statsample 1.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.build.sh +15 -0
- data/.gitignore +1 -0
- data/.travis.yml +19 -7
- data/CONTRIBUTING.md +33 -0
- data/History.txt +5 -0
- data/README.md +41 -53
- data/benchmarks/correlation_matrix_15_variables.rb +6 -5
- data/benchmarks/correlation_matrix_5_variables.rb +6 -5
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
- data/examples/boxplot.rb +17 -5
- data/examples/correlation_matrix.rb +36 -7
- data/examples/dataset.rb +25 -5
- data/examples/dominance_analysis.rb +8 -7
- data/examples/dominance_analysis_bootstrap.rb +16 -11
- data/examples/histogram.rb +16 -2
- data/examples/icc.rb +5 -6
- data/examples/levene.rb +17 -3
- data/examples/multiple_regression.rb +6 -3
- data/examples/parallel_analysis.rb +11 -6
- data/examples/polychoric.rb +26 -13
- data/examples/principal_axis.rb +8 -4
- data/examples/reliability.rb +10 -10
- data/examples/scatterplot.rb +8 -0
- data/examples/t_test.rb +7 -0
- data/examples/u_test.rb +10 -2
- data/examples/vector.rb +9 -6
- data/examples/velicer_map_test.rb +12 -8
- data/lib/statsample.rb +13 -47
- data/lib/statsample/analysis/suite.rb +1 -1
- data/lib/statsample/anova/oneway.rb +6 -6
- data/lib/statsample/anova/twoway.rb +26 -24
- data/lib/statsample/bivariate.rb +78 -61
- data/lib/statsample/bivariate/pearson.rb +2 -2
- data/lib/statsample/codification.rb +45 -32
- data/lib/statsample/converter/csv.rb +15 -53
- data/lib/statsample/converter/spss.rb +6 -5
- data/lib/statsample/converters.rb +50 -211
- data/lib/statsample/crosstab.rb +26 -25
- data/lib/statsample/daru.rb +117 -0
- data/lib/statsample/dataset.rb +70 -942
- data/lib/statsample/dominanceanalysis.rb +16 -17
- data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
- data/lib/statsample/factor/parallelanalysis.rb +17 -19
- data/lib/statsample/factor/pca.rb +21 -20
- data/lib/statsample/factor/principalaxis.rb +3 -3
- data/lib/statsample/graph/boxplot.rb +8 -16
- data/lib/statsample/graph/histogram.rb +4 -4
- data/lib/statsample/graph/scatterplot.rb +8 -7
- data/lib/statsample/histogram.rb +128 -119
- data/lib/statsample/matrix.rb +20 -16
- data/lib/statsample/multiset.rb +39 -38
- data/lib/statsample/regression.rb +3 -3
- data/lib/statsample/regression/multiple.rb +8 -10
- data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
- data/lib/statsample/regression/multiple/baseengine.rb +32 -32
- data/lib/statsample/regression/multiple/gslengine.rb +33 -36
- data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
- data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
- data/lib/statsample/reliability.rb +23 -25
- data/lib/statsample/reliability/icc.rb +8 -7
- data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
- data/lib/statsample/reliability/scaleanalysis.rb +58 -60
- data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
- data/lib/statsample/resample.rb +1 -1
- data/lib/statsample/shorthand.rb +29 -25
- data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
- data/lib/statsample/test/levene.rb +28 -27
- data/lib/statsample/test/t.rb +7 -9
- data/lib/statsample/test/umannwhitney.rb +28 -28
- data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
- data/lib/statsample/vector.rb +70 -1013
- data/lib/statsample/version.rb +1 -1
- data/statsample.gemspec +12 -16
- data/test/helpers_tests.rb +1 -1
- data/test/test_analysis.rb +17 -17
- data/test/test_anova_contrast.rb +6 -6
- data/test/test_anovatwowaywithdataset.rb +8 -8
- data/test/test_anovawithvectors.rb +8 -8
- data/test/test_awesome_print_bug.rb +1 -1
- data/test/test_bartlettsphericity.rb +4 -4
- data/test/test_bivariate.rb +48 -43
- data/test/test_codification.rb +33 -33
- data/test/test_crosstab.rb +9 -9
- data/test/test_dataset.rb +28 -458
- data/test/test_factor.rb +46 -38
- data/test/test_factor_pa.rb +22 -13
- data/test/test_ggobi.rb +4 -4
- data/test/test_gsl.rb +4 -4
- data/test/test_histogram.rb +3 -3
- data/test/test_matrix.rb +13 -13
- data/test/test_multiset.rb +103 -91
- data/test/test_regression.rb +57 -52
- data/test/test_reliability.rb +55 -45
- data/test/test_reliability_icc.rb +8 -8
- data/test/test_reliability_skillscale.rb +26 -24
- data/test/test_resample.rb +1 -1
- data/test/test_statistics.rb +3 -13
- data/test/test_stest.rb +9 -9
- data/test/test_stratified.rb +3 -3
- data/test/test_test_t.rb +12 -12
- data/test/test_umannwhitney.rb +2 -2
- data/test/test_vector.rb +76 -613
- data/test/test_wilcoxonsignedrank.rb +4 -4
- metadata +57 -28
- data/lib/statsample/rserve_extension.rb +0 -20
- data/lib/statsample/vector/gsl.rb +0 -106
- data/test/fixtures/repeated_fields.csv +0 -7
- data/test/fixtures/scientific_notation.csv +0 -4
- data/test/fixtures/test_csv.csv +0 -7
- data/test/fixtures/test_xls.xls +0 -0
- data/test/test_csv.rb +0 -63
- data/test/test_rserve_extension.rb +0 -42
- data/test/test_xls.rb +0 -52
@@ -7,8 +7,8 @@ module Statsample
|
|
7
7
|
# variables.
|
8
8
|
#
|
9
9
|
# == Usage
|
10
|
-
# a = [1,2,3,4,5,6]
|
11
|
-
# b = [2,3,4,5,6,7]
|
10
|
+
# a = Daru::Vector.new([1,2,3,4,5,6])
|
11
|
+
# b = Daru::Vector.new([2,3,4,5,6,7])
|
12
12
|
# pearson = Statsample::Bivariate::Pearson.new(a,b)
|
13
13
|
# puts pearson.r
|
14
14
|
# puts pearson.t
|
@@ -34,24 +34,33 @@ module Statsample
|
|
34
34
|
# will be hashes, with keys = values, for recodification
|
35
35
|
def create_hash(dataset, vectors, sep=Statsample::SPLIT_TOKEN)
|
36
36
|
raise ArgumentError,"Array should't be empty" if vectors.size==0
|
37
|
-
pro_hash=vectors.inject({})
|
38
|
-
|
39
|
-
|
40
|
-
|
37
|
+
pro_hash = vectors.inject({}) do |h,v_name|
|
38
|
+
v_name = v_name.is_a?(Numeric) ? v_name : v_name.to_sym
|
39
|
+
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if
|
40
|
+
!dataset.vectors.include?(v_name)
|
41
|
+
v = dataset[v_name]
|
42
|
+
split_data = v.splitted(sep)
|
43
|
+
.flatten
|
44
|
+
.collect { |c| c.to_s }
|
45
|
+
.find_all{ |c| !c.nil? }
|
41
46
|
|
42
|
-
factors=split_data.uniq
|
43
|
-
|
47
|
+
factors = split_data.uniq
|
48
|
+
.compact
|
49
|
+
.sort
|
50
|
+
.inject({}) { |ac,val| ac[val] = val; ac }
|
51
|
+
h[v_name] = factors
|
44
52
|
h
|
45
|
-
|
53
|
+
end
|
54
|
+
|
46
55
|
pro_hash
|
47
56
|
end
|
48
57
|
# Create a yaml to create a dictionary, based on vectors
|
49
58
|
# The keys will be vectors name on dataset and the values
|
50
59
|
# will be hashes, with keys = values, for recodification
|
51
60
|
#
|
52
|
-
# v1
|
53
|
-
# ds={
|
54
|
-
# Statsample::Codification.create_yaml(ds,[
|
61
|
+
# v1 = Daru::Vector.new(%w{a,b b,c d})
|
62
|
+
# ds = Daru::DataFrame.new({:v1 => v1})
|
63
|
+
# Statsample::Codification.create_yaml(ds,[:v1])
|
55
64
|
# => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n"
|
56
65
|
def create_yaml(dataset, vectors, io=nil, sep=Statsample::SPLIT_TOKEN)
|
57
66
|
pro_hash=create_hash(dataset, vectors, sep)
|
@@ -69,16 +78,17 @@ module Statsample
|
|
69
78
|
if File.exist?(filename)
|
70
79
|
raise "Exists a file named #{filename}. Delete ir before overwrite."
|
71
80
|
end
|
72
|
-
book
|
81
|
+
book = Spreadsheet::Workbook.new
|
73
82
|
sheet = book.create_worksheet
|
74
|
-
sheet.row(0).concat(%w
|
75
|
-
i=1
|
83
|
+
sheet.row(0).concat(%w(field original recoded))
|
84
|
+
i = 1
|
76
85
|
create_hash(dataset, vectors, sep).sort.each do |field, inner_hash|
|
77
86
|
inner_hash.sort.each do |k,v|
|
78
|
-
sheet.row(i).concat([field.
|
79
|
-
i+=1
|
87
|
+
sheet.row(i).concat([field.to_s,k.to_s,v.to_s])
|
88
|
+
i += 1
|
80
89
|
end
|
81
90
|
end
|
91
|
+
|
82
92
|
book.write(filename)
|
83
93
|
end
|
84
94
|
# From a excel generates a dictionary hash
|
@@ -91,10 +101,11 @@ module Statsample
|
|
91
101
|
sheet= book.worksheet 0
|
92
102
|
row_i=0
|
93
103
|
sheet.each do |row|
|
94
|
-
row_i+=1
|
95
|
-
next if row_i==1 or row[0].nil? or row[1].nil? or row[2].nil?
|
96
|
-
|
97
|
-
h[
|
104
|
+
row_i += 1
|
105
|
+
next if row_i == 1 or row[0].nil? or row[1].nil? or row[2].nil?
|
106
|
+
key = row[0].to_sym
|
107
|
+
h[key] ||= {}
|
108
|
+
h[key][row[1]] = row[2]
|
98
109
|
end
|
99
110
|
h
|
100
111
|
end
|
@@ -110,12 +121,12 @@ module Statsample
|
|
110
121
|
end
|
111
122
|
|
112
123
|
def dictionary(h, sep=Statsample::SPLIT_TOKEN)
|
113
|
-
h.inject({}) {|a,v| a[v[0]]=v[1].split(sep); a }
|
124
|
+
h.inject({}) { |a,v| a[v[0]]=v[1].split(sep); a }
|
114
125
|
end
|
115
126
|
|
116
127
|
def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
|
117
|
-
dict=dictionary(h,sep)
|
118
|
-
new_data=v.splitted(sep)
|
128
|
+
dict = dictionary(h,sep)
|
129
|
+
new_data = v.splitted(sep)
|
119
130
|
new_data.collect do |c|
|
120
131
|
if c.nil?
|
121
132
|
nil
|
@@ -134,20 +145,22 @@ module Statsample
|
|
134
145
|
def _recode_dataset(dataset, h , sep=Statsample::SPLIT_TOKEN, split=false)
|
135
146
|
v_names||=h.keys
|
136
147
|
v_names.each do |v_name|
|
137
|
-
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.
|
138
|
-
recoded=
|
139
|
-
|
140
|
-
nil
|
141
|
-
|
142
|
-
|
148
|
+
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.vectors.include? v_name
|
149
|
+
recoded = Daru::Vector.new(
|
150
|
+
recode_vector(dataset[v_name], h[v_name],sep).collect do |c|
|
151
|
+
if c.nil?
|
152
|
+
nil
|
153
|
+
else
|
154
|
+
c.join(sep)
|
155
|
+
end
|
143
156
|
end
|
144
|
-
|
145
|
-
if
|
157
|
+
)
|
158
|
+
if split
|
146
159
|
recoded.split_by_separator(sep).each {|k,v|
|
147
|
-
dataset[v_name+"_"+k]=v
|
160
|
+
dataset[(v_name.to_s + "_" + k).to_sym] = v
|
148
161
|
}
|
149
162
|
else
|
150
|
-
dataset[v_name+"_recoded"]=recoded
|
163
|
+
dataset[(v_name.to_s + "_recoded").to_sym] = recoded
|
151
164
|
end
|
152
165
|
end
|
153
166
|
end
|
@@ -1,65 +1,27 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# This module will be removed in the next release.
|
2
|
+
# Please shift to using Daru::DataFrame.from_csv and #write_csv for CSV
|
3
|
+
# related operations.
|
3
4
|
module Statsample
|
4
|
-
class CSV
|
5
|
-
# Default options for processing CSV files. Accept the same options as
|
6
|
-
# Ruby's `CSV#new`.
|
7
|
-
DEFAULT_OPTIONS = {
|
8
|
-
converters: [:numeric]
|
9
|
-
}
|
10
|
-
|
5
|
+
class CSV
|
11
6
|
class << self
|
12
|
-
# Return a
|
7
|
+
# Return a DataFrom created from a csv file.
|
13
8
|
#
|
14
|
-
#
|
15
|
-
#
|
9
|
+
# == NOTE
|
10
|
+
#
|
11
|
+
# This method has been DEPRECATED in favour of Daru::DataFrame.from_csv.
|
12
|
+
# Please switch to using that.
|
16
13
|
def read(filename, empty = [''], ignore_lines = 0, opts = {})
|
17
|
-
|
18
|
-
fields = []
|
19
|
-
ds = nil
|
20
|
-
line_number = 0
|
21
|
-
options = DEFAULT_OPTIONS.merge(opts)
|
22
|
-
|
23
|
-
csv = ::CSV.open(filename, 'rb', options)
|
24
|
-
|
25
|
-
csv.each do |row|
|
26
|
-
line_number += 1
|
27
|
-
|
28
|
-
if (line_number <= ignore_lines)
|
29
|
-
next
|
30
|
-
end
|
31
|
-
|
32
|
-
if first_row
|
33
|
-
fields = extract_fields(row)
|
34
|
-
ds = Statsample::Dataset.new(fields)
|
35
|
-
first_row = false
|
36
|
-
else
|
37
|
-
rowa = process_row(row, empty)
|
38
|
-
ds.add_case(rowa, false)
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
convert_to_numeric_and_date(ds, fields)
|
43
|
-
ds.update_valid_data
|
44
|
-
ds
|
14
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_csv instead."
|
45
15
|
end
|
46
16
|
|
47
17
|
# Save a Dataset on a csv file.
|
48
18
|
#
|
49
|
-
#
|
50
|
-
#
|
19
|
+
# == NOTE
|
20
|
+
#
|
21
|
+
# This method has BEEN DEPRECATED in favor of Daru::DataFrame#write_csv.
|
22
|
+
# Please use that instead.
|
51
23
|
def write(dataset, filename, convert_comma = false, opts = {})
|
52
|
-
|
53
|
-
|
54
|
-
writer = ::CSV.open(filename, 'w', options)
|
55
|
-
writer << dataset.fields
|
56
|
-
|
57
|
-
dataset.each_array do |row|
|
58
|
-
row.collect! { |v| v.to_s.gsub('.', ',') } if convert_comma
|
59
|
-
writer << row
|
60
|
-
end
|
61
|
-
|
62
|
-
writer.close
|
24
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_csv instead."
|
63
25
|
end
|
64
26
|
end
|
65
27
|
end
|
@@ -4,26 +4,27 @@ module Statsample
|
|
4
4
|
# Export a SPSS Matrix with tetrachoric correlations .
|
5
5
|
#
|
6
6
|
# Use:
|
7
|
-
# ds=
|
7
|
+
# ds=Daru::DataFrame.from_excel("my_data.xls")
|
8
8
|
# puts Statsample::SPSS.tetrachoric_correlation_matrix(ds)
|
9
9
|
def tetrachoric_correlation_matrix(ds)
|
10
10
|
dsv=ds.dup_only_valid
|
11
11
|
# Delete all vectors doesn't have variation
|
12
|
-
dsv.
|
12
|
+
dsv.vectors.each { |f|
|
13
13
|
if dsv[f].factors.size==1
|
14
14
|
dsv.delete_vector(f)
|
15
15
|
else
|
16
16
|
dsv[f]=dsv[f].dichotomize
|
17
17
|
end
|
18
18
|
}
|
19
|
+
|
19
20
|
tcm=Statsample::Bivariate.tetrachoric_correlation_matrix(dsv)
|
20
|
-
n=dsv.
|
21
|
+
n=dsv.vectors.to_a.collect {|f|
|
21
22
|
sprintf("%d",dsv[f].size)
|
22
23
|
}
|
23
|
-
meanlist=dsv.
|
24
|
+
meanlist=dsv.vectors.to_a.collect{|f|
|
24
25
|
sprintf("%0.3f", dsv[f].mean)
|
25
26
|
}
|
26
|
-
stddevlist=dsv.
|
27
|
+
stddevlist=dsv.vectors.to_a.collect{|f|
|
27
28
|
sprintf("%0.3f", dsv[f].sd)
|
28
29
|
}
|
29
30
|
out=<<-HEREDOC
|
@@ -1,63 +1,36 @@
|
|
1
1
|
require 'statsample/converter/spss'
|
2
2
|
module Statsample
|
3
|
-
|
3
|
+
# Create and dumps Datasets on a database
|
4
|
+
#
|
5
|
+
# == NOTE
|
6
|
+
#
|
7
|
+
# Deprecated. Use Daru::DataFrame.from_sql and Daru::DataFrame#write_sql
|
4
8
|
module Database
|
5
9
|
class << self
|
6
10
|
# Read a database query and returns a Dataset
|
7
11
|
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
# Statsample.read(dbh, "SELECT * FROM test")
|
12
|
-
#
|
12
|
+
# == NOTE
|
13
|
+
#
|
14
|
+
# Deprecated. Use Daru::DataFrame.from_sql instead.
|
13
15
|
def read(dbh,query)
|
14
|
-
|
15
|
-
sth=dbh.execute(query)
|
16
|
-
vectors={}
|
17
|
-
fields=[]
|
18
|
-
sth.column_info.each {|c|
|
19
|
-
vectors[c['name']]=Statsample::Vector.new([])
|
20
|
-
vectors[c['name']].name=c['name']
|
21
|
-
vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :numeric : :object
|
22
|
-
fields.push(c['name'])
|
23
|
-
}
|
24
|
-
ds=Statsample::Dataset.new(vectors,fields)
|
25
|
-
sth.fetch do |row|
|
26
|
-
ds.add_case(row.to_a, false )
|
27
|
-
end
|
28
|
-
ds.update_valid_data
|
29
|
-
ds
|
16
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_sql instead."
|
30
17
|
end
|
18
|
+
|
31
19
|
# Insert each case of the Dataset on the selected table
|
32
20
|
#
|
33
|
-
#
|
34
|
-
#
|
35
|
-
#
|
36
|
-
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
37
|
-
# Statsample::Database.insert(ds,dbh,"test")
|
38
|
-
#
|
21
|
+
# == NOTE
|
22
|
+
#
|
23
|
+
# Deprecated. Use Daru::DataFrame#write_sql instead
|
39
24
|
def insert(ds, dbh, table)
|
40
|
-
|
41
|
-
query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
|
42
|
-
sth=dbh.prepare(query)
|
43
|
-
ds.each_array{|c| sth.execute(*c) }
|
44
|
-
return true
|
25
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_sql instead."
|
45
26
|
end
|
46
27
|
# Create a sql, basen on a given Dataset
|
47
28
|
#
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
# Statsample::Database.create_sql(ds,'names')
|
52
|
-
# ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
|
53
|
-
#
|
29
|
+
# == NOTE
|
30
|
+
#
|
31
|
+
# Deprecated. Use Daru::DataFrame#create_sql instead.
|
54
32
|
def create_sql(ds,table,charset="UTF8")
|
55
|
-
|
56
|
-
fields=ds.fields.collect{|f|
|
57
|
-
v=ds[f]
|
58
|
-
f+" "+v.db_type
|
59
|
-
}
|
60
|
-
sql+fields.join(",\n ")+") CHARACTER SET=#{charset};"
|
33
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#create_sql instead."
|
61
34
|
end
|
62
35
|
end
|
63
36
|
end
|
@@ -65,182 +38,49 @@ module Statsample
|
|
65
38
|
class << self
|
66
39
|
def write(dataset,filename)
|
67
40
|
File.open(filename,"wb") do |fp|
|
68
|
-
fp.puts dataset.
|
69
|
-
dataset.
|
70
|
-
row2=row.
|
41
|
+
fp.puts dataset.vectors.to_a.join("\t")
|
42
|
+
dataset.each_row do |row|
|
43
|
+
row2 = row.map { |v| v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_") }
|
71
44
|
fp.puts row2.join("\t")
|
72
45
|
end
|
73
46
|
end
|
74
47
|
end
|
75
48
|
end
|
76
49
|
end
|
77
|
-
class SpreadsheetBase
|
78
|
-
class << self
|
79
|
-
def extract_fields(row)
|
80
|
-
i=0;
|
81
|
-
fields=row.to_a.collect{|c|
|
82
|
-
if c.nil?
|
83
|
-
i+=1
|
84
|
-
"var%05d" % i
|
85
|
-
else
|
86
|
-
c.to_s.downcase
|
87
|
-
end
|
88
|
-
}
|
89
|
-
fields.recode_repeated
|
90
|
-
end
|
91
50
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
else
|
97
|
-
if c.is_a? String and c.is_number?
|
98
|
-
if c=~/^\d+$/
|
99
|
-
c.to_i
|
100
|
-
else
|
101
|
-
c.gsub(",",".").to_f
|
102
|
-
end
|
103
|
-
else
|
104
|
-
c
|
105
|
-
end
|
106
|
-
end
|
107
|
-
end
|
108
|
-
end
|
109
|
-
def convert_to_numeric_and_date(ds,fields)
|
110
|
-
fields.each do |f|
|
111
|
-
if ds[f].can_be_numeric?
|
112
|
-
ds[f].type=:numeric
|
113
|
-
elsif ds[f].can_be_date?
|
114
|
-
ds[f].type=:date
|
115
|
-
end
|
116
|
-
end
|
51
|
+
class PlainText
|
52
|
+
class << self
|
53
|
+
def read(filename, fields)
|
54
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_plaintext instead."
|
117
55
|
end
|
118
|
-
|
119
56
|
end
|
120
57
|
end
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
fp=File.open(filename,"r")
|
126
|
-
fp.each_line do |line|
|
127
|
-
row=process_row(line.strip.split(/\s+/),[""])
|
128
|
-
next if row==["\x1A"]
|
129
|
-
ds.add_case_array(row)
|
130
|
-
end
|
131
|
-
convert_to_numeric_and_date(ds,fields)
|
132
|
-
ds.update_valid_data
|
133
|
-
fields.each {|f|
|
134
|
-
ds[f].name=f
|
135
|
-
}
|
136
|
-
ds
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
class Excel < SpreadsheetBase
|
58
|
+
|
59
|
+
# This class has been DEPRECATED. Use Daru::DataFrame::from_excel
|
60
|
+
# Daru::DataFrame#write_excel for XLS file operations.
|
61
|
+
class Excel
|
141
62
|
class << self
|
142
63
|
# Write a Excel spreadsheet based on a dataset
|
143
64
|
# * TODO: Format nicely date values
|
65
|
+
#
|
66
|
+
# == NOTE
|
67
|
+
#
|
68
|
+
# Deprecated. Use Daru::DataFrame#write_csv.
|
144
69
|
def write(dataset,filename)
|
145
|
-
|
146
|
-
book = Spreadsheet::Workbook.new
|
147
|
-
sheet = book.create_worksheet
|
148
|
-
format = Spreadsheet::Format.new :color => :blue,
|
149
|
-
:weight => :bold
|
150
|
-
sheet.row(0).concat(dataset.fields.map {|i| i.dup}) # Unfreeze strings
|
151
|
-
sheet.row(0).default_format = format
|
152
|
-
i=1
|
153
|
-
dataset.each_array{|row|
|
154
|
-
sheet.row(i).concat(row)
|
155
|
-
i+=1
|
156
|
-
}
|
157
|
-
book.write(filename)
|
158
|
-
end
|
159
|
-
# This should be fixed.
|
160
|
-
# If we have a Formula, should be resolver first
|
161
|
-
|
162
|
-
def preprocess_row(row, dates)
|
163
|
-
i=-1
|
164
|
-
row.collect!{|c|
|
165
|
-
i+=1
|
166
|
-
if c.is_a? Spreadsheet::Formula
|
167
|
-
if(c.value.is_a? Spreadsheet::Excel::Error)
|
168
|
-
nil
|
169
|
-
else
|
170
|
-
c.value
|
171
|
-
end
|
172
|
-
elsif dates.include? i and !c.nil? and c.is_a? Numeric
|
173
|
-
row.date(i)
|
174
|
-
else
|
175
|
-
c
|
176
|
-
end
|
177
|
-
}
|
70
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_excel instead."
|
178
71
|
end
|
179
|
-
private :process_row, :preprocess_row
|
180
72
|
|
181
73
|
# Returns a dataset based on a xls file
|
182
|
-
#
|
183
|
-
#
|
184
|
-
#
|
74
|
+
#
|
75
|
+
# == NOTE
|
76
|
+
#
|
77
|
+
# Deprecated. Use Daru::DataFrame.from_excel instead.
|
185
78
|
def read(filename, opts=Hash.new)
|
186
|
-
|
187
|
-
raise "options should be Hash" unless opts.is_a? Hash
|
188
|
-
opts_default={
|
189
|
-
:worksheet_id=>0,
|
190
|
-
:ignore_lines=>0,
|
191
|
-
:empty=>['']
|
192
|
-
}
|
193
|
-
|
194
|
-
opts=opts_default.merge opts
|
195
|
-
|
196
|
-
worksheet_id=opts[:worksheet_id]
|
197
|
-
ignore_lines=opts[:ignore_lines]
|
198
|
-
empty=opts[:empty]
|
199
|
-
|
200
|
-
first_row=true
|
201
|
-
fields=[]
|
202
|
-
ds=nil
|
203
|
-
line_number=0
|
204
|
-
book = Spreadsheet.open filename
|
205
|
-
sheet= book.worksheet worksheet_id
|
206
|
-
sheet.each do |row|
|
207
|
-
begin
|
208
|
-
dates=[]
|
209
|
-
row.formats.each_index{|i|
|
210
|
-
if !row.formats[i].nil? and row.formats[i].number_format=="DD/MM/YYYY"
|
211
|
-
dates.push(i)
|
212
|
-
end
|
213
|
-
}
|
214
|
-
line_number+=1
|
215
|
-
next if(line_number<=ignore_lines)
|
216
|
-
|
217
|
-
preprocess_row(row,dates)
|
218
|
-
if first_row
|
219
|
-
fields=extract_fields(row)
|
220
|
-
ds=Statsample::Dataset.new(fields)
|
221
|
-
first_row=false
|
222
|
-
else
|
223
|
-
rowa=process_row(row,empty)
|
224
|
-
(fields.size - rowa.size).times {
|
225
|
-
rowa << nil
|
226
|
-
}
|
227
|
-
ds.add_case(rowa,false)
|
228
|
-
end
|
229
|
-
rescue => e
|
230
|
-
error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}"
|
231
|
-
raise
|
232
|
-
end
|
233
|
-
end
|
234
|
-
convert_to_numeric_and_date(ds, fields)
|
235
|
-
ds.update_valid_data
|
236
|
-
fields.each {|f|
|
237
|
-
ds[f].name=f
|
238
|
-
}
|
239
|
-
ds.name=filename
|
240
|
-
ds
|
79
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_excel instead."
|
241
80
|
end
|
242
81
|
end
|
243
82
|
end
|
83
|
+
|
244
84
|
module Mx
|
245
85
|
class << self
|
246
86
|
def write(dataset,filename,type=:covariance)
|
@@ -249,12 +89,12 @@ module Statsample
|
|
249
89
|
fp.puts "! #{filename}"
|
250
90
|
fp.puts "! Output generated by Statsample"
|
251
91
|
fp.puts "Data Ninput=#{dataset.fields.size} Nobservations=#{dataset.cases}"
|
252
|
-
fp.puts "Labels "+dataset.
|
92
|
+
fp.puts "Labels " + dataset.vectors.to_a.join(" ")
|
253
93
|
case type
|
254
94
|
when :raw
|
255
95
|
fp.puts "Rectangular"
|
256
96
|
dataset.each do |row|
|
257
|
-
out=dataset.
|
97
|
+
out=dataset.vectors.to_a.collect do |f|
|
258
98
|
if dataset[f].is_valid? row[f]
|
259
99
|
row[f]
|
260
100
|
else
|
@@ -292,18 +132,18 @@ module Statsample
|
|
292
132
|
carrier=OpenStruct.new
|
293
133
|
carrier.categorials=[]
|
294
134
|
carrier.conversions={}
|
295
|
-
variables_def=dataset.
|
135
|
+
variables_def=dataset.vectors.to_a.collect{|k|
|
296
136
|
variable_definition(carrier,dataset[k],k)
|
297
137
|
}.join("\n")
|
298
138
|
|
299
139
|
indexes=carrier.categorials.inject({}) {|s,c|
|
300
|
-
s[dataset.
|
140
|
+
s[dataset.vectors.to_a.index(c)]=c
|
301
141
|
s
|
302
142
|
}
|
303
143
|
records=""
|
304
|
-
dataset.
|
305
|
-
indexes.each{|ik,iv|
|
306
|
-
c[ik]=carrier.conversions[iv][c[ik]]
|
144
|
+
dataset.each_row {|c|
|
145
|
+
indexes.each { |ik,iv|
|
146
|
+
c[ik] = carrier.conversions[iv][c[ik]]
|
307
147
|
}
|
308
148
|
records << "<record>#{values_definition(c, default_opt[:missing])}</record>\n"
|
309
149
|
}
|
@@ -345,7 +185,7 @@ out
|
|
345
185
|
# nickname = nickname
|
346
186
|
def variable_definition(carrier,v,name,nickname=nil)
|
347
187
|
nickname = (nickname.nil? ? "" : "nickname=\"#{nickname}\"" )
|
348
|
-
if v.type==:object or v.
|
188
|
+
if v.type==:object or v.to_a.find {|d| d.is_a? String }
|
349
189
|
carrier.categorials.push(name)
|
350
190
|
carrier.conversions[name]={}
|
351
191
|
factors=v.factors
|
@@ -353,17 +193,16 @@ out
|
|
353
193
|
out << "<levels count=\"#{factors.size}\">\n"
|
354
194
|
out << (1..factors.size).to_a.collect{|i|
|
355
195
|
carrier.conversions[name][factors[i-1]]=i
|
356
|
-
"<level value=\"#{i}\">#{v.
|
196
|
+
"<level value=\"#{i}\">#{(v.labels[factors[i-1]] || factors[i-1])}</level>"
|
357
197
|
}.join("\n")
|
358
198
|
out << "</levels>\n</categoricalvariable>\n"
|
359
199
|
out
|
360
|
-
elsif v.
|
200
|
+
elsif v.to_a.find {|d| d.is_a? Float}
|
361
201
|
"<realvariable name=\"#{name}\" #{nickname} />"
|
362
202
|
else
|
363
203
|
"<integervariable name=\"#{name}\" #{nickname} />"
|
364
204
|
end
|
365
205
|
end
|
366
|
-
|
367
206
|
end
|
368
207
|
end
|
369
208
|
end
|