statsample 1.5.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.build.sh +15 -0
- data/.gitignore +1 -0
- data/.travis.yml +19 -7
- data/CONTRIBUTING.md +33 -0
- data/History.txt +5 -0
- data/README.md +41 -53
- data/benchmarks/correlation_matrix_15_variables.rb +6 -5
- data/benchmarks/correlation_matrix_5_variables.rb +6 -5
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
- data/examples/boxplot.rb +17 -5
- data/examples/correlation_matrix.rb +36 -7
- data/examples/dataset.rb +25 -5
- data/examples/dominance_analysis.rb +8 -7
- data/examples/dominance_analysis_bootstrap.rb +16 -11
- data/examples/histogram.rb +16 -2
- data/examples/icc.rb +5 -6
- data/examples/levene.rb +17 -3
- data/examples/multiple_regression.rb +6 -3
- data/examples/parallel_analysis.rb +11 -6
- data/examples/polychoric.rb +26 -13
- data/examples/principal_axis.rb +8 -4
- data/examples/reliability.rb +10 -10
- data/examples/scatterplot.rb +8 -0
- data/examples/t_test.rb +7 -0
- data/examples/u_test.rb +10 -2
- data/examples/vector.rb +9 -6
- data/examples/velicer_map_test.rb +12 -8
- data/lib/statsample.rb +13 -47
- data/lib/statsample/analysis/suite.rb +1 -1
- data/lib/statsample/anova/oneway.rb +6 -6
- data/lib/statsample/anova/twoway.rb +26 -24
- data/lib/statsample/bivariate.rb +78 -61
- data/lib/statsample/bivariate/pearson.rb +2 -2
- data/lib/statsample/codification.rb +45 -32
- data/lib/statsample/converter/csv.rb +15 -53
- data/lib/statsample/converter/spss.rb +6 -5
- data/lib/statsample/converters.rb +50 -211
- data/lib/statsample/crosstab.rb +26 -25
- data/lib/statsample/daru.rb +117 -0
- data/lib/statsample/dataset.rb +70 -942
- data/lib/statsample/dominanceanalysis.rb +16 -17
- data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
- data/lib/statsample/factor/parallelanalysis.rb +17 -19
- data/lib/statsample/factor/pca.rb +21 -20
- data/lib/statsample/factor/principalaxis.rb +3 -3
- data/lib/statsample/graph/boxplot.rb +8 -16
- data/lib/statsample/graph/histogram.rb +4 -4
- data/lib/statsample/graph/scatterplot.rb +8 -7
- data/lib/statsample/histogram.rb +128 -119
- data/lib/statsample/matrix.rb +20 -16
- data/lib/statsample/multiset.rb +39 -38
- data/lib/statsample/regression.rb +3 -3
- data/lib/statsample/regression/multiple.rb +8 -10
- data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
- data/lib/statsample/regression/multiple/baseengine.rb +32 -32
- data/lib/statsample/regression/multiple/gslengine.rb +33 -36
- data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
- data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
- data/lib/statsample/reliability.rb +23 -25
- data/lib/statsample/reliability/icc.rb +8 -7
- data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
- data/lib/statsample/reliability/scaleanalysis.rb +58 -60
- data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
- data/lib/statsample/resample.rb +1 -1
- data/lib/statsample/shorthand.rb +29 -25
- data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
- data/lib/statsample/test/levene.rb +28 -27
- data/lib/statsample/test/t.rb +7 -9
- data/lib/statsample/test/umannwhitney.rb +28 -28
- data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
- data/lib/statsample/vector.rb +70 -1013
- data/lib/statsample/version.rb +1 -1
- data/statsample.gemspec +12 -16
- data/test/helpers_tests.rb +1 -1
- data/test/test_analysis.rb +17 -17
- data/test/test_anova_contrast.rb +6 -6
- data/test/test_anovatwowaywithdataset.rb +8 -8
- data/test/test_anovawithvectors.rb +8 -8
- data/test/test_awesome_print_bug.rb +1 -1
- data/test/test_bartlettsphericity.rb +4 -4
- data/test/test_bivariate.rb +48 -43
- data/test/test_codification.rb +33 -33
- data/test/test_crosstab.rb +9 -9
- data/test/test_dataset.rb +28 -458
- data/test/test_factor.rb +46 -38
- data/test/test_factor_pa.rb +22 -13
- data/test/test_ggobi.rb +4 -4
- data/test/test_gsl.rb +4 -4
- data/test/test_histogram.rb +3 -3
- data/test/test_matrix.rb +13 -13
- data/test/test_multiset.rb +103 -91
- data/test/test_regression.rb +57 -52
- data/test/test_reliability.rb +55 -45
- data/test/test_reliability_icc.rb +8 -8
- data/test/test_reliability_skillscale.rb +26 -24
- data/test/test_resample.rb +1 -1
- data/test/test_statistics.rb +3 -13
- data/test/test_stest.rb +9 -9
- data/test/test_stratified.rb +3 -3
- data/test/test_test_t.rb +12 -12
- data/test/test_umannwhitney.rb +2 -2
- data/test/test_vector.rb +76 -613
- data/test/test_wilcoxonsignedrank.rb +4 -4
- metadata +57 -28
- data/lib/statsample/rserve_extension.rb +0 -20
- data/lib/statsample/vector/gsl.rb +0 -106
- data/test/fixtures/repeated_fields.csv +0 -7
- data/test/fixtures/scientific_notation.csv +0 -4
- data/test/fixtures/test_csv.csv +0 -7
- data/test/fixtures/test_xls.xls +0 -0
- data/test/test_csv.rb +0 -63
- data/test/test_rserve_extension.rb +0 -42
- data/test/test_xls.rb +0 -52
@@ -7,8 +7,8 @@ module Statsample
|
|
7
7
|
# variables.
|
8
8
|
#
|
9
9
|
# == Usage
|
10
|
-
# a = [1,2,3,4,5,6]
|
11
|
-
# b = [2,3,4,5,6,7]
|
10
|
+
# a = Daru::Vector.new([1,2,3,4,5,6])
|
11
|
+
# b = Daru::Vector.new([2,3,4,5,6,7])
|
12
12
|
# pearson = Statsample::Bivariate::Pearson.new(a,b)
|
13
13
|
# puts pearson.r
|
14
14
|
# puts pearson.t
|
@@ -34,24 +34,33 @@ module Statsample
|
|
34
34
|
# will be hashes, with keys = values, for recodification
|
35
35
|
def create_hash(dataset, vectors, sep=Statsample::SPLIT_TOKEN)
|
36
36
|
raise ArgumentError,"Array should't be empty" if vectors.size==0
|
37
|
-
pro_hash=vectors.inject({})
|
38
|
-
|
39
|
-
|
40
|
-
|
37
|
+
pro_hash = vectors.inject({}) do |h,v_name|
|
38
|
+
v_name = v_name.is_a?(Numeric) ? v_name : v_name.to_sym
|
39
|
+
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if
|
40
|
+
!dataset.vectors.include?(v_name)
|
41
|
+
v = dataset[v_name]
|
42
|
+
split_data = v.splitted(sep)
|
43
|
+
.flatten
|
44
|
+
.collect { |c| c.to_s }
|
45
|
+
.find_all{ |c| !c.nil? }
|
41
46
|
|
42
|
-
factors=split_data.uniq
|
43
|
-
|
47
|
+
factors = split_data.uniq
|
48
|
+
.compact
|
49
|
+
.sort
|
50
|
+
.inject({}) { |ac,val| ac[val] = val; ac }
|
51
|
+
h[v_name] = factors
|
44
52
|
h
|
45
|
-
|
53
|
+
end
|
54
|
+
|
46
55
|
pro_hash
|
47
56
|
end
|
48
57
|
# Create a yaml to create a dictionary, based on vectors
|
49
58
|
# The keys will be vectors name on dataset and the values
|
50
59
|
# will be hashes, with keys = values, for recodification
|
51
60
|
#
|
52
|
-
# v1
|
53
|
-
# ds={
|
54
|
-
# Statsample::Codification.create_yaml(ds,[
|
61
|
+
# v1 = Daru::Vector.new(%w{a,b b,c d})
|
62
|
+
# ds = Daru::DataFrame.new({:v1 => v1})
|
63
|
+
# Statsample::Codification.create_yaml(ds,[:v1])
|
55
64
|
# => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n"
|
56
65
|
def create_yaml(dataset, vectors, io=nil, sep=Statsample::SPLIT_TOKEN)
|
57
66
|
pro_hash=create_hash(dataset, vectors, sep)
|
@@ -69,16 +78,17 @@ module Statsample
|
|
69
78
|
if File.exist?(filename)
|
70
79
|
raise "Exists a file named #{filename}. Delete ir before overwrite."
|
71
80
|
end
|
72
|
-
book
|
81
|
+
book = Spreadsheet::Workbook.new
|
73
82
|
sheet = book.create_worksheet
|
74
|
-
sheet.row(0).concat(%w
|
75
|
-
i=1
|
83
|
+
sheet.row(0).concat(%w(field original recoded))
|
84
|
+
i = 1
|
76
85
|
create_hash(dataset, vectors, sep).sort.each do |field, inner_hash|
|
77
86
|
inner_hash.sort.each do |k,v|
|
78
|
-
sheet.row(i).concat([field.
|
79
|
-
i+=1
|
87
|
+
sheet.row(i).concat([field.to_s,k.to_s,v.to_s])
|
88
|
+
i += 1
|
80
89
|
end
|
81
90
|
end
|
91
|
+
|
82
92
|
book.write(filename)
|
83
93
|
end
|
84
94
|
# From a excel generates a dictionary hash
|
@@ -91,10 +101,11 @@ module Statsample
|
|
91
101
|
sheet= book.worksheet 0
|
92
102
|
row_i=0
|
93
103
|
sheet.each do |row|
|
94
|
-
row_i+=1
|
95
|
-
next if row_i==1 or row[0].nil? or row[1].nil? or row[2].nil?
|
96
|
-
|
97
|
-
h[
|
104
|
+
row_i += 1
|
105
|
+
next if row_i == 1 or row[0].nil? or row[1].nil? or row[2].nil?
|
106
|
+
key = row[0].to_sym
|
107
|
+
h[key] ||= {}
|
108
|
+
h[key][row[1]] = row[2]
|
98
109
|
end
|
99
110
|
h
|
100
111
|
end
|
@@ -110,12 +121,12 @@ module Statsample
|
|
110
121
|
end
|
111
122
|
|
112
123
|
def dictionary(h, sep=Statsample::SPLIT_TOKEN)
|
113
|
-
h.inject({}) {|a,v| a[v[0]]=v[1].split(sep); a }
|
124
|
+
h.inject({}) { |a,v| a[v[0]]=v[1].split(sep); a }
|
114
125
|
end
|
115
126
|
|
116
127
|
def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
|
117
|
-
dict=dictionary(h,sep)
|
118
|
-
new_data=v.splitted(sep)
|
128
|
+
dict = dictionary(h,sep)
|
129
|
+
new_data = v.splitted(sep)
|
119
130
|
new_data.collect do |c|
|
120
131
|
if c.nil?
|
121
132
|
nil
|
@@ -134,20 +145,22 @@ module Statsample
|
|
134
145
|
def _recode_dataset(dataset, h , sep=Statsample::SPLIT_TOKEN, split=false)
|
135
146
|
v_names||=h.keys
|
136
147
|
v_names.each do |v_name|
|
137
|
-
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.
|
138
|
-
recoded=
|
139
|
-
|
140
|
-
nil
|
141
|
-
|
142
|
-
|
148
|
+
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.vectors.include? v_name
|
149
|
+
recoded = Daru::Vector.new(
|
150
|
+
recode_vector(dataset[v_name], h[v_name],sep).collect do |c|
|
151
|
+
if c.nil?
|
152
|
+
nil
|
153
|
+
else
|
154
|
+
c.join(sep)
|
155
|
+
end
|
143
156
|
end
|
144
|
-
|
145
|
-
if
|
157
|
+
)
|
158
|
+
if split
|
146
159
|
recoded.split_by_separator(sep).each {|k,v|
|
147
|
-
dataset[v_name+"_"+k]=v
|
160
|
+
dataset[(v_name.to_s + "_" + k).to_sym] = v
|
148
161
|
}
|
149
162
|
else
|
150
|
-
dataset[v_name+"_recoded"]=recoded
|
163
|
+
dataset[(v_name.to_s + "_recoded").to_sym] = recoded
|
151
164
|
end
|
152
165
|
end
|
153
166
|
end
|
@@ -1,65 +1,27 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# This module will be removed in the next release.
|
2
|
+
# Please shift to using Daru::DataFrame.from_csv and #write_csv for CSV
|
3
|
+
# related operations.
|
3
4
|
module Statsample
|
4
|
-
class CSV
|
5
|
-
# Default options for processing CSV files. Accept the same options as
|
6
|
-
# Ruby's `CSV#new`.
|
7
|
-
DEFAULT_OPTIONS = {
|
8
|
-
converters: [:numeric]
|
9
|
-
}
|
10
|
-
|
5
|
+
class CSV
|
11
6
|
class << self
|
12
|
-
# Return a
|
7
|
+
# Return a DataFrom created from a csv file.
|
13
8
|
#
|
14
|
-
#
|
15
|
-
#
|
9
|
+
# == NOTE
|
10
|
+
#
|
11
|
+
# This method has been DEPRECATED in favour of Daru::DataFrame.from_csv.
|
12
|
+
# Please switch to using that.
|
16
13
|
def read(filename, empty = [''], ignore_lines = 0, opts = {})
|
17
|
-
|
18
|
-
fields = []
|
19
|
-
ds = nil
|
20
|
-
line_number = 0
|
21
|
-
options = DEFAULT_OPTIONS.merge(opts)
|
22
|
-
|
23
|
-
csv = ::CSV.open(filename, 'rb', options)
|
24
|
-
|
25
|
-
csv.each do |row|
|
26
|
-
line_number += 1
|
27
|
-
|
28
|
-
if (line_number <= ignore_lines)
|
29
|
-
next
|
30
|
-
end
|
31
|
-
|
32
|
-
if first_row
|
33
|
-
fields = extract_fields(row)
|
34
|
-
ds = Statsample::Dataset.new(fields)
|
35
|
-
first_row = false
|
36
|
-
else
|
37
|
-
rowa = process_row(row, empty)
|
38
|
-
ds.add_case(rowa, false)
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
convert_to_numeric_and_date(ds, fields)
|
43
|
-
ds.update_valid_data
|
44
|
-
ds
|
14
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_csv instead."
|
45
15
|
end
|
46
16
|
|
47
17
|
# Save a Dataset on a csv file.
|
48
18
|
#
|
49
|
-
#
|
50
|
-
#
|
19
|
+
# == NOTE
|
20
|
+
#
|
21
|
+
# This method has BEEN DEPRECATED in favor of Daru::DataFrame#write_csv.
|
22
|
+
# Please use that instead.
|
51
23
|
def write(dataset, filename, convert_comma = false, opts = {})
|
52
|
-
|
53
|
-
|
54
|
-
writer = ::CSV.open(filename, 'w', options)
|
55
|
-
writer << dataset.fields
|
56
|
-
|
57
|
-
dataset.each_array do |row|
|
58
|
-
row.collect! { |v| v.to_s.gsub('.', ',') } if convert_comma
|
59
|
-
writer << row
|
60
|
-
end
|
61
|
-
|
62
|
-
writer.close
|
24
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_csv instead."
|
63
25
|
end
|
64
26
|
end
|
65
27
|
end
|
@@ -4,26 +4,27 @@ module Statsample
|
|
4
4
|
# Export a SPSS Matrix with tetrachoric correlations .
|
5
5
|
#
|
6
6
|
# Use:
|
7
|
-
# ds=
|
7
|
+
# ds=Daru::DataFrame.from_excel("my_data.xls")
|
8
8
|
# puts Statsample::SPSS.tetrachoric_correlation_matrix(ds)
|
9
9
|
def tetrachoric_correlation_matrix(ds)
|
10
10
|
dsv=ds.dup_only_valid
|
11
11
|
# Delete all vectors doesn't have variation
|
12
|
-
dsv.
|
12
|
+
dsv.vectors.each { |f|
|
13
13
|
if dsv[f].factors.size==1
|
14
14
|
dsv.delete_vector(f)
|
15
15
|
else
|
16
16
|
dsv[f]=dsv[f].dichotomize
|
17
17
|
end
|
18
18
|
}
|
19
|
+
|
19
20
|
tcm=Statsample::Bivariate.tetrachoric_correlation_matrix(dsv)
|
20
|
-
n=dsv.
|
21
|
+
n=dsv.vectors.to_a.collect {|f|
|
21
22
|
sprintf("%d",dsv[f].size)
|
22
23
|
}
|
23
|
-
meanlist=dsv.
|
24
|
+
meanlist=dsv.vectors.to_a.collect{|f|
|
24
25
|
sprintf("%0.3f", dsv[f].mean)
|
25
26
|
}
|
26
|
-
stddevlist=dsv.
|
27
|
+
stddevlist=dsv.vectors.to_a.collect{|f|
|
27
28
|
sprintf("%0.3f", dsv[f].sd)
|
28
29
|
}
|
29
30
|
out=<<-HEREDOC
|
@@ -1,63 +1,36 @@
|
|
1
1
|
require 'statsample/converter/spss'
|
2
2
|
module Statsample
|
3
|
-
|
3
|
+
# Create and dumps Datasets on a database
|
4
|
+
#
|
5
|
+
# == NOTE
|
6
|
+
#
|
7
|
+
# Deprecated. Use Daru::DataFrame.from_sql and Daru::DataFrame#write_sql
|
4
8
|
module Database
|
5
9
|
class << self
|
6
10
|
# Read a database query and returns a Dataset
|
7
11
|
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
# Statsample.read(dbh, "SELECT * FROM test")
|
12
|
-
#
|
12
|
+
# == NOTE
|
13
|
+
#
|
14
|
+
# Deprecated. Use Daru::DataFrame.from_sql instead.
|
13
15
|
def read(dbh,query)
|
14
|
-
|
15
|
-
sth=dbh.execute(query)
|
16
|
-
vectors={}
|
17
|
-
fields=[]
|
18
|
-
sth.column_info.each {|c|
|
19
|
-
vectors[c['name']]=Statsample::Vector.new([])
|
20
|
-
vectors[c['name']].name=c['name']
|
21
|
-
vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :numeric : :object
|
22
|
-
fields.push(c['name'])
|
23
|
-
}
|
24
|
-
ds=Statsample::Dataset.new(vectors,fields)
|
25
|
-
sth.fetch do |row|
|
26
|
-
ds.add_case(row.to_a, false )
|
27
|
-
end
|
28
|
-
ds.update_valid_data
|
29
|
-
ds
|
16
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_sql instead."
|
30
17
|
end
|
18
|
+
|
31
19
|
# Insert each case of the Dataset on the selected table
|
32
20
|
#
|
33
|
-
#
|
34
|
-
#
|
35
|
-
#
|
36
|
-
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
37
|
-
# Statsample::Database.insert(ds,dbh,"test")
|
38
|
-
#
|
21
|
+
# == NOTE
|
22
|
+
#
|
23
|
+
# Deprecated. Use Daru::DataFrame#write_sql instead
|
39
24
|
def insert(ds, dbh, table)
|
40
|
-
|
41
|
-
query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
|
42
|
-
sth=dbh.prepare(query)
|
43
|
-
ds.each_array{|c| sth.execute(*c) }
|
44
|
-
return true
|
25
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_sql instead."
|
45
26
|
end
|
46
27
|
# Create a sql, basen on a given Dataset
|
47
28
|
#
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
# Statsample::Database.create_sql(ds,'names')
|
52
|
-
# ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
|
53
|
-
#
|
29
|
+
# == NOTE
|
30
|
+
#
|
31
|
+
# Deprecated. Use Daru::DataFrame#create_sql instead.
|
54
32
|
def create_sql(ds,table,charset="UTF8")
|
55
|
-
|
56
|
-
fields=ds.fields.collect{|f|
|
57
|
-
v=ds[f]
|
58
|
-
f+" "+v.db_type
|
59
|
-
}
|
60
|
-
sql+fields.join(",\n ")+") CHARACTER SET=#{charset};"
|
33
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#create_sql instead."
|
61
34
|
end
|
62
35
|
end
|
63
36
|
end
|
@@ -65,182 +38,49 @@ module Statsample
|
|
65
38
|
class << self
|
66
39
|
def write(dataset,filename)
|
67
40
|
File.open(filename,"wb") do |fp|
|
68
|
-
fp.puts dataset.
|
69
|
-
dataset.
|
70
|
-
row2=row.
|
41
|
+
fp.puts dataset.vectors.to_a.join("\t")
|
42
|
+
dataset.each_row do |row|
|
43
|
+
row2 = row.map { |v| v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_") }
|
71
44
|
fp.puts row2.join("\t")
|
72
45
|
end
|
73
46
|
end
|
74
47
|
end
|
75
48
|
end
|
76
49
|
end
|
77
|
-
class SpreadsheetBase
|
78
|
-
class << self
|
79
|
-
def extract_fields(row)
|
80
|
-
i=0;
|
81
|
-
fields=row.to_a.collect{|c|
|
82
|
-
if c.nil?
|
83
|
-
i+=1
|
84
|
-
"var%05d" % i
|
85
|
-
else
|
86
|
-
c.to_s.downcase
|
87
|
-
end
|
88
|
-
}
|
89
|
-
fields.recode_repeated
|
90
|
-
end
|
91
50
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
else
|
97
|
-
if c.is_a? String and c.is_number?
|
98
|
-
if c=~/^\d+$/
|
99
|
-
c.to_i
|
100
|
-
else
|
101
|
-
c.gsub(",",".").to_f
|
102
|
-
end
|
103
|
-
else
|
104
|
-
c
|
105
|
-
end
|
106
|
-
end
|
107
|
-
end
|
108
|
-
end
|
109
|
-
def convert_to_numeric_and_date(ds,fields)
|
110
|
-
fields.each do |f|
|
111
|
-
if ds[f].can_be_numeric?
|
112
|
-
ds[f].type=:numeric
|
113
|
-
elsif ds[f].can_be_date?
|
114
|
-
ds[f].type=:date
|
115
|
-
end
|
116
|
-
end
|
51
|
+
class PlainText
|
52
|
+
class << self
|
53
|
+
def read(filename, fields)
|
54
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_plaintext instead."
|
117
55
|
end
|
118
|
-
|
119
56
|
end
|
120
57
|
end
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
fp=File.open(filename,"r")
|
126
|
-
fp.each_line do |line|
|
127
|
-
row=process_row(line.strip.split(/\s+/),[""])
|
128
|
-
next if row==["\x1A"]
|
129
|
-
ds.add_case_array(row)
|
130
|
-
end
|
131
|
-
convert_to_numeric_and_date(ds,fields)
|
132
|
-
ds.update_valid_data
|
133
|
-
fields.each {|f|
|
134
|
-
ds[f].name=f
|
135
|
-
}
|
136
|
-
ds
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
class Excel < SpreadsheetBase
|
58
|
+
|
59
|
+
# This class has been DEPRECATED. Use Daru::DataFrame::from_excel
|
60
|
+
# Daru::DataFrame#write_excel for XLS file operations.
|
61
|
+
class Excel
|
141
62
|
class << self
|
142
63
|
# Write a Excel spreadsheet based on a dataset
|
143
64
|
# * TODO: Format nicely date values
|
65
|
+
#
|
66
|
+
# == NOTE
|
67
|
+
#
|
68
|
+
# Deprecated. Use Daru::DataFrame#write_csv.
|
144
69
|
def write(dataset,filename)
|
145
|
-
|
146
|
-
book = Spreadsheet::Workbook.new
|
147
|
-
sheet = book.create_worksheet
|
148
|
-
format = Spreadsheet::Format.new :color => :blue,
|
149
|
-
:weight => :bold
|
150
|
-
sheet.row(0).concat(dataset.fields.map {|i| i.dup}) # Unfreeze strings
|
151
|
-
sheet.row(0).default_format = format
|
152
|
-
i=1
|
153
|
-
dataset.each_array{|row|
|
154
|
-
sheet.row(i).concat(row)
|
155
|
-
i+=1
|
156
|
-
}
|
157
|
-
book.write(filename)
|
158
|
-
end
|
159
|
-
# This should be fixed.
|
160
|
-
# If we have a Formula, should be resolver first
|
161
|
-
|
162
|
-
def preprocess_row(row, dates)
|
163
|
-
i=-1
|
164
|
-
row.collect!{|c|
|
165
|
-
i+=1
|
166
|
-
if c.is_a? Spreadsheet::Formula
|
167
|
-
if(c.value.is_a? Spreadsheet::Excel::Error)
|
168
|
-
nil
|
169
|
-
else
|
170
|
-
c.value
|
171
|
-
end
|
172
|
-
elsif dates.include? i and !c.nil? and c.is_a? Numeric
|
173
|
-
row.date(i)
|
174
|
-
else
|
175
|
-
c
|
176
|
-
end
|
177
|
-
}
|
70
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_excel instead."
|
178
71
|
end
|
179
|
-
private :process_row, :preprocess_row
|
180
72
|
|
181
73
|
# Returns a dataset based on a xls file
|
182
|
-
#
|
183
|
-
#
|
184
|
-
#
|
74
|
+
#
|
75
|
+
# == NOTE
|
76
|
+
#
|
77
|
+
# Deprecated. Use Daru::DataFrame.from_excel instead.
|
185
78
|
def read(filename, opts=Hash.new)
|
186
|
-
|
187
|
-
raise "options should be Hash" unless opts.is_a? Hash
|
188
|
-
opts_default={
|
189
|
-
:worksheet_id=>0,
|
190
|
-
:ignore_lines=>0,
|
191
|
-
:empty=>['']
|
192
|
-
}
|
193
|
-
|
194
|
-
opts=opts_default.merge opts
|
195
|
-
|
196
|
-
worksheet_id=opts[:worksheet_id]
|
197
|
-
ignore_lines=opts[:ignore_lines]
|
198
|
-
empty=opts[:empty]
|
199
|
-
|
200
|
-
first_row=true
|
201
|
-
fields=[]
|
202
|
-
ds=nil
|
203
|
-
line_number=0
|
204
|
-
book = Spreadsheet.open filename
|
205
|
-
sheet= book.worksheet worksheet_id
|
206
|
-
sheet.each do |row|
|
207
|
-
begin
|
208
|
-
dates=[]
|
209
|
-
row.formats.each_index{|i|
|
210
|
-
if !row.formats[i].nil? and row.formats[i].number_format=="DD/MM/YYYY"
|
211
|
-
dates.push(i)
|
212
|
-
end
|
213
|
-
}
|
214
|
-
line_number+=1
|
215
|
-
next if(line_number<=ignore_lines)
|
216
|
-
|
217
|
-
preprocess_row(row,dates)
|
218
|
-
if first_row
|
219
|
-
fields=extract_fields(row)
|
220
|
-
ds=Statsample::Dataset.new(fields)
|
221
|
-
first_row=false
|
222
|
-
else
|
223
|
-
rowa=process_row(row,empty)
|
224
|
-
(fields.size - rowa.size).times {
|
225
|
-
rowa << nil
|
226
|
-
}
|
227
|
-
ds.add_case(rowa,false)
|
228
|
-
end
|
229
|
-
rescue => e
|
230
|
-
error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}"
|
231
|
-
raise
|
232
|
-
end
|
233
|
-
end
|
234
|
-
convert_to_numeric_and_date(ds, fields)
|
235
|
-
ds.update_valid_data
|
236
|
-
fields.each {|f|
|
237
|
-
ds[f].name=f
|
238
|
-
}
|
239
|
-
ds.name=filename
|
240
|
-
ds
|
79
|
+
raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_excel instead."
|
241
80
|
end
|
242
81
|
end
|
243
82
|
end
|
83
|
+
|
244
84
|
module Mx
|
245
85
|
class << self
|
246
86
|
def write(dataset,filename,type=:covariance)
|
@@ -249,12 +89,12 @@ module Statsample
|
|
249
89
|
fp.puts "! #{filename}"
|
250
90
|
fp.puts "! Output generated by Statsample"
|
251
91
|
fp.puts "Data Ninput=#{dataset.fields.size} Nobservations=#{dataset.cases}"
|
252
|
-
fp.puts "Labels "+dataset.
|
92
|
+
fp.puts "Labels " + dataset.vectors.to_a.join(" ")
|
253
93
|
case type
|
254
94
|
when :raw
|
255
95
|
fp.puts "Rectangular"
|
256
96
|
dataset.each do |row|
|
257
|
-
out=dataset.
|
97
|
+
out=dataset.vectors.to_a.collect do |f|
|
258
98
|
if dataset[f].is_valid? row[f]
|
259
99
|
row[f]
|
260
100
|
else
|
@@ -292,18 +132,18 @@ module Statsample
|
|
292
132
|
carrier=OpenStruct.new
|
293
133
|
carrier.categorials=[]
|
294
134
|
carrier.conversions={}
|
295
|
-
variables_def=dataset.
|
135
|
+
variables_def=dataset.vectors.to_a.collect{|k|
|
296
136
|
variable_definition(carrier,dataset[k],k)
|
297
137
|
}.join("\n")
|
298
138
|
|
299
139
|
indexes=carrier.categorials.inject({}) {|s,c|
|
300
|
-
s[dataset.
|
140
|
+
s[dataset.vectors.to_a.index(c)]=c
|
301
141
|
s
|
302
142
|
}
|
303
143
|
records=""
|
304
|
-
dataset.
|
305
|
-
indexes.each{|ik,iv|
|
306
|
-
c[ik]=carrier.conversions[iv][c[ik]]
|
144
|
+
dataset.each_row {|c|
|
145
|
+
indexes.each { |ik,iv|
|
146
|
+
c[ik] = carrier.conversions[iv][c[ik]]
|
307
147
|
}
|
308
148
|
records << "<record>#{values_definition(c, default_opt[:missing])}</record>\n"
|
309
149
|
}
|
@@ -345,7 +185,7 @@ out
|
|
345
185
|
# nickname = nickname
|
346
186
|
def variable_definition(carrier,v,name,nickname=nil)
|
347
187
|
nickname = (nickname.nil? ? "" : "nickname=\"#{nickname}\"" )
|
348
|
-
if v.type==:object or v.
|
188
|
+
if v.type==:object or v.to_a.find {|d| d.is_a? String }
|
349
189
|
carrier.categorials.push(name)
|
350
190
|
carrier.conversions[name]={}
|
351
191
|
factors=v.factors
|
@@ -353,17 +193,16 @@ out
|
|
353
193
|
out << "<levels count=\"#{factors.size}\">\n"
|
354
194
|
out << (1..factors.size).to_a.collect{|i|
|
355
195
|
carrier.conversions[name][factors[i-1]]=i
|
356
|
-
"<level value=\"#{i}\">#{v.
|
196
|
+
"<level value=\"#{i}\">#{(v.labels[factors[i-1]] || factors[i-1])}</level>"
|
357
197
|
}.join("\n")
|
358
198
|
out << "</levels>\n</categoricalvariable>\n"
|
359
199
|
out
|
360
|
-
elsif v.
|
200
|
+
elsif v.to_a.find {|d| d.is_a? Float}
|
361
201
|
"<realvariable name=\"#{name}\" #{nickname} />"
|
362
202
|
else
|
363
203
|
"<integervariable name=\"#{name}\" #{nickname} />"
|
364
204
|
end
|
365
205
|
end
|
366
|
-
|
367
206
|
end
|
368
207
|
end
|
369
208
|
end
|