statsample 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +11 -0
- data/Manifest.txt +7 -0
- data/README.txt +3 -3
- data/data/repeated_fields.csv +7 -0
- data/data/tetmat_matrix.txt +5 -0
- data/data/tetmat_test.txt +1001 -0
- data/demo/spss_matrix.rb +3 -0
- data/lib/spss.rb +1 -1
- data/lib/statistics2.rb +1 -1
- data/lib/statsample.rb +30 -1
- data/lib/statsample/anova.rb +62 -66
- data/lib/statsample/bivariate.rb +273 -281
- data/lib/statsample/bivariate/tetrachoric.rb +418 -0
- data/lib/statsample/codification.rb +15 -15
- data/lib/statsample/combination.rb +108 -106
- data/lib/statsample/converter/csv18.rb +52 -52
- data/lib/statsample/converter/csv19.rb +45 -48
- data/lib/statsample/converter/spss.rb +47 -0
- data/lib/statsample/converters.rb +74 -77
- data/lib/statsample/crosstab.rb +21 -17
- data/lib/statsample/dataset.rb +595 -543
- data/lib/statsample/dominanceanalysis.rb +7 -10
- data/lib/statsample/htmlreport.rb +23 -0
- data/lib/statsample/regression/multiple/baseengine.rb +59 -59
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/reliability.rb +165 -145
- data/lib/statsample/vector.rb +16 -2
- data/test/test_anova.rb +16 -16
- data/test/test_bivariate.rb +146 -0
- data/test/test_csv.rb +6 -0
- data/test/test_dataset.rb +49 -5
- data/test/test_statistics.rb +6 -90
- data/test/test_vector.rb +27 -10
- metadata +10 -4
- data/test/test_r.rb +0 -9
- data/test/test_stata.rb +0 -11
@@ -1,90 +1,82 @@
|
|
1
|
+
require 'statsample/converter/spss'
|
1
2
|
module Statsample
|
2
3
|
# Create and dumps Datasets on a database
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
end
|
26
|
-
ds.update_valid_data
|
27
|
-
ds
|
28
|
-
end
|
29
|
-
# Insert each case of the Dataset on the selected table
|
30
|
-
#
|
31
|
-
# USE:
|
32
|
-
#
|
33
|
-
# ds={'id'=>[1,2,3].to_vector, 'name'=>["a","b","c"].to_vector}.to_dataset
|
34
|
-
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
35
|
-
# Statsample::Database.insert(ds,dbh,"test")
|
36
|
-
#
|
37
|
-
def insert(ds, dbh,table)
|
38
|
-
require 'dbi'
|
39
|
-
query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
|
40
|
-
sth=dbh.prepare(query)
|
41
|
-
ds.each_array{|c|
|
42
|
-
sth.execute(*c)
|
43
|
-
}
|
44
|
-
end
|
45
|
-
# Create a sql, basen on a given Dataset
|
46
|
-
#
|
47
|
-
# USE:
|
48
|
-
#
|
49
|
-
# ds={'id'=>[1,2,3,4,5].to_vector,'name'=>%w{Alex Peter Susan Mary John}.to_vector}.to_dataset
|
50
|
-
# Statsample::Database.create_sql(ds,'names')
|
51
|
-
# ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
|
52
|
-
#
|
53
|
-
def create_sql(ds,table,charset="UTF8")
|
54
|
-
sql="CREATE TABLE #{table} ("
|
55
|
-
fields=ds.fields.collect{|f|
|
56
|
-
v=ds[f]
|
57
|
-
f+" "+v.db_type
|
58
|
-
}
|
59
|
-
sql+fields.join(",\n ")+") CHARACTER SET=#{charset};"
|
4
|
+
module Database
|
5
|
+
class << self
|
6
|
+
# Read a database query and returns a Dataset
|
7
|
+
#
|
8
|
+
# USE:
|
9
|
+
#
|
10
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
11
|
+
# Statsample.read(dbh, "SELECT * FROM test")
|
12
|
+
#
|
13
|
+
def read(dbh,query)
|
14
|
+
require 'dbi'
|
15
|
+
sth=dbh.execute(query)
|
16
|
+
vectors={}
|
17
|
+
fields=[]
|
18
|
+
sth.column_info.each {|c|
|
19
|
+
vectors[c['name']]=Statsample::Vector.new([])
|
20
|
+
vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :scale : :nominal
|
21
|
+
fields.push(c['name'])
|
22
|
+
}
|
23
|
+
ds=Statsample::Dataset.new(vectors,fields)
|
24
|
+
sth.fetch do |row|
|
25
|
+
ds.add_case(row.to_a, false )
|
60
26
|
end
|
61
|
-
|
27
|
+
ds.update_valid_data
|
28
|
+
ds
|
29
|
+
end
|
30
|
+
# Insert each case of the Dataset on the selected table
|
31
|
+
#
|
32
|
+
# USE:
|
33
|
+
#
|
34
|
+
# ds={'id'=>[1,2,3].to_vector, 'name'=>["a","b","c"].to_vector}.to_dataset
|
35
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
36
|
+
# Statsample::Database.insert(ds,dbh,"test")
|
37
|
+
#
|
38
|
+
def insert(ds, dbh,table)
|
39
|
+
require 'dbi'
|
40
|
+
query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
|
41
|
+
sth=dbh.prepare(query)
|
42
|
+
ds.each_array{|c| sth.execute(*c) }
|
43
|
+
end
|
44
|
+
# Create a sql, basen on a given Dataset
|
45
|
+
#
|
46
|
+
# USE:
|
47
|
+
#
|
48
|
+
# ds={'id'=>[1,2,3,4,5].to_vector,'name'=>%w{Alex Peter Susan Mary John}.to_vector}.to_dataset
|
49
|
+
# Statsample::Database.create_sql(ds,'names')
|
50
|
+
# ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
|
51
|
+
#
|
52
|
+
def create_sql(ds,table,charset="UTF8")
|
53
|
+
sql="CREATE TABLE #{table} ("
|
54
|
+
fields=ds.fields.collect{|f|
|
55
|
+
v=ds[f]
|
56
|
+
f+" "+v.db_type
|
57
|
+
}
|
58
|
+
sql+fields.join(",\n ")+") CHARACTER SET=#{charset};"
|
59
|
+
end
|
62
60
|
end
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
}
|
74
|
-
end
|
75
|
-
end
|
61
|
+
end
|
62
|
+
module Mondrian
|
63
|
+
class << self
|
64
|
+
def write(dataset,filename)
|
65
|
+
File.open(filename,"wb") do |fp|
|
66
|
+
fp.puts dataset.fields.join("\t")
|
67
|
+
dataset.each_array_with_nils do |row|
|
68
|
+
row2=row.collect{|v| v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_") }
|
69
|
+
fp.puts row2.join("\t")
|
70
|
+
end
|
76
71
|
end
|
72
|
+
end
|
77
73
|
end
|
74
|
+
end
|
78
75
|
class SpreadsheetBase
|
79
76
|
class << self
|
80
77
|
def extract_fields(row)
|
81
78
|
fields=row.to_a.collect{|c| c.downcase}
|
82
|
-
|
83
|
-
repeated=fields.inject({}) {|a,v|
|
84
|
-
(a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v|k}.join(",")
|
85
|
-
raise "There are some repeated fields on the header:#{repeated}. Please, fix"
|
86
|
-
end
|
87
|
-
fields
|
79
|
+
fields.recode_repeated
|
88
80
|
end
|
89
81
|
|
90
82
|
def process_row(row,empty)
|
@@ -121,6 +113,7 @@ module Statsample
|
|
121
113
|
fp=File.open(filename,"r")
|
122
114
|
fp.each_line do |line|
|
123
115
|
row=process_row(line.strip.split(/\s+/),[""])
|
116
|
+
next if row==["\x1A"]
|
124
117
|
ds.add_case_array(row)
|
125
118
|
end
|
126
119
|
convert_to_scale(ds,fields)
|
@@ -178,7 +171,11 @@ module Statsample
|
|
178
171
|
row.collect!{|c|
|
179
172
|
i+=1
|
180
173
|
if c.is_a? Spreadsheet::Formula
|
174
|
+
if(c.value.is_a? Spreadsheet::Excel::Error)
|
175
|
+
nil
|
176
|
+
else
|
181
177
|
c.value
|
178
|
+
end
|
182
179
|
elsif dates.include? i and !c.nil? and c.is_a? Numeric
|
183
180
|
row.date(i)
|
184
181
|
else
|
data/lib/statsample/crosstab.rb
CHANGED
@@ -26,22 +26,22 @@ module Statsample
|
|
26
26
|
@v_cols.frequencies
|
27
27
|
end
|
28
28
|
def frequencies
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
29
|
+
base=rows_names.inject([]){|s,row|
|
30
|
+
s+=cols_names.collect{|col| [row,col]}
|
31
|
+
}.inject({}) {|s,par|
|
32
|
+
s[par]=0
|
33
|
+
s
|
34
|
+
}
|
35
35
|
base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.to_vector.frequencies)
|
36
36
|
end
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
37
|
+
def to_matrix
|
38
|
+
f=frequencies
|
39
|
+
rn=rows_names
|
40
|
+
cn=cols_names
|
41
|
+
Matrix.rows(rn.collect{|row|
|
42
|
+
cn.collect{|col| f[[row,col]]}
|
43
|
+
})
|
44
|
+
end
|
45
45
|
def frequencies_by_row
|
46
46
|
f=frequencies
|
47
47
|
rows_names.inject({}){|sr,row|
|
@@ -81,14 +81,18 @@ module Statsample
|
|
81
81
|
}
|
82
82
|
Matrix.rows(m)
|
83
83
|
end
|
84
|
-
def
|
84
|
+
def cols_empty_hash
|
85
|
+
cols_names.inject({}) {|a,x| a[x]=0;a}
|
86
|
+
end
|
87
|
+
|
88
|
+
def summary(report_type = ConsoleSummary)
|
85
89
|
out=""
|
86
90
|
out.extend report_type
|
87
91
|
fq=frequencies
|
88
92
|
rn=rows_names
|
89
93
|
cn=cols_names
|
90
94
|
total=0
|
91
|
-
total_cols=
|
95
|
+
total_cols=cols_empty_hash
|
92
96
|
out.add "Chi Square: #{chi_square}\n"
|
93
97
|
out.add(_("Rows: %s\n") % @row_label) unless @row_label.nil?
|
94
98
|
out.add(_("Columns: %s\n") % @column_label) unless @column_label.nil?
|
@@ -122,7 +126,7 @@ module Statsample
|
|
122
126
|
rn=rows_names
|
123
127
|
cn=cols_names
|
124
128
|
total=0
|
125
|
-
total_cols=
|
129
|
+
total_cols=cols_empty_hash
|
126
130
|
max_row_size = rn.inject(0) {|s,x| sl=@v_rows.labeling(x).size; sl>s ? sl : s}
|
127
131
|
|
128
132
|
max_row_size=max_row_size<6 ? 6 : max_row_size
|
data/lib/statsample/dataset.rb
CHANGED
@@ -1,225 +1,287 @@
|
|
1
1
|
require 'statsample/vector'
|
2
2
|
|
3
3
|
class Hash
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
def to_dataset(*args)
|
5
|
+
Statsample::Dataset.new(self,*args)
|
6
|
+
end
|
7
7
|
end
|
8
8
|
|
9
9
|
class Array
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
def prefix(s)
|
11
|
+
self.collect{|c| s+c.to_s }
|
12
|
+
end
|
13
|
+
def suffix(s)
|
14
|
+
self.collect{|c| c.to_s+s }
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
module Statsample
|
19
|
+
class DatasetException < RuntimeError
|
20
|
+
attr_reader :ds,:exp
|
21
|
+
def initialize(ds,e)
|
22
|
+
@ds=ds
|
23
|
+
@exp=e
|
24
|
+
end
|
25
|
+
def to_s
|
26
|
+
m="Error: "+@exp.message+"\n"+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
|
27
|
+
m+="\nRow: #{@i}" unless @i.nil?
|
28
|
+
m
|
29
|
+
end
|
30
|
+
end
|
31
|
+
class Dataset
|
32
|
+
include Writable
|
33
|
+
attr_reader :vectors, :fields, :cases, :i
|
34
|
+
attr_accessor :labels
|
35
|
+
|
36
|
+
# Generates a new dataset, using three vectors
|
37
|
+
# - Rows
|
38
|
+
# - Columns
|
39
|
+
# - Values
|
40
|
+
# For example, you have these values
|
41
|
+
#
|
42
|
+
# x y v
|
43
|
+
# a a 0
|
44
|
+
# a b 1
|
45
|
+
# b a 1
|
46
|
+
# b b 0
|
47
|
+
#
|
48
|
+
# You obtain
|
49
|
+
# id a b
|
50
|
+
# a 0 1
|
51
|
+
# b 1 0
|
52
|
+
#
|
53
|
+
# Useful to process outputs from databases
|
54
|
+
#
|
55
|
+
def self.crosstab_by_asignation(rows,columns,values)
|
56
|
+
raise "Three vectors should be equal size" if rows.size!=columns.size or rows.size!=values.size
|
57
|
+
cols_values=columns.factors
|
58
|
+
cols_n=cols_values.size
|
59
|
+
h_rows=rows.factors.inject({}){|a,v| a[v]=cols_values.inject({}){
|
60
|
+
|a1,v1| a1[v1]=nil; a1
|
61
|
+
}
|
62
|
+
;a}
|
63
|
+
values.each_index{|i|
|
64
|
+
h_rows[rows[i]][columns[i]]=values[i]
|
65
|
+
}
|
66
|
+
ds=Dataset.new(["_id"]+cols_values)
|
67
|
+
cols_values.each{|c|
|
68
|
+
ds[c].type=values.type
|
69
|
+
}
|
70
|
+
rows.factors.each {|row|
|
71
|
+
n_row=Array.new(cols_n+1)
|
72
|
+
n_row[0]=row
|
73
|
+
cols_values.each_index {|i|
|
74
|
+
n_row[i+1]=h_rows[row][cols_values[i]]
|
13
75
|
}
|
76
|
+
ds.add_case_array(n_row)
|
77
|
+
}
|
78
|
+
ds.update_valid_data
|
79
|
+
ds
|
80
|
+
end
|
81
|
+
# Creates a new dataset. A dataset is a set of ordered named vectors
|
82
|
+
# of the same size.
|
83
|
+
#
|
84
|
+
# [vectors] With an array, creates a set of empty vectors named as
|
85
|
+
# values on the array. With a hash, each Vector is assigned as
|
86
|
+
# a variable of the Dataset named as its key
|
87
|
+
# [fields] Array of names for vectors. Is only used for set the
|
88
|
+
# order of variables. If empty, vectors keys on alfabethic order as
|
89
|
+
# used as fields
|
90
|
+
# [labels] Hash to set names for fields.
|
91
|
+
#
|
92
|
+
#
|
93
|
+
# Dataset.new()
|
94
|
+
# Dataset.new(%w{v1 v2 v3})
|
95
|
+
# Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
|
96
|
+
# Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
|
97
|
+
#
|
98
|
+
# The fast way to create a dataset uses Hash#to_dataset, with
|
99
|
+
# fields and labels as arguments
|
100
|
+
# ds = {'v1'=>[1,2,3].to_vector}.to_dataset
|
101
|
+
#
|
102
|
+
def initialize(vectors={}, fields=[], labels={})
|
103
|
+
if vectors.instance_of? Array
|
104
|
+
@fields=vectors.dup
|
105
|
+
@vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
|
106
|
+
else
|
107
|
+
# Check vectors
|
108
|
+
@vectors=vectors
|
109
|
+
@fields=fields
|
110
|
+
check_order
|
111
|
+
check_length
|
112
|
+
end
|
113
|
+
@i=nil
|
114
|
+
@labels=labels
|
115
|
+
end
|
116
|
+
def to_gsl_matrix
|
117
|
+
matrix=GSL::Matrix.alloc(cases,@vectors.size)
|
118
|
+
each_array do |row|
|
119
|
+
row.each_index{|y| matrix.set(@i,y,row[y]) }
|
120
|
+
end
|
121
|
+
matrix
|
122
|
+
end
|
123
|
+
def vector_label(v_id)
|
124
|
+
raise "Vector #{v} doesn't exists" unless @fields.include? v_id
|
125
|
+
@labels[v_id].nil? ? v_id : @labels[v_id]
|
14
126
|
end
|
15
|
-
|
16
|
-
|
17
|
-
|
127
|
+
# Creates a copy of the given dataset, deleting all the cases with
|
128
|
+
# missing data on one of the vectors
|
129
|
+
def dup_only_valid
|
130
|
+
if @vectors.find{|field,vector| vector.has_missing_data?}
|
131
|
+
ds=dup_empty
|
132
|
+
each_array { |c|
|
133
|
+
ds.add_case_array(c) unless @fields.find{|f| @vectors[f].data_with_nils[@i].nil? }
|
18
134
|
}
|
135
|
+
ds.update_valid_data
|
136
|
+
else
|
137
|
+
ds=dup()
|
138
|
+
end
|
139
|
+
ds
|
19
140
|
end
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
141
|
+
# Returns an array with the fields from first argumen to last argument
|
142
|
+
def from_to(from,to)
|
143
|
+
raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from
|
144
|
+
raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to
|
145
|
+
@fields.slice(@fields.index(from)..@fields.index(to))
|
146
|
+
end
|
147
|
+
# Returns a duplicate of the Database
|
148
|
+
# If fields given, only include those vectors
|
149
|
+
def dup(*fields_to_include)
|
150
|
+
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
|
151
|
+
fields_to_include=fields_to_include[0]
|
152
|
+
end
|
153
|
+
fields_to_include=@fields if fields_to_include.size==0
|
154
|
+
vectors={}
|
155
|
+
fields=[]
|
156
|
+
new_labels={}
|
157
|
+
fields_to_include.each{|f|
|
158
|
+
raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
|
159
|
+
vectors[f]=@vectors[f].dup
|
160
|
+
new_labels[f]=@labels[f]
|
161
|
+
fields.push(f)
|
162
|
+
}
|
163
|
+
Dataset.new(vectors,fields,new_labels)
|
164
|
+
end
|
165
|
+
# Creates a copy of the given dataset, without data on vectors
|
166
|
+
def dup_empty
|
167
|
+
vectors=@vectors.inject({}) {|a,v|
|
168
|
+
a[v[0]]=v[1].dup_empty
|
169
|
+
a
|
170
|
+
}
|
171
|
+
Dataset.new(vectors,@fields.dup,@labels.dup)
|
172
|
+
end
|
173
|
+
# Merge vectors from two datasets
|
174
|
+
# In case of name collition, the vectors names are changed to
|
175
|
+
# x_1, x_2 ....
|
176
|
+
def merge(other_ds)
|
177
|
+
raise "Cases should be equal (this:#{@cases}; other:#{other_ds.cases}" unless @cases==other_ds.cases
|
178
|
+
types = @fields.collect{|f| @vectors[f].type} + other_ds.fields.collect{|f| other_ds[f].type}
|
179
|
+
new_fields = (@fields+other_ds.fields).recode_repeated
|
180
|
+
ds_new=Statsample::Dataset.new(new_fields)
|
181
|
+
new_fields.each_index{|i|
|
182
|
+
field=new_fields[i]
|
183
|
+
ds_new[field].type=types[i]
|
184
|
+
}
|
185
|
+
@cases.times {|i|
|
186
|
+
row=case_as_array(i)+other_ds.case_as_array(i)
|
187
|
+
ds_new.add_case_array(row)
|
188
|
+
}
|
189
|
+
ds_new.update_valid_data
|
190
|
+
ds_new
|
191
|
+
end
|
192
|
+
# Returns a dataset with standarized data
|
193
|
+
def standarize
|
194
|
+
ds=dup()
|
195
|
+
ds.fields.each {|f|
|
196
|
+
ds[f]=ds[f].vector_standarized
|
197
|
+
}
|
198
|
+
ds
|
199
|
+
end
|
200
|
+
# Generate a matrix, based on fields of dataset
|
201
|
+
def collect_matrix
|
202
|
+
rows=@fields.collect{|row|
|
203
|
+
@fields.collect{|col|
|
204
|
+
yield row,col
|
205
|
+
}
|
206
|
+
}
|
207
|
+
Matrix.rows(rows)
|
208
|
+
end
|
209
|
+
# We have the same datasets if the labels and vectors are the same
|
210
|
+
def ==(d2)
|
211
|
+
@vectors==d2.vectors and @fields==d2.fields
|
212
|
+
end
|
213
|
+
def col(c)
|
214
|
+
@vectors[c]
|
215
|
+
end
|
216
|
+
alias_method :vector, :col
|
217
|
+
def add_vector(name,vector)
|
218
|
+
raise ArgumentError, "Vector have different size" if vector.size!=@cases
|
219
|
+
@vectors[name]=vector
|
220
|
+
check_order
|
221
|
+
end
|
222
|
+
def has_vector? (v)
|
223
|
+
return @vectors.has_key?(v)
|
224
|
+
end
|
225
|
+
# Creates a dataset with the random data, of a n size
|
226
|
+
# If n not given, uses original number of cases
|
227
|
+
def bootstrap(n=nil)
|
228
|
+
n||=@cases
|
229
|
+
ds_boot=dup_empty
|
230
|
+
for i in 1..n
|
231
|
+
ds_boot.add_case_array(case_as_array(rand(n)))
|
232
|
+
end
|
233
|
+
ds_boot.update_valid_data
|
234
|
+
ds_boot
|
235
|
+
end
|
236
|
+
# Fast version of add case
|
237
|
+
# Can only add one case and no error check if performed
|
238
|
+
# You SHOULD use update_valid_data at the end of insertion cycle
|
239
|
+
def add_case_array(v)
|
240
|
+
v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
|
241
|
+
end
|
242
|
+
def add_case(v,uvd=true)
|
243
|
+
case v
|
244
|
+
when Array
|
245
|
+
if (v[0].is_a? Array)
|
246
|
+
v.each{|subv| add_case(subv,false)}
|
247
|
+
else
|
248
|
+
raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size
|
249
|
+
v.each_index {|i| @vectors[@fields[i]].add(v[i],false)}
|
250
|
+
end
|
251
|
+
when Hash
|
252
|
+
raise ArgumentError, "Hash keys should be equal to fields #{(v.keys - @fields).join(",")}" if @fields.sort!=v.keys.sort
|
253
|
+
@fields.each{|f| @vectors[f].add(v[f],false)}
|
254
|
+
else
|
255
|
+
raise TypeError, 'Value must be a Array or a Hash'
|
256
|
+
end
|
257
|
+
if uvd
|
258
|
+
update_valid_data
|
259
|
+
end
|
260
|
+
end
|
261
|
+
def update_valid_data
|
262
|
+
@fields.each{|f| @vectors[f].set_valid_data}
|
263
|
+
check_length
|
264
|
+
end
|
265
|
+
def delete_vector(name)
|
266
|
+
@fields.delete(name)
|
267
|
+
@vectors.delete(name)
|
268
|
+
end
|
269
|
+
def add_vectors_by_split_recode(name,join='-',sep=Statsample::SPLIT_TOKEN)
|
270
|
+
split=@vectors[name].split_by_separator(sep)
|
271
|
+
i=1
|
272
|
+
split.each{|k,v|
|
273
|
+
new_field=name+join+i.to_s
|
274
|
+
@labels[new_field]=name+":"+k
|
275
|
+
add_vector(new_field,v)
|
276
|
+
i+=1
|
277
|
+
}
|
278
|
+
end
|
279
|
+
def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN)
|
280
|
+
split=@vectors[name].split_by_separator(sep)
|
281
|
+
split.each{|k,v|
|
282
|
+
add_vector(name+join+k,v)
|
283
|
+
}
|
34
284
|
end
|
35
|
-
class Dataset
|
36
|
-
include Writable
|
37
|
-
attr_reader :vectors, :fields, :cases, :i
|
38
|
-
attr_accessor :labels
|
39
|
-
# Creates a new dataset. A dataset is a set of ordered named vectors
|
40
|
-
# of the same size.
|
41
|
-
#
|
42
|
-
# [vectors] With an array, creates a set of empty vectors named as
|
43
|
-
# values on the array. With a hash, each Vector is assigned as
|
44
|
-
# a variable of the Dataset named as its key
|
45
|
-
# [fields] Array of names for vectors. Is only used for set the
|
46
|
-
# order of variables. If empty, vectors keys on alfabethic order as
|
47
|
-
# used as fields
|
48
|
-
# [labels] Hash to set names for fields.
|
49
|
-
#
|
50
|
-
#
|
51
|
-
# Dataset.new()
|
52
|
-
# Dataset.new(%w{v1 v2 v3})
|
53
|
-
# Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
|
54
|
-
# Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
|
55
|
-
#
|
56
|
-
# The fast way to create a dataset uses Hash#to_dataset, with
|
57
|
-
# fields and labels as arguments
|
58
|
-
# ds = {'v1'=>[1,2,3].to_vector}.to_dataset
|
59
|
-
#
|
60
|
-
def initialize(vectors={}, fields=[], labels={})
|
61
|
-
if vectors.instance_of? Array
|
62
|
-
@fields=vectors.dup
|
63
|
-
@vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
|
64
|
-
else
|
65
|
-
@vectors=vectors
|
66
|
-
@fields=fields
|
67
|
-
check_order
|
68
|
-
check_length
|
69
|
-
end
|
70
|
-
@i=nil
|
71
|
-
@labels=labels
|
72
|
-
end
|
73
|
-
def to_gsl_matrix
|
74
|
-
matrix=GSL::Matrix.alloc(cases,@vectors.size)
|
75
|
-
each_array do |row|
|
76
|
-
row.each_index{|y| matrix.set(@i,y,row[y]) }
|
77
|
-
end
|
78
|
-
matrix
|
79
|
-
end
|
80
|
-
def vector_label(v_id)
|
81
|
-
raise "Vector #{v} doesn't exists" unless @fields.include? v_id
|
82
|
-
@labels[v_id].nil? ? v_id : @labels[v_id]
|
83
|
-
end
|
84
|
-
# Creates a copy of the given dataset, deleting all the cases with
|
85
|
-
# missing data on one of the vectors
|
86
|
-
def dup_only_valid
|
87
|
-
if @vectors.find{|field,vector| vector.has_missing_data?}
|
88
|
-
ds=dup_empty
|
89
|
-
each_array { |c|
|
90
|
-
ds.add_case_array(c) unless @fields.find{|f| @vectors[f].data_with_nils[@i].nil? }
|
91
|
-
}
|
92
|
-
ds.update_valid_data
|
93
|
-
else
|
94
|
-
ds=dup()
|
95
|
-
end
|
96
|
-
ds
|
97
|
-
end
|
98
|
-
# Returns an array with the fields from first argumen to last argument
|
99
|
-
def from_to(from,to)
|
100
|
-
raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from
|
101
|
-
raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to
|
102
|
-
@fields.slice(@fields.index(from)..@fields.index(to))
|
103
|
-
end
|
104
|
-
# Returns a duplicate of the Database
|
105
|
-
# If fields given, only include those vectors
|
106
|
-
def dup(*fields_to_include)
|
107
|
-
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
|
108
|
-
fields_to_include=fields_to_include[0]
|
109
|
-
end
|
110
|
-
fields_to_include=@fields if fields_to_include.size==0
|
111
|
-
vectors={}
|
112
|
-
fields=[]
|
113
|
-
labels={}
|
114
|
-
fields_to_include.each{|f|
|
115
|
-
raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
|
116
|
-
vectors[f]=@vectors[f].dup
|
117
|
-
labels[f]=@labels[f]
|
118
|
-
fields.push(f)
|
119
|
-
}
|
120
|
-
Dataset.new(vectors,fields,labels)
|
121
|
-
end
|
122
|
-
# Creates a copy of the given dataset, without data on vectors
|
123
|
-
def dup_empty
|
124
|
-
vectors=@vectors.inject({}) {|a,v|
|
125
|
-
a[v[0]]=v[1].dup_empty
|
126
|
-
a
|
127
|
-
}
|
128
|
-
Dataset.new(vectors,@fields.dup,@labels.dup)
|
129
|
-
end
|
130
|
-
# Returns a dataset with standarized data
|
131
|
-
def standarize
|
132
|
-
ds=dup()
|
133
|
-
ds.fields.each {|f|
|
134
|
-
ds[f]=ds[f].vector_standarized
|
135
|
-
}
|
136
|
-
ds
|
137
|
-
end
|
138
|
-
# Generate a matrix, based on fields of dataset
|
139
|
-
def collect_matrix
|
140
|
-
rows=@fields.collect{|row|
|
141
|
-
@fields.collect{|col|
|
142
|
-
yield row,col
|
143
|
-
}
|
144
|
-
}
|
145
|
-
Matrix.rows(rows)
|
146
|
-
end
|
147
|
-
# We have the same datasets if the labels and vectors are the same
|
148
|
-
def ==(d2)
|
149
|
-
@vectors==d2.vectors and @fields==d2.fields
|
150
|
-
end
|
151
|
-
def col(c)
|
152
|
-
@vectors[c]
|
153
|
-
end
|
154
|
-
alias_method :vector, :col
|
155
|
-
def add_vector(name,vector)
|
156
|
-
raise ArgumentError, "Vector have different size" if vector.size!=@cases
|
157
|
-
@vectors[name]=vector
|
158
|
-
check_order
|
159
|
-
end
|
160
|
-
def has_vector? (v)
|
161
|
-
return @vectors.has_key?(v)
|
162
|
-
end
|
163
|
-
# Creates a dataset with the random data, of a n size
|
164
|
-
# If n not given, uses original number of cases
|
165
|
-
def bootstrap(n=nil)
|
166
|
-
n||=@cases
|
167
|
-
ds_boot=dup_empty
|
168
|
-
for i in 1..n
|
169
|
-
ds_boot.add_case_array(case_as_array(rand(n)))
|
170
|
-
end
|
171
|
-
ds_boot.update_valid_data
|
172
|
-
ds_boot
|
173
|
-
end
|
174
|
-
# Fast version of add case
|
175
|
-
# Can only add one case and no error check if performed
|
176
|
-
# You SHOULD use update_valid_data at the end of insertion cycle
|
177
|
-
def add_case_array(v)
|
178
|
-
v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
|
179
|
-
end
|
180
|
-
def add_case(v,uvd=true)
|
181
|
-
case v
|
182
|
-
when Array
|
183
|
-
if (v[0].is_a? Array)
|
184
|
-
v.each{|subv| add_case(subv,false)}
|
185
|
-
else
|
186
|
-
raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size
|
187
|
-
v.each_index {|i| @vectors[@fields[i]].add(v[i],false)}
|
188
|
-
end
|
189
|
-
when Hash
|
190
|
-
raise ArgumentError, "Hash keys should be equal to fields" if @fields.sort!=v.keys.sort
|
191
|
-
@fields.each{|f| @vectors[f].add(v[f],false)}
|
192
|
-
else
|
193
|
-
raise TypeError, 'Value must be a Array or a Hash'
|
194
|
-
end
|
195
|
-
if uvd
|
196
|
-
update_valid_data
|
197
|
-
end
|
198
|
-
end
|
199
|
-
def update_valid_data
|
200
|
-
@fields.each{|f| @vectors[f].set_valid_data}
|
201
|
-
check_length
|
202
|
-
end
|
203
|
-
def delete_vector(name)
|
204
|
-
@fields.delete(name)
|
205
|
-
@vectors.delete(name)
|
206
|
-
end
|
207
|
-
def add_vectors_by_split_recode(name,join='-',sep=Statsample::SPLIT_TOKEN)
|
208
|
-
split=@vectors[name].split_by_separator(sep)
|
209
|
-
i=1
|
210
|
-
split.each{|k,v|
|
211
|
-
new_field=name+join+i.to_s
|
212
|
-
@labels[new_field]=name+":"+k
|
213
|
-
add_vector(new_field,v)
|
214
|
-
i+=1
|
215
|
-
}
|
216
|
-
end
|
217
|
-
def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN)
|
218
|
-
split=@vectors[name].split_by_separator(sep)
|
219
|
-
split.each{|k,v|
|
220
|
-
add_vector(name+join+k,v)
|
221
|
-
}
|
222
|
-
end
|
223
285
|
def vector_by_calculation(type=:scale)
|
224
286
|
a=[]
|
225
287
|
each {|row|
|
@@ -238,214 +300,215 @@ module Statsample
|
|
238
300
|
else
|
239
301
|
fields.inject(0) {|ac,v| ac + row[v].to_f}
|
240
302
|
end
|
241
|
-
|
303
|
+
end
|
242
304
|
end
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
a.push(sum.quo(size-invalids))
|
289
|
-
end
|
290
|
-
end
|
291
|
-
a.to_vector(:scale)
|
292
|
-
end
|
293
|
-
def check_length
|
294
|
-
size=nil
|
295
|
-
@vectors.each do |k,v|
|
296
|
-
raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
|
297
|
-
if size.nil?
|
298
|
-
size=v.size
|
299
|
-
else
|
300
|
-
if v.size!=size
|
301
|
-
p v.to_a.size
|
302
|
-
raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
|
303
|
-
end
|
304
|
-
end
|
305
|
-
end
|
306
|
-
@cases=size
|
307
|
-
end
|
308
|
-
def each_vector
|
309
|
-
@fields.each{|k| yield k,@vectors[k]}
|
310
|
-
end
|
311
|
-
if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
|
312
|
-
def case_as_hash(c) # :nodoc:
|
313
|
-
Statsample::STATSAMPLE__.case_as_hash(self,c)
|
314
|
-
end
|
315
|
-
else
|
316
|
-
def case_as_hash(c)
|
317
|
-
_case_as_hash(c)
|
318
|
-
end
|
319
|
-
end
|
320
|
-
|
321
|
-
if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
|
322
|
-
def case_as_array(c) # :nodoc:
|
323
|
-
Statsample::STATSAMPLE__.case_as_array(self,c)
|
324
|
-
end
|
305
|
+
def check_fields(fields)
|
306
|
+
fields||=@fields
|
307
|
+
raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
|
308
|
+
fields
|
309
|
+
end
|
310
|
+
# Returns a vector with the numbers of missing values for a case
|
311
|
+
|
312
|
+
def vector_missing_values(fields=nil)
|
313
|
+
fields=check_fields(fields)
|
314
|
+
collect_with_index do |i,row|
|
315
|
+
fields.inject(0) {|a,v|
|
316
|
+
a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
|
317
|
+
}
|
318
|
+
end
|
319
|
+
end
|
320
|
+
def vector_count_characters(fields=nil)
|
321
|
+
fields=check_fields(fields)
|
322
|
+
collect_with_index do |i,row|
|
323
|
+
fields.inject(0){|a,v|
|
324
|
+
|
325
|
+
a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
|
326
|
+
}
|
327
|
+
end
|
328
|
+
end
|
329
|
+
# Returns a vector with the mean for a set of fields
|
330
|
+
# if fields parameter is empty, return the mean for all fields
|
331
|
+
# if max invalid parameter > 0, returns the mean for all tuples
|
332
|
+
# with 0 to max_invalid invalid fields
|
333
|
+
def vector_mean(fields=nil,max_invalid=0)
|
334
|
+
a=[]
|
335
|
+
fields=check_fields(fields)
|
336
|
+
size=fields.size
|
337
|
+
each_with_index do |i, row|
|
338
|
+
# numero de invalidos
|
339
|
+
sum=0
|
340
|
+
invalids=0
|
341
|
+
fields.each{|f|
|
342
|
+
if !@vectors[f].data_with_nils[i].nil?
|
343
|
+
sum+=row[f].to_f
|
344
|
+
else
|
345
|
+
invalids+=1
|
346
|
+
end
|
347
|
+
}
|
348
|
+
if(invalids>max_invalid)
|
349
|
+
a.push(nil)
|
325
350
|
else
|
326
|
-
|
327
|
-
_case_as_array(c)
|
328
|
-
end
|
329
|
-
end
|
330
|
-
def _case_as_hash(c) # :nodoc:
|
331
|
-
@fields.inject({}) {|a,x| a[x]=@vectors[x][c];a }
|
332
|
-
end
|
333
|
-
def _case_as_array(c) # :nodoc:
|
334
|
-
@fields.collect {|x| @vectors[x][c]}
|
335
|
-
end
|
336
|
-
# Returns each case as a hash
|
337
|
-
def each
|
338
|
-
begin
|
339
|
-
@i=0
|
340
|
-
@cases.times {|i|
|
341
|
-
@i=i
|
342
|
-
row=case_as_hash(i)
|
343
|
-
yield row
|
344
|
-
}
|
345
|
-
@i=nil
|
346
|
-
rescue =>e
|
347
|
-
raise DatasetException.new(self,e)
|
348
|
-
end
|
351
|
+
a.push(sum.quo(size-invalids))
|
349
352
|
end
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
@i=nil
|
360
|
-
rescue =>e
|
361
|
-
raise DatasetException.new(self,e)
|
362
|
-
end
|
363
|
-
end
|
364
|
-
# Returns each case as an array, coding missing values as nils
|
365
|
-
def each_array_with_nils
|
366
|
-
m=fields.size
|
367
|
-
@cases.times {|i|
|
368
|
-
@i=i
|
369
|
-
row=Array.new(m)
|
370
|
-
fields.each_index{|j|
|
371
|
-
f=fields[j]
|
372
|
-
row[j]=@vectors[f].data_with_nils[i]
|
373
|
-
}
|
374
|
-
yield row
|
375
|
-
}
|
376
|
-
@i=nil
|
377
|
-
end
|
378
|
-
# Returns each case as an array
|
379
|
-
def each_array
|
380
|
-
@cases.times {|i|
|
381
|
-
@i=i
|
382
|
-
row=case_as_array(i)
|
383
|
-
yield row
|
384
|
-
}
|
385
|
-
@i=nil
|
386
|
-
end
|
387
|
-
def fields=(f)
|
388
|
-
@fields=f
|
389
|
-
check_order
|
390
|
-
end
|
391
|
-
def check_order
|
392
|
-
if(@vectors.keys.sort!=@fields.sort)
|
393
|
-
@fields=@fields&@vectors.keys
|
394
|
-
@fields+=@vectors.keys.sort-@fields
|
395
|
-
end
|
396
|
-
end
|
397
|
-
# Returns the vector named i
|
398
|
-
def[](i)
|
399
|
-
if i.is_a? String
|
400
|
-
raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
|
401
|
-
@vectors[i]
|
402
|
-
elsif i.is_a? Range
|
403
|
-
fields=from_to(i.begin,i.end)
|
404
|
-
vectors=fields.inject({}) {|a,v| a[v]=@vectors[v];a}
|
405
|
-
ds=Dataset.new(vectors,fields)
|
353
|
+
end
|
354
|
+
a.to_vector(:scale)
|
355
|
+
end
|
356
|
+
def check_length
|
357
|
+
size=nil
|
358
|
+
@vectors.each do |k,v|
|
359
|
+
raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
|
360
|
+
if size.nil?
|
361
|
+
size=v.size
|
406
362
|
else
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
data=[]
|
412
|
-
each {|row|
|
413
|
-
data.push(yield(row))
|
414
|
-
}
|
415
|
-
Statsample::Vector.new(data,type)
|
416
|
-
end
|
417
|
-
def collect_with_index(type=:scale)
|
418
|
-
data=[]
|
419
|
-
each_with_index {|i,row|
|
420
|
-
data.push(yield(i,row))
|
421
|
-
}
|
422
|
-
Statsample::Vector.new(data,type)
|
423
|
-
end
|
424
|
-
# Recode a vector based on a block
|
425
|
-
def recode!(vector_name)
|
426
|
-
0.upto(@cases-1) {|i|
|
427
|
-
@vectors[vector_name].data[i]=yield case_as_hash(i)
|
428
|
-
}
|
429
|
-
@vectors[vector_name].set_valid_data
|
430
|
-
end
|
431
|
-
def crosstab(v1,v2)
|
432
|
-
Statsample::Crosstab.new(@vectors[v1],@vectors[v2])
|
433
|
-
end
|
434
|
-
def[]=(i,v)
|
435
|
-
if v.instance_of? Statsample::Vector
|
436
|
-
@vectors[i]=v
|
437
|
-
check_order
|
438
|
-
else
|
439
|
-
raise ArgumentError,"Should pass a Statsample::Vector"
|
440
|
-
end
|
441
|
-
end
|
442
|
-
def to_matrix
|
443
|
-
rows=[]
|
444
|
-
self.each_array{|c|
|
445
|
-
rows.push(c)
|
446
|
-
}
|
447
|
-
Matrix.rows(rows)
|
363
|
+
if v.size!=size
|
364
|
+
p v.to_a.size
|
365
|
+
raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
|
366
|
+
end
|
448
367
|
end
|
368
|
+
end
|
369
|
+
@cases=size
|
370
|
+
end
|
371
|
+
def each_vector
|
372
|
+
@fields.each{|k| yield k,@vectors[k]}
|
373
|
+
end
|
374
|
+
if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
|
375
|
+
def case_as_hash(c) # :nodoc:
|
376
|
+
Statsample::STATSAMPLE__.case_as_hash(self,c)
|
377
|
+
end
|
378
|
+
else
|
379
|
+
def case_as_hash(c)
|
380
|
+
_case_as_hash(c)
|
381
|
+
end
|
382
|
+
end
|
383
|
+
|
384
|
+
if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
|
385
|
+
def case_as_array(c) # :nodoc:
|
386
|
+
Statsample::STATSAMPLE__.case_as_array(self,c)
|
387
|
+
end
|
388
|
+
else
|
389
|
+
def case_as_array(c)
|
390
|
+
_case_as_array(c)
|
391
|
+
end
|
392
|
+
end
|
393
|
+
def _case_as_hash(c) # :nodoc:
|
394
|
+
@fields.inject({}) {|a,x| a[x]=@vectors[x][c];a }
|
395
|
+
end
|
396
|
+
def _case_as_array(c) # :nodoc:
|
397
|
+
@fields.collect {|x| @vectors[x][c]}
|
398
|
+
end
|
399
|
+
# Returns each case as a hash
|
400
|
+
def each
|
401
|
+
begin
|
402
|
+
@i=0
|
403
|
+
@cases.times {|i|
|
404
|
+
@i=i
|
405
|
+
row=case_as_hash(i)
|
406
|
+
yield row
|
407
|
+
}
|
408
|
+
@i=nil
|
409
|
+
rescue =>e
|
410
|
+
raise DatasetException.new(self,e)
|
411
|
+
end
|
412
|
+
end
|
413
|
+
# Returns each case as index and hash
|
414
|
+
def each_with_index
|
415
|
+
begin
|
416
|
+
@i=0
|
417
|
+
@cases.times{|i|
|
418
|
+
@i=i
|
419
|
+
row=case_as_hash(i)
|
420
|
+
yield i,row
|
421
|
+
}
|
422
|
+
@i=nil
|
423
|
+
rescue =>e
|
424
|
+
raise DatasetException.new(self,e)
|
425
|
+
end
|
426
|
+
end
|
427
|
+
# Returns each case as an array, coding missing values as nils
|
428
|
+
def each_array_with_nils
|
429
|
+
m=fields.size
|
430
|
+
@cases.times {|i|
|
431
|
+
@i=i
|
432
|
+
row=Array.new(m)
|
433
|
+
fields.each_index{|j|
|
434
|
+
f=fields[j]
|
435
|
+
row[j]=@vectors[f].data_with_nils[i]
|
436
|
+
}
|
437
|
+
yield row
|
438
|
+
}
|
439
|
+
@i=nil
|
440
|
+
end
|
441
|
+
# Returns each case as an array
|
442
|
+
def each_array
|
443
|
+
@cases.times {|i|
|
444
|
+
@i=i
|
445
|
+
row=case_as_array(i)
|
446
|
+
yield row
|
447
|
+
}
|
448
|
+
@i=nil
|
449
|
+
end
|
450
|
+
def fields=(f)
|
451
|
+
@fields=f
|
452
|
+
check_order
|
453
|
+
end
|
454
|
+
def check_order
|
455
|
+
if(@vectors.keys.sort!=@fields.sort)
|
456
|
+
@fields=@fields&@vectors.keys
|
457
|
+
@fields+=@vectors.keys.sort-@fields
|
458
|
+
end
|
459
|
+
end
|
460
|
+
# Returns the vector named i
|
461
|
+
def[](i)
|
462
|
+
if i.is_a? String
|
463
|
+
raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
|
464
|
+
@vectors[i]
|
465
|
+
elsif i.is_a? Range
|
466
|
+
fields=from_to(i.begin,i.end)
|
467
|
+
vectors=fields.inject({}) {|a,v| a[v]=@vectors[v];a}
|
468
|
+
ds=Dataset.new(vectors,fields)
|
469
|
+
else
|
470
|
+
raise ArgumentError, "You need a String or a Range"
|
471
|
+
end
|
472
|
+
end
|
473
|
+
def collect(type=:scale)
|
474
|
+
data=[]
|
475
|
+
each {|row|
|
476
|
+
data.push(yield(row))
|
477
|
+
}
|
478
|
+
Statsample::Vector.new(data,type)
|
479
|
+
end
|
480
|
+
def collect_with_index(type=:scale)
|
481
|
+
data=[]
|
482
|
+
each_with_index {|i,row|
|
483
|
+
data.push(yield(i,row))
|
484
|
+
}
|
485
|
+
Statsample::Vector.new(data,type)
|
486
|
+
end
|
487
|
+
# Recode a vector based on a block
|
488
|
+
def recode!(vector_name)
|
489
|
+
0.upto(@cases-1) {|i|
|
490
|
+
@vectors[vector_name].data[i]=yield case_as_hash(i)
|
491
|
+
}
|
492
|
+
@vectors[vector_name].set_valid_data
|
493
|
+
end
|
494
|
+
def crosstab(v1,v2)
|
495
|
+
Statsample::Crosstab.new(@vectors[v1],@vectors[v2])
|
496
|
+
end
|
497
|
+
def[]=(i,v)
|
498
|
+
if v.instance_of? Statsample::Vector
|
499
|
+
@vectors[i]=v
|
500
|
+
check_order
|
501
|
+
else
|
502
|
+
raise ArgumentError,"Should pass a Statsample::Vector"
|
503
|
+
end
|
504
|
+
end
|
505
|
+
def to_matrix
|
506
|
+
rows=[]
|
507
|
+
self.each_array{|c|
|
508
|
+
rows.push(c)
|
509
|
+
}
|
510
|
+
Matrix.rows(rows)
|
511
|
+
end
|
449
512
|
def to_multiset_by_split(*fields)
|
450
513
|
require 'statsample/multiset'
|
451
514
|
if fields.size==1
|
@@ -454,15 +517,15 @@ module Statsample
|
|
454
517
|
to_multiset_by_split_multiple_fields(*fields)
|
455
518
|
end
|
456
519
|
end
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
520
|
+
# create a new dataset with all the data which the block returns true
|
521
|
+
def filter
|
522
|
+
ds=self.dup_empty
|
523
|
+
each {|c|
|
524
|
+
ds.add_case(c,false) if yield c
|
525
|
+
}
|
526
|
+
ds.update_valid_data
|
527
|
+
ds
|
528
|
+
end
|
466
529
|
# creates a new vector with the data of a given field which the block returns true
|
467
530
|
def filter_field(field)
|
468
531
|
a=[]
|
@@ -471,123 +534,112 @@ module Statsample
|
|
471
534
|
}
|
472
535
|
a.to_vector(@vectors[field].type)
|
473
536
|
end
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
collect_with_index {|i,row|
|
545
|
-
invalid=false
|
546
|
-
@fields.each{|f|
|
547
|
-
if @vectors[f].data_with_nils[i].nil?
|
548
|
-
invalid=true
|
549
|
-
end
|
550
|
-
}
|
551
|
-
if invalid
|
552
|
-
nil
|
553
|
-
else
|
554
|
-
eval(text)
|
555
|
-
end
|
556
|
-
}
|
537
|
+
def to_multiset_by_split_one_field(field)
|
538
|
+
raise ArgumentError,"Should use a correct field name" if !@fields.include? field
|
539
|
+
factors=@vectors[field].factors
|
540
|
+
ms=Multiset.new_empty_vectors(@fields,factors)
|
541
|
+
each {|c|
|
542
|
+
ms[c[field]].add_case(c,false)
|
543
|
+
}
|
544
|
+
#puts "Ingreso a los dataset"
|
545
|
+
ms.datasets.each {|k,ds|
|
546
|
+
ds.update_valid_data
|
547
|
+
ds.vectors.each{|k1,v1|
|
548
|
+
# puts "Vector #{k1}:"+v1.to_s
|
549
|
+
v1.type=@vectors[k1].type
|
550
|
+
}
|
551
|
+
}
|
552
|
+
ms
|
553
|
+
end
|
554
|
+
def to_multiset_by_split_multiple_fields(*fields)
|
555
|
+
factors_total=nil
|
556
|
+
fields.each do |f|
|
557
|
+
if factors_total.nil?
|
558
|
+
factors_total=@vectors[f].factors.collect{|c|
|
559
|
+
[c]
|
560
|
+
}
|
561
|
+
else
|
562
|
+
suma=[]
|
563
|
+
factors=@vectors[f].factors
|
564
|
+
factors_total.each{|f1| factors.each{|f2| suma.push(f1+[f2]) } }
|
565
|
+
factors_total=suma
|
566
|
+
end
|
567
|
+
end
|
568
|
+
ms=Multiset.new_empty_vectors(@fields,factors_total)
|
569
|
+
p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }"
|
570
|
+
each{|c| p1.call(c)}
|
571
|
+
ms.datasets.each do |k,ds|
|
572
|
+
ds.update_valid_data
|
573
|
+
ds.vectors.each{|k1,v1| v1.type=@vectors[k1].type }
|
574
|
+
end
|
575
|
+
ms
|
576
|
+
|
577
|
+
end
|
578
|
+
# Returns a vector, based on a string with a calculation based
|
579
|
+
# on vector
|
580
|
+
# The calculation will be eval'ed, so you can put any variable
|
581
|
+
# or expression valid on ruby
|
582
|
+
# For example:
|
583
|
+
# a=[1,2].to_vector(scale)
|
584
|
+
# b=[3,4].to_vector(scale)
|
585
|
+
# ds={'a'=>a,'b'=>b}.to_dataset
|
586
|
+
# ds.compute("a+b")
|
587
|
+
# => Vector [4,6]
|
588
|
+
def compute(text)
|
589
|
+
@fields.each{|f|
|
590
|
+
if @vectors[f].type=:scale
|
591
|
+
text.gsub!(f,"row['#{f}'].to_f")
|
592
|
+
else
|
593
|
+
text.gsub!(f,"row['#{f}']")
|
594
|
+
end
|
595
|
+
}
|
596
|
+
collect_with_index {|i,row|
|
597
|
+
invalid=false
|
598
|
+
@fields.each{|f|
|
599
|
+
if @vectors[f].data_with_nils[i].nil?
|
600
|
+
invalid=true
|
601
|
+
end
|
602
|
+
}
|
603
|
+
if invalid
|
604
|
+
nil
|
605
|
+
else
|
606
|
+
eval(text)
|
557
607
|
end
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
end
|
581
|
-
}
|
608
|
+
}
|
609
|
+
end
|
610
|
+
# Test each row with one or more tests
|
611
|
+
# each test is a Proc with the form
|
612
|
+
# Proc.new {|row| row['age']>0}
|
613
|
+
# The function returns an array with all errors
|
614
|
+
def verify(*tests)
|
615
|
+
if(tests[0].is_a? String)
|
616
|
+
id=tests[0]
|
617
|
+
tests.shift
|
618
|
+
else
|
619
|
+
id=@fields[0]
|
620
|
+
end
|
621
|
+
vr=[]
|
622
|
+
i=0
|
623
|
+
each do |row|
|
624
|
+
i+=1
|
625
|
+
tests.each{|test|
|
626
|
+
if ! test[2].call(row)
|
627
|
+
values=""
|
628
|
+
if test[1].size>0
|
629
|
+
values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")"
|
582
630
|
end
|
583
|
-
vr
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
631
|
+
vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
|
632
|
+
end
|
633
|
+
}
|
634
|
+
end
|
635
|
+
vr
|
636
|
+
end
|
637
|
+
def to_s
|
638
|
+
"#<"+self.class.to_s+":"+self.object_id.to_s+" @fields=["+@fields.join(",")+"] labels="+@labels.inspect+" cases="+@vectors[@fields[0]].size.to_s
|
639
|
+
end
|
640
|
+
def inspect
|
641
|
+
self.to_s
|
642
|
+
end
|
591
643
|
def summary
|
592
644
|
out=""
|
593
645
|
out << "Summary for dataset\n"
|
@@ -600,10 +652,10 @@ module Statsample
|
|
600
652
|
}
|
601
653
|
out
|
602
654
|
end
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
end
|
655
|
+
def as_r
|
656
|
+
require 'rsruby/dataframe'
|
657
|
+
r=RSRuby.instance
|
658
|
+
|
608
659
|
end
|
660
|
+
end
|
609
661
|
end
|