statsample 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,90 +1,82 @@
1
+ require 'statsample/converter/spss'
1
2
  module Statsample
2
3
  # Create and dumps Datasets on a database
3
- module Database
4
- class << self
5
- # Read a database query and returns a Dataset
6
- #
7
- # USE:
8
- #
9
- # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
10
- # Statsample.read(dbh, "SELECT * FROM test")
11
- #
12
- def read(dbh,query)
13
- require 'dbi'
14
- sth=dbh.execute(query)
15
- vectors={}
16
- fields=[]
17
- sth.column_info.each {|c|
18
- vectors[c['name']]=Statsample::Vector.new([])
19
- vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :scale : :nominal
20
- fields.push(c['name'])
21
- }
22
- ds=Statsample::Dataset.new(vectors,fields)
23
- sth.fetch do |row|
24
- ds.add_case(row.to_a, false )
25
- end
26
- ds.update_valid_data
27
- ds
28
- end
29
- # Insert each case of the Dataset on the selected table
30
- #
31
- # USE:
32
- #
33
- # ds={'id'=>[1,2,3].to_vector, 'name'=>["a","b","c"].to_vector}.to_dataset
34
- # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
35
- # Statsample::Database.insert(ds,dbh,"test")
36
- #
37
- def insert(ds, dbh,table)
38
- require 'dbi'
39
- query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
40
- sth=dbh.prepare(query)
41
- ds.each_array{|c|
42
- sth.execute(*c)
43
- }
44
- end
45
- # Create a sql, basen on a given Dataset
46
- #
47
- # USE:
48
- #
49
- # ds={'id'=>[1,2,3,4,5].to_vector,'name'=>%w{Alex Peter Susan Mary John}.to_vector}.to_dataset
50
- # Statsample::Database.create_sql(ds,'names')
51
- # ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
52
- #
53
- def create_sql(ds,table,charset="UTF8")
54
- sql="CREATE TABLE #{table} ("
55
- fields=ds.fields.collect{|f|
56
- v=ds[f]
57
- f+" "+v.db_type
58
- }
59
- sql+fields.join(",\n ")+") CHARACTER SET=#{charset};"
4
+ module Database
5
+ class << self
6
+ # Read a database query and returns a Dataset
7
+ #
8
+ # USE:
9
+ #
10
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
11
+ # Statsample.read(dbh, "SELECT * FROM test")
12
+ #
13
+ def read(dbh,query)
14
+ require 'dbi'
15
+ sth=dbh.execute(query)
16
+ vectors={}
17
+ fields=[]
18
+ sth.column_info.each {|c|
19
+ vectors[c['name']]=Statsample::Vector.new([])
20
+ vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :scale : :nominal
21
+ fields.push(c['name'])
22
+ }
23
+ ds=Statsample::Dataset.new(vectors,fields)
24
+ sth.fetch do |row|
25
+ ds.add_case(row.to_a, false )
60
26
  end
61
- end
27
+ ds.update_valid_data
28
+ ds
29
+ end
30
+ # Insert each case of the Dataset on the selected table
31
+ #
32
+ # USE:
33
+ #
34
+ # ds={'id'=>[1,2,3].to_vector, 'name'=>["a","b","c"].to_vector}.to_dataset
35
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
36
+ # Statsample::Database.insert(ds,dbh,"test")
37
+ #
38
+ def insert(ds, dbh,table)
39
+ require 'dbi'
40
+ query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
41
+ sth=dbh.prepare(query)
42
+ ds.each_array{|c| sth.execute(*c) }
43
+ end
44
+ # Create a sql, basen on a given Dataset
45
+ #
46
+ # USE:
47
+ #
48
+ # ds={'id'=>[1,2,3,4,5].to_vector,'name'=>%w{Alex Peter Susan Mary John}.to_vector}.to_dataset
49
+ # Statsample::Database.create_sql(ds,'names')
50
+ # ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
51
+ #
52
+ def create_sql(ds,table,charset="UTF8")
53
+ sql="CREATE TABLE #{table} ("
54
+ fields=ds.fields.collect{|f|
55
+ v=ds[f]
56
+ f+" "+v.db_type
57
+ }
58
+ sql+fields.join(",\n ")+") CHARACTER SET=#{charset};"
59
+ end
62
60
  end
63
- module Mondrian
64
- class << self
65
- def write(dataset,filename)
66
- File.open(filename,"wb") do |fp|
67
- fp.puts dataset.fields.join("\t")
68
- dataset.each_array_with_nils{|row|
69
- row2=row.collect{|v|
70
- v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_")
71
- }
72
- fp.puts row2.join("\t")
73
- }
74
- end
75
- end
61
+ end
62
+ module Mondrian
63
+ class << self
64
+ def write(dataset,filename)
65
+ File.open(filename,"wb") do |fp|
66
+ fp.puts dataset.fields.join("\t")
67
+ dataset.each_array_with_nils do |row|
68
+ row2=row.collect{|v| v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_") }
69
+ fp.puts row2.join("\t")
70
+ end
76
71
  end
72
+ end
77
73
  end
74
+ end
78
75
  class SpreadsheetBase
79
76
  class << self
80
77
  def extract_fields(row)
81
78
  fields=row.to_a.collect{|c| c.downcase}
82
- if fields.size!=fields.uniq.size
83
- repeated=fields.inject({}) {|a,v|
84
- (a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v|k}.join(",")
85
- raise "There are some repeated fields on the header:#{repeated}. Please, fix"
86
- end
87
- fields
79
+ fields.recode_repeated
88
80
  end
89
81
 
90
82
  def process_row(row,empty)
@@ -121,6 +113,7 @@ module Statsample
121
113
  fp=File.open(filename,"r")
122
114
  fp.each_line do |line|
123
115
  row=process_row(line.strip.split(/\s+/),[""])
116
+ next if row==["\x1A"]
124
117
  ds.add_case_array(row)
125
118
  end
126
119
  convert_to_scale(ds,fields)
@@ -178,7 +171,11 @@ module Statsample
178
171
  row.collect!{|c|
179
172
  i+=1
180
173
  if c.is_a? Spreadsheet::Formula
174
+ if(c.value.is_a? Spreadsheet::Excel::Error)
175
+ nil
176
+ else
181
177
  c.value
178
+ end
182
179
  elsif dates.include? i and !c.nil? and c.is_a? Numeric
183
180
  row.date(i)
184
181
  else
@@ -26,22 +26,22 @@ module Statsample
26
26
  @v_cols.frequencies
27
27
  end
28
28
  def frequencies
29
- base=rows_names.inject([]){|s,row|
30
- s+=cols_names.collect{|col| [row,col]}
31
- }.inject({}) {|s,par|
32
- s[par]=0
33
- s
34
- }
29
+ base=rows_names.inject([]){|s,row|
30
+ s+=cols_names.collect{|col| [row,col]}
31
+ }.inject({}) {|s,par|
32
+ s[par]=0
33
+ s
34
+ }
35
35
  base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.to_vector.frequencies)
36
36
  end
37
- def to_matrix
38
- f=frequencies
39
- rn=rows_names
40
- cn=cols_names
41
- Matrix.rows(rn.collect{|row|
42
- cn.collect{|col| f[[row,col]]}
43
- })
44
- end
37
+ def to_matrix
38
+ f=frequencies
39
+ rn=rows_names
40
+ cn=cols_names
41
+ Matrix.rows(rn.collect{|row|
42
+ cn.collect{|col| f[[row,col]]}
43
+ })
44
+ end
45
45
  def frequencies_by_row
46
46
  f=frequencies
47
47
  rows_names.inject({}){|sr,row|
@@ -81,14 +81,18 @@ module Statsample
81
81
  }
82
82
  Matrix.rows(m)
83
83
  end
84
- def summary(report_type=ConsoleSummary)
84
+ def cols_empty_hash
85
+ cols_names.inject({}) {|a,x| a[x]=0;a}
86
+ end
87
+
88
+ def summary(report_type = ConsoleSummary)
85
89
  out=""
86
90
  out.extend report_type
87
91
  fq=frequencies
88
92
  rn=rows_names
89
93
  cn=cols_names
90
94
  total=0
91
- total_cols=cn.inject({}) {|a,x| a[x]=0;a}
95
+ total_cols=cols_empty_hash
92
96
  out.add "Chi Square: #{chi_square}\n"
93
97
  out.add(_("Rows: %s\n") % @row_label) unless @row_label.nil?
94
98
  out.add(_("Columns: %s\n") % @column_label) unless @column_label.nil?
@@ -122,7 +126,7 @@ module Statsample
122
126
  rn=rows_names
123
127
  cn=cols_names
124
128
  total=0
125
- total_cols=cn.inject({}) {|a,x| a[x]=0;a}
129
+ total_cols=cols_empty_hash
126
130
  max_row_size = rn.inject(0) {|s,x| sl=@v_rows.labeling(x).size; sl>s ? sl : s}
127
131
 
128
132
  max_row_size=max_row_size<6 ? 6 : max_row_size
@@ -1,225 +1,287 @@
1
1
  require 'statsample/vector'
2
2
 
3
3
  class Hash
4
- def to_dataset(*args)
5
- Statsample::Dataset.new(self,*args)
6
- end
4
+ def to_dataset(*args)
5
+ Statsample::Dataset.new(self,*args)
6
+ end
7
7
  end
8
8
 
9
9
  class Array
10
- def prefix(s)
11
- self.collect{|c|
12
- s+c.to_s
10
+ def prefix(s)
11
+ self.collect{|c| s+c.to_s }
12
+ end
13
+ def suffix(s)
14
+ self.collect{|c| c.to_s+s }
15
+ end
16
+ end
17
+
18
+ module Statsample
19
+ class DatasetException < RuntimeError
20
+ attr_reader :ds,:exp
21
+ def initialize(ds,e)
22
+ @ds=ds
23
+ @exp=e
24
+ end
25
+ def to_s
26
+ m="Error: "+@exp.message+"\n"+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
27
+ m+="\nRow: #{@i}" unless @i.nil?
28
+ m
29
+ end
30
+ end
31
+ class Dataset
32
+ include Writable
33
+ attr_reader :vectors, :fields, :cases, :i
34
+ attr_accessor :labels
35
+
36
+ # Generates a new dataset, using three vectors
37
+ # - Rows
38
+ # - Columns
39
+ # - Values
40
+ # For example, you have these values
41
+ #
42
+ # x y v
43
+ # a a 0
44
+ # a b 1
45
+ # b a 1
46
+ # b b 0
47
+ #
48
+ # You obtain
49
+ # id a b
50
+ # a 0 1
51
+ # b 1 0
52
+ #
53
+ # Useful to process outputs from databases
54
+ #
55
+ def self.crosstab_by_asignation(rows,columns,values)
56
+ raise "Three vectors should be equal size" if rows.size!=columns.size or rows.size!=values.size
57
+ cols_values=columns.factors
58
+ cols_n=cols_values.size
59
+ h_rows=rows.factors.inject({}){|a,v| a[v]=cols_values.inject({}){
60
+ |a1,v1| a1[v1]=nil; a1
61
+ }
62
+ ;a}
63
+ values.each_index{|i|
64
+ h_rows[rows[i]][columns[i]]=values[i]
65
+ }
66
+ ds=Dataset.new(["_id"]+cols_values)
67
+ cols_values.each{|c|
68
+ ds[c].type=values.type
69
+ }
70
+ rows.factors.each {|row|
71
+ n_row=Array.new(cols_n+1)
72
+ n_row[0]=row
73
+ cols_values.each_index {|i|
74
+ n_row[i+1]=h_rows[row][cols_values[i]]
13
75
  }
76
+ ds.add_case_array(n_row)
77
+ }
78
+ ds.update_valid_data
79
+ ds
80
+ end
81
+ # Creates a new dataset. A dataset is a set of ordered named vectors
82
+ # of the same size.
83
+ #
84
+ # [vectors] With an array, creates a set of empty vectors named as
85
+ # values on the array. With a hash, each Vector is assigned as
86
+ # a variable of the Dataset named as its key
87
+ # [fields] Array of names for vectors. Is only used for set the
88
+ # order of variables. If empty, vectors keys on alfabethic order as
89
+ # used as fields
90
+ # [labels] Hash to set names for fields.
91
+ #
92
+ #
93
+ # Dataset.new()
94
+ # Dataset.new(%w{v1 v2 v3})
95
+ # Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
96
+ # Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
97
+ #
98
+ # The fast way to create a dataset uses Hash#to_dataset, with
99
+ # fields and labels as arguments
100
+ # ds = {'v1'=>[1,2,3].to_vector}.to_dataset
101
+ #
102
+ def initialize(vectors={}, fields=[], labels={})
103
+ if vectors.instance_of? Array
104
+ @fields=vectors.dup
105
+ @vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
106
+ else
107
+ # Check vectors
108
+ @vectors=vectors
109
+ @fields=fields
110
+ check_order
111
+ check_length
112
+ end
113
+ @i=nil
114
+ @labels=labels
115
+ end
116
+ def to_gsl_matrix
117
+ matrix=GSL::Matrix.alloc(cases,@vectors.size)
118
+ each_array do |row|
119
+ row.each_index{|y| matrix.set(@i,y,row[y]) }
120
+ end
121
+ matrix
122
+ end
123
+ def vector_label(v_id)
124
+ raise "Vector #{v} doesn't exists" unless @fields.include? v_id
125
+ @labels[v_id].nil? ? v_id : @labels[v_id]
14
126
  end
15
- def suffix(s)
16
- self.collect{|c|
17
- c.to_s+s
127
+ # Creates a copy of the given dataset, deleting all the cases with
128
+ # missing data on one of the vectors
129
+ def dup_only_valid
130
+ if @vectors.find{|field,vector| vector.has_missing_data?}
131
+ ds=dup_empty
132
+ each_array { |c|
133
+ ds.add_case_array(c) unless @fields.find{|f| @vectors[f].data_with_nils[@i].nil? }
18
134
  }
135
+ ds.update_valid_data
136
+ else
137
+ ds=dup()
138
+ end
139
+ ds
19
140
  end
20
- end
21
-
22
- module Statsample
23
- class DatasetException < RuntimeError
24
- attr_reader :ds,:exp
25
- def initialize(ds,e)
26
- @ds=ds
27
- @exp=e
28
- end
29
- def to_s
30
- m="Error: "+@exp.message+"\n"+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
31
- m+="\nRow: #{@i}" unless @i.nil?
32
- m
33
- end
141
+ # Returns an array with the fields from first argumen to last argument
142
+ def from_to(from,to)
143
+ raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from
144
+ raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to
145
+ @fields.slice(@fields.index(from)..@fields.index(to))
146
+ end
147
+ # Returns a duplicate of the Database
148
+ # If fields given, only include those vectors
149
+ def dup(*fields_to_include)
150
+ if fields_to_include.size==1 and fields_to_include[0].is_a? Array
151
+ fields_to_include=fields_to_include[0]
152
+ end
153
+ fields_to_include=@fields if fields_to_include.size==0
154
+ vectors={}
155
+ fields=[]
156
+ new_labels={}
157
+ fields_to_include.each{|f|
158
+ raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
159
+ vectors[f]=@vectors[f].dup
160
+ new_labels[f]=@labels[f]
161
+ fields.push(f)
162
+ }
163
+ Dataset.new(vectors,fields,new_labels)
164
+ end
165
+ # Creates a copy of the given dataset, without data on vectors
166
+ def dup_empty
167
+ vectors=@vectors.inject({}) {|a,v|
168
+ a[v[0]]=v[1].dup_empty
169
+ a
170
+ }
171
+ Dataset.new(vectors,@fields.dup,@labels.dup)
172
+ end
173
+ # Merge vectors from two datasets
174
+ # In case of name collition, the vectors names are changed to
175
+ # x_1, x_2 ....
176
+ def merge(other_ds)
177
+ raise "Cases should be equal (this:#{@cases}; other:#{other_ds.cases}" unless @cases==other_ds.cases
178
+ types = @fields.collect{|f| @vectors[f].type} + other_ds.fields.collect{|f| other_ds[f].type}
179
+ new_fields = (@fields+other_ds.fields).recode_repeated
180
+ ds_new=Statsample::Dataset.new(new_fields)
181
+ new_fields.each_index{|i|
182
+ field=new_fields[i]
183
+ ds_new[field].type=types[i]
184
+ }
185
+ @cases.times {|i|
186
+ row=case_as_array(i)+other_ds.case_as_array(i)
187
+ ds_new.add_case_array(row)
188
+ }
189
+ ds_new.update_valid_data
190
+ ds_new
191
+ end
192
+ # Returns a dataset with standarized data
193
+ def standarize
194
+ ds=dup()
195
+ ds.fields.each {|f|
196
+ ds[f]=ds[f].vector_standarized
197
+ }
198
+ ds
199
+ end
200
+ # Generate a matrix, based on fields of dataset
201
+ def collect_matrix
202
+ rows=@fields.collect{|row|
203
+ @fields.collect{|col|
204
+ yield row,col
205
+ }
206
+ }
207
+ Matrix.rows(rows)
208
+ end
209
+ # We have the same datasets if the labels and vectors are the same
210
+ def ==(d2)
211
+ @vectors==d2.vectors and @fields==d2.fields
212
+ end
213
+ def col(c)
214
+ @vectors[c]
215
+ end
216
+ alias_method :vector, :col
217
+ def add_vector(name,vector)
218
+ raise ArgumentError, "Vector have different size" if vector.size!=@cases
219
+ @vectors[name]=vector
220
+ check_order
221
+ end
222
+ def has_vector? (v)
223
+ return @vectors.has_key?(v)
224
+ end
225
+ # Creates a dataset with the random data, of a n size
226
+ # If n not given, uses original number of cases
227
+ def bootstrap(n=nil)
228
+ n||=@cases
229
+ ds_boot=dup_empty
230
+ for i in 1..n
231
+ ds_boot.add_case_array(case_as_array(rand(n)))
232
+ end
233
+ ds_boot.update_valid_data
234
+ ds_boot
235
+ end
236
+ # Fast version of add case
237
+ # Can only add one case and no error check if performed
238
+ # You SHOULD use update_valid_data at the end of insertion cycle
239
+ def add_case_array(v)
240
+ v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
241
+ end
242
+ def add_case(v,uvd=true)
243
+ case v
244
+ when Array
245
+ if (v[0].is_a? Array)
246
+ v.each{|subv| add_case(subv,false)}
247
+ else
248
+ raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size
249
+ v.each_index {|i| @vectors[@fields[i]].add(v[i],false)}
250
+ end
251
+ when Hash
252
+ raise ArgumentError, "Hash keys should be equal to fields #{(v.keys - @fields).join(",")}" if @fields.sort!=v.keys.sort
253
+ @fields.each{|f| @vectors[f].add(v[f],false)}
254
+ else
255
+ raise TypeError, 'Value must be a Array or a Hash'
256
+ end
257
+ if uvd
258
+ update_valid_data
259
+ end
260
+ end
261
+ def update_valid_data
262
+ @fields.each{|f| @vectors[f].set_valid_data}
263
+ check_length
264
+ end
265
+ def delete_vector(name)
266
+ @fields.delete(name)
267
+ @vectors.delete(name)
268
+ end
269
+ def add_vectors_by_split_recode(name,join='-',sep=Statsample::SPLIT_TOKEN)
270
+ split=@vectors[name].split_by_separator(sep)
271
+ i=1
272
+ split.each{|k,v|
273
+ new_field=name+join+i.to_s
274
+ @labels[new_field]=name+":"+k
275
+ add_vector(new_field,v)
276
+ i+=1
277
+ }
278
+ end
279
+ def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN)
280
+ split=@vectors[name].split_by_separator(sep)
281
+ split.each{|k,v|
282
+ add_vector(name+join+k,v)
283
+ }
34
284
  end
35
- class Dataset
36
- include Writable
37
- attr_reader :vectors, :fields, :cases, :i
38
- attr_accessor :labels
39
- # Creates a new dataset. A dataset is a set of ordered named vectors
40
- # of the same size.
41
- #
42
- # [vectors] With an array, creates a set of empty vectors named as
43
- # values on the array. With a hash, each Vector is assigned as
44
- # a variable of the Dataset named as its key
45
- # [fields] Array of names for vectors. Is only used for set the
46
- # order of variables. If empty, vectors keys on alfabethic order as
47
- # used as fields
48
- # [labels] Hash to set names for fields.
49
- #
50
- #
51
- # Dataset.new()
52
- # Dataset.new(%w{v1 v2 v3})
53
- # Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
54
- # Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
55
- #
56
- # The fast way to create a dataset uses Hash#to_dataset, with
57
- # fields and labels as arguments
58
- # ds = {'v1'=>[1,2,3].to_vector}.to_dataset
59
- #
60
- def initialize(vectors={}, fields=[], labels={})
61
- if vectors.instance_of? Array
62
- @fields=vectors.dup
63
- @vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
64
- else
65
- @vectors=vectors
66
- @fields=fields
67
- check_order
68
- check_length
69
- end
70
- @i=nil
71
- @labels=labels
72
- end
73
- def to_gsl_matrix
74
- matrix=GSL::Matrix.alloc(cases,@vectors.size)
75
- each_array do |row|
76
- row.each_index{|y| matrix.set(@i,y,row[y]) }
77
- end
78
- matrix
79
- end
80
- def vector_label(v_id)
81
- raise "Vector #{v} doesn't exists" unless @fields.include? v_id
82
- @labels[v_id].nil? ? v_id : @labels[v_id]
83
- end
84
- # Creates a copy of the given dataset, deleting all the cases with
85
- # missing data on one of the vectors
86
- def dup_only_valid
87
- if @vectors.find{|field,vector| vector.has_missing_data?}
88
- ds=dup_empty
89
- each_array { |c|
90
- ds.add_case_array(c) unless @fields.find{|f| @vectors[f].data_with_nils[@i].nil? }
91
- }
92
- ds.update_valid_data
93
- else
94
- ds=dup()
95
- end
96
- ds
97
- end
98
- # Returns an array with the fields from first argumen to last argument
99
- def from_to(from,to)
100
- raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from
101
- raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to
102
- @fields.slice(@fields.index(from)..@fields.index(to))
103
- end
104
- # Returns a duplicate of the Database
105
- # If fields given, only include those vectors
106
- def dup(*fields_to_include)
107
- if fields_to_include.size==1 and fields_to_include[0].is_a? Array
108
- fields_to_include=fields_to_include[0]
109
- end
110
- fields_to_include=@fields if fields_to_include.size==0
111
- vectors={}
112
- fields=[]
113
- labels={}
114
- fields_to_include.each{|f|
115
- raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
116
- vectors[f]=@vectors[f].dup
117
- labels[f]=@labels[f]
118
- fields.push(f)
119
- }
120
- Dataset.new(vectors,fields,labels)
121
- end
122
- # Creates a copy of the given dataset, without data on vectors
123
- def dup_empty
124
- vectors=@vectors.inject({}) {|a,v|
125
- a[v[0]]=v[1].dup_empty
126
- a
127
- }
128
- Dataset.new(vectors,@fields.dup,@labels.dup)
129
- end
130
- # Returns a dataset with standarized data
131
- def standarize
132
- ds=dup()
133
- ds.fields.each {|f|
134
- ds[f]=ds[f].vector_standarized
135
- }
136
- ds
137
- end
138
- # Generate a matrix, based on fields of dataset
139
- def collect_matrix
140
- rows=@fields.collect{|row|
141
- @fields.collect{|col|
142
- yield row,col
143
- }
144
- }
145
- Matrix.rows(rows)
146
- end
147
- # We have the same datasets if the labels and vectors are the same
148
- def ==(d2)
149
- @vectors==d2.vectors and @fields==d2.fields
150
- end
151
- def col(c)
152
- @vectors[c]
153
- end
154
- alias_method :vector, :col
155
- def add_vector(name,vector)
156
- raise ArgumentError, "Vector have different size" if vector.size!=@cases
157
- @vectors[name]=vector
158
- check_order
159
- end
160
- def has_vector? (v)
161
- return @vectors.has_key?(v)
162
- end
163
- # Creates a dataset with the random data, of a n size
164
- # If n not given, uses original number of cases
165
- def bootstrap(n=nil)
166
- n||=@cases
167
- ds_boot=dup_empty
168
- for i in 1..n
169
- ds_boot.add_case_array(case_as_array(rand(n)))
170
- end
171
- ds_boot.update_valid_data
172
- ds_boot
173
- end
174
- # Fast version of add case
175
- # Can only add one case and no error check if performed
176
- # You SHOULD use update_valid_data at the end of insertion cycle
177
- def add_case_array(v)
178
- v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
179
- end
180
- def add_case(v,uvd=true)
181
- case v
182
- when Array
183
- if (v[0].is_a? Array)
184
- v.each{|subv| add_case(subv,false)}
185
- else
186
- raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size
187
- v.each_index {|i| @vectors[@fields[i]].add(v[i],false)}
188
- end
189
- when Hash
190
- raise ArgumentError, "Hash keys should be equal to fields" if @fields.sort!=v.keys.sort
191
- @fields.each{|f| @vectors[f].add(v[f],false)}
192
- else
193
- raise TypeError, 'Value must be a Array or a Hash'
194
- end
195
- if uvd
196
- update_valid_data
197
- end
198
- end
199
- def update_valid_data
200
- @fields.each{|f| @vectors[f].set_valid_data}
201
- check_length
202
- end
203
- def delete_vector(name)
204
- @fields.delete(name)
205
- @vectors.delete(name)
206
- end
207
- def add_vectors_by_split_recode(name,join='-',sep=Statsample::SPLIT_TOKEN)
208
- split=@vectors[name].split_by_separator(sep)
209
- i=1
210
- split.each{|k,v|
211
- new_field=name+join+i.to_s
212
- @labels[new_field]=name+":"+k
213
- add_vector(new_field,v)
214
- i+=1
215
- }
216
- end
217
- def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN)
218
- split=@vectors[name].split_by_separator(sep)
219
- split.each{|k,v|
220
- add_vector(name+join+k,v)
221
- }
222
- end
223
285
  def vector_by_calculation(type=:scale)
224
286
  a=[]
225
287
  each {|row|
@@ -238,214 +300,215 @@ module Statsample
238
300
  else
239
301
  fields.inject(0) {|ac,v| ac + row[v].to_f}
240
302
  end
241
- end
303
+ end
242
304
  end
243
- # Returns a vector with the numbers of missing values for a case
244
-
245
- def vector_missing_values(fields=nil)
246
- fields||=@fields
247
- raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
248
-
249
- collect_with_index do |i,row|
250
- fields.inject(0){|a,v|
251
- a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
252
- }
253
- end
254
- end
255
- def vector_count_characters(fields=nil)
256
- fields||=@fields
257
- raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
258
- collect_with_index do |i,row|
259
- fields.inject(0){|a,v|
260
-
261
- a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
262
- }
263
- end
264
- end
265
- # Returns a vector with the mean for a set of fields
266
- # if fields parameter is empty, return the mean for all fields
267
- # if max invalid parameter > 0, returns the mean for all tuples
268
- # with 0 to max_invalid invalid fields
269
- def vector_mean(fields=nil,max_invalid=0)
270
- a=[]
271
- fields||=@fields
272
- size=fields.size
273
- raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
274
- each_with_index do |i, row|
275
- # numero de invalidos
276
- sum=0
277
- invalids=0
278
- fields.each{|f|
279
- if !@vectors[f].data_with_nils[i].nil?
280
- sum+=row[f].to_f
281
- else
282
- invalids+=1
283
- end
284
- }
285
- if(invalids>max_invalid)
286
- a.push(nil)
287
- else
288
- a.push(sum.quo(size-invalids))
289
- end
290
- end
291
- a.to_vector(:scale)
292
- end
293
- def check_length
294
- size=nil
295
- @vectors.each do |k,v|
296
- raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
297
- if size.nil?
298
- size=v.size
299
- else
300
- if v.size!=size
301
- p v.to_a.size
302
- raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
303
- end
304
- end
305
- end
306
- @cases=size
307
- end
308
- def each_vector
309
- @fields.each{|k| yield k,@vectors[k]}
310
- end
311
- if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
312
- def case_as_hash(c) # :nodoc:
313
- Statsample::STATSAMPLE__.case_as_hash(self,c)
314
- end
315
- else
316
- def case_as_hash(c)
317
- _case_as_hash(c)
318
- end
319
- end
320
-
321
- if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
322
- def case_as_array(c) # :nodoc:
323
- Statsample::STATSAMPLE__.case_as_array(self,c)
324
- end
305
+ def check_fields(fields)
306
+ fields||=@fields
307
+ raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
308
+ fields
309
+ end
310
+ # Returns a vector with the numbers of missing values for a case
311
+
312
+ def vector_missing_values(fields=nil)
313
+ fields=check_fields(fields)
314
+ collect_with_index do |i,row|
315
+ fields.inject(0) {|a,v|
316
+ a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
317
+ }
318
+ end
319
+ end
320
+ def vector_count_characters(fields=nil)
321
+ fields=check_fields(fields)
322
+ collect_with_index do |i,row|
323
+ fields.inject(0){|a,v|
324
+
325
+ a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
326
+ }
327
+ end
328
+ end
329
+ # Returns a vector with the mean for a set of fields
330
+ # if fields parameter is empty, return the mean for all fields
331
+ # if max invalid parameter > 0, returns the mean for all tuples
332
+ # with 0 to max_invalid invalid fields
333
+ def vector_mean(fields=nil,max_invalid=0)
334
+ a=[]
335
+ fields=check_fields(fields)
336
+ size=fields.size
337
+ each_with_index do |i, row|
338
+ # numero de invalidos
339
+ sum=0
340
+ invalids=0
341
+ fields.each{|f|
342
+ if !@vectors[f].data_with_nils[i].nil?
343
+ sum+=row[f].to_f
344
+ else
345
+ invalids+=1
346
+ end
347
+ }
348
+ if(invalids>max_invalid)
349
+ a.push(nil)
325
350
  else
326
- def case_as_array(c)
327
- _case_as_array(c)
328
- end
329
- end
330
- def _case_as_hash(c) # :nodoc:
331
- @fields.inject({}) {|a,x| a[x]=@vectors[x][c];a }
332
- end
333
- def _case_as_array(c) # :nodoc:
334
- @fields.collect {|x| @vectors[x][c]}
335
- end
336
- # Returns each case as a hash
337
- def each
338
- begin
339
- @i=0
340
- @cases.times {|i|
341
- @i=i
342
- row=case_as_hash(i)
343
- yield row
344
- }
345
- @i=nil
346
- rescue =>e
347
- raise DatasetException.new(self,e)
348
- end
351
+ a.push(sum.quo(size-invalids))
349
352
  end
350
- # Returns each case as index and hash
351
- def each_with_index
352
- begin
353
- @i=0
354
- @cases.times{|i|
355
- @i=i
356
- row=case_as_hash(i)
357
- yield i,row
358
- }
359
- @i=nil
360
- rescue =>e
361
- raise DatasetException.new(self,e)
362
- end
363
- end
364
- # Returns each case as an array, coding missing values as nils
365
- def each_array_with_nils
366
- m=fields.size
367
- @cases.times {|i|
368
- @i=i
369
- row=Array.new(m)
370
- fields.each_index{|j|
371
- f=fields[j]
372
- row[j]=@vectors[f].data_with_nils[i]
373
- }
374
- yield row
375
- }
376
- @i=nil
377
- end
378
- # Returns each case as an array
379
- def each_array
380
- @cases.times {|i|
381
- @i=i
382
- row=case_as_array(i)
383
- yield row
384
- }
385
- @i=nil
386
- end
387
- def fields=(f)
388
- @fields=f
389
- check_order
390
- end
391
- def check_order
392
- if(@vectors.keys.sort!=@fields.sort)
393
- @fields=@fields&@vectors.keys
394
- @fields+=@vectors.keys.sort-@fields
395
- end
396
- end
397
- # Returns the vector named i
398
- def[](i)
399
- if i.is_a? String
400
- raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
401
- @vectors[i]
402
- elsif i.is_a? Range
403
- fields=from_to(i.begin,i.end)
404
- vectors=fields.inject({}) {|a,v| a[v]=@vectors[v];a}
405
- ds=Dataset.new(vectors,fields)
353
+ end
354
+ a.to_vector(:scale)
355
+ end
356
+ def check_length
357
+ size=nil
358
+ @vectors.each do |k,v|
359
+ raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
360
+ if size.nil?
361
+ size=v.size
406
362
  else
407
- raise ArgumentError, "You need a String or a Range"
408
- end
409
- end
410
- def collect(type=:scale)
411
- data=[]
412
- each {|row|
413
- data.push(yield(row))
414
- }
415
- Statsample::Vector.new(data,type)
416
- end
417
- def collect_with_index(type=:scale)
418
- data=[]
419
- each_with_index {|i,row|
420
- data.push(yield(i,row))
421
- }
422
- Statsample::Vector.new(data,type)
423
- end
424
- # Recode a vector based on a block
425
- def recode!(vector_name)
426
- 0.upto(@cases-1) {|i|
427
- @vectors[vector_name].data[i]=yield case_as_hash(i)
428
- }
429
- @vectors[vector_name].set_valid_data
430
- end
431
- def crosstab(v1,v2)
432
- Statsample::Crosstab.new(@vectors[v1],@vectors[v2])
433
- end
434
- def[]=(i,v)
435
- if v.instance_of? Statsample::Vector
436
- @vectors[i]=v
437
- check_order
438
- else
439
- raise ArgumentError,"Should pass a Statsample::Vector"
440
- end
441
- end
442
- def to_matrix
443
- rows=[]
444
- self.each_array{|c|
445
- rows.push(c)
446
- }
447
- Matrix.rows(rows)
363
+ if v.size!=size
364
+ p v.to_a.size
365
+ raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
366
+ end
448
367
  end
368
+ end
369
+ @cases=size
370
+ end
371
+ def each_vector
372
+ @fields.each{|k| yield k,@vectors[k]}
373
+ end
374
+ if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
375
+ def case_as_hash(c) # :nodoc:
376
+ Statsample::STATSAMPLE__.case_as_hash(self,c)
377
+ end
378
+ else
379
+ def case_as_hash(c)
380
+ _case_as_hash(c)
381
+ end
382
+ end
383
+
384
+ if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
385
+ def case_as_array(c) # :nodoc:
386
+ Statsample::STATSAMPLE__.case_as_array(self,c)
387
+ end
388
+ else
389
+ def case_as_array(c)
390
+ _case_as_array(c)
391
+ end
392
+ end
393
+ def _case_as_hash(c) # :nodoc:
394
+ @fields.inject({}) {|a,x| a[x]=@vectors[x][c];a }
395
+ end
396
+ def _case_as_array(c) # :nodoc:
397
+ @fields.collect {|x| @vectors[x][c]}
398
+ end
399
+ # Returns each case as a hash
400
+ def each
401
+ begin
402
+ @i=0
403
+ @cases.times {|i|
404
+ @i=i
405
+ row=case_as_hash(i)
406
+ yield row
407
+ }
408
+ @i=nil
409
+ rescue =>e
410
+ raise DatasetException.new(self,e)
411
+ end
412
+ end
413
+ # Returns each case as index and hash
414
+ def each_with_index
415
+ begin
416
+ @i=0
417
+ @cases.times{|i|
418
+ @i=i
419
+ row=case_as_hash(i)
420
+ yield i,row
421
+ }
422
+ @i=nil
423
+ rescue =>e
424
+ raise DatasetException.new(self,e)
425
+ end
426
+ end
427
+ # Returns each case as an array, coding missing values as nils
428
+ def each_array_with_nils
429
+ m=fields.size
430
+ @cases.times {|i|
431
+ @i=i
432
+ row=Array.new(m)
433
+ fields.each_index{|j|
434
+ f=fields[j]
435
+ row[j]=@vectors[f].data_with_nils[i]
436
+ }
437
+ yield row
438
+ }
439
+ @i=nil
440
+ end
441
+ # Returns each case as an array
442
+ def each_array
443
+ @cases.times {|i|
444
+ @i=i
445
+ row=case_as_array(i)
446
+ yield row
447
+ }
448
+ @i=nil
449
+ end
450
+ def fields=(f)
451
+ @fields=f
452
+ check_order
453
+ end
454
+ def check_order
455
+ if(@vectors.keys.sort!=@fields.sort)
456
+ @fields=@fields&@vectors.keys
457
+ @fields+=@vectors.keys.sort-@fields
458
+ end
459
+ end
460
+ # Returns the vector named i
461
+ def[](i)
462
+ if i.is_a? String
463
+ raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
464
+ @vectors[i]
465
+ elsif i.is_a? Range
466
+ fields=from_to(i.begin,i.end)
467
+ vectors=fields.inject({}) {|a,v| a[v]=@vectors[v];a}
468
+ ds=Dataset.new(vectors,fields)
469
+ else
470
+ raise ArgumentError, "You need a String or a Range"
471
+ end
472
+ end
473
+ def collect(type=:scale)
474
+ data=[]
475
+ each {|row|
476
+ data.push(yield(row))
477
+ }
478
+ Statsample::Vector.new(data,type)
479
+ end
480
+ def collect_with_index(type=:scale)
481
+ data=[]
482
+ each_with_index {|i,row|
483
+ data.push(yield(i,row))
484
+ }
485
+ Statsample::Vector.new(data,type)
486
+ end
487
+ # Recode a vector based on a block
488
+ def recode!(vector_name)
489
+ 0.upto(@cases-1) {|i|
490
+ @vectors[vector_name].data[i]=yield case_as_hash(i)
491
+ }
492
+ @vectors[vector_name].set_valid_data
493
+ end
494
+ def crosstab(v1,v2)
495
+ Statsample::Crosstab.new(@vectors[v1],@vectors[v2])
496
+ end
497
+ def[]=(i,v)
498
+ if v.instance_of? Statsample::Vector
499
+ @vectors[i]=v
500
+ check_order
501
+ else
502
+ raise ArgumentError,"Should pass a Statsample::Vector"
503
+ end
504
+ end
505
+ def to_matrix
506
+ rows=[]
507
+ self.each_array{|c|
508
+ rows.push(c)
509
+ }
510
+ Matrix.rows(rows)
511
+ end
449
512
  def to_multiset_by_split(*fields)
450
513
  require 'statsample/multiset'
451
514
  if fields.size==1
@@ -454,15 +517,15 @@ module Statsample
454
517
  to_multiset_by_split_multiple_fields(*fields)
455
518
  end
456
519
  end
457
- # create a new dataset with all the data which the block returns true
458
- def filter
459
- ds=self.dup_empty
460
- each {|c|
461
- ds.add_case(c,false) if yield c
462
- }
463
- ds.update_valid_data
464
- ds
465
- end
520
+ # create a new dataset with all the data which the block returns true
521
+ def filter
522
+ ds=self.dup_empty
523
+ each {|c|
524
+ ds.add_case(c,false) if yield c
525
+ }
526
+ ds.update_valid_data
527
+ ds
528
+ end
466
529
  # creates a new vector with the data of a given field which the block returns true
467
530
  def filter_field(field)
468
531
  a=[]
@@ -471,123 +534,112 @@ module Statsample
471
534
  }
472
535
  a.to_vector(@vectors[field].type)
473
536
  end
474
- def to_multiset_by_split_one_field(field)
475
- raise ArgumentError,"Should use a correct field name" if !@fields.include? field
476
- factors=@vectors[field].factors
477
- ms=Multiset.new_empty_vectors(@fields,factors)
478
- each {|c|
479
- ms[c[field]].add_case(c,false)
480
- }
481
- #puts "Ingreso a los dataset"
482
- ms.datasets.each {|k,ds|
483
- ds.update_valid_data
484
- ds.vectors.each{|k1,v1|
485
- # puts "Vector #{k1}:"+v1.to_s
486
- v1.type=@vectors[k1].type
487
- }
488
- }
489
- ms
490
- end
491
- def to_multiset_by_split_multiple_fields(*fields)
492
- factors_total=nil
493
- fields.each{|f|
494
- if factors_total.nil?
495
- factors_total=@vectors[f].factors.collect{|c|
496
- [c]
497
- }
498
- else
499
- suma=[]
500
- factors=@vectors[f].factors
501
- factors_total.each{|f1|
502
- factors.each{|f2|
503
- suma.push(f1+[f2])
504
- }
505
- }
506
- factors_total=suma
507
- end
508
- }
509
- ms=Multiset.new_empty_vectors(@fields,factors_total)
510
- p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }"
511
- each{|c|
512
- p1.call(c)
513
- }
514
- ms.datasets.each {|k,ds|
515
- ds.update_valid_data
516
- ds.vectors.each{|k1,v1|
517
- # puts "Vector #{k1}:"+v1.to_s
518
- v1.type=@vectors[k1].type
519
- }
520
- }
521
- ms
522
-
523
- end
524
- # Returns a vector, based on a string with a calculation based
525
- # on vector
526
- # The calculation will be eval'ed, so you can put any variable
527
- # or expression valid on ruby
528
- # For example:
529
- # a=[1,2].to_vector(scale)
530
- # b=[3,4].to_vector(scale)
531
- # ds={'a'=>a,'b'=>b}.to_dataset
532
- # ds.calculate("a+b")
533
- # => Vector [4,6]
534
- def compute(text)
535
- @fields.each{|f|
536
- if @vectors[f].type=:scale
537
- text.gsub!(f,"row['#{f}'].to_f")
538
- else
539
- text.gsub!(f,"row['#{f}']")
540
-
541
- end
542
-
543
- }
544
- collect_with_index {|i,row|
545
- invalid=false
546
- @fields.each{|f|
547
- if @vectors[f].data_with_nils[i].nil?
548
- invalid=true
549
- end
550
- }
551
- if invalid
552
- nil
553
- else
554
- eval(text)
555
- end
556
- }
537
+ def to_multiset_by_split_one_field(field)
538
+ raise ArgumentError,"Should use a correct field name" if !@fields.include? field
539
+ factors=@vectors[field].factors
540
+ ms=Multiset.new_empty_vectors(@fields,factors)
541
+ each {|c|
542
+ ms[c[field]].add_case(c,false)
543
+ }
544
+ #puts "Ingreso a los dataset"
545
+ ms.datasets.each {|k,ds|
546
+ ds.update_valid_data
547
+ ds.vectors.each{|k1,v1|
548
+ # puts "Vector #{k1}:"+v1.to_s
549
+ v1.type=@vectors[k1].type
550
+ }
551
+ }
552
+ ms
553
+ end
554
+ def to_multiset_by_split_multiple_fields(*fields)
555
+ factors_total=nil
556
+ fields.each do |f|
557
+ if factors_total.nil?
558
+ factors_total=@vectors[f].factors.collect{|c|
559
+ [c]
560
+ }
561
+ else
562
+ suma=[]
563
+ factors=@vectors[f].factors
564
+ factors_total.each{|f1| factors.each{|f2| suma.push(f1+[f2]) } }
565
+ factors_total=suma
566
+ end
567
+ end
568
+ ms=Multiset.new_empty_vectors(@fields,factors_total)
569
+ p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }"
570
+ each{|c| p1.call(c)}
571
+ ms.datasets.each do |k,ds|
572
+ ds.update_valid_data
573
+ ds.vectors.each{|k1,v1| v1.type=@vectors[k1].type }
574
+ end
575
+ ms
576
+
577
+ end
578
+ # Returns a vector, based on a string with a calculation based
579
+ # on vector
580
+ # The calculation will be eval'ed, so you can put any variable
581
+ # or expression valid on ruby
582
+ # For example:
583
+ # a=[1,2].to_vector(scale)
584
+ # b=[3,4].to_vector(scale)
585
+ # ds={'a'=>a,'b'=>b}.to_dataset
586
+ # ds.compute("a+b")
587
+ # => Vector [4,6]
588
+ def compute(text)
589
+ @fields.each{|f|
590
+ if @vectors[f].type=:scale
591
+ text.gsub!(f,"row['#{f}'].to_f")
592
+ else
593
+ text.gsub!(f,"row['#{f}']")
594
+ end
595
+ }
596
+ collect_with_index {|i,row|
597
+ invalid=false
598
+ @fields.each{|f|
599
+ if @vectors[f].data_with_nils[i].nil?
600
+ invalid=true
601
+ end
602
+ }
603
+ if invalid
604
+ nil
605
+ else
606
+ eval(text)
557
607
  end
558
- # Test each row with one or more tests
559
- # each test is a Proc with the form
560
- # Proc.new {|row| row['age']>0}
561
- # The function returns an array with all errors
562
- def verify(*tests)
563
- if(tests[0].is_a? String)
564
- id=tests[0]
565
- tests.shift
566
- else
567
- id=@fields[0]
568
- end
569
- vr=[]
570
- i=0
571
- each do |row|
572
- i+=1
573
- tests.each{|test|
574
- if ! test[2].call(row)
575
- values=""
576
- if test[1].size>0
577
- values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")"
578
- end
579
- vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
580
- end
581
- }
608
+ }
609
+ end
610
+ # Test each row with one or more tests
611
+ # each test is a Proc with the form
612
+ # Proc.new {|row| row['age']>0}
613
+ # The function returns an array with all errors
614
+ def verify(*tests)
615
+ if(tests[0].is_a? String)
616
+ id=tests[0]
617
+ tests.shift
618
+ else
619
+ id=@fields[0]
620
+ end
621
+ vr=[]
622
+ i=0
623
+ each do |row|
624
+ i+=1
625
+ tests.each{|test|
626
+ if ! test[2].call(row)
627
+ values=""
628
+ if test[1].size>0
629
+ values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")"
582
630
  end
583
- vr
584
- end
585
- def to_s
586
- "#<"+self.class.to_s+":"+self.object_id.to_s+" @fields=["+@fields.join(",")+"] labels="+@labels.inspect+" cases="+@vectors[@fields[0]].size.to_s
587
- end
588
- def inspect
589
- self.to_s
590
- end
631
+ vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
632
+ end
633
+ }
634
+ end
635
+ vr
636
+ end
637
+ def to_s
638
+ "#<"+self.class.to_s+":"+self.object_id.to_s+" @fields=["+@fields.join(",")+"] labels="+@labels.inspect+" cases="+@vectors[@fields[0]].size.to_s
639
+ end
640
+ def inspect
641
+ self.to_s
642
+ end
591
643
  def summary
592
644
  out=""
593
645
  out << "Summary for dataset\n"
@@ -600,10 +652,10 @@ module Statsample
600
652
  }
601
653
  out
602
654
  end
603
- def as_r
604
- require 'rsruby/dataframe'
605
- r=RSRuby.instance
606
-
607
- end
655
+ def as_r
656
+ require 'rsruby/dataframe'
657
+ r=RSRuby.instance
658
+
608
659
  end
660
+ end
609
661
  end