statsample 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,60 @@
1
+ module Statsample
2
+ class CSV < SpreadsheetBase
3
+ class << self
4
+ # Returns a Dataset based on a csv file
5
+ #
6
+ # USE:
7
+ # ds=Statsample::CSV.read("test_csv.csv")
8
+ def read(filename, empty=[''],ignore_lines=0,fs=nil,rs=nil)
9
+ require 'csv'
10
+ first_row=true
11
+ fields=[]
12
+ fields_data={}
13
+ ds=nil
14
+ line_number=0
15
+ opts={}
16
+ opts[:col_sep]=fs unless fs.nil?
17
+ opts[:row_sep]=rs unless rs.nil?
18
+ csv=::CSV.open(filename,'r',opts)
19
+
20
+ csv.each do |row|
21
+ line_number+=1
22
+ if(line_number<=ignore_lines)
23
+ #puts "Skip line"
24
+ next
25
+ end
26
+ row.collect!{|c|
27
+ c.to_s
28
+ }
29
+ if first_row
30
+ fields=extract_fields(row)
31
+ ds=Statsample::Dataset.new(fields)
32
+ first_row=false
33
+ else
34
+ rowa=process_row(row,empty)
35
+ ds.add_case(rowa,false)
36
+ end
37
+ end
38
+ convert_to_scale(ds,fields)
39
+ ds.update_valid_data
40
+ ds
41
+ end
42
+ # Save a Dataset on a csv file
43
+ #
44
+ # USE:
45
+ # Statsample::CSV.write(ds,"test_csv.csv")
46
+ def write(dataset,filename, convert_comma=false,*opts)
47
+ require 'csv'
48
+ writer=::CSV.open(filename,'w',*opts)
49
+ writer << dataset.fields
50
+ dataset.each_array{|row|
51
+ if(convert_comma)
52
+ row.collect!{|v| v.to_s.gsub(".",",")}
53
+ end
54
+ writer << row
55
+ }
56
+ writer.close
57
+ end
58
+ end
59
+ end
60
+ end
@@ -1,7 +1,6 @@
1
1
  module Statsample
2
2
  # Create and dumps Datasets on a database
3
3
  module Database
4
- require 'dbi'
5
4
  class << self
6
5
  # Read a database query and returns a Dataset
7
6
  #
@@ -11,6 +10,7 @@ module Statsample
11
10
  # Statsample.read(dbh, "SELECT * FROM test")
12
11
  #
13
12
  def read(dbh,query)
13
+ require 'dbi'
14
14
  sth=dbh.execute(query)
15
15
  vectors={}
16
16
  fields=[]
@@ -35,6 +35,7 @@ module Statsample
35
35
  # Statsample::Database.insert(ds,dbh,"test")
36
36
  #
37
37
  def insert(ds, dbh,table)
38
+ require 'dbi'
38
39
  query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
39
40
  sth=dbh.prepare(query)
40
41
  ds.each_array{|c|
@@ -64,15 +65,11 @@ module Statsample
64
65
  def write(dataset,filename)
65
66
  File.open(filename,"wb") do |fp|
66
67
  fp.puts dataset.fields.join("\t")
67
- dataset.each {|row|
68
- values=dataset.fields.collect{|f|
69
- if dataset[f].is_valid? row[f]
70
- row[f]
71
- else
72
- ""
73
- end
68
+ dataset.each_array_with_nils{|row|
69
+ row2=row.collect{|v|
70
+ v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_")
74
71
  }
75
- fp.puts(values.join("\t"))
72
+ fp.puts row2.join("\t")
76
73
  }
77
74
  end
78
75
  end
@@ -194,7 +191,7 @@ module Statsample
194
191
  first_row=false
195
192
  else
196
193
  rowa=process_row(row,empty)
197
- (fields.size - rowa.size).times {|i|
194
+ (fields.size - rowa.size).times {
198
195
  rowa << nil
199
196
  }
200
197
  ds.add_case(rowa,false)
@@ -210,59 +207,6 @@ module Statsample
210
207
  end
211
208
  end
212
209
  end
213
- class CSV < SpreadsheetBase
214
- class << self
215
- # Returns a Dataset based on a csv file
216
- #
217
- # USE:
218
- # ds=Statsample::CSV.read("test_csv.csv")
219
- def read(filename, empty=[''],ignore_lines=0,fs=nil,rs=nil)
220
- require 'csv'
221
- first_row=true
222
- fields=[]
223
- fields_data={}
224
- ds=nil
225
- line_number=0
226
- ::CSV.open(filename,'r',fs,rs) do |row|
227
- line_number+=1
228
- if(line_number<=ignore_lines)
229
- #puts "Skip line"
230
- next
231
- end
232
- row.collect!{|c|
233
- c.to_s
234
- }
235
- if first_row
236
- fields=extract_fields(row)
237
- ds=Statsample::Dataset.new(fields)
238
- first_row=false
239
- else
240
- rowa=process_row(row,empty)
241
- ds.add_case(rowa,false)
242
- end
243
- end
244
- convert_to_scale(ds,fields)
245
- ds.update_valid_data
246
- ds
247
- end
248
- # Save a Dataset on a csv file
249
- #
250
- # USE:
251
- # Statsample::CSV.write(ds,"test_csv.csv")
252
- def write(dataset,filename, convert_comma=false,*opts)
253
- require 'csv'
254
- writer=::CSV.open(filename,'w',*opts)
255
- writer << dataset.fields
256
- dataset.each_array{|row|
257
- if(convert_comma)
258
- row.collect!{|v| v.to_s.gsub(".",",")}
259
- end
260
- writer << row
261
- }
262
- writer.close
263
- end
264
- end
265
- end
266
210
  module Mx
267
211
  class << self
268
212
  def write(dataset,filename,type=:covariance)
@@ -309,13 +253,13 @@ module Statsample
309
253
  end
310
254
  def out(dataset,opt={})
311
255
  require 'ostruct'
312
- default_opt = {:dataname => "Default", :description=>""}
256
+ default_opt = {:dataname => "Default", :description=>"", :missing=>"NA"}
313
257
  default_opt.merge! opt
314
258
  carrier=OpenStruct.new
315
259
  carrier.categorials=[]
316
260
  carrier.conversions={}
317
- variables_def=dataset.vectors.collect{|k,v|
318
- variable_definition(carrier,v,k)
261
+ variables_def=dataset.fields.collect{|k|
262
+ variable_definition(carrier,dataset[k],k)
319
263
  }.join("\n")
320
264
 
321
265
  indexes=carrier.categorials.inject({}) {|s,c|
@@ -327,7 +271,7 @@ module Statsample
327
271
  indexes.each{|ik,iv|
328
272
  c[ik]=carrier.conversions[iv][c[ik]]
329
273
  }
330
- records << "<record>#{values_definition(c)}</record>\n"
274
+ records << "<record>#{values_definition(c, default_opt[:missing])}</record>\n"
331
275
  }
332
276
 
333
277
  out=<<EOC
@@ -339,7 +283,7 @@ out=<<EOC
339
283
  <variables count="#{dataset.fields.size}">
340
284
  #{variables_def}
341
285
  </variables>
342
- <records count="#{dataset.cases}">
286
+ <records count="#{dataset.cases}" missingValue="#{default_opt[:missing]}">
343
287
  #{records}
344
288
  </records>
345
289
 
@@ -350,14 +294,14 @@ EOC
350
294
  out
351
295
 
352
296
  end
353
- def values_definition(c)
297
+ def values_definition(c,missing)
354
298
  c.collect{|v|
355
- if v.is_a? Float
356
- "<real>#{v}</real>"
357
- elsif v.is_a? Integer
358
- "<int>#{v}</int>"
299
+ if v.nil?
300
+ "#{missing}"
301
+ elsif v.is_a? Numeric
302
+ "#{v}"
359
303
  else
360
- "<string>#{v}</string>"
304
+ "#{v.gsub(/\s+/,"_")}"
361
305
  end
362
306
  }.join(" ")
363
307
  end
@@ -370,7 +314,7 @@ out
370
314
  if v.type==:nominal or v.data.find {|d| d.is_a? String }
371
315
  carrier.categorials.push(name)
372
316
  carrier.conversions[name]={}
373
- factors=v.data.uniq.sort
317
+ factors=v.factors
374
318
  out ="<categoricalvariable name=\"#{name}\" #{nickname}>\n"
375
319
  out << "<levels count=\"#{factors.size}\">\n"
376
320
  out << (1..factors.size).to_a.collect{|i|
@@ -389,3 +333,10 @@ out
389
333
  end
390
334
  end
391
335
  end
336
+
337
+ if RUBY_VERSION<"1.9"
338
+ require 'statsample/converter/csv18.rb'
339
+ else
340
+ require 'statsample/converter/csv19.rb'
341
+ end
342
+
@@ -292,7 +292,7 @@ module Statsample
292
292
  end
293
293
  def check_length
294
294
  size=nil
295
- @vectors.each{|k,v|
295
+ @vectors.each do |k,v|
296
296
  raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
297
297
  if size.nil?
298
298
  size=v.size
@@ -302,38 +302,33 @@ module Statsample
302
302
  raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
303
303
  end
304
304
  end
305
- }
306
- @cases=size
307
305
  end
308
- def each_vector
309
- @fields.each{|k|
310
- yield k,@vectors[k]
311
- }
306
+ @cases=size
307
+ end
308
+ def each_vector
309
+ @fields.each{|k| yield k,@vectors[k]}
310
+ end
311
+ if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
312
+ def case_as_hash(c) # :nodoc:
313
+ Statsample::STATSAMPLE__.case_as_hash(self,c)
312
314
  end
313
- if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
314
- def case_as_hash(c) # :nodoc:
315
- Statsample::STATSAMPLE__.case_as_hash(self,c)
316
- end
317
- else
318
- def case_as_hash(c)
319
- _case_as_hash(c)
320
- end
315
+ else
316
+ def case_as_hash(c)
317
+ _case_as_hash(c)
318
+ end
319
+ end
320
+
321
+ if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
322
+ def case_as_array(c) # :nodoc:
323
+ Statsample::STATSAMPLE__.case_as_array(self,c)
321
324
  end
322
-
323
- if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
324
- def case_as_array(c) # :nodoc:
325
- Statsample::STATSAMPLE__.case_as_array(self,c)
326
- end
327
- else
328
- def case_as_array(c)
329
- _case_as_array(c)
330
- end
325
+ else
326
+ def case_as_array(c)
327
+ _case_as_array(c)
331
328
  end
332
- def _case_as_hash(c) # :nodoc:
333
- @fields.inject({}) {|a,x|
334
- a[x]=@vectors[x][c]
335
- a
336
- }
329
+ end
330
+ def _case_as_hash(c) # :nodoc:
331
+ @fields.inject({}) {|a,x| a[x]=@vectors[x][c];a }
337
332
  end
338
333
  def _case_as_array(c) # :nodoc:
339
334
  @fields.collect {|x| @vectors[x][c]}
@@ -366,6 +361,20 @@ module Statsample
366
361
  raise DatasetException.new(self,e)
367
362
  end
368
363
  end
364
+ # Returns each case as an array, coding missing values as nils
365
+ def each_array_with_nils
366
+ m=fields.size
367
+ @cases.times {|i|
368
+ @i=i
369
+ row=Array.new(m)
370
+ fields.each_index{|j|
371
+ f=fields[j]
372
+ row[j]=@vectors[f].data_with_nils[i]
373
+ }
374
+ yield row
375
+ }
376
+ @i=nil
377
+ end
369
378
  # Returns each case as an array
370
379
  def each_array
371
380
  @cases.times {|i|
@@ -124,8 +124,8 @@ module Statsample
124
124
  }
125
125
  }
126
126
  out={}
127
- averages.each{|k,v|
128
- out[k]=v.to_vector(:scale).mean
127
+ averages.each{|key,val|
128
+ out[key]=val.to_vector(:scale).mean
129
129
  }
130
130
  out
131
131
  end
@@ -139,8 +139,8 @@ module Statsample
139
139
  }
140
140
  end
141
141
  out={}
142
- averages.each{|k,v|
143
- out[k]=v.to_vector(:scale).mean
142
+ averages.each{|key,val|
143
+ out[key]=val.to_vector(:scale).mean
144
144
  }
145
145
  @general_averages=out
146
146
  end
@@ -152,8 +152,8 @@ module Statsample
152
152
  for i in 1..@fields.size
153
153
  c=Statsample::Combination.new(i,@fields.size)
154
154
  c.each{|data|
155
- convert=data.collect {|i|
156
- @fields[i]
155
+ convert=data.collect {|i1|
156
+ @fields[i1]
157
157
  }
158
158
  @models.push(convert)
159
159
  ds_prev=@ds.dup(convert+[@y_var])
@@ -11,7 +11,8 @@ module Statsample
11
11
  options.each{|k,v|
12
12
  gdc.send(k+"=",v)
13
13
  }
14
- f=File.open(file,"w") {|f|
14
+
15
+ File.open(file,"w") {|f|
15
16
  gdc.out_graph(width,height,f,chart_type, data.length/num_datasets,labels,num_datasets,data)
16
17
  }
17
18
  end
@@ -9,19 +9,20 @@ module Statsample
9
9
  class Vector
10
10
  # Creates a barchart using ruby-gdchart
11
11
  def svggraph_frequencies(file, width=600, height=300, chart_type=SVG::Graph::BarNoOp, options={})
12
- labels,data=[],[]
12
+ labels, data1=[],[]
13
13
  self.frequencies.sort.each{|k,v|
14
14
  labels.push(k.to_s)
15
- data.push(v)
15
+ data1.push(v)
16
16
  }
17
17
  options[:height]=height
18
18
  options[:width]=width
19
19
  options[:fields]=labels
20
20
  graph = chart_type.new(options)
21
21
  graph.add_data(
22
- :data => data,
22
+ :data => data1,
23
23
  :title => "Frequencies"
24
24
  )
25
+
25
26
  File.open(file,"w") {|f|
26
27
  f.puts(graph.burn)
27
28
  }
@@ -49,7 +50,7 @@ module Statsample
49
50
  def svggraph_boxplot(options={})
50
51
  check_type :scale
51
52
  options={:graph_title=>"Boxplot", :fields=>['vector'], :show_graph_title=>true}.merge! options
52
- vx=@data.to_a.to_vector(:scale)
53
+ vx=@valid_data.to_a.to_vector(:scale)
53
54
  graph = Statsample::Graph::SvgBoxplot.new(options)
54
55
  graph.add_data(:title=>"vector", :data=>@data.to_a)
55
56
  graph
@@ -58,8 +59,8 @@ module Statsample
58
59
  def svggraph_lag_plot(options={})
59
60
  check_type :scale
60
61
  options={:graph_title=>"Lag Plot", :show_graph_title=>true}.merge! options
61
- vx=@data[0...(@data.size-1)].to_vector(:scale)
62
- vy=@data[1...@data.size].to_vector(:scale)
62
+ vx=@valid_data[0...(@valid_data.size-1)].to_vector(:scale)
63
+ vy=@valid_data[1...@valid_data.size].to_vector(:scale)
63
64
  ds={'x_minus_1'=>vx,'x'=>vy}.to_dataset
64
65
  graph = Statsample::Graph::SvgScatterplot.new(ds,options)
65
66
  graph.set_x('x_minus_1')
@@ -73,11 +74,11 @@ module Statsample
73
74
  extend Statsample::Util
74
75
  check_type :scale
75
76
  options={:graph_title=>"Normal Probability Plot", :show_graph_title=>true}.merge! options
76
- n=@data.size
77
- vx=(1..@data.size).to_a.collect{|i|
77
+ n=@valid_data.size
78
+ vx=(1..@valid_data.size).to_a.collect{|i|
78
79
  Distribution::Normal.p_value(normal_order_statistic_medians(i,n))
79
80
  }.to_vector(:scale)
80
- vy=@data.sort.to_vector(:scale)
81
+ vy=@valid_data.sort.to_vector(:scale)
81
82
  ds={'normal_order_statistics_medians'=>vx, 'ordered_response'=>vy}.to_dataset
82
83
  graph = Statsample::Graph::SvgScatterplot.new(ds,options)
83
84
  graph.set_x('normal_order_statistics_medians')
@@ -50,13 +50,13 @@ module Statsample
50
50
  class StratifiedSample
51
51
  class << self
52
52
  # mean for an array of vectors
53
- def mean(*v)
53
+ def mean(*vectors)
54
54
  n_total=0
55
- a=v.inject(0){|a,v|
55
+ means=vectors.inject(0){|a,v|
56
56
  n_total+=v.size
57
57
  a+v.sum
58
58
  }
59
- a.to_f/n_total
59
+ means.to_f/n_total
60
60
  end
61
61
 
62
62
  def standard_error_ksd_wr(es)