statsample 0.6.7 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,5 +1,12 @@
1
+ === 0.7.0 / 2010-03-25
2
+ * Ported to ReportBuilder 1.x series
3
+ * Implementation of ruby based covariance and correlation changed to a clearer code
4
+ * Statsample::Vector#svggraph_frequencies accepts IO
5
+ * Some test ported to Miniunit
6
+ * CSV on Ruby1.8 uses FasterCSV
7
+
1
8
  === 0.6.7 / 2010-03-23
2
- * Bug fix: dependency on Reportbuilder should be set to "~>0.2.0", not "0.2"
9
+ * Bug fix: dependency on ReportBuilder should be set to "~>0.2.0", not "0.2"
3
10
  === 0.6.6 / 2010-03-22
4
11
  * Set ReportBuilder dependency to '0.2.~' version, because future API break
5
12
  * Removed Alglib dependency
data/Manifest.txt CHANGED
@@ -34,8 +34,7 @@ lib/statsample/bivariate/polychoric.rb
34
34
  lib/statsample/bivariate/tetrachoric.rb
35
35
  lib/statsample/codification.rb
36
36
  lib/statsample/combination.rb
37
- lib/statsample/converter/csv18.rb
38
- lib/statsample/converter/csv19.rb
37
+ lib/statsample/converter/csv.rb
39
38
  lib/statsample/converter/spss.rb
40
39
  lib/statsample/converters.rb
41
40
  lib/statsample/crosstab.rb
data/Rakefile CHANGED
@@ -42,7 +42,7 @@ h=Hoe.spec('statsample') do
42
42
  self.version=Statsample::VERSION
43
43
  self.rubyforge_name = "ruby-statsample"
44
44
  self.developer('Claudio Bustos', 'clbustos@gmail.com')
45
- self.extra_deps << ["spreadsheet","~>0.6.0"] << ["svg-graph", "~>1.0"] << ["reportbuilder", "~>0.2.0"] << ["minimization", "~>0.1.0"]
45
+ self.extra_deps << ["spreadsheet","~>0.6.0"] << ["svg-graph", "~>1.0"] << ["reportbuilder", "~>1.0"] << ["minimization", "~>0.1.0"] << ["fastercsv"]
46
46
  self.clean_globs << "test/images/*" << "demo/item_analysis/*" << "demo/Regression"
47
47
  self.need_rdoc=false
48
48
  end
@@ -10,7 +10,7 @@ d=sample.times.collect {rand}.to_scale
10
10
 
11
11
  ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
12
12
  ds['y']=ds.collect{|row| row['a']*5+row['b']*3+row['c']*2+row['d']+rand()}
13
- rb=ReportBuilder.new("Dominance Analysis")
13
+ rb=ReportBuilder.new(:name=>"Dominance Analysis")
14
14
 
15
15
  cm=Statsample::Bivariate.correlation_matrix(ds)
16
16
  rb.add(cm)
@@ -25,3 +25,4 @@ rb.add(da)
25
25
 
26
26
 
27
27
  puts rb.to_text
28
+ rb.save_rtf("dominance_analysis.rtf")
data/lib/statsample.rb CHANGED
@@ -23,7 +23,7 @@ $:.unshift(File.expand_path(File.dirname(__FILE__)+"/../ext"))
23
23
 
24
24
  require 'matrix'
25
25
  require 'distribution'
26
- raise "Install reportbuilder ~>0.2.0" unless gem 'reportbuilder','~>0.2.0'
26
+ raise "Install reportbuilder ~>1.0" unless gem 'reportbuilder','~>1.0'
27
27
  require 'reportbuilder'
28
28
  class Numeric
29
29
  def square ; self * self ; end
@@ -113,7 +113,7 @@ module Statsample
113
113
  false
114
114
  end
115
115
  end
116
- VERSION = '0.6.7'
116
+ VERSION = '0.7.0'
117
117
  SPLIT_TOKEN = ","
118
118
  autoload(:Database, 'statsample/converters')
119
119
  autoload(:Anova, 'statsample/anova')
@@ -24,14 +24,16 @@ module Statsample
24
24
  sum
25
25
  end
26
26
 
27
- def covariance_slow(v1a,v2a) # :nodoc:
28
- t=0
27
+ def covariance_slow(v1,v2) # :nodoc:
28
+ v1a,v2a=Statsample.only_valid(v1,v2)
29
+ sum_of_squares(v1a,v2a) / (v1a.size-1)
30
+ end
31
+ def sum_of_squares(v1,v2)
32
+ v1a,v2a=Statsample.only_valid(v1,v2)
29
33
  m1=v1a.mean
30
- m2=v1a.mean
31
- (0...v1a.size).each {|i| t+=((v1a[i]-m1)*(v2a[i]-m2)) }
32
- t.to_f / (v1a.size-1)
34
+ m2=v2a.mean
35
+ (v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
33
36
  end
34
-
35
37
  # Calculate Pearson correlation coefficient (r) between 2 vectors
36
38
  def pearson(v1,v2)
37
39
  v1a,v2a=Statsample.only_valid(v1,v2)
@@ -42,12 +44,18 @@ module Statsample
42
44
  pearson_slow(v1a,v2a)
43
45
  end
44
46
  end
45
- def pearson_slow(v1a,v2a) # :nodoc:
46
- v1s,v2s=v1a.vector_standarized_pop,v2a.vector_standarized_pop
47
+ def pearson_slow(v1,v2) # :nodoc:
48
+ v1a,v2a=Statsample.only_valid(v1,v2)
49
+ # Calculate sum of squares
50
+ ss=sum_of_squares(v1a,v2a)
51
+ ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
52
+ =begin
53
+ v1s,v2s=v1a.vector_standarized,v2a.vector_standarized
47
54
  t=0
48
55
  siz=v1s.size
49
56
  (0...v1s.size).each {|i| t+=(v1s[i]*v2s[i]) }
50
- t.to_f/v2s.size
57
+ t.quo(v2s.size-1)
58
+ =end
51
59
  end
52
60
  # Retrieves the value for t test for a pearson correlation
53
61
  # between two vectors to test the null hipothesis of r=0
@@ -279,6 +287,7 @@ module Statsample
279
287
  }
280
288
  a
281
289
  end
290
+ =begin
282
291
  def sum_of_codeviated(v1,v2)
283
292
  v1a,v2a=Statsample.only_valid(v1,v2)
284
293
  sum=0
@@ -287,7 +296,7 @@ module Statsample
287
296
  }
288
297
  sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
289
298
  end
290
-
299
+ =end
291
300
  # Report the minimum number of cases valid of a covariate matrix
292
301
  # based on a dataset
293
302
  def min_n_valid(ds)
@@ -744,24 +744,24 @@ module Statsample
744
744
  rp.to_text
745
745
  end
746
746
 
747
- def to_reportbuilder(generator) # :nodoc:
747
+ def report_building(generator) # :nodoc:
748
748
  compute if @r.nil?
749
749
  section=ReportBuilder::Section.new(:name=>@name)
750
750
  t=ReportBuilder::Table.new(:name=>_("Contingence Table"),:header=>[""]+(@n.times.collect {|i| "Y=#{i}"})+["Total"])
751
751
  @m.times do |i|
752
- t.add_row(["X = #{i}"]+(@n.times.collect {|j| @matrix[i,j]}) + [@sumr[i]])
752
+ t.row(["X = #{i}"]+(@n.times.collect {|j| @matrix[i,j]}) + [@sumr[i]])
753
753
  end
754
- t.add_hr
755
- t.add_row(["T"]+(@n.times.collect {|j| @sumc[j]})+[@total])
754
+ t.hr
755
+ t.row(["T"]+(@n.times.collect {|j| @sumc[j]})+[@total])
756
756
  section.add(t)
757
757
  #generator.parse_element(t)
758
758
  section.add(sprintf("r: %0.4f",r))
759
759
  t=ReportBuilder::Table.new(:name=>_("Thresholds"), :header=>["","Value"])
760
760
  threshold_x.each_with_index {|val,i|
761
- t.add_row(["Threshold X #{i}", sprintf("%0.4f", val)])
761
+ t.row(["Threshold X #{i}", sprintf("%0.4f", val)])
762
762
  }
763
763
  threshold_y.each_with_index {|val,i|
764
- t.add_row(["Threshold Y #{i}", sprintf("%0.4f", val)])
764
+ t.row(["Threshold Y #{i}", sprintf("%0.4f", val)])
765
765
  }
766
766
  section.add(t)
767
767
  section.add(_("Test of bivariate normality: X2 = %0.3f, df = %d, p= %0.5f" % [ chi_square, chi_square_df, 1-Distribution::ChiSquare.cdf(chi_square, chi_square_df)]))
@@ -114,18 +114,18 @@ module Statsample
114
114
  end
115
115
  # Summary of the analysis
116
116
  def summary
117
- rp=ReportBuilder.new()
117
+ rp=ReportBuilder.new(:name=>@name)
118
118
  rp.add(self)
119
119
  rp.to_text
120
120
  end
121
121
 
122
- def to_reportbuilder(generator) # :nodoc:
122
+ def report_building(generator) # :nodoc:
123
123
  section=ReportBuilder::Section.new(:name=>@name)
124
124
  t=ReportBuilder::Table.new(:name=>_("Contingence Table"),:header=>["","Y=0","Y=1", "T"])
125
- t.add_row(["X=0", @a,@b,@a+@b])
126
- t.add_row(["X=1", @c,@d,@c+@d])
127
- t.add_hr
128
- t.add_row(["T", @a+@c,@b+@d,@a+@b+@c+@d])
125
+ t.row(["X=0", @a,@b,@a+@b])
126
+ t.row(["X=1", @c,@d,@c+@d])
127
+ t.hr
128
+ t.row(["T", @a+@c,@b+@d,@a+@b+@c+@d])
129
129
  section.add(t)
130
130
  #generator.parse_element(t)
131
131
  section.add(sprintf("r: %0.3f",r))
@@ -1,3 +1,12 @@
1
+ if RUBY_VERSION<"1.9"
2
+ require 'fastercsv'
3
+ Statsample::CSV_klass=FasterCSV
4
+ else
5
+ require 'csv'
6
+ Statsample::CSV_klass=CSV
7
+
8
+ end
9
+
1
10
  module Statsample
2
11
  class CSV < SpreadsheetBase
3
12
  class << self
@@ -6,7 +15,7 @@ module Statsample
6
15
  # USE:
7
16
  # ds=Statsample::CSV.read("test_csv.csv")
8
17
  def read(filename, empty=[''],ignore_lines=0,fs=nil,rs=nil)
9
- require 'csv'
18
+
10
19
  first_row=true
11
20
  fields=[]
12
21
  fields_data={}
@@ -15,7 +24,7 @@ module Statsample
15
24
  opts={}
16
25
  opts[:col_sep]=fs unless fs.nil?
17
26
  opts[:row_sep]=rs unless rs.nil?
18
- csv=::CSV.open(filename,'r',opts)
27
+ csv=Statsample::CSV_klass.send(:open, filename,'r',opts)
19
28
  csv.each do |row|
20
29
  line_number+=1
21
30
  if(line_number<=ignore_lines)
@@ -41,8 +50,8 @@ module Statsample
41
50
  # USE:
42
51
  # Statsample::CSV.write(ds,"test_csv.csv")
43
52
  def write(dataset,filename, convert_comma=false,*opts)
44
- require 'csv'
45
- writer=::CSV.open(filename,'w',*opts)
53
+
54
+ writer=Statsample::CSV_klass.send(:open, filename,'w',*opts)
46
55
  writer << dataset.fields
47
56
  dataset.each_array do|row|
48
57
  if(convert_comma)
@@ -351,9 +351,5 @@ out
351
351
  end
352
352
  end
353
353
 
354
- if RUBY_VERSION<"1.9"
355
- require 'statsample/converter/csv18.rb'
356
- else
357
- require 'statsample/converter/csv19.rb'
358
- end
354
+ require 'statsample/converter/csv.rb'
359
355
 
@@ -93,17 +93,17 @@ module Statsample
93
93
  def cols_empty_hash
94
94
  cols_names.inject({}) {|a,x| a[x]=0;a}
95
95
  end
96
- def to_reportbuilder(generator)
97
- anchor=generator.add_toc_entry(_("Crosstab: ")+name)
98
- generator.add_html "<div class='crosstab'>"+_("Crosstab")+" #{@name}<a name='#{anchor}'></a>"
96
+ def report_building(generator)
97
+ anchor=generator.toc_entry(_("Crosstab: ")+name)
98
+ generator.html "<div class='crosstab'>"+_("Crosstab")+" #{@name}<a name='#{anchor}'></a>"
99
99
  fq=frequencies
100
100
  rn=rows_names
101
101
  cn=cols_names
102
102
  total=0
103
103
  total_cols=cols_empty_hash
104
- generator.add_text "Chi Square: #{chi_square}"
105
- generator.add_text(_("Rows: %s") % @row_label) unless @row_label.nil?
106
- generator.add_text(_("Columns: %s") % @column_label) unless @column_label.nil?
104
+ generator.text "Chi Square: #{chi_square}"
105
+ generator.text(_("Rows: %s") % @row_label) unless @row_label.nil?
106
+ generator.text(_("Columns: %s") % @column_label) unless @column_label.nil?
107
107
 
108
108
  t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c)}+[_("Total")])
109
109
  rn.each do |row|
@@ -117,15 +117,15 @@ module Statsample
117
117
  t_row.push(data)
118
118
  end
119
119
  t_row.push(total_row)
120
- t.add_row(t_row)
120
+ t.row(t_row)
121
121
  end
122
- t.add_horizontal_line
122
+ t.hr
123
123
  t_row=[_("Total")]
124
124
  cn.each do |v|
125
125
  t_row.push(total_cols[v])
126
126
  end
127
127
  t_row.push(total)
128
- t.add_row(t_row)
128
+ t.row(t_row)
129
129
  generator.parse_element(t)
130
130
 
131
131
  if(@percentage_row)
@@ -138,7 +138,7 @@ module Statsample
138
138
  table_percentage(generator,:total)
139
139
  end
140
140
 
141
- generator.add_html("</div>")
141
+ generator.html("</div>")
142
142
  end
143
143
 
144
144
 
@@ -174,10 +174,10 @@ module Statsample
174
174
  when :total then @cases
175
175
  end
176
176
  t_row.push(sprintf("%0.2f%%", rt[row]*100.0/total))
177
- t.add_row(t_row)
177
+ t.row(t_row)
178
178
  end
179
179
 
180
- t.add_horizontal_line
180
+ t.hr
181
181
  t_row=[_("Total")]
182
182
  cn.each{|col|
183
183
  total=case type
@@ -188,7 +188,7 @@ module Statsample
188
188
  t_row.push(sprintf("%0.2f%%", ct[col]*100.0/total))
189
189
  }
190
190
  t_row.push("100%")
191
- t.add_row(t_row)
191
+ t.row(t_row)
192
192
  generator.parse_element(t)
193
193
  end
194
194
  end
@@ -154,7 +154,7 @@ module Statsample
154
154
  # Creates a copy of the given dataset, deleting all the cases with
155
155
  # missing data on one of the vectors
156
156
  def dup_only_valid
157
- if @vectors.find{|field,vector| vector.has_missing_data?}
157
+ if @vectors.any?{|field,vector| vector.has_missing_data?}
158
158
  ds=dup_empty
159
159
  each_array { |c|
160
160
  ds.add_case_array(c) unless @fields.find{|f| @vectors[f].data_with_nils[@i].nil? }
@@ -313,44 +313,46 @@ module Statsample
313
313
  rp.add(self)
314
314
  rp.to_text
315
315
  end
316
- def to_reportbuilder(generator)
316
+ def report_building(generator)
317
317
  compute if @models.nil?
318
- anchor=generator.add_toc_entry(_("DA: ")+@name)
319
- generator.add_html "<div class='dominance-analysis'>#{@name}<a name='#{anchor}'></a>"
320
- t=ReportBuilder::Table.new(:name=>_("Dominance Analysis result"))
321
318
 
322
- t.header=["","r2",_("sign")]+@predictors.collect {|c| DominanceAnalysis.predictor_name(c) }
319
+ anchor=generator.toc_entry(_("DA: ")+@name)
320
+
321
+ generator.html "<div class='dominance-analysis'>#{@name}<a name='#{anchor}'></a>"
322
+ header=["","r2",_("sign")]+@predictors.collect {|c| DominanceAnalysis.predictor_name(c) }
323
+ t=ReportBuilder::Table.new(:name=>_("Dominance Analysis result"),:header=>header)
324
+
323
325
  row=[_("Model 0"),"",""]+@predictors.collect{|f|
324
326
  sprintf("%0.3f", md([f]).r2)
325
327
  }
326
- t.add_row(row)
327
- t.add_horizontal_line
328
+
329
+ t.row(row)
330
+ t.hr
328
331
  for i in 1..@predictors.size
329
332
  mk=md_k(i)
330
333
  mk.each{|m|
331
- t.add_row(m.add_table_row)
334
+ t.row(m.add_table_row)
332
335
  }
333
336
  # Report averages
334
337
  a=average_k(i)
335
338
  if !a.nil?
336
- t.add_horizontal_line
339
+ t.hr
337
340
  row=[_("k=%d Average") % i,"",""] + @predictors.collect{|f|
338
341
  sprintf("%0.3f",a[f])
339
342
  }
340
- t.add_row(row)
341
- t.add_horizontal_line
343
+ t.row(row)
344
+ t.hr
342
345
 
343
346
  end
344
-
345
347
  end
346
348
 
347
349
  g=general_averages
348
- t.add_horizontal_line
350
+ t.hr
349
351
 
350
352
  row=[_("Overall averages"),"",""]+@predictors.collect{|f|
351
353
  sprintf("%0.3f",g[f])
352
354
  }
353
- t.add_row(row)
355
+ t.row(row)
354
356
  generator.parse_element(t)
355
357
 
356
358
  td=total_dominance
@@ -360,10 +362,10 @@ module Statsample
360
362
  pairs.each{|p|
361
363
  name=p.join(" - ")
362
364
  row=[name, sprintf("%0.1f",td[p]), sprintf("%0.1f",cd[p]), sprintf("%0.1f",gd[p])]
363
- t.add_row(row)
365
+ t.row(row)
364
366
  }
365
367
  generator.parse_element(t)
366
- generator.add_html("</div>")
368
+ generator.html("</div>")
367
369
  end
368
370
  class ModelData # :nodoc:
369
371
  attr_reader :contributions
@@ -178,39 +178,39 @@ module Statsample
178
178
  def t
179
179
  Distribution::T.p_value(1-((1-@alpha) / 2), @n_samples - 1)
180
180
  end
181
- def to_reportbuilder(generator) # :nodoc:
181
+ def report_building(generator) # :nodoc:
182
182
  raise "You should bootstrap first" if @n_samples==0
183
- anchor=generator.add_toc_entry(_("DAB: ")+@name)
184
- generator.add_html "<div class='dominance-analysis-bootstrap'>#{@name}<a name='#{anchor}'></a>"
183
+ anchor=generator.toc_entry(_("DAB: ")+@name)
184
+ generator.html "<div class='dominance-analysis-bootstrap'>#{@name}<a name='#{anchor}'></a>"
185
185
 
186
- generator.add_text _("Sample size: %d\n") % @n_samples
187
- generator.add_text "t: #{t}\n"
188
- generator.add_text _("Linear Regression Engine: %s") % @regression_class.name
186
+ generator.text _("Sample size: %d\n") % @n_samples
187
+ generator.text "t: #{t}\n"
188
+ generator.text _("Linear Regression Engine: %s") % @regression_class.name
189
189
 
190
190
  table=ReportBuilder::Table.new(:name=>"Bootstrap report", :header => [_("pairs"), "sD","Dij", _("SE(Dij)"), "Pij", "Pji", "Pno", _("Reproducibility")])
191
- table.add_row([_("Complete dominance")])
192
- table.add_horizontal_line
191
+ table.row([_("Complete dominance")])
192
+ table.hr
193
193
  @pairs.each{|pair|
194
194
  std=@samples_td[pair].to_vector(:scale)
195
195
  ttd=da.total_dominance_pairwise(pair[0],pair[1])
196
- table.add_row(summary_pairs(pair,std,ttd))
196
+ table.row(summary_pairs(pair,std,ttd))
197
197
  }
198
- table.add_horizontal_line
199
- table.add_row([_("Conditional dominance")])
200
- table.add_horizontal_line
198
+ table.hr
199
+ table.row([_("Conditional dominance")])
200
+ table.hr
201
201
  @pairs.each{|pair|
202
202
  std=@samples_cd[pair].to_vector(:scale)
203
203
  ttd=da.conditional_dominance_pairwise(pair[0],pair[1])
204
- table.add_row(summary_pairs(pair,std,ttd))
204
+ table.row(summary_pairs(pair,std,ttd))
205
205
 
206
206
  }
207
- table.add_horizontal_line
208
- table.add_row([_("General Dominance")])
209
- table.add_horizontal_line
207
+ table.hr
208
+ table.row([_("General Dominance")])
209
+ table.hr
210
210
  @pairs.each{|pair|
211
211
  std=@samples_gd[pair].to_vector(:scale)
212
212
  ttd=da.general_dominance_pairwise(pair[0],pair[1])
213
- table.add_row(summary_pairs(pair,std,ttd))
213
+ table.row(summary_pairs(pair,std,ttd))
214
214
  }
215
215
  generator.parse_element(table)
216
216
 
@@ -219,12 +219,12 @@ module Statsample
219
219
  @fields.each{|f|
220
220
  v=@samples_ga[f].to_vector(:scale)
221
221
  row=[@ds.label(f), sprintf("%0.3f",v.mean), sprintf("%0.3f",v.sd), sprintf("%0.3f",v.percentil(5)),sprintf("%0.3f",v.percentil(95))]
222
- table.add_row(row)
222
+ table.row(row)
223
223
 
224
224
  }
225
225
 
226
226
  generator.parse_element(table)
227
- generator.add_html("</div>")
227
+ generator.html("</div>")
228
228
  end
229
229
  def summary_pairs(pair,std,ttd)
230
230
  freqs=std.proportions