statsample 0.6.7 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,5 +1,12 @@
1
+ === 0.7.0 / 2010-03-25
2
+ * Ported to ReportBuilder 1.x series
3
+ * Implementation of ruby based covariance and correlation changed to a clearer code
4
+ * Statsample::Vector#svggraph_frequencies accepts IO
5
+ * Some test ported to Miniunit
6
+ * CSV on Ruby1.8 uses FasterCSV
7
+
1
8
  === 0.6.7 / 2010-03-23
2
- * Bug fix: dependency on Reportbuilder should be set to "~>0.2.0", not "0.2"
9
+ * Bug fix: dependency on ReportBuilder should be set to "~>0.2.0", not "0.2"
3
10
  === 0.6.6 / 2010-03-22
4
11
  * Set ReportBuilder dependency to '0.2.~' version, because future API break
5
12
  * Removed Alglib dependency
data/Manifest.txt CHANGED
@@ -34,8 +34,7 @@ lib/statsample/bivariate/polychoric.rb
34
34
  lib/statsample/bivariate/tetrachoric.rb
35
35
  lib/statsample/codification.rb
36
36
  lib/statsample/combination.rb
37
- lib/statsample/converter/csv18.rb
38
- lib/statsample/converter/csv19.rb
37
+ lib/statsample/converter/csv.rb
39
38
  lib/statsample/converter/spss.rb
40
39
  lib/statsample/converters.rb
41
40
  lib/statsample/crosstab.rb
data/Rakefile CHANGED
@@ -42,7 +42,7 @@ h=Hoe.spec('statsample') do
42
42
  self.version=Statsample::VERSION
43
43
  self.rubyforge_name = "ruby-statsample"
44
44
  self.developer('Claudio Bustos', 'clbustos@gmail.com')
45
- self.extra_deps << ["spreadsheet","~>0.6.0"] << ["svg-graph", "~>1.0"] << ["reportbuilder", "~>0.2.0"] << ["minimization", "~>0.1.0"]
45
+ self.extra_deps << ["spreadsheet","~>0.6.0"] << ["svg-graph", "~>1.0"] << ["reportbuilder", "~>1.0"] << ["minimization", "~>0.1.0"] << ["fastercsv"]
46
46
  self.clean_globs << "test/images/*" << "demo/item_analysis/*" << "demo/Regression"
47
47
  self.need_rdoc=false
48
48
  end
@@ -10,7 +10,7 @@ d=sample.times.collect {rand}.to_scale
10
10
 
11
11
  ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
12
12
  ds['y']=ds.collect{|row| row['a']*5+row['b']*3+row['c']*2+row['d']+rand()}
13
- rb=ReportBuilder.new("Dominance Analysis")
13
+ rb=ReportBuilder.new(:name=>"Dominance Analysis")
14
14
 
15
15
  cm=Statsample::Bivariate.correlation_matrix(ds)
16
16
  rb.add(cm)
@@ -25,3 +25,4 @@ rb.add(da)
25
25
 
26
26
 
27
27
  puts rb.to_text
28
+ rb.save_rtf("dominance_analysis.rtf")
data/lib/statsample.rb CHANGED
@@ -23,7 +23,7 @@ $:.unshift(File.expand_path(File.dirname(__FILE__)+"/../ext"))
23
23
 
24
24
  require 'matrix'
25
25
  require 'distribution'
26
- raise "Install reportbuilder ~>0.2.0" unless gem 'reportbuilder','~>0.2.0'
26
+ raise "Install reportbuilder ~>1.0" unless gem 'reportbuilder','~>1.0'
27
27
  require 'reportbuilder'
28
28
  class Numeric
29
29
  def square ; self * self ; end
@@ -113,7 +113,7 @@ module Statsample
113
113
  false
114
114
  end
115
115
  end
116
- VERSION = '0.6.7'
116
+ VERSION = '0.7.0'
117
117
  SPLIT_TOKEN = ","
118
118
  autoload(:Database, 'statsample/converters')
119
119
  autoload(:Anova, 'statsample/anova')
@@ -24,14 +24,16 @@ module Statsample
24
24
  sum
25
25
  end
26
26
 
27
- def covariance_slow(v1a,v2a) # :nodoc:
28
- t=0
27
+ def covariance_slow(v1,v2) # :nodoc:
28
+ v1a,v2a=Statsample.only_valid(v1,v2)
29
+ sum_of_squares(v1a,v2a) / (v1a.size-1)
30
+ end
31
+ def sum_of_squares(v1,v2)
32
+ v1a,v2a=Statsample.only_valid(v1,v2)
29
33
  m1=v1a.mean
30
- m2=v1a.mean
31
- (0...v1a.size).each {|i| t+=((v1a[i]-m1)*(v2a[i]-m2)) }
32
- t.to_f / (v1a.size-1)
34
+ m2=v2a.mean
35
+ (v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
33
36
  end
34
-
35
37
  # Calculate Pearson correlation coefficient (r) between 2 vectors
36
38
  def pearson(v1,v2)
37
39
  v1a,v2a=Statsample.only_valid(v1,v2)
@@ -42,12 +44,18 @@ module Statsample
42
44
  pearson_slow(v1a,v2a)
43
45
  end
44
46
  end
45
- def pearson_slow(v1a,v2a) # :nodoc:
46
- v1s,v2s=v1a.vector_standarized_pop,v2a.vector_standarized_pop
47
+ def pearson_slow(v1,v2) # :nodoc:
48
+ v1a,v2a=Statsample.only_valid(v1,v2)
49
+ # Calculate sum of squares
50
+ ss=sum_of_squares(v1a,v2a)
51
+ ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
52
+ =begin
53
+ v1s,v2s=v1a.vector_standarized,v2a.vector_standarized
47
54
  t=0
48
55
  siz=v1s.size
49
56
  (0...v1s.size).each {|i| t+=(v1s[i]*v2s[i]) }
50
- t.to_f/v2s.size
57
+ t.quo(v2s.size-1)
58
+ =end
51
59
  end
52
60
  # Retrieves the value for t test for a pearson correlation
53
61
  # between two vectors to test the null hipothesis of r=0
@@ -279,6 +287,7 @@ module Statsample
279
287
  }
280
288
  a
281
289
  end
290
+ =begin
282
291
  def sum_of_codeviated(v1,v2)
283
292
  v1a,v2a=Statsample.only_valid(v1,v2)
284
293
  sum=0
@@ -287,7 +296,7 @@ module Statsample
287
296
  }
288
297
  sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
289
298
  end
290
-
299
+ =end
291
300
  # Report the minimum number of cases valid of a covariate matrix
292
301
  # based on a dataset
293
302
  def min_n_valid(ds)
@@ -744,24 +744,24 @@ module Statsample
744
744
  rp.to_text
745
745
  end
746
746
 
747
- def to_reportbuilder(generator) # :nodoc:
747
+ def report_building(generator) # :nodoc:
748
748
  compute if @r.nil?
749
749
  section=ReportBuilder::Section.new(:name=>@name)
750
750
  t=ReportBuilder::Table.new(:name=>_("Contingence Table"),:header=>[""]+(@n.times.collect {|i| "Y=#{i}"})+["Total"])
751
751
  @m.times do |i|
752
- t.add_row(["X = #{i}"]+(@n.times.collect {|j| @matrix[i,j]}) + [@sumr[i]])
752
+ t.row(["X = #{i}"]+(@n.times.collect {|j| @matrix[i,j]}) + [@sumr[i]])
753
753
  end
754
- t.add_hr
755
- t.add_row(["T"]+(@n.times.collect {|j| @sumc[j]})+[@total])
754
+ t.hr
755
+ t.row(["T"]+(@n.times.collect {|j| @sumc[j]})+[@total])
756
756
  section.add(t)
757
757
  #generator.parse_element(t)
758
758
  section.add(sprintf("r: %0.4f",r))
759
759
  t=ReportBuilder::Table.new(:name=>_("Thresholds"), :header=>["","Value"])
760
760
  threshold_x.each_with_index {|val,i|
761
- t.add_row(["Threshold X #{i}", sprintf("%0.4f", val)])
761
+ t.row(["Threshold X #{i}", sprintf("%0.4f", val)])
762
762
  }
763
763
  threshold_y.each_with_index {|val,i|
764
- t.add_row(["Threshold Y #{i}", sprintf("%0.4f", val)])
764
+ t.row(["Threshold Y #{i}", sprintf("%0.4f", val)])
765
765
  }
766
766
  section.add(t)
767
767
  section.add(_("Test of bivariate normality: X2 = %0.3f, df = %d, p= %0.5f" % [ chi_square, chi_square_df, 1-Distribution::ChiSquare.cdf(chi_square, chi_square_df)]))
@@ -114,18 +114,18 @@ module Statsample
114
114
  end
115
115
  # Summary of the analysis
116
116
  def summary
117
- rp=ReportBuilder.new()
117
+ rp=ReportBuilder.new(:name=>@name)
118
118
  rp.add(self)
119
119
  rp.to_text
120
120
  end
121
121
 
122
- def to_reportbuilder(generator) # :nodoc:
122
+ def report_building(generator) # :nodoc:
123
123
  section=ReportBuilder::Section.new(:name=>@name)
124
124
  t=ReportBuilder::Table.new(:name=>_("Contingence Table"),:header=>["","Y=0","Y=1", "T"])
125
- t.add_row(["X=0", @a,@b,@a+@b])
126
- t.add_row(["X=1", @c,@d,@c+@d])
127
- t.add_hr
128
- t.add_row(["T", @a+@c,@b+@d,@a+@b+@c+@d])
125
+ t.row(["X=0", @a,@b,@a+@b])
126
+ t.row(["X=1", @c,@d,@c+@d])
127
+ t.hr
128
+ t.row(["T", @a+@c,@b+@d,@a+@b+@c+@d])
129
129
  section.add(t)
130
130
  #generator.parse_element(t)
131
131
  section.add(sprintf("r: %0.3f",r))
@@ -1,3 +1,12 @@
1
+ if RUBY_VERSION<"1.9"
2
+ require 'fastercsv'
3
+ Statsample::CSV_klass=FasterCSV
4
+ else
5
+ require 'csv'
6
+ Statsample::CSV_klass=CSV
7
+
8
+ end
9
+
1
10
  module Statsample
2
11
  class CSV < SpreadsheetBase
3
12
  class << self
@@ -6,7 +15,7 @@ module Statsample
6
15
  # USE:
7
16
  # ds=Statsample::CSV.read("test_csv.csv")
8
17
  def read(filename, empty=[''],ignore_lines=0,fs=nil,rs=nil)
9
- require 'csv'
18
+
10
19
  first_row=true
11
20
  fields=[]
12
21
  fields_data={}
@@ -15,7 +24,7 @@ module Statsample
15
24
  opts={}
16
25
  opts[:col_sep]=fs unless fs.nil?
17
26
  opts[:row_sep]=rs unless rs.nil?
18
- csv=::CSV.open(filename,'r',opts)
27
+ csv=Statsample::CSV_klass.send(:open, filename,'r',opts)
19
28
  csv.each do |row|
20
29
  line_number+=1
21
30
  if(line_number<=ignore_lines)
@@ -41,8 +50,8 @@ module Statsample
41
50
  # USE:
42
51
  # Statsample::CSV.write(ds,"test_csv.csv")
43
52
  def write(dataset,filename, convert_comma=false,*opts)
44
- require 'csv'
45
- writer=::CSV.open(filename,'w',*opts)
53
+
54
+ writer=Statsample::CSV_klass.send(:open, filename,'w',*opts)
46
55
  writer << dataset.fields
47
56
  dataset.each_array do|row|
48
57
  if(convert_comma)
@@ -351,9 +351,5 @@ out
351
351
  end
352
352
  end
353
353
 
354
- if RUBY_VERSION<"1.9"
355
- require 'statsample/converter/csv18.rb'
356
- else
357
- require 'statsample/converter/csv19.rb'
358
- end
354
+ require 'statsample/converter/csv.rb'
359
355
 
@@ -93,17 +93,17 @@ module Statsample
93
93
  def cols_empty_hash
94
94
  cols_names.inject({}) {|a,x| a[x]=0;a}
95
95
  end
96
- def to_reportbuilder(generator)
97
- anchor=generator.add_toc_entry(_("Crosstab: ")+name)
98
- generator.add_html "<div class='crosstab'>"+_("Crosstab")+" #{@name}<a name='#{anchor}'></a>"
96
+ def report_building(generator)
97
+ anchor=generator.toc_entry(_("Crosstab: ")+name)
98
+ generator.html "<div class='crosstab'>"+_("Crosstab")+" #{@name}<a name='#{anchor}'></a>"
99
99
  fq=frequencies
100
100
  rn=rows_names
101
101
  cn=cols_names
102
102
  total=0
103
103
  total_cols=cols_empty_hash
104
- generator.add_text "Chi Square: #{chi_square}"
105
- generator.add_text(_("Rows: %s") % @row_label) unless @row_label.nil?
106
- generator.add_text(_("Columns: %s") % @column_label) unless @column_label.nil?
104
+ generator.text "Chi Square: #{chi_square}"
105
+ generator.text(_("Rows: %s") % @row_label) unless @row_label.nil?
106
+ generator.text(_("Columns: %s") % @column_label) unless @column_label.nil?
107
107
 
108
108
  t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c)}+[_("Total")])
109
109
  rn.each do |row|
@@ -117,15 +117,15 @@ module Statsample
117
117
  t_row.push(data)
118
118
  end
119
119
  t_row.push(total_row)
120
- t.add_row(t_row)
120
+ t.row(t_row)
121
121
  end
122
- t.add_horizontal_line
122
+ t.hr
123
123
  t_row=[_("Total")]
124
124
  cn.each do |v|
125
125
  t_row.push(total_cols[v])
126
126
  end
127
127
  t_row.push(total)
128
- t.add_row(t_row)
128
+ t.row(t_row)
129
129
  generator.parse_element(t)
130
130
 
131
131
  if(@percentage_row)
@@ -138,7 +138,7 @@ module Statsample
138
138
  table_percentage(generator,:total)
139
139
  end
140
140
 
141
- generator.add_html("</div>")
141
+ generator.html("</div>")
142
142
  end
143
143
 
144
144
 
@@ -174,10 +174,10 @@ module Statsample
174
174
  when :total then @cases
175
175
  end
176
176
  t_row.push(sprintf("%0.2f%%", rt[row]*100.0/total))
177
- t.add_row(t_row)
177
+ t.row(t_row)
178
178
  end
179
179
 
180
- t.add_horizontal_line
180
+ t.hr
181
181
  t_row=[_("Total")]
182
182
  cn.each{|col|
183
183
  total=case type
@@ -188,7 +188,7 @@ module Statsample
188
188
  t_row.push(sprintf("%0.2f%%", ct[col]*100.0/total))
189
189
  }
190
190
  t_row.push("100%")
191
- t.add_row(t_row)
191
+ t.row(t_row)
192
192
  generator.parse_element(t)
193
193
  end
194
194
  end
@@ -154,7 +154,7 @@ module Statsample
154
154
  # Creates a copy of the given dataset, deleting all the cases with
155
155
  # missing data on one of the vectors
156
156
  def dup_only_valid
157
- if @vectors.find{|field,vector| vector.has_missing_data?}
157
+ if @vectors.any?{|field,vector| vector.has_missing_data?}
158
158
  ds=dup_empty
159
159
  each_array { |c|
160
160
  ds.add_case_array(c) unless @fields.find{|f| @vectors[f].data_with_nils[@i].nil? }
@@ -313,44 +313,46 @@ module Statsample
313
313
  rp.add(self)
314
314
  rp.to_text
315
315
  end
316
- def to_reportbuilder(generator)
316
+ def report_building(generator)
317
317
  compute if @models.nil?
318
- anchor=generator.add_toc_entry(_("DA: ")+@name)
319
- generator.add_html "<div class='dominance-analysis'>#{@name}<a name='#{anchor}'></a>"
320
- t=ReportBuilder::Table.new(:name=>_("Dominance Analysis result"))
321
318
 
322
- t.header=["","r2",_("sign")]+@predictors.collect {|c| DominanceAnalysis.predictor_name(c) }
319
+ anchor=generator.toc_entry(_("DA: ")+@name)
320
+
321
+ generator.html "<div class='dominance-analysis'>#{@name}<a name='#{anchor}'></a>"
322
+ header=["","r2",_("sign")]+@predictors.collect {|c| DominanceAnalysis.predictor_name(c) }
323
+ t=ReportBuilder::Table.new(:name=>_("Dominance Analysis result"),:header=>header)
324
+
323
325
  row=[_("Model 0"),"",""]+@predictors.collect{|f|
324
326
  sprintf("%0.3f", md([f]).r2)
325
327
  }
326
- t.add_row(row)
327
- t.add_horizontal_line
328
+
329
+ t.row(row)
330
+ t.hr
328
331
  for i in 1..@predictors.size
329
332
  mk=md_k(i)
330
333
  mk.each{|m|
331
- t.add_row(m.add_table_row)
334
+ t.row(m.add_table_row)
332
335
  }
333
336
  # Report averages
334
337
  a=average_k(i)
335
338
  if !a.nil?
336
- t.add_horizontal_line
339
+ t.hr
337
340
  row=[_("k=%d Average") % i,"",""] + @predictors.collect{|f|
338
341
  sprintf("%0.3f",a[f])
339
342
  }
340
- t.add_row(row)
341
- t.add_horizontal_line
343
+ t.row(row)
344
+ t.hr
342
345
 
343
346
  end
344
-
345
347
  end
346
348
 
347
349
  g=general_averages
348
- t.add_horizontal_line
350
+ t.hr
349
351
 
350
352
  row=[_("Overall averages"),"",""]+@predictors.collect{|f|
351
353
  sprintf("%0.3f",g[f])
352
354
  }
353
- t.add_row(row)
355
+ t.row(row)
354
356
  generator.parse_element(t)
355
357
 
356
358
  td=total_dominance
@@ -360,10 +362,10 @@ module Statsample
360
362
  pairs.each{|p|
361
363
  name=p.join(" - ")
362
364
  row=[name, sprintf("%0.1f",td[p]), sprintf("%0.1f",cd[p]), sprintf("%0.1f",gd[p])]
363
- t.add_row(row)
365
+ t.row(row)
364
366
  }
365
367
  generator.parse_element(t)
366
- generator.add_html("</div>")
368
+ generator.html("</div>")
367
369
  end
368
370
  class ModelData # :nodoc:
369
371
  attr_reader :contributions
@@ -178,39 +178,39 @@ module Statsample
178
178
  def t
179
179
  Distribution::T.p_value(1-((1-@alpha) / 2), @n_samples - 1)
180
180
  end
181
- def to_reportbuilder(generator) # :nodoc:
181
+ def report_building(generator) # :nodoc:
182
182
  raise "You should bootstrap first" if @n_samples==0
183
- anchor=generator.add_toc_entry(_("DAB: ")+@name)
184
- generator.add_html "<div class='dominance-analysis-bootstrap'>#{@name}<a name='#{anchor}'></a>"
183
+ anchor=generator.toc_entry(_("DAB: ")+@name)
184
+ generator.html "<div class='dominance-analysis-bootstrap'>#{@name}<a name='#{anchor}'></a>"
185
185
 
186
- generator.add_text _("Sample size: %d\n") % @n_samples
187
- generator.add_text "t: #{t}\n"
188
- generator.add_text _("Linear Regression Engine: %s") % @regression_class.name
186
+ generator.text _("Sample size: %d\n") % @n_samples
187
+ generator.text "t: #{t}\n"
188
+ generator.text _("Linear Regression Engine: %s") % @regression_class.name
189
189
 
190
190
  table=ReportBuilder::Table.new(:name=>"Bootstrap report", :header => [_("pairs"), "sD","Dij", _("SE(Dij)"), "Pij", "Pji", "Pno", _("Reproducibility")])
191
- table.add_row([_("Complete dominance")])
192
- table.add_horizontal_line
191
+ table.row([_("Complete dominance")])
192
+ table.hr
193
193
  @pairs.each{|pair|
194
194
  std=@samples_td[pair].to_vector(:scale)
195
195
  ttd=da.total_dominance_pairwise(pair[0],pair[1])
196
- table.add_row(summary_pairs(pair,std,ttd))
196
+ table.row(summary_pairs(pair,std,ttd))
197
197
  }
198
- table.add_horizontal_line
199
- table.add_row([_("Conditional dominance")])
200
- table.add_horizontal_line
198
+ table.hr
199
+ table.row([_("Conditional dominance")])
200
+ table.hr
201
201
  @pairs.each{|pair|
202
202
  std=@samples_cd[pair].to_vector(:scale)
203
203
  ttd=da.conditional_dominance_pairwise(pair[0],pair[1])
204
- table.add_row(summary_pairs(pair,std,ttd))
204
+ table.row(summary_pairs(pair,std,ttd))
205
205
 
206
206
  }
207
- table.add_horizontal_line
208
- table.add_row([_("General Dominance")])
209
- table.add_horizontal_line
207
+ table.hr
208
+ table.row([_("General Dominance")])
209
+ table.hr
210
210
  @pairs.each{|pair|
211
211
  std=@samples_gd[pair].to_vector(:scale)
212
212
  ttd=da.general_dominance_pairwise(pair[0],pair[1])
213
- table.add_row(summary_pairs(pair,std,ttd))
213
+ table.row(summary_pairs(pair,std,ttd))
214
214
  }
215
215
  generator.parse_element(table)
216
216
 
@@ -219,12 +219,12 @@ module Statsample
219
219
  @fields.each{|f|
220
220
  v=@samples_ga[f].to_vector(:scale)
221
221
  row=[@ds.label(f), sprintf("%0.3f",v.mean), sprintf("%0.3f",v.sd), sprintf("%0.3f",v.percentil(5)),sprintf("%0.3f",v.percentil(95))]
222
- table.add_row(row)
222
+ table.row(row)
223
223
 
224
224
  }
225
225
 
226
226
  generator.parse_element(table)
227
- generator.add_html("</div>")
227
+ generator.html("</div>")
228
228
  end
229
229
  def summary_pairs(pair,std,ttd)
230
230
  freqs=std.proportions