statsample 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,8 +1,14 @@
1
+ === 0.3.4 / 2009-08-21
2
+ * Works with statsample-optimization 2.0.0
3
+ * Vector doesn't uses delegation. All methods are part of Vector
4
+ * Added Combination. Generates all combination of n elements taken r at a time
5
+ * Bivariate#prop_pearson now can uses as a second parameter :both, :left, :right, :positive or :negative
6
+ * Added LICENSE.txt
7
+
1
8
  === 0.3.3 / 2009-08-11
2
9
  * Added i18n support. For now, only spanish translation available
3
10
  * Bug fix: Test now load libraries on ../lib path
4
11
  * Excel and CSV importers automatically modify type of vector to Scale when all data are numbers or nils values
5
- *
6
12
 
7
13
  === 0.3.2 / 2009-08-04
8
14
 
data/Manifest.txt CHANGED
@@ -25,6 +25,7 @@ lib/statsample/anova.rb
25
25
  lib/statsample/bivariate.rb
26
26
  lib/statsample/chidistribution.rb
27
27
  lib/statsample/codification.rb
28
+ lib/statsample/combination.rb
28
29
  lib/statsample/converters.rb
29
30
  lib/statsample/crosstab.rb
30
31
  lib/statsample/dataset.rb
@@ -38,6 +39,7 @@ lib/statsample/graph/svgscatterplot.rb
38
39
  lib/statsample/htmlreport.rb
39
40
  lib/statsample/multiset.rb
40
41
  lib/statsample/regression.rb
42
+ lib/statsample/regression/logit.rb
41
43
  lib/statsample/regression/multiple.rb
42
44
  lib/statsample/regression/multiple/alglibengine.rb
43
45
  lib/statsample/regression/multiple/gslengine.rb
@@ -54,6 +56,7 @@ setup.rb
54
56
  test/_test_chart.rb
55
57
  test/test_anova.rb
56
58
  test/test_codification.rb
59
+ test/test_combination.rb
57
60
  test/test_crosstab.rb
58
61
  test/test_csv.csv
59
62
  test/test_csv.rb
data/demo/benchmark.rb CHANGED
@@ -1,4 +1,5 @@
1
- require File.dirname(__FILE__)+'/../lib/statsample.rb'
1
+ $:.unshift(File.dirname(__FILE__)+'/../lib/')
2
+ require 'statsample'
2
3
  require 'benchmark'
3
4
  v=(0..10000).collect{|n|
4
5
  r=rand(100)
@@ -37,14 +38,15 @@ ds=Statsample::Dataset.new({'a'=>a.to_vector(:scale),'b'=>b.to_vector(:scale), '
37
38
 
38
39
  if (true)
39
40
  Benchmark.bm(7) do |x|
40
- x.report("Alglib coeffs") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.coeffs; end }
41
- x.report("GslEngine coeffs") { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.coeffs; end }
41
+ x.report("Alglib coeffs") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.coeffs; lr=nil;end }
42
+
43
+ x.report("GslEngine coeffs") { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.coeffs;lr=nil; end }
42
44
  end
43
45
  end
44
46
  if(true)
45
47
  Benchmark.bm(7) do |x|
46
- x.report("Alglib process") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.process([1,2]); end }
47
- x.report("GslEngine process") { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.process([1,2]); end }
48
+ x.report("Alglib process") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.process([rand(10),rand(10)]); end }
49
+ x.report("GslEngine process") { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.process([rand(10),rand(10)]); end }
48
50
 
49
51
  end
50
52
  end
data/demo/regression.rb CHANGED
@@ -2,22 +2,36 @@ require File.dirname(__FILE__)+'/../lib/statsample'
2
2
  require 'benchmark'
3
3
  tests=300
4
4
  include Statsample
5
- r = GSL::Rng.alloc(GSL::Rng::TAUS,Time.now.to_i)
6
5
  ds=Dataset.new(%w{a b c d y})
7
6
  ds['a'].type=:scale
8
7
  ds['b'].type=:scale
9
8
  ds['c'].type=:scale
10
9
  ds['d'].type=:scale
11
10
  ds['y'].type=:scale
11
+
12
+ if HAS_GSL
13
+ r = GSL::Rng.alloc(GSL::Rng::TAUS,Time.now.to_i)
12
14
 
13
15
  tests.times {
14
16
  a=r.ugaussian
15
- b=r.ugaussian
17
+ b=a*2+r.ugaussian
16
18
  c=r.ugaussian
17
19
  d=r.ugaussian
18
20
  y=a*70+b*30+c*5+r.ugaussian*5
19
21
  ds.add_case_array([a,b,c,d,y])
20
22
  }
23
+ else
24
+ tests.times {
25
+ a=1-rand()*2.0
26
+ b=1-rand()*2.0
27
+ c=1-rand()*2.0
28
+ d=1-rand()*2.0
29
+ y=a*70+b*30+c*5+(1-rand()*2.0)*5
30
+ ds.add_case_array([a,b,c,d,y])
31
+ }
32
+
33
+
34
+ end
21
35
  ds.update_valid_data
22
36
 
23
37
  if !File.exists? "regression.dab"
@@ -26,15 +40,27 @@ else
26
40
  da=Statsample.load("regression.dab")
27
41
  end
28
42
  times=1
43
+ if(true)
29
44
  Benchmark.bm(7) do |x|
45
+ if HAS_GSL
30
46
  x.report("GslEngine:") {
31
47
  da.lr_class=Regression::Multiple::GslEngine
32
48
  da.bootstrap(times)
33
49
  }
50
+ end
51
+ if(false)
52
+ if HAS_ALGIB
34
53
  x.report("AlglibEngine:") {
35
54
  da.lr_class=Regression::Multiple::AlglibEngine
36
55
  da.bootstrap(times)
37
56
  }
57
+ end
58
+ x.report("RubyEngine:") {
59
+ da.lr_class=Regression::Multiple::RubyEngine
60
+ da.bootstrap(times)
61
+ }
62
+ end
63
+ end
38
64
  end
39
65
 
40
66
  puts da.summary
@@ -45,11 +71,11 @@ lr=Regression::Multiple.listwise(ds,"y")
45
71
  hr=HtmlReport.new("Regression")
46
72
  hr.add_summary("Regression",lr.summary(HtmlSummary))
47
73
  hr.add_summary("Analisis de Dominancia ", da.da.summary(HtmlSummary))
48
-
74
+ hr.add_correlation_matrix(ds)
49
75
  hr.add_summary("Analisis de Dominancia (Bootstrap)", da.summary(HtmlSummary))
50
76
 
51
77
  da.fields.each{|f|
52
- hr.add_histogram("General Dominance #{f}",da.samples_ga[f].to_vector(:scale))
78
+ # hr.add_histogram("General Dominance #{f}",da.samples_ga[f].to_vector(:scale))
53
79
  }
54
80
  hr.save("Regression Dominance.html")
55
81
 
data/lib/statsample.rb CHANGED
@@ -72,16 +72,6 @@ end
72
72
  rescue LoadError
73
73
  HAS_ALGIB=false
74
74
  end
75
-
76
-
77
- begin
78
- require 'statsample/optimization'
79
- rescue LoadError
80
- module Statsample
81
- OPTIMIZED=false
82
- end
83
- end
84
-
85
75
  #
86
76
  # Modules for statistical analysis
87
77
  # See first:
@@ -90,10 +80,11 @@ end
90
80
  # * Dataset: An union of vectors.
91
81
  #
92
82
  module Statsample
93
- VERSION = '0.3.3'
83
+ VERSION = '0.3.4'
94
84
  SPLIT_TOKEN = ","
95
85
  autoload(:Database, 'statsample/converters')
96
86
  autoload(:Anova, 'statsample/anova')
87
+ autoload(:Combination, 'statsample/combination')
97
88
  autoload(:CSV, 'statsample/converters')
98
89
  autoload(:Excel, 'statsample/converters')
99
90
  autoload(:GGobi, 'statsample/converters')
@@ -113,10 +104,15 @@ module Statsample
113
104
  autoload(:Regression, 'statsample/regression')
114
105
  autoload(:Test, 'statsample/test')
115
106
  def self.load(filename)
116
- fp=File.open(filename,"r")
117
- o=Marshal.load(fp)
118
- fp.close
107
+ if File.exists? filename
108
+ o=false
109
+ File.open(filename,"r") {|fp|
110
+ o=Marshal.load(fp)
111
+ }
119
112
  o
113
+ else
114
+ false
115
+ end
120
116
  end
121
117
 
122
118
  module Util
@@ -243,9 +239,21 @@ module Statsample
243
239
  out
244
240
  end
245
241
  end
242
+
243
+ module STATSAMPLE__
244
+ end
245
+
246
+ end
247
+
248
+
249
+ begin
250
+ require 'statsamplert'
251
+ rescue LoadError
252
+ module Statsample
253
+ OPTIMIZED=false
254
+ end
246
255
  end
247
256
 
248
257
  require 'statsample/vector'
249
258
  require 'statsample/dataset'
250
259
  require 'statsample/crosstab'
251
-
@@ -65,15 +65,34 @@ module Statsample
65
65
  r*Math::sqrt(((size)-2).to_f / (1 - r**2))
66
66
  end
67
67
  # Retrieves the probability value (a la SPSS)
68
- # for a given t, size and number of tails
69
- def prop_pearson(t,size, tails=2)
70
- if HAS_GSL
71
- t=-t if t>0
72
- cdf=GSL::Cdf::tdist_P(t,(size)-2)
73
- cdf*tails
74
- else
75
- raise "Needs ruby-gsl"
76
- end
68
+ # for a given t, size and number of tails.
69
+ # Uses a second parameter
70
+ # * :both or 2 : for r!=0
71
+ # * :right, :positive or 1 : for r > 0
72
+ # * :left, :negative : for r < 0
73
+
74
+ def prop_pearson(t,size, tails=:both)
75
+ tails=:both if tails==2
76
+ tails=:right if tails==1 or tails==:positive
77
+ tails=:left if tails==:negative
78
+
79
+ n_tails=case tails
80
+ when :both
81
+ 2
82
+ else
83
+ 1
84
+ end
85
+ if HAS_GSL
86
+ t=-t if t>0 and (tails==:both)
87
+ cdf=GSL::Cdf::tdist_P(t,size-2)
88
+ if(tails==:right)
89
+ 1.0-(cdf*n_tails)
90
+ else
91
+ cdf*n_tails
92
+ end
93
+ else
94
+ raise "Needs ruby-gsl"
95
+ end
77
96
  end
78
97
  # Returns residual score after delete variance
79
98
  # from another variable
@@ -0,0 +1,103 @@
1
+ module Statsample
2
+ # Combination class systematically generates all combinations of n elements, taken r at a time.
3
+ # Use GSL::Combination is available for extra speed
4
+ # Source: http://snippets.dzone.com/posts/show/4666
5
+ # Use:
6
+ # comb=Statsample::Combination.new(3,5)
7
+ # comb.each{|c|
8
+ # p c
9
+ # }
10
+ class Combination
11
+ attr_reader :d
12
+ def initialize(k,n,only_ruby=false)
13
+ @k=k
14
+ @n=n
15
+ if HAS_GSL and !only_ruby
16
+ @d=CombinationGsl.new(@k,@n)
17
+ else
18
+ @d=CombinationRuby.new(@k,@n)
19
+ end
20
+ end
21
+ def each
22
+ reset
23
+ while a=next_value
24
+ yield a
25
+ end
26
+ end
27
+ def reset
28
+ @d.reset
29
+ end
30
+ def next_value
31
+ @d.next_value
32
+ end
33
+ class CombinationRuby
34
+ attr_reader :data
35
+ def initialize(k,n)
36
+ raise "k<=n" if k>n
37
+ @k=k
38
+ @n=n
39
+ reset
40
+ end
41
+ def reset
42
+ @data=[]
43
+ (0...@k).each {|i|
44
+ @data[i] = i;
45
+ }
46
+ end
47
+ def each
48
+ reset
49
+ while a=next_value
50
+ yield a
51
+ end
52
+ end
53
+ def next_value
54
+ return false if !@data
55
+ old_comb=@data.dup
56
+ i = @k - 1;
57
+ @data[i]+=1
58
+ while ((i >= 0) and (@data[i] >= @n - @k + 1 + i)) do
59
+ i-=1;
60
+ @data[i]+=1;
61
+ end
62
+
63
+ if (@data[0] > @n - @k) # Combination (n-k, n-k+1, ..., n) reached */
64
+ @data=false # No more combinations can be generated
65
+ else
66
+ # comb now looks like (..., x, n, n, n, ..., n).
67
+ # Turn it into (..., x, x + 1, x + 2, ...)
68
+ i = i+1
69
+ (i...@k).each{ |i1|
70
+ @data[i1] = @data[i1 - 1] + 1
71
+ }
72
+ end
73
+ return old_comb
74
+ end
75
+ end
76
+ class CombinationGsl
77
+ def initialize(k,n)
78
+ require 'gsl'
79
+ raise "k<=n" if k>n
80
+ @k=k
81
+ @n=n
82
+ reset
83
+ end
84
+ def reset
85
+ @c= ::GSL::Combination.calloc(@n, @k);
86
+ end
87
+ def next_value
88
+ return false if !@c
89
+ data=@c.data.to_a
90
+ if @c.next != GSL::SUCCESS
91
+ @c=false
92
+ end
93
+ return data
94
+ end
95
+ def each
96
+ reset
97
+ begin
98
+ yield @c.data.to_a
99
+ end while @c.next == GSL::SUCCESS
100
+ end
101
+ end
102
+ end
103
+ end
@@ -148,6 +148,13 @@ module Statsample
148
148
  book = Spreadsheet.open filename
149
149
  sheet= book.worksheet worksheet_id
150
150
  sheet.each do |row|
151
+ begin
152
+ dates=[]
153
+ row.formats.each_index{|i|
154
+ if !row.formats[i].nil? and row.formats[i].number_format=="DD/MM/YYYY"
155
+ dates.push(i)
156
+ end
157
+ }
151
158
  line_number+=1
152
159
  if(line_number<=ignore_lines)
153
160
  #puts "Skip line"
@@ -155,9 +162,13 @@ module Statsample
155
162
  end
156
163
  # This should be fixed.
157
164
  # If we have a Formula, should be resolver first
165
+ i=-1
158
166
  row.collect!{|c|
167
+ i+=1
159
168
  if c.is_a? Spreadsheet::Formula
160
- nil
169
+ c.value
170
+ elsif dates.include? i and !c.nil? and c.is_a? Numeric
171
+ row.date(i)
161
172
  else
162
173
  c
163
174
  end
@@ -173,6 +184,10 @@ module Statsample
173
184
  }
174
185
  ds.add_case(rowa,false)
175
186
  end
187
+ rescue => e
188
+ error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}"
189
+ raise
190
+ end
176
191
  end
177
192
  convert_to_scale(ds,fields)
178
193
  ds.update_valid_data
@@ -27,7 +27,7 @@ module Statsample
27
27
  @exp=e
28
28
  end
29
29
  def to_s
30
- m="Error:"+@exp.message+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
30
+ m="Error: "+@exp.message+"\n"+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
31
31
  m+="\nRow: #{@i}" unless @i.nil?
32
32
  m
33
33
  end
@@ -158,7 +158,7 @@ module Statsample
158
158
  end
159
159
  # Fast version of add case
160
160
  # Can only add one case and no error check if performed
161
- # You SHOULD use update_valid_data at the the of insertion cycle
161
+ # You SHOULD use update_valid_data at the end of insertion cycle
162
162
  def add_case_array(v)
163
163
  v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
164
164
  end
@@ -295,17 +295,35 @@ module Statsample
295
295
  yield k,@vectors[k]
296
296
  }
297
297
  end
298
- if !Statsample::OPTIMIZED
299
- def case_as_hash(c)
300
- @fields.inject({}) {|a,x|
301
- a[x]=@vectors[x][c]
302
- a
303
- }
298
+ if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
299
+ def case_as_hash(c)
300
+ Statsample::STATSAMPLE__.case_as_hash(self,c)
301
+ end
302
+ else
303
+ def case_as_hash(c)
304
+ _case_as_hash(c)
305
+ end
306
+ end
307
+
308
+ if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
309
+ def case_as_array(c)
310
+ Statsample::STATSAMPLE__.case_as_array(self,c)
311
+ end
312
+ else
313
+ def case_as_array(c)
314
+ _case_as_array(c)
315
+ end
304
316
  end
305
- def case_as_array(c)
306
- @fields.collect {|x| @vectors[x][c]}
307
- end
317
+ def _case_as_hash(c)
318
+ @fields.inject({}) {|a,x|
319
+ a[x]=@vectors[x][c]
320
+ a
321
+ }
322
+ end
323
+ def _case_as_array(c)
324
+ @fields.collect {|x| @vectors[x][c]}
308
325
  end
326
+
309
327
  def each
310
328
  begin
311
329
  @i=0