statsample 0.3.3 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,8 +1,14 @@
1
+ === 0.3.4 / 2009-08-21
2
+ * Works with statsample-optimization 2.0.0
3
+ * Vector doesn't uses delegation. All methods are part of Vector
4
+ * Added Combination. Generates all combination of n elements taken r at a time
5
+ * Bivariate#prop_pearson now can uses as a second parameter :both, :left, :right, :positive or :negative
6
+ * Added LICENSE.txt
7
+
1
8
  === 0.3.3 / 2009-08-11
2
9
  * Added i18n support. For now, only spanish translation available
3
10
  * Bug fix: Test now load libraries on ../lib path
4
11
  * Excel and CSV importers automatically modify type of vector to Scale when all data are numbers or nils values
5
- *
6
12
 
7
13
  === 0.3.2 / 2009-08-04
8
14
 
data/Manifest.txt CHANGED
@@ -25,6 +25,7 @@ lib/statsample/anova.rb
25
25
  lib/statsample/bivariate.rb
26
26
  lib/statsample/chidistribution.rb
27
27
  lib/statsample/codification.rb
28
+ lib/statsample/combination.rb
28
29
  lib/statsample/converters.rb
29
30
  lib/statsample/crosstab.rb
30
31
  lib/statsample/dataset.rb
@@ -38,6 +39,7 @@ lib/statsample/graph/svgscatterplot.rb
38
39
  lib/statsample/htmlreport.rb
39
40
  lib/statsample/multiset.rb
40
41
  lib/statsample/regression.rb
42
+ lib/statsample/regression/logit.rb
41
43
  lib/statsample/regression/multiple.rb
42
44
  lib/statsample/regression/multiple/alglibengine.rb
43
45
  lib/statsample/regression/multiple/gslengine.rb
@@ -54,6 +56,7 @@ setup.rb
54
56
  test/_test_chart.rb
55
57
  test/test_anova.rb
56
58
  test/test_codification.rb
59
+ test/test_combination.rb
57
60
  test/test_crosstab.rb
58
61
  test/test_csv.csv
59
62
  test/test_csv.rb
data/demo/benchmark.rb CHANGED
@@ -1,4 +1,5 @@
1
- require File.dirname(__FILE__)+'/../lib/statsample.rb'
1
+ $:.unshift(File.dirname(__FILE__)+'/../lib/')
2
+ require 'statsample'
2
3
  require 'benchmark'
3
4
  v=(0..10000).collect{|n|
4
5
  r=rand(100)
@@ -37,14 +38,15 @@ ds=Statsample::Dataset.new({'a'=>a.to_vector(:scale),'b'=>b.to_vector(:scale), '
37
38
 
38
39
  if (true)
39
40
  Benchmark.bm(7) do |x|
40
- x.report("Alglib coeffs") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.coeffs; end }
41
- x.report("GslEngine coeffs") { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.coeffs; end }
41
+ x.report("Alglib coeffs") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.coeffs; lr=nil;end }
42
+
43
+ x.report("GslEngine coeffs") { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.coeffs;lr=nil; end }
42
44
  end
43
45
  end
44
46
  if(true)
45
47
  Benchmark.bm(7) do |x|
46
- x.report("Alglib process") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.process([1,2]); end }
47
- x.report("GslEngine process") { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.process([1,2]); end }
48
+ x.report("Alglib process") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.process([rand(10),rand(10)]); end }
49
+ x.report("GslEngine process") { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.process([rand(10),rand(10)]); end }
48
50
 
49
51
  end
50
52
  end
data/demo/regression.rb CHANGED
@@ -2,22 +2,36 @@ require File.dirname(__FILE__)+'/../lib/statsample'
2
2
  require 'benchmark'
3
3
  tests=300
4
4
  include Statsample
5
- r = GSL::Rng.alloc(GSL::Rng::TAUS,Time.now.to_i)
6
5
  ds=Dataset.new(%w{a b c d y})
7
6
  ds['a'].type=:scale
8
7
  ds['b'].type=:scale
9
8
  ds['c'].type=:scale
10
9
  ds['d'].type=:scale
11
10
  ds['y'].type=:scale
11
+
12
+ if HAS_GSL
13
+ r = GSL::Rng.alloc(GSL::Rng::TAUS,Time.now.to_i)
12
14
 
13
15
  tests.times {
14
16
  a=r.ugaussian
15
- b=r.ugaussian
17
+ b=a*2+r.ugaussian
16
18
  c=r.ugaussian
17
19
  d=r.ugaussian
18
20
  y=a*70+b*30+c*5+r.ugaussian*5
19
21
  ds.add_case_array([a,b,c,d,y])
20
22
  }
23
+ else
24
+ tests.times {
25
+ a=1-rand()*2.0
26
+ b=1-rand()*2.0
27
+ c=1-rand()*2.0
28
+ d=1-rand()*2.0
29
+ y=a*70+b*30+c*5+(1-rand()*2.0)*5
30
+ ds.add_case_array([a,b,c,d,y])
31
+ }
32
+
33
+
34
+ end
21
35
  ds.update_valid_data
22
36
 
23
37
  if !File.exists? "regression.dab"
@@ -26,15 +40,27 @@ else
26
40
  da=Statsample.load("regression.dab")
27
41
  end
28
42
  times=1
43
+ if(true)
29
44
  Benchmark.bm(7) do |x|
45
+ if HAS_GSL
30
46
  x.report("GslEngine:") {
31
47
  da.lr_class=Regression::Multiple::GslEngine
32
48
  da.bootstrap(times)
33
49
  }
50
+ end
51
+ if(false)
52
+ if HAS_ALGIB
34
53
  x.report("AlglibEngine:") {
35
54
  da.lr_class=Regression::Multiple::AlglibEngine
36
55
  da.bootstrap(times)
37
56
  }
57
+ end
58
+ x.report("RubyEngine:") {
59
+ da.lr_class=Regression::Multiple::RubyEngine
60
+ da.bootstrap(times)
61
+ }
62
+ end
63
+ end
38
64
  end
39
65
 
40
66
  puts da.summary
@@ -45,11 +71,11 @@ lr=Regression::Multiple.listwise(ds,"y")
45
71
  hr=HtmlReport.new("Regression")
46
72
  hr.add_summary("Regression",lr.summary(HtmlSummary))
47
73
  hr.add_summary("Analisis de Dominancia ", da.da.summary(HtmlSummary))
48
-
74
+ hr.add_correlation_matrix(ds)
49
75
  hr.add_summary("Analisis de Dominancia (Bootstrap)", da.summary(HtmlSummary))
50
76
 
51
77
  da.fields.each{|f|
52
- hr.add_histogram("General Dominance #{f}",da.samples_ga[f].to_vector(:scale))
78
+ # hr.add_histogram("General Dominance #{f}",da.samples_ga[f].to_vector(:scale))
53
79
  }
54
80
  hr.save("Regression Dominance.html")
55
81
 
data/lib/statsample.rb CHANGED
@@ -72,16 +72,6 @@ end
72
72
  rescue LoadError
73
73
  HAS_ALGIB=false
74
74
  end
75
-
76
-
77
- begin
78
- require 'statsample/optimization'
79
- rescue LoadError
80
- module Statsample
81
- OPTIMIZED=false
82
- end
83
- end
84
-
85
75
  #
86
76
  # Modules for statistical analysis
87
77
  # See first:
@@ -90,10 +80,11 @@ end
90
80
  # * Dataset: An union of vectors.
91
81
  #
92
82
  module Statsample
93
- VERSION = '0.3.3'
83
+ VERSION = '0.3.4'
94
84
  SPLIT_TOKEN = ","
95
85
  autoload(:Database, 'statsample/converters')
96
86
  autoload(:Anova, 'statsample/anova')
87
+ autoload(:Combination, 'statsample/combination')
97
88
  autoload(:CSV, 'statsample/converters')
98
89
  autoload(:Excel, 'statsample/converters')
99
90
  autoload(:GGobi, 'statsample/converters')
@@ -113,10 +104,15 @@ module Statsample
113
104
  autoload(:Regression, 'statsample/regression')
114
105
  autoload(:Test, 'statsample/test')
115
106
  def self.load(filename)
116
- fp=File.open(filename,"r")
117
- o=Marshal.load(fp)
118
- fp.close
107
+ if File.exists? filename
108
+ o=false
109
+ File.open(filename,"r") {|fp|
110
+ o=Marshal.load(fp)
111
+ }
119
112
  o
113
+ else
114
+ false
115
+ end
120
116
  end
121
117
 
122
118
  module Util
@@ -243,9 +239,21 @@ module Statsample
243
239
  out
244
240
  end
245
241
  end
242
+
243
+ module STATSAMPLE__
244
+ end
245
+
246
+ end
247
+
248
+
249
+ begin
250
+ require 'statsamplert'
251
+ rescue LoadError
252
+ module Statsample
253
+ OPTIMIZED=false
254
+ end
246
255
  end
247
256
 
248
257
  require 'statsample/vector'
249
258
  require 'statsample/dataset'
250
259
  require 'statsample/crosstab'
251
-
@@ -65,15 +65,34 @@ module Statsample
65
65
  r*Math::sqrt(((size)-2).to_f / (1 - r**2))
66
66
  end
67
67
  # Retrieves the probability value (a la SPSS)
68
- # for a given t, size and number of tails
69
- def prop_pearson(t,size, tails=2)
70
- if HAS_GSL
71
- t=-t if t>0
72
- cdf=GSL::Cdf::tdist_P(t,(size)-2)
73
- cdf*tails
74
- else
75
- raise "Needs ruby-gsl"
76
- end
68
+ # for a given t, size and number of tails.
69
+ # Uses a second parameter
70
+ # * :both or 2 : for r!=0
71
+ # * :right, :positive or 1 : for r > 0
72
+ # * :left, :negative : for r < 0
73
+
74
+ def prop_pearson(t,size, tails=:both)
75
+ tails=:both if tails==2
76
+ tails=:right if tails==1 or tails==:positive
77
+ tails=:left if tails==:negative
78
+
79
+ n_tails=case tails
80
+ when :both
81
+ 2
82
+ else
83
+ 1
84
+ end
85
+ if HAS_GSL
86
+ t=-t if t>0 and (tails==:both)
87
+ cdf=GSL::Cdf::tdist_P(t,size-2)
88
+ if(tails==:right)
89
+ 1.0-(cdf*n_tails)
90
+ else
91
+ cdf*n_tails
92
+ end
93
+ else
94
+ raise "Needs ruby-gsl"
95
+ end
77
96
  end
78
97
  # Returns residual score after delete variance
79
98
  # from another variable
@@ -0,0 +1,103 @@
1
+ module Statsample
2
+ # Combination class systematically generates all combinations of n elements, taken r at a time.
3
+ # Use GSL::Combination is available for extra speed
4
+ # Source: http://snippets.dzone.com/posts/show/4666
5
+ # Use:
6
+ # comb=Statsample::Combination.new(3,5)
7
+ # comb.each{|c|
8
+ # p c
9
+ # }
10
+ class Combination
11
+ attr_reader :d
12
+ def initialize(k,n,only_ruby=false)
13
+ @k=k
14
+ @n=n
15
+ if HAS_GSL and !only_ruby
16
+ @d=CombinationGsl.new(@k,@n)
17
+ else
18
+ @d=CombinationRuby.new(@k,@n)
19
+ end
20
+ end
21
+ def each
22
+ reset
23
+ while a=next_value
24
+ yield a
25
+ end
26
+ end
27
+ def reset
28
+ @d.reset
29
+ end
30
+ def next_value
31
+ @d.next_value
32
+ end
33
+ class CombinationRuby
34
+ attr_reader :data
35
+ def initialize(k,n)
36
+ raise "k<=n" if k>n
37
+ @k=k
38
+ @n=n
39
+ reset
40
+ end
41
+ def reset
42
+ @data=[]
43
+ (0...@k).each {|i|
44
+ @data[i] = i;
45
+ }
46
+ end
47
+ def each
48
+ reset
49
+ while a=next_value
50
+ yield a
51
+ end
52
+ end
53
+ def next_value
54
+ return false if !@data
55
+ old_comb=@data.dup
56
+ i = @k - 1;
57
+ @data[i]+=1
58
+ while ((i >= 0) and (@data[i] >= @n - @k + 1 + i)) do
59
+ i-=1;
60
+ @data[i]+=1;
61
+ end
62
+
63
+ if (@data[0] > @n - @k) # Combination (n-k, n-k+1, ..., n) reached */
64
+ @data=false # No more combinations can be generated
65
+ else
66
+ # comb now looks like (..., x, n, n, n, ..., n).
67
+ # Turn it into (..., x, x + 1, x + 2, ...)
68
+ i = i+1
69
+ (i...@k).each{ |i1|
70
+ @data[i1] = @data[i1 - 1] + 1
71
+ }
72
+ end
73
+ return old_comb
74
+ end
75
+ end
76
+ class CombinationGsl
77
+ def initialize(k,n)
78
+ require 'gsl'
79
+ raise "k<=n" if k>n
80
+ @k=k
81
+ @n=n
82
+ reset
83
+ end
84
+ def reset
85
+ @c= ::GSL::Combination.calloc(@n, @k);
86
+ end
87
+ def next_value
88
+ return false if !@c
89
+ data=@c.data.to_a
90
+ if @c.next != GSL::SUCCESS
91
+ @c=false
92
+ end
93
+ return data
94
+ end
95
+ def each
96
+ reset
97
+ begin
98
+ yield @c.data.to_a
99
+ end while @c.next == GSL::SUCCESS
100
+ end
101
+ end
102
+ end
103
+ end
@@ -148,6 +148,13 @@ module Statsample
148
148
  book = Spreadsheet.open filename
149
149
  sheet= book.worksheet worksheet_id
150
150
  sheet.each do |row|
151
+ begin
152
+ dates=[]
153
+ row.formats.each_index{|i|
154
+ if !row.formats[i].nil? and row.formats[i].number_format=="DD/MM/YYYY"
155
+ dates.push(i)
156
+ end
157
+ }
151
158
  line_number+=1
152
159
  if(line_number<=ignore_lines)
153
160
  #puts "Skip line"
@@ -155,9 +162,13 @@ module Statsample
155
162
  end
156
163
  # This should be fixed.
157
164
  # If we have a Formula, should be resolver first
165
+ i=-1
158
166
  row.collect!{|c|
167
+ i+=1
159
168
  if c.is_a? Spreadsheet::Formula
160
- nil
169
+ c.value
170
+ elsif dates.include? i and !c.nil? and c.is_a? Numeric
171
+ row.date(i)
161
172
  else
162
173
  c
163
174
  end
@@ -173,6 +184,10 @@ module Statsample
173
184
  }
174
185
  ds.add_case(rowa,false)
175
186
  end
187
+ rescue => e
188
+ error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}"
189
+ raise
190
+ end
176
191
  end
177
192
  convert_to_scale(ds,fields)
178
193
  ds.update_valid_data
@@ -27,7 +27,7 @@ module Statsample
27
27
  @exp=e
28
28
  end
29
29
  def to_s
30
- m="Error:"+@exp.message+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
30
+ m="Error: "+@exp.message+"\n"+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
31
31
  m+="\nRow: #{@i}" unless @i.nil?
32
32
  m
33
33
  end
@@ -158,7 +158,7 @@ module Statsample
158
158
  end
159
159
  # Fast version of add case
160
160
  # Can only add one case and no error check if performed
161
- # You SHOULD use update_valid_data at the the of insertion cycle
161
+ # You SHOULD use update_valid_data at the end of insertion cycle
162
162
  def add_case_array(v)
163
163
  v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
164
164
  end
@@ -295,17 +295,35 @@ module Statsample
295
295
  yield k,@vectors[k]
296
296
  }
297
297
  end
298
- if !Statsample::OPTIMIZED
299
- def case_as_hash(c)
300
- @fields.inject({}) {|a,x|
301
- a[x]=@vectors[x][c]
302
- a
303
- }
298
+ if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
299
+ def case_as_hash(c)
300
+ Statsample::STATSAMPLE__.case_as_hash(self,c)
301
+ end
302
+ else
303
+ def case_as_hash(c)
304
+ _case_as_hash(c)
305
+ end
306
+ end
307
+
308
+ if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
309
+ def case_as_array(c)
310
+ Statsample::STATSAMPLE__.case_as_array(self,c)
311
+ end
312
+ else
313
+ def case_as_array(c)
314
+ _case_as_array(c)
315
+ end
304
316
  end
305
- def case_as_array(c)
306
- @fields.collect {|x| @vectors[x][c]}
307
- end
317
+ def _case_as_hash(c)
318
+ @fields.inject({}) {|a,x|
319
+ a[x]=@vectors[x][c]
320
+ a
321
+ }
322
+ end
323
+ def _case_as_array(c)
324
+ @fields.collect {|x| @vectors[x][c]}
308
325
  end
326
+
309
327
  def each
310
328
  begin
311
329
  @i=0