statsample 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -1
- data/Manifest.txt +3 -0
- data/demo/benchmark.rb +7 -5
- data/demo/regression.rb +30 -4
- data/lib/statsample.rb +23 -15
- data/lib/statsample/bivariate.rb +28 -9
- data/lib/statsample/combination.rb +103 -0
- data/lib/statsample/converters.rb +16 -1
- data/lib/statsample/dataset.rb +29 -11
- data/lib/statsample/dominanceanalysis.rb +15 -11
- data/lib/statsample/dominanceanalysis/bootstrap.rb +9 -7
- data/lib/statsample/graph/svggraph.rb +1 -3
- data/lib/statsample/regression.rb +1 -0
- data/lib/statsample/regression/logit.rb +35 -0
- data/lib/statsample/regression/multiple.rb +21 -2
- data/lib/statsample/regression/multiple/alglibengine.rb +3 -1
- data/lib/statsample/vector.rb +168 -183
- data/test/test_combination.rb +42 -0
- data/test/test_csv.rb +1 -1
- data/test/test_dataset.rb +5 -0
- data/test/test_statistics.rb +19 -2
- data/test/test_svg_graph.rb +5 -2
- data/test/test_vector.rb +6 -1
- metadata +6 -2
data/History.txt
CHANGED
@@ -1,8 +1,14 @@
|
|
1
|
+
=== 0.3.4 / 2009-08-21
|
2
|
+
* Works with statsample-optimization 2.0.0
|
3
|
+
* Vector doesn't uses delegation. All methods are part of Vector
|
4
|
+
* Added Combination. Generates all combination of n elements taken r at a time
|
5
|
+
* Bivariate#prop_pearson now can uses as a second parameter :both, :left, :right, :positive or :negative
|
6
|
+
* Added LICENSE.txt
|
7
|
+
|
1
8
|
=== 0.3.3 / 2009-08-11
|
2
9
|
* Added i18n support. For now, only spanish translation available
|
3
10
|
* Bug fix: Test now load libraries on ../lib path
|
4
11
|
* Excel and CSV importers automatically modify type of vector to Scale when all data are numbers or nils values
|
5
|
-
*
|
6
12
|
|
7
13
|
=== 0.3.2 / 2009-08-04
|
8
14
|
|
data/Manifest.txt
CHANGED
@@ -25,6 +25,7 @@ lib/statsample/anova.rb
|
|
25
25
|
lib/statsample/bivariate.rb
|
26
26
|
lib/statsample/chidistribution.rb
|
27
27
|
lib/statsample/codification.rb
|
28
|
+
lib/statsample/combination.rb
|
28
29
|
lib/statsample/converters.rb
|
29
30
|
lib/statsample/crosstab.rb
|
30
31
|
lib/statsample/dataset.rb
|
@@ -38,6 +39,7 @@ lib/statsample/graph/svgscatterplot.rb
|
|
38
39
|
lib/statsample/htmlreport.rb
|
39
40
|
lib/statsample/multiset.rb
|
40
41
|
lib/statsample/regression.rb
|
42
|
+
lib/statsample/regression/logit.rb
|
41
43
|
lib/statsample/regression/multiple.rb
|
42
44
|
lib/statsample/regression/multiple/alglibengine.rb
|
43
45
|
lib/statsample/regression/multiple/gslengine.rb
|
@@ -54,6 +56,7 @@ setup.rb
|
|
54
56
|
test/_test_chart.rb
|
55
57
|
test/test_anova.rb
|
56
58
|
test/test_codification.rb
|
59
|
+
test/test_combination.rb
|
57
60
|
test/test_crosstab.rb
|
58
61
|
test/test_csv.csv
|
59
62
|
test/test_csv.rb
|
data/demo/benchmark.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
2
|
+
require 'statsample'
|
2
3
|
require 'benchmark'
|
3
4
|
v=(0..10000).collect{|n|
|
4
5
|
r=rand(100)
|
@@ -37,14 +38,15 @@ ds=Statsample::Dataset.new({'a'=>a.to_vector(:scale),'b'=>b.to_vector(:scale), '
|
|
37
38
|
|
38
39
|
if (true)
|
39
40
|
Benchmark.bm(7) do |x|
|
40
|
-
x.report("Alglib coeffs") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.coeffs;
|
41
|
-
|
41
|
+
x.report("Alglib coeffs") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.coeffs; lr=nil;end }
|
42
|
+
|
43
|
+
x.report("GslEngine coeffs") { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.coeffs;lr=nil; end }
|
42
44
|
end
|
43
45
|
end
|
44
46
|
if(true)
|
45
47
|
Benchmark.bm(7) do |x|
|
46
|
-
x.report("Alglib process") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.process([
|
47
|
-
x.report("GslEngine process") { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.process([
|
48
|
+
x.report("Alglib process") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.process([rand(10),rand(10)]); end }
|
49
|
+
x.report("GslEngine process") { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.process([rand(10),rand(10)]); end }
|
48
50
|
|
49
51
|
end
|
50
52
|
end
|
data/demo/regression.rb
CHANGED
@@ -2,22 +2,36 @@ require File.dirname(__FILE__)+'/../lib/statsample'
|
|
2
2
|
require 'benchmark'
|
3
3
|
tests=300
|
4
4
|
include Statsample
|
5
|
-
r = GSL::Rng.alloc(GSL::Rng::TAUS,Time.now.to_i)
|
6
5
|
ds=Dataset.new(%w{a b c d y})
|
7
6
|
ds['a'].type=:scale
|
8
7
|
ds['b'].type=:scale
|
9
8
|
ds['c'].type=:scale
|
10
9
|
ds['d'].type=:scale
|
11
10
|
ds['y'].type=:scale
|
11
|
+
|
12
|
+
if HAS_GSL
|
13
|
+
r = GSL::Rng.alloc(GSL::Rng::TAUS,Time.now.to_i)
|
12
14
|
|
13
15
|
tests.times {
|
14
16
|
a=r.ugaussian
|
15
|
-
b=r.ugaussian
|
17
|
+
b=a*2+r.ugaussian
|
16
18
|
c=r.ugaussian
|
17
19
|
d=r.ugaussian
|
18
20
|
y=a*70+b*30+c*5+r.ugaussian*5
|
19
21
|
ds.add_case_array([a,b,c,d,y])
|
20
22
|
}
|
23
|
+
else
|
24
|
+
tests.times {
|
25
|
+
a=1-rand()*2.0
|
26
|
+
b=1-rand()*2.0
|
27
|
+
c=1-rand()*2.0
|
28
|
+
d=1-rand()*2.0
|
29
|
+
y=a*70+b*30+c*5+(1-rand()*2.0)*5
|
30
|
+
ds.add_case_array([a,b,c,d,y])
|
31
|
+
}
|
32
|
+
|
33
|
+
|
34
|
+
end
|
21
35
|
ds.update_valid_data
|
22
36
|
|
23
37
|
if !File.exists? "regression.dab"
|
@@ -26,15 +40,27 @@ else
|
|
26
40
|
da=Statsample.load("regression.dab")
|
27
41
|
end
|
28
42
|
times=1
|
43
|
+
if(true)
|
29
44
|
Benchmark.bm(7) do |x|
|
45
|
+
if HAS_GSL
|
30
46
|
x.report("GslEngine:") {
|
31
47
|
da.lr_class=Regression::Multiple::GslEngine
|
32
48
|
da.bootstrap(times)
|
33
49
|
}
|
50
|
+
end
|
51
|
+
if(false)
|
52
|
+
if HAS_ALGIB
|
34
53
|
x.report("AlglibEngine:") {
|
35
54
|
da.lr_class=Regression::Multiple::AlglibEngine
|
36
55
|
da.bootstrap(times)
|
37
56
|
}
|
57
|
+
end
|
58
|
+
x.report("RubyEngine:") {
|
59
|
+
da.lr_class=Regression::Multiple::RubyEngine
|
60
|
+
da.bootstrap(times)
|
61
|
+
}
|
62
|
+
end
|
63
|
+
end
|
38
64
|
end
|
39
65
|
|
40
66
|
puts da.summary
|
@@ -45,11 +71,11 @@ lr=Regression::Multiple.listwise(ds,"y")
|
|
45
71
|
hr=HtmlReport.new("Regression")
|
46
72
|
hr.add_summary("Regression",lr.summary(HtmlSummary))
|
47
73
|
hr.add_summary("Analisis de Dominancia ", da.da.summary(HtmlSummary))
|
48
|
-
|
74
|
+
hr.add_correlation_matrix(ds)
|
49
75
|
hr.add_summary("Analisis de Dominancia (Bootstrap)", da.summary(HtmlSummary))
|
50
76
|
|
51
77
|
da.fields.each{|f|
|
52
|
-
hr.add_histogram("General Dominance #{f}",da.samples_ga[f].to_vector(:scale))
|
78
|
+
# hr.add_histogram("General Dominance #{f}",da.samples_ga[f].to_vector(:scale))
|
53
79
|
}
|
54
80
|
hr.save("Regression Dominance.html")
|
55
81
|
|
data/lib/statsample.rb
CHANGED
@@ -72,16 +72,6 @@ end
|
|
72
72
|
rescue LoadError
|
73
73
|
HAS_ALGIB=false
|
74
74
|
end
|
75
|
-
|
76
|
-
|
77
|
-
begin
|
78
|
-
require 'statsample/optimization'
|
79
|
-
rescue LoadError
|
80
|
-
module Statsample
|
81
|
-
OPTIMIZED=false
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
75
|
#
|
86
76
|
# Modules for statistical analysis
|
87
77
|
# See first:
|
@@ -90,10 +80,11 @@ end
|
|
90
80
|
# * Dataset: An union of vectors.
|
91
81
|
#
|
92
82
|
module Statsample
|
93
|
-
VERSION = '0.3.
|
83
|
+
VERSION = '0.3.4'
|
94
84
|
SPLIT_TOKEN = ","
|
95
85
|
autoload(:Database, 'statsample/converters')
|
96
86
|
autoload(:Anova, 'statsample/anova')
|
87
|
+
autoload(:Combination, 'statsample/combination')
|
97
88
|
autoload(:CSV, 'statsample/converters')
|
98
89
|
autoload(:Excel, 'statsample/converters')
|
99
90
|
autoload(:GGobi, 'statsample/converters')
|
@@ -113,10 +104,15 @@ module Statsample
|
|
113
104
|
autoload(:Regression, 'statsample/regression')
|
114
105
|
autoload(:Test, 'statsample/test')
|
115
106
|
def self.load(filename)
|
116
|
-
|
117
|
-
|
118
|
-
|
107
|
+
if File.exists? filename
|
108
|
+
o=false
|
109
|
+
File.open(filename,"r") {|fp|
|
110
|
+
o=Marshal.load(fp)
|
111
|
+
}
|
119
112
|
o
|
113
|
+
else
|
114
|
+
false
|
115
|
+
end
|
120
116
|
end
|
121
117
|
|
122
118
|
module Util
|
@@ -243,9 +239,21 @@ module Statsample
|
|
243
239
|
out
|
244
240
|
end
|
245
241
|
end
|
242
|
+
|
243
|
+
module STATSAMPLE__
|
244
|
+
end
|
245
|
+
|
246
|
+
end
|
247
|
+
|
248
|
+
|
249
|
+
begin
|
250
|
+
require 'statsamplert'
|
251
|
+
rescue LoadError
|
252
|
+
module Statsample
|
253
|
+
OPTIMIZED=false
|
254
|
+
end
|
246
255
|
end
|
247
256
|
|
248
257
|
require 'statsample/vector'
|
249
258
|
require 'statsample/dataset'
|
250
259
|
require 'statsample/crosstab'
|
251
|
-
|
data/lib/statsample/bivariate.rb
CHANGED
@@ -65,15 +65,34 @@ module Statsample
|
|
65
65
|
r*Math::sqrt(((size)-2).to_f / (1 - r**2))
|
66
66
|
end
|
67
67
|
# Retrieves the probability value (a la SPSS)
|
68
|
-
# for a given t, size and number of tails
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
68
|
+
# for a given t, size and number of tails.
|
69
|
+
# Uses a second parameter
|
70
|
+
# * :both or 2 : for r!=0
|
71
|
+
# * :right, :positive or 1 : for r > 0
|
72
|
+
# * :left, :negative : for r < 0
|
73
|
+
|
74
|
+
def prop_pearson(t,size, tails=:both)
|
75
|
+
tails=:both if tails==2
|
76
|
+
tails=:right if tails==1 or tails==:positive
|
77
|
+
tails=:left if tails==:negative
|
78
|
+
|
79
|
+
n_tails=case tails
|
80
|
+
when :both
|
81
|
+
2
|
82
|
+
else
|
83
|
+
1
|
84
|
+
end
|
85
|
+
if HAS_GSL
|
86
|
+
t=-t if t>0 and (tails==:both)
|
87
|
+
cdf=GSL::Cdf::tdist_P(t,size-2)
|
88
|
+
if(tails==:right)
|
89
|
+
1.0-(cdf*n_tails)
|
90
|
+
else
|
91
|
+
cdf*n_tails
|
92
|
+
end
|
93
|
+
else
|
94
|
+
raise "Needs ruby-gsl"
|
95
|
+
end
|
77
96
|
end
|
78
97
|
# Returns residual score after delete variance
|
79
98
|
# from another variable
|
@@ -0,0 +1,103 @@
|
|
1
|
+
module Statsample
|
2
|
+
# Combination class systematically generates all combinations of n elements, taken r at a time.
|
3
|
+
# Use GSL::Combination is available for extra speed
|
4
|
+
# Source: http://snippets.dzone.com/posts/show/4666
|
5
|
+
# Use:
|
6
|
+
# comb=Statsample::Combination.new(3,5)
|
7
|
+
# comb.each{|c|
|
8
|
+
# p c
|
9
|
+
# }
|
10
|
+
class Combination
|
11
|
+
attr_reader :d
|
12
|
+
def initialize(k,n,only_ruby=false)
|
13
|
+
@k=k
|
14
|
+
@n=n
|
15
|
+
if HAS_GSL and !only_ruby
|
16
|
+
@d=CombinationGsl.new(@k,@n)
|
17
|
+
else
|
18
|
+
@d=CombinationRuby.new(@k,@n)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
def each
|
22
|
+
reset
|
23
|
+
while a=next_value
|
24
|
+
yield a
|
25
|
+
end
|
26
|
+
end
|
27
|
+
def reset
|
28
|
+
@d.reset
|
29
|
+
end
|
30
|
+
def next_value
|
31
|
+
@d.next_value
|
32
|
+
end
|
33
|
+
class CombinationRuby
|
34
|
+
attr_reader :data
|
35
|
+
def initialize(k,n)
|
36
|
+
raise "k<=n" if k>n
|
37
|
+
@k=k
|
38
|
+
@n=n
|
39
|
+
reset
|
40
|
+
end
|
41
|
+
def reset
|
42
|
+
@data=[]
|
43
|
+
(0...@k).each {|i|
|
44
|
+
@data[i] = i;
|
45
|
+
}
|
46
|
+
end
|
47
|
+
def each
|
48
|
+
reset
|
49
|
+
while a=next_value
|
50
|
+
yield a
|
51
|
+
end
|
52
|
+
end
|
53
|
+
def next_value
|
54
|
+
return false if !@data
|
55
|
+
old_comb=@data.dup
|
56
|
+
i = @k - 1;
|
57
|
+
@data[i]+=1
|
58
|
+
while ((i >= 0) and (@data[i] >= @n - @k + 1 + i)) do
|
59
|
+
i-=1;
|
60
|
+
@data[i]+=1;
|
61
|
+
end
|
62
|
+
|
63
|
+
if (@data[0] > @n - @k) # Combination (n-k, n-k+1, ..., n) reached */
|
64
|
+
@data=false # No more combinations can be generated
|
65
|
+
else
|
66
|
+
# comb now looks like (..., x, n, n, n, ..., n).
|
67
|
+
# Turn it into (..., x, x + 1, x + 2, ...)
|
68
|
+
i = i+1
|
69
|
+
(i...@k).each{ |i1|
|
70
|
+
@data[i1] = @data[i1 - 1] + 1
|
71
|
+
}
|
72
|
+
end
|
73
|
+
return old_comb
|
74
|
+
end
|
75
|
+
end
|
76
|
+
class CombinationGsl
|
77
|
+
def initialize(k,n)
|
78
|
+
require 'gsl'
|
79
|
+
raise "k<=n" if k>n
|
80
|
+
@k=k
|
81
|
+
@n=n
|
82
|
+
reset
|
83
|
+
end
|
84
|
+
def reset
|
85
|
+
@c= ::GSL::Combination.calloc(@n, @k);
|
86
|
+
end
|
87
|
+
def next_value
|
88
|
+
return false if !@c
|
89
|
+
data=@c.data.to_a
|
90
|
+
if @c.next != GSL::SUCCESS
|
91
|
+
@c=false
|
92
|
+
end
|
93
|
+
return data
|
94
|
+
end
|
95
|
+
def each
|
96
|
+
reset
|
97
|
+
begin
|
98
|
+
yield @c.data.to_a
|
99
|
+
end while @c.next == GSL::SUCCESS
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
@@ -148,6 +148,13 @@ module Statsample
|
|
148
148
|
book = Spreadsheet.open filename
|
149
149
|
sheet= book.worksheet worksheet_id
|
150
150
|
sheet.each do |row|
|
151
|
+
begin
|
152
|
+
dates=[]
|
153
|
+
row.formats.each_index{|i|
|
154
|
+
if !row.formats[i].nil? and row.formats[i].number_format=="DD/MM/YYYY"
|
155
|
+
dates.push(i)
|
156
|
+
end
|
157
|
+
}
|
151
158
|
line_number+=1
|
152
159
|
if(line_number<=ignore_lines)
|
153
160
|
#puts "Skip line"
|
@@ -155,9 +162,13 @@ module Statsample
|
|
155
162
|
end
|
156
163
|
# This should be fixed.
|
157
164
|
# If we have a Formula, should be resolver first
|
165
|
+
i=-1
|
158
166
|
row.collect!{|c|
|
167
|
+
i+=1
|
159
168
|
if c.is_a? Spreadsheet::Formula
|
160
|
-
|
169
|
+
c.value
|
170
|
+
elsif dates.include? i and !c.nil? and c.is_a? Numeric
|
171
|
+
row.date(i)
|
161
172
|
else
|
162
173
|
c
|
163
174
|
end
|
@@ -173,6 +184,10 @@ module Statsample
|
|
173
184
|
}
|
174
185
|
ds.add_case(rowa,false)
|
175
186
|
end
|
187
|
+
rescue => e
|
188
|
+
error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}"
|
189
|
+
raise
|
190
|
+
end
|
176
191
|
end
|
177
192
|
convert_to_scale(ds,fields)
|
178
193
|
ds.update_valid_data
|
data/lib/statsample/dataset.rb
CHANGED
@@ -27,7 +27,7 @@ module Statsample
|
|
27
27
|
@exp=e
|
28
28
|
end
|
29
29
|
def to_s
|
30
|
-
m="Error:"+@exp.message+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
|
30
|
+
m="Error: "+@exp.message+"\n"+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
|
31
31
|
m+="\nRow: #{@i}" unless @i.nil?
|
32
32
|
m
|
33
33
|
end
|
@@ -158,7 +158,7 @@ module Statsample
|
|
158
158
|
end
|
159
159
|
# Fast version of add case
|
160
160
|
# Can only add one case and no error check if performed
|
161
|
-
# You SHOULD use update_valid_data at the
|
161
|
+
# You SHOULD use update_valid_data at the end of insertion cycle
|
162
162
|
def add_case_array(v)
|
163
163
|
v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
|
164
164
|
end
|
@@ -295,17 +295,35 @@ module Statsample
|
|
295
295
|
yield k,@vectors[k]
|
296
296
|
}
|
297
297
|
end
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
298
|
+
if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
|
299
|
+
def case_as_hash(c)
|
300
|
+
Statsample::STATSAMPLE__.case_as_hash(self,c)
|
301
|
+
end
|
302
|
+
else
|
303
|
+
def case_as_hash(c)
|
304
|
+
_case_as_hash(c)
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
308
|
+
if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
|
309
|
+
def case_as_array(c)
|
310
|
+
Statsample::STATSAMPLE__.case_as_array(self,c)
|
311
|
+
end
|
312
|
+
else
|
313
|
+
def case_as_array(c)
|
314
|
+
_case_as_array(c)
|
315
|
+
end
|
304
316
|
end
|
305
|
-
|
306
|
-
|
307
|
-
|
317
|
+
def _case_as_hash(c)
|
318
|
+
@fields.inject({}) {|a,x|
|
319
|
+
a[x]=@vectors[x][c]
|
320
|
+
a
|
321
|
+
}
|
322
|
+
end
|
323
|
+
def _case_as_array(c)
|
324
|
+
@fields.collect {|x| @vectors[x][c]}
|
308
325
|
end
|
326
|
+
|
309
327
|
def each
|
310
328
|
begin
|
311
329
|
@i=0
|