statsample 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -1
- data/Manifest.txt +3 -0
- data/demo/benchmark.rb +7 -5
- data/demo/regression.rb +30 -4
- data/lib/statsample.rb +23 -15
- data/lib/statsample/bivariate.rb +28 -9
- data/lib/statsample/combination.rb +103 -0
- data/lib/statsample/converters.rb +16 -1
- data/lib/statsample/dataset.rb +29 -11
- data/lib/statsample/dominanceanalysis.rb +15 -11
- data/lib/statsample/dominanceanalysis/bootstrap.rb +9 -7
- data/lib/statsample/graph/svggraph.rb +1 -3
- data/lib/statsample/regression.rb +1 -0
- data/lib/statsample/regression/logit.rb +35 -0
- data/lib/statsample/regression/multiple.rb +21 -2
- data/lib/statsample/regression/multiple/alglibengine.rb +3 -1
- data/lib/statsample/vector.rb +168 -183
- data/test/test_combination.rb +42 -0
- data/test/test_csv.rb +1 -1
- data/test/test_dataset.rb +5 -0
- data/test/test_statistics.rb +19 -2
- data/test/test_svg_graph.rb +5 -2
- data/test/test_vector.rb +6 -1
- metadata +6 -2
data/History.txt
CHANGED
@@ -1,8 +1,14 @@
|
|
1
|
+
=== 0.3.4 / 2009-08-21
|
2
|
+
* Works with statsample-optimization 2.0.0
|
3
|
+
* Vector doesn't uses delegation. All methods are part of Vector
|
4
|
+
* Added Combination. Generates all combination of n elements taken r at a time
|
5
|
+
* Bivariate#prop_pearson now can uses as a second parameter :both, :left, :right, :positive or :negative
|
6
|
+
* Added LICENSE.txt
|
7
|
+
|
1
8
|
=== 0.3.3 / 2009-08-11
|
2
9
|
* Added i18n support. For now, only spanish translation available
|
3
10
|
* Bug fix: Test now load libraries on ../lib path
|
4
11
|
* Excel and CSV importers automatically modify type of vector to Scale when all data are numbers or nils values
|
5
|
-
*
|
6
12
|
|
7
13
|
=== 0.3.2 / 2009-08-04
|
8
14
|
|
data/Manifest.txt
CHANGED
@@ -25,6 +25,7 @@ lib/statsample/anova.rb
|
|
25
25
|
lib/statsample/bivariate.rb
|
26
26
|
lib/statsample/chidistribution.rb
|
27
27
|
lib/statsample/codification.rb
|
28
|
+
lib/statsample/combination.rb
|
28
29
|
lib/statsample/converters.rb
|
29
30
|
lib/statsample/crosstab.rb
|
30
31
|
lib/statsample/dataset.rb
|
@@ -38,6 +39,7 @@ lib/statsample/graph/svgscatterplot.rb
|
|
38
39
|
lib/statsample/htmlreport.rb
|
39
40
|
lib/statsample/multiset.rb
|
40
41
|
lib/statsample/regression.rb
|
42
|
+
lib/statsample/regression/logit.rb
|
41
43
|
lib/statsample/regression/multiple.rb
|
42
44
|
lib/statsample/regression/multiple/alglibengine.rb
|
43
45
|
lib/statsample/regression/multiple/gslengine.rb
|
@@ -54,6 +56,7 @@ setup.rb
|
|
54
56
|
test/_test_chart.rb
|
55
57
|
test/test_anova.rb
|
56
58
|
test/test_codification.rb
|
59
|
+
test/test_combination.rb
|
57
60
|
test/test_crosstab.rb
|
58
61
|
test/test_csv.csv
|
59
62
|
test/test_csv.rb
|
data/demo/benchmark.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
2
|
+
require 'statsample'
|
2
3
|
require 'benchmark'
|
3
4
|
v=(0..10000).collect{|n|
|
4
5
|
r=rand(100)
|
@@ -37,14 +38,15 @@ ds=Statsample::Dataset.new({'a'=>a.to_vector(:scale),'b'=>b.to_vector(:scale), '
|
|
37
38
|
|
38
39
|
if (true)
|
39
40
|
Benchmark.bm(7) do |x|
|
40
|
-
x.report("Alglib coeffs") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.coeffs;
|
41
|
-
|
41
|
+
x.report("Alglib coeffs") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.coeffs; lr=nil;end }
|
42
|
+
|
43
|
+
x.report("GslEngine coeffs") { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.coeffs;lr=nil; end }
|
42
44
|
end
|
43
45
|
end
|
44
46
|
if(true)
|
45
47
|
Benchmark.bm(7) do |x|
|
46
|
-
x.report("Alglib process") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.process([
|
47
|
-
x.report("GslEngine process") { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.process([
|
48
|
+
x.report("Alglib process") { for i in 1..n; lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,"c"); lr.process([rand(10),rand(10)]); end }
|
49
|
+
x.report("GslEngine process") { for i in 1..n; lr=Statsample::Regression::Multiple::GslEngine.new(ds,"c"); lr.process([rand(10),rand(10)]); end }
|
48
50
|
|
49
51
|
end
|
50
52
|
end
|
data/demo/regression.rb
CHANGED
@@ -2,22 +2,36 @@ require File.dirname(__FILE__)+'/../lib/statsample'
|
|
2
2
|
require 'benchmark'
|
3
3
|
tests=300
|
4
4
|
include Statsample
|
5
|
-
r = GSL::Rng.alloc(GSL::Rng::TAUS,Time.now.to_i)
|
6
5
|
ds=Dataset.new(%w{a b c d y})
|
7
6
|
ds['a'].type=:scale
|
8
7
|
ds['b'].type=:scale
|
9
8
|
ds['c'].type=:scale
|
10
9
|
ds['d'].type=:scale
|
11
10
|
ds['y'].type=:scale
|
11
|
+
|
12
|
+
if HAS_GSL
|
13
|
+
r = GSL::Rng.alloc(GSL::Rng::TAUS,Time.now.to_i)
|
12
14
|
|
13
15
|
tests.times {
|
14
16
|
a=r.ugaussian
|
15
|
-
b=r.ugaussian
|
17
|
+
b=a*2+r.ugaussian
|
16
18
|
c=r.ugaussian
|
17
19
|
d=r.ugaussian
|
18
20
|
y=a*70+b*30+c*5+r.ugaussian*5
|
19
21
|
ds.add_case_array([a,b,c,d,y])
|
20
22
|
}
|
23
|
+
else
|
24
|
+
tests.times {
|
25
|
+
a=1-rand()*2.0
|
26
|
+
b=1-rand()*2.0
|
27
|
+
c=1-rand()*2.0
|
28
|
+
d=1-rand()*2.0
|
29
|
+
y=a*70+b*30+c*5+(1-rand()*2.0)*5
|
30
|
+
ds.add_case_array([a,b,c,d,y])
|
31
|
+
}
|
32
|
+
|
33
|
+
|
34
|
+
end
|
21
35
|
ds.update_valid_data
|
22
36
|
|
23
37
|
if !File.exists? "regression.dab"
|
@@ -26,15 +40,27 @@ else
|
|
26
40
|
da=Statsample.load("regression.dab")
|
27
41
|
end
|
28
42
|
times=1
|
43
|
+
if(true)
|
29
44
|
Benchmark.bm(7) do |x|
|
45
|
+
if HAS_GSL
|
30
46
|
x.report("GslEngine:") {
|
31
47
|
da.lr_class=Regression::Multiple::GslEngine
|
32
48
|
da.bootstrap(times)
|
33
49
|
}
|
50
|
+
end
|
51
|
+
if(false)
|
52
|
+
if HAS_ALGIB
|
34
53
|
x.report("AlglibEngine:") {
|
35
54
|
da.lr_class=Regression::Multiple::AlglibEngine
|
36
55
|
da.bootstrap(times)
|
37
56
|
}
|
57
|
+
end
|
58
|
+
x.report("RubyEngine:") {
|
59
|
+
da.lr_class=Regression::Multiple::RubyEngine
|
60
|
+
da.bootstrap(times)
|
61
|
+
}
|
62
|
+
end
|
63
|
+
end
|
38
64
|
end
|
39
65
|
|
40
66
|
puts da.summary
|
@@ -45,11 +71,11 @@ lr=Regression::Multiple.listwise(ds,"y")
|
|
45
71
|
hr=HtmlReport.new("Regression")
|
46
72
|
hr.add_summary("Regression",lr.summary(HtmlSummary))
|
47
73
|
hr.add_summary("Analisis de Dominancia ", da.da.summary(HtmlSummary))
|
48
|
-
|
74
|
+
hr.add_correlation_matrix(ds)
|
49
75
|
hr.add_summary("Analisis de Dominancia (Bootstrap)", da.summary(HtmlSummary))
|
50
76
|
|
51
77
|
da.fields.each{|f|
|
52
|
-
hr.add_histogram("General Dominance #{f}",da.samples_ga[f].to_vector(:scale))
|
78
|
+
# hr.add_histogram("General Dominance #{f}",da.samples_ga[f].to_vector(:scale))
|
53
79
|
}
|
54
80
|
hr.save("Regression Dominance.html")
|
55
81
|
|
data/lib/statsample.rb
CHANGED
@@ -72,16 +72,6 @@ end
|
|
72
72
|
rescue LoadError
|
73
73
|
HAS_ALGIB=false
|
74
74
|
end
|
75
|
-
|
76
|
-
|
77
|
-
begin
|
78
|
-
require 'statsample/optimization'
|
79
|
-
rescue LoadError
|
80
|
-
module Statsample
|
81
|
-
OPTIMIZED=false
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
75
|
#
|
86
76
|
# Modules for statistical analysis
|
87
77
|
# See first:
|
@@ -90,10 +80,11 @@ end
|
|
90
80
|
# * Dataset: An union of vectors.
|
91
81
|
#
|
92
82
|
module Statsample
|
93
|
-
VERSION = '0.3.
|
83
|
+
VERSION = '0.3.4'
|
94
84
|
SPLIT_TOKEN = ","
|
95
85
|
autoload(:Database, 'statsample/converters')
|
96
86
|
autoload(:Anova, 'statsample/anova')
|
87
|
+
autoload(:Combination, 'statsample/combination')
|
97
88
|
autoload(:CSV, 'statsample/converters')
|
98
89
|
autoload(:Excel, 'statsample/converters')
|
99
90
|
autoload(:GGobi, 'statsample/converters')
|
@@ -113,10 +104,15 @@ module Statsample
|
|
113
104
|
autoload(:Regression, 'statsample/regression')
|
114
105
|
autoload(:Test, 'statsample/test')
|
115
106
|
def self.load(filename)
|
116
|
-
|
117
|
-
|
118
|
-
|
107
|
+
if File.exists? filename
|
108
|
+
o=false
|
109
|
+
File.open(filename,"r") {|fp|
|
110
|
+
o=Marshal.load(fp)
|
111
|
+
}
|
119
112
|
o
|
113
|
+
else
|
114
|
+
false
|
115
|
+
end
|
120
116
|
end
|
121
117
|
|
122
118
|
module Util
|
@@ -243,9 +239,21 @@ module Statsample
|
|
243
239
|
out
|
244
240
|
end
|
245
241
|
end
|
242
|
+
|
243
|
+
module STATSAMPLE__
|
244
|
+
end
|
245
|
+
|
246
|
+
end
|
247
|
+
|
248
|
+
|
249
|
+
begin
|
250
|
+
require 'statsamplert'
|
251
|
+
rescue LoadError
|
252
|
+
module Statsample
|
253
|
+
OPTIMIZED=false
|
254
|
+
end
|
246
255
|
end
|
247
256
|
|
248
257
|
require 'statsample/vector'
|
249
258
|
require 'statsample/dataset'
|
250
259
|
require 'statsample/crosstab'
|
251
|
-
|
data/lib/statsample/bivariate.rb
CHANGED
@@ -65,15 +65,34 @@ module Statsample
|
|
65
65
|
r*Math::sqrt(((size)-2).to_f / (1 - r**2))
|
66
66
|
end
|
67
67
|
# Retrieves the probability value (a la SPSS)
|
68
|
-
# for a given t, size and number of tails
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
68
|
+
# for a given t, size and number of tails.
|
69
|
+
# Uses a second parameter
|
70
|
+
# * :both or 2 : for r!=0
|
71
|
+
# * :right, :positive or 1 : for r > 0
|
72
|
+
# * :left, :negative : for r < 0
|
73
|
+
|
74
|
+
def prop_pearson(t,size, tails=:both)
|
75
|
+
tails=:both if tails==2
|
76
|
+
tails=:right if tails==1 or tails==:positive
|
77
|
+
tails=:left if tails==:negative
|
78
|
+
|
79
|
+
n_tails=case tails
|
80
|
+
when :both
|
81
|
+
2
|
82
|
+
else
|
83
|
+
1
|
84
|
+
end
|
85
|
+
if HAS_GSL
|
86
|
+
t=-t if t>0 and (tails==:both)
|
87
|
+
cdf=GSL::Cdf::tdist_P(t,size-2)
|
88
|
+
if(tails==:right)
|
89
|
+
1.0-(cdf*n_tails)
|
90
|
+
else
|
91
|
+
cdf*n_tails
|
92
|
+
end
|
93
|
+
else
|
94
|
+
raise "Needs ruby-gsl"
|
95
|
+
end
|
77
96
|
end
|
78
97
|
# Returns residual score after delete variance
|
79
98
|
# from another variable
|
@@ -0,0 +1,103 @@
|
|
1
|
+
module Statsample
|
2
|
+
# Combination class systematically generates all combinations of n elements, taken r at a time.
|
3
|
+
# Use GSL::Combination is available for extra speed
|
4
|
+
# Source: http://snippets.dzone.com/posts/show/4666
|
5
|
+
# Use:
|
6
|
+
# comb=Statsample::Combination.new(3,5)
|
7
|
+
# comb.each{|c|
|
8
|
+
# p c
|
9
|
+
# }
|
10
|
+
class Combination
|
11
|
+
attr_reader :d
|
12
|
+
def initialize(k,n,only_ruby=false)
|
13
|
+
@k=k
|
14
|
+
@n=n
|
15
|
+
if HAS_GSL and !only_ruby
|
16
|
+
@d=CombinationGsl.new(@k,@n)
|
17
|
+
else
|
18
|
+
@d=CombinationRuby.new(@k,@n)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
def each
|
22
|
+
reset
|
23
|
+
while a=next_value
|
24
|
+
yield a
|
25
|
+
end
|
26
|
+
end
|
27
|
+
def reset
|
28
|
+
@d.reset
|
29
|
+
end
|
30
|
+
def next_value
|
31
|
+
@d.next_value
|
32
|
+
end
|
33
|
+
class CombinationRuby
|
34
|
+
attr_reader :data
|
35
|
+
def initialize(k,n)
|
36
|
+
raise "k<=n" if k>n
|
37
|
+
@k=k
|
38
|
+
@n=n
|
39
|
+
reset
|
40
|
+
end
|
41
|
+
def reset
|
42
|
+
@data=[]
|
43
|
+
(0...@k).each {|i|
|
44
|
+
@data[i] = i;
|
45
|
+
}
|
46
|
+
end
|
47
|
+
def each
|
48
|
+
reset
|
49
|
+
while a=next_value
|
50
|
+
yield a
|
51
|
+
end
|
52
|
+
end
|
53
|
+
def next_value
|
54
|
+
return false if !@data
|
55
|
+
old_comb=@data.dup
|
56
|
+
i = @k - 1;
|
57
|
+
@data[i]+=1
|
58
|
+
while ((i >= 0) and (@data[i] >= @n - @k + 1 + i)) do
|
59
|
+
i-=1;
|
60
|
+
@data[i]+=1;
|
61
|
+
end
|
62
|
+
|
63
|
+
if (@data[0] > @n - @k) # Combination (n-k, n-k+1, ..., n) reached */
|
64
|
+
@data=false # No more combinations can be generated
|
65
|
+
else
|
66
|
+
# comb now looks like (..., x, n, n, n, ..., n).
|
67
|
+
# Turn it into (..., x, x + 1, x + 2, ...)
|
68
|
+
i = i+1
|
69
|
+
(i...@k).each{ |i1|
|
70
|
+
@data[i1] = @data[i1 - 1] + 1
|
71
|
+
}
|
72
|
+
end
|
73
|
+
return old_comb
|
74
|
+
end
|
75
|
+
end
|
76
|
+
class CombinationGsl
|
77
|
+
def initialize(k,n)
|
78
|
+
require 'gsl'
|
79
|
+
raise "k<=n" if k>n
|
80
|
+
@k=k
|
81
|
+
@n=n
|
82
|
+
reset
|
83
|
+
end
|
84
|
+
def reset
|
85
|
+
@c= ::GSL::Combination.calloc(@n, @k);
|
86
|
+
end
|
87
|
+
def next_value
|
88
|
+
return false if !@c
|
89
|
+
data=@c.data.to_a
|
90
|
+
if @c.next != GSL::SUCCESS
|
91
|
+
@c=false
|
92
|
+
end
|
93
|
+
return data
|
94
|
+
end
|
95
|
+
def each
|
96
|
+
reset
|
97
|
+
begin
|
98
|
+
yield @c.data.to_a
|
99
|
+
end while @c.next == GSL::SUCCESS
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
@@ -148,6 +148,13 @@ module Statsample
|
|
148
148
|
book = Spreadsheet.open filename
|
149
149
|
sheet= book.worksheet worksheet_id
|
150
150
|
sheet.each do |row|
|
151
|
+
begin
|
152
|
+
dates=[]
|
153
|
+
row.formats.each_index{|i|
|
154
|
+
if !row.formats[i].nil? and row.formats[i].number_format=="DD/MM/YYYY"
|
155
|
+
dates.push(i)
|
156
|
+
end
|
157
|
+
}
|
151
158
|
line_number+=1
|
152
159
|
if(line_number<=ignore_lines)
|
153
160
|
#puts "Skip line"
|
@@ -155,9 +162,13 @@ module Statsample
|
|
155
162
|
end
|
156
163
|
# This should be fixed.
|
157
164
|
# If we have a Formula, should be resolver first
|
165
|
+
i=-1
|
158
166
|
row.collect!{|c|
|
167
|
+
i+=1
|
159
168
|
if c.is_a? Spreadsheet::Formula
|
160
|
-
|
169
|
+
c.value
|
170
|
+
elsif dates.include? i and !c.nil? and c.is_a? Numeric
|
171
|
+
row.date(i)
|
161
172
|
else
|
162
173
|
c
|
163
174
|
end
|
@@ -173,6 +184,10 @@ module Statsample
|
|
173
184
|
}
|
174
185
|
ds.add_case(rowa,false)
|
175
186
|
end
|
187
|
+
rescue => e
|
188
|
+
error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}"
|
189
|
+
raise
|
190
|
+
end
|
176
191
|
end
|
177
192
|
convert_to_scale(ds,fields)
|
178
193
|
ds.update_valid_data
|
data/lib/statsample/dataset.rb
CHANGED
@@ -27,7 +27,7 @@ module Statsample
|
|
27
27
|
@exp=e
|
28
28
|
end
|
29
29
|
def to_s
|
30
|
-
m="Error:"+@exp.message+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
|
30
|
+
m="Error: "+@exp.message+"\n"+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
|
31
31
|
m+="\nRow: #{@i}" unless @i.nil?
|
32
32
|
m
|
33
33
|
end
|
@@ -158,7 +158,7 @@ module Statsample
|
|
158
158
|
end
|
159
159
|
# Fast version of add case
|
160
160
|
# Can only add one case and no error check if performed
|
161
|
-
# You SHOULD use update_valid_data at the
|
161
|
+
# You SHOULD use update_valid_data at the end of insertion cycle
|
162
162
|
def add_case_array(v)
|
163
163
|
v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
|
164
164
|
end
|
@@ -295,17 +295,35 @@ module Statsample
|
|
295
295
|
yield k,@vectors[k]
|
296
296
|
}
|
297
297
|
end
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
298
|
+
if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
|
299
|
+
def case_as_hash(c)
|
300
|
+
Statsample::STATSAMPLE__.case_as_hash(self,c)
|
301
|
+
end
|
302
|
+
else
|
303
|
+
def case_as_hash(c)
|
304
|
+
_case_as_hash(c)
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
308
|
+
if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
|
309
|
+
def case_as_array(c)
|
310
|
+
Statsample::STATSAMPLE__.case_as_array(self,c)
|
311
|
+
end
|
312
|
+
else
|
313
|
+
def case_as_array(c)
|
314
|
+
_case_as_array(c)
|
315
|
+
end
|
304
316
|
end
|
305
|
-
|
306
|
-
|
307
|
-
|
317
|
+
def _case_as_hash(c)
|
318
|
+
@fields.inject({}) {|a,x|
|
319
|
+
a[x]=@vectors[x][c]
|
320
|
+
a
|
321
|
+
}
|
322
|
+
end
|
323
|
+
def _case_as_array(c)
|
324
|
+
@fields.collect {|x| @vectors[x][c]}
|
308
325
|
end
|
326
|
+
|
309
327
|
def each
|
310
328
|
begin
|
311
329
|
@i=0
|