statsample 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/Manifest.txt +13 -2
- data/demo/benchmark.rb +1 -1
- data/demo/crosstab.rb +7 -0
- data/demo/nunnally_6.rb +34 -0
- data/demo/proportion.rb +1 -1
- data/demo/regression.rb +46 -0
- data/demo/t-student.rb +17 -0
- data/lib/statsample.rb +3 -4
- data/lib/statsample/crosstab.rb +34 -1
- data/lib/statsample/dominanceanalysis.rb +2 -2
- data/lib/statsample/dominanceanalysis/bootstrap.rb +2 -1
- data/lib/statsample/regression.rb +6 -518
- data/lib/statsample/regression/multiple.rb +259 -0
- data/lib/statsample/regression/multiple/alglibengine.rb +117 -0
- data/lib/statsample/regression/multiple/rubyengine.rb +140 -0
- data/lib/statsample/regression/simple.rb +81 -0
- data/test/test_regression.rb +5 -5
- data/test/test_statistics.rb +2 -12
- data/test/test_xls.xls +0 -0
- metadata +14 -3
data/History.txt
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
=== 0.3.1 / 2009-08-03
|
2
|
+
|
3
|
+
* Name and logic of Regression classes changed. Now, you have Regression::Simple class and Regression::Multiple module with two engines: RubyEngine and AlglibEngne
|
4
|
+
* New Crosstab#summary
|
5
|
+
|
1
6
|
=== 0.3.0 / 2009-08-02
|
2
7
|
|
3
8
|
* Statsample renamed to Statsample
|
data/Manifest.txt
CHANGED
@@ -5,15 +5,20 @@ Rakefile
|
|
5
5
|
bin/statsample
|
6
6
|
demo/benchmark.rb
|
7
7
|
demo/chi-square.rb
|
8
|
+
demo/crosstab.rb
|
8
9
|
demo/dice.rb
|
9
10
|
demo/distribution_t.rb
|
10
11
|
demo/graph.rb
|
11
12
|
demo/item_analysis.rb
|
12
13
|
demo/mean.rb
|
14
|
+
demo/nunnally_6.rb
|
13
15
|
demo/proportion.rb
|
16
|
+
demo/regression.rb
|
14
17
|
demo/sample_test.csv
|
15
18
|
demo/strata_proportion.rb
|
16
19
|
demo/stratum.rb
|
20
|
+
demo/t-student.rb
|
21
|
+
lib/spss.rb
|
17
22
|
lib/statsample.rb
|
18
23
|
lib/statsample/anova.rb
|
19
24
|
lib/statsample/bivariate.rb
|
@@ -25,19 +30,22 @@ lib/statsample/dataset.rb
|
|
25
30
|
lib/statsample/dominanceanalysis.rb
|
26
31
|
lib/statsample/dominanceanalysis/bootstrap.rb
|
27
32
|
lib/statsample/graph/gdchart.rb
|
28
|
-
lib/statsample/graph/svggraph.rb
|
29
33
|
lib/statsample/graph/svgboxplot.rb
|
34
|
+
lib/statsample/graph/svggraph.rb
|
30
35
|
lib/statsample/graph/svghistogram.rb
|
31
36
|
lib/statsample/graph/svgscatterplot.rb
|
32
37
|
lib/statsample/htmlreport.rb
|
33
38
|
lib/statsample/multiset.rb
|
34
39
|
lib/statsample/regression.rb
|
40
|
+
lib/statsample/regression/multiple.rb
|
41
|
+
lib/statsample/regression/multiple/alglibengine.rb
|
42
|
+
lib/statsample/regression/multiple/rubyengine.rb
|
43
|
+
lib/statsample/regression/simple.rb
|
35
44
|
lib/statsample/reliability.rb
|
36
45
|
lib/statsample/resample.rb
|
37
46
|
lib/statsample/srs.rb
|
38
47
|
lib/statsample/test.rb
|
39
48
|
lib/statsample/vector.rb
|
40
|
-
lib/spss.rb
|
41
49
|
test/_test_chart.rb
|
42
50
|
test/test_anova.rb
|
43
51
|
test/test_codification.rb
|
@@ -50,7 +58,10 @@ test/test_multiset.rb
|
|
50
58
|
test/test_regression.rb
|
51
59
|
test/test_reliability.rb
|
52
60
|
test/test_resample.rb
|
61
|
+
test/test_srs.rb
|
53
62
|
test/test_statistics.rb
|
54
63
|
test/test_stratified.rb
|
55
64
|
test/test_svg_graph.rb
|
56
65
|
test/test_vector.rb
|
66
|
+
test/test_xls.rb
|
67
|
+
test/test_xls.xls
|
data/demo/benchmark.rb
CHANGED
data/demo/crosstab.rb
ADDED
data/demo/nunnally_6.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require File.dirname(__FILE__)+'/../lib/statsample'
|
2
|
+
|
3
|
+
x1=[7,12,15,10,19,13,10,12,15,14].to_vector(:scale)
|
4
|
+
x2=[9,6,8,8,9,8,6,8,10,9].to_vector(:scale)
|
5
|
+
x3=[7,15,13,9,12,12,13,11,9,10].to_vector(:scale)
|
6
|
+
|
7
|
+
puts Statsample::Bivariate.pearson(x1,x2)
|
8
|
+
puts Statsample::Bivariate.pearson(x2,x3)
|
9
|
+
puts Statsample::Bivariate.pearson(x1,x3)
|
10
|
+
|
11
|
+
puts "Residual x1.x3"
|
12
|
+
res1=Statsample::Bivariate.residuals(x1,x3)
|
13
|
+
puts res1
|
14
|
+
puts "Residual x2.x3"
|
15
|
+
res2=Statsample::Bivariate.residuals(x2,x3)
|
16
|
+
puts res2
|
17
|
+
|
18
|
+
puts "Residual x1.x2"
|
19
|
+
res3=Statsample::Bivariate.residuals(x1,x2)
|
20
|
+
puts res3
|
21
|
+
puts "Residual x3.x2"
|
22
|
+
res4=Statsample::Bivariate.residuals(x3,x2)
|
23
|
+
puts res4
|
24
|
+
|
25
|
+
puts "Partial correlation de 1 y 2, controlando 3"
|
26
|
+
puts Statsample::Bivariate.pearson(res1,res2)
|
27
|
+
puts Statsample::Bivariate.partial_correlation(x1,x2,x3)
|
28
|
+
|
29
|
+
puts "Partial correlation de 1 y 3, controlando 2"
|
30
|
+
puts Statsample::Bivariate.pearson(res3,res4)
|
31
|
+
puts Statsample::Bivariate.partial_correlation(x1,x3,x2)
|
32
|
+
|
33
|
+
puts "Partial correlation de 2 y 3, controlando 1"
|
34
|
+
puts Statsample::Bivariate.partial_correlation(x2,x3,x1)
|
data/demo/proportion.rb
CHANGED
@@ -7,7 +7,7 @@ tests=3000
|
|
7
7
|
sample_size=100
|
8
8
|
# rand a 50%
|
9
9
|
poblacion=([1]*500+[0]*500).to_vector(:scale)
|
10
|
-
prop=poblacion.proportion(1
|
10
|
+
prop=poblacion.proportion(1)
|
11
11
|
puts "Estadísticos"
|
12
12
|
puts "DE con reemplazo:"+Statsample::SRS.proportion_sd_kp_wr(prop, sample_size).to_s
|
13
13
|
puts "DE sin reemplazo:"+Statsample::SRS.proportion_sd_kp_wor(prop, sample_size,poblacion.size).to_s
|
data/demo/regression.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
require File.dirname(__FILE__)+'/../lib/statsample'
|
2
|
+
tests=300
|
3
|
+
include Statsample
|
4
|
+
r = GSL::Rng.alloc(GSL::Rng::TAUS,Time.now.to_i)
|
5
|
+
ds=Dataset.new(%w{a b c d y})
|
6
|
+
ds['a'].type=:scale
|
7
|
+
ds['b'].type=:scale
|
8
|
+
ds['c'].type=:scale
|
9
|
+
ds['d'].type=:scale
|
10
|
+
ds['y'].type=:scale
|
11
|
+
|
12
|
+
tests.times {
|
13
|
+
a=r.ugaussian
|
14
|
+
b=r.ugaussian
|
15
|
+
c=r.ugaussian
|
16
|
+
d=r.ugaussian
|
17
|
+
y=a*70+b*30+c*5+r.ugaussian*5
|
18
|
+
ds.add_case_array([a,b,c,d,y])
|
19
|
+
}
|
20
|
+
ds.update_valid_data
|
21
|
+
|
22
|
+
if !File.exists? "regression.dab"
|
23
|
+
da=DominanceAnalysis::Bootstrap.new(ds,"y")
|
24
|
+
else
|
25
|
+
da=Statsample.load("regression.dab")
|
26
|
+
end
|
27
|
+
|
28
|
+
da.lr_class=Regression::Multiple::AlglibEngine
|
29
|
+
da.bootstrap(20)
|
30
|
+
|
31
|
+
puts da.summary
|
32
|
+
da.save("regression.dab")
|
33
|
+
|
34
|
+
lr=Regression::Multiple.listwise(ds,"y")
|
35
|
+
|
36
|
+
hr=HtmlReport.new("Regression")
|
37
|
+
hr.add_summary("Regression",lr.summary(HtmlSummary))
|
38
|
+
hr.add_summary("Analisis de Dominancia ", da.da.summary(HtmlSummary))
|
39
|
+
|
40
|
+
hr.add_summary("Analisis de Dominancia (Bootstrap)", da.summary(HtmlSummary))
|
41
|
+
|
42
|
+
da.fields.each{|f|
|
43
|
+
hr.add_histogram("General Dominance #{f}",da.samples_ga[f].to_vector(:scale))
|
44
|
+
}
|
45
|
+
hr.save("Regression Dominance.html")
|
46
|
+
|
data/demo/t-student.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require File.dirname(__FILE__)+"/../lib/statsample"
|
2
|
+
|
3
|
+
|
4
|
+
tests=3000
|
5
|
+
|
6
|
+
r = GSL::Rng.alloc(GSL::Rng::TAUS, 1)
|
7
|
+
sample_sizes=[5,10,20,30]
|
8
|
+
sample_sizes.each{|sample_size|
|
9
|
+
monte=Statsample::Resample.repeat_and_save(tests) {
|
10
|
+
v=[]
|
11
|
+
sample_size.times{|i|
|
12
|
+
v.push(r.ugaussian)
|
13
|
+
}
|
14
|
+
v.to_vector(:scale).mean
|
15
|
+
|
16
|
+
}
|
17
|
+
}
|
data/lib/statsample.rb
CHANGED
@@ -58,7 +58,7 @@ end
|
|
58
58
|
# :startdoc:
|
59
59
|
#
|
60
60
|
module Statsample
|
61
|
-
VERSION = '0.3.
|
61
|
+
VERSION = '0.3.1'
|
62
62
|
SPLIT_TOKEN = ","
|
63
63
|
autoload(:Database, 'statsample/converters')
|
64
64
|
autoload(:Anova, 'statsample/anova')
|
@@ -74,7 +74,6 @@ module Statsample
|
|
74
74
|
autoload(:Reliability, 'statsample/reliability')
|
75
75
|
autoload(:Bivariate, 'statsample/bivariate')
|
76
76
|
autoload(:Multivariate, 'statsample/multivariate')
|
77
|
-
|
78
77
|
autoload(:Regression, 'statsample/regression')
|
79
78
|
autoload(:Test, 'statsample/test')
|
80
79
|
def self.load(filename)
|
@@ -134,10 +133,10 @@ module Statsample
|
|
134
133
|
end
|
135
134
|
class ReportTable
|
136
135
|
attr_reader :header
|
137
|
-
def initialize(
|
138
|
-
@header=header
|
136
|
+
def initialize(h=[])
|
139
137
|
@rows=[]
|
140
138
|
@max_cols=[]
|
139
|
+
self.header=(h)
|
141
140
|
end
|
142
141
|
def add_row(row)
|
143
142
|
row.each_index{|i|
|
data/lib/statsample/crosstab.rb
CHANGED
@@ -62,7 +62,7 @@ module Statsample
|
|
62
62
|
# Chi square, based on expected and real matrix
|
63
63
|
def chi_square
|
64
64
|
require 'statsample/test'
|
65
|
-
Statsample::Test.chi_square(self.to_matrix,matrix_expected)
|
65
|
+
Statsample::Test.chi_square(self.to_matrix, matrix_expected)
|
66
66
|
end
|
67
67
|
# Useful to obtain chi square
|
68
68
|
def matrix_expected
|
@@ -78,6 +78,39 @@ module Statsample
|
|
78
78
|
}
|
79
79
|
Matrix.rows(m)
|
80
80
|
end
|
81
|
+
def summary(report_type=ConsoleSummary)
|
82
|
+
out=""
|
83
|
+
out.extend report_type
|
84
|
+
fq=frequencies
|
85
|
+
rn=rows_names
|
86
|
+
cn=cols_names
|
87
|
+
total=0
|
88
|
+
total_cols=cn.inject({}) {|a,x| a[x]=0;a}
|
89
|
+
out.add "Chi Square: #{chi_square}"
|
90
|
+
t=Statsample::ReportTable.new([""]+cols_names+["Total"])
|
91
|
+
rn.each{|row|
|
92
|
+
total_row=0
|
93
|
+
t_row=[@v_rows.labeling(row)]
|
94
|
+
cn.each{|col|
|
95
|
+
data=fq[[row,col]]
|
96
|
+
total_row+=fq[[row,col]]
|
97
|
+
total+=fq[[row,col]]
|
98
|
+
total_cols[col]+=fq[[row,col]]
|
99
|
+
t_row.push(data)
|
100
|
+
}
|
101
|
+
t_row.push(total_row)
|
102
|
+
t.add_row(t_row)
|
103
|
+
}
|
104
|
+
t.add_horizontal_line
|
105
|
+
t_row=["Total"]
|
106
|
+
cn.each{|v|
|
107
|
+
t_row.push(total_cols[v])
|
108
|
+
}
|
109
|
+
t_row.push(total)
|
110
|
+
t.add_row(t_row)
|
111
|
+
out.parse_table(t)
|
112
|
+
out
|
113
|
+
end
|
81
114
|
def to_s
|
82
115
|
fq=frequencies
|
83
116
|
rn=rows_names
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'statsample/dominanceanalysis/bootstrap'
|
2
2
|
module Statsample
|
3
3
|
class DominanceAnalysis
|
4
|
-
def initialize(ds,y_var, r_class = Regression::
|
4
|
+
def initialize(ds,y_var, r_class = Regression::Multiple::RubyEngine)
|
5
5
|
@y_var=y_var
|
6
6
|
@dy=ds[@y_var]
|
7
7
|
@ds=ds
|
@@ -220,7 +220,7 @@ module Statsample
|
|
220
220
|
@name=name
|
221
221
|
@fields=fields
|
222
222
|
@contributions=@fields.inject({}){|a,v| a[v]=nil;a}
|
223
|
-
r_class=Regression::
|
223
|
+
r_class=Regression::Multiple::RubyEngine if r_class.nil?
|
224
224
|
@lr=r_class.new(ds,y_var)
|
225
225
|
end
|
226
226
|
def add_contribution(f,v)
|
@@ -11,7 +11,7 @@ class DominanceAnalysis
|
|
11
11
|
@fields=ds.fields-[y_var]
|
12
12
|
@samples_ga=@fields.inject({}){|a,v| a[v]=[];a}
|
13
13
|
@n_samples=0
|
14
|
-
@lr_class=Regression::
|
14
|
+
@lr_class=Regression::Multiple::RubyEngine
|
15
15
|
create_samples_pairs
|
16
16
|
end
|
17
17
|
def lr_class=(lr)
|
@@ -68,6 +68,7 @@ class DominanceAnalysis
|
|
68
68
|
out.add "Summary for Bootstrap Dominance Analysis of "+@fields.join(", ")+" over "+@y_var+"\n"
|
69
69
|
out.add "Size of sample: #{@n_samples}\n"
|
70
70
|
out.add "t:#{t}\n"
|
71
|
+
out.add "Linear Regression Engine: #{@lr_class.name}"
|
71
72
|
out.nl
|
72
73
|
table=ReportTable.new
|
73
74
|
header=["pairs","sD","Dij","SE(Dij)","Pij","Pji","Pno","Reprod"]
|
@@ -1,522 +1,10 @@
|
|
1
|
+
require 'statsample/regression/simple'
|
2
|
+
require 'statsample/regression/multiple'
|
3
|
+
require 'statsample/regression/multiple/alglibengine'
|
4
|
+
require 'statsample/regression/multiple/rubyengine'
|
5
|
+
|
1
6
|
module Statsample
|
2
|
-
#
|
7
|
+
# Module for regression procedures
|
3
8
|
module Regression
|
4
|
-
# Class for calculation of linear regressions
|
5
|
-
# To create a SimpleRegression object:
|
6
|
-
# * <tt> SimpleRegression.new_from_vectors(vx,vy)</tt>
|
7
|
-
# * <tt> SimpleRegression.new_from_gsl(gsl) </tt>
|
8
|
-
#
|
9
|
-
class SimpleRegression
|
10
|
-
attr_accessor :a,:b,:cov00, :cov01, :covx1, :chisq, :status
|
11
|
-
private_class_method :new
|
12
|
-
def initialize(init_method, *argv)
|
13
|
-
self.send(init_method, *argv)
|
14
|
-
end
|
15
|
-
def y(val_x)
|
16
|
-
@a+@b*val_x
|
17
|
-
end
|
18
|
-
def x(val_y)
|
19
|
-
(val_y-@a) / @b.to_f
|
20
|
-
end
|
21
|
-
# Sum of square error
|
22
|
-
def sse
|
23
|
-
(0...@vx.size).inject(0) {|acum,i|
|
24
|
-
acum+((@vy[i]-y(@vx[i]))**2)
|
25
|
-
}
|
26
|
-
end
|
27
|
-
def standard_error
|
28
|
-
Math::sqrt(sse / (@vx.size-2).to_f)
|
29
|
-
end
|
30
|
-
# Sum of square regression
|
31
|
-
def ssr
|
32
|
-
vy_mean=@vy.mean
|
33
|
-
(0...@vx.size).inject(0) {|a,i|
|
34
|
-
a+((y(@vx[i])-vy_mean)**2)
|
35
|
-
}
|
36
|
-
|
37
|
-
end
|
38
|
-
# Sum of square total
|
39
|
-
def sst
|
40
|
-
@vy.sum_of_squared_deviation
|
41
|
-
end
|
42
|
-
# Value of r
|
43
|
-
def r
|
44
|
-
@b * (@vx.sds / @vy.sds)
|
45
|
-
end
|
46
|
-
# Value of r^2
|
47
|
-
def r2
|
48
|
-
r**2
|
49
|
-
end
|
50
|
-
class << self
|
51
|
-
def new_from_gsl(ar)
|
52
|
-
new(:init_gsl, *ar)
|
53
|
-
end
|
54
|
-
def new_from_vectors(vx,vy)
|
55
|
-
new(:init_vectors,vx,vy)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
def init_vectors(vx,vy)
|
59
|
-
@vx,@vy=Statsample.only_valid(vx,vy)
|
60
|
-
x_m=@vx.mean
|
61
|
-
y_m=@vy.mean
|
62
|
-
num=den=0
|
63
|
-
(0...@vx.size).each {|i|
|
64
|
-
num+=(@vx[i]-x_m)*(@vy[i]-y_m)
|
65
|
-
den+=(@vx[i]-x_m)**2
|
66
|
-
}
|
67
|
-
@b=num.to_f/den
|
68
|
-
@a=y_m - @b*x_m
|
69
|
-
end
|
70
|
-
def init_gsl(a,b,cov00, cov01, covx1, chisq, status)
|
71
|
-
@a=a
|
72
|
-
@b=b
|
73
|
-
@cov00=cov00
|
74
|
-
@cov01=cov01
|
75
|
-
@covx1=covx1
|
76
|
-
@chisq=chisq
|
77
|
-
@status=status
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
|
82
|
-
class MultipleRegressionBase
|
83
|
-
def initialize(ds,y_var)
|
84
|
-
@ds=ds
|
85
|
-
@y_var=y_var
|
86
|
-
@r2=nil
|
87
|
-
|
88
|
-
end
|
89
|
-
def assign_names(c)
|
90
|
-
a={}
|
91
|
-
@fields.each_index {|i|
|
92
|
-
a[@fields[i]]=c[i]
|
93
|
-
}
|
94
|
-
a
|
95
|
-
end
|
96
|
-
def predicted
|
97
|
-
(0...@ds.cases).collect { |i|
|
98
|
-
invalid=false
|
99
|
-
vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]}
|
100
|
-
if invalid
|
101
|
-
nil
|
102
|
-
else
|
103
|
-
process(vect)
|
104
|
-
end
|
105
|
-
}.to_vector(:scale)
|
106
|
-
end
|
107
|
-
def standarized_predicted
|
108
|
-
predicted.standarized
|
109
|
-
end
|
110
|
-
def residuals
|
111
|
-
(0...@ds.cases).collect{|i|
|
112
|
-
invalid=false
|
113
|
-
vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
|
114
|
-
if invalid or @ds[@y_var][i].nil?
|
115
|
-
nil
|
116
|
-
else
|
117
|
-
@ds[@y_var][i] - process(vect)
|
118
|
-
end
|
119
|
-
}.to_vector(:scale)
|
120
|
-
end
|
121
|
-
def r
|
122
|
-
raise "You should implement this"
|
123
|
-
end
|
124
|
-
def sst
|
125
|
-
raise "You should implement this"
|
126
|
-
end
|
127
|
-
def ssr
|
128
|
-
r2*sst
|
129
|
-
end
|
130
|
-
def sse
|
131
|
-
sst - ssr
|
132
|
-
end
|
133
|
-
|
134
|
-
def coeffs_t
|
135
|
-
out={}
|
136
|
-
se=coeffs_se
|
137
|
-
coeffs.each{|k,v|
|
138
|
-
out[k]=v / se[k]
|
139
|
-
}
|
140
|
-
out
|
141
|
-
end
|
142
|
-
|
143
|
-
def mse
|
144
|
-
sse/df_e
|
145
|
-
end
|
146
|
-
|
147
|
-
def df_r
|
148
|
-
@dep_columns.size
|
149
|
-
end
|
150
|
-
def df_e
|
151
|
-
@ds_valid.cases-@dep_columns.size-1
|
152
|
-
end
|
153
|
-
def f
|
154
|
-
(ssr.quo(df_r)).quo(sse.quo(df_e))
|
155
|
-
end
|
156
|
-
# Significance of Fisher
|
157
|
-
def significance
|
158
|
-
if HAS_GSL
|
159
|
-
GSL::Cdf.fdist_Q(f,df_r,df_e)
|
160
|
-
else
|
161
|
-
raise "Need Ruby/GSL"
|
162
|
-
end
|
163
|
-
end
|
164
|
-
# Tolerance for a given variable
|
165
|
-
# http://talkstats.com/showthread.php?t=5056
|
166
|
-
def tolerance(var)
|
167
|
-
ds=assign_names(@dep_columns)
|
168
|
-
ds.each{|k,v|
|
169
|
-
ds[k]=v.to_vector(:scale)
|
170
|
-
}
|
171
|
-
if HAS_ALGIB
|
172
|
-
lr_class=::Statsample::Regression::MultipleRegressionAlglib
|
173
|
-
ds=ds.to_dataset
|
174
|
-
else
|
175
|
-
lr_class=MultipleRegressionPairwise
|
176
|
-
ds=ds.to_dataset.dup_only_valid
|
177
|
-
end
|
178
|
-
lr=lr_class.new(ds,var)
|
179
|
-
1-lr.r2
|
180
|
-
end
|
181
|
-
def coeffs_tolerances
|
182
|
-
@fields.inject({}) {|a,f|
|
183
|
-
a[f]=tolerance(f);
|
184
|
-
a
|
185
|
-
}
|
186
|
-
end
|
187
|
-
def coeffs_se
|
188
|
-
out={}
|
189
|
-
mse=sse.quo(df_e)
|
190
|
-
coeffs.each {|k,v|
|
191
|
-
out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares*tolerance(k)))
|
192
|
-
}
|
193
|
-
out
|
194
|
-
end
|
195
|
-
def estimated_variance_covariance_matrix
|
196
|
-
mse_p=mse
|
197
|
-
columns=[]
|
198
|
-
@ds_valid.each_vector{|k,v|
|
199
|
-
columns.push(v.data) unless k==@y_var
|
200
|
-
}
|
201
|
-
columns.unshift([1.0]*@ds_valid.cases)
|
202
|
-
x=Matrix.columns(columns)
|
203
|
-
matrix=((x.t*x)).inverse * mse
|
204
|
-
matrix.collect {|i|
|
205
|
-
|
206
|
-
Math::sqrt(i) if i>0
|
207
|
-
}
|
208
|
-
end
|
209
|
-
def constant_t
|
210
|
-
constant.to_f/constant_se
|
211
|
-
end
|
212
|
-
def constant_se
|
213
|
-
estimated_variance_covariance_matrix[0,0]
|
214
|
-
end
|
215
|
-
def summary(report_type=ConsoleSummary)
|
216
|
-
c=coeffs
|
217
|
-
out=""
|
218
|
-
out.extend report_type
|
219
|
-
out.add <<HEREDOC
|
220
|
-
Summary for regression of #{@fields.join(',')} over #{@y_var}"
|
221
|
-
*************************************************************
|
222
|
-
Cases(listwise)=#{@ds.cases}(#{@ds_valid.cases})
|
223
|
-
r=#{sprintf("%0.3f",r)}
|
224
|
-
r2=#{sprintf("%0.3f",r2)}
|
225
|
-
ssr=#{sprintf("%0.3f",ssr)}
|
226
|
-
sse=#{sprintf("%0.3f",sse)}
|
227
|
-
sst=#{sprintf("%0.3f",sst)}
|
228
|
-
F#{sprintf("(%d,%d)=%0.3f, p=%0.3f",df_r,df_e,f,significance)}
|
229
|
-
Equation=#{sprintf("%0.3f",constant)}+#{@fields.collect {|k| sprintf("%0.3f%s",c[k],k)}.join(' + ')}
|
230
|
-
|
231
|
-
HEREDOC
|
232
|
-
|
233
|
-
end
|
234
|
-
|
235
|
-
|
236
|
-
# Deprecated
|
237
|
-
# Sum of squares of error (manual calculation)
|
238
|
-
# using the predicted value minus the y_i value
|
239
|
-
def sse_manual
|
240
|
-
pr=predicted
|
241
|
-
cases=0
|
242
|
-
sse=(0...@ds.cases).inject(0) {|a,i|
|
243
|
-
if !@dy.data_with_nils[i].nil? and !pr[i].nil?
|
244
|
-
cases+=1
|
245
|
-
a+((pr[i]-@dy[i])**2)
|
246
|
-
else
|
247
|
-
a
|
248
|
-
end
|
249
|
-
}
|
250
|
-
sse*(min_n_valid-1.0).quo(cases-1)
|
251
|
-
end
|
252
|
-
# Sum of squares of regression
|
253
|
-
# using the predicted value minus y mean
|
254
|
-
def ssr_direct
|
255
|
-
mean=@dy.mean
|
256
|
-
cases=0
|
257
|
-
ssr=(0...@ds.cases).inject(0) {|a,i|
|
258
|
-
invalid=false
|
259
|
-
v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
|
260
|
-
if !invalid
|
261
|
-
cases+=1
|
262
|
-
a+((process(v)-mean)**2)
|
263
|
-
else
|
264
|
-
a
|
265
|
-
end
|
266
|
-
}
|
267
|
-
ssr
|
268
|
-
end
|
269
|
-
def sse_direct
|
270
|
-
sst-ssr
|
271
|
-
end
|
272
|
-
|
273
|
-
|
274
|
-
end
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
if HAS_ALGIB
|
281
|
-
# Class for calculation of multiple regression.
|
282
|
-
# Requires Alglib gem.
|
283
|
-
# To create a SimpleRegression object:
|
284
|
-
# @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
|
285
|
-
# @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
|
286
|
-
# @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
|
287
|
-
# @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
|
288
|
-
# ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
|
289
|
-
# lr=Statsample::Regression::MultipleRegression.new(ds,'y')
|
290
|
-
#
|
291
|
-
class MultipleRegressionAlglib < MultipleRegressionBase
|
292
|
-
def initialize(ds,y_var)
|
293
|
-
@ds=ds.dup_only_valid
|
294
|
-
@ds_valid=@ds
|
295
|
-
@y_var=y_var
|
296
|
-
@dy=@ds[@y_var]
|
297
|
-
@ds_indep=ds.dup(ds.fields-[y_var])
|
298
|
-
# Create a custom matrix
|
299
|
-
columns=[]
|
300
|
-
@fields=[]
|
301
|
-
@ds.fields.each{|f|
|
302
|
-
if f!=@y_var
|
303
|
-
columns.push(@ds[f].to_a)
|
304
|
-
@fields.push(f)
|
305
|
-
end
|
306
|
-
}
|
307
|
-
@dep_columns=columns.dup
|
308
|
-
columns.push(@ds[@y_var])
|
309
|
-
matrix=Matrix.columns(columns)
|
310
|
-
@lr_s=nil
|
311
|
-
@lr=::Alglib::LinearRegression.build_from_matrix(matrix)
|
312
|
-
end
|
313
|
-
|
314
|
-
def _dump(i)
|
315
|
-
Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
|
316
|
-
end
|
317
|
-
def self._load(data)
|
318
|
-
h=Marshal.load(data)
|
319
|
-
MultipleRegression.new(h['ds'], h['y_var'])
|
320
|
-
end
|
321
|
-
|
322
|
-
def coeffs
|
323
|
-
assign_names(@lr.coeffs)
|
324
|
-
end
|
325
|
-
# Coefficients using a constant
|
326
|
-
# Based on http://www.xycoon.com/ols1.htm
|
327
|
-
def matrix_resolution
|
328
|
-
mse_p=mse
|
329
|
-
columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
|
330
|
-
columns.unshift([1.0]*@ds.cases)
|
331
|
-
y=Matrix.columns([@dy.data.map {|i| i.to_f}])
|
332
|
-
x=Matrix.columns(columns)
|
333
|
-
xt=x.t
|
334
|
-
matrix=((xt*x)).inverse*xt
|
335
|
-
matrix*y
|
336
|
-
end
|
337
|
-
def r2
|
338
|
-
r**2
|
339
|
-
end
|
340
|
-
def r
|
341
|
-
Bivariate::pearson(@dy,predicted)
|
342
|
-
end
|
343
|
-
def sst
|
344
|
-
@dy.ss
|
345
|
-
end
|
346
|
-
def constant
|
347
|
-
@lr.constant
|
348
|
-
end
|
349
|
-
def standarized_coeffs
|
350
|
-
l=lr_s
|
351
|
-
assign_names(l.coeffs)
|
352
|
-
end
|
353
|
-
def lr_s
|
354
|
-
if @lr_s.nil?
|
355
|
-
build_standarized
|
356
|
-
end
|
357
|
-
@lr_s
|
358
|
-
end
|
359
|
-
def build_standarized
|
360
|
-
@ds_s=@ds.standarize
|
361
|
-
columns=[]
|
362
|
-
@ds_s.fields.each{|f|
|
363
|
-
columns.push(@ds_s[f].to_a) unless f==@y_var
|
364
|
-
}
|
365
|
-
@dep_columns_s=columns.dup
|
366
|
-
columns.push(@ds_s[@y_var])
|
367
|
-
matrix=Matrix.columns(columns)
|
368
|
-
@lr_s=Alglib::LinearRegression.build_from_matrix(matrix)
|
369
|
-
end
|
370
|
-
def process(v)
|
371
|
-
@lr.process(v)
|
372
|
-
end
|
373
|
-
def process_s(v)
|
374
|
-
lr_s.process(v)
|
375
|
-
end
|
376
|
-
# ???? Not equal to SPSS output
|
377
|
-
def standarized_residuals
|
378
|
-
res=residuals
|
379
|
-
red_sd=residuals.sds
|
380
|
-
res.collect {|v|
|
381
|
-
v.quo(red_sd)
|
382
|
-
}.to_vector(:scale)
|
383
|
-
end
|
384
|
-
end
|
385
|
-
end
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
class MultipleRegressionPairwise < MultipleRegressionBase
|
399
|
-
def initialize(ds,y_var)
|
400
|
-
super
|
401
|
-
@dy=ds[@y_var]
|
402
|
-
@ds_valid=ds.dup_only_valid
|
403
|
-
@ds_indep=ds.dup(ds.fields-[y_var])
|
404
|
-
@fields=@ds_indep.fields
|
405
|
-
set_dep_columns
|
406
|
-
obtain_y_vector
|
407
|
-
@matrix_x = Bivariate.correlation_matrix(@ds_indep)
|
408
|
-
@coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a
|
409
|
-
@min_n_valid=nil
|
410
|
-
end
|
411
|
-
def min_n_valid
|
412
|
-
if @min_n_valid.nil?
|
413
|
-
min=@ds.cases
|
414
|
-
m=Bivariate::n_valid_matrix(@ds)
|
415
|
-
for x in 0...m.row_size
|
416
|
-
for y in 0...m.column_size
|
417
|
-
min=m[x,y] if m[x,y] < min
|
418
|
-
end
|
419
|
-
end
|
420
|
-
@min_n_valid=min
|
421
|
-
end
|
422
|
-
@min_n_valid
|
423
|
-
end
|
424
|
-
def set_dep_columns
|
425
|
-
@dep_columns=[]
|
426
|
-
@ds_indep.each_vector{|k,v|
|
427
|
-
@dep_columns.push(v.data_with_nils)
|
428
|
-
}
|
429
|
-
end
|
430
|
-
# Sum of square total
|
431
|
-
def sst
|
432
|
-
#if @sst.nil?
|
433
|
-
@sst=@dy.variance*(min_n_valid-1.0)
|
434
|
-
#end
|
435
|
-
@sst
|
436
|
-
end
|
437
|
-
def r2
|
438
|
-
if @r2.nil?
|
439
|
-
c=@matrix_y
|
440
|
-
rxx=obtain_predictor_matrix
|
441
|
-
matrix=(c.t*rxx.inverse*c)
|
442
|
-
@r2=matrix[0,0]
|
443
|
-
end
|
444
|
-
@r2
|
445
|
-
end
|
446
|
-
def r
|
447
|
-
Math::sqrt(r2)
|
448
|
-
end
|
449
|
-
|
450
|
-
def df_e
|
451
|
-
min_n_valid-@dep_columns.size-1
|
452
|
-
end
|
453
|
-
def fix_with_mean
|
454
|
-
i=0
|
455
|
-
@ds_indep.each{|row|
|
456
|
-
empty=[]
|
457
|
-
row.each{|k,v|
|
458
|
-
empty.push(k) if v.nil?
|
459
|
-
}
|
460
|
-
if empty.size==1
|
461
|
-
@ds_indep[empty[0]][i]=@ds[empty[0]].mean
|
462
|
-
end
|
463
|
-
i+=1
|
464
|
-
}
|
465
|
-
@ds_indep.update_valid_data
|
466
|
-
set_dep_columns
|
467
|
-
end
|
468
|
-
def fix_with_regression
|
469
|
-
i=0
|
470
|
-
@ds_indep.each{|row|
|
471
|
-
empty=[]
|
472
|
-
row.each{|k,v|
|
473
|
-
empty.push(k) if v.nil?
|
474
|
-
}
|
475
|
-
if empty.size==1
|
476
|
-
field=empty[0]
|
477
|
-
lr=MultipleRegression.new(@ds_indep,field)
|
478
|
-
fields=[]
|
479
|
-
@ds_indep.fields.each{|f|
|
480
|
-
fields.push(row[f]) unless f==field
|
481
|
-
}
|
482
|
-
@ds_indep[field][i]=lr.process(fields)
|
483
|
-
end
|
484
|
-
i+=1
|
485
|
-
}
|
486
|
-
@ds_indep.update_valid_data
|
487
|
-
set_dep_columns
|
488
|
-
end
|
489
|
-
def obtain_y_vector
|
490
|
-
@matrix_y=Matrix.columns([@ds_indep.fields.collect{|f|
|
491
|
-
Bivariate.pearson(@dy, @ds_indep[f])
|
492
|
-
}])
|
493
|
-
end
|
494
|
-
def obtain_predictor_matrix
|
495
|
-
Bivariate::correlation_matrix(@ds_indep)
|
496
|
-
end
|
497
|
-
def constant
|
498
|
-
c=coeffs
|
499
|
-
@dy.mean-@fields.inject(0){|a,k| a+(c[k] * @ds_indep[k].mean)}
|
500
|
-
end
|
501
|
-
def process(v)
|
502
|
-
c=coeffs
|
503
|
-
total=constant
|
504
|
-
@fields.each_index{|i|
|
505
|
-
total+=c[@fields[i]]*v[i]
|
506
|
-
}
|
507
|
-
total
|
508
|
-
end
|
509
|
-
def coeffs
|
510
|
-
sc=standarized_coeffs
|
511
|
-
assign_names(@fields.collect{|f|
|
512
|
-
(sc[f]*@dy.sds).quo(@ds_indep[f].sds)
|
513
|
-
})
|
514
|
-
end
|
515
|
-
def standarized_coeffs
|
516
|
-
assign_names(@coeffs_stan)
|
517
|
-
end
|
518
|
-
end
|
519
|
-
|
520
|
-
|
521
9
|
end
|
522
10
|
end
|