statsample 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/Manifest.txt +13 -2
- data/demo/benchmark.rb +1 -1
- data/demo/crosstab.rb +7 -0
- data/demo/nunnally_6.rb +34 -0
- data/demo/proportion.rb +1 -1
- data/demo/regression.rb +46 -0
- data/demo/t-student.rb +17 -0
- data/lib/statsample.rb +3 -4
- data/lib/statsample/crosstab.rb +34 -1
- data/lib/statsample/dominanceanalysis.rb +2 -2
- data/lib/statsample/dominanceanalysis/bootstrap.rb +2 -1
- data/lib/statsample/regression.rb +6 -518
- data/lib/statsample/regression/multiple.rb +259 -0
- data/lib/statsample/regression/multiple/alglibengine.rb +117 -0
- data/lib/statsample/regression/multiple/rubyengine.rb +140 -0
- data/lib/statsample/regression/simple.rb +81 -0
- data/test/test_regression.rb +5 -5
- data/test/test_statistics.rb +2 -12
- data/test/test_xls.xls +0 -0
- metadata +14 -3
data/History.txt
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
=== 0.3.1 / 2009-08-03
|
2
|
+
|
3
|
+
* Name and logic of Regression classes changed. Now, you have Regression::Simple class and Regression::Multiple module with two engines: RubyEngine and AlglibEngne
|
4
|
+
* New Crosstab#summary
|
5
|
+
|
1
6
|
=== 0.3.0 / 2009-08-02
|
2
7
|
|
3
8
|
* Statsample renamed to Statsample
|
data/Manifest.txt
CHANGED
@@ -5,15 +5,20 @@ Rakefile
|
|
5
5
|
bin/statsample
|
6
6
|
demo/benchmark.rb
|
7
7
|
demo/chi-square.rb
|
8
|
+
demo/crosstab.rb
|
8
9
|
demo/dice.rb
|
9
10
|
demo/distribution_t.rb
|
10
11
|
demo/graph.rb
|
11
12
|
demo/item_analysis.rb
|
12
13
|
demo/mean.rb
|
14
|
+
demo/nunnally_6.rb
|
13
15
|
demo/proportion.rb
|
16
|
+
demo/regression.rb
|
14
17
|
demo/sample_test.csv
|
15
18
|
demo/strata_proportion.rb
|
16
19
|
demo/stratum.rb
|
20
|
+
demo/t-student.rb
|
21
|
+
lib/spss.rb
|
17
22
|
lib/statsample.rb
|
18
23
|
lib/statsample/anova.rb
|
19
24
|
lib/statsample/bivariate.rb
|
@@ -25,19 +30,22 @@ lib/statsample/dataset.rb
|
|
25
30
|
lib/statsample/dominanceanalysis.rb
|
26
31
|
lib/statsample/dominanceanalysis/bootstrap.rb
|
27
32
|
lib/statsample/graph/gdchart.rb
|
28
|
-
lib/statsample/graph/svggraph.rb
|
29
33
|
lib/statsample/graph/svgboxplot.rb
|
34
|
+
lib/statsample/graph/svggraph.rb
|
30
35
|
lib/statsample/graph/svghistogram.rb
|
31
36
|
lib/statsample/graph/svgscatterplot.rb
|
32
37
|
lib/statsample/htmlreport.rb
|
33
38
|
lib/statsample/multiset.rb
|
34
39
|
lib/statsample/regression.rb
|
40
|
+
lib/statsample/regression/multiple.rb
|
41
|
+
lib/statsample/regression/multiple/alglibengine.rb
|
42
|
+
lib/statsample/regression/multiple/rubyengine.rb
|
43
|
+
lib/statsample/regression/simple.rb
|
35
44
|
lib/statsample/reliability.rb
|
36
45
|
lib/statsample/resample.rb
|
37
46
|
lib/statsample/srs.rb
|
38
47
|
lib/statsample/test.rb
|
39
48
|
lib/statsample/vector.rb
|
40
|
-
lib/spss.rb
|
41
49
|
test/_test_chart.rb
|
42
50
|
test/test_anova.rb
|
43
51
|
test/test_codification.rb
|
@@ -50,7 +58,10 @@ test/test_multiset.rb
|
|
50
58
|
test/test_regression.rb
|
51
59
|
test/test_reliability.rb
|
52
60
|
test/test_resample.rb
|
61
|
+
test/test_srs.rb
|
53
62
|
test/test_statistics.rb
|
54
63
|
test/test_stratified.rb
|
55
64
|
test/test_svg_graph.rb
|
56
65
|
test/test_vector.rb
|
66
|
+
test/test_xls.rb
|
67
|
+
test/test_xls.xls
|
data/demo/benchmark.rb
CHANGED
data/demo/crosstab.rb
ADDED
data/demo/nunnally_6.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require File.dirname(__FILE__)+'/../lib/statsample'
|
2
|
+
|
3
|
+
x1=[7,12,15,10,19,13,10,12,15,14].to_vector(:scale)
|
4
|
+
x2=[9,6,8,8,9,8,6,8,10,9].to_vector(:scale)
|
5
|
+
x3=[7,15,13,9,12,12,13,11,9,10].to_vector(:scale)
|
6
|
+
|
7
|
+
puts Statsample::Bivariate.pearson(x1,x2)
|
8
|
+
puts Statsample::Bivariate.pearson(x2,x3)
|
9
|
+
puts Statsample::Bivariate.pearson(x1,x3)
|
10
|
+
|
11
|
+
puts "Residual x1.x3"
|
12
|
+
res1=Statsample::Bivariate.residuals(x1,x3)
|
13
|
+
puts res1
|
14
|
+
puts "Residual x2.x3"
|
15
|
+
res2=Statsample::Bivariate.residuals(x2,x3)
|
16
|
+
puts res2
|
17
|
+
|
18
|
+
puts "Residual x1.x2"
|
19
|
+
res3=Statsample::Bivariate.residuals(x1,x2)
|
20
|
+
puts res3
|
21
|
+
puts "Residual x3.x2"
|
22
|
+
res4=Statsample::Bivariate.residuals(x3,x2)
|
23
|
+
puts res4
|
24
|
+
|
25
|
+
puts "Partial correlation de 1 y 2, controlando 3"
|
26
|
+
puts Statsample::Bivariate.pearson(res1,res2)
|
27
|
+
puts Statsample::Bivariate.partial_correlation(x1,x2,x3)
|
28
|
+
|
29
|
+
puts "Partial correlation de 1 y 3, controlando 2"
|
30
|
+
puts Statsample::Bivariate.pearson(res3,res4)
|
31
|
+
puts Statsample::Bivariate.partial_correlation(x1,x3,x2)
|
32
|
+
|
33
|
+
puts "Partial correlation de 2 y 3, controlando 1"
|
34
|
+
puts Statsample::Bivariate.partial_correlation(x2,x3,x1)
|
data/demo/proportion.rb
CHANGED
@@ -7,7 +7,7 @@ tests=3000
|
|
7
7
|
sample_size=100
|
8
8
|
# rand a 50%
|
9
9
|
poblacion=([1]*500+[0]*500).to_vector(:scale)
|
10
|
-
prop=poblacion.proportion(1
|
10
|
+
prop=poblacion.proportion(1)
|
11
11
|
puts "Estadísticos"
|
12
12
|
puts "DE con reemplazo:"+Statsample::SRS.proportion_sd_kp_wr(prop, sample_size).to_s
|
13
13
|
puts "DE sin reemplazo:"+Statsample::SRS.proportion_sd_kp_wor(prop, sample_size,poblacion.size).to_s
|
data/demo/regression.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
require File.dirname(__FILE__)+'/../lib/statsample'
|
2
|
+
tests=300
|
3
|
+
include Statsample
|
4
|
+
r = GSL::Rng.alloc(GSL::Rng::TAUS,Time.now.to_i)
|
5
|
+
ds=Dataset.new(%w{a b c d y})
|
6
|
+
ds['a'].type=:scale
|
7
|
+
ds['b'].type=:scale
|
8
|
+
ds['c'].type=:scale
|
9
|
+
ds['d'].type=:scale
|
10
|
+
ds['y'].type=:scale
|
11
|
+
|
12
|
+
tests.times {
|
13
|
+
a=r.ugaussian
|
14
|
+
b=r.ugaussian
|
15
|
+
c=r.ugaussian
|
16
|
+
d=r.ugaussian
|
17
|
+
y=a*70+b*30+c*5+r.ugaussian*5
|
18
|
+
ds.add_case_array([a,b,c,d,y])
|
19
|
+
}
|
20
|
+
ds.update_valid_data
|
21
|
+
|
22
|
+
if !File.exists? "regression.dab"
|
23
|
+
da=DominanceAnalysis::Bootstrap.new(ds,"y")
|
24
|
+
else
|
25
|
+
da=Statsample.load("regression.dab")
|
26
|
+
end
|
27
|
+
|
28
|
+
da.lr_class=Regression::Multiple::AlglibEngine
|
29
|
+
da.bootstrap(20)
|
30
|
+
|
31
|
+
puts da.summary
|
32
|
+
da.save("regression.dab")
|
33
|
+
|
34
|
+
lr=Regression::Multiple.listwise(ds,"y")
|
35
|
+
|
36
|
+
hr=HtmlReport.new("Regression")
|
37
|
+
hr.add_summary("Regression",lr.summary(HtmlSummary))
|
38
|
+
hr.add_summary("Analisis de Dominancia ", da.da.summary(HtmlSummary))
|
39
|
+
|
40
|
+
hr.add_summary("Analisis de Dominancia (Bootstrap)", da.summary(HtmlSummary))
|
41
|
+
|
42
|
+
da.fields.each{|f|
|
43
|
+
hr.add_histogram("General Dominance #{f}",da.samples_ga[f].to_vector(:scale))
|
44
|
+
}
|
45
|
+
hr.save("Regression Dominance.html")
|
46
|
+
|
data/demo/t-student.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require File.dirname(__FILE__)+"/../lib/statsample"
|
2
|
+
|
3
|
+
|
4
|
+
tests=3000
|
5
|
+
|
6
|
+
r = GSL::Rng.alloc(GSL::Rng::TAUS, 1)
|
7
|
+
sample_sizes=[5,10,20,30]
|
8
|
+
sample_sizes.each{|sample_size|
|
9
|
+
monte=Statsample::Resample.repeat_and_save(tests) {
|
10
|
+
v=[]
|
11
|
+
sample_size.times{|i|
|
12
|
+
v.push(r.ugaussian)
|
13
|
+
}
|
14
|
+
v.to_vector(:scale).mean
|
15
|
+
|
16
|
+
}
|
17
|
+
}
|
data/lib/statsample.rb
CHANGED
@@ -58,7 +58,7 @@ end
|
|
58
58
|
# :startdoc:
|
59
59
|
#
|
60
60
|
module Statsample
|
61
|
-
VERSION = '0.3.
|
61
|
+
VERSION = '0.3.1'
|
62
62
|
SPLIT_TOKEN = ","
|
63
63
|
autoload(:Database, 'statsample/converters')
|
64
64
|
autoload(:Anova, 'statsample/anova')
|
@@ -74,7 +74,6 @@ module Statsample
|
|
74
74
|
autoload(:Reliability, 'statsample/reliability')
|
75
75
|
autoload(:Bivariate, 'statsample/bivariate')
|
76
76
|
autoload(:Multivariate, 'statsample/multivariate')
|
77
|
-
|
78
77
|
autoload(:Regression, 'statsample/regression')
|
79
78
|
autoload(:Test, 'statsample/test')
|
80
79
|
def self.load(filename)
|
@@ -134,10 +133,10 @@ module Statsample
|
|
134
133
|
end
|
135
134
|
class ReportTable
|
136
135
|
attr_reader :header
|
137
|
-
def initialize(
|
138
|
-
@header=header
|
136
|
+
def initialize(h=[])
|
139
137
|
@rows=[]
|
140
138
|
@max_cols=[]
|
139
|
+
self.header=(h)
|
141
140
|
end
|
142
141
|
def add_row(row)
|
143
142
|
row.each_index{|i|
|
data/lib/statsample/crosstab.rb
CHANGED
@@ -62,7 +62,7 @@ module Statsample
|
|
62
62
|
# Chi square, based on expected and real matrix
|
63
63
|
def chi_square
|
64
64
|
require 'statsample/test'
|
65
|
-
Statsample::Test.chi_square(self.to_matrix,matrix_expected)
|
65
|
+
Statsample::Test.chi_square(self.to_matrix, matrix_expected)
|
66
66
|
end
|
67
67
|
# Useful to obtain chi square
|
68
68
|
def matrix_expected
|
@@ -78,6 +78,39 @@ module Statsample
|
|
78
78
|
}
|
79
79
|
Matrix.rows(m)
|
80
80
|
end
|
81
|
+
def summary(report_type=ConsoleSummary)
|
82
|
+
out=""
|
83
|
+
out.extend report_type
|
84
|
+
fq=frequencies
|
85
|
+
rn=rows_names
|
86
|
+
cn=cols_names
|
87
|
+
total=0
|
88
|
+
total_cols=cn.inject({}) {|a,x| a[x]=0;a}
|
89
|
+
out.add "Chi Square: #{chi_square}"
|
90
|
+
t=Statsample::ReportTable.new([""]+cols_names+["Total"])
|
91
|
+
rn.each{|row|
|
92
|
+
total_row=0
|
93
|
+
t_row=[@v_rows.labeling(row)]
|
94
|
+
cn.each{|col|
|
95
|
+
data=fq[[row,col]]
|
96
|
+
total_row+=fq[[row,col]]
|
97
|
+
total+=fq[[row,col]]
|
98
|
+
total_cols[col]+=fq[[row,col]]
|
99
|
+
t_row.push(data)
|
100
|
+
}
|
101
|
+
t_row.push(total_row)
|
102
|
+
t.add_row(t_row)
|
103
|
+
}
|
104
|
+
t.add_horizontal_line
|
105
|
+
t_row=["Total"]
|
106
|
+
cn.each{|v|
|
107
|
+
t_row.push(total_cols[v])
|
108
|
+
}
|
109
|
+
t_row.push(total)
|
110
|
+
t.add_row(t_row)
|
111
|
+
out.parse_table(t)
|
112
|
+
out
|
113
|
+
end
|
81
114
|
def to_s
|
82
115
|
fq=frequencies
|
83
116
|
rn=rows_names
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'statsample/dominanceanalysis/bootstrap'
|
2
2
|
module Statsample
|
3
3
|
class DominanceAnalysis
|
4
|
-
def initialize(ds,y_var, r_class = Regression::
|
4
|
+
def initialize(ds,y_var, r_class = Regression::Multiple::RubyEngine)
|
5
5
|
@y_var=y_var
|
6
6
|
@dy=ds[@y_var]
|
7
7
|
@ds=ds
|
@@ -220,7 +220,7 @@ module Statsample
|
|
220
220
|
@name=name
|
221
221
|
@fields=fields
|
222
222
|
@contributions=@fields.inject({}){|a,v| a[v]=nil;a}
|
223
|
-
r_class=Regression::
|
223
|
+
r_class=Regression::Multiple::RubyEngine if r_class.nil?
|
224
224
|
@lr=r_class.new(ds,y_var)
|
225
225
|
end
|
226
226
|
def add_contribution(f,v)
|
@@ -11,7 +11,7 @@ class DominanceAnalysis
|
|
11
11
|
@fields=ds.fields-[y_var]
|
12
12
|
@samples_ga=@fields.inject({}){|a,v| a[v]=[];a}
|
13
13
|
@n_samples=0
|
14
|
-
@lr_class=Regression::
|
14
|
+
@lr_class=Regression::Multiple::RubyEngine
|
15
15
|
create_samples_pairs
|
16
16
|
end
|
17
17
|
def lr_class=(lr)
|
@@ -68,6 +68,7 @@ class DominanceAnalysis
|
|
68
68
|
out.add "Summary for Bootstrap Dominance Analysis of "+@fields.join(", ")+" over "+@y_var+"\n"
|
69
69
|
out.add "Size of sample: #{@n_samples}\n"
|
70
70
|
out.add "t:#{t}\n"
|
71
|
+
out.add "Linear Regression Engine: #{@lr_class.name}"
|
71
72
|
out.nl
|
72
73
|
table=ReportTable.new
|
73
74
|
header=["pairs","sD","Dij","SE(Dij)","Pij","Pji","Pno","Reprod"]
|
@@ -1,522 +1,10 @@
|
|
1
|
+
require 'statsample/regression/simple'
|
2
|
+
require 'statsample/regression/multiple'
|
3
|
+
require 'statsample/regression/multiple/alglibengine'
|
4
|
+
require 'statsample/regression/multiple/rubyengine'
|
5
|
+
|
1
6
|
module Statsample
|
2
|
-
#
|
7
|
+
# Module for regression procedures
|
3
8
|
module Regression
|
4
|
-
# Class for calculation of linear regressions
|
5
|
-
# To create a SimpleRegression object:
|
6
|
-
# * <tt> SimpleRegression.new_from_vectors(vx,vy)</tt>
|
7
|
-
# * <tt> SimpleRegression.new_from_gsl(gsl) </tt>
|
8
|
-
#
|
9
|
-
class SimpleRegression
|
10
|
-
attr_accessor :a,:b,:cov00, :cov01, :covx1, :chisq, :status
|
11
|
-
private_class_method :new
|
12
|
-
def initialize(init_method, *argv)
|
13
|
-
self.send(init_method, *argv)
|
14
|
-
end
|
15
|
-
def y(val_x)
|
16
|
-
@a+@b*val_x
|
17
|
-
end
|
18
|
-
def x(val_y)
|
19
|
-
(val_y-@a) / @b.to_f
|
20
|
-
end
|
21
|
-
# Sum of square error
|
22
|
-
def sse
|
23
|
-
(0...@vx.size).inject(0) {|acum,i|
|
24
|
-
acum+((@vy[i]-y(@vx[i]))**2)
|
25
|
-
}
|
26
|
-
end
|
27
|
-
def standard_error
|
28
|
-
Math::sqrt(sse / (@vx.size-2).to_f)
|
29
|
-
end
|
30
|
-
# Sum of square regression
|
31
|
-
def ssr
|
32
|
-
vy_mean=@vy.mean
|
33
|
-
(0...@vx.size).inject(0) {|a,i|
|
34
|
-
a+((y(@vx[i])-vy_mean)**2)
|
35
|
-
}
|
36
|
-
|
37
|
-
end
|
38
|
-
# Sum of square total
|
39
|
-
def sst
|
40
|
-
@vy.sum_of_squared_deviation
|
41
|
-
end
|
42
|
-
# Value of r
|
43
|
-
def r
|
44
|
-
@b * (@vx.sds / @vy.sds)
|
45
|
-
end
|
46
|
-
# Value of r^2
|
47
|
-
def r2
|
48
|
-
r**2
|
49
|
-
end
|
50
|
-
class << self
|
51
|
-
def new_from_gsl(ar)
|
52
|
-
new(:init_gsl, *ar)
|
53
|
-
end
|
54
|
-
def new_from_vectors(vx,vy)
|
55
|
-
new(:init_vectors,vx,vy)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
def init_vectors(vx,vy)
|
59
|
-
@vx,@vy=Statsample.only_valid(vx,vy)
|
60
|
-
x_m=@vx.mean
|
61
|
-
y_m=@vy.mean
|
62
|
-
num=den=0
|
63
|
-
(0...@vx.size).each {|i|
|
64
|
-
num+=(@vx[i]-x_m)*(@vy[i]-y_m)
|
65
|
-
den+=(@vx[i]-x_m)**2
|
66
|
-
}
|
67
|
-
@b=num.to_f/den
|
68
|
-
@a=y_m - @b*x_m
|
69
|
-
end
|
70
|
-
def init_gsl(a,b,cov00, cov01, covx1, chisq, status)
|
71
|
-
@a=a
|
72
|
-
@b=b
|
73
|
-
@cov00=cov00
|
74
|
-
@cov01=cov01
|
75
|
-
@covx1=covx1
|
76
|
-
@chisq=chisq
|
77
|
-
@status=status
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
|
82
|
-
class MultipleRegressionBase
|
83
|
-
def initialize(ds,y_var)
|
84
|
-
@ds=ds
|
85
|
-
@y_var=y_var
|
86
|
-
@r2=nil
|
87
|
-
|
88
|
-
end
|
89
|
-
def assign_names(c)
|
90
|
-
a={}
|
91
|
-
@fields.each_index {|i|
|
92
|
-
a[@fields[i]]=c[i]
|
93
|
-
}
|
94
|
-
a
|
95
|
-
end
|
96
|
-
def predicted
|
97
|
-
(0...@ds.cases).collect { |i|
|
98
|
-
invalid=false
|
99
|
-
vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]}
|
100
|
-
if invalid
|
101
|
-
nil
|
102
|
-
else
|
103
|
-
process(vect)
|
104
|
-
end
|
105
|
-
}.to_vector(:scale)
|
106
|
-
end
|
107
|
-
def standarized_predicted
|
108
|
-
predicted.standarized
|
109
|
-
end
|
110
|
-
def residuals
|
111
|
-
(0...@ds.cases).collect{|i|
|
112
|
-
invalid=false
|
113
|
-
vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
|
114
|
-
if invalid or @ds[@y_var][i].nil?
|
115
|
-
nil
|
116
|
-
else
|
117
|
-
@ds[@y_var][i] - process(vect)
|
118
|
-
end
|
119
|
-
}.to_vector(:scale)
|
120
|
-
end
|
121
|
-
def r
|
122
|
-
raise "You should implement this"
|
123
|
-
end
|
124
|
-
def sst
|
125
|
-
raise "You should implement this"
|
126
|
-
end
|
127
|
-
def ssr
|
128
|
-
r2*sst
|
129
|
-
end
|
130
|
-
def sse
|
131
|
-
sst - ssr
|
132
|
-
end
|
133
|
-
|
134
|
-
def coeffs_t
|
135
|
-
out={}
|
136
|
-
se=coeffs_se
|
137
|
-
coeffs.each{|k,v|
|
138
|
-
out[k]=v / se[k]
|
139
|
-
}
|
140
|
-
out
|
141
|
-
end
|
142
|
-
|
143
|
-
def mse
|
144
|
-
sse/df_e
|
145
|
-
end
|
146
|
-
|
147
|
-
def df_r
|
148
|
-
@dep_columns.size
|
149
|
-
end
|
150
|
-
def df_e
|
151
|
-
@ds_valid.cases-@dep_columns.size-1
|
152
|
-
end
|
153
|
-
def f
|
154
|
-
(ssr.quo(df_r)).quo(sse.quo(df_e))
|
155
|
-
end
|
156
|
-
# Significance of Fisher
|
157
|
-
def significance
|
158
|
-
if HAS_GSL
|
159
|
-
GSL::Cdf.fdist_Q(f,df_r,df_e)
|
160
|
-
else
|
161
|
-
raise "Need Ruby/GSL"
|
162
|
-
end
|
163
|
-
end
|
164
|
-
# Tolerance for a given variable
|
165
|
-
# http://talkstats.com/showthread.php?t=5056
|
166
|
-
def tolerance(var)
|
167
|
-
ds=assign_names(@dep_columns)
|
168
|
-
ds.each{|k,v|
|
169
|
-
ds[k]=v.to_vector(:scale)
|
170
|
-
}
|
171
|
-
if HAS_ALGIB
|
172
|
-
lr_class=::Statsample::Regression::MultipleRegressionAlglib
|
173
|
-
ds=ds.to_dataset
|
174
|
-
else
|
175
|
-
lr_class=MultipleRegressionPairwise
|
176
|
-
ds=ds.to_dataset.dup_only_valid
|
177
|
-
end
|
178
|
-
lr=lr_class.new(ds,var)
|
179
|
-
1-lr.r2
|
180
|
-
end
|
181
|
-
def coeffs_tolerances
|
182
|
-
@fields.inject({}) {|a,f|
|
183
|
-
a[f]=tolerance(f);
|
184
|
-
a
|
185
|
-
}
|
186
|
-
end
|
187
|
-
def coeffs_se
|
188
|
-
out={}
|
189
|
-
mse=sse.quo(df_e)
|
190
|
-
coeffs.each {|k,v|
|
191
|
-
out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares*tolerance(k)))
|
192
|
-
}
|
193
|
-
out
|
194
|
-
end
|
195
|
-
def estimated_variance_covariance_matrix
|
196
|
-
mse_p=mse
|
197
|
-
columns=[]
|
198
|
-
@ds_valid.each_vector{|k,v|
|
199
|
-
columns.push(v.data) unless k==@y_var
|
200
|
-
}
|
201
|
-
columns.unshift([1.0]*@ds_valid.cases)
|
202
|
-
x=Matrix.columns(columns)
|
203
|
-
matrix=((x.t*x)).inverse * mse
|
204
|
-
matrix.collect {|i|
|
205
|
-
|
206
|
-
Math::sqrt(i) if i>0
|
207
|
-
}
|
208
|
-
end
|
209
|
-
def constant_t
|
210
|
-
constant.to_f/constant_se
|
211
|
-
end
|
212
|
-
def constant_se
|
213
|
-
estimated_variance_covariance_matrix[0,0]
|
214
|
-
end
|
215
|
-
def summary(report_type=ConsoleSummary)
|
216
|
-
c=coeffs
|
217
|
-
out=""
|
218
|
-
out.extend report_type
|
219
|
-
out.add <<HEREDOC
|
220
|
-
Summary for regression of #{@fields.join(',')} over #{@y_var}"
|
221
|
-
*************************************************************
|
222
|
-
Cases(listwise)=#{@ds.cases}(#{@ds_valid.cases})
|
223
|
-
r=#{sprintf("%0.3f",r)}
|
224
|
-
r2=#{sprintf("%0.3f",r2)}
|
225
|
-
ssr=#{sprintf("%0.3f",ssr)}
|
226
|
-
sse=#{sprintf("%0.3f",sse)}
|
227
|
-
sst=#{sprintf("%0.3f",sst)}
|
228
|
-
F#{sprintf("(%d,%d)=%0.3f, p=%0.3f",df_r,df_e,f,significance)}
|
229
|
-
Equation=#{sprintf("%0.3f",constant)}+#{@fields.collect {|k| sprintf("%0.3f%s",c[k],k)}.join(' + ')}
|
230
|
-
|
231
|
-
HEREDOC
|
232
|
-
|
233
|
-
end
|
234
|
-
|
235
|
-
|
236
|
-
# Deprecated
|
237
|
-
# Sum of squares of error (manual calculation)
|
238
|
-
# using the predicted value minus the y_i value
|
239
|
-
def sse_manual
|
240
|
-
pr=predicted
|
241
|
-
cases=0
|
242
|
-
sse=(0...@ds.cases).inject(0) {|a,i|
|
243
|
-
if !@dy.data_with_nils[i].nil? and !pr[i].nil?
|
244
|
-
cases+=1
|
245
|
-
a+((pr[i]-@dy[i])**2)
|
246
|
-
else
|
247
|
-
a
|
248
|
-
end
|
249
|
-
}
|
250
|
-
sse*(min_n_valid-1.0).quo(cases-1)
|
251
|
-
end
|
252
|
-
# Sum of squares of regression
|
253
|
-
# using the predicted value minus y mean
|
254
|
-
def ssr_direct
|
255
|
-
mean=@dy.mean
|
256
|
-
cases=0
|
257
|
-
ssr=(0...@ds.cases).inject(0) {|a,i|
|
258
|
-
invalid=false
|
259
|
-
v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
|
260
|
-
if !invalid
|
261
|
-
cases+=1
|
262
|
-
a+((process(v)-mean)**2)
|
263
|
-
else
|
264
|
-
a
|
265
|
-
end
|
266
|
-
}
|
267
|
-
ssr
|
268
|
-
end
|
269
|
-
def sse_direct
|
270
|
-
sst-ssr
|
271
|
-
end
|
272
|
-
|
273
|
-
|
274
|
-
end
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
if HAS_ALGIB
|
281
|
-
# Class for calculation of multiple regression.
|
282
|
-
# Requires Alglib gem.
|
283
|
-
# To create a SimpleRegression object:
|
284
|
-
# @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
|
285
|
-
# @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
|
286
|
-
# @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
|
287
|
-
# @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
|
288
|
-
# ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
|
289
|
-
# lr=Statsample::Regression::MultipleRegression.new(ds,'y')
|
290
|
-
#
|
291
|
-
class MultipleRegressionAlglib < MultipleRegressionBase
|
292
|
-
def initialize(ds,y_var)
|
293
|
-
@ds=ds.dup_only_valid
|
294
|
-
@ds_valid=@ds
|
295
|
-
@y_var=y_var
|
296
|
-
@dy=@ds[@y_var]
|
297
|
-
@ds_indep=ds.dup(ds.fields-[y_var])
|
298
|
-
# Create a custom matrix
|
299
|
-
columns=[]
|
300
|
-
@fields=[]
|
301
|
-
@ds.fields.each{|f|
|
302
|
-
if f!=@y_var
|
303
|
-
columns.push(@ds[f].to_a)
|
304
|
-
@fields.push(f)
|
305
|
-
end
|
306
|
-
}
|
307
|
-
@dep_columns=columns.dup
|
308
|
-
columns.push(@ds[@y_var])
|
309
|
-
matrix=Matrix.columns(columns)
|
310
|
-
@lr_s=nil
|
311
|
-
@lr=::Alglib::LinearRegression.build_from_matrix(matrix)
|
312
|
-
end
|
313
|
-
|
314
|
-
def _dump(i)
|
315
|
-
Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
|
316
|
-
end
|
317
|
-
def self._load(data)
|
318
|
-
h=Marshal.load(data)
|
319
|
-
MultipleRegression.new(h['ds'], h['y_var'])
|
320
|
-
end
|
321
|
-
|
322
|
-
def coeffs
|
323
|
-
assign_names(@lr.coeffs)
|
324
|
-
end
|
325
|
-
# Coefficients using a constant
|
326
|
-
# Based on http://www.xycoon.com/ols1.htm
|
327
|
-
def matrix_resolution
|
328
|
-
mse_p=mse
|
329
|
-
columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
|
330
|
-
columns.unshift([1.0]*@ds.cases)
|
331
|
-
y=Matrix.columns([@dy.data.map {|i| i.to_f}])
|
332
|
-
x=Matrix.columns(columns)
|
333
|
-
xt=x.t
|
334
|
-
matrix=((xt*x)).inverse*xt
|
335
|
-
matrix*y
|
336
|
-
end
|
337
|
-
def r2
|
338
|
-
r**2
|
339
|
-
end
|
340
|
-
def r
|
341
|
-
Bivariate::pearson(@dy,predicted)
|
342
|
-
end
|
343
|
-
def sst
|
344
|
-
@dy.ss
|
345
|
-
end
|
346
|
-
def constant
|
347
|
-
@lr.constant
|
348
|
-
end
|
349
|
-
def standarized_coeffs
|
350
|
-
l=lr_s
|
351
|
-
assign_names(l.coeffs)
|
352
|
-
end
|
353
|
-
def lr_s
|
354
|
-
if @lr_s.nil?
|
355
|
-
build_standarized
|
356
|
-
end
|
357
|
-
@lr_s
|
358
|
-
end
|
359
|
-
def build_standarized
|
360
|
-
@ds_s=@ds.standarize
|
361
|
-
columns=[]
|
362
|
-
@ds_s.fields.each{|f|
|
363
|
-
columns.push(@ds_s[f].to_a) unless f==@y_var
|
364
|
-
}
|
365
|
-
@dep_columns_s=columns.dup
|
366
|
-
columns.push(@ds_s[@y_var])
|
367
|
-
matrix=Matrix.columns(columns)
|
368
|
-
@lr_s=Alglib::LinearRegression.build_from_matrix(matrix)
|
369
|
-
end
|
370
|
-
def process(v)
|
371
|
-
@lr.process(v)
|
372
|
-
end
|
373
|
-
def process_s(v)
|
374
|
-
lr_s.process(v)
|
375
|
-
end
|
376
|
-
# ???? Not equal to SPSS output
|
377
|
-
def standarized_residuals
|
378
|
-
res=residuals
|
379
|
-
red_sd=residuals.sds
|
380
|
-
res.collect {|v|
|
381
|
-
v.quo(red_sd)
|
382
|
-
}.to_vector(:scale)
|
383
|
-
end
|
384
|
-
end
|
385
|
-
end
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
class MultipleRegressionPairwise < MultipleRegressionBase
|
399
|
-
def initialize(ds,y_var)
|
400
|
-
super
|
401
|
-
@dy=ds[@y_var]
|
402
|
-
@ds_valid=ds.dup_only_valid
|
403
|
-
@ds_indep=ds.dup(ds.fields-[y_var])
|
404
|
-
@fields=@ds_indep.fields
|
405
|
-
set_dep_columns
|
406
|
-
obtain_y_vector
|
407
|
-
@matrix_x = Bivariate.correlation_matrix(@ds_indep)
|
408
|
-
@coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a
|
409
|
-
@min_n_valid=nil
|
410
|
-
end
|
411
|
-
def min_n_valid
|
412
|
-
if @min_n_valid.nil?
|
413
|
-
min=@ds.cases
|
414
|
-
m=Bivariate::n_valid_matrix(@ds)
|
415
|
-
for x in 0...m.row_size
|
416
|
-
for y in 0...m.column_size
|
417
|
-
min=m[x,y] if m[x,y] < min
|
418
|
-
end
|
419
|
-
end
|
420
|
-
@min_n_valid=min
|
421
|
-
end
|
422
|
-
@min_n_valid
|
423
|
-
end
|
424
|
-
def set_dep_columns
|
425
|
-
@dep_columns=[]
|
426
|
-
@ds_indep.each_vector{|k,v|
|
427
|
-
@dep_columns.push(v.data_with_nils)
|
428
|
-
}
|
429
|
-
end
|
430
|
-
# Sum of square total
|
431
|
-
def sst
|
432
|
-
#if @sst.nil?
|
433
|
-
@sst=@dy.variance*(min_n_valid-1.0)
|
434
|
-
#end
|
435
|
-
@sst
|
436
|
-
end
|
437
|
-
def r2
|
438
|
-
if @r2.nil?
|
439
|
-
c=@matrix_y
|
440
|
-
rxx=obtain_predictor_matrix
|
441
|
-
matrix=(c.t*rxx.inverse*c)
|
442
|
-
@r2=matrix[0,0]
|
443
|
-
end
|
444
|
-
@r2
|
445
|
-
end
|
446
|
-
def r
|
447
|
-
Math::sqrt(r2)
|
448
|
-
end
|
449
|
-
|
450
|
-
def df_e
|
451
|
-
min_n_valid-@dep_columns.size-1
|
452
|
-
end
|
453
|
-
def fix_with_mean
|
454
|
-
i=0
|
455
|
-
@ds_indep.each{|row|
|
456
|
-
empty=[]
|
457
|
-
row.each{|k,v|
|
458
|
-
empty.push(k) if v.nil?
|
459
|
-
}
|
460
|
-
if empty.size==1
|
461
|
-
@ds_indep[empty[0]][i]=@ds[empty[0]].mean
|
462
|
-
end
|
463
|
-
i+=1
|
464
|
-
}
|
465
|
-
@ds_indep.update_valid_data
|
466
|
-
set_dep_columns
|
467
|
-
end
|
468
|
-
def fix_with_regression
|
469
|
-
i=0
|
470
|
-
@ds_indep.each{|row|
|
471
|
-
empty=[]
|
472
|
-
row.each{|k,v|
|
473
|
-
empty.push(k) if v.nil?
|
474
|
-
}
|
475
|
-
if empty.size==1
|
476
|
-
field=empty[0]
|
477
|
-
lr=MultipleRegression.new(@ds_indep,field)
|
478
|
-
fields=[]
|
479
|
-
@ds_indep.fields.each{|f|
|
480
|
-
fields.push(row[f]) unless f==field
|
481
|
-
}
|
482
|
-
@ds_indep[field][i]=lr.process(fields)
|
483
|
-
end
|
484
|
-
i+=1
|
485
|
-
}
|
486
|
-
@ds_indep.update_valid_data
|
487
|
-
set_dep_columns
|
488
|
-
end
|
489
|
-
def obtain_y_vector
|
490
|
-
@matrix_y=Matrix.columns([@ds_indep.fields.collect{|f|
|
491
|
-
Bivariate.pearson(@dy, @ds_indep[f])
|
492
|
-
}])
|
493
|
-
end
|
494
|
-
def obtain_predictor_matrix
|
495
|
-
Bivariate::correlation_matrix(@ds_indep)
|
496
|
-
end
|
497
|
-
def constant
|
498
|
-
c=coeffs
|
499
|
-
@dy.mean-@fields.inject(0){|a,k| a+(c[k] * @ds_indep[k].mean)}
|
500
|
-
end
|
501
|
-
def process(v)
|
502
|
-
c=coeffs
|
503
|
-
total=constant
|
504
|
-
@fields.each_index{|i|
|
505
|
-
total+=c[@fields[i]]*v[i]
|
506
|
-
}
|
507
|
-
total
|
508
|
-
end
|
509
|
-
def coeffs
|
510
|
-
sc=standarized_coeffs
|
511
|
-
assign_names(@fields.collect{|f|
|
512
|
-
(sc[f]*@dy.sds).quo(@ds_indep[f].sds)
|
513
|
-
})
|
514
|
-
end
|
515
|
-
def standarized_coeffs
|
516
|
-
assign_names(@coeffs_stan)
|
517
|
-
end
|
518
|
-
end
|
519
|
-
|
520
|
-
|
521
9
|
end
|
522
10
|
end
|