statsample-ekatena 2.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.travis.yml +23 -0
- data/CONTRIBUTING.md +17 -0
- data/Gemfile +2 -0
- data/History.txt +457 -0
- data/LICENSE.txt +12 -0
- data/README.md +175 -0
- data/Rakefile +44 -0
- data/benchmarks/correlation_matrix_15_variables.rb +32 -0
- data/benchmarks/correlation_matrix_5_variables.rb +33 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/doc_latex/manual/equations.tex +78 -0
- data/examples/boxplot.rb +28 -0
- data/examples/chisquare_test.rb +23 -0
- data/examples/correlation_matrix.rb +32 -0
- data/examples/dataset.rb +30 -0
- data/examples/dominance_analysis.rb +33 -0
- data/examples/dominance_analysis_bootstrap.rb +32 -0
- data/examples/histogram.rb +26 -0
- data/examples/icc.rb +24 -0
- data/examples/levene.rb +29 -0
- data/examples/multiple_regression.rb +20 -0
- data/examples/multivariate_correlation.rb +33 -0
- data/examples/parallel_analysis.rb +40 -0
- data/examples/polychoric.rb +40 -0
- data/examples/principal_axis.rb +26 -0
- data/examples/reliability.rb +31 -0
- data/examples/scatterplot.rb +25 -0
- data/examples/t_test.rb +27 -0
- data/examples/tetrachoric.rb +17 -0
- data/examples/u_test.rb +24 -0
- data/examples/vector.rb +20 -0
- data/examples/velicer_map_test.rb +46 -0
- data/grab_references.rb +29 -0
- data/lib/spss.rb +134 -0
- data/lib/statsample-ekatena/analysis.rb +100 -0
- data/lib/statsample-ekatena/analysis/suite.rb +89 -0
- data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
- data/lib/statsample-ekatena/anova.rb +24 -0
- data/lib/statsample-ekatena/anova/contrast.rb +79 -0
- data/lib/statsample-ekatena/anova/oneway.rb +187 -0
- data/lib/statsample-ekatena/anova/twoway.rb +207 -0
- data/lib/statsample-ekatena/bivariate.rb +406 -0
- data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
- data/lib/statsample-ekatena/codification.rb +182 -0
- data/lib/statsample-ekatena/converter/csv.rb +28 -0
- data/lib/statsample-ekatena/converter/spss.rb +48 -0
- data/lib/statsample-ekatena/converters.rb +211 -0
- data/lib/statsample-ekatena/crosstab.rb +188 -0
- data/lib/statsample-ekatena/daru.rb +115 -0
- data/lib/statsample-ekatena/dataset.rb +10 -0
- data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
- data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
- data/lib/statsample-ekatena/factor.rb +104 -0
- data/lib/statsample-ekatena/factor/map.rb +124 -0
- data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
- data/lib/statsample-ekatena/factor/pca.rb +242 -0
- data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
- data/lib/statsample-ekatena/factor/rotation.rb +198 -0
- data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
- data/lib/statsample-ekatena/formula/formula.rb +306 -0
- data/lib/statsample-ekatena/graph.rb +11 -0
- data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
- data/lib/statsample-ekatena/graph/histogram.rb +198 -0
- data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
- data/lib/statsample-ekatena/histogram.rb +180 -0
- data/lib/statsample-ekatena/matrix.rb +329 -0
- data/lib/statsample-ekatena/multiset.rb +310 -0
- data/lib/statsample-ekatena/regression.rb +65 -0
- data/lib/statsample-ekatena/regression/multiple.rb +89 -0
- data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
- data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
- data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
- data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
- data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
- data/lib/statsample-ekatena/regression/simple.rb +121 -0
- data/lib/statsample-ekatena/reliability.rb +150 -0
- data/lib/statsample-ekatena/reliability/icc.rb +415 -0
- data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
- data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
- data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
- data/lib/statsample-ekatena/resample.rb +15 -0
- data/lib/statsample-ekatena/shorthand.rb +125 -0
- data/lib/statsample-ekatena/srs.rb +169 -0
- data/lib/statsample-ekatena/test.rb +82 -0
- data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
- data/lib/statsample-ekatena/test/chisquare.rb +73 -0
- data/lib/statsample-ekatena/test/f.rb +52 -0
- data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
- data/lib/statsample-ekatena/test/levene.rb +88 -0
- data/lib/statsample-ekatena/test/t.rb +309 -0
- data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
- data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
- data/lib/statsample-ekatena/vector.rb +19 -0
- data/lib/statsample-ekatena/version.rb +3 -0
- data/lib/statsample.rb +282 -0
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +959 -0
- data/po/statsample.pot +947 -0
- data/references.txt +24 -0
- data/statsample-ekatena.gemspec +49 -0
- data/test/fixtures/bank2.dat +200 -0
- data/test/fixtures/correlation_matrix.rb +17 -0
- data/test/fixtures/df.csv +15 -0
- data/test/fixtures/hartman_23.matrix +9 -0
- data/test/fixtures/stock_data.csv +500 -0
- data/test/fixtures/tetmat_matrix.txt +5 -0
- data/test/fixtures/tetmat_test.txt +1001 -0
- data/test/helpers_tests.rb +83 -0
- data/test/test_analysis.rb +176 -0
- data/test/test_anova_contrast.rb +36 -0
- data/test/test_anovaoneway.rb +26 -0
- data/test/test_anovatwoway.rb +37 -0
- data/test/test_anovatwowaywithdataset.rb +47 -0
- data/test/test_anovawithvectors.rb +102 -0
- data/test/test_awesome_print_bug.rb +16 -0
- data/test/test_bartlettsphericity.rb +25 -0
- data/test/test_bivariate.rb +164 -0
- data/test/test_codification.rb +78 -0
- data/test/test_crosstab.rb +67 -0
- data/test/test_dominance_analysis.rb +39 -0
- data/test/test_factor.rb +228 -0
- data/test/test_factor_map.rb +38 -0
- data/test/test_factor_pa.rb +56 -0
- data/test/test_fit_model.rb +88 -0
- data/test/test_ggobi.rb +35 -0
- data/test/test_gsl.rb +15 -0
- data/test/test_histogram.rb +109 -0
- data/test/test_matrix.rb +48 -0
- data/test/test_multiset.rb +176 -0
- data/test/test_regression.rb +231 -0
- data/test/test_reliability.rb +223 -0
- data/test/test_reliability_icc.rb +198 -0
- data/test/test_reliability_skillscale.rb +57 -0
- data/test/test_resample.rb +24 -0
- data/test/test_srs.rb +9 -0
- data/test/test_statistics.rb +69 -0
- data/test/test_stest.rb +69 -0
- data/test/test_stratified.rb +17 -0
- data/test/test_test_f.rb +33 -0
- data/test/test_test_kolmogorovsmirnov.rb +34 -0
- data/test/test_test_t.rb +62 -0
- data/test/test_umannwhitney.rb +27 -0
- data/test/test_vector.rb +12 -0
- data/test/test_wilcoxonsignedrank.rb +64 -0
- metadata +570 -0
@@ -0,0 +1,128 @@
|
|
1
|
+
if HAS_ALGIB
|
2
|
+
module Statsample
|
3
|
+
module Regression
|
4
|
+
module Multiple
|
5
|
+
# Class for Multiple Regression Analysis
|
6
|
+
# Requires Alglib gem and uses a listwise aproach.
|
7
|
+
# Faster than GslEngine on massive prediction use, because process is c-based.
|
8
|
+
# Prefer GslEngine if you need good memory use.
|
9
|
+
# If you need pairwise, use RubyEngine
|
10
|
+
# Example:
|
11
|
+
#
|
12
|
+
# @a = Daru::Vector.new([1,3,2,4,3,5,4,6,5,7])
|
13
|
+
# @b = Daru::Vector.new([3,3,4,4,5,5,6,6,4,4])
|
14
|
+
# @c = Daru::Vector.new([11,22,30,40,50,65,78,79,99,100])
|
15
|
+
# @y = Daru::Vector.new([3,4,5,6,7,8,9,10,20,30])
|
16
|
+
# ds = Daru::DataFrame.new({:a => @a,:b => @b,:c => @c,:y => @y})
|
17
|
+
# lr=Statsample::Regression::Multiple::AlglibEngine.new(ds, :y)
|
18
|
+
#
|
19
|
+
class AlglibEngine < BaseEngine
|
20
|
+
def initialize(ds,y_var, opts=Hash.new)
|
21
|
+
super
|
22
|
+
@ds = ds.reject_values(*Daru::MISSING_VALUES)
|
23
|
+
@ds_valid = @ds
|
24
|
+
@dy = @ds[@y_var]
|
25
|
+
@ds_indep = ds.dup(ds.vectors.to_a - [y_var])
|
26
|
+
# Create a custom matrix
|
27
|
+
columns = []
|
28
|
+
@fields = []
|
29
|
+
@ds.vectors.each do |f|
|
30
|
+
if f != @y_var
|
31
|
+
columns.push(@ds[f].to_a)
|
32
|
+
@fields.push(f)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
@dep_columns = columns.dup
|
36
|
+
columns.push(@ds[@y_var])
|
37
|
+
matrix=Matrix.columns(columns)
|
38
|
+
@lr_s=nil
|
39
|
+
@lr=::Alglib::LinearRegression.build_from_matrix(matrix)
|
40
|
+
@coeffs=assign_names(@lr.coeffs)
|
41
|
+
end
|
42
|
+
|
43
|
+
def _dump(i)
|
44
|
+
Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
|
45
|
+
end
|
46
|
+
|
47
|
+
def self._load(data)
|
48
|
+
h=Marshal.load(data)
|
49
|
+
self.new(h['ds'], h['y_var'])
|
50
|
+
end
|
51
|
+
|
52
|
+
def coeffs
|
53
|
+
@coeffs
|
54
|
+
end
|
55
|
+
# Coefficients using a constant
|
56
|
+
# Based on http://www.xycoon.com/ols1.htm
|
57
|
+
def matrix_resolution
|
58
|
+
mse_p=mse
|
59
|
+
columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
|
60
|
+
columns.unshift([1.0]*@ds.cases)
|
61
|
+
y=Matrix.columns([@dy.data.map {|i| i.to_f}])
|
62
|
+
x=Matrix.columns(columns)
|
63
|
+
xt=x.t
|
64
|
+
matrix=((xt*x)).inverse*xt
|
65
|
+
matrix*y
|
66
|
+
end
|
67
|
+
|
68
|
+
def r2
|
69
|
+
r**2
|
70
|
+
end
|
71
|
+
|
72
|
+
def r
|
73
|
+
Bivariate::pearson(@dy,predicted)
|
74
|
+
end
|
75
|
+
|
76
|
+
def sst
|
77
|
+
@dy.ss
|
78
|
+
end
|
79
|
+
|
80
|
+
def constant
|
81
|
+
@lr.constant
|
82
|
+
end
|
83
|
+
|
84
|
+
def standarized_coeffs
|
85
|
+
l=lr_s
|
86
|
+
assign_names(l.coeffs)
|
87
|
+
end
|
88
|
+
|
89
|
+
def lr_s
|
90
|
+
if @lr_s.nil?
|
91
|
+
build_standarized
|
92
|
+
end
|
93
|
+
@lr_s
|
94
|
+
end
|
95
|
+
|
96
|
+
def build_standarized
|
97
|
+
@ds_s=@ds.standardize
|
98
|
+
columns=[]
|
99
|
+
@ds_s.vectors.each{|f|
|
100
|
+
columns.push(@ds_s[f].to_a) unless f == @y_var
|
101
|
+
}
|
102
|
+
@dep_columns_s=columns.dup
|
103
|
+
columns.push(@ds_s[@y_var])
|
104
|
+
matrix=Matrix.columns(columns)
|
105
|
+
@lr_s=Alglib::LinearRegression.build_from_matrix(matrix)
|
106
|
+
end
|
107
|
+
|
108
|
+
def process(v)
|
109
|
+
@lr.process(v)
|
110
|
+
end
|
111
|
+
|
112
|
+
def process_s(v)
|
113
|
+
lr_s.process(v)
|
114
|
+
end
|
115
|
+
# ???? Not equal to SPSS output
|
116
|
+
def standarized_residuals
|
117
|
+
res = residuals
|
118
|
+
red_sd = residuals.sds
|
119
|
+
Daru::Vector.new(res.collect {|v| v.quo(red_sd) })
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end # for Statsample
|
125
|
+
end # for if
|
126
|
+
|
127
|
+
|
128
|
+
|
@@ -0,0 +1,251 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Regression
|
3
|
+
module Multiple
|
4
|
+
# Base class for Multiple Regression Engines
|
5
|
+
class BaseEngine
|
6
|
+
include Statsample::Summarizable
|
7
|
+
# Name of analysis
|
8
|
+
attr_accessor :name
|
9
|
+
# Minimum number of valid case for pairs of correlation
|
10
|
+
attr_reader :cases
|
11
|
+
# Number of valid cases (listwise)
|
12
|
+
attr_reader :valid_cases
|
13
|
+
# Number of total cases (dataset.cases)
|
14
|
+
attr_reader :total_cases
|
15
|
+
|
16
|
+
attr_accessor :digits
|
17
|
+
def self.univariate?
|
18
|
+
true
|
19
|
+
end
|
20
|
+
def initialize(ds, y_var, opts = Hash.new)
|
21
|
+
@ds=ds
|
22
|
+
@predictors_n=@ds.vectors.size-1
|
23
|
+
@total_cases=@ds.nrows
|
24
|
+
@cases=@ds.nrows
|
25
|
+
@y_var=y_var
|
26
|
+
@r2=nil
|
27
|
+
@name=_("Multiple Regression: %s over %s") % [ ds.vectors.to_a.join(",") , @y_var]
|
28
|
+
|
29
|
+
opts_default={:digits=>3}
|
30
|
+
@opts=opts_default.merge opts
|
31
|
+
|
32
|
+
@opts.each{|k,v|
|
33
|
+
self.send("#{k}=",v) if self.respond_to? k
|
34
|
+
}
|
35
|
+
end
|
36
|
+
# Calculate F Test
|
37
|
+
def anova
|
38
|
+
@anova||=Statsample::Anova::OneWay.new(:ss_num=>ssr, :ss_den=>sse, :df_num=>df_r, :df_den=>df_e, :name_numerator=>_("Regression"), :name_denominator=>_("Error"), :name=>"ANOVA")
|
39
|
+
end
|
40
|
+
# Standard error of estimate
|
41
|
+
def se_estimate
|
42
|
+
Math::sqrt(sse.quo(df_e))
|
43
|
+
end
|
44
|
+
# Retrieves a vector with predicted values for y
|
45
|
+
def predicted
|
46
|
+
Daru::Vector.new(
|
47
|
+
@total_cases.times.collect do |i|
|
48
|
+
invalid = false
|
49
|
+
vect = @dep_columns.collect {|v| invalid = true if v[i].nil?; v[i]}
|
50
|
+
if invalid
|
51
|
+
nil
|
52
|
+
else
|
53
|
+
process(vect)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
)
|
57
|
+
end
|
58
|
+
# Retrieves a vector with standarized values for y
|
59
|
+
def standarized_predicted
|
60
|
+
predicted.standarized
|
61
|
+
end
|
62
|
+
# Retrieves a vector with residuals values for y
|
63
|
+
def residuals
|
64
|
+
Daru::Vector.new(
|
65
|
+
(0...@total_cases).collect do |i|
|
66
|
+
invalid=false
|
67
|
+
vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
|
68
|
+
if invalid or @ds[@y_var][i].nil?
|
69
|
+
nil
|
70
|
+
else
|
71
|
+
@ds[@y_var][i] - process(vect)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
)
|
75
|
+
end
|
76
|
+
# R Multiple
|
77
|
+
def r
|
78
|
+
raise "You should implement this"
|
79
|
+
end
|
80
|
+
# Sum of squares Total
|
81
|
+
def sst
|
82
|
+
raise "You should implement this"
|
83
|
+
end
|
84
|
+
# R^2 Adjusted.
|
85
|
+
# Estimate Population R^2 usign Ezequiel formula.
|
86
|
+
# Always lower than sample R^2
|
87
|
+
# == Reference:
|
88
|
+
# * Leach, L. & Henson, R. (2007). The Use and Impact of Adjusted R2 Effects in Published Regression Research. Multiple Linear Regression Viewpoints, 33(1), 1-11.
|
89
|
+
def r2_adjusted
|
90
|
+
r2-((1-r2)*@predictors_n).quo(df_e)
|
91
|
+
end
|
92
|
+
# Sum of squares (regression)
|
93
|
+
def ssr
|
94
|
+
r2*sst
|
95
|
+
end
|
96
|
+
# Sum of squares (Error)
|
97
|
+
def sse
|
98
|
+
sst - ssr
|
99
|
+
end
|
100
|
+
# T values for coeffs
|
101
|
+
def coeffs_t
|
102
|
+
out={}
|
103
|
+
se=coeffs_se
|
104
|
+
coeffs.each do |k,v|
|
105
|
+
out[k]=v / se[k]
|
106
|
+
end
|
107
|
+
out
|
108
|
+
end
|
109
|
+
# Mean square Regression
|
110
|
+
def msr
|
111
|
+
ssr.quo(df_r)
|
112
|
+
end
|
113
|
+
# Mean Square Error
|
114
|
+
def mse
|
115
|
+
sse.quo(df_e)
|
116
|
+
end
|
117
|
+
# Degrees of freedom for regression
|
118
|
+
def df_r
|
119
|
+
@predictors_n
|
120
|
+
end
|
121
|
+
# Degrees of freedom for error
|
122
|
+
def df_e
|
123
|
+
@valid_cases-@predictors_n-1
|
124
|
+
end
|
125
|
+
# Fisher for Anova
|
126
|
+
def f
|
127
|
+
anova.f
|
128
|
+
end
|
129
|
+
# p-value of Fisher
|
130
|
+
def probability
|
131
|
+
anova.probability
|
132
|
+
end
|
133
|
+
# Tolerance for a given variable
|
134
|
+
# http://talkstats.com/showthread.php?t=5056
|
135
|
+
def tolerance(var)
|
136
|
+
ds = assign_names(@dep_columns)
|
137
|
+
ds.each { |k,v| ds[k] = Daru::Vector.new(v) }
|
138
|
+
lr = self.class.new(Daru::DataFrame.new(ds),var)
|
139
|
+
1 - lr.r2
|
140
|
+
end
|
141
|
+
# Tolerances for each coefficient
|
142
|
+
def coeffs_tolerances
|
143
|
+
@fields.inject({}) {|a,f|
|
144
|
+
a[f]=tolerance(f);
|
145
|
+
a
|
146
|
+
}
|
147
|
+
end
|
148
|
+
# Standard Error for coefficients
|
149
|
+
def coeffs_se
|
150
|
+
out={}
|
151
|
+
mse=sse.quo(df_e)
|
152
|
+
coeffs.each {|k,v|
|
153
|
+
out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares * tolerance(k)))
|
154
|
+
}
|
155
|
+
out
|
156
|
+
end
|
157
|
+
# Estandar error of R^2
|
158
|
+
# ????
|
159
|
+
def se_r2
|
160
|
+
Math::sqrt((4*r2*(1-r2)**2*(df_e)**2).quo((@cases**2-1)*(@cases+3)))
|
161
|
+
end
|
162
|
+
|
163
|
+
# Estimated Variance-Covariance Matrix
|
164
|
+
# Used for calculation of se of constant
|
165
|
+
def estimated_variance_covariance_matrix
|
166
|
+
#mse_p=mse
|
167
|
+
columns=[]
|
168
|
+
@ds_valid.vectors.each{|k|
|
169
|
+
v = @ds_valid[k]
|
170
|
+
columns.push(v.to_a) unless k == @y_var
|
171
|
+
}
|
172
|
+
columns.unshift([1.0]*@valid_cases)
|
173
|
+
x=::Matrix.columns(columns)
|
174
|
+
matrix=((x.t*x)).inverse * mse
|
175
|
+
matrix.collect {|i| Math::sqrt(i) if i>=0 }
|
176
|
+
end
|
177
|
+
# T for constant
|
178
|
+
def constant_t
|
179
|
+
constant.to_f/constant_se
|
180
|
+
end
|
181
|
+
# Standard error for constant
|
182
|
+
def constant_se
|
183
|
+
estimated_variance_covariance_matrix[0,0]
|
184
|
+
end
|
185
|
+
def report_building(b)
|
186
|
+
di="%0.#{digits}f"
|
187
|
+
b.section(:name=>@name) do |g|
|
188
|
+
c=coeffs
|
189
|
+
g.text _("Engine: %s") % self.class
|
190
|
+
g.text(_("Cases(listwise)=%d(%d)") % [@total_cases, @valid_cases])
|
191
|
+
g.text _("R=")+(di % r)
|
192
|
+
g.text _("R^2=")+(di % r2)
|
193
|
+
g.text _("R^2 Adj=")+(di % r2_adjusted)
|
194
|
+
g.text _("Std.Error R=")+ (di % se_estimate)
|
195
|
+
|
196
|
+
g.text(_("Equation")+"="+ sprintf(di,constant) +" + "+ @fields.collect {|k| sprintf("#{di}%s",c[k],k)}.join(' + ') )
|
197
|
+
|
198
|
+
g.parse_element(anova)
|
199
|
+
sc=standarized_coeffs
|
200
|
+
|
201
|
+
cse=coeffs_se
|
202
|
+
g.table(:name=>_("Beta coefficients"), :header=>%w{coeff b beta se t}.collect{|field| _(field)} ) do |t|
|
203
|
+
t.row([_("Constant"), sprintf(di, constant), "-", constant_se.nil? ? "": sprintf(di, constant_se), constant_t.nil? ? "" : sprintf(di, constant_t)])
|
204
|
+
@fields.each do |f|
|
205
|
+
t.row([f, sprintf(di, c[f]), sprintf(di, sc[f]), sprintf(di, cse[f]), sprintf(di, c[f].quo(cse[f]))])
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
|
212
|
+
def assign_names(c)
|
213
|
+
a={}
|
214
|
+
@fields.each_index {|i|
|
215
|
+
a[@fields[i]]=c[i]
|
216
|
+
}
|
217
|
+
a
|
218
|
+
end
|
219
|
+
|
220
|
+
# Sum of squares of regression
|
221
|
+
# using the predicted value minus y mean
|
222
|
+
def ssr_direct
|
223
|
+
mean=@dy.mean
|
224
|
+
cases=0
|
225
|
+
ssr=(0...@ds.cases).inject(0) {|a,i|
|
226
|
+
invalid=false
|
227
|
+
v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
|
228
|
+
if !invalid
|
229
|
+
cases+=1
|
230
|
+
a+((process(v)-mean)**2)
|
231
|
+
else
|
232
|
+
a
|
233
|
+
end
|
234
|
+
}
|
235
|
+
ssr
|
236
|
+
end
|
237
|
+
def sse_direct
|
238
|
+
sst-ssr
|
239
|
+
end
|
240
|
+
def process(v)
|
241
|
+
c=coeffs
|
242
|
+
total=constant
|
243
|
+
@fields.each_index{|i|
|
244
|
+
total+=c[@fields[i]]*v[i]
|
245
|
+
}
|
246
|
+
total
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
@@ -0,0 +1,129 @@
|
|
1
|
+
if Statsample.has_gsl?
|
2
|
+
module Statsample
|
3
|
+
module Regression
|
4
|
+
module Multiple
|
5
|
+
# Class for Multiple Regression Analysis
|
6
|
+
# Requires rbgsl and uses a listwise aproach.
|
7
|
+
# Slower on prediction of values than Alglib, because predict is ruby based.
|
8
|
+
# Better memory management on multiple (+1000) series of regression.
|
9
|
+
# If you need pairwise, use RubyEngine
|
10
|
+
# Example:
|
11
|
+
#
|
12
|
+
# @a = Daru::Vector.new([1,3,2,4,3,5,4,6,5,7])
|
13
|
+
# @b = Daru::Vector.new([3,3,4,4,5,5,6,6,4,4])
|
14
|
+
# @c = Daru::Vector.new([11,22,30,40,50,65,78,79,99,100])
|
15
|
+
# @y = Daru::Vector.new([3,4,5,6,7,8,9,10,20,30])
|
16
|
+
# ds = Daru::DataFrame.new({:a => @a,:b => @b,:c => @c,:y => @y})
|
17
|
+
# lr=Statsample::Regression::Multiple::GslEngine.new(ds,:y)
|
18
|
+
#
|
19
|
+
class GslEngine < BaseEngine
|
20
|
+
def initialize(ds,y_var, opts=Hash.new)
|
21
|
+
super
|
22
|
+
@ds = ds.reject_values(*Daru::MISSING_VALUES)
|
23
|
+
@ds_valid = @ds
|
24
|
+
@valid_cases = @ds_valid.nrows
|
25
|
+
@dy = @ds[@y_var]
|
26
|
+
@ds_indep = ds.dup(ds.vectors.to_a - [y_var])
|
27
|
+
# Create a custom matrix
|
28
|
+
columns=[]
|
29
|
+
@fields=[]
|
30
|
+
max_deps = GSL::Matrix.alloc(@ds.nrows, @ds.vectors.size)
|
31
|
+
constant_col=@ds.vectors.size-1
|
32
|
+
for i in 0...@ds.nrows
|
33
|
+
max_deps.set(i,constant_col,1)
|
34
|
+
end
|
35
|
+
j = 0
|
36
|
+
@ds.vectors.each do |f|
|
37
|
+
if f != @y_var
|
38
|
+
@ds[f].each_index do |i1|
|
39
|
+
max_deps.set(i1,j,@ds[f][i1])
|
40
|
+
end
|
41
|
+
|
42
|
+
columns.push(@ds[f].to_a)
|
43
|
+
@fields.push(f)
|
44
|
+
j += 1
|
45
|
+
end
|
46
|
+
end
|
47
|
+
@dep_columns = columns.dup
|
48
|
+
@lr_s = nil
|
49
|
+
c, @cov, @chisq, @status = GSL::MultiFit.linear(max_deps, @dy.to_gsl)
|
50
|
+
@constant=c[constant_col]
|
51
|
+
@coeffs_a=c.to_a.slice(0...constant_col)
|
52
|
+
@coeffs=assign_names(@coeffs_a)
|
53
|
+
c=nil
|
54
|
+
end
|
55
|
+
|
56
|
+
def _dump(i)
|
57
|
+
Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
|
58
|
+
end
|
59
|
+
def self._load(data)
|
60
|
+
h=Marshal.load(data)
|
61
|
+
self.new(h['ds'], h['y_var'])
|
62
|
+
end
|
63
|
+
|
64
|
+
def coeffs
|
65
|
+
@coeffs
|
66
|
+
end
|
67
|
+
# Coefficients using a constant
|
68
|
+
# Based on http://www.xycoon.com/ols1.htm
|
69
|
+
def matrix_resolution
|
70
|
+
columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
|
71
|
+
columns.unshift([1.0]*@ds.cases)
|
72
|
+
y=Matrix.columns([@dy.data.map {|i| i.to_f}])
|
73
|
+
x=Matrix.columns(columns)
|
74
|
+
xt=x.t
|
75
|
+
matrix=((xt*x)).inverse*xt
|
76
|
+
matrix*y
|
77
|
+
end
|
78
|
+
def r2
|
79
|
+
r**2
|
80
|
+
end
|
81
|
+
def r
|
82
|
+
Bivariate::pearson(@dy, predicted)
|
83
|
+
end
|
84
|
+
def sst
|
85
|
+
@dy.ss
|
86
|
+
end
|
87
|
+
def constant
|
88
|
+
@constant
|
89
|
+
end
|
90
|
+
def standarized_coeffs
|
91
|
+
l=lr_s
|
92
|
+
l.coeffs
|
93
|
+
end
|
94
|
+
def lr_s
|
95
|
+
if @lr_s.nil?
|
96
|
+
build_standarized
|
97
|
+
end
|
98
|
+
@lr_s
|
99
|
+
end
|
100
|
+
def build_standarized
|
101
|
+
@ds_s=@ds.standardize
|
102
|
+
@lr_s=GslEngine.new(@ds_s,@y_var)
|
103
|
+
end
|
104
|
+
def process_s(v)
|
105
|
+
lr_s.process(v)
|
106
|
+
end
|
107
|
+
# ???? Not equal to SPSS output
|
108
|
+
def standarized_residuals
|
109
|
+
res=residuals
|
110
|
+
red_sd=residuals.sds
|
111
|
+
Daru::Vector.new(res.collect {|v| v.quo(red_sd) })
|
112
|
+
end
|
113
|
+
|
114
|
+
# Standard error for coeffs
|
115
|
+
def coeffs_se
|
116
|
+
out = {}
|
117
|
+
evcm = estimated_variance_covariance_matrix
|
118
|
+
@ds_valid.vectors.to_a.each_with_index do |f,i|
|
119
|
+
mi = i+1
|
120
|
+
next if f == @y_var
|
121
|
+
out[f] = evcm[mi,mi]
|
122
|
+
end
|
123
|
+
out
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end # for Statsample
|
129
|
+
end # for if
|