statsample-ekatena 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.travis.yml +23 -0
- data/CONTRIBUTING.md +17 -0
- data/Gemfile +2 -0
- data/History.txt +457 -0
- data/LICENSE.txt +12 -0
- data/README.md +175 -0
- data/Rakefile +44 -0
- data/benchmarks/correlation_matrix_15_variables.rb +32 -0
- data/benchmarks/correlation_matrix_5_variables.rb +33 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/doc_latex/manual/equations.tex +78 -0
- data/examples/boxplot.rb +28 -0
- data/examples/chisquare_test.rb +23 -0
- data/examples/correlation_matrix.rb +32 -0
- data/examples/dataset.rb +30 -0
- data/examples/dominance_analysis.rb +33 -0
- data/examples/dominance_analysis_bootstrap.rb +32 -0
- data/examples/histogram.rb +26 -0
- data/examples/icc.rb +24 -0
- data/examples/levene.rb +29 -0
- data/examples/multiple_regression.rb +20 -0
- data/examples/multivariate_correlation.rb +33 -0
- data/examples/parallel_analysis.rb +40 -0
- data/examples/polychoric.rb +40 -0
- data/examples/principal_axis.rb +26 -0
- data/examples/reliability.rb +31 -0
- data/examples/scatterplot.rb +25 -0
- data/examples/t_test.rb +27 -0
- data/examples/tetrachoric.rb +17 -0
- data/examples/u_test.rb +24 -0
- data/examples/vector.rb +20 -0
- data/examples/velicer_map_test.rb +46 -0
- data/grab_references.rb +29 -0
- data/lib/spss.rb +134 -0
- data/lib/statsample-ekatena/analysis.rb +100 -0
- data/lib/statsample-ekatena/analysis/suite.rb +89 -0
- data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
- data/lib/statsample-ekatena/anova.rb +24 -0
- data/lib/statsample-ekatena/anova/contrast.rb +79 -0
- data/lib/statsample-ekatena/anova/oneway.rb +187 -0
- data/lib/statsample-ekatena/anova/twoway.rb +207 -0
- data/lib/statsample-ekatena/bivariate.rb +406 -0
- data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
- data/lib/statsample-ekatena/codification.rb +182 -0
- data/lib/statsample-ekatena/converter/csv.rb +28 -0
- data/lib/statsample-ekatena/converter/spss.rb +48 -0
- data/lib/statsample-ekatena/converters.rb +211 -0
- data/lib/statsample-ekatena/crosstab.rb +188 -0
- data/lib/statsample-ekatena/daru.rb +115 -0
- data/lib/statsample-ekatena/dataset.rb +10 -0
- data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
- data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
- data/lib/statsample-ekatena/factor.rb +104 -0
- data/lib/statsample-ekatena/factor/map.rb +124 -0
- data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
- data/lib/statsample-ekatena/factor/pca.rb +242 -0
- data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
- data/lib/statsample-ekatena/factor/rotation.rb +198 -0
- data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
- data/lib/statsample-ekatena/formula/formula.rb +306 -0
- data/lib/statsample-ekatena/graph.rb +11 -0
- data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
- data/lib/statsample-ekatena/graph/histogram.rb +198 -0
- data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
- data/lib/statsample-ekatena/histogram.rb +180 -0
- data/lib/statsample-ekatena/matrix.rb +329 -0
- data/lib/statsample-ekatena/multiset.rb +310 -0
- data/lib/statsample-ekatena/regression.rb +65 -0
- data/lib/statsample-ekatena/regression/multiple.rb +89 -0
- data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
- data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
- data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
- data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
- data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
- data/lib/statsample-ekatena/regression/simple.rb +121 -0
- data/lib/statsample-ekatena/reliability.rb +150 -0
- data/lib/statsample-ekatena/reliability/icc.rb +415 -0
- data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
- data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
- data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
- data/lib/statsample-ekatena/resample.rb +15 -0
- data/lib/statsample-ekatena/shorthand.rb +125 -0
- data/lib/statsample-ekatena/srs.rb +169 -0
- data/lib/statsample-ekatena/test.rb +82 -0
- data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
- data/lib/statsample-ekatena/test/chisquare.rb +73 -0
- data/lib/statsample-ekatena/test/f.rb +52 -0
- data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
- data/lib/statsample-ekatena/test/levene.rb +88 -0
- data/lib/statsample-ekatena/test/t.rb +309 -0
- data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
- data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
- data/lib/statsample-ekatena/vector.rb +19 -0
- data/lib/statsample-ekatena/version.rb +3 -0
- data/lib/statsample.rb +282 -0
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +959 -0
- data/po/statsample.pot +947 -0
- data/references.txt +24 -0
- data/statsample-ekatena.gemspec +49 -0
- data/test/fixtures/bank2.dat +200 -0
- data/test/fixtures/correlation_matrix.rb +17 -0
- data/test/fixtures/df.csv +15 -0
- data/test/fixtures/hartman_23.matrix +9 -0
- data/test/fixtures/stock_data.csv +500 -0
- data/test/fixtures/tetmat_matrix.txt +5 -0
- data/test/fixtures/tetmat_test.txt +1001 -0
- data/test/helpers_tests.rb +83 -0
- data/test/test_analysis.rb +176 -0
- data/test/test_anova_contrast.rb +36 -0
- data/test/test_anovaoneway.rb +26 -0
- data/test/test_anovatwoway.rb +37 -0
- data/test/test_anovatwowaywithdataset.rb +47 -0
- data/test/test_anovawithvectors.rb +102 -0
- data/test/test_awesome_print_bug.rb +16 -0
- data/test/test_bartlettsphericity.rb +25 -0
- data/test/test_bivariate.rb +164 -0
- data/test/test_codification.rb +78 -0
- data/test/test_crosstab.rb +67 -0
- data/test/test_dominance_analysis.rb +39 -0
- data/test/test_factor.rb +228 -0
- data/test/test_factor_map.rb +38 -0
- data/test/test_factor_pa.rb +56 -0
- data/test/test_fit_model.rb +88 -0
- data/test/test_ggobi.rb +35 -0
- data/test/test_gsl.rb +15 -0
- data/test/test_histogram.rb +109 -0
- data/test/test_matrix.rb +48 -0
- data/test/test_multiset.rb +176 -0
- data/test/test_regression.rb +231 -0
- data/test/test_reliability.rb +223 -0
- data/test/test_reliability_icc.rb +198 -0
- data/test/test_reliability_skillscale.rb +57 -0
- data/test/test_resample.rb +24 -0
- data/test/test_srs.rb +9 -0
- data/test/test_statistics.rb +69 -0
- data/test/test_stest.rb +69 -0
- data/test/test_stratified.rb +17 -0
- data/test/test_test_f.rb +33 -0
- data/test/test_test_kolmogorovsmirnov.rb +34 -0
- data/test/test_test_t.rb +62 -0
- data/test/test_umannwhitney.rb +27 -0
- data/test/test_vector.rb +12 -0
- data/test/test_wilcoxonsignedrank.rb +64 -0
- metadata +570 -0
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
if HAS_ALGIB
|
|
2
|
+
module Statsample
|
|
3
|
+
module Regression
|
|
4
|
+
module Multiple
|
|
5
|
+
# Class for Multiple Regression Analysis
|
|
6
|
+
# Requires Alglib gem and uses a listwise aproach.
|
|
7
|
+
# Faster than GslEngine on massive prediction use, because process is c-based.
|
|
8
|
+
# Prefer GslEngine if you need good memory use.
|
|
9
|
+
# If you need pairwise, use RubyEngine
|
|
10
|
+
# Example:
|
|
11
|
+
#
|
|
12
|
+
# @a = Daru::Vector.new([1,3,2,4,3,5,4,6,5,7])
|
|
13
|
+
# @b = Daru::Vector.new([3,3,4,4,5,5,6,6,4,4])
|
|
14
|
+
# @c = Daru::Vector.new([11,22,30,40,50,65,78,79,99,100])
|
|
15
|
+
# @y = Daru::Vector.new([3,4,5,6,7,8,9,10,20,30])
|
|
16
|
+
# ds = Daru::DataFrame.new({:a => @a,:b => @b,:c => @c,:y => @y})
|
|
17
|
+
# lr=Statsample::Regression::Multiple::AlglibEngine.new(ds, :y)
|
|
18
|
+
#
|
|
19
|
+
class AlglibEngine < BaseEngine
|
|
20
|
+
def initialize(ds,y_var, opts=Hash.new)
|
|
21
|
+
super
|
|
22
|
+
@ds = ds.reject_values(*Daru::MISSING_VALUES)
|
|
23
|
+
@ds_valid = @ds
|
|
24
|
+
@dy = @ds[@y_var]
|
|
25
|
+
@ds_indep = ds.dup(ds.vectors.to_a - [y_var])
|
|
26
|
+
# Create a custom matrix
|
|
27
|
+
columns = []
|
|
28
|
+
@fields = []
|
|
29
|
+
@ds.vectors.each do |f|
|
|
30
|
+
if f != @y_var
|
|
31
|
+
columns.push(@ds[f].to_a)
|
|
32
|
+
@fields.push(f)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
@dep_columns = columns.dup
|
|
36
|
+
columns.push(@ds[@y_var])
|
|
37
|
+
matrix=Matrix.columns(columns)
|
|
38
|
+
@lr_s=nil
|
|
39
|
+
@lr=::Alglib::LinearRegression.build_from_matrix(matrix)
|
|
40
|
+
@coeffs=assign_names(@lr.coeffs)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def _dump(i)
|
|
44
|
+
Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def self._load(data)
|
|
48
|
+
h=Marshal.load(data)
|
|
49
|
+
self.new(h['ds'], h['y_var'])
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def coeffs
|
|
53
|
+
@coeffs
|
|
54
|
+
end
|
|
55
|
+
# Coefficients using a constant
|
|
56
|
+
# Based on http://www.xycoon.com/ols1.htm
|
|
57
|
+
def matrix_resolution
|
|
58
|
+
mse_p=mse
|
|
59
|
+
columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
|
|
60
|
+
columns.unshift([1.0]*@ds.cases)
|
|
61
|
+
y=Matrix.columns([@dy.data.map {|i| i.to_f}])
|
|
62
|
+
x=Matrix.columns(columns)
|
|
63
|
+
xt=x.t
|
|
64
|
+
matrix=((xt*x)).inverse*xt
|
|
65
|
+
matrix*y
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def r2
|
|
69
|
+
r**2
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def r
|
|
73
|
+
Bivariate::pearson(@dy,predicted)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def sst
|
|
77
|
+
@dy.ss
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def constant
|
|
81
|
+
@lr.constant
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def standarized_coeffs
|
|
85
|
+
l=lr_s
|
|
86
|
+
assign_names(l.coeffs)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def lr_s
|
|
90
|
+
if @lr_s.nil?
|
|
91
|
+
build_standarized
|
|
92
|
+
end
|
|
93
|
+
@lr_s
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def build_standarized
|
|
97
|
+
@ds_s=@ds.standardize
|
|
98
|
+
columns=[]
|
|
99
|
+
@ds_s.vectors.each{|f|
|
|
100
|
+
columns.push(@ds_s[f].to_a) unless f == @y_var
|
|
101
|
+
}
|
|
102
|
+
@dep_columns_s=columns.dup
|
|
103
|
+
columns.push(@ds_s[@y_var])
|
|
104
|
+
matrix=Matrix.columns(columns)
|
|
105
|
+
@lr_s=Alglib::LinearRegression.build_from_matrix(matrix)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def process(v)
|
|
109
|
+
@lr.process(v)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def process_s(v)
|
|
113
|
+
lr_s.process(v)
|
|
114
|
+
end
|
|
115
|
+
# ???? Not equal to SPSS output
|
|
116
|
+
def standarized_residuals
|
|
117
|
+
res = residuals
|
|
118
|
+
red_sd = residuals.sds
|
|
119
|
+
Daru::Vector.new(res.collect {|v| v.quo(red_sd) })
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end # for Statsample
|
|
125
|
+
end # for if
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
module Regression
|
|
3
|
+
module Multiple
|
|
4
|
+
# Base class for Multiple Regression Engines
|
|
5
|
+
class BaseEngine
|
|
6
|
+
include Statsample::Summarizable
|
|
7
|
+
# Name of analysis
|
|
8
|
+
attr_accessor :name
|
|
9
|
+
# Minimum number of valid case for pairs of correlation
|
|
10
|
+
attr_reader :cases
|
|
11
|
+
# Number of valid cases (listwise)
|
|
12
|
+
attr_reader :valid_cases
|
|
13
|
+
# Number of total cases (dataset.cases)
|
|
14
|
+
attr_reader :total_cases
|
|
15
|
+
|
|
16
|
+
attr_accessor :digits
|
|
17
|
+
def self.univariate?
|
|
18
|
+
true
|
|
19
|
+
end
|
|
20
|
+
def initialize(ds, y_var, opts = Hash.new)
|
|
21
|
+
@ds=ds
|
|
22
|
+
@predictors_n=@ds.vectors.size-1
|
|
23
|
+
@total_cases=@ds.nrows
|
|
24
|
+
@cases=@ds.nrows
|
|
25
|
+
@y_var=y_var
|
|
26
|
+
@r2=nil
|
|
27
|
+
@name=_("Multiple Regression: %s over %s") % [ ds.vectors.to_a.join(",") , @y_var]
|
|
28
|
+
|
|
29
|
+
opts_default={:digits=>3}
|
|
30
|
+
@opts=opts_default.merge opts
|
|
31
|
+
|
|
32
|
+
@opts.each{|k,v|
|
|
33
|
+
self.send("#{k}=",v) if self.respond_to? k
|
|
34
|
+
}
|
|
35
|
+
end
|
|
36
|
+
# Calculate F Test
|
|
37
|
+
def anova
|
|
38
|
+
@anova||=Statsample::Anova::OneWay.new(:ss_num=>ssr, :ss_den=>sse, :df_num=>df_r, :df_den=>df_e, :name_numerator=>_("Regression"), :name_denominator=>_("Error"), :name=>"ANOVA")
|
|
39
|
+
end
|
|
40
|
+
# Standard error of estimate
|
|
41
|
+
def se_estimate
|
|
42
|
+
Math::sqrt(sse.quo(df_e))
|
|
43
|
+
end
|
|
44
|
+
# Retrieves a vector with predicted values for y
|
|
45
|
+
def predicted
|
|
46
|
+
Daru::Vector.new(
|
|
47
|
+
@total_cases.times.collect do |i|
|
|
48
|
+
invalid = false
|
|
49
|
+
vect = @dep_columns.collect {|v| invalid = true if v[i].nil?; v[i]}
|
|
50
|
+
if invalid
|
|
51
|
+
nil
|
|
52
|
+
else
|
|
53
|
+
process(vect)
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
)
|
|
57
|
+
end
|
|
58
|
+
# Retrieves a vector with standarized values for y
|
|
59
|
+
def standarized_predicted
|
|
60
|
+
predicted.standarized
|
|
61
|
+
end
|
|
62
|
+
# Retrieves a vector with residuals values for y
|
|
63
|
+
def residuals
|
|
64
|
+
Daru::Vector.new(
|
|
65
|
+
(0...@total_cases).collect do |i|
|
|
66
|
+
invalid=false
|
|
67
|
+
vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
|
|
68
|
+
if invalid or @ds[@y_var][i].nil?
|
|
69
|
+
nil
|
|
70
|
+
else
|
|
71
|
+
@ds[@y_var][i] - process(vect)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
)
|
|
75
|
+
end
|
|
76
|
+
# R Multiple
|
|
77
|
+
def r
|
|
78
|
+
raise "You should implement this"
|
|
79
|
+
end
|
|
80
|
+
# Sum of squares Total
|
|
81
|
+
def sst
|
|
82
|
+
raise "You should implement this"
|
|
83
|
+
end
|
|
84
|
+
# R^2 Adjusted.
|
|
85
|
+
# Estimate Population R^2 usign Ezequiel formula.
|
|
86
|
+
# Always lower than sample R^2
|
|
87
|
+
# == Reference:
|
|
88
|
+
# * Leach, L. & Henson, R. (2007). The Use and Impact of Adjusted R2 Effects in Published Regression Research. Multiple Linear Regression Viewpoints, 33(1), 1-11.
|
|
89
|
+
def r2_adjusted
|
|
90
|
+
r2-((1-r2)*@predictors_n).quo(df_e)
|
|
91
|
+
end
|
|
92
|
+
# Sum of squares (regression)
|
|
93
|
+
def ssr
|
|
94
|
+
r2*sst
|
|
95
|
+
end
|
|
96
|
+
# Sum of squares (Error)
|
|
97
|
+
def sse
|
|
98
|
+
sst - ssr
|
|
99
|
+
end
|
|
100
|
+
# T values for coeffs
|
|
101
|
+
def coeffs_t
|
|
102
|
+
out={}
|
|
103
|
+
se=coeffs_se
|
|
104
|
+
coeffs.each do |k,v|
|
|
105
|
+
out[k]=v / se[k]
|
|
106
|
+
end
|
|
107
|
+
out
|
|
108
|
+
end
|
|
109
|
+
# Mean square Regression
|
|
110
|
+
def msr
|
|
111
|
+
ssr.quo(df_r)
|
|
112
|
+
end
|
|
113
|
+
# Mean Square Error
|
|
114
|
+
def mse
|
|
115
|
+
sse.quo(df_e)
|
|
116
|
+
end
|
|
117
|
+
# Degrees of freedom for regression
|
|
118
|
+
def df_r
|
|
119
|
+
@predictors_n
|
|
120
|
+
end
|
|
121
|
+
# Degrees of freedom for error
|
|
122
|
+
def df_e
|
|
123
|
+
@valid_cases-@predictors_n-1
|
|
124
|
+
end
|
|
125
|
+
# Fisher for Anova
|
|
126
|
+
def f
|
|
127
|
+
anova.f
|
|
128
|
+
end
|
|
129
|
+
# p-value of Fisher
|
|
130
|
+
def probability
|
|
131
|
+
anova.probability
|
|
132
|
+
end
|
|
133
|
+
# Tolerance for a given variable
|
|
134
|
+
# http://talkstats.com/showthread.php?t=5056
|
|
135
|
+
def tolerance(var)
|
|
136
|
+
ds = assign_names(@dep_columns)
|
|
137
|
+
ds.each { |k,v| ds[k] = Daru::Vector.new(v) }
|
|
138
|
+
lr = self.class.new(Daru::DataFrame.new(ds),var)
|
|
139
|
+
1 - lr.r2
|
|
140
|
+
end
|
|
141
|
+
# Tolerances for each coefficient
|
|
142
|
+
def coeffs_tolerances
|
|
143
|
+
@fields.inject({}) {|a,f|
|
|
144
|
+
a[f]=tolerance(f);
|
|
145
|
+
a
|
|
146
|
+
}
|
|
147
|
+
end
|
|
148
|
+
# Standard Error for coefficients
|
|
149
|
+
def coeffs_se
|
|
150
|
+
out={}
|
|
151
|
+
mse=sse.quo(df_e)
|
|
152
|
+
coeffs.each {|k,v|
|
|
153
|
+
out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares * tolerance(k)))
|
|
154
|
+
}
|
|
155
|
+
out
|
|
156
|
+
end
|
|
157
|
+
# Estandar error of R^2
|
|
158
|
+
# ????
|
|
159
|
+
def se_r2
|
|
160
|
+
Math::sqrt((4*r2*(1-r2)**2*(df_e)**2).quo((@cases**2-1)*(@cases+3)))
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Estimated Variance-Covariance Matrix
|
|
164
|
+
# Used for calculation of se of constant
|
|
165
|
+
def estimated_variance_covariance_matrix
|
|
166
|
+
#mse_p=mse
|
|
167
|
+
columns=[]
|
|
168
|
+
@ds_valid.vectors.each{|k|
|
|
169
|
+
v = @ds_valid[k]
|
|
170
|
+
columns.push(v.to_a) unless k == @y_var
|
|
171
|
+
}
|
|
172
|
+
columns.unshift([1.0]*@valid_cases)
|
|
173
|
+
x=::Matrix.columns(columns)
|
|
174
|
+
matrix=((x.t*x)).inverse * mse
|
|
175
|
+
matrix.collect {|i| Math::sqrt(i) if i>=0 }
|
|
176
|
+
end
|
|
177
|
+
# T for constant
|
|
178
|
+
def constant_t
|
|
179
|
+
constant.to_f/constant_se
|
|
180
|
+
end
|
|
181
|
+
# Standard error for constant
|
|
182
|
+
def constant_se
|
|
183
|
+
estimated_variance_covariance_matrix[0,0]
|
|
184
|
+
end
|
|
185
|
+
def report_building(b)
|
|
186
|
+
di="%0.#{digits}f"
|
|
187
|
+
b.section(:name=>@name) do |g|
|
|
188
|
+
c=coeffs
|
|
189
|
+
g.text _("Engine: %s") % self.class
|
|
190
|
+
g.text(_("Cases(listwise)=%d(%d)") % [@total_cases, @valid_cases])
|
|
191
|
+
g.text _("R=")+(di % r)
|
|
192
|
+
g.text _("R^2=")+(di % r2)
|
|
193
|
+
g.text _("R^2 Adj=")+(di % r2_adjusted)
|
|
194
|
+
g.text _("Std.Error R=")+ (di % se_estimate)
|
|
195
|
+
|
|
196
|
+
g.text(_("Equation")+"="+ sprintf(di,constant) +" + "+ @fields.collect {|k| sprintf("#{di}%s",c[k],k)}.join(' + ') )
|
|
197
|
+
|
|
198
|
+
g.parse_element(anova)
|
|
199
|
+
sc=standarized_coeffs
|
|
200
|
+
|
|
201
|
+
cse=coeffs_se
|
|
202
|
+
g.table(:name=>_("Beta coefficients"), :header=>%w{coeff b beta se t}.collect{|field| _(field)} ) do |t|
|
|
203
|
+
t.row([_("Constant"), sprintf(di, constant), "-", constant_se.nil? ? "": sprintf(di, constant_se), constant_t.nil? ? "" : sprintf(di, constant_t)])
|
|
204
|
+
@fields.each do |f|
|
|
205
|
+
t.row([f, sprintf(di, c[f]), sprintf(di, sc[f]), sprintf(di, cse[f]), sprintf(di, c[f].quo(cse[f]))])
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def assign_names(c)
|
|
213
|
+
a={}
|
|
214
|
+
@fields.each_index {|i|
|
|
215
|
+
a[@fields[i]]=c[i]
|
|
216
|
+
}
|
|
217
|
+
a
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Sum of squares of regression
|
|
221
|
+
# using the predicted value minus y mean
|
|
222
|
+
def ssr_direct
|
|
223
|
+
mean=@dy.mean
|
|
224
|
+
cases=0
|
|
225
|
+
ssr=(0...@ds.cases).inject(0) {|a,i|
|
|
226
|
+
invalid=false
|
|
227
|
+
v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
|
|
228
|
+
if !invalid
|
|
229
|
+
cases+=1
|
|
230
|
+
a+((process(v)-mean)**2)
|
|
231
|
+
else
|
|
232
|
+
a
|
|
233
|
+
end
|
|
234
|
+
}
|
|
235
|
+
ssr
|
|
236
|
+
end
|
|
237
|
+
def sse_direct
|
|
238
|
+
sst-ssr
|
|
239
|
+
end
|
|
240
|
+
def process(v)
|
|
241
|
+
c=coeffs
|
|
242
|
+
total=constant
|
|
243
|
+
@fields.each_index{|i|
|
|
244
|
+
total+=c[@fields[i]]*v[i]
|
|
245
|
+
}
|
|
246
|
+
total
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
end
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
if Statsample.has_gsl?
|
|
2
|
+
module Statsample
|
|
3
|
+
module Regression
|
|
4
|
+
module Multiple
|
|
5
|
+
# Class for Multiple Regression Analysis
|
|
6
|
+
# Requires rbgsl and uses a listwise aproach.
|
|
7
|
+
# Slower on prediction of values than Alglib, because predict is ruby based.
|
|
8
|
+
# Better memory management on multiple (+1000) series of regression.
|
|
9
|
+
# If you need pairwise, use RubyEngine
|
|
10
|
+
# Example:
|
|
11
|
+
#
|
|
12
|
+
# @a = Daru::Vector.new([1,3,2,4,3,5,4,6,5,7])
|
|
13
|
+
# @b = Daru::Vector.new([3,3,4,4,5,5,6,6,4,4])
|
|
14
|
+
# @c = Daru::Vector.new([11,22,30,40,50,65,78,79,99,100])
|
|
15
|
+
# @y = Daru::Vector.new([3,4,5,6,7,8,9,10,20,30])
|
|
16
|
+
# ds = Daru::DataFrame.new({:a => @a,:b => @b,:c => @c,:y => @y})
|
|
17
|
+
# lr=Statsample::Regression::Multiple::GslEngine.new(ds,:y)
|
|
18
|
+
#
|
|
19
|
+
class GslEngine < BaseEngine
|
|
20
|
+
def initialize(ds,y_var, opts=Hash.new)
|
|
21
|
+
super
|
|
22
|
+
@ds = ds.reject_values(*Daru::MISSING_VALUES)
|
|
23
|
+
@ds_valid = @ds
|
|
24
|
+
@valid_cases = @ds_valid.nrows
|
|
25
|
+
@dy = @ds[@y_var]
|
|
26
|
+
@ds_indep = ds.dup(ds.vectors.to_a - [y_var])
|
|
27
|
+
# Create a custom matrix
|
|
28
|
+
columns=[]
|
|
29
|
+
@fields=[]
|
|
30
|
+
max_deps = GSL::Matrix.alloc(@ds.nrows, @ds.vectors.size)
|
|
31
|
+
constant_col=@ds.vectors.size-1
|
|
32
|
+
for i in 0...@ds.nrows
|
|
33
|
+
max_deps.set(i,constant_col,1)
|
|
34
|
+
end
|
|
35
|
+
j = 0
|
|
36
|
+
@ds.vectors.each do |f|
|
|
37
|
+
if f != @y_var
|
|
38
|
+
@ds[f].each_index do |i1|
|
|
39
|
+
max_deps.set(i1,j,@ds[f][i1])
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
columns.push(@ds[f].to_a)
|
|
43
|
+
@fields.push(f)
|
|
44
|
+
j += 1
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
@dep_columns = columns.dup
|
|
48
|
+
@lr_s = nil
|
|
49
|
+
c, @cov, @chisq, @status = GSL::MultiFit.linear(max_deps, @dy.to_gsl)
|
|
50
|
+
@constant=c[constant_col]
|
|
51
|
+
@coeffs_a=c.to_a.slice(0...constant_col)
|
|
52
|
+
@coeffs=assign_names(@coeffs_a)
|
|
53
|
+
c=nil
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def _dump(i)
|
|
57
|
+
Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
|
|
58
|
+
end
|
|
59
|
+
def self._load(data)
|
|
60
|
+
h=Marshal.load(data)
|
|
61
|
+
self.new(h['ds'], h['y_var'])
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def coeffs
|
|
65
|
+
@coeffs
|
|
66
|
+
end
|
|
67
|
+
# Coefficients using a constant
|
|
68
|
+
# Based on http://www.xycoon.com/ols1.htm
|
|
69
|
+
def matrix_resolution
|
|
70
|
+
columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
|
|
71
|
+
columns.unshift([1.0]*@ds.cases)
|
|
72
|
+
y=Matrix.columns([@dy.data.map {|i| i.to_f}])
|
|
73
|
+
x=Matrix.columns(columns)
|
|
74
|
+
xt=x.t
|
|
75
|
+
matrix=((xt*x)).inverse*xt
|
|
76
|
+
matrix*y
|
|
77
|
+
end
|
|
78
|
+
def r2
|
|
79
|
+
r**2
|
|
80
|
+
end
|
|
81
|
+
def r
|
|
82
|
+
Bivariate::pearson(@dy, predicted)
|
|
83
|
+
end
|
|
84
|
+
def sst
|
|
85
|
+
@dy.ss
|
|
86
|
+
end
|
|
87
|
+
def constant
|
|
88
|
+
@constant
|
|
89
|
+
end
|
|
90
|
+
def standarized_coeffs
|
|
91
|
+
l=lr_s
|
|
92
|
+
l.coeffs
|
|
93
|
+
end
|
|
94
|
+
def lr_s
|
|
95
|
+
if @lr_s.nil?
|
|
96
|
+
build_standarized
|
|
97
|
+
end
|
|
98
|
+
@lr_s
|
|
99
|
+
end
|
|
100
|
+
def build_standarized
|
|
101
|
+
@ds_s=@ds.standardize
|
|
102
|
+
@lr_s=GslEngine.new(@ds_s,@y_var)
|
|
103
|
+
end
|
|
104
|
+
def process_s(v)
|
|
105
|
+
lr_s.process(v)
|
|
106
|
+
end
|
|
107
|
+
# ???? Not equal to SPSS output
|
|
108
|
+
def standarized_residuals
|
|
109
|
+
res=residuals
|
|
110
|
+
red_sd=residuals.sds
|
|
111
|
+
Daru::Vector.new(res.collect {|v| v.quo(red_sd) })
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Standard error for coeffs
|
|
115
|
+
def coeffs_se
|
|
116
|
+
out = {}
|
|
117
|
+
evcm = estimated_variance_covariance_matrix
|
|
118
|
+
@ds_valid.vectors.to_a.each_with_index do |f,i|
|
|
119
|
+
mi = i+1
|
|
120
|
+
next if f == @y_var
|
|
121
|
+
out[f] = evcm[mi,mi]
|
|
122
|
+
end
|
|
123
|
+
out
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end # for Statsample
|
|
129
|
+
end # for if
|