statsample-ekatena 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.travis.yml +23 -0
- data/CONTRIBUTING.md +17 -0
- data/Gemfile +2 -0
- data/History.txt +457 -0
- data/LICENSE.txt +12 -0
- data/README.md +175 -0
- data/Rakefile +44 -0
- data/benchmarks/correlation_matrix_15_variables.rb +32 -0
- data/benchmarks/correlation_matrix_5_variables.rb +33 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/doc_latex/manual/equations.tex +78 -0
- data/examples/boxplot.rb +28 -0
- data/examples/chisquare_test.rb +23 -0
- data/examples/correlation_matrix.rb +32 -0
- data/examples/dataset.rb +30 -0
- data/examples/dominance_analysis.rb +33 -0
- data/examples/dominance_analysis_bootstrap.rb +32 -0
- data/examples/histogram.rb +26 -0
- data/examples/icc.rb +24 -0
- data/examples/levene.rb +29 -0
- data/examples/multiple_regression.rb +20 -0
- data/examples/multivariate_correlation.rb +33 -0
- data/examples/parallel_analysis.rb +40 -0
- data/examples/polychoric.rb +40 -0
- data/examples/principal_axis.rb +26 -0
- data/examples/reliability.rb +31 -0
- data/examples/scatterplot.rb +25 -0
- data/examples/t_test.rb +27 -0
- data/examples/tetrachoric.rb +17 -0
- data/examples/u_test.rb +24 -0
- data/examples/vector.rb +20 -0
- data/examples/velicer_map_test.rb +46 -0
- data/grab_references.rb +29 -0
- data/lib/spss.rb +134 -0
- data/lib/statsample-ekatena/analysis.rb +100 -0
- data/lib/statsample-ekatena/analysis/suite.rb +89 -0
- data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
- data/lib/statsample-ekatena/anova.rb +24 -0
- data/lib/statsample-ekatena/anova/contrast.rb +79 -0
- data/lib/statsample-ekatena/anova/oneway.rb +187 -0
- data/lib/statsample-ekatena/anova/twoway.rb +207 -0
- data/lib/statsample-ekatena/bivariate.rb +406 -0
- data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
- data/lib/statsample-ekatena/codification.rb +182 -0
- data/lib/statsample-ekatena/converter/csv.rb +28 -0
- data/lib/statsample-ekatena/converter/spss.rb +48 -0
- data/lib/statsample-ekatena/converters.rb +211 -0
- data/lib/statsample-ekatena/crosstab.rb +188 -0
- data/lib/statsample-ekatena/daru.rb +115 -0
- data/lib/statsample-ekatena/dataset.rb +10 -0
- data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
- data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
- data/lib/statsample-ekatena/factor.rb +104 -0
- data/lib/statsample-ekatena/factor/map.rb +124 -0
- data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
- data/lib/statsample-ekatena/factor/pca.rb +242 -0
- data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
- data/lib/statsample-ekatena/factor/rotation.rb +198 -0
- data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
- data/lib/statsample-ekatena/formula/formula.rb +306 -0
- data/lib/statsample-ekatena/graph.rb +11 -0
- data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
- data/lib/statsample-ekatena/graph/histogram.rb +198 -0
- data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
- data/lib/statsample-ekatena/histogram.rb +180 -0
- data/lib/statsample-ekatena/matrix.rb +329 -0
- data/lib/statsample-ekatena/multiset.rb +310 -0
- data/lib/statsample-ekatena/regression.rb +65 -0
- data/lib/statsample-ekatena/regression/multiple.rb +89 -0
- data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
- data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
- data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
- data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
- data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
- data/lib/statsample-ekatena/regression/simple.rb +121 -0
- data/lib/statsample-ekatena/reliability.rb +150 -0
- data/lib/statsample-ekatena/reliability/icc.rb +415 -0
- data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
- data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
- data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
- data/lib/statsample-ekatena/resample.rb +15 -0
- data/lib/statsample-ekatena/shorthand.rb +125 -0
- data/lib/statsample-ekatena/srs.rb +169 -0
- data/lib/statsample-ekatena/test.rb +82 -0
- data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
- data/lib/statsample-ekatena/test/chisquare.rb +73 -0
- data/lib/statsample-ekatena/test/f.rb +52 -0
- data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
- data/lib/statsample-ekatena/test/levene.rb +88 -0
- data/lib/statsample-ekatena/test/t.rb +309 -0
- data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
- data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
- data/lib/statsample-ekatena/vector.rb +19 -0
- data/lib/statsample-ekatena/version.rb +3 -0
- data/lib/statsample.rb +282 -0
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +959 -0
- data/po/statsample.pot +947 -0
- data/references.txt +24 -0
- data/statsample-ekatena.gemspec +49 -0
- data/test/fixtures/bank2.dat +200 -0
- data/test/fixtures/correlation_matrix.rb +17 -0
- data/test/fixtures/df.csv +15 -0
- data/test/fixtures/hartman_23.matrix +9 -0
- data/test/fixtures/stock_data.csv +500 -0
- data/test/fixtures/tetmat_matrix.txt +5 -0
- data/test/fixtures/tetmat_test.txt +1001 -0
- data/test/helpers_tests.rb +83 -0
- data/test/test_analysis.rb +176 -0
- data/test/test_anova_contrast.rb +36 -0
- data/test/test_anovaoneway.rb +26 -0
- data/test/test_anovatwoway.rb +37 -0
- data/test/test_anovatwowaywithdataset.rb +47 -0
- data/test/test_anovawithvectors.rb +102 -0
- data/test/test_awesome_print_bug.rb +16 -0
- data/test/test_bartlettsphericity.rb +25 -0
- data/test/test_bivariate.rb +164 -0
- data/test/test_codification.rb +78 -0
- data/test/test_crosstab.rb +67 -0
- data/test/test_dominance_analysis.rb +39 -0
- data/test/test_factor.rb +228 -0
- data/test/test_factor_map.rb +38 -0
- data/test/test_factor_pa.rb +56 -0
- data/test/test_fit_model.rb +88 -0
- data/test/test_ggobi.rb +35 -0
- data/test/test_gsl.rb +15 -0
- data/test/test_histogram.rb +109 -0
- data/test/test_matrix.rb +48 -0
- data/test/test_multiset.rb +176 -0
- data/test/test_regression.rb +231 -0
- data/test/test_reliability.rb +223 -0
- data/test/test_reliability_icc.rb +198 -0
- data/test/test_reliability_skillscale.rb +57 -0
- data/test/test_resample.rb +24 -0
- data/test/test_srs.rb +9 -0
- data/test/test_statistics.rb +69 -0
- data/test/test_stest.rb +69 -0
- data/test/test_stratified.rb +17 -0
- data/test/test_test_f.rb +33 -0
- data/test/test_test_kolmogorovsmirnov.rb +34 -0
- data/test/test_test_t.rb +62 -0
- data/test/test_umannwhitney.rb +27 -0
- data/test/test_vector.rb +12 -0
- data/test/test_wilcoxonsignedrank.rb +64 -0
- metadata +570 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
module Regression
|
|
3
|
+
module Multiple
|
|
4
|
+
# Pure Ruby Class for Multiple Regression Analysis, based on a covariance or correlation matrix.
|
|
5
|
+
#
|
|
6
|
+
# Use Statsample::Regression::Multiple::RubyEngine if you have a
|
|
7
|
+
# Dataset, to avoid setting all details.
|
|
8
|
+
#
|
|
9
|
+
# <b>Remember:</b> NEVER use a Covariance data if you have missing data. Use only correlation matrix on that case.
|
|
10
|
+
#
|
|
11
|
+
#
|
|
12
|
+
# Example:
|
|
13
|
+
#
|
|
14
|
+
# matrix=[[1.0, 0.5, 0.2], [0.5, 1.0, 0.7], [0.2, 0.7, 1.0]]
|
|
15
|
+
#
|
|
16
|
+
# lr=Statsample::Regression::Multiple::MatrixEngine.new(matrix,2)
|
|
17
|
+
|
|
18
|
+
class MatrixEngine < BaseEngine
|
|
19
|
+
# Hash of standard deviation of predictors.
|
|
20
|
+
# Only useful for Correlation Matrix, because by default is set to 1
|
|
21
|
+
attr_accessor :x_sd
|
|
22
|
+
# Standard deviation of criterion
|
|
23
|
+
# Only useful for Correlation Matrix, because by default is set to 1
|
|
24
|
+
attr_accessor :y_sd
|
|
25
|
+
# Hash of mean for predictors. By default, set to 0
|
|
26
|
+
attr_accessor :x_mean
|
|
27
|
+
|
|
28
|
+
# Mean for criteria. By default, set to 0
|
|
29
|
+
attr_accessor :y_mean
|
|
30
|
+
|
|
31
|
+
# Number of cases
|
|
32
|
+
attr_writer :cases
|
|
33
|
+
attr_writer :digits
|
|
34
|
+
# Create object
|
|
35
|
+
#
|
|
36
|
+
def initialize(matrix,y_var, opts=Hash.new)
|
|
37
|
+
matrix.extend Statsample::CovariateMatrix
|
|
38
|
+
raise "#{y_var} variable should be on data" unless matrix.fields.include? y_var
|
|
39
|
+
if matrix._type==:covariance
|
|
40
|
+
@matrix_cov=matrix
|
|
41
|
+
@matrix_cor=matrix.correlation
|
|
42
|
+
@no_covariance=false
|
|
43
|
+
else
|
|
44
|
+
@matrix_cor=matrix
|
|
45
|
+
@matrix_cov=matrix
|
|
46
|
+
@no_covariance=true
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
@y_var=y_var
|
|
50
|
+
@fields=matrix.fields-[y_var]
|
|
51
|
+
|
|
52
|
+
@n_predictors=@fields.size
|
|
53
|
+
@predictors_n=@n_predictors
|
|
54
|
+
@matrix_x= @matrix_cor.submatrix(@fields)
|
|
55
|
+
@matrix_x_cov= @matrix_cov.submatrix(@fields)
|
|
56
|
+
raise LinearDependency, "Regressors are linearly dependent" if @matrix_x.determinant<1e-15
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@matrix_y = @matrix_cor.submatrix(@fields, [y_var])
|
|
60
|
+
@matrix_y_cov = @matrix_cov.submatrix(@fields, [y_var])
|
|
61
|
+
|
|
62
|
+
@y_sd=Math::sqrt(@matrix_cov.submatrix([y_var])[0,0])
|
|
63
|
+
|
|
64
|
+
@x_sd=@n_predictors.times.inject({}) {|ac,i|
|
|
65
|
+
ac[@matrix_x_cov.fields[i]]=Math::sqrt(@matrix_x_cov[i,i])
|
|
66
|
+
ac;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
@cases=nil
|
|
70
|
+
@x_mean=@fields.inject({}) {|ac,f|
|
|
71
|
+
ac[f]=0.0
|
|
72
|
+
ac;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
@y_mean=0.0
|
|
76
|
+
@name=_("Multiple reggresion of %s on %s") % [@fields.join(","), @y_var]
|
|
77
|
+
|
|
78
|
+
opts_default = {:digits=>3}
|
|
79
|
+
opts = opts_default.merge opts
|
|
80
|
+
opts.each{|k,v|
|
|
81
|
+
self.send("#{k}=",v) if self.respond_to? k
|
|
82
|
+
}
|
|
83
|
+
result_matrix=@matrix_x_cov.inverse * @matrix_y_cov
|
|
84
|
+
|
|
85
|
+
if matrix._type == :covariance
|
|
86
|
+
@coeffs=result_matrix.column(0).to_a
|
|
87
|
+
@coeffs_stan=coeffs.collect {|k,v|
|
|
88
|
+
coeffs[k]*@x_sd[k].quo(@y_sd)
|
|
89
|
+
}
|
|
90
|
+
else
|
|
91
|
+
@coeffs_stan=result_matrix.column(0).to_a
|
|
92
|
+
@coeffs=standarized_coeffs.collect {|k,v|
|
|
93
|
+
standarized_coeffs[k]*@y_sd.quo(@x_sd[k])
|
|
94
|
+
}
|
|
95
|
+
end
|
|
96
|
+
@total_cases=@valid_cases=@cases
|
|
97
|
+
end
|
|
98
|
+
def cases
|
|
99
|
+
raise "You should define the number of valid cases first" if @cases.nil?
|
|
100
|
+
@cases
|
|
101
|
+
end
|
|
102
|
+
# Get R^2 for the regression
|
|
103
|
+
# For fixed models is the coefficient of determination.
|
|
104
|
+
# On random models, is the 'squared-multiple correlation'
|
|
105
|
+
# Equal to
|
|
106
|
+
# * 1-(|R| / |R_x|) or
|
|
107
|
+
# * Sum(b_i*r_yi) <- used
|
|
108
|
+
def r2
|
|
109
|
+
@n_predictors.times.inject(0) {|ac,i| ac+@coeffs_stan[i]* @matrix_y[i,0]}
|
|
110
|
+
end
|
|
111
|
+
# Multiple correlation, on random models.
|
|
112
|
+
def r
|
|
113
|
+
Math::sqrt(r2)
|
|
114
|
+
end
|
|
115
|
+
# Value of constant
|
|
116
|
+
def constant
|
|
117
|
+
c = coeffs
|
|
118
|
+
@y_mean - @fields.inject(0) { |a,k| a + (c[k] * @x_mean[k])}
|
|
119
|
+
end
|
|
120
|
+
# Hash of b or raw coefficients
|
|
121
|
+
def coeffs
|
|
122
|
+
assign_names(@coeffs)
|
|
123
|
+
end
|
|
124
|
+
# Hash of beta or standarized coefficients
|
|
125
|
+
|
|
126
|
+
def standarized_coeffs
|
|
127
|
+
assign_names(@coeffs_stan)
|
|
128
|
+
end
|
|
129
|
+
# Total sum of squares
|
|
130
|
+
def sst
|
|
131
|
+
@y_sd**2*(cases-1.0)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Degrees of freedom for regression
|
|
135
|
+
def df_r
|
|
136
|
+
@n_predictors
|
|
137
|
+
end
|
|
138
|
+
# Degrees of freedom for error
|
|
139
|
+
def df_e
|
|
140
|
+
cases-@n_predictors-1
|
|
141
|
+
end
|
|
142
|
+
# Tolerance for a given variable
|
|
143
|
+
# defined as (1-R^2) of regression of other independent variables
|
|
144
|
+
# over the selected
|
|
145
|
+
# == Reference:
|
|
146
|
+
# * http://talkstats.com/showthread.php?t=5056
|
|
147
|
+
def tolerance(var)
|
|
148
|
+
return 1 if @matrix_x.column_size==1
|
|
149
|
+
lr=Statsample::Regression::Multiple::MatrixEngine.new(@matrix_x, var)
|
|
150
|
+
1-lr.r2
|
|
151
|
+
end
|
|
152
|
+
# Standard Error for coefficients.
|
|
153
|
+
# Standard error of a coefficients depends on
|
|
154
|
+
# * Tolerance of the coeffients: Higher tolerances implies higher error
|
|
155
|
+
# * Higher r2 implies lower error
|
|
156
|
+
# == Reference:
|
|
157
|
+
# * Cohen et al. (2003). Applied Multiple Reggression / Correlation Analysis for the Behavioral Sciences
|
|
158
|
+
#
|
|
159
|
+
def coeffs_se
|
|
160
|
+
out={}
|
|
161
|
+
#mse=sse.quo(df_e)
|
|
162
|
+
coeffs.each {|k,v|
|
|
163
|
+
out[k]=@y_sd.quo(@x_sd[k])*Math::sqrt( 1.quo(tolerance(k)))*Math::sqrt((1-r2).quo(df_e))
|
|
164
|
+
}
|
|
165
|
+
out
|
|
166
|
+
end
|
|
167
|
+
# t value for constant
|
|
168
|
+
def constant_t
|
|
169
|
+
return nil if constant_se.nil?
|
|
170
|
+
constant.to_f / constant_se
|
|
171
|
+
end
|
|
172
|
+
# Standard error for constant.
|
|
173
|
+
# This method recreates the estimaded variance-covariance matrix
|
|
174
|
+
# using means, standard deviation and covariance matrix.
|
|
175
|
+
# So, needs the covariance matrix.
|
|
176
|
+
def constant_se
|
|
177
|
+
return nil if @no_covariance
|
|
178
|
+
means=@x_mean
|
|
179
|
+
#means[@y_var]=@y_mean
|
|
180
|
+
means[:constant]=1
|
|
181
|
+
sd=@x_sd
|
|
182
|
+
#sd[@y_var]=@y_sd
|
|
183
|
+
sd[:constant]=0
|
|
184
|
+
fields=[:constant]+@matrix_cov.fields-[@y_var]
|
|
185
|
+
# Recreate X'X using the variance-covariance matrix
|
|
186
|
+
xt_x=::Matrix.rows(fields.collect {|i|
|
|
187
|
+
fields.collect {|j|
|
|
188
|
+
if i==:constant or j==:constant
|
|
189
|
+
cov=0
|
|
190
|
+
elsif i==j
|
|
191
|
+
cov=sd[i]**2
|
|
192
|
+
else
|
|
193
|
+
cov=@matrix_cov.submatrix(i..i,j..j)[0,0]
|
|
194
|
+
end
|
|
195
|
+
cov*(@cases-1)+@cases*means[i]*means[j]
|
|
196
|
+
}
|
|
197
|
+
})
|
|
198
|
+
matrix=xt_x.inverse * mse
|
|
199
|
+
matrix.collect {|i| Math::sqrt(i) if i>0 }[0,0]
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
end
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
module Regression
|
|
3
|
+
module Multiple
|
|
4
|
+
# Pure Ruby Class for Multiple Regression Analysis.
|
|
5
|
+
# Slower than AlglibEngine, but is pure ruby and can use a pairwise aproach for missing values.
|
|
6
|
+
# Coeffient calculation uses correlation matrix between the vectors
|
|
7
|
+
# If you need listwise aproach for missing values, use AlglibEngine, because is faster.
|
|
8
|
+
#
|
|
9
|
+
# Example:
|
|
10
|
+
#
|
|
11
|
+
# @a = Daru::Vector.new([1,3,2,4,3,5,4,6,5,7])
|
|
12
|
+
# @b = Daru::Vector.new([3,3,4,4,5,5,6,6,4,4])
|
|
13
|
+
# @c = Daru::Vector.new([11,22,30,40,50,65,78,79,99,100])
|
|
14
|
+
# @y = Daru::Vector.new([3,4,5,6,7,8,9,10,20,30])
|
|
15
|
+
# ds = Daru::DataFrame.new({:a => @a,:b => @b,:c => @c,:y => @y})
|
|
16
|
+
# lr=Statsample::Regression::Multiple::RubyEngine.new(ds,:y)
|
|
17
|
+
|
|
18
|
+
class RubyEngine < MatrixEngine
|
|
19
|
+
def initialize(ds,y_var, opts=Hash.new)
|
|
20
|
+
matrix = Statsample::Bivariate.correlation_matrix ds
|
|
21
|
+
fields_indep=ds.vectors.to_a - [y_var]
|
|
22
|
+
default= {
|
|
23
|
+
:y_mean => ds[y_var].mean,
|
|
24
|
+
:x_mean => fields_indep.inject({}) {|ac,f| ac[f]=ds[f].mean; ac},
|
|
25
|
+
:y_sd => ds[y_var].sd,
|
|
26
|
+
:x_sd => fields_indep.inject({}) {|ac,f| ac[f]=ds[f].sd; ac},
|
|
27
|
+
:cases => Statsample::Bivariate.min_n_valid(ds)
|
|
28
|
+
}
|
|
29
|
+
opts = opts.merge(default)
|
|
30
|
+
super(matrix, y_var, opts)
|
|
31
|
+
@ds = ds
|
|
32
|
+
@dy = ds[@y_var]
|
|
33
|
+
@ds_valid = ds.reject_values(*Daru::MISSING_VALUES)
|
|
34
|
+
@total_cases = @ds.nrows
|
|
35
|
+
@valid_cases = @ds_valid.nrows
|
|
36
|
+
@ds_indep = ds.dup(ds.vectors.to_a - [y_var])
|
|
37
|
+
set_dep_columns
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def set_dep_columns
|
|
41
|
+
@dep_columns = []
|
|
42
|
+
@ds_indep.each_vector { |v| @dep_columns.push(v.to_a) }
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def fix_with_mean
|
|
46
|
+
i=0
|
|
47
|
+
@ds_indep.each(:row) do |row|
|
|
48
|
+
empty=[]
|
|
49
|
+
row.each do |k,v|
|
|
50
|
+
empty.push(k) if v.nil?
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
if empty.size==1
|
|
54
|
+
@ds_indep[empty[0]][i]=@ds[empty[0]].mean
|
|
55
|
+
end
|
|
56
|
+
i += 1
|
|
57
|
+
end
|
|
58
|
+
set_dep_columns
|
|
59
|
+
end
|
|
60
|
+
def fix_with_regression
|
|
61
|
+
i = 0
|
|
62
|
+
@ds_indep.each(:row) do |row|
|
|
63
|
+
empty = []
|
|
64
|
+
row.each { |k,v| empty.push(k) if v.nil? }
|
|
65
|
+
if empty.size==1
|
|
66
|
+
field = empty[0]
|
|
67
|
+
lr = MultipleRegression.new(@ds_indep,field)
|
|
68
|
+
fields = []
|
|
69
|
+
@ds_indep.vectors.each { |f|
|
|
70
|
+
fields.push(row[f]) unless f == field
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
@ds_indep[field][i]=lr.process(fields)
|
|
74
|
+
end
|
|
75
|
+
i+=1
|
|
76
|
+
end
|
|
77
|
+
set_dep_columns
|
|
78
|
+
end
|
|
79
|
+
# Standard error for constant
|
|
80
|
+
def constant_se
|
|
81
|
+
estimated_variance_covariance_matrix[0,0]
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
module Regression
|
|
3
|
+
# Class for calculation of linear regressions with form
|
|
4
|
+
# y = a+bx
|
|
5
|
+
# To create a Statsample::Regression::Simple object:
|
|
6
|
+
# * <tt> Statsample::Regression::Simple.new_from_dataset(ds,x,y)</tt>
|
|
7
|
+
# * <tt> Statsample::Regression::Simple.new_from_vectors(vx,vy)</tt>
|
|
8
|
+
# * <tt> Statsample::Regression::Simple.new_from_gsl(gsl) </tt>
|
|
9
|
+
#
|
|
10
|
+
class Simple
|
|
11
|
+
include Summarizable
|
|
12
|
+
attr_accessor :a,:b,:cov00, :cov01, :covx1, :chisq, :status
|
|
13
|
+
attr_accessor :name
|
|
14
|
+
attr_accessor :digits
|
|
15
|
+
def initialize(init_method, *argv)
|
|
16
|
+
self.send(init_method, *argv)
|
|
17
|
+
end
|
|
18
|
+
private_class_method :new
|
|
19
|
+
# Obtain y value given x value
|
|
20
|
+
# x=a+bx
|
|
21
|
+
|
|
22
|
+
def y(val_x)
|
|
23
|
+
@a+@b*val_x
|
|
24
|
+
end
|
|
25
|
+
# Obtain x value given y value
|
|
26
|
+
# x=(y-a)/b
|
|
27
|
+
def x(val_y)
|
|
28
|
+
(val_y-@a) / @b.to_f
|
|
29
|
+
end
|
|
30
|
+
# Sum of square error
|
|
31
|
+
def sse
|
|
32
|
+
(0...@vx.size).inject(0) {|acum,i| acum+((@vy[i]-y(@vx[i]))**2)
|
|
33
|
+
}
|
|
34
|
+
end
|
|
35
|
+
def standard_error
|
|
36
|
+
Math::sqrt(sse / (@vx.size-2).to_f)
|
|
37
|
+
end
|
|
38
|
+
# Sum of square regression
|
|
39
|
+
def ssr
|
|
40
|
+
vy_mean=@vy.mean
|
|
41
|
+
(0...@vx.size).inject(0) {|a,i|
|
|
42
|
+
a+((y(@vx[i])-vy_mean)**2)
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
end
|
|
46
|
+
# Sum of square total
|
|
47
|
+
def sst
|
|
48
|
+
@vy.sum_of_squared_deviation
|
|
49
|
+
end
|
|
50
|
+
# Value of r
|
|
51
|
+
def r
|
|
52
|
+
@b * (@vx.sds / @vy.sds)
|
|
53
|
+
end
|
|
54
|
+
# Value of r^2
|
|
55
|
+
def r2
|
|
56
|
+
r**2
|
|
57
|
+
end
|
|
58
|
+
class << self
|
|
59
|
+
# Create a regression object giving an array with following parameters:
|
|
60
|
+
# <tt>a,b,cov00, cov01, covx1, chisq, status</tt>
|
|
61
|
+
# Useful to obtain x and y values with a and b values.
|
|
62
|
+
def new_from_gsl(ar)
|
|
63
|
+
new(:init_gsl, *ar)
|
|
64
|
+
end
|
|
65
|
+
# Create a simple regression using two vectors
|
|
66
|
+
def new_from_vectors(vx,vy, opts=Hash.new)
|
|
67
|
+
new(:init_vectors,vx,vy, opts)
|
|
68
|
+
end
|
|
69
|
+
# Create a simple regression using a dataset and two vector names.
|
|
70
|
+
def new_from_dataset(ds,x,y, opts=Hash.new)
|
|
71
|
+
new(:init_vectors,ds[x],ds[y], opts)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
def init_vectors(vx,vy, opts=Hash.new)
|
|
75
|
+
@vx,@vy=Statsample.only_valid_clone(vx,vy)
|
|
76
|
+
x_m=@vx.mean
|
|
77
|
+
y_m=@vy.mean
|
|
78
|
+
num=den=0
|
|
79
|
+
(0...@vx.size).each {|i|
|
|
80
|
+
num+=(@vx[i]-x_m)*(@vy[i]-y_m)
|
|
81
|
+
den+=(@vx[i]-x_m)**2
|
|
82
|
+
}
|
|
83
|
+
@b=num.to_f/den
|
|
84
|
+
@a=y_m - @b*x_m
|
|
85
|
+
|
|
86
|
+
opts_default={
|
|
87
|
+
:digits=>3,
|
|
88
|
+
:name=>_("Regression of %s over %s") % [@vx.name, @vy.name]
|
|
89
|
+
}
|
|
90
|
+
@opts=opts_default.merge opts
|
|
91
|
+
|
|
92
|
+
@opts.each{|k,v|
|
|
93
|
+
self.send("#{k}=",v) if self.respond_to? k
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
end
|
|
97
|
+
def init_gsl(a,b,cov00, cov01, covx1, chisq, status)
|
|
98
|
+
@a=a
|
|
99
|
+
@b=b
|
|
100
|
+
@cov00=cov00
|
|
101
|
+
@cov01=cov01
|
|
102
|
+
@covx1=covx1
|
|
103
|
+
@chisq=chisq
|
|
104
|
+
@status=status
|
|
105
|
+
end
|
|
106
|
+
def report_building(gen)
|
|
107
|
+
f="%0.#{digits}f"
|
|
108
|
+
gen.section(:name=>name) do |s|
|
|
109
|
+
s.table(:header=>[_("Variable"), _("Value")]) do |t|
|
|
110
|
+
t.row [_("r"), f % r]
|
|
111
|
+
t.row [_("r^2"), f % r2]
|
|
112
|
+
t.row [_("a"), f % a]
|
|
113
|
+
t.row [_("b"), f % b]
|
|
114
|
+
t.row [_("s.e"), f % standard_error]
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
private :init_vectors, :init_gsl
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
module Reliability
|
|
3
|
+
class << self
|
|
4
|
+
# Calculate Chonbach's alpha for a given dataset.
|
|
5
|
+
# only uses tuples without missing data
|
|
6
|
+
def cronbach_alpha(ods)
|
|
7
|
+
ds = ods.reject_values(*Daru::MISSING_VALUES)
|
|
8
|
+
n_items = ds.ncols
|
|
9
|
+
return nil if n_items <= 1
|
|
10
|
+
s2_items = ds.to_hash.values.inject(0) { |ac,v|
|
|
11
|
+
ac + v.variance }
|
|
12
|
+
total = ds.vector_sum
|
|
13
|
+
|
|
14
|
+
(n_items.quo(n_items - 1)) * (1 - (s2_items.quo(total.variance)))
|
|
15
|
+
end
|
|
16
|
+
# Calculate Chonbach's alpha for a given dataset
|
|
17
|
+
# using standarized values for every vector.
|
|
18
|
+
# Only uses tuples without missing data
|
|
19
|
+
# Return nil if one or more vectors has 0 variance
|
|
20
|
+
def cronbach_alpha_standarized(ods)
|
|
21
|
+
ds = ods.reject_values(*Daru::MISSING_VALUES)
|
|
22
|
+
return nil if ds.any? { |v| v.variance==0}
|
|
23
|
+
|
|
24
|
+
ds = Daru::DataFrame.new(
|
|
25
|
+
ds.vectors.to_a.inject({}) { |a,i|
|
|
26
|
+
a[i] = ods[i].standardize
|
|
27
|
+
a
|
|
28
|
+
}
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
cronbach_alpha(ds)
|
|
32
|
+
end
|
|
33
|
+
# Predicted reliability of a test by replicating
|
|
34
|
+
# +n+ times the number of items
|
|
35
|
+
def spearman_brown_prophecy(r,n)
|
|
36
|
+
(n*r).quo(1+(n-1)*r)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
alias :sbp :spearman_brown_prophecy
|
|
40
|
+
# Returns the number of items
|
|
41
|
+
# to obtain +r_d+ desired reliability
|
|
42
|
+
# from +r+ current reliability, achieved with
|
|
43
|
+
# +n+ items
|
|
44
|
+
def n_for_desired_reliability(r,r_d,n=1)
|
|
45
|
+
return nil if r.nil?
|
|
46
|
+
(r_d*(1-r)).quo(r*(1-r_d))*n
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Get Cronbach alpha from <tt>n</tt> cases,
|
|
50
|
+
# <tt>s2</tt> mean variance and <tt>cov</tt>
|
|
51
|
+
# mean covariance
|
|
52
|
+
def cronbach_alpha_from_n_s2_cov(n,s2,cov)
|
|
53
|
+
(n.quo(n-1)) * (1-(s2.quo(s2+(n-1)*cov)))
|
|
54
|
+
end
|
|
55
|
+
# Get Cronbach's alpha from a covariance matrix
|
|
56
|
+
def cronbach_alpha_from_covariance_matrix(cov)
|
|
57
|
+
n = cov.row_size
|
|
58
|
+
raise "covariance matrix should have at least 2 variables" if n < 2
|
|
59
|
+
s2 = n.times.inject(0) { |ac,i| ac + cov[i,i] }
|
|
60
|
+
(n.quo(n - 1)) * (1 - (s2.quo(cov.total_sum)))
|
|
61
|
+
end
|
|
62
|
+
# Returns n necessary to obtain specific alpha
|
|
63
|
+
# given variance and covariance mean of items
|
|
64
|
+
def n_for_desired_alpha(alpha,s2,cov)
|
|
65
|
+
# Start with a regular test : 50 items
|
|
66
|
+
min=2
|
|
67
|
+
max=1000
|
|
68
|
+
n=50
|
|
69
|
+
prev_n=0
|
|
70
|
+
epsilon=0.0001
|
|
71
|
+
dif=1000
|
|
72
|
+
c_a=cronbach_alpha_from_n_s2_cov(n,s2,cov)
|
|
73
|
+
dif=c_a - alpha
|
|
74
|
+
while(dif.abs>epsilon and n!=prev_n)
|
|
75
|
+
prev_n=n
|
|
76
|
+
if dif<0
|
|
77
|
+
min=n
|
|
78
|
+
n=(n+(max-min).quo(2)).to_i
|
|
79
|
+
else
|
|
80
|
+
max=n
|
|
81
|
+
n=(n-(max-min).quo(2)).to_i
|
|
82
|
+
end
|
|
83
|
+
c_a=cronbach_alpha_from_n_s2_cov(n,s2,cov)
|
|
84
|
+
dif=c_a - alpha
|
|
85
|
+
end
|
|
86
|
+
n
|
|
87
|
+
end
|
|
88
|
+
# First derivative for alfa
|
|
89
|
+
# Parameters
|
|
90
|
+
# <tt>n</tt>: Number of items
|
|
91
|
+
# <tt>sx</tt>: mean of variances
|
|
92
|
+
# <tt>sxy</tt>: mean of covariances
|
|
93
|
+
|
|
94
|
+
def alpha_first_derivative(n,sx,sxy)
|
|
95
|
+
(sxy*(sx-sxy)).quo(((sxy*(n-1))+sx)**2)
|
|
96
|
+
end
|
|
97
|
+
# Second derivative for alfa
|
|
98
|
+
# Parameters
|
|
99
|
+
# <tt>n</tt>: Number of items
|
|
100
|
+
# <tt>sx</tt>: mean of variances
|
|
101
|
+
# <tt>sxy</tt>: mean of covariances
|
|
102
|
+
|
|
103
|
+
def alfa_second_derivative(n,sx,sxy)
|
|
104
|
+
(2*(sxy**2)*(sxy-sx)).quo(((sxy*(n-1))+sx)**3)
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
class ItemCharacteristicCurve
|
|
108
|
+
attr_reader :totals, :counts, :vector_total
|
|
109
|
+
def initialize (ds, vector_total=nil)
|
|
110
|
+
vector_total||=ds.vector_sum
|
|
111
|
+
raise ArgumentError, "Total size != Dataset size" if vector_total.size != ds.nrows
|
|
112
|
+
@vector_total=vector_total
|
|
113
|
+
@ds=ds
|
|
114
|
+
@totals={}
|
|
115
|
+
@counts=@ds.vectors.to_a.inject({}) {|a,v| a[v]={};a}
|
|
116
|
+
process
|
|
117
|
+
end
|
|
118
|
+
def process
|
|
119
|
+
i=0
|
|
120
|
+
@ds.each_row do |row|
|
|
121
|
+
tot=@vector_total[i]
|
|
122
|
+
@totals[tot]||=0
|
|
123
|
+
@totals[tot]+=1
|
|
124
|
+
@ds.vectors.each do |f|
|
|
125
|
+
item=row[f].to_s
|
|
126
|
+
@counts[f][tot]||={}
|
|
127
|
+
@counts[f][tot][item]||=0
|
|
128
|
+
@counts[f][tot][item] += 1
|
|
129
|
+
end
|
|
130
|
+
i+=1
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
# Return a hash with p for each different value on a vector
|
|
134
|
+
def curve_field(field, item)
|
|
135
|
+
out={}
|
|
136
|
+
item=item.to_s
|
|
137
|
+
@totals.each do |value,n|
|
|
138
|
+
count_value= @counts[field][value][item].nil? ? 0 : @counts[field][value][item]
|
|
139
|
+
out[value]=count_value.quo(n)
|
|
140
|
+
end
|
|
141
|
+
out
|
|
142
|
+
end # def
|
|
143
|
+
end # self
|
|
144
|
+
end # Reliability
|
|
145
|
+
end # Statsample
|
|
146
|
+
|
|
147
|
+
require 'statsample/reliability/icc.rb'
|
|
148
|
+
require 'statsample/reliability/scaleanalysis.rb'
|
|
149
|
+
require 'statsample/reliability/skillscaleanalysis.rb'
|
|
150
|
+
require 'statsample/reliability/multiscaleanalysis.rb'
|