statsample 0.6.5 → 0.6.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +15 -0
- data/Manifest.txt +6 -0
- data/README.txt +30 -12
- data/Rakefile +91 -0
- data/demo/levene.rb +9 -0
- data/demo/multiple_regression.rb +1 -7
- data/demo/polychoric.rb +1 -0
- data/demo/principal_axis.rb +8 -0
- data/lib/distribution/f.rb +22 -22
- data/lib/spss.rb +99 -99
- data/lib/statsample/bivariate/polychoric.rb +32 -22
- data/lib/statsample/bivariate/tetrachoric.rb +212 -207
- data/lib/statsample/bivariate.rb +6 -6
- data/lib/statsample/codification.rb +65 -65
- data/lib/statsample/combination.rb +60 -59
- data/lib/statsample/converter/csv19.rb +12 -12
- data/lib/statsample/converters.rb +1 -1
- data/lib/statsample/dataset.rb +93 -36
- data/lib/statsample/dominanceanalysis/bootstrap.rb +66 -3
- data/lib/statsample/dominanceanalysis.rb +5 -6
- data/lib/statsample/factor/pca.rb +41 -11
- data/lib/statsample/factor/principalaxis.rb +105 -29
- data/lib/statsample/factor/rotation.rb +20 -3
- data/lib/statsample/factor.rb +1 -1
- data/lib/statsample/graph/gdchart.rb +13 -13
- data/lib/statsample/graph/svggraph.rb +166 -167
- data/lib/statsample/matrix.rb +22 -12
- data/lib/statsample/mle/logit.rb +3 -2
- data/lib/statsample/mle/probit.rb +7 -5
- data/lib/statsample/mle.rb +4 -2
- data/lib/statsample/multiset.rb +125 -124
- data/lib/statsample/permutation.rb +2 -1
- data/lib/statsample/regression/binomial/logit.rb +4 -3
- data/lib/statsample/regression/binomial/probit.rb +2 -1
- data/lib/statsample/regression/binomial.rb +62 -81
- data/lib/statsample/regression/multiple/baseengine.rb +1 -1
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/matrixengine.rb +12 -6
- data/lib/statsample/regression/multiple.rb +15 -42
- data/lib/statsample/regression/simple.rb +93 -78
- data/lib/statsample/regression.rb +74 -2
- data/lib/statsample/reliability.rb +117 -120
- data/lib/statsample/srs.rb +156 -153
- data/lib/statsample/test/levene.rb +90 -0
- data/lib/statsample/test/umannwhitney.rb +25 -9
- data/lib/statsample/test.rb +2 -0
- data/lib/statsample/vector.rb +388 -413
- data/lib/statsample.rb +74 -30
- data/po/es/statsample.mo +0 -0
- data/test/test_bivariate.rb +5 -4
- data/test/test_combination.rb +1 -1
- data/test/test_dataset.rb +2 -2
- data/test/test_factor.rb +53 -6
- data/test/test_gsl.rb +1 -1
- data/test/test_mle.rb +1 -1
- data/test/test_regression.rb +18 -33
- data/test/test_statistics.rb +15 -33
- data/test/test_stest.rb +35 -0
- data/test/test_svg_graph.rb +2 -2
- data/test/test_vector.rb +331 -333
- metadata +38 -11
@@ -101,7 +101,7 @@ class MatrixEngine < BaseEngine
|
|
101
101
|
# Get R^2 for the regression
|
102
102
|
# Equal to
|
103
103
|
# * 1-(|R| / |R_x|) or
|
104
|
-
# * Sum(b_i*r_yi)
|
104
|
+
# * Sum(b_i*r_yi) <- used
|
105
105
|
def r2
|
106
106
|
@n_predictors.times.inject(0) {|ac,i| ac+@coeffs_stan[i]* @matrix_y[i,0]}
|
107
107
|
end
|
@@ -113,13 +113,16 @@ class MatrixEngine < BaseEngine
|
|
113
113
|
c=coeffs
|
114
114
|
@y_mean - @fields.inject(0){|a,k| a + (c[k] * @x_mean[k])}
|
115
115
|
end
|
116
|
+
# Hash of b or raw coefficients
|
116
117
|
def coeffs
|
117
118
|
assign_names(@coeffs)
|
118
119
|
end
|
120
|
+
# Hash of beta or standarized coefficients
|
121
|
+
|
119
122
|
def standarized_coeffs
|
120
123
|
assign_names(@coeffs_stan)
|
121
124
|
end
|
122
|
-
|
125
|
+
# Total sum of squares
|
123
126
|
def sst
|
124
127
|
@y_sd**2*(cases-1.0)
|
125
128
|
end
|
@@ -134,9 +137,11 @@ class MatrixEngine < BaseEngine
|
|
134
137
|
end
|
135
138
|
|
136
139
|
# Tolerance for a given variable
|
137
|
-
# defined as (1-
|
140
|
+
# defined as (1-R^2) of regression of other independent variables
|
138
141
|
# over the selected
|
139
|
-
#
|
142
|
+
# Reference:
|
143
|
+
#
|
144
|
+
# * http://talkstats.com/showthread.php?t=5056
|
140
145
|
def tolerance(var)
|
141
146
|
lr=Statsample::Regression::Multiple::MatrixEngine.new(@matrix_x, var)
|
142
147
|
1-lr.r2
|
@@ -146,7 +151,8 @@ class MatrixEngine < BaseEngine
|
|
146
151
|
# * Tolerance of the coeffients: Higher tolerances implies higher error
|
147
152
|
# * Higher r2 implies lower error
|
148
153
|
|
149
|
-
# Reference:
|
154
|
+
# Reference:
|
155
|
+
# * Cohen et al. (2003). Applied Multiple Reggression / Correlation Analysis for the Behavioral Sciences
|
150
156
|
#
|
151
157
|
def coeffs_se
|
152
158
|
out={}
|
@@ -188,7 +194,7 @@ class MatrixEngine < BaseEngine
|
|
188
194
|
matrix.collect {|i| Math::sqrt(i) if i>0 }[0,0]
|
189
195
|
end
|
190
196
|
|
191
|
-
def to_reportbuilder(generator)
|
197
|
+
def to_reportbuilder(generator) # :nodoc:
|
192
198
|
anchor=generator.add_toc_entry(_("Multiple Regression: ")+@name)
|
193
199
|
generator.add_html "<div class='multiple-regression'>#{@name}<a name='#{anchor}'></a>"
|
194
200
|
c=coeffs
|
@@ -1,9 +1,7 @@
|
|
1
1
|
require 'statsample/regression/multiple/baseengine'
|
2
2
|
module Statsample
|
3
3
|
module Regression
|
4
|
-
# Module for
|
5
|
-
#
|
6
|
-
# You can call Statsample::Regression::Multiple.listwise, Statsample::Regression::Multiple.pairwise or instance directly the engines.
|
4
|
+
# Module for OLS Multiple Regression Analysis.
|
7
5
|
#
|
8
6
|
# Use:.
|
9
7
|
#
|
@@ -13,7 +11,7 @@ module Statsample
|
|
13
11
|
# c=1000.times.collect {rand}.to_scale
|
14
12
|
# ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset
|
15
13
|
# ds['y']=ds.collect{|row| row['a']*5+row['b']*3+row['c']*2+rand()}
|
16
|
-
# lr=Statsample::Regression
|
14
|
+
# lr=Statsample::Regression.multiple(ds,'y')
|
17
15
|
# puts lr.summary
|
18
16
|
# Summary for regression of a,b,c over y
|
19
17
|
# *************************************************************
|
@@ -42,29 +40,6 @@ module Statsample
|
|
42
40
|
# -----------------------------------------------
|
43
41
|
#
|
44
42
|
module Multiple
|
45
|
-
# Creates an object for listwise regression.
|
46
|
-
# Alglib is faster, so is prefered over GSL
|
47
|
-
# lr=Statsample::Regression::Multiple.listwise(ds,'y')
|
48
|
-
def self.listwise(ds,y_var)
|
49
|
-
if HAS_ALGIB
|
50
|
-
AlglibEngine.new(ds,y_var)
|
51
|
-
elsif HAS_GSL
|
52
|
-
GslEngine.new(ds,y_var)
|
53
|
-
else
|
54
|
-
ds2=ds.dup_only_valid
|
55
|
-
RubyEngine.new(ds2,y_var)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
# Creates an object for pairwise regression
|
60
|
-
# For now, always retrieves a RubyEngine
|
61
|
-
# lr=Statsample::Regression::Multiple.listwise(ds,'y')
|
62
|
-
def self.pairwise(ds,y_var)
|
63
|
-
RubyEngine.new(ds,y_var)
|
64
|
-
end
|
65
|
-
def self.listwise_by_exp(ds,exp)
|
66
|
-
raise "Not implemented yet"
|
67
|
-
end
|
68
43
|
# Obtain r2 for regressors
|
69
44
|
def self.r2_from_matrices(rxx,rxy)
|
70
45
|
matrix=(rxy.transpose*rxx.inverse*rxy)
|
@@ -76,21 +51,19 @@ module Statsample
|
|
76
51
|
0.0
|
77
52
|
end
|
78
53
|
def initialize(matrix,y_var, opts=Hash.new)
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
54
|
+
matrix.extend Statsample::CovariateMatrix
|
55
|
+
@matrix=matrix
|
56
|
+
@fields=matrix.fields-y_var
|
57
|
+
@y_var=y_var
|
58
|
+
@q=@y_var.size
|
59
|
+
@matrix_cor=matrix.correlation
|
60
|
+
@matrix_cor_xx = @matrix_cor.submatrix(@fields)
|
61
|
+
@matrix_cor_yy = @matrix_cor.submatrix(y_var, y_var)
|
62
|
+
|
63
|
+
@sxx = @matrix.submatrix(@fields)
|
64
|
+
@syy = @matrix.submatrix(y_var, y_var)
|
65
|
+
@sxy = @matrix.submatrix(@fields, y_var)
|
66
|
+
@syx = @sxy.t
|
94
67
|
end
|
95
68
|
|
96
69
|
def r2yx
|
@@ -1,81 +1,96 @@
|
|
1
1
|
module Statsample
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
2
|
+
module Regression
|
3
|
+
# Class for calculation of linear regressions with form
|
4
|
+
# y = a+bx
|
5
|
+
# To create a SimpleRegression object:
|
6
|
+
# * <tt> SimpleRegression.new_from_dataset(ds,x,y)</tt>
|
7
|
+
# * <tt> SimpleRegression.new_from_vectors(vx,vy)</tt>
|
8
|
+
# * <tt> SimpleRegression.new_from_gsl(gsl) </tt>
|
9
|
+
#
|
10
|
+
class Simple
|
11
|
+
attr_accessor :a,:b,:cov00, :cov01, :covx1, :chisq, :status
|
12
|
+
|
13
|
+
def initialize(init_method, *argv)
|
14
|
+
self.send(init_method, *argv)
|
15
|
+
end
|
16
|
+
private_class_method :new
|
17
|
+
# Obtain y value given x value
|
18
|
+
# x=a+bx
|
19
|
+
|
20
|
+
def y(val_x)
|
21
|
+
@a+@b*val_x
|
22
|
+
end
|
23
|
+
# Obtain x value given y value
|
24
|
+
# x=(y-a)/b
|
25
|
+
def x(val_y)
|
26
|
+
(val_y-@a) / @b.to_f
|
27
|
+
end
|
28
|
+
# Sum of square error
|
29
|
+
def sse
|
30
|
+
(0...@vx.size).inject(0) {|acum,i| acum+((@vy[i]-y(@vx[i]))**2)
|
31
|
+
}
|
32
|
+
end
|
33
|
+
def standard_error
|
34
|
+
Math::sqrt(sse / (@vx.size-2).to_f)
|
35
|
+
end
|
36
|
+
# Sum of square regression
|
37
|
+
def ssr
|
38
|
+
vy_mean=@vy.mean
|
39
|
+
(0...@vx.size).inject(0) {|a,i|
|
40
|
+
a+((y(@vx[i])-vy_mean)**2)
|
41
|
+
}
|
42
|
+
|
43
|
+
end
|
44
|
+
# Sum of square total
|
45
|
+
def sst
|
46
|
+
@vy.sum_of_squared_deviation
|
47
|
+
end
|
48
|
+
# Value of r
|
49
|
+
def r
|
50
|
+
@b * (@vx.sds / @vy.sds)
|
51
|
+
end
|
52
|
+
# Value of r^2
|
53
|
+
def r2
|
54
|
+
r**2
|
55
|
+
end
|
56
|
+
class << self
|
57
|
+
# Create a regression object giving an array with following parameters:
|
58
|
+
# <tt>a,b,cov00, cov01, covx1, chisq, status</tt>
|
59
|
+
# Useful to obtain x and y values with a and b values.
|
60
|
+
def new_from_gsl(ar)
|
61
|
+
new(:init_gsl, *ar)
|
62
|
+
end
|
63
|
+
# Create a simple regression using two vectors
|
64
|
+
def new_from_vectors(vx,vy)
|
65
|
+
new(:init_vectors,vx,vy)
|
66
|
+
end
|
67
|
+
# Create a simple regression using a dataset and two vector names.
|
68
|
+
def new_from_dataset(ds,x,y)
|
69
|
+
new(:init_vectors,ds[x],ds[y])
|
70
|
+
end
|
71
|
+
end
|
72
|
+
def init_vectors(vx,vy)
|
73
|
+
@vx,@vy=Statsample.only_valid(vx,vy)
|
74
|
+
x_m=@vx.mean
|
75
|
+
y_m=@vy.mean
|
76
|
+
num=den=0
|
77
|
+
(0...@vx.size).each {|i|
|
78
|
+
num+=(@vx[i]-x_m)*(@vy[i]-y_m)
|
79
|
+
den+=(@vx[i]-x_m)**2
|
80
|
+
}
|
81
|
+
@b=num.to_f/den
|
82
|
+
@a=y_m - @b*x_m
|
83
|
+
end
|
84
|
+
def init_gsl(a,b,cov00, cov01, covx1, chisq, status)
|
85
|
+
@a=a
|
86
|
+
@b=b
|
87
|
+
@cov00=cov00
|
88
|
+
@cov01=cov01
|
89
|
+
@covx1=covx1
|
90
|
+
@chisq=chisq
|
91
|
+
@status=status
|
92
|
+
end
|
93
|
+
private :init_vectors, :init_gsl
|
80
94
|
end
|
95
|
+
end
|
81
96
|
end
|
@@ -2,7 +2,6 @@ require 'statsample/regression/simple'
|
|
2
2
|
require 'statsample/regression/multiple'
|
3
3
|
|
4
4
|
require 'statsample/regression/multiple/matrixengine'
|
5
|
-
require 'statsample/regression/multiple/alglibengine'
|
6
5
|
require 'statsample/regression/multiple/rubyengine'
|
7
6
|
require 'statsample/regression/multiple/gslengine'
|
8
7
|
|
@@ -11,7 +10,80 @@ require 'statsample/regression/binomial/logit'
|
|
11
10
|
require 'statsample/regression/binomial/probit'
|
12
11
|
|
13
12
|
module Statsample
|
14
|
-
# Module for regression procedures.
|
13
|
+
# = Module for regression procedures.
|
14
|
+
# Use the method on this class to generate
|
15
|
+
# analysis.
|
16
|
+
# If you need more control, you can
|
17
|
+
# create and control directly the objects who computes
|
18
|
+
# the regressions.
|
19
|
+
#
|
20
|
+
# * Simple Regression : Statsample::Regression::Simple
|
21
|
+
# * Multiple Regression: Statsample::Regression::Multiple
|
22
|
+
# * Logit Regression: Statsample::Regression::Binomial::Logit
|
23
|
+
# * Probit Regression: Statsample::Regression::Binomial::Probit
|
15
24
|
module Regression
|
25
|
+
# Create a Statsample::Regression::Simple object, for simple regression
|
26
|
+
# * x: independent Vector
|
27
|
+
# * y: dependent Vector
|
28
|
+
# <b>Usage:</b>
|
29
|
+
# x=100.times.collect {|i| rand(100)}.to_scale
|
30
|
+
# y=100.times.collect {|i| 2+x[i]*2+rand()}.to_scale
|
31
|
+
# sr=Statsample::Regression.simple(x,y)
|
32
|
+
# sr.a
|
33
|
+
# => 2.51763295177808
|
34
|
+
# sr.b
|
35
|
+
# => 1.99973746599856
|
36
|
+
# sr.r
|
37
|
+
# => 0.999987881153254
|
38
|
+
|
39
|
+
def self.simple(x,y)
|
40
|
+
Statsample::Regression::Simple.new_from_vectors(x,y)
|
41
|
+
end
|
42
|
+
# Create a Binomial::Logit object, for logit regression.
|
43
|
+
# * ds:: Dataset
|
44
|
+
# * y:: Name of dependent vector
|
45
|
+
# <b>Usage</b>
|
46
|
+
# dataset=Statsample::CSV.read("data.csv")
|
47
|
+
# lr=Statsample::Regression.logit(dataset,'y')
|
48
|
+
#
|
49
|
+
def self.logit(ds,y_var)
|
50
|
+
Statsample::Regression::Binomial::Logit.new(ds,y_var)
|
51
|
+
end
|
52
|
+
# Create a Binomial::Probit object, for probit regression
|
53
|
+
# * ds:: Dataset
|
54
|
+
# * y:: Name of dependent vector
|
55
|
+
# <b>Usage</b>
|
56
|
+
# dataset=Statsample::CSV.read("data.csv")
|
57
|
+
# lr=Statsample::Regression.probit(dataset,'y')
|
58
|
+
#
|
59
|
+
|
60
|
+
def self.probit(ds,y_var)
|
61
|
+
Statsample::Regression::Binomial::Probit.new(ds,y_var)
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
# Creates one of the Statsample::Regression::Multiple object,
|
66
|
+
# for OLS multiple regression.
|
67
|
+
# Parameters:
|
68
|
+
# * ds: Dataset.
|
69
|
+
# * y: Name of dependent variable.
|
70
|
+
# * missing_data: Could be
|
71
|
+
# * :listwise: delete cases with one or more empty data (default).
|
72
|
+
# * :pairwise: uses correlation matrix. Use with caution.
|
73
|
+
#
|
74
|
+
# <b>Usage:</b>
|
75
|
+
# lr=Statsample::Regression::multiple(ds,'y')
|
76
|
+
def self.multiple(ds,y_var, missing_data=:listwise)
|
77
|
+
if missing_data==:pairwise
|
78
|
+
RubyEngine.new(ds,y_var)
|
79
|
+
else
|
80
|
+
if Statsample.has_gsl?
|
81
|
+
Statsample::Regression::Multiple::GslEngine.new(ds,y_var)
|
82
|
+
else
|
83
|
+
ds2=ds.dup_only_valid
|
84
|
+
Statsample::Regression::Multiple::RubyEngine.new(ds2,y_var)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
16
88
|
end
|
17
89
|
end
|