statsample 0.18.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (121) hide show
  1. data.tar.gz.sig +0 -0
  2. data/History.txt +23 -0
  3. data/Manifest.txt +28 -17
  4. data/Rakefile +3 -2
  5. data/benchmarks/correlation_matrix_15_variables.rb +31 -0
  6. data/benchmarks/correlation_matrix_5_variables.rb +32 -0
  7. data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
  8. data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
  9. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +75 -0
  10. data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
  11. data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
  12. data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
  13. data/benchmarks/correlation_matrix_methods/results.ds +0 -0
  14. data/benchmarks/factor_map.rb +37 -0
  15. data/benchmarks/helpers_benchmark.rb +5 -0
  16. data/examples/boxplot.rb +13 -14
  17. data/examples/correlation_matrix.rb +16 -8
  18. data/examples/dataset.rb +13 -4
  19. data/examples/dominance_analysis.rb +23 -17
  20. data/examples/dominance_analysis_bootstrap.rb +28 -22
  21. data/examples/histogram.rb +8 -9
  22. data/examples/icc.rb +20 -21
  23. data/examples/levene.rb +10 -4
  24. data/examples/multiple_regression.rb +9 -28
  25. data/examples/multivariate_correlation.rb +9 -3
  26. data/examples/parallel_analysis.rb +20 -16
  27. data/examples/polychoric.rb +15 -9
  28. data/examples/principal_axis.rb +18 -6
  29. data/examples/reliability.rb +26 -13
  30. data/examples/scatterplot.rb +10 -6
  31. data/examples/t_test.rb +15 -6
  32. data/examples/tetrachoric.rb +9 -2
  33. data/examples/u_test.rb +12 -4
  34. data/examples/vector.rb +13 -2
  35. data/examples/velicer_map_test.rb +33 -26
  36. data/lib/statsample.rb +32 -12
  37. data/lib/statsample/analysis.rb +79 -0
  38. data/lib/statsample/analysis/suite.rb +72 -0
  39. data/lib/statsample/analysis/suitereportbuilder.rb +38 -0
  40. data/lib/statsample/bivariate.rb +70 -16
  41. data/lib/statsample/dataset.rb +25 -19
  42. data/lib/statsample/dominanceanalysis.rb +2 -2
  43. data/lib/statsample/factor.rb +2 -0
  44. data/lib/statsample/factor/map.rb +16 -10
  45. data/lib/statsample/factor/parallelanalysis.rb +9 -3
  46. data/lib/statsample/factor/pca.rb +28 -32
  47. data/lib/statsample/factor/rotation.rb +15 -8
  48. data/lib/statsample/graph/boxplot.rb +3 -4
  49. data/lib/statsample/graph/histogram.rb +2 -1
  50. data/lib/statsample/graph/scatterplot.rb +1 -0
  51. data/lib/statsample/matrix.rb +106 -16
  52. data/lib/statsample/regression.rb +4 -1
  53. data/lib/statsample/regression/binomial.rb +1 -1
  54. data/lib/statsample/regression/multiple/baseengine.rb +19 -9
  55. data/lib/statsample/regression/multiple/gslengine.rb +127 -126
  56. data/lib/statsample/regression/multiple/matrixengine.rb +8 -5
  57. data/lib/statsample/regression/multiple/rubyengine.rb +1 -1
  58. data/lib/statsample/regression/simple.rb +31 -6
  59. data/lib/statsample/reliability.rb +11 -3
  60. data/lib/statsample/reliability/scaleanalysis.rb +4 -4
  61. data/lib/statsample/shorthand.rb +81 -0
  62. data/lib/statsample/test/chisquare.rb +1 -1
  63. data/lib/statsample/vector.rb +163 -163
  64. data/lib/statsample/vector/gsl.rb +106 -0
  65. data/references.txt +2 -2
  66. data/{data → test/fixtures}/crime.txt +0 -0
  67. data/{data → test/fixtures}/hartman_23.matrix +0 -0
  68. data/{data → test/fixtures}/repeated_fields.csv +0 -0
  69. data/{data → test/fixtures}/test_binomial.csv +0 -0
  70. data/test/{test_csv.csv → fixtures/test_csv.csv} +0 -0
  71. data/test/{test_xls.xls → fixtures/test_xls.xls} +0 -0
  72. data/{data → test/fixtures}/tetmat_matrix.txt +0 -0
  73. data/{data → test/fixtures}/tetmat_test.txt +0 -0
  74. data/test/helpers_tests.rb +18 -2
  75. data/test/test_analysis.rb +118 -0
  76. data/test/test_anovatwoway.rb +1 -1
  77. data/test/test_anovatwowaywithdataset.rb +1 -1
  78. data/test/test_anovawithvectors.rb +1 -2
  79. data/test/test_bartlettsphericity.rb +1 -2
  80. data/test/test_bivariate.rb +64 -22
  81. data/test/test_codification.rb +1 -2
  82. data/test/test_crosstab.rb +1 -2
  83. data/test/test_csv.rb +3 -4
  84. data/test/test_dataset.rb +24 -3
  85. data/test/test_dominance_analysis.rb +1 -2
  86. data/test/test_factor.rb +8 -69
  87. data/test/test_factor_map.rb +43 -0
  88. data/test/test_factor_pa.rb +54 -0
  89. data/test/test_ggobi.rb +1 -1
  90. data/test/test_gsl.rb +12 -18
  91. data/test/test_histogram.rb +1 -2
  92. data/test/test_logit.rb +62 -18
  93. data/test/test_matrix.rb +4 -5
  94. data/test/test_mle.rb +3 -4
  95. data/test/test_regression.rb +21 -2
  96. data/test/test_reliability.rb +3 -3
  97. data/test/test_reliability_icc.rb +1 -1
  98. data/test/test_reliability_skillscale.rb +20 -4
  99. data/test/test_resample.rb +1 -2
  100. data/test/test_rserve_extension.rb +1 -2
  101. data/test/test_srs.rb +1 -2
  102. data/test/test_statistics.rb +1 -2
  103. data/test/test_stest.rb +1 -2
  104. data/test/test_stratified.rb +1 -2
  105. data/test/test_test_f.rb +1 -2
  106. data/test/test_test_t.rb +1 -2
  107. data/test/test_umannwhitney.rb +1 -2
  108. data/test/test_vector.rb +117 -18
  109. data/test/test_xls.rb +2 -3
  110. data/web/Rakefile +39 -0
  111. metadata +109 -29
  112. metadata.gz.sig +0 -0
  113. data/examples/parallel_analysis_tetrachoric.rb +0 -31
  114. data/lib/distribution.rb +0 -25
  115. data/lib/distribution/chisquare.rb +0 -23
  116. data/lib/distribution/f.rb +0 -35
  117. data/lib/distribution/normal.rb +0 -60
  118. data/lib/distribution/normalbivariate.rb +0 -284
  119. data/lib/distribution/normalmultivariate.rb +0 -73
  120. data/lib/distribution/t.rb +0 -55
  121. data/test/test_distribution.rb +0 -73
@@ -27,7 +27,7 @@ module Factor
27
27
  attr_accessor :max_iterations
28
28
  # Maximum precision
29
29
  attr_accessor :epsilon
30
-
30
+ attr_accessor :use_gsl
31
31
  dirty_writer :max_iterations, :epsilon
32
32
  dirty_memoize :iterations, :rotated, :component_transformation_matrix, :h2
33
33
 
@@ -41,6 +41,7 @@ module Factor
41
41
  @epsilon=EPSILON
42
42
  @rotated=nil
43
43
  @h2=(@matrix.collect {|c| c**2} * Matrix.column_vector([1]*@m)).column(0).to_a
44
+ @use_gsl=Statsample.has_gsl?
44
45
  opts.each{|k,v|
45
46
  self.send("#{k}=",v) if self.respond_to? k
46
47
  }
@@ -58,11 +59,12 @@ module Factor
58
59
  end
59
60
  # Start iteration
60
61
  def iterate
61
- t=Matrix.identity(@m)
62
- b=@matrix.dup
63
- h=Matrix.diagonal(*@h2).collect {|c| Math::sqrt(c)}
62
+ k_matrix=@use_gsl ? GSL::Matrix : ::Matrix
63
+ t=k_matrix.identity(@m)
64
+ b=(@use_gsl ? @matrix.to_gsl : @matrix.dup)
65
+ h=k_matrix.diagonal(*@h2).collect {|c| Math::sqrt(c)}
64
66
  h_inverse=h.collect {|c| c!=0 ? 1/c : 0 }
65
- bh=h_inverse*b
67
+ bh=h_inverse * b
66
68
  @not_converged=true
67
69
  @iterations=0
68
70
  while @not_converged
@@ -110,9 +112,14 @@ module Factor
110
112
  t[row_i][i]=tx_rot[row_i]
111
113
  t[row_i][j]=ty_rot[row_i]
112
114
  }
113
-
114
- bh=Matrix.rows(bh)
115
- t=Matrix.rows(t)
115
+ #if @use_gsl
116
+ bh=k_matrix.[](*bh)
117
+ t=k_matrix.[](*t)
118
+ #else
119
+ # bh=Matrix.rows(bh)
120
+ # t=Matrix.rows(t)
121
+
122
+ #end
116
123
  else
117
124
  num_pairs=num_pairs-1
118
125
  @not_converged=false if num_pairs==0
@@ -49,7 +49,6 @@ module Statsample
49
49
  # to the anchor location. For example, with the default left alignment,
50
50
  # an angle of Math.PI / 2 causes text to proceed downwards. The default angle is zero.
51
51
  attr_accessor :label_angle
52
-
53
52
  attr_reader :x_scale, :y_scale
54
53
  # Create a new Boxplot.
55
54
  # Parameters: Hash of options
@@ -223,11 +222,11 @@ module Statsample
223
222
  dot.bottom {|v| y_scale.scale(v)}
224
223
  dot.title {|v| v}
225
224
  end
226
-
227
-
228
- end
225
+ end
229
226
  end
227
+ vis
230
228
  end
229
+
231
230
  # Returns SVG with scatterplot
232
231
  def to_svg
233
232
  rp=rubyvis_panel
@@ -120,7 +120,7 @@ module Statsample
120
120
  y_scale=Rubyvis::Scale.linear(@minimum_y, @maximum_y).range(0, height - margin_vert)
121
121
 
122
122
  y_scale.nice
123
- max_range=@hist.max
123
+
124
124
  bins=@hist.bins.times.map {|i|
125
125
  {
126
126
  :low =>@hist.get_range(i)[0],
@@ -170,6 +170,7 @@ module Statsample
170
170
  end
171
171
  rubyvis_normal_distribution(pan) if @line_normal_distribution
172
172
  end
173
+ vis
173
174
  end
174
175
  # Returns SVG with scatterplot
175
176
  def to_svg
@@ -193,6 +193,7 @@ module Statsample
193
193
  end
194
194
  end
195
195
  end
196
+ vis
196
197
  end
197
198
  # Returns SVG with scatterplot
198
199
  def to_svg
@@ -2,6 +2,9 @@ class ::Vector
2
2
  def to_matrix
3
3
  ::Matrix.columns([self.to_a])
4
4
  end
5
+ def to_vector
6
+ self
7
+ end
5
8
  end
6
9
  class ::Matrix
7
10
  def to_matrix
@@ -28,18 +31,24 @@ class ::Matrix
28
31
  if Statsample.has_gsl?
29
32
  # Optimize eigenpairs of extendmatrix module using gsl
30
33
  def eigenpairs
31
- eigval, eigvec= GSL::Eigen.symmv(self.to_gsl)
32
- ep=eigval.size.times.map {|i|
33
- [eigval[i], eigvec.get_col(i).to_a]
34
- }
35
- ep.sort{|a,b| a[0]<=>b[0]}.reverse
34
+ to_gsl.eigenpairs
36
35
  end
37
36
  end
38
37
 
39
38
  def eigenvalues
40
- eigen[:eigenvalues]
39
+ eigenpairs.collect {|v| v[0]}
40
+ end
41
+ def eigenvectors
42
+ eigenpairs.collect {|v| v[1]}
43
+ end
44
+ def eigenvectors_matrix
45
+ Matrix.columns(eigenvectors)
41
46
  end
47
+
48
+
42
49
 
50
+
51
+
43
52
  def to_gsl
44
53
  out=[]
45
54
  self.row_size.times{|i|
@@ -55,18 +64,94 @@ module GSL
55
64
  def to_matrix
56
65
  ::Matrix.columns([self.size.times.map {|i| self[i]}])
57
66
  end
67
+ def to_ary
68
+ to_a
69
+ end
70
+ def to_gsl
71
+ self
72
+ end
58
73
  end
59
74
  end
60
75
  class Matrix
61
76
  def to_gsl
62
77
  self
63
78
  end
79
+
80
+ def to_dataset
81
+ f = (self.respond_to? :fields_y) ? fields_y : column_size.times.map {|i| _("VAR_%d") % (i+1) }
82
+ ds=Statsample::Dataset.new(f)
83
+ f.each do |ff|
84
+ ds[ff].type=:scale
85
+ ds[ff].name=ff
86
+ end
87
+ row_size.times {|i|
88
+ ds.add_case_array(self.row(i).to_a)
89
+ }
90
+ ds.update_valid_data
91
+ ds.name=self.name if self.respond_to? :name
92
+ ds
93
+ end
94
+
95
+ def row_size
96
+ size1
97
+ end
98
+ def column_size
99
+ size2
100
+ end
101
+ def determinant
102
+ det
103
+ end
104
+ def inverse
105
+ GSL::Linalg::LU.invert(self)
106
+ end
107
+ def eigenvalues
108
+ eigenpairs.collect {|v| v[0]}
109
+ end
110
+ def eigenvectors
111
+ eigenpairs.collect {|v| v[1]}
112
+ end
113
+
114
+ # Matrix sum of squares
115
+ def mssq
116
+ sum=0
117
+ to_v.each {|i| sum+=i**2}
118
+ sum
119
+ end
120
+
121
+ def eigenvectors_matrix
122
+ eigval, eigvec= GSL::Eigen.symmv(self)
123
+ GSL::Eigen::symmv_sort(eigval, eigvec, GSL::Eigen::SORT_VAL_DESC)
124
+ eigvec
125
+ end
126
+ def eigenpairs
127
+ eigval, eigvec= GSL::Eigen.symmv(self)
128
+ GSL::Eigen::symmv_sort(eigval, eigvec, GSL::Eigen::SORT_VAL_DESC)
129
+ @eigenpairs=eigval.size.times.map {|i|
130
+ [eigval[i],eigvec.get_col(i)]
131
+ }
132
+ end
133
+
134
+ #def eigenpairs_ruby
135
+ # self.to_matrix.eigenpairs_ruby
136
+ #end
137
+ def square?
138
+ size1==size2
139
+ end
64
140
  def to_matrix
65
141
  rows=self.size1
66
142
  cols=self.size2
67
143
  out=(0...rows).collect{|i| (0...cols).collect {|j| self[i,j]} }
68
144
  ::Matrix.rows(out)
69
145
  end
146
+ def total_sum
147
+ sum=0
148
+ size1.times {|i|
149
+ size2.times {|j|
150
+ sum+=self[i,j]
151
+ }
152
+ }
153
+ sum
154
+ end
70
155
  end
71
156
  end
72
157
 
@@ -122,7 +207,7 @@ module Statsample
122
207
  @@covariatematrix=0
123
208
 
124
209
  # Get type of covariate matrix. Could be :covariance or :correlation
125
- def type
210
+ def _type
126
211
  if row_size==column_size
127
212
  if row_size.times.find {|i| self[i,i]!=1.0}
128
213
  :covariance
@@ -134,11 +219,11 @@ module Statsample
134
219
  end
135
220
 
136
221
  end
137
- def type=(t)
222
+ def _type=(t)
138
223
  @type=t
139
224
  end
140
225
  def correlation
141
- if(type==:covariance)
226
+ if(_type==:covariance)
142
227
  matrix=Matrix.rows(row_size.times.collect { |i|
143
228
  column_size.times.collect { |j|
144
229
  if i==j
@@ -151,7 +236,7 @@ module Statsample
151
236
  matrix.extend CovariateMatrix
152
237
  matrix.fields_x=fields_x
153
238
  matrix.fields_y=fields_y
154
- matrix.type=:correlation
239
+ matrix._type=:correlation
155
240
  matrix
156
241
  else
157
242
  self
@@ -192,12 +277,17 @@ module Statsample
192
277
  columns||=rows
193
278
  # Convert all labels on index
194
279
  row_index=rows.collect {|v|
195
- v.is_a?(Numeric) ? v : fields_x.index(v)
280
+ r=v.is_a?(Numeric) ? v : fields_x.index(v)
281
+ raise "Index #{v} doesn't exists on matrix" if r.nil?
282
+ r
196
283
  }
197
284
  column_index=columns.collect {|v|
198
- v.is_a?(Numeric) ? v : fields_y.index(v)
285
+ r=v.is_a?(Numeric) ? v : fields_y.index(v)
286
+ raise "Index #{v} doesn't exists on matrix" if r.nil?
287
+ r
199
288
  }
200
-
289
+
290
+
201
291
  fx=row_index.collect {|v| fields_x[v]}
202
292
  fy=column_index.collect {|v| fields_y[v]}
203
293
 
@@ -206,14 +296,14 @@ module Statsample
206
296
  matrix.extend CovariateMatrix
207
297
  matrix.fields_x=fx
208
298
  matrix.fields_y=fy
209
- matrix.type=type
299
+ matrix._type=_type
210
300
  matrix
211
301
  end
212
302
  def report_building(generator)
213
- @name||= (type==:correlation ? _("Correlation"):_("Covariance"))+_(" Matrix")
303
+ @name||= (_type==:correlation ? _("Correlation"):_("Covariance"))+_(" Matrix")
214
304
  generator.table(:name=>@name, :header=>[""]+fields_y) do |t|
215
305
  row_size.times {|i|
216
- t.row([fields_x[i]]+@rows[i].collect {|i1|
306
+ t.row([fields_x[i]]+row(i).to_a.collect {|i1|
217
307
  i1.nil? ? "--" : sprintf("%0.3f",i1).gsub("0.",".")
218
308
  })
219
309
  }
@@ -22,6 +22,9 @@ module Statsample
22
22
  # * Logit Regression: Statsample::Regression::Binomial::Logit
23
23
  # * Probit Regression: Statsample::Regression::Binomial::Probit
24
24
  module Regression
25
+
26
+ LinearDependency=Class.new(Exception)
27
+
25
28
  # Create a Statsample::Regression::Simple object, for simple regression
26
29
  # * x: independent Vector
27
30
  # * y: dependent Vector
@@ -78,7 +81,7 @@ module Statsample
78
81
  if missing_data==:pairwise
79
82
  Statsample::Regression::Multiple::RubyEngine.new(ds,y_var, opts)
80
83
  else
81
- if Statsample.has_gsl?
84
+ if Statsample.has_gsl? and false
82
85
  Statsample::Regression::Multiple::GslEngine.new(ds, y_var, opts)
83
86
  else
84
87
  ds2=ds.dup_only_valid
@@ -49,7 +49,7 @@ module Statsample
49
49
  end
50
50
  # Constant standard error
51
51
  def constant_se
52
- i=@fields.index :_constant
52
+ i=@fields.index "_constant"
53
53
  Math::sqrt(@var_cov_matrix[i,i])
54
54
  end
55
55
  # Regression coefficients
@@ -12,6 +12,8 @@ module Statsample
12
12
  attr_reader :valid_cases
13
13
  # Number of total cases (dataset.cases)
14
14
  attr_reader :total_cases
15
+
16
+ attr_accessor :digits
15
17
  def self.univariate?
16
18
  true
17
19
  end
@@ -23,9 +25,15 @@ module Statsample
23
25
  @y_var=y_var
24
26
  @r2=nil
25
27
  @name=_("Multiple Regression: %s over %s") % [ ds.fields.join(",") , @y_var]
26
- opts.each{|k,v|
28
+
29
+
30
+ opts_default={:digits=>3}
31
+ @opts=opts_default.merge opts
32
+
33
+ @opts.each{|k,v|
27
34
  self.send("#{k}=",v) if self.respond_to? k
28
35
  }
36
+
29
37
  end
30
38
  # Calculate F Test
31
39
  def anova
@@ -159,7 +167,7 @@ module Statsample
159
167
  columns.unshift([1.0]*@valid_cases)
160
168
  x=Matrix.columns(columns)
161
169
  matrix=((x.t*x)).inverse * mse
162
- matrix.collect {|i| Math::sqrt(i) if i>0 }
170
+ matrix.collect {|i| Math::sqrt(i) if i>=0 }
163
171
  end
164
172
  # T for constant
165
173
  def constant_t
@@ -170,24 +178,26 @@ module Statsample
170
178
  estimated_variance_covariance_matrix[0,0]
171
179
  end
172
180
  def report_building(b)
181
+ di="%0.#{digits}f"
173
182
  b.section(:name=>@name) do |g|
174
183
  c=coeffs
175
184
  g.text _("Engine: %s") % self.class
176
185
  g.text(_("Cases(listwise)=%d(%d)") % [@total_cases, @valid_cases])
177
- g.text _("R=%0.3f") % r
178
- g.text _("R^2=%0.3f") % r2
179
- g.text _"R^2 Adj=%0.3f" % r2_adjusted
180
- g.text _("Std.Error R=%0.3f") % se_estimate
186
+ g.text _("R=#{di}") % r
187
+ g.text _("R^2=#{di}") % r2
188
+ g.text _"R^2 Adj=#{di}" % r2_adjusted
189
+ g.text _("Std.Error R=#{di}") % se_estimate
181
190
 
182
- g.text(_("Equation")+"="+ sprintf('%0.3f',constant) +" + "+ @fields.collect {|k| sprintf('%0.3f%s',c[k],k)}.join(' + ') )
191
+ g.text(_("Equation")+"="+ sprintf(di,constant) +" + "+ @fields.collect {|k| sprintf("#{di}%s",c[k],k)}.join(' + ') )
183
192
 
184
193
  g.parse_element(anova)
185
194
  sc=standarized_coeffs
195
+
186
196
  cse=coeffs_se
187
197
  g.table(:name=>_("Beta coefficients"), :header=>%w{coeff b beta se t}.collect{|field| _(field)} ) do |t|
188
- t.row([_("Constant"), sprintf("%0.3f", constant), "-", constant_se.nil? ? "": sprintf("%0.3f", constant_se), constant_t.nil? ? "" : sprintf("%0.3f", constant_t)])
198
+ t.row([_("Constant"), sprintf(di, constant), "-", constant_se.nil? ? "": sprintf(di, constant_se), constant_t.nil? ? "" : sprintf(di, constant_t)])
189
199
  @fields.each do |f|
190
- t.row([f, sprintf("%0.3f", c[f]), sprintf("%0.3f", sc[f]), sprintf("%0.3f", cse[f]), sprintf("%0.3f", c[f].quo(cse[f]))])
200
+ t.row([f, sprintf(di, c[f]), sprintf(di, sc[f]), sprintf(di, cse[f]), sprintf(di, c[f].quo(cse[f]))])
191
201
  end
192
202
  end
193
203
  end
@@ -1,131 +1,132 @@
1
1
  if Statsample.has_gsl?
2
- module Statsample
3
- module Regression
4
- module Multiple
5
- # Class for Multiple Regression Analysis
6
- # Requires rbgsl and uses a listwise aproach.
7
- # Slower on prediction of values than Alglib, because predict is ruby based.
8
- # Better memory management on multiple (+1000) series of regression.
9
- # If you need pairwise, use RubyEngine
10
- # Example:
11
- #
12
- # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
13
- # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
14
- # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
15
- # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
16
- # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
17
- # lr=Statsample::Regression::Multiple::GslEngine.new(ds,'y')
18
- #
19
- class GslEngine < BaseEngine
20
- def initialize(ds,y_var, opts=Hash.new)
21
- super
22
- @ds=ds.dup_only_valid
23
- @ds_valid=@ds
24
- @valid_cases=@ds_valid.cases
25
- @dy=@ds[@y_var]
26
- @ds_indep=ds.dup(ds.fields-[y_var])
27
- # Create a custom matrix
28
- columns=[]
29
- @fields=[]
30
- max_deps = GSL::Matrix.alloc(@ds.cases, @ds.fields.size)
31
- constant_col=@ds.fields.size-1
32
- for i in 0...@ds.cases
33
- max_deps.set(i,constant_col,1)
34
- end
35
- j=0
36
- @ds.fields.each{|f|
37
- if f!=@y_var
38
- @ds[f].each_index{|i1|
39
- max_deps.set(i1,j,@ds[f][i1])
2
+ module Statsample
3
+ module Regression
4
+ module Multiple
5
+ # Class for Multiple Regression Analysis
6
+ # Requires rbgsl and uses a listwise aproach.
7
+ # Slower on prediction of values than Alglib, because predict is ruby based.
8
+ # Better memory management on multiple (+1000) series of regression.
9
+ # If you need pairwise, use RubyEngine
10
+ # Example:
11
+ #
12
+ # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
13
+ # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
14
+ # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
15
+ # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
16
+ # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
17
+ # lr=Statsample::Regression::Multiple::GslEngine.new(ds,'y')
18
+ #
19
+ class GslEngine < BaseEngine
20
+ def initialize(ds,y_var, opts=Hash.new)
21
+ super
22
+ @ds=ds.dup_only_valid
23
+ @ds_valid=@ds
24
+ @valid_cases=@ds_valid.cases
25
+ @dy=@ds[@y_var]
26
+ @ds_indep=ds.dup(ds.fields-[y_var])
27
+ # Create a custom matrix
28
+ columns=[]
29
+ @fields=[]
30
+ max_deps = GSL::Matrix.alloc(@ds.cases, @ds.fields.size)
31
+ constant_col=@ds.fields.size-1
32
+ for i in 0...@ds.cases
33
+ max_deps.set(i,constant_col,1)
34
+ end
35
+ j=0
36
+ @ds.fields.each{|f|
37
+ if f!=@y_var
38
+ @ds[f].each_index{|i1|
39
+ max_deps.set(i1,j,@ds[f][i1])
40
+ }
41
+ columns.push(@ds[f].to_a)
42
+ @fields.push(f)
43
+ j+=1
44
+ end
40
45
  }
41
- columns.push(@ds[f].to_a)
42
- @fields.push(f)
43
- j+=1
46
+ @dep_columns=columns.dup
47
+ @lr_s=nil
48
+ c, @cov, @chisq, @status = GSL::MultiFit.linear(max_deps, @dy.gsl)
49
+ @constant=c[constant_col]
50
+ @coeffs_a=c.to_a.slice(0...constant_col)
51
+ @coeffs=assign_names(@coeffs_a)
52
+ c=nil
53
+ end
54
+
55
+ def _dump(i)
56
+ Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
57
+ end
58
+ def self._load(data)
59
+ h=Marshal.load(data)
60
+ self.new(h['ds'], h['y_var'])
61
+ end
62
+
63
+ def coeffs
64
+ @coeffs
65
+ end
66
+ # Coefficients using a constant
67
+ # Based on http://www.xycoon.com/ols1.htm
68
+ def matrix_resolution
69
+ columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
70
+ columns.unshift([1.0]*@ds.cases)
71
+ y=Matrix.columns([@dy.data.map {|i| i.to_f}])
72
+ x=Matrix.columns(columns)
73
+ xt=x.t
74
+ matrix=((xt*x)).inverse*xt
75
+ matrix*y
76
+ end
77
+ def r2
78
+ r**2
79
+ end
80
+ def r
81
+ Bivariate::pearson(@dy, predicted)
82
+ end
83
+ def sst
84
+ @dy.ss
85
+ end
86
+ def constant
87
+ @constant
88
+ end
89
+ def standarized_coeffs
90
+ l=lr_s
91
+ l.coeffs
92
+ end
93
+ def lr_s
94
+ if @lr_s.nil?
95
+ build_standarized
96
+ end
97
+ @lr_s
98
+ end
99
+ def build_standarized
100
+ @ds_s=@ds.standarize
101
+ @lr_s=GslEngine.new(@ds_s,@y_var)
102
+ end
103
+ def process_s(v)
104
+ lr_s.process(v)
105
+ end
106
+ # ???? Not equal to SPSS output
107
+ def standarized_residuals
108
+ res=residuals
109
+ red_sd=residuals.sds
110
+ res.collect {|v|
111
+ v.quo(red_sd)
112
+ }.to_vector(:scale)
113
+ end
114
+
115
+ # Standard error for coeffs
116
+ def coeffs_se
117
+ out={}
118
+ evcm=estimated_variance_covariance_matrix
119
+ @ds_valid.fields.each_with_index do |f,i|
120
+
121
+ mi=i+1
122
+ next if f==@y_var
123
+ out[f]=evcm[mi,mi]
124
+ end
125
+ out
126
+ end
127
+
44
128
  end
45
- }
46
- @dep_columns=columns.dup
47
- @lr_s=nil
48
- c, @cov, @chisq, @status = GSL::MultiFit.linear(max_deps, @dy.gsl)
49
- @constant=c[constant_col]
50
- @coeffs_a=c.to_a.slice(0...constant_col)
51
- @coeffs=assign_names(@coeffs_a)
52
- c=nil
53
- end
54
-
55
- def _dump(i)
56
- Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
57
- end
58
- def self._load(data)
59
- h=Marshal.load(data)
60
- self.new(h['ds'], h['y_var'])
61
- end
62
-
63
- def coeffs
64
- @coeffs
65
- end
66
- # Coefficients using a constant
67
- # Based on http://www.xycoon.com/ols1.htm
68
- def matrix_resolution
69
- columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
70
- columns.unshift([1.0]*@ds.cases)
71
- y=Matrix.columns([@dy.data.map {|i| i.to_f}])
72
- x=Matrix.columns(columns)
73
- xt=x.t
74
- matrix=((xt*x)).inverse*xt
75
- matrix*y
76
- end
77
- def r2
78
- r**2
79
- end
80
- def r
81
- Bivariate::pearson(@dy, predicted)
82
- end
83
- def sst
84
- @dy.ss
85
- end
86
- def constant
87
- @constant
88
- end
89
- def standarized_coeffs
90
- l=lr_s
91
- l.coeffs
92
- end
93
- def lr_s
94
- if @lr_s.nil?
95
- build_standarized
96
- end
97
- @lr_s
98
- end
99
- def build_standarized
100
- @ds_s=@ds.standarize
101
- @lr_s=GslEngine.new(@ds_s,@y_var)
102
- end
103
- def process_s(v)
104
- lr_s.process(v)
105
- end
106
- # ???? Not equal to SPSS output
107
- def standarized_residuals
108
- res=residuals
109
- red_sd=residuals.sds
110
- res.collect {|v|
111
- v.quo(red_sd)
112
- }.to_vector(:scale)
113
- end
114
-
115
- # Standard error for coeffs
116
- def coeffs_se
117
- out={}
118
- evcm=estimated_variance_covariance_matrix
119
- @ds_valid.fields.each_with_index do |f,i|
120
- mi=i+1
121
- next if f==@y_var
122
- out[f]=evcm[mi,mi]
129
+ end
123
130
  end
124
- out
125
- end
126
-
127
- end
128
- end
129
- end
130
- end # for Statsample
131
+ end # for Statsample
131
132
  end # for if