statsample 0.18.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. data.tar.gz.sig +0 -0
  2. data/History.txt +23 -0
  3. data/Manifest.txt +28 -17
  4. data/Rakefile +3 -2
  5. data/benchmarks/correlation_matrix_15_variables.rb +31 -0
  6. data/benchmarks/correlation_matrix_5_variables.rb +32 -0
  7. data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
  8. data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
  9. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +75 -0
  10. data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
  11. data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
  12. data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
  13. data/benchmarks/correlation_matrix_methods/results.ds +0 -0
  14. data/benchmarks/factor_map.rb +37 -0
  15. data/benchmarks/helpers_benchmark.rb +5 -0
  16. data/examples/boxplot.rb +13 -14
  17. data/examples/correlation_matrix.rb +16 -8
  18. data/examples/dataset.rb +13 -4
  19. data/examples/dominance_analysis.rb +23 -17
  20. data/examples/dominance_analysis_bootstrap.rb +28 -22
  21. data/examples/histogram.rb +8 -9
  22. data/examples/icc.rb +20 -21
  23. data/examples/levene.rb +10 -4
  24. data/examples/multiple_regression.rb +9 -28
  25. data/examples/multivariate_correlation.rb +9 -3
  26. data/examples/parallel_analysis.rb +20 -16
  27. data/examples/polychoric.rb +15 -9
  28. data/examples/principal_axis.rb +18 -6
  29. data/examples/reliability.rb +26 -13
  30. data/examples/scatterplot.rb +10 -6
  31. data/examples/t_test.rb +15 -6
  32. data/examples/tetrachoric.rb +9 -2
  33. data/examples/u_test.rb +12 -4
  34. data/examples/vector.rb +13 -2
  35. data/examples/velicer_map_test.rb +33 -26
  36. data/lib/statsample.rb +32 -12
  37. data/lib/statsample/analysis.rb +79 -0
  38. data/lib/statsample/analysis/suite.rb +72 -0
  39. data/lib/statsample/analysis/suitereportbuilder.rb +38 -0
  40. data/lib/statsample/bivariate.rb +70 -16
  41. data/lib/statsample/dataset.rb +25 -19
  42. data/lib/statsample/dominanceanalysis.rb +2 -2
  43. data/lib/statsample/factor.rb +2 -0
  44. data/lib/statsample/factor/map.rb +16 -10
  45. data/lib/statsample/factor/parallelanalysis.rb +9 -3
  46. data/lib/statsample/factor/pca.rb +28 -32
  47. data/lib/statsample/factor/rotation.rb +15 -8
  48. data/lib/statsample/graph/boxplot.rb +3 -4
  49. data/lib/statsample/graph/histogram.rb +2 -1
  50. data/lib/statsample/graph/scatterplot.rb +1 -0
  51. data/lib/statsample/matrix.rb +106 -16
  52. data/lib/statsample/regression.rb +4 -1
  53. data/lib/statsample/regression/binomial.rb +1 -1
  54. data/lib/statsample/regression/multiple/baseengine.rb +19 -9
  55. data/lib/statsample/regression/multiple/gslengine.rb +127 -126
  56. data/lib/statsample/regression/multiple/matrixengine.rb +8 -5
  57. data/lib/statsample/regression/multiple/rubyengine.rb +1 -1
  58. data/lib/statsample/regression/simple.rb +31 -6
  59. data/lib/statsample/reliability.rb +11 -3
  60. data/lib/statsample/reliability/scaleanalysis.rb +4 -4
  61. data/lib/statsample/shorthand.rb +81 -0
  62. data/lib/statsample/test/chisquare.rb +1 -1
  63. data/lib/statsample/vector.rb +163 -163
  64. data/lib/statsample/vector/gsl.rb +106 -0
  65. data/references.txt +2 -2
  66. data/{data → test/fixtures}/crime.txt +0 -0
  67. data/{data → test/fixtures}/hartman_23.matrix +0 -0
  68. data/{data → test/fixtures}/repeated_fields.csv +0 -0
  69. data/{data → test/fixtures}/test_binomial.csv +0 -0
  70. data/test/{test_csv.csv → fixtures/test_csv.csv} +0 -0
  71. data/test/{test_xls.xls → fixtures/test_xls.xls} +0 -0
  72. data/{data → test/fixtures}/tetmat_matrix.txt +0 -0
  73. data/{data → test/fixtures}/tetmat_test.txt +0 -0
  74. data/test/helpers_tests.rb +18 -2
  75. data/test/test_analysis.rb +118 -0
  76. data/test/test_anovatwoway.rb +1 -1
  77. data/test/test_anovatwowaywithdataset.rb +1 -1
  78. data/test/test_anovawithvectors.rb +1 -2
  79. data/test/test_bartlettsphericity.rb +1 -2
  80. data/test/test_bivariate.rb +64 -22
  81. data/test/test_codification.rb +1 -2
  82. data/test/test_crosstab.rb +1 -2
  83. data/test/test_csv.rb +3 -4
  84. data/test/test_dataset.rb +24 -3
  85. data/test/test_dominance_analysis.rb +1 -2
  86. data/test/test_factor.rb +8 -69
  87. data/test/test_factor_map.rb +43 -0
  88. data/test/test_factor_pa.rb +54 -0
  89. data/test/test_ggobi.rb +1 -1
  90. data/test/test_gsl.rb +12 -18
  91. data/test/test_histogram.rb +1 -2
  92. data/test/test_logit.rb +62 -18
  93. data/test/test_matrix.rb +4 -5
  94. data/test/test_mle.rb +3 -4
  95. data/test/test_regression.rb +21 -2
  96. data/test/test_reliability.rb +3 -3
  97. data/test/test_reliability_icc.rb +1 -1
  98. data/test/test_reliability_skillscale.rb +20 -4
  99. data/test/test_resample.rb +1 -2
  100. data/test/test_rserve_extension.rb +1 -2
  101. data/test/test_srs.rb +1 -2
  102. data/test/test_statistics.rb +1 -2
  103. data/test/test_stest.rb +1 -2
  104. data/test/test_stratified.rb +1 -2
  105. data/test/test_test_f.rb +1 -2
  106. data/test/test_test_t.rb +1 -2
  107. data/test/test_umannwhitney.rb +1 -2
  108. data/test/test_vector.rb +117 -18
  109. data/test/test_xls.rb +2 -3
  110. data/web/Rakefile +39 -0
  111. metadata +109 -29
  112. metadata.gz.sig +0 -0
  113. data/examples/parallel_analysis_tetrachoric.rb +0 -31
  114. data/lib/distribution.rb +0 -25
  115. data/lib/distribution/chisquare.rb +0 -23
  116. data/lib/distribution/f.rb +0 -35
  117. data/lib/distribution/normal.rb +0 -60
  118. data/lib/distribution/normalbivariate.rb +0 -284
  119. data/lib/distribution/normalmultivariate.rb +0 -73
  120. data/lib/distribution/t.rb +0 -55
  121. data/test/test_distribution.rb +0 -73
@@ -27,7 +27,7 @@ module Factor
27
27
  attr_accessor :max_iterations
28
28
  # Maximum precision
29
29
  attr_accessor :epsilon
30
-
30
+ attr_accessor :use_gsl
31
31
  dirty_writer :max_iterations, :epsilon
32
32
  dirty_memoize :iterations, :rotated, :component_transformation_matrix, :h2
33
33
 
@@ -41,6 +41,7 @@ module Factor
41
41
  @epsilon=EPSILON
42
42
  @rotated=nil
43
43
  @h2=(@matrix.collect {|c| c**2} * Matrix.column_vector([1]*@m)).column(0).to_a
44
+ @use_gsl=Statsample.has_gsl?
44
45
  opts.each{|k,v|
45
46
  self.send("#{k}=",v) if self.respond_to? k
46
47
  }
@@ -58,11 +59,12 @@ module Factor
58
59
  end
59
60
  # Start iteration
60
61
  def iterate
61
- t=Matrix.identity(@m)
62
- b=@matrix.dup
63
- h=Matrix.diagonal(*@h2).collect {|c| Math::sqrt(c)}
62
+ k_matrix=@use_gsl ? GSL::Matrix : ::Matrix
63
+ t=k_matrix.identity(@m)
64
+ b=(@use_gsl ? @matrix.to_gsl : @matrix.dup)
65
+ h=k_matrix.diagonal(*@h2).collect {|c| Math::sqrt(c)}
64
66
  h_inverse=h.collect {|c| c!=0 ? 1/c : 0 }
65
- bh=h_inverse*b
67
+ bh=h_inverse * b
66
68
  @not_converged=true
67
69
  @iterations=0
68
70
  while @not_converged
@@ -110,9 +112,14 @@ module Factor
110
112
  t[row_i][i]=tx_rot[row_i]
111
113
  t[row_i][j]=ty_rot[row_i]
112
114
  }
113
-
114
- bh=Matrix.rows(bh)
115
- t=Matrix.rows(t)
115
+ #if @use_gsl
116
+ bh=k_matrix.[](*bh)
117
+ t=k_matrix.[](*t)
118
+ #else
119
+ # bh=Matrix.rows(bh)
120
+ # t=Matrix.rows(t)
121
+
122
+ #end
116
123
  else
117
124
  num_pairs=num_pairs-1
118
125
  @not_converged=false if num_pairs==0
@@ -49,7 +49,6 @@ module Statsample
49
49
  # to the anchor location. For example, with the default left alignment,
50
50
  # an angle of Math.PI / 2 causes text to proceed downwards. The default angle is zero.
51
51
  attr_accessor :label_angle
52
-
53
52
  attr_reader :x_scale, :y_scale
54
53
  # Create a new Boxplot.
55
54
  # Parameters: Hash of options
@@ -223,11 +222,11 @@ module Statsample
223
222
  dot.bottom {|v| y_scale.scale(v)}
224
223
  dot.title {|v| v}
225
224
  end
226
-
227
-
228
- end
225
+ end
229
226
  end
227
+ vis
230
228
  end
229
+
231
230
  # Returns SVG with scatterplot
232
231
  def to_svg
233
232
  rp=rubyvis_panel
@@ -120,7 +120,7 @@ module Statsample
120
120
  y_scale=Rubyvis::Scale.linear(@minimum_y, @maximum_y).range(0, height - margin_vert)
121
121
 
122
122
  y_scale.nice
123
- max_range=@hist.max
123
+
124
124
  bins=@hist.bins.times.map {|i|
125
125
  {
126
126
  :low =>@hist.get_range(i)[0],
@@ -170,6 +170,7 @@ module Statsample
170
170
  end
171
171
  rubyvis_normal_distribution(pan) if @line_normal_distribution
172
172
  end
173
+ vis
173
174
  end
174
175
  # Returns SVG with scatterplot
175
176
  def to_svg
@@ -193,6 +193,7 @@ module Statsample
193
193
  end
194
194
  end
195
195
  end
196
+ vis
196
197
  end
197
198
  # Returns SVG with scatterplot
198
199
  def to_svg
@@ -2,6 +2,9 @@ class ::Vector
2
2
  def to_matrix
3
3
  ::Matrix.columns([self.to_a])
4
4
  end
5
+ def to_vector
6
+ self
7
+ end
5
8
  end
6
9
  class ::Matrix
7
10
  def to_matrix
@@ -28,18 +31,24 @@ class ::Matrix
28
31
  if Statsample.has_gsl?
29
32
  # Optimize eigenpairs of extendmatrix module using gsl
30
33
  def eigenpairs
31
- eigval, eigvec= GSL::Eigen.symmv(self.to_gsl)
32
- ep=eigval.size.times.map {|i|
33
- [eigval[i], eigvec.get_col(i).to_a]
34
- }
35
- ep.sort{|a,b| a[0]<=>b[0]}.reverse
34
+ to_gsl.eigenpairs
36
35
  end
37
36
  end
38
37
 
39
38
  def eigenvalues
40
- eigen[:eigenvalues]
39
+ eigenpairs.collect {|v| v[0]}
40
+ end
41
+ def eigenvectors
42
+ eigenpairs.collect {|v| v[1]}
43
+ end
44
+ def eigenvectors_matrix
45
+ Matrix.columns(eigenvectors)
41
46
  end
47
+
48
+
42
49
 
50
+
51
+
43
52
  def to_gsl
44
53
  out=[]
45
54
  self.row_size.times{|i|
@@ -55,18 +64,94 @@ module GSL
55
64
  def to_matrix
56
65
  ::Matrix.columns([self.size.times.map {|i| self[i]}])
57
66
  end
67
+ def to_ary
68
+ to_a
69
+ end
70
+ def to_gsl
71
+ self
72
+ end
58
73
  end
59
74
  end
60
75
  class Matrix
61
76
  def to_gsl
62
77
  self
63
78
  end
79
+
80
+ def to_dataset
81
+ f = (self.respond_to? :fields_y) ? fields_y : column_size.times.map {|i| _("VAR_%d") % (i+1) }
82
+ ds=Statsample::Dataset.new(f)
83
+ f.each do |ff|
84
+ ds[ff].type=:scale
85
+ ds[ff].name=ff
86
+ end
87
+ row_size.times {|i|
88
+ ds.add_case_array(self.row(i).to_a)
89
+ }
90
+ ds.update_valid_data
91
+ ds.name=self.name if self.respond_to? :name
92
+ ds
93
+ end
94
+
95
+ def row_size
96
+ size1
97
+ end
98
+ def column_size
99
+ size2
100
+ end
101
+ def determinant
102
+ det
103
+ end
104
+ def inverse
105
+ GSL::Linalg::LU.invert(self)
106
+ end
107
+ def eigenvalues
108
+ eigenpairs.collect {|v| v[0]}
109
+ end
110
+ def eigenvectors
111
+ eigenpairs.collect {|v| v[1]}
112
+ end
113
+
114
+ # Matrix sum of squares
115
+ def mssq
116
+ sum=0
117
+ to_v.each {|i| sum+=i**2}
118
+ sum
119
+ end
120
+
121
+ def eigenvectors_matrix
122
+ eigval, eigvec= GSL::Eigen.symmv(self)
123
+ GSL::Eigen::symmv_sort(eigval, eigvec, GSL::Eigen::SORT_VAL_DESC)
124
+ eigvec
125
+ end
126
+ def eigenpairs
127
+ eigval, eigvec= GSL::Eigen.symmv(self)
128
+ GSL::Eigen::symmv_sort(eigval, eigvec, GSL::Eigen::SORT_VAL_DESC)
129
+ @eigenpairs=eigval.size.times.map {|i|
130
+ [eigval[i],eigvec.get_col(i)]
131
+ }
132
+ end
133
+
134
+ #def eigenpairs_ruby
135
+ # self.to_matrix.eigenpairs_ruby
136
+ #end
137
+ def square?
138
+ size1==size2
139
+ end
64
140
  def to_matrix
65
141
  rows=self.size1
66
142
  cols=self.size2
67
143
  out=(0...rows).collect{|i| (0...cols).collect {|j| self[i,j]} }
68
144
  ::Matrix.rows(out)
69
145
  end
146
+ def total_sum
147
+ sum=0
148
+ size1.times {|i|
149
+ size2.times {|j|
150
+ sum+=self[i,j]
151
+ }
152
+ }
153
+ sum
154
+ end
70
155
  end
71
156
  end
72
157
 
@@ -122,7 +207,7 @@ module Statsample
122
207
  @@covariatematrix=0
123
208
 
124
209
  # Get type of covariate matrix. Could be :covariance or :correlation
125
- def type
210
+ def _type
126
211
  if row_size==column_size
127
212
  if row_size.times.find {|i| self[i,i]!=1.0}
128
213
  :covariance
@@ -134,11 +219,11 @@ module Statsample
134
219
  end
135
220
 
136
221
  end
137
- def type=(t)
222
+ def _type=(t)
138
223
  @type=t
139
224
  end
140
225
  def correlation
141
- if(type==:covariance)
226
+ if(_type==:covariance)
142
227
  matrix=Matrix.rows(row_size.times.collect { |i|
143
228
  column_size.times.collect { |j|
144
229
  if i==j
@@ -151,7 +236,7 @@ module Statsample
151
236
  matrix.extend CovariateMatrix
152
237
  matrix.fields_x=fields_x
153
238
  matrix.fields_y=fields_y
154
- matrix.type=:correlation
239
+ matrix._type=:correlation
155
240
  matrix
156
241
  else
157
242
  self
@@ -192,12 +277,17 @@ module Statsample
192
277
  columns||=rows
193
278
  # Convert all labels on index
194
279
  row_index=rows.collect {|v|
195
- v.is_a?(Numeric) ? v : fields_x.index(v)
280
+ r=v.is_a?(Numeric) ? v : fields_x.index(v)
281
+ raise "Index #{v} doesn't exists on matrix" if r.nil?
282
+ r
196
283
  }
197
284
  column_index=columns.collect {|v|
198
- v.is_a?(Numeric) ? v : fields_y.index(v)
285
+ r=v.is_a?(Numeric) ? v : fields_y.index(v)
286
+ raise "Index #{v} doesn't exists on matrix" if r.nil?
287
+ r
199
288
  }
200
-
289
+
290
+
201
291
  fx=row_index.collect {|v| fields_x[v]}
202
292
  fy=column_index.collect {|v| fields_y[v]}
203
293
 
@@ -206,14 +296,14 @@ module Statsample
206
296
  matrix.extend CovariateMatrix
207
297
  matrix.fields_x=fx
208
298
  matrix.fields_y=fy
209
- matrix.type=type
299
+ matrix._type=_type
210
300
  matrix
211
301
  end
212
302
  def report_building(generator)
213
- @name||= (type==:correlation ? _("Correlation"):_("Covariance"))+_(" Matrix")
303
+ @name||= (_type==:correlation ? _("Correlation"):_("Covariance"))+_(" Matrix")
214
304
  generator.table(:name=>@name, :header=>[""]+fields_y) do |t|
215
305
  row_size.times {|i|
216
- t.row([fields_x[i]]+@rows[i].collect {|i1|
306
+ t.row([fields_x[i]]+row(i).to_a.collect {|i1|
217
307
  i1.nil? ? "--" : sprintf("%0.3f",i1).gsub("0.",".")
218
308
  })
219
309
  }
@@ -22,6 +22,9 @@ module Statsample
22
22
  # * Logit Regression: Statsample::Regression::Binomial::Logit
23
23
  # * Probit Regression: Statsample::Regression::Binomial::Probit
24
24
  module Regression
25
+
26
+ LinearDependency=Class.new(Exception)
27
+
25
28
  # Create a Statsample::Regression::Simple object, for simple regression
26
29
  # * x: independent Vector
27
30
  # * y: dependent Vector
@@ -78,7 +81,7 @@ module Statsample
78
81
  if missing_data==:pairwise
79
82
  Statsample::Regression::Multiple::RubyEngine.new(ds,y_var, opts)
80
83
  else
81
- if Statsample.has_gsl?
84
+ if Statsample.has_gsl? and false
82
85
  Statsample::Regression::Multiple::GslEngine.new(ds, y_var, opts)
83
86
  else
84
87
  ds2=ds.dup_only_valid
@@ -49,7 +49,7 @@ module Statsample
49
49
  end
50
50
  # Constant standard error
51
51
  def constant_se
52
- i=@fields.index :_constant
52
+ i=@fields.index "_constant"
53
53
  Math::sqrt(@var_cov_matrix[i,i])
54
54
  end
55
55
  # Regression coefficients
@@ -12,6 +12,8 @@ module Statsample
12
12
  attr_reader :valid_cases
13
13
  # Number of total cases (dataset.cases)
14
14
  attr_reader :total_cases
15
+
16
+ attr_accessor :digits
15
17
  def self.univariate?
16
18
  true
17
19
  end
@@ -23,9 +25,15 @@ module Statsample
23
25
  @y_var=y_var
24
26
  @r2=nil
25
27
  @name=_("Multiple Regression: %s over %s") % [ ds.fields.join(",") , @y_var]
26
- opts.each{|k,v|
28
+
29
+
30
+ opts_default={:digits=>3}
31
+ @opts=opts_default.merge opts
32
+
33
+ @opts.each{|k,v|
27
34
  self.send("#{k}=",v) if self.respond_to? k
28
35
  }
36
+
29
37
  end
30
38
  # Calculate F Test
31
39
  def anova
@@ -159,7 +167,7 @@ module Statsample
159
167
  columns.unshift([1.0]*@valid_cases)
160
168
  x=Matrix.columns(columns)
161
169
  matrix=((x.t*x)).inverse * mse
162
- matrix.collect {|i| Math::sqrt(i) if i>0 }
170
+ matrix.collect {|i| Math::sqrt(i) if i>=0 }
163
171
  end
164
172
  # T for constant
165
173
  def constant_t
@@ -170,24 +178,26 @@ module Statsample
170
178
  estimated_variance_covariance_matrix[0,0]
171
179
  end
172
180
  def report_building(b)
181
+ di="%0.#{digits}f"
173
182
  b.section(:name=>@name) do |g|
174
183
  c=coeffs
175
184
  g.text _("Engine: %s") % self.class
176
185
  g.text(_("Cases(listwise)=%d(%d)") % [@total_cases, @valid_cases])
177
- g.text _("R=%0.3f") % r
178
- g.text _("R^2=%0.3f") % r2
179
- g.text _"R^2 Adj=%0.3f" % r2_adjusted
180
- g.text _("Std.Error R=%0.3f") % se_estimate
186
+ g.text _("R=#{di}") % r
187
+ g.text _("R^2=#{di}") % r2
188
+ g.text _"R^2 Adj=#{di}" % r2_adjusted
189
+ g.text _("Std.Error R=#{di}") % se_estimate
181
190
 
182
- g.text(_("Equation")+"="+ sprintf('%0.3f',constant) +" + "+ @fields.collect {|k| sprintf('%0.3f%s',c[k],k)}.join(' + ') )
191
+ g.text(_("Equation")+"="+ sprintf(di,constant) +" + "+ @fields.collect {|k| sprintf("#{di}%s",c[k],k)}.join(' + ') )
183
192
 
184
193
  g.parse_element(anova)
185
194
  sc=standarized_coeffs
195
+
186
196
  cse=coeffs_se
187
197
  g.table(:name=>_("Beta coefficients"), :header=>%w{coeff b beta se t}.collect{|field| _(field)} ) do |t|
188
- t.row([_("Constant"), sprintf("%0.3f", constant), "-", constant_se.nil? ? "": sprintf("%0.3f", constant_se), constant_t.nil? ? "" : sprintf("%0.3f", constant_t)])
198
+ t.row([_("Constant"), sprintf(di, constant), "-", constant_se.nil? ? "": sprintf(di, constant_se), constant_t.nil? ? "" : sprintf(di, constant_t)])
189
199
  @fields.each do |f|
190
- t.row([f, sprintf("%0.3f", c[f]), sprintf("%0.3f", sc[f]), sprintf("%0.3f", cse[f]), sprintf("%0.3f", c[f].quo(cse[f]))])
200
+ t.row([f, sprintf(di, c[f]), sprintf(di, sc[f]), sprintf(di, cse[f]), sprintf(di, c[f].quo(cse[f]))])
191
201
  end
192
202
  end
193
203
  end
@@ -1,131 +1,132 @@
1
1
  if Statsample.has_gsl?
2
- module Statsample
3
- module Regression
4
- module Multiple
5
- # Class for Multiple Regression Analysis
6
- # Requires rbgsl and uses a listwise aproach.
7
- # Slower on prediction of values than Alglib, because predict is ruby based.
8
- # Better memory management on multiple (+1000) series of regression.
9
- # If you need pairwise, use RubyEngine
10
- # Example:
11
- #
12
- # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
13
- # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
14
- # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
15
- # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
16
- # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
17
- # lr=Statsample::Regression::Multiple::GslEngine.new(ds,'y')
18
- #
19
- class GslEngine < BaseEngine
20
- def initialize(ds,y_var, opts=Hash.new)
21
- super
22
- @ds=ds.dup_only_valid
23
- @ds_valid=@ds
24
- @valid_cases=@ds_valid.cases
25
- @dy=@ds[@y_var]
26
- @ds_indep=ds.dup(ds.fields-[y_var])
27
- # Create a custom matrix
28
- columns=[]
29
- @fields=[]
30
- max_deps = GSL::Matrix.alloc(@ds.cases, @ds.fields.size)
31
- constant_col=@ds.fields.size-1
32
- for i in 0...@ds.cases
33
- max_deps.set(i,constant_col,1)
34
- end
35
- j=0
36
- @ds.fields.each{|f|
37
- if f!=@y_var
38
- @ds[f].each_index{|i1|
39
- max_deps.set(i1,j,@ds[f][i1])
2
+ module Statsample
3
+ module Regression
4
+ module Multiple
5
+ # Class for Multiple Regression Analysis
6
+ # Requires rbgsl and uses a listwise aproach.
7
+ # Slower on prediction of values than Alglib, because predict is ruby based.
8
+ # Better memory management on multiple (+1000) series of regression.
9
+ # If you need pairwise, use RubyEngine
10
+ # Example:
11
+ #
12
+ # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
13
+ # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
14
+ # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
15
+ # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
16
+ # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
17
+ # lr=Statsample::Regression::Multiple::GslEngine.new(ds,'y')
18
+ #
19
+ class GslEngine < BaseEngine
20
+ def initialize(ds,y_var, opts=Hash.new)
21
+ super
22
+ @ds=ds.dup_only_valid
23
+ @ds_valid=@ds
24
+ @valid_cases=@ds_valid.cases
25
+ @dy=@ds[@y_var]
26
+ @ds_indep=ds.dup(ds.fields-[y_var])
27
+ # Create a custom matrix
28
+ columns=[]
29
+ @fields=[]
30
+ max_deps = GSL::Matrix.alloc(@ds.cases, @ds.fields.size)
31
+ constant_col=@ds.fields.size-1
32
+ for i in 0...@ds.cases
33
+ max_deps.set(i,constant_col,1)
34
+ end
35
+ j=0
36
+ @ds.fields.each{|f|
37
+ if f!=@y_var
38
+ @ds[f].each_index{|i1|
39
+ max_deps.set(i1,j,@ds[f][i1])
40
+ }
41
+ columns.push(@ds[f].to_a)
42
+ @fields.push(f)
43
+ j+=1
44
+ end
40
45
  }
41
- columns.push(@ds[f].to_a)
42
- @fields.push(f)
43
- j+=1
46
+ @dep_columns=columns.dup
47
+ @lr_s=nil
48
+ c, @cov, @chisq, @status = GSL::MultiFit.linear(max_deps, @dy.gsl)
49
+ @constant=c[constant_col]
50
+ @coeffs_a=c.to_a.slice(0...constant_col)
51
+ @coeffs=assign_names(@coeffs_a)
52
+ c=nil
53
+ end
54
+
55
+ def _dump(i)
56
+ Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
57
+ end
58
+ def self._load(data)
59
+ h=Marshal.load(data)
60
+ self.new(h['ds'], h['y_var'])
61
+ end
62
+
63
+ def coeffs
64
+ @coeffs
65
+ end
66
+ # Coefficients using a constant
67
+ # Based on http://www.xycoon.com/ols1.htm
68
+ def matrix_resolution
69
+ columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
70
+ columns.unshift([1.0]*@ds.cases)
71
+ y=Matrix.columns([@dy.data.map {|i| i.to_f}])
72
+ x=Matrix.columns(columns)
73
+ xt=x.t
74
+ matrix=((xt*x)).inverse*xt
75
+ matrix*y
76
+ end
77
+ def r2
78
+ r**2
79
+ end
80
+ def r
81
+ Bivariate::pearson(@dy, predicted)
82
+ end
83
+ def sst
84
+ @dy.ss
85
+ end
86
+ def constant
87
+ @constant
88
+ end
89
+ def standarized_coeffs
90
+ l=lr_s
91
+ l.coeffs
92
+ end
93
+ def lr_s
94
+ if @lr_s.nil?
95
+ build_standarized
96
+ end
97
+ @lr_s
98
+ end
99
+ def build_standarized
100
+ @ds_s=@ds.standarize
101
+ @lr_s=GslEngine.new(@ds_s,@y_var)
102
+ end
103
+ def process_s(v)
104
+ lr_s.process(v)
105
+ end
106
+ # ???? Not equal to SPSS output
107
+ def standarized_residuals
108
+ res=residuals
109
+ red_sd=residuals.sds
110
+ res.collect {|v|
111
+ v.quo(red_sd)
112
+ }.to_vector(:scale)
113
+ end
114
+
115
+ # Standard error for coeffs
116
+ def coeffs_se
117
+ out={}
118
+ evcm=estimated_variance_covariance_matrix
119
+ @ds_valid.fields.each_with_index do |f,i|
120
+
121
+ mi=i+1
122
+ next if f==@y_var
123
+ out[f]=evcm[mi,mi]
124
+ end
125
+ out
126
+ end
127
+
44
128
  end
45
- }
46
- @dep_columns=columns.dup
47
- @lr_s=nil
48
- c, @cov, @chisq, @status = GSL::MultiFit.linear(max_deps, @dy.gsl)
49
- @constant=c[constant_col]
50
- @coeffs_a=c.to_a.slice(0...constant_col)
51
- @coeffs=assign_names(@coeffs_a)
52
- c=nil
53
- end
54
-
55
- def _dump(i)
56
- Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
57
- end
58
- def self._load(data)
59
- h=Marshal.load(data)
60
- self.new(h['ds'], h['y_var'])
61
- end
62
-
63
- def coeffs
64
- @coeffs
65
- end
66
- # Coefficients using a constant
67
- # Based on http://www.xycoon.com/ols1.htm
68
- def matrix_resolution
69
- columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
70
- columns.unshift([1.0]*@ds.cases)
71
- y=Matrix.columns([@dy.data.map {|i| i.to_f}])
72
- x=Matrix.columns(columns)
73
- xt=x.t
74
- matrix=((xt*x)).inverse*xt
75
- matrix*y
76
- end
77
- def r2
78
- r**2
79
- end
80
- def r
81
- Bivariate::pearson(@dy, predicted)
82
- end
83
- def sst
84
- @dy.ss
85
- end
86
- def constant
87
- @constant
88
- end
89
- def standarized_coeffs
90
- l=lr_s
91
- l.coeffs
92
- end
93
- def lr_s
94
- if @lr_s.nil?
95
- build_standarized
96
- end
97
- @lr_s
98
- end
99
- def build_standarized
100
- @ds_s=@ds.standarize
101
- @lr_s=GslEngine.new(@ds_s,@y_var)
102
- end
103
- def process_s(v)
104
- lr_s.process(v)
105
- end
106
- # ???? Not equal to SPSS output
107
- def standarized_residuals
108
- res=residuals
109
- red_sd=residuals.sds
110
- res.collect {|v|
111
- v.quo(red_sd)
112
- }.to_vector(:scale)
113
- end
114
-
115
- # Standard error for coeffs
116
- def coeffs_se
117
- out={}
118
- evcm=estimated_variance_covariance_matrix
119
- @ds_valid.fields.each_with_index do |f,i|
120
- mi=i+1
121
- next if f==@y_var
122
- out[f]=evcm[mi,mi]
129
+ end
123
130
  end
124
- out
125
- end
126
-
127
- end
128
- end
129
- end
130
- end # for Statsample
131
+ end # for Statsample
131
132
  end # for if