statsample 1.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.build.sh +15 -0
  3. data/.gitignore +1 -0
  4. data/.travis.yml +19 -7
  5. data/CONTRIBUTING.md +33 -0
  6. data/History.txt +5 -0
  7. data/README.md +41 -53
  8. data/benchmarks/correlation_matrix_15_variables.rb +6 -5
  9. data/benchmarks/correlation_matrix_5_variables.rb +6 -5
  10. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
  11. data/examples/boxplot.rb +17 -5
  12. data/examples/correlation_matrix.rb +36 -7
  13. data/examples/dataset.rb +25 -5
  14. data/examples/dominance_analysis.rb +8 -7
  15. data/examples/dominance_analysis_bootstrap.rb +16 -11
  16. data/examples/histogram.rb +16 -2
  17. data/examples/icc.rb +5 -6
  18. data/examples/levene.rb +17 -3
  19. data/examples/multiple_regression.rb +6 -3
  20. data/examples/parallel_analysis.rb +11 -6
  21. data/examples/polychoric.rb +26 -13
  22. data/examples/principal_axis.rb +8 -4
  23. data/examples/reliability.rb +10 -10
  24. data/examples/scatterplot.rb +8 -0
  25. data/examples/t_test.rb +7 -0
  26. data/examples/u_test.rb +10 -2
  27. data/examples/vector.rb +9 -6
  28. data/examples/velicer_map_test.rb +12 -8
  29. data/lib/statsample.rb +13 -47
  30. data/lib/statsample/analysis/suite.rb +1 -1
  31. data/lib/statsample/anova/oneway.rb +6 -6
  32. data/lib/statsample/anova/twoway.rb +26 -24
  33. data/lib/statsample/bivariate.rb +78 -61
  34. data/lib/statsample/bivariate/pearson.rb +2 -2
  35. data/lib/statsample/codification.rb +45 -32
  36. data/lib/statsample/converter/csv.rb +15 -53
  37. data/lib/statsample/converter/spss.rb +6 -5
  38. data/lib/statsample/converters.rb +50 -211
  39. data/lib/statsample/crosstab.rb +26 -25
  40. data/lib/statsample/daru.rb +117 -0
  41. data/lib/statsample/dataset.rb +70 -942
  42. data/lib/statsample/dominanceanalysis.rb +16 -17
  43. data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
  44. data/lib/statsample/factor/parallelanalysis.rb +17 -19
  45. data/lib/statsample/factor/pca.rb +21 -20
  46. data/lib/statsample/factor/principalaxis.rb +3 -3
  47. data/lib/statsample/graph/boxplot.rb +8 -16
  48. data/lib/statsample/graph/histogram.rb +4 -4
  49. data/lib/statsample/graph/scatterplot.rb +8 -7
  50. data/lib/statsample/histogram.rb +128 -119
  51. data/lib/statsample/matrix.rb +20 -16
  52. data/lib/statsample/multiset.rb +39 -38
  53. data/lib/statsample/regression.rb +3 -3
  54. data/lib/statsample/regression/multiple.rb +8 -10
  55. data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
  56. data/lib/statsample/regression/multiple/baseengine.rb +32 -32
  57. data/lib/statsample/regression/multiple/gslengine.rb +33 -36
  58. data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
  59. data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
  60. data/lib/statsample/reliability.rb +23 -25
  61. data/lib/statsample/reliability/icc.rb +8 -7
  62. data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
  63. data/lib/statsample/reliability/scaleanalysis.rb +58 -60
  64. data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
  65. data/lib/statsample/resample.rb +1 -1
  66. data/lib/statsample/shorthand.rb +29 -25
  67. data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
  68. data/lib/statsample/test/levene.rb +28 -27
  69. data/lib/statsample/test/t.rb +7 -9
  70. data/lib/statsample/test/umannwhitney.rb +28 -28
  71. data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
  72. data/lib/statsample/vector.rb +70 -1013
  73. data/lib/statsample/version.rb +1 -1
  74. data/statsample.gemspec +12 -16
  75. data/test/helpers_tests.rb +1 -1
  76. data/test/test_analysis.rb +17 -17
  77. data/test/test_anova_contrast.rb +6 -6
  78. data/test/test_anovatwowaywithdataset.rb +8 -8
  79. data/test/test_anovawithvectors.rb +8 -8
  80. data/test/test_awesome_print_bug.rb +1 -1
  81. data/test/test_bartlettsphericity.rb +4 -4
  82. data/test/test_bivariate.rb +48 -43
  83. data/test/test_codification.rb +33 -33
  84. data/test/test_crosstab.rb +9 -9
  85. data/test/test_dataset.rb +28 -458
  86. data/test/test_factor.rb +46 -38
  87. data/test/test_factor_pa.rb +22 -13
  88. data/test/test_ggobi.rb +4 -4
  89. data/test/test_gsl.rb +4 -4
  90. data/test/test_histogram.rb +3 -3
  91. data/test/test_matrix.rb +13 -13
  92. data/test/test_multiset.rb +103 -91
  93. data/test/test_regression.rb +57 -52
  94. data/test/test_reliability.rb +55 -45
  95. data/test/test_reliability_icc.rb +8 -8
  96. data/test/test_reliability_skillscale.rb +26 -24
  97. data/test/test_resample.rb +1 -1
  98. data/test/test_statistics.rb +3 -13
  99. data/test/test_stest.rb +9 -9
  100. data/test/test_stratified.rb +3 -3
  101. data/test/test_test_t.rb +12 -12
  102. data/test/test_umannwhitney.rb +2 -2
  103. data/test/test_vector.rb +76 -613
  104. data/test/test_wilcoxonsignedrank.rb +4 -4
  105. metadata +57 -28
  106. data/lib/statsample/rserve_extension.rb +0 -20
  107. data/lib/statsample/vector/gsl.rb +0 -106
  108. data/test/fixtures/repeated_fields.csv +0 -7
  109. data/test/fixtures/scientific_notation.csv +0 -4
  110. data/test/fixtures/test_csv.csv +0 -7
  111. data/test/fixtures/test_xls.xls +0 -0
  112. data/test/test_csv.rb +0 -63
  113. data/test/test_rserve_extension.rb +0 -42
  114. data/test/test_xls.rb +0 -52
@@ -1,5 +1,8 @@
1
1
  #!/usr/bin/ruby
2
2
  $:.unshift(File.dirname(__FILE__)+'/../lib/')
3
+ # == Description
4
+ #
5
+ # Velicer MAP test.
3
6
 
4
7
  require 'statsample'
5
8
 
@@ -15,17 +18,18 @@ Statsample::Analysis.store(Statsample::Factor::MAP) do
15
18
  vectors={}
16
19
 
17
20
  variables.times do |i|
18
- vectors["v#{i}"]=samples.times.collect {|nv|
19
- if i<5
20
- f1[nv]*5 + f2[nv] *2 +rng.call
21
- else
22
- f1[nv]*2 + f2[nv] *3 +rng.call
23
- end
24
- }.to_numeric
21
+ vectors["v#{i}".to_sym]= Daru::Vector.new(
22
+ samples.times.collect do |nv|
23
+ if i<5
24
+ f1[nv]*5 + f2[nv] *2 +rng.call
25
+ else
26
+ f1[nv]*2 + f2[nv] *3 +rng.call
27
+ end
28
+ end)
25
29
  end
26
30
 
27
31
 
28
- ds=vectors.to_dataset
32
+ ds = Daru::DataFrame.new(vectors)
29
33
  cor=cor(ds)
30
34
  pca=pca(cor)
31
35
 
@@ -22,6 +22,8 @@ require 'extendmatrix'
22
22
  require 'distribution'
23
23
  require 'dirty-memoize'
24
24
  require 'reportbuilder'
25
+ require 'daru'
26
+ require 'statsample/daru'
25
27
 
26
28
  class Numeric
27
29
  def square
@@ -52,42 +54,6 @@ class Module
52
54
  end
53
55
 
54
56
  class Array
55
- # Recode repeated values on an array, adding the number of repetition
56
- # at the end
57
- # Example:
58
- # a=%w{a b c c d d d e}
59
- # a.recode_repeated
60
- # => ["a","b","c_1","c_2","d_1","d_2","d_3","e"]
61
- def recode_repeated
62
- if size != uniq.size
63
- # Find repeated
64
- repeated = inject({}) do |acc, v|
65
- if acc[v].nil?
66
- acc[v] = 1
67
- else
68
- acc[v] += 1
69
- end
70
- acc
71
- end.select { |_k, v| v > 1 }.keys
72
-
73
- ns = repeated.inject({}) do |acc, v|
74
- acc[v] = 0
75
- acc
76
- end
77
-
78
- collect do |f|
79
- if repeated.include? f
80
- ns[f] += 1
81
- sprintf('%s_%d', f, ns[f])
82
- else
83
- f
84
- end
85
- end
86
- else
87
- self
88
- end
89
- end
90
-
91
57
  def sum
92
58
  inject(:+)
93
59
  end
@@ -218,7 +184,7 @@ module Statsample
218
184
  size = vs[0].size
219
185
 
220
186
  vs.each do |v|
221
- fail ArgumentError, 'Arguments should be Vector' unless v.instance_of? Statsample::Vector
187
+ fail ArgumentError, 'Arguments should be Vector' unless v.instance_of? Daru::Vector
222
188
  fail ArgumentError, 'Vectors size should be the same' if v.size != size
223
189
  end
224
190
 
@@ -228,26 +194,26 @@ module Statsample
228
194
  # Returns a duplicate of the input vectors, without missing data
229
195
  # for any of the vectors.
230
196
  #
231
- # a=[1,2,3,6,7,nil,3,5].to_numeric
232
- # b=[nil,nil,5,6,4,5,10,2].to_numeric
233
- # c=[2,4,6,7,4,5,6,7].to_numeric
197
+ # a = Daru::Vector.new([1,2,3,6,7,nil,3,5])
198
+ # b = Daru::Vector.new([nil,nil,5,6,4,5,10,2])
199
+ # c = Daru::Vector.new([2,4,6,7,4,5,6,7])
234
200
  # a2,b2,c2=Statsample.only_valid(a,b,c)
235
- # => [#<Statsample::Scale:0xb748c8c8 @data=[3, 6, 7, 3, 5]>,
236
- # #<Statsample::Scale:0xb748c814 @data=[5, 6, 4, 10, 2]>,
237
- # #<Statsample::Scale:0xb748c760 @data=[6, 7, 4, 6, 7]>]
201
+ # => [#<Daru::Vector:0xb748c8c8 @data=[3, 6, 7, 3, 5]>,
202
+ # #<Daru::Vector:0xb748c814 @data=[5, 6, 4, 10, 2]>,
203
+ # #<Daru::Vector:0xb748c760 @data=[6, 7, 4, 6, 7]>]
238
204
  #
239
205
  def only_valid(*vs)
240
206
  i = 1
241
- h = vs.inject({}) { |acc, v| acc["v#{i}"] = v; i += 1; acc }
242
- ds = Statsample::Dataset.new(h).dup_only_valid
243
- ds.vectors.values
207
+ h = vs.inject({}) { |acc, v| acc["v#{i}".to_sym] = v; i += 1; acc }
208
+ df = Daru::DataFrame.new(h).dup_only_valid
209
+ df.map { |v| v }
244
210
  end
245
211
 
246
212
  # Cheap version of #only_valid.
247
213
  # If any vectors have missing_values, return only valid.
248
214
  # If not, return the vectors itself
249
215
  def only_valid_clone(*vs)
250
- if vs.any?(&:flawed?)
216
+ if vs.any?(&:has_missing_data?)
251
217
  only_valid(*vs)
252
218
  else
253
219
  vs
@@ -80,7 +80,7 @@ module Statsample
80
80
 
81
81
  def method_missing(name, *args,&block)
82
82
  @attached.reverse.each do |ds|
83
- return ds[name.to_s] if ds.fields.include? (name.to_s)
83
+ return ds[name] if ds.vectors.to_a.include? (name)
84
84
  end
85
85
  raise "Method #{name} doesn't exists"
86
86
  end
@@ -67,9 +67,9 @@ module Statsample
67
67
 
68
68
  # One Way Anova with vectors
69
69
  # Example:
70
- # v1=[2,3,4,5,6].to_numeric
71
- # v2=[3,3,4,5,6].to_numeric
72
- # v3=[5,3,1,5,6].to_numeric
70
+ # v1 = Daru::Vector.new([2,3,4,5,6])
71
+ # v2 = Daru::Vector.new([3,3,4,5,6])
72
+ # v3 = Daru::Vector.new([5,3,1,5,6])
73
73
  # anova=Statsample::Anova::OneWayWithVectors.new([v1,v2,v3])
74
74
  # anova.f
75
75
  # => 0.0243902439024391
@@ -90,10 +90,10 @@ module Statsample
90
90
 
91
91
  def initialize(*args)
92
92
  if args[0].is_a? Array
93
- @vectors=args.shift
93
+ @vectors = args.shift
94
94
  else
95
- @vectors=args.find_all {|v| v.is_a? Statsample::Vector}
96
- opts=args.find {|v| v.is_a? Hash}
95
+ @vectors = args.find_all {|v| v.is_a? Daru::Vector}
96
+ opts = args.find {|v| v.is_a? Hash}
97
97
  end
98
98
  opts||=Hash.new
99
99
  opts_default={:name=>_("Anova One-Way"),
@@ -107,9 +107,9 @@ module Statsample
107
107
 
108
108
  # Two Way Anova with vectors
109
109
  # Example:
110
- # v1=[1,1,2,2].to_numeric
111
- # v2=[1,2,1,2].to_numeric
112
- # v3=[5,3,1,5].to_numeric
110
+ # v1 = Daru::Vector.new([1,1,2,2])
111
+ # v2 = Daru::Vector.new([1,2,1,2])
112
+ # v3 = Daru::Vector.new([5,3,1,5])
113
113
  # anova=Statsample::Anova::TwoWayWithVectors.new(:a=>v1,:b=>v2, :dependent=>v3)
114
114
  #
115
115
  class TwoWayWithVectors < TwoWay
@@ -121,25 +121,26 @@ module Statsample
121
121
  # For now, only equal sample cells allowed
122
122
  def initialize(opts=Hash.new)
123
123
  raise "You should insert at least :a, :b and :dependent" unless [:a, :b, :dependent].all? {|v| opts.has_key? v}
124
- @a_var='a'
125
- @b_var='b'
126
- @dep_var='dependent'
127
- @a_vector, @b_vector, @dep_vector=Statsample.only_valid_clone opts[:a], opts[:b], opts[:dependent]
124
+ @a_var = :a
125
+ @b_var = :b
126
+ @dep_var = :dependent
127
+ @a_vector, @b_vector, @dep_vector =
128
+ Statsample.only_valid_clone opts[:a], opts[:b], opts[:dependent]
128
129
 
129
- ds={@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector}.to_dataset
130
- @ds=ds.clone_only_valid
131
- _p=@a_vector.factors.size
132
- _q=@b_vector.factors.size
133
- @x_general=@dep_vector.mean
134
- @axb_means={}
135
- @axb_sd={}
136
- @vectors=[]
130
+ ds = Daru::DataFrame.new({@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector})
131
+ @ds = ds.clone_only_valid
132
+ _p = @a_vector.factors.size
133
+ _q = @b_vector.factors.size
134
+ @x_general = @dep_vector.mean
135
+ @axb_means = {}
136
+ @axb_sd = {}
137
+ @vectors = []
137
138
  n=nil
138
139
  @ds.to_multiset_by_split(a_var,b_var).each_vector(dep_var) {|k,v|
139
- @axb_means[k]=v.mean
140
- @axb_sd[k]=v.sd
140
+ @axb_means[k] = v.mean
141
+ @axb_sd[k] = v.sd
141
142
  @vectors << v
142
- n||=v.size
143
+ n ||= v.size
143
144
  raise "All cell sizes should be equal" if n!=v.size
144
145
  }
145
146
 
@@ -151,20 +152,21 @@ module Statsample
151
152
  @ds.to_multiset_by_split(b_var).each_vector(dep_var) {|k,v|
152
153
  @b_means[k]=v.mean
153
154
  }
154
- ss_a=n*_q*@ds[a_var].factors.inject(0) {|ac,v|
155
- ac+(@a_means[v]-@x_general)**2
155
+ ss_a = n*_q*@ds[a_var].factors.inject(0) {|ac,v|
156
+ ac + (@a_means[v]-@x_general)**2
156
157
  }
157
158
  ss_b=n*_p*@ds[b_var].factors.inject(0) {|ac,v|
158
159
  ac+(@b_means[v]-@x_general)**2
159
160
  }
160
- ss_within=@ds.collect {|row|
161
+ ss_within = @ds.collect(:row) { |row|
161
162
  (row[dep_var]-@axb_means[[row[a_var],row[b_var]]])**2
162
163
  }.sum
163
- ss_axb=n*@axb_means.inject(0) {|ac,v|
164
+ ss_axb = n*@axb_means.inject(0) {|ac,v|
164
165
  j,k=v[0]
165
166
  xjk=v[1]
166
167
  ac+(xjk-@a_means[j]-@b_means[k]+@x_general)**2
167
168
  }
169
+
168
170
  df_a=_p-1
169
171
  df_b=_q-1
170
172
  df_within=(_p*_q)*(n-1)
@@ -186,9 +188,9 @@ module Statsample
186
188
  def report_building(builder) #:nodoc:#
187
189
  builder.section(:name=>@name) do |s|
188
190
  if summary_descriptives
189
- s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].labeling(a)}+[_("%s Mean") % @name_b]) do |t|
191
+ s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].index_of(a)}+[_("%s Mean") % @name_b]) do |t|
190
192
  @ds[b_var].factors.each do |b|
191
- t.row([@ds[b_var].labeling(b)]+@ds[a_var].factors.map {|a| "%0.3f" % @axb_means[[a,b]] } + ["%0.3f" % @b_means[b]])
193
+ t.row([@ds[b_var].index_of(b)]+@ds[a_var].factors.map {|a| "%0.3f" % @axb_means[[a,b]] } + ["%0.3f" % @b_means[b]])
192
194
  end
193
195
  t.row([_("%s Mean") % @name_a]+@ds[a_var].factors.map {|a| "%0.3f" % @a_means[a]}+ ["%0.3f" % @x_general])
194
196
  end
@@ -12,9 +12,10 @@ module Statsample
12
12
  # Covariance between two vectors
13
13
  def covariance(v1,v2)
14
14
  v1a,v2a=Statsample.only_valid_clone(v1,v2)
15
+
15
16
  return nil if v1a.size==0
16
17
  if Statsample.has_gsl?
17
- GSL::Stats::covariance(v1a.gsl, v2a.gsl)
18
+ GSL::Stats::covariance(v1a.to_gsl, v2a.to_gsl)
18
19
  else
19
20
  covariance_slow(v1a,v2a)
20
21
  end
@@ -34,7 +35,9 @@ module Statsample
34
35
  sum_of_squares(v1a,v2a) / (v1a.size-1)
35
36
  end
36
37
  def sum_of_squares(v1,v2)
37
- v1a,v2a=Statsample.only_valid_clone(v1,v2)
38
+ v1a,v2a=Statsample.only_valid_clone(v1,v2)
39
+ v1a.reset_index!
40
+ v2a.reset_index!
38
41
  m1=v1a.mean
39
42
  m2=v2a.mean
40
43
  (v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
@@ -44,13 +47,14 @@ module Statsample
44
47
  v1a,v2a=Statsample.only_valid_clone(v1,v2)
45
48
  return nil if v1a.size ==0
46
49
  if Statsample.has_gsl?
47
- GSL::Stats::correlation(v1a.gsl, v2a.gsl)
50
+ GSL::Stats::correlation(v1a.to_gsl, v2a.to_gsl)
48
51
  else
49
52
  pearson_slow(v1a,v2a)
50
53
  end
51
54
  end
52
55
  def pearson_slow(v1,v2) # :nodoc:
53
56
  v1a,v2a=Statsample.only_valid_clone(v1,v2)
57
+
54
58
  # Calculate sum of squares
55
59
  ss=sum_of_squares(v1a,v2a)
56
60
  ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
@@ -118,14 +122,16 @@ module Statsample
118
122
  r=Statsample::Bivariate.pearson(from,del)
119
123
  froms, dels = from.vector_standarized, del.vector_standarized
120
124
  nv=[]
121
- froms.data_with_nils.each_index do |i|
125
+ froms.reset_index!
126
+ dels.reset_index!
127
+ froms.each_index do |i|
122
128
  if froms[i].nil? or dels[i].nil?
123
129
  nv.push(nil)
124
130
  else
125
131
  nv.push(froms[i]-r*dels[i])
126
132
  end
127
133
  end
128
- nv.to_vector(:numeric)
134
+ Daru::Vector.new(nv)
129
135
  end
130
136
  # Correlation between v1 and v2, controling the effect of
131
137
  # control on both.
@@ -135,7 +141,6 @@ module Statsample
135
141
  rv1con=pearson(v1a,cona)
136
142
  rv2con=pearson(v2a,cona)
137
143
  (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
138
-
139
144
  end
140
145
 
141
146
  def covariance_matrix_optimized(ds)
@@ -153,50 +158,53 @@ module Statsample
153
158
  # Order of rows and columns depends on Dataset#fields order
154
159
 
155
160
  def covariance_matrix(ds)
156
- vars,cases=ds.fields.size,ds.cases
161
+ vars,cases = ds.ncols, ds.nrows
157
162
  if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
158
163
  cm=covariance_matrix_optimized(ds)
159
164
  else
160
165
  cm=covariance_matrix_pairwise(ds)
161
-
162
166
  end
163
167
  cm.extend(Statsample::CovariateMatrix)
164
- cm.fields=ds.fields
168
+ cm.fields = ds.vectors.to_a
165
169
  cm
166
170
  end
167
171
 
168
172
 
169
173
  def covariance_matrix_pairwise(ds)
170
174
  cache={}
171
- matrix=ds.collect_matrix do |row,col|
172
- if (ds[row].type!=:numeric or ds[col].type!=:numeric)
173
- nil
174
- elsif row==col
175
- ds[row].variance
176
- else
177
- if cache[[col,row]].nil?
178
- cov=covariance(ds[row],ds[col])
179
- cache[[row,col]]=cov
180
- cov
175
+ vectors = ds.vectors.to_a
176
+ mat_rows = vectors.collect do |row|
177
+ vectors.collect do |col|
178
+ if (ds[row].type!=:numeric or ds[col].type!=:numeric)
179
+ nil
180
+ elsif row==col
181
+ ds[row].variance
181
182
  else
182
- cache[[col,row]]
183
+ if cache[[col,row]].nil?
184
+ cov=covariance(ds[row],ds[col])
185
+ cache[[row,col]]=cov
186
+ cov
187
+ else
188
+ cache[[col,row]]
189
+ end
183
190
  end
184
191
  end
185
192
  end
186
- matrix
193
+
194
+ Matrix.rows mat_rows
187
195
  end
188
196
 
189
197
  # Correlation matrix.
190
198
  # Order of rows and columns depends on Dataset#fields order
191
199
  def correlation_matrix(ds)
192
- vars,cases=ds.fields.size,ds.cases
200
+ vars, cases = ds.ncols, ds.nrows
193
201
  if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
194
202
  cm=correlation_matrix_optimized(ds)
195
203
  else
196
204
  cm=correlation_matrix_pairwise(ds)
197
205
  end
198
206
  cm.extend(Statsample::CovariateMatrix)
199
- cm.fields=ds.fields
207
+ cm.fields = ds.vectors.to_a
200
208
  cm
201
209
  end
202
210
 
@@ -212,33 +220,43 @@ module Statsample
212
220
  end
213
221
  def correlation_matrix_pairwise(ds)
214
222
  cache={}
215
- cm=ds.collect_matrix do |row,col|
216
- if row==col
217
- 1.0
218
- elsif (ds[row].type!=:numeric or ds[col].type!=:numeric)
219
- nil
220
- else
221
- if cache[[col,row]].nil?
222
- r=pearson(ds[row],ds[col])
223
- cache[[row,col]]=r
224
- r
223
+ vectors = ds.vectors.to_a
224
+ cm = vectors.collect do |row|
225
+ vectors.collect do |col|
226
+ if row==col
227
+ 1.0
228
+ elsif (ds[row].type!=:numeric or ds[col].type!=:numeric)
229
+ nil
225
230
  else
226
- cache[[col,row]]
227
- end
231
+ if cache[[col,row]].nil?
232
+ r=pearson(ds[row],ds[col])
233
+ cache[[row,col]]=r
234
+ r
235
+ else
236
+ cache[[col,row]]
237
+ end
238
+ end
228
239
  end
229
240
  end
241
+
242
+ Matrix.rows cm
230
243
  end
231
244
 
232
245
  # Retrieves the n valid pairwise.
233
246
  def n_valid_matrix(ds)
234
- ds.collect_matrix do |row,col|
235
- if row==col
236
- ds[row].valid_data.size
237
- else
238
- rowa,rowb=Statsample.only_valid_clone(ds[row],ds[col])
239
- rowa.size
247
+ vectors = ds.vectors.to_a
248
+ m = vectors.collect do |row|
249
+ vectors.collect do |col|
250
+ if row==col
251
+ ds[row].only_valid.size
252
+ else
253
+ rowa,rowb = Statsample.only_valid_clone(ds[row],ds[col])
254
+ rowa.size
255
+ end
240
256
  end
241
257
  end
258
+
259
+ Matrix.rows m
242
260
  end
243
261
 
244
262
  # Matrix of correlation probabilities.
@@ -256,27 +274,27 @@ module Statsample
256
274
 
257
275
  # Spearman ranked correlation coefficient (rho) between 2 vectors
258
276
  def spearman(v1,v2)
259
- v1a,v2a=Statsample.only_valid_clone(v1,v2)
260
- v1r,v2r=v1a.ranked(:numeric),v2a.ranked(:numeric)
277
+ v1a,v2a = Statsample.only_valid_clone(v1,v2)
278
+ v1r,v2r = v1a.ranked, v2a.ranked
261
279
  pearson(v1r,v2r)
262
280
  end
263
281
  # Calculate Point biserial correlation. Equal to Pearson correlation, with
264
282
  # one dichotomous value replaced by "0" and the other by "1"
265
283
  def point_biserial(dichotomous,continous)
266
- ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
267
- raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
268
- raise(TypeError, "Second vector should be continous") if ds['c'].type!=:numeric
269
- f0=ds['d'].factors.sort[0]
270
- m0=ds.filter_field('c') {|c| c['d']==f0}
271
- m1=ds.filter_field('c') {|c| c['d']!=f0}
272
- ((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
284
+ ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).dup_only_valid
285
+ raise(TypeError, "First vector should be dichotomous") if ds[:d].factors.size != 2
286
+ raise(TypeError, "Second vector should be continous") if ds[:c].type != :numeric
287
+ f0=ds[:d].factors.sort.to_a[0]
288
+ m0=ds.filter_vector(:c) {|c| c[:d] == f0}
289
+ m1=ds.filter_vector(:c) {|c| c[:d] != f0}
290
+ ((m1.mean-m0.mean).to_f / ds[:c].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.nrows**2)
273
291
  end
274
292
  # Kendall Rank Correlation Coefficient (Tau a)
275
293
  # Based on Hervé Adbi article
276
294
  def tau_a(v1,v2)
277
295
  v1a,v2a=Statsample.only_valid_clone(v1,v2)
278
296
  n=v1.size
279
- v1r,v2r=v1a.ranked(:numeric),v2a.ranked(:numeric)
297
+ v1r,v2r=v1a.ranked,v2a.ranked
280
298
  o1=ordered_pairs(v1r)
281
299
  o2=ordered_pairs(v2r)
282
300
  delta= o1.size*2-(o2 & o1).size*2
@@ -348,14 +366,15 @@ module Statsample
348
366
  }
349
367
  {'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
350
368
  end
369
+
351
370
  def ordered_pairs(vector)
352
- d=vector.data
353
- a=[]
354
- (0...(d.size-1)).each{|i|
355
- ((i+1)...(d.size)).each {|j|
371
+ d = vector.to_a
372
+ a = []
373
+ (0...(d.size-1)).each do |i|
374
+ ((i+1)...(d.size)).each do |j|
356
375
  a.push([d[i],d[j]])
357
- }
358
- }
376
+ end
377
+ end
359
378
  a
360
379
  end
361
380
  =begin
@@ -371,8 +390,8 @@ module Statsample
371
390
  # Report the minimum number of cases valid of a covariate matrix
372
391
  # based on a dataset
373
392
  def min_n_valid(ds)
374
- min=ds.cases
375
- m=n_valid_matrix(ds)
393
+ min = ds.nrows
394
+ m = n_valid_matrix(ds)
376
395
  for x in 0...m.row_size
377
396
  for y in 0...m.column_size
378
397
  min=m[x,y] if m[x,y] < min
@@ -380,8 +399,6 @@ module Statsample
380
399
  end
381
400
  min
382
401
  end
383
-
384
-
385
402
  end
386
403
  end
387
404
  end