statsample 1.5.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/.build.sh +15 -0
  3. data/.gitignore +1 -0
  4. data/.travis.yml +19 -7
  5. data/CONTRIBUTING.md +33 -0
  6. data/History.txt +5 -0
  7. data/README.md +41 -53
  8. data/benchmarks/correlation_matrix_15_variables.rb +6 -5
  9. data/benchmarks/correlation_matrix_5_variables.rb +6 -5
  10. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
  11. data/examples/boxplot.rb +17 -5
  12. data/examples/correlation_matrix.rb +36 -7
  13. data/examples/dataset.rb +25 -5
  14. data/examples/dominance_analysis.rb +8 -7
  15. data/examples/dominance_analysis_bootstrap.rb +16 -11
  16. data/examples/histogram.rb +16 -2
  17. data/examples/icc.rb +5 -6
  18. data/examples/levene.rb +17 -3
  19. data/examples/multiple_regression.rb +6 -3
  20. data/examples/parallel_analysis.rb +11 -6
  21. data/examples/polychoric.rb +26 -13
  22. data/examples/principal_axis.rb +8 -4
  23. data/examples/reliability.rb +10 -10
  24. data/examples/scatterplot.rb +8 -0
  25. data/examples/t_test.rb +7 -0
  26. data/examples/u_test.rb +10 -2
  27. data/examples/vector.rb +9 -6
  28. data/examples/velicer_map_test.rb +12 -8
  29. data/lib/statsample.rb +13 -47
  30. data/lib/statsample/analysis/suite.rb +1 -1
  31. data/lib/statsample/anova/oneway.rb +6 -6
  32. data/lib/statsample/anova/twoway.rb +26 -24
  33. data/lib/statsample/bivariate.rb +78 -61
  34. data/lib/statsample/bivariate/pearson.rb +2 -2
  35. data/lib/statsample/codification.rb +45 -32
  36. data/lib/statsample/converter/csv.rb +15 -53
  37. data/lib/statsample/converter/spss.rb +6 -5
  38. data/lib/statsample/converters.rb +50 -211
  39. data/lib/statsample/crosstab.rb +26 -25
  40. data/lib/statsample/daru.rb +117 -0
  41. data/lib/statsample/dataset.rb +70 -942
  42. data/lib/statsample/dominanceanalysis.rb +16 -17
  43. data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
  44. data/lib/statsample/factor/parallelanalysis.rb +17 -19
  45. data/lib/statsample/factor/pca.rb +21 -20
  46. data/lib/statsample/factor/principalaxis.rb +3 -3
  47. data/lib/statsample/graph/boxplot.rb +8 -16
  48. data/lib/statsample/graph/histogram.rb +4 -4
  49. data/lib/statsample/graph/scatterplot.rb +8 -7
  50. data/lib/statsample/histogram.rb +128 -119
  51. data/lib/statsample/matrix.rb +20 -16
  52. data/lib/statsample/multiset.rb +39 -38
  53. data/lib/statsample/regression.rb +3 -3
  54. data/lib/statsample/regression/multiple.rb +8 -10
  55. data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
  56. data/lib/statsample/regression/multiple/baseengine.rb +32 -32
  57. data/lib/statsample/regression/multiple/gslengine.rb +33 -36
  58. data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
  59. data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
  60. data/lib/statsample/reliability.rb +23 -25
  61. data/lib/statsample/reliability/icc.rb +8 -7
  62. data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
  63. data/lib/statsample/reliability/scaleanalysis.rb +58 -60
  64. data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
  65. data/lib/statsample/resample.rb +1 -1
  66. data/lib/statsample/shorthand.rb +29 -25
  67. data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
  68. data/lib/statsample/test/levene.rb +28 -27
  69. data/lib/statsample/test/t.rb +7 -9
  70. data/lib/statsample/test/umannwhitney.rb +28 -28
  71. data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
  72. data/lib/statsample/vector.rb +70 -1013
  73. data/lib/statsample/version.rb +1 -1
  74. data/statsample.gemspec +12 -16
  75. data/test/helpers_tests.rb +1 -1
  76. data/test/test_analysis.rb +17 -17
  77. data/test/test_anova_contrast.rb +6 -6
  78. data/test/test_anovatwowaywithdataset.rb +8 -8
  79. data/test/test_anovawithvectors.rb +8 -8
  80. data/test/test_awesome_print_bug.rb +1 -1
  81. data/test/test_bartlettsphericity.rb +4 -4
  82. data/test/test_bivariate.rb +48 -43
  83. data/test/test_codification.rb +33 -33
  84. data/test/test_crosstab.rb +9 -9
  85. data/test/test_dataset.rb +28 -458
  86. data/test/test_factor.rb +46 -38
  87. data/test/test_factor_pa.rb +22 -13
  88. data/test/test_ggobi.rb +4 -4
  89. data/test/test_gsl.rb +4 -4
  90. data/test/test_histogram.rb +3 -3
  91. data/test/test_matrix.rb +13 -13
  92. data/test/test_multiset.rb +103 -91
  93. data/test/test_regression.rb +57 -52
  94. data/test/test_reliability.rb +55 -45
  95. data/test/test_reliability_icc.rb +8 -8
  96. data/test/test_reliability_skillscale.rb +26 -24
  97. data/test/test_resample.rb +1 -1
  98. data/test/test_statistics.rb +3 -13
  99. data/test/test_stest.rb +9 -9
  100. data/test/test_stratified.rb +3 -3
  101. data/test/test_test_t.rb +12 -12
  102. data/test/test_umannwhitney.rb +2 -2
  103. data/test/test_vector.rb +76 -613
  104. data/test/test_wilcoxonsignedrank.rb +4 -4
  105. metadata +57 -28
  106. data/lib/statsample/rserve_extension.rb +0 -20
  107. data/lib/statsample/vector/gsl.rb +0 -106
  108. data/test/fixtures/repeated_fields.csv +0 -7
  109. data/test/fixtures/scientific_notation.csv +0 -4
  110. data/test/fixtures/test_csv.csv +0 -7
  111. data/test/fixtures/test_xls.xls +0 -0
  112. data/test/test_csv.rb +0 -63
  113. data/test/test_rserve_extension.rb +0 -42
  114. data/test/test_xls.rb +0 -52
@@ -1,5 +1,8 @@
1
1
  #!/usr/bin/ruby
2
2
  $:.unshift(File.dirname(__FILE__)+'/../lib/')
3
+ # == Description
4
+ #
5
+ # Velicer MAP test.
3
6
 
4
7
  require 'statsample'
5
8
 
@@ -15,17 +18,18 @@ Statsample::Analysis.store(Statsample::Factor::MAP) do
15
18
  vectors={}
16
19
 
17
20
  variables.times do |i|
18
- vectors["v#{i}"]=samples.times.collect {|nv|
19
- if i<5
20
- f1[nv]*5 + f2[nv] *2 +rng.call
21
- else
22
- f1[nv]*2 + f2[nv] *3 +rng.call
23
- end
24
- }.to_numeric
21
+ vectors["v#{i}".to_sym]= Daru::Vector.new(
22
+ samples.times.collect do |nv|
23
+ if i<5
24
+ f1[nv]*5 + f2[nv] *2 +rng.call
25
+ else
26
+ f1[nv]*2 + f2[nv] *3 +rng.call
27
+ end
28
+ end)
25
29
  end
26
30
 
27
31
 
28
- ds=vectors.to_dataset
32
+ ds = Daru::DataFrame.new(vectors)
29
33
  cor=cor(ds)
30
34
  pca=pca(cor)
31
35
 
@@ -22,6 +22,8 @@ require 'extendmatrix'
22
22
  require 'distribution'
23
23
  require 'dirty-memoize'
24
24
  require 'reportbuilder'
25
+ require 'daru'
26
+ require 'statsample/daru'
25
27
 
26
28
  class Numeric
27
29
  def square
@@ -52,42 +54,6 @@ class Module
52
54
  end
53
55
 
54
56
  class Array
55
- # Recode repeated values on an array, adding the number of repetition
56
- # at the end
57
- # Example:
58
- # a=%w{a b c c d d d e}
59
- # a.recode_repeated
60
- # => ["a","b","c_1","c_2","d_1","d_2","d_3","e"]
61
- def recode_repeated
62
- if size != uniq.size
63
- # Find repeated
64
- repeated = inject({}) do |acc, v|
65
- if acc[v].nil?
66
- acc[v] = 1
67
- else
68
- acc[v] += 1
69
- end
70
- acc
71
- end.select { |_k, v| v > 1 }.keys
72
-
73
- ns = repeated.inject({}) do |acc, v|
74
- acc[v] = 0
75
- acc
76
- end
77
-
78
- collect do |f|
79
- if repeated.include? f
80
- ns[f] += 1
81
- sprintf('%s_%d', f, ns[f])
82
- else
83
- f
84
- end
85
- end
86
- else
87
- self
88
- end
89
- end
90
-
91
57
  def sum
92
58
  inject(:+)
93
59
  end
@@ -218,7 +184,7 @@ module Statsample
218
184
  size = vs[0].size
219
185
 
220
186
  vs.each do |v|
221
- fail ArgumentError, 'Arguments should be Vector' unless v.instance_of? Statsample::Vector
187
+ fail ArgumentError, 'Arguments should be Vector' unless v.instance_of? Daru::Vector
222
188
  fail ArgumentError, 'Vectors size should be the same' if v.size != size
223
189
  end
224
190
 
@@ -228,26 +194,26 @@ module Statsample
228
194
  # Returns a duplicate of the input vectors, without missing data
229
195
  # for any of the vectors.
230
196
  #
231
- # a=[1,2,3,6,7,nil,3,5].to_numeric
232
- # b=[nil,nil,5,6,4,5,10,2].to_numeric
233
- # c=[2,4,6,7,4,5,6,7].to_numeric
197
+ # a = Daru::Vector.new([1,2,3,6,7,nil,3,5])
198
+ # b = Daru::Vector.new([nil,nil,5,6,4,5,10,2])
199
+ # c = Daru::Vector.new([2,4,6,7,4,5,6,7])
234
200
  # a2,b2,c2=Statsample.only_valid(a,b,c)
235
- # => [#<Statsample::Scale:0xb748c8c8 @data=[3, 6, 7, 3, 5]>,
236
- # #<Statsample::Scale:0xb748c814 @data=[5, 6, 4, 10, 2]>,
237
- # #<Statsample::Scale:0xb748c760 @data=[6, 7, 4, 6, 7]>]
201
+ # => [#<Daru::Vector:0xb748c8c8 @data=[3, 6, 7, 3, 5]>,
202
+ # #<Daru::Vector:0xb748c814 @data=[5, 6, 4, 10, 2]>,
203
+ # #<Daru::Vector:0xb748c760 @data=[6, 7, 4, 6, 7]>]
238
204
  #
239
205
  def only_valid(*vs)
240
206
  i = 1
241
- h = vs.inject({}) { |acc, v| acc["v#{i}"] = v; i += 1; acc }
242
- ds = Statsample::Dataset.new(h).dup_only_valid
243
- ds.vectors.values
207
+ h = vs.inject({}) { |acc, v| acc["v#{i}".to_sym] = v; i += 1; acc }
208
+ df = Daru::DataFrame.new(h).dup_only_valid
209
+ df.map { |v| v }
244
210
  end
245
211
 
246
212
  # Cheap version of #only_valid.
247
213
  # If any vectors have missing_values, return only valid.
248
214
  # If not, return the vectors itself
249
215
  def only_valid_clone(*vs)
250
- if vs.any?(&:flawed?)
216
+ if vs.any?(&:has_missing_data?)
251
217
  only_valid(*vs)
252
218
  else
253
219
  vs
@@ -80,7 +80,7 @@ module Statsample
80
80
 
81
81
  def method_missing(name, *args,&block)
82
82
  @attached.reverse.each do |ds|
83
- return ds[name.to_s] if ds.fields.include? (name.to_s)
83
+ return ds[name] if ds.vectors.to_a.include? (name)
84
84
  end
85
85
  raise "Method #{name} doesn't exists"
86
86
  end
@@ -67,9 +67,9 @@ module Statsample
67
67
 
68
68
  # One Way Anova with vectors
69
69
  # Example:
70
- # v1=[2,3,4,5,6].to_numeric
71
- # v2=[3,3,4,5,6].to_numeric
72
- # v3=[5,3,1,5,6].to_numeric
70
+ # v1 = Daru::Vector.new([2,3,4,5,6])
71
+ # v2 = Daru::Vector.new([3,3,4,5,6])
72
+ # v3 = Daru::Vector.new([5,3,1,5,6])
73
73
  # anova=Statsample::Anova::OneWayWithVectors.new([v1,v2,v3])
74
74
  # anova.f
75
75
  # => 0.0243902439024391
@@ -90,10 +90,10 @@ module Statsample
90
90
 
91
91
  def initialize(*args)
92
92
  if args[0].is_a? Array
93
- @vectors=args.shift
93
+ @vectors = args.shift
94
94
  else
95
- @vectors=args.find_all {|v| v.is_a? Statsample::Vector}
96
- opts=args.find {|v| v.is_a? Hash}
95
+ @vectors = args.find_all {|v| v.is_a? Daru::Vector}
96
+ opts = args.find {|v| v.is_a? Hash}
97
97
  end
98
98
  opts||=Hash.new
99
99
  opts_default={:name=>_("Anova One-Way"),
@@ -107,9 +107,9 @@ module Statsample
107
107
 
108
108
  # Two Way Anova with vectors
109
109
  # Example:
110
- # v1=[1,1,2,2].to_numeric
111
- # v2=[1,2,1,2].to_numeric
112
- # v3=[5,3,1,5].to_numeric
110
+ # v1 = Daru::Vector.new([1,1,2,2])
111
+ # v2 = Daru::Vector.new([1,2,1,2])
112
+ # v3 = Daru::Vector.new([5,3,1,5])
113
113
  # anova=Statsample::Anova::TwoWayWithVectors.new(:a=>v1,:b=>v2, :dependent=>v3)
114
114
  #
115
115
  class TwoWayWithVectors < TwoWay
@@ -121,25 +121,26 @@ module Statsample
121
121
  # For now, only equal sample cells allowed
122
122
  def initialize(opts=Hash.new)
123
123
  raise "You should insert at least :a, :b and :dependent" unless [:a, :b, :dependent].all? {|v| opts.has_key? v}
124
- @a_var='a'
125
- @b_var='b'
126
- @dep_var='dependent'
127
- @a_vector, @b_vector, @dep_vector=Statsample.only_valid_clone opts[:a], opts[:b], opts[:dependent]
124
+ @a_var = :a
125
+ @b_var = :b
126
+ @dep_var = :dependent
127
+ @a_vector, @b_vector, @dep_vector =
128
+ Statsample.only_valid_clone opts[:a], opts[:b], opts[:dependent]
128
129
 
129
- ds={@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector}.to_dataset
130
- @ds=ds.clone_only_valid
131
- _p=@a_vector.factors.size
132
- _q=@b_vector.factors.size
133
- @x_general=@dep_vector.mean
134
- @axb_means={}
135
- @axb_sd={}
136
- @vectors=[]
130
+ ds = Daru::DataFrame.new({@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector})
131
+ @ds = ds.clone_only_valid
132
+ _p = @a_vector.factors.size
133
+ _q = @b_vector.factors.size
134
+ @x_general = @dep_vector.mean
135
+ @axb_means = {}
136
+ @axb_sd = {}
137
+ @vectors = []
137
138
  n=nil
138
139
  @ds.to_multiset_by_split(a_var,b_var).each_vector(dep_var) {|k,v|
139
- @axb_means[k]=v.mean
140
- @axb_sd[k]=v.sd
140
+ @axb_means[k] = v.mean
141
+ @axb_sd[k] = v.sd
141
142
  @vectors << v
142
- n||=v.size
143
+ n ||= v.size
143
144
  raise "All cell sizes should be equal" if n!=v.size
144
145
  }
145
146
 
@@ -151,20 +152,21 @@ module Statsample
151
152
  @ds.to_multiset_by_split(b_var).each_vector(dep_var) {|k,v|
152
153
  @b_means[k]=v.mean
153
154
  }
154
- ss_a=n*_q*@ds[a_var].factors.inject(0) {|ac,v|
155
- ac+(@a_means[v]-@x_general)**2
155
+ ss_a = n*_q*@ds[a_var].factors.inject(0) {|ac,v|
156
+ ac + (@a_means[v]-@x_general)**2
156
157
  }
157
158
  ss_b=n*_p*@ds[b_var].factors.inject(0) {|ac,v|
158
159
  ac+(@b_means[v]-@x_general)**2
159
160
  }
160
- ss_within=@ds.collect {|row|
161
+ ss_within = @ds.collect(:row) { |row|
161
162
  (row[dep_var]-@axb_means[[row[a_var],row[b_var]]])**2
162
163
  }.sum
163
- ss_axb=n*@axb_means.inject(0) {|ac,v|
164
+ ss_axb = n*@axb_means.inject(0) {|ac,v|
164
165
  j,k=v[0]
165
166
  xjk=v[1]
166
167
  ac+(xjk-@a_means[j]-@b_means[k]+@x_general)**2
167
168
  }
169
+
168
170
  df_a=_p-1
169
171
  df_b=_q-1
170
172
  df_within=(_p*_q)*(n-1)
@@ -186,9 +188,9 @@ module Statsample
186
188
  def report_building(builder) #:nodoc:#
187
189
  builder.section(:name=>@name) do |s|
188
190
  if summary_descriptives
189
- s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].labeling(a)}+[_("%s Mean") % @name_b]) do |t|
191
+ s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].index_of(a)}+[_("%s Mean") % @name_b]) do |t|
190
192
  @ds[b_var].factors.each do |b|
191
- t.row([@ds[b_var].labeling(b)]+@ds[a_var].factors.map {|a| "%0.3f" % @axb_means[[a,b]] } + ["%0.3f" % @b_means[b]])
193
+ t.row([@ds[b_var].index_of(b)]+@ds[a_var].factors.map {|a| "%0.3f" % @axb_means[[a,b]] } + ["%0.3f" % @b_means[b]])
192
194
  end
193
195
  t.row([_("%s Mean") % @name_a]+@ds[a_var].factors.map {|a| "%0.3f" % @a_means[a]}+ ["%0.3f" % @x_general])
194
196
  end
@@ -12,9 +12,10 @@ module Statsample
12
12
  # Covariance between two vectors
13
13
  def covariance(v1,v2)
14
14
  v1a,v2a=Statsample.only_valid_clone(v1,v2)
15
+
15
16
  return nil if v1a.size==0
16
17
  if Statsample.has_gsl?
17
- GSL::Stats::covariance(v1a.gsl, v2a.gsl)
18
+ GSL::Stats::covariance(v1a.to_gsl, v2a.to_gsl)
18
19
  else
19
20
  covariance_slow(v1a,v2a)
20
21
  end
@@ -34,7 +35,9 @@ module Statsample
34
35
  sum_of_squares(v1a,v2a) / (v1a.size-1)
35
36
  end
36
37
  def sum_of_squares(v1,v2)
37
- v1a,v2a=Statsample.only_valid_clone(v1,v2)
38
+ v1a,v2a=Statsample.only_valid_clone(v1,v2)
39
+ v1a.reset_index!
40
+ v2a.reset_index!
38
41
  m1=v1a.mean
39
42
  m2=v2a.mean
40
43
  (v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
@@ -44,13 +47,14 @@ module Statsample
44
47
  v1a,v2a=Statsample.only_valid_clone(v1,v2)
45
48
  return nil if v1a.size ==0
46
49
  if Statsample.has_gsl?
47
- GSL::Stats::correlation(v1a.gsl, v2a.gsl)
50
+ GSL::Stats::correlation(v1a.to_gsl, v2a.to_gsl)
48
51
  else
49
52
  pearson_slow(v1a,v2a)
50
53
  end
51
54
  end
52
55
  def pearson_slow(v1,v2) # :nodoc:
53
56
  v1a,v2a=Statsample.only_valid_clone(v1,v2)
57
+
54
58
  # Calculate sum of squares
55
59
  ss=sum_of_squares(v1a,v2a)
56
60
  ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
@@ -118,14 +122,16 @@ module Statsample
118
122
  r=Statsample::Bivariate.pearson(from,del)
119
123
  froms, dels = from.vector_standarized, del.vector_standarized
120
124
  nv=[]
121
- froms.data_with_nils.each_index do |i|
125
+ froms.reset_index!
126
+ dels.reset_index!
127
+ froms.each_index do |i|
122
128
  if froms[i].nil? or dels[i].nil?
123
129
  nv.push(nil)
124
130
  else
125
131
  nv.push(froms[i]-r*dels[i])
126
132
  end
127
133
  end
128
- nv.to_vector(:numeric)
134
+ Daru::Vector.new(nv)
129
135
  end
130
136
  # Correlation between v1 and v2, controling the effect of
131
137
  # control on both.
@@ -135,7 +141,6 @@ module Statsample
135
141
  rv1con=pearson(v1a,cona)
136
142
  rv2con=pearson(v2a,cona)
137
143
  (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
138
-
139
144
  end
140
145
 
141
146
  def covariance_matrix_optimized(ds)
@@ -153,50 +158,53 @@ module Statsample
153
158
  # Order of rows and columns depends on Dataset#fields order
154
159
 
155
160
  def covariance_matrix(ds)
156
- vars,cases=ds.fields.size,ds.cases
161
+ vars,cases = ds.ncols, ds.nrows
157
162
  if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
158
163
  cm=covariance_matrix_optimized(ds)
159
164
  else
160
165
  cm=covariance_matrix_pairwise(ds)
161
-
162
166
  end
163
167
  cm.extend(Statsample::CovariateMatrix)
164
- cm.fields=ds.fields
168
+ cm.fields = ds.vectors.to_a
165
169
  cm
166
170
  end
167
171
 
168
172
 
169
173
  def covariance_matrix_pairwise(ds)
170
174
  cache={}
171
- matrix=ds.collect_matrix do |row,col|
172
- if (ds[row].type!=:numeric or ds[col].type!=:numeric)
173
- nil
174
- elsif row==col
175
- ds[row].variance
176
- else
177
- if cache[[col,row]].nil?
178
- cov=covariance(ds[row],ds[col])
179
- cache[[row,col]]=cov
180
- cov
175
+ vectors = ds.vectors.to_a
176
+ mat_rows = vectors.collect do |row|
177
+ vectors.collect do |col|
178
+ if (ds[row].type!=:numeric or ds[col].type!=:numeric)
179
+ nil
180
+ elsif row==col
181
+ ds[row].variance
181
182
  else
182
- cache[[col,row]]
183
+ if cache[[col,row]].nil?
184
+ cov=covariance(ds[row],ds[col])
185
+ cache[[row,col]]=cov
186
+ cov
187
+ else
188
+ cache[[col,row]]
189
+ end
183
190
  end
184
191
  end
185
192
  end
186
- matrix
193
+
194
+ Matrix.rows mat_rows
187
195
  end
188
196
 
189
197
  # Correlation matrix.
190
198
  # Order of rows and columns depends on Dataset#fields order
191
199
  def correlation_matrix(ds)
192
- vars,cases=ds.fields.size,ds.cases
200
+ vars, cases = ds.ncols, ds.nrows
193
201
  if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
194
202
  cm=correlation_matrix_optimized(ds)
195
203
  else
196
204
  cm=correlation_matrix_pairwise(ds)
197
205
  end
198
206
  cm.extend(Statsample::CovariateMatrix)
199
- cm.fields=ds.fields
207
+ cm.fields = ds.vectors.to_a
200
208
  cm
201
209
  end
202
210
 
@@ -212,33 +220,43 @@ module Statsample
212
220
  end
213
221
  def correlation_matrix_pairwise(ds)
214
222
  cache={}
215
- cm=ds.collect_matrix do |row,col|
216
- if row==col
217
- 1.0
218
- elsif (ds[row].type!=:numeric or ds[col].type!=:numeric)
219
- nil
220
- else
221
- if cache[[col,row]].nil?
222
- r=pearson(ds[row],ds[col])
223
- cache[[row,col]]=r
224
- r
223
+ vectors = ds.vectors.to_a
224
+ cm = vectors.collect do |row|
225
+ vectors.collect do |col|
226
+ if row==col
227
+ 1.0
228
+ elsif (ds[row].type!=:numeric or ds[col].type!=:numeric)
229
+ nil
225
230
  else
226
- cache[[col,row]]
227
- end
231
+ if cache[[col,row]].nil?
232
+ r=pearson(ds[row],ds[col])
233
+ cache[[row,col]]=r
234
+ r
235
+ else
236
+ cache[[col,row]]
237
+ end
238
+ end
228
239
  end
229
240
  end
241
+
242
+ Matrix.rows cm
230
243
  end
231
244
 
232
245
  # Retrieves the n valid pairwise.
233
246
  def n_valid_matrix(ds)
234
- ds.collect_matrix do |row,col|
235
- if row==col
236
- ds[row].valid_data.size
237
- else
238
- rowa,rowb=Statsample.only_valid_clone(ds[row],ds[col])
239
- rowa.size
247
+ vectors = ds.vectors.to_a
248
+ m = vectors.collect do |row|
249
+ vectors.collect do |col|
250
+ if row==col
251
+ ds[row].only_valid.size
252
+ else
253
+ rowa,rowb = Statsample.only_valid_clone(ds[row],ds[col])
254
+ rowa.size
255
+ end
240
256
  end
241
257
  end
258
+
259
+ Matrix.rows m
242
260
  end
243
261
 
244
262
  # Matrix of correlation probabilities.
@@ -256,27 +274,27 @@ module Statsample
256
274
 
257
275
  # Spearman ranked correlation coefficient (rho) between 2 vectors
258
276
  def spearman(v1,v2)
259
- v1a,v2a=Statsample.only_valid_clone(v1,v2)
260
- v1r,v2r=v1a.ranked(:numeric),v2a.ranked(:numeric)
277
+ v1a,v2a = Statsample.only_valid_clone(v1,v2)
278
+ v1r,v2r = v1a.ranked, v2a.ranked
261
279
  pearson(v1r,v2r)
262
280
  end
263
281
  # Calculate Point biserial correlation. Equal to Pearson correlation, with
264
282
  # one dichotomous value replaced by "0" and the other by "1"
265
283
  def point_biserial(dichotomous,continous)
266
- ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
267
- raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
268
- raise(TypeError, "Second vector should be continous") if ds['c'].type!=:numeric
269
- f0=ds['d'].factors.sort[0]
270
- m0=ds.filter_field('c') {|c| c['d']==f0}
271
- m1=ds.filter_field('c') {|c| c['d']!=f0}
272
- ((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
284
+ ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).dup_only_valid
285
+ raise(TypeError, "First vector should be dichotomous") if ds[:d].factors.size != 2
286
+ raise(TypeError, "Second vector should be continous") if ds[:c].type != :numeric
287
+ f0=ds[:d].factors.sort.to_a[0]
288
+ m0=ds.filter_vector(:c) {|c| c[:d] == f0}
289
+ m1=ds.filter_vector(:c) {|c| c[:d] != f0}
290
+ ((m1.mean-m0.mean).to_f / ds[:c].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.nrows**2)
273
291
  end
274
292
  # Kendall Rank Correlation Coefficient (Tau a)
275
293
  # Based on Hervé Adbi article
276
294
  def tau_a(v1,v2)
277
295
  v1a,v2a=Statsample.only_valid_clone(v1,v2)
278
296
  n=v1.size
279
- v1r,v2r=v1a.ranked(:numeric),v2a.ranked(:numeric)
297
+ v1r,v2r=v1a.ranked,v2a.ranked
280
298
  o1=ordered_pairs(v1r)
281
299
  o2=ordered_pairs(v2r)
282
300
  delta= o1.size*2-(o2 & o1).size*2
@@ -348,14 +366,15 @@ module Statsample
348
366
  }
349
367
  {'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
350
368
  end
369
+
351
370
  def ordered_pairs(vector)
352
- d=vector.data
353
- a=[]
354
- (0...(d.size-1)).each{|i|
355
- ((i+1)...(d.size)).each {|j|
371
+ d = vector.to_a
372
+ a = []
373
+ (0...(d.size-1)).each do |i|
374
+ ((i+1)...(d.size)).each do |j|
356
375
  a.push([d[i],d[j]])
357
- }
358
- }
376
+ end
377
+ end
359
378
  a
360
379
  end
361
380
  =begin
@@ -371,8 +390,8 @@ module Statsample
371
390
  # Report the minimum number of cases valid of a covariate matrix
372
391
  # based on a dataset
373
392
  def min_n_valid(ds)
374
- min=ds.cases
375
- m=n_valid_matrix(ds)
393
+ min = ds.nrows
394
+ m = n_valid_matrix(ds)
376
395
  for x in 0...m.row_size
377
396
  for y in 0...m.column_size
378
397
  min=m[x,y] if m[x,y] < min
@@ -380,8 +399,6 @@ module Statsample
380
399
  end
381
400
  min
382
401
  end
383
-
384
-
385
402
  end
386
403
  end
387
404
  end