statsample 1.5.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.build.sh +15 -0
- data/.gitignore +1 -0
- data/.travis.yml +19 -7
- data/CONTRIBUTING.md +33 -0
- data/History.txt +5 -0
- data/README.md +41 -53
- data/benchmarks/correlation_matrix_15_variables.rb +6 -5
- data/benchmarks/correlation_matrix_5_variables.rb +6 -5
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
- data/examples/boxplot.rb +17 -5
- data/examples/correlation_matrix.rb +36 -7
- data/examples/dataset.rb +25 -5
- data/examples/dominance_analysis.rb +8 -7
- data/examples/dominance_analysis_bootstrap.rb +16 -11
- data/examples/histogram.rb +16 -2
- data/examples/icc.rb +5 -6
- data/examples/levene.rb +17 -3
- data/examples/multiple_regression.rb +6 -3
- data/examples/parallel_analysis.rb +11 -6
- data/examples/polychoric.rb +26 -13
- data/examples/principal_axis.rb +8 -4
- data/examples/reliability.rb +10 -10
- data/examples/scatterplot.rb +8 -0
- data/examples/t_test.rb +7 -0
- data/examples/u_test.rb +10 -2
- data/examples/vector.rb +9 -6
- data/examples/velicer_map_test.rb +12 -8
- data/lib/statsample.rb +13 -47
- data/lib/statsample/analysis/suite.rb +1 -1
- data/lib/statsample/anova/oneway.rb +6 -6
- data/lib/statsample/anova/twoway.rb +26 -24
- data/lib/statsample/bivariate.rb +78 -61
- data/lib/statsample/bivariate/pearson.rb +2 -2
- data/lib/statsample/codification.rb +45 -32
- data/lib/statsample/converter/csv.rb +15 -53
- data/lib/statsample/converter/spss.rb +6 -5
- data/lib/statsample/converters.rb +50 -211
- data/lib/statsample/crosstab.rb +26 -25
- data/lib/statsample/daru.rb +117 -0
- data/lib/statsample/dataset.rb +70 -942
- data/lib/statsample/dominanceanalysis.rb +16 -17
- data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
- data/lib/statsample/factor/parallelanalysis.rb +17 -19
- data/lib/statsample/factor/pca.rb +21 -20
- data/lib/statsample/factor/principalaxis.rb +3 -3
- data/lib/statsample/graph/boxplot.rb +8 -16
- data/lib/statsample/graph/histogram.rb +4 -4
- data/lib/statsample/graph/scatterplot.rb +8 -7
- data/lib/statsample/histogram.rb +128 -119
- data/lib/statsample/matrix.rb +20 -16
- data/lib/statsample/multiset.rb +39 -38
- data/lib/statsample/regression.rb +3 -3
- data/lib/statsample/regression/multiple.rb +8 -10
- data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
- data/lib/statsample/regression/multiple/baseengine.rb +32 -32
- data/lib/statsample/regression/multiple/gslengine.rb +33 -36
- data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
- data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
- data/lib/statsample/reliability.rb +23 -25
- data/lib/statsample/reliability/icc.rb +8 -7
- data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
- data/lib/statsample/reliability/scaleanalysis.rb +58 -60
- data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
- data/lib/statsample/resample.rb +1 -1
- data/lib/statsample/shorthand.rb +29 -25
- data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
- data/lib/statsample/test/levene.rb +28 -27
- data/lib/statsample/test/t.rb +7 -9
- data/lib/statsample/test/umannwhitney.rb +28 -28
- data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
- data/lib/statsample/vector.rb +70 -1013
- data/lib/statsample/version.rb +1 -1
- data/statsample.gemspec +12 -16
- data/test/helpers_tests.rb +1 -1
- data/test/test_analysis.rb +17 -17
- data/test/test_anova_contrast.rb +6 -6
- data/test/test_anovatwowaywithdataset.rb +8 -8
- data/test/test_anovawithvectors.rb +8 -8
- data/test/test_awesome_print_bug.rb +1 -1
- data/test/test_bartlettsphericity.rb +4 -4
- data/test/test_bivariate.rb +48 -43
- data/test/test_codification.rb +33 -33
- data/test/test_crosstab.rb +9 -9
- data/test/test_dataset.rb +28 -458
- data/test/test_factor.rb +46 -38
- data/test/test_factor_pa.rb +22 -13
- data/test/test_ggobi.rb +4 -4
- data/test/test_gsl.rb +4 -4
- data/test/test_histogram.rb +3 -3
- data/test/test_matrix.rb +13 -13
- data/test/test_multiset.rb +103 -91
- data/test/test_regression.rb +57 -52
- data/test/test_reliability.rb +55 -45
- data/test/test_reliability_icc.rb +8 -8
- data/test/test_reliability_skillscale.rb +26 -24
- data/test/test_resample.rb +1 -1
- data/test/test_statistics.rb +3 -13
- data/test/test_stest.rb +9 -9
- data/test/test_stratified.rb +3 -3
- data/test/test_test_t.rb +12 -12
- data/test/test_umannwhitney.rb +2 -2
- data/test/test_vector.rb +76 -613
- data/test/test_wilcoxonsignedrank.rb +4 -4
- metadata +57 -28
- data/lib/statsample/rserve_extension.rb +0 -20
- data/lib/statsample/vector/gsl.rb +0 -106
- data/test/fixtures/repeated_fields.csv +0 -7
- data/test/fixtures/scientific_notation.csv +0 -4
- data/test/fixtures/test_csv.csv +0 -7
- data/test/fixtures/test_xls.xls +0 -0
- data/test/test_csv.rb +0 -63
- data/test/test_rserve_extension.rb +0 -42
- data/test/test_xls.rb +0 -52
@@ -1,5 +1,8 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
2
|
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
3
|
+
# == Description
|
4
|
+
#
|
5
|
+
# Velicer MAP test.
|
3
6
|
|
4
7
|
require 'statsample'
|
5
8
|
|
@@ -15,17 +18,18 @@ Statsample::Analysis.store(Statsample::Factor::MAP) do
|
|
15
18
|
vectors={}
|
16
19
|
|
17
20
|
variables.times do |i|
|
18
|
-
vectors["v#{i}"]=
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
21
|
+
vectors["v#{i}".to_sym]= Daru::Vector.new(
|
22
|
+
samples.times.collect do |nv|
|
23
|
+
if i<5
|
24
|
+
f1[nv]*5 + f2[nv] *2 +rng.call
|
25
|
+
else
|
26
|
+
f1[nv]*2 + f2[nv] *3 +rng.call
|
27
|
+
end
|
28
|
+
end)
|
25
29
|
end
|
26
30
|
|
27
31
|
|
28
|
-
ds=vectors
|
32
|
+
ds = Daru::DataFrame.new(vectors)
|
29
33
|
cor=cor(ds)
|
30
34
|
pca=pca(cor)
|
31
35
|
|
data/lib/statsample.rb
CHANGED
@@ -22,6 +22,8 @@ require 'extendmatrix'
|
|
22
22
|
require 'distribution'
|
23
23
|
require 'dirty-memoize'
|
24
24
|
require 'reportbuilder'
|
25
|
+
require 'daru'
|
26
|
+
require 'statsample/daru'
|
25
27
|
|
26
28
|
class Numeric
|
27
29
|
def square
|
@@ -52,42 +54,6 @@ class Module
|
|
52
54
|
end
|
53
55
|
|
54
56
|
class Array
|
55
|
-
# Recode repeated values on an array, adding the number of repetition
|
56
|
-
# at the end
|
57
|
-
# Example:
|
58
|
-
# a=%w{a b c c d d d e}
|
59
|
-
# a.recode_repeated
|
60
|
-
# => ["a","b","c_1","c_2","d_1","d_2","d_3","e"]
|
61
|
-
def recode_repeated
|
62
|
-
if size != uniq.size
|
63
|
-
# Find repeated
|
64
|
-
repeated = inject({}) do |acc, v|
|
65
|
-
if acc[v].nil?
|
66
|
-
acc[v] = 1
|
67
|
-
else
|
68
|
-
acc[v] += 1
|
69
|
-
end
|
70
|
-
acc
|
71
|
-
end.select { |_k, v| v > 1 }.keys
|
72
|
-
|
73
|
-
ns = repeated.inject({}) do |acc, v|
|
74
|
-
acc[v] = 0
|
75
|
-
acc
|
76
|
-
end
|
77
|
-
|
78
|
-
collect do |f|
|
79
|
-
if repeated.include? f
|
80
|
-
ns[f] += 1
|
81
|
-
sprintf('%s_%d', f, ns[f])
|
82
|
-
else
|
83
|
-
f
|
84
|
-
end
|
85
|
-
end
|
86
|
-
else
|
87
|
-
self
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
57
|
def sum
|
92
58
|
inject(:+)
|
93
59
|
end
|
@@ -218,7 +184,7 @@ module Statsample
|
|
218
184
|
size = vs[0].size
|
219
185
|
|
220
186
|
vs.each do |v|
|
221
|
-
fail ArgumentError, 'Arguments should be Vector' unless v.instance_of?
|
187
|
+
fail ArgumentError, 'Arguments should be Vector' unless v.instance_of? Daru::Vector
|
222
188
|
fail ArgumentError, 'Vectors size should be the same' if v.size != size
|
223
189
|
end
|
224
190
|
|
@@ -228,26 +194,26 @@ module Statsample
|
|
228
194
|
# Returns a duplicate of the input vectors, without missing data
|
229
195
|
# for any of the vectors.
|
230
196
|
#
|
231
|
-
# a=[1,2,3,6,7,nil,3,5]
|
232
|
-
# b=[nil,nil,5,6,4,5,10,2]
|
233
|
-
# c=[2,4,6,7,4,5,6,7]
|
197
|
+
# a = Daru::Vector.new([1,2,3,6,7,nil,3,5])
|
198
|
+
# b = Daru::Vector.new([nil,nil,5,6,4,5,10,2])
|
199
|
+
# c = Daru::Vector.new([2,4,6,7,4,5,6,7])
|
234
200
|
# a2,b2,c2=Statsample.only_valid(a,b,c)
|
235
|
-
# => [#<
|
236
|
-
# #<
|
237
|
-
# #<
|
201
|
+
# => [#<Daru::Vector:0xb748c8c8 @data=[3, 6, 7, 3, 5]>,
|
202
|
+
# #<Daru::Vector:0xb748c814 @data=[5, 6, 4, 10, 2]>,
|
203
|
+
# #<Daru::Vector:0xb748c760 @data=[6, 7, 4, 6, 7]>]
|
238
204
|
#
|
239
205
|
def only_valid(*vs)
|
240
206
|
i = 1
|
241
|
-
h = vs.inject({}) { |acc, v| acc["v#{i}"] = v; i += 1; acc }
|
242
|
-
|
243
|
-
|
207
|
+
h = vs.inject({}) { |acc, v| acc["v#{i}".to_sym] = v; i += 1; acc }
|
208
|
+
df = Daru::DataFrame.new(h).dup_only_valid
|
209
|
+
df.map { |v| v }
|
244
210
|
end
|
245
211
|
|
246
212
|
# Cheap version of #only_valid.
|
247
213
|
# If any vectors have missing_values, return only valid.
|
248
214
|
# If not, return the vectors itself
|
249
215
|
def only_valid_clone(*vs)
|
250
|
-
if vs.any?(&:
|
216
|
+
if vs.any?(&:has_missing_data?)
|
251
217
|
only_valid(*vs)
|
252
218
|
else
|
253
219
|
vs
|
@@ -80,7 +80,7 @@ module Statsample
|
|
80
80
|
|
81
81
|
def method_missing(name, *args,&block)
|
82
82
|
@attached.reverse.each do |ds|
|
83
|
-
return ds[name
|
83
|
+
return ds[name] if ds.vectors.to_a.include? (name)
|
84
84
|
end
|
85
85
|
raise "Method #{name} doesn't exists"
|
86
86
|
end
|
@@ -67,9 +67,9 @@ module Statsample
|
|
67
67
|
|
68
68
|
# One Way Anova with vectors
|
69
69
|
# Example:
|
70
|
-
# v1=[2,3,4,5,6]
|
71
|
-
# v2=[3,3,4,5,6]
|
72
|
-
# v3=[5,3,1,5,6]
|
70
|
+
# v1 = Daru::Vector.new([2,3,4,5,6])
|
71
|
+
# v2 = Daru::Vector.new([3,3,4,5,6])
|
72
|
+
# v3 = Daru::Vector.new([5,3,1,5,6])
|
73
73
|
# anova=Statsample::Anova::OneWayWithVectors.new([v1,v2,v3])
|
74
74
|
# anova.f
|
75
75
|
# => 0.0243902439024391
|
@@ -90,10 +90,10 @@ module Statsample
|
|
90
90
|
|
91
91
|
def initialize(*args)
|
92
92
|
if args[0].is_a? Array
|
93
|
-
@vectors=args.shift
|
93
|
+
@vectors = args.shift
|
94
94
|
else
|
95
|
-
@vectors=args.find_all {|v| v.is_a?
|
96
|
-
opts=args.find {|v| v.is_a? Hash}
|
95
|
+
@vectors = args.find_all {|v| v.is_a? Daru::Vector}
|
96
|
+
opts = args.find {|v| v.is_a? Hash}
|
97
97
|
end
|
98
98
|
opts||=Hash.new
|
99
99
|
opts_default={:name=>_("Anova One-Way"),
|
@@ -107,9 +107,9 @@ module Statsample
|
|
107
107
|
|
108
108
|
# Two Way Anova with vectors
|
109
109
|
# Example:
|
110
|
-
# v1=[1,1,2,2]
|
111
|
-
# v2=[1,2,1,2]
|
112
|
-
# v3=[5,3,1,5]
|
110
|
+
# v1 = Daru::Vector.new([1,1,2,2])
|
111
|
+
# v2 = Daru::Vector.new([1,2,1,2])
|
112
|
+
# v3 = Daru::Vector.new([5,3,1,5])
|
113
113
|
# anova=Statsample::Anova::TwoWayWithVectors.new(:a=>v1,:b=>v2, :dependent=>v3)
|
114
114
|
#
|
115
115
|
class TwoWayWithVectors < TwoWay
|
@@ -121,25 +121,26 @@ module Statsample
|
|
121
121
|
# For now, only equal sample cells allowed
|
122
122
|
def initialize(opts=Hash.new)
|
123
123
|
raise "You should insert at least :a, :b and :dependent" unless [:a, :b, :dependent].all? {|v| opts.has_key? v}
|
124
|
-
@a_var=
|
125
|
-
@b_var=
|
126
|
-
@dep_var=
|
127
|
-
@a_vector, @b_vector, @dep_vector=
|
124
|
+
@a_var = :a
|
125
|
+
@b_var = :b
|
126
|
+
@dep_var = :dependent
|
127
|
+
@a_vector, @b_vector, @dep_vector =
|
128
|
+
Statsample.only_valid_clone opts[:a], opts[:b], opts[:dependent]
|
128
129
|
|
129
|
-
ds={@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector}
|
130
|
-
@ds=ds.clone_only_valid
|
131
|
-
_p
|
132
|
-
_q
|
133
|
-
@x_general
|
134
|
-
@axb_means={}
|
135
|
-
@axb_sd={}
|
136
|
-
@vectors=[]
|
130
|
+
ds = Daru::DataFrame.new({@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector})
|
131
|
+
@ds = ds.clone_only_valid
|
132
|
+
_p = @a_vector.factors.size
|
133
|
+
_q = @b_vector.factors.size
|
134
|
+
@x_general = @dep_vector.mean
|
135
|
+
@axb_means = {}
|
136
|
+
@axb_sd = {}
|
137
|
+
@vectors = []
|
137
138
|
n=nil
|
138
139
|
@ds.to_multiset_by_split(a_var,b_var).each_vector(dep_var) {|k,v|
|
139
|
-
@axb_means[k]=v.mean
|
140
|
-
@axb_sd[k]=v.sd
|
140
|
+
@axb_means[k] = v.mean
|
141
|
+
@axb_sd[k] = v.sd
|
141
142
|
@vectors << v
|
142
|
-
n||=v.size
|
143
|
+
n ||= v.size
|
143
144
|
raise "All cell sizes should be equal" if n!=v.size
|
144
145
|
}
|
145
146
|
|
@@ -151,20 +152,21 @@ module Statsample
|
|
151
152
|
@ds.to_multiset_by_split(b_var).each_vector(dep_var) {|k,v|
|
152
153
|
@b_means[k]=v.mean
|
153
154
|
}
|
154
|
-
ss_a=n*_q*@ds[a_var].factors.inject(0) {|ac,v|
|
155
|
-
ac+(@a_means[v]-@x_general)**2
|
155
|
+
ss_a = n*_q*@ds[a_var].factors.inject(0) {|ac,v|
|
156
|
+
ac + (@a_means[v]-@x_general)**2
|
156
157
|
}
|
157
158
|
ss_b=n*_p*@ds[b_var].factors.inject(0) {|ac,v|
|
158
159
|
ac+(@b_means[v]-@x_general)**2
|
159
160
|
}
|
160
|
-
ss_within
|
161
|
+
ss_within = @ds.collect(:row) { |row|
|
161
162
|
(row[dep_var]-@axb_means[[row[a_var],row[b_var]]])**2
|
162
163
|
}.sum
|
163
|
-
ss_axb=n*@axb_means.inject(0) {|ac,v|
|
164
|
+
ss_axb = n*@axb_means.inject(0) {|ac,v|
|
164
165
|
j,k=v[0]
|
165
166
|
xjk=v[1]
|
166
167
|
ac+(xjk-@a_means[j]-@b_means[k]+@x_general)**2
|
167
168
|
}
|
169
|
+
|
168
170
|
df_a=_p-1
|
169
171
|
df_b=_q-1
|
170
172
|
df_within=(_p*_q)*(n-1)
|
@@ -186,9 +188,9 @@ module Statsample
|
|
186
188
|
def report_building(builder) #:nodoc:#
|
187
189
|
builder.section(:name=>@name) do |s|
|
188
190
|
if summary_descriptives
|
189
|
-
s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].
|
191
|
+
s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].index_of(a)}+[_("%s Mean") % @name_b]) do |t|
|
190
192
|
@ds[b_var].factors.each do |b|
|
191
|
-
t.row([@ds[b_var].
|
193
|
+
t.row([@ds[b_var].index_of(b)]+@ds[a_var].factors.map {|a| "%0.3f" % @axb_means[[a,b]] } + ["%0.3f" % @b_means[b]])
|
192
194
|
end
|
193
195
|
t.row([_("%s Mean") % @name_a]+@ds[a_var].factors.map {|a| "%0.3f" % @a_means[a]}+ ["%0.3f" % @x_general])
|
194
196
|
end
|
data/lib/statsample/bivariate.rb
CHANGED
@@ -12,9 +12,10 @@ module Statsample
|
|
12
12
|
# Covariance between two vectors
|
13
13
|
def covariance(v1,v2)
|
14
14
|
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
15
|
+
|
15
16
|
return nil if v1a.size==0
|
16
17
|
if Statsample.has_gsl?
|
17
|
-
GSL::Stats::covariance(v1a.
|
18
|
+
GSL::Stats::covariance(v1a.to_gsl, v2a.to_gsl)
|
18
19
|
else
|
19
20
|
covariance_slow(v1a,v2a)
|
20
21
|
end
|
@@ -34,7 +35,9 @@ module Statsample
|
|
34
35
|
sum_of_squares(v1a,v2a) / (v1a.size-1)
|
35
36
|
end
|
36
37
|
def sum_of_squares(v1,v2)
|
37
|
-
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
38
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
39
|
+
v1a.reset_index!
|
40
|
+
v2a.reset_index!
|
38
41
|
m1=v1a.mean
|
39
42
|
m2=v2a.mean
|
40
43
|
(v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
|
@@ -44,13 +47,14 @@ module Statsample
|
|
44
47
|
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
45
48
|
return nil if v1a.size ==0
|
46
49
|
if Statsample.has_gsl?
|
47
|
-
GSL::Stats::correlation(v1a.
|
50
|
+
GSL::Stats::correlation(v1a.to_gsl, v2a.to_gsl)
|
48
51
|
else
|
49
52
|
pearson_slow(v1a,v2a)
|
50
53
|
end
|
51
54
|
end
|
52
55
|
def pearson_slow(v1,v2) # :nodoc:
|
53
56
|
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
57
|
+
|
54
58
|
# Calculate sum of squares
|
55
59
|
ss=sum_of_squares(v1a,v2a)
|
56
60
|
ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
|
@@ -118,14 +122,16 @@ module Statsample
|
|
118
122
|
r=Statsample::Bivariate.pearson(from,del)
|
119
123
|
froms, dels = from.vector_standarized, del.vector_standarized
|
120
124
|
nv=[]
|
121
|
-
froms.
|
125
|
+
froms.reset_index!
|
126
|
+
dels.reset_index!
|
127
|
+
froms.each_index do |i|
|
122
128
|
if froms[i].nil? or dels[i].nil?
|
123
129
|
nv.push(nil)
|
124
130
|
else
|
125
131
|
nv.push(froms[i]-r*dels[i])
|
126
132
|
end
|
127
133
|
end
|
128
|
-
|
134
|
+
Daru::Vector.new(nv)
|
129
135
|
end
|
130
136
|
# Correlation between v1 and v2, controling the effect of
|
131
137
|
# control on both.
|
@@ -135,7 +141,6 @@ module Statsample
|
|
135
141
|
rv1con=pearson(v1a,cona)
|
136
142
|
rv2con=pearson(v2a,cona)
|
137
143
|
(rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
|
138
|
-
|
139
144
|
end
|
140
145
|
|
141
146
|
def covariance_matrix_optimized(ds)
|
@@ -153,50 +158,53 @@ module Statsample
|
|
153
158
|
# Order of rows and columns depends on Dataset#fields order
|
154
159
|
|
155
160
|
def covariance_matrix(ds)
|
156
|
-
vars,cases=ds.
|
161
|
+
vars,cases = ds.ncols, ds.nrows
|
157
162
|
if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
|
158
163
|
cm=covariance_matrix_optimized(ds)
|
159
164
|
else
|
160
165
|
cm=covariance_matrix_pairwise(ds)
|
161
|
-
|
162
166
|
end
|
163
167
|
cm.extend(Statsample::CovariateMatrix)
|
164
|
-
cm.fields=ds.
|
168
|
+
cm.fields = ds.vectors.to_a
|
165
169
|
cm
|
166
170
|
end
|
167
171
|
|
168
172
|
|
169
173
|
def covariance_matrix_pairwise(ds)
|
170
174
|
cache={}
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
cov=covariance(ds[row],ds[col])
|
179
|
-
cache[[row,col]]=cov
|
180
|
-
cov
|
175
|
+
vectors = ds.vectors.to_a
|
176
|
+
mat_rows = vectors.collect do |row|
|
177
|
+
vectors.collect do |col|
|
178
|
+
if (ds[row].type!=:numeric or ds[col].type!=:numeric)
|
179
|
+
nil
|
180
|
+
elsif row==col
|
181
|
+
ds[row].variance
|
181
182
|
else
|
182
|
-
|
183
|
+
if cache[[col,row]].nil?
|
184
|
+
cov=covariance(ds[row],ds[col])
|
185
|
+
cache[[row,col]]=cov
|
186
|
+
cov
|
187
|
+
else
|
188
|
+
cache[[col,row]]
|
189
|
+
end
|
183
190
|
end
|
184
191
|
end
|
185
192
|
end
|
186
|
-
|
193
|
+
|
194
|
+
Matrix.rows mat_rows
|
187
195
|
end
|
188
196
|
|
189
197
|
# Correlation matrix.
|
190
198
|
# Order of rows and columns depends on Dataset#fields order
|
191
199
|
def correlation_matrix(ds)
|
192
|
-
vars,cases=ds.
|
200
|
+
vars, cases = ds.ncols, ds.nrows
|
193
201
|
if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
|
194
202
|
cm=correlation_matrix_optimized(ds)
|
195
203
|
else
|
196
204
|
cm=correlation_matrix_pairwise(ds)
|
197
205
|
end
|
198
206
|
cm.extend(Statsample::CovariateMatrix)
|
199
|
-
cm.fields=ds.
|
207
|
+
cm.fields = ds.vectors.to_a
|
200
208
|
cm
|
201
209
|
end
|
202
210
|
|
@@ -212,33 +220,43 @@ module Statsample
|
|
212
220
|
end
|
213
221
|
def correlation_matrix_pairwise(ds)
|
214
222
|
cache={}
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
r=pearson(ds[row],ds[col])
|
223
|
-
cache[[row,col]]=r
|
224
|
-
r
|
223
|
+
vectors = ds.vectors.to_a
|
224
|
+
cm = vectors.collect do |row|
|
225
|
+
vectors.collect do |col|
|
226
|
+
if row==col
|
227
|
+
1.0
|
228
|
+
elsif (ds[row].type!=:numeric or ds[col].type!=:numeric)
|
229
|
+
nil
|
225
230
|
else
|
226
|
-
cache[[col,row]]
|
227
|
-
|
231
|
+
if cache[[col,row]].nil?
|
232
|
+
r=pearson(ds[row],ds[col])
|
233
|
+
cache[[row,col]]=r
|
234
|
+
r
|
235
|
+
else
|
236
|
+
cache[[col,row]]
|
237
|
+
end
|
238
|
+
end
|
228
239
|
end
|
229
240
|
end
|
241
|
+
|
242
|
+
Matrix.rows cm
|
230
243
|
end
|
231
244
|
|
232
245
|
# Retrieves the n valid pairwise.
|
233
246
|
def n_valid_matrix(ds)
|
234
|
-
ds.
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
247
|
+
vectors = ds.vectors.to_a
|
248
|
+
m = vectors.collect do |row|
|
249
|
+
vectors.collect do |col|
|
250
|
+
if row==col
|
251
|
+
ds[row].only_valid.size
|
252
|
+
else
|
253
|
+
rowa,rowb = Statsample.only_valid_clone(ds[row],ds[col])
|
254
|
+
rowa.size
|
255
|
+
end
|
240
256
|
end
|
241
257
|
end
|
258
|
+
|
259
|
+
Matrix.rows m
|
242
260
|
end
|
243
261
|
|
244
262
|
# Matrix of correlation probabilities.
|
@@ -256,27 +274,27 @@ module Statsample
|
|
256
274
|
|
257
275
|
# Spearman ranked correlation coefficient (rho) between 2 vectors
|
258
276
|
def spearman(v1,v2)
|
259
|
-
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
260
|
-
v1r,v2r=v1a.ranked
|
277
|
+
v1a,v2a = Statsample.only_valid_clone(v1,v2)
|
278
|
+
v1r,v2r = v1a.ranked, v2a.ranked
|
261
279
|
pearson(v1r,v2r)
|
262
280
|
end
|
263
281
|
# Calculate Point biserial correlation. Equal to Pearson correlation, with
|
264
282
|
# one dichotomous value replaced by "0" and the other by "1"
|
265
283
|
def point_biserial(dichotomous,continous)
|
266
|
-
ds={
|
267
|
-
raise(TypeError, "First vector should be dichotomous") if ds[
|
268
|
-
raise(TypeError, "Second vector should be continous") if ds[
|
269
|
-
f0=ds[
|
270
|
-
m0=ds.
|
271
|
-
m1=ds.
|
272
|
-
((m1.mean-m0.mean).to_f / ds[
|
284
|
+
ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).dup_only_valid
|
285
|
+
raise(TypeError, "First vector should be dichotomous") if ds[:d].factors.size != 2
|
286
|
+
raise(TypeError, "Second vector should be continous") if ds[:c].type != :numeric
|
287
|
+
f0=ds[:d].factors.sort.to_a[0]
|
288
|
+
m0=ds.filter_vector(:c) {|c| c[:d] == f0}
|
289
|
+
m1=ds.filter_vector(:c) {|c| c[:d] != f0}
|
290
|
+
((m1.mean-m0.mean).to_f / ds[:c].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.nrows**2)
|
273
291
|
end
|
274
292
|
# Kendall Rank Correlation Coefficient (Tau a)
|
275
293
|
# Based on Hervé Adbi article
|
276
294
|
def tau_a(v1,v2)
|
277
295
|
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
278
296
|
n=v1.size
|
279
|
-
v1r,v2r=v1a.ranked
|
297
|
+
v1r,v2r=v1a.ranked,v2a.ranked
|
280
298
|
o1=ordered_pairs(v1r)
|
281
299
|
o2=ordered_pairs(v2r)
|
282
300
|
delta= o1.size*2-(o2 & o1).size*2
|
@@ -348,14 +366,15 @@ module Statsample
|
|
348
366
|
}
|
349
367
|
{'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
|
350
368
|
end
|
369
|
+
|
351
370
|
def ordered_pairs(vector)
|
352
|
-
d=vector.
|
353
|
-
a=[]
|
354
|
-
(0...(d.size-1)).each
|
355
|
-
((i+1)...(d.size)).each
|
371
|
+
d = vector.to_a
|
372
|
+
a = []
|
373
|
+
(0...(d.size-1)).each do |i|
|
374
|
+
((i+1)...(d.size)).each do |j|
|
356
375
|
a.push([d[i],d[j]])
|
357
|
-
|
358
|
-
|
376
|
+
end
|
377
|
+
end
|
359
378
|
a
|
360
379
|
end
|
361
380
|
=begin
|
@@ -371,8 +390,8 @@ module Statsample
|
|
371
390
|
# Report the minimum number of cases valid of a covariate matrix
|
372
391
|
# based on a dataset
|
373
392
|
def min_n_valid(ds)
|
374
|
-
min=ds.
|
375
|
-
m=n_valid_matrix(ds)
|
393
|
+
min = ds.nrows
|
394
|
+
m = n_valid_matrix(ds)
|
376
395
|
for x in 0...m.row_size
|
377
396
|
for y in 0...m.column_size
|
378
397
|
min=m[x,y] if m[x,y] < min
|
@@ -380,8 +399,6 @@ module Statsample
|
|
380
399
|
end
|
381
400
|
min
|
382
401
|
end
|
383
|
-
|
384
|
-
|
385
402
|
end
|
386
403
|
end
|
387
404
|
end
|