statsample 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +11 -0
- data/Manifest.txt +7 -0
- data/README.txt +3 -3
- data/data/repeated_fields.csv +7 -0
- data/data/tetmat_matrix.txt +5 -0
- data/data/tetmat_test.txt +1001 -0
- data/demo/spss_matrix.rb +3 -0
- data/lib/spss.rb +1 -1
- data/lib/statistics2.rb +1 -1
- data/lib/statsample.rb +30 -1
- data/lib/statsample/anova.rb +62 -66
- data/lib/statsample/bivariate.rb +273 -281
- data/lib/statsample/bivariate/tetrachoric.rb +418 -0
- data/lib/statsample/codification.rb +15 -15
- data/lib/statsample/combination.rb +108 -106
- data/lib/statsample/converter/csv18.rb +52 -52
- data/lib/statsample/converter/csv19.rb +45 -48
- data/lib/statsample/converter/spss.rb +47 -0
- data/lib/statsample/converters.rb +74 -77
- data/lib/statsample/crosstab.rb +21 -17
- data/lib/statsample/dataset.rb +595 -543
- data/lib/statsample/dominanceanalysis.rb +7 -10
- data/lib/statsample/htmlreport.rb +23 -0
- data/lib/statsample/regression/multiple/baseengine.rb +59 -59
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/reliability.rb +165 -145
- data/lib/statsample/vector.rb +16 -2
- data/test/test_anova.rb +16 -16
- data/test/test_bivariate.rb +146 -0
- data/test/test_csv.rb +6 -0
- data/test/test_dataset.rb +49 -5
- data/test/test_statistics.rb +6 -90
- data/test/test_vector.rb +27 -10
- metadata +10 -4
- data/test/test_r.rb +0 -9
- data/test/test_stata.rb +0 -11
data/demo/spss_matrix.rb
ADDED
data/lib/spss.rb
CHANGED
data/lib/statistics2.rb
CHANGED
data/lib/statsample.rb
CHANGED
@@ -38,6 +38,34 @@ class String
|
|
38
38
|
end
|
39
39
|
end
|
40
40
|
|
41
|
+
|
42
|
+
class Array
|
43
|
+
# Recode repeated values on an array, adding the number of repetition
|
44
|
+
# at the end
|
45
|
+
# Example:
|
46
|
+
# a=%w{a b c c d d d e}
|
47
|
+
# a.recode_repeated
|
48
|
+
# => ["a","b","c_1","c_2","d_1","d_2","d_3","e"]
|
49
|
+
def recode_repeated
|
50
|
+
if self.size!=self.uniq.size
|
51
|
+
# Find repeated
|
52
|
+
repeated=self.inject({}) {|a,v|
|
53
|
+
(a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v| k}
|
54
|
+
ns=repeated.inject({}) {|a,v| a[v]=0;a}
|
55
|
+
self.collect do |f|
|
56
|
+
if repeated.include? f
|
57
|
+
ns[f]+=1
|
58
|
+
sprintf("%s_%d",f,ns[f])
|
59
|
+
else
|
60
|
+
f
|
61
|
+
end
|
62
|
+
end
|
63
|
+
else
|
64
|
+
self
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
41
69
|
def create_test(*args,&proc)
|
42
70
|
description=args.shift
|
43
71
|
fields=args
|
@@ -80,7 +108,7 @@ end
|
|
80
108
|
# * Dataset: An union of vectors.
|
81
109
|
#
|
82
110
|
module Statsample
|
83
|
-
VERSION = '0.5.
|
111
|
+
VERSION = '0.5.1'
|
84
112
|
SPLIT_TOKEN = ","
|
85
113
|
autoload(:Database, 'statsample/converters')
|
86
114
|
autoload(:Anova, 'statsample/anova')
|
@@ -89,6 +117,7 @@ module Statsample
|
|
89
117
|
autoload(:PlainText, 'statsample/converters')
|
90
118
|
autoload(:Excel, 'statsample/converters')
|
91
119
|
autoload(:GGobi, 'statsample/converters')
|
120
|
+
autoload(:SPSS, 'statsample/converter/spss')
|
92
121
|
autoload(:DominanceAnalysis, 'statsample/dominanceanalysis')
|
93
122
|
autoload(:HtmlReport, 'statsample/htmlreport')
|
94
123
|
autoload(:Mx, 'statsample/converters')
|
data/lib/statsample/anova.rb
CHANGED
@@ -1,70 +1,66 @@
|
|
1
1
|
module Statsample
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
2
|
+
module Anova
|
3
|
+
# One Way Anova
|
4
|
+
# Example:
|
5
|
+
# v1=[2,3,4,5,6].to_scale
|
6
|
+
# v2=[3,3,4,5,6].to_scale
|
7
|
+
# v3=[5,3,1,5,6].to_scale
|
8
|
+
# anova=Statsample::Anova::OneWay.new([v1,v2,v3])
|
9
|
+
# puts anova.f
|
10
|
+
# puts anova.significance
|
11
|
+
class OneWay
|
12
|
+
def initialize(vectors)
|
13
|
+
@vectors=vectors
|
14
|
+
end
|
15
|
+
# Total sum
|
16
|
+
def sum
|
17
|
+
@vectors.inject(0){|a,v| a+v.sum}
|
18
|
+
end
|
19
|
+
# Total mean
|
20
|
+
def mean
|
21
|
+
sum.quo(n)
|
22
|
+
end
|
23
|
+
# Total sum of squares
|
24
|
+
def sst
|
25
|
+
m=mean.to_f
|
26
|
+
@vectors.inject(0) {|total,vector| total+vector.sum_of_squares(m) }
|
27
|
+
end
|
28
|
+
# Sum of squares within groups
|
29
|
+
def sswg
|
30
|
+
@vectors.inject(0) {|total,vector| total+vector.sum_of_squares }
|
31
|
+
end
|
32
|
+
# Sum of squares between groups
|
33
|
+
def ssbg
|
34
|
+
m=mean
|
35
|
+
@vectors.inject(0) do |total,vector|
|
36
|
+
total + (vector.mean-m).square * vector.size
|
14
37
|
end
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
}
|
42
|
-
end
|
43
|
-
# Degrees of freedom within groups
|
44
|
-
def df_wg
|
45
|
-
@vectors.inject(0) {|a,v| a+(v.size-1)}
|
46
|
-
end
|
47
|
-
# Degrees of freedom between groups
|
48
|
-
def df_bg
|
49
|
-
@vectors.size-1
|
50
|
-
end
|
51
|
-
# Total Degrees of freedom
|
52
|
-
def df_total
|
53
|
-
n-1
|
54
|
-
end
|
55
|
-
# Total number of cases
|
56
|
-
def n
|
57
|
-
@vectors.inject(0){|a,v| a+v.size}
|
58
|
-
end
|
59
|
-
# Fisher
|
60
|
-
def f
|
61
|
-
k=@vectors.size
|
62
|
-
(ssbg*(n-k)) / (sswg*(k-1))
|
63
|
-
end
|
64
|
-
# Significance of Fisher
|
65
|
-
def significance
|
66
|
-
1.0-Distribution::F.cdf(f,df_bg,df_wg)
|
67
|
-
end
|
68
|
-
end
|
38
|
+
end
|
39
|
+
# Degrees of freedom within groups
|
40
|
+
def df_wg
|
41
|
+
@vectors.inject(0) {|a,v| a+(v.size-1)}
|
42
|
+
end
|
43
|
+
# Degrees of freedom between groups
|
44
|
+
def df_bg
|
45
|
+
@vectors.size-1
|
46
|
+
end
|
47
|
+
# Total Degrees of freedom
|
48
|
+
def df_total
|
49
|
+
n-1
|
50
|
+
end
|
51
|
+
# Total number of cases
|
52
|
+
def n
|
53
|
+
@vectors.inject(0){|a,v| a+v.size}
|
54
|
+
end
|
55
|
+
# Fisher
|
56
|
+
def f
|
57
|
+
k=@vectors.size
|
58
|
+
(ssbg*(n-k)) / (sswg*(k-1))
|
59
|
+
end
|
60
|
+
# Significance of Fisher
|
61
|
+
def significance
|
62
|
+
1.0-Distribution::F.cdf(f,df_bg,df_wg)
|
63
|
+
end
|
69
64
|
end
|
65
|
+
end
|
70
66
|
end
|
data/lib/statsample/bivariate.rb
CHANGED
@@ -1,286 +1,278 @@
|
|
1
|
+
require 'statsample/bivariate/tetrachoric'
|
1
2
|
module Statsample
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
def correlation_probability_matrix(ds, tails=:both)
|
162
|
-
rows=ds.fields.collect{|row|
|
163
|
-
ds.fields.collect{|col|
|
164
|
-
v1a,v2a=Statsample.only_valid(ds[row],ds[col])
|
165
|
-
(row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
|
166
|
-
}
|
167
|
-
}
|
168
|
-
Matrix.rows(rows)
|
169
|
-
end
|
170
|
-
# Spearman ranked correlation coefficient between 2 vectors
|
171
|
-
def spearman(v1,v2)
|
172
|
-
v1a,v2a=Statsample.only_valid(v1,v2)
|
173
|
-
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
174
|
-
pearson(v1r,v2r)
|
175
|
-
end
|
176
|
-
# Calculate Point biserial correlation.
|
177
|
-
# Equal to Pearson correlation, with one dichotomous value replaced
|
178
|
-
# by "0" and the other by "1"
|
179
|
-
def point_biserial(dichotomous,continous)
|
180
|
-
ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
|
181
|
-
raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
|
182
|
-
raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
|
183
|
-
f0=ds['d'].factors.sort[0]
|
184
|
-
m0=ds.filter_field('c') {|c| c['d']==f0}
|
185
|
-
m1=ds.filter_field('c') {|c| c['d']!=f0}
|
186
|
-
((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
|
187
|
-
end
|
188
|
-
# Kendall Rank Correlation Coefficient.
|
189
|
-
#
|
190
|
-
# Based on Hervé Adbi article
|
191
|
-
def tau_a(v1,v2)
|
192
|
-
v1a,v2a=Statsample.only_valid(v1,v2)
|
193
|
-
n=v1.size
|
194
|
-
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
195
|
-
o1=ordered_pairs(v1r)
|
196
|
-
o2=ordered_pairs(v2r)
|
197
|
-
delta= o1.size*2-(o2 & o1).size*2
|
198
|
-
1-(delta * 2 / (n*(n-1)).to_f)
|
199
|
-
end
|
200
|
-
# Calculates Tau b correlation.
|
201
|
-
#
|
202
|
-
# Tau-b defines perfect association as strict monotonicity.
|
203
|
-
# Although it requires strict monotonicity to reach 1.0,
|
204
|
-
# it does not penalize ties as much as some other measures.
|
205
|
-
#
|
206
|
-
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
207
|
-
def tau_b(matrix)
|
208
|
-
v=pairs(matrix)
|
209
|
-
((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
|
210
|
-
end
|
211
|
-
# Calculates Goodman and Kruskal's gamma.
|
212
|
-
#
|
213
|
-
# Gamma is the surplus of concordant pairs over discordant pairs,
|
214
|
-
# as a percentage of all pairs ignoring ties.
|
215
|
-
#
|
216
|
-
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
217
|
-
def gamma(matrix)
|
218
|
-
v=pairs(matrix)
|
219
|
-
(v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
|
220
|
-
end
|
221
|
-
# Calculate indexes for a matrix
|
222
|
-
# the rows and cols has to be ordered
|
223
|
-
def pairs(matrix)
|
224
|
-
# calculate concordant
|
225
|
-
#p matrix
|
226
|
-
rs=matrix.row_size
|
227
|
-
cs=matrix.column_size
|
228
|
-
conc=disc=ties_x=ties_y=0
|
229
|
-
(0...(rs-1)).each {|x|
|
230
|
-
(0...(cs-1)).each{|y|
|
231
|
-
((x+1)...rs).each{|x2|
|
232
|
-
((y+1)...cs).each{|y2|
|
233
|
-
#p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
234
|
-
conc+=matrix[x,y]*matrix[x2,y2]
|
235
|
-
}
|
236
|
-
}
|
237
|
-
}
|
238
|
-
}
|
239
|
-
(0...(rs-1)).each {|x|
|
240
|
-
(1...(cs)).each{|y|
|
241
|
-
((x+1)...rs).each{|x2|
|
242
|
-
(0...y).each{|y2|
|
243
|
-
#p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
244
|
-
disc+=matrix[x,y]*matrix[x2,y2]
|
245
|
-
}
|
246
|
-
}
|
247
|
-
}
|
248
|
-
}
|
249
|
-
(0...(rs-1)).each {|x|
|
250
|
-
(0...(cs)).each{|y|
|
251
|
-
((x+1)...(rs)).each{|x2|
|
252
|
-
ties_x+=matrix[x,y]*matrix[x2,y]
|
253
|
-
}
|
254
|
-
}
|
255
|
-
}
|
256
|
-
(0...rs).each {|x|
|
257
|
-
(0...(cs-1)).each{|y|
|
258
|
-
((y+1)...(cs)).each{|y2|
|
259
|
-
ties_y+=matrix[x,y]*matrix[x,y2]
|
260
|
-
}
|
261
|
-
}
|
262
|
-
}
|
263
|
-
{'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
|
264
|
-
end
|
265
|
-
def ordered_pairs(vector)
|
266
|
-
d=vector.data
|
267
|
-
a=[]
|
268
|
-
(0...(d.size-1)).each{|i|
|
269
|
-
((i+1)...(d.size)).each {|j|
|
270
|
-
a.push([d[i],d[j]])
|
271
|
-
}
|
272
|
-
}
|
273
|
-
a
|
274
|
-
end
|
275
|
-
def sum_of_codeviated(v1,v2)
|
276
|
-
v1a,v2a=Statsample.only_valid(v1,v2)
|
277
|
-
sum=0
|
278
|
-
(0...v1a.size).each{|i|
|
279
|
-
sum+=v1a[i]*v2a[i]
|
280
|
-
}
|
281
|
-
sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
|
282
|
-
end
|
3
|
+
# Diverse correlation methods
|
4
|
+
module Bivariate
|
5
|
+
class << self
|
6
|
+
# Covariance between two vectors
|
7
|
+
def covariance(v1,v2)
|
8
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
9
|
+
return nil if v1a.size==0
|
10
|
+
if HAS_GSL
|
11
|
+
GSL::Stats::covariance(v1a.gsl, v2a.gsl)
|
12
|
+
else
|
13
|
+
covariance_slow(v1a,v2a)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
def maximum_likehood_dichotomic(pred,real)
|
17
|
+
preda,reala=Statsample.only_valid(pred,real)
|
18
|
+
sum=0
|
19
|
+
pred.each_index{|i|
|
20
|
+
sum+=(real[i]*Math::log(pred[i])) + ((1-real[i])*Math::log(1-pred[i]))
|
21
|
+
}
|
22
|
+
sum
|
23
|
+
end
|
24
|
+
|
25
|
+
def covariance_slow(v1a,v2a) # :nodoc:
|
26
|
+
t=0
|
27
|
+
m1=v1a.mean
|
28
|
+
m2=v1a.mean
|
29
|
+
(0...v1a.size).each {|i| t+=((v1a[i]-m1)*(v2a[i]-m2)) }
|
30
|
+
t.to_f / (v1a.size-1)
|
31
|
+
end
|
32
|
+
# Calculate Pearson correlation coefficient between 2 vectors
|
33
|
+
def pearson(v1,v2)
|
34
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
35
|
+
return nil if v1a.size ==0
|
36
|
+
if HAS_GSL
|
37
|
+
GSL::Stats::correlation(v1a.gsl, v2a.gsl)
|
38
|
+
else
|
39
|
+
pearson_slow(v1a,v2a)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
def pearson_slow(v1a,v2a) # :nodoc:
|
43
|
+
v1s,v2s=v1a.vector_standarized_pop,v2a.vector_standarized_pop
|
44
|
+
t=0
|
45
|
+
siz=v1s.size
|
46
|
+
(0...v1s.size).each {|i| t+=(v1s[i]*v2s[i]) }
|
47
|
+
t.to_f/v2s.size
|
48
|
+
end
|
49
|
+
# Retrieves the value for t test for a pearson correlation
|
50
|
+
# between two vectors to test the null hipothesis of r=0
|
51
|
+
def t_pearson(v1,v2)
|
52
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
53
|
+
r=pearson(v1a,v2a)
|
54
|
+
if(r==1.0)
|
55
|
+
0
|
56
|
+
else
|
57
|
+
t_r(r,v1a.size)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
# Retrieves the value for t test for a pearson correlation
|
61
|
+
# giving r and vector size
|
62
|
+
def t_r(r,size)
|
63
|
+
r * Math::sqrt(((size)-2).to_f / (1 - r**2))
|
64
|
+
end
|
65
|
+
# Retrieves the probability value (a la SPSS)
|
66
|
+
# for a given t, size and number of tails.
|
67
|
+
# Uses a second parameter
|
68
|
+
# * :both or 2 : for r!=0
|
69
|
+
# * :right, :positive or 1 : for r > 0
|
70
|
+
# * :left, :negative : for r < 0
|
71
|
+
|
72
|
+
def prop_pearson(t, size, tails=:both)
|
73
|
+
tails=:both if tails==2
|
74
|
+
tails=:right if tails==1 or tails==:positive
|
75
|
+
tails=:left if tails==:negative
|
76
|
+
|
77
|
+
n_tails=case tails
|
78
|
+
when :both then 2
|
79
|
+
else 1
|
80
|
+
end
|
81
|
+
t=-t if t>0 and (tails==:both)
|
82
|
+
cdf=Distribution::T.cdf(t, size-2)
|
83
|
+
if(tails==:right)
|
84
|
+
1.0-(cdf*n_tails)
|
85
|
+
else
|
86
|
+
cdf*n_tails
|
87
|
+
end
|
88
|
+
end
|
89
|
+
# Returns residual score after delete variance
|
90
|
+
# from another variable
|
91
|
+
#
|
92
|
+
def residuals(from,del)
|
93
|
+
r=Statsample::Bivariate.pearson(from,del)
|
94
|
+
froms, dels = from.vector_standarized, del.vector_standarized
|
95
|
+
nv=[]
|
96
|
+
froms.data_with_nils.each_index do |i|
|
97
|
+
if froms[i].nil? or dels[i].nil?
|
98
|
+
nv.push(nil)
|
99
|
+
else
|
100
|
+
nv.push(froms[i]-r*dels[i])
|
101
|
+
end
|
102
|
+
end
|
103
|
+
nv.to_vector(:scale)
|
104
|
+
end
|
105
|
+
# Correlation between v1 and v2, controling the effect of
|
106
|
+
# control on both.
|
107
|
+
def partial_correlation(v1,v2,control)
|
108
|
+
v1a,v2a,cona=Statsample.only_valid(v1,v2,control)
|
109
|
+
rv1v2=pearson(v1a,v2a)
|
110
|
+
rv1con=pearson(v1a,cona)
|
111
|
+
rv2con=pearson(v2a,cona)
|
112
|
+
(rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
|
113
|
+
|
114
|
+
end
|
115
|
+
# Covariance matrix.
|
116
|
+
# Order of rows and columns depends on Dataset#fields order
|
117
|
+
|
118
|
+
def covariance_matrix(ds)
|
119
|
+
ds.collect_matrix do |row,col|
|
120
|
+
if (ds[row].type!=:scale or ds[col].type!=:scale)
|
121
|
+
nil
|
122
|
+
else
|
123
|
+
covariance(ds[row],ds[col])
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# Correlation matrix.
|
129
|
+
# Order of rows and columns depends on Dataset#fields order
|
130
|
+
|
131
|
+
def correlation_matrix(ds)
|
132
|
+
ds.collect_matrix do |row,col|
|
133
|
+
if row==col
|
134
|
+
1.0
|
135
|
+
elsif (ds[row].type!=:scale or ds[col].type!=:scale)
|
136
|
+
nil
|
137
|
+
else
|
138
|
+
pearson(ds[row],ds[col])
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
# Retrieves the n valid pairwise
|
143
|
+
def n_valid_matrix(ds)
|
144
|
+
ds.collect_matrix do |row,col|
|
145
|
+
if row==col
|
146
|
+
ds[row].valid_data.size
|
147
|
+
else
|
148
|
+
rowa,rowb=Statsample.only_valid(ds[row],ds[col])
|
149
|
+
rowa.size
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
# Matrix of correlation probability
|
154
|
+
# Order of rows and columns depends on Dataset#fields order
|
155
|
+
|
156
|
+
def correlation_probability_matrix(ds, tails=:both)
|
157
|
+
rows=ds.fields.collect do |row|
|
158
|
+
ds.fields.collect do |col|
|
159
|
+
v1a,v2a=Statsample.only_valid(ds[row],ds[col])
|
160
|
+
(row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
|
161
|
+
end
|
283
162
|
end
|
163
|
+
Matrix.rows(rows)
|
164
|
+
end
|
165
|
+
# Spearman ranked correlation coefficient between 2 vectors
|
166
|
+
def spearman(v1,v2)
|
167
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
168
|
+
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
169
|
+
pearson(v1r,v2r)
|
170
|
+
end
|
171
|
+
# Calculate Point biserial correlation. Equal to Pearson correlation, with
|
172
|
+
# one dichotomous value replaced by "0" and the other by "1"
|
173
|
+
def point_biserial(dichotomous,continous)
|
174
|
+
ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
|
175
|
+
raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
|
176
|
+
raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
|
177
|
+
f0=ds['d'].factors.sort[0]
|
178
|
+
m0=ds.filter_field('c') {|c| c['d']==f0}
|
179
|
+
m1=ds.filter_field('c') {|c| c['d']!=f0}
|
180
|
+
((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
|
181
|
+
end
|
182
|
+
# Kendall Rank Correlation Coefficient.
|
183
|
+
#
|
184
|
+
# Based on Hervé Adbi article
|
185
|
+
def tau_a(v1,v2)
|
186
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
187
|
+
n=v1.size
|
188
|
+
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
189
|
+
o1=ordered_pairs(v1r)
|
190
|
+
o2=ordered_pairs(v2r)
|
191
|
+
delta= o1.size*2-(o2 & o1).size*2
|
192
|
+
1-(delta * 2 / (n*(n-1)).to_f)
|
193
|
+
end
|
194
|
+
# Calculates Tau b correlation.
|
195
|
+
#
|
196
|
+
# Tau-b defines perfect association as strict monotonicity. Although it
|
197
|
+
# requires strict monotonicity to reach 1.0, it does not penalize ties as
|
198
|
+
# much as some other measures.
|
199
|
+
#
|
200
|
+
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
201
|
+
def tau_b(matrix)
|
202
|
+
v=pairs(matrix)
|
203
|
+
((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
|
204
|
+
end
|
205
|
+
# Calculates Goodman and Kruskal's gamma.
|
206
|
+
#
|
207
|
+
# Gamma is the surplus of concordant pairs over discordant pairs, as a
|
208
|
+
# percentage of all pairs ignoring ties.
|
209
|
+
#
|
210
|
+
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
211
|
+
def gamma(matrix)
|
212
|
+
v=pairs(matrix)
|
213
|
+
(v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
|
214
|
+
end
|
215
|
+
# Calculate indexes for a matrix the rows and cols has to be ordered
|
216
|
+
def pairs(matrix)
|
217
|
+
# calculate concordant #p matrix
|
218
|
+
rs=matrix.row_size
|
219
|
+
cs=matrix.column_size
|
220
|
+
conc=disc=ties_x=ties_y=0
|
221
|
+
(0...(rs-1)).each {|x|
|
222
|
+
(0...(cs-1)).each{|y|
|
223
|
+
((x+1)...rs).each{|x2|
|
224
|
+
((y+1)...cs).each{|y2|
|
225
|
+
# #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
226
|
+
conc+=matrix[x,y]*matrix[x2,y2]
|
227
|
+
}
|
228
|
+
}
|
229
|
+
}
|
230
|
+
}
|
231
|
+
(0...(rs-1)).each {|x|
|
232
|
+
(1...(cs)).each{|y|
|
233
|
+
((x+1)...rs).each{|x2|
|
234
|
+
(0...y).each{|y2|
|
235
|
+
# #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
236
|
+
disc+=matrix[x,y]*matrix[x2,y2]
|
237
|
+
}
|
238
|
+
}
|
239
|
+
}
|
240
|
+
}
|
241
|
+
(0...(rs-1)).each {|x|
|
242
|
+
(0...(cs)).each{|y|
|
243
|
+
((x+1)...(rs)).each{|x2|
|
244
|
+
ties_x+=matrix[x,y]*matrix[x2,y]
|
245
|
+
}
|
246
|
+
}
|
247
|
+
}
|
248
|
+
(0...rs).each {|x|
|
249
|
+
(0...(cs-1)).each{|y|
|
250
|
+
((y+1)...(cs)).each{|y2|
|
251
|
+
ties_y+=matrix[x,y]*matrix[x,y2]
|
252
|
+
}
|
253
|
+
}
|
254
|
+
}
|
255
|
+
{'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
|
256
|
+
end
|
257
|
+
def ordered_pairs(vector)
|
258
|
+
d=vector.data
|
259
|
+
a=[]
|
260
|
+
(0...(d.size-1)).each{|i|
|
261
|
+
((i+1)...(d.size)).each {|j|
|
262
|
+
a.push([d[i],d[j]])
|
263
|
+
}
|
264
|
+
}
|
265
|
+
a
|
266
|
+
end
|
267
|
+
def sum_of_codeviated(v1,v2)
|
268
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
269
|
+
sum=0
|
270
|
+
(0...v1a.size).each{|i|
|
271
|
+
sum+=v1a[i]*v2a[i]
|
272
|
+
}
|
273
|
+
sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
|
274
|
+
end
|
284
275
|
end
|
276
|
+
end
|
285
277
|
end
|
286
278
|
|