statsample 0.5.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +11 -0
- data/Manifest.txt +7 -0
- data/README.txt +3 -3
- data/data/repeated_fields.csv +7 -0
- data/data/tetmat_matrix.txt +5 -0
- data/data/tetmat_test.txt +1001 -0
- data/demo/spss_matrix.rb +3 -0
- data/lib/spss.rb +1 -1
- data/lib/statistics2.rb +1 -1
- data/lib/statsample.rb +30 -1
- data/lib/statsample/anova.rb +62 -66
- data/lib/statsample/bivariate.rb +273 -281
- data/lib/statsample/bivariate/tetrachoric.rb +418 -0
- data/lib/statsample/codification.rb +15 -15
- data/lib/statsample/combination.rb +108 -106
- data/lib/statsample/converter/csv18.rb +52 -52
- data/lib/statsample/converter/csv19.rb +45 -48
- data/lib/statsample/converter/spss.rb +47 -0
- data/lib/statsample/converters.rb +74 -77
- data/lib/statsample/crosstab.rb +21 -17
- data/lib/statsample/dataset.rb +595 -543
- data/lib/statsample/dominanceanalysis.rb +7 -10
- data/lib/statsample/htmlreport.rb +23 -0
- data/lib/statsample/regression/multiple/baseengine.rb +59 -59
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/reliability.rb +165 -145
- data/lib/statsample/vector.rb +16 -2
- data/test/test_anova.rb +16 -16
- data/test/test_bivariate.rb +146 -0
- data/test/test_csv.rb +6 -0
- data/test/test_dataset.rb +49 -5
- data/test/test_statistics.rb +6 -90
- data/test/test_vector.rb +27 -10
- metadata +10 -4
- data/test/test_r.rb +0 -9
- data/test/test_stata.rb +0 -11
data/demo/spss_matrix.rb
ADDED
data/lib/spss.rb
CHANGED
data/lib/statistics2.rb
CHANGED
data/lib/statsample.rb
CHANGED
@@ -38,6 +38,34 @@ class String
|
|
38
38
|
end
|
39
39
|
end
|
40
40
|
|
41
|
+
|
42
|
+
class Array
|
43
|
+
# Recode repeated values on an array, adding the number of repetition
|
44
|
+
# at the end
|
45
|
+
# Example:
|
46
|
+
# a=%w{a b c c d d d e}
|
47
|
+
# a.recode_repeated
|
48
|
+
# => ["a","b","c_1","c_2","d_1","d_2","d_3","e"]
|
49
|
+
def recode_repeated
|
50
|
+
if self.size!=self.uniq.size
|
51
|
+
# Find repeated
|
52
|
+
repeated=self.inject({}) {|a,v|
|
53
|
+
(a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v| k}
|
54
|
+
ns=repeated.inject({}) {|a,v| a[v]=0;a}
|
55
|
+
self.collect do |f|
|
56
|
+
if repeated.include? f
|
57
|
+
ns[f]+=1
|
58
|
+
sprintf("%s_%d",f,ns[f])
|
59
|
+
else
|
60
|
+
f
|
61
|
+
end
|
62
|
+
end
|
63
|
+
else
|
64
|
+
self
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
41
69
|
def create_test(*args,&proc)
|
42
70
|
description=args.shift
|
43
71
|
fields=args
|
@@ -80,7 +108,7 @@ end
|
|
80
108
|
# * Dataset: An union of vectors.
|
81
109
|
#
|
82
110
|
module Statsample
|
83
|
-
VERSION = '0.5.
|
111
|
+
VERSION = '0.5.1'
|
84
112
|
SPLIT_TOKEN = ","
|
85
113
|
autoload(:Database, 'statsample/converters')
|
86
114
|
autoload(:Anova, 'statsample/anova')
|
@@ -89,6 +117,7 @@ module Statsample
|
|
89
117
|
autoload(:PlainText, 'statsample/converters')
|
90
118
|
autoload(:Excel, 'statsample/converters')
|
91
119
|
autoload(:GGobi, 'statsample/converters')
|
120
|
+
autoload(:SPSS, 'statsample/converter/spss')
|
92
121
|
autoload(:DominanceAnalysis, 'statsample/dominanceanalysis')
|
93
122
|
autoload(:HtmlReport, 'statsample/htmlreport')
|
94
123
|
autoload(:Mx, 'statsample/converters')
|
data/lib/statsample/anova.rb
CHANGED
@@ -1,70 +1,66 @@
|
|
1
1
|
module Statsample
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
2
|
+
module Anova
|
3
|
+
# One Way Anova
|
4
|
+
# Example:
|
5
|
+
# v1=[2,3,4,5,6].to_scale
|
6
|
+
# v2=[3,3,4,5,6].to_scale
|
7
|
+
# v3=[5,3,1,5,6].to_scale
|
8
|
+
# anova=Statsample::Anova::OneWay.new([v1,v2,v3])
|
9
|
+
# puts anova.f
|
10
|
+
# puts anova.significance
|
11
|
+
class OneWay
|
12
|
+
def initialize(vectors)
|
13
|
+
@vectors=vectors
|
14
|
+
end
|
15
|
+
# Total sum
|
16
|
+
def sum
|
17
|
+
@vectors.inject(0){|a,v| a+v.sum}
|
18
|
+
end
|
19
|
+
# Total mean
|
20
|
+
def mean
|
21
|
+
sum.quo(n)
|
22
|
+
end
|
23
|
+
# Total sum of squares
|
24
|
+
def sst
|
25
|
+
m=mean.to_f
|
26
|
+
@vectors.inject(0) {|total,vector| total+vector.sum_of_squares(m) }
|
27
|
+
end
|
28
|
+
# Sum of squares within groups
|
29
|
+
def sswg
|
30
|
+
@vectors.inject(0) {|total,vector| total+vector.sum_of_squares }
|
31
|
+
end
|
32
|
+
# Sum of squares between groups
|
33
|
+
def ssbg
|
34
|
+
m=mean
|
35
|
+
@vectors.inject(0) do |total,vector|
|
36
|
+
total + (vector.mean-m).square * vector.size
|
14
37
|
end
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
}
|
42
|
-
end
|
43
|
-
# Degrees of freedom within groups
|
44
|
-
def df_wg
|
45
|
-
@vectors.inject(0) {|a,v| a+(v.size-1)}
|
46
|
-
end
|
47
|
-
# Degrees of freedom between groups
|
48
|
-
def df_bg
|
49
|
-
@vectors.size-1
|
50
|
-
end
|
51
|
-
# Total Degrees of freedom
|
52
|
-
def df_total
|
53
|
-
n-1
|
54
|
-
end
|
55
|
-
# Total number of cases
|
56
|
-
def n
|
57
|
-
@vectors.inject(0){|a,v| a+v.size}
|
58
|
-
end
|
59
|
-
# Fisher
|
60
|
-
def f
|
61
|
-
k=@vectors.size
|
62
|
-
(ssbg*(n-k)) / (sswg*(k-1))
|
63
|
-
end
|
64
|
-
# Significance of Fisher
|
65
|
-
def significance
|
66
|
-
1.0-Distribution::F.cdf(f,df_bg,df_wg)
|
67
|
-
end
|
68
|
-
end
|
38
|
+
end
|
39
|
+
# Degrees of freedom within groups
|
40
|
+
def df_wg
|
41
|
+
@vectors.inject(0) {|a,v| a+(v.size-1)}
|
42
|
+
end
|
43
|
+
# Degrees of freedom between groups
|
44
|
+
def df_bg
|
45
|
+
@vectors.size-1
|
46
|
+
end
|
47
|
+
# Total Degrees of freedom
|
48
|
+
def df_total
|
49
|
+
n-1
|
50
|
+
end
|
51
|
+
# Total number of cases
|
52
|
+
def n
|
53
|
+
@vectors.inject(0){|a,v| a+v.size}
|
54
|
+
end
|
55
|
+
# Fisher
|
56
|
+
def f
|
57
|
+
k=@vectors.size
|
58
|
+
(ssbg*(n-k)) / (sswg*(k-1))
|
59
|
+
end
|
60
|
+
# Significance of Fisher
|
61
|
+
def significance
|
62
|
+
1.0-Distribution::F.cdf(f,df_bg,df_wg)
|
63
|
+
end
|
69
64
|
end
|
65
|
+
end
|
70
66
|
end
|
data/lib/statsample/bivariate.rb
CHANGED
@@ -1,286 +1,278 @@
|
|
1
|
+
require 'statsample/bivariate/tetrachoric'
|
1
2
|
module Statsample
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
def correlation_probability_matrix(ds, tails=:both)
|
162
|
-
rows=ds.fields.collect{|row|
|
163
|
-
ds.fields.collect{|col|
|
164
|
-
v1a,v2a=Statsample.only_valid(ds[row],ds[col])
|
165
|
-
(row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
|
166
|
-
}
|
167
|
-
}
|
168
|
-
Matrix.rows(rows)
|
169
|
-
end
|
170
|
-
# Spearman ranked correlation coefficient between 2 vectors
|
171
|
-
def spearman(v1,v2)
|
172
|
-
v1a,v2a=Statsample.only_valid(v1,v2)
|
173
|
-
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
174
|
-
pearson(v1r,v2r)
|
175
|
-
end
|
176
|
-
# Calculate Point biserial correlation.
|
177
|
-
# Equal to Pearson correlation, with one dichotomous value replaced
|
178
|
-
# by "0" and the other by "1"
|
179
|
-
def point_biserial(dichotomous,continous)
|
180
|
-
ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
|
181
|
-
raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
|
182
|
-
raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
|
183
|
-
f0=ds['d'].factors.sort[0]
|
184
|
-
m0=ds.filter_field('c') {|c| c['d']==f0}
|
185
|
-
m1=ds.filter_field('c') {|c| c['d']!=f0}
|
186
|
-
((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
|
187
|
-
end
|
188
|
-
# Kendall Rank Correlation Coefficient.
|
189
|
-
#
|
190
|
-
# Based on Hervé Adbi article
|
191
|
-
def tau_a(v1,v2)
|
192
|
-
v1a,v2a=Statsample.only_valid(v1,v2)
|
193
|
-
n=v1.size
|
194
|
-
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
195
|
-
o1=ordered_pairs(v1r)
|
196
|
-
o2=ordered_pairs(v2r)
|
197
|
-
delta= o1.size*2-(o2 & o1).size*2
|
198
|
-
1-(delta * 2 / (n*(n-1)).to_f)
|
199
|
-
end
|
200
|
-
# Calculates Tau b correlation.
|
201
|
-
#
|
202
|
-
# Tau-b defines perfect association as strict monotonicity.
|
203
|
-
# Although it requires strict monotonicity to reach 1.0,
|
204
|
-
# it does not penalize ties as much as some other measures.
|
205
|
-
#
|
206
|
-
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
207
|
-
def tau_b(matrix)
|
208
|
-
v=pairs(matrix)
|
209
|
-
((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
|
210
|
-
end
|
211
|
-
# Calculates Goodman and Kruskal's gamma.
|
212
|
-
#
|
213
|
-
# Gamma is the surplus of concordant pairs over discordant pairs,
|
214
|
-
# as a percentage of all pairs ignoring ties.
|
215
|
-
#
|
216
|
-
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
217
|
-
def gamma(matrix)
|
218
|
-
v=pairs(matrix)
|
219
|
-
(v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
|
220
|
-
end
|
221
|
-
# Calculate indexes for a matrix
|
222
|
-
# the rows and cols has to be ordered
|
223
|
-
def pairs(matrix)
|
224
|
-
# calculate concordant
|
225
|
-
#p matrix
|
226
|
-
rs=matrix.row_size
|
227
|
-
cs=matrix.column_size
|
228
|
-
conc=disc=ties_x=ties_y=0
|
229
|
-
(0...(rs-1)).each {|x|
|
230
|
-
(0...(cs-1)).each{|y|
|
231
|
-
((x+1)...rs).each{|x2|
|
232
|
-
((y+1)...cs).each{|y2|
|
233
|
-
#p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
234
|
-
conc+=matrix[x,y]*matrix[x2,y2]
|
235
|
-
}
|
236
|
-
}
|
237
|
-
}
|
238
|
-
}
|
239
|
-
(0...(rs-1)).each {|x|
|
240
|
-
(1...(cs)).each{|y|
|
241
|
-
((x+1)...rs).each{|x2|
|
242
|
-
(0...y).each{|y2|
|
243
|
-
#p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
244
|
-
disc+=matrix[x,y]*matrix[x2,y2]
|
245
|
-
}
|
246
|
-
}
|
247
|
-
}
|
248
|
-
}
|
249
|
-
(0...(rs-1)).each {|x|
|
250
|
-
(0...(cs)).each{|y|
|
251
|
-
((x+1)...(rs)).each{|x2|
|
252
|
-
ties_x+=matrix[x,y]*matrix[x2,y]
|
253
|
-
}
|
254
|
-
}
|
255
|
-
}
|
256
|
-
(0...rs).each {|x|
|
257
|
-
(0...(cs-1)).each{|y|
|
258
|
-
((y+1)...(cs)).each{|y2|
|
259
|
-
ties_y+=matrix[x,y]*matrix[x,y2]
|
260
|
-
}
|
261
|
-
}
|
262
|
-
}
|
263
|
-
{'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
|
264
|
-
end
|
265
|
-
def ordered_pairs(vector)
|
266
|
-
d=vector.data
|
267
|
-
a=[]
|
268
|
-
(0...(d.size-1)).each{|i|
|
269
|
-
((i+1)...(d.size)).each {|j|
|
270
|
-
a.push([d[i],d[j]])
|
271
|
-
}
|
272
|
-
}
|
273
|
-
a
|
274
|
-
end
|
275
|
-
def sum_of_codeviated(v1,v2)
|
276
|
-
v1a,v2a=Statsample.only_valid(v1,v2)
|
277
|
-
sum=0
|
278
|
-
(0...v1a.size).each{|i|
|
279
|
-
sum+=v1a[i]*v2a[i]
|
280
|
-
}
|
281
|
-
sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
|
282
|
-
end
|
3
|
+
# Diverse correlation methods
|
4
|
+
module Bivariate
|
5
|
+
class << self
|
6
|
+
# Covariance between two vectors
|
7
|
+
def covariance(v1,v2)
|
8
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
9
|
+
return nil if v1a.size==0
|
10
|
+
if HAS_GSL
|
11
|
+
GSL::Stats::covariance(v1a.gsl, v2a.gsl)
|
12
|
+
else
|
13
|
+
covariance_slow(v1a,v2a)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
def maximum_likehood_dichotomic(pred,real)
|
17
|
+
preda,reala=Statsample.only_valid(pred,real)
|
18
|
+
sum=0
|
19
|
+
pred.each_index{|i|
|
20
|
+
sum+=(real[i]*Math::log(pred[i])) + ((1-real[i])*Math::log(1-pred[i]))
|
21
|
+
}
|
22
|
+
sum
|
23
|
+
end
|
24
|
+
|
25
|
+
def covariance_slow(v1a,v2a) # :nodoc:
|
26
|
+
t=0
|
27
|
+
m1=v1a.mean
|
28
|
+
m2=v1a.mean
|
29
|
+
(0...v1a.size).each {|i| t+=((v1a[i]-m1)*(v2a[i]-m2)) }
|
30
|
+
t.to_f / (v1a.size-1)
|
31
|
+
end
|
32
|
+
# Calculate Pearson correlation coefficient between 2 vectors
|
33
|
+
def pearson(v1,v2)
|
34
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
35
|
+
return nil if v1a.size ==0
|
36
|
+
if HAS_GSL
|
37
|
+
GSL::Stats::correlation(v1a.gsl, v2a.gsl)
|
38
|
+
else
|
39
|
+
pearson_slow(v1a,v2a)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
def pearson_slow(v1a,v2a) # :nodoc:
|
43
|
+
v1s,v2s=v1a.vector_standarized_pop,v2a.vector_standarized_pop
|
44
|
+
t=0
|
45
|
+
siz=v1s.size
|
46
|
+
(0...v1s.size).each {|i| t+=(v1s[i]*v2s[i]) }
|
47
|
+
t.to_f/v2s.size
|
48
|
+
end
|
49
|
+
# Retrieves the value for t test for a pearson correlation
|
50
|
+
# between two vectors to test the null hipothesis of r=0
|
51
|
+
def t_pearson(v1,v2)
|
52
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
53
|
+
r=pearson(v1a,v2a)
|
54
|
+
if(r==1.0)
|
55
|
+
0
|
56
|
+
else
|
57
|
+
t_r(r,v1a.size)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
# Retrieves the value for t test for a pearson correlation
|
61
|
+
# giving r and vector size
|
62
|
+
def t_r(r,size)
|
63
|
+
r * Math::sqrt(((size)-2).to_f / (1 - r**2))
|
64
|
+
end
|
65
|
+
# Retrieves the probability value (a la SPSS)
|
66
|
+
# for a given t, size and number of tails.
|
67
|
+
# Uses a second parameter
|
68
|
+
# * :both or 2 : for r!=0
|
69
|
+
# * :right, :positive or 1 : for r > 0
|
70
|
+
# * :left, :negative : for r < 0
|
71
|
+
|
72
|
+
def prop_pearson(t, size, tails=:both)
|
73
|
+
tails=:both if tails==2
|
74
|
+
tails=:right if tails==1 or tails==:positive
|
75
|
+
tails=:left if tails==:negative
|
76
|
+
|
77
|
+
n_tails=case tails
|
78
|
+
when :both then 2
|
79
|
+
else 1
|
80
|
+
end
|
81
|
+
t=-t if t>0 and (tails==:both)
|
82
|
+
cdf=Distribution::T.cdf(t, size-2)
|
83
|
+
if(tails==:right)
|
84
|
+
1.0-(cdf*n_tails)
|
85
|
+
else
|
86
|
+
cdf*n_tails
|
87
|
+
end
|
88
|
+
end
|
89
|
+
# Returns residual score after delete variance
|
90
|
+
# from another variable
|
91
|
+
#
|
92
|
+
def residuals(from,del)
|
93
|
+
r=Statsample::Bivariate.pearson(from,del)
|
94
|
+
froms, dels = from.vector_standarized, del.vector_standarized
|
95
|
+
nv=[]
|
96
|
+
froms.data_with_nils.each_index do |i|
|
97
|
+
if froms[i].nil? or dels[i].nil?
|
98
|
+
nv.push(nil)
|
99
|
+
else
|
100
|
+
nv.push(froms[i]-r*dels[i])
|
101
|
+
end
|
102
|
+
end
|
103
|
+
nv.to_vector(:scale)
|
104
|
+
end
|
105
|
+
# Correlation between v1 and v2, controling the effect of
|
106
|
+
# control on both.
|
107
|
+
def partial_correlation(v1,v2,control)
|
108
|
+
v1a,v2a,cona=Statsample.only_valid(v1,v2,control)
|
109
|
+
rv1v2=pearson(v1a,v2a)
|
110
|
+
rv1con=pearson(v1a,cona)
|
111
|
+
rv2con=pearson(v2a,cona)
|
112
|
+
(rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
|
113
|
+
|
114
|
+
end
|
115
|
+
# Covariance matrix.
|
116
|
+
# Order of rows and columns depends on Dataset#fields order
|
117
|
+
|
118
|
+
def covariance_matrix(ds)
|
119
|
+
ds.collect_matrix do |row,col|
|
120
|
+
if (ds[row].type!=:scale or ds[col].type!=:scale)
|
121
|
+
nil
|
122
|
+
else
|
123
|
+
covariance(ds[row],ds[col])
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# Correlation matrix.
|
129
|
+
# Order of rows and columns depends on Dataset#fields order
|
130
|
+
|
131
|
+
def correlation_matrix(ds)
|
132
|
+
ds.collect_matrix do |row,col|
|
133
|
+
if row==col
|
134
|
+
1.0
|
135
|
+
elsif (ds[row].type!=:scale or ds[col].type!=:scale)
|
136
|
+
nil
|
137
|
+
else
|
138
|
+
pearson(ds[row],ds[col])
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
# Retrieves the n valid pairwise
|
143
|
+
def n_valid_matrix(ds)
|
144
|
+
ds.collect_matrix do |row,col|
|
145
|
+
if row==col
|
146
|
+
ds[row].valid_data.size
|
147
|
+
else
|
148
|
+
rowa,rowb=Statsample.only_valid(ds[row],ds[col])
|
149
|
+
rowa.size
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
# Matrix of correlation probability
|
154
|
+
# Order of rows and columns depends on Dataset#fields order
|
155
|
+
|
156
|
+
def correlation_probability_matrix(ds, tails=:both)
|
157
|
+
rows=ds.fields.collect do |row|
|
158
|
+
ds.fields.collect do |col|
|
159
|
+
v1a,v2a=Statsample.only_valid(ds[row],ds[col])
|
160
|
+
(row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
|
161
|
+
end
|
283
162
|
end
|
163
|
+
Matrix.rows(rows)
|
164
|
+
end
|
165
|
+
# Spearman ranked correlation coefficient between 2 vectors
|
166
|
+
def spearman(v1,v2)
|
167
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
168
|
+
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
169
|
+
pearson(v1r,v2r)
|
170
|
+
end
|
171
|
+
# Calculate Point biserial correlation. Equal to Pearson correlation, with
|
172
|
+
# one dichotomous value replaced by "0" and the other by "1"
|
173
|
+
def point_biserial(dichotomous,continous)
|
174
|
+
ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
|
175
|
+
raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
|
176
|
+
raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
|
177
|
+
f0=ds['d'].factors.sort[0]
|
178
|
+
m0=ds.filter_field('c') {|c| c['d']==f0}
|
179
|
+
m1=ds.filter_field('c') {|c| c['d']!=f0}
|
180
|
+
((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
|
181
|
+
end
|
182
|
+
# Kendall Rank Correlation Coefficient.
|
183
|
+
#
|
184
|
+
# Based on Hervé Adbi article
|
185
|
+
def tau_a(v1,v2)
|
186
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
187
|
+
n=v1.size
|
188
|
+
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
189
|
+
o1=ordered_pairs(v1r)
|
190
|
+
o2=ordered_pairs(v2r)
|
191
|
+
delta= o1.size*2-(o2 & o1).size*2
|
192
|
+
1-(delta * 2 / (n*(n-1)).to_f)
|
193
|
+
end
|
194
|
+
# Calculates Tau b correlation.
|
195
|
+
#
|
196
|
+
# Tau-b defines perfect association as strict monotonicity. Although it
|
197
|
+
# requires strict monotonicity to reach 1.0, it does not penalize ties as
|
198
|
+
# much as some other measures.
|
199
|
+
#
|
200
|
+
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
201
|
+
def tau_b(matrix)
|
202
|
+
v=pairs(matrix)
|
203
|
+
((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
|
204
|
+
end
|
205
|
+
# Calculates Goodman and Kruskal's gamma.
|
206
|
+
#
|
207
|
+
# Gamma is the surplus of concordant pairs over discordant pairs, as a
|
208
|
+
# percentage of all pairs ignoring ties.
|
209
|
+
#
|
210
|
+
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
211
|
+
def gamma(matrix)
|
212
|
+
v=pairs(matrix)
|
213
|
+
(v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
|
214
|
+
end
|
215
|
+
# Calculate indexes for a matrix the rows and cols has to be ordered
|
216
|
+
def pairs(matrix)
|
217
|
+
# calculate concordant #p matrix
|
218
|
+
rs=matrix.row_size
|
219
|
+
cs=matrix.column_size
|
220
|
+
conc=disc=ties_x=ties_y=0
|
221
|
+
(0...(rs-1)).each {|x|
|
222
|
+
(0...(cs-1)).each{|y|
|
223
|
+
((x+1)...rs).each{|x2|
|
224
|
+
((y+1)...cs).each{|y2|
|
225
|
+
# #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
226
|
+
conc+=matrix[x,y]*matrix[x2,y2]
|
227
|
+
}
|
228
|
+
}
|
229
|
+
}
|
230
|
+
}
|
231
|
+
(0...(rs-1)).each {|x|
|
232
|
+
(1...(cs)).each{|y|
|
233
|
+
((x+1)...rs).each{|x2|
|
234
|
+
(0...y).each{|y2|
|
235
|
+
# #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
236
|
+
disc+=matrix[x,y]*matrix[x2,y2]
|
237
|
+
}
|
238
|
+
}
|
239
|
+
}
|
240
|
+
}
|
241
|
+
(0...(rs-1)).each {|x|
|
242
|
+
(0...(cs)).each{|y|
|
243
|
+
((x+1)...(rs)).each{|x2|
|
244
|
+
ties_x+=matrix[x,y]*matrix[x2,y]
|
245
|
+
}
|
246
|
+
}
|
247
|
+
}
|
248
|
+
(0...rs).each {|x|
|
249
|
+
(0...(cs-1)).each{|y|
|
250
|
+
((y+1)...(cs)).each{|y2|
|
251
|
+
ties_y+=matrix[x,y]*matrix[x,y2]
|
252
|
+
}
|
253
|
+
}
|
254
|
+
}
|
255
|
+
{'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
|
256
|
+
end
|
257
|
+
def ordered_pairs(vector)
|
258
|
+
d=vector.data
|
259
|
+
a=[]
|
260
|
+
(0...(d.size-1)).each{|i|
|
261
|
+
((i+1)...(d.size)).each {|j|
|
262
|
+
a.push([d[i],d[j]])
|
263
|
+
}
|
264
|
+
}
|
265
|
+
a
|
266
|
+
end
|
267
|
+
def sum_of_codeviated(v1,v2)
|
268
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
269
|
+
sum=0
|
270
|
+
(0...v1a.size).each{|i|
|
271
|
+
sum+=v1a[i]*v2a[i]
|
272
|
+
}
|
273
|
+
sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
|
274
|
+
end
|
284
275
|
end
|
276
|
+
end
|
285
277
|
end
|
286
278
|
|