statsample 0.5.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ require File.dirname(__FILE__)+"/../lib/statsample"
2
+ ds=Statsample::PlainText.read(File.dirname(__FILE__)+"/../data/tetmat_test.txt", %w{a b c d e})
3
+ puts Statsample::SPSS.tetrachoric_correlation_matrix(ds)
data/lib/spss.rb CHANGED
@@ -6,7 +6,7 @@
6
6
  #
7
7
  # Claudio Bustos mailto:clbustos@gmail.com
8
8
 
9
- module SPSS
9
+ module SPSS # :nodoc: all
10
10
  module Dictionary
11
11
  class Element
12
12
  def add(a)
data/lib/statistics2.rb CHANGED
@@ -10,7 +10,7 @@
10
10
  # [1] http://www.matsusaka-u.ac.jp/~okumura/algo/
11
11
  # [2] http://www5.airnet.ne.jp/tomy/cpro/sslib11.htm
12
12
 
13
- module Statistics2
13
+ module Statistics2 # :nodoc:
14
14
  SQ2PI = Math.sqrt(2 * Math::PI)
15
15
 
16
16
  # Newton approximation
data/lib/statsample.rb CHANGED
@@ -38,6 +38,34 @@ class String
38
38
  end
39
39
  end
40
40
 
41
+
42
+ class Array
43
+ # Recode repeated values on an array, adding the number of repetition
44
+ # at the end
45
+ # Example:
46
+ # a=%w{a b c c d d d e}
47
+ # a.recode_repeated
48
+ # => ["a","b","c_1","c_2","d_1","d_2","d_3","e"]
49
+ def recode_repeated
50
+ if self.size!=self.uniq.size
51
+ # Find repeated
52
+ repeated=self.inject({}) {|a,v|
53
+ (a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v| k}
54
+ ns=repeated.inject({}) {|a,v| a[v]=0;a}
55
+ self.collect do |f|
56
+ if repeated.include? f
57
+ ns[f]+=1
58
+ sprintf("%s_%d",f,ns[f])
59
+ else
60
+ f
61
+ end
62
+ end
63
+ else
64
+ self
65
+ end
66
+ end
67
+ end
68
+
41
69
  def create_test(*args,&proc)
42
70
  description=args.shift
43
71
  fields=args
@@ -80,7 +108,7 @@ end
80
108
  # * Dataset: An union of vectors.
81
109
  #
82
110
  module Statsample
83
- VERSION = '0.5.0'
111
+ VERSION = '0.5.1'
84
112
  SPLIT_TOKEN = ","
85
113
  autoload(:Database, 'statsample/converters')
86
114
  autoload(:Anova, 'statsample/anova')
@@ -89,6 +117,7 @@ module Statsample
89
117
  autoload(:PlainText, 'statsample/converters')
90
118
  autoload(:Excel, 'statsample/converters')
91
119
  autoload(:GGobi, 'statsample/converters')
120
+ autoload(:SPSS, 'statsample/converter/spss')
92
121
  autoload(:DominanceAnalysis, 'statsample/dominanceanalysis')
93
122
  autoload(:HtmlReport, 'statsample/htmlreport')
94
123
  autoload(:Mx, 'statsample/converters')
@@ -1,70 +1,66 @@
1
1
  module Statsample
2
- module Anova
3
- # One Way Anova
4
- # Example:
5
- # v1=[2,3,4,5,6].to_vector(:scale)
6
- # v2=[3,3,4,5,6].to_vector(:scale)
7
- # v3=[5,3,1,5,6].to_vector(:scale)
8
- # anova=Statsample::Anova::OneWay.new([v1,v2,v3])
9
- # puts anova.f
10
- # puts anova.significance
11
- class OneWay
12
- def initialize(vectors)
13
- @vectors=vectors
2
+ module Anova
3
+ # One Way Anova
4
+ # Example:
5
+ # v1=[2,3,4,5,6].to_scale
6
+ # v2=[3,3,4,5,6].to_scale
7
+ # v3=[5,3,1,5,6].to_scale
8
+ # anova=Statsample::Anova::OneWay.new([v1,v2,v3])
9
+ # puts anova.f
10
+ # puts anova.significance
11
+ class OneWay
12
+ def initialize(vectors)
13
+ @vectors=vectors
14
+ end
15
+ # Total sum
16
+ def sum
17
+ @vectors.inject(0){|a,v| a+v.sum}
18
+ end
19
+ # Total mean
20
+ def mean
21
+ sum.quo(n)
22
+ end
23
+ # Total sum of squares
24
+ def sst
25
+ m=mean.to_f
26
+ @vectors.inject(0) {|total,vector| total+vector.sum_of_squares(m) }
27
+ end
28
+ # Sum of squares within groups
29
+ def sswg
30
+ @vectors.inject(0) {|total,vector| total+vector.sum_of_squares }
31
+ end
32
+ # Sum of squares between groups
33
+ def ssbg
34
+ m=mean
35
+ @vectors.inject(0) do |total,vector|
36
+ total + (vector.mean-m).square * vector.size
14
37
  end
15
- # Total sum
16
- def sum
17
- @vectors.inject(0){|a,v| a+v.sum}
18
- end
19
- # Total mean
20
- def mean
21
- sum.quo(n)
22
- end
23
- # Total sum of squares
24
- def sst
25
- m=mean.to_f
26
- @vectors.inject(0) {|total,vector|
27
- total+vector.sum_of_squares(m)
28
- }
29
- end
30
- # Sum of squares within groups
31
- def sswg
32
- @vectors.inject(0) {|total,vector|
33
- total+vector.sum_of_squares
34
- }
35
- end
36
- # Sum of squares between groups
37
- def ssbg
38
- m=mean
39
- @vectors.inject(0) {|total,vector|
40
- total+(vector.mean-m).square*vector.size
41
- }
42
- end
43
- # Degrees of freedom within groups
44
- def df_wg
45
- @vectors.inject(0) {|a,v| a+(v.size-1)}
46
- end
47
- # Degrees of freedom between groups
48
- def df_bg
49
- @vectors.size-1
50
- end
51
- # Total Degrees of freedom
52
- def df_total
53
- n-1
54
- end
55
- # Total number of cases
56
- def n
57
- @vectors.inject(0){|a,v| a+v.size}
58
- end
59
- # Fisher
60
- def f
61
- k=@vectors.size
62
- (ssbg*(n-k)) / (sswg*(k-1))
63
- end
64
- # Significance of Fisher
65
- def significance
66
- 1.0-Distribution::F.cdf(f,df_bg,df_wg)
67
- end
68
- end
38
+ end
39
+ # Degrees of freedom within groups
40
+ def df_wg
41
+ @vectors.inject(0) {|a,v| a+(v.size-1)}
42
+ end
43
+ # Degrees of freedom between groups
44
+ def df_bg
45
+ @vectors.size-1
46
+ end
47
+ # Total Degrees of freedom
48
+ def df_total
49
+ n-1
50
+ end
51
+ # Total number of cases
52
+ def n
53
+ @vectors.inject(0){|a,v| a+v.size}
54
+ end
55
+ # Fisher
56
+ def f
57
+ k=@vectors.size
58
+ (ssbg*(n-k)) / (sswg*(k-1))
59
+ end
60
+ # Significance of Fisher
61
+ def significance
62
+ 1.0-Distribution::F.cdf(f,df_bg,df_wg)
63
+ end
69
64
  end
65
+ end
70
66
  end
@@ -1,286 +1,278 @@
1
+ require 'statsample/bivariate/tetrachoric'
1
2
  module Statsample
2
- # Diverse correlation methods
3
- module Bivariate
4
- class << self
5
- # Covariance between two vectors
6
- def covariance(v1,v2)
7
- v1a,v2a=Statsample.only_valid(v1,v2)
8
- return nil if v1a.size==0
9
- if HAS_GSL
10
- GSL::Stats::covariance(v1a.gsl, v2a.gsl)
11
- else
12
- covariance_slow(v1a,v2a)
13
- end
14
- end
15
- def maximum_likehood_dichotomic(pred,real)
16
- preda,reala=Statsample.only_valid(pred,real)
17
- sum=0
18
- pred.each_index{|i|
19
- sum+=(real[i]*Math::log(pred[i])) + ((1-real[i])*Math::log(1-pred[i]))
20
- }
21
- sum
22
- end
23
-
24
- def covariance_slow(v1a,v2a) # :nodoc:
25
- t=0
26
- m1=v1a.mean
27
- m2=v1a.mean
28
- (0...v1a.size).each {|i|
29
- t+=((v1a[i]-m1)*(v2a[i]-m2))
30
- }
31
- t.to_f / (v1a.size-1)
32
- end
33
- # Calculate Pearson correlation coefficient between 2 vectors
34
- def pearson(v1,v2)
35
- v1a,v2a=Statsample.only_valid(v1,v2)
36
- return nil if v1a.size ==0
37
- if HAS_GSL
38
- GSL::Stats::correlation(v1a.gsl, v2a.gsl)
39
- else
40
- pearson_slow(v1a,v2a)
41
- end
42
- end
43
- def pearson_slow(v1a,v2a) # :nodoc:
44
-
45
- v1s,v2s=v1a.vector_standarized_pop,v2a.vector_standarized_pop
46
- t=0
47
- siz=v1s.size
48
- (0...v1s.size).each {|i| t+=(v1s[i]*v2s[i]) }
49
- t.to_f/v2s.size
50
- end
51
- # Retrieves the value for t test for a pearson correlation
52
- # between two vectors to test the null hipothesis of r=0
53
- def t_pearson(v1,v2)
54
- v1a,v2a=Statsample.only_valid(v1,v2)
55
- r=pearson(v1a,v2a)
56
- if(r==1.0)
57
- 0
58
- else
59
- t_r(r,v1a.size)
60
- end
61
- end
62
- # Retrieves the value for t test for a pearson correlation
63
- # giving r and vector size
64
- def t_r(r,size)
65
- r * Math::sqrt(((size)-2).to_f / (1 - r**2))
66
- end
67
- # Retrieves the probability value (a la SPSS)
68
- # for a given t, size and number of tails.
69
- # Uses a second parameter
70
- # * :both or 2 : for r!=0
71
- # * :right, :positive or 1 : for r > 0
72
- # * :left, :negative : for r < 0
73
-
74
- def prop_pearson(t, size, tails=:both)
75
- tails=:both if tails==2
76
- tails=:right if tails==1 or tails==:positive
77
- tails=:left if tails==:negative
78
-
79
- n_tails=case tails
80
- when :both
81
- 2
82
- else
83
- 1
84
- end
85
- t=-t if t>0 and (tails==:both)
86
- cdf=Distribution::T.cdf(t, size-2)
87
- if(tails==:right)
88
- 1.0-(cdf*n_tails)
89
- else
90
- cdf*n_tails
91
- end
92
- end
93
- # Returns residual score after delete variance
94
- # from another variable
95
- #
96
- def residuals(from,del)
97
- r=Statsample::Bivariate.pearson(from,del)
98
- froms, dels = from.vector_standarized, del.vector_standarized
99
- nv=[]
100
- froms.data_with_nils.each_index{|i|
101
- if froms[i].nil? or dels[i].nil?
102
- nv.push(nil)
103
- else
104
- nv.push(froms[i]-r*dels[i])
105
- end
106
- }
107
- nv.to_vector(:scale)
108
- end
109
- # Correlation between v1 and v2, controling the effect of
110
- # control on both.
111
- def partial_correlation(v1,v2,control)
112
- v1a,v2a,cona=Statsample.only_valid(v1,v2,control)
113
- rv1v2=pearson(v1a,v2a)
114
- rv1con=pearson(v1a,cona)
115
- rv2con=pearson(v2a,cona)
116
-
117
- (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
118
-
119
- end
120
- # Covariance matrix.
121
- # Order of rows and columns depends on Dataset#fields order
122
-
123
- def covariance_matrix(ds)
124
- ds.collect_matrix do |row,col|
125
- if (ds[row].type!=:scale or ds[col].type!=:scale)
126
- nil
127
- else
128
- covariance(ds[row],ds[col])
129
- end
130
- end
131
- end
132
-
133
- # Correlation matrix.
134
- # Order of rows and columns depends on Dataset#fields order
135
-
136
- def correlation_matrix(ds)
137
- ds.collect_matrix {|row,col|
138
- if row==col
139
- 1.0
140
- elsif (ds[row].type!=:scale or ds[col].type!=:scale)
141
- nil
142
- else
143
- pearson(ds[row],ds[col])
144
- end
145
- }
146
- end
147
- # Retrieves the n valid pairwise
148
- def n_valid_matrix(ds)
149
- ds.collect_matrix {|row,col|
150
- if row==col
151
- ds[row].valid_data.size
152
- else
153
- rowa,rowb=Statsample.only_valid(ds[row],ds[col])
154
- rowa.size
155
- end
156
- }
157
- end
158
- # Matrix of correlation probability
159
- # Order of rows and columns depends on Dataset#fields order
160
-
161
- def correlation_probability_matrix(ds, tails=:both)
162
- rows=ds.fields.collect{|row|
163
- ds.fields.collect{|col|
164
- v1a,v2a=Statsample.only_valid(ds[row],ds[col])
165
- (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
166
- }
167
- }
168
- Matrix.rows(rows)
169
- end
170
- # Spearman ranked correlation coefficient between 2 vectors
171
- def spearman(v1,v2)
172
- v1a,v2a=Statsample.only_valid(v1,v2)
173
- v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
174
- pearson(v1r,v2r)
175
- end
176
- # Calculate Point biserial correlation.
177
- # Equal to Pearson correlation, with one dichotomous value replaced
178
- # by "0" and the other by "1"
179
- def point_biserial(dichotomous,continous)
180
- ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
181
- raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
182
- raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
183
- f0=ds['d'].factors.sort[0]
184
- m0=ds.filter_field('c') {|c| c['d']==f0}
185
- m1=ds.filter_field('c') {|c| c['d']!=f0}
186
- ((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
187
- end
188
- # Kendall Rank Correlation Coefficient.
189
- #
190
- # Based on Hervé Adbi article
191
- def tau_a(v1,v2)
192
- v1a,v2a=Statsample.only_valid(v1,v2)
193
- n=v1.size
194
- v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
195
- o1=ordered_pairs(v1r)
196
- o2=ordered_pairs(v2r)
197
- delta= o1.size*2-(o2 & o1).size*2
198
- 1-(delta * 2 / (n*(n-1)).to_f)
199
- end
200
- # Calculates Tau b correlation.
201
- #
202
- # Tau-b defines perfect association as strict monotonicity.
203
- # Although it requires strict monotonicity to reach 1.0,
204
- # it does not penalize ties as much as some other measures.
205
- #
206
- # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
207
- def tau_b(matrix)
208
- v=pairs(matrix)
209
- ((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
210
- end
211
- # Calculates Goodman and Kruskal's gamma.
212
- #
213
- # Gamma is the surplus of concordant pairs over discordant pairs,
214
- # as a percentage of all pairs ignoring ties.
215
- #
216
- # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
217
- def gamma(matrix)
218
- v=pairs(matrix)
219
- (v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
220
- end
221
- # Calculate indexes for a matrix
222
- # the rows and cols has to be ordered
223
- def pairs(matrix)
224
- # calculate concordant
225
- #p matrix
226
- rs=matrix.row_size
227
- cs=matrix.column_size
228
- conc=disc=ties_x=ties_y=0
229
- (0...(rs-1)).each {|x|
230
- (0...(cs-1)).each{|y|
231
- ((x+1)...rs).each{|x2|
232
- ((y+1)...cs).each{|y2|
233
- #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
234
- conc+=matrix[x,y]*matrix[x2,y2]
235
- }
236
- }
237
- }
238
- }
239
- (0...(rs-1)).each {|x|
240
- (1...(cs)).each{|y|
241
- ((x+1)...rs).each{|x2|
242
- (0...y).each{|y2|
243
- #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
244
- disc+=matrix[x,y]*matrix[x2,y2]
245
- }
246
- }
247
- }
248
- }
249
- (0...(rs-1)).each {|x|
250
- (0...(cs)).each{|y|
251
- ((x+1)...(rs)).each{|x2|
252
- ties_x+=matrix[x,y]*matrix[x2,y]
253
- }
254
- }
255
- }
256
- (0...rs).each {|x|
257
- (0...(cs-1)).each{|y|
258
- ((y+1)...(cs)).each{|y2|
259
- ties_y+=matrix[x,y]*matrix[x,y2]
260
- }
261
- }
262
- }
263
- {'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
264
- end
265
- def ordered_pairs(vector)
266
- d=vector.data
267
- a=[]
268
- (0...(d.size-1)).each{|i|
269
- ((i+1)...(d.size)).each {|j|
270
- a.push([d[i],d[j]])
271
- }
272
- }
273
- a
274
- end
275
- def sum_of_codeviated(v1,v2)
276
- v1a,v2a=Statsample.only_valid(v1,v2)
277
- sum=0
278
- (0...v1a.size).each{|i|
279
- sum+=v1a[i]*v2a[i]
280
- }
281
- sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
282
- end
3
+ # Diverse correlation methods
4
+ module Bivariate
5
+ class << self
6
+ # Covariance between two vectors
7
+ def covariance(v1,v2)
8
+ v1a,v2a=Statsample.only_valid(v1,v2)
9
+ return nil if v1a.size==0
10
+ if HAS_GSL
11
+ GSL::Stats::covariance(v1a.gsl, v2a.gsl)
12
+ else
13
+ covariance_slow(v1a,v2a)
14
+ end
15
+ end
16
+ def maximum_likehood_dichotomic(pred,real)
17
+ preda,reala=Statsample.only_valid(pred,real)
18
+ sum=0
19
+ pred.each_index{|i|
20
+ sum+=(real[i]*Math::log(pred[i])) + ((1-real[i])*Math::log(1-pred[i]))
21
+ }
22
+ sum
23
+ end
24
+
25
+ def covariance_slow(v1a,v2a) # :nodoc:
26
+ t=0
27
+ m1=v1a.mean
28
+ m2=v1a.mean
29
+ (0...v1a.size).each {|i| t+=((v1a[i]-m1)*(v2a[i]-m2)) }
30
+ t.to_f / (v1a.size-1)
31
+ end
32
+ # Calculate Pearson correlation coefficient between 2 vectors
33
+ def pearson(v1,v2)
34
+ v1a,v2a=Statsample.only_valid(v1,v2)
35
+ return nil if v1a.size ==0
36
+ if HAS_GSL
37
+ GSL::Stats::correlation(v1a.gsl, v2a.gsl)
38
+ else
39
+ pearson_slow(v1a,v2a)
40
+ end
41
+ end
42
+ def pearson_slow(v1a,v2a) # :nodoc:
43
+ v1s,v2s=v1a.vector_standarized_pop,v2a.vector_standarized_pop
44
+ t=0
45
+ siz=v1s.size
46
+ (0...v1s.size).each {|i| t+=(v1s[i]*v2s[i]) }
47
+ t.to_f/v2s.size
48
+ end
49
+ # Retrieves the value for t test for a pearson correlation
50
+ # between two vectors to test the null hipothesis of r=0
51
+ def t_pearson(v1,v2)
52
+ v1a,v2a=Statsample.only_valid(v1,v2)
53
+ r=pearson(v1a,v2a)
54
+ if(r==1.0)
55
+ 0
56
+ else
57
+ t_r(r,v1a.size)
58
+ end
59
+ end
60
+ # Retrieves the value for t test for a pearson correlation
61
+ # giving r and vector size
62
+ def t_r(r,size)
63
+ r * Math::sqrt(((size)-2).to_f / (1 - r**2))
64
+ end
65
+ # Retrieves the probability value (a la SPSS)
66
+ # for a given t, size and number of tails.
67
+ # Uses a second parameter
68
+ # * :both or 2 : for r!=0
69
+ # * :right, :positive or 1 : for r > 0
70
+ # * :left, :negative : for r < 0
71
+
72
+ def prop_pearson(t, size, tails=:both)
73
+ tails=:both if tails==2
74
+ tails=:right if tails==1 or tails==:positive
75
+ tails=:left if tails==:negative
76
+
77
+ n_tails=case tails
78
+ when :both then 2
79
+ else 1
80
+ end
81
+ t=-t if t>0 and (tails==:both)
82
+ cdf=Distribution::T.cdf(t, size-2)
83
+ if(tails==:right)
84
+ 1.0-(cdf*n_tails)
85
+ else
86
+ cdf*n_tails
87
+ end
88
+ end
89
+ # Returns residual score after delete variance
90
+ # from another variable
91
+ #
92
+ def residuals(from,del)
93
+ r=Statsample::Bivariate.pearson(from,del)
94
+ froms, dels = from.vector_standarized, del.vector_standarized
95
+ nv=[]
96
+ froms.data_with_nils.each_index do |i|
97
+ if froms[i].nil? or dels[i].nil?
98
+ nv.push(nil)
99
+ else
100
+ nv.push(froms[i]-r*dels[i])
101
+ end
102
+ end
103
+ nv.to_vector(:scale)
104
+ end
105
+ # Correlation between v1 and v2, controling the effect of
106
+ # control on both.
107
+ def partial_correlation(v1,v2,control)
108
+ v1a,v2a,cona=Statsample.only_valid(v1,v2,control)
109
+ rv1v2=pearson(v1a,v2a)
110
+ rv1con=pearson(v1a,cona)
111
+ rv2con=pearson(v2a,cona)
112
+ (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
113
+
114
+ end
115
+ # Covariance matrix.
116
+ # Order of rows and columns depends on Dataset#fields order
117
+
118
+ def covariance_matrix(ds)
119
+ ds.collect_matrix do |row,col|
120
+ if (ds[row].type!=:scale or ds[col].type!=:scale)
121
+ nil
122
+ else
123
+ covariance(ds[row],ds[col])
124
+ end
125
+ end
126
+ end
127
+
128
+ # Correlation matrix.
129
+ # Order of rows and columns depends on Dataset#fields order
130
+
131
+ def correlation_matrix(ds)
132
+ ds.collect_matrix do |row,col|
133
+ if row==col
134
+ 1.0
135
+ elsif (ds[row].type!=:scale or ds[col].type!=:scale)
136
+ nil
137
+ else
138
+ pearson(ds[row],ds[col])
139
+ end
140
+ end
141
+ end
142
+ # Retrieves the n valid pairwise
143
+ def n_valid_matrix(ds)
144
+ ds.collect_matrix do |row,col|
145
+ if row==col
146
+ ds[row].valid_data.size
147
+ else
148
+ rowa,rowb=Statsample.only_valid(ds[row],ds[col])
149
+ rowa.size
150
+ end
151
+ end
152
+ end
153
+ # Matrix of correlation probability
154
+ # Order of rows and columns depends on Dataset#fields order
155
+
156
+ def correlation_probability_matrix(ds, tails=:both)
157
+ rows=ds.fields.collect do |row|
158
+ ds.fields.collect do |col|
159
+ v1a,v2a=Statsample.only_valid(ds[row],ds[col])
160
+ (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
161
+ end
283
162
  end
163
+ Matrix.rows(rows)
164
+ end
165
+ # Spearman ranked correlation coefficient between 2 vectors
166
+ def spearman(v1,v2)
167
+ v1a,v2a=Statsample.only_valid(v1,v2)
168
+ v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
169
+ pearson(v1r,v2r)
170
+ end
171
+ # Calculate Point biserial correlation. Equal to Pearson correlation, with
172
+ # one dichotomous value replaced by "0" and the other by "1"
173
+ def point_biserial(dichotomous,continous)
174
+ ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
175
+ raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
176
+ raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
177
+ f0=ds['d'].factors.sort[0]
178
+ m0=ds.filter_field('c') {|c| c['d']==f0}
179
+ m1=ds.filter_field('c') {|c| c['d']!=f0}
180
+ ((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
181
+ end
182
+ # Kendall Rank Correlation Coefficient.
183
+ #
184
+ # Based on Hervé Adbi article
185
+ def tau_a(v1,v2)
186
+ v1a,v2a=Statsample.only_valid(v1,v2)
187
+ n=v1.size
188
+ v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
189
+ o1=ordered_pairs(v1r)
190
+ o2=ordered_pairs(v2r)
191
+ delta= o1.size*2-(o2 & o1).size*2
192
+ 1-(delta * 2 / (n*(n-1)).to_f)
193
+ end
194
+ # Calculates Tau b correlation.
195
+ #
196
+ # Tau-b defines perfect association as strict monotonicity. Although it
197
+ # requires strict monotonicity to reach 1.0, it does not penalize ties as
198
+ # much as some other measures.
199
+ #
200
+ # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
201
+ def tau_b(matrix)
202
+ v=pairs(matrix)
203
+ ((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
204
+ end
205
+ # Calculates Goodman and Kruskal's gamma.
206
+ #
207
+ # Gamma is the surplus of concordant pairs over discordant pairs, as a
208
+ # percentage of all pairs ignoring ties.
209
+ #
210
+ # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
211
+ def gamma(matrix)
212
+ v=pairs(matrix)
213
+ (v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
214
+ end
215
+ # Calculate indexes for a matrix the rows and cols has to be ordered
216
+ def pairs(matrix)
217
+ # calculate concordant #p matrix
218
+ rs=matrix.row_size
219
+ cs=matrix.column_size
220
+ conc=disc=ties_x=ties_y=0
221
+ (0...(rs-1)).each {|x|
222
+ (0...(cs-1)).each{|y|
223
+ ((x+1)...rs).each{|x2|
224
+ ((y+1)...cs).each{|y2|
225
+ # #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
226
+ conc+=matrix[x,y]*matrix[x2,y2]
227
+ }
228
+ }
229
+ }
230
+ }
231
+ (0...(rs-1)).each {|x|
232
+ (1...(cs)).each{|y|
233
+ ((x+1)...rs).each{|x2|
234
+ (0...y).each{|y2|
235
+ # #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
236
+ disc+=matrix[x,y]*matrix[x2,y2]
237
+ }
238
+ }
239
+ }
240
+ }
241
+ (0...(rs-1)).each {|x|
242
+ (0...(cs)).each{|y|
243
+ ((x+1)...(rs)).each{|x2|
244
+ ties_x+=matrix[x,y]*matrix[x2,y]
245
+ }
246
+ }
247
+ }
248
+ (0...rs).each {|x|
249
+ (0...(cs-1)).each{|y|
250
+ ((y+1)...(cs)).each{|y2|
251
+ ties_y+=matrix[x,y]*matrix[x,y2]
252
+ }
253
+ }
254
+ }
255
+ {'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
256
+ end
257
+ def ordered_pairs(vector)
258
+ d=vector.data
259
+ a=[]
260
+ (0...(d.size-1)).each{|i|
261
+ ((i+1)...(d.size)).each {|j|
262
+ a.push([d[i],d[j]])
263
+ }
264
+ }
265
+ a
266
+ end
267
+ def sum_of_codeviated(v1,v2)
268
+ v1a,v2a=Statsample.only_valid(v1,v2)
269
+ sum=0
270
+ (0...v1a.size).each{|i|
271
+ sum+=v1a[i]*v2a[i]
272
+ }
273
+ sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
274
+ end
284
275
  end
276
+ end
285
277
  end
286
278