statsample 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ require File.dirname(__FILE__)+"/../lib/statsample"
2
+ ds=Statsample::PlainText.read(File.dirname(__FILE__)+"/../data/tetmat_test.txt", %w{a b c d e})
3
+ puts Statsample::SPSS.tetrachoric_correlation_matrix(ds)
data/lib/spss.rb CHANGED
@@ -6,7 +6,7 @@
6
6
  #
7
7
  # Claudio Bustos mailto:clbustos@gmail.com
8
8
 
9
- module SPSS
9
+ module SPSS # :nodoc: all
10
10
  module Dictionary
11
11
  class Element
12
12
  def add(a)
data/lib/statistics2.rb CHANGED
@@ -10,7 +10,7 @@
10
10
  # [1] http://www.matsusaka-u.ac.jp/~okumura/algo/
11
11
  # [2] http://www5.airnet.ne.jp/tomy/cpro/sslib11.htm
12
12
 
13
- module Statistics2
13
+ module Statistics2 # :nodoc:
14
14
  SQ2PI = Math.sqrt(2 * Math::PI)
15
15
 
16
16
  # Newton approximation
data/lib/statsample.rb CHANGED
@@ -38,6 +38,34 @@ class String
38
38
  end
39
39
  end
40
40
 
41
+
42
+ class Array
43
+ # Recode repeated values on an array, adding the number of repetition
44
+ # at the end
45
+ # Example:
46
+ # a=%w{a b c c d d d e}
47
+ # a.recode_repeated
48
+ # => ["a","b","c_1","c_2","d_1","d_2","d_3","e"]
49
+ def recode_repeated
50
+ if self.size!=self.uniq.size
51
+ # Find repeated
52
+ repeated=self.inject({}) {|a,v|
53
+ (a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v| k}
54
+ ns=repeated.inject({}) {|a,v| a[v]=0;a}
55
+ self.collect do |f|
56
+ if repeated.include? f
57
+ ns[f]+=1
58
+ sprintf("%s_%d",f,ns[f])
59
+ else
60
+ f
61
+ end
62
+ end
63
+ else
64
+ self
65
+ end
66
+ end
67
+ end
68
+
41
69
  def create_test(*args,&proc)
42
70
  description=args.shift
43
71
  fields=args
@@ -80,7 +108,7 @@ end
80
108
  # * Dataset: An union of vectors.
81
109
  #
82
110
  module Statsample
83
- VERSION = '0.5.0'
111
+ VERSION = '0.5.1'
84
112
  SPLIT_TOKEN = ","
85
113
  autoload(:Database, 'statsample/converters')
86
114
  autoload(:Anova, 'statsample/anova')
@@ -89,6 +117,7 @@ module Statsample
89
117
  autoload(:PlainText, 'statsample/converters')
90
118
  autoload(:Excel, 'statsample/converters')
91
119
  autoload(:GGobi, 'statsample/converters')
120
+ autoload(:SPSS, 'statsample/converter/spss')
92
121
  autoload(:DominanceAnalysis, 'statsample/dominanceanalysis')
93
122
  autoload(:HtmlReport, 'statsample/htmlreport')
94
123
  autoload(:Mx, 'statsample/converters')
@@ -1,70 +1,66 @@
1
1
  module Statsample
2
- module Anova
3
- # One Way Anova
4
- # Example:
5
- # v1=[2,3,4,5,6].to_vector(:scale)
6
- # v2=[3,3,4,5,6].to_vector(:scale)
7
- # v3=[5,3,1,5,6].to_vector(:scale)
8
- # anova=Statsample::Anova::OneWay.new([v1,v2,v3])
9
- # puts anova.f
10
- # puts anova.significance
11
- class OneWay
12
- def initialize(vectors)
13
- @vectors=vectors
2
+ module Anova
3
+ # One Way Anova
4
+ # Example:
5
+ # v1=[2,3,4,5,6].to_scale
6
+ # v2=[3,3,4,5,6].to_scale
7
+ # v3=[5,3,1,5,6].to_scale
8
+ # anova=Statsample::Anova::OneWay.new([v1,v2,v3])
9
+ # puts anova.f
10
+ # puts anova.significance
11
+ class OneWay
12
+ def initialize(vectors)
13
+ @vectors=vectors
14
+ end
15
+ # Total sum
16
+ def sum
17
+ @vectors.inject(0){|a,v| a+v.sum}
18
+ end
19
+ # Total mean
20
+ def mean
21
+ sum.quo(n)
22
+ end
23
+ # Total sum of squares
24
+ def sst
25
+ m=mean.to_f
26
+ @vectors.inject(0) {|total,vector| total+vector.sum_of_squares(m) }
27
+ end
28
+ # Sum of squares within groups
29
+ def sswg
30
+ @vectors.inject(0) {|total,vector| total+vector.sum_of_squares }
31
+ end
32
+ # Sum of squares between groups
33
+ def ssbg
34
+ m=mean
35
+ @vectors.inject(0) do |total,vector|
36
+ total + (vector.mean-m).square * vector.size
14
37
  end
15
- # Total sum
16
- def sum
17
- @vectors.inject(0){|a,v| a+v.sum}
18
- end
19
- # Total mean
20
- def mean
21
- sum.quo(n)
22
- end
23
- # Total sum of squares
24
- def sst
25
- m=mean.to_f
26
- @vectors.inject(0) {|total,vector|
27
- total+vector.sum_of_squares(m)
28
- }
29
- end
30
- # Sum of squares within groups
31
- def sswg
32
- @vectors.inject(0) {|total,vector|
33
- total+vector.sum_of_squares
34
- }
35
- end
36
- # Sum of squares between groups
37
- def ssbg
38
- m=mean
39
- @vectors.inject(0) {|total,vector|
40
- total+(vector.mean-m).square*vector.size
41
- }
42
- end
43
- # Degrees of freedom within groups
44
- def df_wg
45
- @vectors.inject(0) {|a,v| a+(v.size-1)}
46
- end
47
- # Degrees of freedom between groups
48
- def df_bg
49
- @vectors.size-1
50
- end
51
- # Total Degrees of freedom
52
- def df_total
53
- n-1
54
- end
55
- # Total number of cases
56
- def n
57
- @vectors.inject(0){|a,v| a+v.size}
58
- end
59
- # Fisher
60
- def f
61
- k=@vectors.size
62
- (ssbg*(n-k)) / (sswg*(k-1))
63
- end
64
- # Significance of Fisher
65
- def significance
66
- 1.0-Distribution::F.cdf(f,df_bg,df_wg)
67
- end
68
- end
38
+ end
39
+ # Degrees of freedom within groups
40
+ def df_wg
41
+ @vectors.inject(0) {|a,v| a+(v.size-1)}
42
+ end
43
+ # Degrees of freedom between groups
44
+ def df_bg
45
+ @vectors.size-1
46
+ end
47
+ # Total Degrees of freedom
48
+ def df_total
49
+ n-1
50
+ end
51
+ # Total number of cases
52
+ def n
53
+ @vectors.inject(0){|a,v| a+v.size}
54
+ end
55
+ # Fisher
56
+ def f
57
+ k=@vectors.size
58
+ (ssbg*(n-k)) / (sswg*(k-1))
59
+ end
60
+ # Significance of Fisher
61
+ def significance
62
+ 1.0-Distribution::F.cdf(f,df_bg,df_wg)
63
+ end
69
64
  end
65
+ end
70
66
  end
@@ -1,286 +1,278 @@
1
+ require 'statsample/bivariate/tetrachoric'
1
2
  module Statsample
2
- # Diverse correlation methods
3
- module Bivariate
4
- class << self
5
- # Covariance between two vectors
6
- def covariance(v1,v2)
7
- v1a,v2a=Statsample.only_valid(v1,v2)
8
- return nil if v1a.size==0
9
- if HAS_GSL
10
- GSL::Stats::covariance(v1a.gsl, v2a.gsl)
11
- else
12
- covariance_slow(v1a,v2a)
13
- end
14
- end
15
- def maximum_likehood_dichotomic(pred,real)
16
- preda,reala=Statsample.only_valid(pred,real)
17
- sum=0
18
- pred.each_index{|i|
19
- sum+=(real[i]*Math::log(pred[i])) + ((1-real[i])*Math::log(1-pred[i]))
20
- }
21
- sum
22
- end
23
-
24
- def covariance_slow(v1a,v2a) # :nodoc:
25
- t=0
26
- m1=v1a.mean
27
- m2=v1a.mean
28
- (0...v1a.size).each {|i|
29
- t+=((v1a[i]-m1)*(v2a[i]-m2))
30
- }
31
- t.to_f / (v1a.size-1)
32
- end
33
- # Calculate Pearson correlation coefficient between 2 vectors
34
- def pearson(v1,v2)
35
- v1a,v2a=Statsample.only_valid(v1,v2)
36
- return nil if v1a.size ==0
37
- if HAS_GSL
38
- GSL::Stats::correlation(v1a.gsl, v2a.gsl)
39
- else
40
- pearson_slow(v1a,v2a)
41
- end
42
- end
43
- def pearson_slow(v1a,v2a) # :nodoc:
44
-
45
- v1s,v2s=v1a.vector_standarized_pop,v2a.vector_standarized_pop
46
- t=0
47
- siz=v1s.size
48
- (0...v1s.size).each {|i| t+=(v1s[i]*v2s[i]) }
49
- t.to_f/v2s.size
50
- end
51
- # Retrieves the value for t test for a pearson correlation
52
- # between two vectors to test the null hipothesis of r=0
53
- def t_pearson(v1,v2)
54
- v1a,v2a=Statsample.only_valid(v1,v2)
55
- r=pearson(v1a,v2a)
56
- if(r==1.0)
57
- 0
58
- else
59
- t_r(r,v1a.size)
60
- end
61
- end
62
- # Retrieves the value for t test for a pearson correlation
63
- # giving r and vector size
64
- def t_r(r,size)
65
- r * Math::sqrt(((size)-2).to_f / (1 - r**2))
66
- end
67
- # Retrieves the probability value (a la SPSS)
68
- # for a given t, size and number of tails.
69
- # Uses a second parameter
70
- # * :both or 2 : for r!=0
71
- # * :right, :positive or 1 : for r > 0
72
- # * :left, :negative : for r < 0
73
-
74
- def prop_pearson(t, size, tails=:both)
75
- tails=:both if tails==2
76
- tails=:right if tails==1 or tails==:positive
77
- tails=:left if tails==:negative
78
-
79
- n_tails=case tails
80
- when :both
81
- 2
82
- else
83
- 1
84
- end
85
- t=-t if t>0 and (tails==:both)
86
- cdf=Distribution::T.cdf(t, size-2)
87
- if(tails==:right)
88
- 1.0-(cdf*n_tails)
89
- else
90
- cdf*n_tails
91
- end
92
- end
93
- # Returns residual score after delete variance
94
- # from another variable
95
- #
96
- def residuals(from,del)
97
- r=Statsample::Bivariate.pearson(from,del)
98
- froms, dels = from.vector_standarized, del.vector_standarized
99
- nv=[]
100
- froms.data_with_nils.each_index{|i|
101
- if froms[i].nil? or dels[i].nil?
102
- nv.push(nil)
103
- else
104
- nv.push(froms[i]-r*dels[i])
105
- end
106
- }
107
- nv.to_vector(:scale)
108
- end
109
- # Correlation between v1 and v2, controling the effect of
110
- # control on both.
111
- def partial_correlation(v1,v2,control)
112
- v1a,v2a,cona=Statsample.only_valid(v1,v2,control)
113
- rv1v2=pearson(v1a,v2a)
114
- rv1con=pearson(v1a,cona)
115
- rv2con=pearson(v2a,cona)
116
-
117
- (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
118
-
119
- end
120
- # Covariance matrix.
121
- # Order of rows and columns depends on Dataset#fields order
122
-
123
- def covariance_matrix(ds)
124
- ds.collect_matrix do |row,col|
125
- if (ds[row].type!=:scale or ds[col].type!=:scale)
126
- nil
127
- else
128
- covariance(ds[row],ds[col])
129
- end
130
- end
131
- end
132
-
133
- # Correlation matrix.
134
- # Order of rows and columns depends on Dataset#fields order
135
-
136
- def correlation_matrix(ds)
137
- ds.collect_matrix {|row,col|
138
- if row==col
139
- 1.0
140
- elsif (ds[row].type!=:scale or ds[col].type!=:scale)
141
- nil
142
- else
143
- pearson(ds[row],ds[col])
144
- end
145
- }
146
- end
147
- # Retrieves the n valid pairwise
148
- def n_valid_matrix(ds)
149
- ds.collect_matrix {|row,col|
150
- if row==col
151
- ds[row].valid_data.size
152
- else
153
- rowa,rowb=Statsample.only_valid(ds[row],ds[col])
154
- rowa.size
155
- end
156
- }
157
- end
158
- # Matrix of correlation probability
159
- # Order of rows and columns depends on Dataset#fields order
160
-
161
- def correlation_probability_matrix(ds, tails=:both)
162
- rows=ds.fields.collect{|row|
163
- ds.fields.collect{|col|
164
- v1a,v2a=Statsample.only_valid(ds[row],ds[col])
165
- (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
166
- }
167
- }
168
- Matrix.rows(rows)
169
- end
170
- # Spearman ranked correlation coefficient between 2 vectors
171
- def spearman(v1,v2)
172
- v1a,v2a=Statsample.only_valid(v1,v2)
173
- v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
174
- pearson(v1r,v2r)
175
- end
176
- # Calculate Point biserial correlation.
177
- # Equal to Pearson correlation, with one dichotomous value replaced
178
- # by "0" and the other by "1"
179
- def point_biserial(dichotomous,continous)
180
- ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
181
- raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
182
- raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
183
- f0=ds['d'].factors.sort[0]
184
- m0=ds.filter_field('c') {|c| c['d']==f0}
185
- m1=ds.filter_field('c') {|c| c['d']!=f0}
186
- ((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
187
- end
188
- # Kendall Rank Correlation Coefficient.
189
- #
190
- # Based on Hervé Adbi article
191
- def tau_a(v1,v2)
192
- v1a,v2a=Statsample.only_valid(v1,v2)
193
- n=v1.size
194
- v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
195
- o1=ordered_pairs(v1r)
196
- o2=ordered_pairs(v2r)
197
- delta= o1.size*2-(o2 & o1).size*2
198
- 1-(delta * 2 / (n*(n-1)).to_f)
199
- end
200
- # Calculates Tau b correlation.
201
- #
202
- # Tau-b defines perfect association as strict monotonicity.
203
- # Although it requires strict monotonicity to reach 1.0,
204
- # it does not penalize ties as much as some other measures.
205
- #
206
- # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
207
- def tau_b(matrix)
208
- v=pairs(matrix)
209
- ((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
210
- end
211
- # Calculates Goodman and Kruskal's gamma.
212
- #
213
- # Gamma is the surplus of concordant pairs over discordant pairs,
214
- # as a percentage of all pairs ignoring ties.
215
- #
216
- # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
217
- def gamma(matrix)
218
- v=pairs(matrix)
219
- (v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
220
- end
221
- # Calculate indexes for a matrix
222
- # the rows and cols has to be ordered
223
- def pairs(matrix)
224
- # calculate concordant
225
- #p matrix
226
- rs=matrix.row_size
227
- cs=matrix.column_size
228
- conc=disc=ties_x=ties_y=0
229
- (0...(rs-1)).each {|x|
230
- (0...(cs-1)).each{|y|
231
- ((x+1)...rs).each{|x2|
232
- ((y+1)...cs).each{|y2|
233
- #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
234
- conc+=matrix[x,y]*matrix[x2,y2]
235
- }
236
- }
237
- }
238
- }
239
- (0...(rs-1)).each {|x|
240
- (1...(cs)).each{|y|
241
- ((x+1)...rs).each{|x2|
242
- (0...y).each{|y2|
243
- #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
244
- disc+=matrix[x,y]*matrix[x2,y2]
245
- }
246
- }
247
- }
248
- }
249
- (0...(rs-1)).each {|x|
250
- (0...(cs)).each{|y|
251
- ((x+1)...(rs)).each{|x2|
252
- ties_x+=matrix[x,y]*matrix[x2,y]
253
- }
254
- }
255
- }
256
- (0...rs).each {|x|
257
- (0...(cs-1)).each{|y|
258
- ((y+1)...(cs)).each{|y2|
259
- ties_y+=matrix[x,y]*matrix[x,y2]
260
- }
261
- }
262
- }
263
- {'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
264
- end
265
- def ordered_pairs(vector)
266
- d=vector.data
267
- a=[]
268
- (0...(d.size-1)).each{|i|
269
- ((i+1)...(d.size)).each {|j|
270
- a.push([d[i],d[j]])
271
- }
272
- }
273
- a
274
- end
275
- def sum_of_codeviated(v1,v2)
276
- v1a,v2a=Statsample.only_valid(v1,v2)
277
- sum=0
278
- (0...v1a.size).each{|i|
279
- sum+=v1a[i]*v2a[i]
280
- }
281
- sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
282
- end
3
+ # Diverse correlation methods
4
+ module Bivariate
5
+ class << self
6
+ # Covariance between two vectors
7
+ def covariance(v1,v2)
8
+ v1a,v2a=Statsample.only_valid(v1,v2)
9
+ return nil if v1a.size==0
10
+ if HAS_GSL
11
+ GSL::Stats::covariance(v1a.gsl, v2a.gsl)
12
+ else
13
+ covariance_slow(v1a,v2a)
14
+ end
15
+ end
16
+ def maximum_likehood_dichotomic(pred,real)
17
+ preda,reala=Statsample.only_valid(pred,real)
18
+ sum=0
19
+ pred.each_index{|i|
20
+ sum+=(real[i]*Math::log(pred[i])) + ((1-real[i])*Math::log(1-pred[i]))
21
+ }
22
+ sum
23
+ end
24
+
25
+ def covariance_slow(v1a,v2a) # :nodoc:
26
+ t=0
27
+ m1=v1a.mean
28
+ m2=v1a.mean
29
+ (0...v1a.size).each {|i| t+=((v1a[i]-m1)*(v2a[i]-m2)) }
30
+ t.to_f / (v1a.size-1)
31
+ end
32
+ # Calculate Pearson correlation coefficient between 2 vectors
33
+ def pearson(v1,v2)
34
+ v1a,v2a=Statsample.only_valid(v1,v2)
35
+ return nil if v1a.size ==0
36
+ if HAS_GSL
37
+ GSL::Stats::correlation(v1a.gsl, v2a.gsl)
38
+ else
39
+ pearson_slow(v1a,v2a)
40
+ end
41
+ end
42
+ def pearson_slow(v1a,v2a) # :nodoc:
43
+ v1s,v2s=v1a.vector_standarized_pop,v2a.vector_standarized_pop
44
+ t=0
45
+ siz=v1s.size
46
+ (0...v1s.size).each {|i| t+=(v1s[i]*v2s[i]) }
47
+ t.to_f/v2s.size
48
+ end
49
+ # Retrieves the value for t test for a pearson correlation
50
+ # between two vectors to test the null hipothesis of r=0
51
+ def t_pearson(v1,v2)
52
+ v1a,v2a=Statsample.only_valid(v1,v2)
53
+ r=pearson(v1a,v2a)
54
+ if(r==1.0)
55
+ 0
56
+ else
57
+ t_r(r,v1a.size)
58
+ end
59
+ end
60
+ # Retrieves the value for t test for a pearson correlation
61
+ # giving r and vector size
62
+ def t_r(r,size)
63
+ r * Math::sqrt(((size)-2).to_f / (1 - r**2))
64
+ end
65
+ # Retrieves the probability value (a la SPSS)
66
+ # for a given t, size and number of tails.
67
+ # Uses a second parameter
68
+ # * :both or 2 : for r!=0
69
+ # * :right, :positive or 1 : for r > 0
70
+ # * :left, :negative : for r < 0
71
+
72
+ def prop_pearson(t, size, tails=:both)
73
+ tails=:both if tails==2
74
+ tails=:right if tails==1 or tails==:positive
75
+ tails=:left if tails==:negative
76
+
77
+ n_tails=case tails
78
+ when :both then 2
79
+ else 1
80
+ end
81
+ t=-t if t>0 and (tails==:both)
82
+ cdf=Distribution::T.cdf(t, size-2)
83
+ if(tails==:right)
84
+ 1.0-(cdf*n_tails)
85
+ else
86
+ cdf*n_tails
87
+ end
88
+ end
89
+ # Returns residual score after delete variance
90
+ # from another variable
91
+ #
92
+ def residuals(from,del)
93
+ r=Statsample::Bivariate.pearson(from,del)
94
+ froms, dels = from.vector_standarized, del.vector_standarized
95
+ nv=[]
96
+ froms.data_with_nils.each_index do |i|
97
+ if froms[i].nil? or dels[i].nil?
98
+ nv.push(nil)
99
+ else
100
+ nv.push(froms[i]-r*dels[i])
101
+ end
102
+ end
103
+ nv.to_vector(:scale)
104
+ end
105
+ # Correlation between v1 and v2, controling the effect of
106
+ # control on both.
107
+ def partial_correlation(v1,v2,control)
108
+ v1a,v2a,cona=Statsample.only_valid(v1,v2,control)
109
+ rv1v2=pearson(v1a,v2a)
110
+ rv1con=pearson(v1a,cona)
111
+ rv2con=pearson(v2a,cona)
112
+ (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
113
+
114
+ end
115
+ # Covariance matrix.
116
+ # Order of rows and columns depends on Dataset#fields order
117
+
118
+ def covariance_matrix(ds)
119
+ ds.collect_matrix do |row,col|
120
+ if (ds[row].type!=:scale or ds[col].type!=:scale)
121
+ nil
122
+ else
123
+ covariance(ds[row],ds[col])
124
+ end
125
+ end
126
+ end
127
+
128
+ # Correlation matrix.
129
+ # Order of rows and columns depends on Dataset#fields order
130
+
131
+ def correlation_matrix(ds)
132
+ ds.collect_matrix do |row,col|
133
+ if row==col
134
+ 1.0
135
+ elsif (ds[row].type!=:scale or ds[col].type!=:scale)
136
+ nil
137
+ else
138
+ pearson(ds[row],ds[col])
139
+ end
140
+ end
141
+ end
142
+ # Retrieves the n valid pairwise
143
+ def n_valid_matrix(ds)
144
+ ds.collect_matrix do |row,col|
145
+ if row==col
146
+ ds[row].valid_data.size
147
+ else
148
+ rowa,rowb=Statsample.only_valid(ds[row],ds[col])
149
+ rowa.size
150
+ end
151
+ end
152
+ end
153
+ # Matrix of correlation probability
154
+ # Order of rows and columns depends on Dataset#fields order
155
+
156
+ def correlation_probability_matrix(ds, tails=:both)
157
+ rows=ds.fields.collect do |row|
158
+ ds.fields.collect do |col|
159
+ v1a,v2a=Statsample.only_valid(ds[row],ds[col])
160
+ (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
161
+ end
283
162
  end
163
+ Matrix.rows(rows)
164
+ end
165
+ # Spearman ranked correlation coefficient between 2 vectors
166
+ def spearman(v1,v2)
167
+ v1a,v2a=Statsample.only_valid(v1,v2)
168
+ v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
169
+ pearson(v1r,v2r)
170
+ end
171
+ # Calculate Point biserial correlation. Equal to Pearson correlation, with
172
+ # one dichotomous value replaced by "0" and the other by "1"
173
+ def point_biserial(dichotomous,continous)
174
+ ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
175
+ raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
176
+ raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
177
+ f0=ds['d'].factors.sort[0]
178
+ m0=ds.filter_field('c') {|c| c['d']==f0}
179
+ m1=ds.filter_field('c') {|c| c['d']!=f0}
180
+ ((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
181
+ end
182
+ # Kendall Rank Correlation Coefficient.
183
+ #
184
+ # Based on Hervé Adbi article
185
+ def tau_a(v1,v2)
186
+ v1a,v2a=Statsample.only_valid(v1,v2)
187
+ n=v1.size
188
+ v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
189
+ o1=ordered_pairs(v1r)
190
+ o2=ordered_pairs(v2r)
191
+ delta= o1.size*2-(o2 & o1).size*2
192
+ 1-(delta * 2 / (n*(n-1)).to_f)
193
+ end
194
+ # Calculates Tau b correlation.
195
+ #
196
+ # Tau-b defines perfect association as strict monotonicity. Although it
197
+ # requires strict monotonicity to reach 1.0, it does not penalize ties as
198
+ # much as some other measures.
199
+ #
200
+ # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
201
+ def tau_b(matrix)
202
+ v=pairs(matrix)
203
+ ((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
204
+ end
205
+ # Calculates Goodman and Kruskal's gamma.
206
+ #
207
+ # Gamma is the surplus of concordant pairs over discordant pairs, as a
208
+ # percentage of all pairs ignoring ties.
209
+ #
210
+ # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
211
+ def gamma(matrix)
212
+ v=pairs(matrix)
213
+ (v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
214
+ end
215
+ # Calculate indexes for a matrix the rows and cols has to be ordered
216
+ def pairs(matrix)
217
+ # calculate concordant #p matrix
218
+ rs=matrix.row_size
219
+ cs=matrix.column_size
220
+ conc=disc=ties_x=ties_y=0
221
+ (0...(rs-1)).each {|x|
222
+ (0...(cs-1)).each{|y|
223
+ ((x+1)...rs).each{|x2|
224
+ ((y+1)...cs).each{|y2|
225
+ # #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
226
+ conc+=matrix[x,y]*matrix[x2,y2]
227
+ }
228
+ }
229
+ }
230
+ }
231
+ (0...(rs-1)).each {|x|
232
+ (1...(cs)).each{|y|
233
+ ((x+1)...rs).each{|x2|
234
+ (0...y).each{|y2|
235
+ # #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
236
+ disc+=matrix[x,y]*matrix[x2,y2]
237
+ }
238
+ }
239
+ }
240
+ }
241
+ (0...(rs-1)).each {|x|
242
+ (0...(cs)).each{|y|
243
+ ((x+1)...(rs)).each{|x2|
244
+ ties_x+=matrix[x,y]*matrix[x2,y]
245
+ }
246
+ }
247
+ }
248
+ (0...rs).each {|x|
249
+ (0...(cs-1)).each{|y|
250
+ ((y+1)...(cs)).each{|y2|
251
+ ties_y+=matrix[x,y]*matrix[x,y2]
252
+ }
253
+ }
254
+ }
255
+ {'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
256
+ end
257
+ def ordered_pairs(vector)
258
+ d=vector.data
259
+ a=[]
260
+ (0...(d.size-1)).each{|i|
261
+ ((i+1)...(d.size)).each {|j|
262
+ a.push([d[i],d[j]])
263
+ }
264
+ }
265
+ a
266
+ end
267
+ def sum_of_codeviated(v1,v2)
268
+ v1a,v2a=Statsample.only_valid(v1,v2)
269
+ sum=0
270
+ (0...v1a.size).each{|i|
271
+ sum+=v1a[i]*v2a[i]
272
+ }
273
+ sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
274
+ end
284
275
  end
276
+ end
285
277
  end
286
278