statsample 1.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.build.sh +15 -0
- data/.gitignore +1 -0
- data/.travis.yml +19 -7
- data/CONTRIBUTING.md +33 -0
- data/History.txt +5 -0
- data/README.md +41 -53
- data/benchmarks/correlation_matrix_15_variables.rb +6 -5
- data/benchmarks/correlation_matrix_5_variables.rb +6 -5
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
- data/examples/boxplot.rb +17 -5
- data/examples/correlation_matrix.rb +36 -7
- data/examples/dataset.rb +25 -5
- data/examples/dominance_analysis.rb +8 -7
- data/examples/dominance_analysis_bootstrap.rb +16 -11
- data/examples/histogram.rb +16 -2
- data/examples/icc.rb +5 -6
- data/examples/levene.rb +17 -3
- data/examples/multiple_regression.rb +6 -3
- data/examples/parallel_analysis.rb +11 -6
- data/examples/polychoric.rb +26 -13
- data/examples/principal_axis.rb +8 -4
- data/examples/reliability.rb +10 -10
- data/examples/scatterplot.rb +8 -0
- data/examples/t_test.rb +7 -0
- data/examples/u_test.rb +10 -2
- data/examples/vector.rb +9 -6
- data/examples/velicer_map_test.rb +12 -8
- data/lib/statsample.rb +13 -47
- data/lib/statsample/analysis/suite.rb +1 -1
- data/lib/statsample/anova/oneway.rb +6 -6
- data/lib/statsample/anova/twoway.rb +26 -24
- data/lib/statsample/bivariate.rb +78 -61
- data/lib/statsample/bivariate/pearson.rb +2 -2
- data/lib/statsample/codification.rb +45 -32
- data/lib/statsample/converter/csv.rb +15 -53
- data/lib/statsample/converter/spss.rb +6 -5
- data/lib/statsample/converters.rb +50 -211
- data/lib/statsample/crosstab.rb +26 -25
- data/lib/statsample/daru.rb +117 -0
- data/lib/statsample/dataset.rb +70 -942
- data/lib/statsample/dominanceanalysis.rb +16 -17
- data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
- data/lib/statsample/factor/parallelanalysis.rb +17 -19
- data/lib/statsample/factor/pca.rb +21 -20
- data/lib/statsample/factor/principalaxis.rb +3 -3
- data/lib/statsample/graph/boxplot.rb +8 -16
- data/lib/statsample/graph/histogram.rb +4 -4
- data/lib/statsample/graph/scatterplot.rb +8 -7
- data/lib/statsample/histogram.rb +128 -119
- data/lib/statsample/matrix.rb +20 -16
- data/lib/statsample/multiset.rb +39 -38
- data/lib/statsample/regression.rb +3 -3
- data/lib/statsample/regression/multiple.rb +8 -10
- data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
- data/lib/statsample/regression/multiple/baseengine.rb +32 -32
- data/lib/statsample/regression/multiple/gslengine.rb +33 -36
- data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
- data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
- data/lib/statsample/reliability.rb +23 -25
- data/lib/statsample/reliability/icc.rb +8 -7
- data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
- data/lib/statsample/reliability/scaleanalysis.rb +58 -60
- data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
- data/lib/statsample/resample.rb +1 -1
- data/lib/statsample/shorthand.rb +29 -25
- data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
- data/lib/statsample/test/levene.rb +28 -27
- data/lib/statsample/test/t.rb +7 -9
- data/lib/statsample/test/umannwhitney.rb +28 -28
- data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
- data/lib/statsample/vector.rb +70 -1013
- data/lib/statsample/version.rb +1 -1
- data/statsample.gemspec +12 -16
- data/test/helpers_tests.rb +1 -1
- data/test/test_analysis.rb +17 -17
- data/test/test_anova_contrast.rb +6 -6
- data/test/test_anovatwowaywithdataset.rb +8 -8
- data/test/test_anovawithvectors.rb +8 -8
- data/test/test_awesome_print_bug.rb +1 -1
- data/test/test_bartlettsphericity.rb +4 -4
- data/test/test_bivariate.rb +48 -43
- data/test/test_codification.rb +33 -33
- data/test/test_crosstab.rb +9 -9
- data/test/test_dataset.rb +28 -458
- data/test/test_factor.rb +46 -38
- data/test/test_factor_pa.rb +22 -13
- data/test/test_ggobi.rb +4 -4
- data/test/test_gsl.rb +4 -4
- data/test/test_histogram.rb +3 -3
- data/test/test_matrix.rb +13 -13
- data/test/test_multiset.rb +103 -91
- data/test/test_regression.rb +57 -52
- data/test/test_reliability.rb +55 -45
- data/test/test_reliability_icc.rb +8 -8
- data/test/test_reliability_skillscale.rb +26 -24
- data/test/test_resample.rb +1 -1
- data/test/test_statistics.rb +3 -13
- data/test/test_stest.rb +9 -9
- data/test/test_stratified.rb +3 -3
- data/test/test_test_t.rb +12 -12
- data/test/test_umannwhitney.rb +2 -2
- data/test/test_vector.rb +76 -613
- data/test/test_wilcoxonsignedrank.rb +4 -4
- metadata +57 -28
- data/lib/statsample/rserve_extension.rb +0 -20
- data/lib/statsample/vector/gsl.rb +0 -106
- data/test/fixtures/repeated_fields.csv +0 -7
- data/test/fixtures/scientific_notation.csv +0 -4
- data/test/fixtures/test_csv.csv +0 -7
- data/test/fixtures/test_xls.xls +0 -0
- data/test/test_csv.rb +0 -63
- data/test/test_rserve_extension.rb +0 -42
- data/test/test_xls.rb +0 -52
@@ -1,5 +1,8 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
2
|
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
3
|
+
# == Description
|
4
|
+
#
|
5
|
+
# Velicer MAP test.
|
3
6
|
|
4
7
|
require 'statsample'
|
5
8
|
|
@@ -15,17 +18,18 @@ Statsample::Analysis.store(Statsample::Factor::MAP) do
|
|
15
18
|
vectors={}
|
16
19
|
|
17
20
|
variables.times do |i|
|
18
|
-
vectors["v#{i}"]=
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
21
|
+
vectors["v#{i}".to_sym]= Daru::Vector.new(
|
22
|
+
samples.times.collect do |nv|
|
23
|
+
if i<5
|
24
|
+
f1[nv]*5 + f2[nv] *2 +rng.call
|
25
|
+
else
|
26
|
+
f1[nv]*2 + f2[nv] *3 +rng.call
|
27
|
+
end
|
28
|
+
end)
|
25
29
|
end
|
26
30
|
|
27
31
|
|
28
|
-
ds=vectors
|
32
|
+
ds = Daru::DataFrame.new(vectors)
|
29
33
|
cor=cor(ds)
|
30
34
|
pca=pca(cor)
|
31
35
|
|
data/lib/statsample.rb
CHANGED
@@ -22,6 +22,8 @@ require 'extendmatrix'
|
|
22
22
|
require 'distribution'
|
23
23
|
require 'dirty-memoize'
|
24
24
|
require 'reportbuilder'
|
25
|
+
require 'daru'
|
26
|
+
require 'statsample/daru'
|
25
27
|
|
26
28
|
class Numeric
|
27
29
|
def square
|
@@ -52,42 +54,6 @@ class Module
|
|
52
54
|
end
|
53
55
|
|
54
56
|
class Array
|
55
|
-
# Recode repeated values on an array, adding the number of repetition
|
56
|
-
# at the end
|
57
|
-
# Example:
|
58
|
-
# a=%w{a b c c d d d e}
|
59
|
-
# a.recode_repeated
|
60
|
-
# => ["a","b","c_1","c_2","d_1","d_2","d_3","e"]
|
61
|
-
def recode_repeated
|
62
|
-
if size != uniq.size
|
63
|
-
# Find repeated
|
64
|
-
repeated = inject({}) do |acc, v|
|
65
|
-
if acc[v].nil?
|
66
|
-
acc[v] = 1
|
67
|
-
else
|
68
|
-
acc[v] += 1
|
69
|
-
end
|
70
|
-
acc
|
71
|
-
end.select { |_k, v| v > 1 }.keys
|
72
|
-
|
73
|
-
ns = repeated.inject({}) do |acc, v|
|
74
|
-
acc[v] = 0
|
75
|
-
acc
|
76
|
-
end
|
77
|
-
|
78
|
-
collect do |f|
|
79
|
-
if repeated.include? f
|
80
|
-
ns[f] += 1
|
81
|
-
sprintf('%s_%d', f, ns[f])
|
82
|
-
else
|
83
|
-
f
|
84
|
-
end
|
85
|
-
end
|
86
|
-
else
|
87
|
-
self
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
57
|
def sum
|
92
58
|
inject(:+)
|
93
59
|
end
|
@@ -218,7 +184,7 @@ module Statsample
|
|
218
184
|
size = vs[0].size
|
219
185
|
|
220
186
|
vs.each do |v|
|
221
|
-
fail ArgumentError, 'Arguments should be Vector' unless v.instance_of?
|
187
|
+
fail ArgumentError, 'Arguments should be Vector' unless v.instance_of? Daru::Vector
|
222
188
|
fail ArgumentError, 'Vectors size should be the same' if v.size != size
|
223
189
|
end
|
224
190
|
|
@@ -228,26 +194,26 @@ module Statsample
|
|
228
194
|
# Returns a duplicate of the input vectors, without missing data
|
229
195
|
# for any of the vectors.
|
230
196
|
#
|
231
|
-
# a=[1,2,3,6,7,nil,3,5]
|
232
|
-
# b=[nil,nil,5,6,4,5,10,2]
|
233
|
-
# c=[2,4,6,7,4,5,6,7]
|
197
|
+
# a = Daru::Vector.new([1,2,3,6,7,nil,3,5])
|
198
|
+
# b = Daru::Vector.new([nil,nil,5,6,4,5,10,2])
|
199
|
+
# c = Daru::Vector.new([2,4,6,7,4,5,6,7])
|
234
200
|
# a2,b2,c2=Statsample.only_valid(a,b,c)
|
235
|
-
# => [#<
|
236
|
-
# #<
|
237
|
-
# #<
|
201
|
+
# => [#<Daru::Vector:0xb748c8c8 @data=[3, 6, 7, 3, 5]>,
|
202
|
+
# #<Daru::Vector:0xb748c814 @data=[5, 6, 4, 10, 2]>,
|
203
|
+
# #<Daru::Vector:0xb748c760 @data=[6, 7, 4, 6, 7]>]
|
238
204
|
#
|
239
205
|
def only_valid(*vs)
|
240
206
|
i = 1
|
241
|
-
h = vs.inject({}) { |acc, v| acc["v#{i}"] = v; i += 1; acc }
|
242
|
-
|
243
|
-
|
207
|
+
h = vs.inject({}) { |acc, v| acc["v#{i}".to_sym] = v; i += 1; acc }
|
208
|
+
df = Daru::DataFrame.new(h).dup_only_valid
|
209
|
+
df.map { |v| v }
|
244
210
|
end
|
245
211
|
|
246
212
|
# Cheap version of #only_valid.
|
247
213
|
# If any vectors have missing_values, return only valid.
|
248
214
|
# If not, return the vectors itself
|
249
215
|
def only_valid_clone(*vs)
|
250
|
-
if vs.any?(&:
|
216
|
+
if vs.any?(&:has_missing_data?)
|
251
217
|
only_valid(*vs)
|
252
218
|
else
|
253
219
|
vs
|
@@ -80,7 +80,7 @@ module Statsample
|
|
80
80
|
|
81
81
|
def method_missing(name, *args,&block)
|
82
82
|
@attached.reverse.each do |ds|
|
83
|
-
return ds[name
|
83
|
+
return ds[name] if ds.vectors.to_a.include? (name)
|
84
84
|
end
|
85
85
|
raise "Method #{name} doesn't exists"
|
86
86
|
end
|
@@ -67,9 +67,9 @@ module Statsample
|
|
67
67
|
|
68
68
|
# One Way Anova with vectors
|
69
69
|
# Example:
|
70
|
-
# v1=[2,3,4,5,6]
|
71
|
-
# v2=[3,3,4,5,6]
|
72
|
-
# v3=[5,3,1,5,6]
|
70
|
+
# v1 = Daru::Vector.new([2,3,4,5,6])
|
71
|
+
# v2 = Daru::Vector.new([3,3,4,5,6])
|
72
|
+
# v3 = Daru::Vector.new([5,3,1,5,6])
|
73
73
|
# anova=Statsample::Anova::OneWayWithVectors.new([v1,v2,v3])
|
74
74
|
# anova.f
|
75
75
|
# => 0.0243902439024391
|
@@ -90,10 +90,10 @@ module Statsample
|
|
90
90
|
|
91
91
|
def initialize(*args)
|
92
92
|
if args[0].is_a? Array
|
93
|
-
@vectors=args.shift
|
93
|
+
@vectors = args.shift
|
94
94
|
else
|
95
|
-
@vectors=args.find_all {|v| v.is_a?
|
96
|
-
opts=args.find {|v| v.is_a? Hash}
|
95
|
+
@vectors = args.find_all {|v| v.is_a? Daru::Vector}
|
96
|
+
opts = args.find {|v| v.is_a? Hash}
|
97
97
|
end
|
98
98
|
opts||=Hash.new
|
99
99
|
opts_default={:name=>_("Anova One-Way"),
|
@@ -107,9 +107,9 @@ module Statsample
|
|
107
107
|
|
108
108
|
# Two Way Anova with vectors
|
109
109
|
# Example:
|
110
|
-
# v1=[1,1,2,2]
|
111
|
-
# v2=[1,2,1,2]
|
112
|
-
# v3=[5,3,1,5]
|
110
|
+
# v1 = Daru::Vector.new([1,1,2,2])
|
111
|
+
# v2 = Daru::Vector.new([1,2,1,2])
|
112
|
+
# v3 = Daru::Vector.new([5,3,1,5])
|
113
113
|
# anova=Statsample::Anova::TwoWayWithVectors.new(:a=>v1,:b=>v2, :dependent=>v3)
|
114
114
|
#
|
115
115
|
class TwoWayWithVectors < TwoWay
|
@@ -121,25 +121,26 @@ module Statsample
|
|
121
121
|
# For now, only equal sample cells allowed
|
122
122
|
def initialize(opts=Hash.new)
|
123
123
|
raise "You should insert at least :a, :b and :dependent" unless [:a, :b, :dependent].all? {|v| opts.has_key? v}
|
124
|
-
@a_var=
|
125
|
-
@b_var=
|
126
|
-
@dep_var=
|
127
|
-
@a_vector, @b_vector, @dep_vector=
|
124
|
+
@a_var = :a
|
125
|
+
@b_var = :b
|
126
|
+
@dep_var = :dependent
|
127
|
+
@a_vector, @b_vector, @dep_vector =
|
128
|
+
Statsample.only_valid_clone opts[:a], opts[:b], opts[:dependent]
|
128
129
|
|
129
|
-
ds={@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector}
|
130
|
-
@ds=ds.clone_only_valid
|
131
|
-
_p
|
132
|
-
_q
|
133
|
-
@x_general
|
134
|
-
@axb_means={}
|
135
|
-
@axb_sd={}
|
136
|
-
@vectors=[]
|
130
|
+
ds = Daru::DataFrame.new({@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector})
|
131
|
+
@ds = ds.clone_only_valid
|
132
|
+
_p = @a_vector.factors.size
|
133
|
+
_q = @b_vector.factors.size
|
134
|
+
@x_general = @dep_vector.mean
|
135
|
+
@axb_means = {}
|
136
|
+
@axb_sd = {}
|
137
|
+
@vectors = []
|
137
138
|
n=nil
|
138
139
|
@ds.to_multiset_by_split(a_var,b_var).each_vector(dep_var) {|k,v|
|
139
|
-
@axb_means[k]=v.mean
|
140
|
-
@axb_sd[k]=v.sd
|
140
|
+
@axb_means[k] = v.mean
|
141
|
+
@axb_sd[k] = v.sd
|
141
142
|
@vectors << v
|
142
|
-
n||=v.size
|
143
|
+
n ||= v.size
|
143
144
|
raise "All cell sizes should be equal" if n!=v.size
|
144
145
|
}
|
145
146
|
|
@@ -151,20 +152,21 @@ module Statsample
|
|
151
152
|
@ds.to_multiset_by_split(b_var).each_vector(dep_var) {|k,v|
|
152
153
|
@b_means[k]=v.mean
|
153
154
|
}
|
154
|
-
ss_a=n*_q*@ds[a_var].factors.inject(0) {|ac,v|
|
155
|
-
ac+(@a_means[v]-@x_general)**2
|
155
|
+
ss_a = n*_q*@ds[a_var].factors.inject(0) {|ac,v|
|
156
|
+
ac + (@a_means[v]-@x_general)**2
|
156
157
|
}
|
157
158
|
ss_b=n*_p*@ds[b_var].factors.inject(0) {|ac,v|
|
158
159
|
ac+(@b_means[v]-@x_general)**2
|
159
160
|
}
|
160
|
-
ss_within
|
161
|
+
ss_within = @ds.collect(:row) { |row|
|
161
162
|
(row[dep_var]-@axb_means[[row[a_var],row[b_var]]])**2
|
162
163
|
}.sum
|
163
|
-
ss_axb=n*@axb_means.inject(0) {|ac,v|
|
164
|
+
ss_axb = n*@axb_means.inject(0) {|ac,v|
|
164
165
|
j,k=v[0]
|
165
166
|
xjk=v[1]
|
166
167
|
ac+(xjk-@a_means[j]-@b_means[k]+@x_general)**2
|
167
168
|
}
|
169
|
+
|
168
170
|
df_a=_p-1
|
169
171
|
df_b=_q-1
|
170
172
|
df_within=(_p*_q)*(n-1)
|
@@ -186,9 +188,9 @@ module Statsample
|
|
186
188
|
def report_building(builder) #:nodoc:#
|
187
189
|
builder.section(:name=>@name) do |s|
|
188
190
|
if summary_descriptives
|
189
|
-
s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].
|
191
|
+
s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].index_of(a)}+[_("%s Mean") % @name_b]) do |t|
|
190
192
|
@ds[b_var].factors.each do |b|
|
191
|
-
t.row([@ds[b_var].
|
193
|
+
t.row([@ds[b_var].index_of(b)]+@ds[a_var].factors.map {|a| "%0.3f" % @axb_means[[a,b]] } + ["%0.3f" % @b_means[b]])
|
192
194
|
end
|
193
195
|
t.row([_("%s Mean") % @name_a]+@ds[a_var].factors.map {|a| "%0.3f" % @a_means[a]}+ ["%0.3f" % @x_general])
|
194
196
|
end
|
data/lib/statsample/bivariate.rb
CHANGED
@@ -12,9 +12,10 @@ module Statsample
|
|
12
12
|
# Covariance between two vectors
|
13
13
|
def covariance(v1,v2)
|
14
14
|
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
15
|
+
|
15
16
|
return nil if v1a.size==0
|
16
17
|
if Statsample.has_gsl?
|
17
|
-
GSL::Stats::covariance(v1a.
|
18
|
+
GSL::Stats::covariance(v1a.to_gsl, v2a.to_gsl)
|
18
19
|
else
|
19
20
|
covariance_slow(v1a,v2a)
|
20
21
|
end
|
@@ -34,7 +35,9 @@ module Statsample
|
|
34
35
|
sum_of_squares(v1a,v2a) / (v1a.size-1)
|
35
36
|
end
|
36
37
|
def sum_of_squares(v1,v2)
|
37
|
-
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
38
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
39
|
+
v1a.reset_index!
|
40
|
+
v2a.reset_index!
|
38
41
|
m1=v1a.mean
|
39
42
|
m2=v2a.mean
|
40
43
|
(v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
|
@@ -44,13 +47,14 @@ module Statsample
|
|
44
47
|
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
45
48
|
return nil if v1a.size ==0
|
46
49
|
if Statsample.has_gsl?
|
47
|
-
GSL::Stats::correlation(v1a.
|
50
|
+
GSL::Stats::correlation(v1a.to_gsl, v2a.to_gsl)
|
48
51
|
else
|
49
52
|
pearson_slow(v1a,v2a)
|
50
53
|
end
|
51
54
|
end
|
52
55
|
def pearson_slow(v1,v2) # :nodoc:
|
53
56
|
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
57
|
+
|
54
58
|
# Calculate sum of squares
|
55
59
|
ss=sum_of_squares(v1a,v2a)
|
56
60
|
ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
|
@@ -118,14 +122,16 @@ module Statsample
|
|
118
122
|
r=Statsample::Bivariate.pearson(from,del)
|
119
123
|
froms, dels = from.vector_standarized, del.vector_standarized
|
120
124
|
nv=[]
|
121
|
-
froms.
|
125
|
+
froms.reset_index!
|
126
|
+
dels.reset_index!
|
127
|
+
froms.each_index do |i|
|
122
128
|
if froms[i].nil? or dels[i].nil?
|
123
129
|
nv.push(nil)
|
124
130
|
else
|
125
131
|
nv.push(froms[i]-r*dels[i])
|
126
132
|
end
|
127
133
|
end
|
128
|
-
|
134
|
+
Daru::Vector.new(nv)
|
129
135
|
end
|
130
136
|
# Correlation between v1 and v2, controling the effect of
|
131
137
|
# control on both.
|
@@ -135,7 +141,6 @@ module Statsample
|
|
135
141
|
rv1con=pearson(v1a,cona)
|
136
142
|
rv2con=pearson(v2a,cona)
|
137
143
|
(rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
|
138
|
-
|
139
144
|
end
|
140
145
|
|
141
146
|
def covariance_matrix_optimized(ds)
|
@@ -153,50 +158,53 @@ module Statsample
|
|
153
158
|
# Order of rows and columns depends on Dataset#fields order
|
154
159
|
|
155
160
|
def covariance_matrix(ds)
|
156
|
-
vars,cases=ds.
|
161
|
+
vars,cases = ds.ncols, ds.nrows
|
157
162
|
if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
|
158
163
|
cm=covariance_matrix_optimized(ds)
|
159
164
|
else
|
160
165
|
cm=covariance_matrix_pairwise(ds)
|
161
|
-
|
162
166
|
end
|
163
167
|
cm.extend(Statsample::CovariateMatrix)
|
164
|
-
cm.fields=ds.
|
168
|
+
cm.fields = ds.vectors.to_a
|
165
169
|
cm
|
166
170
|
end
|
167
171
|
|
168
172
|
|
169
173
|
def covariance_matrix_pairwise(ds)
|
170
174
|
cache={}
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
cov=covariance(ds[row],ds[col])
|
179
|
-
cache[[row,col]]=cov
|
180
|
-
cov
|
175
|
+
vectors = ds.vectors.to_a
|
176
|
+
mat_rows = vectors.collect do |row|
|
177
|
+
vectors.collect do |col|
|
178
|
+
if (ds[row].type!=:numeric or ds[col].type!=:numeric)
|
179
|
+
nil
|
180
|
+
elsif row==col
|
181
|
+
ds[row].variance
|
181
182
|
else
|
182
|
-
|
183
|
+
if cache[[col,row]].nil?
|
184
|
+
cov=covariance(ds[row],ds[col])
|
185
|
+
cache[[row,col]]=cov
|
186
|
+
cov
|
187
|
+
else
|
188
|
+
cache[[col,row]]
|
189
|
+
end
|
183
190
|
end
|
184
191
|
end
|
185
192
|
end
|
186
|
-
|
193
|
+
|
194
|
+
Matrix.rows mat_rows
|
187
195
|
end
|
188
196
|
|
189
197
|
# Correlation matrix.
|
190
198
|
# Order of rows and columns depends on Dataset#fields order
|
191
199
|
def correlation_matrix(ds)
|
192
|
-
vars,cases=ds.
|
200
|
+
vars, cases = ds.ncols, ds.nrows
|
193
201
|
if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
|
194
202
|
cm=correlation_matrix_optimized(ds)
|
195
203
|
else
|
196
204
|
cm=correlation_matrix_pairwise(ds)
|
197
205
|
end
|
198
206
|
cm.extend(Statsample::CovariateMatrix)
|
199
|
-
cm.fields=ds.
|
207
|
+
cm.fields = ds.vectors.to_a
|
200
208
|
cm
|
201
209
|
end
|
202
210
|
|
@@ -212,33 +220,43 @@ module Statsample
|
|
212
220
|
end
|
213
221
|
def correlation_matrix_pairwise(ds)
|
214
222
|
cache={}
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
r=pearson(ds[row],ds[col])
|
223
|
-
cache[[row,col]]=r
|
224
|
-
r
|
223
|
+
vectors = ds.vectors.to_a
|
224
|
+
cm = vectors.collect do |row|
|
225
|
+
vectors.collect do |col|
|
226
|
+
if row==col
|
227
|
+
1.0
|
228
|
+
elsif (ds[row].type!=:numeric or ds[col].type!=:numeric)
|
229
|
+
nil
|
225
230
|
else
|
226
|
-
cache[[col,row]]
|
227
|
-
|
231
|
+
if cache[[col,row]].nil?
|
232
|
+
r=pearson(ds[row],ds[col])
|
233
|
+
cache[[row,col]]=r
|
234
|
+
r
|
235
|
+
else
|
236
|
+
cache[[col,row]]
|
237
|
+
end
|
238
|
+
end
|
228
239
|
end
|
229
240
|
end
|
241
|
+
|
242
|
+
Matrix.rows cm
|
230
243
|
end
|
231
244
|
|
232
245
|
# Retrieves the n valid pairwise.
|
233
246
|
def n_valid_matrix(ds)
|
234
|
-
ds.
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
247
|
+
vectors = ds.vectors.to_a
|
248
|
+
m = vectors.collect do |row|
|
249
|
+
vectors.collect do |col|
|
250
|
+
if row==col
|
251
|
+
ds[row].only_valid.size
|
252
|
+
else
|
253
|
+
rowa,rowb = Statsample.only_valid_clone(ds[row],ds[col])
|
254
|
+
rowa.size
|
255
|
+
end
|
240
256
|
end
|
241
257
|
end
|
258
|
+
|
259
|
+
Matrix.rows m
|
242
260
|
end
|
243
261
|
|
244
262
|
# Matrix of correlation probabilities.
|
@@ -256,27 +274,27 @@ module Statsample
|
|
256
274
|
|
257
275
|
# Spearman ranked correlation coefficient (rho) between 2 vectors
|
258
276
|
def spearman(v1,v2)
|
259
|
-
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
260
|
-
v1r,v2r=v1a.ranked
|
277
|
+
v1a,v2a = Statsample.only_valid_clone(v1,v2)
|
278
|
+
v1r,v2r = v1a.ranked, v2a.ranked
|
261
279
|
pearson(v1r,v2r)
|
262
280
|
end
|
263
281
|
# Calculate Point biserial correlation. Equal to Pearson correlation, with
|
264
282
|
# one dichotomous value replaced by "0" and the other by "1"
|
265
283
|
def point_biserial(dichotomous,continous)
|
266
|
-
ds={
|
267
|
-
raise(TypeError, "First vector should be dichotomous") if ds[
|
268
|
-
raise(TypeError, "Second vector should be continous") if ds[
|
269
|
-
f0=ds[
|
270
|
-
m0=ds.
|
271
|
-
m1=ds.
|
272
|
-
((m1.mean-m0.mean).to_f / ds[
|
284
|
+
ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).dup_only_valid
|
285
|
+
raise(TypeError, "First vector should be dichotomous") if ds[:d].factors.size != 2
|
286
|
+
raise(TypeError, "Second vector should be continous") if ds[:c].type != :numeric
|
287
|
+
f0=ds[:d].factors.sort.to_a[0]
|
288
|
+
m0=ds.filter_vector(:c) {|c| c[:d] == f0}
|
289
|
+
m1=ds.filter_vector(:c) {|c| c[:d] != f0}
|
290
|
+
((m1.mean-m0.mean).to_f / ds[:c].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.nrows**2)
|
273
291
|
end
|
274
292
|
# Kendall Rank Correlation Coefficient (Tau a)
|
275
293
|
# Based on Hervé Adbi article
|
276
294
|
def tau_a(v1,v2)
|
277
295
|
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
278
296
|
n=v1.size
|
279
|
-
v1r,v2r=v1a.ranked
|
297
|
+
v1r,v2r=v1a.ranked,v2a.ranked
|
280
298
|
o1=ordered_pairs(v1r)
|
281
299
|
o2=ordered_pairs(v2r)
|
282
300
|
delta= o1.size*2-(o2 & o1).size*2
|
@@ -348,14 +366,15 @@ module Statsample
|
|
348
366
|
}
|
349
367
|
{'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
|
350
368
|
end
|
369
|
+
|
351
370
|
def ordered_pairs(vector)
|
352
|
-
d=vector.
|
353
|
-
a=[]
|
354
|
-
(0...(d.size-1)).each
|
355
|
-
((i+1)...(d.size)).each
|
371
|
+
d = vector.to_a
|
372
|
+
a = []
|
373
|
+
(0...(d.size-1)).each do |i|
|
374
|
+
((i+1)...(d.size)).each do |j|
|
356
375
|
a.push([d[i],d[j]])
|
357
|
-
|
358
|
-
|
376
|
+
end
|
377
|
+
end
|
359
378
|
a
|
360
379
|
end
|
361
380
|
=begin
|
@@ -371,8 +390,8 @@ module Statsample
|
|
371
390
|
# Report the minimum number of cases valid of a covariate matrix
|
372
391
|
# based on a dataset
|
373
392
|
def min_n_valid(ds)
|
374
|
-
min=ds.
|
375
|
-
m=n_valid_matrix(ds)
|
393
|
+
min = ds.nrows
|
394
|
+
m = n_valid_matrix(ds)
|
376
395
|
for x in 0...m.row_size
|
377
396
|
for y in 0...m.column_size
|
378
397
|
min=m[x,y] if m[x,y] < min
|
@@ -380,8 +399,6 @@ module Statsample
|
|
380
399
|
end
|
381
400
|
min
|
382
401
|
end
|
383
|
-
|
384
|
-
|
385
402
|
end
|
386
403
|
end
|
387
404
|
end
|