statsample-ekatena 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.travis.yml +23 -0
- data/CONTRIBUTING.md +17 -0
- data/Gemfile +2 -0
- data/History.txt +457 -0
- data/LICENSE.txt +12 -0
- data/README.md +175 -0
- data/Rakefile +44 -0
- data/benchmarks/correlation_matrix_15_variables.rb +32 -0
- data/benchmarks/correlation_matrix_5_variables.rb +33 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/doc_latex/manual/equations.tex +78 -0
- data/examples/boxplot.rb +28 -0
- data/examples/chisquare_test.rb +23 -0
- data/examples/correlation_matrix.rb +32 -0
- data/examples/dataset.rb +30 -0
- data/examples/dominance_analysis.rb +33 -0
- data/examples/dominance_analysis_bootstrap.rb +32 -0
- data/examples/histogram.rb +26 -0
- data/examples/icc.rb +24 -0
- data/examples/levene.rb +29 -0
- data/examples/multiple_regression.rb +20 -0
- data/examples/multivariate_correlation.rb +33 -0
- data/examples/parallel_analysis.rb +40 -0
- data/examples/polychoric.rb +40 -0
- data/examples/principal_axis.rb +26 -0
- data/examples/reliability.rb +31 -0
- data/examples/scatterplot.rb +25 -0
- data/examples/t_test.rb +27 -0
- data/examples/tetrachoric.rb +17 -0
- data/examples/u_test.rb +24 -0
- data/examples/vector.rb +20 -0
- data/examples/velicer_map_test.rb +46 -0
- data/grab_references.rb +29 -0
- data/lib/spss.rb +134 -0
- data/lib/statsample-ekatena/analysis.rb +100 -0
- data/lib/statsample-ekatena/analysis/suite.rb +89 -0
- data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
- data/lib/statsample-ekatena/anova.rb +24 -0
- data/lib/statsample-ekatena/anova/contrast.rb +79 -0
- data/lib/statsample-ekatena/anova/oneway.rb +187 -0
- data/lib/statsample-ekatena/anova/twoway.rb +207 -0
- data/lib/statsample-ekatena/bivariate.rb +406 -0
- data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
- data/lib/statsample-ekatena/codification.rb +182 -0
- data/lib/statsample-ekatena/converter/csv.rb +28 -0
- data/lib/statsample-ekatena/converter/spss.rb +48 -0
- data/lib/statsample-ekatena/converters.rb +211 -0
- data/lib/statsample-ekatena/crosstab.rb +188 -0
- data/lib/statsample-ekatena/daru.rb +115 -0
- data/lib/statsample-ekatena/dataset.rb +10 -0
- data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
- data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
- data/lib/statsample-ekatena/factor.rb +104 -0
- data/lib/statsample-ekatena/factor/map.rb +124 -0
- data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
- data/lib/statsample-ekatena/factor/pca.rb +242 -0
- data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
- data/lib/statsample-ekatena/factor/rotation.rb +198 -0
- data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
- data/lib/statsample-ekatena/formula/formula.rb +306 -0
- data/lib/statsample-ekatena/graph.rb +11 -0
- data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
- data/lib/statsample-ekatena/graph/histogram.rb +198 -0
- data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
- data/lib/statsample-ekatena/histogram.rb +180 -0
- data/lib/statsample-ekatena/matrix.rb +329 -0
- data/lib/statsample-ekatena/multiset.rb +310 -0
- data/lib/statsample-ekatena/regression.rb +65 -0
- data/lib/statsample-ekatena/regression/multiple.rb +89 -0
- data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
- data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
- data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
- data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
- data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
- data/lib/statsample-ekatena/regression/simple.rb +121 -0
- data/lib/statsample-ekatena/reliability.rb +150 -0
- data/lib/statsample-ekatena/reliability/icc.rb +415 -0
- data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
- data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
- data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
- data/lib/statsample-ekatena/resample.rb +15 -0
- data/lib/statsample-ekatena/shorthand.rb +125 -0
- data/lib/statsample-ekatena/srs.rb +169 -0
- data/lib/statsample-ekatena/test.rb +82 -0
- data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
- data/lib/statsample-ekatena/test/chisquare.rb +73 -0
- data/lib/statsample-ekatena/test/f.rb +52 -0
- data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
- data/lib/statsample-ekatena/test/levene.rb +88 -0
- data/lib/statsample-ekatena/test/t.rb +309 -0
- data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
- data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
- data/lib/statsample-ekatena/vector.rb +19 -0
- data/lib/statsample-ekatena/version.rb +3 -0
- data/lib/statsample.rb +282 -0
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +959 -0
- data/po/statsample.pot +947 -0
- data/references.txt +24 -0
- data/statsample-ekatena.gemspec +49 -0
- data/test/fixtures/bank2.dat +200 -0
- data/test/fixtures/correlation_matrix.rb +17 -0
- data/test/fixtures/df.csv +15 -0
- data/test/fixtures/hartman_23.matrix +9 -0
- data/test/fixtures/stock_data.csv +500 -0
- data/test/fixtures/tetmat_matrix.txt +5 -0
- data/test/fixtures/tetmat_test.txt +1001 -0
- data/test/helpers_tests.rb +83 -0
- data/test/test_analysis.rb +176 -0
- data/test/test_anova_contrast.rb +36 -0
- data/test/test_anovaoneway.rb +26 -0
- data/test/test_anovatwoway.rb +37 -0
- data/test/test_anovatwowaywithdataset.rb +47 -0
- data/test/test_anovawithvectors.rb +102 -0
- data/test/test_awesome_print_bug.rb +16 -0
- data/test/test_bartlettsphericity.rb +25 -0
- data/test/test_bivariate.rb +164 -0
- data/test/test_codification.rb +78 -0
- data/test/test_crosstab.rb +67 -0
- data/test/test_dominance_analysis.rb +39 -0
- data/test/test_factor.rb +228 -0
- data/test/test_factor_map.rb +38 -0
- data/test/test_factor_pa.rb +56 -0
- data/test/test_fit_model.rb +88 -0
- data/test/test_ggobi.rb +35 -0
- data/test/test_gsl.rb +15 -0
- data/test/test_histogram.rb +109 -0
- data/test/test_matrix.rb +48 -0
- data/test/test_multiset.rb +176 -0
- data/test/test_regression.rb +231 -0
- data/test/test_reliability.rb +223 -0
- data/test/test_reliability_icc.rb +198 -0
- data/test/test_reliability_skillscale.rb +57 -0
- data/test/test_resample.rb +24 -0
- data/test/test_srs.rb +9 -0
- data/test/test_statistics.rb +69 -0
- data/test/test_stest.rb +69 -0
- data/test/test_stratified.rb +17 -0
- data/test/test_test_f.rb +33 -0
- data/test/test_test_kolmogorovsmirnov.rb +34 -0
- data/test/test_test_t.rb +62 -0
- data/test/test_umannwhitney.rb +27 -0
- data/test/test_vector.rb +12 -0
- data/test/test_wilcoxonsignedrank.rb +64 -0
- metadata +570 -0
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
module Anova
|
|
3
|
+
# = Generic Anova two-way.
|
|
4
|
+
# You could enter the sum of squares or the mean squares for a, b, axb and within.
|
|
5
|
+
# You should enter the degrees of freedom for a,b and within, because df_axb=df_a*df_b
|
|
6
|
+
# == Usage
|
|
7
|
+
# anova=Statsample::Anova::TwoWay(:ss_a=>10,:ss_b=>20,:ss_axb=>10, :ss_within=>20, :df_a=>2, :df_b=>3,df_within=100 @name=>"ANOVA for....")
|
|
8
|
+
class TwoWay
|
|
9
|
+
include Summarizable
|
|
10
|
+
attr_reader :df_a, :df_b, :df_axb, :df_within, :df_total
|
|
11
|
+
attr_reader :ss_a, :ss_b, :ss_axb, :ss_within, :ss_total
|
|
12
|
+
attr_reader :ms_a, :ms_b, :ms_axb, :ms_within, :ms_total
|
|
13
|
+
# Name of ANOVA Analisys
|
|
14
|
+
attr_accessor :name
|
|
15
|
+
# Name of a factor
|
|
16
|
+
attr_accessor :name_a
|
|
17
|
+
# Name of b factor
|
|
18
|
+
attr_accessor :name_b
|
|
19
|
+
# Name of within factor
|
|
20
|
+
attr_accessor :name_within
|
|
21
|
+
|
|
22
|
+
attr_reader :f_a_object, :f_b_object, :f_axb_object
|
|
23
|
+
def initialize(opts=Hash.new)
|
|
24
|
+
# First see if sum of squares or mean squares are entered
|
|
25
|
+
raise ArgumentError, "You should set all d.f." unless [:df_a, :df_b, :df_within].all? {|v| opts.has_key? v}
|
|
26
|
+
|
|
27
|
+
@df_a=opts.delete :df_a
|
|
28
|
+
@df_b=opts.delete :df_b
|
|
29
|
+
@df_axb=@df_a*@df_b
|
|
30
|
+
@df_within=opts.delete :df_within
|
|
31
|
+
@df_total=@df_a+@df_b+@df_axb+@df_within
|
|
32
|
+
|
|
33
|
+
if [:ss_a, :ss_b, :ss_axb, :ss_within].all? {|v| opts.has_key? v}
|
|
34
|
+
@ss_a = opts.delete :ss_a
|
|
35
|
+
@ss_b = opts.delete :ss_b
|
|
36
|
+
@ss_axb = opts.delete :ss_axb
|
|
37
|
+
@ss_within = opts.delete :ss_within
|
|
38
|
+
|
|
39
|
+
@ms_a =@ss_a.quo(@df_a)
|
|
40
|
+
@ms_b =@ss_b.quo(@df_b)
|
|
41
|
+
@ms_axb =@ss_axb.quo(@df_axb)
|
|
42
|
+
@ms_within =@ss_within.quo(@df_within)
|
|
43
|
+
|
|
44
|
+
elsif [:ms_a, :ms_b, :ms_axb, :ms_within].all? {|v| opts.has_key? v}
|
|
45
|
+
@ms_a = opts.delete :ms_a
|
|
46
|
+
@ms_b = opts.delete :ms_b
|
|
47
|
+
@ms_axb = opts.delete :ms_axb
|
|
48
|
+
@ms_within = opts.delete :ms_within
|
|
49
|
+
|
|
50
|
+
@ss_a =@ms_a*@df_a
|
|
51
|
+
@ss_b =@ms_b*@df_b
|
|
52
|
+
@ss_axb =@ms_axb*@df_axb
|
|
53
|
+
@ss_within =@ms_within*@df_within
|
|
54
|
+
else
|
|
55
|
+
raise "You should set all ss or ss"
|
|
56
|
+
end
|
|
57
|
+
@ss_total=@ss_a+@ss_b+@ss_axb+@ss_within
|
|
58
|
+
@ms_total=@ms_a+@ms_b+@ms_axb+@ms_within
|
|
59
|
+
opts_default={:name=>_("ANOVA Two-Way"),
|
|
60
|
+
:name_a=>_("A"),
|
|
61
|
+
:name_b=>_("B"),
|
|
62
|
+
:name_within=>_("Within")
|
|
63
|
+
}
|
|
64
|
+
@opts=opts_default.merge(opts)
|
|
65
|
+
opts_default.keys.each {|k|
|
|
66
|
+
send("#{k}=", @opts[k])
|
|
67
|
+
}
|
|
68
|
+
@f_a_object=Statsample::Test::F.new(@ms_a,@ms_within,@df_a,@df_within)
|
|
69
|
+
@f_b_object=Statsample::Test::F.new(@ms_b,@ms_within,@df_b,@df_within)
|
|
70
|
+
@f_axb_object=Statsample::Test::F.new(@ms_axb,@ms_within,@df_axb,@df_within)
|
|
71
|
+
end
|
|
72
|
+
def f_a
|
|
73
|
+
@f_a_object.f
|
|
74
|
+
end
|
|
75
|
+
def f_b
|
|
76
|
+
@f_b_object.f
|
|
77
|
+
end
|
|
78
|
+
def f_axb
|
|
79
|
+
@f_axb_object.f
|
|
80
|
+
end
|
|
81
|
+
def f_a_probability
|
|
82
|
+
@f_a_object.probability
|
|
83
|
+
end
|
|
84
|
+
def f_b_probability
|
|
85
|
+
@f_b_object.probability
|
|
86
|
+
end
|
|
87
|
+
def f_axb_probability
|
|
88
|
+
@f_axb_object.probability
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def report_building(builder) #:nodoc:
|
|
93
|
+
builder.section(:name=>@name) do |b|
|
|
94
|
+
report_building_table(b)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
def report_building_table(builder) #:nodoc:
|
|
98
|
+
builder.table(:name=>_("%s Table") % @name, :header=>%w{source ss df ms f p}.map {|v| _(v)}) do |t|
|
|
99
|
+
t.row([@name_a, "%0.3f" % @ss_a, @df_a, "%0.3f" % @ms_a , "%0.3f" % f_a, "%0.4f" % f_a_probability] )
|
|
100
|
+
t.row([@name_b, "%0.3f" % @ss_b, @df_b, "%0.3f" % @ms_b , "%0.3f" % f_b, "%0.4f" % f_b_probability] )
|
|
101
|
+
t.row(["%s X %s" % [@name_a, @name_b], "%0.3f" % @ss_axb, @df_axb, "%0.3f" % @ms_axb , "%0.3f" % f_axb, "%0.4f" % f_axb_probability] )
|
|
102
|
+
t.row([@name_within, "%0.3f" % @ss_within, @df_within, nil,nil,nil] )
|
|
103
|
+
t.row([_("Total"), "%0.3f" % @ss_total, @df_total, nil,nil,nil] )
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Two Way Anova with vectors
|
|
109
|
+
# Example:
|
|
110
|
+
# v1 = Daru::Vector.new([1,1,2,2])
|
|
111
|
+
# v2 = Daru::Vector.new([1,2,1,2])
|
|
112
|
+
# v3 = Daru::Vector.new([5,3,1,5])
|
|
113
|
+
# anova=Statsample::Anova::TwoWayWithVectors.new(:a=>v1,:b=>v2, :dependent=>v3)
|
|
114
|
+
#
|
|
115
|
+
class TwoWayWithVectors < TwoWay
|
|
116
|
+
# Show summary Levene test
|
|
117
|
+
attr_accessor :summary_levene
|
|
118
|
+
# Show summary descriptives for variables (means)
|
|
119
|
+
attr_accessor :summary_descriptives
|
|
120
|
+
attr_reader :a_var, :b_var, :dep_var
|
|
121
|
+
# For now, only equal sample cells allowed
|
|
122
|
+
def initialize(opts=Hash.new)
|
|
123
|
+
raise "You should insert at least :a, :b and :dependent" unless [:a, :b, :dependent].all? {|v| opts.has_key? v}
|
|
124
|
+
@a_var = :a
|
|
125
|
+
@b_var = :b
|
|
126
|
+
@dep_var = :dependent
|
|
127
|
+
@a_vector, @b_vector, @dep_vector =
|
|
128
|
+
Statsample.only_valid_clone opts[:a], opts[:b], opts[:dependent]
|
|
129
|
+
|
|
130
|
+
ds = Daru::DataFrame.new({@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector})
|
|
131
|
+
@ds = ds.clone_only_valid
|
|
132
|
+
_p = @a_vector.factors.size
|
|
133
|
+
_q = @b_vector.factors.size
|
|
134
|
+
@x_general = @dep_vector.mean
|
|
135
|
+
@axb_means = {}
|
|
136
|
+
@axb_sd = {}
|
|
137
|
+
@vectors = []
|
|
138
|
+
n=nil
|
|
139
|
+
@ds.to_multiset_by_split(a_var,b_var).each_vector(dep_var) {|k,v|
|
|
140
|
+
@axb_means[k] = v.mean
|
|
141
|
+
@axb_sd[k] = v.sd
|
|
142
|
+
@vectors << v
|
|
143
|
+
n ||= v.size
|
|
144
|
+
raise "All cell sizes should be equal" if n!=v.size
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
@a_means={}
|
|
148
|
+
@ds.to_multiset_by_split(a_var).each_vector(dep_var) {|k,v|
|
|
149
|
+
@a_means[k]=v.mean
|
|
150
|
+
}
|
|
151
|
+
@b_means={}
|
|
152
|
+
@ds.to_multiset_by_split(b_var).each_vector(dep_var) {|k,v|
|
|
153
|
+
@b_means[k]=v.mean
|
|
154
|
+
}
|
|
155
|
+
ss_a = n*_q*@ds[a_var].factors.inject(0) {|ac,v|
|
|
156
|
+
ac + (@a_means[v]-@x_general)**2
|
|
157
|
+
}
|
|
158
|
+
ss_b=n*_p*@ds[b_var].factors.inject(0) {|ac,v|
|
|
159
|
+
ac+(@b_means[v]-@x_general)**2
|
|
160
|
+
}
|
|
161
|
+
ss_within = @ds.collect(:row) { |row|
|
|
162
|
+
(row[dep_var]-@axb_means[[row[a_var],row[b_var]]])**2
|
|
163
|
+
}.sum
|
|
164
|
+
ss_axb = n*@axb_means.inject(0) {|ac,v|
|
|
165
|
+
j,k=v[0]
|
|
166
|
+
xjk=v[1]
|
|
167
|
+
ac+(xjk-@a_means[j]-@b_means[k]+@x_general)**2
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
df_a=_p-1
|
|
171
|
+
df_b=_q-1
|
|
172
|
+
df_within=(_p*_q)*(n-1)
|
|
173
|
+
|
|
174
|
+
opts_default={:name=>_("Anova Two-Way on %s") % @ds[dep_var].name,
|
|
175
|
+
:name_a=>@ds[a_var].name,
|
|
176
|
+
:name_b=>@ds[b_var].name,
|
|
177
|
+
:summary_descriptives=>true,
|
|
178
|
+
:summary_levene=>false}
|
|
179
|
+
|
|
180
|
+
@opts=opts_default.merge(opts).merge({:ss_a=>ss_a,:ss_b=>ss_b, :ss_axb=>ss_axb, :ss_within=>ss_within, :df_a=>df_a, :df_b=>df_b, :df_within=>df_within})
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
super(@opts)
|
|
184
|
+
end
|
|
185
|
+
def levene
|
|
186
|
+
Statsample::Test.levene(@vectors, :name=>_("Test of Homogeneity of variances (Levene)"))
|
|
187
|
+
end
|
|
188
|
+
def report_building(builder) #:nodoc:#
|
|
189
|
+
builder.section(:name=>@name) do |s|
|
|
190
|
+
if summary_descriptives
|
|
191
|
+
s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].index_of(a)}+[_("%s Mean") % @name_b]) do |t|
|
|
192
|
+
@ds[b_var].factors.each do |b|
|
|
193
|
+
t.row([@ds[b_var].index_of(b)]+@ds[a_var].factors.map {|a| "%0.3f" % @axb_means[[a,b]] } + ["%0.3f" % @b_means[b]])
|
|
194
|
+
end
|
|
195
|
+
t.row([_("%s Mean") % @name_a]+@ds[a_var].factors.map {|a| "%0.3f" % @a_means[a]}+ ["%0.3f" % @x_general])
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
if summary_levene
|
|
199
|
+
s.parse_element(levene)
|
|
200
|
+
end
|
|
201
|
+
report_building_table(s)
|
|
202
|
+
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
require 'statsample/bivariate/pearson'
|
|
2
|
+
module Statsample
|
|
3
|
+
# Diverse methods and classes to calculate bivariate relations
|
|
4
|
+
# Specific classes:
|
|
5
|
+
# * Statsample::Bivariate::Pearson : Pearson correlation coefficient (r)
|
|
6
|
+
# * Statsample::Bivariate::Tetrachoric : Tetrachoric correlation
|
|
7
|
+
# * Statsample::Bivariate::Polychoric : Polychoric correlation (using joint, two-step and polychoric series)
|
|
8
|
+
module Bivariate
|
|
9
|
+
autoload(:Polychoric, 'statsample/bivariate/polychoric')
|
|
10
|
+
autoload(:Tetrachoric, 'statsample/bivariate/tetrachoric')
|
|
11
|
+
class << self
|
|
12
|
+
# Covariance between two vectors
|
|
13
|
+
def covariance(v1,v2)
|
|
14
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
|
15
|
+
|
|
16
|
+
return nil if v1a.size==0
|
|
17
|
+
if Statsample.has_gsl?
|
|
18
|
+
GSL::Stats::covariance(v1a.to_gsl, v2a.to_gsl)
|
|
19
|
+
else
|
|
20
|
+
covariance_slow(v1a,v2a)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
# Estimate the ML between two dichotomic vectors
|
|
24
|
+
def maximum_likehood_dichotomic(pred,real)
|
|
25
|
+
preda,reala=Statsample.only_valid_clone(pred,real)
|
|
26
|
+
sum=0
|
|
27
|
+
preda.each_index{|i|
|
|
28
|
+
sum+=(reala[i]*Math::log(preda[i])) + ((1-reala[i])*Math::log(1-preda[i]))
|
|
29
|
+
}
|
|
30
|
+
sum
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def covariance_slow(v1,v2) # :nodoc:
|
|
34
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
|
35
|
+
sum_of_squares(v1a,v2a) / (v1a.size-1)
|
|
36
|
+
end
|
|
37
|
+
def sum_of_squares(v1,v2)
|
|
38
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
|
39
|
+
v1a.reset_index!
|
|
40
|
+
v2a.reset_index!
|
|
41
|
+
m1=v1a.mean
|
|
42
|
+
m2=v2a.mean
|
|
43
|
+
(v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
|
|
44
|
+
end
|
|
45
|
+
# Calculate Pearson correlation coefficient (r) between 2 vectors
|
|
46
|
+
def pearson(v1,v2)
|
|
47
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
|
48
|
+
return nil if v1a.size ==0
|
|
49
|
+
if Statsample.has_gsl?
|
|
50
|
+
GSL::Stats::correlation(v1a.to_gsl, v2a.to_gsl)
|
|
51
|
+
else
|
|
52
|
+
pearson_slow(v1a,v2a)
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
def pearson_slow(v1,v2) # :nodoc:
|
|
56
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
|
57
|
+
|
|
58
|
+
# Calculate sum of squares
|
|
59
|
+
ss=sum_of_squares(v1a,v2a)
|
|
60
|
+
ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
|
|
61
|
+
end
|
|
62
|
+
alias :correlation :pearson
|
|
63
|
+
# Retrieves the value for t test for a pearson correlation
|
|
64
|
+
# between two vectors to test the null hipothesis of r=0
|
|
65
|
+
def t_pearson(v1,v2)
|
|
66
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
|
67
|
+
r=pearson(v1a,v2a)
|
|
68
|
+
if(r==1.0)
|
|
69
|
+
0
|
|
70
|
+
else
|
|
71
|
+
t_r(r,v1a.size)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
# Retrieves the value for t test for a pearson correlation
|
|
75
|
+
# giving r and vector size
|
|
76
|
+
# Source : http://faculty.chass.ncsu.edu/garson/PA765/correl.htm
|
|
77
|
+
def t_r(r,size)
|
|
78
|
+
r * Math::sqrt(((size)-2).to_f / (1 - r**2))
|
|
79
|
+
end
|
|
80
|
+
# Retrieves the probability value (a la SPSS)
|
|
81
|
+
# for a given t, size and number of tails.
|
|
82
|
+
# Uses a second parameter
|
|
83
|
+
# * :both or 2 : for r!=0 (default)
|
|
84
|
+
# * :right, :positive or 1 : for r > 0
|
|
85
|
+
# * :left, :negative : for r < 0
|
|
86
|
+
|
|
87
|
+
def prop_pearson(t, size, tails=:both)
|
|
88
|
+
tails=:both if tails==2
|
|
89
|
+
tails=:right if tails==1 or tails==:positive
|
|
90
|
+
tails=:left if tails==:negative
|
|
91
|
+
|
|
92
|
+
n_tails=case tails
|
|
93
|
+
when :both then 2
|
|
94
|
+
else 1
|
|
95
|
+
end
|
|
96
|
+
t=-t if t>0 and (tails==:both)
|
|
97
|
+
cdf=Distribution::T.cdf(t, size-2)
|
|
98
|
+
if(tails==:right)
|
|
99
|
+
1.0-(cdf*n_tails)
|
|
100
|
+
else
|
|
101
|
+
cdf*n_tails
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# Predicted time for pairwise correlation matrix, in miliseconds
|
|
107
|
+
# See benchmarks/correlation_matrix.rb to see mode of calculation
|
|
108
|
+
|
|
109
|
+
def prediction_pairwise(vars,cases)
|
|
110
|
+
((-0.518111-0.000746*cases+1.235608*vars+0.000740*cases*vars)**2) / 100
|
|
111
|
+
end
|
|
112
|
+
# Predicted time for optimized correlation matrix, in miliseconds
|
|
113
|
+
# See benchmarks/correlation_matrix.rb to see mode of calculation
|
|
114
|
+
|
|
115
|
+
def prediction_optimized(vars,cases)
|
|
116
|
+
((4+0.018128*cases+0.246871*vars+0.001169*vars*cases)**2) / 100
|
|
117
|
+
end
|
|
118
|
+
# Returns residual score after delete variance
|
|
119
|
+
# from another variable
|
|
120
|
+
#
|
|
121
|
+
def residuals(from,del)
|
|
122
|
+
r=Statsample::Bivariate.pearson(from,del)
|
|
123
|
+
froms, dels = from.vector_standarized, del.vector_standarized
|
|
124
|
+
nv=[]
|
|
125
|
+
froms.reset_index!
|
|
126
|
+
dels.reset_index!
|
|
127
|
+
froms.each_index do |i|
|
|
128
|
+
if froms[i].nil? or dels[i].nil?
|
|
129
|
+
nv.push(nil)
|
|
130
|
+
else
|
|
131
|
+
nv.push(froms[i]-r*dels[i])
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
Daru::Vector.new(nv)
|
|
135
|
+
end
|
|
136
|
+
# Correlation between v1 and v2, controling the effect of
|
|
137
|
+
# control on both.
|
|
138
|
+
def partial_correlation(v1,v2,control)
|
|
139
|
+
v1a,v2a,cona=Statsample.only_valid_clone(v1,v2,control)
|
|
140
|
+
rv1v2=pearson(v1a,v2a)
|
|
141
|
+
rv1con=pearson(v1a,cona)
|
|
142
|
+
rv2con=pearson(v2a,cona)
|
|
143
|
+
(rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def covariance_matrix_optimized(ds)
|
|
147
|
+
x=ds.to_gsl
|
|
148
|
+
n=x.row_size
|
|
149
|
+
m=x.column_size
|
|
150
|
+
means=((1/n.to_f)*GSL::Matrix.ones(1,n)*x).row(0)
|
|
151
|
+
centered=x-(GSL::Matrix.ones(n,m)*GSL::Matrix.diag(means))
|
|
152
|
+
ss=centered.transpose*centered
|
|
153
|
+
s=((1/(n-1).to_f))*ss
|
|
154
|
+
s
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Covariance matrix.
|
|
158
|
+
# Order of rows and columns depends on Dataset#fields order
|
|
159
|
+
|
|
160
|
+
def covariance_matrix(ds)
|
|
161
|
+
vars,cases = ds.ncols, ds.nrows
|
|
162
|
+
if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
|
|
163
|
+
cm=covariance_matrix_optimized(ds)
|
|
164
|
+
else
|
|
165
|
+
cm=covariance_matrix_pairwise(ds)
|
|
166
|
+
end
|
|
167
|
+
cm.extend(Statsample::CovariateMatrix)
|
|
168
|
+
cm.fields = ds.vectors.to_a
|
|
169
|
+
cm
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def covariance_matrix_pairwise(ds)
|
|
174
|
+
cache={}
|
|
175
|
+
vectors = ds.vectors.to_a
|
|
176
|
+
mat_rows = vectors.collect do |row|
|
|
177
|
+
vectors.collect do |col|
|
|
178
|
+
if (ds[row].type!=:numeric or ds[col].type!=:numeric)
|
|
179
|
+
nil
|
|
180
|
+
elsif row==col
|
|
181
|
+
ds[row].variance
|
|
182
|
+
else
|
|
183
|
+
if cache[[col,row]].nil?
|
|
184
|
+
cov=covariance(ds[row],ds[col])
|
|
185
|
+
cache[[row,col]]=cov
|
|
186
|
+
cov
|
|
187
|
+
else
|
|
188
|
+
cache[[col,row]]
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
Matrix.rows mat_rows
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Correlation matrix.
|
|
198
|
+
# Order of rows and columns depends on Dataset#fields order
|
|
199
|
+
def correlation_matrix(ds)
|
|
200
|
+
vars, cases = ds.ncols, ds.nrows
|
|
201
|
+
if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
|
|
202
|
+
cm=correlation_matrix_optimized(ds)
|
|
203
|
+
else
|
|
204
|
+
cm=correlation_matrix_pairwise(ds)
|
|
205
|
+
end
|
|
206
|
+
cm.extend(Statsample::CovariateMatrix)
|
|
207
|
+
cm.fields = ds.vectors.to_a
|
|
208
|
+
cm
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def correlation_matrix_optimized(ds)
|
|
212
|
+
s=covariance_matrix_optimized(ds)
|
|
213
|
+
sds=GSL::Matrix.diagonal(s.diagonal.sqrt.pow(-1))
|
|
214
|
+
cm=sds*s*sds
|
|
215
|
+
# Fix diagonal
|
|
216
|
+
s.row_size.times {|i|
|
|
217
|
+
cm[i,i]=1.0
|
|
218
|
+
}
|
|
219
|
+
cm
|
|
220
|
+
end
|
|
221
|
+
def correlation_matrix_pairwise(ds)
|
|
222
|
+
cache={}
|
|
223
|
+
vectors = ds.vectors.to_a
|
|
224
|
+
cm = vectors.collect do |row|
|
|
225
|
+
vectors.collect do |col|
|
|
226
|
+
if row==col
|
|
227
|
+
1.0
|
|
228
|
+
elsif (ds[row].type!=:numeric or ds[col].type!=:numeric)
|
|
229
|
+
nil
|
|
230
|
+
else
|
|
231
|
+
if cache[[col,row]].nil?
|
|
232
|
+
r=pearson(ds[row],ds[col])
|
|
233
|
+
cache[[row,col]]=r
|
|
234
|
+
r
|
|
235
|
+
else
|
|
236
|
+
cache[[col,row]]
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
Matrix.rows cm
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# Retrieves the n valid pairwise.
|
|
246
|
+
def n_valid_matrix(ds)
|
|
247
|
+
vectors = ds.vectors.to_a
|
|
248
|
+
m = vectors.collect do |row|
|
|
249
|
+
vectors.collect do |col|
|
|
250
|
+
if row==col
|
|
251
|
+
ds[row].reject_values(*Daru::MISSING_VALUES).size
|
|
252
|
+
else
|
|
253
|
+
rowa,rowb = Statsample.only_valid_clone(ds[row],ds[col])
|
|
254
|
+
rowa.size
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
Matrix.rows m
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
# Matrix of correlation probabilities.
|
|
263
|
+
# Order of rows and columns depends on Dataset#fields order
|
|
264
|
+
|
|
265
|
+
def correlation_probability_matrix(ds, tails=:both)
|
|
266
|
+
rows=ds.fields.collect do |row|
|
|
267
|
+
ds.fields.collect do |col|
|
|
268
|
+
v1a,v2a=Statsample.only_valid_clone(ds[row],ds[col])
|
|
269
|
+
(row==col or ds[row].type!=:numeric or ds[col].type!=:numeric) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
Matrix.rows(rows)
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
# Spearman ranked correlation coefficient (rho) between 2 vectors
|
|
276
|
+
def spearman(v1,v2)
|
|
277
|
+
v1a,v2a = Statsample.only_valid_clone(v1,v2)
|
|
278
|
+
v1r,v2r = v1a.ranked, v2a.ranked
|
|
279
|
+
pearson(v1r,v2r)
|
|
280
|
+
end
|
|
281
|
+
# Calculate Point biserial correlation. Equal to Pearson correlation, with
|
|
282
|
+
# one dichotomous value replaced by "0" and the other by "1"
|
|
283
|
+
def point_biserial(dichotomous,continous)
|
|
284
|
+
ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).reject_values(*Daru::MISSING_VALUES)
|
|
285
|
+
raise(TypeError, "First vector should be dichotomous") if ds[:d].factors.size != 2
|
|
286
|
+
raise(TypeError, "Second vector should be continous") if ds[:c].type != :numeric
|
|
287
|
+
f0=ds[:d].factors.sort.to_a[0]
|
|
288
|
+
m0=ds.filter_vector(:c) {|c| c[:d] == f0}
|
|
289
|
+
m1=ds.filter_vector(:c) {|c| c[:d] != f0}
|
|
290
|
+
((m1.mean-m0.mean).to_f / ds[:c].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.nrows**2)
|
|
291
|
+
end
|
|
292
|
+
# Kendall Rank Correlation Coefficient (Tau a)
|
|
293
|
+
# Based on Hervé Adbi article
|
|
294
|
+
def tau_a(v1,v2)
|
|
295
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
|
296
|
+
n=v1.size
|
|
297
|
+
v1r,v2r=v1a.ranked,v2a.ranked
|
|
298
|
+
o1=ordered_pairs(v1r)
|
|
299
|
+
o2=ordered_pairs(v2r)
|
|
300
|
+
delta= o1.size*2-(o2 & o1).size*2
|
|
301
|
+
1-(delta * 2 / (n*(n-1)).to_f)
|
|
302
|
+
end
|
|
303
|
+
# Calculates Goodman and Kruskal’s Tau b correlation.
|
|
304
|
+
# Tb is an asymmetric P-R-E measure of association for nominal scales
|
|
305
|
+
# (Mielke, X)
|
|
306
|
+
#
|
|
307
|
+
# Tau-b defines perfect association as strict monotonicity. Although it
|
|
308
|
+
# requires strict monotonicity to reach 1.0, it does not penalize ties as
|
|
309
|
+
# much as some other measures.
|
|
310
|
+
# == Reference
|
|
311
|
+
# Mielke, P. GOODMAN–KRUSKAL TAU AND GAMMA.
|
|
312
|
+
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
|
313
|
+
def tau_b(matrix)
|
|
314
|
+
v=pairs(matrix)
|
|
315
|
+
((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
|
|
316
|
+
end
|
|
317
|
+
# Calculates Goodman and Kruskal's gamma.
|
|
318
|
+
#
|
|
319
|
+
# Gamma is the surplus of concordant pairs over discordant pairs, as a
|
|
320
|
+
# percentage of all pairs ignoring ties.
|
|
321
|
+
#
|
|
322
|
+
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
|
323
|
+
def gamma(matrix)
|
|
324
|
+
v=pairs(matrix)
|
|
325
|
+
(v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
|
|
326
|
+
end
|
|
327
|
+
# Calculate indexes for a matrix the rows and cols has to be ordered
|
|
328
|
+
def pairs(matrix)
|
|
329
|
+
# calculate concordant #p matrix
|
|
330
|
+
rs=matrix.row_size
|
|
331
|
+
cs=matrix.column_size
|
|
332
|
+
conc=disc=ties_x=ties_y=0
|
|
333
|
+
(0...(rs-1)).each do |x|
|
|
334
|
+
(0...(cs-1)).each do |y|
|
|
335
|
+
((x+1)...rs).each do |x2|
|
|
336
|
+
((y+1)...cs).each do |y2|
|
|
337
|
+
# #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
|
338
|
+
conc+=matrix[x,y]*matrix[x2,y2]
|
|
339
|
+
end
|
|
340
|
+
end
|
|
341
|
+
end
|
|
342
|
+
end
|
|
343
|
+
(0...(rs-1)).each {|x|
|
|
344
|
+
(1...(cs)).each{|y|
|
|
345
|
+
((x+1)...rs).each{|x2|
|
|
346
|
+
(0...y).each{|y2|
|
|
347
|
+
# #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
|
348
|
+
disc+=matrix[x,y]*matrix[x2,y2]
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
(0...(rs-1)).each {|x|
|
|
354
|
+
(0...(cs)).each{|y|
|
|
355
|
+
((x+1)...(rs)).each{|x2|
|
|
356
|
+
ties_x+=matrix[x,y]*matrix[x2,y]
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
(0...rs).each {|x|
|
|
361
|
+
(0...(cs-1)).each{|y|
|
|
362
|
+
((y+1)...(cs)).each{|y2|
|
|
363
|
+
ties_y+=matrix[x,y]*matrix[x,y2]
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
{'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
def ordered_pairs(vector)
|
|
371
|
+
d = vector.to_a
|
|
372
|
+
a = []
|
|
373
|
+
(0...(d.size-1)).each do |i|
|
|
374
|
+
((i+1)...(d.size)).each do |j|
|
|
375
|
+
a.push([d[i],d[j]])
|
|
376
|
+
end
|
|
377
|
+
end
|
|
378
|
+
a
|
|
379
|
+
end
|
|
380
|
+
=begin
|
|
381
|
+
def sum_of_codeviated(v1,v2)
|
|
382
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
|
383
|
+
sum=0
|
|
384
|
+
(0...v1a.size).each{|i|
|
|
385
|
+
sum+=v1a[i]*v2a[i]
|
|
386
|
+
}
|
|
387
|
+
sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
|
|
388
|
+
end
|
|
389
|
+
=end
|
|
390
|
+
# Report the minimum number of cases valid of a covariate matrix
|
|
391
|
+
# based on a dataset
|
|
392
|
+
def min_n_valid(ds)
|
|
393
|
+
min = ds.nrows
|
|
394
|
+
m = n_valid_matrix(ds)
|
|
395
|
+
for x in 0...m.row_size
|
|
396
|
+
for y in 0...m.column_size
|
|
397
|
+
min=m[x,y] if m[x,y] < min
|
|
398
|
+
end
|
|
399
|
+
end
|
|
400
|
+
min
|
|
401
|
+
end
|
|
402
|
+
end
|
|
403
|
+
end
|
|
404
|
+
end
|
|
405
|
+
|
|
406
|
+
|