statsample-ekatena 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.travis.yml +23 -0
- data/CONTRIBUTING.md +17 -0
- data/Gemfile +2 -0
- data/History.txt +457 -0
- data/LICENSE.txt +12 -0
- data/README.md +175 -0
- data/Rakefile +44 -0
- data/benchmarks/correlation_matrix_15_variables.rb +32 -0
- data/benchmarks/correlation_matrix_5_variables.rb +33 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/doc_latex/manual/equations.tex +78 -0
- data/examples/boxplot.rb +28 -0
- data/examples/chisquare_test.rb +23 -0
- data/examples/correlation_matrix.rb +32 -0
- data/examples/dataset.rb +30 -0
- data/examples/dominance_analysis.rb +33 -0
- data/examples/dominance_analysis_bootstrap.rb +32 -0
- data/examples/histogram.rb +26 -0
- data/examples/icc.rb +24 -0
- data/examples/levene.rb +29 -0
- data/examples/multiple_regression.rb +20 -0
- data/examples/multivariate_correlation.rb +33 -0
- data/examples/parallel_analysis.rb +40 -0
- data/examples/polychoric.rb +40 -0
- data/examples/principal_axis.rb +26 -0
- data/examples/reliability.rb +31 -0
- data/examples/scatterplot.rb +25 -0
- data/examples/t_test.rb +27 -0
- data/examples/tetrachoric.rb +17 -0
- data/examples/u_test.rb +24 -0
- data/examples/vector.rb +20 -0
- data/examples/velicer_map_test.rb +46 -0
- data/grab_references.rb +29 -0
- data/lib/spss.rb +134 -0
- data/lib/statsample-ekatena/analysis.rb +100 -0
- data/lib/statsample-ekatena/analysis/suite.rb +89 -0
- data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
- data/lib/statsample-ekatena/anova.rb +24 -0
- data/lib/statsample-ekatena/anova/contrast.rb +79 -0
- data/lib/statsample-ekatena/anova/oneway.rb +187 -0
- data/lib/statsample-ekatena/anova/twoway.rb +207 -0
- data/lib/statsample-ekatena/bivariate.rb +406 -0
- data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
- data/lib/statsample-ekatena/codification.rb +182 -0
- data/lib/statsample-ekatena/converter/csv.rb +28 -0
- data/lib/statsample-ekatena/converter/spss.rb +48 -0
- data/lib/statsample-ekatena/converters.rb +211 -0
- data/lib/statsample-ekatena/crosstab.rb +188 -0
- data/lib/statsample-ekatena/daru.rb +115 -0
- data/lib/statsample-ekatena/dataset.rb +10 -0
- data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
- data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
- data/lib/statsample-ekatena/factor.rb +104 -0
- data/lib/statsample-ekatena/factor/map.rb +124 -0
- data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
- data/lib/statsample-ekatena/factor/pca.rb +242 -0
- data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
- data/lib/statsample-ekatena/factor/rotation.rb +198 -0
- data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
- data/lib/statsample-ekatena/formula/formula.rb +306 -0
- data/lib/statsample-ekatena/graph.rb +11 -0
- data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
- data/lib/statsample-ekatena/graph/histogram.rb +198 -0
- data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
- data/lib/statsample-ekatena/histogram.rb +180 -0
- data/lib/statsample-ekatena/matrix.rb +329 -0
- data/lib/statsample-ekatena/multiset.rb +310 -0
- data/lib/statsample-ekatena/regression.rb +65 -0
- data/lib/statsample-ekatena/regression/multiple.rb +89 -0
- data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
- data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
- data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
- data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
- data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
- data/lib/statsample-ekatena/regression/simple.rb +121 -0
- data/lib/statsample-ekatena/reliability.rb +150 -0
- data/lib/statsample-ekatena/reliability/icc.rb +415 -0
- data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
- data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
- data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
- data/lib/statsample-ekatena/resample.rb +15 -0
- data/lib/statsample-ekatena/shorthand.rb +125 -0
- data/lib/statsample-ekatena/srs.rb +169 -0
- data/lib/statsample-ekatena/test.rb +82 -0
- data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
- data/lib/statsample-ekatena/test/chisquare.rb +73 -0
- data/lib/statsample-ekatena/test/f.rb +52 -0
- data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
- data/lib/statsample-ekatena/test/levene.rb +88 -0
- data/lib/statsample-ekatena/test/t.rb +309 -0
- data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
- data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
- data/lib/statsample-ekatena/vector.rb +19 -0
- data/lib/statsample-ekatena/version.rb +3 -0
- data/lib/statsample.rb +282 -0
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +959 -0
- data/po/statsample.pot +947 -0
- data/references.txt +24 -0
- data/statsample-ekatena.gemspec +49 -0
- data/test/fixtures/bank2.dat +200 -0
- data/test/fixtures/correlation_matrix.rb +17 -0
- data/test/fixtures/df.csv +15 -0
- data/test/fixtures/hartman_23.matrix +9 -0
- data/test/fixtures/stock_data.csv +500 -0
- data/test/fixtures/tetmat_matrix.txt +5 -0
- data/test/fixtures/tetmat_test.txt +1001 -0
- data/test/helpers_tests.rb +83 -0
- data/test/test_analysis.rb +176 -0
- data/test/test_anova_contrast.rb +36 -0
- data/test/test_anovaoneway.rb +26 -0
- data/test/test_anovatwoway.rb +37 -0
- data/test/test_anovatwowaywithdataset.rb +47 -0
- data/test/test_anovawithvectors.rb +102 -0
- data/test/test_awesome_print_bug.rb +16 -0
- data/test/test_bartlettsphericity.rb +25 -0
- data/test/test_bivariate.rb +164 -0
- data/test/test_codification.rb +78 -0
- data/test/test_crosstab.rb +67 -0
- data/test/test_dominance_analysis.rb +39 -0
- data/test/test_factor.rb +228 -0
- data/test/test_factor_map.rb +38 -0
- data/test/test_factor_pa.rb +56 -0
- data/test/test_fit_model.rb +88 -0
- data/test/test_ggobi.rb +35 -0
- data/test/test_gsl.rb +15 -0
- data/test/test_histogram.rb +109 -0
- data/test/test_matrix.rb +48 -0
- data/test/test_multiset.rb +176 -0
- data/test/test_regression.rb +231 -0
- data/test/test_reliability.rb +223 -0
- data/test/test_reliability_icc.rb +198 -0
- data/test/test_reliability_skillscale.rb +57 -0
- data/test/test_resample.rb +24 -0
- data/test/test_srs.rb +9 -0
- data/test/test_statistics.rb +69 -0
- data/test/test_stest.rb +69 -0
- data/test/test_stratified.rb +17 -0
- data/test/test_test_f.rb +33 -0
- data/test/test_test_kolmogorovsmirnov.rb +34 -0
- data/test/test_test_t.rb +62 -0
- data/test/test_umannwhitney.rb +27 -0
- data/test/test_vector.rb +12 -0
- data/test/test_wilcoxonsignedrank.rb +64 -0
- metadata +570 -0
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
# Several methods to estimate parameters for simple random sampling
|
|
3
|
+
# == Reference:
|
|
4
|
+
# * Cochran, W.(1972). Sampling Techniques [spanish edition].
|
|
5
|
+
# * http://stattrek.com/Lesson6/SRS.aspx
|
|
6
|
+
|
|
7
|
+
module SRS
|
|
8
|
+
|
|
9
|
+
class << self
|
|
10
|
+
########################
|
|
11
|
+
#
|
|
12
|
+
# :SECTION: Proportion estimation
|
|
13
|
+
#
|
|
14
|
+
# Function for estimation of proportions
|
|
15
|
+
########################
|
|
16
|
+
|
|
17
|
+
#
|
|
18
|
+
# Finite population correction (over variance)
|
|
19
|
+
# Source: Cochran(1972)
|
|
20
|
+
def fpc_var(sam,pop)
|
|
21
|
+
(pop - sam).quo(pop - 1)
|
|
22
|
+
end
|
|
23
|
+
# Finite population correction (over standard deviation)
|
|
24
|
+
def fpc(sam,pop)
|
|
25
|
+
Math::sqrt((pop-sam).quo(pop-1))
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Non sample fraction.
|
|
29
|
+
#
|
|
30
|
+
# 1 - sample fraction
|
|
31
|
+
def qf(sam , pop)
|
|
32
|
+
1-(sam.quo(pop))
|
|
33
|
+
end
|
|
34
|
+
# Sample size estimation for proportions, infinite poblation
|
|
35
|
+
def estimation_n0(d,prop,margin=0.95)
|
|
36
|
+
t=Distribution::Normal.p_value(1-(1-margin).quo(2))
|
|
37
|
+
var=prop*(1-prop)
|
|
38
|
+
t**2*var.quo(d**2)
|
|
39
|
+
end
|
|
40
|
+
# Sample size estimation for proportions, finite poblation.
|
|
41
|
+
def estimation_n(d,prop,n_pobl,margin=0.95)
|
|
42
|
+
n0=estimation_n0(d,prop,margin)
|
|
43
|
+
n0.quo( 1 + ((n0 - 1).quo(n_pobl)))
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# Proportion confidence interval with t values
|
|
48
|
+
# Uses estimated proportion, sample without replacement.
|
|
49
|
+
|
|
50
|
+
def proportion_confidence_interval_t(prop, n_sample, n_population, margin=0.95)
|
|
51
|
+
t = Distribution::T.p_value(1-((1-margin).quo(2)) , n_sample-1)
|
|
52
|
+
proportion_confidence_interval(prop,n_sample,n_population, t)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Proportion confidence interval with z values
|
|
56
|
+
# Uses estimated proportion, sample without replacement.
|
|
57
|
+
def proportion_confidence_interval_z(p, n_sample, n_population, margin=0.95)
|
|
58
|
+
z=Distribution::Normal.p_value(1-((1-margin).quo(2)))
|
|
59
|
+
proportion_confidence_interval(p,n_sample,n_population, z)
|
|
60
|
+
end
|
|
61
|
+
# Proportion confidence interval with x value
|
|
62
|
+
# Uses estimated proportion, sample without replacement
|
|
63
|
+
|
|
64
|
+
def proportion_confidence_interval(p, sam,pop , x)
|
|
65
|
+
#f=sam.quo(pop)
|
|
66
|
+
one_range=x * Math::sqrt((qf(sam, pop) * p * (1-p)).quo(sam-1)) + (1.quo(sam * 2.0))
|
|
67
|
+
[p-one_range, p+one_range]
|
|
68
|
+
end
|
|
69
|
+
# Standard deviation for sample distribution of a proportion
|
|
70
|
+
# Know proportion, sample with replacement.
|
|
71
|
+
# Based on http://stattrek.com/Lesson6/SRS.aspx
|
|
72
|
+
def proportion_sd_kp_wr(p, n_sample)
|
|
73
|
+
Math::sqrt(p*(1-p).quo(n_sample))
|
|
74
|
+
end
|
|
75
|
+
# Standard deviation for sample distribution of a proportion
|
|
76
|
+
# Know proportion, sample without replacement.
|
|
77
|
+
#
|
|
78
|
+
# Sources:
|
|
79
|
+
# * Cochran(1972)
|
|
80
|
+
def proportion_sd_kp_wor(p, sam, pop)
|
|
81
|
+
fpc(sam,pop)*Math::sqrt(p*(1-p).quo(sam))
|
|
82
|
+
end
|
|
83
|
+
# Standard deviation for sample distribution of a proportion
|
|
84
|
+
# Estimated proportion, sample with replacement
|
|
85
|
+
# Based on http://stattrek.com/Lesson6/SRS.aspx.
|
|
86
|
+
def proportion_sd_ep_wr(p, n_sample)
|
|
87
|
+
Math::sqrt(p*(1-p).quo(n_sample-1))
|
|
88
|
+
end
|
|
89
|
+
# Standard deviation for sample distribution of a proportion.
|
|
90
|
+
# Estimated proportion, sample without replacement.
|
|
91
|
+
# Reference:
|
|
92
|
+
# * Cochran, 1972, Técnicas de muestreo
|
|
93
|
+
def proportion_sd_ep_wor(p, sam,pop)
|
|
94
|
+
fsc=(pop-sam).quo((sam-1)*pop)
|
|
95
|
+
Math::sqrt(fsc*p*(1-p))
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Total estimation sd based on sample.
|
|
99
|
+
# Known proportion, sample without replacement
|
|
100
|
+
# Reference:
|
|
101
|
+
# * Cochran(1972)
|
|
102
|
+
def proportion_total_sd_kp_wor(prop, sam, pop)
|
|
103
|
+
pob * proportion_sd_kp_wor(p, sam, pop)
|
|
104
|
+
end
|
|
105
|
+
# Total estimation sd based on sample.
|
|
106
|
+
# Estimated proportion, sample without replacement
|
|
107
|
+
# Source: Cochran(1972)
|
|
108
|
+
def proportion_total_sd_ep_wor(prop, sam, pop)
|
|
109
|
+
fsc=((pop - sam).to_f / ( sam - 1))
|
|
110
|
+
Math::sqrt(fsc*pop*prop*(1-prop))
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
########################
|
|
114
|
+
#
|
|
115
|
+
# :SECTION: Mean stimation
|
|
116
|
+
#
|
|
117
|
+
########################
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# Standard error. Known variance, sample with replacement.
|
|
121
|
+
def standard_error_ksd_wr(s, sam, pop)
|
|
122
|
+
s.quo(Math::sqrt(sam)) * Math::sqrt((pop-1).quo(pop))
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Standard error of the mean. Known variance, sample w/o replacement
|
|
126
|
+
def standard_error_ksd_wor(s,sam,pop)
|
|
127
|
+
s.quo(Math::sqrt(sam)) * Math::sqrt(qf(sam,pop))
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
alias_method :standard_error_esd_wr, :standard_error_ksd_wr
|
|
131
|
+
|
|
132
|
+
# Standard error of the mean.
|
|
133
|
+
# Estimated variance, without replacement
|
|
134
|
+
# Cochran (1972) p.47
|
|
135
|
+
def standard_error_esd_wor(s,sam,pop)
|
|
136
|
+
s.quo(Math::sqrt(sam)) * Math::sqrt(qf(sam,pop))
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
alias_method :standard_error, :standard_error_esd_wor
|
|
140
|
+
alias_method :se, :standard_error_esd_wor
|
|
141
|
+
|
|
142
|
+
# Standard error of total estimation
|
|
143
|
+
|
|
144
|
+
def standard_error_total(s,sam,pop)
|
|
145
|
+
pop*se(s,sam,pop)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Confidence Interval using T-Student
|
|
149
|
+
# Use with n < 60
|
|
150
|
+
def mean_confidence_interval_t(mean,s,n_sample,n_population,margin=0.95)
|
|
151
|
+
t=Distribution::T.p_value(1-((1-margin) / 2),n_sample-1)
|
|
152
|
+
mean_confidence_interval(mean,s,n_sample,n_population,t)
|
|
153
|
+
end
|
|
154
|
+
# Confidente Interval using Z
|
|
155
|
+
# Use with n > 60
|
|
156
|
+
def mean_confidence_interval_z(mean,s,n_sample,n_population,margin=0.95)
|
|
157
|
+
z=Distribution::Normal.p_value(1-((1-margin) / 2))
|
|
158
|
+
mean_confidence_interval(mean,s,n_sample,n_population, z)
|
|
159
|
+
end
|
|
160
|
+
# Confidente interval using X.
|
|
161
|
+
#
|
|
162
|
+
# Better use mean_confidence_interval_z or mean_confidence_interval_t
|
|
163
|
+
def mean_confidence_interval(mean,s,n_sample,n_population,x)
|
|
164
|
+
range=x*se(s,n_sample,n_population)
|
|
165
|
+
[mean-range,mean+range]
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
# Module for several statistical tests
|
|
3
|
+
|
|
4
|
+
module Test
|
|
5
|
+
autoload(:UMannWhitney, 'statsample/test/umannwhitney')
|
|
6
|
+
autoload(:Levene, 'statsample/test/levene')
|
|
7
|
+
autoload(:T, 'statsample/test/t')
|
|
8
|
+
autoload(:F, 'statsample/test/f')
|
|
9
|
+
autoload(:ChiSquare, 'statsample/test/chisquare')
|
|
10
|
+
autoload(:BartlettSphericity, 'statsample/test/bartlettsphericity')
|
|
11
|
+
autoload(:KolmogorovSmirnov, 'statsample/test/kolmogorovsmirnov')
|
|
12
|
+
autoload(:WilcoxonSignedRank, 'statsample/test/wilcoxonsignedrank')
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Returns probability of getting a value lower or higher
|
|
16
|
+
# than sample, using cdf and number of tails.
|
|
17
|
+
#
|
|
18
|
+
# * <tt>:left</tt> : For one tail left, return the cdf
|
|
19
|
+
# * <tt>:right</tt> : For one tail right, return 1-cdf
|
|
20
|
+
# * <tt>:both</tt> : For both tails, returns 2*right_tail(cdf.abs)
|
|
21
|
+
def p_using_cdf(cdf, tails=:both)
|
|
22
|
+
tails=:both if tails==2 or tails==:two
|
|
23
|
+
tails=:right if tails==1 or tails==:positive
|
|
24
|
+
tails=:left if tails==:negative
|
|
25
|
+
case tails
|
|
26
|
+
when :left then cdf
|
|
27
|
+
when :right then 1-cdf
|
|
28
|
+
when :both
|
|
29
|
+
if cdf>=0.5
|
|
30
|
+
cdf=1-cdf
|
|
31
|
+
end
|
|
32
|
+
2*cdf
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
# Get critical t to create confidence interval
|
|
36
|
+
def t_critical(confidence_level, df)
|
|
37
|
+
-Distribution::T.p_value((1-confidence_level) / 2.0, df)
|
|
38
|
+
end
|
|
39
|
+
# Get critical z to create confidence interval
|
|
40
|
+
def z_critical(confidence_level)
|
|
41
|
+
-Distribution::Z.p_value((1-confidence_level) / 2.0)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
extend self
|
|
45
|
+
# Calculate chi square for two Matrix
|
|
46
|
+
class << self
|
|
47
|
+
def chi_square(observed, expected=nil)
|
|
48
|
+
case observed
|
|
49
|
+
when Vector
|
|
50
|
+
ChiSquare::WithVector.new(observed,expected)
|
|
51
|
+
when Matrix
|
|
52
|
+
ChiSquare::WithMatrix.new(observed,expected)
|
|
53
|
+
else
|
|
54
|
+
raise "Not implemented for #{observed.class}"
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
# Shorthand for Statsample::Test::UMannWhitney.new
|
|
58
|
+
#
|
|
59
|
+
# * <tt>v1</tt> and <tt>v2</tt> should be Statsample::Vector.
|
|
60
|
+
def u_mannwhitney(v1, v2)
|
|
61
|
+
Statsample::Test::UMannWhitney.new(v1,v2)
|
|
62
|
+
end
|
|
63
|
+
# Shorthand for Statsample::Test::T::OneSample.new
|
|
64
|
+
def t_one_sample(vector, opts=Hash.new)
|
|
65
|
+
Statsample::Test::T::OneSample.new(vector,opts)
|
|
66
|
+
end
|
|
67
|
+
# Shorthand for Statsample::Test::T::TwoSamplesIndependent.new
|
|
68
|
+
def t_two_samples_independent(v1,v2, opts=Hash.new)
|
|
69
|
+
Statsample::Test::T::TwoSamplesIndependent.new(v1,v2,opts)
|
|
70
|
+
end
|
|
71
|
+
# Shorthand for Statsample::Test::WilcoxonSignedRank.new
|
|
72
|
+
def wilcoxon_signed_rank(v1,v2,opts=Hash.new)
|
|
73
|
+
Statsample::Test::WilcoxonSignedRank.new(v1,v2,opts)
|
|
74
|
+
end
|
|
75
|
+
# Shorthand for Statsample::Test::Levene.new
|
|
76
|
+
def levene(input, opts=Hash.new)
|
|
77
|
+
Statsample::Test::Levene.new(input,opts)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
module Test
|
|
3
|
+
# == Bartlett's test of Sphericity.
|
|
4
|
+
# Test the hyphotesis that the sample correlation matrix
|
|
5
|
+
# comes from a multivariate normal population where variables
|
|
6
|
+
# are independent. In other words, the population correlation
|
|
7
|
+
# matrix is the identity matrix.
|
|
8
|
+
# == Reference
|
|
9
|
+
# * Dziuban, C., & Shirkey E. (1974). When is a correlation matrix appropriate for factor analysis? Some decision rules. Psychological Bulletin, 81(6), 358-361.
|
|
10
|
+
class BartlettSphericity
|
|
11
|
+
include Statsample::Test
|
|
12
|
+
include Summarizable
|
|
13
|
+
attr_accessor :name
|
|
14
|
+
attr_reader :ncases
|
|
15
|
+
attr_reader :nvars
|
|
16
|
+
attr_reader :value
|
|
17
|
+
attr_reader :df
|
|
18
|
+
# Args
|
|
19
|
+
# * _matrix_: correlation matrix
|
|
20
|
+
# * _ncases_: number of cases
|
|
21
|
+
def initialize(matrix,ncases)
|
|
22
|
+
@matrix=matrix
|
|
23
|
+
@ncases=ncases
|
|
24
|
+
@nvars=@matrix.row_size
|
|
25
|
+
@name=_("Bartlett's test of sphericity")
|
|
26
|
+
compute
|
|
27
|
+
end
|
|
28
|
+
# Uses SPSS formula.
|
|
29
|
+
# On Dziuban & Shirkey, the minus between the first and second
|
|
30
|
+
# statement is a *!!!
|
|
31
|
+
#
|
|
32
|
+
def compute
|
|
33
|
+
@value=-((@ncases-1)-(2*@nvars+5).quo(6))*Math::log(@matrix.determinant)
|
|
34
|
+
@df=(@nvars*(@nvars-1)) / 2
|
|
35
|
+
end
|
|
36
|
+
def probability
|
|
37
|
+
1-Distribution::ChiSquare.cdf(@value,@df)
|
|
38
|
+
end
|
|
39
|
+
def report_building(builder) # :nodoc:
|
|
40
|
+
builder.text "%s : X(%d) = %0.4f , p = %0.4f" % [@name, @df, @value, probability]
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
module Test
|
|
3
|
+
module ChiSquare
|
|
4
|
+
module Shared
|
|
5
|
+
attr_reader :df
|
|
6
|
+
attr_reader :value
|
|
7
|
+
|
|
8
|
+
def to_f
|
|
9
|
+
@value
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def chi_square
|
|
13
|
+
@value
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def probability
|
|
17
|
+
1-Distribution::ChiSquare.cdf(@value.to_f,@df)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
class WithMatrix
|
|
22
|
+
include Statsample::Test::ChiSquare::Shared
|
|
23
|
+
|
|
24
|
+
def initialize(observed, expected=nil)
|
|
25
|
+
@observed=observed
|
|
26
|
+
@expected=expected or calculate_expected
|
|
27
|
+
raise "Observed size!=expected size" if @observed.row_size!=@expected.row_size or @observed.column_size!=@expected.column_size
|
|
28
|
+
@df=(@observed.row_size-1)*(@observed.column_size-1)
|
|
29
|
+
@value=compute_chi
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def calculate_expected
|
|
33
|
+
sum=@observed.total_sum
|
|
34
|
+
@expected=Matrix.rows( @observed.row_size.times.map {|i|
|
|
35
|
+
@observed.column_size.times.map {|j|
|
|
36
|
+
(@observed.row_sum[i].quo(sum) * @observed.column_sum[j].quo(sum))*sum
|
|
37
|
+
}
|
|
38
|
+
})
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def compute_chi
|
|
42
|
+
sum=0
|
|
43
|
+
(0...@observed.row_size).each {|i|
|
|
44
|
+
(0...@observed.column_size).each {|j|
|
|
45
|
+
sum+=((@observed[i, j] - @expected[i,j])**2).quo(@expected[i,j])
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
sum
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
class WithVector
|
|
53
|
+
include Statsample::Test::ChiSquare::Shared
|
|
54
|
+
|
|
55
|
+
def initialize(observed, expected)
|
|
56
|
+
@observed = observed
|
|
57
|
+
@expected = expected
|
|
58
|
+
raise "Observed size!=expected size" if @observed.size!=@expected.size
|
|
59
|
+
@df = @observed.size - 1
|
|
60
|
+
@value = compute_chi
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def compute_chi
|
|
64
|
+
sum=0
|
|
65
|
+
(0...@observed.size).each {|i|
|
|
66
|
+
sum+=((@observed[i] - @expected[i])**2).quo(@expected[i])
|
|
67
|
+
}
|
|
68
|
+
sum
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
module Test
|
|
3
|
+
# From Wikipedia:
|
|
4
|
+
# An F-test is any statistical test in which the test statistic has an F-distribution under the null hypothesis. It is most often used when comparing statistical models that have been fit to a data set, in order to identify the model that best fits the population from which the data were sampled.
|
|
5
|
+
class F
|
|
6
|
+
include Statsample::Test
|
|
7
|
+
include Summarizable
|
|
8
|
+
attr_reader :var_num, :var_den, :df_num, :df_den, :var_total, :df_total
|
|
9
|
+
# Tails for probability (:both, :left or :right)
|
|
10
|
+
attr_accessor :tails
|
|
11
|
+
# Name of F analysis
|
|
12
|
+
attr_accessor :name
|
|
13
|
+
|
|
14
|
+
# Parameters:
|
|
15
|
+
# * var_num: variance numerator
|
|
16
|
+
# * var_den: variance denominator
|
|
17
|
+
# * df_num: degrees of freedom numerator
|
|
18
|
+
# * df_den: degrees of freedom denominator
|
|
19
|
+
def initialize(var_num, var_den, df_num, df_den, opts=Hash.new)
|
|
20
|
+
@var_num=var_num
|
|
21
|
+
@var_den=var_den
|
|
22
|
+
@df_num=df_num
|
|
23
|
+
@df_den=df_den
|
|
24
|
+
@var_total=var_num+var_den
|
|
25
|
+
@df_total=df_num+df_den
|
|
26
|
+
opts_default={:tails=>:right, :name=>_("F Test")}
|
|
27
|
+
@opts=opts_default.merge(opts)
|
|
28
|
+
raise "Tails should be right or left, not both" if @opts[:tails]==:both
|
|
29
|
+
opts_default.keys.each {|k|
|
|
30
|
+
send("#{k}=", @opts[k])
|
|
31
|
+
}
|
|
32
|
+
end
|
|
33
|
+
def f
|
|
34
|
+
@var_num.quo(@var_den)
|
|
35
|
+
end
|
|
36
|
+
def to_f
|
|
37
|
+
f
|
|
38
|
+
end
|
|
39
|
+
# probability
|
|
40
|
+
def probability
|
|
41
|
+
p_using_cdf(Distribution::F.cdf(f, @df_num, @df_den), tails)
|
|
42
|
+
end
|
|
43
|
+
def report_building(builder) #:nodoc:
|
|
44
|
+
if @df_num.is_a? Integer and @df_den.is_a? Integer
|
|
45
|
+
builder.text "%s : F(%d, %d) = %0.4f , p = %0.4f" % [@name, @df_num, @df_den, f, probability]
|
|
46
|
+
else
|
|
47
|
+
builder.text "%s : F(%0.2f, %0.2f) = %0.4f , p = %0.4f" % [@name, @df_num, @df_den, f, probability]
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
module Test
|
|
3
|
+
# == Kolmogorov-Smirnov's test of equality of distributions.
|
|
4
|
+
class KolmogorovSmirnov
|
|
5
|
+
|
|
6
|
+
attr_reader :d
|
|
7
|
+
include Statsample::Test
|
|
8
|
+
include Summarizable
|
|
9
|
+
# Creates a new Kolmogorov-Smirnov test
|
|
10
|
+
# d1 should have each method
|
|
11
|
+
# d2 could be a Distribution class, with a cdf method,
|
|
12
|
+
# a vector or a lambda
|
|
13
|
+
def initialize(d1,d2)
|
|
14
|
+
raise "First argument should have each method" unless d1.respond_to? :each
|
|
15
|
+
@d1=make_cdf(d1)
|
|
16
|
+
if d2.respond_to? :cdf or d2.is_a? Proc
|
|
17
|
+
@d2=d2
|
|
18
|
+
elsif d2.respond_to? :each
|
|
19
|
+
@d2=make_cdf(d2)
|
|
20
|
+
else
|
|
21
|
+
raise "Second argument should respond to cdf or each"
|
|
22
|
+
end
|
|
23
|
+
calculate
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def calculate
|
|
27
|
+
d=0
|
|
28
|
+
@d1.each {|x|
|
|
29
|
+
v1=@d1.cdf(x);
|
|
30
|
+
v2=@d2.is_a?(Proc) ? @d2.call(x) : @d2.cdf(x)
|
|
31
|
+
d=(v1-v2).to_f.abs if (v1-v2).abs>d
|
|
32
|
+
}
|
|
33
|
+
@d=d
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Make a wrapper EmpiricDistribution to any method which implements
|
|
37
|
+
# each on Statsample::Vector, only uses non-missing data.
|
|
38
|
+
def make_cdf(v)
|
|
39
|
+
v.is_a?(Daru::Vector) ? EmpiricDistribution.new(v.only_valid.to_a) : EmpiricDistribution.new(v)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
class EmpiricDistribution
|
|
43
|
+
def initialize(data)
|
|
44
|
+
@min=data.min
|
|
45
|
+
@max=data.max
|
|
46
|
+
@data=data.sort
|
|
47
|
+
@n=data.size
|
|
48
|
+
end
|
|
49
|
+
def each
|
|
50
|
+
@data.each {|x|
|
|
51
|
+
yield x
|
|
52
|
+
}
|
|
53
|
+
end
|
|
54
|
+
def cdf(x)
|
|
55
|
+
return 0 if x<@min
|
|
56
|
+
return 1 if x>=@max
|
|
57
|
+
v=@data.index{|v1| v1>=x}
|
|
58
|
+
v.nil? ? 0 : (v+(x==@data[v]? 1 : 0)).quo(@n)
|
|
59
|
+
end
|
|
60
|
+
end # End EmpiricDistribution
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|