statsample-ekatena 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.travis.yml +23 -0
- data/CONTRIBUTING.md +17 -0
- data/Gemfile +2 -0
- data/History.txt +457 -0
- data/LICENSE.txt +12 -0
- data/README.md +175 -0
- data/Rakefile +44 -0
- data/benchmarks/correlation_matrix_15_variables.rb +32 -0
- data/benchmarks/correlation_matrix_5_variables.rb +33 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/doc_latex/manual/equations.tex +78 -0
- data/examples/boxplot.rb +28 -0
- data/examples/chisquare_test.rb +23 -0
- data/examples/correlation_matrix.rb +32 -0
- data/examples/dataset.rb +30 -0
- data/examples/dominance_analysis.rb +33 -0
- data/examples/dominance_analysis_bootstrap.rb +32 -0
- data/examples/histogram.rb +26 -0
- data/examples/icc.rb +24 -0
- data/examples/levene.rb +29 -0
- data/examples/multiple_regression.rb +20 -0
- data/examples/multivariate_correlation.rb +33 -0
- data/examples/parallel_analysis.rb +40 -0
- data/examples/polychoric.rb +40 -0
- data/examples/principal_axis.rb +26 -0
- data/examples/reliability.rb +31 -0
- data/examples/scatterplot.rb +25 -0
- data/examples/t_test.rb +27 -0
- data/examples/tetrachoric.rb +17 -0
- data/examples/u_test.rb +24 -0
- data/examples/vector.rb +20 -0
- data/examples/velicer_map_test.rb +46 -0
- data/grab_references.rb +29 -0
- data/lib/spss.rb +134 -0
- data/lib/statsample-ekatena/analysis.rb +100 -0
- data/lib/statsample-ekatena/analysis/suite.rb +89 -0
- data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
- data/lib/statsample-ekatena/anova.rb +24 -0
- data/lib/statsample-ekatena/anova/contrast.rb +79 -0
- data/lib/statsample-ekatena/anova/oneway.rb +187 -0
- data/lib/statsample-ekatena/anova/twoway.rb +207 -0
- data/lib/statsample-ekatena/bivariate.rb +406 -0
- data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
- data/lib/statsample-ekatena/codification.rb +182 -0
- data/lib/statsample-ekatena/converter/csv.rb +28 -0
- data/lib/statsample-ekatena/converter/spss.rb +48 -0
- data/lib/statsample-ekatena/converters.rb +211 -0
- data/lib/statsample-ekatena/crosstab.rb +188 -0
- data/lib/statsample-ekatena/daru.rb +115 -0
- data/lib/statsample-ekatena/dataset.rb +10 -0
- data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
- data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
- data/lib/statsample-ekatena/factor.rb +104 -0
- data/lib/statsample-ekatena/factor/map.rb +124 -0
- data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
- data/lib/statsample-ekatena/factor/pca.rb +242 -0
- data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
- data/lib/statsample-ekatena/factor/rotation.rb +198 -0
- data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
- data/lib/statsample-ekatena/formula/formula.rb +306 -0
- data/lib/statsample-ekatena/graph.rb +11 -0
- data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
- data/lib/statsample-ekatena/graph/histogram.rb +198 -0
- data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
- data/lib/statsample-ekatena/histogram.rb +180 -0
- data/lib/statsample-ekatena/matrix.rb +329 -0
- data/lib/statsample-ekatena/multiset.rb +310 -0
- data/lib/statsample-ekatena/regression.rb +65 -0
- data/lib/statsample-ekatena/regression/multiple.rb +89 -0
- data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
- data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
- data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
- data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
- data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
- data/lib/statsample-ekatena/regression/simple.rb +121 -0
- data/lib/statsample-ekatena/reliability.rb +150 -0
- data/lib/statsample-ekatena/reliability/icc.rb +415 -0
- data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
- data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
- data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
- data/lib/statsample-ekatena/resample.rb +15 -0
- data/lib/statsample-ekatena/shorthand.rb +125 -0
- data/lib/statsample-ekatena/srs.rb +169 -0
- data/lib/statsample-ekatena/test.rb +82 -0
- data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
- data/lib/statsample-ekatena/test/chisquare.rb +73 -0
- data/lib/statsample-ekatena/test/f.rb +52 -0
- data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
- data/lib/statsample-ekatena/test/levene.rb +88 -0
- data/lib/statsample-ekatena/test/t.rb +309 -0
- data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
- data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
- data/lib/statsample-ekatena/vector.rb +19 -0
- data/lib/statsample-ekatena/version.rb +3 -0
- data/lib/statsample.rb +282 -0
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +959 -0
- data/po/statsample.pot +947 -0
- data/references.txt +24 -0
- data/statsample-ekatena.gemspec +49 -0
- data/test/fixtures/bank2.dat +200 -0
- data/test/fixtures/correlation_matrix.rb +17 -0
- data/test/fixtures/df.csv +15 -0
- data/test/fixtures/hartman_23.matrix +9 -0
- data/test/fixtures/stock_data.csv +500 -0
- data/test/fixtures/tetmat_matrix.txt +5 -0
- data/test/fixtures/tetmat_test.txt +1001 -0
- data/test/helpers_tests.rb +83 -0
- data/test/test_analysis.rb +176 -0
- data/test/test_anova_contrast.rb +36 -0
- data/test/test_anovaoneway.rb +26 -0
- data/test/test_anovatwoway.rb +37 -0
- data/test/test_anovatwowaywithdataset.rb +47 -0
- data/test/test_anovawithvectors.rb +102 -0
- data/test/test_awesome_print_bug.rb +16 -0
- data/test/test_bartlettsphericity.rb +25 -0
- data/test/test_bivariate.rb +164 -0
- data/test/test_codification.rb +78 -0
- data/test/test_crosstab.rb +67 -0
- data/test/test_dominance_analysis.rb +39 -0
- data/test/test_factor.rb +228 -0
- data/test/test_factor_map.rb +38 -0
- data/test/test_factor_pa.rb +56 -0
- data/test/test_fit_model.rb +88 -0
- data/test/test_ggobi.rb +35 -0
- data/test/test_gsl.rb +15 -0
- data/test/test_histogram.rb +109 -0
- data/test/test_matrix.rb +48 -0
- data/test/test_multiset.rb +176 -0
- data/test/test_regression.rb +231 -0
- data/test/test_reliability.rb +223 -0
- data/test/test_reliability_icc.rb +198 -0
- data/test/test_reliability_skillscale.rb +57 -0
- data/test/test_resample.rb +24 -0
- data/test/test_srs.rb +9 -0
- data/test/test_statistics.rb +69 -0
- data/test/test_stest.rb +69 -0
- data/test/test_stratified.rb +17 -0
- data/test/test_test_f.rb +33 -0
- data/test/test_test_kolmogorovsmirnov.rb +34 -0
- data/test/test_test_t.rb +62 -0
- data/test/test_umannwhitney.rb +27 -0
- data/test/test_vector.rb +12 -0
- data/test/test_wilcoxonsignedrank.rb +64 -0
- metadata +570 -0
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
# Multiset joins multiple dataset with the same fields and vectors
|
|
3
|
+
# but with different number of cases.
|
|
4
|
+
# This is the base class for stratified and cluster sampling estimation
|
|
5
|
+
class Multiset
|
|
6
|
+
# Name of fields
|
|
7
|
+
attr_reader :fields
|
|
8
|
+
# Array with Daru::DataFrame
|
|
9
|
+
attr_reader :datasets
|
|
10
|
+
# To create a multiset
|
|
11
|
+
# * Multiset.new(%w{f1 f2 f3}) # define only fields
|
|
12
|
+
def initialize(fields)
|
|
13
|
+
@fields=fields
|
|
14
|
+
@datasets={}
|
|
15
|
+
end
|
|
16
|
+
def self.new_empty_vectors(fields,ds_names)
|
|
17
|
+
ms = Multiset.new(fields)
|
|
18
|
+
ds_names.each do |d|
|
|
19
|
+
ms.add_dataset(d, Daru::DataFrame.new({}, order: fields))
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
ms
|
|
23
|
+
end
|
|
24
|
+
# Generate a new dataset as a union of partial dataset
|
|
25
|
+
# If block given, this is applied to each dataset before union
|
|
26
|
+
def union(&block)
|
|
27
|
+
union_field={}
|
|
28
|
+
types={}
|
|
29
|
+
names={}
|
|
30
|
+
labels={}
|
|
31
|
+
each do |k,ds|
|
|
32
|
+
if block
|
|
33
|
+
ds = ds.dup
|
|
34
|
+
yield k,ds
|
|
35
|
+
end
|
|
36
|
+
@fields.each do |f|
|
|
37
|
+
union_field[f] ||= Array.new
|
|
38
|
+
union_field[f].concat(ds[f].to_a)
|
|
39
|
+
types[f] ||= ds[f].type
|
|
40
|
+
names[f] ||= ds[f].name
|
|
41
|
+
labels[f] ||= ds[f].index.to_a
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
@fields.each do |f|
|
|
46
|
+
union_field[f] = Daru::Vector.new(union_field[f], name: names[f])
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
ds_union = Daru::DataFrame.new(union_field, order: @fields)
|
|
50
|
+
ds_union
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def datasets_names
|
|
54
|
+
@datasets.keys.sort
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def n_datasets
|
|
58
|
+
@datasets.size
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def add_dataset(key,ds)
|
|
62
|
+
if ds.vectors.to_a != @fields
|
|
63
|
+
raise ArgumentError, "Dataset(#{ds.vectors.to_a.to_s})must have the same fields of the Multiset(#{@fields})"
|
|
64
|
+
else
|
|
65
|
+
@datasets[key] = ds
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
def sum_field(field)
|
|
69
|
+
@datasets.inject(0) {|a,da|
|
|
70
|
+
stratum_name = da[0]
|
|
71
|
+
vector = da[1][field]
|
|
72
|
+
val = yield stratum_name,vector
|
|
73
|
+
a + val
|
|
74
|
+
}
|
|
75
|
+
end
|
|
76
|
+
def collect_vector(field)
|
|
77
|
+
@datasets.collect { |k,v| yield k, v[field] }
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def each_vector(field)
|
|
81
|
+
@datasets.each { |k,v| yield k, v[field] }
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def [](i)
|
|
85
|
+
@datasets[i]
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def each(&block)
|
|
89
|
+
@datasets.each {|k,ds|
|
|
90
|
+
next if ds.nrows == 0
|
|
91
|
+
block.call(k,ds)
|
|
92
|
+
}
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
class StratifiedSample
|
|
96
|
+
class << self
|
|
97
|
+
# mean for an array of vectors
|
|
98
|
+
def mean(*vectors)
|
|
99
|
+
n_total=0
|
|
100
|
+
means=vectors.inject(0){|a,v|
|
|
101
|
+
n_total+=v.size
|
|
102
|
+
a+v.sum
|
|
103
|
+
}
|
|
104
|
+
means.to_f/n_total
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def standard_error_ksd_wr(es)
|
|
108
|
+
n_total=0
|
|
109
|
+
sum=es.inject(0){|a,h|
|
|
110
|
+
n_total+=h['N']
|
|
111
|
+
a+((h['N']**2 * h['s']**2) / h['n'].to_f)
|
|
112
|
+
}
|
|
113
|
+
(1.to_f / n_total)*Math::sqrt(sum)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def variance_ksd_wr(es)
|
|
118
|
+
standard_error_ksd_wr(es)**2
|
|
119
|
+
end
|
|
120
|
+
def calculate_n_total(es)
|
|
121
|
+
es.inject(0) {|a,h| a+h['N'] }
|
|
122
|
+
end
|
|
123
|
+
# Source : Cochran (1972)
|
|
124
|
+
|
|
125
|
+
def variance_ksd_wor(es)
|
|
126
|
+
n_total=calculate_n_total(es)
|
|
127
|
+
es.inject(0){|a,h|
|
|
128
|
+
val=((h['N'].to_f / n_total)**2) * (h['s']**2 / h['n'].to_f) * (1 - (h['n'].to_f / h['N']))
|
|
129
|
+
a+val
|
|
130
|
+
}
|
|
131
|
+
end
|
|
132
|
+
def standard_error_ksd_wor(es)
|
|
133
|
+
Math::sqrt(variance_ksd_wor(es))
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def variance_esd_wor(es)
|
|
139
|
+
n_total=calculate_n_total(es)
|
|
140
|
+
sum=es.inject(0){|a,h|
|
|
141
|
+
val=h['N']*(h['N']-h['n'])*(h['s']**2 / h['n'].to_f)
|
|
142
|
+
a+val
|
|
143
|
+
}
|
|
144
|
+
(1.0/(n_total**2))*sum
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def standard_error_esd_wor(es)
|
|
149
|
+
Math::sqrt(variance_ksd_wor(es))
|
|
150
|
+
end
|
|
151
|
+
# Based on http://stattrek.com/Lesson6/STRAnalysis.aspx
|
|
152
|
+
def variance_esd_wr(es)
|
|
153
|
+
n_total=calculate_n_total(es)
|
|
154
|
+
sum=es.inject(0){|a,h|
|
|
155
|
+
val= ((h['s']**2 * h['N']**2) / h['n'].to_f)
|
|
156
|
+
a+val
|
|
157
|
+
}
|
|
158
|
+
(1.0/(n_total**2))*sum
|
|
159
|
+
end
|
|
160
|
+
def standard_error_esd_wr(es)
|
|
161
|
+
Math::sqrt(variance_esd_wr(es))
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def proportion_variance_ksd_wor(es)
|
|
165
|
+
n_total=calculate_n_total(es)
|
|
166
|
+
es.inject(0){|a,h|
|
|
167
|
+
val= (((h['N'].to_f / n_total)**2 * h['p']*(1-h['p'])) / (h['n'])) * (1- (h['n'].to_f / h['N']))
|
|
168
|
+
a+val
|
|
169
|
+
}
|
|
170
|
+
end
|
|
171
|
+
def proportion_sd_ksd_wor(es)
|
|
172
|
+
Math::sqrt(proportion_variance_ksd_wor(es))
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def proportion_sd_ksd_wr(es)
|
|
177
|
+
n_total=calculate_n_total(es)
|
|
178
|
+
sum=es.inject(0){|a,h|
|
|
179
|
+
val= (h['N']**2 * h['p']*(1-h['p'])) / h['n'].to_f
|
|
180
|
+
a+val
|
|
181
|
+
}
|
|
182
|
+
Math::sqrt(sum) * (1.0/n_total)
|
|
183
|
+
end
|
|
184
|
+
def proportion_variance_ksd_wr(es)
|
|
185
|
+
proportion_variance_ksd_wor(es)**2
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def proportion_variance_esd_wor(es)
|
|
189
|
+
n_total=n_total=calculate_n_total(es)
|
|
190
|
+
|
|
191
|
+
sum=es.inject(0){|a,h|
|
|
192
|
+
a=(h['N']**2 * (h['N']-h['n']) * h['p']*(1.0-h['p'])) / ((h['n']-1)*(h['N']-1))
|
|
193
|
+
a+val
|
|
194
|
+
}
|
|
195
|
+
Math::sqrt(sum) * (1.0/n_total**2)
|
|
196
|
+
end
|
|
197
|
+
def proportion_sd_esd_wor(es)
|
|
198
|
+
Math::sqrt(proportion_variance_ksd_wor(es))
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def initialize(ms,strata_sizes)
|
|
203
|
+
raise TypeError,"ms should be a Multiset" unless ms.is_a? Statsample::Multiset
|
|
204
|
+
@ms=ms
|
|
205
|
+
raise ArgumentError,"You should put a strata size for each dataset" if strata_sizes.keys.sort!=ms.datasets_names
|
|
206
|
+
@strata_sizes=strata_sizes
|
|
207
|
+
@population_size=@strata_sizes.inject(0) { |a,x| a+x[1] }
|
|
208
|
+
@strata_number=@ms.n_datasets
|
|
209
|
+
@sample_size=@ms.datasets.inject(0) { |a,x| a+x[1].nrows }
|
|
210
|
+
end
|
|
211
|
+
# Number of strata
|
|
212
|
+
def strata_number
|
|
213
|
+
@strata_number
|
|
214
|
+
end
|
|
215
|
+
# Population size. Equal to sum of strata sizes
|
|
216
|
+
# Symbol: N<sub>h</sub>
|
|
217
|
+
def population_size
|
|
218
|
+
@population_size
|
|
219
|
+
end
|
|
220
|
+
# Sample size. Equal to sum of sample of each stratum
|
|
221
|
+
def sample_size
|
|
222
|
+
@sample_size
|
|
223
|
+
end
|
|
224
|
+
# Size of stratum x
|
|
225
|
+
def stratum_size(h)
|
|
226
|
+
@strata_sizes[h]
|
|
227
|
+
end
|
|
228
|
+
def vectors_by_field(field)
|
|
229
|
+
@ms.datasets.collect{|k,ds|
|
|
230
|
+
ds[field]
|
|
231
|
+
}
|
|
232
|
+
end
|
|
233
|
+
# Population proportion based on strata
|
|
234
|
+
def proportion(field, v=1)
|
|
235
|
+
@ms.sum_field(field) {|s_name,vector|
|
|
236
|
+
stratum_ponderation(s_name)*vector.proportion(v)
|
|
237
|
+
}
|
|
238
|
+
end
|
|
239
|
+
# Stratum ponderation.
|
|
240
|
+
# Symbol: W\<sub>h\</sub>
|
|
241
|
+
def stratum_ponderation(h)
|
|
242
|
+
@strata_sizes[h].to_f / @population_size
|
|
243
|
+
end
|
|
244
|
+
alias_method :wh, :stratum_ponderation
|
|
245
|
+
|
|
246
|
+
# Population mean based on strata
|
|
247
|
+
def mean(field)
|
|
248
|
+
@ms.sum_field(field) {|s_name,vector|
|
|
249
|
+
stratum_ponderation(s_name)*vector.mean
|
|
250
|
+
}
|
|
251
|
+
end
|
|
252
|
+
# Standard error with estimated population variance and without replacement.
|
|
253
|
+
# Source: Cochran (1972)
|
|
254
|
+
def standard_error_wor(field)
|
|
255
|
+
es=@ms.collect_vector(field) {|s_n, vector|
|
|
256
|
+
{'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
StratifiedSample.standard_error_esd_wor(es)
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
# Standard error with estimated population variance and without replacement.
|
|
263
|
+
# Source: http://stattrek.com/Lesson6/STRAnalysis.aspx
|
|
264
|
+
|
|
265
|
+
def standard_error_wor_2(field)
|
|
266
|
+
sum=@ms.sum_field(field) {|s_name,vector|
|
|
267
|
+
s_size=@strata_sizes[s_name]
|
|
268
|
+
(s_size**2 * (1-(vector.size.to_f / s_size)) * vector.variance_sample / vector.size.to_f)
|
|
269
|
+
}
|
|
270
|
+
(1/@population_size.to_f)*Math::sqrt(sum)
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
def standard_error_wr(field)
|
|
274
|
+
es=@ms.collect_vector(field) {|s_n, vector|
|
|
275
|
+
{'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
StratifiedSample.standard_error_esd_wr(es)
|
|
279
|
+
end
|
|
280
|
+
def proportion_sd_esd_wor(field,v=1)
|
|
281
|
+
es=@ms.collect_vector(field) {|s_n, vector|
|
|
282
|
+
{'N'=>@strata_sizes[s_n],'n'=>vector.size, 'p'=>vector.proportion(v)}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
StratifiedSample.proportion_sd_esd_wor(es)
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
def proportion_standard_error(field,v=1)
|
|
289
|
+
prop=proportion(field,v)
|
|
290
|
+
sum=@ms.sum_field(field) {|s_name,vector|
|
|
291
|
+
nh=vector.size
|
|
292
|
+
s_size=@strata_sizes[s_name]
|
|
293
|
+
(s_size**2 * (1-(nh / s_size)) * prop * (1-prop) / (nh - 1 ))
|
|
294
|
+
}
|
|
295
|
+
(1.quo(@population_size)) * Math::sqrt(sum)
|
|
296
|
+
end
|
|
297
|
+
# Cochran(1971), p. 150
|
|
298
|
+
def variance_pst(field,v=1)
|
|
299
|
+
sum=@ms.datasets.inject(0) {|a,da|
|
|
300
|
+
stratum_name=da[0]
|
|
301
|
+
ds=da[1]
|
|
302
|
+
nh=ds.cases.to_f
|
|
303
|
+
s_size=@strata_sizes[stratum_name]
|
|
304
|
+
prop=ds[field].proportion(v)
|
|
305
|
+
a + (((s_size**2 * (s_size-nh)) / (s_size-1))*(prop*(1-prop) / (nh-1)))
|
|
306
|
+
}
|
|
307
|
+
(1/@population_size.to_f ** 2)*sum
|
|
308
|
+
end
|
|
309
|
+
end
|
|
310
|
+
end
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
require 'statsample/regression/simple'
|
|
2
|
+
require 'statsample/regression/multiple'
|
|
3
|
+
|
|
4
|
+
require 'statsample/regression/multiple/matrixengine'
|
|
5
|
+
require 'statsample/regression/multiple/rubyengine'
|
|
6
|
+
require 'statsample/regression/multiple/gslengine'
|
|
7
|
+
|
|
8
|
+
module Statsample
|
|
9
|
+
# = Module for regression procedures.
|
|
10
|
+
# Use the method on this class to generate
|
|
11
|
+
# analysis.
|
|
12
|
+
# If you need more control, you can
|
|
13
|
+
# create and control directly the objects who computes
|
|
14
|
+
# the regressions.
|
|
15
|
+
#
|
|
16
|
+
# * Simple Regression : Statsample::Regression::Simple
|
|
17
|
+
# * Multiple Regression: Statsample::Regression::Multiple
|
|
18
|
+
module Regression
|
|
19
|
+
|
|
20
|
+
LinearDependency=Class.new(Exception)
|
|
21
|
+
|
|
22
|
+
# Create a Statsample::Regression::Simple object, for simple regression
|
|
23
|
+
# * x: independent Vector
|
|
24
|
+
# * y: dependent Vector
|
|
25
|
+
# <b>Usage:</b>
|
|
26
|
+
# x = Daru::Vector.new(100.times.collect {|i| rand(100)})
|
|
27
|
+
# y = Daru::Vector.new(100.times.collect {|i| 2+x[i]*2+rand()})
|
|
28
|
+
# sr=Statsample::Regression.simple(x,y)
|
|
29
|
+
# sr.a
|
|
30
|
+
# => 2.51763295177808
|
|
31
|
+
# sr.b
|
|
32
|
+
# => 1.99973746599856
|
|
33
|
+
# sr.r
|
|
34
|
+
# => 0.999987881153254
|
|
35
|
+
def self.simple(x,y)
|
|
36
|
+
Statsample::Regression::Simple.new_from_vectors(x,y)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Creates one of the Statsample::Regression::Multiple object,
|
|
40
|
+
# for OLS multiple regression.
|
|
41
|
+
# Parameters:
|
|
42
|
+
# * <tt>ds</tt>: Dataset.
|
|
43
|
+
# * y: Name of dependent variable.
|
|
44
|
+
# * opts: A hash with options
|
|
45
|
+
# * missing_data: Could be
|
|
46
|
+
# * :listwise: delete cases with one or more empty data (default).
|
|
47
|
+
# * :pairwise: uses correlation matrix. Use with caution.
|
|
48
|
+
#
|
|
49
|
+
# <b>Usage:</b>
|
|
50
|
+
# lr=Statsample::Regression::multiple(ds,:y)
|
|
51
|
+
def self.multiple(ds,y_var, opts=Hash.new)
|
|
52
|
+
missing_data= (opts[:missing_data].nil? ) ? :listwise : opts.delete(:missing_data)
|
|
53
|
+
if missing_data==:pairwise
|
|
54
|
+
Statsample::Regression::Multiple::RubyEngine.new(ds,y_var, opts)
|
|
55
|
+
else
|
|
56
|
+
if Statsample.has_gsl? and false
|
|
57
|
+
Statsample::Regression::Multiple::GslEngine.new(ds, y_var, opts)
|
|
58
|
+
else
|
|
59
|
+
ds2=ds.reject_values(*Daru::MISSING_VALUES)
|
|
60
|
+
Statsample::Regression::Multiple::RubyEngine.new(ds2,y_var, opts)
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
require 'statsample/regression/multiple/baseengine'
|
|
2
|
+
module Statsample
|
|
3
|
+
module Regression
|
|
4
|
+
# Module for OLS Multiple Regression Analysis.
|
|
5
|
+
#
|
|
6
|
+
# Use:.
|
|
7
|
+
#
|
|
8
|
+
# require 'statsample'
|
|
9
|
+
# a = Daru::Vector.new(1000.times.collect {rand})
|
|
10
|
+
# b = Daru::Vector.new(1000.times.collect {rand})
|
|
11
|
+
# c = Daru::Vector.new(1000.times.collect {rand})
|
|
12
|
+
# ds= Daru::DataFrame.new({:a => a,:b => b,:c => c})
|
|
13
|
+
# ds[:y]=ds.collect{|row| row[:a]*5 + row[:b]*3 + row[:c]*2 + rand()}
|
|
14
|
+
# lr=Statsample::Regression.multiple(ds, :y)
|
|
15
|
+
# puts lr.summary
|
|
16
|
+
# Summary for regression of a,b,c over y
|
|
17
|
+
# *************************************************************
|
|
18
|
+
# Engine: Statsample::Regression::Multiple::AlglibEngine
|
|
19
|
+
# Cases(listwise)=1000(1000)
|
|
20
|
+
# r=0.986
|
|
21
|
+
# r2=0.973
|
|
22
|
+
# Equation=0.504+5.011a + 2.995b + 1.988c
|
|
23
|
+
# ----------------------------
|
|
24
|
+
# ANOVA TABLE
|
|
25
|
+
# --------------------------------------------------------------
|
|
26
|
+
# | source | ss | df | ms | f | s |
|
|
27
|
+
# --------------------------------------------------------------
|
|
28
|
+
# | Regression | 2979.321 | 3 | 993.107 | 12040.067 | 0.000 |
|
|
29
|
+
# | Error | 82.154 | 996 | 0.082 | | |
|
|
30
|
+
# | Total | 3061.475 | 999 | | | |
|
|
31
|
+
# --------------------------------------------------------------
|
|
32
|
+
# Beta coefficientes
|
|
33
|
+
# -----------------------------------------------
|
|
34
|
+
# | coeff | b | beta | se | t |
|
|
35
|
+
# -----------------------------------------------
|
|
36
|
+
# | Constant | 0.504 | - | 0.030 | 16.968 |
|
|
37
|
+
# | a | 5.011 | 0.832 | 0.031 | 159.486 |
|
|
38
|
+
# | b | 2.995 | 0.492 | 0.032 | 94.367 |
|
|
39
|
+
# | c | 1.988 | 0.323 | 0.032 | 62.132 |
|
|
40
|
+
# -----------------------------------------------
|
|
41
|
+
#
|
|
42
|
+
module Multiple
|
|
43
|
+
# Obtain r2 for regressors
|
|
44
|
+
def self.r2_from_matrices(rxx,rxy)
|
|
45
|
+
matrix=(rxy.transpose*rxx.inverse*rxy)
|
|
46
|
+
matrix[0,0]
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
class MultipleDependent
|
|
50
|
+
def significance
|
|
51
|
+
0.0
|
|
52
|
+
end
|
|
53
|
+
def initialize(matrix,y_var, opts=Hash.new)
|
|
54
|
+
matrix.extend Statsample::CovariateMatrix
|
|
55
|
+
@matrix=matrix
|
|
56
|
+
@fields=matrix.fields - y_var
|
|
57
|
+
@y_var = y_var
|
|
58
|
+
@q=@y_var.size
|
|
59
|
+
@matrix_cor=matrix.correlation
|
|
60
|
+
@matrix_cor_xx = @matrix_cor.submatrix(@fields)
|
|
61
|
+
@matrix_cor_yy = @matrix_cor.submatrix(y_var, y_var)
|
|
62
|
+
|
|
63
|
+
@sxx = @matrix.submatrix(@fields)
|
|
64
|
+
@syy = @matrix.submatrix(y_var, y_var)
|
|
65
|
+
@sxy = @matrix.submatrix(@fields, y_var)
|
|
66
|
+
@syx = @sxy.t
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def r2yx
|
|
70
|
+
1- (@matrix_cor.determinant.quo(@matrix_cor_yy.determinant * @matrix_cor_xx.determinant))
|
|
71
|
+
end
|
|
72
|
+
# Residual covariance of Y after accountin with lineal relation with x
|
|
73
|
+
def syyx
|
|
74
|
+
@syy-@syx*@sxx.inverse*@sxy
|
|
75
|
+
end
|
|
76
|
+
def r2yx_covariance
|
|
77
|
+
1-(syyx.determinant.quo(@syy.determinant))
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def vxy
|
|
81
|
+
@q-(@syy.inverse*syyx).trace
|
|
82
|
+
end
|
|
83
|
+
def p2yx
|
|
84
|
+
vxy.quo(@q)
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|