statsample-ekatena 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.travis.yml +23 -0
- data/CONTRIBUTING.md +17 -0
- data/Gemfile +2 -0
- data/History.txt +457 -0
- data/LICENSE.txt +12 -0
- data/README.md +175 -0
- data/Rakefile +44 -0
- data/benchmarks/correlation_matrix_15_variables.rb +32 -0
- data/benchmarks/correlation_matrix_5_variables.rb +33 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/doc_latex/manual/equations.tex +78 -0
- data/examples/boxplot.rb +28 -0
- data/examples/chisquare_test.rb +23 -0
- data/examples/correlation_matrix.rb +32 -0
- data/examples/dataset.rb +30 -0
- data/examples/dominance_analysis.rb +33 -0
- data/examples/dominance_analysis_bootstrap.rb +32 -0
- data/examples/histogram.rb +26 -0
- data/examples/icc.rb +24 -0
- data/examples/levene.rb +29 -0
- data/examples/multiple_regression.rb +20 -0
- data/examples/multivariate_correlation.rb +33 -0
- data/examples/parallel_analysis.rb +40 -0
- data/examples/polychoric.rb +40 -0
- data/examples/principal_axis.rb +26 -0
- data/examples/reliability.rb +31 -0
- data/examples/scatterplot.rb +25 -0
- data/examples/t_test.rb +27 -0
- data/examples/tetrachoric.rb +17 -0
- data/examples/u_test.rb +24 -0
- data/examples/vector.rb +20 -0
- data/examples/velicer_map_test.rb +46 -0
- data/grab_references.rb +29 -0
- data/lib/spss.rb +134 -0
- data/lib/statsample-ekatena/analysis.rb +100 -0
- data/lib/statsample-ekatena/analysis/suite.rb +89 -0
- data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
- data/lib/statsample-ekatena/anova.rb +24 -0
- data/lib/statsample-ekatena/anova/contrast.rb +79 -0
- data/lib/statsample-ekatena/anova/oneway.rb +187 -0
- data/lib/statsample-ekatena/anova/twoway.rb +207 -0
- data/lib/statsample-ekatena/bivariate.rb +406 -0
- data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
- data/lib/statsample-ekatena/codification.rb +182 -0
- data/lib/statsample-ekatena/converter/csv.rb +28 -0
- data/lib/statsample-ekatena/converter/spss.rb +48 -0
- data/lib/statsample-ekatena/converters.rb +211 -0
- data/lib/statsample-ekatena/crosstab.rb +188 -0
- data/lib/statsample-ekatena/daru.rb +115 -0
- data/lib/statsample-ekatena/dataset.rb +10 -0
- data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
- data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
- data/lib/statsample-ekatena/factor.rb +104 -0
- data/lib/statsample-ekatena/factor/map.rb +124 -0
- data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
- data/lib/statsample-ekatena/factor/pca.rb +242 -0
- data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
- data/lib/statsample-ekatena/factor/rotation.rb +198 -0
- data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
- data/lib/statsample-ekatena/formula/formula.rb +306 -0
- data/lib/statsample-ekatena/graph.rb +11 -0
- data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
- data/lib/statsample-ekatena/graph/histogram.rb +198 -0
- data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
- data/lib/statsample-ekatena/histogram.rb +180 -0
- data/lib/statsample-ekatena/matrix.rb +329 -0
- data/lib/statsample-ekatena/multiset.rb +310 -0
- data/lib/statsample-ekatena/regression.rb +65 -0
- data/lib/statsample-ekatena/regression/multiple.rb +89 -0
- data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
- data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
- data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
- data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
- data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
- data/lib/statsample-ekatena/regression/simple.rb +121 -0
- data/lib/statsample-ekatena/reliability.rb +150 -0
- data/lib/statsample-ekatena/reliability/icc.rb +415 -0
- data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
- data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
- data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
- data/lib/statsample-ekatena/resample.rb +15 -0
- data/lib/statsample-ekatena/shorthand.rb +125 -0
- data/lib/statsample-ekatena/srs.rb +169 -0
- data/lib/statsample-ekatena/test.rb +82 -0
- data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
- data/lib/statsample-ekatena/test/chisquare.rb +73 -0
- data/lib/statsample-ekatena/test/f.rb +52 -0
- data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
- data/lib/statsample-ekatena/test/levene.rb +88 -0
- data/lib/statsample-ekatena/test/t.rb +309 -0
- data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
- data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
- data/lib/statsample-ekatena/vector.rb +19 -0
- data/lib/statsample-ekatena/version.rb +3 -0
- data/lib/statsample.rb +282 -0
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +959 -0
- data/po/statsample.pot +947 -0
- data/references.txt +24 -0
- data/statsample-ekatena.gemspec +49 -0
- data/test/fixtures/bank2.dat +200 -0
- data/test/fixtures/correlation_matrix.rb +17 -0
- data/test/fixtures/df.csv +15 -0
- data/test/fixtures/hartman_23.matrix +9 -0
- data/test/fixtures/stock_data.csv +500 -0
- data/test/fixtures/tetmat_matrix.txt +5 -0
- data/test/fixtures/tetmat_test.txt +1001 -0
- data/test/helpers_tests.rb +83 -0
- data/test/test_analysis.rb +176 -0
- data/test/test_anova_contrast.rb +36 -0
- data/test/test_anovaoneway.rb +26 -0
- data/test/test_anovatwoway.rb +37 -0
- data/test/test_anovatwowaywithdataset.rb +47 -0
- data/test/test_anovawithvectors.rb +102 -0
- data/test/test_awesome_print_bug.rb +16 -0
- data/test/test_bartlettsphericity.rb +25 -0
- data/test/test_bivariate.rb +164 -0
- data/test/test_codification.rb +78 -0
- data/test/test_crosstab.rb +67 -0
- data/test/test_dominance_analysis.rb +39 -0
- data/test/test_factor.rb +228 -0
- data/test/test_factor_map.rb +38 -0
- data/test/test_factor_pa.rb +56 -0
- data/test/test_fit_model.rb +88 -0
- data/test/test_ggobi.rb +35 -0
- data/test/test_gsl.rb +15 -0
- data/test/test_histogram.rb +109 -0
- data/test/test_matrix.rb +48 -0
- data/test/test_multiset.rb +176 -0
- data/test/test_regression.rb +231 -0
- data/test/test_reliability.rb +223 -0
- data/test/test_reliability_icc.rb +198 -0
- data/test/test_reliability_skillscale.rb +57 -0
- data/test/test_resample.rb +24 -0
- data/test/test_srs.rb +9 -0
- data/test/test_statistics.rb +69 -0
- data/test/test_stest.rb +69 -0
- data/test/test_stratified.rb +17 -0
- data/test/test_test_f.rb +33 -0
- data/test/test_test_kolmogorovsmirnov.rb +34 -0
- data/test/test_test_t.rb +62 -0
- data/test/test_umannwhitney.rb +27 -0
- data/test/test_vector.rb +12 -0
- data/test/test_wilcoxonsignedrank.rb +64 -0
- metadata +570 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
# Class to create crosstab of data
|
|
3
|
+
# With this, you can create reports and do chi square test
|
|
4
|
+
# The first vector will be at rows and the second will the the columns
|
|
5
|
+
#
|
|
6
|
+
class Crosstab
|
|
7
|
+
include Summarizable
|
|
8
|
+
attr_reader :v_rows, :v_cols
|
|
9
|
+
attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total
|
|
10
|
+
def initialize(v1, v2, opts=Hash.new)
|
|
11
|
+
raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
|
|
12
|
+
@v_rows, @v_cols = Statsample.only_valid_clone(
|
|
13
|
+
Daru::Vector.new(v1),
|
|
14
|
+
Daru::Vector.new(v2))
|
|
15
|
+
@cases = @v_rows.size
|
|
16
|
+
@row_label = v1.name
|
|
17
|
+
@column_label = v2.name
|
|
18
|
+
@name = nil
|
|
19
|
+
@percentage_row = @percentage_column = @percentage_total=false
|
|
20
|
+
opts.each do |k,v|
|
|
21
|
+
self.send("#{k}=",v) if self.respond_to? k
|
|
22
|
+
end
|
|
23
|
+
@name ||= _("Crosstab %s - %s") % [@row_label, @column_label]
|
|
24
|
+
end
|
|
25
|
+
def rows_names
|
|
26
|
+
@v_rows.factors.sort.reset_index!
|
|
27
|
+
end
|
|
28
|
+
def cols_names
|
|
29
|
+
@v_cols.factors.sort.reset_index!
|
|
30
|
+
end
|
|
31
|
+
def rows_total
|
|
32
|
+
@v_rows.frequencies
|
|
33
|
+
end
|
|
34
|
+
def cols_total
|
|
35
|
+
@v_cols.frequencies
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def frequencies
|
|
39
|
+
base = rows_names.inject([]) do |s,row|
|
|
40
|
+
s += cols_names.collect { |col| [row,col] }
|
|
41
|
+
end.inject({}) do |s,par|
|
|
42
|
+
s[par]=0
|
|
43
|
+
s
|
|
44
|
+
end
|
|
45
|
+
base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies)
|
|
46
|
+
end
|
|
47
|
+
def to_matrix
|
|
48
|
+
f = frequencies
|
|
49
|
+
rn = rows_names
|
|
50
|
+
cn = cols_names
|
|
51
|
+
Matrix.rows(rn.collect{|row|
|
|
52
|
+
cn.collect{|col| f[[row,col]]}
|
|
53
|
+
})
|
|
54
|
+
end
|
|
55
|
+
def frequencies_by_row
|
|
56
|
+
f=frequencies
|
|
57
|
+
rows_names.inject({}){|sr,row|
|
|
58
|
+
sr[row]=cols_names.inject({}) {|sc,col| sc[col]=f[[row,col]]; sc}
|
|
59
|
+
sr
|
|
60
|
+
}
|
|
61
|
+
end
|
|
62
|
+
def frequencies_by_col
|
|
63
|
+
f=frequencies
|
|
64
|
+
cols_names.inject({}){|sc,col|
|
|
65
|
+
sc[col]=rows_names.inject({}) {|sr,row| sr[row]=f[[row,col]]; sr}
|
|
66
|
+
sc
|
|
67
|
+
}
|
|
68
|
+
end
|
|
69
|
+
# Chi square, based on expected and real matrix
|
|
70
|
+
def chi_square
|
|
71
|
+
require 'statsample/test'
|
|
72
|
+
Statsample::Test.chi_square(self.to_matrix, matrix_expected)
|
|
73
|
+
end
|
|
74
|
+
# Useful to obtain chi square
|
|
75
|
+
def matrix_expected
|
|
76
|
+
rn=rows_names
|
|
77
|
+
cn=cols_names
|
|
78
|
+
rt=rows_total
|
|
79
|
+
ct=cols_total
|
|
80
|
+
t=@v_rows.size
|
|
81
|
+
m=rn.collect{|row|
|
|
82
|
+
cn.collect{|col|
|
|
83
|
+
(rt[row]*ct[col]).quo(t)
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
Matrix.rows(m)
|
|
87
|
+
end
|
|
88
|
+
def cols_empty_hash
|
|
89
|
+
cols_names.inject({}) {|a,x| a[x]=0;a}
|
|
90
|
+
end
|
|
91
|
+
def report_building(builder)
|
|
92
|
+
builder.section(:name=>@name) do |generator|
|
|
93
|
+
fq=frequencies
|
|
94
|
+
rn=rows_names
|
|
95
|
+
cn=cols_names
|
|
96
|
+
total=0
|
|
97
|
+
total_cols=cols_empty_hash
|
|
98
|
+
generator.text "Chi Square: #{chi_square}"
|
|
99
|
+
generator.text(_("Rows: %s") % @row_label) unless @row_label.nil?
|
|
100
|
+
generator.text(_("Columns: %s") % @column_label) unless @column_label.nil?
|
|
101
|
+
|
|
102
|
+
t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c)}+[_("Total")])
|
|
103
|
+
rn.each do |row|
|
|
104
|
+
total_row=0
|
|
105
|
+
t_row=[@v_rows.index_of(row)]
|
|
106
|
+
cn.each do |col|
|
|
107
|
+
data=fq[[row,col]]
|
|
108
|
+
total_row+=fq[[row,col]]
|
|
109
|
+
total+=fq[[row,col]]
|
|
110
|
+
total_cols[col]+=fq[[row,col]]
|
|
111
|
+
t_row.push(data)
|
|
112
|
+
end
|
|
113
|
+
t_row.push(total_row)
|
|
114
|
+
t.row(t_row)
|
|
115
|
+
end
|
|
116
|
+
t.hr
|
|
117
|
+
t_row=[_("Total")]
|
|
118
|
+
cn.each do |v|
|
|
119
|
+
t_row.push(total_cols[v])
|
|
120
|
+
end
|
|
121
|
+
t_row.push(total)
|
|
122
|
+
t.row(t_row)
|
|
123
|
+
generator.parse_element(t)
|
|
124
|
+
|
|
125
|
+
if(@percentage_row)
|
|
126
|
+
table_percentage(generator,:row)
|
|
127
|
+
end
|
|
128
|
+
if(@percentage_column)
|
|
129
|
+
table_percentage(generator,:column)
|
|
130
|
+
end
|
|
131
|
+
if(@percentage_total)
|
|
132
|
+
table_percentage(generator,:total)
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def table_percentage(generator,type)
|
|
140
|
+
fq=frequencies
|
|
141
|
+
cn=cols_names
|
|
142
|
+
rn=rows_names
|
|
143
|
+
rt=rows_total
|
|
144
|
+
ct=cols_total
|
|
145
|
+
|
|
146
|
+
type_name=case type
|
|
147
|
+
when :row then _("% Row")
|
|
148
|
+
when :column then _("% Column")
|
|
149
|
+
when :total then _("% Total")
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c) } + [_("Total")])
|
|
153
|
+
rn.each do |row|
|
|
154
|
+
t_row=[@v_rows.index_of(row)]
|
|
155
|
+
cn.each do |col|
|
|
156
|
+
total=case type
|
|
157
|
+
when :row then rt[row]
|
|
158
|
+
when :column then ct[col]
|
|
159
|
+
when :total then @cases
|
|
160
|
+
end
|
|
161
|
+
data = sprintf("%0.2f%%", fq[[row,col]]*100.0/ total )
|
|
162
|
+
t_row.push(data)
|
|
163
|
+
end
|
|
164
|
+
total=case type
|
|
165
|
+
when :row then rt[row]
|
|
166
|
+
when :column then @cases
|
|
167
|
+
when :total then @cases
|
|
168
|
+
end
|
|
169
|
+
t_row.push(sprintf("%0.2f%%", rt[row]*100.0/total))
|
|
170
|
+
t.row(t_row)
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
t.hr
|
|
174
|
+
t_row=[_("Total")]
|
|
175
|
+
cn.each{|col|
|
|
176
|
+
total=case type
|
|
177
|
+
when :row then @cases
|
|
178
|
+
when :column then ct[col]
|
|
179
|
+
when :total then @cases
|
|
180
|
+
end
|
|
181
|
+
t_row.push(sprintf("%0.2f%%", ct[col]*100.0/total))
|
|
182
|
+
}
|
|
183
|
+
t_row.push("100%")
|
|
184
|
+
t.row(t_row)
|
|
185
|
+
generator.parse_element(t)
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
end
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# Opening the Daru::DataFrame class for adding methods to convert from
|
|
2
|
+
# data structures to specialized statsample data structues like Multiset.
|
|
3
|
+
module Daru
|
|
4
|
+
class Vector
|
|
5
|
+
def histogram(bins=10)
|
|
6
|
+
type == :numeric or raise TypeError, "Only numeric Vectors can do this operation."
|
|
7
|
+
|
|
8
|
+
if bins.is_a? Array
|
|
9
|
+
h = Statsample::Histogram.alloc(bins)
|
|
10
|
+
else
|
|
11
|
+
# ugly patch. The upper limit for a bin has the form
|
|
12
|
+
# x < range
|
|
13
|
+
#h=Statsample::Histogram.new(self, bins)
|
|
14
|
+
valid = reject_values(*Daru::MISSING_VALUES)
|
|
15
|
+
min,max=Statsample::Util.nice(valid.min,valid.max)
|
|
16
|
+
# fix last data
|
|
17
|
+
if max == valid.max
|
|
18
|
+
max += 1e-10
|
|
19
|
+
end
|
|
20
|
+
h = Statsample::Histogram.alloc(bins,[min,max])
|
|
21
|
+
# Fix last bin
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
h.increment(valid)
|
|
25
|
+
h
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Variance of p, according to poblation size
|
|
29
|
+
def variance_proportion(n_poblation, v=1)
|
|
30
|
+
Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Variance of p, according to poblation size
|
|
34
|
+
def variance_total(n_poblation, v=1)
|
|
35
|
+
Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
|
|
39
|
+
Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
|
|
43
|
+
Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
class DataFrame
|
|
48
|
+
def crosstab(v1,v2,opts={})
|
|
49
|
+
Statsample::Crosstab.new(self[v1], self[v2],opts)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Functions for converting to Statsample::Multiset
|
|
53
|
+
def to_multiset_by_split(*vecs)
|
|
54
|
+
require 'statsample/multiset'
|
|
55
|
+
|
|
56
|
+
if vecs.size == 1
|
|
57
|
+
to_multiset_by_split_one_field(vecs[0])
|
|
58
|
+
else
|
|
59
|
+
to_multiset_by_split_multiple_fields(*vecs)
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Creates a Statsample::Multiset, using one field
|
|
64
|
+
def to_multiset_by_split_one_field(field)
|
|
65
|
+
raise ArgumentError,"Should use a correct field name" if
|
|
66
|
+
!@vectors.include? field
|
|
67
|
+
|
|
68
|
+
factors = self[field].factors
|
|
69
|
+
ms = Statsample::Multiset.new_empty_vectors(@vectors.to_a, factors)
|
|
70
|
+
each_row do |row|
|
|
71
|
+
ms[row[field]].add_row(row)
|
|
72
|
+
end
|
|
73
|
+
#puts "Ingreso a los dataset"
|
|
74
|
+
ms.datasets.each do |k,ds|
|
|
75
|
+
ds.rename self[field].index_of(k)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
ms
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def to_multiset_by_split_multiple_fields(*fields)
|
|
82
|
+
fields.map!(&:to_sym)
|
|
83
|
+
factors_total=nil
|
|
84
|
+
fields.each do |f|
|
|
85
|
+
if factors_total.nil?
|
|
86
|
+
factors_total = self[f].factors.collect { |c| [c] }
|
|
87
|
+
else
|
|
88
|
+
suma = []
|
|
89
|
+
factors = self[f].factors
|
|
90
|
+
factors_total.each do |f1|
|
|
91
|
+
factors.each do |f2|
|
|
92
|
+
suma.push(f1+[f2])
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
factors_total = suma
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
ms = Statsample::Multiset.new_empty_vectors(vectors.to_a, factors_total)
|
|
99
|
+
|
|
100
|
+
p1 = eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}'.to_sym]"}.join(",")+"]].add_row(c) }"
|
|
101
|
+
each_row { |r| p1.call(r) }
|
|
102
|
+
|
|
103
|
+
ms.datasets.each do |k,ds|
|
|
104
|
+
ds.rename(
|
|
105
|
+
fields.size.times.map do |i|
|
|
106
|
+
f = fields[i]
|
|
107
|
+
sk = k[i]
|
|
108
|
+
self[f].index_of(sk)
|
|
109
|
+
end.join("-")
|
|
110
|
+
)
|
|
111
|
+
end
|
|
112
|
+
ms
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
# Dominance Analysis is a procedure based on an examination of the R<sup>2</sup> values
|
|
3
|
+
# for all possible subset models, to identify the relevance of one or more
|
|
4
|
+
# predictors in the prediction of criterium.
|
|
5
|
+
#
|
|
6
|
+
# See Budescu(1993), Azen & Budescu (2003, 2006) for more information.
|
|
7
|
+
#
|
|
8
|
+
# == Use
|
|
9
|
+
#
|
|
10
|
+
# a = Daru::Vector.new(1000.times.collect {rand})
|
|
11
|
+
# b = Daru::Vector.new(1000.times.collect {rand})
|
|
12
|
+
# c = Daru::Vector.new(1000.times.collect {rand})
|
|
13
|
+
# ds= Daru::DataFrame.new({:a => a,:b => b,:c => c})
|
|
14
|
+
# ds[:y] = ds.collect_rows {|row| row[:a]*5 + row[:b]*3 + row[:c]*2 + rand()}
|
|
15
|
+
# da=Statsample::DominanceAnalysis.new(ds, :y)
|
|
16
|
+
# puts da.summary
|
|
17
|
+
#
|
|
18
|
+
# === Output:
|
|
19
|
+
#
|
|
20
|
+
# Report: Report 2010-02-08 19:10:11 -0300
|
|
21
|
+
# Table: Dominance Analysis result
|
|
22
|
+
# ------------------------------------------------------------
|
|
23
|
+
# | | r2 | sign | a | b | c |
|
|
24
|
+
# ------------------------------------------------------------
|
|
25
|
+
# | Model 0 | | | 0.648 | 0.265 | 0.109 |
|
|
26
|
+
# ------------------------------------------------------------
|
|
27
|
+
# | a | 0.648 | 0.000 | -- | 0.229 | 0.104 |
|
|
28
|
+
# | b | 0.265 | 0.000 | 0.612 | -- | 0.104 |
|
|
29
|
+
# | c | 0.109 | 0.000 | 0.643 | 0.260 | -- |
|
|
30
|
+
# ------------------------------------------------------------
|
|
31
|
+
# | k=1 Average | | | 0.627 | 0.244 | 0.104 |
|
|
32
|
+
# ------------------------------------------------------------
|
|
33
|
+
# | a*b | 0.877 | 0.000 | -- | -- | 0.099 |
|
|
34
|
+
# | a*c | 0.752 | 0.000 | -- | 0.224 | -- |
|
|
35
|
+
# | b*c | 0.369 | 0.000 | 0.607 | -- | -- |
|
|
36
|
+
# ------------------------------------------------------------
|
|
37
|
+
# | k=2 Average | | | 0.607 | 0.224 | 0.099 |
|
|
38
|
+
# ------------------------------------------------------------
|
|
39
|
+
# | a*b*c | 0.976 | 0.000 | -- | -- | -- |
|
|
40
|
+
# ------------------------------------------------------------
|
|
41
|
+
# | Overall averages | | | 0.628 | 0.245 | 0.104 |
|
|
42
|
+
# ------------------------------------------------------------
|
|
43
|
+
#
|
|
44
|
+
# Table: Pairwise dominance
|
|
45
|
+
# -----------------------------------------
|
|
46
|
+
# | Pairs | Total | Conditional | General |
|
|
47
|
+
# -----------------------------------------
|
|
48
|
+
# | a - b | 1.0 | 1.0 | 1.0 |
|
|
49
|
+
# | a - c | 1.0 | 1.0 | 1.0 |
|
|
50
|
+
# | b - c | 1.0 | 1.0 | 1.0 |
|
|
51
|
+
# -----------------------------------------
|
|
52
|
+
#
|
|
53
|
+
# == Reference:
|
|
54
|
+
# * Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression. <em>Psychological Bulletin, 114</em>, 542-551.
|
|
55
|
+
# * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. <em>Psychological Methods, 8</em>(2), 129-148.
|
|
56
|
+
# * Azen, R. & Budescu, D.V. (2006). Comparing predictors in Multivariate Regression Models: An extension of Dominance Analysis. <em>Journal of Educational and Behavioral Statistics, 31</em>(2), 157-180.
|
|
57
|
+
#
|
|
58
|
+
class DominanceAnalysis
|
|
59
|
+
include Summarizable
|
|
60
|
+
# Class to generate the regressions. Default to Statsample::Regression::Multiple::MatrixEngine
|
|
61
|
+
attr_accessor :regression_class
|
|
62
|
+
# Name of analysis
|
|
63
|
+
attr_accessor :name
|
|
64
|
+
# Set to true if you want to build from dataset, not correlation matrix
|
|
65
|
+
attr_accessor :build_from_dataset
|
|
66
|
+
# Array with independent variables. You could create subarrays,
|
|
67
|
+
# to test groups of predictors as blocks
|
|
68
|
+
attr_accessor :predictors
|
|
69
|
+
# If you provide a matrix as input, you should set
|
|
70
|
+
# the number of cases to define significance of R^2
|
|
71
|
+
attr_accessor :cases
|
|
72
|
+
# Method of :regression_class used to measure association.
|
|
73
|
+
#
|
|
74
|
+
# Only necessary to change if you have multivariate dependent.
|
|
75
|
+
# * :r2yx (R^2_yx), the default option, is the option when distinction
|
|
76
|
+
# between independent and dependents variable is arbitrary
|
|
77
|
+
# * :p2yx is the option when the distinction between independent and dependents variables is real.
|
|
78
|
+
#
|
|
79
|
+
|
|
80
|
+
attr_accessor :method_association
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
attr_reader :dependent
|
|
84
|
+
|
|
85
|
+
UNIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MatrixEngine
|
|
86
|
+
MULTIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MultipleDependent
|
|
87
|
+
|
|
88
|
+
def self.predictor_name(variable)
|
|
89
|
+
if variable.is_a? Array
|
|
90
|
+
sprintf("(%s)", variable.join(","))
|
|
91
|
+
else
|
|
92
|
+
variable
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
# Creates a new DominanceAnalysis object
|
|
96
|
+
# Parameters:
|
|
97
|
+
# * input: A Matrix or Dataset object
|
|
98
|
+
# * dependent: Name of dependent variable. Could be an array, if you want to
|
|
99
|
+
# do an Multivariate Regression Analysis. If nil, set to all
|
|
100
|
+
# fields on input, except criteria
|
|
101
|
+
|
|
102
|
+
def initialize(input, dependent, opts=Hash.new)
|
|
103
|
+
@build_from_dataset=false
|
|
104
|
+
if dependent.is_a? Array
|
|
105
|
+
@regression_class= MULTIVARIATE_REGRESSION_CLASS
|
|
106
|
+
@method_association=:r2yx
|
|
107
|
+
else
|
|
108
|
+
@regression_class= UNIVARIATE_REGRESSION_CLASS
|
|
109
|
+
@method_association=:r2
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
@name=nil
|
|
113
|
+
opts.each{|k,v|
|
|
114
|
+
self.send("#{k}=",v) if self.respond_to? k
|
|
115
|
+
}
|
|
116
|
+
@dependent=dependent
|
|
117
|
+
@dependent=[@dependent] unless @dependent.is_a? Array
|
|
118
|
+
|
|
119
|
+
if input.kind_of? Daru::DataFrame
|
|
120
|
+
@predictors ||= input.vectors.to_a - @dependent
|
|
121
|
+
@ds=input
|
|
122
|
+
@matrix=Statsample::Bivariate.correlation_matrix(input)
|
|
123
|
+
@cases=Statsample::Bivariate.min_n_valid(input)
|
|
124
|
+
elsif input.is_a? ::Matrix
|
|
125
|
+
@predictors ||= input.fields-@dependent
|
|
126
|
+
@ds=nil
|
|
127
|
+
@matrix=input
|
|
128
|
+
else
|
|
129
|
+
raise ArgumentError.new("You should use a Matrix or a Dataset")
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
@name=_("Dominance Analysis: %s over %s") % [ @predictors.flatten.join(",") , @dependent.join(",")] if @name.nil?
|
|
133
|
+
@models=nil
|
|
134
|
+
@models_data=nil
|
|
135
|
+
@general_averages=nil
|
|
136
|
+
end
|
|
137
|
+
# Compute models.
|
|
138
|
+
def compute
|
|
139
|
+
create_models
|
|
140
|
+
fill_models
|
|
141
|
+
end
|
|
142
|
+
def models
|
|
143
|
+
if @models.nil?
|
|
144
|
+
compute
|
|
145
|
+
end
|
|
146
|
+
@models
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def models_data
|
|
150
|
+
if @models_data.nil?
|
|
151
|
+
compute
|
|
152
|
+
end
|
|
153
|
+
@models_data
|
|
154
|
+
end
|
|
155
|
+
def create_models
|
|
156
|
+
@models=[]
|
|
157
|
+
@models_data={}
|
|
158
|
+
for i in 1..@predictors.size
|
|
159
|
+
c=(0...@predictors.size).to_a.combination(i)
|
|
160
|
+
c.each do |data|
|
|
161
|
+
|
|
162
|
+
independent=data.collect {|i1| @predictors[i1] }
|
|
163
|
+
@models.push(independent)
|
|
164
|
+
if (@build_from_dataset)
|
|
165
|
+
data=@ds.dup(independent.flatten+@dependent)
|
|
166
|
+
else
|
|
167
|
+
data=@matrix.submatrix(independent.flatten+@dependent)
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
modeldata=ModelData.new(independent, data, self)
|
|
171
|
+
models_data[independent.sort {|a,b| a.to_s<=>b.to_s}]=modeldata
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
def fill_models
|
|
176
|
+
@models.each do |m|
|
|
177
|
+
@predictors.each do |f|
|
|
178
|
+
next if m.include? f
|
|
179
|
+
base_model=md(m)
|
|
180
|
+
comp_model=md(m+[f])
|
|
181
|
+
base_model.add_contribution(f,comp_model.r2)
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
private :create_models, :fill_models
|
|
186
|
+
|
|
187
|
+
def dominance_for_nil_model(i,j)
|
|
188
|
+
if md([i]).r2>md([j]).r2
|
|
189
|
+
1
|
|
190
|
+
elsif md([i]).r2<md([j]).r2
|
|
191
|
+
0
|
|
192
|
+
else
|
|
193
|
+
0.5
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
# Returns 1 if i D k, 0 if j dominates i and 0.5 if undetermined
|
|
197
|
+
def total_dominance_pairwise(i,j)
|
|
198
|
+
dm=dominance_for_nil_model(i,j)
|
|
199
|
+
return 0.5 if dm==0.5
|
|
200
|
+
dominances=[dm]
|
|
201
|
+
models_data.each do |k,m|
|
|
202
|
+
if !m.contributions[i].nil? and !m.contributions[j].nil?
|
|
203
|
+
if m.contributions[i]>m.contributions[j]
|
|
204
|
+
dominances.push(1)
|
|
205
|
+
elsif m.contributions[i]<m.contributions[j]
|
|
206
|
+
dominances.push(0)
|
|
207
|
+
else
|
|
208
|
+
return 0.5
|
|
209
|
+
#dominances.push(0.5)
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
final=dominances.uniq
|
|
214
|
+
final.size>1 ? 0.5 : final[0]
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# Returns 1 if i cD k, 0 if j cD i and 0.5 if undetermined
|
|
218
|
+
def conditional_dominance_pairwise(i,j)
|
|
219
|
+
dm=dominance_for_nil_model(i,j)
|
|
220
|
+
return 0.5 if dm==0.5
|
|
221
|
+
dominances=[dm]
|
|
222
|
+
for k in 1...@predictors.size
|
|
223
|
+
a=average_k(k)
|
|
224
|
+
if a[i]>a[j]
|
|
225
|
+
dominances.push(1)
|
|
226
|
+
elsif a[i]<a[j]
|
|
227
|
+
dominances.push(0)
|
|
228
|
+
else
|
|
229
|
+
return 0.5
|
|
230
|
+
#dominances.push(0.5)
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
final=dominances.uniq
|
|
234
|
+
final.size>1 ? 0.5 : final[0]
|
|
235
|
+
end
|
|
236
|
+
# Returns 1 if i gD k, 0 if j gD i and 0.5 if undetermined
|
|
237
|
+
def general_dominance_pairwise(i,j)
|
|
238
|
+
ga=general_averages
|
|
239
|
+
if ga[i]>ga[j]
|
|
240
|
+
1
|
|
241
|
+
elsif ga[i]<ga[j]
|
|
242
|
+
0
|
|
243
|
+
else
|
|
244
|
+
0.5
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
def pairs
|
|
248
|
+
models.find_all{|m| m.size==2}
|
|
249
|
+
end
|
|
250
|
+
def total_dominance
|
|
251
|
+
pairs.inject({}){|a,pair| a[pair]=total_dominance_pairwise(pair[0], pair[1])
|
|
252
|
+
a
|
|
253
|
+
}
|
|
254
|
+
end
|
|
255
|
+
def conditional_dominance
|
|
256
|
+
pairs.inject({}){|a,pair| a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
|
|
257
|
+
a
|
|
258
|
+
}
|
|
259
|
+
end
|
|
260
|
+
def general_dominance
|
|
261
|
+
pairs.inject({}){|a,pair| a[pair]=general_dominance_pairwise(pair[0], pair[1])
|
|
262
|
+
a
|
|
263
|
+
}
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
def md(m)
|
|
267
|
+
models_data[m.sort {|a,b| a.to_s <=> b.to_s}]
|
|
268
|
+
end
|
|
269
|
+
# Get all model of size k
|
|
270
|
+
def md_k(k)
|
|
271
|
+
out=[]
|
|
272
|
+
@models.each{ |m| out.push(md(m)) if m.size==k }
|
|
273
|
+
out
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# For a hash with arrays of numbers as values
|
|
277
|
+
# Returns a hash with same keys and
|
|
278
|
+
# value as the mean of values of original hash
|
|
279
|
+
def get_averages(averages)
|
|
280
|
+
out={}
|
|
281
|
+
averages.each{ |key,val| out[key] = Daru::Vector.new(val).mean }
|
|
282
|
+
out
|
|
283
|
+
end
|
|
284
|
+
# Hash with average for each k size model.
|
|
285
|
+
def average_k(k)
|
|
286
|
+
return nil if k==@predictors.size
|
|
287
|
+
models=md_k(k)
|
|
288
|
+
averages=@predictors.inject({}) {|a,v| a[v]=[];a}
|
|
289
|
+
models.each do |m|
|
|
290
|
+
@predictors.each do |f|
|
|
291
|
+
averages[f].push(m.contributions[f]) unless m.contributions[f].nil?
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
get_averages(averages)
|
|
295
|
+
end
|
|
296
|
+
def general_averages
|
|
297
|
+
if @general_averages.nil?
|
|
298
|
+
averages=@predictors.inject({}) {|a,v| a[v]=[md([v]).r2];a}
|
|
299
|
+
for k in 1...@predictors.size
|
|
300
|
+
ak=average_k(k)
|
|
301
|
+
@predictors.each do |f|
|
|
302
|
+
averages[f].push(ak[f])
|
|
303
|
+
end
|
|
304
|
+
end
|
|
305
|
+
@general_averages=get_averages(averages)
|
|
306
|
+
end
|
|
307
|
+
@general_averages
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def report_building(g)
|
|
312
|
+
compute if @models.nil?
|
|
313
|
+
g.section(:name=>@name) do |generator|
|
|
314
|
+
header=["","r2",_("sign")]+@predictors.collect {|c| DominanceAnalysis.predictor_name(c) }
|
|
315
|
+
|
|
316
|
+
generator.table(:name=>_("Dominance Analysis result"), :header=>header) do |t|
|
|
317
|
+
row=[_("Model 0"),"",""]+@predictors.collect{|f|
|
|
318
|
+
sprintf("%0.3f",md([f]).r2)
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
t.row(row)
|
|
322
|
+
t.hr
|
|
323
|
+
for i in 1..@predictors.size
|
|
324
|
+
mk=md_k(i)
|
|
325
|
+
mk.each{|m|
|
|
326
|
+
t.row(m.add_table_row)
|
|
327
|
+
}
|
|
328
|
+
# Report averages
|
|
329
|
+
a=average_k(i)
|
|
330
|
+
if !a.nil?
|
|
331
|
+
t.hr
|
|
332
|
+
row=[_("k=%d Average") % i,"",""] + @predictors.collect{|f|
|
|
333
|
+
sprintf("%0.3f",a[f])
|
|
334
|
+
}
|
|
335
|
+
t.row(row)
|
|
336
|
+
t.hr
|
|
337
|
+
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
g=general_averages
|
|
342
|
+
t.hr
|
|
343
|
+
|
|
344
|
+
row=[_("Overall averages"),"",""]+@predictors.collect{|f|
|
|
345
|
+
sprintf("%0.3f",g[f])
|
|
346
|
+
}
|
|
347
|
+
t.row(row)
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
td=total_dominance
|
|
351
|
+
cd=conditional_dominance
|
|
352
|
+
gd=general_dominance
|
|
353
|
+
generator.table(:name=>_("Pairwise dominance"), :header=>[_("Pairs"),_("Total"),_("Conditional"),_("General")]) do |t|
|
|
354
|
+
pairs.each{|pair|
|
|
355
|
+
name=pair.map{|v| v.is_a?(Array) ? "("+v.join("-")+")" : v}.join(" - ")
|
|
356
|
+
row=[name, sprintf("%0.1f",td[pair]), sprintf("%0.1f",cd[pair]), sprintf("%0.1f",gd[pair])]
|
|
357
|
+
t.row(row)
|
|
358
|
+
}
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
end
|
|
362
|
+
class ModelData # :nodoc:
|
|
363
|
+
attr_reader :contributions
|
|
364
|
+
def initialize(independent, data, da)
|
|
365
|
+
@independent=independent
|
|
366
|
+
@data=data
|
|
367
|
+
@predictors=da.predictors
|
|
368
|
+
@dependent=da.dependent
|
|
369
|
+
@cases=da.cases
|
|
370
|
+
@method=da.method_association
|
|
371
|
+
@contributions=@independent.inject({}){|a,v| a[v]=nil;a}
|
|
372
|
+
|
|
373
|
+
r_class=da.regression_class
|
|
374
|
+
|
|
375
|
+
if @dependent.size==1
|
|
376
|
+
@lr=r_class.new(data, @dependent[0], :cases=>@cases)
|
|
377
|
+
else
|
|
378
|
+
@lr=r_class.new(data, @dependent, :cases=>@cases)
|
|
379
|
+
end
|
|
380
|
+
end
|
|
381
|
+
def add_contribution(f, v)
|
|
382
|
+
@contributions[f]=v-r2
|
|
383
|
+
end
|
|
384
|
+
def r2
|
|
385
|
+
@lr.send(@method)
|
|
386
|
+
end
|
|
387
|
+
def name
|
|
388
|
+
@independent.collect {|variable|
|
|
389
|
+
DominanceAnalysis.predictor_name(variable)
|
|
390
|
+
}.join("*")
|
|
391
|
+
end
|
|
392
|
+
def add_table_row
|
|
393
|
+
if @cases
|
|
394
|
+
sign=sprintf("%0.3f", @lr.probability)
|
|
395
|
+
else
|
|
396
|
+
sign="???"
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
[name, sprintf("%0.3f",r2), sign] + @predictors.collect{|k|
|
|
400
|
+
v=@contributions[k]
|
|
401
|
+
if v.nil?
|
|
402
|
+
"--"
|
|
403
|
+
else
|
|
404
|
+
sprintf("%0.3f",v)
|
|
405
|
+
end
|
|
406
|
+
}
|
|
407
|
+
end
|
|
408
|
+
def summary
|
|
409
|
+
out=sprintf("%s: r2=%0.3f(p=%0.2f)\n",name, r2, @lr.significance, @lr.sst)
|
|
410
|
+
out << @predictors.collect{|k|
|
|
411
|
+
v=@contributions[k]
|
|
412
|
+
if v.nil?
|
|
413
|
+
"--"
|
|
414
|
+
else
|
|
415
|
+
sprintf("%s=%0.3f",k,v)
|
|
416
|
+
end
|
|
417
|
+
}.join(" | ")
|
|
418
|
+
out << "\n"
|
|
419
|
+
return out
|
|
420
|
+
end
|
|
421
|
+
end # end ModelData
|
|
422
|
+
end # end Dominance Analysis
|
|
423
|
+
end
|
|
424
|
+
|
|
425
|
+
require 'statsample/dominanceanalysis/bootstrap'
|