statsample-ekatena 2.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.travis.yml +23 -0
- data/CONTRIBUTING.md +17 -0
- data/Gemfile +2 -0
- data/History.txt +457 -0
- data/LICENSE.txt +12 -0
- data/README.md +175 -0
- data/Rakefile +44 -0
- data/benchmarks/correlation_matrix_15_variables.rb +32 -0
- data/benchmarks/correlation_matrix_5_variables.rb +33 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/doc_latex/manual/equations.tex +78 -0
- data/examples/boxplot.rb +28 -0
- data/examples/chisquare_test.rb +23 -0
- data/examples/correlation_matrix.rb +32 -0
- data/examples/dataset.rb +30 -0
- data/examples/dominance_analysis.rb +33 -0
- data/examples/dominance_analysis_bootstrap.rb +32 -0
- data/examples/histogram.rb +26 -0
- data/examples/icc.rb +24 -0
- data/examples/levene.rb +29 -0
- data/examples/multiple_regression.rb +20 -0
- data/examples/multivariate_correlation.rb +33 -0
- data/examples/parallel_analysis.rb +40 -0
- data/examples/polychoric.rb +40 -0
- data/examples/principal_axis.rb +26 -0
- data/examples/reliability.rb +31 -0
- data/examples/scatterplot.rb +25 -0
- data/examples/t_test.rb +27 -0
- data/examples/tetrachoric.rb +17 -0
- data/examples/u_test.rb +24 -0
- data/examples/vector.rb +20 -0
- data/examples/velicer_map_test.rb +46 -0
- data/grab_references.rb +29 -0
- data/lib/spss.rb +134 -0
- data/lib/statsample-ekatena/analysis.rb +100 -0
- data/lib/statsample-ekatena/analysis/suite.rb +89 -0
- data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
- data/lib/statsample-ekatena/anova.rb +24 -0
- data/lib/statsample-ekatena/anova/contrast.rb +79 -0
- data/lib/statsample-ekatena/anova/oneway.rb +187 -0
- data/lib/statsample-ekatena/anova/twoway.rb +207 -0
- data/lib/statsample-ekatena/bivariate.rb +406 -0
- data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
- data/lib/statsample-ekatena/codification.rb +182 -0
- data/lib/statsample-ekatena/converter/csv.rb +28 -0
- data/lib/statsample-ekatena/converter/spss.rb +48 -0
- data/lib/statsample-ekatena/converters.rb +211 -0
- data/lib/statsample-ekatena/crosstab.rb +188 -0
- data/lib/statsample-ekatena/daru.rb +115 -0
- data/lib/statsample-ekatena/dataset.rb +10 -0
- data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
- data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
- data/lib/statsample-ekatena/factor.rb +104 -0
- data/lib/statsample-ekatena/factor/map.rb +124 -0
- data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
- data/lib/statsample-ekatena/factor/pca.rb +242 -0
- data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
- data/lib/statsample-ekatena/factor/rotation.rb +198 -0
- data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
- data/lib/statsample-ekatena/formula/formula.rb +306 -0
- data/lib/statsample-ekatena/graph.rb +11 -0
- data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
- data/lib/statsample-ekatena/graph/histogram.rb +198 -0
- data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
- data/lib/statsample-ekatena/histogram.rb +180 -0
- data/lib/statsample-ekatena/matrix.rb +329 -0
- data/lib/statsample-ekatena/multiset.rb +310 -0
- data/lib/statsample-ekatena/regression.rb +65 -0
- data/lib/statsample-ekatena/regression/multiple.rb +89 -0
- data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
- data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
- data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
- data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
- data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
- data/lib/statsample-ekatena/regression/simple.rb +121 -0
- data/lib/statsample-ekatena/reliability.rb +150 -0
- data/lib/statsample-ekatena/reliability/icc.rb +415 -0
- data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
- data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
- data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
- data/lib/statsample-ekatena/resample.rb +15 -0
- data/lib/statsample-ekatena/shorthand.rb +125 -0
- data/lib/statsample-ekatena/srs.rb +169 -0
- data/lib/statsample-ekatena/test.rb +82 -0
- data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
- data/lib/statsample-ekatena/test/chisquare.rb +73 -0
- data/lib/statsample-ekatena/test/f.rb +52 -0
- data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
- data/lib/statsample-ekatena/test/levene.rb +88 -0
- data/lib/statsample-ekatena/test/t.rb +309 -0
- data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
- data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
- data/lib/statsample-ekatena/vector.rb +19 -0
- data/lib/statsample-ekatena/version.rb +3 -0
- data/lib/statsample.rb +282 -0
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +959 -0
- data/po/statsample.pot +947 -0
- data/references.txt +24 -0
- data/statsample-ekatena.gemspec +49 -0
- data/test/fixtures/bank2.dat +200 -0
- data/test/fixtures/correlation_matrix.rb +17 -0
- data/test/fixtures/df.csv +15 -0
- data/test/fixtures/hartman_23.matrix +9 -0
- data/test/fixtures/stock_data.csv +500 -0
- data/test/fixtures/tetmat_matrix.txt +5 -0
- data/test/fixtures/tetmat_test.txt +1001 -0
- data/test/helpers_tests.rb +83 -0
- data/test/test_analysis.rb +176 -0
- data/test/test_anova_contrast.rb +36 -0
- data/test/test_anovaoneway.rb +26 -0
- data/test/test_anovatwoway.rb +37 -0
- data/test/test_anovatwowaywithdataset.rb +47 -0
- data/test/test_anovawithvectors.rb +102 -0
- data/test/test_awesome_print_bug.rb +16 -0
- data/test/test_bartlettsphericity.rb +25 -0
- data/test/test_bivariate.rb +164 -0
- data/test/test_codification.rb +78 -0
- data/test/test_crosstab.rb +67 -0
- data/test/test_dominance_analysis.rb +39 -0
- data/test/test_factor.rb +228 -0
- data/test/test_factor_map.rb +38 -0
- data/test/test_factor_pa.rb +56 -0
- data/test/test_fit_model.rb +88 -0
- data/test/test_ggobi.rb +35 -0
- data/test/test_gsl.rb +15 -0
- data/test/test_histogram.rb +109 -0
- data/test/test_matrix.rb +48 -0
- data/test/test_multiset.rb +176 -0
- data/test/test_regression.rb +231 -0
- data/test/test_reliability.rb +223 -0
- data/test/test_reliability_icc.rb +198 -0
- data/test/test_reliability_skillscale.rb +57 -0
- data/test/test_resample.rb +24 -0
- data/test/test_srs.rb +9 -0
- data/test/test_statistics.rb +69 -0
- data/test/test_stest.rb +69 -0
- data/test/test_stratified.rb +17 -0
- data/test/test_test_f.rb +33 -0
- data/test/test_test_kolmogorovsmirnov.rb +34 -0
- data/test/test_test_t.rb +62 -0
- data/test/test_umannwhitney.rb +27 -0
- data/test/test_vector.rb +12 -0
- data/test/test_wilcoxonsignedrank.rb +64 -0
- metadata +570 -0
@@ -0,0 +1,188 @@
|
|
1
|
+
module Statsample
|
2
|
+
# Class to create crosstab of data
|
3
|
+
# With this, you can create reports and do chi square test
|
4
|
+
# The first vector will be at rows and the second will the the columns
|
5
|
+
#
|
6
|
+
class Crosstab
|
7
|
+
include Summarizable
|
8
|
+
attr_reader :v_rows, :v_cols
|
9
|
+
attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total
|
10
|
+
def initialize(v1, v2, opts=Hash.new)
|
11
|
+
raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
|
12
|
+
@v_rows, @v_cols = Statsample.only_valid_clone(
|
13
|
+
Daru::Vector.new(v1),
|
14
|
+
Daru::Vector.new(v2))
|
15
|
+
@cases = @v_rows.size
|
16
|
+
@row_label = v1.name
|
17
|
+
@column_label = v2.name
|
18
|
+
@name = nil
|
19
|
+
@percentage_row = @percentage_column = @percentage_total=false
|
20
|
+
opts.each do |k,v|
|
21
|
+
self.send("#{k}=",v) if self.respond_to? k
|
22
|
+
end
|
23
|
+
@name ||= _("Crosstab %s - %s") % [@row_label, @column_label]
|
24
|
+
end
|
25
|
+
def rows_names
|
26
|
+
@v_rows.factors.sort.reset_index!
|
27
|
+
end
|
28
|
+
def cols_names
|
29
|
+
@v_cols.factors.sort.reset_index!
|
30
|
+
end
|
31
|
+
def rows_total
|
32
|
+
@v_rows.frequencies
|
33
|
+
end
|
34
|
+
def cols_total
|
35
|
+
@v_cols.frequencies
|
36
|
+
end
|
37
|
+
|
38
|
+
def frequencies
|
39
|
+
base = rows_names.inject([]) do |s,row|
|
40
|
+
s += cols_names.collect { |col| [row,col] }
|
41
|
+
end.inject({}) do |s,par|
|
42
|
+
s[par]=0
|
43
|
+
s
|
44
|
+
end
|
45
|
+
base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies)
|
46
|
+
end
|
47
|
+
def to_matrix
|
48
|
+
f = frequencies
|
49
|
+
rn = rows_names
|
50
|
+
cn = cols_names
|
51
|
+
Matrix.rows(rn.collect{|row|
|
52
|
+
cn.collect{|col| f[[row,col]]}
|
53
|
+
})
|
54
|
+
end
|
55
|
+
def frequencies_by_row
|
56
|
+
f=frequencies
|
57
|
+
rows_names.inject({}){|sr,row|
|
58
|
+
sr[row]=cols_names.inject({}) {|sc,col| sc[col]=f[[row,col]]; sc}
|
59
|
+
sr
|
60
|
+
}
|
61
|
+
end
|
62
|
+
def frequencies_by_col
|
63
|
+
f=frequencies
|
64
|
+
cols_names.inject({}){|sc,col|
|
65
|
+
sc[col]=rows_names.inject({}) {|sr,row| sr[row]=f[[row,col]]; sr}
|
66
|
+
sc
|
67
|
+
}
|
68
|
+
end
|
69
|
+
# Chi square, based on expected and real matrix
|
70
|
+
def chi_square
|
71
|
+
require 'statsample/test'
|
72
|
+
Statsample::Test.chi_square(self.to_matrix, matrix_expected)
|
73
|
+
end
|
74
|
+
# Useful to obtain chi square
|
75
|
+
def matrix_expected
|
76
|
+
rn=rows_names
|
77
|
+
cn=cols_names
|
78
|
+
rt=rows_total
|
79
|
+
ct=cols_total
|
80
|
+
t=@v_rows.size
|
81
|
+
m=rn.collect{|row|
|
82
|
+
cn.collect{|col|
|
83
|
+
(rt[row]*ct[col]).quo(t)
|
84
|
+
}
|
85
|
+
}
|
86
|
+
Matrix.rows(m)
|
87
|
+
end
|
88
|
+
def cols_empty_hash
|
89
|
+
cols_names.inject({}) {|a,x| a[x]=0;a}
|
90
|
+
end
|
91
|
+
def report_building(builder)
|
92
|
+
builder.section(:name=>@name) do |generator|
|
93
|
+
fq=frequencies
|
94
|
+
rn=rows_names
|
95
|
+
cn=cols_names
|
96
|
+
total=0
|
97
|
+
total_cols=cols_empty_hash
|
98
|
+
generator.text "Chi Square: #{chi_square}"
|
99
|
+
generator.text(_("Rows: %s") % @row_label) unless @row_label.nil?
|
100
|
+
generator.text(_("Columns: %s") % @column_label) unless @column_label.nil?
|
101
|
+
|
102
|
+
t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c)}+[_("Total")])
|
103
|
+
rn.each do |row|
|
104
|
+
total_row=0
|
105
|
+
t_row=[@v_rows.index_of(row)]
|
106
|
+
cn.each do |col|
|
107
|
+
data=fq[[row,col]]
|
108
|
+
total_row+=fq[[row,col]]
|
109
|
+
total+=fq[[row,col]]
|
110
|
+
total_cols[col]+=fq[[row,col]]
|
111
|
+
t_row.push(data)
|
112
|
+
end
|
113
|
+
t_row.push(total_row)
|
114
|
+
t.row(t_row)
|
115
|
+
end
|
116
|
+
t.hr
|
117
|
+
t_row=[_("Total")]
|
118
|
+
cn.each do |v|
|
119
|
+
t_row.push(total_cols[v])
|
120
|
+
end
|
121
|
+
t_row.push(total)
|
122
|
+
t.row(t_row)
|
123
|
+
generator.parse_element(t)
|
124
|
+
|
125
|
+
if(@percentage_row)
|
126
|
+
table_percentage(generator,:row)
|
127
|
+
end
|
128
|
+
if(@percentage_column)
|
129
|
+
table_percentage(generator,:column)
|
130
|
+
end
|
131
|
+
if(@percentage_total)
|
132
|
+
table_percentage(generator,:total)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
def table_percentage(generator,type)
|
140
|
+
fq=frequencies
|
141
|
+
cn=cols_names
|
142
|
+
rn=rows_names
|
143
|
+
rt=rows_total
|
144
|
+
ct=cols_total
|
145
|
+
|
146
|
+
type_name=case type
|
147
|
+
when :row then _("% Row")
|
148
|
+
when :column then _("% Column")
|
149
|
+
when :total then _("% Total")
|
150
|
+
end
|
151
|
+
|
152
|
+
t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c) } + [_("Total")])
|
153
|
+
rn.each do |row|
|
154
|
+
t_row=[@v_rows.index_of(row)]
|
155
|
+
cn.each do |col|
|
156
|
+
total=case type
|
157
|
+
when :row then rt[row]
|
158
|
+
when :column then ct[col]
|
159
|
+
when :total then @cases
|
160
|
+
end
|
161
|
+
data = sprintf("%0.2f%%", fq[[row,col]]*100.0/ total )
|
162
|
+
t_row.push(data)
|
163
|
+
end
|
164
|
+
total=case type
|
165
|
+
when :row then rt[row]
|
166
|
+
when :column then @cases
|
167
|
+
when :total then @cases
|
168
|
+
end
|
169
|
+
t_row.push(sprintf("%0.2f%%", rt[row]*100.0/total))
|
170
|
+
t.row(t_row)
|
171
|
+
end
|
172
|
+
|
173
|
+
t.hr
|
174
|
+
t_row=[_("Total")]
|
175
|
+
cn.each{|col|
|
176
|
+
total=case type
|
177
|
+
when :row then @cases
|
178
|
+
when :column then ct[col]
|
179
|
+
when :total then @cases
|
180
|
+
end
|
181
|
+
t_row.push(sprintf("%0.2f%%", ct[col]*100.0/total))
|
182
|
+
}
|
183
|
+
t_row.push("100%")
|
184
|
+
t.row(t_row)
|
185
|
+
generator.parse_element(t)
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
# Opening the Daru::DataFrame class for adding methods to convert from
|
2
|
+
# data structures to specialized statsample data structues like Multiset.
|
3
|
+
module Daru
|
4
|
+
class Vector
|
5
|
+
def histogram(bins=10)
|
6
|
+
type == :numeric or raise TypeError, "Only numeric Vectors can do this operation."
|
7
|
+
|
8
|
+
if bins.is_a? Array
|
9
|
+
h = Statsample::Histogram.alloc(bins)
|
10
|
+
else
|
11
|
+
# ugly patch. The upper limit for a bin has the form
|
12
|
+
# x < range
|
13
|
+
#h=Statsample::Histogram.new(self, bins)
|
14
|
+
valid = reject_values(*Daru::MISSING_VALUES)
|
15
|
+
min,max=Statsample::Util.nice(valid.min,valid.max)
|
16
|
+
# fix last data
|
17
|
+
if max == valid.max
|
18
|
+
max += 1e-10
|
19
|
+
end
|
20
|
+
h = Statsample::Histogram.alloc(bins,[min,max])
|
21
|
+
# Fix last bin
|
22
|
+
end
|
23
|
+
|
24
|
+
h.increment(valid)
|
25
|
+
h
|
26
|
+
end
|
27
|
+
|
28
|
+
# Variance of p, according to poblation size
|
29
|
+
def variance_proportion(n_poblation, v=1)
|
30
|
+
Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Variance of p, according to poblation size
|
34
|
+
def variance_total(n_poblation, v=1)
|
35
|
+
Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
36
|
+
end
|
37
|
+
|
38
|
+
def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
|
39
|
+
Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
|
40
|
+
end
|
41
|
+
|
42
|
+
def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
|
43
|
+
Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class DataFrame
|
48
|
+
def crosstab(v1,v2,opts={})
|
49
|
+
Statsample::Crosstab.new(self[v1], self[v2],opts)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Functions for converting to Statsample::Multiset
|
53
|
+
def to_multiset_by_split(*vecs)
|
54
|
+
require 'statsample/multiset'
|
55
|
+
|
56
|
+
if vecs.size == 1
|
57
|
+
to_multiset_by_split_one_field(vecs[0])
|
58
|
+
else
|
59
|
+
to_multiset_by_split_multiple_fields(*vecs)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# Creates a Statsample::Multiset, using one field
|
64
|
+
def to_multiset_by_split_one_field(field)
|
65
|
+
raise ArgumentError,"Should use a correct field name" if
|
66
|
+
!@vectors.include? field
|
67
|
+
|
68
|
+
factors = self[field].factors
|
69
|
+
ms = Statsample::Multiset.new_empty_vectors(@vectors.to_a, factors)
|
70
|
+
each_row do |row|
|
71
|
+
ms[row[field]].add_row(row)
|
72
|
+
end
|
73
|
+
#puts "Ingreso a los dataset"
|
74
|
+
ms.datasets.each do |k,ds|
|
75
|
+
ds.rename self[field].index_of(k)
|
76
|
+
end
|
77
|
+
|
78
|
+
ms
|
79
|
+
end
|
80
|
+
|
81
|
+
def to_multiset_by_split_multiple_fields(*fields)
|
82
|
+
fields.map!(&:to_sym)
|
83
|
+
factors_total=nil
|
84
|
+
fields.each do |f|
|
85
|
+
if factors_total.nil?
|
86
|
+
factors_total = self[f].factors.collect { |c| [c] }
|
87
|
+
else
|
88
|
+
suma = []
|
89
|
+
factors = self[f].factors
|
90
|
+
factors_total.each do |f1|
|
91
|
+
factors.each do |f2|
|
92
|
+
suma.push(f1+[f2])
|
93
|
+
end
|
94
|
+
end
|
95
|
+
factors_total = suma
|
96
|
+
end
|
97
|
+
end
|
98
|
+
ms = Statsample::Multiset.new_empty_vectors(vectors.to_a, factors_total)
|
99
|
+
|
100
|
+
p1 = eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}'.to_sym]"}.join(",")+"]].add_row(c) }"
|
101
|
+
each_row { |r| p1.call(r) }
|
102
|
+
|
103
|
+
ms.datasets.each do |k,ds|
|
104
|
+
ds.rename(
|
105
|
+
fields.size.times.map do |i|
|
106
|
+
f = fields[i]
|
107
|
+
sk = k[i]
|
108
|
+
self[f].index_of(sk)
|
109
|
+
end.join("-")
|
110
|
+
)
|
111
|
+
end
|
112
|
+
ms
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,425 @@
|
|
1
|
+
module Statsample
|
2
|
+
# Dominance Analysis is a procedure based on an examination of the R<sup>2</sup> values
|
3
|
+
# for all possible subset models, to identify the relevance of one or more
|
4
|
+
# predictors in the prediction of criterium.
|
5
|
+
#
|
6
|
+
# See Budescu(1993), Azen & Budescu (2003, 2006) for more information.
|
7
|
+
#
|
8
|
+
# == Use
|
9
|
+
#
|
10
|
+
# a = Daru::Vector.new(1000.times.collect {rand})
|
11
|
+
# b = Daru::Vector.new(1000.times.collect {rand})
|
12
|
+
# c = Daru::Vector.new(1000.times.collect {rand})
|
13
|
+
# ds= Daru::DataFrame.new({:a => a,:b => b,:c => c})
|
14
|
+
# ds[:y] = ds.collect_rows {|row| row[:a]*5 + row[:b]*3 + row[:c]*2 + rand()}
|
15
|
+
# da=Statsample::DominanceAnalysis.new(ds, :y)
|
16
|
+
# puts da.summary
|
17
|
+
#
|
18
|
+
# === Output:
|
19
|
+
#
|
20
|
+
# Report: Report 2010-02-08 19:10:11 -0300
|
21
|
+
# Table: Dominance Analysis result
|
22
|
+
# ------------------------------------------------------------
|
23
|
+
# | | r2 | sign | a | b | c |
|
24
|
+
# ------------------------------------------------------------
|
25
|
+
# | Model 0 | | | 0.648 | 0.265 | 0.109 |
|
26
|
+
# ------------------------------------------------------------
|
27
|
+
# | a | 0.648 | 0.000 | -- | 0.229 | 0.104 |
|
28
|
+
# | b | 0.265 | 0.000 | 0.612 | -- | 0.104 |
|
29
|
+
# | c | 0.109 | 0.000 | 0.643 | 0.260 | -- |
|
30
|
+
# ------------------------------------------------------------
|
31
|
+
# | k=1 Average | | | 0.627 | 0.244 | 0.104 |
|
32
|
+
# ------------------------------------------------------------
|
33
|
+
# | a*b | 0.877 | 0.000 | -- | -- | 0.099 |
|
34
|
+
# | a*c | 0.752 | 0.000 | -- | 0.224 | -- |
|
35
|
+
# | b*c | 0.369 | 0.000 | 0.607 | -- | -- |
|
36
|
+
# ------------------------------------------------------------
|
37
|
+
# | k=2 Average | | | 0.607 | 0.224 | 0.099 |
|
38
|
+
# ------------------------------------------------------------
|
39
|
+
# | a*b*c | 0.976 | 0.000 | -- | -- | -- |
|
40
|
+
# ------------------------------------------------------------
|
41
|
+
# | Overall averages | | | 0.628 | 0.245 | 0.104 |
|
42
|
+
# ------------------------------------------------------------
|
43
|
+
#
|
44
|
+
# Table: Pairwise dominance
|
45
|
+
# -----------------------------------------
|
46
|
+
# | Pairs | Total | Conditional | General |
|
47
|
+
# -----------------------------------------
|
48
|
+
# | a - b | 1.0 | 1.0 | 1.0 |
|
49
|
+
# | a - c | 1.0 | 1.0 | 1.0 |
|
50
|
+
# | b - c | 1.0 | 1.0 | 1.0 |
|
51
|
+
# -----------------------------------------
|
52
|
+
#
|
53
|
+
# == Reference:
|
54
|
+
# * Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression. <em>Psychological Bulletin, 114</em>, 542-551.
|
55
|
+
# * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. <em>Psychological Methods, 8</em>(2), 129-148.
|
56
|
+
# * Azen, R. & Budescu, D.V. (2006). Comparing predictors in Multivariate Regression Models: An extension of Dominance Analysis. <em>Journal of Educational and Behavioral Statistics, 31</em>(2), 157-180.
|
57
|
+
#
|
58
|
+
class DominanceAnalysis
|
59
|
+
include Summarizable
|
60
|
+
# Class to generate the regressions. Default to Statsample::Regression::Multiple::MatrixEngine
|
61
|
+
attr_accessor :regression_class
|
62
|
+
# Name of analysis
|
63
|
+
attr_accessor :name
|
64
|
+
# Set to true if you want to build from dataset, not correlation matrix
|
65
|
+
attr_accessor :build_from_dataset
|
66
|
+
# Array with independent variables. You could create subarrays,
|
67
|
+
# to test groups of predictors as blocks
|
68
|
+
attr_accessor :predictors
|
69
|
+
# If you provide a matrix as input, you should set
|
70
|
+
# the number of cases to define significance of R^2
|
71
|
+
attr_accessor :cases
|
72
|
+
# Method of :regression_class used to measure association.
|
73
|
+
#
|
74
|
+
# Only necessary to change if you have multivariate dependent.
|
75
|
+
# * :r2yx (R^2_yx), the default option, is the option when distinction
|
76
|
+
# between independent and dependents variable is arbitrary
|
77
|
+
# * :p2yx is the option when the distinction between independent and dependents variables is real.
|
78
|
+
#
|
79
|
+
|
80
|
+
attr_accessor :method_association
|
81
|
+
|
82
|
+
|
83
|
+
attr_reader :dependent
|
84
|
+
|
85
|
+
UNIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MatrixEngine
|
86
|
+
MULTIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MultipleDependent
|
87
|
+
|
88
|
+
def self.predictor_name(variable)
|
89
|
+
if variable.is_a? Array
|
90
|
+
sprintf("(%s)", variable.join(","))
|
91
|
+
else
|
92
|
+
variable
|
93
|
+
end
|
94
|
+
end
|
95
|
+
# Creates a new DominanceAnalysis object
|
96
|
+
# Parameters:
|
97
|
+
# * input: A Matrix or Dataset object
|
98
|
+
# * dependent: Name of dependent variable. Could be an array, if you want to
|
99
|
+
# do an Multivariate Regression Analysis. If nil, set to all
|
100
|
+
# fields on input, except criteria
|
101
|
+
|
102
|
+
def initialize(input, dependent, opts=Hash.new)
|
103
|
+
@build_from_dataset=false
|
104
|
+
if dependent.is_a? Array
|
105
|
+
@regression_class= MULTIVARIATE_REGRESSION_CLASS
|
106
|
+
@method_association=:r2yx
|
107
|
+
else
|
108
|
+
@regression_class= UNIVARIATE_REGRESSION_CLASS
|
109
|
+
@method_association=:r2
|
110
|
+
end
|
111
|
+
|
112
|
+
@name=nil
|
113
|
+
opts.each{|k,v|
|
114
|
+
self.send("#{k}=",v) if self.respond_to? k
|
115
|
+
}
|
116
|
+
@dependent=dependent
|
117
|
+
@dependent=[@dependent] unless @dependent.is_a? Array
|
118
|
+
|
119
|
+
if input.kind_of? Daru::DataFrame
|
120
|
+
@predictors ||= input.vectors.to_a - @dependent
|
121
|
+
@ds=input
|
122
|
+
@matrix=Statsample::Bivariate.correlation_matrix(input)
|
123
|
+
@cases=Statsample::Bivariate.min_n_valid(input)
|
124
|
+
elsif input.is_a? ::Matrix
|
125
|
+
@predictors ||= input.fields-@dependent
|
126
|
+
@ds=nil
|
127
|
+
@matrix=input
|
128
|
+
else
|
129
|
+
raise ArgumentError.new("You should use a Matrix or a Dataset")
|
130
|
+
end
|
131
|
+
|
132
|
+
@name=_("Dominance Analysis: %s over %s") % [ @predictors.flatten.join(",") , @dependent.join(",")] if @name.nil?
|
133
|
+
@models=nil
|
134
|
+
@models_data=nil
|
135
|
+
@general_averages=nil
|
136
|
+
end
|
137
|
+
# Compute models.
|
138
|
+
def compute
|
139
|
+
create_models
|
140
|
+
fill_models
|
141
|
+
end
|
142
|
+
def models
|
143
|
+
if @models.nil?
|
144
|
+
compute
|
145
|
+
end
|
146
|
+
@models
|
147
|
+
end
|
148
|
+
|
149
|
+
def models_data
|
150
|
+
if @models_data.nil?
|
151
|
+
compute
|
152
|
+
end
|
153
|
+
@models_data
|
154
|
+
end
|
155
|
+
def create_models
|
156
|
+
@models=[]
|
157
|
+
@models_data={}
|
158
|
+
for i in 1..@predictors.size
|
159
|
+
c=(0...@predictors.size).to_a.combination(i)
|
160
|
+
c.each do |data|
|
161
|
+
|
162
|
+
independent=data.collect {|i1| @predictors[i1] }
|
163
|
+
@models.push(independent)
|
164
|
+
if (@build_from_dataset)
|
165
|
+
data=@ds.dup(independent.flatten+@dependent)
|
166
|
+
else
|
167
|
+
data=@matrix.submatrix(independent.flatten+@dependent)
|
168
|
+
end
|
169
|
+
|
170
|
+
modeldata=ModelData.new(independent, data, self)
|
171
|
+
models_data[independent.sort {|a,b| a.to_s<=>b.to_s}]=modeldata
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
def fill_models
|
176
|
+
@models.each do |m|
|
177
|
+
@predictors.each do |f|
|
178
|
+
next if m.include? f
|
179
|
+
base_model=md(m)
|
180
|
+
comp_model=md(m+[f])
|
181
|
+
base_model.add_contribution(f,comp_model.r2)
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
private :create_models, :fill_models
|
186
|
+
|
187
|
+
def dominance_for_nil_model(i,j)
|
188
|
+
if md([i]).r2>md([j]).r2
|
189
|
+
1
|
190
|
+
elsif md([i]).r2<md([j]).r2
|
191
|
+
0
|
192
|
+
else
|
193
|
+
0.5
|
194
|
+
end
|
195
|
+
end
|
196
|
+
# Returns 1 if i D k, 0 if j dominates i and 0.5 if undetermined
|
197
|
+
def total_dominance_pairwise(i,j)
|
198
|
+
dm=dominance_for_nil_model(i,j)
|
199
|
+
return 0.5 if dm==0.5
|
200
|
+
dominances=[dm]
|
201
|
+
models_data.each do |k,m|
|
202
|
+
if !m.contributions[i].nil? and !m.contributions[j].nil?
|
203
|
+
if m.contributions[i]>m.contributions[j]
|
204
|
+
dominances.push(1)
|
205
|
+
elsif m.contributions[i]<m.contributions[j]
|
206
|
+
dominances.push(0)
|
207
|
+
else
|
208
|
+
return 0.5
|
209
|
+
#dominances.push(0.5)
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
final=dominances.uniq
|
214
|
+
final.size>1 ? 0.5 : final[0]
|
215
|
+
end
|
216
|
+
|
217
|
+
# Returns 1 if i cD k, 0 if j cD i and 0.5 if undetermined
|
218
|
+
def conditional_dominance_pairwise(i,j)
|
219
|
+
dm=dominance_for_nil_model(i,j)
|
220
|
+
return 0.5 if dm==0.5
|
221
|
+
dominances=[dm]
|
222
|
+
for k in 1...@predictors.size
|
223
|
+
a=average_k(k)
|
224
|
+
if a[i]>a[j]
|
225
|
+
dominances.push(1)
|
226
|
+
elsif a[i]<a[j]
|
227
|
+
dominances.push(0)
|
228
|
+
else
|
229
|
+
return 0.5
|
230
|
+
#dominances.push(0.5)
|
231
|
+
end
|
232
|
+
end
|
233
|
+
final=dominances.uniq
|
234
|
+
final.size>1 ? 0.5 : final[0]
|
235
|
+
end
|
236
|
+
# Returns 1 if i gD k, 0 if j gD i and 0.5 if undetermined
|
237
|
+
def general_dominance_pairwise(i,j)
|
238
|
+
ga=general_averages
|
239
|
+
if ga[i]>ga[j]
|
240
|
+
1
|
241
|
+
elsif ga[i]<ga[j]
|
242
|
+
0
|
243
|
+
else
|
244
|
+
0.5
|
245
|
+
end
|
246
|
+
end
|
247
|
+
def pairs
|
248
|
+
models.find_all{|m| m.size==2}
|
249
|
+
end
|
250
|
+
def total_dominance
|
251
|
+
pairs.inject({}){|a,pair| a[pair]=total_dominance_pairwise(pair[0], pair[1])
|
252
|
+
a
|
253
|
+
}
|
254
|
+
end
|
255
|
+
def conditional_dominance
|
256
|
+
pairs.inject({}){|a,pair| a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
|
257
|
+
a
|
258
|
+
}
|
259
|
+
end
|
260
|
+
def general_dominance
|
261
|
+
pairs.inject({}){|a,pair| a[pair]=general_dominance_pairwise(pair[0], pair[1])
|
262
|
+
a
|
263
|
+
}
|
264
|
+
end
|
265
|
+
|
266
|
+
def md(m)
|
267
|
+
models_data[m.sort {|a,b| a.to_s <=> b.to_s}]
|
268
|
+
end
|
269
|
+
# Get all model of size k
|
270
|
+
def md_k(k)
|
271
|
+
out=[]
|
272
|
+
@models.each{ |m| out.push(md(m)) if m.size==k }
|
273
|
+
out
|
274
|
+
end
|
275
|
+
|
276
|
+
# For a hash with arrays of numbers as values
|
277
|
+
# Returns a hash with same keys and
|
278
|
+
# value as the mean of values of original hash
|
279
|
+
def get_averages(averages)
|
280
|
+
out={}
|
281
|
+
averages.each{ |key,val| out[key] = Daru::Vector.new(val).mean }
|
282
|
+
out
|
283
|
+
end
|
284
|
+
# Hash with average for each k size model.
|
285
|
+
def average_k(k)
|
286
|
+
return nil if k==@predictors.size
|
287
|
+
models=md_k(k)
|
288
|
+
averages=@predictors.inject({}) {|a,v| a[v]=[];a}
|
289
|
+
models.each do |m|
|
290
|
+
@predictors.each do |f|
|
291
|
+
averages[f].push(m.contributions[f]) unless m.contributions[f].nil?
|
292
|
+
end
|
293
|
+
end
|
294
|
+
get_averages(averages)
|
295
|
+
end
|
296
|
+
def general_averages
|
297
|
+
if @general_averages.nil?
|
298
|
+
averages=@predictors.inject({}) {|a,v| a[v]=[md([v]).r2];a}
|
299
|
+
for k in 1...@predictors.size
|
300
|
+
ak=average_k(k)
|
301
|
+
@predictors.each do |f|
|
302
|
+
averages[f].push(ak[f])
|
303
|
+
end
|
304
|
+
end
|
305
|
+
@general_averages=get_averages(averages)
|
306
|
+
end
|
307
|
+
@general_averages
|
308
|
+
end
|
309
|
+
|
310
|
+
|
311
|
+
def report_building(g)
|
312
|
+
compute if @models.nil?
|
313
|
+
g.section(:name=>@name) do |generator|
|
314
|
+
header=["","r2",_("sign")]+@predictors.collect {|c| DominanceAnalysis.predictor_name(c) }
|
315
|
+
|
316
|
+
generator.table(:name=>_("Dominance Analysis result"), :header=>header) do |t|
|
317
|
+
row=[_("Model 0"),"",""]+@predictors.collect{|f|
|
318
|
+
sprintf("%0.3f",md([f]).r2)
|
319
|
+
}
|
320
|
+
|
321
|
+
t.row(row)
|
322
|
+
t.hr
|
323
|
+
for i in 1..@predictors.size
|
324
|
+
mk=md_k(i)
|
325
|
+
mk.each{|m|
|
326
|
+
t.row(m.add_table_row)
|
327
|
+
}
|
328
|
+
# Report averages
|
329
|
+
a=average_k(i)
|
330
|
+
if !a.nil?
|
331
|
+
t.hr
|
332
|
+
row=[_("k=%d Average") % i,"",""] + @predictors.collect{|f|
|
333
|
+
sprintf("%0.3f",a[f])
|
334
|
+
}
|
335
|
+
t.row(row)
|
336
|
+
t.hr
|
337
|
+
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
341
|
+
g=general_averages
|
342
|
+
t.hr
|
343
|
+
|
344
|
+
row=[_("Overall averages"),"",""]+@predictors.collect{|f|
|
345
|
+
sprintf("%0.3f",g[f])
|
346
|
+
}
|
347
|
+
t.row(row)
|
348
|
+
end
|
349
|
+
|
350
|
+
td=total_dominance
|
351
|
+
cd=conditional_dominance
|
352
|
+
gd=general_dominance
|
353
|
+
generator.table(:name=>_("Pairwise dominance"), :header=>[_("Pairs"),_("Total"),_("Conditional"),_("General")]) do |t|
|
354
|
+
pairs.each{|pair|
|
355
|
+
name=pair.map{|v| v.is_a?(Array) ? "("+v.join("-")+")" : v}.join(" - ")
|
356
|
+
row=[name, sprintf("%0.1f",td[pair]), sprintf("%0.1f",cd[pair]), sprintf("%0.1f",gd[pair])]
|
357
|
+
t.row(row)
|
358
|
+
}
|
359
|
+
end
|
360
|
+
end
|
361
|
+
end
|
362
|
+
class ModelData # :nodoc:
|
363
|
+
attr_reader :contributions
|
364
|
+
def initialize(independent, data, da)
|
365
|
+
@independent=independent
|
366
|
+
@data=data
|
367
|
+
@predictors=da.predictors
|
368
|
+
@dependent=da.dependent
|
369
|
+
@cases=da.cases
|
370
|
+
@method=da.method_association
|
371
|
+
@contributions=@independent.inject({}){|a,v| a[v]=nil;a}
|
372
|
+
|
373
|
+
r_class=da.regression_class
|
374
|
+
|
375
|
+
if @dependent.size==1
|
376
|
+
@lr=r_class.new(data, @dependent[0], :cases=>@cases)
|
377
|
+
else
|
378
|
+
@lr=r_class.new(data, @dependent, :cases=>@cases)
|
379
|
+
end
|
380
|
+
end
|
381
|
+
def add_contribution(f, v)
|
382
|
+
@contributions[f]=v-r2
|
383
|
+
end
|
384
|
+
def r2
|
385
|
+
@lr.send(@method)
|
386
|
+
end
|
387
|
+
def name
|
388
|
+
@independent.collect {|variable|
|
389
|
+
DominanceAnalysis.predictor_name(variable)
|
390
|
+
}.join("*")
|
391
|
+
end
|
392
|
+
def add_table_row
|
393
|
+
if @cases
|
394
|
+
sign=sprintf("%0.3f", @lr.probability)
|
395
|
+
else
|
396
|
+
sign="???"
|
397
|
+
end
|
398
|
+
|
399
|
+
[name, sprintf("%0.3f",r2), sign] + @predictors.collect{|k|
|
400
|
+
v=@contributions[k]
|
401
|
+
if v.nil?
|
402
|
+
"--"
|
403
|
+
else
|
404
|
+
sprintf("%0.3f",v)
|
405
|
+
end
|
406
|
+
}
|
407
|
+
end
|
408
|
+
def summary
|
409
|
+
out=sprintf("%s: r2=%0.3f(p=%0.2f)\n",name, r2, @lr.significance, @lr.sst)
|
410
|
+
out << @predictors.collect{|k|
|
411
|
+
v=@contributions[k]
|
412
|
+
if v.nil?
|
413
|
+
"--"
|
414
|
+
else
|
415
|
+
sprintf("%s=%0.3f",k,v)
|
416
|
+
end
|
417
|
+
}.join(" | ")
|
418
|
+
out << "\n"
|
419
|
+
return out
|
420
|
+
end
|
421
|
+
end # end ModelData
|
422
|
+
end # end Dominance Analysis
|
423
|
+
end
|
424
|
+
|
425
|
+
require 'statsample/dominanceanalysis/bootstrap'
|