statsample 0.6.3 → 0.6.4
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/Manifest.txt +4 -0
- data/README.txt +5 -5
- data/demo/dominance_analysis_bootstrap.rb +9 -3
- data/demo/dominanceanalysis.rb +23 -7
- data/demo/multivariate_correlation.rb +26 -0
- data/lib/statsample.rb +1 -1
- data/lib/statsample/bivariate.rb +24 -4
- data/lib/statsample/bivariate/polychoric.rb +15 -14
- data/lib/statsample/converters.rb +27 -23
- data/lib/statsample/crosstab.rb +1 -44
- data/lib/statsample/dominanceanalysis.rb +158 -64
- data/lib/statsample/dominanceanalysis/bootstrap.rb +16 -7
- data/lib/statsample/matrix.rb +145 -13
- data/lib/statsample/multiset.rb +248 -265
- data/lib/statsample/regression.rb +3 -0
- data/lib/statsample/regression/multiple.rb +65 -23
- data/lib/statsample/regression/multiple/baseengine.rb +19 -20
- data/lib/statsample/regression/multiple/matrixengine.rb +187 -0
- data/lib/statsample/regression/multiple/rubyengine.rb +58 -98
- data/test/test_bivariate.rb +1 -0
- data/test/test_crosstab.rb +0 -3
- data/test/test_dataset.rb +379 -379
- data/test/test_dominance_analysis.rb +43 -0
- data/test/test_matrix.rb +52 -0
- data/test/test_regression.rb +174 -129
- data/test/test_svg_graph.rb +51 -51
- metadata +29 -3
data/History.txt
CHANGED
@@ -1,6 +1,12 @@
|
|
1
|
+
=== 0.6.4 / 2010-02-19
|
2
|
+
* Dominance Analysis and Dominance Analysis Bootstrap allows multivariate dependent analysis.
|
3
|
+
* Test suite for Dominance Analysis, using Azen and Budescu papers as references
|
4
|
+
* X^2 for polychoric correlation
|
5
|
+
|
1
6
|
=== 0.6.3 / 2010-02-15
|
2
7
|
* Statsample::Bivariate::Polychoric have joint estimation.
|
3
8
|
* Some extra documentation and bug fixs
|
9
|
+
|
4
10
|
=== 0.6.2 / 2010-02-11
|
5
11
|
* New Statsample::Bivariate::Polychoric. For implement: X2 and G2
|
6
12
|
* New matrix.rb, for faster development of Contingence Tables and Correlation Matrix
|
data/Manifest.txt
CHANGED
@@ -13,6 +13,7 @@ demo/correlation_matrix.rb
|
|
13
13
|
demo/dominance_analysis_bootstrap.rb
|
14
14
|
demo/dominanceanalysis.rb
|
15
15
|
demo/multiple_regression.rb
|
16
|
+
demo/multivariate_correlation.rb
|
16
17
|
demo/polychoric.rb
|
17
18
|
demo/tetrachoric.rb
|
18
19
|
lib/distribution.rb
|
@@ -63,6 +64,7 @@ lib/statsample/regression/multiple.rb
|
|
63
64
|
lib/statsample/regression/multiple/alglibengine.rb
|
64
65
|
lib/statsample/regression/multiple/baseengine.rb
|
65
66
|
lib/statsample/regression/multiple/gslengine.rb
|
67
|
+
lib/statsample/regression/multiple/matrixengine.rb
|
66
68
|
lib/statsample/regression/multiple/rubyengine.rb
|
67
69
|
lib/statsample/regression/simple.rb
|
68
70
|
lib/statsample/reliability.rb
|
@@ -83,11 +85,13 @@ test/test_csv.csv
|
|
83
85
|
test/test_csv.rb
|
84
86
|
test/test_dataset.rb
|
85
87
|
test/test_distribution.rb
|
88
|
+
test/test_dominance_analysis.rb
|
86
89
|
test/test_factor.rb
|
87
90
|
test/test_ggobi.rb
|
88
91
|
test/test_gsl.rb
|
89
92
|
test/test_histogram.rb
|
90
93
|
test/test_logit.rb
|
94
|
+
test/test_matrix.rb
|
91
95
|
test/test_mle.rb
|
92
96
|
test/test_multiset.rb
|
93
97
|
test/test_permutation.rb
|
data/README.txt
CHANGED
@@ -11,16 +11,16 @@ A suite for basic and advanced statistics. Includes:
|
|
11
11
|
* Correlations: Pearson (r), Rho, Tetrachoric, Polychoric
|
12
12
|
* Regression: Simple, Multiple, Probit and Logit
|
13
13
|
* Factorial Analysis: Extraction (PCA and Principal Axis) and Rotation (Varimax and relatives)
|
14
|
-
* Dominance Analysis (Azen & Budescu)
|
14
|
+
* Dominance Analysis, with multivariate dependent and bootstrap (Azen & Budescu)
|
15
15
|
* Sample calculation related formulas
|
16
16
|
|
17
17
|
== DETAILED FEATURES:
|
18
18
|
|
19
19
|
* Factorial Analysis. Principal Component Analysis and Principal Axis extraction, with orthogonal rotations (Varimax, Equimax, Quartimax)
|
20
|
-
* Multiple Regression. Listwise analysis optimized with use of Alglib library. Pairwise analysis is executed on pure ruby and reports same values as SPSS
|
20
|
+
* Multiple Regression. Listwise analysis optimized with use of Alglib library. Pairwise analysis is executed on pure ruby with matrixes and reports same values as SPSS
|
21
21
|
* Module Bivariate provides covariance and pearson, spearman, point biserial, tau a, tau b, gamma, tetrachoric and polychoric correlation correlations. Include methods to create correlation (pearson and tetrachoric) and covariance matrices
|
22
22
|
* Regression module provides linear regression methods
|
23
|
-
* Dominance Analysis. Based on Budescu and Azen papers, <strong>DominanceAnalysis</strong> class can report dominance analysis for a sample and <strong>DominanceAnalysisBootstrap</strong> can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
|
23
|
+
* Dominance Analysis. Based on Budescu and Azen papers, <strong>DominanceAnalysis</strong> class can report dominance analysis for a sample, using uni or multivariate dependent variables and <strong>DominanceAnalysisBootstrap</strong> can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
|
24
24
|
* Classes for Vector, Datasets (set of Vectors) and Multisets (multiple datasets with same fields and type of vectors), and multiple methods to manipulate them
|
25
25
|
* Module Codification, to help to codify open questions
|
26
26
|
* Converters to and from database and csv files, and to output Mx and GGobi files
|
@@ -66,7 +66,7 @@ A suite for basic and advanced statistics. Includes:
|
|
66
66
|
Optional:
|
67
67
|
|
68
68
|
* Plotting: gnuplot and rbgnuplot, SVG::Graph
|
69
|
-
* Factorial analysis and polychorical correlation: gsl and rb-gsl (http://rb-gsl.rubyforge.org/)
|
69
|
+
* Factorial analysis and polychorical correlation: gsl library and rb-gsl (http://rb-gsl.rubyforge.org/). You should install it using <tt>gem install gsl</tt>
|
70
70
|
|
71
71
|
== DOWNLOAD
|
72
72
|
* Gems and bugs report: http://rubyforge.org/projects/ruby-statsample/
|
@@ -78,7 +78,7 @@ Optional:
|
|
78
78
|
|
79
79
|
For optimization on *nix env
|
80
80
|
|
81
|
-
sudo gem install ruby-statsample-optimization
|
81
|
+
sudo gem install gsl ruby-statsample-optimization
|
82
82
|
|
83
83
|
Available setup.rb file
|
84
84
|
|
@@ -10,7 +10,13 @@ d=100.times.collect {rand}.to_scale
|
|
10
10
|
|
11
11
|
ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
|
12
12
|
|
13
|
-
ds['
|
14
|
-
|
15
|
-
|
13
|
+
ds['y1']=ds.collect{|row| row['a']*5+row['b']*2+row['c']*2+row['d']*2+10*rand()}
|
14
|
+
ds['y2']=ds.collect{|row| row['a']*10+rand()}
|
15
|
+
|
16
|
+
dab=Statsample::DominanceAnalysis::Bootstrap.new(ds, ['y1','y2'], :debug=>true)
|
17
|
+
dab.bootstrap(100,nil)
|
18
|
+
puts dab.summary
|
19
|
+
ds2=ds['a'..'y1']
|
20
|
+
dab=Statsample::DominanceAnalysis::Bootstrap.new(ds2, 'y1', :debug=>true)
|
21
|
+
dab.bootstrap(100,nil)
|
16
22
|
puts dab.summary
|
data/demo/dominanceanalysis.rb
CHANGED
@@ -2,10 +2,26 @@
|
|
2
2
|
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
3
3
|
|
4
4
|
require 'statsample'
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
5
|
+
sample=200
|
6
|
+
a=sample.times.collect {rand}.to_scale
|
7
|
+
b=sample.times.collect {rand}.to_scale
|
8
|
+
c=sample.times.collect {rand}.to_scale
|
9
|
+
d=sample.times.collect {rand}.to_scale
|
10
|
+
|
11
|
+
ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
|
12
|
+
ds['y']=ds.collect{|row| row['a']*5+row['b']*3+row['c']*2+row['d']+rand()}
|
13
|
+
rb=ReportBuilder.new("Dominance Analysis")
|
14
|
+
|
15
|
+
cm=Statsample::Bivariate.correlation_matrix(ds)
|
16
|
+
rb.add(cm)
|
17
|
+
lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
|
18
|
+
rb.add(lr)
|
19
|
+
|
20
|
+
#da=Statsample::DominanceAnalysis.new(ds,'y')
|
21
|
+
#rb.add(da)
|
22
|
+
|
23
|
+
da=Statsample::DominanceAnalysis.new(ds,'y',:name=>"Dominance Analysis using group of predictors", :predictors=>['a', 'b', %w{c d}])
|
24
|
+
rb.add(da)
|
25
|
+
|
26
|
+
|
27
|
+
puts rb.to_text
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
3
|
+
|
4
|
+
require 'statsample'
|
5
|
+
require 'mathn'
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
complete=Matrix[
|
10
|
+
[1,0.53,0.62,0.19,-0.09,0.08,0.02,-0.12,0.08],
|
11
|
+
[0.53,1,0.61,0.23,0.1,0.18,0.02,-0.1,0.15],
|
12
|
+
[0.62,0.61,1,0.03,0.1,0.12,0.03,-0.06,0.12],
|
13
|
+
[0.19,0.23,0.03,1,-0.02,0.02,0,-0.02,-0.02],
|
14
|
+
[-0.09,0.1,0.1,-0.02,1,0.05,0.06,0.18,0.02],
|
15
|
+
[0.08,0.18,0.12,0.02,0.05,1,0.22,-0.07,0.36],
|
16
|
+
[0.02,0.02,0.03,0,0.06,0.22,1,-0.01,-0.05],
|
17
|
+
[-0.12,-0.1,-0.06,-0.02,0.18,-0.07,-0.01,1,-0.03],
|
18
|
+
[0.08,0.15,0.12,-0.02,0.02,0.36,-0.05,-0.03,1]]
|
19
|
+
complete.extend Statsample::CovariateMatrix
|
20
|
+
complete.fields=%w{adhd cd odd sex age monly mwork mage poverty}
|
21
|
+
|
22
|
+
lr=Statsample::Regression::Multiple::MultipleDependent.new(complete, %w{adhd cd odd})
|
23
|
+
|
24
|
+
puts "R^2_yx #{lr.r2yx}"
|
25
|
+
puts "P^2_yx #{lr.p2yx}"
|
26
|
+
|
data/lib/statsample.rb
CHANGED
data/lib/statsample/bivariate.rb
CHANGED
@@ -120,13 +120,18 @@ module Statsample
|
|
120
120
|
# Order of rows and columns depends on Dataset#fields order
|
121
121
|
|
122
122
|
def covariance_matrix(ds)
|
123
|
-
ds.collect_matrix do |row,col|
|
123
|
+
matrix=ds.collect_matrix do |row,col|
|
124
124
|
if (ds[row].type!=:scale or ds[col].type!=:scale)
|
125
125
|
nil
|
126
|
+
elsif row==col
|
127
|
+
ds[row].variance
|
126
128
|
else
|
127
|
-
covariance(ds[row],ds[col])
|
129
|
+
covariance(ds[row], ds[col])
|
128
130
|
end
|
129
131
|
end
|
132
|
+
matrix.extend CovariateMatrix
|
133
|
+
matrix.fields=ds.fields
|
134
|
+
matrix
|
130
135
|
end
|
131
136
|
|
132
137
|
# Correlation matrix.
|
@@ -142,8 +147,8 @@ module Statsample
|
|
142
147
|
pearson(ds[row],ds[col])
|
143
148
|
end
|
144
149
|
end
|
145
|
-
cm.extend(Statsample::
|
146
|
-
cm.
|
150
|
+
cm.extend(Statsample::CovariateMatrix)
|
151
|
+
cm.fields=ds.fields
|
147
152
|
cm
|
148
153
|
end
|
149
154
|
|
@@ -282,6 +287,21 @@ module Statsample
|
|
282
287
|
}
|
283
288
|
sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
|
284
289
|
end
|
290
|
+
|
291
|
+
# Report the minimum number of cases valid of a covariate matrix
|
292
|
+
# based on a dataset
|
293
|
+
def min_n_valid(ds)
|
294
|
+
min=ds.cases
|
295
|
+
m=n_valid_matrix(ds)
|
296
|
+
for x in 0...m.row_size
|
297
|
+
for y in 0...m.column_size
|
298
|
+
min=m[x,y] if m[x,y] < min
|
299
|
+
end
|
300
|
+
end
|
301
|
+
min
|
302
|
+
end
|
303
|
+
|
304
|
+
|
285
305
|
end
|
286
306
|
end
|
287
307
|
end
|
@@ -48,21 +48,21 @@ module Statsample
|
|
48
48
|
attr_accessor :max_iterations
|
49
49
|
# Debug algorithm (See iterations, for example)
|
50
50
|
attr_accessor :debug
|
51
|
-
# Minimizer type. Default
|
51
|
+
# Minimizer type for two step. Default "brent"
|
52
52
|
# See http://rb-gsl.rubyforge.org/min.html for reference.
|
53
53
|
attr_accessor :minimizer_type_two_step
|
54
54
|
|
55
|
-
# Minimizer type. Default
|
55
|
+
# Minimizer type for joint estimate. Default "nmsimplex"
|
56
56
|
# See http://rb-gsl.rubyforge.org/min.html for reference.
|
57
57
|
attr_accessor :minimizer_type_joint
|
58
58
|
|
59
59
|
|
60
60
|
# Method of calculation of polychoric series.
|
61
61
|
#
|
62
|
-
# :two_step:: two-step ML, based on code by Gegenfurtner(1992)
|
62
|
+
# :two_step:: two-step ML, based on code by Gegenfurtner(1992).
|
63
63
|
# :polychoric_series:: polychoric series estimate, using
|
64
|
-
# algorithm AS87 by Martinson and Hamdan (1975)
|
65
|
-
# :joint
|
64
|
+
# algorithm AS87 by Martinson and Hamdan (1975).
|
65
|
+
# :joint:: one-step ML, based on R package 'polycor'
|
66
66
|
# by J.Fox.
|
67
67
|
attr_accessor :method
|
68
68
|
# Absolute error for iteration.
|
@@ -73,7 +73,9 @@ module Statsample
|
|
73
73
|
|
74
74
|
# Log of algorithm
|
75
75
|
attr_reader :log
|
76
|
-
|
76
|
+
|
77
|
+
|
78
|
+
attr_reader :loglike_model
|
77
79
|
|
78
80
|
METHOD=:two_step
|
79
81
|
MAX_ITERATIONS=300
|
@@ -162,16 +164,15 @@ module Statsample
|
|
162
164
|
|
163
165
|
def loglike_data
|
164
166
|
loglike=0
|
165
|
-
@nr.times
|
166
|
-
@nc.times
|
167
|
+
@nr.times do |i|
|
168
|
+
@nc.times do |j|
|
167
169
|
res=@matrix[i,j].quo(@total)
|
168
170
|
if (res==0)
|
169
|
-
|
170
|
-
|
171
|
-
end
|
171
|
+
res=1e-16
|
172
|
+
end
|
172
173
|
loglike+= @matrix[i,j] * Math::log(res )
|
173
|
-
|
174
|
-
|
174
|
+
end
|
175
|
+
end
|
175
176
|
loglike
|
176
177
|
end
|
177
178
|
def chi_square
|
@@ -346,7 +347,7 @@ module Statsample
|
|
346
347
|
end
|
347
348
|
message+=sprintf("f() = %7.3f size = %.3f\n", minimizer.fval, minimizer.size)+"\n";
|
348
349
|
end while status == GSL::CONTINUE and iter < @max_iterations
|
349
|
-
@iteration
|
350
|
+
@iteration=iter
|
350
351
|
@log+=message
|
351
352
|
puts message if @debug
|
352
353
|
@r=minimizer.x[0]
|
@@ -155,6 +155,28 @@ raise "Should'nt be empty headers: [#{row.to_a.join(",")}]" if row.to_a.find_all
|
|
155
155
|
}
|
156
156
|
book.write(filename)
|
157
157
|
end
|
158
|
+
# This should be fixed.
|
159
|
+
# If we have a Formula, should be resolver first
|
160
|
+
|
161
|
+
def preprocess_row(row, dates)
|
162
|
+
i=-1
|
163
|
+
row.collect!{|c|
|
164
|
+
i+=1
|
165
|
+
if c.is_a? Spreadsheet::Formula
|
166
|
+
if(c.value.is_a? Spreadsheet::Excel::Error)
|
167
|
+
nil
|
168
|
+
else
|
169
|
+
c.value
|
170
|
+
end
|
171
|
+
elsif dates.include? i and !c.nil? and c.is_a? Numeric
|
172
|
+
row.date(i)
|
173
|
+
else
|
174
|
+
c
|
175
|
+
end
|
176
|
+
}
|
177
|
+
end
|
178
|
+
private :process_row
|
179
|
+
|
158
180
|
# Returns a dataset based on a xls file
|
159
181
|
# USE:
|
160
182
|
# ds = Statsample::Excel.read("test.xls")
|
@@ -177,27 +199,9 @@ raise "Should'nt be empty headers: [#{row.to_a.join(",")}]" if row.to_a.find_all
|
|
177
199
|
end
|
178
200
|
}
|
179
201
|
line_number+=1
|
180
|
-
if(line_number<=ignore_lines)
|
181
|
-
|
182
|
-
|
183
|
-
end
|
184
|
-
# This should be fixed.
|
185
|
-
# If we have a Formula, should be resolver first
|
186
|
-
i=-1
|
187
|
-
row.collect!{|c|
|
188
|
-
i+=1
|
189
|
-
if c.is_a? Spreadsheet::Formula
|
190
|
-
if(c.value.is_a? Spreadsheet::Excel::Error)
|
191
|
-
nil
|
192
|
-
else
|
193
|
-
c.value
|
194
|
-
end
|
195
|
-
elsif dates.include? i and !c.nil? and c.is_a? Numeric
|
196
|
-
row.date(i)
|
197
|
-
else
|
198
|
-
c
|
199
|
-
end
|
200
|
-
}
|
202
|
+
next if(line_number<=ignore_lines)
|
203
|
+
|
204
|
+
preprocess_row(row,dates)
|
201
205
|
if first_row
|
202
206
|
fields=extract_fields(row)
|
203
207
|
ds=Statsample::Dataset.new(fields)
|
@@ -210,8 +214,8 @@ raise "Should'nt be empty headers: [#{row.to_a.join(",")}]" if row.to_a.find_all
|
|
210
214
|
ds.add_case(rowa,false)
|
211
215
|
end
|
212
216
|
rescue => e
|
213
|
-
|
214
|
-
|
217
|
+
error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}"
|
218
|
+
raise
|
215
219
|
end
|
216
220
|
end
|
217
221
|
convert_to_scale_and_date(ds, fields)
|
data/lib/statsample/crosstab.rb
CHANGED
@@ -8,7 +8,7 @@ module Statsample
|
|
8
8
|
bindtextdomain("statsample")
|
9
9
|
attr_reader :v_rows, :v_cols
|
10
10
|
attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total
|
11
|
-
def initialize(v1,v2,opts=Hash.new)
|
11
|
+
def initialize(v1, v2, opts=Hash.new)
|
12
12
|
raise ArgumentError, "Both arguments should be Vectors" unless v1.is_a? Statsample::Vector and v2.is_a? Statsample::Vector
|
13
13
|
raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
|
14
14
|
@v_rows, @v_cols=Statsample.only_valid(v1,v2)
|
@@ -191,48 +191,5 @@ module Statsample
|
|
191
191
|
t.add_row(t_row)
|
192
192
|
generator.parse_element(t)
|
193
193
|
end
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
def to_s
|
198
|
-
fq=frequencies
|
199
|
-
rn=rows_names
|
200
|
-
cn=cols_names
|
201
|
-
total=0
|
202
|
-
total_cols=cols_empty_hash
|
203
|
-
max_row_size = rn.inject(0) {|s,x| sl=@v_rows.labeling(x).size; sl>s ? sl : s}
|
204
|
-
|
205
|
-
max_row_size=max_row_size<6 ? 6 : max_row_size
|
206
|
-
|
207
|
-
max_col_size = cn.inject(0) {|s,x| sl=@v_cols.labeling(x).size; sl>s ? sl : s}
|
208
|
-
max_col_size = frequencies.inject(max_col_size) {|s,x| x[1].to_s.size>s ? x[1].to_s.size : s}
|
209
|
-
|
210
|
-
out=""
|
211
|
-
out << " " * (max_row_size+2) << "|" << cn.collect{|c| name=@v_cols.labeling(c); " "+name+(" "*(max_col_size-name.size))+" "}.join("|") << "| Total\n"
|
212
|
-
linea="-" * (max_row_size+2) << "|" << ("-"*(max_col_size+2) +"|")*cn.size << "-"*7 << "\n"
|
213
|
-
out << linea
|
214
|
-
rn.each{|row|
|
215
|
-
total_row=0;
|
216
|
-
name=@v_rows.labeling(row)
|
217
|
-
out << " " +name << " "*(max_row_size-name.size) << " | "
|
218
|
-
cn.each{|col|
|
219
|
-
data=fq[[row,col]].to_s
|
220
|
-
total_row+=fq[[row,col]]
|
221
|
-
total+=fq[[row,col]]
|
222
|
-
total_cols[col]+=fq[[row,col]]
|
223
|
-
out << " " << data << " "*(max_col_size-data.size) << "| "
|
224
|
-
}
|
225
|
-
out << " " << total_row.to_s
|
226
|
-
out << "\n"
|
227
|
-
}
|
228
|
-
out << linea
|
229
|
-
out << " Total " << " "*(max_row_size-5) << "| "
|
230
|
-
cn.each{|v|
|
231
|
-
data=total_cols[v].to_s
|
232
|
-
out << " " << data << " "*(max_col_size-data.size) << "| "
|
233
|
-
}
|
234
|
-
out << " " << total.to_s
|
235
|
-
out
|
236
|
-
end
|
237
194
|
end
|
238
195
|
end
|
@@ -4,7 +4,8 @@ module Statsample
|
|
4
4
|
# for all possible subset models, to identify the relevance of one or more
|
5
5
|
# predictors in the prediction of criterium.
|
6
6
|
#
|
7
|
-
#
|
7
|
+
#
|
8
|
+
# See Budescu(1993), Azen & Budescu (2003, 2006) for more information.
|
8
9
|
#
|
9
10
|
# Example:
|
10
11
|
#
|
@@ -53,39 +54,127 @@ module Statsample
|
|
53
54
|
|
54
55
|
#
|
55
56
|
# == References:
|
56
|
-
# * Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression.
|
57
|
-
# * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression.
|
57
|
+
# * Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression. <em>Psychological Bulletin, 114</em>, 542-551.
|
58
|
+
# * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. <em>Psychological Methods, 8</em>(2), 129-148.
|
59
|
+
# * Azen, R. & Budescu, D.V. (2006). Comparing predictors in Multivariate Regression Models: An extension of Dominance Analysis. <em>Journal of Educational and Behavioral Statistics, 31</em>(2), 157-180.
|
58
60
|
class DominanceAnalysis
|
59
61
|
include GetText
|
60
62
|
bindtextdomain("statsample")
|
61
|
-
# Class to generate the regressions. Default to Statsample::Regression::Multiple::
|
63
|
+
# Class to generate the regressions. Default to Statsample::Regression::Multiple::MatrixEngine
|
62
64
|
attr_accessor :regression_class
|
63
65
|
# Name of analysis
|
64
66
|
attr_accessor :name
|
67
|
+
# Set to true if you want to build from dataset, not correlation matrix
|
68
|
+
attr_accessor :build_from_dataset
|
69
|
+
# Array with independent variables. You could create subarrays,
|
70
|
+
# to test groups of predictors as blocks
|
71
|
+
attr_accessor :predictors
|
72
|
+
# If you provide a matrix as input, you should set
|
73
|
+
# the number of cases to define significance of R^2
|
74
|
+
attr_accessor :cases
|
75
|
+
# Method of :regression_class used to measure association.
|
76
|
+
#
|
77
|
+
# Only necessary to change if you have multivariate dependent.
|
78
|
+
# * :r2yx (R^2_yx), the default option, is the option when distinction
|
79
|
+
# between independent and dependents variable is arbitrary
|
80
|
+
# * :p2yx is the option when the distinction between independent and dependents variables is real.
|
81
|
+
#
|
82
|
+
|
83
|
+
attr_accessor :method_association
|
84
|
+
|
85
|
+
|
86
|
+
attr_reader :dependent
|
87
|
+
|
88
|
+
UNIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MatrixEngine
|
89
|
+
MULTIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MultipleDependent
|
65
90
|
|
91
|
+
def self.predictor_name(variable)
|
92
|
+
if variable.is_a? Array
|
93
|
+
sprintf("(%s)", variable.join(","))
|
94
|
+
else
|
95
|
+
variable
|
96
|
+
end
|
97
|
+
end
|
66
98
|
# Creates a new DominanceAnalysis object
|
67
|
-
#
|
68
|
-
# *
|
69
|
-
# *
|
70
|
-
#
|
71
|
-
#
|
72
|
-
|
73
|
-
|
74
|
-
@
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
99
|
+
# Parameters:
|
100
|
+
# * input: A Matrix or Dataset object
|
101
|
+
# * dependent: Name of dependent variable. Could be an array, if you want to
|
102
|
+
# do an Multivariate Regression Analysis. If nil, set to all
|
103
|
+
# fields on input, except criteria
|
104
|
+
|
105
|
+
def initialize(input, dependent, opts=Hash.new)
|
106
|
+
@build_from_dataset=false
|
107
|
+
if dependent.is_a? Array
|
108
|
+
@regression_class= MULTIVARIATE_REGRESSION_CLASS
|
109
|
+
@method_association=:r2yx
|
110
|
+
else
|
111
|
+
@regression_class= UNIVARIATE_REGRESSION_CLASS
|
112
|
+
@method_association=:r2
|
113
|
+
|
114
|
+
end
|
80
115
|
opts.each{|k,v|
|
81
116
|
self.send("#{k}=",v) if self.respond_to? k
|
82
117
|
}
|
118
|
+
@dependent=dependent
|
119
|
+
@dependent=[@dependent] unless @dependent.is_a? Array
|
120
|
+
|
121
|
+
@predictors ||= input.fields-@dependent
|
122
|
+
|
123
|
+
@name=_("Dominance Analysis: %s over %s") % [ @predictors.flatten.join(",") , @dependent.join(",")] if @name.nil?
|
124
|
+
|
125
|
+
if input.is_a? Statsample::Dataset
|
126
|
+
@ds=input
|
127
|
+
@matrix=Statsample::Bivariate.correlation_matrix(input)
|
128
|
+
@cases=Statsample::Bivariate.min_n_valid(input)
|
129
|
+
elsif input.is_a? ::Matrix
|
130
|
+
@ds=nil
|
131
|
+
@matrix=input
|
132
|
+
else
|
133
|
+
raise ArgumentError.new("You should use a Matrix or a Dataset")
|
134
|
+
end
|
135
|
+
@models=nil
|
136
|
+
|
137
|
+
end
|
138
|
+
# Compute models.
|
139
|
+
def compute
|
83
140
|
create_models
|
84
141
|
fill_models
|
85
142
|
end
|
143
|
+
def models
|
144
|
+
if @models.nil?
|
145
|
+
compute
|
146
|
+
end
|
147
|
+
@models
|
148
|
+
end
|
149
|
+
|
150
|
+
def models_data
|
151
|
+
if @models_data.nil?
|
152
|
+
compute
|
153
|
+
end
|
154
|
+
@models_data
|
155
|
+
end
|
156
|
+
def create_models
|
157
|
+
@models=[]
|
158
|
+
@models_data={}
|
159
|
+
for i in 1..@predictors.size
|
160
|
+
c=Statsample::Combination.new(i,@predictors.size)
|
161
|
+
c.each do |data|
|
162
|
+
independent=data.collect {|i1| @predictors[i1] }
|
163
|
+
@models.push(independent)
|
164
|
+
if (@build_from_dataset)
|
165
|
+
data=@ds.dup(independent.flatten+@dependent)
|
166
|
+
else
|
167
|
+
data=@matrix.submatrix(independent.flatten+@dependent)
|
168
|
+
end
|
169
|
+
|
170
|
+
modeldata=ModelData.new(independent, data, self)
|
171
|
+
models_data[independent.sort {|a,b| a.to_s<=>b.to_s}]=modeldata
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
86
175
|
def fill_models
|
87
176
|
@models.each do |m|
|
88
|
-
@
|
177
|
+
@predictors.each do |f|
|
89
178
|
next if m.include? f
|
90
179
|
base_model=md(m)
|
91
180
|
comp_model=md(m+[f])
|
@@ -93,6 +182,8 @@ module Statsample
|
|
93
182
|
end
|
94
183
|
end
|
95
184
|
end
|
185
|
+
private :create_models, :fill_models
|
186
|
+
|
96
187
|
def dominance_for_nil_model(i,j)
|
97
188
|
if md([i]).r2>md([j]).r2
|
98
189
|
1
|
@@ -107,7 +198,7 @@ module Statsample
|
|
107
198
|
dm=dominance_for_nil_model(i,j)
|
108
199
|
return 0.5 if dm==0.5
|
109
200
|
dominances=[dm]
|
110
|
-
|
201
|
+
models_data.each do |k,m|
|
111
202
|
if !m.contributions[i].nil? and !m.contributions[j].nil?
|
112
203
|
if m.contributions[i]>m.contributions[j]
|
113
204
|
dominances.push(1)
|
@@ -128,7 +219,7 @@ module Statsample
|
|
128
219
|
dm=dominance_for_nil_model(i,j)
|
129
220
|
return 0.5 if dm==0.5
|
130
221
|
dominances=[dm]
|
131
|
-
for k in 1...@
|
222
|
+
for k in 1...@predictors.size
|
132
223
|
a=average_k(k)
|
133
224
|
if a[i]>a[j]
|
134
225
|
dominances.push(1)
|
@@ -154,7 +245,7 @@ module Statsample
|
|
154
245
|
end
|
155
246
|
end
|
156
247
|
def pairs
|
157
|
-
|
248
|
+
models.find_all{|m| m.size==2}
|
158
249
|
end
|
159
250
|
def total_dominance
|
160
251
|
pairs.inject({}){|a,pair| a[pair]=total_dominance_pairwise(pair[0], pair[1])
|
@@ -162,20 +253,18 @@ module Statsample
|
|
162
253
|
}
|
163
254
|
end
|
164
255
|
def conditional_dominance
|
165
|
-
pairs.inject({}){|a,pair|
|
166
|
-
a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
|
256
|
+
pairs.inject({}){|a,pair| a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
|
167
257
|
a
|
168
258
|
}
|
169
259
|
end
|
170
260
|
def general_dominance
|
171
|
-
pairs.inject({}){|a,pair|
|
172
|
-
a[pair]=general_dominance_pairwise(pair[0], pair[1])
|
261
|
+
pairs.inject({}){|a,pair| a[pair]=general_dominance_pairwise(pair[0], pair[1])
|
173
262
|
a
|
174
263
|
}
|
175
264
|
end
|
176
265
|
|
177
266
|
def md(m)
|
178
|
-
|
267
|
+
models_data[m.sort {|a,b| a.to_s<=>b.to_s}]
|
179
268
|
end
|
180
269
|
# Get all model of size k
|
181
270
|
def md_k(k)
|
@@ -195,11 +284,11 @@ module Statsample
|
|
195
284
|
end
|
196
285
|
# Hash with average for each k size model.
|
197
286
|
def average_k(k)
|
198
|
-
return nil if k==@
|
287
|
+
return nil if k==@predictors.size
|
199
288
|
models=md_k(k)
|
200
|
-
averages=@
|
289
|
+
averages=@predictors.inject({}) {|a,v| a[v]=[];a}
|
201
290
|
models.each do |m|
|
202
|
-
@
|
291
|
+
@predictors.each do |f|
|
203
292
|
averages[f].push(m.contributions[f]) unless m.contributions[f].nil?
|
204
293
|
end
|
205
294
|
end
|
@@ -207,10 +296,10 @@ module Statsample
|
|
207
296
|
end
|
208
297
|
def general_averages
|
209
298
|
if @general_averages.nil?
|
210
|
-
averages=@
|
211
|
-
for k in 1...@
|
299
|
+
averages=@predictors.inject({}) {|a,v| a[v]=[md([v]).r2];a}
|
300
|
+
for k in 1...@predictors.size
|
212
301
|
ak=average_k(k)
|
213
|
-
@
|
302
|
+
@predictors.each do |f|
|
214
303
|
averages[f].push(ak[f])
|
215
304
|
end
|
216
305
|
end
|
@@ -218,36 +307,25 @@ module Statsample
|
|
218
307
|
end
|
219
308
|
@general_averages
|
220
309
|
end
|
221
|
-
|
222
|
-
@models=[]
|
223
|
-
@models_data={}
|
224
|
-
for i in 1..@fields.size
|
225
|
-
c=Statsample::Combination.new(i,@fields.size)
|
226
|
-
c.each do |data|
|
227
|
-
convert=data.collect {|i1| @fields[i1] }
|
228
|
-
@models.push(convert)
|
229
|
-
ds_prev=@ds.dup(convert+[@y_var])
|
230
|
-
modeldata=ModelData.new(convert,ds_prev, @y_var, @fields, @regression_class)
|
231
|
-
@models_data[convert.sort]=modeldata
|
232
|
-
end
|
233
|
-
end
|
234
|
-
end
|
310
|
+
|
235
311
|
def summary
|
236
312
|
rp=ReportBuilder.new()
|
237
313
|
rp.add(self)
|
238
314
|
rp.to_text
|
239
315
|
end
|
240
316
|
def to_reportbuilder(generator)
|
317
|
+
compute if @models.nil?
|
241
318
|
anchor=generator.add_toc_entry(_("DA: ")+@name)
|
242
319
|
generator.add_html "<div class='dominance-analysis'>#{@name}<a name='#{anchor}'></a>"
|
243
320
|
t=ReportBuilder::Table.new(:name=>_("Dominance Analysis result"))
|
244
|
-
|
245
|
-
|
321
|
+
|
322
|
+
t.header=["","r2",_("sign")]+@predictors.collect {|c| DominanceAnalysis.predictor_name(c) }
|
323
|
+
row=[_("Model 0"),"",""]+@predictors.collect{|f|
|
246
324
|
sprintf("%0.3f", md([f]).r2)
|
247
325
|
}
|
248
326
|
t.add_row(row)
|
249
327
|
t.add_horizontal_line
|
250
|
-
for i in 1..@
|
328
|
+
for i in 1..@predictors.size
|
251
329
|
mk=md_k(i)
|
252
330
|
mk.each{|m|
|
253
331
|
t.add_row(m.add_table_row)
|
@@ -256,7 +334,7 @@ module Statsample
|
|
256
334
|
a=average_k(i)
|
257
335
|
if !a.nil?
|
258
336
|
t.add_horizontal_line
|
259
|
-
row=[_("k=%d Average") % i,"",""] + @
|
337
|
+
row=[_("k=%d Average") % i,"",""] + @predictors.collect{|f|
|
260
338
|
sprintf("%0.3f",a[f])
|
261
339
|
}
|
262
340
|
t.add_row(row)
|
@@ -269,7 +347,7 @@ module Statsample
|
|
269
347
|
g=general_averages
|
270
348
|
t.add_horizontal_line
|
271
349
|
|
272
|
-
row=[_("Overall averages"),"",""]+@
|
350
|
+
row=[_("Overall averages"),"",""]+@predictors.collect{|f|
|
273
351
|
sprintf("%0.3f",g[f])
|
274
352
|
}
|
275
353
|
t.add_row(row)
|
@@ -289,26 +367,42 @@ module Statsample
|
|
289
367
|
end
|
290
368
|
class ModelData
|
291
369
|
attr_reader :contributions
|
292
|
-
def initialize(
|
293
|
-
@
|
294
|
-
@
|
295
|
-
@
|
296
|
-
|
297
|
-
@
|
370
|
+
def initialize(independent, data, da)
|
371
|
+
@independent=independent
|
372
|
+
@data=data
|
373
|
+
@predictors=da.predictors
|
374
|
+
@dependent=da.dependent
|
375
|
+
@cases=da.cases
|
376
|
+
@method=da.method_association
|
377
|
+
@contributions=@independent.inject({}){|a,v| a[v]=nil;a}
|
378
|
+
|
379
|
+
r_class=da.regression_class
|
380
|
+
|
381
|
+
if @dependent.size==1
|
382
|
+
@lr=r_class.new(data, @dependent[0], :cases=>@cases)
|
383
|
+
else
|
384
|
+
@lr=r_class.new(data, @dependent, :cases=>@cases)
|
385
|
+
end
|
298
386
|
end
|
299
|
-
def add_contribution(f,v)
|
387
|
+
def add_contribution(f, v)
|
300
388
|
@contributions[f]=v-r2
|
301
389
|
end
|
302
390
|
def r2
|
303
|
-
@lr.
|
391
|
+
@lr.send(@method)
|
392
|
+
end
|
393
|
+
def name
|
394
|
+
@independent.collect {|variable|
|
395
|
+
DominanceAnalysis.predictor_name(variable)
|
396
|
+
}.join("*")
|
304
397
|
end
|
305
398
|
def add_table_row
|
306
399
|
begin
|
307
|
-
|
400
|
+
sign=sprintf("%0.3f", @lr.significance)
|
308
401
|
rescue RuntimeError
|
309
|
-
|
402
|
+
sign="???"
|
310
403
|
end
|
311
|
-
|
404
|
+
|
405
|
+
[name, sprintf("%0.3f",r2), sign] + @predictors.collect{|k|
|
312
406
|
v=@contributions[k]
|
313
407
|
if v.nil?
|
314
408
|
"--"
|
@@ -318,8 +412,8 @@ module Statsample
|
|
318
412
|
}
|
319
413
|
end
|
320
414
|
def summary
|
321
|
-
out=sprintf("%s: r2=%0.3f(p=%0.2f)\n"
|
322
|
-
out << @
|
415
|
+
out=sprintf("%s: r2=%0.3f(p=%0.2f)\n",name, r2, @lr.significance, @lr.sst)
|
416
|
+
out << @predictors.collect{|k|
|
323
417
|
v=@contributions[k]
|
324
418
|
if v.nil?
|
325
419
|
"--"
|