statsample 0.6.3 → 0.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/Manifest.txt +4 -0
- data/README.txt +5 -5
- data/demo/dominance_analysis_bootstrap.rb +9 -3
- data/demo/dominanceanalysis.rb +23 -7
- data/demo/multivariate_correlation.rb +26 -0
- data/lib/statsample.rb +1 -1
- data/lib/statsample/bivariate.rb +24 -4
- data/lib/statsample/bivariate/polychoric.rb +15 -14
- data/lib/statsample/converters.rb +27 -23
- data/lib/statsample/crosstab.rb +1 -44
- data/lib/statsample/dominanceanalysis.rb +158 -64
- data/lib/statsample/dominanceanalysis/bootstrap.rb +16 -7
- data/lib/statsample/matrix.rb +145 -13
- data/lib/statsample/multiset.rb +248 -265
- data/lib/statsample/regression.rb +3 -0
- data/lib/statsample/regression/multiple.rb +65 -23
- data/lib/statsample/regression/multiple/baseengine.rb +19 -20
- data/lib/statsample/regression/multiple/matrixengine.rb +187 -0
- data/lib/statsample/regression/multiple/rubyengine.rb +58 -98
- data/test/test_bivariate.rb +1 -0
- data/test/test_crosstab.rb +0 -3
- data/test/test_dataset.rb +379 -379
- data/test/test_dominance_analysis.rb +43 -0
- data/test/test_matrix.rb +52 -0
- data/test/test_regression.rb +174 -129
- data/test/test_svg_graph.rb +51 -51
- metadata +29 -3
data/History.txt
CHANGED
@@ -1,6 +1,12 @@
|
|
1
|
+
=== 0.6.4 / 2010-02-19
|
2
|
+
* Dominance Analysis and Dominance Analysis Bootstrap allows multivariate dependent analysis.
|
3
|
+
* Test suite for Dominance Analysis, using Azen and Budescu papers as references
|
4
|
+
* X^2 for polychoric correlation
|
5
|
+
|
1
6
|
=== 0.6.3 / 2010-02-15
|
2
7
|
* Statsample::Bivariate::Polychoric have joint estimation.
|
3
8
|
* Some extra documentation and bug fixs
|
9
|
+
|
4
10
|
=== 0.6.2 / 2010-02-11
|
5
11
|
* New Statsample::Bivariate::Polychoric. For implement: X2 and G2
|
6
12
|
* New matrix.rb, for faster development of Contingence Tables and Correlation Matrix
|
data/Manifest.txt
CHANGED
@@ -13,6 +13,7 @@ demo/correlation_matrix.rb
|
|
13
13
|
demo/dominance_analysis_bootstrap.rb
|
14
14
|
demo/dominanceanalysis.rb
|
15
15
|
demo/multiple_regression.rb
|
16
|
+
demo/multivariate_correlation.rb
|
16
17
|
demo/polychoric.rb
|
17
18
|
demo/tetrachoric.rb
|
18
19
|
lib/distribution.rb
|
@@ -63,6 +64,7 @@ lib/statsample/regression/multiple.rb
|
|
63
64
|
lib/statsample/regression/multiple/alglibengine.rb
|
64
65
|
lib/statsample/regression/multiple/baseengine.rb
|
65
66
|
lib/statsample/regression/multiple/gslengine.rb
|
67
|
+
lib/statsample/regression/multiple/matrixengine.rb
|
66
68
|
lib/statsample/regression/multiple/rubyengine.rb
|
67
69
|
lib/statsample/regression/simple.rb
|
68
70
|
lib/statsample/reliability.rb
|
@@ -83,11 +85,13 @@ test/test_csv.csv
|
|
83
85
|
test/test_csv.rb
|
84
86
|
test/test_dataset.rb
|
85
87
|
test/test_distribution.rb
|
88
|
+
test/test_dominance_analysis.rb
|
86
89
|
test/test_factor.rb
|
87
90
|
test/test_ggobi.rb
|
88
91
|
test/test_gsl.rb
|
89
92
|
test/test_histogram.rb
|
90
93
|
test/test_logit.rb
|
94
|
+
test/test_matrix.rb
|
91
95
|
test/test_mle.rb
|
92
96
|
test/test_multiset.rb
|
93
97
|
test/test_permutation.rb
|
data/README.txt
CHANGED
@@ -11,16 +11,16 @@ A suite for basic and advanced statistics. Includes:
|
|
11
11
|
* Correlations: Pearson (r), Rho, Tetrachoric, Polychoric
|
12
12
|
* Regression: Simple, Multiple, Probit and Logit
|
13
13
|
* Factorial Analysis: Extraction (PCA and Principal Axis) and Rotation (Varimax and relatives)
|
14
|
-
* Dominance Analysis (Azen & Budescu)
|
14
|
+
* Dominance Analysis, with multivariate dependent and bootstrap (Azen & Budescu)
|
15
15
|
* Sample calculation related formulas
|
16
16
|
|
17
17
|
== DETAILED FEATURES:
|
18
18
|
|
19
19
|
* Factorial Analysis. Principal Component Analysis and Principal Axis extraction, with orthogonal rotations (Varimax, Equimax, Quartimax)
|
20
|
-
* Multiple Regression. Listwise analysis optimized with use of Alglib library. Pairwise analysis is executed on pure ruby and reports same values as SPSS
|
20
|
+
* Multiple Regression. Listwise analysis optimized with use of Alglib library. Pairwise analysis is executed on pure ruby with matrixes and reports same values as SPSS
|
21
21
|
* Module Bivariate provides covariance and pearson, spearman, point biserial, tau a, tau b, gamma, tetrachoric and polychoric correlation correlations. Include methods to create correlation (pearson and tetrachoric) and covariance matrices
|
22
22
|
* Regression module provides linear regression methods
|
23
|
-
* Dominance Analysis. Based on Budescu and Azen papers, <strong>DominanceAnalysis</strong> class can report dominance analysis for a sample and <strong>DominanceAnalysisBootstrap</strong> can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
|
23
|
+
* Dominance Analysis. Based on Budescu and Azen papers, <strong>DominanceAnalysis</strong> class can report dominance analysis for a sample, using uni or multivariate dependent variables and <strong>DominanceAnalysisBootstrap</strong> can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
|
24
24
|
* Classes for Vector, Datasets (set of Vectors) and Multisets (multiple datasets with same fields and type of vectors), and multiple methods to manipulate them
|
25
25
|
* Module Codification, to help to codify open questions
|
26
26
|
* Converters to and from database and csv files, and to output Mx and GGobi files
|
@@ -66,7 +66,7 @@ A suite for basic and advanced statistics. Includes:
|
|
66
66
|
Optional:
|
67
67
|
|
68
68
|
* Plotting: gnuplot and rbgnuplot, SVG::Graph
|
69
|
-
* Factorial analysis and polychorical correlation: gsl and rb-gsl (http://rb-gsl.rubyforge.org/)
|
69
|
+
* Factorial analysis and polychorical correlation: gsl library and rb-gsl (http://rb-gsl.rubyforge.org/). You should install it using <tt>gem install gsl</tt>
|
70
70
|
|
71
71
|
== DOWNLOAD
|
72
72
|
* Gems and bugs report: http://rubyforge.org/projects/ruby-statsample/
|
@@ -78,7 +78,7 @@ Optional:
|
|
78
78
|
|
79
79
|
For optimization on *nix env
|
80
80
|
|
81
|
-
sudo gem install ruby-statsample-optimization
|
81
|
+
sudo gem install gsl ruby-statsample-optimization
|
82
82
|
|
83
83
|
Available setup.rb file
|
84
84
|
|
@@ -10,7 +10,13 @@ d=100.times.collect {rand}.to_scale
|
|
10
10
|
|
11
11
|
ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
|
12
12
|
|
13
|
-
ds['
|
14
|
-
|
15
|
-
|
13
|
+
ds['y1']=ds.collect{|row| row['a']*5+row['b']*2+row['c']*2+row['d']*2+10*rand()}
|
14
|
+
ds['y2']=ds.collect{|row| row['a']*10+rand()}
|
15
|
+
|
16
|
+
dab=Statsample::DominanceAnalysis::Bootstrap.new(ds, ['y1','y2'], :debug=>true)
|
17
|
+
dab.bootstrap(100,nil)
|
18
|
+
puts dab.summary
|
19
|
+
ds2=ds['a'..'y1']
|
20
|
+
dab=Statsample::DominanceAnalysis::Bootstrap.new(ds2, 'y1', :debug=>true)
|
21
|
+
dab.bootstrap(100,nil)
|
16
22
|
puts dab.summary
|
data/demo/dominanceanalysis.rb
CHANGED
@@ -2,10 +2,26 @@
|
|
2
2
|
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
3
3
|
|
4
4
|
require 'statsample'
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
5
|
+
sample=200
|
6
|
+
a=sample.times.collect {rand}.to_scale
|
7
|
+
b=sample.times.collect {rand}.to_scale
|
8
|
+
c=sample.times.collect {rand}.to_scale
|
9
|
+
d=sample.times.collect {rand}.to_scale
|
10
|
+
|
11
|
+
ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
|
12
|
+
ds['y']=ds.collect{|row| row['a']*5+row['b']*3+row['c']*2+row['d']+rand()}
|
13
|
+
rb=ReportBuilder.new("Dominance Analysis")
|
14
|
+
|
15
|
+
cm=Statsample::Bivariate.correlation_matrix(ds)
|
16
|
+
rb.add(cm)
|
17
|
+
lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
|
18
|
+
rb.add(lr)
|
19
|
+
|
20
|
+
#da=Statsample::DominanceAnalysis.new(ds,'y')
|
21
|
+
#rb.add(da)
|
22
|
+
|
23
|
+
da=Statsample::DominanceAnalysis.new(ds,'y',:name=>"Dominance Analysis using group of predictors", :predictors=>['a', 'b', %w{c d}])
|
24
|
+
rb.add(da)
|
25
|
+
|
26
|
+
|
27
|
+
puts rb.to_text
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
3
|
+
|
4
|
+
require 'statsample'
|
5
|
+
require 'mathn'
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
complete=Matrix[
|
10
|
+
[1,0.53,0.62,0.19,-0.09,0.08,0.02,-0.12,0.08],
|
11
|
+
[0.53,1,0.61,0.23,0.1,0.18,0.02,-0.1,0.15],
|
12
|
+
[0.62,0.61,1,0.03,0.1,0.12,0.03,-0.06,0.12],
|
13
|
+
[0.19,0.23,0.03,1,-0.02,0.02,0,-0.02,-0.02],
|
14
|
+
[-0.09,0.1,0.1,-0.02,1,0.05,0.06,0.18,0.02],
|
15
|
+
[0.08,0.18,0.12,0.02,0.05,1,0.22,-0.07,0.36],
|
16
|
+
[0.02,0.02,0.03,0,0.06,0.22,1,-0.01,-0.05],
|
17
|
+
[-0.12,-0.1,-0.06,-0.02,0.18,-0.07,-0.01,1,-0.03],
|
18
|
+
[0.08,0.15,0.12,-0.02,0.02,0.36,-0.05,-0.03,1]]
|
19
|
+
complete.extend Statsample::CovariateMatrix
|
20
|
+
complete.fields=%w{adhd cd odd sex age monly mwork mage poverty}
|
21
|
+
|
22
|
+
lr=Statsample::Regression::Multiple::MultipleDependent.new(complete, %w{adhd cd odd})
|
23
|
+
|
24
|
+
puts "R^2_yx #{lr.r2yx}"
|
25
|
+
puts "P^2_yx #{lr.p2yx}"
|
26
|
+
|
data/lib/statsample.rb
CHANGED
data/lib/statsample/bivariate.rb
CHANGED
@@ -120,13 +120,18 @@ module Statsample
|
|
120
120
|
# Order of rows and columns depends on Dataset#fields order
|
121
121
|
|
122
122
|
def covariance_matrix(ds)
|
123
|
-
ds.collect_matrix do |row,col|
|
123
|
+
matrix=ds.collect_matrix do |row,col|
|
124
124
|
if (ds[row].type!=:scale or ds[col].type!=:scale)
|
125
125
|
nil
|
126
|
+
elsif row==col
|
127
|
+
ds[row].variance
|
126
128
|
else
|
127
|
-
covariance(ds[row],ds[col])
|
129
|
+
covariance(ds[row], ds[col])
|
128
130
|
end
|
129
131
|
end
|
132
|
+
matrix.extend CovariateMatrix
|
133
|
+
matrix.fields=ds.fields
|
134
|
+
matrix
|
130
135
|
end
|
131
136
|
|
132
137
|
# Correlation matrix.
|
@@ -142,8 +147,8 @@ module Statsample
|
|
142
147
|
pearson(ds[row],ds[col])
|
143
148
|
end
|
144
149
|
end
|
145
|
-
cm.extend(Statsample::
|
146
|
-
cm.
|
150
|
+
cm.extend(Statsample::CovariateMatrix)
|
151
|
+
cm.fields=ds.fields
|
147
152
|
cm
|
148
153
|
end
|
149
154
|
|
@@ -282,6 +287,21 @@ module Statsample
|
|
282
287
|
}
|
283
288
|
sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
|
284
289
|
end
|
290
|
+
|
291
|
+
# Report the minimum number of cases valid of a covariate matrix
|
292
|
+
# based on a dataset
|
293
|
+
def min_n_valid(ds)
|
294
|
+
min=ds.cases
|
295
|
+
m=n_valid_matrix(ds)
|
296
|
+
for x in 0...m.row_size
|
297
|
+
for y in 0...m.column_size
|
298
|
+
min=m[x,y] if m[x,y] < min
|
299
|
+
end
|
300
|
+
end
|
301
|
+
min
|
302
|
+
end
|
303
|
+
|
304
|
+
|
285
305
|
end
|
286
306
|
end
|
287
307
|
end
|
@@ -48,21 +48,21 @@ module Statsample
|
|
48
48
|
attr_accessor :max_iterations
|
49
49
|
# Debug algorithm (See iterations, for example)
|
50
50
|
attr_accessor :debug
|
51
|
-
# Minimizer type. Default
|
51
|
+
# Minimizer type for two step. Default "brent"
|
52
52
|
# See http://rb-gsl.rubyforge.org/min.html for reference.
|
53
53
|
attr_accessor :minimizer_type_two_step
|
54
54
|
|
55
|
-
# Minimizer type. Default
|
55
|
+
# Minimizer type for joint estimate. Default "nmsimplex"
|
56
56
|
# See http://rb-gsl.rubyforge.org/min.html for reference.
|
57
57
|
attr_accessor :minimizer_type_joint
|
58
58
|
|
59
59
|
|
60
60
|
# Method of calculation of polychoric series.
|
61
61
|
#
|
62
|
-
# :two_step:: two-step ML, based on code by Gegenfurtner(1992)
|
62
|
+
# :two_step:: two-step ML, based on code by Gegenfurtner(1992).
|
63
63
|
# :polychoric_series:: polychoric series estimate, using
|
64
|
-
# algorithm AS87 by Martinson and Hamdan (1975)
|
65
|
-
# :joint
|
64
|
+
# algorithm AS87 by Martinson and Hamdan (1975).
|
65
|
+
# :joint:: one-step ML, based on R package 'polycor'
|
66
66
|
# by J.Fox.
|
67
67
|
attr_accessor :method
|
68
68
|
# Absolute error for iteration.
|
@@ -73,7 +73,9 @@ module Statsample
|
|
73
73
|
|
74
74
|
# Log of algorithm
|
75
75
|
attr_reader :log
|
76
|
-
|
76
|
+
|
77
|
+
|
78
|
+
attr_reader :loglike_model
|
77
79
|
|
78
80
|
METHOD=:two_step
|
79
81
|
MAX_ITERATIONS=300
|
@@ -162,16 +164,15 @@ module Statsample
|
|
162
164
|
|
163
165
|
def loglike_data
|
164
166
|
loglike=0
|
165
|
-
@nr.times
|
166
|
-
@nc.times
|
167
|
+
@nr.times do |i|
|
168
|
+
@nc.times do |j|
|
167
169
|
res=@matrix[i,j].quo(@total)
|
168
170
|
if (res==0)
|
169
|
-
|
170
|
-
|
171
|
-
end
|
171
|
+
res=1e-16
|
172
|
+
end
|
172
173
|
loglike+= @matrix[i,j] * Math::log(res )
|
173
|
-
|
174
|
-
|
174
|
+
end
|
175
|
+
end
|
175
176
|
loglike
|
176
177
|
end
|
177
178
|
def chi_square
|
@@ -346,7 +347,7 @@ module Statsample
|
|
346
347
|
end
|
347
348
|
message+=sprintf("f() = %7.3f size = %.3f\n", minimizer.fval, minimizer.size)+"\n";
|
348
349
|
end while status == GSL::CONTINUE and iter < @max_iterations
|
349
|
-
@iteration
|
350
|
+
@iteration=iter
|
350
351
|
@log+=message
|
351
352
|
puts message if @debug
|
352
353
|
@r=minimizer.x[0]
|
@@ -155,6 +155,28 @@ raise "Should'nt be empty headers: [#{row.to_a.join(",")}]" if row.to_a.find_all
|
|
155
155
|
}
|
156
156
|
book.write(filename)
|
157
157
|
end
|
158
|
+
# This should be fixed.
|
159
|
+
# If we have a Formula, should be resolver first
|
160
|
+
|
161
|
+
def preprocess_row(row, dates)
|
162
|
+
i=-1
|
163
|
+
row.collect!{|c|
|
164
|
+
i+=1
|
165
|
+
if c.is_a? Spreadsheet::Formula
|
166
|
+
if(c.value.is_a? Spreadsheet::Excel::Error)
|
167
|
+
nil
|
168
|
+
else
|
169
|
+
c.value
|
170
|
+
end
|
171
|
+
elsif dates.include? i and !c.nil? and c.is_a? Numeric
|
172
|
+
row.date(i)
|
173
|
+
else
|
174
|
+
c
|
175
|
+
end
|
176
|
+
}
|
177
|
+
end
|
178
|
+
private :process_row
|
179
|
+
|
158
180
|
# Returns a dataset based on a xls file
|
159
181
|
# USE:
|
160
182
|
# ds = Statsample::Excel.read("test.xls")
|
@@ -177,27 +199,9 @@ raise "Should'nt be empty headers: [#{row.to_a.join(",")}]" if row.to_a.find_all
|
|
177
199
|
end
|
178
200
|
}
|
179
201
|
line_number+=1
|
180
|
-
if(line_number<=ignore_lines)
|
181
|
-
|
182
|
-
|
183
|
-
end
|
184
|
-
# This should be fixed.
|
185
|
-
# If we have a Formula, should be resolver first
|
186
|
-
i=-1
|
187
|
-
row.collect!{|c|
|
188
|
-
i+=1
|
189
|
-
if c.is_a? Spreadsheet::Formula
|
190
|
-
if(c.value.is_a? Spreadsheet::Excel::Error)
|
191
|
-
nil
|
192
|
-
else
|
193
|
-
c.value
|
194
|
-
end
|
195
|
-
elsif dates.include? i and !c.nil? and c.is_a? Numeric
|
196
|
-
row.date(i)
|
197
|
-
else
|
198
|
-
c
|
199
|
-
end
|
200
|
-
}
|
202
|
+
next if(line_number<=ignore_lines)
|
203
|
+
|
204
|
+
preprocess_row(row,dates)
|
201
205
|
if first_row
|
202
206
|
fields=extract_fields(row)
|
203
207
|
ds=Statsample::Dataset.new(fields)
|
@@ -210,8 +214,8 @@ raise "Should'nt be empty headers: [#{row.to_a.join(",")}]" if row.to_a.find_all
|
|
210
214
|
ds.add_case(rowa,false)
|
211
215
|
end
|
212
216
|
rescue => e
|
213
|
-
|
214
|
-
|
217
|
+
error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}"
|
218
|
+
raise
|
215
219
|
end
|
216
220
|
end
|
217
221
|
convert_to_scale_and_date(ds, fields)
|
data/lib/statsample/crosstab.rb
CHANGED
@@ -8,7 +8,7 @@ module Statsample
|
|
8
8
|
bindtextdomain("statsample")
|
9
9
|
attr_reader :v_rows, :v_cols
|
10
10
|
attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total
|
11
|
-
def initialize(v1,v2,opts=Hash.new)
|
11
|
+
def initialize(v1, v2, opts=Hash.new)
|
12
12
|
raise ArgumentError, "Both arguments should be Vectors" unless v1.is_a? Statsample::Vector and v2.is_a? Statsample::Vector
|
13
13
|
raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
|
14
14
|
@v_rows, @v_cols=Statsample.only_valid(v1,v2)
|
@@ -191,48 +191,5 @@ module Statsample
|
|
191
191
|
t.add_row(t_row)
|
192
192
|
generator.parse_element(t)
|
193
193
|
end
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
def to_s
|
198
|
-
fq=frequencies
|
199
|
-
rn=rows_names
|
200
|
-
cn=cols_names
|
201
|
-
total=0
|
202
|
-
total_cols=cols_empty_hash
|
203
|
-
max_row_size = rn.inject(0) {|s,x| sl=@v_rows.labeling(x).size; sl>s ? sl : s}
|
204
|
-
|
205
|
-
max_row_size=max_row_size<6 ? 6 : max_row_size
|
206
|
-
|
207
|
-
max_col_size = cn.inject(0) {|s,x| sl=@v_cols.labeling(x).size; sl>s ? sl : s}
|
208
|
-
max_col_size = frequencies.inject(max_col_size) {|s,x| x[1].to_s.size>s ? x[1].to_s.size : s}
|
209
|
-
|
210
|
-
out=""
|
211
|
-
out << " " * (max_row_size+2) << "|" << cn.collect{|c| name=@v_cols.labeling(c); " "+name+(" "*(max_col_size-name.size))+" "}.join("|") << "| Total\n"
|
212
|
-
linea="-" * (max_row_size+2) << "|" << ("-"*(max_col_size+2) +"|")*cn.size << "-"*7 << "\n"
|
213
|
-
out << linea
|
214
|
-
rn.each{|row|
|
215
|
-
total_row=0;
|
216
|
-
name=@v_rows.labeling(row)
|
217
|
-
out << " " +name << " "*(max_row_size-name.size) << " | "
|
218
|
-
cn.each{|col|
|
219
|
-
data=fq[[row,col]].to_s
|
220
|
-
total_row+=fq[[row,col]]
|
221
|
-
total+=fq[[row,col]]
|
222
|
-
total_cols[col]+=fq[[row,col]]
|
223
|
-
out << " " << data << " "*(max_col_size-data.size) << "| "
|
224
|
-
}
|
225
|
-
out << " " << total_row.to_s
|
226
|
-
out << "\n"
|
227
|
-
}
|
228
|
-
out << linea
|
229
|
-
out << " Total " << " "*(max_row_size-5) << "| "
|
230
|
-
cn.each{|v|
|
231
|
-
data=total_cols[v].to_s
|
232
|
-
out << " " << data << " "*(max_col_size-data.size) << "| "
|
233
|
-
}
|
234
|
-
out << " " << total.to_s
|
235
|
-
out
|
236
|
-
end
|
237
194
|
end
|
238
195
|
end
|
@@ -4,7 +4,8 @@ module Statsample
|
|
4
4
|
# for all possible subset models, to identify the relevance of one or more
|
5
5
|
# predictors in the prediction of criterium.
|
6
6
|
#
|
7
|
-
#
|
7
|
+
#
|
8
|
+
# See Budescu(1993), Azen & Budescu (2003, 2006) for more information.
|
8
9
|
#
|
9
10
|
# Example:
|
10
11
|
#
|
@@ -53,39 +54,127 @@ module Statsample
|
|
53
54
|
|
54
55
|
#
|
55
56
|
# == References:
|
56
|
-
# * Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression.
|
57
|
-
# * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression.
|
57
|
+
# * Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression. <em>Psychological Bulletin, 114</em>, 542-551.
|
58
|
+
# * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. <em>Psychological Methods, 8</em>(2), 129-148.
|
59
|
+
# * Azen, R. & Budescu, D.V. (2006). Comparing predictors in Multivariate Regression Models: An extension of Dominance Analysis. <em>Journal of Educational and Behavioral Statistics, 31</em>(2), 157-180.
|
58
60
|
class DominanceAnalysis
|
59
61
|
include GetText
|
60
62
|
bindtextdomain("statsample")
|
61
|
-
# Class to generate the regressions. Default to Statsample::Regression::Multiple::
|
63
|
+
# Class to generate the regressions. Default to Statsample::Regression::Multiple::MatrixEngine
|
62
64
|
attr_accessor :regression_class
|
63
65
|
# Name of analysis
|
64
66
|
attr_accessor :name
|
67
|
+
# Set to true if you want to build from dataset, not correlation matrix
|
68
|
+
attr_accessor :build_from_dataset
|
69
|
+
# Array with independent variables. You could create subarrays,
|
70
|
+
# to test groups of predictors as blocks
|
71
|
+
attr_accessor :predictors
|
72
|
+
# If you provide a matrix as input, you should set
|
73
|
+
# the number of cases to define significance of R^2
|
74
|
+
attr_accessor :cases
|
75
|
+
# Method of :regression_class used to measure association.
|
76
|
+
#
|
77
|
+
# Only necessary to change if you have multivariate dependent.
|
78
|
+
# * :r2yx (R^2_yx), the default option, is the option when distinction
|
79
|
+
# between independent and dependents variable is arbitrary
|
80
|
+
# * :p2yx is the option when the distinction between independent and dependents variables is real.
|
81
|
+
#
|
82
|
+
|
83
|
+
attr_accessor :method_association
|
84
|
+
|
85
|
+
|
86
|
+
attr_reader :dependent
|
87
|
+
|
88
|
+
UNIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MatrixEngine
|
89
|
+
MULTIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MultipleDependent
|
65
90
|
|
91
|
+
def self.predictor_name(variable)
|
92
|
+
if variable.is_a? Array
|
93
|
+
sprintf("(%s)", variable.join(","))
|
94
|
+
else
|
95
|
+
variable
|
96
|
+
end
|
97
|
+
end
|
66
98
|
# Creates a new DominanceAnalysis object
|
67
|
-
#
|
68
|
-
# *
|
69
|
-
# *
|
70
|
-
#
|
71
|
-
#
|
72
|
-
|
73
|
-
|
74
|
-
@
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
99
|
+
# Parameters:
|
100
|
+
# * input: A Matrix or Dataset object
|
101
|
+
# * dependent: Name of dependent variable. Could be an array, if you want to
|
102
|
+
# do an Multivariate Regression Analysis. If nil, set to all
|
103
|
+
# fields on input, except criteria
|
104
|
+
|
105
|
+
def initialize(input, dependent, opts=Hash.new)
|
106
|
+
@build_from_dataset=false
|
107
|
+
if dependent.is_a? Array
|
108
|
+
@regression_class= MULTIVARIATE_REGRESSION_CLASS
|
109
|
+
@method_association=:r2yx
|
110
|
+
else
|
111
|
+
@regression_class= UNIVARIATE_REGRESSION_CLASS
|
112
|
+
@method_association=:r2
|
113
|
+
|
114
|
+
end
|
80
115
|
opts.each{|k,v|
|
81
116
|
self.send("#{k}=",v) if self.respond_to? k
|
82
117
|
}
|
118
|
+
@dependent=dependent
|
119
|
+
@dependent=[@dependent] unless @dependent.is_a? Array
|
120
|
+
|
121
|
+
@predictors ||= input.fields-@dependent
|
122
|
+
|
123
|
+
@name=_("Dominance Analysis: %s over %s") % [ @predictors.flatten.join(",") , @dependent.join(",")] if @name.nil?
|
124
|
+
|
125
|
+
if input.is_a? Statsample::Dataset
|
126
|
+
@ds=input
|
127
|
+
@matrix=Statsample::Bivariate.correlation_matrix(input)
|
128
|
+
@cases=Statsample::Bivariate.min_n_valid(input)
|
129
|
+
elsif input.is_a? ::Matrix
|
130
|
+
@ds=nil
|
131
|
+
@matrix=input
|
132
|
+
else
|
133
|
+
raise ArgumentError.new("You should use a Matrix or a Dataset")
|
134
|
+
end
|
135
|
+
@models=nil
|
136
|
+
|
137
|
+
end
|
138
|
+
# Compute models.
|
139
|
+
def compute
|
83
140
|
create_models
|
84
141
|
fill_models
|
85
142
|
end
|
143
|
+
def models
|
144
|
+
if @models.nil?
|
145
|
+
compute
|
146
|
+
end
|
147
|
+
@models
|
148
|
+
end
|
149
|
+
|
150
|
+
def models_data
|
151
|
+
if @models_data.nil?
|
152
|
+
compute
|
153
|
+
end
|
154
|
+
@models_data
|
155
|
+
end
|
156
|
+
def create_models
|
157
|
+
@models=[]
|
158
|
+
@models_data={}
|
159
|
+
for i in 1..@predictors.size
|
160
|
+
c=Statsample::Combination.new(i,@predictors.size)
|
161
|
+
c.each do |data|
|
162
|
+
independent=data.collect {|i1| @predictors[i1] }
|
163
|
+
@models.push(independent)
|
164
|
+
if (@build_from_dataset)
|
165
|
+
data=@ds.dup(independent.flatten+@dependent)
|
166
|
+
else
|
167
|
+
data=@matrix.submatrix(independent.flatten+@dependent)
|
168
|
+
end
|
169
|
+
|
170
|
+
modeldata=ModelData.new(independent, data, self)
|
171
|
+
models_data[independent.sort {|a,b| a.to_s<=>b.to_s}]=modeldata
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
86
175
|
def fill_models
|
87
176
|
@models.each do |m|
|
88
|
-
@
|
177
|
+
@predictors.each do |f|
|
89
178
|
next if m.include? f
|
90
179
|
base_model=md(m)
|
91
180
|
comp_model=md(m+[f])
|
@@ -93,6 +182,8 @@ module Statsample
|
|
93
182
|
end
|
94
183
|
end
|
95
184
|
end
|
185
|
+
private :create_models, :fill_models
|
186
|
+
|
96
187
|
def dominance_for_nil_model(i,j)
|
97
188
|
if md([i]).r2>md([j]).r2
|
98
189
|
1
|
@@ -107,7 +198,7 @@ module Statsample
|
|
107
198
|
dm=dominance_for_nil_model(i,j)
|
108
199
|
return 0.5 if dm==0.5
|
109
200
|
dominances=[dm]
|
110
|
-
|
201
|
+
models_data.each do |k,m|
|
111
202
|
if !m.contributions[i].nil? and !m.contributions[j].nil?
|
112
203
|
if m.contributions[i]>m.contributions[j]
|
113
204
|
dominances.push(1)
|
@@ -128,7 +219,7 @@ module Statsample
|
|
128
219
|
dm=dominance_for_nil_model(i,j)
|
129
220
|
return 0.5 if dm==0.5
|
130
221
|
dominances=[dm]
|
131
|
-
for k in 1...@
|
222
|
+
for k in 1...@predictors.size
|
132
223
|
a=average_k(k)
|
133
224
|
if a[i]>a[j]
|
134
225
|
dominances.push(1)
|
@@ -154,7 +245,7 @@ module Statsample
|
|
154
245
|
end
|
155
246
|
end
|
156
247
|
def pairs
|
157
|
-
|
248
|
+
models.find_all{|m| m.size==2}
|
158
249
|
end
|
159
250
|
def total_dominance
|
160
251
|
pairs.inject({}){|a,pair| a[pair]=total_dominance_pairwise(pair[0], pair[1])
|
@@ -162,20 +253,18 @@ module Statsample
|
|
162
253
|
}
|
163
254
|
end
|
164
255
|
def conditional_dominance
|
165
|
-
pairs.inject({}){|a,pair|
|
166
|
-
a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
|
256
|
+
pairs.inject({}){|a,pair| a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
|
167
257
|
a
|
168
258
|
}
|
169
259
|
end
|
170
260
|
def general_dominance
|
171
|
-
pairs.inject({}){|a,pair|
|
172
|
-
a[pair]=general_dominance_pairwise(pair[0], pair[1])
|
261
|
+
pairs.inject({}){|a,pair| a[pair]=general_dominance_pairwise(pair[0], pair[1])
|
173
262
|
a
|
174
263
|
}
|
175
264
|
end
|
176
265
|
|
177
266
|
def md(m)
|
178
|
-
|
267
|
+
models_data[m.sort {|a,b| a.to_s<=>b.to_s}]
|
179
268
|
end
|
180
269
|
# Get all model of size k
|
181
270
|
def md_k(k)
|
@@ -195,11 +284,11 @@ module Statsample
|
|
195
284
|
end
|
196
285
|
# Hash with average for each k size model.
|
197
286
|
def average_k(k)
|
198
|
-
return nil if k==@
|
287
|
+
return nil if k==@predictors.size
|
199
288
|
models=md_k(k)
|
200
|
-
averages=@
|
289
|
+
averages=@predictors.inject({}) {|a,v| a[v]=[];a}
|
201
290
|
models.each do |m|
|
202
|
-
@
|
291
|
+
@predictors.each do |f|
|
203
292
|
averages[f].push(m.contributions[f]) unless m.contributions[f].nil?
|
204
293
|
end
|
205
294
|
end
|
@@ -207,10 +296,10 @@ module Statsample
|
|
207
296
|
end
|
208
297
|
def general_averages
|
209
298
|
if @general_averages.nil?
|
210
|
-
averages=@
|
211
|
-
for k in 1...@
|
299
|
+
averages=@predictors.inject({}) {|a,v| a[v]=[md([v]).r2];a}
|
300
|
+
for k in 1...@predictors.size
|
212
301
|
ak=average_k(k)
|
213
|
-
@
|
302
|
+
@predictors.each do |f|
|
214
303
|
averages[f].push(ak[f])
|
215
304
|
end
|
216
305
|
end
|
@@ -218,36 +307,25 @@ module Statsample
|
|
218
307
|
end
|
219
308
|
@general_averages
|
220
309
|
end
|
221
|
-
|
222
|
-
@models=[]
|
223
|
-
@models_data={}
|
224
|
-
for i in 1..@fields.size
|
225
|
-
c=Statsample::Combination.new(i,@fields.size)
|
226
|
-
c.each do |data|
|
227
|
-
convert=data.collect {|i1| @fields[i1] }
|
228
|
-
@models.push(convert)
|
229
|
-
ds_prev=@ds.dup(convert+[@y_var])
|
230
|
-
modeldata=ModelData.new(convert,ds_prev, @y_var, @fields, @regression_class)
|
231
|
-
@models_data[convert.sort]=modeldata
|
232
|
-
end
|
233
|
-
end
|
234
|
-
end
|
310
|
+
|
235
311
|
def summary
|
236
312
|
rp=ReportBuilder.new()
|
237
313
|
rp.add(self)
|
238
314
|
rp.to_text
|
239
315
|
end
|
240
316
|
def to_reportbuilder(generator)
|
317
|
+
compute if @models.nil?
|
241
318
|
anchor=generator.add_toc_entry(_("DA: ")+@name)
|
242
319
|
generator.add_html "<div class='dominance-analysis'>#{@name}<a name='#{anchor}'></a>"
|
243
320
|
t=ReportBuilder::Table.new(:name=>_("Dominance Analysis result"))
|
244
|
-
|
245
|
-
|
321
|
+
|
322
|
+
t.header=["","r2",_("sign")]+@predictors.collect {|c| DominanceAnalysis.predictor_name(c) }
|
323
|
+
row=[_("Model 0"),"",""]+@predictors.collect{|f|
|
246
324
|
sprintf("%0.3f", md([f]).r2)
|
247
325
|
}
|
248
326
|
t.add_row(row)
|
249
327
|
t.add_horizontal_line
|
250
|
-
for i in 1..@
|
328
|
+
for i in 1..@predictors.size
|
251
329
|
mk=md_k(i)
|
252
330
|
mk.each{|m|
|
253
331
|
t.add_row(m.add_table_row)
|
@@ -256,7 +334,7 @@ module Statsample
|
|
256
334
|
a=average_k(i)
|
257
335
|
if !a.nil?
|
258
336
|
t.add_horizontal_line
|
259
|
-
row=[_("k=%d Average") % i,"",""] + @
|
337
|
+
row=[_("k=%d Average") % i,"",""] + @predictors.collect{|f|
|
260
338
|
sprintf("%0.3f",a[f])
|
261
339
|
}
|
262
340
|
t.add_row(row)
|
@@ -269,7 +347,7 @@ module Statsample
|
|
269
347
|
g=general_averages
|
270
348
|
t.add_horizontal_line
|
271
349
|
|
272
|
-
row=[_("Overall averages"),"",""]+@
|
350
|
+
row=[_("Overall averages"),"",""]+@predictors.collect{|f|
|
273
351
|
sprintf("%0.3f",g[f])
|
274
352
|
}
|
275
353
|
t.add_row(row)
|
@@ -289,26 +367,42 @@ module Statsample
|
|
289
367
|
end
|
290
368
|
class ModelData
|
291
369
|
attr_reader :contributions
|
292
|
-
def initialize(
|
293
|
-
@
|
294
|
-
@
|
295
|
-
@
|
296
|
-
|
297
|
-
@
|
370
|
+
def initialize(independent, data, da)
|
371
|
+
@independent=independent
|
372
|
+
@data=data
|
373
|
+
@predictors=da.predictors
|
374
|
+
@dependent=da.dependent
|
375
|
+
@cases=da.cases
|
376
|
+
@method=da.method_association
|
377
|
+
@contributions=@independent.inject({}){|a,v| a[v]=nil;a}
|
378
|
+
|
379
|
+
r_class=da.regression_class
|
380
|
+
|
381
|
+
if @dependent.size==1
|
382
|
+
@lr=r_class.new(data, @dependent[0], :cases=>@cases)
|
383
|
+
else
|
384
|
+
@lr=r_class.new(data, @dependent, :cases=>@cases)
|
385
|
+
end
|
298
386
|
end
|
299
|
-
def add_contribution(f,v)
|
387
|
+
def add_contribution(f, v)
|
300
388
|
@contributions[f]=v-r2
|
301
389
|
end
|
302
390
|
def r2
|
303
|
-
@lr.
|
391
|
+
@lr.send(@method)
|
392
|
+
end
|
393
|
+
def name
|
394
|
+
@independent.collect {|variable|
|
395
|
+
DominanceAnalysis.predictor_name(variable)
|
396
|
+
}.join("*")
|
304
397
|
end
|
305
398
|
def add_table_row
|
306
399
|
begin
|
307
|
-
|
400
|
+
sign=sprintf("%0.3f", @lr.significance)
|
308
401
|
rescue RuntimeError
|
309
|
-
|
402
|
+
sign="???"
|
310
403
|
end
|
311
|
-
|
404
|
+
|
405
|
+
[name, sprintf("%0.3f",r2), sign] + @predictors.collect{|k|
|
312
406
|
v=@contributions[k]
|
313
407
|
if v.nil?
|
314
408
|
"--"
|
@@ -318,8 +412,8 @@ module Statsample
|
|
318
412
|
}
|
319
413
|
end
|
320
414
|
def summary
|
321
|
-
out=sprintf("%s: r2=%0.3f(p=%0.2f)\n"
|
322
|
-
out << @
|
415
|
+
out=sprintf("%s: r2=%0.3f(p=%0.2f)\n",name, r2, @lr.significance, @lr.sst)
|
416
|
+
out << @predictors.collect{|k|
|
323
417
|
v=@contributions[k]
|
324
418
|
if v.nil?
|
325
419
|
"--"
|