statsample-ekatena 2.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (156) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.travis.yml +23 -0
  4. data/CONTRIBUTING.md +17 -0
  5. data/Gemfile +2 -0
  6. data/History.txt +457 -0
  7. data/LICENSE.txt +12 -0
  8. data/README.md +175 -0
  9. data/Rakefile +44 -0
  10. data/benchmarks/correlation_matrix_15_variables.rb +32 -0
  11. data/benchmarks/correlation_matrix_5_variables.rb +33 -0
  12. data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
  13. data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
  14. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
  15. data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
  16. data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
  17. data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
  18. data/benchmarks/correlation_matrix_methods/results.ds +0 -0
  19. data/benchmarks/factor_map.rb +37 -0
  20. data/benchmarks/helpers_benchmark.rb +5 -0
  21. data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
  22. data/doc_latex/manual/equations.tex +78 -0
  23. data/examples/boxplot.rb +28 -0
  24. data/examples/chisquare_test.rb +23 -0
  25. data/examples/correlation_matrix.rb +32 -0
  26. data/examples/dataset.rb +30 -0
  27. data/examples/dominance_analysis.rb +33 -0
  28. data/examples/dominance_analysis_bootstrap.rb +32 -0
  29. data/examples/histogram.rb +26 -0
  30. data/examples/icc.rb +24 -0
  31. data/examples/levene.rb +29 -0
  32. data/examples/multiple_regression.rb +20 -0
  33. data/examples/multivariate_correlation.rb +33 -0
  34. data/examples/parallel_analysis.rb +40 -0
  35. data/examples/polychoric.rb +40 -0
  36. data/examples/principal_axis.rb +26 -0
  37. data/examples/reliability.rb +31 -0
  38. data/examples/scatterplot.rb +25 -0
  39. data/examples/t_test.rb +27 -0
  40. data/examples/tetrachoric.rb +17 -0
  41. data/examples/u_test.rb +24 -0
  42. data/examples/vector.rb +20 -0
  43. data/examples/velicer_map_test.rb +46 -0
  44. data/grab_references.rb +29 -0
  45. data/lib/spss.rb +134 -0
  46. data/lib/statsample-ekatena/analysis.rb +100 -0
  47. data/lib/statsample-ekatena/analysis/suite.rb +89 -0
  48. data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
  49. data/lib/statsample-ekatena/anova.rb +24 -0
  50. data/lib/statsample-ekatena/anova/contrast.rb +79 -0
  51. data/lib/statsample-ekatena/anova/oneway.rb +187 -0
  52. data/lib/statsample-ekatena/anova/twoway.rb +207 -0
  53. data/lib/statsample-ekatena/bivariate.rb +406 -0
  54. data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
  55. data/lib/statsample-ekatena/codification.rb +182 -0
  56. data/lib/statsample-ekatena/converter/csv.rb +28 -0
  57. data/lib/statsample-ekatena/converter/spss.rb +48 -0
  58. data/lib/statsample-ekatena/converters.rb +211 -0
  59. data/lib/statsample-ekatena/crosstab.rb +188 -0
  60. data/lib/statsample-ekatena/daru.rb +115 -0
  61. data/lib/statsample-ekatena/dataset.rb +10 -0
  62. data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
  63. data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
  64. data/lib/statsample-ekatena/factor.rb +104 -0
  65. data/lib/statsample-ekatena/factor/map.rb +124 -0
  66. data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
  67. data/lib/statsample-ekatena/factor/pca.rb +242 -0
  68. data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
  69. data/lib/statsample-ekatena/factor/rotation.rb +198 -0
  70. data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
  71. data/lib/statsample-ekatena/formula/formula.rb +306 -0
  72. data/lib/statsample-ekatena/graph.rb +11 -0
  73. data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
  74. data/lib/statsample-ekatena/graph/histogram.rb +198 -0
  75. data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
  76. data/lib/statsample-ekatena/histogram.rb +180 -0
  77. data/lib/statsample-ekatena/matrix.rb +329 -0
  78. data/lib/statsample-ekatena/multiset.rb +310 -0
  79. data/lib/statsample-ekatena/regression.rb +65 -0
  80. data/lib/statsample-ekatena/regression/multiple.rb +89 -0
  81. data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
  82. data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
  83. data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
  84. data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
  85. data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
  86. data/lib/statsample-ekatena/regression/simple.rb +121 -0
  87. data/lib/statsample-ekatena/reliability.rb +150 -0
  88. data/lib/statsample-ekatena/reliability/icc.rb +415 -0
  89. data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
  90. data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
  91. data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
  92. data/lib/statsample-ekatena/resample.rb +15 -0
  93. data/lib/statsample-ekatena/shorthand.rb +125 -0
  94. data/lib/statsample-ekatena/srs.rb +169 -0
  95. data/lib/statsample-ekatena/test.rb +82 -0
  96. data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
  97. data/lib/statsample-ekatena/test/chisquare.rb +73 -0
  98. data/lib/statsample-ekatena/test/f.rb +52 -0
  99. data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
  100. data/lib/statsample-ekatena/test/levene.rb +88 -0
  101. data/lib/statsample-ekatena/test/t.rb +309 -0
  102. data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
  103. data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
  104. data/lib/statsample-ekatena/vector.rb +19 -0
  105. data/lib/statsample-ekatena/version.rb +3 -0
  106. data/lib/statsample.rb +282 -0
  107. data/po/es/statsample.mo +0 -0
  108. data/po/es/statsample.po +959 -0
  109. data/po/statsample.pot +947 -0
  110. data/references.txt +24 -0
  111. data/statsample-ekatena.gemspec +49 -0
  112. data/test/fixtures/bank2.dat +200 -0
  113. data/test/fixtures/correlation_matrix.rb +17 -0
  114. data/test/fixtures/df.csv +15 -0
  115. data/test/fixtures/hartman_23.matrix +9 -0
  116. data/test/fixtures/stock_data.csv +500 -0
  117. data/test/fixtures/tetmat_matrix.txt +5 -0
  118. data/test/fixtures/tetmat_test.txt +1001 -0
  119. data/test/helpers_tests.rb +83 -0
  120. data/test/test_analysis.rb +176 -0
  121. data/test/test_anova_contrast.rb +36 -0
  122. data/test/test_anovaoneway.rb +26 -0
  123. data/test/test_anovatwoway.rb +37 -0
  124. data/test/test_anovatwowaywithdataset.rb +47 -0
  125. data/test/test_anovawithvectors.rb +102 -0
  126. data/test/test_awesome_print_bug.rb +16 -0
  127. data/test/test_bartlettsphericity.rb +25 -0
  128. data/test/test_bivariate.rb +164 -0
  129. data/test/test_codification.rb +78 -0
  130. data/test/test_crosstab.rb +67 -0
  131. data/test/test_dominance_analysis.rb +39 -0
  132. data/test/test_factor.rb +228 -0
  133. data/test/test_factor_map.rb +38 -0
  134. data/test/test_factor_pa.rb +56 -0
  135. data/test/test_fit_model.rb +88 -0
  136. data/test/test_ggobi.rb +35 -0
  137. data/test/test_gsl.rb +15 -0
  138. data/test/test_histogram.rb +109 -0
  139. data/test/test_matrix.rb +48 -0
  140. data/test/test_multiset.rb +176 -0
  141. data/test/test_regression.rb +231 -0
  142. data/test/test_reliability.rb +223 -0
  143. data/test/test_reliability_icc.rb +198 -0
  144. data/test/test_reliability_skillscale.rb +57 -0
  145. data/test/test_resample.rb +24 -0
  146. data/test/test_srs.rb +9 -0
  147. data/test/test_statistics.rb +69 -0
  148. data/test/test_stest.rb +69 -0
  149. data/test/test_stratified.rb +17 -0
  150. data/test/test_test_f.rb +33 -0
  151. data/test/test_test_kolmogorovsmirnov.rb +34 -0
  152. data/test/test_test_t.rb +62 -0
  153. data/test/test_umannwhitney.rb +27 -0
  154. data/test/test_vector.rb +12 -0
  155. data/test/test_wilcoxonsignedrank.rb +64 -0
  156. metadata +570 -0
@@ -0,0 +1,188 @@
1
+ module Statsample
2
+ # Class to create crosstab of data
3
+ # With this, you can create reports and do chi square test
4
+ # The first vector will be at rows and the second will the the columns
5
+ #
6
+ class Crosstab
7
+ include Summarizable
8
+ attr_reader :v_rows, :v_cols
9
+ attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total
10
+ def initialize(v1, v2, opts=Hash.new)
11
+ raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
12
+ @v_rows, @v_cols = Statsample.only_valid_clone(
13
+ Daru::Vector.new(v1),
14
+ Daru::Vector.new(v2))
15
+ @cases = @v_rows.size
16
+ @row_label = v1.name
17
+ @column_label = v2.name
18
+ @name = nil
19
+ @percentage_row = @percentage_column = @percentage_total=false
20
+ opts.each do |k,v|
21
+ self.send("#{k}=",v) if self.respond_to? k
22
+ end
23
+ @name ||= _("Crosstab %s - %s") % [@row_label, @column_label]
24
+ end
25
+ def rows_names
26
+ @v_rows.factors.sort.reset_index!
27
+ end
28
+ def cols_names
29
+ @v_cols.factors.sort.reset_index!
30
+ end
31
+ def rows_total
32
+ @v_rows.frequencies
33
+ end
34
+ def cols_total
35
+ @v_cols.frequencies
36
+ end
37
+
38
+ def frequencies
39
+ base = rows_names.inject([]) do |s,row|
40
+ s += cols_names.collect { |col| [row,col] }
41
+ end.inject({}) do |s,par|
42
+ s[par]=0
43
+ s
44
+ end
45
+ base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies)
46
+ end
47
+ def to_matrix
48
+ f = frequencies
49
+ rn = rows_names
50
+ cn = cols_names
51
+ Matrix.rows(rn.collect{|row|
52
+ cn.collect{|col| f[[row,col]]}
53
+ })
54
+ end
55
+ def frequencies_by_row
56
+ f=frequencies
57
+ rows_names.inject({}){|sr,row|
58
+ sr[row]=cols_names.inject({}) {|sc,col| sc[col]=f[[row,col]]; sc}
59
+ sr
60
+ }
61
+ end
62
+ def frequencies_by_col
63
+ f=frequencies
64
+ cols_names.inject({}){|sc,col|
65
+ sc[col]=rows_names.inject({}) {|sr,row| sr[row]=f[[row,col]]; sr}
66
+ sc
67
+ }
68
+ end
69
+ # Chi square, based on expected and real matrix
70
+ def chi_square
71
+ require 'statsample/test'
72
+ Statsample::Test.chi_square(self.to_matrix, matrix_expected)
73
+ end
74
+ # Useful to obtain chi square
75
+ def matrix_expected
76
+ rn=rows_names
77
+ cn=cols_names
78
+ rt=rows_total
79
+ ct=cols_total
80
+ t=@v_rows.size
81
+ m=rn.collect{|row|
82
+ cn.collect{|col|
83
+ (rt[row]*ct[col]).quo(t)
84
+ }
85
+ }
86
+ Matrix.rows(m)
87
+ end
88
+ def cols_empty_hash
89
+ cols_names.inject({}) {|a,x| a[x]=0;a}
90
+ end
91
+ def report_building(builder)
92
+ builder.section(:name=>@name) do |generator|
93
+ fq=frequencies
94
+ rn=rows_names
95
+ cn=cols_names
96
+ total=0
97
+ total_cols=cols_empty_hash
98
+ generator.text "Chi Square: #{chi_square}"
99
+ generator.text(_("Rows: %s") % @row_label) unless @row_label.nil?
100
+ generator.text(_("Columns: %s") % @column_label) unless @column_label.nil?
101
+
102
+ t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c)}+[_("Total")])
103
+ rn.each do |row|
104
+ total_row=0
105
+ t_row=[@v_rows.index_of(row)]
106
+ cn.each do |col|
107
+ data=fq[[row,col]]
108
+ total_row+=fq[[row,col]]
109
+ total+=fq[[row,col]]
110
+ total_cols[col]+=fq[[row,col]]
111
+ t_row.push(data)
112
+ end
113
+ t_row.push(total_row)
114
+ t.row(t_row)
115
+ end
116
+ t.hr
117
+ t_row=[_("Total")]
118
+ cn.each do |v|
119
+ t_row.push(total_cols[v])
120
+ end
121
+ t_row.push(total)
122
+ t.row(t_row)
123
+ generator.parse_element(t)
124
+
125
+ if(@percentage_row)
126
+ table_percentage(generator,:row)
127
+ end
128
+ if(@percentage_column)
129
+ table_percentage(generator,:column)
130
+ end
131
+ if(@percentage_total)
132
+ table_percentage(generator,:total)
133
+ end
134
+ end
135
+ end
136
+
137
+
138
+
139
+ def table_percentage(generator,type)
140
+ fq=frequencies
141
+ cn=cols_names
142
+ rn=rows_names
143
+ rt=rows_total
144
+ ct=cols_total
145
+
146
+ type_name=case type
147
+ when :row then _("% Row")
148
+ when :column then _("% Column")
149
+ when :total then _("% Total")
150
+ end
151
+
152
+ t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c) } + [_("Total")])
153
+ rn.each do |row|
154
+ t_row=[@v_rows.index_of(row)]
155
+ cn.each do |col|
156
+ total=case type
157
+ when :row then rt[row]
158
+ when :column then ct[col]
159
+ when :total then @cases
160
+ end
161
+ data = sprintf("%0.2f%%", fq[[row,col]]*100.0/ total )
162
+ t_row.push(data)
163
+ end
164
+ total=case type
165
+ when :row then rt[row]
166
+ when :column then @cases
167
+ when :total then @cases
168
+ end
169
+ t_row.push(sprintf("%0.2f%%", rt[row]*100.0/total))
170
+ t.row(t_row)
171
+ end
172
+
173
+ t.hr
174
+ t_row=[_("Total")]
175
+ cn.each{|col|
176
+ total=case type
177
+ when :row then @cases
178
+ when :column then ct[col]
179
+ when :total then @cases
180
+ end
181
+ t_row.push(sprintf("%0.2f%%", ct[col]*100.0/total))
182
+ }
183
+ t_row.push("100%")
184
+ t.row(t_row)
185
+ generator.parse_element(t)
186
+ end
187
+ end
188
+ end
@@ -0,0 +1,115 @@
1
+ # Opening the Daru::DataFrame class for adding methods to convert from
2
+ # data structures to specialized statsample data structues like Multiset.
3
+ module Daru
4
+ class Vector
5
+ def histogram(bins=10)
6
+ type == :numeric or raise TypeError, "Only numeric Vectors can do this operation."
7
+
8
+ if bins.is_a? Array
9
+ h = Statsample::Histogram.alloc(bins)
10
+ else
11
+ # ugly patch. The upper limit for a bin has the form
12
+ # x < range
13
+ #h=Statsample::Histogram.new(self, bins)
14
+ valid = reject_values(*Daru::MISSING_VALUES)
15
+ min,max=Statsample::Util.nice(valid.min,valid.max)
16
+ # fix last data
17
+ if max == valid.max
18
+ max += 1e-10
19
+ end
20
+ h = Statsample::Histogram.alloc(bins,[min,max])
21
+ # Fix last bin
22
+ end
23
+
24
+ h.increment(valid)
25
+ h
26
+ end
27
+
28
+ # Variance of p, according to poblation size
29
+ def variance_proportion(n_poblation, v=1)
30
+ Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
31
+ end
32
+
33
+ # Variance of p, according to poblation size
34
+ def variance_total(n_poblation, v=1)
35
+ Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
36
+ end
37
+
38
+ def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
39
+ Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
40
+ end
41
+
42
+ def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
43
+ Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
44
+ end
45
+ end
46
+
47
+ class DataFrame
48
+ def crosstab(v1,v2,opts={})
49
+ Statsample::Crosstab.new(self[v1], self[v2],opts)
50
+ end
51
+
52
+ # Functions for converting to Statsample::Multiset
53
+ def to_multiset_by_split(*vecs)
54
+ require 'statsample/multiset'
55
+
56
+ if vecs.size == 1
57
+ to_multiset_by_split_one_field(vecs[0])
58
+ else
59
+ to_multiset_by_split_multiple_fields(*vecs)
60
+ end
61
+ end
62
+
63
+ # Creates a Statsample::Multiset, using one field
64
+ def to_multiset_by_split_one_field(field)
65
+ raise ArgumentError,"Should use a correct field name" if
66
+ !@vectors.include? field
67
+
68
+ factors = self[field].factors
69
+ ms = Statsample::Multiset.new_empty_vectors(@vectors.to_a, factors)
70
+ each_row do |row|
71
+ ms[row[field]].add_row(row)
72
+ end
73
+ #puts "Ingreso a los dataset"
74
+ ms.datasets.each do |k,ds|
75
+ ds.rename self[field].index_of(k)
76
+ end
77
+
78
+ ms
79
+ end
80
+
81
+ def to_multiset_by_split_multiple_fields(*fields)
82
+ fields.map!(&:to_sym)
83
+ factors_total=nil
84
+ fields.each do |f|
85
+ if factors_total.nil?
86
+ factors_total = self[f].factors.collect { |c| [c] }
87
+ else
88
+ suma = []
89
+ factors = self[f].factors
90
+ factors_total.each do |f1|
91
+ factors.each do |f2|
92
+ suma.push(f1+[f2])
93
+ end
94
+ end
95
+ factors_total = suma
96
+ end
97
+ end
98
+ ms = Statsample::Multiset.new_empty_vectors(vectors.to_a, factors_total)
99
+
100
+ p1 = eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}'.to_sym]"}.join(",")+"]].add_row(c) }"
101
+ each_row { |r| p1.call(r) }
102
+
103
+ ms.datasets.each do |k,ds|
104
+ ds.rename(
105
+ fields.size.times.map do |i|
106
+ f = fields[i]
107
+ sk = k[i]
108
+ self[f].index_of(sk)
109
+ end.join("-")
110
+ )
111
+ end
112
+ ms
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,10 @@
1
+ require 'statsample/vector'
2
+
3
+ class Hash
4
+ # Creates a Statsample::Dataset based on a Hash
5
+ def to_dataframe(*args)
6
+ Daru::DataFrame.new(self, *args)
7
+ end
8
+
9
+ alias :to_dataset :to_dataframe
10
+ end
@@ -0,0 +1,425 @@
1
+ module Statsample
2
+ # Dominance Analysis is a procedure based on an examination of the R<sup>2</sup> values
3
+ # for all possible subset models, to identify the relevance of one or more
4
+ # predictors in the prediction of criterium.
5
+ #
6
+ # See Budescu(1993), Azen & Budescu (2003, 2006) for more information.
7
+ #
8
+ # == Use
9
+ #
10
+ # a = Daru::Vector.new(1000.times.collect {rand})
11
+ # b = Daru::Vector.new(1000.times.collect {rand})
12
+ # c = Daru::Vector.new(1000.times.collect {rand})
13
+ # ds= Daru::DataFrame.new({:a => a,:b => b,:c => c})
14
+ # ds[:y] = ds.collect_rows {|row| row[:a]*5 + row[:b]*3 + row[:c]*2 + rand()}
15
+ # da=Statsample::DominanceAnalysis.new(ds, :y)
16
+ # puts da.summary
17
+ #
18
+ # === Output:
19
+ #
20
+ # Report: Report 2010-02-08 19:10:11 -0300
21
+ # Table: Dominance Analysis result
22
+ # ------------------------------------------------------------
23
+ # | | r2 | sign | a | b | c |
24
+ # ------------------------------------------------------------
25
+ # | Model 0 | | | 0.648 | 0.265 | 0.109 |
26
+ # ------------------------------------------------------------
27
+ # | a | 0.648 | 0.000 | -- | 0.229 | 0.104 |
28
+ # | b | 0.265 | 0.000 | 0.612 | -- | 0.104 |
29
+ # | c | 0.109 | 0.000 | 0.643 | 0.260 | -- |
30
+ # ------------------------------------------------------------
31
+ # | k=1 Average | | | 0.627 | 0.244 | 0.104 |
32
+ # ------------------------------------------------------------
33
+ # | a*b | 0.877 | 0.000 | -- | -- | 0.099 |
34
+ # | a*c | 0.752 | 0.000 | -- | 0.224 | -- |
35
+ # | b*c | 0.369 | 0.000 | 0.607 | -- | -- |
36
+ # ------------------------------------------------------------
37
+ # | k=2 Average | | | 0.607 | 0.224 | 0.099 |
38
+ # ------------------------------------------------------------
39
+ # | a*b*c | 0.976 | 0.000 | -- | -- | -- |
40
+ # ------------------------------------------------------------
41
+ # | Overall averages | | | 0.628 | 0.245 | 0.104 |
42
+ # ------------------------------------------------------------
43
+ #
44
+ # Table: Pairwise dominance
45
+ # -----------------------------------------
46
+ # | Pairs | Total | Conditional | General |
47
+ # -----------------------------------------
48
+ # | a - b | 1.0 | 1.0 | 1.0 |
49
+ # | a - c | 1.0 | 1.0 | 1.0 |
50
+ # | b - c | 1.0 | 1.0 | 1.0 |
51
+ # -----------------------------------------
52
+ #
53
+ # == Reference:
54
+ # * Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression. <em>Psychological Bulletin, 114</em>, 542-551.
55
+ # * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. <em>Psychological Methods, 8</em>(2), 129-148.
56
+ # * Azen, R. & Budescu, D.V. (2006). Comparing predictors in Multivariate Regression Models: An extension of Dominance Analysis. <em>Journal of Educational and Behavioral Statistics, 31</em>(2), 157-180.
57
+ #
58
+ class DominanceAnalysis
59
+ include Summarizable
60
+ # Class to generate the regressions. Default to Statsample::Regression::Multiple::MatrixEngine
61
+ attr_accessor :regression_class
62
+ # Name of analysis
63
+ attr_accessor :name
64
+ # Set to true if you want to build from dataset, not correlation matrix
65
+ attr_accessor :build_from_dataset
66
+ # Array with independent variables. You could create subarrays,
67
+ # to test groups of predictors as blocks
68
+ attr_accessor :predictors
69
+ # If you provide a matrix as input, you should set
70
+ # the number of cases to define significance of R^2
71
+ attr_accessor :cases
72
+ # Method of :regression_class used to measure association.
73
+ #
74
+ # Only necessary to change if you have multivariate dependent.
75
+ # * :r2yx (R^2_yx), the default option, is the option when distinction
76
+ # between independent and dependents variable is arbitrary
77
+ # * :p2yx is the option when the distinction between independent and dependents variables is real.
78
+ #
79
+
80
+ attr_accessor :method_association
81
+
82
+
83
+ attr_reader :dependent
84
+
85
+ UNIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MatrixEngine
86
+ MULTIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MultipleDependent
87
+
88
+ def self.predictor_name(variable)
89
+ if variable.is_a? Array
90
+ sprintf("(%s)", variable.join(","))
91
+ else
92
+ variable
93
+ end
94
+ end
95
+ # Creates a new DominanceAnalysis object
96
+ # Parameters:
97
+ # * input: A Matrix or Dataset object
98
+ # * dependent: Name of dependent variable. Could be an array, if you want to
99
+ # do an Multivariate Regression Analysis. If nil, set to all
100
+ # fields on input, except criteria
101
+
102
+ def initialize(input, dependent, opts=Hash.new)
103
+ @build_from_dataset=false
104
+ if dependent.is_a? Array
105
+ @regression_class= MULTIVARIATE_REGRESSION_CLASS
106
+ @method_association=:r2yx
107
+ else
108
+ @regression_class= UNIVARIATE_REGRESSION_CLASS
109
+ @method_association=:r2
110
+ end
111
+
112
+ @name=nil
113
+ opts.each{|k,v|
114
+ self.send("#{k}=",v) if self.respond_to? k
115
+ }
116
+ @dependent=dependent
117
+ @dependent=[@dependent] unless @dependent.is_a? Array
118
+
119
+ if input.kind_of? Daru::DataFrame
120
+ @predictors ||= input.vectors.to_a - @dependent
121
+ @ds=input
122
+ @matrix=Statsample::Bivariate.correlation_matrix(input)
123
+ @cases=Statsample::Bivariate.min_n_valid(input)
124
+ elsif input.is_a? ::Matrix
125
+ @predictors ||= input.fields-@dependent
126
+ @ds=nil
127
+ @matrix=input
128
+ else
129
+ raise ArgumentError.new("You should use a Matrix or a Dataset")
130
+ end
131
+
132
+ @name=_("Dominance Analysis: %s over %s") % [ @predictors.flatten.join(",") , @dependent.join(",")] if @name.nil?
133
+ @models=nil
134
+ @models_data=nil
135
+ @general_averages=nil
136
+ end
137
+ # Compute models.
138
+ def compute
139
+ create_models
140
+ fill_models
141
+ end
142
+ def models
143
+ if @models.nil?
144
+ compute
145
+ end
146
+ @models
147
+ end
148
+
149
+ def models_data
150
+ if @models_data.nil?
151
+ compute
152
+ end
153
+ @models_data
154
+ end
155
+ def create_models
156
+ @models=[]
157
+ @models_data={}
158
+ for i in 1..@predictors.size
159
+ c=(0...@predictors.size).to_a.combination(i)
160
+ c.each do |data|
161
+
162
+ independent=data.collect {|i1| @predictors[i1] }
163
+ @models.push(independent)
164
+ if (@build_from_dataset)
165
+ data=@ds.dup(independent.flatten+@dependent)
166
+ else
167
+ data=@matrix.submatrix(independent.flatten+@dependent)
168
+ end
169
+
170
+ modeldata=ModelData.new(independent, data, self)
171
+ models_data[independent.sort {|a,b| a.to_s<=>b.to_s}]=modeldata
172
+ end
173
+ end
174
+ end
175
+ def fill_models
176
+ @models.each do |m|
177
+ @predictors.each do |f|
178
+ next if m.include? f
179
+ base_model=md(m)
180
+ comp_model=md(m+[f])
181
+ base_model.add_contribution(f,comp_model.r2)
182
+ end
183
+ end
184
+ end
185
+ private :create_models, :fill_models
186
+
187
+ def dominance_for_nil_model(i,j)
188
+ if md([i]).r2>md([j]).r2
189
+ 1
190
+ elsif md([i]).r2<md([j]).r2
191
+ 0
192
+ else
193
+ 0.5
194
+ end
195
+ end
196
+ # Returns 1 if i D k, 0 if j dominates i and 0.5 if undetermined
197
+ def total_dominance_pairwise(i,j)
198
+ dm=dominance_for_nil_model(i,j)
199
+ return 0.5 if dm==0.5
200
+ dominances=[dm]
201
+ models_data.each do |k,m|
202
+ if !m.contributions[i].nil? and !m.contributions[j].nil?
203
+ if m.contributions[i]>m.contributions[j]
204
+ dominances.push(1)
205
+ elsif m.contributions[i]<m.contributions[j]
206
+ dominances.push(0)
207
+ else
208
+ return 0.5
209
+ #dominances.push(0.5)
210
+ end
211
+ end
212
+ end
213
+ final=dominances.uniq
214
+ final.size>1 ? 0.5 : final[0]
215
+ end
216
+
217
+ # Returns 1 if i cD k, 0 if j cD i and 0.5 if undetermined
218
+ def conditional_dominance_pairwise(i,j)
219
+ dm=dominance_for_nil_model(i,j)
220
+ return 0.5 if dm==0.5
221
+ dominances=[dm]
222
+ for k in 1...@predictors.size
223
+ a=average_k(k)
224
+ if a[i]>a[j]
225
+ dominances.push(1)
226
+ elsif a[i]<a[j]
227
+ dominances.push(0)
228
+ else
229
+ return 0.5
230
+ #dominances.push(0.5)
231
+ end
232
+ end
233
+ final=dominances.uniq
234
+ final.size>1 ? 0.5 : final[0]
235
+ end
236
+ # Returns 1 if i gD k, 0 if j gD i and 0.5 if undetermined
237
+ def general_dominance_pairwise(i,j)
238
+ ga=general_averages
239
+ if ga[i]>ga[j]
240
+ 1
241
+ elsif ga[i]<ga[j]
242
+ 0
243
+ else
244
+ 0.5
245
+ end
246
+ end
247
+ def pairs
248
+ models.find_all{|m| m.size==2}
249
+ end
250
+ def total_dominance
251
+ pairs.inject({}){|a,pair| a[pair]=total_dominance_pairwise(pair[0], pair[1])
252
+ a
253
+ }
254
+ end
255
+ def conditional_dominance
256
+ pairs.inject({}){|a,pair| a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
257
+ a
258
+ }
259
+ end
260
+ def general_dominance
261
+ pairs.inject({}){|a,pair| a[pair]=general_dominance_pairwise(pair[0], pair[1])
262
+ a
263
+ }
264
+ end
265
+
266
+ def md(m)
267
+ models_data[m.sort {|a,b| a.to_s <=> b.to_s}]
268
+ end
269
+ # Get all model of size k
270
+ def md_k(k)
271
+ out=[]
272
+ @models.each{ |m| out.push(md(m)) if m.size==k }
273
+ out
274
+ end
275
+
276
+ # For a hash with arrays of numbers as values
277
+ # Returns a hash with same keys and
278
+ # value as the mean of values of original hash
279
+ def get_averages(averages)
280
+ out={}
281
+ averages.each{ |key,val| out[key] = Daru::Vector.new(val).mean }
282
+ out
283
+ end
284
+ # Hash with average for each k size model.
285
+ def average_k(k)
286
+ return nil if k==@predictors.size
287
+ models=md_k(k)
288
+ averages=@predictors.inject({}) {|a,v| a[v]=[];a}
289
+ models.each do |m|
290
+ @predictors.each do |f|
291
+ averages[f].push(m.contributions[f]) unless m.contributions[f].nil?
292
+ end
293
+ end
294
+ get_averages(averages)
295
+ end
296
+ def general_averages
297
+ if @general_averages.nil?
298
+ averages=@predictors.inject({}) {|a,v| a[v]=[md([v]).r2];a}
299
+ for k in 1...@predictors.size
300
+ ak=average_k(k)
301
+ @predictors.each do |f|
302
+ averages[f].push(ak[f])
303
+ end
304
+ end
305
+ @general_averages=get_averages(averages)
306
+ end
307
+ @general_averages
308
+ end
309
+
310
+
311
+ def report_building(g)
312
+ compute if @models.nil?
313
+ g.section(:name=>@name) do |generator|
314
+ header=["","r2",_("sign")]+@predictors.collect {|c| DominanceAnalysis.predictor_name(c) }
315
+
316
+ generator.table(:name=>_("Dominance Analysis result"), :header=>header) do |t|
317
+ row=[_("Model 0"),"",""]+@predictors.collect{|f|
318
+ sprintf("%0.3f",md([f]).r2)
319
+ }
320
+
321
+ t.row(row)
322
+ t.hr
323
+ for i in 1..@predictors.size
324
+ mk=md_k(i)
325
+ mk.each{|m|
326
+ t.row(m.add_table_row)
327
+ }
328
+ # Report averages
329
+ a=average_k(i)
330
+ if !a.nil?
331
+ t.hr
332
+ row=[_("k=%d Average") % i,"",""] + @predictors.collect{|f|
333
+ sprintf("%0.3f",a[f])
334
+ }
335
+ t.row(row)
336
+ t.hr
337
+
338
+ end
339
+ end
340
+
341
+ g=general_averages
342
+ t.hr
343
+
344
+ row=[_("Overall averages"),"",""]+@predictors.collect{|f|
345
+ sprintf("%0.3f",g[f])
346
+ }
347
+ t.row(row)
348
+ end
349
+
350
+ td=total_dominance
351
+ cd=conditional_dominance
352
+ gd=general_dominance
353
+ generator.table(:name=>_("Pairwise dominance"), :header=>[_("Pairs"),_("Total"),_("Conditional"),_("General")]) do |t|
354
+ pairs.each{|pair|
355
+ name=pair.map{|v| v.is_a?(Array) ? "("+v.join("-")+")" : v}.join(" - ")
356
+ row=[name, sprintf("%0.1f",td[pair]), sprintf("%0.1f",cd[pair]), sprintf("%0.1f",gd[pair])]
357
+ t.row(row)
358
+ }
359
+ end
360
+ end
361
+ end
362
+ class ModelData # :nodoc:
363
+ attr_reader :contributions
364
+ def initialize(independent, data, da)
365
+ @independent=independent
366
+ @data=data
367
+ @predictors=da.predictors
368
+ @dependent=da.dependent
369
+ @cases=da.cases
370
+ @method=da.method_association
371
+ @contributions=@independent.inject({}){|a,v| a[v]=nil;a}
372
+
373
+ r_class=da.regression_class
374
+
375
+ if @dependent.size==1
376
+ @lr=r_class.new(data, @dependent[0], :cases=>@cases)
377
+ else
378
+ @lr=r_class.new(data, @dependent, :cases=>@cases)
379
+ end
380
+ end
381
+ def add_contribution(f, v)
382
+ @contributions[f]=v-r2
383
+ end
384
+ def r2
385
+ @lr.send(@method)
386
+ end
387
+ def name
388
+ @independent.collect {|variable|
389
+ DominanceAnalysis.predictor_name(variable)
390
+ }.join("*")
391
+ end
392
+ def add_table_row
393
+ if @cases
394
+ sign=sprintf("%0.3f", @lr.probability)
395
+ else
396
+ sign="???"
397
+ end
398
+
399
+ [name, sprintf("%0.3f",r2), sign] + @predictors.collect{|k|
400
+ v=@contributions[k]
401
+ if v.nil?
402
+ "--"
403
+ else
404
+ sprintf("%0.3f",v)
405
+ end
406
+ }
407
+ end
408
+ def summary
409
+ out=sprintf("%s: r2=%0.3f(p=%0.2f)\n",name, r2, @lr.significance, @lr.sst)
410
+ out << @predictors.collect{|k|
411
+ v=@contributions[k]
412
+ if v.nil?
413
+ "--"
414
+ else
415
+ sprintf("%s=%0.3f",k,v)
416
+ end
417
+ }.join(" | ")
418
+ out << "\n"
419
+ return out
420
+ end
421
+ end # end ModelData
422
+ end # end Dominance Analysis
423
+ end
424
+
425
+ require 'statsample/dominanceanalysis/bootstrap'