statsample-ekatena 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.travis.yml +23 -0
  4. data/CONTRIBUTING.md +17 -0
  5. data/Gemfile +2 -0
  6. data/History.txt +457 -0
  7. data/LICENSE.txt +12 -0
  8. data/README.md +175 -0
  9. data/Rakefile +44 -0
  10. data/benchmarks/correlation_matrix_15_variables.rb +32 -0
  11. data/benchmarks/correlation_matrix_5_variables.rb +33 -0
  12. data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
  13. data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
  14. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
  15. data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
  16. data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
  17. data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
  18. data/benchmarks/correlation_matrix_methods/results.ds +0 -0
  19. data/benchmarks/factor_map.rb +37 -0
  20. data/benchmarks/helpers_benchmark.rb +5 -0
  21. data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
  22. data/doc_latex/manual/equations.tex +78 -0
  23. data/examples/boxplot.rb +28 -0
  24. data/examples/chisquare_test.rb +23 -0
  25. data/examples/correlation_matrix.rb +32 -0
  26. data/examples/dataset.rb +30 -0
  27. data/examples/dominance_analysis.rb +33 -0
  28. data/examples/dominance_analysis_bootstrap.rb +32 -0
  29. data/examples/histogram.rb +26 -0
  30. data/examples/icc.rb +24 -0
  31. data/examples/levene.rb +29 -0
  32. data/examples/multiple_regression.rb +20 -0
  33. data/examples/multivariate_correlation.rb +33 -0
  34. data/examples/parallel_analysis.rb +40 -0
  35. data/examples/polychoric.rb +40 -0
  36. data/examples/principal_axis.rb +26 -0
  37. data/examples/reliability.rb +31 -0
  38. data/examples/scatterplot.rb +25 -0
  39. data/examples/t_test.rb +27 -0
  40. data/examples/tetrachoric.rb +17 -0
  41. data/examples/u_test.rb +24 -0
  42. data/examples/vector.rb +20 -0
  43. data/examples/velicer_map_test.rb +46 -0
  44. data/grab_references.rb +29 -0
  45. data/lib/spss.rb +134 -0
  46. data/lib/statsample-ekatena/analysis.rb +100 -0
  47. data/lib/statsample-ekatena/analysis/suite.rb +89 -0
  48. data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
  49. data/lib/statsample-ekatena/anova.rb +24 -0
  50. data/lib/statsample-ekatena/anova/contrast.rb +79 -0
  51. data/lib/statsample-ekatena/anova/oneway.rb +187 -0
  52. data/lib/statsample-ekatena/anova/twoway.rb +207 -0
  53. data/lib/statsample-ekatena/bivariate.rb +406 -0
  54. data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
  55. data/lib/statsample-ekatena/codification.rb +182 -0
  56. data/lib/statsample-ekatena/converter/csv.rb +28 -0
  57. data/lib/statsample-ekatena/converter/spss.rb +48 -0
  58. data/lib/statsample-ekatena/converters.rb +211 -0
  59. data/lib/statsample-ekatena/crosstab.rb +188 -0
  60. data/lib/statsample-ekatena/daru.rb +115 -0
  61. data/lib/statsample-ekatena/dataset.rb +10 -0
  62. data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
  63. data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
  64. data/lib/statsample-ekatena/factor.rb +104 -0
  65. data/lib/statsample-ekatena/factor/map.rb +124 -0
  66. data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
  67. data/lib/statsample-ekatena/factor/pca.rb +242 -0
  68. data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
  69. data/lib/statsample-ekatena/factor/rotation.rb +198 -0
  70. data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
  71. data/lib/statsample-ekatena/formula/formula.rb +306 -0
  72. data/lib/statsample-ekatena/graph.rb +11 -0
  73. data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
  74. data/lib/statsample-ekatena/graph/histogram.rb +198 -0
  75. data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
  76. data/lib/statsample-ekatena/histogram.rb +180 -0
  77. data/lib/statsample-ekatena/matrix.rb +329 -0
  78. data/lib/statsample-ekatena/multiset.rb +310 -0
  79. data/lib/statsample-ekatena/regression.rb +65 -0
  80. data/lib/statsample-ekatena/regression/multiple.rb +89 -0
  81. data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
  82. data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
  83. data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
  84. data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
  85. data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
  86. data/lib/statsample-ekatena/regression/simple.rb +121 -0
  87. data/lib/statsample-ekatena/reliability.rb +150 -0
  88. data/lib/statsample-ekatena/reliability/icc.rb +415 -0
  89. data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
  90. data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
  91. data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
  92. data/lib/statsample-ekatena/resample.rb +15 -0
  93. data/lib/statsample-ekatena/shorthand.rb +125 -0
  94. data/lib/statsample-ekatena/srs.rb +169 -0
  95. data/lib/statsample-ekatena/test.rb +82 -0
  96. data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
  97. data/lib/statsample-ekatena/test/chisquare.rb +73 -0
  98. data/lib/statsample-ekatena/test/f.rb +52 -0
  99. data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
  100. data/lib/statsample-ekatena/test/levene.rb +88 -0
  101. data/lib/statsample-ekatena/test/t.rb +309 -0
  102. data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
  103. data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
  104. data/lib/statsample-ekatena/vector.rb +19 -0
  105. data/lib/statsample-ekatena/version.rb +3 -0
  106. data/lib/statsample.rb +282 -0
  107. data/po/es/statsample.mo +0 -0
  108. data/po/es/statsample.po +959 -0
  109. data/po/statsample.pot +947 -0
  110. data/references.txt +24 -0
  111. data/statsample-ekatena.gemspec +49 -0
  112. data/test/fixtures/bank2.dat +200 -0
  113. data/test/fixtures/correlation_matrix.rb +17 -0
  114. data/test/fixtures/df.csv +15 -0
  115. data/test/fixtures/hartman_23.matrix +9 -0
  116. data/test/fixtures/stock_data.csv +500 -0
  117. data/test/fixtures/tetmat_matrix.txt +5 -0
  118. data/test/fixtures/tetmat_test.txt +1001 -0
  119. data/test/helpers_tests.rb +83 -0
  120. data/test/test_analysis.rb +176 -0
  121. data/test/test_anova_contrast.rb +36 -0
  122. data/test/test_anovaoneway.rb +26 -0
  123. data/test/test_anovatwoway.rb +37 -0
  124. data/test/test_anovatwowaywithdataset.rb +47 -0
  125. data/test/test_anovawithvectors.rb +102 -0
  126. data/test/test_awesome_print_bug.rb +16 -0
  127. data/test/test_bartlettsphericity.rb +25 -0
  128. data/test/test_bivariate.rb +164 -0
  129. data/test/test_codification.rb +78 -0
  130. data/test/test_crosstab.rb +67 -0
  131. data/test/test_dominance_analysis.rb +39 -0
  132. data/test/test_factor.rb +228 -0
  133. data/test/test_factor_map.rb +38 -0
  134. data/test/test_factor_pa.rb +56 -0
  135. data/test/test_fit_model.rb +88 -0
  136. data/test/test_ggobi.rb +35 -0
  137. data/test/test_gsl.rb +15 -0
  138. data/test/test_histogram.rb +109 -0
  139. data/test/test_matrix.rb +48 -0
  140. data/test/test_multiset.rb +176 -0
  141. data/test/test_regression.rb +231 -0
  142. data/test/test_reliability.rb +223 -0
  143. data/test/test_reliability_icc.rb +198 -0
  144. data/test/test_reliability_skillscale.rb +57 -0
  145. data/test/test_resample.rb +24 -0
  146. data/test/test_srs.rb +9 -0
  147. data/test/test_statistics.rb +69 -0
  148. data/test/test_stest.rb +69 -0
  149. data/test/test_stratified.rb +17 -0
  150. data/test/test_test_f.rb +33 -0
  151. data/test/test_test_kolmogorovsmirnov.rb +34 -0
  152. data/test/test_test_t.rb +62 -0
  153. data/test/test_umannwhitney.rb +27 -0
  154. data/test/test_vector.rb +12 -0
  155. data/test/test_wilcoxonsignedrank.rb +64 -0
  156. metadata +570 -0
@@ -0,0 +1,188 @@
1
+ module Statsample
2
+ # Class to create crosstab of data
3
+ # With this, you can create reports and do chi square test
4
+ # The first vector will be at rows and the second will the the columns
5
+ #
6
+ class Crosstab
7
+ include Summarizable
8
+ attr_reader :v_rows, :v_cols
9
+ attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total
10
+ def initialize(v1, v2, opts=Hash.new)
11
+ raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
12
+ @v_rows, @v_cols = Statsample.only_valid_clone(
13
+ Daru::Vector.new(v1),
14
+ Daru::Vector.new(v2))
15
+ @cases = @v_rows.size
16
+ @row_label = v1.name
17
+ @column_label = v2.name
18
+ @name = nil
19
+ @percentage_row = @percentage_column = @percentage_total=false
20
+ opts.each do |k,v|
21
+ self.send("#{k}=",v) if self.respond_to? k
22
+ end
23
+ @name ||= _("Crosstab %s - %s") % [@row_label, @column_label]
24
+ end
25
+ def rows_names
26
+ @v_rows.factors.sort.reset_index!
27
+ end
28
+ def cols_names
29
+ @v_cols.factors.sort.reset_index!
30
+ end
31
+ def rows_total
32
+ @v_rows.frequencies
33
+ end
34
+ def cols_total
35
+ @v_cols.frequencies
36
+ end
37
+
38
+ def frequencies
39
+ base = rows_names.inject([]) do |s,row|
40
+ s += cols_names.collect { |col| [row,col] }
41
+ end.inject({}) do |s,par|
42
+ s[par]=0
43
+ s
44
+ end
45
+ base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies)
46
+ end
47
+ def to_matrix
48
+ f = frequencies
49
+ rn = rows_names
50
+ cn = cols_names
51
+ Matrix.rows(rn.collect{|row|
52
+ cn.collect{|col| f[[row,col]]}
53
+ })
54
+ end
55
+ def frequencies_by_row
56
+ f=frequencies
57
+ rows_names.inject({}){|sr,row|
58
+ sr[row]=cols_names.inject({}) {|sc,col| sc[col]=f[[row,col]]; sc}
59
+ sr
60
+ }
61
+ end
62
+ def frequencies_by_col
63
+ f=frequencies
64
+ cols_names.inject({}){|sc,col|
65
+ sc[col]=rows_names.inject({}) {|sr,row| sr[row]=f[[row,col]]; sr}
66
+ sc
67
+ }
68
+ end
69
+ # Chi square, based on expected and real matrix
70
+ def chi_square
71
+ require 'statsample/test'
72
+ Statsample::Test.chi_square(self.to_matrix, matrix_expected)
73
+ end
74
+ # Useful to obtain chi square
75
+ def matrix_expected
76
+ rn=rows_names
77
+ cn=cols_names
78
+ rt=rows_total
79
+ ct=cols_total
80
+ t=@v_rows.size
81
+ m=rn.collect{|row|
82
+ cn.collect{|col|
83
+ (rt[row]*ct[col]).quo(t)
84
+ }
85
+ }
86
+ Matrix.rows(m)
87
+ end
88
+ def cols_empty_hash
89
+ cols_names.inject({}) {|a,x| a[x]=0;a}
90
+ end
91
+ def report_building(builder)
92
+ builder.section(:name=>@name) do |generator|
93
+ fq=frequencies
94
+ rn=rows_names
95
+ cn=cols_names
96
+ total=0
97
+ total_cols=cols_empty_hash
98
+ generator.text "Chi Square: #{chi_square}"
99
+ generator.text(_("Rows: %s") % @row_label) unless @row_label.nil?
100
+ generator.text(_("Columns: %s") % @column_label) unless @column_label.nil?
101
+
102
+ t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c)}+[_("Total")])
103
+ rn.each do |row|
104
+ total_row=0
105
+ t_row=[@v_rows.index_of(row)]
106
+ cn.each do |col|
107
+ data=fq[[row,col]]
108
+ total_row+=fq[[row,col]]
109
+ total+=fq[[row,col]]
110
+ total_cols[col]+=fq[[row,col]]
111
+ t_row.push(data)
112
+ end
113
+ t_row.push(total_row)
114
+ t.row(t_row)
115
+ end
116
+ t.hr
117
+ t_row=[_("Total")]
118
+ cn.each do |v|
119
+ t_row.push(total_cols[v])
120
+ end
121
+ t_row.push(total)
122
+ t.row(t_row)
123
+ generator.parse_element(t)
124
+
125
+ if(@percentage_row)
126
+ table_percentage(generator,:row)
127
+ end
128
+ if(@percentage_column)
129
+ table_percentage(generator,:column)
130
+ end
131
+ if(@percentage_total)
132
+ table_percentage(generator,:total)
133
+ end
134
+ end
135
+ end
136
+
137
+
138
+
139
+ def table_percentage(generator,type)
140
+ fq=frequencies
141
+ cn=cols_names
142
+ rn=rows_names
143
+ rt=rows_total
144
+ ct=cols_total
145
+
146
+ type_name=case type
147
+ when :row then _("% Row")
148
+ when :column then _("% Column")
149
+ when :total then _("% Total")
150
+ end
151
+
152
+ t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c) } + [_("Total")])
153
+ rn.each do |row|
154
+ t_row=[@v_rows.index_of(row)]
155
+ cn.each do |col|
156
+ total=case type
157
+ when :row then rt[row]
158
+ when :column then ct[col]
159
+ when :total then @cases
160
+ end
161
+ data = sprintf("%0.2f%%", fq[[row,col]]*100.0/ total )
162
+ t_row.push(data)
163
+ end
164
+ total=case type
165
+ when :row then rt[row]
166
+ when :column then @cases
167
+ when :total then @cases
168
+ end
169
+ t_row.push(sprintf("%0.2f%%", rt[row]*100.0/total))
170
+ t.row(t_row)
171
+ end
172
+
173
+ t.hr
174
+ t_row=[_("Total")]
175
+ cn.each{|col|
176
+ total=case type
177
+ when :row then @cases
178
+ when :column then ct[col]
179
+ when :total then @cases
180
+ end
181
+ t_row.push(sprintf("%0.2f%%", ct[col]*100.0/total))
182
+ }
183
+ t_row.push("100%")
184
+ t.row(t_row)
185
+ generator.parse_element(t)
186
+ end
187
+ end
188
+ end
@@ -0,0 +1,115 @@
1
+ # Opening the Daru::DataFrame class for adding methods to convert from
2
+ # data structures to specialized statsample data structues like Multiset.
3
+ module Daru
4
+ class Vector
5
+ def histogram(bins=10)
6
+ type == :numeric or raise TypeError, "Only numeric Vectors can do this operation."
7
+
8
+ if bins.is_a? Array
9
+ h = Statsample::Histogram.alloc(bins)
10
+ else
11
+ # ugly patch. The upper limit for a bin has the form
12
+ # x < range
13
+ #h=Statsample::Histogram.new(self, bins)
14
+ valid = reject_values(*Daru::MISSING_VALUES)
15
+ min,max=Statsample::Util.nice(valid.min,valid.max)
16
+ # fix last data
17
+ if max == valid.max
18
+ max += 1e-10
19
+ end
20
+ h = Statsample::Histogram.alloc(bins,[min,max])
21
+ # Fix last bin
22
+ end
23
+
24
+ h.increment(valid)
25
+ h
26
+ end
27
+
28
+ # Variance of p, according to poblation size
29
+ def variance_proportion(n_poblation, v=1)
30
+ Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
31
+ end
32
+
33
+ # Variance of p, according to poblation size
34
+ def variance_total(n_poblation, v=1)
35
+ Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
36
+ end
37
+
38
+ def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
39
+ Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
40
+ end
41
+
42
+ def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
43
+ Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
44
+ end
45
+ end
46
+
47
+ class DataFrame
48
+ def crosstab(v1,v2,opts={})
49
+ Statsample::Crosstab.new(self[v1], self[v2],opts)
50
+ end
51
+
52
+ # Functions for converting to Statsample::Multiset
53
+ def to_multiset_by_split(*vecs)
54
+ require 'statsample/multiset'
55
+
56
+ if vecs.size == 1
57
+ to_multiset_by_split_one_field(vecs[0])
58
+ else
59
+ to_multiset_by_split_multiple_fields(*vecs)
60
+ end
61
+ end
62
+
63
+ # Creates a Statsample::Multiset, using one field
64
+ def to_multiset_by_split_one_field(field)
65
+ raise ArgumentError,"Should use a correct field name" if
66
+ !@vectors.include? field
67
+
68
+ factors = self[field].factors
69
+ ms = Statsample::Multiset.new_empty_vectors(@vectors.to_a, factors)
70
+ each_row do |row|
71
+ ms[row[field]].add_row(row)
72
+ end
73
+ #puts "Ingreso a los dataset"
74
+ ms.datasets.each do |k,ds|
75
+ ds.rename self[field].index_of(k)
76
+ end
77
+
78
+ ms
79
+ end
80
+
81
+ def to_multiset_by_split_multiple_fields(*fields)
82
+ fields.map!(&:to_sym)
83
+ factors_total=nil
84
+ fields.each do |f|
85
+ if factors_total.nil?
86
+ factors_total = self[f].factors.collect { |c| [c] }
87
+ else
88
+ suma = []
89
+ factors = self[f].factors
90
+ factors_total.each do |f1|
91
+ factors.each do |f2|
92
+ suma.push(f1+[f2])
93
+ end
94
+ end
95
+ factors_total = suma
96
+ end
97
+ end
98
+ ms = Statsample::Multiset.new_empty_vectors(vectors.to_a, factors_total)
99
+
100
+ p1 = eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}'.to_sym]"}.join(",")+"]].add_row(c) }"
101
+ each_row { |r| p1.call(r) }
102
+
103
+ ms.datasets.each do |k,ds|
104
+ ds.rename(
105
+ fields.size.times.map do |i|
106
+ f = fields[i]
107
+ sk = k[i]
108
+ self[f].index_of(sk)
109
+ end.join("-")
110
+ )
111
+ end
112
+ ms
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,10 @@
1
+ require 'statsample/vector'
2
+
3
+ class Hash
4
+ # Creates a Statsample::Dataset based on a Hash
5
+ def to_dataframe(*args)
6
+ Daru::DataFrame.new(self, *args)
7
+ end
8
+
9
+ alias :to_dataset :to_dataframe
10
+ end
@@ -0,0 +1,425 @@
1
+ module Statsample
2
+ # Dominance Analysis is a procedure based on an examination of the R<sup>2</sup> values
3
+ # for all possible subset models, to identify the relevance of one or more
4
+ # predictors in the prediction of criterium.
5
+ #
6
+ # See Budescu(1993), Azen & Budescu (2003, 2006) for more information.
7
+ #
8
+ # == Use
9
+ #
10
+ # a = Daru::Vector.new(1000.times.collect {rand})
11
+ # b = Daru::Vector.new(1000.times.collect {rand})
12
+ # c = Daru::Vector.new(1000.times.collect {rand})
13
+ # ds= Daru::DataFrame.new({:a => a,:b => b,:c => c})
14
+ # ds[:y] = ds.collect_rows {|row| row[:a]*5 + row[:b]*3 + row[:c]*2 + rand()}
15
+ # da=Statsample::DominanceAnalysis.new(ds, :y)
16
+ # puts da.summary
17
+ #
18
+ # === Output:
19
+ #
20
+ # Report: Report 2010-02-08 19:10:11 -0300
21
+ # Table: Dominance Analysis result
22
+ # ------------------------------------------------------------
23
+ # | | r2 | sign | a | b | c |
24
+ # ------------------------------------------------------------
25
+ # | Model 0 | | | 0.648 | 0.265 | 0.109 |
26
+ # ------------------------------------------------------------
27
+ # | a | 0.648 | 0.000 | -- | 0.229 | 0.104 |
28
+ # | b | 0.265 | 0.000 | 0.612 | -- | 0.104 |
29
+ # | c | 0.109 | 0.000 | 0.643 | 0.260 | -- |
30
+ # ------------------------------------------------------------
31
+ # | k=1 Average | | | 0.627 | 0.244 | 0.104 |
32
+ # ------------------------------------------------------------
33
+ # | a*b | 0.877 | 0.000 | -- | -- | 0.099 |
34
+ # | a*c | 0.752 | 0.000 | -- | 0.224 | -- |
35
+ # | b*c | 0.369 | 0.000 | 0.607 | -- | -- |
36
+ # ------------------------------------------------------------
37
+ # | k=2 Average | | | 0.607 | 0.224 | 0.099 |
38
+ # ------------------------------------------------------------
39
+ # | a*b*c | 0.976 | 0.000 | -- | -- | -- |
40
+ # ------------------------------------------------------------
41
+ # | Overall averages | | | 0.628 | 0.245 | 0.104 |
42
+ # ------------------------------------------------------------
43
+ #
44
+ # Table: Pairwise dominance
45
+ # -----------------------------------------
46
+ # | Pairs | Total | Conditional | General |
47
+ # -----------------------------------------
48
+ # | a - b | 1.0 | 1.0 | 1.0 |
49
+ # | a - c | 1.0 | 1.0 | 1.0 |
50
+ # | b - c | 1.0 | 1.0 | 1.0 |
51
+ # -----------------------------------------
52
+ #
53
+ # == Reference:
54
+ # * Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression. <em>Psychological Bulletin, 114</em>, 542-551.
55
+ # * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. <em>Psychological Methods, 8</em>(2), 129-148.
56
+ # * Azen, R. & Budescu, D.V. (2006). Comparing predictors in Multivariate Regression Models: An extension of Dominance Analysis. <em>Journal of Educational and Behavioral Statistics, 31</em>(2), 157-180.
57
+ #
58
+ class DominanceAnalysis
59
+ include Summarizable
60
+ # Class to generate the regressions. Default to Statsample::Regression::Multiple::MatrixEngine
61
+ attr_accessor :regression_class
62
+ # Name of analysis
63
+ attr_accessor :name
64
+ # Set to true if you want to build from dataset, not correlation matrix
65
+ attr_accessor :build_from_dataset
66
+ # Array with independent variables. You could create subarrays,
67
+ # to test groups of predictors as blocks
68
+ attr_accessor :predictors
69
+ # If you provide a matrix as input, you should set
70
+ # the number of cases to define significance of R^2
71
+ attr_accessor :cases
72
+ # Method of :regression_class used to measure association.
73
+ #
74
+ # Only necessary to change if you have multivariate dependent.
75
+ # * :r2yx (R^2_yx), the default option, is the option when distinction
76
+ # between independent and dependents variable is arbitrary
77
+ # * :p2yx is the option when the distinction between independent and dependents variables is real.
78
+ #
79
+
80
+ attr_accessor :method_association
81
+
82
+
83
+ attr_reader :dependent
84
+
85
+ UNIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MatrixEngine
86
+ MULTIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MultipleDependent
87
+
88
+ def self.predictor_name(variable)
89
+ if variable.is_a? Array
90
+ sprintf("(%s)", variable.join(","))
91
+ else
92
+ variable
93
+ end
94
+ end
95
+ # Creates a new DominanceAnalysis object
96
+ # Parameters:
97
+ # * input: A Matrix or Dataset object
98
+ # * dependent: Name of dependent variable. Could be an array, if you want to
99
+ # do an Multivariate Regression Analysis. If nil, set to all
100
+ # fields on input, except criteria
101
+
102
+ def initialize(input, dependent, opts=Hash.new)
103
+ @build_from_dataset=false
104
+ if dependent.is_a? Array
105
+ @regression_class= MULTIVARIATE_REGRESSION_CLASS
106
+ @method_association=:r2yx
107
+ else
108
+ @regression_class= UNIVARIATE_REGRESSION_CLASS
109
+ @method_association=:r2
110
+ end
111
+
112
+ @name=nil
113
+ opts.each{|k,v|
114
+ self.send("#{k}=",v) if self.respond_to? k
115
+ }
116
+ @dependent=dependent
117
+ @dependent=[@dependent] unless @dependent.is_a? Array
118
+
119
+ if input.kind_of? Daru::DataFrame
120
+ @predictors ||= input.vectors.to_a - @dependent
121
+ @ds=input
122
+ @matrix=Statsample::Bivariate.correlation_matrix(input)
123
+ @cases=Statsample::Bivariate.min_n_valid(input)
124
+ elsif input.is_a? ::Matrix
125
+ @predictors ||= input.fields-@dependent
126
+ @ds=nil
127
+ @matrix=input
128
+ else
129
+ raise ArgumentError.new("You should use a Matrix or a Dataset")
130
+ end
131
+
132
+ @name=_("Dominance Analysis: %s over %s") % [ @predictors.flatten.join(",") , @dependent.join(",")] if @name.nil?
133
+ @models=nil
134
+ @models_data=nil
135
+ @general_averages=nil
136
+ end
137
+ # Compute models.
138
+ def compute
139
+ create_models
140
+ fill_models
141
+ end
142
+ def models
143
+ if @models.nil?
144
+ compute
145
+ end
146
+ @models
147
+ end
148
+
149
+ def models_data
150
+ if @models_data.nil?
151
+ compute
152
+ end
153
+ @models_data
154
+ end
155
+ def create_models
156
+ @models=[]
157
+ @models_data={}
158
+ for i in 1..@predictors.size
159
+ c=(0...@predictors.size).to_a.combination(i)
160
+ c.each do |data|
161
+
162
+ independent=data.collect {|i1| @predictors[i1] }
163
+ @models.push(independent)
164
+ if (@build_from_dataset)
165
+ data=@ds.dup(independent.flatten+@dependent)
166
+ else
167
+ data=@matrix.submatrix(independent.flatten+@dependent)
168
+ end
169
+
170
+ modeldata=ModelData.new(independent, data, self)
171
+ models_data[independent.sort {|a,b| a.to_s<=>b.to_s}]=modeldata
172
+ end
173
+ end
174
+ end
175
+ def fill_models
176
+ @models.each do |m|
177
+ @predictors.each do |f|
178
+ next if m.include? f
179
+ base_model=md(m)
180
+ comp_model=md(m+[f])
181
+ base_model.add_contribution(f,comp_model.r2)
182
+ end
183
+ end
184
+ end
185
+ private :create_models, :fill_models
186
+
187
+ def dominance_for_nil_model(i,j)
188
+ if md([i]).r2>md([j]).r2
189
+ 1
190
+ elsif md([i]).r2<md([j]).r2
191
+ 0
192
+ else
193
+ 0.5
194
+ end
195
+ end
196
+ # Returns 1 if i D k, 0 if j dominates i and 0.5 if undetermined
197
+ def total_dominance_pairwise(i,j)
198
+ dm=dominance_for_nil_model(i,j)
199
+ return 0.5 if dm==0.5
200
+ dominances=[dm]
201
+ models_data.each do |k,m|
202
+ if !m.contributions[i].nil? and !m.contributions[j].nil?
203
+ if m.contributions[i]>m.contributions[j]
204
+ dominances.push(1)
205
+ elsif m.contributions[i]<m.contributions[j]
206
+ dominances.push(0)
207
+ else
208
+ return 0.5
209
+ #dominances.push(0.5)
210
+ end
211
+ end
212
+ end
213
+ final=dominances.uniq
214
+ final.size>1 ? 0.5 : final[0]
215
+ end
216
+
217
+ # Returns 1 if i cD k, 0 if j cD i and 0.5 if undetermined
218
+ def conditional_dominance_pairwise(i,j)
219
+ dm=dominance_for_nil_model(i,j)
220
+ return 0.5 if dm==0.5
221
+ dominances=[dm]
222
+ for k in 1...@predictors.size
223
+ a=average_k(k)
224
+ if a[i]>a[j]
225
+ dominances.push(1)
226
+ elsif a[i]<a[j]
227
+ dominances.push(0)
228
+ else
229
+ return 0.5
230
+ #dominances.push(0.5)
231
+ end
232
+ end
233
+ final=dominances.uniq
234
+ final.size>1 ? 0.5 : final[0]
235
+ end
236
+ # Returns 1 if i gD k, 0 if j gD i and 0.5 if undetermined
237
+ def general_dominance_pairwise(i,j)
238
+ ga=general_averages
239
+ if ga[i]>ga[j]
240
+ 1
241
+ elsif ga[i]<ga[j]
242
+ 0
243
+ else
244
+ 0.5
245
+ end
246
+ end
247
+ def pairs
248
+ models.find_all{|m| m.size==2}
249
+ end
250
+ def total_dominance
251
+ pairs.inject({}){|a,pair| a[pair]=total_dominance_pairwise(pair[0], pair[1])
252
+ a
253
+ }
254
+ end
255
+ def conditional_dominance
256
+ pairs.inject({}){|a,pair| a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
257
+ a
258
+ }
259
+ end
260
+ def general_dominance
261
+ pairs.inject({}){|a,pair| a[pair]=general_dominance_pairwise(pair[0], pair[1])
262
+ a
263
+ }
264
+ end
265
+
266
+ def md(m)
267
+ models_data[m.sort {|a,b| a.to_s <=> b.to_s}]
268
+ end
269
+ # Get all model of size k
270
+ def md_k(k)
271
+ out=[]
272
+ @models.each{ |m| out.push(md(m)) if m.size==k }
273
+ out
274
+ end
275
+
276
+ # For a hash with arrays of numbers as values
277
+ # Returns a hash with same keys and
278
+ # value as the mean of values of original hash
279
+ def get_averages(averages)
280
+ out={}
281
+ averages.each{ |key,val| out[key] = Daru::Vector.new(val).mean }
282
+ out
283
+ end
284
+ # Hash with average for each k size model.
285
+ def average_k(k)
286
+ return nil if k==@predictors.size
287
+ models=md_k(k)
288
+ averages=@predictors.inject({}) {|a,v| a[v]=[];a}
289
+ models.each do |m|
290
+ @predictors.each do |f|
291
+ averages[f].push(m.contributions[f]) unless m.contributions[f].nil?
292
+ end
293
+ end
294
+ get_averages(averages)
295
+ end
296
+ def general_averages
297
+ if @general_averages.nil?
298
+ averages=@predictors.inject({}) {|a,v| a[v]=[md([v]).r2];a}
299
+ for k in 1...@predictors.size
300
+ ak=average_k(k)
301
+ @predictors.each do |f|
302
+ averages[f].push(ak[f])
303
+ end
304
+ end
305
+ @general_averages=get_averages(averages)
306
+ end
307
+ @general_averages
308
+ end
309
+
310
+
311
+ def report_building(g)
312
+ compute if @models.nil?
313
+ g.section(:name=>@name) do |generator|
314
+ header=["","r2",_("sign")]+@predictors.collect {|c| DominanceAnalysis.predictor_name(c) }
315
+
316
+ generator.table(:name=>_("Dominance Analysis result"), :header=>header) do |t|
317
+ row=[_("Model 0"),"",""]+@predictors.collect{|f|
318
+ sprintf("%0.3f",md([f]).r2)
319
+ }
320
+
321
+ t.row(row)
322
+ t.hr
323
+ for i in 1..@predictors.size
324
+ mk=md_k(i)
325
+ mk.each{|m|
326
+ t.row(m.add_table_row)
327
+ }
328
+ # Report averages
329
+ a=average_k(i)
330
+ if !a.nil?
331
+ t.hr
332
+ row=[_("k=%d Average") % i,"",""] + @predictors.collect{|f|
333
+ sprintf("%0.3f",a[f])
334
+ }
335
+ t.row(row)
336
+ t.hr
337
+
338
+ end
339
+ end
340
+
341
+ g=general_averages
342
+ t.hr
343
+
344
+ row=[_("Overall averages"),"",""]+@predictors.collect{|f|
345
+ sprintf("%0.3f",g[f])
346
+ }
347
+ t.row(row)
348
+ end
349
+
350
+ td=total_dominance
351
+ cd=conditional_dominance
352
+ gd=general_dominance
353
+ generator.table(:name=>_("Pairwise dominance"), :header=>[_("Pairs"),_("Total"),_("Conditional"),_("General")]) do |t|
354
+ pairs.each{|pair|
355
+ name=pair.map{|v| v.is_a?(Array) ? "("+v.join("-")+")" : v}.join(" - ")
356
+ row=[name, sprintf("%0.1f",td[pair]), sprintf("%0.1f",cd[pair]), sprintf("%0.1f",gd[pair])]
357
+ t.row(row)
358
+ }
359
+ end
360
+ end
361
+ end
362
+ class ModelData # :nodoc:
363
+ attr_reader :contributions
364
+ def initialize(independent, data, da)
365
+ @independent=independent
366
+ @data=data
367
+ @predictors=da.predictors
368
+ @dependent=da.dependent
369
+ @cases=da.cases
370
+ @method=da.method_association
371
+ @contributions=@independent.inject({}){|a,v| a[v]=nil;a}
372
+
373
+ r_class=da.regression_class
374
+
375
+ if @dependent.size==1
376
+ @lr=r_class.new(data, @dependent[0], :cases=>@cases)
377
+ else
378
+ @lr=r_class.new(data, @dependent, :cases=>@cases)
379
+ end
380
+ end
381
+ def add_contribution(f, v)
382
+ @contributions[f]=v-r2
383
+ end
384
+ def r2
385
+ @lr.send(@method)
386
+ end
387
+ def name
388
+ @independent.collect {|variable|
389
+ DominanceAnalysis.predictor_name(variable)
390
+ }.join("*")
391
+ end
392
+ def add_table_row
393
+ if @cases
394
+ sign=sprintf("%0.3f", @lr.probability)
395
+ else
396
+ sign="???"
397
+ end
398
+
399
+ [name, sprintf("%0.3f",r2), sign] + @predictors.collect{|k|
400
+ v=@contributions[k]
401
+ if v.nil?
402
+ "--"
403
+ else
404
+ sprintf("%0.3f",v)
405
+ end
406
+ }
407
+ end
408
+ def summary
409
+ out=sprintf("%s: r2=%0.3f(p=%0.2f)\n",name, r2, @lr.significance, @lr.sst)
410
+ out << @predictors.collect{|k|
411
+ v=@contributions[k]
412
+ if v.nil?
413
+ "--"
414
+ else
415
+ sprintf("%s=%0.3f",k,v)
416
+ end
417
+ }.join(" | ")
418
+ out << "\n"
419
+ return out
420
+ end
421
+ end # end ModelData
422
+ end # end Dominance Analysis
423
+ end
424
+
425
+ require 'statsample/dominanceanalysis/bootstrap'