statsample-ekatena 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.travis.yml +23 -0
  4. data/CONTRIBUTING.md +17 -0
  5. data/Gemfile +2 -0
  6. data/History.txt +457 -0
  7. data/LICENSE.txt +12 -0
  8. data/README.md +175 -0
  9. data/Rakefile +44 -0
  10. data/benchmarks/correlation_matrix_15_variables.rb +32 -0
  11. data/benchmarks/correlation_matrix_5_variables.rb +33 -0
  12. data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
  13. data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
  14. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
  15. data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
  16. data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
  17. data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
  18. data/benchmarks/correlation_matrix_methods/results.ds +0 -0
  19. data/benchmarks/factor_map.rb +37 -0
  20. data/benchmarks/helpers_benchmark.rb +5 -0
  21. data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
  22. data/doc_latex/manual/equations.tex +78 -0
  23. data/examples/boxplot.rb +28 -0
  24. data/examples/chisquare_test.rb +23 -0
  25. data/examples/correlation_matrix.rb +32 -0
  26. data/examples/dataset.rb +30 -0
  27. data/examples/dominance_analysis.rb +33 -0
  28. data/examples/dominance_analysis_bootstrap.rb +32 -0
  29. data/examples/histogram.rb +26 -0
  30. data/examples/icc.rb +24 -0
  31. data/examples/levene.rb +29 -0
  32. data/examples/multiple_regression.rb +20 -0
  33. data/examples/multivariate_correlation.rb +33 -0
  34. data/examples/parallel_analysis.rb +40 -0
  35. data/examples/polychoric.rb +40 -0
  36. data/examples/principal_axis.rb +26 -0
  37. data/examples/reliability.rb +31 -0
  38. data/examples/scatterplot.rb +25 -0
  39. data/examples/t_test.rb +27 -0
  40. data/examples/tetrachoric.rb +17 -0
  41. data/examples/u_test.rb +24 -0
  42. data/examples/vector.rb +20 -0
  43. data/examples/velicer_map_test.rb +46 -0
  44. data/grab_references.rb +29 -0
  45. data/lib/spss.rb +134 -0
  46. data/lib/statsample-ekatena/analysis.rb +100 -0
  47. data/lib/statsample-ekatena/analysis/suite.rb +89 -0
  48. data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
  49. data/lib/statsample-ekatena/anova.rb +24 -0
  50. data/lib/statsample-ekatena/anova/contrast.rb +79 -0
  51. data/lib/statsample-ekatena/anova/oneway.rb +187 -0
  52. data/lib/statsample-ekatena/anova/twoway.rb +207 -0
  53. data/lib/statsample-ekatena/bivariate.rb +406 -0
  54. data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
  55. data/lib/statsample-ekatena/codification.rb +182 -0
  56. data/lib/statsample-ekatena/converter/csv.rb +28 -0
  57. data/lib/statsample-ekatena/converter/spss.rb +48 -0
  58. data/lib/statsample-ekatena/converters.rb +211 -0
  59. data/lib/statsample-ekatena/crosstab.rb +188 -0
  60. data/lib/statsample-ekatena/daru.rb +115 -0
  61. data/lib/statsample-ekatena/dataset.rb +10 -0
  62. data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
  63. data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
  64. data/lib/statsample-ekatena/factor.rb +104 -0
  65. data/lib/statsample-ekatena/factor/map.rb +124 -0
  66. data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
  67. data/lib/statsample-ekatena/factor/pca.rb +242 -0
  68. data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
  69. data/lib/statsample-ekatena/factor/rotation.rb +198 -0
  70. data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
  71. data/lib/statsample-ekatena/formula/formula.rb +306 -0
  72. data/lib/statsample-ekatena/graph.rb +11 -0
  73. data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
  74. data/lib/statsample-ekatena/graph/histogram.rb +198 -0
  75. data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
  76. data/lib/statsample-ekatena/histogram.rb +180 -0
  77. data/lib/statsample-ekatena/matrix.rb +329 -0
  78. data/lib/statsample-ekatena/multiset.rb +310 -0
  79. data/lib/statsample-ekatena/regression.rb +65 -0
  80. data/lib/statsample-ekatena/regression/multiple.rb +89 -0
  81. data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
  82. data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
  83. data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
  84. data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
  85. data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
  86. data/lib/statsample-ekatena/regression/simple.rb +121 -0
  87. data/lib/statsample-ekatena/reliability.rb +150 -0
  88. data/lib/statsample-ekatena/reliability/icc.rb +415 -0
  89. data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
  90. data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
  91. data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
  92. data/lib/statsample-ekatena/resample.rb +15 -0
  93. data/lib/statsample-ekatena/shorthand.rb +125 -0
  94. data/lib/statsample-ekatena/srs.rb +169 -0
  95. data/lib/statsample-ekatena/test.rb +82 -0
  96. data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
  97. data/lib/statsample-ekatena/test/chisquare.rb +73 -0
  98. data/lib/statsample-ekatena/test/f.rb +52 -0
  99. data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
  100. data/lib/statsample-ekatena/test/levene.rb +88 -0
  101. data/lib/statsample-ekatena/test/t.rb +309 -0
  102. data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
  103. data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
  104. data/lib/statsample-ekatena/vector.rb +19 -0
  105. data/lib/statsample-ekatena/version.rb +3 -0
  106. data/lib/statsample.rb +282 -0
  107. data/po/es/statsample.mo +0 -0
  108. data/po/es/statsample.po +959 -0
  109. data/po/statsample.pot +947 -0
  110. data/references.txt +24 -0
  111. data/statsample-ekatena.gemspec +49 -0
  112. data/test/fixtures/bank2.dat +200 -0
  113. data/test/fixtures/correlation_matrix.rb +17 -0
  114. data/test/fixtures/df.csv +15 -0
  115. data/test/fixtures/hartman_23.matrix +9 -0
  116. data/test/fixtures/stock_data.csv +500 -0
  117. data/test/fixtures/tetmat_matrix.txt +5 -0
  118. data/test/fixtures/tetmat_test.txt +1001 -0
  119. data/test/helpers_tests.rb +83 -0
  120. data/test/test_analysis.rb +176 -0
  121. data/test/test_anova_contrast.rb +36 -0
  122. data/test/test_anovaoneway.rb +26 -0
  123. data/test/test_anovatwoway.rb +37 -0
  124. data/test/test_anovatwowaywithdataset.rb +47 -0
  125. data/test/test_anovawithvectors.rb +102 -0
  126. data/test/test_awesome_print_bug.rb +16 -0
  127. data/test/test_bartlettsphericity.rb +25 -0
  128. data/test/test_bivariate.rb +164 -0
  129. data/test/test_codification.rb +78 -0
  130. data/test/test_crosstab.rb +67 -0
  131. data/test/test_dominance_analysis.rb +39 -0
  132. data/test/test_factor.rb +228 -0
  133. data/test/test_factor_map.rb +38 -0
  134. data/test/test_factor_pa.rb +56 -0
  135. data/test/test_fit_model.rb +88 -0
  136. data/test/test_ggobi.rb +35 -0
  137. data/test/test_gsl.rb +15 -0
  138. data/test/test_histogram.rb +109 -0
  139. data/test/test_matrix.rb +48 -0
  140. data/test/test_multiset.rb +176 -0
  141. data/test/test_regression.rb +231 -0
  142. data/test/test_reliability.rb +223 -0
  143. data/test/test_reliability_icc.rb +198 -0
  144. data/test/test_reliability_skillscale.rb +57 -0
  145. data/test/test_resample.rb +24 -0
  146. data/test/test_srs.rb +9 -0
  147. data/test/test_statistics.rb +69 -0
  148. data/test/test_stest.rb +69 -0
  149. data/test/test_stratified.rb +17 -0
  150. data/test/test_test_f.rb +33 -0
  151. data/test/test_test_kolmogorovsmirnov.rb +34 -0
  152. data/test/test_test_t.rb +62 -0
  153. data/test/test_umannwhitney.rb +27 -0
  154. data/test/test_vector.rb +12 -0
  155. data/test/test_wilcoxonsignedrank.rb +64 -0
  156. metadata +570 -0
@@ -0,0 +1,198 @@
1
+ module Statsample
2
+ module Factor
3
+ # Base class for component matrix rotation.
4
+ #
5
+ # == Reference:
6
+ # * SPSS Manual
7
+ # * Lin, J. (2007). VARIMAX_K58 [Source code]. [http://www.johnny-lin.com/idl_code/varimax_k58.pro]
8
+ #
9
+ # Use subclasses Varimax, Equimax or Quartimax for desired type of rotation
10
+ # Use:
11
+ # a = Matrix[ [ 0.4320, 0.8129, 0.3872]
12
+ # , [ 0.7950, -0.5416, 0.2565]
13
+ # , [ 0.5944, 0.7234, -0.3441]
14
+ # , [ 0.8945, -0.3921, -0.1863] ]
15
+ # rotation = Statsample::Factor::Varimax(a)
16
+ # rotation.iterate
17
+ # p rotation.rotated
18
+ # p rotation.component_transformation_matrix
19
+ #
20
+ class Rotation
21
+ EPSILON=1e-15
22
+ MAX_ITERATIONS=25
23
+ include Summarizable
24
+ include DirtyMemoize
25
+ attr_reader :iterations, :rotated, :component_transformation_matrix, :h2
26
+ # Maximum number of iterations
27
+ attr_accessor :max_iterations
28
+ # Maximum precision
29
+ attr_accessor :epsilon
30
+ attr_accessor :use_gsl
31
+ dirty_writer :max_iterations, :epsilon
32
+ dirty_memoize :iterations, :rotated, :component_transformation_matrix, :h2
33
+
34
+ def initialize(matrix, opts=Hash.new)
35
+ @name=_("%s rotation") % rotation_name
36
+ @matrix=matrix
37
+ @n=@matrix.row_size # Variables, p on original
38
+ @m=@matrix.column_size # Factors, r on original
39
+ @component_transformation_matrix=nil
40
+ @max_iterations=MAX_ITERATIONS
41
+ @epsilon=EPSILON
42
+ @rotated=nil
43
+ @h2=(@matrix.collect {|c| c**2} * Matrix.column_vector([1]*@m)).column(0).to_a
44
+ @use_gsl=Statsample.has_gsl?
45
+ opts.each{|k,v|
46
+ self.send("#{k}=",v) if self.respond_to? k
47
+ }
48
+ end
49
+ def report_building(g)
50
+ g.section(:name=>@name) do |s|
51
+ s.parse_element(rotated)
52
+ s.parse_element(component_transformation_matrix)
53
+ end
54
+ end
55
+ alias_method :communalities, :h2
56
+ alias_method :rotated_component_matrix, :rotated
57
+ def compute
58
+ iterate
59
+ end
60
+ # Start iteration
61
+ def iterate
62
+ k_matrix=@use_gsl ? GSL::Matrix : ::Matrix
63
+ t=k_matrix.identity(@m)
64
+ b=(@use_gsl ? @matrix.to_gsl : @matrix.dup)
65
+ h=k_matrix.diagonal(*@h2).collect {|c| Math::sqrt(c)}
66
+ h_inverse=h.collect {|c| c!=0 ? 1/c : 0 }
67
+ bh=h_inverse * b
68
+ @not_converged=true
69
+ @iterations=0
70
+ while @not_converged
71
+ break if @iterations>@max_iterations
72
+ @iterations+=1
73
+ #puts "Iteration #{iterations}"
74
+ num_pairs=@m*(@m-1).quo(2)
75
+ (0..(@m-2)).each do |i| #+ go through factor index 0:r-1-1 (begin)
76
+ ((i+1)..(@m-1)).each do |j| #+ pair i to "rest" of factors (begin)
77
+
78
+ xx = bh.column(i)
79
+ yy = bh.column(j)
80
+ tx = t.column(i)
81
+ ty = t.column(j)
82
+
83
+ uu = @n.times.collect {|var_i| xx[var_i]**2-yy[var_i]**2}
84
+ vv = @n.times.collect {|var_i| 2*xx[var_i]*yy[var_i]}
85
+
86
+ a = @n.times.inject(0) {|ac,var_i| ac+ uu[var_i] }
87
+ b = @n.times.inject(0) {|ac,var_i| ac+ vv[var_i] }
88
+ c = @n.times.inject(0) {|ac,var_i| ac+ (uu[var_i]**2 - vv[var_i]**2) }
89
+ d = @n.times.inject(0) {|ac,var_i| ac+ (2*uu[var_i]*vv[var_i]) }
90
+ num=x(a,b,c,d)
91
+ den=y(a,b,c,d)
92
+ phi=Math::atan2(num,den) / 4.0
93
+ # puts "#{i}-#{j}: #{phi}"
94
+
95
+ if(Math::sin(phi.abs) >= @epsilon)
96
+ xx_rot=( Math::cos(phi)*xx)+(Math::sin(phi)*yy)
97
+ yy_rot=((-Math::sin(phi))*xx)+(Math::cos(phi)*yy)
98
+
99
+
100
+ tx_rot=( Math::cos(phi)*tx)+(Math::sin(phi)*ty)
101
+ ty_rot=((-Math::sin(phi))*tx)+(Math::cos(phi)*ty)
102
+
103
+
104
+ bh=bh.to_a
105
+
106
+ @n.times {|row_i|
107
+ bh[row_i][i] = xx_rot[row_i]
108
+ bh[row_i][j] = yy_rot[row_i]
109
+ }
110
+ t=t.to_a
111
+ @m.times {|row_i|
112
+ t[row_i][i]=tx_rot[row_i]
113
+ t[row_i][j]=ty_rot[row_i]
114
+ }
115
+ #if @use_gsl
116
+ bh=k_matrix.[](*bh)
117
+ t=k_matrix.[](*t)
118
+ #else
119
+ # bh=Matrix.rows(bh)
120
+ # t=Matrix.rows(t)
121
+
122
+ #end
123
+ else
124
+ num_pairs=num_pairs-1
125
+ @not_converged=false if num_pairs==0
126
+ end # if
127
+ end #j
128
+ end #i
129
+ end # while
130
+ @rotated=h*bh
131
+ @rotated.extend CovariateMatrix
132
+ @rotated.name=_("Rotated Component matrix")
133
+
134
+ if @matrix.respond_to? :fields_x
135
+ @rotated.fields_x = @matrix.fields_x
136
+ else
137
+ @rotated.fields_x = @n.times.map {|i| "var_#{i+1}"}
138
+ end
139
+ if @matrix.respond_to? :fields_y
140
+ @rotated.fields_y = @matrix.fields_y
141
+ else
142
+ @rotated.fields_y = @m.times.map {|i| "var_#{i+1}"}
143
+ end
144
+
145
+
146
+
147
+ @component_transformation_matrix=t
148
+ @component_transformation_matrix.extend CovariateMatrix
149
+ @component_transformation_matrix.name=_("Component transformation matrix")
150
+
151
+ if @matrix.respond_to? :fields_y
152
+ @component_transformation_matrix.fields = @matrix.fields_y
153
+
154
+ else
155
+ @component_transformation_matrix.fields = @m.times.map {|i| "var_#{i+1}"}
156
+ end
157
+
158
+ @rotated
159
+ end
160
+
161
+ end
162
+ class Varimax < Rotation
163
+ def x(a,b,c,d)
164
+ d-(2*a*b / @n.to_f)
165
+ end
166
+ def y(a,b,c,d)
167
+ c-((a**2-b**2) / @n.to_f)
168
+ end
169
+ def rotation_name
170
+ "Varimax"
171
+ end
172
+ end
173
+ class Equimax < Rotation
174
+ def x(a,b,c,d)
175
+ d-(@m*a*b / @n.to_f)
176
+ end
177
+ def y(a,b,c,d)
178
+ c-@m*((a**2-b**2) / (2*@n.to_f))
179
+ end
180
+ def rotation_name
181
+ "Equimax"
182
+ end
183
+
184
+ end
185
+ class Quartimax < Rotation
186
+ def x(a,b,c,d)
187
+ d
188
+ end
189
+ def y(a,b,c,d)
190
+ c
191
+ end
192
+ def rotation_name
193
+ "Quartimax"
194
+ end
195
+
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,46 @@
1
+ require 'statsample/formula/formula'
2
+
3
+ module Statsample
4
+ # Class for performing regression
5
+ class FitModel
6
+ def initialize(formula, df, opts = {})
7
+ @formula = FormulaWrapper.new formula, df
8
+ @df = df
9
+ @opts = opts
10
+ end
11
+
12
+ def model
13
+ @model || fit_model
14
+ end
15
+
16
+ def predict(new_data)
17
+ model.predict(df_for_prediction(new_data))
18
+ end
19
+
20
+ def df_for_prediction df
21
+ canonicalize_df(df)
22
+ end
23
+
24
+ def df_for_regression
25
+ df = canonicalize_df(@df)
26
+ df[@formula.y.value] = @df[@formula.y.value]
27
+ df
28
+ end
29
+
30
+ def canonicalize_df(orig_df)
31
+ tokens = @formula.canonical_tokens
32
+ tokens.shift if tokens.first.value == '1'
33
+ df = tokens.map { |t| t.to_df orig_df }.reduce(&:merge)
34
+ df
35
+ end
36
+
37
+ def fit_model
38
+ # TODO: Add support for inclusion/exclusion of intercept
39
+ @model = Statsample::Regression.multiple(
40
+ df_for_regression,
41
+ @formula.y.value,
42
+ @opts
43
+ )
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,306 @@
1
+ module Statsample
2
+ # This class recognizes what terms are numeric
3
+ # and accordingly forms groups which are fed to Formula
4
+ # Once they are parsed with Formula, they are combined back
5
+ class FormulaWrapper
6
+ attr_reader :tokens, :y, :canonical_tokens
7
+
8
+ # Initializes formula wrapper object to parse a given formula into
9
+ # some tokens which do not overlap one another.
10
+ # @note Specify 0 as a term in the formula if you do not want constant
11
+ # to be included in the parsed formula
12
+ # @param [string] formula to parse
13
+ # @param [Daru::DataFrame] df dataframe requried to know what vectors
14
+ # are numerical
15
+ # @example
16
+ # df = Daru::DataFrame.from_csv 'spec/data/df.csv'
17
+ # df.to_category 'c', 'd', 'e'
18
+ # formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
19
+ # formula.canonical_to_s
20
+ # #=> "1+c(-)+d(-):c+a"
21
+ def initialize(formula, df)
22
+ @df = df
23
+ # @y store the LHS term that is name of vector to be predicted
24
+ # @tokens store the RHS terms of the formula
25
+ @y, *@tokens = split_to_tokens(formula)
26
+ @tokens = @tokens.uniq.sort
27
+ manage_constant_term
28
+ @canonical_tokens = non_redundant_tokens
29
+ end
30
+
31
+ # Returns canonical tokens in a readable form.
32
+ # @return [String] canonical tokens in a readable form.
33
+ # @note 'y~a+b(-)' means 'a' exist in full rank expansion
34
+ # and 'b(-)' exist in reduced rank expansion
35
+ # @example
36
+ # df = Daru::DataFrame.from_csv 'spec/data/df.csv'
37
+ # df.to_category 'c', 'd', 'e'
38
+ # formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
39
+ # formula.canonical_to_s
40
+ # #=> "1+c(-)+d(-):c+a"
41
+ def canonical_to_s
42
+ canonical_tokens.join '+'
43
+ end
44
+
45
+ # Returns tokens to produce non-redundant design matrix
46
+ # @return [Array] array of tokens that do not produce redundant matrix
47
+ def non_redundant_tokens
48
+ groups = split_to_groups
49
+ # TODO: An enhancement
50
+ # Right now x:c appears as c:x
51
+ groups.each { |k, v| groups[k] = strip_numeric v, k }
52
+ groups.each { |k, v| groups[k] = Formula.new(v).canonical_tokens }
53
+ groups.flat_map { |k, v| add_numeric v, k }
54
+ end
55
+
56
+ private
57
+
58
+ # Removes intercept token if term '0' is found in the formula.
59
+ # Intercept token remains if term '1' is found.
60
+ # If neither term '0' nor term '1' is found then, intercept token is added.
61
+ def manage_constant_term
62
+ @tokens.unshift Token.new('1') unless
63
+ @tokens.include?(Token.new('1')) ||
64
+ @tokens.include?(Token.new('0'))
65
+ @tokens.delete Token.new('0')
66
+ end
67
+
68
+ # Groups the tokens to gropus based on the numerical terms
69
+ # they are interacting with.
70
+ def split_to_groups
71
+ @tokens.group_by { |t| extract_numeric t }
72
+ end
73
+
74
+ # Add numeric interaction term which was removed earlier
75
+ # @param [Array] tokens tokens on which to add numerical terms
76
+ # @param [Array] numeric array of numeric terms to add
77
+ def add_numeric(tokens, numeric)
78
+ tokens.map do |t|
79
+ terms = t.interact_terms + numeric
80
+ if terms == ['1']
81
+ Token.new('1')
82
+ else
83
+ terms = terms.reject { |i| i == '1' }
84
+ Token.new terms.join(':'), t.full
85
+ end
86
+ end
87
+ end
88
+
89
+ # Strip numerical interacting terms
90
+ # @param [Array] tokens tokens from which to strip numeric
91
+ # @param [Array] numeric array of numeric terms to strip from tokens
92
+ # @return [Array] array of tokens with striped numerical terms
93
+ def strip_numeric(tokens, numeric)
94
+ tokens.map do |t|
95
+ terms = t.interact_terms - numeric
96
+ terms = ['1'] if terms.empty?
97
+ Token.new terms.join(':')
98
+ end
99
+ end
100
+
101
+ # Extract numeric interacting terms
102
+ # @param [Statsample::GLM::Token] token form which to extract numeric terms
103
+ # @return [Array] array of numericl terms
104
+ def extract_numeric(token)
105
+ terms = token.interact_terms
106
+ return [] if terms == ['1']
107
+ terms.reject { |t| @df[t].category? }
108
+ end
109
+
110
+ def split_to_tokens(formula)
111
+ formula = formula.gsub(/\s+/, '')
112
+ lhs_term, rhs = formula.split '~'
113
+ rhs_terms = rhs.split '+'
114
+ ([lhs_term] + rhs_terms).map { |t| Token.new t }
115
+ end
116
+ end
117
+
118
+ # To process formula language
119
+ class Formula
120
+ attr_reader :tokens, :canonical_tokens
121
+
122
+ def initialize(tokens)
123
+ @tokens = tokens
124
+ @canonical_tokens = parse_formula
125
+ end
126
+
127
+ def canonical_to_s
128
+ canonical_tokens.join '+'
129
+ end
130
+
131
+ private
132
+
133
+ def parse_formula
134
+ @tokens.inject([]) do |acc, token|
135
+ acc + add_non_redundant_elements(token, acc)
136
+ end
137
+ end
138
+
139
+ def add_non_redundant_elements(token, result_so_far)
140
+ return [token] if token.value == '1'
141
+ tokens = token.expand
142
+ result_so_far = result_so_far.flat_map(&:expand)
143
+ tokens -= result_so_far
144
+ contract_if_possible tokens
145
+ end
146
+
147
+ def contract_if_possible(tokens)
148
+ tokens.combination(2).each do |a, b|
149
+ result = a.add b
150
+ next unless result
151
+ tokens.delete a
152
+ tokens.delete b
153
+ tokens << result
154
+ return contract_if_possible tokens
155
+ end
156
+ tokens.sort
157
+ end
158
+ end
159
+
160
+ # To encapsulate interaction as well as non-interaction terms
161
+ class Token
162
+ attr_reader :value, :full, :interact_terms
163
+
164
+ def initialize(value, full = true)
165
+ @interact_terms = value.include?(':') ? value.split(':') : [value]
166
+ @full = coerce_full full
167
+ end
168
+
169
+ def value
170
+ interact_terms.join(':')
171
+ end
172
+
173
+ def size
174
+ # TODO: Return size 1 for value '1' also
175
+ # CAn't do this at the moment because have to make
176
+ # changes in sorting first
177
+ value == '1' ? 0 : interact_terms.size
178
+ end
179
+
180
+ def add(other)
181
+ # ANYTHING + FACTOR- : ANYTHING = FACTOR : ANYTHING
182
+ # ANYTHING + ANYTHING : FACTOR- = ANYTHING : FACTOR
183
+ if size > other.size
184
+ other.add self
185
+
186
+ elsif other.size == 2 &&
187
+ size == 1 &&
188
+ other.interact_terms.last == value &&
189
+ other.full.last == full.first &&
190
+ other.full.first == false
191
+ Token.new(
192
+ "#{other.interact_terms.first}:#{value}",
193
+ [true, other.full.last]
194
+ )
195
+
196
+ elsif other.size == 2 &&
197
+ size == 1 &&
198
+ other.interact_terms.first == value &&
199
+ other.full.first == full.first &&
200
+ other.full.last == false
201
+ Token.new(
202
+ "#{value}:#{other.interact_terms.last}",
203
+ [other.full.first, true]
204
+ )
205
+
206
+ elsif value == '1' &&
207
+ other.size == 1
208
+ Token.new(other.value, true)
209
+ end
210
+ end
211
+
212
+ def ==(other)
213
+ value == other.value &&
214
+ full == other.full
215
+ end
216
+
217
+ alias eql? ==
218
+
219
+ def hash
220
+ value.hash ^ full.hash
221
+ end
222
+
223
+ def <=>(other)
224
+ size <=> other.size
225
+ end
226
+
227
+ def to_s
228
+ interact_terms
229
+ .zip(full)
230
+ .map { |t, f| f ? t : t + '(-)' }
231
+ .join ':'
232
+ end
233
+
234
+ def expand
235
+ case size
236
+ when 0
237
+ [self]
238
+ when 1
239
+ [Token.new('1'), Token.new(value, false)]
240
+ when 2
241
+ a, b = interact_terms
242
+ [Token.new('1'), Token.new(a, false), Token.new(b, false),
243
+ Token.new(a + ':' + b, [false, false])]
244
+ end
245
+ end
246
+
247
+ def to_df(df)
248
+ case size
249
+ when 1
250
+ if df[value].category?
251
+ df[value].contrast_code full: full.first
252
+ else
253
+ Daru::DataFrame.new value => df[value].to_a
254
+ end
255
+ when 2
256
+ to_df_when_interaction(df)
257
+ end
258
+ end
259
+
260
+ private
261
+
262
+ def coerce_full(value)
263
+ if value.is_a? Array
264
+ value + Array.new((@interact_terms.size - value.size), true)
265
+ else
266
+ [value] * @interact_terms.size
267
+ end
268
+ end
269
+
270
+ def to_df_when_interaction(df)
271
+ case interact_terms.map { |t| df[t].category? }
272
+ when [true, true]
273
+ df.interact_code(interact_terms, full)
274
+ when [false, false]
275
+ to_df_numeric_interact_with_numeric df
276
+ when [true, false]
277
+ to_df_category_interact_with_numeric df
278
+ when [false, true]
279
+ to_df_numeric_interact_with_category df
280
+ end
281
+ end
282
+
283
+ def to_df_numeric_interact_with_numeric(df)
284
+ Daru::DataFrame.new value => (df[interact_terms.first] *
285
+ df[interact_terms.last]).to_a
286
+ end
287
+
288
+ def to_df_category_interact_with_numeric(df)
289
+ a, b = interact_terms
290
+ Daru::DataFrame.new(
291
+ df[a].contrast_code(full: full.first)
292
+ .map { |dv| ["#{dv.name}:#{b}", (dv * df[b]).to_a] }
293
+ .to_h
294
+ )
295
+ end
296
+
297
+ def to_df_numeric_interact_with_category(df)
298
+ a, b = interact_terms
299
+ Daru::DataFrame.new(
300
+ df[b].contrast_code(full: full.last)
301
+ .map { |dv| ["#{a}:#{dv.name}", (dv * df[a]).to_a] }
302
+ .to_h
303
+ )
304
+ end
305
+ end
306
+ end