statsample-ekatena 2.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (156) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.travis.yml +23 -0
  4. data/CONTRIBUTING.md +17 -0
  5. data/Gemfile +2 -0
  6. data/History.txt +457 -0
  7. data/LICENSE.txt +12 -0
  8. data/README.md +175 -0
  9. data/Rakefile +44 -0
  10. data/benchmarks/correlation_matrix_15_variables.rb +32 -0
  11. data/benchmarks/correlation_matrix_5_variables.rb +33 -0
  12. data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
  13. data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
  14. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
  15. data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
  16. data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
  17. data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
  18. data/benchmarks/correlation_matrix_methods/results.ds +0 -0
  19. data/benchmarks/factor_map.rb +37 -0
  20. data/benchmarks/helpers_benchmark.rb +5 -0
  21. data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
  22. data/doc_latex/manual/equations.tex +78 -0
  23. data/examples/boxplot.rb +28 -0
  24. data/examples/chisquare_test.rb +23 -0
  25. data/examples/correlation_matrix.rb +32 -0
  26. data/examples/dataset.rb +30 -0
  27. data/examples/dominance_analysis.rb +33 -0
  28. data/examples/dominance_analysis_bootstrap.rb +32 -0
  29. data/examples/histogram.rb +26 -0
  30. data/examples/icc.rb +24 -0
  31. data/examples/levene.rb +29 -0
  32. data/examples/multiple_regression.rb +20 -0
  33. data/examples/multivariate_correlation.rb +33 -0
  34. data/examples/parallel_analysis.rb +40 -0
  35. data/examples/polychoric.rb +40 -0
  36. data/examples/principal_axis.rb +26 -0
  37. data/examples/reliability.rb +31 -0
  38. data/examples/scatterplot.rb +25 -0
  39. data/examples/t_test.rb +27 -0
  40. data/examples/tetrachoric.rb +17 -0
  41. data/examples/u_test.rb +24 -0
  42. data/examples/vector.rb +20 -0
  43. data/examples/velicer_map_test.rb +46 -0
  44. data/grab_references.rb +29 -0
  45. data/lib/spss.rb +134 -0
  46. data/lib/statsample-ekatena/analysis.rb +100 -0
  47. data/lib/statsample-ekatena/analysis/suite.rb +89 -0
  48. data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
  49. data/lib/statsample-ekatena/anova.rb +24 -0
  50. data/lib/statsample-ekatena/anova/contrast.rb +79 -0
  51. data/lib/statsample-ekatena/anova/oneway.rb +187 -0
  52. data/lib/statsample-ekatena/anova/twoway.rb +207 -0
  53. data/lib/statsample-ekatena/bivariate.rb +406 -0
  54. data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
  55. data/lib/statsample-ekatena/codification.rb +182 -0
  56. data/lib/statsample-ekatena/converter/csv.rb +28 -0
  57. data/lib/statsample-ekatena/converter/spss.rb +48 -0
  58. data/lib/statsample-ekatena/converters.rb +211 -0
  59. data/lib/statsample-ekatena/crosstab.rb +188 -0
  60. data/lib/statsample-ekatena/daru.rb +115 -0
  61. data/lib/statsample-ekatena/dataset.rb +10 -0
  62. data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
  63. data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
  64. data/lib/statsample-ekatena/factor.rb +104 -0
  65. data/lib/statsample-ekatena/factor/map.rb +124 -0
  66. data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
  67. data/lib/statsample-ekatena/factor/pca.rb +242 -0
  68. data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
  69. data/lib/statsample-ekatena/factor/rotation.rb +198 -0
  70. data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
  71. data/lib/statsample-ekatena/formula/formula.rb +306 -0
  72. data/lib/statsample-ekatena/graph.rb +11 -0
  73. data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
  74. data/lib/statsample-ekatena/graph/histogram.rb +198 -0
  75. data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
  76. data/lib/statsample-ekatena/histogram.rb +180 -0
  77. data/lib/statsample-ekatena/matrix.rb +329 -0
  78. data/lib/statsample-ekatena/multiset.rb +310 -0
  79. data/lib/statsample-ekatena/regression.rb +65 -0
  80. data/lib/statsample-ekatena/regression/multiple.rb +89 -0
  81. data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
  82. data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
  83. data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
  84. data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
  85. data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
  86. data/lib/statsample-ekatena/regression/simple.rb +121 -0
  87. data/lib/statsample-ekatena/reliability.rb +150 -0
  88. data/lib/statsample-ekatena/reliability/icc.rb +415 -0
  89. data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
  90. data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
  91. data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
  92. data/lib/statsample-ekatena/resample.rb +15 -0
  93. data/lib/statsample-ekatena/shorthand.rb +125 -0
  94. data/lib/statsample-ekatena/srs.rb +169 -0
  95. data/lib/statsample-ekatena/test.rb +82 -0
  96. data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
  97. data/lib/statsample-ekatena/test/chisquare.rb +73 -0
  98. data/lib/statsample-ekatena/test/f.rb +52 -0
  99. data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
  100. data/lib/statsample-ekatena/test/levene.rb +88 -0
  101. data/lib/statsample-ekatena/test/t.rb +309 -0
  102. data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
  103. data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
  104. data/lib/statsample-ekatena/vector.rb +19 -0
  105. data/lib/statsample-ekatena/version.rb +3 -0
  106. data/lib/statsample.rb +282 -0
  107. data/po/es/statsample.mo +0 -0
  108. data/po/es/statsample.po +959 -0
  109. data/po/statsample.pot +947 -0
  110. data/references.txt +24 -0
  111. data/statsample-ekatena.gemspec +49 -0
  112. data/test/fixtures/bank2.dat +200 -0
  113. data/test/fixtures/correlation_matrix.rb +17 -0
  114. data/test/fixtures/df.csv +15 -0
  115. data/test/fixtures/hartman_23.matrix +9 -0
  116. data/test/fixtures/stock_data.csv +500 -0
  117. data/test/fixtures/tetmat_matrix.txt +5 -0
  118. data/test/fixtures/tetmat_test.txt +1001 -0
  119. data/test/helpers_tests.rb +83 -0
  120. data/test/test_analysis.rb +176 -0
  121. data/test/test_anova_contrast.rb +36 -0
  122. data/test/test_anovaoneway.rb +26 -0
  123. data/test/test_anovatwoway.rb +37 -0
  124. data/test/test_anovatwowaywithdataset.rb +47 -0
  125. data/test/test_anovawithvectors.rb +102 -0
  126. data/test/test_awesome_print_bug.rb +16 -0
  127. data/test/test_bartlettsphericity.rb +25 -0
  128. data/test/test_bivariate.rb +164 -0
  129. data/test/test_codification.rb +78 -0
  130. data/test/test_crosstab.rb +67 -0
  131. data/test/test_dominance_analysis.rb +39 -0
  132. data/test/test_factor.rb +228 -0
  133. data/test/test_factor_map.rb +38 -0
  134. data/test/test_factor_pa.rb +56 -0
  135. data/test/test_fit_model.rb +88 -0
  136. data/test/test_ggobi.rb +35 -0
  137. data/test/test_gsl.rb +15 -0
  138. data/test/test_histogram.rb +109 -0
  139. data/test/test_matrix.rb +48 -0
  140. data/test/test_multiset.rb +176 -0
  141. data/test/test_regression.rb +231 -0
  142. data/test/test_reliability.rb +223 -0
  143. data/test/test_reliability_icc.rb +198 -0
  144. data/test/test_reliability_skillscale.rb +57 -0
  145. data/test/test_resample.rb +24 -0
  146. data/test/test_srs.rb +9 -0
  147. data/test/test_statistics.rb +69 -0
  148. data/test/test_stest.rb +69 -0
  149. data/test/test_stratified.rb +17 -0
  150. data/test/test_test_f.rb +33 -0
  151. data/test/test_test_kolmogorovsmirnov.rb +34 -0
  152. data/test/test_test_t.rb +62 -0
  153. data/test/test_umannwhitney.rb +27 -0
  154. data/test/test_vector.rb +12 -0
  155. data/test/test_wilcoxonsignedrank.rb +64 -0
  156. metadata +570 -0
@@ -0,0 +1,198 @@
1
+ module Statsample
2
+ module Factor
3
+ # Base class for component matrix rotation.
4
+ #
5
+ # == Reference:
6
+ # * SPSS Manual
7
+ # * Lin, J. (2007). VARIMAX_K58 [Source code]. [http://www.johnny-lin.com/idl_code/varimax_k58.pro]
8
+ #
9
+ # Use subclasses Varimax, Equimax or Quartimax for desired type of rotation
10
+ # Use:
11
+ # a = Matrix[ [ 0.4320, 0.8129, 0.3872]
12
+ # , [ 0.7950, -0.5416, 0.2565]
13
+ # , [ 0.5944, 0.7234, -0.3441]
14
+ # , [ 0.8945, -0.3921, -0.1863] ]
15
+ # rotation = Statsample::Factor::Varimax(a)
16
+ # rotation.iterate
17
+ # p rotation.rotated
18
+ # p rotation.component_transformation_matrix
19
+ #
20
+ class Rotation
21
+ EPSILON=1e-15
22
+ MAX_ITERATIONS=25
23
+ include Summarizable
24
+ include DirtyMemoize
25
+ attr_reader :iterations, :rotated, :component_transformation_matrix, :h2
26
+ # Maximum number of iterations
27
+ attr_accessor :max_iterations
28
+ # Maximum precision
29
+ attr_accessor :epsilon
30
+ attr_accessor :use_gsl
31
+ dirty_writer :max_iterations, :epsilon
32
+ dirty_memoize :iterations, :rotated, :component_transformation_matrix, :h2
33
+
34
+ def initialize(matrix, opts=Hash.new)
35
+ @name=_("%s rotation") % rotation_name
36
+ @matrix=matrix
37
+ @n=@matrix.row_size # Variables, p on original
38
+ @m=@matrix.column_size # Factors, r on original
39
+ @component_transformation_matrix=nil
40
+ @max_iterations=MAX_ITERATIONS
41
+ @epsilon=EPSILON
42
+ @rotated=nil
43
+ @h2=(@matrix.collect {|c| c**2} * Matrix.column_vector([1]*@m)).column(0).to_a
44
+ @use_gsl=Statsample.has_gsl?
45
+ opts.each{|k,v|
46
+ self.send("#{k}=",v) if self.respond_to? k
47
+ }
48
+ end
49
+ def report_building(g)
50
+ g.section(:name=>@name) do |s|
51
+ s.parse_element(rotated)
52
+ s.parse_element(component_transformation_matrix)
53
+ end
54
+ end
55
+ alias_method :communalities, :h2
56
+ alias_method :rotated_component_matrix, :rotated
57
+ def compute
58
+ iterate
59
+ end
60
+ # Start iteration
61
+ def iterate
62
+ k_matrix=@use_gsl ? GSL::Matrix : ::Matrix
63
+ t=k_matrix.identity(@m)
64
+ b=(@use_gsl ? @matrix.to_gsl : @matrix.dup)
65
+ h=k_matrix.diagonal(*@h2).collect {|c| Math::sqrt(c)}
66
+ h_inverse=h.collect {|c| c!=0 ? 1/c : 0 }
67
+ bh=h_inverse * b
68
+ @not_converged=true
69
+ @iterations=0
70
+ while @not_converged
71
+ break if @iterations>@max_iterations
72
+ @iterations+=1
73
+ #puts "Iteration #{iterations}"
74
+ num_pairs=@m*(@m-1).quo(2)
75
+ (0..(@m-2)).each do |i| #+ go through factor index 0:r-1-1 (begin)
76
+ ((i+1)..(@m-1)).each do |j| #+ pair i to "rest" of factors (begin)
77
+
78
+ xx = bh.column(i)
79
+ yy = bh.column(j)
80
+ tx = t.column(i)
81
+ ty = t.column(j)
82
+
83
+ uu = @n.times.collect {|var_i| xx[var_i]**2-yy[var_i]**2}
84
+ vv = @n.times.collect {|var_i| 2*xx[var_i]*yy[var_i]}
85
+
86
+ a = @n.times.inject(0) {|ac,var_i| ac+ uu[var_i] }
87
+ b = @n.times.inject(0) {|ac,var_i| ac+ vv[var_i] }
88
+ c = @n.times.inject(0) {|ac,var_i| ac+ (uu[var_i]**2 - vv[var_i]**2) }
89
+ d = @n.times.inject(0) {|ac,var_i| ac+ (2*uu[var_i]*vv[var_i]) }
90
+ num=x(a,b,c,d)
91
+ den=y(a,b,c,d)
92
+ phi=Math::atan2(num,den) / 4.0
93
+ # puts "#{i}-#{j}: #{phi}"
94
+
95
+ if(Math::sin(phi.abs) >= @epsilon)
96
+ xx_rot=( Math::cos(phi)*xx)+(Math::sin(phi)*yy)
97
+ yy_rot=((-Math::sin(phi))*xx)+(Math::cos(phi)*yy)
98
+
99
+
100
+ tx_rot=( Math::cos(phi)*tx)+(Math::sin(phi)*ty)
101
+ ty_rot=((-Math::sin(phi))*tx)+(Math::cos(phi)*ty)
102
+
103
+
104
+ bh=bh.to_a
105
+
106
+ @n.times {|row_i|
107
+ bh[row_i][i] = xx_rot[row_i]
108
+ bh[row_i][j] = yy_rot[row_i]
109
+ }
110
+ t=t.to_a
111
+ @m.times {|row_i|
112
+ t[row_i][i]=tx_rot[row_i]
113
+ t[row_i][j]=ty_rot[row_i]
114
+ }
115
+ #if @use_gsl
116
+ bh=k_matrix.[](*bh)
117
+ t=k_matrix.[](*t)
118
+ #else
119
+ # bh=Matrix.rows(bh)
120
+ # t=Matrix.rows(t)
121
+
122
+ #end
123
+ else
124
+ num_pairs=num_pairs-1
125
+ @not_converged=false if num_pairs==0
126
+ end # if
127
+ end #j
128
+ end #i
129
+ end # while
130
+ @rotated=h*bh
131
+ @rotated.extend CovariateMatrix
132
+ @rotated.name=_("Rotated Component matrix")
133
+
134
+ if @matrix.respond_to? :fields_x
135
+ @rotated.fields_x = @matrix.fields_x
136
+ else
137
+ @rotated.fields_x = @n.times.map {|i| "var_#{i+1}"}
138
+ end
139
+ if @matrix.respond_to? :fields_y
140
+ @rotated.fields_y = @matrix.fields_y
141
+ else
142
+ @rotated.fields_y = @m.times.map {|i| "var_#{i+1}"}
143
+ end
144
+
145
+
146
+
147
+ @component_transformation_matrix=t
148
+ @component_transformation_matrix.extend CovariateMatrix
149
+ @component_transformation_matrix.name=_("Component transformation matrix")
150
+
151
+ if @matrix.respond_to? :fields_y
152
+ @component_transformation_matrix.fields = @matrix.fields_y
153
+
154
+ else
155
+ @component_transformation_matrix.fields = @m.times.map {|i| "var_#{i+1}"}
156
+ end
157
+
158
+ @rotated
159
+ end
160
+
161
+ end
162
+ class Varimax < Rotation
163
+ def x(a,b,c,d)
164
+ d-(2*a*b / @n.to_f)
165
+ end
166
+ def y(a,b,c,d)
167
+ c-((a**2-b**2) / @n.to_f)
168
+ end
169
+ def rotation_name
170
+ "Varimax"
171
+ end
172
+ end
173
+ class Equimax < Rotation
174
+ def x(a,b,c,d)
175
+ d-(@m*a*b / @n.to_f)
176
+ end
177
+ def y(a,b,c,d)
178
+ c-@m*((a**2-b**2) / (2*@n.to_f))
179
+ end
180
+ def rotation_name
181
+ "Equimax"
182
+ end
183
+
184
+ end
185
+ class Quartimax < Rotation
186
+ def x(a,b,c,d)
187
+ d
188
+ end
189
+ def y(a,b,c,d)
190
+ c
191
+ end
192
+ def rotation_name
193
+ "Quartimax"
194
+ end
195
+
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,46 @@
1
+ require 'statsample/formula/formula'
2
+
3
+ module Statsample
4
+ # Class for performing regression
5
+ class FitModel
6
+ def initialize(formula, df, opts = {})
7
+ @formula = FormulaWrapper.new formula, df
8
+ @df = df
9
+ @opts = opts
10
+ end
11
+
12
+ def model
13
+ @model || fit_model
14
+ end
15
+
16
+ def predict(new_data)
17
+ model.predict(df_for_prediction(new_data))
18
+ end
19
+
20
+ def df_for_prediction df
21
+ canonicalize_df(df)
22
+ end
23
+
24
+ def df_for_regression
25
+ df = canonicalize_df(@df)
26
+ df[@formula.y.value] = @df[@formula.y.value]
27
+ df
28
+ end
29
+
30
+ def canonicalize_df(orig_df)
31
+ tokens = @formula.canonical_tokens
32
+ tokens.shift if tokens.first.value == '1'
33
+ df = tokens.map { |t| t.to_df orig_df }.reduce(&:merge)
34
+ df
35
+ end
36
+
37
+ def fit_model
38
+ # TODO: Add support for inclusion/exclusion of intercept
39
+ @model = Statsample::Regression.multiple(
40
+ df_for_regression,
41
+ @formula.y.value,
42
+ @opts
43
+ )
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,306 @@
1
+ module Statsample
2
+ # This class recognizes what terms are numeric
3
+ # and accordingly forms groups which are fed to Formula
4
+ # Once they are parsed with Formula, they are combined back
5
+ class FormulaWrapper
6
+ attr_reader :tokens, :y, :canonical_tokens
7
+
8
+ # Initializes formula wrapper object to parse a given formula into
9
+ # some tokens which do not overlap one another.
10
+ # @note Specify 0 as a term in the formula if you do not want constant
11
+ # to be included in the parsed formula
12
+ # @param [string] formula to parse
13
+ # @param [Daru::DataFrame] df dataframe requried to know what vectors
14
+ # are numerical
15
+ # @example
16
+ # df = Daru::DataFrame.from_csv 'spec/data/df.csv'
17
+ # df.to_category 'c', 'd', 'e'
18
+ # formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
19
+ # formula.canonical_to_s
20
+ # #=> "1+c(-)+d(-):c+a"
21
+ def initialize(formula, df)
22
+ @df = df
23
+ # @y store the LHS term that is name of vector to be predicted
24
+ # @tokens store the RHS terms of the formula
25
+ @y, *@tokens = split_to_tokens(formula)
26
+ @tokens = @tokens.uniq.sort
27
+ manage_constant_term
28
+ @canonical_tokens = non_redundant_tokens
29
+ end
30
+
31
+ # Returns canonical tokens in a readable form.
32
+ # @return [String] canonical tokens in a readable form.
33
+ # @note 'y~a+b(-)' means 'a' exist in full rank expansion
34
+ # and 'b(-)' exist in reduced rank expansion
35
+ # @example
36
+ # df = Daru::DataFrame.from_csv 'spec/data/df.csv'
37
+ # df.to_category 'c', 'd', 'e'
38
+ # formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
39
+ # formula.canonical_to_s
40
+ # #=> "1+c(-)+d(-):c+a"
41
+ def canonical_to_s
42
+ canonical_tokens.join '+'
43
+ end
44
+
45
+ # Returns tokens to produce non-redundant design matrix
46
+ # @return [Array] array of tokens that do not produce redundant matrix
47
+ def non_redundant_tokens
48
+ groups = split_to_groups
49
+ # TODO: An enhancement
50
+ # Right now x:c appears as c:x
51
+ groups.each { |k, v| groups[k] = strip_numeric v, k }
52
+ groups.each { |k, v| groups[k] = Formula.new(v).canonical_tokens }
53
+ groups.flat_map { |k, v| add_numeric v, k }
54
+ end
55
+
56
+ private
57
+
58
+ # Removes intercept token if term '0' is found in the formula.
59
+ # Intercept token remains if term '1' is found.
60
+ # If neither term '0' nor term '1' is found then, intercept token is added.
61
+ def manage_constant_term
62
+ @tokens.unshift Token.new('1') unless
63
+ @tokens.include?(Token.new('1')) ||
64
+ @tokens.include?(Token.new('0'))
65
+ @tokens.delete Token.new('0')
66
+ end
67
+
68
+ # Groups the tokens to gropus based on the numerical terms
69
+ # they are interacting with.
70
+ def split_to_groups
71
+ @tokens.group_by { |t| extract_numeric t }
72
+ end
73
+
74
+ # Add numeric interaction term which was removed earlier
75
+ # @param [Array] tokens tokens on which to add numerical terms
76
+ # @param [Array] numeric array of numeric terms to add
77
+ def add_numeric(tokens, numeric)
78
+ tokens.map do |t|
79
+ terms = t.interact_terms + numeric
80
+ if terms == ['1']
81
+ Token.new('1')
82
+ else
83
+ terms = terms.reject { |i| i == '1' }
84
+ Token.new terms.join(':'), t.full
85
+ end
86
+ end
87
+ end
88
+
89
+ # Strip numerical interacting terms
90
+ # @param [Array] tokens tokens from which to strip numeric
91
+ # @param [Array] numeric array of numeric terms to strip from tokens
92
+ # @return [Array] array of tokens with striped numerical terms
93
+ def strip_numeric(tokens, numeric)
94
+ tokens.map do |t|
95
+ terms = t.interact_terms - numeric
96
+ terms = ['1'] if terms.empty?
97
+ Token.new terms.join(':')
98
+ end
99
+ end
100
+
101
+ # Extract numeric interacting terms
102
+ # @param [Statsample::GLM::Token] token form which to extract numeric terms
103
+ # @return [Array] array of numericl terms
104
+ def extract_numeric(token)
105
+ terms = token.interact_terms
106
+ return [] if terms == ['1']
107
+ terms.reject { |t| @df[t].category? }
108
+ end
109
+
110
+ def split_to_tokens(formula)
111
+ formula = formula.gsub(/\s+/, '')
112
+ lhs_term, rhs = formula.split '~'
113
+ rhs_terms = rhs.split '+'
114
+ ([lhs_term] + rhs_terms).map { |t| Token.new t }
115
+ end
116
+ end
117
+
118
+ # To process formula language
119
+ class Formula
120
+ attr_reader :tokens, :canonical_tokens
121
+
122
+ def initialize(tokens)
123
+ @tokens = tokens
124
+ @canonical_tokens = parse_formula
125
+ end
126
+
127
+ def canonical_to_s
128
+ canonical_tokens.join '+'
129
+ end
130
+
131
+ private
132
+
133
+ def parse_formula
134
+ @tokens.inject([]) do |acc, token|
135
+ acc + add_non_redundant_elements(token, acc)
136
+ end
137
+ end
138
+
139
+ def add_non_redundant_elements(token, result_so_far)
140
+ return [token] if token.value == '1'
141
+ tokens = token.expand
142
+ result_so_far = result_so_far.flat_map(&:expand)
143
+ tokens -= result_so_far
144
+ contract_if_possible tokens
145
+ end
146
+
147
+ def contract_if_possible(tokens)
148
+ tokens.combination(2).each do |a, b|
149
+ result = a.add b
150
+ next unless result
151
+ tokens.delete a
152
+ tokens.delete b
153
+ tokens << result
154
+ return contract_if_possible tokens
155
+ end
156
+ tokens.sort
157
+ end
158
+ end
159
+
160
+ # To encapsulate interaction as well as non-interaction terms
161
+ class Token
162
+ attr_reader :value, :full, :interact_terms
163
+
164
+ def initialize(value, full = true)
165
+ @interact_terms = value.include?(':') ? value.split(':') : [value]
166
+ @full = coerce_full full
167
+ end
168
+
169
+ def value
170
+ interact_terms.join(':')
171
+ end
172
+
173
+ def size
174
+ # TODO: Return size 1 for value '1' also
175
+ # CAn't do this at the moment because have to make
176
+ # changes in sorting first
177
+ value == '1' ? 0 : interact_terms.size
178
+ end
179
+
180
+ def add(other)
181
+ # ANYTHING + FACTOR- : ANYTHING = FACTOR : ANYTHING
182
+ # ANYTHING + ANYTHING : FACTOR- = ANYTHING : FACTOR
183
+ if size > other.size
184
+ other.add self
185
+
186
+ elsif other.size == 2 &&
187
+ size == 1 &&
188
+ other.interact_terms.last == value &&
189
+ other.full.last == full.first &&
190
+ other.full.first == false
191
+ Token.new(
192
+ "#{other.interact_terms.first}:#{value}",
193
+ [true, other.full.last]
194
+ )
195
+
196
+ elsif other.size == 2 &&
197
+ size == 1 &&
198
+ other.interact_terms.first == value &&
199
+ other.full.first == full.first &&
200
+ other.full.last == false
201
+ Token.new(
202
+ "#{value}:#{other.interact_terms.last}",
203
+ [other.full.first, true]
204
+ )
205
+
206
+ elsif value == '1' &&
207
+ other.size == 1
208
+ Token.new(other.value, true)
209
+ end
210
+ end
211
+
212
+ def ==(other)
213
+ value == other.value &&
214
+ full == other.full
215
+ end
216
+
217
+ alias eql? ==
218
+
219
+ def hash
220
+ value.hash ^ full.hash
221
+ end
222
+
223
+ def <=>(other)
224
+ size <=> other.size
225
+ end
226
+
227
+ def to_s
228
+ interact_terms
229
+ .zip(full)
230
+ .map { |t, f| f ? t : t + '(-)' }
231
+ .join ':'
232
+ end
233
+
234
+ def expand
235
+ case size
236
+ when 0
237
+ [self]
238
+ when 1
239
+ [Token.new('1'), Token.new(value, false)]
240
+ when 2
241
+ a, b = interact_terms
242
+ [Token.new('1'), Token.new(a, false), Token.new(b, false),
243
+ Token.new(a + ':' + b, [false, false])]
244
+ end
245
+ end
246
+
247
+ def to_df(df)
248
+ case size
249
+ when 1
250
+ if df[value].category?
251
+ df[value].contrast_code full: full.first
252
+ else
253
+ Daru::DataFrame.new value => df[value].to_a
254
+ end
255
+ when 2
256
+ to_df_when_interaction(df)
257
+ end
258
+ end
259
+
260
+ private
261
+
262
+ def coerce_full(value)
263
+ if value.is_a? Array
264
+ value + Array.new((@interact_terms.size - value.size), true)
265
+ else
266
+ [value] * @interact_terms.size
267
+ end
268
+ end
269
+
270
+ def to_df_when_interaction(df)
271
+ case interact_terms.map { |t| df[t].category? }
272
+ when [true, true]
273
+ df.interact_code(interact_terms, full)
274
+ when [false, false]
275
+ to_df_numeric_interact_with_numeric df
276
+ when [true, false]
277
+ to_df_category_interact_with_numeric df
278
+ when [false, true]
279
+ to_df_numeric_interact_with_category df
280
+ end
281
+ end
282
+
283
+ def to_df_numeric_interact_with_numeric(df)
284
+ Daru::DataFrame.new value => (df[interact_terms.first] *
285
+ df[interact_terms.last]).to_a
286
+ end
287
+
288
+ def to_df_category_interact_with_numeric(df)
289
+ a, b = interact_terms
290
+ Daru::DataFrame.new(
291
+ df[a].contrast_code(full: full.first)
292
+ .map { |dv| ["#{dv.name}:#{b}", (dv * df[b]).to_a] }
293
+ .to_h
294
+ )
295
+ end
296
+
297
+ def to_df_numeric_interact_with_category(df)
298
+ a, b = interact_terms
299
+ Daru::DataFrame.new(
300
+ df[b].contrast_code(full: full.last)
301
+ .map { |dv| ["#{a}:#{dv.name}", (dv * df[a]).to_a] }
302
+ .to_h
303
+ )
304
+ end
305
+ end
306
+ end