statsample 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/History.txt +7 -0
  3. data/README.md +2 -4
  4. data/Rakefile +6 -0
  5. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +0 -1
  6. data/examples/correlation_matrix.rb +0 -16
  7. data/examples/dataset.rb +0 -7
  8. data/examples/dominance_analysis_bootstrap.rb +0 -6
  9. data/examples/reliability.rb +0 -2
  10. data/lib/statsample.rb +3 -2
  11. data/lib/statsample/anova/oneway.rb +1 -1
  12. data/lib/statsample/bivariate.rb +4 -4
  13. data/lib/statsample/converter/spss.rb +1 -1
  14. data/lib/statsample/crosstab.rb +3 -3
  15. data/lib/statsample/daru.rb +1 -3
  16. data/lib/statsample/factor/parallelanalysis.rb +1 -3
  17. data/lib/statsample/formula/fit_model.rb +46 -0
  18. data/lib/statsample/formula/formula.rb +306 -0
  19. data/lib/statsample/matrix.rb +0 -2
  20. data/lib/statsample/regression.rb +1 -3
  21. data/lib/statsample/regression/multiple/alglibengine.rb +1 -1
  22. data/lib/statsample/regression/multiple/gslengine.rb +1 -1
  23. data/lib/statsample/regression/multiple/rubyengine.rb +1 -3
  24. data/lib/statsample/reliability.rb +3 -3
  25. data/lib/statsample/reliability/icc.rb +1 -2
  26. data/lib/statsample/reliability/multiscaleanalysis.rb +0 -1
  27. data/lib/statsample/reliability/scaleanalysis.rb +2 -3
  28. data/lib/statsample/reliability/skillscaleanalysis.rb +1 -1
  29. data/lib/statsample/test/levene.rb +4 -4
  30. data/lib/statsample/test/t.rb +10 -10
  31. data/lib/statsample/test/umannwhitney.rb +3 -3
  32. data/lib/statsample/version.rb +1 -1
  33. data/statsample.gemspec +4 -1
  34. data/test/fixtures/df.csv +15 -0
  35. data/test/helpers_tests.rb +7 -0
  36. data/test/test_factor.rb +0 -5
  37. data/test/test_factor_pa.rb +1 -6
  38. data/test/test_fit_model.rb +88 -0
  39. data/test/test_reliability.rb +0 -10
  40. data/test/test_statistics.rb +1 -1
  41. metadata +52 -48
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ee62b72f947f9760824885a479c92ce6dbc55127
4
- data.tar.gz: 7cb2c7057856eee78f69be2f7c1e43671cc8007e
3
+ metadata.gz: e2a80fff135f963dcabfe1593de243611eb8ab9a
4
+ data.tar.gz: cb2d80e85339201f8a37ea1b8e934953f26b5591
5
5
  SHA512:
6
- metadata.gz: 26b9d9aab40c4dc700fc4632a30cd195f5f0f0dfe6ab36a84d055f5d0e22ba7992c143ac2507069c02e3ec6e3b02b31f20ae1bf4c1f47a8846339c6d0e0b67b6
7
- data.tar.gz: 8eeac7c1f6aca3ed959ff15cfd97407876d46dfb8d9adf13aa892831772b02209142d73a46ac1581e848cf020b86cfccb93e21bd9f450da106d08ed755d5bd1b
6
+ metadata.gz: b9412e202d3364c6fe7f982a9ceb2828061312ffd0274c7bc25e8b5747abd4b11aca5edd960e22fb6ee8cfec409a6352961c51e283d9523fb608e5d66bf65377
7
+ data.tar.gz: 5db058e78ae638c155727ca51af3e303503bfd42b518de095aa93e9639194c9a273a8b4e3ae3e069d91e25500612beba48140f2e7024e2c329ba16adeca54bef
@@ -1,3 +1,10 @@
1
+ === 2.1.0 / 2017-08-10
2
+ * Update documentation to reflect methods that have been removed (@lokeshh)
3
+ * Update daru dependency to v0.1.6 (@lokeshh)
4
+ * Remove pre-daru legacy methods like n_valid, missing value functions (@lokeshh)
5
+ * Update test suite with rubocop and rake. New tests for methods like Regression (@lokeshh)
6
+ * Introduce fitting a regression using string formulas (@lokeshh)
7
+
1
8
  === 2.0.2 / 2016-03-11
2
9
  * Update dependencies (spreadsheet, GSL)
3
10
 
data/README.md CHANGED
@@ -11,7 +11,7 @@ Homepage :: https://github.com/sciruby/statsample
11
11
  You should have a recent version of GSL and R (with the `irr` and `Rserve` libraries) installed. In Ubuntu:
12
12
 
13
13
  ```bash
14
- $ sudo apt-get install libgs10-dev r-base r-base-dev
14
+ $ sudo apt-get install libgsl0-dev r-base r-base-dev
15
15
  $ sudo Rscript -e "install.packages(c('Rserve', 'irr'))"
16
16
  ```
17
17
 
@@ -86,7 +86,7 @@ Include:
86
86
  - Intra-class correlation
87
87
  - Anova: generic and vector-based One-way ANOVA and Two-way ANOVA, with contrasts for One-way ANOVA.
88
88
  - Tests: F, T, Levene, U-Mannwhitney.
89
- - Regression: Simple, Multiple (OLS), Probit and Logit
89
+ - Regression: Simple, Multiple (OLS)
90
90
  - Factorial Analysis: Extraction (PCA and Principal Axis), Rotation (Varimax, Equimax, Quartimax) and Parallel Analysis and Velicer's MAP test, for estimation of number of factors.
91
91
  - Reliability analysis for simple scale and a DSL to easily analyze multiple scales using factor analysis and correlations, if you want it.
92
92
  - Basic time series support
@@ -120,8 +120,6 @@ Include:
120
120
  - Multiple types of regression.
121
121
  - Simple Regression : Statsample::Regression::Simple
122
122
  - Multiple Regression: Statsample::Regression::Multiple
123
- - Logit Regression: Statsample::Regression::Binomial::Logit
124
- - Probit Regression: Statsample::Regression::Binomial::Probit
125
123
  - Factorial Analysis algorithms on Statsample::Factor module.
126
124
  - Classes for Extraction of factors:
127
125
  - Statsample::Factor::PCA
data/Rakefile CHANGED
@@ -1,4 +1,5 @@
1
1
  $:.unshift File.expand_path("../lib/", __FILE__)
2
+ lib_folder = File.expand_path("../lib", __FILE__)
2
3
 
3
4
  require 'statsample/version'
4
5
  require 'rake'
@@ -36,3 +37,8 @@ task "gettext:makemo" do
36
37
  require 'gettext/tools'
37
38
  GetText.create_mofiles()
38
39
  end
40
+
41
+ desc 'Run pry'
42
+ task :pry do |task|
43
+ sh "pry -r #{lib_folder}/statsample.rb"
44
+ end
@@ -60,7 +60,6 @@ end
60
60
 
61
61
  rs[:c_v] = rs.collect {|row| row[:cases]*row[:vars]}
62
62
 
63
- rs.update
64
63
  rs.save("correlation_matrix.ds")
65
64
  Statsample::Excel.write(rs,"correlation_matrix.xls")
66
65
 
@@ -7,18 +7,6 @@ $:.unshift(File.dirname(__FILE__)+'/../lib/')
7
7
 
8
8
  require 'statsample'
9
9
  Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do
10
- # It so happens that Daru::Vector and Daru::DataFrame must update metadata
11
- # like positions of missing values every time they are created.
12
- #
13
- # Since we dont have any missing values in the data that we are creating,
14
- # we set Daru.lazy_update = true so that missing data is not updated every
15
- # time and things happen much faster.
16
- #
17
- # In case you do have missing data and lazy_update has been set to *true*,
18
- # you _SHOULD_ called `#update` on the concerned Vector or DataFrame object
19
- # everytime an assingment or deletion cycle is complete.
20
- Daru.lazy_update = true
21
-
22
10
  # Create a Daru::DataFrame containing 4 vectors a, b, c and d.
23
11
  #
24
12
  # Notice that the `clone` option has been set to *false*. This tells Daru
@@ -36,10 +24,6 @@ Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do
36
24
  # Calculate correlation matrix by calling the `cor` shorthand.
37
25
  cm = cor(ds)
38
26
  summary(cm)
39
-
40
- # Set lazy_update to *false* once our job is done so that this analysis does
41
- # not accidentally affect code elsewhere.
42
- Daru.lazy_update = false
43
27
  end
44
28
 
45
29
  if __FILE__==$0
@@ -6,10 +6,6 @@ $:.unshift(File.dirname(__FILE__)+'/../lib/')
6
6
  require 'statsample'
7
7
 
8
8
  Statsample::Analysis.store(Daru::DataFrame) do
9
- # We set lazy_update to *true* so that time is not wasted in updating
10
- # metdata every time an assignment happens.
11
- Daru.lazy_update = true
12
-
13
9
  samples = 1000
14
10
 
15
11
  # The 'new_with_size' function lets you specify the size of the
@@ -26,9 +22,6 @@ Statsample::Analysis.store(Daru::DataFrame) do
26
22
  # order by default.
27
23
  ds = Daru::DataFrame.new({:a=>a,:b=>b}, order: [:b, :a])
28
24
  summary(ds)
29
-
30
- # Reset lazy_update to *false* to prevent other code from breaking.
31
- Daru.lazy_update = false
32
25
  end
33
26
 
34
27
  if __FILE__==$0
@@ -3,10 +3,6 @@ $:.unshift(File.dirname(__FILE__)+'/../lib/')
3
3
  require 'statsample'
4
4
 
5
5
  Statsample::Analysis.store(Statsample::DominanceAnalysis::Bootstrap) do
6
- # Remember to call *update* after an assignment/deletion cycle if lazy_update
7
- # is *false*.
8
- Daru.lazy_update = true
9
-
10
6
  sample=300
11
7
  a=rnorm(sample)
12
8
  b=rnorm(sample)
@@ -29,8 +25,6 @@ Statsample::Analysis.store(Statsample::DominanceAnalysis::Bootstrap) do
29
25
  dab2=dominance_analysis_bootstrap(ds2, :y1, :debug=>true)
30
26
  dab2.bootstrap(100,nil)
31
27
  summary(dab2)
32
-
33
- Daru.lazy_update = false
34
28
  end
35
29
 
36
30
  if __FILE__==$0
@@ -15,8 +15,6 @@ Statsample::Analysis.store(Statsample::Reliability) do
15
15
  ds["v#{i}".to_sym]= a + rnorm(samples,0,0.2)
16
16
  end
17
17
 
18
- ds.update
19
-
20
18
  rel=Statsample::Reliability::ScaleAnalysis.new(ds)
21
19
  summary rel
22
20
 
@@ -160,6 +160,7 @@ module Statsample
160
160
  autoload(:StratifiedSample, 'statsample/multiset')
161
161
  autoload(:MLE, 'statsample/mle')
162
162
  autoload(:Regression, 'statsample/regression')
163
+ autoload(:FitModel, 'statsample/formula/fit_model')
163
164
  autoload(:Test, 'statsample/test')
164
165
  autoload(:Factor, 'statsample/factor')
165
166
  autoload(:Graph, 'statsample/graph')
@@ -206,7 +207,7 @@ module Statsample
206
207
  def only_valid(*vs)
207
208
  i = 1
208
209
  h = vs.inject({}) { |acc, v| acc["v#{i}".to_sym] = v; i += 1; acc }
209
- df = Daru::DataFrame.new(h).dup_only_valid
210
+ df = Daru::DataFrame.new(h).reject_values(*Daru::MISSING_VALUES)
210
211
  df.map { |v| v }
211
212
  end
212
213
 
@@ -214,7 +215,7 @@ module Statsample
214
215
  # If any vectors have missing_values, return only valid.
215
216
  # If not, return the vectors itself
216
217
  def only_valid_clone(*vs)
217
- if vs.any?(&:has_missing_data?)
218
+ if vs.any? { |v| v.include_values?(*Daru::MISSING_VALUES) }
218
219
  only_valid(*vs)
219
220
  else
220
221
  vs
@@ -164,7 +164,7 @@ module Statsample
164
164
  if summary_descriptives
165
165
  s.table(:name=>_("Descriptives"),:header=>%w{Name N Mean SD Min Max}.map {|v| _(v)}) do |t|
166
166
  @vectors.each do |v|
167
- t.row [v.name, v.n_valid, "%0.4f" % v.mean, "%0.4f" % v.sd, "%0.4f" % v.min, "%0.4f" % v.max]
167
+ t.row [v.name, v.reject_values(*Daru::MISSING_VALUES).size, "%0.4f" % v.mean, "%0.4f" % v.sd, "%0.4f" % v.min, "%0.4f" % v.max]
168
168
  end
169
169
  end
170
170
  end
@@ -159,7 +159,7 @@ module Statsample
159
159
 
160
160
  def covariance_matrix(ds)
161
161
  vars,cases = ds.ncols, ds.nrows
162
- if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
162
+ if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
163
163
  cm=covariance_matrix_optimized(ds)
164
164
  else
165
165
  cm=covariance_matrix_pairwise(ds)
@@ -198,7 +198,7 @@ module Statsample
198
198
  # Order of rows and columns depends on Dataset#fields order
199
199
  def correlation_matrix(ds)
200
200
  vars, cases = ds.ncols, ds.nrows
201
- if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
201
+ if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
202
202
  cm=correlation_matrix_optimized(ds)
203
203
  else
204
204
  cm=correlation_matrix_pairwise(ds)
@@ -248,7 +248,7 @@ module Statsample
248
248
  m = vectors.collect do |row|
249
249
  vectors.collect do |col|
250
250
  if row==col
251
- ds[row].only_valid.size
251
+ ds[row].reject_values(*Daru::MISSING_VALUES).size
252
252
  else
253
253
  rowa,rowb = Statsample.only_valid_clone(ds[row],ds[col])
254
254
  rowa.size
@@ -281,7 +281,7 @@ module Statsample
281
281
  # Calculate Point biserial correlation. Equal to Pearson correlation, with
282
282
  # one dichotomous value replaced by "0" and the other by "1"
283
283
  def point_biserial(dichotomous,continous)
284
- ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).dup_only_valid
284
+ ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).reject_values(*Daru::MISSING_VALUES)
285
285
  raise(TypeError, "First vector should be dichotomous") if ds[:d].factors.size != 2
286
286
  raise(TypeError, "Second vector should be continous") if ds[:c].type != :numeric
287
287
  f0=ds[:d].factors.sort.to_a[0]
@@ -7,7 +7,7 @@ module Statsample
7
7
  # ds=Daru::DataFrame.from_excel("my_data.xls")
8
8
  # puts Statsample::SPSS.tetrachoric_correlation_matrix(ds)
9
9
  def tetrachoric_correlation_matrix(ds)
10
- dsv=ds.dup_only_valid
10
+ dsv=ds.reject_values(*Daru::MISSING_VALUES)
11
11
  # Delete all vectors doesn't have variation
12
12
  dsv.vectors.each { |f|
13
13
  if dsv[f].factors.size==1
@@ -29,10 +29,10 @@ module Statsample
29
29
  @v_cols.factors.sort.reset_index!
30
30
  end
31
31
  def rows_total
32
- @v_rows.frequencies
32
+ @v_rows.frequencies.to_h
33
33
  end
34
34
  def cols_total
35
- @v_cols.frequencies
35
+ @v_cols.frequencies.to_h
36
36
  end
37
37
 
38
38
  def frequencies
@@ -42,7 +42,7 @@ module Statsample
42
42
  s[par]=0
43
43
  s
44
44
  end
45
- base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies)
45
+ base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies.to_h)
46
46
  end
47
47
  def to_matrix
48
48
  f = frequencies
@@ -11,7 +11,7 @@ module Daru
11
11
  # ugly patch. The upper limit for a bin has the form
12
12
  # x < range
13
13
  #h=Statsample::Histogram.new(self, bins)
14
- valid = only_valid
14
+ valid = reject_values(*Daru::MISSING_VALUES)
15
15
  min,max=Statsample::Util.nice(valid.min,valid.max)
16
16
  # fix last data
17
17
  if max == valid.max
@@ -72,7 +72,6 @@ module Daru
72
72
  end
73
73
  #puts "Ingreso a los dataset"
74
74
  ms.datasets.each do |k,ds|
75
- ds.update
76
75
  ds.rename self[field].index_of(k)
77
76
  end
78
77
 
@@ -102,7 +101,6 @@ module Daru
102
101
  each_row { |r| p1.call(r) }
103
102
 
104
103
  ms.datasets.each do |k,ds|
105
- ds.update
106
104
  ds.rename(
107
105
  fields.size.times.map do |i|
108
106
  f = fields[i]
@@ -142,8 +142,7 @@ module Statsample
142
142
  raise "bootstrap_method doesn't recogniced"
143
143
  end
144
144
  end
145
- ds_bootstrap.update
146
-
145
+
147
146
  matrix=Statsample::Bivariate.send(matrix_method, ds_bootstrap)
148
147
  matrix=matrix.to_gsl if @use_gsl
149
148
  if smc
@@ -159,7 +158,6 @@ module Statsample
159
158
  redo
160
159
  end
161
160
  end
162
- @ds_eigenvalues.update
163
161
  end
164
162
  dirty_memoize :number_of_factors, :ds_eigenvalues
165
163
  dirty_writer :iterations, :bootstrap_method, :percentil, :smc
@@ -0,0 +1,46 @@
1
+ require 'statsample/formula/formula'
2
+
3
+ module Statsample
4
+ # Class for performing regression
5
+ class FitModel
6
+ def initialize(formula, df, opts = {})
7
+ @formula = FormulaWrapper.new formula, df
8
+ @df = df
9
+ @opts = opts
10
+ end
11
+
12
+ def model
13
+ @model || fit_model
14
+ end
15
+
16
+ def predict(new_data)
17
+ model.predict(df_for_prediction(new_data))
18
+ end
19
+
20
+ def df_for_prediction df
21
+ canonicalize_df(df)
22
+ end
23
+
24
+ def df_for_regression
25
+ df = canonicalize_df(@df)
26
+ df[@formula.y.value] = @df[@formula.y.value]
27
+ df
28
+ end
29
+
30
+ def canonicalize_df(orig_df)
31
+ tokens = @formula.canonical_tokens
32
+ tokens.shift if tokens.first.value == '1'
33
+ df = tokens.map { |t| t.to_df orig_df }.reduce(&:merge)
34
+ df
35
+ end
36
+
37
+ def fit_model
38
+ # TODO: Add support for inclusion/exclusion of intercept
39
+ @model = Statsample::Regression.multiple(
40
+ df_for_regression,
41
+ @formula.y.value,
42
+ @opts
43
+ )
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,306 @@
1
+ module Statsample
2
+ # This class recognizes what terms are numeric
3
+ # and accordingly forms groups which are fed to Formula
4
+ # Once they are parsed with Formula, they are combined back
5
+ class FormulaWrapper
6
+ attr_reader :tokens, :y, :canonical_tokens
7
+
8
+ # Initializes formula wrapper object to parse a given formula into
9
+ # some tokens which do not overlap one another.
10
+ # @note Specify 0 as a term in the formula if you do not want constant
11
+ # to be included in the parsed formula
12
+ # @param [string] formula to parse
13
+ # @param [Daru::DataFrame] df dataframe requried to know what vectors
14
+ # are numerical
15
+ # @example
16
+ # df = Daru::DataFrame.from_csv 'spec/data/df.csv'
17
+ # df.to_category 'c', 'd', 'e'
18
+ # formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
19
+ # formula.canonical_to_s
20
+ # #=> "1+c(-)+d(-):c+a"
21
+ def initialize(formula, df)
22
+ @df = df
23
+ # @y store the LHS term that is name of vector to be predicted
24
+ # @tokens store the RHS terms of the formula
25
+ @y, *@tokens = split_to_tokens(formula)
26
+ @tokens = @tokens.uniq.sort
27
+ manage_constant_term
28
+ @canonical_tokens = non_redundant_tokens
29
+ end
30
+
31
+ # Returns canonical tokens in a readable form.
32
+ # @return [String] canonical tokens in a readable form.
33
+ # @note 'y~a+b(-)' means 'a' exist in full rank expansion
34
+ # and 'b(-)' exist in reduced rank expansion
35
+ # @example
36
+ # df = Daru::DataFrame.from_csv 'spec/data/df.csv'
37
+ # df.to_category 'c', 'd', 'e'
38
+ # formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
39
+ # formula.canonical_to_s
40
+ # #=> "1+c(-)+d(-):c+a"
41
+ def canonical_to_s
42
+ canonical_tokens.join '+'
43
+ end
44
+
45
+ # Returns tokens to produce non-redundant design matrix
46
+ # @return [Array] array of tokens that do not produce redundant matrix
47
+ def non_redundant_tokens
48
+ groups = split_to_groups
49
+ # TODO: An enhancement
50
+ # Right now x:c appears as c:x
51
+ groups.each { |k, v| groups[k] = strip_numeric v, k }
52
+ groups.each { |k, v| groups[k] = Formula.new(v).canonical_tokens }
53
+ groups.flat_map { |k, v| add_numeric v, k }
54
+ end
55
+
56
+ private
57
+
58
+ # Removes intercept token if term '0' is found in the formula.
59
+ # Intercept token remains if term '1' is found.
60
+ # If neither term '0' nor term '1' is found then, intercept token is added.
61
+ def manage_constant_term
62
+ @tokens.unshift Token.new('1') unless
63
+ @tokens.include?(Token.new('1')) ||
64
+ @tokens.include?(Token.new('0'))
65
+ @tokens.delete Token.new('0')
66
+ end
67
+
68
+ # Groups the tokens to gropus based on the numerical terms
69
+ # they are interacting with.
70
+ def split_to_groups
71
+ @tokens.group_by { |t| extract_numeric t }
72
+ end
73
+
74
+ # Add numeric interaction term which was removed earlier
75
+ # @param [Array] tokens tokens on which to add numerical terms
76
+ # @param [Array] numeric array of numeric terms to add
77
+ def add_numeric(tokens, numeric)
78
+ tokens.map do |t|
79
+ terms = t.interact_terms + numeric
80
+ if terms == ['1']
81
+ Token.new('1')
82
+ else
83
+ terms = terms.reject { |i| i == '1' }
84
+ Token.new terms.join(':'), t.full
85
+ end
86
+ end
87
+ end
88
+
89
+ # Strip numerical interacting terms
90
+ # @param [Array] tokens tokens from which to strip numeric
91
+ # @param [Array] numeric array of numeric terms to strip from tokens
92
+ # @return [Array] array of tokens with striped numerical terms
93
+ def strip_numeric(tokens, numeric)
94
+ tokens.map do |t|
95
+ terms = t.interact_terms - numeric
96
+ terms = ['1'] if terms.empty?
97
+ Token.new terms.join(':')
98
+ end
99
+ end
100
+
101
+ # Extract numeric interacting terms
102
+ # @param [Statsample::GLM::Token] token form which to extract numeric terms
103
+ # @return [Array] array of numericl terms
104
+ def extract_numeric(token)
105
+ terms = token.interact_terms
106
+ return [] if terms == ['1']
107
+ terms.reject { |t| @df[t].category? }
108
+ end
109
+
110
+ def split_to_tokens(formula)
111
+ formula = formula.gsub(/\s+/, '')
112
+ lhs_term, rhs = formula.split '~'
113
+ rhs_terms = rhs.split '+'
114
+ ([lhs_term] + rhs_terms).map { |t| Token.new t }
115
+ end
116
+ end
117
+
118
+ # To process formula language
119
+ class Formula
120
+ attr_reader :tokens, :canonical_tokens
121
+
122
+ def initialize(tokens)
123
+ @tokens = tokens
124
+ @canonical_tokens = parse_formula
125
+ end
126
+
127
+ def canonical_to_s
128
+ canonical_tokens.join '+'
129
+ end
130
+
131
+ private
132
+
133
+ def parse_formula
134
+ @tokens.inject([]) do |acc, token|
135
+ acc + add_non_redundant_elements(token, acc)
136
+ end
137
+ end
138
+
139
+ def add_non_redundant_elements(token, result_so_far)
140
+ return [token] if token.value == '1'
141
+ tokens = token.expand
142
+ result_so_far = result_so_far.flat_map(&:expand)
143
+ tokens -= result_so_far
144
+ contract_if_possible tokens
145
+ end
146
+
147
+ def contract_if_possible(tokens)
148
+ tokens.combination(2).each do |a, b|
149
+ result = a.add b
150
+ next unless result
151
+ tokens.delete a
152
+ tokens.delete b
153
+ tokens << result
154
+ return contract_if_possible tokens
155
+ end
156
+ tokens.sort
157
+ end
158
+ end
159
+
160
+ # To encapsulate interaction as well as non-interaction terms
161
+ class Token
162
+ attr_reader :value, :full, :interact_terms
163
+
164
+ def initialize(value, full = true)
165
+ @interact_terms = value.include?(':') ? value.split(':') : [value]
166
+ @full = coerce_full full
167
+ end
168
+
169
+ def value
170
+ interact_terms.join(':')
171
+ end
172
+
173
+ def size
174
+ # TODO: Return size 1 for value '1' also
175
+ # CAn't do this at the moment because have to make
176
+ # changes in sorting first
177
+ value == '1' ? 0 : interact_terms.size
178
+ end
179
+
180
+ def add(other)
181
+ # ANYTHING + FACTOR- : ANYTHING = FACTOR : ANYTHING
182
+ # ANYTHING + ANYTHING : FACTOR- = ANYTHING : FACTOR
183
+ if size > other.size
184
+ other.add self
185
+
186
+ elsif other.size == 2 &&
187
+ size == 1 &&
188
+ other.interact_terms.last == value &&
189
+ other.full.last == full.first &&
190
+ other.full.first == false
191
+ Token.new(
192
+ "#{other.interact_terms.first}:#{value}",
193
+ [true, other.full.last]
194
+ )
195
+
196
+ elsif other.size == 2 &&
197
+ size == 1 &&
198
+ other.interact_terms.first == value &&
199
+ other.full.first == full.first &&
200
+ other.full.last == false
201
+ Token.new(
202
+ "#{value}:#{other.interact_terms.last}",
203
+ [other.full.first, true]
204
+ )
205
+
206
+ elsif value == '1' &&
207
+ other.size == 1
208
+ Token.new(other.value, true)
209
+ end
210
+ end
211
+
212
+ def ==(other)
213
+ value == other.value &&
214
+ full == other.full
215
+ end
216
+
217
+ alias eql? ==
218
+
219
+ def hash
220
+ value.hash ^ full.hash
221
+ end
222
+
223
+ def <=>(other)
224
+ size <=> other.size
225
+ end
226
+
227
+ def to_s
228
+ interact_terms
229
+ .zip(full)
230
+ .map { |t, f| f ? t : t + '(-)' }
231
+ .join ':'
232
+ end
233
+
234
+ def expand
235
+ case size
236
+ when 0
237
+ [self]
238
+ when 1
239
+ [Token.new('1'), Token.new(value, false)]
240
+ when 2
241
+ a, b = interact_terms
242
+ [Token.new('1'), Token.new(a, false), Token.new(b, false),
243
+ Token.new(a + ':' + b, [false, false])]
244
+ end
245
+ end
246
+
247
+ def to_df(df)
248
+ case size
249
+ when 1
250
+ if df[value].category?
251
+ df[value].contrast_code full: full.first
252
+ else
253
+ Daru::DataFrame.new value => df[value].to_a
254
+ end
255
+ when 2
256
+ to_df_when_interaction(df)
257
+ end
258
+ end
259
+
260
+ private
261
+
262
+ def coerce_full(value)
263
+ if value.is_a? Array
264
+ value + Array.new((@interact_terms.size - value.size), true)
265
+ else
266
+ [value] * @interact_terms.size
267
+ end
268
+ end
269
+
270
+ def to_df_when_interaction(df)
271
+ case interact_terms.map { |t| df[t].category? }
272
+ when [true, true]
273
+ df.interact_code(interact_terms, full)
274
+ when [false, false]
275
+ to_df_numeric_interact_with_numeric df
276
+ when [true, false]
277
+ to_df_category_interact_with_numeric df
278
+ when [false, true]
279
+ to_df_numeric_interact_with_category df
280
+ end
281
+ end
282
+
283
+ def to_df_numeric_interact_with_numeric(df)
284
+ Daru::DataFrame.new value => (df[interact_terms.first] *
285
+ df[interact_terms.last]).to_a
286
+ end
287
+
288
+ def to_df_category_interact_with_numeric(df)
289
+ a, b = interact_terms
290
+ Daru::DataFrame.new(
291
+ df[a].contrast_code(full: full.first)
292
+ .map { |dv| ["#{dv.name}:#{b}", (dv * df[b]).to_a] }
293
+ .to_h
294
+ )
295
+ end
296
+
297
+ def to_df_numeric_interact_with_category(df)
298
+ a, b = interact_terms
299
+ Daru::DataFrame.new(
300
+ df[b].contrast_code(full: full.last)
301
+ .map { |dv| ["#{a}:#{dv.name}", (dv * df[a]).to_a] }
302
+ .to_h
303
+ )
304
+ end
305
+ end
306
+ end