statsample 2.0.2 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/History.txt +7 -0
  3. data/README.md +2 -4
  4. data/Rakefile +6 -0
  5. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +0 -1
  6. data/examples/correlation_matrix.rb +0 -16
  7. data/examples/dataset.rb +0 -7
  8. data/examples/dominance_analysis_bootstrap.rb +0 -6
  9. data/examples/reliability.rb +0 -2
  10. data/lib/statsample.rb +3 -2
  11. data/lib/statsample/anova/oneway.rb +1 -1
  12. data/lib/statsample/bivariate.rb +4 -4
  13. data/lib/statsample/converter/spss.rb +1 -1
  14. data/lib/statsample/crosstab.rb +3 -3
  15. data/lib/statsample/daru.rb +1 -3
  16. data/lib/statsample/factor/parallelanalysis.rb +1 -3
  17. data/lib/statsample/formula/fit_model.rb +46 -0
  18. data/lib/statsample/formula/formula.rb +306 -0
  19. data/lib/statsample/matrix.rb +0 -2
  20. data/lib/statsample/regression.rb +1 -3
  21. data/lib/statsample/regression/multiple/alglibengine.rb +1 -1
  22. data/lib/statsample/regression/multiple/gslengine.rb +1 -1
  23. data/lib/statsample/regression/multiple/rubyengine.rb +1 -3
  24. data/lib/statsample/reliability.rb +3 -3
  25. data/lib/statsample/reliability/icc.rb +1 -2
  26. data/lib/statsample/reliability/multiscaleanalysis.rb +0 -1
  27. data/lib/statsample/reliability/scaleanalysis.rb +2 -3
  28. data/lib/statsample/reliability/skillscaleanalysis.rb +1 -1
  29. data/lib/statsample/test/levene.rb +4 -4
  30. data/lib/statsample/test/t.rb +10 -10
  31. data/lib/statsample/test/umannwhitney.rb +3 -3
  32. data/lib/statsample/version.rb +1 -1
  33. data/statsample.gemspec +4 -1
  34. data/test/fixtures/df.csv +15 -0
  35. data/test/helpers_tests.rb +7 -0
  36. data/test/test_factor.rb +0 -5
  37. data/test/test_factor_pa.rb +1 -6
  38. data/test/test_fit_model.rb +88 -0
  39. data/test/test_reliability.rb +0 -10
  40. data/test/test_statistics.rb +1 -1
  41. metadata +52 -48
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ee62b72f947f9760824885a479c92ce6dbc55127
4
- data.tar.gz: 7cb2c7057856eee78f69be2f7c1e43671cc8007e
3
+ metadata.gz: e2a80fff135f963dcabfe1593de243611eb8ab9a
4
+ data.tar.gz: cb2d80e85339201f8a37ea1b8e934953f26b5591
5
5
  SHA512:
6
- metadata.gz: 26b9d9aab40c4dc700fc4632a30cd195f5f0f0dfe6ab36a84d055f5d0e22ba7992c143ac2507069c02e3ec6e3b02b31f20ae1bf4c1f47a8846339c6d0e0b67b6
7
- data.tar.gz: 8eeac7c1f6aca3ed959ff15cfd97407876d46dfb8d9adf13aa892831772b02209142d73a46ac1581e848cf020b86cfccb93e21bd9f450da106d08ed755d5bd1b
6
+ metadata.gz: b9412e202d3364c6fe7f982a9ceb2828061312ffd0274c7bc25e8b5747abd4b11aca5edd960e22fb6ee8cfec409a6352961c51e283d9523fb608e5d66bf65377
7
+ data.tar.gz: 5db058e78ae638c155727ca51af3e303503bfd42b518de095aa93e9639194c9a273a8b4e3ae3e069d91e25500612beba48140f2e7024e2c329ba16adeca54bef
@@ -1,3 +1,10 @@
1
+ === 2.1.0 / 2017-08-10
2
+ * Update documentation to reflect methods that have been removed (@lokeshh)
3
+ * Update daru dependency to v0.1.6 (@lokeshh)
4
+ * Remove pre-daru legacy methods like n_valid, missing value functions (@lokeshh)
5
+ * Update test suite with rubocop and rake. New tests for methods like Regression (@lokeshh)
6
+ * Introduce fitting a regression using string formulas (@lokeshh)
7
+
1
8
  === 2.0.2 / 2016-03-11
2
9
  * Update dependencies (spreadsheet, GSL)
3
10
 
data/README.md CHANGED
@@ -11,7 +11,7 @@ Homepage :: https://github.com/sciruby/statsample
11
11
  You should have a recent version of GSL and R (with the `irr` and `Rserve` libraries) installed. In Ubuntu:
12
12
 
13
13
  ```bash
14
- $ sudo apt-get install libgs10-dev r-base r-base-dev
14
+ $ sudo apt-get install libgsl0-dev r-base r-base-dev
15
15
  $ sudo Rscript -e "install.packages(c('Rserve', 'irr'))"
16
16
  ```
17
17
 
@@ -86,7 +86,7 @@ Include:
86
86
  - Intra-class correlation
87
87
  - Anova: generic and vector-based One-way ANOVA and Two-way ANOVA, with contrasts for One-way ANOVA.
88
88
  - Tests: F, T, Levene, U-Mannwhitney.
89
- - Regression: Simple, Multiple (OLS), Probit and Logit
89
+ - Regression: Simple, Multiple (OLS)
90
90
  - Factorial Analysis: Extraction (PCA and Principal Axis), Rotation (Varimax, Equimax, Quartimax) and Parallel Analysis and Velicer's MAP test, for estimation of number of factors.
91
91
  - Reliability analysis for simple scale and a DSL to easily analyze multiple scales using factor analysis and correlations, if you want it.
92
92
  - Basic time series support
@@ -120,8 +120,6 @@ Include:
120
120
  - Multiple types of regression.
121
121
  - Simple Regression : Statsample::Regression::Simple
122
122
  - Multiple Regression: Statsample::Regression::Multiple
123
- - Logit Regression: Statsample::Regression::Binomial::Logit
124
- - Probit Regression: Statsample::Regression::Binomial::Probit
125
123
  - Factorial Analysis algorithms on Statsample::Factor module.
126
124
  - Classes for Extraction of factors:
127
125
  - Statsample::Factor::PCA
data/Rakefile CHANGED
@@ -1,4 +1,5 @@
1
1
  $:.unshift File.expand_path("../lib/", __FILE__)
2
+ lib_folder = File.expand_path("../lib", __FILE__)
2
3
 
3
4
  require 'statsample/version'
4
5
  require 'rake'
@@ -36,3 +37,8 @@ task "gettext:makemo" do
36
37
  require 'gettext/tools'
37
38
  GetText.create_mofiles()
38
39
  end
40
+
41
+ desc 'Run pry'
42
+ task :pry do |task|
43
+ sh "pry -r #{lib_folder}/statsample.rb"
44
+ end
@@ -60,7 +60,6 @@ end
60
60
 
61
61
  rs[:c_v] = rs.collect {|row| row[:cases]*row[:vars]}
62
62
 
63
- rs.update
64
63
  rs.save("correlation_matrix.ds")
65
64
  Statsample::Excel.write(rs,"correlation_matrix.xls")
66
65
 
@@ -7,18 +7,6 @@ $:.unshift(File.dirname(__FILE__)+'/../lib/')
7
7
 
8
8
  require 'statsample'
9
9
  Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do
10
- # It so happens that Daru::Vector and Daru::DataFrame must update metadata
11
- # like positions of missing values every time they are created.
12
- #
13
- # Since we dont have any missing values in the data that we are creating,
14
- # we set Daru.lazy_update = true so that missing data is not updated every
15
- # time and things happen much faster.
16
- #
17
- # In case you do have missing data and lazy_update has been set to *true*,
18
- # you _SHOULD_ called `#update` on the concerned Vector or DataFrame object
19
- # everytime an assingment or deletion cycle is complete.
20
- Daru.lazy_update = true
21
-
22
10
  # Create a Daru::DataFrame containing 4 vectors a, b, c and d.
23
11
  #
24
12
  # Notice that the `clone` option has been set to *false*. This tells Daru
@@ -36,10 +24,6 @@ Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do
36
24
  # Calculate correlation matrix by calling the `cor` shorthand.
37
25
  cm = cor(ds)
38
26
  summary(cm)
39
-
40
- # Set lazy_update to *false* once our job is done so that this analysis does
41
- # not accidentally affect code elsewhere.
42
- Daru.lazy_update = false
43
27
  end
44
28
 
45
29
  if __FILE__==$0
@@ -6,10 +6,6 @@ $:.unshift(File.dirname(__FILE__)+'/../lib/')
6
6
  require 'statsample'
7
7
 
8
8
  Statsample::Analysis.store(Daru::DataFrame) do
9
- # We set lazy_update to *true* so that time is not wasted in updating
10
- # metdata every time an assignment happens.
11
- Daru.lazy_update = true
12
-
13
9
  samples = 1000
14
10
 
15
11
  # The 'new_with_size' function lets you specify the size of the
@@ -26,9 +22,6 @@ Statsample::Analysis.store(Daru::DataFrame) do
26
22
  # order by default.
27
23
  ds = Daru::DataFrame.new({:a=>a,:b=>b}, order: [:b, :a])
28
24
  summary(ds)
29
-
30
- # Reset lazy_update to *false* to prevent other code from breaking.
31
- Daru.lazy_update = false
32
25
  end
33
26
 
34
27
  if __FILE__==$0
@@ -3,10 +3,6 @@ $:.unshift(File.dirname(__FILE__)+'/../lib/')
3
3
  require 'statsample'
4
4
 
5
5
  Statsample::Analysis.store(Statsample::DominanceAnalysis::Bootstrap) do
6
- # Remember to call *update* after an assignment/deletion cycle if lazy_update
7
- # is *false*.
8
- Daru.lazy_update = true
9
-
10
6
  sample=300
11
7
  a=rnorm(sample)
12
8
  b=rnorm(sample)
@@ -29,8 +25,6 @@ Statsample::Analysis.store(Statsample::DominanceAnalysis::Bootstrap) do
29
25
  dab2=dominance_analysis_bootstrap(ds2, :y1, :debug=>true)
30
26
  dab2.bootstrap(100,nil)
31
27
  summary(dab2)
32
-
33
- Daru.lazy_update = false
34
28
  end
35
29
 
36
30
  if __FILE__==$0
@@ -15,8 +15,6 @@ Statsample::Analysis.store(Statsample::Reliability) do
15
15
  ds["v#{i}".to_sym]= a + rnorm(samples,0,0.2)
16
16
  end
17
17
 
18
- ds.update
19
-
20
18
  rel=Statsample::Reliability::ScaleAnalysis.new(ds)
21
19
  summary rel
22
20
 
@@ -160,6 +160,7 @@ module Statsample
160
160
  autoload(:StratifiedSample, 'statsample/multiset')
161
161
  autoload(:MLE, 'statsample/mle')
162
162
  autoload(:Regression, 'statsample/regression')
163
+ autoload(:FitModel, 'statsample/formula/fit_model')
163
164
  autoload(:Test, 'statsample/test')
164
165
  autoload(:Factor, 'statsample/factor')
165
166
  autoload(:Graph, 'statsample/graph')
@@ -206,7 +207,7 @@ module Statsample
206
207
  def only_valid(*vs)
207
208
  i = 1
208
209
  h = vs.inject({}) { |acc, v| acc["v#{i}".to_sym] = v; i += 1; acc }
209
- df = Daru::DataFrame.new(h).dup_only_valid
210
+ df = Daru::DataFrame.new(h).reject_values(*Daru::MISSING_VALUES)
210
211
  df.map { |v| v }
211
212
  end
212
213
 
@@ -214,7 +215,7 @@ module Statsample
214
215
  # If any vectors have missing_values, return only valid.
215
216
  # If not, return the vectors itself
216
217
  def only_valid_clone(*vs)
217
- if vs.any?(&:has_missing_data?)
218
+ if vs.any? { |v| v.include_values?(*Daru::MISSING_VALUES) }
218
219
  only_valid(*vs)
219
220
  else
220
221
  vs
@@ -164,7 +164,7 @@ module Statsample
164
164
  if summary_descriptives
165
165
  s.table(:name=>_("Descriptives"),:header=>%w{Name N Mean SD Min Max}.map {|v| _(v)}) do |t|
166
166
  @vectors.each do |v|
167
- t.row [v.name, v.n_valid, "%0.4f" % v.mean, "%0.4f" % v.sd, "%0.4f" % v.min, "%0.4f" % v.max]
167
+ t.row [v.name, v.reject_values(*Daru::MISSING_VALUES).size, "%0.4f" % v.mean, "%0.4f" % v.sd, "%0.4f" % v.min, "%0.4f" % v.max]
168
168
  end
169
169
  end
170
170
  end
@@ -159,7 +159,7 @@ module Statsample
159
159
 
160
160
  def covariance_matrix(ds)
161
161
  vars,cases = ds.ncols, ds.nrows
162
- if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
162
+ if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
163
163
  cm=covariance_matrix_optimized(ds)
164
164
  else
165
165
  cm=covariance_matrix_pairwise(ds)
@@ -198,7 +198,7 @@ module Statsample
198
198
  # Order of rows and columns depends on Dataset#fields order
199
199
  def correlation_matrix(ds)
200
200
  vars, cases = ds.ncols, ds.nrows
201
- if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
201
+ if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
202
202
  cm=correlation_matrix_optimized(ds)
203
203
  else
204
204
  cm=correlation_matrix_pairwise(ds)
@@ -248,7 +248,7 @@ module Statsample
248
248
  m = vectors.collect do |row|
249
249
  vectors.collect do |col|
250
250
  if row==col
251
- ds[row].only_valid.size
251
+ ds[row].reject_values(*Daru::MISSING_VALUES).size
252
252
  else
253
253
  rowa,rowb = Statsample.only_valid_clone(ds[row],ds[col])
254
254
  rowa.size
@@ -281,7 +281,7 @@ module Statsample
281
281
  # Calculate Point biserial correlation. Equal to Pearson correlation, with
282
282
  # one dichotomous value replaced by "0" and the other by "1"
283
283
  def point_biserial(dichotomous,continous)
284
- ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).dup_only_valid
284
+ ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).reject_values(*Daru::MISSING_VALUES)
285
285
  raise(TypeError, "First vector should be dichotomous") if ds[:d].factors.size != 2
286
286
  raise(TypeError, "Second vector should be continous") if ds[:c].type != :numeric
287
287
  f0=ds[:d].factors.sort.to_a[0]
@@ -7,7 +7,7 @@ module Statsample
7
7
  # ds=Daru::DataFrame.from_excel("my_data.xls")
8
8
  # puts Statsample::SPSS.tetrachoric_correlation_matrix(ds)
9
9
  def tetrachoric_correlation_matrix(ds)
10
- dsv=ds.dup_only_valid
10
+ dsv=ds.reject_values(*Daru::MISSING_VALUES)
11
11
  # Delete all vectors doesn't have variation
12
12
  dsv.vectors.each { |f|
13
13
  if dsv[f].factors.size==1
@@ -29,10 +29,10 @@ module Statsample
29
29
  @v_cols.factors.sort.reset_index!
30
30
  end
31
31
  def rows_total
32
- @v_rows.frequencies
32
+ @v_rows.frequencies.to_h
33
33
  end
34
34
  def cols_total
35
- @v_cols.frequencies
35
+ @v_cols.frequencies.to_h
36
36
  end
37
37
 
38
38
  def frequencies
@@ -42,7 +42,7 @@ module Statsample
42
42
  s[par]=0
43
43
  s
44
44
  end
45
- base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies)
45
+ base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies.to_h)
46
46
  end
47
47
  def to_matrix
48
48
  f = frequencies
@@ -11,7 +11,7 @@ module Daru
11
11
  # ugly patch. The upper limit for a bin has the form
12
12
  # x < range
13
13
  #h=Statsample::Histogram.new(self, bins)
14
- valid = only_valid
14
+ valid = reject_values(*Daru::MISSING_VALUES)
15
15
  min,max=Statsample::Util.nice(valid.min,valid.max)
16
16
  # fix last data
17
17
  if max == valid.max
@@ -72,7 +72,6 @@ module Daru
72
72
  end
73
73
  #puts "Ingreso a los dataset"
74
74
  ms.datasets.each do |k,ds|
75
- ds.update
76
75
  ds.rename self[field].index_of(k)
77
76
  end
78
77
 
@@ -102,7 +101,6 @@ module Daru
102
101
  each_row { |r| p1.call(r) }
103
102
 
104
103
  ms.datasets.each do |k,ds|
105
- ds.update
106
104
  ds.rename(
107
105
  fields.size.times.map do |i|
108
106
  f = fields[i]
@@ -142,8 +142,7 @@ module Statsample
142
142
  raise "bootstrap_method doesn't recogniced"
143
143
  end
144
144
  end
145
- ds_bootstrap.update
146
-
145
+
147
146
  matrix=Statsample::Bivariate.send(matrix_method, ds_bootstrap)
148
147
  matrix=matrix.to_gsl if @use_gsl
149
148
  if smc
@@ -159,7 +158,6 @@ module Statsample
159
158
  redo
160
159
  end
161
160
  end
162
- @ds_eigenvalues.update
163
161
  end
164
162
  dirty_memoize :number_of_factors, :ds_eigenvalues
165
163
  dirty_writer :iterations, :bootstrap_method, :percentil, :smc
@@ -0,0 +1,46 @@
1
+ require 'statsample/formula/formula'
2
+
3
+ module Statsample
4
+ # Class for performing regression
5
+ class FitModel
6
+ def initialize(formula, df, opts = {})
7
+ @formula = FormulaWrapper.new formula, df
8
+ @df = df
9
+ @opts = opts
10
+ end
11
+
12
+ def model
13
+ @model || fit_model
14
+ end
15
+
16
+ def predict(new_data)
17
+ model.predict(df_for_prediction(new_data))
18
+ end
19
+
20
+ def df_for_prediction df
21
+ canonicalize_df(df)
22
+ end
23
+
24
+ def df_for_regression
25
+ df = canonicalize_df(@df)
26
+ df[@formula.y.value] = @df[@formula.y.value]
27
+ df
28
+ end
29
+
30
+ def canonicalize_df(orig_df)
31
+ tokens = @formula.canonical_tokens
32
+ tokens.shift if tokens.first.value == '1'
33
+ df = tokens.map { |t| t.to_df orig_df }.reduce(&:merge)
34
+ df
35
+ end
36
+
37
+ def fit_model
38
+ # TODO: Add support for inclusion/exclusion of intercept
39
+ @model = Statsample::Regression.multiple(
40
+ df_for_regression,
41
+ @formula.y.value,
42
+ @opts
43
+ )
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,306 @@
1
+ module Statsample
2
+ # This class recognizes what terms are numeric
3
+ # and accordingly forms groups which are fed to Formula
4
+ # Once they are parsed with Formula, they are combined back
5
+ class FormulaWrapper
6
+ attr_reader :tokens, :y, :canonical_tokens
7
+
8
+ # Initializes formula wrapper object to parse a given formula into
9
+ # some tokens which do not overlap one another.
10
+ # @note Specify 0 as a term in the formula if you do not want constant
11
+ # to be included in the parsed formula
12
+ # @param [string] formula to parse
13
+ # @param [Daru::DataFrame] df dataframe requried to know what vectors
14
+ # are numerical
15
+ # @example
16
+ # df = Daru::DataFrame.from_csv 'spec/data/df.csv'
17
+ # df.to_category 'c', 'd', 'e'
18
+ # formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
19
+ # formula.canonical_to_s
20
+ # #=> "1+c(-)+d(-):c+a"
21
+ def initialize(formula, df)
22
+ @df = df
23
+ # @y store the LHS term that is name of vector to be predicted
24
+ # @tokens store the RHS terms of the formula
25
+ @y, *@tokens = split_to_tokens(formula)
26
+ @tokens = @tokens.uniq.sort
27
+ manage_constant_term
28
+ @canonical_tokens = non_redundant_tokens
29
+ end
30
+
31
+ # Returns canonical tokens in a readable form.
32
+ # @return [String] canonical tokens in a readable form.
33
+ # @note 'y~a+b(-)' means 'a' exist in full rank expansion
34
+ # and 'b(-)' exist in reduced rank expansion
35
+ # @example
36
+ # df = Daru::DataFrame.from_csv 'spec/data/df.csv'
37
+ # df.to_category 'c', 'd', 'e'
38
+ # formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
39
+ # formula.canonical_to_s
40
+ # #=> "1+c(-)+d(-):c+a"
41
+ def canonical_to_s
42
+ canonical_tokens.join '+'
43
+ end
44
+
45
+ # Returns tokens to produce non-redundant design matrix
46
+ # @return [Array] array of tokens that do not produce redundant matrix
47
+ def non_redundant_tokens
48
+ groups = split_to_groups
49
+ # TODO: An enhancement
50
+ # Right now x:c appears as c:x
51
+ groups.each { |k, v| groups[k] = strip_numeric v, k }
52
+ groups.each { |k, v| groups[k] = Formula.new(v).canonical_tokens }
53
+ groups.flat_map { |k, v| add_numeric v, k }
54
+ end
55
+
56
+ private
57
+
58
+ # Removes intercept token if term '0' is found in the formula.
59
+ # Intercept token remains if term '1' is found.
60
+ # If neither term '0' nor term '1' is found then, intercept token is added.
61
+ def manage_constant_term
62
+ @tokens.unshift Token.new('1') unless
63
+ @tokens.include?(Token.new('1')) ||
64
+ @tokens.include?(Token.new('0'))
65
+ @tokens.delete Token.new('0')
66
+ end
67
+
68
+ # Groups the tokens to gropus based on the numerical terms
69
+ # they are interacting with.
70
+ def split_to_groups
71
+ @tokens.group_by { |t| extract_numeric t }
72
+ end
73
+
74
+ # Add numeric interaction term which was removed earlier
75
+ # @param [Array] tokens tokens on which to add numerical terms
76
+ # @param [Array] numeric array of numeric terms to add
77
+ def add_numeric(tokens, numeric)
78
+ tokens.map do |t|
79
+ terms = t.interact_terms + numeric
80
+ if terms == ['1']
81
+ Token.new('1')
82
+ else
83
+ terms = terms.reject { |i| i == '1' }
84
+ Token.new terms.join(':'), t.full
85
+ end
86
+ end
87
+ end
88
+
89
+ # Strip numerical interacting terms
90
+ # @param [Array] tokens tokens from which to strip numeric
91
+ # @param [Array] numeric array of numeric terms to strip from tokens
92
+ # @return [Array] array of tokens with striped numerical terms
93
+ def strip_numeric(tokens, numeric)
94
+ tokens.map do |t|
95
+ terms = t.interact_terms - numeric
96
+ terms = ['1'] if terms.empty?
97
+ Token.new terms.join(':')
98
+ end
99
+ end
100
+
101
+ # Extract numeric interacting terms
102
+ # @param [Statsample::GLM::Token] token form which to extract numeric terms
103
+ # @return [Array] array of numericl terms
104
+ def extract_numeric(token)
105
+ terms = token.interact_terms
106
+ return [] if terms == ['1']
107
+ terms.reject { |t| @df[t].category? }
108
+ end
109
+
110
+ def split_to_tokens(formula)
111
+ formula = formula.gsub(/\s+/, '')
112
+ lhs_term, rhs = formula.split '~'
113
+ rhs_terms = rhs.split '+'
114
+ ([lhs_term] + rhs_terms).map { |t| Token.new t }
115
+ end
116
+ end
117
+
118
+ # To process formula language
119
+ class Formula
120
+ attr_reader :tokens, :canonical_tokens
121
+
122
+ def initialize(tokens)
123
+ @tokens = tokens
124
+ @canonical_tokens = parse_formula
125
+ end
126
+
127
+ def canonical_to_s
128
+ canonical_tokens.join '+'
129
+ end
130
+
131
+ private
132
+
133
+ def parse_formula
134
+ @tokens.inject([]) do |acc, token|
135
+ acc + add_non_redundant_elements(token, acc)
136
+ end
137
+ end
138
+
139
+ def add_non_redundant_elements(token, result_so_far)
140
+ return [token] if token.value == '1'
141
+ tokens = token.expand
142
+ result_so_far = result_so_far.flat_map(&:expand)
143
+ tokens -= result_so_far
144
+ contract_if_possible tokens
145
+ end
146
+
147
+ def contract_if_possible(tokens)
148
+ tokens.combination(2).each do |a, b|
149
+ result = a.add b
150
+ next unless result
151
+ tokens.delete a
152
+ tokens.delete b
153
+ tokens << result
154
+ return contract_if_possible tokens
155
+ end
156
+ tokens.sort
157
+ end
158
+ end
159
+
160
+ # To encapsulate interaction as well as non-interaction terms
161
+ class Token
162
+ attr_reader :value, :full, :interact_terms
163
+
164
+ def initialize(value, full = true)
165
+ @interact_terms = value.include?(':') ? value.split(':') : [value]
166
+ @full = coerce_full full
167
+ end
168
+
169
+ def value
170
+ interact_terms.join(':')
171
+ end
172
+
173
+ def size
174
+ # TODO: Return size 1 for value '1' also
175
+ # CAn't do this at the moment because have to make
176
+ # changes in sorting first
177
+ value == '1' ? 0 : interact_terms.size
178
+ end
179
+
180
+ def add(other)
181
+ # ANYTHING + FACTOR- : ANYTHING = FACTOR : ANYTHING
182
+ # ANYTHING + ANYTHING : FACTOR- = ANYTHING : FACTOR
183
+ if size > other.size
184
+ other.add self
185
+
186
+ elsif other.size == 2 &&
187
+ size == 1 &&
188
+ other.interact_terms.last == value &&
189
+ other.full.last == full.first &&
190
+ other.full.first == false
191
+ Token.new(
192
+ "#{other.interact_terms.first}:#{value}",
193
+ [true, other.full.last]
194
+ )
195
+
196
+ elsif other.size == 2 &&
197
+ size == 1 &&
198
+ other.interact_terms.first == value &&
199
+ other.full.first == full.first &&
200
+ other.full.last == false
201
+ Token.new(
202
+ "#{value}:#{other.interact_terms.last}",
203
+ [other.full.first, true]
204
+ )
205
+
206
+ elsif value == '1' &&
207
+ other.size == 1
208
+ Token.new(other.value, true)
209
+ end
210
+ end
211
+
212
+ def ==(other)
213
+ value == other.value &&
214
+ full == other.full
215
+ end
216
+
217
+ alias eql? ==
218
+
219
+ def hash
220
+ value.hash ^ full.hash
221
+ end
222
+
223
+ def <=>(other)
224
+ size <=> other.size
225
+ end
226
+
227
+ def to_s
228
+ interact_terms
229
+ .zip(full)
230
+ .map { |t, f| f ? t : t + '(-)' }
231
+ .join ':'
232
+ end
233
+
234
+ def expand
235
+ case size
236
+ when 0
237
+ [self]
238
+ when 1
239
+ [Token.new('1'), Token.new(value, false)]
240
+ when 2
241
+ a, b = interact_terms
242
+ [Token.new('1'), Token.new(a, false), Token.new(b, false),
243
+ Token.new(a + ':' + b, [false, false])]
244
+ end
245
+ end
246
+
247
+ def to_df(df)
248
+ case size
249
+ when 1
250
+ if df[value].category?
251
+ df[value].contrast_code full: full.first
252
+ else
253
+ Daru::DataFrame.new value => df[value].to_a
254
+ end
255
+ when 2
256
+ to_df_when_interaction(df)
257
+ end
258
+ end
259
+
260
+ private
261
+
262
+ def coerce_full(value)
263
+ if value.is_a? Array
264
+ value + Array.new((@interact_terms.size - value.size), true)
265
+ else
266
+ [value] * @interact_terms.size
267
+ end
268
+ end
269
+
270
+ def to_df_when_interaction(df)
271
+ case interact_terms.map { |t| df[t].category? }
272
+ when [true, true]
273
+ df.interact_code(interact_terms, full)
274
+ when [false, false]
275
+ to_df_numeric_interact_with_numeric df
276
+ when [true, false]
277
+ to_df_category_interact_with_numeric df
278
+ when [false, true]
279
+ to_df_numeric_interact_with_category df
280
+ end
281
+ end
282
+
283
+ def to_df_numeric_interact_with_numeric(df)
284
+ Daru::DataFrame.new value => (df[interact_terms.first] *
285
+ df[interact_terms.last]).to_a
286
+ end
287
+
288
+ def to_df_category_interact_with_numeric(df)
289
+ a, b = interact_terms
290
+ Daru::DataFrame.new(
291
+ df[a].contrast_code(full: full.first)
292
+ .map { |dv| ["#{dv.name}:#{b}", (dv * df[b]).to_a] }
293
+ .to_h
294
+ )
295
+ end
296
+
297
+ def to_df_numeric_interact_with_category(df)
298
+ a, b = interact_terms
299
+ Daru::DataFrame.new(
300
+ df[b].contrast_code(full: full.last)
301
+ .map { |dv| ["#{a}:#{dv.name}", (dv * df[a]).to_a] }
302
+ .to_h
303
+ )
304
+ end
305
+ end
306
+ end