statsample 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/History.txt +7 -0
- data/README.md +2 -4
- data/Rakefile +6 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +0 -1
- data/examples/correlation_matrix.rb +0 -16
- data/examples/dataset.rb +0 -7
- data/examples/dominance_analysis_bootstrap.rb +0 -6
- data/examples/reliability.rb +0 -2
- data/lib/statsample.rb +3 -2
- data/lib/statsample/anova/oneway.rb +1 -1
- data/lib/statsample/bivariate.rb +4 -4
- data/lib/statsample/converter/spss.rb +1 -1
- data/lib/statsample/crosstab.rb +3 -3
- data/lib/statsample/daru.rb +1 -3
- data/lib/statsample/factor/parallelanalysis.rb +1 -3
- data/lib/statsample/formula/fit_model.rb +46 -0
- data/lib/statsample/formula/formula.rb +306 -0
- data/lib/statsample/matrix.rb +0 -2
- data/lib/statsample/regression.rb +1 -3
- data/lib/statsample/regression/multiple/alglibengine.rb +1 -1
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/rubyengine.rb +1 -3
- data/lib/statsample/reliability.rb +3 -3
- data/lib/statsample/reliability/icc.rb +1 -2
- data/lib/statsample/reliability/multiscaleanalysis.rb +0 -1
- data/lib/statsample/reliability/scaleanalysis.rb +2 -3
- data/lib/statsample/reliability/skillscaleanalysis.rb +1 -1
- data/lib/statsample/test/levene.rb +4 -4
- data/lib/statsample/test/t.rb +10 -10
- data/lib/statsample/test/umannwhitney.rb +3 -3
- data/lib/statsample/version.rb +1 -1
- data/statsample.gemspec +4 -1
- data/test/fixtures/df.csv +15 -0
- data/test/helpers_tests.rb +7 -0
- data/test/test_factor.rb +0 -5
- data/test/test_factor_pa.rb +1 -6
- data/test/test_fit_model.rb +88 -0
- data/test/test_reliability.rb +0 -10
- data/test/test_statistics.rb +1 -1
- metadata +52 -48
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e2a80fff135f963dcabfe1593de243611eb8ab9a
|
4
|
+
data.tar.gz: cb2d80e85339201f8a37ea1b8e934953f26b5591
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b9412e202d3364c6fe7f982a9ceb2828061312ffd0274c7bc25e8b5747abd4b11aca5edd960e22fb6ee8cfec409a6352961c51e283d9523fb608e5d66bf65377
|
7
|
+
data.tar.gz: 5db058e78ae638c155727ca51af3e303503bfd42b518de095aa93e9639194c9a273a8b4e3ae3e069d91e25500612beba48140f2e7024e2c329ba16adeca54bef
|
data/History.txt
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
=== 2.1.0 / 2017-08-10
|
2
|
+
* Update documentation to reflect methods that have been removed (@lokeshh)
|
3
|
+
* Update daru dependency to v0.1.6 (@lokeshh)
|
4
|
+
* Remove pre-daru legacy methods like n_valid, missing value functions (@lokeshh)
|
5
|
+
* Update test suite with rubocop and rake. New tests for methods like Regression (@lokeshh)
|
6
|
+
* Introduce fitting a regression using string formulas (@lokeshh)
|
7
|
+
|
1
8
|
=== 2.0.2 / 2016-03-11
|
2
9
|
* Update dependencies (spreadsheet, GSL)
|
3
10
|
|
data/README.md
CHANGED
@@ -11,7 +11,7 @@ Homepage :: https://github.com/sciruby/statsample
|
|
11
11
|
You should have a recent version of GSL and R (with the `irr` and `Rserve` libraries) installed. In Ubuntu:
|
12
12
|
|
13
13
|
```bash
|
14
|
-
$ sudo apt-get install
|
14
|
+
$ sudo apt-get install libgsl0-dev r-base r-base-dev
|
15
15
|
$ sudo Rscript -e "install.packages(c('Rserve', 'irr'))"
|
16
16
|
```
|
17
17
|
|
@@ -86,7 +86,7 @@ Include:
|
|
86
86
|
- Intra-class correlation
|
87
87
|
- Anova: generic and vector-based One-way ANOVA and Two-way ANOVA, with contrasts for One-way ANOVA.
|
88
88
|
- Tests: F, T, Levene, U-Mannwhitney.
|
89
|
-
- Regression: Simple, Multiple (OLS)
|
89
|
+
- Regression: Simple, Multiple (OLS)
|
90
90
|
- Factorial Analysis: Extraction (PCA and Principal Axis), Rotation (Varimax, Equimax, Quartimax) and Parallel Analysis and Velicer's MAP test, for estimation of number of factors.
|
91
91
|
- Reliability analysis for simple scale and a DSL to easily analyze multiple scales using factor analysis and correlations, if you want it.
|
92
92
|
- Basic time series support
|
@@ -120,8 +120,6 @@ Include:
|
|
120
120
|
- Multiple types of regression.
|
121
121
|
- Simple Regression : Statsample::Regression::Simple
|
122
122
|
- Multiple Regression: Statsample::Regression::Multiple
|
123
|
-
- Logit Regression: Statsample::Regression::Binomial::Logit
|
124
|
-
- Probit Regression: Statsample::Regression::Binomial::Probit
|
125
123
|
- Factorial Analysis algorithms on Statsample::Factor module.
|
126
124
|
- Classes for Extraction of factors:
|
127
125
|
- Statsample::Factor::PCA
|
data/Rakefile
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
$:.unshift File.expand_path("../lib/", __FILE__)
|
2
|
+
lib_folder = File.expand_path("../lib", __FILE__)
|
2
3
|
|
3
4
|
require 'statsample/version'
|
4
5
|
require 'rake'
|
@@ -36,3 +37,8 @@ task "gettext:makemo" do
|
|
36
37
|
require 'gettext/tools'
|
37
38
|
GetText.create_mofiles()
|
38
39
|
end
|
40
|
+
|
41
|
+
desc 'Run pry'
|
42
|
+
task :pry do |task|
|
43
|
+
sh "pry -r #{lib_folder}/statsample.rb"
|
44
|
+
end
|
@@ -7,18 +7,6 @@ $:.unshift(File.dirname(__FILE__)+'/../lib/')
|
|
7
7
|
|
8
8
|
require 'statsample'
|
9
9
|
Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do
|
10
|
-
# It so happens that Daru::Vector and Daru::DataFrame must update metadata
|
11
|
-
# like positions of missing values every time they are created.
|
12
|
-
#
|
13
|
-
# Since we dont have any missing values in the data that we are creating,
|
14
|
-
# we set Daru.lazy_update = true so that missing data is not updated every
|
15
|
-
# time and things happen much faster.
|
16
|
-
#
|
17
|
-
# In case you do have missing data and lazy_update has been set to *true*,
|
18
|
-
# you _SHOULD_ called `#update` on the concerned Vector or DataFrame object
|
19
|
-
# everytime an assingment or deletion cycle is complete.
|
20
|
-
Daru.lazy_update = true
|
21
|
-
|
22
10
|
# Create a Daru::DataFrame containing 4 vectors a, b, c and d.
|
23
11
|
#
|
24
12
|
# Notice that the `clone` option has been set to *false*. This tells Daru
|
@@ -36,10 +24,6 @@ Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do
|
|
36
24
|
# Calculate correlation matrix by calling the `cor` shorthand.
|
37
25
|
cm = cor(ds)
|
38
26
|
summary(cm)
|
39
|
-
|
40
|
-
# Set lazy_update to *false* once our job is done so that this analysis does
|
41
|
-
# not accidentally affect code elsewhere.
|
42
|
-
Daru.lazy_update = false
|
43
27
|
end
|
44
28
|
|
45
29
|
if __FILE__==$0
|
data/examples/dataset.rb
CHANGED
@@ -6,10 +6,6 @@ $:.unshift(File.dirname(__FILE__)+'/../lib/')
|
|
6
6
|
require 'statsample'
|
7
7
|
|
8
8
|
Statsample::Analysis.store(Daru::DataFrame) do
|
9
|
-
# We set lazy_update to *true* so that time is not wasted in updating
|
10
|
-
# metdata every time an assignment happens.
|
11
|
-
Daru.lazy_update = true
|
12
|
-
|
13
9
|
samples = 1000
|
14
10
|
|
15
11
|
# The 'new_with_size' function lets you specify the size of the
|
@@ -26,9 +22,6 @@ Statsample::Analysis.store(Daru::DataFrame) do
|
|
26
22
|
# order by default.
|
27
23
|
ds = Daru::DataFrame.new({:a=>a,:b=>b}, order: [:b, :a])
|
28
24
|
summary(ds)
|
29
|
-
|
30
|
-
# Reset lazy_update to *false* to prevent other code from breaking.
|
31
|
-
Daru.lazy_update = false
|
32
25
|
end
|
33
26
|
|
34
27
|
if __FILE__==$0
|
@@ -3,10 +3,6 @@ $:.unshift(File.dirname(__FILE__)+'/../lib/')
|
|
3
3
|
require 'statsample'
|
4
4
|
|
5
5
|
Statsample::Analysis.store(Statsample::DominanceAnalysis::Bootstrap) do
|
6
|
-
# Remember to call *update* after an assignment/deletion cycle if lazy_update
|
7
|
-
# is *false*.
|
8
|
-
Daru.lazy_update = true
|
9
|
-
|
10
6
|
sample=300
|
11
7
|
a=rnorm(sample)
|
12
8
|
b=rnorm(sample)
|
@@ -29,8 +25,6 @@ Statsample::Analysis.store(Statsample::DominanceAnalysis::Bootstrap) do
|
|
29
25
|
dab2=dominance_analysis_bootstrap(ds2, :y1, :debug=>true)
|
30
26
|
dab2.bootstrap(100,nil)
|
31
27
|
summary(dab2)
|
32
|
-
|
33
|
-
Daru.lazy_update = false
|
34
28
|
end
|
35
29
|
|
36
30
|
if __FILE__==$0
|
data/examples/reliability.rb
CHANGED
data/lib/statsample.rb
CHANGED
@@ -160,6 +160,7 @@ module Statsample
|
|
160
160
|
autoload(:StratifiedSample, 'statsample/multiset')
|
161
161
|
autoload(:MLE, 'statsample/mle')
|
162
162
|
autoload(:Regression, 'statsample/regression')
|
163
|
+
autoload(:FitModel, 'statsample/formula/fit_model')
|
163
164
|
autoload(:Test, 'statsample/test')
|
164
165
|
autoload(:Factor, 'statsample/factor')
|
165
166
|
autoload(:Graph, 'statsample/graph')
|
@@ -206,7 +207,7 @@ module Statsample
|
|
206
207
|
def only_valid(*vs)
|
207
208
|
i = 1
|
208
209
|
h = vs.inject({}) { |acc, v| acc["v#{i}".to_sym] = v; i += 1; acc }
|
209
|
-
df = Daru::DataFrame.new(h).
|
210
|
+
df = Daru::DataFrame.new(h).reject_values(*Daru::MISSING_VALUES)
|
210
211
|
df.map { |v| v }
|
211
212
|
end
|
212
213
|
|
@@ -214,7 +215,7 @@ module Statsample
|
|
214
215
|
# If any vectors have missing_values, return only valid.
|
215
216
|
# If not, return the vectors itself
|
216
217
|
def only_valid_clone(*vs)
|
217
|
-
if vs.any?(
|
218
|
+
if vs.any? { |v| v.include_values?(*Daru::MISSING_VALUES) }
|
218
219
|
only_valid(*vs)
|
219
220
|
else
|
220
221
|
vs
|
@@ -164,7 +164,7 @@ module Statsample
|
|
164
164
|
if summary_descriptives
|
165
165
|
s.table(:name=>_("Descriptives"),:header=>%w{Name N Mean SD Min Max}.map {|v| _(v)}) do |t|
|
166
166
|
@vectors.each do |v|
|
167
|
-
t.row [v.name, v.
|
167
|
+
t.row [v.name, v.reject_values(*Daru::MISSING_VALUES).size, "%0.4f" % v.mean, "%0.4f" % v.sd, "%0.4f" % v.min, "%0.4f" % v.max]
|
168
168
|
end
|
169
169
|
end
|
170
170
|
end
|
data/lib/statsample/bivariate.rb
CHANGED
@@ -159,7 +159,7 @@ module Statsample
|
|
159
159
|
|
160
160
|
def covariance_matrix(ds)
|
161
161
|
vars,cases = ds.ncols, ds.nrows
|
162
|
-
if !ds.
|
162
|
+
if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
|
163
163
|
cm=covariance_matrix_optimized(ds)
|
164
164
|
else
|
165
165
|
cm=covariance_matrix_pairwise(ds)
|
@@ -198,7 +198,7 @@ module Statsample
|
|
198
198
|
# Order of rows and columns depends on Dataset#fields order
|
199
199
|
def correlation_matrix(ds)
|
200
200
|
vars, cases = ds.ncols, ds.nrows
|
201
|
-
if !ds.
|
201
|
+
if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
|
202
202
|
cm=correlation_matrix_optimized(ds)
|
203
203
|
else
|
204
204
|
cm=correlation_matrix_pairwise(ds)
|
@@ -248,7 +248,7 @@ module Statsample
|
|
248
248
|
m = vectors.collect do |row|
|
249
249
|
vectors.collect do |col|
|
250
250
|
if row==col
|
251
|
-
ds[row].
|
251
|
+
ds[row].reject_values(*Daru::MISSING_VALUES).size
|
252
252
|
else
|
253
253
|
rowa,rowb = Statsample.only_valid_clone(ds[row],ds[col])
|
254
254
|
rowa.size
|
@@ -281,7 +281,7 @@ module Statsample
|
|
281
281
|
# Calculate Point biserial correlation. Equal to Pearson correlation, with
|
282
282
|
# one dichotomous value replaced by "0" and the other by "1"
|
283
283
|
def point_biserial(dichotomous,continous)
|
284
|
-
ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).
|
284
|
+
ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).reject_values(*Daru::MISSING_VALUES)
|
285
285
|
raise(TypeError, "First vector should be dichotomous") if ds[:d].factors.size != 2
|
286
286
|
raise(TypeError, "Second vector should be continous") if ds[:c].type != :numeric
|
287
287
|
f0=ds[:d].factors.sort.to_a[0]
|
@@ -7,7 +7,7 @@ module Statsample
|
|
7
7
|
# ds=Daru::DataFrame.from_excel("my_data.xls")
|
8
8
|
# puts Statsample::SPSS.tetrachoric_correlation_matrix(ds)
|
9
9
|
def tetrachoric_correlation_matrix(ds)
|
10
|
-
dsv=ds.
|
10
|
+
dsv=ds.reject_values(*Daru::MISSING_VALUES)
|
11
11
|
# Delete all vectors doesn't have variation
|
12
12
|
dsv.vectors.each { |f|
|
13
13
|
if dsv[f].factors.size==1
|
data/lib/statsample/crosstab.rb
CHANGED
@@ -29,10 +29,10 @@ module Statsample
|
|
29
29
|
@v_cols.factors.sort.reset_index!
|
30
30
|
end
|
31
31
|
def rows_total
|
32
|
-
@v_rows.frequencies
|
32
|
+
@v_rows.frequencies.to_h
|
33
33
|
end
|
34
34
|
def cols_total
|
35
|
-
@v_cols.frequencies
|
35
|
+
@v_cols.frequencies.to_h
|
36
36
|
end
|
37
37
|
|
38
38
|
def frequencies
|
@@ -42,7 +42,7 @@ module Statsample
|
|
42
42
|
s[par]=0
|
43
43
|
s
|
44
44
|
end
|
45
|
-
base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies)
|
45
|
+
base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies.to_h)
|
46
46
|
end
|
47
47
|
def to_matrix
|
48
48
|
f = frequencies
|
data/lib/statsample/daru.rb
CHANGED
@@ -11,7 +11,7 @@ module Daru
|
|
11
11
|
# ugly patch. The upper limit for a bin has the form
|
12
12
|
# x < range
|
13
13
|
#h=Statsample::Histogram.new(self, bins)
|
14
|
-
valid =
|
14
|
+
valid = reject_values(*Daru::MISSING_VALUES)
|
15
15
|
min,max=Statsample::Util.nice(valid.min,valid.max)
|
16
16
|
# fix last data
|
17
17
|
if max == valid.max
|
@@ -72,7 +72,6 @@ module Daru
|
|
72
72
|
end
|
73
73
|
#puts "Ingreso a los dataset"
|
74
74
|
ms.datasets.each do |k,ds|
|
75
|
-
ds.update
|
76
75
|
ds.rename self[field].index_of(k)
|
77
76
|
end
|
78
77
|
|
@@ -102,7 +101,6 @@ module Daru
|
|
102
101
|
each_row { |r| p1.call(r) }
|
103
102
|
|
104
103
|
ms.datasets.each do |k,ds|
|
105
|
-
ds.update
|
106
104
|
ds.rename(
|
107
105
|
fields.size.times.map do |i|
|
108
106
|
f = fields[i]
|
@@ -142,8 +142,7 @@ module Statsample
|
|
142
142
|
raise "bootstrap_method doesn't recogniced"
|
143
143
|
end
|
144
144
|
end
|
145
|
-
|
146
|
-
|
145
|
+
|
147
146
|
matrix=Statsample::Bivariate.send(matrix_method, ds_bootstrap)
|
148
147
|
matrix=matrix.to_gsl if @use_gsl
|
149
148
|
if smc
|
@@ -159,7 +158,6 @@ module Statsample
|
|
159
158
|
redo
|
160
159
|
end
|
161
160
|
end
|
162
|
-
@ds_eigenvalues.update
|
163
161
|
end
|
164
162
|
dirty_memoize :number_of_factors, :ds_eigenvalues
|
165
163
|
dirty_writer :iterations, :bootstrap_method, :percentil, :smc
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'statsample/formula/formula'
|
2
|
+
|
3
|
+
module Statsample
|
4
|
+
# Class for performing regression
|
5
|
+
class FitModel
|
6
|
+
def initialize(formula, df, opts = {})
|
7
|
+
@formula = FormulaWrapper.new formula, df
|
8
|
+
@df = df
|
9
|
+
@opts = opts
|
10
|
+
end
|
11
|
+
|
12
|
+
def model
|
13
|
+
@model || fit_model
|
14
|
+
end
|
15
|
+
|
16
|
+
def predict(new_data)
|
17
|
+
model.predict(df_for_prediction(new_data))
|
18
|
+
end
|
19
|
+
|
20
|
+
def df_for_prediction df
|
21
|
+
canonicalize_df(df)
|
22
|
+
end
|
23
|
+
|
24
|
+
def df_for_regression
|
25
|
+
df = canonicalize_df(@df)
|
26
|
+
df[@formula.y.value] = @df[@formula.y.value]
|
27
|
+
df
|
28
|
+
end
|
29
|
+
|
30
|
+
def canonicalize_df(orig_df)
|
31
|
+
tokens = @formula.canonical_tokens
|
32
|
+
tokens.shift if tokens.first.value == '1'
|
33
|
+
df = tokens.map { |t| t.to_df orig_df }.reduce(&:merge)
|
34
|
+
df
|
35
|
+
end
|
36
|
+
|
37
|
+
def fit_model
|
38
|
+
# TODO: Add support for inclusion/exclusion of intercept
|
39
|
+
@model = Statsample::Regression.multiple(
|
40
|
+
df_for_regression,
|
41
|
+
@formula.y.value,
|
42
|
+
@opts
|
43
|
+
)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,306 @@
|
|
1
|
+
module Statsample
|
2
|
+
# This class recognizes what terms are numeric
|
3
|
+
# and accordingly forms groups which are fed to Formula
|
4
|
+
# Once they are parsed with Formula, they are combined back
|
5
|
+
class FormulaWrapper
|
6
|
+
attr_reader :tokens, :y, :canonical_tokens
|
7
|
+
|
8
|
+
# Initializes formula wrapper object to parse a given formula into
|
9
|
+
# some tokens which do not overlap one another.
|
10
|
+
# @note Specify 0 as a term in the formula if you do not want constant
|
11
|
+
# to be included in the parsed formula
|
12
|
+
# @param [string] formula to parse
|
13
|
+
# @param [Daru::DataFrame] df dataframe requried to know what vectors
|
14
|
+
# are numerical
|
15
|
+
# @example
|
16
|
+
# df = Daru::DataFrame.from_csv 'spec/data/df.csv'
|
17
|
+
# df.to_category 'c', 'd', 'e'
|
18
|
+
# formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
|
19
|
+
# formula.canonical_to_s
|
20
|
+
# #=> "1+c(-)+d(-):c+a"
|
21
|
+
def initialize(formula, df)
|
22
|
+
@df = df
|
23
|
+
# @y store the LHS term that is name of vector to be predicted
|
24
|
+
# @tokens store the RHS terms of the formula
|
25
|
+
@y, *@tokens = split_to_tokens(formula)
|
26
|
+
@tokens = @tokens.uniq.sort
|
27
|
+
manage_constant_term
|
28
|
+
@canonical_tokens = non_redundant_tokens
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns canonical tokens in a readable form.
|
32
|
+
# @return [String] canonical tokens in a readable form.
|
33
|
+
# @note 'y~a+b(-)' means 'a' exist in full rank expansion
|
34
|
+
# and 'b(-)' exist in reduced rank expansion
|
35
|
+
# @example
|
36
|
+
# df = Daru::DataFrame.from_csv 'spec/data/df.csv'
|
37
|
+
# df.to_category 'c', 'd', 'e'
|
38
|
+
# formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
|
39
|
+
# formula.canonical_to_s
|
40
|
+
# #=> "1+c(-)+d(-):c+a"
|
41
|
+
def canonical_to_s
|
42
|
+
canonical_tokens.join '+'
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns tokens to produce non-redundant design matrix
|
46
|
+
# @return [Array] array of tokens that do not produce redundant matrix
|
47
|
+
def non_redundant_tokens
|
48
|
+
groups = split_to_groups
|
49
|
+
# TODO: An enhancement
|
50
|
+
# Right now x:c appears as c:x
|
51
|
+
groups.each { |k, v| groups[k] = strip_numeric v, k }
|
52
|
+
groups.each { |k, v| groups[k] = Formula.new(v).canonical_tokens }
|
53
|
+
groups.flat_map { |k, v| add_numeric v, k }
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
# Removes intercept token if term '0' is found in the formula.
|
59
|
+
# Intercept token remains if term '1' is found.
|
60
|
+
# If neither term '0' nor term '1' is found then, intercept token is added.
|
61
|
+
def manage_constant_term
|
62
|
+
@tokens.unshift Token.new('1') unless
|
63
|
+
@tokens.include?(Token.new('1')) ||
|
64
|
+
@tokens.include?(Token.new('0'))
|
65
|
+
@tokens.delete Token.new('0')
|
66
|
+
end
|
67
|
+
|
68
|
+
# Groups the tokens to gropus based on the numerical terms
|
69
|
+
# they are interacting with.
|
70
|
+
def split_to_groups
|
71
|
+
@tokens.group_by { |t| extract_numeric t }
|
72
|
+
end
|
73
|
+
|
74
|
+
# Add numeric interaction term which was removed earlier
|
75
|
+
# @param [Array] tokens tokens on which to add numerical terms
|
76
|
+
# @param [Array] numeric array of numeric terms to add
|
77
|
+
def add_numeric(tokens, numeric)
|
78
|
+
tokens.map do |t|
|
79
|
+
terms = t.interact_terms + numeric
|
80
|
+
if terms == ['1']
|
81
|
+
Token.new('1')
|
82
|
+
else
|
83
|
+
terms = terms.reject { |i| i == '1' }
|
84
|
+
Token.new terms.join(':'), t.full
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Strip numerical interacting terms
|
90
|
+
# @param [Array] tokens tokens from which to strip numeric
|
91
|
+
# @param [Array] numeric array of numeric terms to strip from tokens
|
92
|
+
# @return [Array] array of tokens with striped numerical terms
|
93
|
+
def strip_numeric(tokens, numeric)
|
94
|
+
tokens.map do |t|
|
95
|
+
terms = t.interact_terms - numeric
|
96
|
+
terms = ['1'] if terms.empty?
|
97
|
+
Token.new terms.join(':')
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Extract numeric interacting terms
|
102
|
+
# @param [Statsample::GLM::Token] token form which to extract numeric terms
|
103
|
+
# @return [Array] array of numericl terms
|
104
|
+
def extract_numeric(token)
|
105
|
+
terms = token.interact_terms
|
106
|
+
return [] if terms == ['1']
|
107
|
+
terms.reject { |t| @df[t].category? }
|
108
|
+
end
|
109
|
+
|
110
|
+
def split_to_tokens(formula)
|
111
|
+
formula = formula.gsub(/\s+/, '')
|
112
|
+
lhs_term, rhs = formula.split '~'
|
113
|
+
rhs_terms = rhs.split '+'
|
114
|
+
([lhs_term] + rhs_terms).map { |t| Token.new t }
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# To process formula language
|
119
|
+
class Formula
|
120
|
+
attr_reader :tokens, :canonical_tokens
|
121
|
+
|
122
|
+
def initialize(tokens)
|
123
|
+
@tokens = tokens
|
124
|
+
@canonical_tokens = parse_formula
|
125
|
+
end
|
126
|
+
|
127
|
+
def canonical_to_s
|
128
|
+
canonical_tokens.join '+'
|
129
|
+
end
|
130
|
+
|
131
|
+
private
|
132
|
+
|
133
|
+
def parse_formula
|
134
|
+
@tokens.inject([]) do |acc, token|
|
135
|
+
acc + add_non_redundant_elements(token, acc)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def add_non_redundant_elements(token, result_so_far)
|
140
|
+
return [token] if token.value == '1'
|
141
|
+
tokens = token.expand
|
142
|
+
result_so_far = result_so_far.flat_map(&:expand)
|
143
|
+
tokens -= result_so_far
|
144
|
+
contract_if_possible tokens
|
145
|
+
end
|
146
|
+
|
147
|
+
def contract_if_possible(tokens)
|
148
|
+
tokens.combination(2).each do |a, b|
|
149
|
+
result = a.add b
|
150
|
+
next unless result
|
151
|
+
tokens.delete a
|
152
|
+
tokens.delete b
|
153
|
+
tokens << result
|
154
|
+
return contract_if_possible tokens
|
155
|
+
end
|
156
|
+
tokens.sort
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# To encapsulate interaction as well as non-interaction terms
|
161
|
+
class Token
|
162
|
+
attr_reader :value, :full, :interact_terms
|
163
|
+
|
164
|
+
def initialize(value, full = true)
|
165
|
+
@interact_terms = value.include?(':') ? value.split(':') : [value]
|
166
|
+
@full = coerce_full full
|
167
|
+
end
|
168
|
+
|
169
|
+
def value
|
170
|
+
interact_terms.join(':')
|
171
|
+
end
|
172
|
+
|
173
|
+
def size
|
174
|
+
# TODO: Return size 1 for value '1' also
|
175
|
+
# CAn't do this at the moment because have to make
|
176
|
+
# changes in sorting first
|
177
|
+
value == '1' ? 0 : interact_terms.size
|
178
|
+
end
|
179
|
+
|
180
|
+
def add(other)
|
181
|
+
# ANYTHING + FACTOR- : ANYTHING = FACTOR : ANYTHING
|
182
|
+
# ANYTHING + ANYTHING : FACTOR- = ANYTHING : FACTOR
|
183
|
+
if size > other.size
|
184
|
+
other.add self
|
185
|
+
|
186
|
+
elsif other.size == 2 &&
|
187
|
+
size == 1 &&
|
188
|
+
other.interact_terms.last == value &&
|
189
|
+
other.full.last == full.first &&
|
190
|
+
other.full.first == false
|
191
|
+
Token.new(
|
192
|
+
"#{other.interact_terms.first}:#{value}",
|
193
|
+
[true, other.full.last]
|
194
|
+
)
|
195
|
+
|
196
|
+
elsif other.size == 2 &&
|
197
|
+
size == 1 &&
|
198
|
+
other.interact_terms.first == value &&
|
199
|
+
other.full.first == full.first &&
|
200
|
+
other.full.last == false
|
201
|
+
Token.new(
|
202
|
+
"#{value}:#{other.interact_terms.last}",
|
203
|
+
[other.full.first, true]
|
204
|
+
)
|
205
|
+
|
206
|
+
elsif value == '1' &&
|
207
|
+
other.size == 1
|
208
|
+
Token.new(other.value, true)
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
def ==(other)
|
213
|
+
value == other.value &&
|
214
|
+
full == other.full
|
215
|
+
end
|
216
|
+
|
217
|
+
alias eql? ==
|
218
|
+
|
219
|
+
def hash
|
220
|
+
value.hash ^ full.hash
|
221
|
+
end
|
222
|
+
|
223
|
+
def <=>(other)
|
224
|
+
size <=> other.size
|
225
|
+
end
|
226
|
+
|
227
|
+
def to_s
|
228
|
+
interact_terms
|
229
|
+
.zip(full)
|
230
|
+
.map { |t, f| f ? t : t + '(-)' }
|
231
|
+
.join ':'
|
232
|
+
end
|
233
|
+
|
234
|
+
def expand
|
235
|
+
case size
|
236
|
+
when 0
|
237
|
+
[self]
|
238
|
+
when 1
|
239
|
+
[Token.new('1'), Token.new(value, false)]
|
240
|
+
when 2
|
241
|
+
a, b = interact_terms
|
242
|
+
[Token.new('1'), Token.new(a, false), Token.new(b, false),
|
243
|
+
Token.new(a + ':' + b, [false, false])]
|
244
|
+
end
|
245
|
+
end
|
246
|
+
|
247
|
+
def to_df(df)
|
248
|
+
case size
|
249
|
+
when 1
|
250
|
+
if df[value].category?
|
251
|
+
df[value].contrast_code full: full.first
|
252
|
+
else
|
253
|
+
Daru::DataFrame.new value => df[value].to_a
|
254
|
+
end
|
255
|
+
when 2
|
256
|
+
to_df_when_interaction(df)
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
private
|
261
|
+
|
262
|
+
def coerce_full(value)
|
263
|
+
if value.is_a? Array
|
264
|
+
value + Array.new((@interact_terms.size - value.size), true)
|
265
|
+
else
|
266
|
+
[value] * @interact_terms.size
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
def to_df_when_interaction(df)
|
271
|
+
case interact_terms.map { |t| df[t].category? }
|
272
|
+
when [true, true]
|
273
|
+
df.interact_code(interact_terms, full)
|
274
|
+
when [false, false]
|
275
|
+
to_df_numeric_interact_with_numeric df
|
276
|
+
when [true, false]
|
277
|
+
to_df_category_interact_with_numeric df
|
278
|
+
when [false, true]
|
279
|
+
to_df_numeric_interact_with_category df
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
def to_df_numeric_interact_with_numeric(df)
|
284
|
+
Daru::DataFrame.new value => (df[interact_terms.first] *
|
285
|
+
df[interact_terms.last]).to_a
|
286
|
+
end
|
287
|
+
|
288
|
+
def to_df_category_interact_with_numeric(df)
|
289
|
+
a, b = interact_terms
|
290
|
+
Daru::DataFrame.new(
|
291
|
+
df[a].contrast_code(full: full.first)
|
292
|
+
.map { |dv| ["#{dv.name}:#{b}", (dv * df[b]).to_a] }
|
293
|
+
.to_h
|
294
|
+
)
|
295
|
+
end
|
296
|
+
|
297
|
+
def to_df_numeric_interact_with_category(df)
|
298
|
+
a, b = interact_terms
|
299
|
+
Daru::DataFrame.new(
|
300
|
+
df[b].contrast_code(full: full.last)
|
301
|
+
.map { |dv| ["#{a}:#{dv.name}", (dv * df[a]).to_a] }
|
302
|
+
.to_h
|
303
|
+
)
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|