statsample 2.0.2 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/History.txt +7 -0
- data/README.md +2 -4
- data/Rakefile +6 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +0 -1
- data/examples/correlation_matrix.rb +0 -16
- data/examples/dataset.rb +0 -7
- data/examples/dominance_analysis_bootstrap.rb +0 -6
- data/examples/reliability.rb +0 -2
- data/lib/statsample.rb +3 -2
- data/lib/statsample/anova/oneway.rb +1 -1
- data/lib/statsample/bivariate.rb +4 -4
- data/lib/statsample/converter/spss.rb +1 -1
- data/lib/statsample/crosstab.rb +3 -3
- data/lib/statsample/daru.rb +1 -3
- data/lib/statsample/factor/parallelanalysis.rb +1 -3
- data/lib/statsample/formula/fit_model.rb +46 -0
- data/lib/statsample/formula/formula.rb +306 -0
- data/lib/statsample/matrix.rb +0 -2
- data/lib/statsample/regression.rb +1 -3
- data/lib/statsample/regression/multiple/alglibengine.rb +1 -1
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/rubyengine.rb +1 -3
- data/lib/statsample/reliability.rb +3 -3
- data/lib/statsample/reliability/icc.rb +1 -2
- data/lib/statsample/reliability/multiscaleanalysis.rb +0 -1
- data/lib/statsample/reliability/scaleanalysis.rb +2 -3
- data/lib/statsample/reliability/skillscaleanalysis.rb +1 -1
- data/lib/statsample/test/levene.rb +4 -4
- data/lib/statsample/test/t.rb +10 -10
- data/lib/statsample/test/umannwhitney.rb +3 -3
- data/lib/statsample/version.rb +1 -1
- data/statsample.gemspec +4 -1
- data/test/fixtures/df.csv +15 -0
- data/test/helpers_tests.rb +7 -0
- data/test/test_factor.rb +0 -5
- data/test/test_factor_pa.rb +1 -6
- data/test/test_fit_model.rb +88 -0
- data/test/test_reliability.rb +0 -10
- data/test/test_statistics.rb +1 -1
- metadata +52 -48
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e2a80fff135f963dcabfe1593de243611eb8ab9a
|
4
|
+
data.tar.gz: cb2d80e85339201f8a37ea1b8e934953f26b5591
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b9412e202d3364c6fe7f982a9ceb2828061312ffd0274c7bc25e8b5747abd4b11aca5edd960e22fb6ee8cfec409a6352961c51e283d9523fb608e5d66bf65377
|
7
|
+
data.tar.gz: 5db058e78ae638c155727ca51af3e303503bfd42b518de095aa93e9639194c9a273a8b4e3ae3e069d91e25500612beba48140f2e7024e2c329ba16adeca54bef
|
data/History.txt
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
=== 2.1.0 / 2017-08-10
|
2
|
+
* Update documentation to reflect methods that have been removed (@lokeshh)
|
3
|
+
* Update daru dependency to v0.1.6 (@lokeshh)
|
4
|
+
* Remove pre-daru legacy methods like n_valid, missing value functions (@lokeshh)
|
5
|
+
* Update test suite with rubocop and rake. New tests for methods like Regression (@lokeshh)
|
6
|
+
* Introduce fitting a regression using string formulas (@lokeshh)
|
7
|
+
|
1
8
|
=== 2.0.2 / 2016-03-11
|
2
9
|
* Update dependencies (spreadsheet, GSL)
|
3
10
|
|
data/README.md
CHANGED
@@ -11,7 +11,7 @@ Homepage :: https://github.com/sciruby/statsample
|
|
11
11
|
You should have a recent version of GSL and R (with the `irr` and `Rserve` libraries) installed. In Ubuntu:
|
12
12
|
|
13
13
|
```bash
|
14
|
-
$ sudo apt-get install
|
14
|
+
$ sudo apt-get install libgsl0-dev r-base r-base-dev
|
15
15
|
$ sudo Rscript -e "install.packages(c('Rserve', 'irr'))"
|
16
16
|
```
|
17
17
|
|
@@ -86,7 +86,7 @@ Include:
|
|
86
86
|
- Intra-class correlation
|
87
87
|
- Anova: generic and vector-based One-way ANOVA and Two-way ANOVA, with contrasts for One-way ANOVA.
|
88
88
|
- Tests: F, T, Levene, U-Mannwhitney.
|
89
|
-
- Regression: Simple, Multiple (OLS)
|
89
|
+
- Regression: Simple, Multiple (OLS)
|
90
90
|
- Factorial Analysis: Extraction (PCA and Principal Axis), Rotation (Varimax, Equimax, Quartimax) and Parallel Analysis and Velicer's MAP test, for estimation of number of factors.
|
91
91
|
- Reliability analysis for simple scale and a DSL to easily analyze multiple scales using factor analysis and correlations, if you want it.
|
92
92
|
- Basic time series support
|
@@ -120,8 +120,6 @@ Include:
|
|
120
120
|
- Multiple types of regression.
|
121
121
|
- Simple Regression : Statsample::Regression::Simple
|
122
122
|
- Multiple Regression: Statsample::Regression::Multiple
|
123
|
-
- Logit Regression: Statsample::Regression::Binomial::Logit
|
124
|
-
- Probit Regression: Statsample::Regression::Binomial::Probit
|
125
123
|
- Factorial Analysis algorithms on Statsample::Factor module.
|
126
124
|
- Classes for Extraction of factors:
|
127
125
|
- Statsample::Factor::PCA
|
data/Rakefile
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
$:.unshift File.expand_path("../lib/", __FILE__)
|
2
|
+
lib_folder = File.expand_path("../lib", __FILE__)
|
2
3
|
|
3
4
|
require 'statsample/version'
|
4
5
|
require 'rake'
|
@@ -36,3 +37,8 @@ task "gettext:makemo" do
|
|
36
37
|
require 'gettext/tools'
|
37
38
|
GetText.create_mofiles()
|
38
39
|
end
|
40
|
+
|
41
|
+
desc 'Run pry'
|
42
|
+
task :pry do |task|
|
43
|
+
sh "pry -r #{lib_folder}/statsample.rb"
|
44
|
+
end
|
@@ -7,18 +7,6 @@ $:.unshift(File.dirname(__FILE__)+'/../lib/')
|
|
7
7
|
|
8
8
|
require 'statsample'
|
9
9
|
Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do
|
10
|
-
# It so happens that Daru::Vector and Daru::DataFrame must update metadata
|
11
|
-
# like positions of missing values every time they are created.
|
12
|
-
#
|
13
|
-
# Since we dont have any missing values in the data that we are creating,
|
14
|
-
# we set Daru.lazy_update = true so that missing data is not updated every
|
15
|
-
# time and things happen much faster.
|
16
|
-
#
|
17
|
-
# In case you do have missing data and lazy_update has been set to *true*,
|
18
|
-
# you _SHOULD_ called `#update` on the concerned Vector or DataFrame object
|
19
|
-
# everytime an assingment or deletion cycle is complete.
|
20
|
-
Daru.lazy_update = true
|
21
|
-
|
22
10
|
# Create a Daru::DataFrame containing 4 vectors a, b, c and d.
|
23
11
|
#
|
24
12
|
# Notice that the `clone` option has been set to *false*. This tells Daru
|
@@ -36,10 +24,6 @@ Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do
|
|
36
24
|
# Calculate correlation matrix by calling the `cor` shorthand.
|
37
25
|
cm = cor(ds)
|
38
26
|
summary(cm)
|
39
|
-
|
40
|
-
# Set lazy_update to *false* once our job is done so that this analysis does
|
41
|
-
# not accidentally affect code elsewhere.
|
42
|
-
Daru.lazy_update = false
|
43
27
|
end
|
44
28
|
|
45
29
|
if __FILE__==$0
|
data/examples/dataset.rb
CHANGED
@@ -6,10 +6,6 @@ $:.unshift(File.dirname(__FILE__)+'/../lib/')
|
|
6
6
|
require 'statsample'
|
7
7
|
|
8
8
|
Statsample::Analysis.store(Daru::DataFrame) do
|
9
|
-
# We set lazy_update to *true* so that time is not wasted in updating
|
10
|
-
# metdata every time an assignment happens.
|
11
|
-
Daru.lazy_update = true
|
12
|
-
|
13
9
|
samples = 1000
|
14
10
|
|
15
11
|
# The 'new_with_size' function lets you specify the size of the
|
@@ -26,9 +22,6 @@ Statsample::Analysis.store(Daru::DataFrame) do
|
|
26
22
|
# order by default.
|
27
23
|
ds = Daru::DataFrame.new({:a=>a,:b=>b}, order: [:b, :a])
|
28
24
|
summary(ds)
|
29
|
-
|
30
|
-
# Reset lazy_update to *false* to prevent other code from breaking.
|
31
|
-
Daru.lazy_update = false
|
32
25
|
end
|
33
26
|
|
34
27
|
if __FILE__==$0
|
@@ -3,10 +3,6 @@ $:.unshift(File.dirname(__FILE__)+'/../lib/')
|
|
3
3
|
require 'statsample'
|
4
4
|
|
5
5
|
Statsample::Analysis.store(Statsample::DominanceAnalysis::Bootstrap) do
|
6
|
-
# Remember to call *update* after an assignment/deletion cycle if lazy_update
|
7
|
-
# is *false*.
|
8
|
-
Daru.lazy_update = true
|
9
|
-
|
10
6
|
sample=300
|
11
7
|
a=rnorm(sample)
|
12
8
|
b=rnorm(sample)
|
@@ -29,8 +25,6 @@ Statsample::Analysis.store(Statsample::DominanceAnalysis::Bootstrap) do
|
|
29
25
|
dab2=dominance_analysis_bootstrap(ds2, :y1, :debug=>true)
|
30
26
|
dab2.bootstrap(100,nil)
|
31
27
|
summary(dab2)
|
32
|
-
|
33
|
-
Daru.lazy_update = false
|
34
28
|
end
|
35
29
|
|
36
30
|
if __FILE__==$0
|
data/examples/reliability.rb
CHANGED
data/lib/statsample.rb
CHANGED
@@ -160,6 +160,7 @@ module Statsample
|
|
160
160
|
autoload(:StratifiedSample, 'statsample/multiset')
|
161
161
|
autoload(:MLE, 'statsample/mle')
|
162
162
|
autoload(:Regression, 'statsample/regression')
|
163
|
+
autoload(:FitModel, 'statsample/formula/fit_model')
|
163
164
|
autoload(:Test, 'statsample/test')
|
164
165
|
autoload(:Factor, 'statsample/factor')
|
165
166
|
autoload(:Graph, 'statsample/graph')
|
@@ -206,7 +207,7 @@ module Statsample
|
|
206
207
|
def only_valid(*vs)
|
207
208
|
i = 1
|
208
209
|
h = vs.inject({}) { |acc, v| acc["v#{i}".to_sym] = v; i += 1; acc }
|
209
|
-
df = Daru::DataFrame.new(h).
|
210
|
+
df = Daru::DataFrame.new(h).reject_values(*Daru::MISSING_VALUES)
|
210
211
|
df.map { |v| v }
|
211
212
|
end
|
212
213
|
|
@@ -214,7 +215,7 @@ module Statsample
|
|
214
215
|
# If any vectors have missing_values, return only valid.
|
215
216
|
# If not, return the vectors itself
|
216
217
|
def only_valid_clone(*vs)
|
217
|
-
if vs.any?(
|
218
|
+
if vs.any? { |v| v.include_values?(*Daru::MISSING_VALUES) }
|
218
219
|
only_valid(*vs)
|
219
220
|
else
|
220
221
|
vs
|
@@ -164,7 +164,7 @@ module Statsample
|
|
164
164
|
if summary_descriptives
|
165
165
|
s.table(:name=>_("Descriptives"),:header=>%w{Name N Mean SD Min Max}.map {|v| _(v)}) do |t|
|
166
166
|
@vectors.each do |v|
|
167
|
-
t.row [v.name, v.
|
167
|
+
t.row [v.name, v.reject_values(*Daru::MISSING_VALUES).size, "%0.4f" % v.mean, "%0.4f" % v.sd, "%0.4f" % v.min, "%0.4f" % v.max]
|
168
168
|
end
|
169
169
|
end
|
170
170
|
end
|
data/lib/statsample/bivariate.rb
CHANGED
@@ -159,7 +159,7 @@ module Statsample
|
|
159
159
|
|
160
160
|
def covariance_matrix(ds)
|
161
161
|
vars,cases = ds.ncols, ds.nrows
|
162
|
-
if !ds.
|
162
|
+
if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
|
163
163
|
cm=covariance_matrix_optimized(ds)
|
164
164
|
else
|
165
165
|
cm=covariance_matrix_pairwise(ds)
|
@@ -198,7 +198,7 @@ module Statsample
|
|
198
198
|
# Order of rows and columns depends on Dataset#fields order
|
199
199
|
def correlation_matrix(ds)
|
200
200
|
vars, cases = ds.ncols, ds.nrows
|
201
|
-
if !ds.
|
201
|
+
if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
|
202
202
|
cm=correlation_matrix_optimized(ds)
|
203
203
|
else
|
204
204
|
cm=correlation_matrix_pairwise(ds)
|
@@ -248,7 +248,7 @@ module Statsample
|
|
248
248
|
m = vectors.collect do |row|
|
249
249
|
vectors.collect do |col|
|
250
250
|
if row==col
|
251
|
-
ds[row].
|
251
|
+
ds[row].reject_values(*Daru::MISSING_VALUES).size
|
252
252
|
else
|
253
253
|
rowa,rowb = Statsample.only_valid_clone(ds[row],ds[col])
|
254
254
|
rowa.size
|
@@ -281,7 +281,7 @@ module Statsample
|
|
281
281
|
# Calculate Point biserial correlation. Equal to Pearson correlation, with
|
282
282
|
# one dichotomous value replaced by "0" and the other by "1"
|
283
283
|
def point_biserial(dichotomous,continous)
|
284
|
-
ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).
|
284
|
+
ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).reject_values(*Daru::MISSING_VALUES)
|
285
285
|
raise(TypeError, "First vector should be dichotomous") if ds[:d].factors.size != 2
|
286
286
|
raise(TypeError, "Second vector should be continous") if ds[:c].type != :numeric
|
287
287
|
f0=ds[:d].factors.sort.to_a[0]
|
@@ -7,7 +7,7 @@ module Statsample
|
|
7
7
|
# ds=Daru::DataFrame.from_excel("my_data.xls")
|
8
8
|
# puts Statsample::SPSS.tetrachoric_correlation_matrix(ds)
|
9
9
|
def tetrachoric_correlation_matrix(ds)
|
10
|
-
dsv=ds.
|
10
|
+
dsv=ds.reject_values(*Daru::MISSING_VALUES)
|
11
11
|
# Delete all vectors doesn't have variation
|
12
12
|
dsv.vectors.each { |f|
|
13
13
|
if dsv[f].factors.size==1
|
data/lib/statsample/crosstab.rb
CHANGED
@@ -29,10 +29,10 @@ module Statsample
|
|
29
29
|
@v_cols.factors.sort.reset_index!
|
30
30
|
end
|
31
31
|
def rows_total
|
32
|
-
@v_rows.frequencies
|
32
|
+
@v_rows.frequencies.to_h
|
33
33
|
end
|
34
34
|
def cols_total
|
35
|
-
@v_cols.frequencies
|
35
|
+
@v_cols.frequencies.to_h
|
36
36
|
end
|
37
37
|
|
38
38
|
def frequencies
|
@@ -42,7 +42,7 @@ module Statsample
|
|
42
42
|
s[par]=0
|
43
43
|
s
|
44
44
|
end
|
45
|
-
base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies)
|
45
|
+
base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies.to_h)
|
46
46
|
end
|
47
47
|
def to_matrix
|
48
48
|
f = frequencies
|
data/lib/statsample/daru.rb
CHANGED
@@ -11,7 +11,7 @@ module Daru
|
|
11
11
|
# ugly patch. The upper limit for a bin has the form
|
12
12
|
# x < range
|
13
13
|
#h=Statsample::Histogram.new(self, bins)
|
14
|
-
valid =
|
14
|
+
valid = reject_values(*Daru::MISSING_VALUES)
|
15
15
|
min,max=Statsample::Util.nice(valid.min,valid.max)
|
16
16
|
# fix last data
|
17
17
|
if max == valid.max
|
@@ -72,7 +72,6 @@ module Daru
|
|
72
72
|
end
|
73
73
|
#puts "Ingreso a los dataset"
|
74
74
|
ms.datasets.each do |k,ds|
|
75
|
-
ds.update
|
76
75
|
ds.rename self[field].index_of(k)
|
77
76
|
end
|
78
77
|
|
@@ -102,7 +101,6 @@ module Daru
|
|
102
101
|
each_row { |r| p1.call(r) }
|
103
102
|
|
104
103
|
ms.datasets.each do |k,ds|
|
105
|
-
ds.update
|
106
104
|
ds.rename(
|
107
105
|
fields.size.times.map do |i|
|
108
106
|
f = fields[i]
|
@@ -142,8 +142,7 @@ module Statsample
|
|
142
142
|
raise "bootstrap_method doesn't recogniced"
|
143
143
|
end
|
144
144
|
end
|
145
|
-
|
146
|
-
|
145
|
+
|
147
146
|
matrix=Statsample::Bivariate.send(matrix_method, ds_bootstrap)
|
148
147
|
matrix=matrix.to_gsl if @use_gsl
|
149
148
|
if smc
|
@@ -159,7 +158,6 @@ module Statsample
|
|
159
158
|
redo
|
160
159
|
end
|
161
160
|
end
|
162
|
-
@ds_eigenvalues.update
|
163
161
|
end
|
164
162
|
dirty_memoize :number_of_factors, :ds_eigenvalues
|
165
163
|
dirty_writer :iterations, :bootstrap_method, :percentil, :smc
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'statsample/formula/formula'
|
2
|
+
|
3
|
+
module Statsample
|
4
|
+
# Class for performing regression
|
5
|
+
class FitModel
|
6
|
+
def initialize(formula, df, opts = {})
|
7
|
+
@formula = FormulaWrapper.new formula, df
|
8
|
+
@df = df
|
9
|
+
@opts = opts
|
10
|
+
end
|
11
|
+
|
12
|
+
def model
|
13
|
+
@model || fit_model
|
14
|
+
end
|
15
|
+
|
16
|
+
def predict(new_data)
|
17
|
+
model.predict(df_for_prediction(new_data))
|
18
|
+
end
|
19
|
+
|
20
|
+
def df_for_prediction df
|
21
|
+
canonicalize_df(df)
|
22
|
+
end
|
23
|
+
|
24
|
+
def df_for_regression
|
25
|
+
df = canonicalize_df(@df)
|
26
|
+
df[@formula.y.value] = @df[@formula.y.value]
|
27
|
+
df
|
28
|
+
end
|
29
|
+
|
30
|
+
def canonicalize_df(orig_df)
|
31
|
+
tokens = @formula.canonical_tokens
|
32
|
+
tokens.shift if tokens.first.value == '1'
|
33
|
+
df = tokens.map { |t| t.to_df orig_df }.reduce(&:merge)
|
34
|
+
df
|
35
|
+
end
|
36
|
+
|
37
|
+
def fit_model
|
38
|
+
# TODO: Add support for inclusion/exclusion of intercept
|
39
|
+
@model = Statsample::Regression.multiple(
|
40
|
+
df_for_regression,
|
41
|
+
@formula.y.value,
|
42
|
+
@opts
|
43
|
+
)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,306 @@
|
|
1
|
+
module Statsample
|
2
|
+
# This class recognizes what terms are numeric
|
3
|
+
# and accordingly forms groups which are fed to Formula
|
4
|
+
# Once they are parsed with Formula, they are combined back
|
5
|
+
class FormulaWrapper
|
6
|
+
attr_reader :tokens, :y, :canonical_tokens
|
7
|
+
|
8
|
+
# Initializes formula wrapper object to parse a given formula into
|
9
|
+
# some tokens which do not overlap one another.
|
10
|
+
# @note Specify 0 as a term in the formula if you do not want constant
|
11
|
+
# to be included in the parsed formula
|
12
|
+
# @param [string] formula to parse
|
13
|
+
# @param [Daru::DataFrame] df dataframe requried to know what vectors
|
14
|
+
# are numerical
|
15
|
+
# @example
|
16
|
+
# df = Daru::DataFrame.from_csv 'spec/data/df.csv'
|
17
|
+
# df.to_category 'c', 'd', 'e'
|
18
|
+
# formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
|
19
|
+
# formula.canonical_to_s
|
20
|
+
# #=> "1+c(-)+d(-):c+a"
|
21
|
+
def initialize(formula, df)
|
22
|
+
@df = df
|
23
|
+
# @y store the LHS term that is name of vector to be predicted
|
24
|
+
# @tokens store the RHS terms of the formula
|
25
|
+
@y, *@tokens = split_to_tokens(formula)
|
26
|
+
@tokens = @tokens.uniq.sort
|
27
|
+
manage_constant_term
|
28
|
+
@canonical_tokens = non_redundant_tokens
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns canonical tokens in a readable form.
|
32
|
+
# @return [String] canonical tokens in a readable form.
|
33
|
+
# @note 'y~a+b(-)' means 'a' exist in full rank expansion
|
34
|
+
# and 'b(-)' exist in reduced rank expansion
|
35
|
+
# @example
|
36
|
+
# df = Daru::DataFrame.from_csv 'spec/data/df.csv'
|
37
|
+
# df.to_category 'c', 'd', 'e'
|
38
|
+
# formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
|
39
|
+
# formula.canonical_to_s
|
40
|
+
# #=> "1+c(-)+d(-):c+a"
|
41
|
+
def canonical_to_s
|
42
|
+
canonical_tokens.join '+'
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns tokens to produce non-redundant design matrix
|
46
|
+
# @return [Array] array of tokens that do not produce redundant matrix
|
47
|
+
def non_redundant_tokens
|
48
|
+
groups = split_to_groups
|
49
|
+
# TODO: An enhancement
|
50
|
+
# Right now x:c appears as c:x
|
51
|
+
groups.each { |k, v| groups[k] = strip_numeric v, k }
|
52
|
+
groups.each { |k, v| groups[k] = Formula.new(v).canonical_tokens }
|
53
|
+
groups.flat_map { |k, v| add_numeric v, k }
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
# Removes intercept token if term '0' is found in the formula.
|
59
|
+
# Intercept token remains if term '1' is found.
|
60
|
+
# If neither term '0' nor term '1' is found then, intercept token is added.
|
61
|
+
def manage_constant_term
|
62
|
+
@tokens.unshift Token.new('1') unless
|
63
|
+
@tokens.include?(Token.new('1')) ||
|
64
|
+
@tokens.include?(Token.new('0'))
|
65
|
+
@tokens.delete Token.new('0')
|
66
|
+
end
|
67
|
+
|
68
|
+
# Groups the tokens to gropus based on the numerical terms
|
69
|
+
# they are interacting with.
|
70
|
+
def split_to_groups
|
71
|
+
@tokens.group_by { |t| extract_numeric t }
|
72
|
+
end
|
73
|
+
|
74
|
+
# Add numeric interaction term which was removed earlier
|
75
|
+
# @param [Array] tokens tokens on which to add numerical terms
|
76
|
+
# @param [Array] numeric array of numeric terms to add
|
77
|
+
def add_numeric(tokens, numeric)
|
78
|
+
tokens.map do |t|
|
79
|
+
terms = t.interact_terms + numeric
|
80
|
+
if terms == ['1']
|
81
|
+
Token.new('1')
|
82
|
+
else
|
83
|
+
terms = terms.reject { |i| i == '1' }
|
84
|
+
Token.new terms.join(':'), t.full
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Strip numerical interacting terms
|
90
|
+
# @param [Array] tokens tokens from which to strip numeric
|
91
|
+
# @param [Array] numeric array of numeric terms to strip from tokens
|
92
|
+
# @return [Array] array of tokens with striped numerical terms
|
93
|
+
def strip_numeric(tokens, numeric)
|
94
|
+
tokens.map do |t|
|
95
|
+
terms = t.interact_terms - numeric
|
96
|
+
terms = ['1'] if terms.empty?
|
97
|
+
Token.new terms.join(':')
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Extract numeric interacting terms
|
102
|
+
# @param [Statsample::GLM::Token] token form which to extract numeric terms
|
103
|
+
# @return [Array] array of numericl terms
|
104
|
+
def extract_numeric(token)
|
105
|
+
terms = token.interact_terms
|
106
|
+
return [] if terms == ['1']
|
107
|
+
terms.reject { |t| @df[t].category? }
|
108
|
+
end
|
109
|
+
|
110
|
+
def split_to_tokens(formula)
|
111
|
+
formula = formula.gsub(/\s+/, '')
|
112
|
+
lhs_term, rhs = formula.split '~'
|
113
|
+
rhs_terms = rhs.split '+'
|
114
|
+
([lhs_term] + rhs_terms).map { |t| Token.new t }
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# To process formula language
|
119
|
+
class Formula
|
120
|
+
attr_reader :tokens, :canonical_tokens
|
121
|
+
|
122
|
+
def initialize(tokens)
|
123
|
+
@tokens = tokens
|
124
|
+
@canonical_tokens = parse_formula
|
125
|
+
end
|
126
|
+
|
127
|
+
def canonical_to_s
|
128
|
+
canonical_tokens.join '+'
|
129
|
+
end
|
130
|
+
|
131
|
+
private
|
132
|
+
|
133
|
+
def parse_formula
|
134
|
+
@tokens.inject([]) do |acc, token|
|
135
|
+
acc + add_non_redundant_elements(token, acc)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def add_non_redundant_elements(token, result_so_far)
|
140
|
+
return [token] if token.value == '1'
|
141
|
+
tokens = token.expand
|
142
|
+
result_so_far = result_so_far.flat_map(&:expand)
|
143
|
+
tokens -= result_so_far
|
144
|
+
contract_if_possible tokens
|
145
|
+
end
|
146
|
+
|
147
|
+
def contract_if_possible(tokens)
|
148
|
+
tokens.combination(2).each do |a, b|
|
149
|
+
result = a.add b
|
150
|
+
next unless result
|
151
|
+
tokens.delete a
|
152
|
+
tokens.delete b
|
153
|
+
tokens << result
|
154
|
+
return contract_if_possible tokens
|
155
|
+
end
|
156
|
+
tokens.sort
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# To encapsulate interaction as well as non-interaction terms
|
161
|
+
class Token
|
162
|
+
attr_reader :value, :full, :interact_terms
|
163
|
+
|
164
|
+
def initialize(value, full = true)
|
165
|
+
@interact_terms = value.include?(':') ? value.split(':') : [value]
|
166
|
+
@full = coerce_full full
|
167
|
+
end
|
168
|
+
|
169
|
+
def value
|
170
|
+
interact_terms.join(':')
|
171
|
+
end
|
172
|
+
|
173
|
+
def size
|
174
|
+
# TODO: Return size 1 for value '1' also
|
175
|
+
# CAn't do this at the moment because have to make
|
176
|
+
# changes in sorting first
|
177
|
+
value == '1' ? 0 : interact_terms.size
|
178
|
+
end
|
179
|
+
|
180
|
+
def add(other)
|
181
|
+
# ANYTHING + FACTOR- : ANYTHING = FACTOR : ANYTHING
|
182
|
+
# ANYTHING + ANYTHING : FACTOR- = ANYTHING : FACTOR
|
183
|
+
if size > other.size
|
184
|
+
other.add self
|
185
|
+
|
186
|
+
elsif other.size == 2 &&
|
187
|
+
size == 1 &&
|
188
|
+
other.interact_terms.last == value &&
|
189
|
+
other.full.last == full.first &&
|
190
|
+
other.full.first == false
|
191
|
+
Token.new(
|
192
|
+
"#{other.interact_terms.first}:#{value}",
|
193
|
+
[true, other.full.last]
|
194
|
+
)
|
195
|
+
|
196
|
+
elsif other.size == 2 &&
|
197
|
+
size == 1 &&
|
198
|
+
other.interact_terms.first == value &&
|
199
|
+
other.full.first == full.first &&
|
200
|
+
other.full.last == false
|
201
|
+
Token.new(
|
202
|
+
"#{value}:#{other.interact_terms.last}",
|
203
|
+
[other.full.first, true]
|
204
|
+
)
|
205
|
+
|
206
|
+
elsif value == '1' &&
|
207
|
+
other.size == 1
|
208
|
+
Token.new(other.value, true)
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
def ==(other)
|
213
|
+
value == other.value &&
|
214
|
+
full == other.full
|
215
|
+
end
|
216
|
+
|
217
|
+
alias eql? ==
|
218
|
+
|
219
|
+
def hash
|
220
|
+
value.hash ^ full.hash
|
221
|
+
end
|
222
|
+
|
223
|
+
def <=>(other)
|
224
|
+
size <=> other.size
|
225
|
+
end
|
226
|
+
|
227
|
+
def to_s
|
228
|
+
interact_terms
|
229
|
+
.zip(full)
|
230
|
+
.map { |t, f| f ? t : t + '(-)' }
|
231
|
+
.join ':'
|
232
|
+
end
|
233
|
+
|
234
|
+
def expand
|
235
|
+
case size
|
236
|
+
when 0
|
237
|
+
[self]
|
238
|
+
when 1
|
239
|
+
[Token.new('1'), Token.new(value, false)]
|
240
|
+
when 2
|
241
|
+
a, b = interact_terms
|
242
|
+
[Token.new('1'), Token.new(a, false), Token.new(b, false),
|
243
|
+
Token.new(a + ':' + b, [false, false])]
|
244
|
+
end
|
245
|
+
end
|
246
|
+
|
247
|
+
def to_df(df)
|
248
|
+
case size
|
249
|
+
when 1
|
250
|
+
if df[value].category?
|
251
|
+
df[value].contrast_code full: full.first
|
252
|
+
else
|
253
|
+
Daru::DataFrame.new value => df[value].to_a
|
254
|
+
end
|
255
|
+
when 2
|
256
|
+
to_df_when_interaction(df)
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
private
|
261
|
+
|
262
|
+
def coerce_full(value)
|
263
|
+
if value.is_a? Array
|
264
|
+
value + Array.new((@interact_terms.size - value.size), true)
|
265
|
+
else
|
266
|
+
[value] * @interact_terms.size
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
def to_df_when_interaction(df)
|
271
|
+
case interact_terms.map { |t| df[t].category? }
|
272
|
+
when [true, true]
|
273
|
+
df.interact_code(interact_terms, full)
|
274
|
+
when [false, false]
|
275
|
+
to_df_numeric_interact_with_numeric df
|
276
|
+
when [true, false]
|
277
|
+
to_df_category_interact_with_numeric df
|
278
|
+
when [false, true]
|
279
|
+
to_df_numeric_interact_with_category df
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
def to_df_numeric_interact_with_numeric(df)
|
284
|
+
Daru::DataFrame.new value => (df[interact_terms.first] *
|
285
|
+
df[interact_terms.last]).to_a
|
286
|
+
end
|
287
|
+
|
288
|
+
def to_df_category_interact_with_numeric(df)
|
289
|
+
a, b = interact_terms
|
290
|
+
Daru::DataFrame.new(
|
291
|
+
df[a].contrast_code(full: full.first)
|
292
|
+
.map { |dv| ["#{dv.name}:#{b}", (dv * df[b]).to_a] }
|
293
|
+
.to_h
|
294
|
+
)
|
295
|
+
end
|
296
|
+
|
297
|
+
def to_df_numeric_interact_with_category(df)
|
298
|
+
a, b = interact_terms
|
299
|
+
Daru::DataFrame.new(
|
300
|
+
df[b].contrast_code(full: full.last)
|
301
|
+
.map { |dv| ["#{a}:#{dv.name}", (dv * df[a]).to_a] }
|
302
|
+
.to_h
|
303
|
+
)
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|