statsample-ekatena 2.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.travis.yml +23 -0
- data/CONTRIBUTING.md +17 -0
- data/Gemfile +2 -0
- data/History.txt +457 -0
- data/LICENSE.txt +12 -0
- data/README.md +175 -0
- data/Rakefile +44 -0
- data/benchmarks/correlation_matrix_15_variables.rb +32 -0
- data/benchmarks/correlation_matrix_5_variables.rb +33 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/doc_latex/manual/equations.tex +78 -0
- data/examples/boxplot.rb +28 -0
- data/examples/chisquare_test.rb +23 -0
- data/examples/correlation_matrix.rb +32 -0
- data/examples/dataset.rb +30 -0
- data/examples/dominance_analysis.rb +33 -0
- data/examples/dominance_analysis_bootstrap.rb +32 -0
- data/examples/histogram.rb +26 -0
- data/examples/icc.rb +24 -0
- data/examples/levene.rb +29 -0
- data/examples/multiple_regression.rb +20 -0
- data/examples/multivariate_correlation.rb +33 -0
- data/examples/parallel_analysis.rb +40 -0
- data/examples/polychoric.rb +40 -0
- data/examples/principal_axis.rb +26 -0
- data/examples/reliability.rb +31 -0
- data/examples/scatterplot.rb +25 -0
- data/examples/t_test.rb +27 -0
- data/examples/tetrachoric.rb +17 -0
- data/examples/u_test.rb +24 -0
- data/examples/vector.rb +20 -0
- data/examples/velicer_map_test.rb +46 -0
- data/grab_references.rb +29 -0
- data/lib/spss.rb +134 -0
- data/lib/statsample-ekatena/analysis.rb +100 -0
- data/lib/statsample-ekatena/analysis/suite.rb +89 -0
- data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
- data/lib/statsample-ekatena/anova.rb +24 -0
- data/lib/statsample-ekatena/anova/contrast.rb +79 -0
- data/lib/statsample-ekatena/anova/oneway.rb +187 -0
- data/lib/statsample-ekatena/anova/twoway.rb +207 -0
- data/lib/statsample-ekatena/bivariate.rb +406 -0
- data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
- data/lib/statsample-ekatena/codification.rb +182 -0
- data/lib/statsample-ekatena/converter/csv.rb +28 -0
- data/lib/statsample-ekatena/converter/spss.rb +48 -0
- data/lib/statsample-ekatena/converters.rb +211 -0
- data/lib/statsample-ekatena/crosstab.rb +188 -0
- data/lib/statsample-ekatena/daru.rb +115 -0
- data/lib/statsample-ekatena/dataset.rb +10 -0
- data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
- data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
- data/lib/statsample-ekatena/factor.rb +104 -0
- data/lib/statsample-ekatena/factor/map.rb +124 -0
- data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
- data/lib/statsample-ekatena/factor/pca.rb +242 -0
- data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
- data/lib/statsample-ekatena/factor/rotation.rb +198 -0
- data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
- data/lib/statsample-ekatena/formula/formula.rb +306 -0
- data/lib/statsample-ekatena/graph.rb +11 -0
- data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
- data/lib/statsample-ekatena/graph/histogram.rb +198 -0
- data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
- data/lib/statsample-ekatena/histogram.rb +180 -0
- data/lib/statsample-ekatena/matrix.rb +329 -0
- data/lib/statsample-ekatena/multiset.rb +310 -0
- data/lib/statsample-ekatena/regression.rb +65 -0
- data/lib/statsample-ekatena/regression/multiple.rb +89 -0
- data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
- data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
- data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
- data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
- data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
- data/lib/statsample-ekatena/regression/simple.rb +121 -0
- data/lib/statsample-ekatena/reliability.rb +150 -0
- data/lib/statsample-ekatena/reliability/icc.rb +415 -0
- data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
- data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
- data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
- data/lib/statsample-ekatena/resample.rb +15 -0
- data/lib/statsample-ekatena/shorthand.rb +125 -0
- data/lib/statsample-ekatena/srs.rb +169 -0
- data/lib/statsample-ekatena/test.rb +82 -0
- data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
- data/lib/statsample-ekatena/test/chisquare.rb +73 -0
- data/lib/statsample-ekatena/test/f.rb +52 -0
- data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
- data/lib/statsample-ekatena/test/levene.rb +88 -0
- data/lib/statsample-ekatena/test/t.rb +309 -0
- data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
- data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
- data/lib/statsample-ekatena/vector.rb +19 -0
- data/lib/statsample-ekatena/version.rb +3 -0
- data/lib/statsample.rb +282 -0
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +959 -0
- data/po/statsample.pot +947 -0
- data/references.txt +24 -0
- data/statsample-ekatena.gemspec +49 -0
- data/test/fixtures/bank2.dat +200 -0
- data/test/fixtures/correlation_matrix.rb +17 -0
- data/test/fixtures/df.csv +15 -0
- data/test/fixtures/hartman_23.matrix +9 -0
- data/test/fixtures/stock_data.csv +500 -0
- data/test/fixtures/tetmat_matrix.txt +5 -0
- data/test/fixtures/tetmat_test.txt +1001 -0
- data/test/helpers_tests.rb +83 -0
- data/test/test_analysis.rb +176 -0
- data/test/test_anova_contrast.rb +36 -0
- data/test/test_anovaoneway.rb +26 -0
- data/test/test_anovatwoway.rb +37 -0
- data/test/test_anovatwowaywithdataset.rb +47 -0
- data/test/test_anovawithvectors.rb +102 -0
- data/test/test_awesome_print_bug.rb +16 -0
- data/test/test_bartlettsphericity.rb +25 -0
- data/test/test_bivariate.rb +164 -0
- data/test/test_codification.rb +78 -0
- data/test/test_crosstab.rb +67 -0
- data/test/test_dominance_analysis.rb +39 -0
- data/test/test_factor.rb +228 -0
- data/test/test_factor_map.rb +38 -0
- data/test/test_factor_pa.rb +56 -0
- data/test/test_fit_model.rb +88 -0
- data/test/test_ggobi.rb +35 -0
- data/test/test_gsl.rb +15 -0
- data/test/test_histogram.rb +109 -0
- data/test/test_matrix.rb +48 -0
- data/test/test_multiset.rb +176 -0
- data/test/test_regression.rb +231 -0
- data/test/test_reliability.rb +223 -0
- data/test/test_reliability_icc.rb +198 -0
- data/test/test_reliability_skillscale.rb +57 -0
- data/test/test_resample.rb +24 -0
- data/test/test_srs.rb +9 -0
- data/test/test_statistics.rb +69 -0
- data/test/test_stest.rb +69 -0
- data/test/test_stratified.rb +17 -0
- data/test/test_test_f.rb +33 -0
- data/test/test_test_kolmogorovsmirnov.rb +34 -0
- data/test/test_test_t.rb +62 -0
- data/test/test_umannwhitney.rb +27 -0
- data/test/test_vector.rb +12 -0
- data/test/test_wilcoxonsignedrank.rb +64 -0
- metadata +570 -0
@@ -0,0 +1,198 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Factor
|
3
|
+
# Base class for component matrix rotation.
|
4
|
+
#
|
5
|
+
# == Reference:
|
6
|
+
# * SPSS Manual
|
7
|
+
# * Lin, J. (2007). VARIMAX_K58 [Source code]. [http://www.johnny-lin.com/idl_code/varimax_k58.pro]
|
8
|
+
#
|
9
|
+
# Use subclasses Varimax, Equimax or Quartimax for desired type of rotation
|
10
|
+
# Use:
|
11
|
+
# a = Matrix[ [ 0.4320, 0.8129, 0.3872]
|
12
|
+
# , [ 0.7950, -0.5416, 0.2565]
|
13
|
+
# , [ 0.5944, 0.7234, -0.3441]
|
14
|
+
# , [ 0.8945, -0.3921, -0.1863] ]
|
15
|
+
# rotation = Statsample::Factor::Varimax(a)
|
16
|
+
# rotation.iterate
|
17
|
+
# p rotation.rotated
|
18
|
+
# p rotation.component_transformation_matrix
|
19
|
+
#
|
20
|
+
class Rotation
|
21
|
+
EPSILON=1e-15
|
22
|
+
MAX_ITERATIONS=25
|
23
|
+
include Summarizable
|
24
|
+
include DirtyMemoize
|
25
|
+
attr_reader :iterations, :rotated, :component_transformation_matrix, :h2
|
26
|
+
# Maximum number of iterations
|
27
|
+
attr_accessor :max_iterations
|
28
|
+
# Maximum precision
|
29
|
+
attr_accessor :epsilon
|
30
|
+
attr_accessor :use_gsl
|
31
|
+
dirty_writer :max_iterations, :epsilon
|
32
|
+
dirty_memoize :iterations, :rotated, :component_transformation_matrix, :h2
|
33
|
+
|
34
|
+
def initialize(matrix, opts=Hash.new)
|
35
|
+
@name=_("%s rotation") % rotation_name
|
36
|
+
@matrix=matrix
|
37
|
+
@n=@matrix.row_size # Variables, p on original
|
38
|
+
@m=@matrix.column_size # Factors, r on original
|
39
|
+
@component_transformation_matrix=nil
|
40
|
+
@max_iterations=MAX_ITERATIONS
|
41
|
+
@epsilon=EPSILON
|
42
|
+
@rotated=nil
|
43
|
+
@h2=(@matrix.collect {|c| c**2} * Matrix.column_vector([1]*@m)).column(0).to_a
|
44
|
+
@use_gsl=Statsample.has_gsl?
|
45
|
+
opts.each{|k,v|
|
46
|
+
self.send("#{k}=",v) if self.respond_to? k
|
47
|
+
}
|
48
|
+
end
|
49
|
+
def report_building(g)
|
50
|
+
g.section(:name=>@name) do |s|
|
51
|
+
s.parse_element(rotated)
|
52
|
+
s.parse_element(component_transformation_matrix)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
alias_method :communalities, :h2
|
56
|
+
alias_method :rotated_component_matrix, :rotated
|
57
|
+
def compute
|
58
|
+
iterate
|
59
|
+
end
|
60
|
+
# Start iteration
|
61
|
+
def iterate
|
62
|
+
k_matrix=@use_gsl ? GSL::Matrix : ::Matrix
|
63
|
+
t=k_matrix.identity(@m)
|
64
|
+
b=(@use_gsl ? @matrix.to_gsl : @matrix.dup)
|
65
|
+
h=k_matrix.diagonal(*@h2).collect {|c| Math::sqrt(c)}
|
66
|
+
h_inverse=h.collect {|c| c!=0 ? 1/c : 0 }
|
67
|
+
bh=h_inverse * b
|
68
|
+
@not_converged=true
|
69
|
+
@iterations=0
|
70
|
+
while @not_converged
|
71
|
+
break if @iterations>@max_iterations
|
72
|
+
@iterations+=1
|
73
|
+
#puts "Iteration #{iterations}"
|
74
|
+
num_pairs=@m*(@m-1).quo(2)
|
75
|
+
(0..(@m-2)).each do |i| #+ go through factor index 0:r-1-1 (begin)
|
76
|
+
((i+1)..(@m-1)).each do |j| #+ pair i to "rest" of factors (begin)
|
77
|
+
|
78
|
+
xx = bh.column(i)
|
79
|
+
yy = bh.column(j)
|
80
|
+
tx = t.column(i)
|
81
|
+
ty = t.column(j)
|
82
|
+
|
83
|
+
uu = @n.times.collect {|var_i| xx[var_i]**2-yy[var_i]**2}
|
84
|
+
vv = @n.times.collect {|var_i| 2*xx[var_i]*yy[var_i]}
|
85
|
+
|
86
|
+
a = @n.times.inject(0) {|ac,var_i| ac+ uu[var_i] }
|
87
|
+
b = @n.times.inject(0) {|ac,var_i| ac+ vv[var_i] }
|
88
|
+
c = @n.times.inject(0) {|ac,var_i| ac+ (uu[var_i]**2 - vv[var_i]**2) }
|
89
|
+
d = @n.times.inject(0) {|ac,var_i| ac+ (2*uu[var_i]*vv[var_i]) }
|
90
|
+
num=x(a,b,c,d)
|
91
|
+
den=y(a,b,c,d)
|
92
|
+
phi=Math::atan2(num,den) / 4.0
|
93
|
+
# puts "#{i}-#{j}: #{phi}"
|
94
|
+
|
95
|
+
if(Math::sin(phi.abs) >= @epsilon)
|
96
|
+
xx_rot=( Math::cos(phi)*xx)+(Math::sin(phi)*yy)
|
97
|
+
yy_rot=((-Math::sin(phi))*xx)+(Math::cos(phi)*yy)
|
98
|
+
|
99
|
+
|
100
|
+
tx_rot=( Math::cos(phi)*tx)+(Math::sin(phi)*ty)
|
101
|
+
ty_rot=((-Math::sin(phi))*tx)+(Math::cos(phi)*ty)
|
102
|
+
|
103
|
+
|
104
|
+
bh=bh.to_a
|
105
|
+
|
106
|
+
@n.times {|row_i|
|
107
|
+
bh[row_i][i] = xx_rot[row_i]
|
108
|
+
bh[row_i][j] = yy_rot[row_i]
|
109
|
+
}
|
110
|
+
t=t.to_a
|
111
|
+
@m.times {|row_i|
|
112
|
+
t[row_i][i]=tx_rot[row_i]
|
113
|
+
t[row_i][j]=ty_rot[row_i]
|
114
|
+
}
|
115
|
+
#if @use_gsl
|
116
|
+
bh=k_matrix.[](*bh)
|
117
|
+
t=k_matrix.[](*t)
|
118
|
+
#else
|
119
|
+
# bh=Matrix.rows(bh)
|
120
|
+
# t=Matrix.rows(t)
|
121
|
+
|
122
|
+
#end
|
123
|
+
else
|
124
|
+
num_pairs=num_pairs-1
|
125
|
+
@not_converged=false if num_pairs==0
|
126
|
+
end # if
|
127
|
+
end #j
|
128
|
+
end #i
|
129
|
+
end # while
|
130
|
+
@rotated=h*bh
|
131
|
+
@rotated.extend CovariateMatrix
|
132
|
+
@rotated.name=_("Rotated Component matrix")
|
133
|
+
|
134
|
+
if @matrix.respond_to? :fields_x
|
135
|
+
@rotated.fields_x = @matrix.fields_x
|
136
|
+
else
|
137
|
+
@rotated.fields_x = @n.times.map {|i| "var_#{i+1}"}
|
138
|
+
end
|
139
|
+
if @matrix.respond_to? :fields_y
|
140
|
+
@rotated.fields_y = @matrix.fields_y
|
141
|
+
else
|
142
|
+
@rotated.fields_y = @m.times.map {|i| "var_#{i+1}"}
|
143
|
+
end
|
144
|
+
|
145
|
+
|
146
|
+
|
147
|
+
@component_transformation_matrix=t
|
148
|
+
@component_transformation_matrix.extend CovariateMatrix
|
149
|
+
@component_transformation_matrix.name=_("Component transformation matrix")
|
150
|
+
|
151
|
+
if @matrix.respond_to? :fields_y
|
152
|
+
@component_transformation_matrix.fields = @matrix.fields_y
|
153
|
+
|
154
|
+
else
|
155
|
+
@component_transformation_matrix.fields = @m.times.map {|i| "var_#{i+1}"}
|
156
|
+
end
|
157
|
+
|
158
|
+
@rotated
|
159
|
+
end
|
160
|
+
|
161
|
+
end
|
162
|
+
class Varimax < Rotation
|
163
|
+
def x(a,b,c,d)
|
164
|
+
d-(2*a*b / @n.to_f)
|
165
|
+
end
|
166
|
+
def y(a,b,c,d)
|
167
|
+
c-((a**2-b**2) / @n.to_f)
|
168
|
+
end
|
169
|
+
def rotation_name
|
170
|
+
"Varimax"
|
171
|
+
end
|
172
|
+
end
|
173
|
+
class Equimax < Rotation
|
174
|
+
def x(a,b,c,d)
|
175
|
+
d-(@m*a*b / @n.to_f)
|
176
|
+
end
|
177
|
+
def y(a,b,c,d)
|
178
|
+
c-@m*((a**2-b**2) / (2*@n.to_f))
|
179
|
+
end
|
180
|
+
def rotation_name
|
181
|
+
"Equimax"
|
182
|
+
end
|
183
|
+
|
184
|
+
end
|
185
|
+
class Quartimax < Rotation
|
186
|
+
def x(a,b,c,d)
|
187
|
+
d
|
188
|
+
end
|
189
|
+
def y(a,b,c,d)
|
190
|
+
c
|
191
|
+
end
|
192
|
+
def rotation_name
|
193
|
+
"Quartimax"
|
194
|
+
end
|
195
|
+
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'statsample/formula/formula'
|
2
|
+
|
3
|
+
module Statsample
|
4
|
+
# Class for performing regression
|
5
|
+
class FitModel
|
6
|
+
def initialize(formula, df, opts = {})
|
7
|
+
@formula = FormulaWrapper.new formula, df
|
8
|
+
@df = df
|
9
|
+
@opts = opts
|
10
|
+
end
|
11
|
+
|
12
|
+
def model
|
13
|
+
@model || fit_model
|
14
|
+
end
|
15
|
+
|
16
|
+
def predict(new_data)
|
17
|
+
model.predict(df_for_prediction(new_data))
|
18
|
+
end
|
19
|
+
|
20
|
+
def df_for_prediction df
|
21
|
+
canonicalize_df(df)
|
22
|
+
end
|
23
|
+
|
24
|
+
def df_for_regression
|
25
|
+
df = canonicalize_df(@df)
|
26
|
+
df[@formula.y.value] = @df[@formula.y.value]
|
27
|
+
df
|
28
|
+
end
|
29
|
+
|
30
|
+
def canonicalize_df(orig_df)
|
31
|
+
tokens = @formula.canonical_tokens
|
32
|
+
tokens.shift if tokens.first.value == '1'
|
33
|
+
df = tokens.map { |t| t.to_df orig_df }.reduce(&:merge)
|
34
|
+
df
|
35
|
+
end
|
36
|
+
|
37
|
+
def fit_model
|
38
|
+
# TODO: Add support for inclusion/exclusion of intercept
|
39
|
+
@model = Statsample::Regression.multiple(
|
40
|
+
df_for_regression,
|
41
|
+
@formula.y.value,
|
42
|
+
@opts
|
43
|
+
)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,306 @@
|
|
1
|
+
module Statsample
|
2
|
+
# This class recognizes what terms are numeric
|
3
|
+
# and accordingly forms groups which are fed to Formula
|
4
|
+
# Once they are parsed with Formula, they are combined back
|
5
|
+
class FormulaWrapper
|
6
|
+
attr_reader :tokens, :y, :canonical_tokens
|
7
|
+
|
8
|
+
# Initializes formula wrapper object to parse a given formula into
|
9
|
+
# some tokens which do not overlap one another.
|
10
|
+
# @note Specify 0 as a term in the formula if you do not want constant
|
11
|
+
# to be included in the parsed formula
|
12
|
+
# @param [string] formula to parse
|
13
|
+
# @param [Daru::DataFrame] df dataframe requried to know what vectors
|
14
|
+
# are numerical
|
15
|
+
# @example
|
16
|
+
# df = Daru::DataFrame.from_csv 'spec/data/df.csv'
|
17
|
+
# df.to_category 'c', 'd', 'e'
|
18
|
+
# formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
|
19
|
+
# formula.canonical_to_s
|
20
|
+
# #=> "1+c(-)+d(-):c+a"
|
21
|
+
def initialize(formula, df)
|
22
|
+
@df = df
|
23
|
+
# @y store the LHS term that is name of vector to be predicted
|
24
|
+
# @tokens store the RHS terms of the formula
|
25
|
+
@y, *@tokens = split_to_tokens(formula)
|
26
|
+
@tokens = @tokens.uniq.sort
|
27
|
+
manage_constant_term
|
28
|
+
@canonical_tokens = non_redundant_tokens
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns canonical tokens in a readable form.
|
32
|
+
# @return [String] canonical tokens in a readable form.
|
33
|
+
# @note 'y~a+b(-)' means 'a' exist in full rank expansion
|
34
|
+
# and 'b(-)' exist in reduced rank expansion
|
35
|
+
# @example
|
36
|
+
# df = Daru::DataFrame.from_csv 'spec/data/df.csv'
|
37
|
+
# df.to_category 'c', 'd', 'e'
|
38
|
+
# formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
|
39
|
+
# formula.canonical_to_s
|
40
|
+
# #=> "1+c(-)+d(-):c+a"
|
41
|
+
def canonical_to_s
|
42
|
+
canonical_tokens.join '+'
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns tokens to produce non-redundant design matrix
|
46
|
+
# @return [Array] array of tokens that do not produce redundant matrix
|
47
|
+
def non_redundant_tokens
|
48
|
+
groups = split_to_groups
|
49
|
+
# TODO: An enhancement
|
50
|
+
# Right now x:c appears as c:x
|
51
|
+
groups.each { |k, v| groups[k] = strip_numeric v, k }
|
52
|
+
groups.each { |k, v| groups[k] = Formula.new(v).canonical_tokens }
|
53
|
+
groups.flat_map { |k, v| add_numeric v, k }
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
# Removes intercept token if term '0' is found in the formula.
|
59
|
+
# Intercept token remains if term '1' is found.
|
60
|
+
# If neither term '0' nor term '1' is found then, intercept token is added.
|
61
|
+
def manage_constant_term
|
62
|
+
@tokens.unshift Token.new('1') unless
|
63
|
+
@tokens.include?(Token.new('1')) ||
|
64
|
+
@tokens.include?(Token.new('0'))
|
65
|
+
@tokens.delete Token.new('0')
|
66
|
+
end
|
67
|
+
|
68
|
+
# Groups the tokens to gropus based on the numerical terms
|
69
|
+
# they are interacting with.
|
70
|
+
def split_to_groups
|
71
|
+
@tokens.group_by { |t| extract_numeric t }
|
72
|
+
end
|
73
|
+
|
74
|
+
# Add numeric interaction term which was removed earlier
|
75
|
+
# @param [Array] tokens tokens on which to add numerical terms
|
76
|
+
# @param [Array] numeric array of numeric terms to add
|
77
|
+
def add_numeric(tokens, numeric)
|
78
|
+
tokens.map do |t|
|
79
|
+
terms = t.interact_terms + numeric
|
80
|
+
if terms == ['1']
|
81
|
+
Token.new('1')
|
82
|
+
else
|
83
|
+
terms = terms.reject { |i| i == '1' }
|
84
|
+
Token.new terms.join(':'), t.full
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Strip numerical interacting terms
|
90
|
+
# @param [Array] tokens tokens from which to strip numeric
|
91
|
+
# @param [Array] numeric array of numeric terms to strip from tokens
|
92
|
+
# @return [Array] array of tokens with striped numerical terms
|
93
|
+
def strip_numeric(tokens, numeric)
|
94
|
+
tokens.map do |t|
|
95
|
+
terms = t.interact_terms - numeric
|
96
|
+
terms = ['1'] if terms.empty?
|
97
|
+
Token.new terms.join(':')
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Extract numeric interacting terms
|
102
|
+
# @param [Statsample::GLM::Token] token form which to extract numeric terms
|
103
|
+
# @return [Array] array of numericl terms
|
104
|
+
def extract_numeric(token)
|
105
|
+
terms = token.interact_terms
|
106
|
+
return [] if terms == ['1']
|
107
|
+
terms.reject { |t| @df[t].category? }
|
108
|
+
end
|
109
|
+
|
110
|
+
def split_to_tokens(formula)
|
111
|
+
formula = formula.gsub(/\s+/, '')
|
112
|
+
lhs_term, rhs = formula.split '~'
|
113
|
+
rhs_terms = rhs.split '+'
|
114
|
+
([lhs_term] + rhs_terms).map { |t| Token.new t }
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# To process formula language
|
119
|
+
class Formula
|
120
|
+
attr_reader :tokens, :canonical_tokens
|
121
|
+
|
122
|
+
def initialize(tokens)
|
123
|
+
@tokens = tokens
|
124
|
+
@canonical_tokens = parse_formula
|
125
|
+
end
|
126
|
+
|
127
|
+
def canonical_to_s
|
128
|
+
canonical_tokens.join '+'
|
129
|
+
end
|
130
|
+
|
131
|
+
private
|
132
|
+
|
133
|
+
def parse_formula
|
134
|
+
@tokens.inject([]) do |acc, token|
|
135
|
+
acc + add_non_redundant_elements(token, acc)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def add_non_redundant_elements(token, result_so_far)
|
140
|
+
return [token] if token.value == '1'
|
141
|
+
tokens = token.expand
|
142
|
+
result_so_far = result_so_far.flat_map(&:expand)
|
143
|
+
tokens -= result_so_far
|
144
|
+
contract_if_possible tokens
|
145
|
+
end
|
146
|
+
|
147
|
+
def contract_if_possible(tokens)
|
148
|
+
tokens.combination(2).each do |a, b|
|
149
|
+
result = a.add b
|
150
|
+
next unless result
|
151
|
+
tokens.delete a
|
152
|
+
tokens.delete b
|
153
|
+
tokens << result
|
154
|
+
return contract_if_possible tokens
|
155
|
+
end
|
156
|
+
tokens.sort
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# To encapsulate interaction as well as non-interaction terms
|
161
|
+
class Token
|
162
|
+
attr_reader :value, :full, :interact_terms
|
163
|
+
|
164
|
+
def initialize(value, full = true)
|
165
|
+
@interact_terms = value.include?(':') ? value.split(':') : [value]
|
166
|
+
@full = coerce_full full
|
167
|
+
end
|
168
|
+
|
169
|
+
def value
|
170
|
+
interact_terms.join(':')
|
171
|
+
end
|
172
|
+
|
173
|
+
def size
|
174
|
+
# TODO: Return size 1 for value '1' also
|
175
|
+
# CAn't do this at the moment because have to make
|
176
|
+
# changes in sorting first
|
177
|
+
value == '1' ? 0 : interact_terms.size
|
178
|
+
end
|
179
|
+
|
180
|
+
def add(other)
|
181
|
+
# ANYTHING + FACTOR- : ANYTHING = FACTOR : ANYTHING
|
182
|
+
# ANYTHING + ANYTHING : FACTOR- = ANYTHING : FACTOR
|
183
|
+
if size > other.size
|
184
|
+
other.add self
|
185
|
+
|
186
|
+
elsif other.size == 2 &&
|
187
|
+
size == 1 &&
|
188
|
+
other.interact_terms.last == value &&
|
189
|
+
other.full.last == full.first &&
|
190
|
+
other.full.first == false
|
191
|
+
Token.new(
|
192
|
+
"#{other.interact_terms.first}:#{value}",
|
193
|
+
[true, other.full.last]
|
194
|
+
)
|
195
|
+
|
196
|
+
elsif other.size == 2 &&
|
197
|
+
size == 1 &&
|
198
|
+
other.interact_terms.first == value &&
|
199
|
+
other.full.first == full.first &&
|
200
|
+
other.full.last == false
|
201
|
+
Token.new(
|
202
|
+
"#{value}:#{other.interact_terms.last}",
|
203
|
+
[other.full.first, true]
|
204
|
+
)
|
205
|
+
|
206
|
+
elsif value == '1' &&
|
207
|
+
other.size == 1
|
208
|
+
Token.new(other.value, true)
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
def ==(other)
|
213
|
+
value == other.value &&
|
214
|
+
full == other.full
|
215
|
+
end
|
216
|
+
|
217
|
+
alias eql? ==
|
218
|
+
|
219
|
+
def hash
|
220
|
+
value.hash ^ full.hash
|
221
|
+
end
|
222
|
+
|
223
|
+
def <=>(other)
|
224
|
+
size <=> other.size
|
225
|
+
end
|
226
|
+
|
227
|
+
def to_s
|
228
|
+
interact_terms
|
229
|
+
.zip(full)
|
230
|
+
.map { |t, f| f ? t : t + '(-)' }
|
231
|
+
.join ':'
|
232
|
+
end
|
233
|
+
|
234
|
+
def expand
|
235
|
+
case size
|
236
|
+
when 0
|
237
|
+
[self]
|
238
|
+
when 1
|
239
|
+
[Token.new('1'), Token.new(value, false)]
|
240
|
+
when 2
|
241
|
+
a, b = interact_terms
|
242
|
+
[Token.new('1'), Token.new(a, false), Token.new(b, false),
|
243
|
+
Token.new(a + ':' + b, [false, false])]
|
244
|
+
end
|
245
|
+
end
|
246
|
+
|
247
|
+
def to_df(df)
|
248
|
+
case size
|
249
|
+
when 1
|
250
|
+
if df[value].category?
|
251
|
+
df[value].contrast_code full: full.first
|
252
|
+
else
|
253
|
+
Daru::DataFrame.new value => df[value].to_a
|
254
|
+
end
|
255
|
+
when 2
|
256
|
+
to_df_when_interaction(df)
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
private
|
261
|
+
|
262
|
+
def coerce_full(value)
|
263
|
+
if value.is_a? Array
|
264
|
+
value + Array.new((@interact_terms.size - value.size), true)
|
265
|
+
else
|
266
|
+
[value] * @interact_terms.size
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
def to_df_when_interaction(df)
|
271
|
+
case interact_terms.map { |t| df[t].category? }
|
272
|
+
when [true, true]
|
273
|
+
df.interact_code(interact_terms, full)
|
274
|
+
when [false, false]
|
275
|
+
to_df_numeric_interact_with_numeric df
|
276
|
+
when [true, false]
|
277
|
+
to_df_category_interact_with_numeric df
|
278
|
+
when [false, true]
|
279
|
+
to_df_numeric_interact_with_category df
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
def to_df_numeric_interact_with_numeric(df)
|
284
|
+
Daru::DataFrame.new value => (df[interact_terms.first] *
|
285
|
+
df[interact_terms.last]).to_a
|
286
|
+
end
|
287
|
+
|
288
|
+
def to_df_category_interact_with_numeric(df)
|
289
|
+
a, b = interact_terms
|
290
|
+
Daru::DataFrame.new(
|
291
|
+
df[a].contrast_code(full: full.first)
|
292
|
+
.map { |dv| ["#{dv.name}:#{b}", (dv * df[b]).to_a] }
|
293
|
+
.to_h
|
294
|
+
)
|
295
|
+
end
|
296
|
+
|
297
|
+
def to_df_numeric_interact_with_category(df)
|
298
|
+
a, b = interact_terms
|
299
|
+
Daru::DataFrame.new(
|
300
|
+
df[b].contrast_code(full: full.last)
|
301
|
+
.map { |dv| ["#{a}:#{dv.name}", (dv * df[a]).to_a] }
|
302
|
+
.to_h
|
303
|
+
)
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|