statsample-ekatena 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.travis.yml +23 -0
- data/CONTRIBUTING.md +17 -0
- data/Gemfile +2 -0
- data/History.txt +457 -0
- data/LICENSE.txt +12 -0
- data/README.md +175 -0
- data/Rakefile +44 -0
- data/benchmarks/correlation_matrix_15_variables.rb +32 -0
- data/benchmarks/correlation_matrix_5_variables.rb +33 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/doc_latex/manual/equations.tex +78 -0
- data/examples/boxplot.rb +28 -0
- data/examples/chisquare_test.rb +23 -0
- data/examples/correlation_matrix.rb +32 -0
- data/examples/dataset.rb +30 -0
- data/examples/dominance_analysis.rb +33 -0
- data/examples/dominance_analysis_bootstrap.rb +32 -0
- data/examples/histogram.rb +26 -0
- data/examples/icc.rb +24 -0
- data/examples/levene.rb +29 -0
- data/examples/multiple_regression.rb +20 -0
- data/examples/multivariate_correlation.rb +33 -0
- data/examples/parallel_analysis.rb +40 -0
- data/examples/polychoric.rb +40 -0
- data/examples/principal_axis.rb +26 -0
- data/examples/reliability.rb +31 -0
- data/examples/scatterplot.rb +25 -0
- data/examples/t_test.rb +27 -0
- data/examples/tetrachoric.rb +17 -0
- data/examples/u_test.rb +24 -0
- data/examples/vector.rb +20 -0
- data/examples/velicer_map_test.rb +46 -0
- data/grab_references.rb +29 -0
- data/lib/spss.rb +134 -0
- data/lib/statsample-ekatena/analysis.rb +100 -0
- data/lib/statsample-ekatena/analysis/suite.rb +89 -0
- data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
- data/lib/statsample-ekatena/anova.rb +24 -0
- data/lib/statsample-ekatena/anova/contrast.rb +79 -0
- data/lib/statsample-ekatena/anova/oneway.rb +187 -0
- data/lib/statsample-ekatena/anova/twoway.rb +207 -0
- data/lib/statsample-ekatena/bivariate.rb +406 -0
- data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
- data/lib/statsample-ekatena/codification.rb +182 -0
- data/lib/statsample-ekatena/converter/csv.rb +28 -0
- data/lib/statsample-ekatena/converter/spss.rb +48 -0
- data/lib/statsample-ekatena/converters.rb +211 -0
- data/lib/statsample-ekatena/crosstab.rb +188 -0
- data/lib/statsample-ekatena/daru.rb +115 -0
- data/lib/statsample-ekatena/dataset.rb +10 -0
- data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
- data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
- data/lib/statsample-ekatena/factor.rb +104 -0
- data/lib/statsample-ekatena/factor/map.rb +124 -0
- data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
- data/lib/statsample-ekatena/factor/pca.rb +242 -0
- data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
- data/lib/statsample-ekatena/factor/rotation.rb +198 -0
- data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
- data/lib/statsample-ekatena/formula/formula.rb +306 -0
- data/lib/statsample-ekatena/graph.rb +11 -0
- data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
- data/lib/statsample-ekatena/graph/histogram.rb +198 -0
- data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
- data/lib/statsample-ekatena/histogram.rb +180 -0
- data/lib/statsample-ekatena/matrix.rb +329 -0
- data/lib/statsample-ekatena/multiset.rb +310 -0
- data/lib/statsample-ekatena/regression.rb +65 -0
- data/lib/statsample-ekatena/regression/multiple.rb +89 -0
- data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
- data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
- data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
- data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
- data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
- data/lib/statsample-ekatena/regression/simple.rb +121 -0
- data/lib/statsample-ekatena/reliability.rb +150 -0
- data/lib/statsample-ekatena/reliability/icc.rb +415 -0
- data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
- data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
- data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
- data/lib/statsample-ekatena/resample.rb +15 -0
- data/lib/statsample-ekatena/shorthand.rb +125 -0
- data/lib/statsample-ekatena/srs.rb +169 -0
- data/lib/statsample-ekatena/test.rb +82 -0
- data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
- data/lib/statsample-ekatena/test/chisquare.rb +73 -0
- data/lib/statsample-ekatena/test/f.rb +52 -0
- data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
- data/lib/statsample-ekatena/test/levene.rb +88 -0
- data/lib/statsample-ekatena/test/t.rb +309 -0
- data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
- data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
- data/lib/statsample-ekatena/vector.rb +19 -0
- data/lib/statsample-ekatena/version.rb +3 -0
- data/lib/statsample.rb +282 -0
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +959 -0
- data/po/statsample.pot +947 -0
- data/references.txt +24 -0
- data/statsample-ekatena.gemspec +49 -0
- data/test/fixtures/bank2.dat +200 -0
- data/test/fixtures/correlation_matrix.rb +17 -0
- data/test/fixtures/df.csv +15 -0
- data/test/fixtures/hartman_23.matrix +9 -0
- data/test/fixtures/stock_data.csv +500 -0
- data/test/fixtures/tetmat_matrix.txt +5 -0
- data/test/fixtures/tetmat_test.txt +1001 -0
- data/test/helpers_tests.rb +83 -0
- data/test/test_analysis.rb +176 -0
- data/test/test_anova_contrast.rb +36 -0
- data/test/test_anovaoneway.rb +26 -0
- data/test/test_anovatwoway.rb +37 -0
- data/test/test_anovatwowaywithdataset.rb +47 -0
- data/test/test_anovawithvectors.rb +102 -0
- data/test/test_awesome_print_bug.rb +16 -0
- data/test/test_bartlettsphericity.rb +25 -0
- data/test/test_bivariate.rb +164 -0
- data/test/test_codification.rb +78 -0
- data/test/test_crosstab.rb +67 -0
- data/test/test_dominance_analysis.rb +39 -0
- data/test/test_factor.rb +228 -0
- data/test/test_factor_map.rb +38 -0
- data/test/test_factor_pa.rb +56 -0
- data/test/test_fit_model.rb +88 -0
- data/test/test_ggobi.rb +35 -0
- data/test/test_gsl.rb +15 -0
- data/test/test_histogram.rb +109 -0
- data/test/test_matrix.rb +48 -0
- data/test/test_multiset.rb +176 -0
- data/test/test_regression.rb +231 -0
- data/test/test_reliability.rb +223 -0
- data/test/test_reliability_icc.rb +198 -0
- data/test/test_reliability_skillscale.rb +57 -0
- data/test/test_resample.rb +24 -0
- data/test/test_srs.rb +9 -0
- data/test/test_statistics.rb +69 -0
- data/test/test_stest.rb +69 -0
- data/test/test_stratified.rb +17 -0
- data/test/test_test_f.rb +33 -0
- data/test/test_test_kolmogorovsmirnov.rb +34 -0
- data/test/test_test_t.rb +62 -0
- data/test/test_umannwhitney.rb +27 -0
- data/test/test_vector.rb +12 -0
- data/test/test_wilcoxonsignedrank.rb +64 -0
- metadata +570 -0
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
module Factor
|
|
3
|
+
# Base class for component matrix rotation.
|
|
4
|
+
#
|
|
5
|
+
# == Reference:
|
|
6
|
+
# * SPSS Manual
|
|
7
|
+
# * Lin, J. (2007). VARIMAX_K58 [Source code]. [http://www.johnny-lin.com/idl_code/varimax_k58.pro]
|
|
8
|
+
#
|
|
9
|
+
# Use subclasses Varimax, Equimax or Quartimax for desired type of rotation
|
|
10
|
+
# Use:
|
|
11
|
+
# a = Matrix[ [ 0.4320, 0.8129, 0.3872]
|
|
12
|
+
# , [ 0.7950, -0.5416, 0.2565]
|
|
13
|
+
# , [ 0.5944, 0.7234, -0.3441]
|
|
14
|
+
# , [ 0.8945, -0.3921, -0.1863] ]
|
|
15
|
+
# rotation = Statsample::Factor::Varimax(a)
|
|
16
|
+
# rotation.iterate
|
|
17
|
+
# p rotation.rotated
|
|
18
|
+
# p rotation.component_transformation_matrix
|
|
19
|
+
#
|
|
20
|
+
class Rotation
|
|
21
|
+
EPSILON=1e-15
|
|
22
|
+
MAX_ITERATIONS=25
|
|
23
|
+
include Summarizable
|
|
24
|
+
include DirtyMemoize
|
|
25
|
+
attr_reader :iterations, :rotated, :component_transformation_matrix, :h2
|
|
26
|
+
# Maximum number of iterations
|
|
27
|
+
attr_accessor :max_iterations
|
|
28
|
+
# Maximum precision
|
|
29
|
+
attr_accessor :epsilon
|
|
30
|
+
attr_accessor :use_gsl
|
|
31
|
+
dirty_writer :max_iterations, :epsilon
|
|
32
|
+
dirty_memoize :iterations, :rotated, :component_transformation_matrix, :h2
|
|
33
|
+
|
|
34
|
+
def initialize(matrix, opts=Hash.new)
|
|
35
|
+
@name=_("%s rotation") % rotation_name
|
|
36
|
+
@matrix=matrix
|
|
37
|
+
@n=@matrix.row_size # Variables, p on original
|
|
38
|
+
@m=@matrix.column_size # Factors, r on original
|
|
39
|
+
@component_transformation_matrix=nil
|
|
40
|
+
@max_iterations=MAX_ITERATIONS
|
|
41
|
+
@epsilon=EPSILON
|
|
42
|
+
@rotated=nil
|
|
43
|
+
@h2=(@matrix.collect {|c| c**2} * Matrix.column_vector([1]*@m)).column(0).to_a
|
|
44
|
+
@use_gsl=Statsample.has_gsl?
|
|
45
|
+
opts.each{|k,v|
|
|
46
|
+
self.send("#{k}=",v) if self.respond_to? k
|
|
47
|
+
}
|
|
48
|
+
end
|
|
49
|
+
def report_building(g)
|
|
50
|
+
g.section(:name=>@name) do |s|
|
|
51
|
+
s.parse_element(rotated)
|
|
52
|
+
s.parse_element(component_transformation_matrix)
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
alias_method :communalities, :h2
|
|
56
|
+
alias_method :rotated_component_matrix, :rotated
|
|
57
|
+
def compute
|
|
58
|
+
iterate
|
|
59
|
+
end
|
|
60
|
+
# Start iteration
|
|
61
|
+
def iterate
|
|
62
|
+
k_matrix=@use_gsl ? GSL::Matrix : ::Matrix
|
|
63
|
+
t=k_matrix.identity(@m)
|
|
64
|
+
b=(@use_gsl ? @matrix.to_gsl : @matrix.dup)
|
|
65
|
+
h=k_matrix.diagonal(*@h2).collect {|c| Math::sqrt(c)}
|
|
66
|
+
h_inverse=h.collect {|c| c!=0 ? 1/c : 0 }
|
|
67
|
+
bh=h_inverse * b
|
|
68
|
+
@not_converged=true
|
|
69
|
+
@iterations=0
|
|
70
|
+
while @not_converged
|
|
71
|
+
break if @iterations>@max_iterations
|
|
72
|
+
@iterations+=1
|
|
73
|
+
#puts "Iteration #{iterations}"
|
|
74
|
+
num_pairs=@m*(@m-1).quo(2)
|
|
75
|
+
(0..(@m-2)).each do |i| #+ go through factor index 0:r-1-1 (begin)
|
|
76
|
+
((i+1)..(@m-1)).each do |j| #+ pair i to "rest" of factors (begin)
|
|
77
|
+
|
|
78
|
+
xx = bh.column(i)
|
|
79
|
+
yy = bh.column(j)
|
|
80
|
+
tx = t.column(i)
|
|
81
|
+
ty = t.column(j)
|
|
82
|
+
|
|
83
|
+
uu = @n.times.collect {|var_i| xx[var_i]**2-yy[var_i]**2}
|
|
84
|
+
vv = @n.times.collect {|var_i| 2*xx[var_i]*yy[var_i]}
|
|
85
|
+
|
|
86
|
+
a = @n.times.inject(0) {|ac,var_i| ac+ uu[var_i] }
|
|
87
|
+
b = @n.times.inject(0) {|ac,var_i| ac+ vv[var_i] }
|
|
88
|
+
c = @n.times.inject(0) {|ac,var_i| ac+ (uu[var_i]**2 - vv[var_i]**2) }
|
|
89
|
+
d = @n.times.inject(0) {|ac,var_i| ac+ (2*uu[var_i]*vv[var_i]) }
|
|
90
|
+
num=x(a,b,c,d)
|
|
91
|
+
den=y(a,b,c,d)
|
|
92
|
+
phi=Math::atan2(num,den) / 4.0
|
|
93
|
+
# puts "#{i}-#{j}: #{phi}"
|
|
94
|
+
|
|
95
|
+
if(Math::sin(phi.abs) >= @epsilon)
|
|
96
|
+
xx_rot=( Math::cos(phi)*xx)+(Math::sin(phi)*yy)
|
|
97
|
+
yy_rot=((-Math::sin(phi))*xx)+(Math::cos(phi)*yy)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
tx_rot=( Math::cos(phi)*tx)+(Math::sin(phi)*ty)
|
|
101
|
+
ty_rot=((-Math::sin(phi))*tx)+(Math::cos(phi)*ty)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
bh=bh.to_a
|
|
105
|
+
|
|
106
|
+
@n.times {|row_i|
|
|
107
|
+
bh[row_i][i] = xx_rot[row_i]
|
|
108
|
+
bh[row_i][j] = yy_rot[row_i]
|
|
109
|
+
}
|
|
110
|
+
t=t.to_a
|
|
111
|
+
@m.times {|row_i|
|
|
112
|
+
t[row_i][i]=tx_rot[row_i]
|
|
113
|
+
t[row_i][j]=ty_rot[row_i]
|
|
114
|
+
}
|
|
115
|
+
#if @use_gsl
|
|
116
|
+
bh=k_matrix.[](*bh)
|
|
117
|
+
t=k_matrix.[](*t)
|
|
118
|
+
#else
|
|
119
|
+
# bh=Matrix.rows(bh)
|
|
120
|
+
# t=Matrix.rows(t)
|
|
121
|
+
|
|
122
|
+
#end
|
|
123
|
+
else
|
|
124
|
+
num_pairs=num_pairs-1
|
|
125
|
+
@not_converged=false if num_pairs==0
|
|
126
|
+
end # if
|
|
127
|
+
end #j
|
|
128
|
+
end #i
|
|
129
|
+
end # while
|
|
130
|
+
@rotated=h*bh
|
|
131
|
+
@rotated.extend CovariateMatrix
|
|
132
|
+
@rotated.name=_("Rotated Component matrix")
|
|
133
|
+
|
|
134
|
+
if @matrix.respond_to? :fields_x
|
|
135
|
+
@rotated.fields_x = @matrix.fields_x
|
|
136
|
+
else
|
|
137
|
+
@rotated.fields_x = @n.times.map {|i| "var_#{i+1}"}
|
|
138
|
+
end
|
|
139
|
+
if @matrix.respond_to? :fields_y
|
|
140
|
+
@rotated.fields_y = @matrix.fields_y
|
|
141
|
+
else
|
|
142
|
+
@rotated.fields_y = @m.times.map {|i| "var_#{i+1}"}
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
@component_transformation_matrix=t
|
|
148
|
+
@component_transformation_matrix.extend CovariateMatrix
|
|
149
|
+
@component_transformation_matrix.name=_("Component transformation matrix")
|
|
150
|
+
|
|
151
|
+
if @matrix.respond_to? :fields_y
|
|
152
|
+
@component_transformation_matrix.fields = @matrix.fields_y
|
|
153
|
+
|
|
154
|
+
else
|
|
155
|
+
@component_transformation_matrix.fields = @m.times.map {|i| "var_#{i+1}"}
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
@rotated
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
end
|
|
162
|
+
class Varimax < Rotation
|
|
163
|
+
def x(a,b,c,d)
|
|
164
|
+
d-(2*a*b / @n.to_f)
|
|
165
|
+
end
|
|
166
|
+
def y(a,b,c,d)
|
|
167
|
+
c-((a**2-b**2) / @n.to_f)
|
|
168
|
+
end
|
|
169
|
+
def rotation_name
|
|
170
|
+
"Varimax"
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
class Equimax < Rotation
|
|
174
|
+
def x(a,b,c,d)
|
|
175
|
+
d-(@m*a*b / @n.to_f)
|
|
176
|
+
end
|
|
177
|
+
def y(a,b,c,d)
|
|
178
|
+
c-@m*((a**2-b**2) / (2*@n.to_f))
|
|
179
|
+
end
|
|
180
|
+
def rotation_name
|
|
181
|
+
"Equimax"
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
end
|
|
185
|
+
class Quartimax < Rotation
|
|
186
|
+
def x(a,b,c,d)
|
|
187
|
+
d
|
|
188
|
+
end
|
|
189
|
+
def y(a,b,c,d)
|
|
190
|
+
c
|
|
191
|
+
end
|
|
192
|
+
def rotation_name
|
|
193
|
+
"Quartimax"
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
require 'statsample/formula/formula'
|
|
2
|
+
|
|
3
|
+
module Statsample
|
|
4
|
+
# Class for performing regression
|
|
5
|
+
class FitModel
|
|
6
|
+
def initialize(formula, df, opts = {})
|
|
7
|
+
@formula = FormulaWrapper.new formula, df
|
|
8
|
+
@df = df
|
|
9
|
+
@opts = opts
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def model
|
|
13
|
+
@model || fit_model
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def predict(new_data)
|
|
17
|
+
model.predict(df_for_prediction(new_data))
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def df_for_prediction df
|
|
21
|
+
canonicalize_df(df)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def df_for_regression
|
|
25
|
+
df = canonicalize_df(@df)
|
|
26
|
+
df[@formula.y.value] = @df[@formula.y.value]
|
|
27
|
+
df
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def canonicalize_df(orig_df)
|
|
31
|
+
tokens = @formula.canonical_tokens
|
|
32
|
+
tokens.shift if tokens.first.value == '1'
|
|
33
|
+
df = tokens.map { |t| t.to_df orig_df }.reduce(&:merge)
|
|
34
|
+
df
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def fit_model
|
|
38
|
+
# TODO: Add support for inclusion/exclusion of intercept
|
|
39
|
+
@model = Statsample::Regression.multiple(
|
|
40
|
+
df_for_regression,
|
|
41
|
+
@formula.y.value,
|
|
42
|
+
@opts
|
|
43
|
+
)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
# This class recognizes what terms are numeric
|
|
3
|
+
# and accordingly forms groups which are fed to Formula
|
|
4
|
+
# Once they are parsed with Formula, they are combined back
|
|
5
|
+
class FormulaWrapper
|
|
6
|
+
attr_reader :tokens, :y, :canonical_tokens
|
|
7
|
+
|
|
8
|
+
# Initializes formula wrapper object to parse a given formula into
|
|
9
|
+
# some tokens which do not overlap one another.
|
|
10
|
+
# @note Specify 0 as a term in the formula if you do not want constant
|
|
11
|
+
# to be included in the parsed formula
|
|
12
|
+
# @param [string] formula to parse
|
|
13
|
+
# @param [Daru::DataFrame] df dataframe requried to know what vectors
|
|
14
|
+
# are numerical
|
|
15
|
+
# @example
|
|
16
|
+
# df = Daru::DataFrame.from_csv 'spec/data/df.csv'
|
|
17
|
+
# df.to_category 'c', 'd', 'e'
|
|
18
|
+
# formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
|
|
19
|
+
# formula.canonical_to_s
|
|
20
|
+
# #=> "1+c(-)+d(-):c+a"
|
|
21
|
+
def initialize(formula, df)
|
|
22
|
+
@df = df
|
|
23
|
+
# @y store the LHS term that is name of vector to be predicted
|
|
24
|
+
# @tokens store the RHS terms of the formula
|
|
25
|
+
@y, *@tokens = split_to_tokens(formula)
|
|
26
|
+
@tokens = @tokens.uniq.sort
|
|
27
|
+
manage_constant_term
|
|
28
|
+
@canonical_tokens = non_redundant_tokens
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Returns canonical tokens in a readable form.
|
|
32
|
+
# @return [String] canonical tokens in a readable form.
|
|
33
|
+
# @note 'y~a+b(-)' means 'a' exist in full rank expansion
|
|
34
|
+
# and 'b(-)' exist in reduced rank expansion
|
|
35
|
+
# @example
|
|
36
|
+
# df = Daru::DataFrame.from_csv 'spec/data/df.csv'
|
|
37
|
+
# df.to_category 'c', 'd', 'e'
|
|
38
|
+
# formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
|
|
39
|
+
# formula.canonical_to_s
|
|
40
|
+
# #=> "1+c(-)+d(-):c+a"
|
|
41
|
+
def canonical_to_s
|
|
42
|
+
canonical_tokens.join '+'
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Returns tokens to produce non-redundant design matrix
|
|
46
|
+
# @return [Array] array of tokens that do not produce redundant matrix
|
|
47
|
+
def non_redundant_tokens
|
|
48
|
+
groups = split_to_groups
|
|
49
|
+
# TODO: An enhancement
|
|
50
|
+
# Right now x:c appears as c:x
|
|
51
|
+
groups.each { |k, v| groups[k] = strip_numeric v, k }
|
|
52
|
+
groups.each { |k, v| groups[k] = Formula.new(v).canonical_tokens }
|
|
53
|
+
groups.flat_map { |k, v| add_numeric v, k }
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
private
|
|
57
|
+
|
|
58
|
+
# Removes intercept token if term '0' is found in the formula.
|
|
59
|
+
# Intercept token remains if term '1' is found.
|
|
60
|
+
# If neither term '0' nor term '1' is found then, intercept token is added.
|
|
61
|
+
def manage_constant_term
|
|
62
|
+
@tokens.unshift Token.new('1') unless
|
|
63
|
+
@tokens.include?(Token.new('1')) ||
|
|
64
|
+
@tokens.include?(Token.new('0'))
|
|
65
|
+
@tokens.delete Token.new('0')
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Groups the tokens to gropus based on the numerical terms
|
|
69
|
+
# they are interacting with.
|
|
70
|
+
def split_to_groups
|
|
71
|
+
@tokens.group_by { |t| extract_numeric t }
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Add numeric interaction term which was removed earlier
|
|
75
|
+
# @param [Array] tokens tokens on which to add numerical terms
|
|
76
|
+
# @param [Array] numeric array of numeric terms to add
|
|
77
|
+
def add_numeric(tokens, numeric)
|
|
78
|
+
tokens.map do |t|
|
|
79
|
+
terms = t.interact_terms + numeric
|
|
80
|
+
if terms == ['1']
|
|
81
|
+
Token.new('1')
|
|
82
|
+
else
|
|
83
|
+
terms = terms.reject { |i| i == '1' }
|
|
84
|
+
Token.new terms.join(':'), t.full
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Strip numerical interacting terms
|
|
90
|
+
# @param [Array] tokens tokens from which to strip numeric
|
|
91
|
+
# @param [Array] numeric array of numeric terms to strip from tokens
|
|
92
|
+
# @return [Array] array of tokens with striped numerical terms
|
|
93
|
+
def strip_numeric(tokens, numeric)
|
|
94
|
+
tokens.map do |t|
|
|
95
|
+
terms = t.interact_terms - numeric
|
|
96
|
+
terms = ['1'] if terms.empty?
|
|
97
|
+
Token.new terms.join(':')
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Extract numeric interacting terms
|
|
102
|
+
# @param [Statsample::GLM::Token] token form which to extract numeric terms
|
|
103
|
+
# @return [Array] array of numericl terms
|
|
104
|
+
def extract_numeric(token)
|
|
105
|
+
terms = token.interact_terms
|
|
106
|
+
return [] if terms == ['1']
|
|
107
|
+
terms.reject { |t| @df[t].category? }
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def split_to_tokens(formula)
|
|
111
|
+
formula = formula.gsub(/\s+/, '')
|
|
112
|
+
lhs_term, rhs = formula.split '~'
|
|
113
|
+
rhs_terms = rhs.split '+'
|
|
114
|
+
([lhs_term] + rhs_terms).map { |t| Token.new t }
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# To process formula language
|
|
119
|
+
class Formula
|
|
120
|
+
attr_reader :tokens, :canonical_tokens
|
|
121
|
+
|
|
122
|
+
def initialize(tokens)
|
|
123
|
+
@tokens = tokens
|
|
124
|
+
@canonical_tokens = parse_formula
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def canonical_to_s
|
|
128
|
+
canonical_tokens.join '+'
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
private
|
|
132
|
+
|
|
133
|
+
def parse_formula
|
|
134
|
+
@tokens.inject([]) do |acc, token|
|
|
135
|
+
acc + add_non_redundant_elements(token, acc)
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def add_non_redundant_elements(token, result_so_far)
|
|
140
|
+
return [token] if token.value == '1'
|
|
141
|
+
tokens = token.expand
|
|
142
|
+
result_so_far = result_so_far.flat_map(&:expand)
|
|
143
|
+
tokens -= result_so_far
|
|
144
|
+
contract_if_possible tokens
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def contract_if_possible(tokens)
|
|
148
|
+
tokens.combination(2).each do |a, b|
|
|
149
|
+
result = a.add b
|
|
150
|
+
next unless result
|
|
151
|
+
tokens.delete a
|
|
152
|
+
tokens.delete b
|
|
153
|
+
tokens << result
|
|
154
|
+
return contract_if_possible tokens
|
|
155
|
+
end
|
|
156
|
+
tokens.sort
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# To encapsulate interaction as well as non-interaction terms
|
|
161
|
+
class Token
|
|
162
|
+
attr_reader :value, :full, :interact_terms
|
|
163
|
+
|
|
164
|
+
def initialize(value, full = true)
|
|
165
|
+
@interact_terms = value.include?(':') ? value.split(':') : [value]
|
|
166
|
+
@full = coerce_full full
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def value
|
|
170
|
+
interact_terms.join(':')
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def size
|
|
174
|
+
# TODO: Return size 1 for value '1' also
|
|
175
|
+
# CAn't do this at the moment because have to make
|
|
176
|
+
# changes in sorting first
|
|
177
|
+
value == '1' ? 0 : interact_terms.size
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def add(other)
|
|
181
|
+
# ANYTHING + FACTOR- : ANYTHING = FACTOR : ANYTHING
|
|
182
|
+
# ANYTHING + ANYTHING : FACTOR- = ANYTHING : FACTOR
|
|
183
|
+
if size > other.size
|
|
184
|
+
other.add self
|
|
185
|
+
|
|
186
|
+
elsif other.size == 2 &&
|
|
187
|
+
size == 1 &&
|
|
188
|
+
other.interact_terms.last == value &&
|
|
189
|
+
other.full.last == full.first &&
|
|
190
|
+
other.full.first == false
|
|
191
|
+
Token.new(
|
|
192
|
+
"#{other.interact_terms.first}:#{value}",
|
|
193
|
+
[true, other.full.last]
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
elsif other.size == 2 &&
|
|
197
|
+
size == 1 &&
|
|
198
|
+
other.interact_terms.first == value &&
|
|
199
|
+
other.full.first == full.first &&
|
|
200
|
+
other.full.last == false
|
|
201
|
+
Token.new(
|
|
202
|
+
"#{value}:#{other.interact_terms.last}",
|
|
203
|
+
[other.full.first, true]
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
elsif value == '1' &&
|
|
207
|
+
other.size == 1
|
|
208
|
+
Token.new(other.value, true)
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def ==(other)
|
|
213
|
+
value == other.value &&
|
|
214
|
+
full == other.full
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
alias eql? ==
|
|
218
|
+
|
|
219
|
+
def hash
|
|
220
|
+
value.hash ^ full.hash
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def <=>(other)
|
|
224
|
+
size <=> other.size
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def to_s
|
|
228
|
+
interact_terms
|
|
229
|
+
.zip(full)
|
|
230
|
+
.map { |t, f| f ? t : t + '(-)' }
|
|
231
|
+
.join ':'
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def expand
|
|
235
|
+
case size
|
|
236
|
+
when 0
|
|
237
|
+
[self]
|
|
238
|
+
when 1
|
|
239
|
+
[Token.new('1'), Token.new(value, false)]
|
|
240
|
+
when 2
|
|
241
|
+
a, b = interact_terms
|
|
242
|
+
[Token.new('1'), Token.new(a, false), Token.new(b, false),
|
|
243
|
+
Token.new(a + ':' + b, [false, false])]
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
def to_df(df)
|
|
248
|
+
case size
|
|
249
|
+
when 1
|
|
250
|
+
if df[value].category?
|
|
251
|
+
df[value].contrast_code full: full.first
|
|
252
|
+
else
|
|
253
|
+
Daru::DataFrame.new value => df[value].to_a
|
|
254
|
+
end
|
|
255
|
+
when 2
|
|
256
|
+
to_df_when_interaction(df)
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
private
|
|
261
|
+
|
|
262
|
+
def coerce_full(value)
|
|
263
|
+
if value.is_a? Array
|
|
264
|
+
value + Array.new((@interact_terms.size - value.size), true)
|
|
265
|
+
else
|
|
266
|
+
[value] * @interact_terms.size
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
def to_df_when_interaction(df)
|
|
271
|
+
case interact_terms.map { |t| df[t].category? }
|
|
272
|
+
when [true, true]
|
|
273
|
+
df.interact_code(interact_terms, full)
|
|
274
|
+
when [false, false]
|
|
275
|
+
to_df_numeric_interact_with_numeric df
|
|
276
|
+
when [true, false]
|
|
277
|
+
to_df_category_interact_with_numeric df
|
|
278
|
+
when [false, true]
|
|
279
|
+
to_df_numeric_interact_with_category df
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
def to_df_numeric_interact_with_numeric(df)
|
|
284
|
+
Daru::DataFrame.new value => (df[interact_terms.first] *
|
|
285
|
+
df[interact_terms.last]).to_a
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
def to_df_category_interact_with_numeric(df)
|
|
289
|
+
a, b = interact_terms
|
|
290
|
+
Daru::DataFrame.new(
|
|
291
|
+
df[a].contrast_code(full: full.first)
|
|
292
|
+
.map { |dv| ["#{dv.name}:#{b}", (dv * df[b]).to_a] }
|
|
293
|
+
.to_h
|
|
294
|
+
)
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
def to_df_numeric_interact_with_category(df)
|
|
298
|
+
a, b = interact_terms
|
|
299
|
+
Daru::DataFrame.new(
|
|
300
|
+
df[b].contrast_code(full: full.last)
|
|
301
|
+
.map { |dv| ["#{a}:#{dv.name}", (dv * df[a]).to_a] }
|
|
302
|
+
.to_h
|
|
303
|
+
)
|
|
304
|
+
end
|
|
305
|
+
end
|
|
306
|
+
end
|