eps 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/LICENSE.txt +1 -1
- data/README.md +183 -243
- data/lib/eps.rb +27 -3
- data/lib/eps/base_estimator.rb +316 -47
- data/lib/eps/data_frame.rb +141 -0
- data/lib/eps/evaluators/lightgbm.rb +116 -0
- data/lib/eps/evaluators/linear_regression.rb +54 -0
- data/lib/eps/evaluators/naive_bayes.rb +95 -0
- data/lib/eps/evaluators/node.rb +26 -0
- data/lib/eps/label_encoder.rb +41 -0
- data/lib/eps/lightgbm.rb +237 -0
- data/lib/eps/linear_regression.rb +132 -386
- data/lib/eps/metrics.rb +46 -0
- data/lib/eps/model.rb +16 -58
- data/lib/eps/naive_bayes.rb +175 -164
- data/lib/eps/pmml_generators/lightgbm.rb +187 -0
- data/lib/eps/statistics.rb +79 -0
- data/lib/eps/text_encoder.rb +81 -0
- data/lib/eps/utils.rb +22 -0
- data/lib/eps/version.rb +1 -1
- metadata +33 -7
@@ -1,28 +1,105 @@
|
|
1
1
|
module Eps
|
2
2
|
class LinearRegression < BaseEstimator
|
3
|
-
|
4
|
-
|
5
|
-
|
3
|
+
# pmml
|
4
|
+
|
5
|
+
def self.load_pmml(data)
|
6
|
+
super do |data|
|
7
|
+
# TODO more validation
|
8
|
+
node = data.css("RegressionTable")
|
9
|
+
|
10
|
+
coefficients = {
|
11
|
+
"_intercept" => node.attribute("intercept").value.to_f
|
12
|
+
}
|
13
|
+
|
14
|
+
features = {}
|
15
|
+
|
16
|
+
text_features, derived_fields = extract_text_features(data, features)
|
17
|
+
|
18
|
+
node.css("NumericPredictor").each do |n|
|
19
|
+
name = n.attribute("name").value
|
20
|
+
if derived_fields[name]
|
21
|
+
name = derived_fields[name]
|
22
|
+
else
|
23
|
+
features[name] = "numeric"
|
24
|
+
end
|
25
|
+
coefficients[name] = n.attribute("coefficient").value.to_f
|
26
|
+
end
|
27
|
+
|
28
|
+
node.css("CategoricalPredictor").each do |n|
|
29
|
+
name = n.attribute("name").value
|
30
|
+
coefficients[[name, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
|
31
|
+
features[name] = "categorical"
|
32
|
+
end
|
33
|
+
|
34
|
+
Evaluators::LinearRegression.new(coefficients: coefficients, features: features, text_features: text_features)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def coefficients
|
39
|
+
@evaluator.coefficients
|
40
|
+
end
|
41
|
+
|
42
|
+
def r2
|
43
|
+
@r2 ||= (sst - sse) / sst
|
44
|
+
end
|
45
|
+
|
46
|
+
def adjusted_r2
|
47
|
+
@adjusted_r2 ||= (mst - mse) / mst
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
# https://people.richland.edu/james/ictcm/2004/multiple.html
|
53
|
+
def _summary(extended: false)
|
54
|
+
coefficients = @coefficients
|
55
|
+
str = String.new("")
|
56
|
+
len = [coefficients.keys.map(&:size).max, 15].max
|
57
|
+
if extended
|
58
|
+
str += "%-#{len}s %12s %12s %12s %12s\n" % ["", "coef", "stderr", "t", "p"]
|
59
|
+
else
|
60
|
+
str += "%-#{len}s %12s %12s\n" % ["", "coef", "p"]
|
61
|
+
end
|
62
|
+
coefficients.each do |k, v|
|
63
|
+
if extended
|
64
|
+
str += "%-#{len}s %12.2f %12.2f %12.2f %12.3f\n" % [display_field(k), v, std_err[k], t_value[k], p_value[k]]
|
65
|
+
else
|
66
|
+
str += "%-#{len}s %12.2f %12.3f\n" % [display_field(k), v, p_value[k]]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
str += "\n"
|
70
|
+
str += "r2: %.3f\n" % [r2] if extended
|
71
|
+
str += "adjusted r2: %.3f\n" % [adjusted_r2]
|
72
|
+
str
|
6
73
|
end
|
7
74
|
|
8
|
-
def
|
9
|
-
|
75
|
+
def _train(**options)
|
76
|
+
raise "Target must be numeric" if @target_type != "numeric"
|
77
|
+
check_missing_value(@train_set)
|
78
|
+
check_missing_value(@validation_set) if @validation_set
|
10
79
|
|
11
|
-
|
80
|
+
data = prep_x(@train_set)
|
12
81
|
|
13
|
-
if
|
14
|
-
raise "Number of
|
82
|
+
if data.size < data.columns.size + 2
|
83
|
+
raise "Number of data points must be at least two more than number of features"
|
15
84
|
end
|
16
85
|
|
86
|
+
x = data.map_rows(&:to_a)
|
87
|
+
data.size.times do |i|
|
88
|
+
# add intercept
|
89
|
+
x[i].unshift(1)
|
90
|
+
end
|
91
|
+
|
92
|
+
gsl = options.key?(:gsl) ? options[:gsl] : defined?(GSL)
|
93
|
+
|
17
94
|
v3 =
|
18
|
-
if
|
95
|
+
if gsl
|
19
96
|
x = GSL::Matrix.alloc(*x)
|
20
|
-
y = GSL::Vector.alloc(
|
97
|
+
y = GSL::Vector.alloc(data.label)
|
21
98
|
c, @covariance, _, _ = GSL::MultiFit::linear(x, y)
|
22
99
|
c.to_a
|
23
100
|
else
|
24
101
|
x = Matrix.rows(x)
|
25
|
-
y = Matrix.column_vector(
|
102
|
+
y = Matrix.column_vector(data.label)
|
26
103
|
removed = []
|
27
104
|
|
28
105
|
# https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
|
@@ -67,7 +144,10 @@ module Eps
|
|
67
144
|
end
|
68
145
|
# huge performance boost
|
69
146
|
# by multiplying xt * y first
|
70
|
-
v2 =
|
147
|
+
v2 = @xtxi * (xt * y)
|
148
|
+
|
149
|
+
# convert to array
|
150
|
+
v2 = v2.to_a.map { |xi| xi[0].to_f }
|
71
151
|
|
72
152
|
# add back removed
|
73
153
|
removed.sort.each do |i|
|
@@ -78,198 +158,62 @@ module Eps
|
|
78
158
|
v2
|
79
159
|
end
|
80
160
|
|
161
|
+
@coefficient_names = ["_intercept"] + data.columns.keys
|
81
162
|
@coefficients = Hash[@coefficient_names.zip(v3)]
|
163
|
+
Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
|
82
164
|
end
|
83
165
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
Hash[@coefficients.map { |k, v| [Array(k).join.to_sym, v] }]
|
88
|
-
end
|
89
|
-
|
90
|
-
# ruby
|
91
|
-
|
92
|
-
def self.load(data)
|
93
|
-
new(Hash[data.map { |k, v| [k.to_sym, v] }])
|
94
|
-
end
|
95
|
-
|
96
|
-
def dump
|
97
|
-
{coefficients: coefficients}
|
98
|
-
end
|
99
|
-
|
100
|
-
# json
|
101
|
-
|
102
|
-
def self.load_json(data)
|
103
|
-
data = JSON.parse(data) if data.is_a?(String)
|
104
|
-
coefficients = data["coefficients"]
|
105
|
-
|
106
|
-
# for R models
|
107
|
-
if coefficients["(Intercept)"]
|
108
|
-
coefficients = coefficients.dup
|
109
|
-
coefficients["_intercept"] = coefficients.delete("(Intercept)")
|
110
|
-
end
|
111
|
-
|
112
|
-
new(coefficients: coefficients)
|
113
|
-
end
|
114
|
-
|
115
|
-
def to_json
|
116
|
-
JSON.generate(dump)
|
117
|
-
end
|
118
|
-
|
119
|
-
# pmml
|
120
|
-
|
121
|
-
def self.load_pmml(data)
|
122
|
-
# TODO more validation
|
123
|
-
node = data.css("RegressionTable")
|
124
|
-
coefficients = {
|
125
|
-
_intercept: node.attribute("intercept").value.to_f
|
126
|
-
}
|
127
|
-
node.css("NumericPredictor").each do |n|
|
128
|
-
coefficients[n.attribute("name").value] = n.attribute("coefficient").value.to_f
|
129
|
-
end
|
130
|
-
node.css("CategoricalPredictor").each do |n|
|
131
|
-
coefficients[[n.attribute("name").value.to_sym, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
|
132
|
-
end
|
133
|
-
new(coefficients: coefficients)
|
134
|
-
end
|
135
|
-
|
136
|
-
def to_pmml
|
137
|
-
predictors = @coefficients.reject { |k| k == :_intercept }
|
166
|
+
def generate_pmml
|
167
|
+
predictors = @coefficients.dup
|
168
|
+
predictors.delete("_intercept")
|
138
169
|
|
139
170
|
data_fields = {}
|
140
|
-
|
141
|
-
if
|
142
|
-
|
171
|
+
@features.each do |k, type|
|
172
|
+
if type == "categorical"
|
173
|
+
data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last)
|
143
174
|
else
|
144
175
|
data_fields[k] = nil
|
145
176
|
end
|
146
177
|
end
|
147
178
|
|
148
|
-
|
149
|
-
xml.
|
150
|
-
xml.
|
151
|
-
|
152
|
-
|
153
|
-
if vs
|
154
|
-
xml.DataField(name: k, optype: "categorical", dataType: "string") do
|
155
|
-
vs.each do |v|
|
156
|
-
xml.Value(value: v)
|
157
|
-
end
|
158
|
-
end
|
159
|
-
else
|
160
|
-
xml.DataField(name: k, optype: "continuous", dataType: "double")
|
161
|
-
end
|
179
|
+
build_pmml(data_fields) do |xml|
|
180
|
+
xml.RegressionModel(functionName: "regression") do
|
181
|
+
xml.MiningSchema do
|
182
|
+
@features.each do |k, _|
|
183
|
+
xml.MiningField(name: k)
|
162
184
|
end
|
163
185
|
end
|
164
|
-
xml
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
xml.RegressionTable(intercept: @coefficients[:_intercept]) do
|
171
|
-
predictors.each do |k, v|
|
172
|
-
if k.is_a?(Array)
|
173
|
-
xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
|
186
|
+
pmml_local_transformations(xml)
|
187
|
+
xml.RegressionTable(intercept: @coefficients["_intercept"]) do
|
188
|
+
predictors.each do |k, v|
|
189
|
+
if k.is_a?(Array)
|
190
|
+
if @features[k.first] == "text"
|
191
|
+
xml.NumericPredictor(name: display_field(k), coefficient: v)
|
174
192
|
else
|
175
|
-
xml.
|
193
|
+
xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
|
176
194
|
end
|
195
|
+
else
|
196
|
+
xml.NumericPredictor(name: k, coefficient: v)
|
177
197
|
end
|
178
198
|
end
|
179
199
|
end
|
180
200
|
end
|
181
|
-
end.to_xml
|
182
|
-
end
|
183
|
-
|
184
|
-
# pfa
|
185
|
-
|
186
|
-
def self.load_pfa(data)
|
187
|
-
data = JSON.parse(data) if data.is_a?(String)
|
188
|
-
init = data["cells"].first[1]["init"]
|
189
|
-
names =
|
190
|
-
if data["input"]["fields"]
|
191
|
-
data["input"]["fields"].map { |f| f["name"] }
|
192
|
-
else
|
193
|
-
init["coeff"].map.with_index { |_, i| "x#{i}" }
|
194
|
-
end
|
195
|
-
coefficients = {
|
196
|
-
_intercept: init["const"]
|
197
|
-
}
|
198
|
-
init["coeff"].each_with_index do |c, i|
|
199
|
-
name = names[i]
|
200
|
-
# R can export coefficients with same name
|
201
|
-
raise "Coefficients with same name" if coefficients[name]
|
202
|
-
coefficients[name] = c
|
203
201
|
end
|
204
|
-
new(coefficients: coefficients)
|
205
|
-
end
|
206
|
-
|
207
|
-
# metrics
|
208
|
-
|
209
|
-
def self.metrics(actual, estimated)
|
210
|
-
errors = actual.zip(estimated).map { |yi, yi2| yi - yi2 }
|
211
|
-
|
212
|
-
{
|
213
|
-
me: mean(errors),
|
214
|
-
mae: mean(errors.map { |v| v.abs }),
|
215
|
-
rmse: Math.sqrt(mean(errors.map { |v| v**2 }))
|
216
|
-
}
|
217
202
|
end
|
218
203
|
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
len = [coefficients.keys.map(&:size).max, 15].max
|
229
|
-
if extended
|
230
|
-
str += "%-#{len}s %12s %12s %12s %12s\n" % ["", "coef", "stderr", "t", "p"]
|
231
|
-
else
|
232
|
-
str += "%-#{len}s %12s %12s\n" % ["", "coef", "p"]
|
233
|
-
end
|
234
|
-
coefficients.each do |k, v|
|
235
|
-
if extended
|
236
|
-
str += "%-#{len}s %12.2f %12.2f %12.2f %12.3f\n" % [display_field(k), v, std_err[k], t_value[k], p_value[k]]
|
237
|
-
else
|
238
|
-
str += "%-#{len}s %12.2f %12.3f\n" % [display_field(k), v, p_value[k]]
|
204
|
+
def prep_x(x)
|
205
|
+
x = x.dup
|
206
|
+
@features.each do |k, type|
|
207
|
+
if type == "categorical"
|
208
|
+
values = x.columns.delete(k)
|
209
|
+
labels = values.uniq[1..-1]
|
210
|
+
labels.each do |label|
|
211
|
+
x.columns[[k, label]] = values.map { |v| v == label ? 1 : 0 }
|
212
|
+
end
|
239
213
|
end
|
240
214
|
end
|
241
|
-
|
242
|
-
|
243
|
-
str += "adjusted r2: %.3f\n" % [adjusted_r2]
|
244
|
-
str
|
245
|
-
end
|
246
|
-
|
247
|
-
def r2
|
248
|
-
@r2 ||= (sst - sse) / sst
|
249
|
-
end
|
250
|
-
|
251
|
-
def adjusted_r2
|
252
|
-
@adjusted_r2 ||= (mst - mse) / mst
|
253
|
-
end
|
254
|
-
|
255
|
-
private
|
256
|
-
|
257
|
-
def _predict(x)
|
258
|
-
x, c = prep_x(x, train: false)
|
259
|
-
coef = c.map do |v|
|
260
|
-
# use 0 if coefficient does not exist
|
261
|
-
# this can happen for categorical features
|
262
|
-
# since only n-1 coefficients are stored
|
263
|
-
@coefficients[v] || 0
|
264
|
-
end
|
265
|
-
|
266
|
-
x = Matrix.rows(x)
|
267
|
-
c = Matrix.column_vector(coef)
|
268
|
-
matrix_arr(x * c)
|
269
|
-
end
|
270
|
-
|
271
|
-
def display_field(k)
|
272
|
-
k.is_a?(Array) ? k.join("") : k
|
215
|
+
prep_text_features(x)
|
216
|
+
x
|
273
217
|
end
|
274
218
|
|
275
219
|
def constant?(arr)
|
@@ -289,7 +233,7 @@ module Eps
|
|
289
233
|
if @gsl
|
290
234
|
GSL::Cdf.tdist_P(t_value[k].abs, degrees_of_freedom)
|
291
235
|
else
|
292
|
-
tdist_p(t_value[k].abs, degrees_of_freedom)
|
236
|
+
Eps::Statistics.tdist_p(t_value[k].abs, degrees_of_freedom)
|
293
237
|
end
|
294
238
|
|
295
239
|
[k, 2 * (1 - tp)]
|
@@ -322,26 +266,26 @@ module Eps
|
|
322
266
|
end
|
323
267
|
|
324
268
|
def y_bar
|
325
|
-
@y_bar ||= mean(@
|
269
|
+
@y_bar ||= mean(@train_set.label)
|
326
270
|
end
|
327
271
|
|
328
272
|
def y_hat
|
329
|
-
@y_hat ||= predict(@
|
273
|
+
@y_hat ||= predict(@train_set)
|
330
274
|
end
|
331
275
|
|
332
276
|
# total sum of squares
|
333
277
|
def sst
|
334
|
-
@sst ||= @
|
278
|
+
@sst ||= @train_set.label.map { |y| (y - y_bar)**2 }.sum
|
335
279
|
end
|
336
280
|
|
337
281
|
# sum of squared errors of prediction
|
338
282
|
# not to be confused with "explained sum of squares"
|
339
283
|
def sse
|
340
|
-
@sse ||= @
|
284
|
+
@sse ||= @train_set.label.zip(y_hat).map { |y, yh| (y - yh)**2 }.sum
|
341
285
|
end
|
342
286
|
|
343
287
|
def mst
|
344
|
-
@mst ||= sst / (@
|
288
|
+
@mst ||= sst / (@train_set.size - 1)
|
345
289
|
end
|
346
290
|
|
347
291
|
def mse
|
@@ -349,209 +293,11 @@ module Eps
|
|
349
293
|
end
|
350
294
|
|
351
295
|
def degrees_of_freedom
|
352
|
-
@
|
296
|
+
@train_set.size - @coefficients.size
|
353
297
|
end
|
354
298
|
|
355
299
|
def mean(arr)
|
356
300
|
arr.sum / arr.size.to_f
|
357
301
|
end
|
358
|
-
|
359
|
-
### Extracted from https://github.com/estebanz01/ruby-statistics
|
360
|
-
### The Ruby author is Esteban Zapata Rojas
|
361
|
-
###
|
362
|
-
### Originally extracted from https://codeplea.com/incomplete-beta-function-c
|
363
|
-
### This function is shared under zlib license and the author is Lewis Van Winkle
|
364
|
-
def tdist_p(value, degrees_of_freedom)
|
365
|
-
upper = (value + Math.sqrt(value * value + degrees_of_freedom))
|
366
|
-
lower = (2.0 * Math.sqrt(value * value + degrees_of_freedom))
|
367
|
-
|
368
|
-
x = upper/lower
|
369
|
-
|
370
|
-
alpha = degrees_of_freedom/2.0
|
371
|
-
beta = degrees_of_freedom/2.0
|
372
|
-
|
373
|
-
incomplete_beta_function(x, alpha, beta)
|
374
|
-
end
|
375
|
-
|
376
|
-
### Extracted from https://github.com/estebanz01/ruby-statistics
|
377
|
-
### The Ruby author is Esteban Zapata Rojas
|
378
|
-
###
|
379
|
-
### This implementation is an adaptation of the incomplete beta function made in C by
|
380
|
-
### Lewis Van Winkle, which released the code under the zlib license.
|
381
|
-
### The whole math behind this code is described in the following post: https://codeplea.com/incomplete-beta-function-c
|
382
|
-
def incomplete_beta_function(x, alp, bet)
|
383
|
-
return if x < 0.0
|
384
|
-
return 1.0 if x > 1.0
|
385
|
-
|
386
|
-
tiny = 1.0E-50
|
387
|
-
|
388
|
-
if x > ((alp + 1.0)/(alp + bet + 2.0))
|
389
|
-
return 1.0 - incomplete_beta_function(1.0 - x, bet, alp)
|
390
|
-
end
|
391
|
-
|
392
|
-
# To avoid overflow problems, the implementation applies the logarithm properties
|
393
|
-
# to calculate in a faster and safer way the values.
|
394
|
-
lbet_ab = (Math.lgamma(alp)[0] + Math.lgamma(bet)[0] - Math.lgamma(alp + bet)[0]).freeze
|
395
|
-
front = (Math.exp(Math.log(x) * alp + Math.log(1.0 - x) * bet - lbet_ab) / alp.to_f).freeze
|
396
|
-
|
397
|
-
# This is the non-log version of the left part of the formula (before the continuous fraction)
|
398
|
-
# down_left = alp * self.beta_function(alp, bet)
|
399
|
-
# upper_left = (x ** alp) * ((1.0 - x) ** bet)
|
400
|
-
# front = upper_left/down_left
|
401
|
-
|
402
|
-
f, c, d = 1.0, 1.0, 0.0
|
403
|
-
|
404
|
-
returned_value = nil
|
405
|
-
|
406
|
-
# Let's do more iterations than the proposed implementation (200 iters)
|
407
|
-
(0..500).each do |number|
|
408
|
-
m = number/2
|
409
|
-
|
410
|
-
numerator = if number == 0
|
411
|
-
1.0
|
412
|
-
elsif number % 2 == 0
|
413
|
-
(m * (bet - m) * x)/((alp + 2.0 * m - 1.0)* (alp + 2.0 * m))
|
414
|
-
else
|
415
|
-
top = -((alp + m) * (alp + bet + m) * x)
|
416
|
-
down = ((alp + 2.0 * m) * (alp + 2.0 * m + 1.0))
|
417
|
-
|
418
|
-
top/down
|
419
|
-
end
|
420
|
-
|
421
|
-
d = 1.0 + numerator * d
|
422
|
-
d = tiny if d.abs < tiny
|
423
|
-
d = 1.0 / d
|
424
|
-
|
425
|
-
c = 1.0 + numerator / c
|
426
|
-
c = tiny if c.abs < tiny
|
427
|
-
|
428
|
-
cd = (c*d).freeze
|
429
|
-
f = f * cd
|
430
|
-
|
431
|
-
if (1.0 - cd).abs < 1.0E-10
|
432
|
-
returned_value = front * (f - 1.0)
|
433
|
-
break
|
434
|
-
end
|
435
|
-
end
|
436
|
-
|
437
|
-
returned_value
|
438
|
-
end
|
439
|
-
|
440
|
-
def prep_x(x, train: true)
|
441
|
-
coefficients = @coefficients
|
442
|
-
|
443
|
-
if daru?(x)
|
444
|
-
x = x.to_a[0]
|
445
|
-
else
|
446
|
-
x = x.map do |xi|
|
447
|
-
case xi
|
448
|
-
when Hash
|
449
|
-
xi
|
450
|
-
when Array
|
451
|
-
Hash[xi.map.with_index { |v, i| [:"x#{i}", v] }]
|
452
|
-
else
|
453
|
-
{x0: xi}
|
454
|
-
end
|
455
|
-
end
|
456
|
-
end
|
457
|
-
|
458
|
-
# get column types
|
459
|
-
if train
|
460
|
-
column_types = {}
|
461
|
-
if x.any?
|
462
|
-
row = x.first
|
463
|
-
row.each do |k, v|
|
464
|
-
column_types[k] = categorical?(v) ? "categorical" : "numeric"
|
465
|
-
end
|
466
|
-
end
|
467
|
-
else
|
468
|
-
# get column types for prediction
|
469
|
-
column_types = {}
|
470
|
-
coefficients.each do |k, v|
|
471
|
-
next if k == :_intercept
|
472
|
-
if k.is_a?(Array)
|
473
|
-
column_types[k.first] = "categorical"
|
474
|
-
else
|
475
|
-
column_types[k] = "numeric"
|
476
|
-
end
|
477
|
-
end
|
478
|
-
end
|
479
|
-
|
480
|
-
# if !train && x.any?
|
481
|
-
# # check first row against coefficients
|
482
|
-
# ckeys = coefficients.keys.map(&:to_s)
|
483
|
-
# bad_keys = x[0].keys.map(&:to_s).reject { |k| ckeys.any? { |c| c.start_with?(k) } }
|
484
|
-
# raise "Unknown keys: #{bad_keys.join(", ")}" if bad_keys.any?
|
485
|
-
# end
|
486
|
-
|
487
|
-
supports_categorical = train || coefficients.any? { |k, _| k.is_a?(Array) }
|
488
|
-
|
489
|
-
cache = {}
|
490
|
-
first_key = {}
|
491
|
-
i = 0
|
492
|
-
rows = []
|
493
|
-
x.each do |xi|
|
494
|
-
row = {}
|
495
|
-
xi.each do |k, v|
|
496
|
-
categorical = column_types[k.to_sym] == "categorical" || (!supports_categorical && categorical?(v))
|
497
|
-
|
498
|
-
key = categorical ? [k.to_sym, v.to_s] : k.to_sym
|
499
|
-
v2 = categorical ? 1 : v
|
500
|
-
|
501
|
-
# TODO make more efficient
|
502
|
-
check_key = supports_categorical ? key : symbolize_coef(key)
|
503
|
-
next if !train && !coefficients.key?(check_key)
|
504
|
-
|
505
|
-
raise "Missing data" if v2.nil?
|
506
|
-
|
507
|
-
unless cache[key]
|
508
|
-
cache[key] = i
|
509
|
-
first_key[k] ||= key if categorical
|
510
|
-
i += 1
|
511
|
-
end
|
512
|
-
|
513
|
-
row[key] = v2
|
514
|
-
end
|
515
|
-
rows << row
|
516
|
-
end
|
517
|
-
|
518
|
-
if train
|
519
|
-
# remove one degree of freedom
|
520
|
-
first_key.values.each do |v|
|
521
|
-
num = cache.delete(v)
|
522
|
-
cache.each do |k, v2|
|
523
|
-
cache[k] -= 1 if v2 > num
|
524
|
-
end
|
525
|
-
end
|
526
|
-
end
|
527
|
-
|
528
|
-
ret2 = []
|
529
|
-
rows.each do |row|
|
530
|
-
ret = [0] * cache.size
|
531
|
-
row.each do |k, v|
|
532
|
-
if cache[k]
|
533
|
-
ret[cache[k]] = v
|
534
|
-
end
|
535
|
-
end
|
536
|
-
ret2 << ([1] + ret)
|
537
|
-
end
|
538
|
-
|
539
|
-
# flatten keys
|
540
|
-
c = [:_intercept] + cache.sort_by { |_, v| v }.map(&:first)
|
541
|
-
|
542
|
-
unless supports_categorical
|
543
|
-
c = c.map { |v| symbolize_coef(v) }
|
544
|
-
end
|
545
|
-
|
546
|
-
[ret2, c]
|
547
|
-
end
|
548
|
-
|
549
|
-
def symbolize_coef(k)
|
550
|
-
(k.is_a?(Array) ? k.join("") : k).to_sym
|
551
|
-
end
|
552
|
-
|
553
|
-
def matrix_arr(matrix)
|
554
|
-
matrix.to_a.map { |xi| xi[0].to_f }
|
555
|
-
end
|
556
302
|
end
|
557
303
|
end
|