eps 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/LICENSE.txt +1 -1
- data/README.md +183 -243
- data/lib/eps.rb +27 -3
- data/lib/eps/base_estimator.rb +316 -47
- data/lib/eps/data_frame.rb +141 -0
- data/lib/eps/evaluators/lightgbm.rb +116 -0
- data/lib/eps/evaluators/linear_regression.rb +54 -0
- data/lib/eps/evaluators/naive_bayes.rb +95 -0
- data/lib/eps/evaluators/node.rb +26 -0
- data/lib/eps/label_encoder.rb +41 -0
- data/lib/eps/lightgbm.rb +237 -0
- data/lib/eps/linear_regression.rb +132 -386
- data/lib/eps/metrics.rb +46 -0
- data/lib/eps/model.rb +16 -58
- data/lib/eps/naive_bayes.rb +175 -164
- data/lib/eps/pmml_generators/lightgbm.rb +187 -0
- data/lib/eps/statistics.rb +79 -0
- data/lib/eps/text_encoder.rb +81 -0
- data/lib/eps/utils.rb +22 -0
- data/lib/eps/version.rb +1 -1
- metadata +33 -7
@@ -1,28 +1,105 @@
|
|
1
1
|
module Eps
|
2
2
|
class LinearRegression < BaseEstimator
|
3
|
-
|
4
|
-
|
5
|
-
|
3
|
+
# pmml
|
4
|
+
|
5
|
+
def self.load_pmml(data)
|
6
|
+
super do |data|
|
7
|
+
# TODO more validation
|
8
|
+
node = data.css("RegressionTable")
|
9
|
+
|
10
|
+
coefficients = {
|
11
|
+
"_intercept" => node.attribute("intercept").value.to_f
|
12
|
+
}
|
13
|
+
|
14
|
+
features = {}
|
15
|
+
|
16
|
+
text_features, derived_fields = extract_text_features(data, features)
|
17
|
+
|
18
|
+
node.css("NumericPredictor").each do |n|
|
19
|
+
name = n.attribute("name").value
|
20
|
+
if derived_fields[name]
|
21
|
+
name = derived_fields[name]
|
22
|
+
else
|
23
|
+
features[name] = "numeric"
|
24
|
+
end
|
25
|
+
coefficients[name] = n.attribute("coefficient").value.to_f
|
26
|
+
end
|
27
|
+
|
28
|
+
node.css("CategoricalPredictor").each do |n|
|
29
|
+
name = n.attribute("name").value
|
30
|
+
coefficients[[name, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
|
31
|
+
features[name] = "categorical"
|
32
|
+
end
|
33
|
+
|
34
|
+
Evaluators::LinearRegression.new(coefficients: coefficients, features: features, text_features: text_features)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def coefficients
|
39
|
+
@evaluator.coefficients
|
40
|
+
end
|
41
|
+
|
42
|
+
def r2
|
43
|
+
@r2 ||= (sst - sse) / sst
|
44
|
+
end
|
45
|
+
|
46
|
+
def adjusted_r2
|
47
|
+
@adjusted_r2 ||= (mst - mse) / mst
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
# https://people.richland.edu/james/ictcm/2004/multiple.html
|
53
|
+
def _summary(extended: false)
|
54
|
+
coefficients = @coefficients
|
55
|
+
str = String.new("")
|
56
|
+
len = [coefficients.keys.map(&:size).max, 15].max
|
57
|
+
if extended
|
58
|
+
str += "%-#{len}s %12s %12s %12s %12s\n" % ["", "coef", "stderr", "t", "p"]
|
59
|
+
else
|
60
|
+
str += "%-#{len}s %12s %12s\n" % ["", "coef", "p"]
|
61
|
+
end
|
62
|
+
coefficients.each do |k, v|
|
63
|
+
if extended
|
64
|
+
str += "%-#{len}s %12.2f %12.2f %12.2f %12.3f\n" % [display_field(k), v, std_err[k], t_value[k], p_value[k]]
|
65
|
+
else
|
66
|
+
str += "%-#{len}s %12.2f %12.3f\n" % [display_field(k), v, p_value[k]]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
str += "\n"
|
70
|
+
str += "r2: %.3f\n" % [r2] if extended
|
71
|
+
str += "adjusted r2: %.3f\n" % [adjusted_r2]
|
72
|
+
str
|
6
73
|
end
|
7
74
|
|
8
|
-
def
|
9
|
-
|
75
|
+
def _train(**options)
|
76
|
+
raise "Target must be numeric" if @target_type != "numeric"
|
77
|
+
check_missing_value(@train_set)
|
78
|
+
check_missing_value(@validation_set) if @validation_set
|
10
79
|
|
11
|
-
|
80
|
+
data = prep_x(@train_set)
|
12
81
|
|
13
|
-
if
|
14
|
-
raise "Number of
|
82
|
+
if data.size < data.columns.size + 2
|
83
|
+
raise "Number of data points must be at least two more than number of features"
|
15
84
|
end
|
16
85
|
|
86
|
+
x = data.map_rows(&:to_a)
|
87
|
+
data.size.times do |i|
|
88
|
+
# add intercept
|
89
|
+
x[i].unshift(1)
|
90
|
+
end
|
91
|
+
|
92
|
+
gsl = options.key?(:gsl) ? options[:gsl] : defined?(GSL)
|
93
|
+
|
17
94
|
v3 =
|
18
|
-
if
|
95
|
+
if gsl
|
19
96
|
x = GSL::Matrix.alloc(*x)
|
20
|
-
y = GSL::Vector.alloc(
|
97
|
+
y = GSL::Vector.alloc(data.label)
|
21
98
|
c, @covariance, _, _ = GSL::MultiFit::linear(x, y)
|
22
99
|
c.to_a
|
23
100
|
else
|
24
101
|
x = Matrix.rows(x)
|
25
|
-
y = Matrix.column_vector(
|
102
|
+
y = Matrix.column_vector(data.label)
|
26
103
|
removed = []
|
27
104
|
|
28
105
|
# https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
|
@@ -67,7 +144,10 @@ module Eps
|
|
67
144
|
end
|
68
145
|
# huge performance boost
|
69
146
|
# by multiplying xt * y first
|
70
|
-
v2 =
|
147
|
+
v2 = @xtxi * (xt * y)
|
148
|
+
|
149
|
+
# convert to array
|
150
|
+
v2 = v2.to_a.map { |xi| xi[0].to_f }
|
71
151
|
|
72
152
|
# add back removed
|
73
153
|
removed.sort.each do |i|
|
@@ -78,198 +158,62 @@ module Eps
|
|
78
158
|
v2
|
79
159
|
end
|
80
160
|
|
161
|
+
@coefficient_names = ["_intercept"] + data.columns.keys
|
81
162
|
@coefficients = Hash[@coefficient_names.zip(v3)]
|
163
|
+
Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
|
82
164
|
end
|
83
165
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
Hash[@coefficients.map { |k, v| [Array(k).join.to_sym, v] }]
|
88
|
-
end
|
89
|
-
|
90
|
-
# ruby
|
91
|
-
|
92
|
-
def self.load(data)
|
93
|
-
new(Hash[data.map { |k, v| [k.to_sym, v] }])
|
94
|
-
end
|
95
|
-
|
96
|
-
def dump
|
97
|
-
{coefficients: coefficients}
|
98
|
-
end
|
99
|
-
|
100
|
-
# json
|
101
|
-
|
102
|
-
def self.load_json(data)
|
103
|
-
data = JSON.parse(data) if data.is_a?(String)
|
104
|
-
coefficients = data["coefficients"]
|
105
|
-
|
106
|
-
# for R models
|
107
|
-
if coefficients["(Intercept)"]
|
108
|
-
coefficients = coefficients.dup
|
109
|
-
coefficients["_intercept"] = coefficients.delete("(Intercept)")
|
110
|
-
end
|
111
|
-
|
112
|
-
new(coefficients: coefficients)
|
113
|
-
end
|
114
|
-
|
115
|
-
def to_json
|
116
|
-
JSON.generate(dump)
|
117
|
-
end
|
118
|
-
|
119
|
-
# pmml
|
120
|
-
|
121
|
-
def self.load_pmml(data)
|
122
|
-
# TODO more validation
|
123
|
-
node = data.css("RegressionTable")
|
124
|
-
coefficients = {
|
125
|
-
_intercept: node.attribute("intercept").value.to_f
|
126
|
-
}
|
127
|
-
node.css("NumericPredictor").each do |n|
|
128
|
-
coefficients[n.attribute("name").value] = n.attribute("coefficient").value.to_f
|
129
|
-
end
|
130
|
-
node.css("CategoricalPredictor").each do |n|
|
131
|
-
coefficients[[n.attribute("name").value.to_sym, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
|
132
|
-
end
|
133
|
-
new(coefficients: coefficients)
|
134
|
-
end
|
135
|
-
|
136
|
-
def to_pmml
|
137
|
-
predictors = @coefficients.reject { |k| k == :_intercept }
|
166
|
+
def generate_pmml
|
167
|
+
predictors = @coefficients.dup
|
168
|
+
predictors.delete("_intercept")
|
138
169
|
|
139
170
|
data_fields = {}
|
140
|
-
|
141
|
-
if
|
142
|
-
|
171
|
+
@features.each do |k, type|
|
172
|
+
if type == "categorical"
|
173
|
+
data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last)
|
143
174
|
else
|
144
175
|
data_fields[k] = nil
|
145
176
|
end
|
146
177
|
end
|
147
178
|
|
148
|
-
|
149
|
-
xml.
|
150
|
-
xml.
|
151
|
-
|
152
|
-
|
153
|
-
if vs
|
154
|
-
xml.DataField(name: k, optype: "categorical", dataType: "string") do
|
155
|
-
vs.each do |v|
|
156
|
-
xml.Value(value: v)
|
157
|
-
end
|
158
|
-
end
|
159
|
-
else
|
160
|
-
xml.DataField(name: k, optype: "continuous", dataType: "double")
|
161
|
-
end
|
179
|
+
build_pmml(data_fields) do |xml|
|
180
|
+
xml.RegressionModel(functionName: "regression") do
|
181
|
+
xml.MiningSchema do
|
182
|
+
@features.each do |k, _|
|
183
|
+
xml.MiningField(name: k)
|
162
184
|
end
|
163
185
|
end
|
164
|
-
xml
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
xml.RegressionTable(intercept: @coefficients[:_intercept]) do
|
171
|
-
predictors.each do |k, v|
|
172
|
-
if k.is_a?(Array)
|
173
|
-
xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
|
186
|
+
pmml_local_transformations(xml)
|
187
|
+
xml.RegressionTable(intercept: @coefficients["_intercept"]) do
|
188
|
+
predictors.each do |k, v|
|
189
|
+
if k.is_a?(Array)
|
190
|
+
if @features[k.first] == "text"
|
191
|
+
xml.NumericPredictor(name: display_field(k), coefficient: v)
|
174
192
|
else
|
175
|
-
xml.
|
193
|
+
xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
|
176
194
|
end
|
195
|
+
else
|
196
|
+
xml.NumericPredictor(name: k, coefficient: v)
|
177
197
|
end
|
178
198
|
end
|
179
199
|
end
|
180
200
|
end
|
181
|
-
end.to_xml
|
182
|
-
end
|
183
|
-
|
184
|
-
# pfa
|
185
|
-
|
186
|
-
def self.load_pfa(data)
|
187
|
-
data = JSON.parse(data) if data.is_a?(String)
|
188
|
-
init = data["cells"].first[1]["init"]
|
189
|
-
names =
|
190
|
-
if data["input"]["fields"]
|
191
|
-
data["input"]["fields"].map { |f| f["name"] }
|
192
|
-
else
|
193
|
-
init["coeff"].map.with_index { |_, i| "x#{i}" }
|
194
|
-
end
|
195
|
-
coefficients = {
|
196
|
-
_intercept: init["const"]
|
197
|
-
}
|
198
|
-
init["coeff"].each_with_index do |c, i|
|
199
|
-
name = names[i]
|
200
|
-
# R can export coefficients with same name
|
201
|
-
raise "Coefficients with same name" if coefficients[name]
|
202
|
-
coefficients[name] = c
|
203
201
|
end
|
204
|
-
new(coefficients: coefficients)
|
205
|
-
end
|
206
|
-
|
207
|
-
# metrics
|
208
|
-
|
209
|
-
def self.metrics(actual, estimated)
|
210
|
-
errors = actual.zip(estimated).map { |yi, yi2| yi - yi2 }
|
211
|
-
|
212
|
-
{
|
213
|
-
me: mean(errors),
|
214
|
-
mae: mean(errors.map { |v| v.abs }),
|
215
|
-
rmse: Math.sqrt(mean(errors.map { |v| v**2 }))
|
216
|
-
}
|
217
202
|
end
|
218
203
|
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
len = [coefficients.keys.map(&:size).max, 15].max
|
229
|
-
if extended
|
230
|
-
str += "%-#{len}s %12s %12s %12s %12s\n" % ["", "coef", "stderr", "t", "p"]
|
231
|
-
else
|
232
|
-
str += "%-#{len}s %12s %12s\n" % ["", "coef", "p"]
|
233
|
-
end
|
234
|
-
coefficients.each do |k, v|
|
235
|
-
if extended
|
236
|
-
str += "%-#{len}s %12.2f %12.2f %12.2f %12.3f\n" % [display_field(k), v, std_err[k], t_value[k], p_value[k]]
|
237
|
-
else
|
238
|
-
str += "%-#{len}s %12.2f %12.3f\n" % [display_field(k), v, p_value[k]]
|
204
|
+
def prep_x(x)
|
205
|
+
x = x.dup
|
206
|
+
@features.each do |k, type|
|
207
|
+
if type == "categorical"
|
208
|
+
values = x.columns.delete(k)
|
209
|
+
labels = values.uniq[1..-1]
|
210
|
+
labels.each do |label|
|
211
|
+
x.columns[[k, label]] = values.map { |v| v == label ? 1 : 0 }
|
212
|
+
end
|
239
213
|
end
|
240
214
|
end
|
241
|
-
|
242
|
-
|
243
|
-
str += "adjusted r2: %.3f\n" % [adjusted_r2]
|
244
|
-
str
|
245
|
-
end
|
246
|
-
|
247
|
-
def r2
|
248
|
-
@r2 ||= (sst - sse) / sst
|
249
|
-
end
|
250
|
-
|
251
|
-
def adjusted_r2
|
252
|
-
@adjusted_r2 ||= (mst - mse) / mst
|
253
|
-
end
|
254
|
-
|
255
|
-
private
|
256
|
-
|
257
|
-
def _predict(x)
|
258
|
-
x, c = prep_x(x, train: false)
|
259
|
-
coef = c.map do |v|
|
260
|
-
# use 0 if coefficient does not exist
|
261
|
-
# this can happen for categorical features
|
262
|
-
# since only n-1 coefficients are stored
|
263
|
-
@coefficients[v] || 0
|
264
|
-
end
|
265
|
-
|
266
|
-
x = Matrix.rows(x)
|
267
|
-
c = Matrix.column_vector(coef)
|
268
|
-
matrix_arr(x * c)
|
269
|
-
end
|
270
|
-
|
271
|
-
def display_field(k)
|
272
|
-
k.is_a?(Array) ? k.join("") : k
|
215
|
+
prep_text_features(x)
|
216
|
+
x
|
273
217
|
end
|
274
218
|
|
275
219
|
def constant?(arr)
|
@@ -289,7 +233,7 @@ module Eps
|
|
289
233
|
if @gsl
|
290
234
|
GSL::Cdf.tdist_P(t_value[k].abs, degrees_of_freedom)
|
291
235
|
else
|
292
|
-
tdist_p(t_value[k].abs, degrees_of_freedom)
|
236
|
+
Eps::Statistics.tdist_p(t_value[k].abs, degrees_of_freedom)
|
293
237
|
end
|
294
238
|
|
295
239
|
[k, 2 * (1 - tp)]
|
@@ -322,26 +266,26 @@ module Eps
|
|
322
266
|
end
|
323
267
|
|
324
268
|
def y_bar
|
325
|
-
@y_bar ||= mean(@
|
269
|
+
@y_bar ||= mean(@train_set.label)
|
326
270
|
end
|
327
271
|
|
328
272
|
def y_hat
|
329
|
-
@y_hat ||= predict(@
|
273
|
+
@y_hat ||= predict(@train_set)
|
330
274
|
end
|
331
275
|
|
332
276
|
# total sum of squares
|
333
277
|
def sst
|
334
|
-
@sst ||= @
|
278
|
+
@sst ||= @train_set.label.map { |y| (y - y_bar)**2 }.sum
|
335
279
|
end
|
336
280
|
|
337
281
|
# sum of squared errors of prediction
|
338
282
|
# not to be confused with "explained sum of squares"
|
339
283
|
def sse
|
340
|
-
@sse ||= @
|
284
|
+
@sse ||= @train_set.label.zip(y_hat).map { |y, yh| (y - yh)**2 }.sum
|
341
285
|
end
|
342
286
|
|
343
287
|
def mst
|
344
|
-
@mst ||= sst / (@
|
288
|
+
@mst ||= sst / (@train_set.size - 1)
|
345
289
|
end
|
346
290
|
|
347
291
|
def mse
|
@@ -349,209 +293,11 @@ module Eps
|
|
349
293
|
end
|
350
294
|
|
351
295
|
def degrees_of_freedom
|
352
|
-
@
|
296
|
+
@train_set.size - @coefficients.size
|
353
297
|
end
|
354
298
|
|
355
299
|
def mean(arr)
|
356
300
|
arr.sum / arr.size.to_f
|
357
301
|
end
|
358
|
-
|
359
|
-
### Extracted from https://github.com/estebanz01/ruby-statistics
|
360
|
-
### The Ruby author is Esteban Zapata Rojas
|
361
|
-
###
|
362
|
-
### Originally extracted from https://codeplea.com/incomplete-beta-function-c
|
363
|
-
### This function is shared under zlib license and the author is Lewis Van Winkle
|
364
|
-
def tdist_p(value, degrees_of_freedom)
|
365
|
-
upper = (value + Math.sqrt(value * value + degrees_of_freedom))
|
366
|
-
lower = (2.0 * Math.sqrt(value * value + degrees_of_freedom))
|
367
|
-
|
368
|
-
x = upper/lower
|
369
|
-
|
370
|
-
alpha = degrees_of_freedom/2.0
|
371
|
-
beta = degrees_of_freedom/2.0
|
372
|
-
|
373
|
-
incomplete_beta_function(x, alpha, beta)
|
374
|
-
end
|
375
|
-
|
376
|
-
### Extracted from https://github.com/estebanz01/ruby-statistics
|
377
|
-
### The Ruby author is Esteban Zapata Rojas
|
378
|
-
###
|
379
|
-
### This implementation is an adaptation of the incomplete beta function made in C by
|
380
|
-
### Lewis Van Winkle, which released the code under the zlib license.
|
381
|
-
### The whole math behind this code is described in the following post: https://codeplea.com/incomplete-beta-function-c
|
382
|
-
def incomplete_beta_function(x, alp, bet)
|
383
|
-
return if x < 0.0
|
384
|
-
return 1.0 if x > 1.0
|
385
|
-
|
386
|
-
tiny = 1.0E-50
|
387
|
-
|
388
|
-
if x > ((alp + 1.0)/(alp + bet + 2.0))
|
389
|
-
return 1.0 - incomplete_beta_function(1.0 - x, bet, alp)
|
390
|
-
end
|
391
|
-
|
392
|
-
# To avoid overflow problems, the implementation applies the logarithm properties
|
393
|
-
# to calculate in a faster and safer way the values.
|
394
|
-
lbet_ab = (Math.lgamma(alp)[0] + Math.lgamma(bet)[0] - Math.lgamma(alp + bet)[0]).freeze
|
395
|
-
front = (Math.exp(Math.log(x) * alp + Math.log(1.0 - x) * bet - lbet_ab) / alp.to_f).freeze
|
396
|
-
|
397
|
-
# This is the non-log version of the left part of the formula (before the continuous fraction)
|
398
|
-
# down_left = alp * self.beta_function(alp, bet)
|
399
|
-
# upper_left = (x ** alp) * ((1.0 - x) ** bet)
|
400
|
-
# front = upper_left/down_left
|
401
|
-
|
402
|
-
f, c, d = 1.0, 1.0, 0.0
|
403
|
-
|
404
|
-
returned_value = nil
|
405
|
-
|
406
|
-
# Let's do more iterations than the proposed implementation (200 iters)
|
407
|
-
(0..500).each do |number|
|
408
|
-
m = number/2
|
409
|
-
|
410
|
-
numerator = if number == 0
|
411
|
-
1.0
|
412
|
-
elsif number % 2 == 0
|
413
|
-
(m * (bet - m) * x)/((alp + 2.0 * m - 1.0)* (alp + 2.0 * m))
|
414
|
-
else
|
415
|
-
top = -((alp + m) * (alp + bet + m) * x)
|
416
|
-
down = ((alp + 2.0 * m) * (alp + 2.0 * m + 1.0))
|
417
|
-
|
418
|
-
top/down
|
419
|
-
end
|
420
|
-
|
421
|
-
d = 1.0 + numerator * d
|
422
|
-
d = tiny if d.abs < tiny
|
423
|
-
d = 1.0 / d
|
424
|
-
|
425
|
-
c = 1.0 + numerator / c
|
426
|
-
c = tiny if c.abs < tiny
|
427
|
-
|
428
|
-
cd = (c*d).freeze
|
429
|
-
f = f * cd
|
430
|
-
|
431
|
-
if (1.0 - cd).abs < 1.0E-10
|
432
|
-
returned_value = front * (f - 1.0)
|
433
|
-
break
|
434
|
-
end
|
435
|
-
end
|
436
|
-
|
437
|
-
returned_value
|
438
|
-
end
|
439
|
-
|
440
|
-
def prep_x(x, train: true)
|
441
|
-
coefficients = @coefficients
|
442
|
-
|
443
|
-
if daru?(x)
|
444
|
-
x = x.to_a[0]
|
445
|
-
else
|
446
|
-
x = x.map do |xi|
|
447
|
-
case xi
|
448
|
-
when Hash
|
449
|
-
xi
|
450
|
-
when Array
|
451
|
-
Hash[xi.map.with_index { |v, i| [:"x#{i}", v] }]
|
452
|
-
else
|
453
|
-
{x0: xi}
|
454
|
-
end
|
455
|
-
end
|
456
|
-
end
|
457
|
-
|
458
|
-
# get column types
|
459
|
-
if train
|
460
|
-
column_types = {}
|
461
|
-
if x.any?
|
462
|
-
row = x.first
|
463
|
-
row.each do |k, v|
|
464
|
-
column_types[k] = categorical?(v) ? "categorical" : "numeric"
|
465
|
-
end
|
466
|
-
end
|
467
|
-
else
|
468
|
-
# get column types for prediction
|
469
|
-
column_types = {}
|
470
|
-
coefficients.each do |k, v|
|
471
|
-
next if k == :_intercept
|
472
|
-
if k.is_a?(Array)
|
473
|
-
column_types[k.first] = "categorical"
|
474
|
-
else
|
475
|
-
column_types[k] = "numeric"
|
476
|
-
end
|
477
|
-
end
|
478
|
-
end
|
479
|
-
|
480
|
-
# if !train && x.any?
|
481
|
-
# # check first row against coefficients
|
482
|
-
# ckeys = coefficients.keys.map(&:to_s)
|
483
|
-
# bad_keys = x[0].keys.map(&:to_s).reject { |k| ckeys.any? { |c| c.start_with?(k) } }
|
484
|
-
# raise "Unknown keys: #{bad_keys.join(", ")}" if bad_keys.any?
|
485
|
-
# end
|
486
|
-
|
487
|
-
supports_categorical = train || coefficients.any? { |k, _| k.is_a?(Array) }
|
488
|
-
|
489
|
-
cache = {}
|
490
|
-
first_key = {}
|
491
|
-
i = 0
|
492
|
-
rows = []
|
493
|
-
x.each do |xi|
|
494
|
-
row = {}
|
495
|
-
xi.each do |k, v|
|
496
|
-
categorical = column_types[k.to_sym] == "categorical" || (!supports_categorical && categorical?(v))
|
497
|
-
|
498
|
-
key = categorical ? [k.to_sym, v.to_s] : k.to_sym
|
499
|
-
v2 = categorical ? 1 : v
|
500
|
-
|
501
|
-
# TODO make more efficient
|
502
|
-
check_key = supports_categorical ? key : symbolize_coef(key)
|
503
|
-
next if !train && !coefficients.key?(check_key)
|
504
|
-
|
505
|
-
raise "Missing data" if v2.nil?
|
506
|
-
|
507
|
-
unless cache[key]
|
508
|
-
cache[key] = i
|
509
|
-
first_key[k] ||= key if categorical
|
510
|
-
i += 1
|
511
|
-
end
|
512
|
-
|
513
|
-
row[key] = v2
|
514
|
-
end
|
515
|
-
rows << row
|
516
|
-
end
|
517
|
-
|
518
|
-
if train
|
519
|
-
# remove one degree of freedom
|
520
|
-
first_key.values.each do |v|
|
521
|
-
num = cache.delete(v)
|
522
|
-
cache.each do |k, v2|
|
523
|
-
cache[k] -= 1 if v2 > num
|
524
|
-
end
|
525
|
-
end
|
526
|
-
end
|
527
|
-
|
528
|
-
ret2 = []
|
529
|
-
rows.each do |row|
|
530
|
-
ret = [0] * cache.size
|
531
|
-
row.each do |k, v|
|
532
|
-
if cache[k]
|
533
|
-
ret[cache[k]] = v
|
534
|
-
end
|
535
|
-
end
|
536
|
-
ret2 << ([1] + ret)
|
537
|
-
end
|
538
|
-
|
539
|
-
# flatten keys
|
540
|
-
c = [:_intercept] + cache.sort_by { |_, v| v }.map(&:first)
|
541
|
-
|
542
|
-
unless supports_categorical
|
543
|
-
c = c.map { |v| symbolize_coef(v) }
|
544
|
-
end
|
545
|
-
|
546
|
-
[ret2, c]
|
547
|
-
end
|
548
|
-
|
549
|
-
def symbolize_coef(k)
|
550
|
-
(k.is_a?(Array) ? k.join("") : k).to_sym
|
551
|
-
end
|
552
|
-
|
553
|
-
def matrix_arr(matrix)
|
554
|
-
matrix.to_a.map { |xi| xi[0].to_f }
|
555
|
-
end
|
556
302
|
end
|
557
303
|
end
|