eps 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -5
- data/README.md +34 -0
- data/lib/eps.rb +19 -10
- data/lib/eps/base_estimator.rb +35 -129
- data/lib/eps/data_frame.rb +7 -1
- data/lib/eps/evaluators/linear_regression.rb +1 -1
- data/lib/eps/label_encoder.rb +7 -3
- data/lib/eps/lightgbm.rb +36 -76
- data/lib/eps/linear_regression.rb +26 -79
- data/lib/eps/metrics.rb +24 -12
- data/lib/eps/model.rb +6 -6
- data/lib/eps/naive_bayes.rb +2 -139
- data/lib/eps/pmml.rb +14 -0
- data/lib/eps/pmml/generator.rb +422 -0
- data/lib/eps/pmml/loader.rb +241 -0
- data/lib/eps/version.rb +1 -1
- metadata +7 -5
- data/lib/eps/pmml_generators/lightgbm.rb +0 -187
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a59850fe508d404a023145710505e721f1bfc24935a30a090aee09d179887d3a
|
4
|
+
data.tar.gz: 8218bc5bb63ee5ebbd23a8e9a129bcd76789b1f6bb628d57b015f1d5740183ac
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db1011e9228763dc0a98e1e57d1c9e18a297d362cea18b33bf8eeffecce853ea49d4273ae4e782a6de2be37711e9e6373810e5517558248489e696b477c0848b
|
7
|
+
data.tar.gz: 6b9f52453be9d2ad7a29a4703508763988447de64a7599c53f9b9d3b0135e105130aba3c2679fed17ea60ba7242b6bd0d3cac9c5c2b796fe93f9009f0bbbcb30
|
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,11 @@
|
|
1
|
-
## 0.3.
|
1
|
+
## 0.3.1 (2019-12-06)
|
2
|
+
|
3
|
+
- Added `weight` option for LightGBM and linear regression
|
4
|
+
- Added `intercept` option for linear regression
|
5
|
+
- Fixed `Unknown label` error for LightGBM
|
6
|
+
- Fixed error message for unstable solutions with linear regression
|
7
|
+
|
8
|
+
## 0.3.0 (2019-09-05)
|
2
9
|
|
3
10
|
- Added support for LightGBM
|
4
11
|
- Added text features
|
@@ -12,22 +19,22 @@ Breaking
|
|
12
19
|
- Removed support for JSON and PFA formats
|
13
20
|
- Added smoothing to naive Bayes
|
14
21
|
|
15
|
-
## 0.2.1
|
22
|
+
## 0.2.1 (2019-05-19)
|
16
23
|
|
17
24
|
- Fixed error with `summary`
|
18
25
|
- Fixed error with `predict` in `Eps::Base`
|
19
26
|
- Fixed error with loaded classification models
|
20
27
|
|
21
|
-
## 0.2.0
|
28
|
+
## 0.2.0 (2019-05-19)
|
22
29
|
|
23
30
|
- Added support for classification
|
24
31
|
- Added `to_pmml` method
|
25
32
|
- Added `Eps::Base`
|
26
33
|
|
27
|
-
## 0.1.1
|
34
|
+
## 0.1.1 (2018-07-05)
|
28
35
|
|
29
36
|
- Huge performance boost
|
30
37
|
|
31
|
-
## 0.1.0
|
38
|
+
## 0.1.0 (2018-07-03)
|
32
39
|
|
33
40
|
- First release
|
data/README.md
CHANGED
@@ -369,6 +369,12 @@ gem 'gsl', group: :development
|
|
369
369
|
|
370
370
|
It only needs to be available in environments used to build the model.
|
371
371
|
|
372
|
+
By default, an intercept is included. Disable this with:
|
373
|
+
|
374
|
+
```ruby
|
375
|
+
Eps::Model.new(data, intercept: false)
|
376
|
+
```
|
377
|
+
|
372
378
|
## Validation Options
|
373
379
|
|
374
380
|
Pass your own validation set with:
|
@@ -389,6 +395,12 @@ Specify the validation set size (the default is `0.25`, which is 25%)
|
|
389
395
|
Eps::Model.new(data, split: {validation_size: 0.2})
|
390
396
|
```
|
391
397
|
|
398
|
+
Disable the validation set completely with:
|
399
|
+
|
400
|
+
```ruby
|
401
|
+
Eps::Model.new(data, split: false)
|
402
|
+
```
|
403
|
+
|
392
404
|
## Database Storage
|
393
405
|
|
394
406
|
The database is another place you can store models. It’s good if you retrain models automatically.
|
@@ -419,6 +431,28 @@ model = Eps::Model.load_pmml(data)
|
|
419
431
|
|
420
432
|
You can use [IRuby](https://github.com/SciRuby/iruby) to run Eps in [Jupyter](https://jupyter.org/) notebooks. Here’s how to get [IRuby working with Rails](https://ankane.org/jupyter-rails).
|
421
433
|
|
434
|
+
## Weights
|
435
|
+
|
436
|
+
Specify a weight for each data point
|
437
|
+
|
438
|
+
```ruby
|
439
|
+
Eps::Model.new(data, weight: :weight)
|
440
|
+
```
|
441
|
+
|
442
|
+
You can also pass an array
|
443
|
+
|
444
|
+
```ruby
|
445
|
+
Eps::Model.new(data, weight: [1, 2, 3])
|
446
|
+
```
|
447
|
+
|
448
|
+
Weights are supported for metrics as well
|
449
|
+
|
450
|
+
```ruby
|
451
|
+
Eps.metrics(actual, predicted, weight: weight)
|
452
|
+
```
|
453
|
+
|
454
|
+
Reweighing is one method to [mitigate bias](http://aif360.mybluemix.net/) in training data
|
455
|
+
|
422
456
|
## Upgrading
|
423
457
|
|
424
458
|
## 0.3.0
|
data/lib/eps.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
# dependencies
|
2
|
-
require "bigdecimal"
|
3
2
|
require "json"
|
4
3
|
require "lightgbm"
|
5
4
|
require "matrix"
|
@@ -9,10 +8,6 @@ require "nokogiri"
|
|
9
8
|
require "eps/base"
|
10
9
|
require "eps/base_estimator"
|
11
10
|
require "eps/data_frame"
|
12
|
-
require "eps/evaluators/linear_regression"
|
13
|
-
require "eps/evaluators/lightgbm"
|
14
|
-
require "eps/evaluators/naive_bayes"
|
15
|
-
require "eps/evaluators/node"
|
16
11
|
require "eps/label_encoder"
|
17
12
|
require "eps/lightgbm"
|
18
13
|
require "eps/linear_regression"
|
@@ -24,17 +19,31 @@ require "eps/text_encoder"
|
|
24
19
|
require "eps/utils"
|
25
20
|
require "eps/version"
|
26
21
|
|
22
|
+
# pmml
|
23
|
+
require "eps/pmml"
|
24
|
+
require "eps/pmml/generator"
|
25
|
+
require "eps/pmml/loader"
|
26
|
+
|
27
|
+
# evaluators
|
28
|
+
require "eps/evaluators/linear_regression"
|
29
|
+
require "eps/evaluators/lightgbm"
|
30
|
+
require "eps/evaluators/naive_bayes"
|
31
|
+
require "eps/evaluators/node"
|
32
|
+
|
27
33
|
module Eps
|
28
|
-
|
34
|
+
class Error < StandardError; end
|
35
|
+
class UnstableSolution < Error; end
|
36
|
+
|
37
|
+
def self.metrics(y_true, y_pred, weight: nil)
|
29
38
|
if Utils.column_type(y_true, "actual") == "numeric"
|
30
39
|
{
|
31
|
-
rmse: Metrics.rmse(y_true, y_pred),
|
32
|
-
mae: Metrics.mae(y_true, y_pred),
|
33
|
-
me: Metrics.me(y_true, y_pred)
|
40
|
+
rmse: Metrics.rmse(y_true, y_pred, weight: weight),
|
41
|
+
mae: Metrics.mae(y_true, y_pred, weight: weight),
|
42
|
+
me: Metrics.me(y_true, y_pred, weight: weight)
|
34
43
|
}
|
35
44
|
else
|
36
45
|
{
|
37
|
-
accuracy: Metrics.accuracy(y_true, y_pred)
|
46
|
+
accuracy: Metrics.accuracy(y_true, y_pred, weight: weight)
|
38
47
|
}
|
39
48
|
end
|
40
49
|
end
|
data/lib/eps/base_estimator.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
module Eps
|
2
2
|
class BaseEstimator
|
3
3
|
def initialize(data = nil, y = nil, **options)
|
4
|
+
@options = options.dup
|
5
|
+
# TODO better pattern - don't pass most options to train
|
6
|
+
options.delete(:intercept)
|
4
7
|
train(data, y, **options) if data
|
5
8
|
end
|
6
9
|
|
@@ -28,22 +31,19 @@ module Eps
|
|
28
31
|
singular ? predictions.first : predictions
|
29
32
|
end
|
30
33
|
|
31
|
-
def evaluate(data, y = nil, target: nil)
|
32
|
-
data, target = prep_data(data, y, target || @target)
|
33
|
-
Eps.metrics(data.label, predict(data))
|
34
|
+
def evaluate(data, y = nil, target: nil, weight: nil)
|
35
|
+
data, target = prep_data(data, y, target || @target, weight)
|
36
|
+
Eps.metrics(data.label, predict(data), weight: data.weight)
|
34
37
|
end
|
35
38
|
|
36
39
|
def to_pmml
|
37
|
-
|
40
|
+
@pmml ||= PMML.generate(self)
|
38
41
|
end
|
39
42
|
|
40
|
-
def self.load_pmml(
|
41
|
-
if data.is_a?(String)
|
42
|
-
data = Nokogiri::XML(data) { |config| config.strict }
|
43
|
-
end
|
43
|
+
def self.load_pmml(pmml)
|
44
44
|
model = new
|
45
|
-
model.instance_variable_set("@
|
46
|
-
model.instance_variable_set("@
|
45
|
+
model.instance_variable_set("@evaluator", PMML.load(pmml))
|
46
|
+
model.instance_variable_set("@pmml", pmml.respond_to?(:to_xml) ? pmml.to_xml : pmml) # cache data
|
47
47
|
model
|
48
48
|
end
|
49
49
|
|
@@ -57,11 +57,11 @@ module Eps
|
|
57
57
|
case @target_type
|
58
58
|
when "numeric"
|
59
59
|
metric_name = "RMSE"
|
60
|
-
v = Metrics.rmse(y_true, y_pred)
|
60
|
+
v = Metrics.rmse(y_true, y_pred, weight: @validation_set.weight)
|
61
61
|
metric_value = v.round >= 1000 ? v.round.to_s : "%.3g" % v
|
62
62
|
else
|
63
63
|
metric_name = "accuracy"
|
64
|
-
metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred)).round(1)
|
64
|
+
metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred, weight: @validation_set.weight)).round(1)
|
65
65
|
end
|
66
66
|
str << "Validation %s: %s\n\n" % [metric_name, metric_value]
|
67
67
|
end
|
@@ -70,50 +70,10 @@ module Eps
|
|
70
70
|
str
|
71
71
|
end
|
72
72
|
|
73
|
-
# private
|
74
|
-
def self.extract_text_features(data, features)
|
75
|
-
# updates features object
|
76
|
-
vocabulary = {}
|
77
|
-
function_mapping = {}
|
78
|
-
derived_fields = {}
|
79
|
-
data.css("LocalTransformations DerivedField, TransformationDictionary DerivedField").each do |n|
|
80
|
-
name = n.attribute("name")&.value
|
81
|
-
field = n.css("FieldRef").attribute("field").value
|
82
|
-
value = n.css("Constant").text
|
83
|
-
|
84
|
-
field = field[10..-2] if field =~ /\Alowercase\(.+\)\z/
|
85
|
-
next if value.empty?
|
86
|
-
|
87
|
-
(vocabulary[field] ||= []) << value
|
88
|
-
|
89
|
-
function_mapping[field] = n.css("Apply").attribute("function").value
|
90
|
-
|
91
|
-
derived_fields[name] = [field, value]
|
92
|
-
end
|
93
|
-
|
94
|
-
functions = {}
|
95
|
-
data.css("TransformationDictionary DefineFunction").each do |n|
|
96
|
-
name = n.attribute("name").value
|
97
|
-
text_index = n.css("TextIndex")
|
98
|
-
functions[name] = {
|
99
|
-
tokenizer: Regexp.new(text_index.attribute("wordSeparatorCharacterRE").value),
|
100
|
-
case_sensitive: text_index.attribute("isCaseSensitive")&.value == "true"
|
101
|
-
}
|
102
|
-
end
|
103
|
-
|
104
|
-
text_features = {}
|
105
|
-
function_mapping.each do |field, function|
|
106
|
-
text_features[field] = functions[function].merge(vocabulary: vocabulary[field])
|
107
|
-
features[field] = "text"
|
108
|
-
end
|
109
|
-
|
110
|
-
[text_features, derived_fields]
|
111
|
-
end
|
112
|
-
|
113
73
|
private
|
114
74
|
|
115
|
-
def train(data, y = nil, target: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
|
116
|
-
data, @target = prep_data(data, y, target)
|
75
|
+
def train(data, y = nil, target: nil, weight: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
|
76
|
+
data, @target = prep_data(data, y, target, weight)
|
117
77
|
@target_type = Utils.column_type(data.label, @target)
|
118
78
|
|
119
79
|
if split.nil?
|
@@ -121,6 +81,7 @@ module Eps
|
|
121
81
|
end
|
122
82
|
|
123
83
|
# cross validation
|
84
|
+
# TODO adjust based on weight
|
124
85
|
if split && !validation_set
|
125
86
|
split = {} if split == true
|
126
87
|
split = {column: split} unless split.is_a?(Hash)
|
@@ -193,8 +154,9 @@ module Eps
|
|
193
154
|
else
|
194
155
|
@train_set = data.dup
|
195
156
|
if validation_set
|
196
|
-
|
197
|
-
|
157
|
+
raise "Target required for validation set" unless target
|
158
|
+
raise "Weight required for validation set" if data.weight && !weight
|
159
|
+
validation_set, _ = prep_data(validation_set, nil, @target, weight)
|
198
160
|
end
|
199
161
|
end
|
200
162
|
|
@@ -210,12 +172,27 @@ module Eps
|
|
210
172
|
nil
|
211
173
|
end
|
212
174
|
|
213
|
-
def prep_data(data, y, target)
|
175
|
+
def prep_data(data, y, target, weight)
|
214
176
|
data = Eps::DataFrame.new(data)
|
177
|
+
|
178
|
+
# target
|
215
179
|
target = (target || "target").to_s
|
216
180
|
y ||= data.columns.delete(target)
|
217
181
|
check_missing(y, target)
|
218
182
|
data.label = y.to_a
|
183
|
+
|
184
|
+
# weight
|
185
|
+
if weight
|
186
|
+
weight =
|
187
|
+
if weight.respond_to?(:to_a)
|
188
|
+
weight.to_a
|
189
|
+
else
|
190
|
+
data.columns.delete(weight.to_s)
|
191
|
+
end
|
192
|
+
check_missing(weight, "weight")
|
193
|
+
data.weight = weight.to_a
|
194
|
+
end
|
195
|
+
|
219
196
|
check_data(data)
|
220
197
|
[data, target]
|
221
198
|
end
|
@@ -251,6 +228,7 @@ module Eps
|
|
251
228
|
def check_data(data)
|
252
229
|
raise "No data" if data.empty?
|
253
230
|
raise "Number of data points differs from target" if data.size != data.label.size
|
231
|
+
raise "Number of data points differs from weight" if data.weight && data.size != data.weight.size
|
254
232
|
end
|
255
233
|
|
256
234
|
def check_missing(c, name)
|
@@ -275,77 +253,5 @@ module Eps
|
|
275
253
|
k
|
276
254
|
end
|
277
255
|
end
|
278
|
-
|
279
|
-
# pmml
|
280
|
-
|
281
|
-
def build_pmml(data_fields)
|
282
|
-
Nokogiri::XML::Builder.new do |xml|
|
283
|
-
xml.PMML(version: "4.4", xmlns: "http://www.dmg.org/PMML-4_4", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
|
284
|
-
pmml_header(xml)
|
285
|
-
pmml_data_dictionary(xml, data_fields)
|
286
|
-
pmml_transformation_dictionary(xml)
|
287
|
-
yield xml
|
288
|
-
end
|
289
|
-
end
|
290
|
-
end
|
291
|
-
|
292
|
-
def pmml_header(xml)
|
293
|
-
xml.Header do
|
294
|
-
xml.Application(name: "Eps", version: Eps::VERSION)
|
295
|
-
# xml.Timestamp Time.now.utc.iso8601
|
296
|
-
end
|
297
|
-
end
|
298
|
-
|
299
|
-
def pmml_data_dictionary(xml, data_fields)
|
300
|
-
xml.DataDictionary do
|
301
|
-
data_fields.each do |k, vs|
|
302
|
-
case @features[k]
|
303
|
-
when "categorical", nil
|
304
|
-
xml.DataField(name: k, optype: "categorical", dataType: "string") do
|
305
|
-
vs.map(&:to_s).sort.each do |v|
|
306
|
-
xml.Value(value: v)
|
307
|
-
end
|
308
|
-
end
|
309
|
-
when "text"
|
310
|
-
xml.DataField(name: k, optype: "categorical", dataType: "string")
|
311
|
-
else
|
312
|
-
xml.DataField(name: k, optype: "continuous", dataType: "double")
|
313
|
-
end
|
314
|
-
end
|
315
|
-
end
|
316
|
-
end
|
317
|
-
|
318
|
-
def pmml_transformation_dictionary(xml)
|
319
|
-
if @text_features.any?
|
320
|
-
xml.TransformationDictionary do
|
321
|
-
@text_features.each do |k, text_options|
|
322
|
-
xml.DefineFunction(name: "#{k}Transform", optype: "continuous") do
|
323
|
-
xml.ParameterField(name: "text")
|
324
|
-
xml.ParameterField(name: "term")
|
325
|
-
xml.TextIndex(textField: "text", localTermWeights: "termFrequency", wordSeparatorCharacterRE: text_options[:tokenizer].source, isCaseSensitive: !!text_options[:case_sensitive]) do
|
326
|
-
xml.FieldRef(field: "term")
|
327
|
-
end
|
328
|
-
end
|
329
|
-
end
|
330
|
-
end
|
331
|
-
end
|
332
|
-
end
|
333
|
-
|
334
|
-
def pmml_local_transformations(xml)
|
335
|
-
if @text_features.any?
|
336
|
-
xml.LocalTransformations do
|
337
|
-
@text_features.each do |k, _|
|
338
|
-
@text_encoders[k].vocabulary.each do |v|
|
339
|
-
xml.DerivedField(name: display_field([k, v]), optype: "continuous", dataType: "integer") do
|
340
|
-
xml.Apply(function: "#{k}Transform") do
|
341
|
-
xml.FieldRef(field: k)
|
342
|
-
xml.Constant v
|
343
|
-
end
|
344
|
-
end
|
345
|
-
end
|
346
|
-
end
|
347
|
-
end
|
348
|
-
end
|
349
|
-
end
|
350
256
|
end
|
351
257
|
end
|
data/lib/eps/data_frame.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module Eps
|
2
2
|
class DataFrame
|
3
3
|
attr_reader :columns
|
4
|
-
attr_accessor :label
|
4
|
+
attr_accessor :label, :weight
|
5
5
|
|
6
6
|
def initialize(data = [])
|
7
7
|
@columns = {}
|
@@ -78,6 +78,10 @@ module Eps
|
|
78
78
|
rows = Range.new(rows.begin, size - 1)
|
79
79
|
elsif rows.end < 0
|
80
80
|
rows = Range.new(rows.begin, size + rows.end, rows.exclude_end?)
|
81
|
+
else
|
82
|
+
finish = rows.end
|
83
|
+
finish -= 1 if rows.exclude_end?
|
84
|
+
rows = Range.new(rows.begin, size - 1) if finish >= size - 1
|
81
85
|
end
|
82
86
|
end
|
83
87
|
|
@@ -115,6 +119,7 @@ module Eps
|
|
115
119
|
df.columns[c] = columns[c].values_at(*rows)
|
116
120
|
end
|
117
121
|
df.label = label.values_at(*rows) if label
|
122
|
+
df.weight = weight.values_at(*rows) if weight
|
118
123
|
|
119
124
|
singular ? df.columns[cols[0]] : df
|
120
125
|
end
|
@@ -129,6 +134,7 @@ module Eps
|
|
129
134
|
df.columns[k] = v
|
130
135
|
end
|
131
136
|
df.label = label
|
137
|
+
df.weight = weight
|
132
138
|
df
|
133
139
|
end
|
134
140
|
|
data/lib/eps/label_encoder.rb
CHANGED
@@ -24,9 +24,13 @@ module Eps
|
|
24
24
|
if yi.nil?
|
25
25
|
nil
|
26
26
|
else
|
27
|
-
|
28
|
-
|
29
|
-
|
27
|
+
# use an additional label for unseen values
|
28
|
+
# this is only used during training for the LightGBM eval_set
|
29
|
+
# LightGBM ignores them (only uses seen categories for predictions)
|
30
|
+
# https://github.com/microsoft/LightGBM/issues/1936
|
31
|
+
# the evaluator also ignores them (to be consistent with LightGBM)
|
32
|
+
# but doesn't use this code
|
33
|
+
@labels[yi.to_s] || @labels.size
|
30
34
|
end
|
31
35
|
end
|
32
36
|
end
|