eps 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -5
- data/README.md +34 -0
- data/lib/eps.rb +19 -10
- data/lib/eps/base_estimator.rb +35 -129
- data/lib/eps/data_frame.rb +7 -1
- data/lib/eps/evaluators/linear_regression.rb +1 -1
- data/lib/eps/label_encoder.rb +7 -3
- data/lib/eps/lightgbm.rb +36 -76
- data/lib/eps/linear_regression.rb +26 -79
- data/lib/eps/metrics.rb +24 -12
- data/lib/eps/model.rb +6 -6
- data/lib/eps/naive_bayes.rb +2 -139
- data/lib/eps/pmml.rb +14 -0
- data/lib/eps/pmml/generator.rb +422 -0
- data/lib/eps/pmml/loader.rb +241 -0
- data/lib/eps/version.rb +1 -1
- metadata +7 -5
- data/lib/eps/pmml_generators/lightgbm.rb +0 -187
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a59850fe508d404a023145710505e721f1bfc24935a30a090aee09d179887d3a
|
4
|
+
data.tar.gz: 8218bc5bb63ee5ebbd23a8e9a129bcd76789b1f6bb628d57b015f1d5740183ac
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db1011e9228763dc0a98e1e57d1c9e18a297d362cea18b33bf8eeffecce853ea49d4273ae4e782a6de2be37711e9e6373810e5517558248489e696b477c0848b
|
7
|
+
data.tar.gz: 6b9f52453be9d2ad7a29a4703508763988447de64a7599c53f9b9d3b0135e105130aba3c2679fed17ea60ba7242b6bd0d3cac9c5c2b796fe93f9009f0bbbcb30
|
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,11 @@
|
|
1
|
-
## 0.3.
|
1
|
+
## 0.3.1 (2019-12-06)
|
2
|
+
|
3
|
+
- Added `weight` option for LightGBM and linear regression
|
4
|
+
- Added `intercept` option for linear regression
|
5
|
+
- Fixed `Unknown label` error for LightGBM
|
6
|
+
- Fixed error message for unstable solutions with linear regression
|
7
|
+
|
8
|
+
## 0.3.0 (2019-09-05)
|
2
9
|
|
3
10
|
- Added support for LightGBM
|
4
11
|
- Added text features
|
@@ -12,22 +19,22 @@ Breaking
|
|
12
19
|
- Removed support for JSON and PFA formats
|
13
20
|
- Added smoothing to naive Bayes
|
14
21
|
|
15
|
-
## 0.2.1
|
22
|
+
## 0.2.1 (2019-05-19)
|
16
23
|
|
17
24
|
- Fixed error with `summary`
|
18
25
|
- Fixed error with `predict` in `Eps::Base`
|
19
26
|
- Fixed error with loaded classification models
|
20
27
|
|
21
|
-
## 0.2.0
|
28
|
+
## 0.2.0 (2019-05-19)
|
22
29
|
|
23
30
|
- Added support for classification
|
24
31
|
- Added `to_pmml` method
|
25
32
|
- Added `Eps::Base`
|
26
33
|
|
27
|
-
## 0.1.1
|
34
|
+
## 0.1.1 (2018-07-05)
|
28
35
|
|
29
36
|
- Huge performance boost
|
30
37
|
|
31
|
-
## 0.1.0
|
38
|
+
## 0.1.0 (2018-07-03)
|
32
39
|
|
33
40
|
- First release
|
data/README.md
CHANGED
@@ -369,6 +369,12 @@ gem 'gsl', group: :development
|
|
369
369
|
|
370
370
|
It only needs to be available in environments used to build the model.
|
371
371
|
|
372
|
+
By default, an intercept is included. Disable this with:
|
373
|
+
|
374
|
+
```ruby
|
375
|
+
Eps::Model.new(data, intercept: false)
|
376
|
+
```
|
377
|
+
|
372
378
|
## Validation Options
|
373
379
|
|
374
380
|
Pass your own validation set with:
|
@@ -389,6 +395,12 @@ Specify the validation set size (the default is `0.25`, which is 25%)
|
|
389
395
|
Eps::Model.new(data, split: {validation_size: 0.2})
|
390
396
|
```
|
391
397
|
|
398
|
+
Disable the validation set completely with:
|
399
|
+
|
400
|
+
```ruby
|
401
|
+
Eps::Model.new(data, split: false)
|
402
|
+
```
|
403
|
+
|
392
404
|
## Database Storage
|
393
405
|
|
394
406
|
The database is another place you can store models. It’s good if you retrain models automatically.
|
@@ -419,6 +431,28 @@ model = Eps::Model.load_pmml(data)
|
|
419
431
|
|
420
432
|
You can use [IRuby](https://github.com/SciRuby/iruby) to run Eps in [Jupyter](https://jupyter.org/) notebooks. Here’s how to get [IRuby working with Rails](https://ankane.org/jupyter-rails).
|
421
433
|
|
434
|
+
## Weights
|
435
|
+
|
436
|
+
Specify a weight for each data point
|
437
|
+
|
438
|
+
```ruby
|
439
|
+
Eps::Model.new(data, weight: :weight)
|
440
|
+
```
|
441
|
+
|
442
|
+
You can also pass an array
|
443
|
+
|
444
|
+
```ruby
|
445
|
+
Eps::Model.new(data, weight: [1, 2, 3])
|
446
|
+
```
|
447
|
+
|
448
|
+
Weights are supported for metrics as well
|
449
|
+
|
450
|
+
```ruby
|
451
|
+
Eps.metrics(actual, predicted, weight: weight)
|
452
|
+
```
|
453
|
+
|
454
|
+
Reweighing is one method to [mitigate bias](http://aif360.mybluemix.net/) in training data
|
455
|
+
|
422
456
|
## Upgrading
|
423
457
|
|
424
458
|
## 0.3.0
|
data/lib/eps.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
# dependencies
|
2
|
-
require "bigdecimal"
|
3
2
|
require "json"
|
4
3
|
require "lightgbm"
|
5
4
|
require "matrix"
|
@@ -9,10 +8,6 @@ require "nokogiri"
|
|
9
8
|
require "eps/base"
|
10
9
|
require "eps/base_estimator"
|
11
10
|
require "eps/data_frame"
|
12
|
-
require "eps/evaluators/linear_regression"
|
13
|
-
require "eps/evaluators/lightgbm"
|
14
|
-
require "eps/evaluators/naive_bayes"
|
15
|
-
require "eps/evaluators/node"
|
16
11
|
require "eps/label_encoder"
|
17
12
|
require "eps/lightgbm"
|
18
13
|
require "eps/linear_regression"
|
@@ -24,17 +19,31 @@ require "eps/text_encoder"
|
|
24
19
|
require "eps/utils"
|
25
20
|
require "eps/version"
|
26
21
|
|
22
|
+
# pmml
|
23
|
+
require "eps/pmml"
|
24
|
+
require "eps/pmml/generator"
|
25
|
+
require "eps/pmml/loader"
|
26
|
+
|
27
|
+
# evaluators
|
28
|
+
require "eps/evaluators/linear_regression"
|
29
|
+
require "eps/evaluators/lightgbm"
|
30
|
+
require "eps/evaluators/naive_bayes"
|
31
|
+
require "eps/evaluators/node"
|
32
|
+
|
27
33
|
module Eps
|
28
|
-
|
34
|
+
class Error < StandardError; end
|
35
|
+
class UnstableSolution < Error; end
|
36
|
+
|
37
|
+
def self.metrics(y_true, y_pred, weight: nil)
|
29
38
|
if Utils.column_type(y_true, "actual") == "numeric"
|
30
39
|
{
|
31
|
-
rmse: Metrics.rmse(y_true, y_pred),
|
32
|
-
mae: Metrics.mae(y_true, y_pred),
|
33
|
-
me: Metrics.me(y_true, y_pred)
|
40
|
+
rmse: Metrics.rmse(y_true, y_pred, weight: weight),
|
41
|
+
mae: Metrics.mae(y_true, y_pred, weight: weight),
|
42
|
+
me: Metrics.me(y_true, y_pred, weight: weight)
|
34
43
|
}
|
35
44
|
else
|
36
45
|
{
|
37
|
-
accuracy: Metrics.accuracy(y_true, y_pred)
|
46
|
+
accuracy: Metrics.accuracy(y_true, y_pred, weight: weight)
|
38
47
|
}
|
39
48
|
end
|
40
49
|
end
|
data/lib/eps/base_estimator.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
module Eps
|
2
2
|
class BaseEstimator
|
3
3
|
def initialize(data = nil, y = nil, **options)
|
4
|
+
@options = options.dup
|
5
|
+
# TODO better pattern - don't pass most options to train
|
6
|
+
options.delete(:intercept)
|
4
7
|
train(data, y, **options) if data
|
5
8
|
end
|
6
9
|
|
@@ -28,22 +31,19 @@ module Eps
|
|
28
31
|
singular ? predictions.first : predictions
|
29
32
|
end
|
30
33
|
|
31
|
-
def evaluate(data, y = nil, target: nil)
|
32
|
-
data, target = prep_data(data, y, target || @target)
|
33
|
-
Eps.metrics(data.label, predict(data))
|
34
|
+
def evaluate(data, y = nil, target: nil, weight: nil)
|
35
|
+
data, target = prep_data(data, y, target || @target, weight)
|
36
|
+
Eps.metrics(data.label, predict(data), weight: data.weight)
|
34
37
|
end
|
35
38
|
|
36
39
|
def to_pmml
|
37
|
-
|
40
|
+
@pmml ||= PMML.generate(self)
|
38
41
|
end
|
39
42
|
|
40
|
-
def self.load_pmml(
|
41
|
-
if data.is_a?(String)
|
42
|
-
data = Nokogiri::XML(data) { |config| config.strict }
|
43
|
-
end
|
43
|
+
def self.load_pmml(pmml)
|
44
44
|
model = new
|
45
|
-
model.instance_variable_set("@
|
46
|
-
model.instance_variable_set("@
|
45
|
+
model.instance_variable_set("@evaluator", PMML.load(pmml))
|
46
|
+
model.instance_variable_set("@pmml", pmml.respond_to?(:to_xml) ? pmml.to_xml : pmml) # cache data
|
47
47
|
model
|
48
48
|
end
|
49
49
|
|
@@ -57,11 +57,11 @@ module Eps
|
|
57
57
|
case @target_type
|
58
58
|
when "numeric"
|
59
59
|
metric_name = "RMSE"
|
60
|
-
v = Metrics.rmse(y_true, y_pred)
|
60
|
+
v = Metrics.rmse(y_true, y_pred, weight: @validation_set.weight)
|
61
61
|
metric_value = v.round >= 1000 ? v.round.to_s : "%.3g" % v
|
62
62
|
else
|
63
63
|
metric_name = "accuracy"
|
64
|
-
metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred)).round(1)
|
64
|
+
metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred, weight: @validation_set.weight)).round(1)
|
65
65
|
end
|
66
66
|
str << "Validation %s: %s\n\n" % [metric_name, metric_value]
|
67
67
|
end
|
@@ -70,50 +70,10 @@ module Eps
|
|
70
70
|
str
|
71
71
|
end
|
72
72
|
|
73
|
-
# private
|
74
|
-
def self.extract_text_features(data, features)
|
75
|
-
# updates features object
|
76
|
-
vocabulary = {}
|
77
|
-
function_mapping = {}
|
78
|
-
derived_fields = {}
|
79
|
-
data.css("LocalTransformations DerivedField, TransformationDictionary DerivedField").each do |n|
|
80
|
-
name = n.attribute("name")&.value
|
81
|
-
field = n.css("FieldRef").attribute("field").value
|
82
|
-
value = n.css("Constant").text
|
83
|
-
|
84
|
-
field = field[10..-2] if field =~ /\Alowercase\(.+\)\z/
|
85
|
-
next if value.empty?
|
86
|
-
|
87
|
-
(vocabulary[field] ||= []) << value
|
88
|
-
|
89
|
-
function_mapping[field] = n.css("Apply").attribute("function").value
|
90
|
-
|
91
|
-
derived_fields[name] = [field, value]
|
92
|
-
end
|
93
|
-
|
94
|
-
functions = {}
|
95
|
-
data.css("TransformationDictionary DefineFunction").each do |n|
|
96
|
-
name = n.attribute("name").value
|
97
|
-
text_index = n.css("TextIndex")
|
98
|
-
functions[name] = {
|
99
|
-
tokenizer: Regexp.new(text_index.attribute("wordSeparatorCharacterRE").value),
|
100
|
-
case_sensitive: text_index.attribute("isCaseSensitive")&.value == "true"
|
101
|
-
}
|
102
|
-
end
|
103
|
-
|
104
|
-
text_features = {}
|
105
|
-
function_mapping.each do |field, function|
|
106
|
-
text_features[field] = functions[function].merge(vocabulary: vocabulary[field])
|
107
|
-
features[field] = "text"
|
108
|
-
end
|
109
|
-
|
110
|
-
[text_features, derived_fields]
|
111
|
-
end
|
112
|
-
|
113
73
|
private
|
114
74
|
|
115
|
-
def train(data, y = nil, target: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
|
116
|
-
data, @target = prep_data(data, y, target)
|
75
|
+
def train(data, y = nil, target: nil, weight: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
|
76
|
+
data, @target = prep_data(data, y, target, weight)
|
117
77
|
@target_type = Utils.column_type(data.label, @target)
|
118
78
|
|
119
79
|
if split.nil?
|
@@ -121,6 +81,7 @@ module Eps
|
|
121
81
|
end
|
122
82
|
|
123
83
|
# cross validation
|
84
|
+
# TODO adjust based on weight
|
124
85
|
if split && !validation_set
|
125
86
|
split = {} if split == true
|
126
87
|
split = {column: split} unless split.is_a?(Hash)
|
@@ -193,8 +154,9 @@ module Eps
|
|
193
154
|
else
|
194
155
|
@train_set = data.dup
|
195
156
|
if validation_set
|
196
|
-
|
197
|
-
|
157
|
+
raise "Target required for validation set" unless target
|
158
|
+
raise "Weight required for validation set" if data.weight && !weight
|
159
|
+
validation_set, _ = prep_data(validation_set, nil, @target, weight)
|
198
160
|
end
|
199
161
|
end
|
200
162
|
|
@@ -210,12 +172,27 @@ module Eps
|
|
210
172
|
nil
|
211
173
|
end
|
212
174
|
|
213
|
-
def prep_data(data, y, target)
|
175
|
+
def prep_data(data, y, target, weight)
|
214
176
|
data = Eps::DataFrame.new(data)
|
177
|
+
|
178
|
+
# target
|
215
179
|
target = (target || "target").to_s
|
216
180
|
y ||= data.columns.delete(target)
|
217
181
|
check_missing(y, target)
|
218
182
|
data.label = y.to_a
|
183
|
+
|
184
|
+
# weight
|
185
|
+
if weight
|
186
|
+
weight =
|
187
|
+
if weight.respond_to?(:to_a)
|
188
|
+
weight.to_a
|
189
|
+
else
|
190
|
+
data.columns.delete(weight.to_s)
|
191
|
+
end
|
192
|
+
check_missing(weight, "weight")
|
193
|
+
data.weight = weight.to_a
|
194
|
+
end
|
195
|
+
|
219
196
|
check_data(data)
|
220
197
|
[data, target]
|
221
198
|
end
|
@@ -251,6 +228,7 @@ module Eps
|
|
251
228
|
def check_data(data)
|
252
229
|
raise "No data" if data.empty?
|
253
230
|
raise "Number of data points differs from target" if data.size != data.label.size
|
231
|
+
raise "Number of data points differs from weight" if data.weight && data.size != data.weight.size
|
254
232
|
end
|
255
233
|
|
256
234
|
def check_missing(c, name)
|
@@ -275,77 +253,5 @@ module Eps
|
|
275
253
|
k
|
276
254
|
end
|
277
255
|
end
|
278
|
-
|
279
|
-
# pmml
|
280
|
-
|
281
|
-
def build_pmml(data_fields)
|
282
|
-
Nokogiri::XML::Builder.new do |xml|
|
283
|
-
xml.PMML(version: "4.4", xmlns: "http://www.dmg.org/PMML-4_4", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
|
284
|
-
pmml_header(xml)
|
285
|
-
pmml_data_dictionary(xml, data_fields)
|
286
|
-
pmml_transformation_dictionary(xml)
|
287
|
-
yield xml
|
288
|
-
end
|
289
|
-
end
|
290
|
-
end
|
291
|
-
|
292
|
-
def pmml_header(xml)
|
293
|
-
xml.Header do
|
294
|
-
xml.Application(name: "Eps", version: Eps::VERSION)
|
295
|
-
# xml.Timestamp Time.now.utc.iso8601
|
296
|
-
end
|
297
|
-
end
|
298
|
-
|
299
|
-
def pmml_data_dictionary(xml, data_fields)
|
300
|
-
xml.DataDictionary do
|
301
|
-
data_fields.each do |k, vs|
|
302
|
-
case @features[k]
|
303
|
-
when "categorical", nil
|
304
|
-
xml.DataField(name: k, optype: "categorical", dataType: "string") do
|
305
|
-
vs.map(&:to_s).sort.each do |v|
|
306
|
-
xml.Value(value: v)
|
307
|
-
end
|
308
|
-
end
|
309
|
-
when "text"
|
310
|
-
xml.DataField(name: k, optype: "categorical", dataType: "string")
|
311
|
-
else
|
312
|
-
xml.DataField(name: k, optype: "continuous", dataType: "double")
|
313
|
-
end
|
314
|
-
end
|
315
|
-
end
|
316
|
-
end
|
317
|
-
|
318
|
-
def pmml_transformation_dictionary(xml)
|
319
|
-
if @text_features.any?
|
320
|
-
xml.TransformationDictionary do
|
321
|
-
@text_features.each do |k, text_options|
|
322
|
-
xml.DefineFunction(name: "#{k}Transform", optype: "continuous") do
|
323
|
-
xml.ParameterField(name: "text")
|
324
|
-
xml.ParameterField(name: "term")
|
325
|
-
xml.TextIndex(textField: "text", localTermWeights: "termFrequency", wordSeparatorCharacterRE: text_options[:tokenizer].source, isCaseSensitive: !!text_options[:case_sensitive]) do
|
326
|
-
xml.FieldRef(field: "term")
|
327
|
-
end
|
328
|
-
end
|
329
|
-
end
|
330
|
-
end
|
331
|
-
end
|
332
|
-
end
|
333
|
-
|
334
|
-
def pmml_local_transformations(xml)
|
335
|
-
if @text_features.any?
|
336
|
-
xml.LocalTransformations do
|
337
|
-
@text_features.each do |k, _|
|
338
|
-
@text_encoders[k].vocabulary.each do |v|
|
339
|
-
xml.DerivedField(name: display_field([k, v]), optype: "continuous", dataType: "integer") do
|
340
|
-
xml.Apply(function: "#{k}Transform") do
|
341
|
-
xml.FieldRef(field: k)
|
342
|
-
xml.Constant v
|
343
|
-
end
|
344
|
-
end
|
345
|
-
end
|
346
|
-
end
|
347
|
-
end
|
348
|
-
end
|
349
|
-
end
|
350
256
|
end
|
351
257
|
end
|
data/lib/eps/data_frame.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module Eps
|
2
2
|
class DataFrame
|
3
3
|
attr_reader :columns
|
4
|
-
attr_accessor :label
|
4
|
+
attr_accessor :label, :weight
|
5
5
|
|
6
6
|
def initialize(data = [])
|
7
7
|
@columns = {}
|
@@ -78,6 +78,10 @@ module Eps
|
|
78
78
|
rows = Range.new(rows.begin, size - 1)
|
79
79
|
elsif rows.end < 0
|
80
80
|
rows = Range.new(rows.begin, size + rows.end, rows.exclude_end?)
|
81
|
+
else
|
82
|
+
finish = rows.end
|
83
|
+
finish -= 1 if rows.exclude_end?
|
84
|
+
rows = Range.new(rows.begin, size - 1) if finish >= size - 1
|
81
85
|
end
|
82
86
|
end
|
83
87
|
|
@@ -115,6 +119,7 @@ module Eps
|
|
115
119
|
df.columns[c] = columns[c].values_at(*rows)
|
116
120
|
end
|
117
121
|
df.label = label.values_at(*rows) if label
|
122
|
+
df.weight = weight.values_at(*rows) if weight
|
118
123
|
|
119
124
|
singular ? df.columns[cols[0]] : df
|
120
125
|
end
|
@@ -129,6 +134,7 @@ module Eps
|
|
129
134
|
df.columns[k] = v
|
130
135
|
end
|
131
136
|
df.label = label
|
137
|
+
df.weight = weight
|
132
138
|
df
|
133
139
|
end
|
134
140
|
|
data/lib/eps/label_encoder.rb
CHANGED
@@ -24,9 +24,13 @@ module Eps
|
|
24
24
|
if yi.nil?
|
25
25
|
nil
|
26
26
|
else
|
27
|
-
|
28
|
-
|
29
|
-
|
27
|
+
# use an additional label for unseen values
|
28
|
+
# this is only used during training for the LightGBM eval_set
|
29
|
+
# LightGBM ignores them (only uses seen categories for predictions)
|
30
|
+
# https://github.com/microsoft/LightGBM/issues/1936
|
31
|
+
# the evaluator also ignores them (to be consistent with LightGBM)
|
32
|
+
# but doesn't use this code
|
33
|
+
@labels[yi.to_s] || @labels.size
|
30
34
|
end
|
31
35
|
end
|
32
36
|
end
|