eps 0.3.0 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -5
- data/README.md +77 -9
- data/lib/eps.rb +19 -10
- data/lib/eps/base_estimator.rb +63 -145
- data/lib/eps/data_frame.rb +19 -3
- data/lib/eps/evaluators/lightgbm.rb +20 -7
- data/lib/eps/evaluators/linear_regression.rb +7 -4
- data/lib/eps/evaluators/naive_bayes.rb +9 -7
- data/lib/eps/label_encoder.rb +7 -3
- data/lib/eps/lightgbm.rb +43 -78
- data/lib/eps/linear_regression.rb +53 -83
- data/lib/eps/metrics.rb +24 -12
- data/lib/eps/model.rb +6 -6
- data/lib/eps/naive_bayes.rb +3 -140
- data/lib/eps/pmml.rb +14 -0
- data/lib/eps/pmml/generator.rb +422 -0
- data/lib/eps/pmml/loader.rb +241 -0
- data/lib/eps/version.rb +1 -1
- metadata +36 -6
- data/lib/eps/pmml_generators/lightgbm.rb +0 -187
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1369016c3cae228f169fe580b54fca3c0d240cda202fa7d03ecc7a4e156ee8c7
|
4
|
+
data.tar.gz: bf83ca424c509798d1a1436806b52cba0cfdbefecb8d827d5b17aec7b807b121
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2bf47d80a301eb546c348aaa71f847fa22ace5bed63d97a1f19eb14bc15388b056cd3f545ccf251b2bbf2afc485ef81e5559849ff7459e9dd9f88a71c7cbf83a
|
7
|
+
data.tar.gz: 82d65d84e95a6518cd132c2a42cdec20afd05c0013192941b59ee0edb524874d12b2dd9082dd89be1422872c88e827e031469e43b80336c48c7eab7ff4fe611e
|
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,30 @@
|
|
1
|
-
## 0.3.
|
1
|
+
## 0.3.5 (2020-06-10)
|
2
|
+
|
3
|
+
- Added `learning_rate` option for LightGBM
|
4
|
+
- Added support for Numo and Rover
|
5
|
+
|
6
|
+
## 0.3.4 (2020-04-05)
|
7
|
+
|
8
|
+
- Added `predict_probability` for classification
|
9
|
+
|
10
|
+
## 0.3.3 (2020-02-24)
|
11
|
+
|
12
|
+
- Fixed errors and incorrect predictions with boolean columns
|
13
|
+
- Fixed deprecation warnings in Ruby 2.7
|
14
|
+
|
15
|
+
## 0.3.2 (2019-12-08)
|
16
|
+
|
17
|
+
- Added support for GSLR
|
18
|
+
|
19
|
+
## 0.3.1 (2019-12-06)
|
20
|
+
|
21
|
+
- Added `weight` option for LightGBM and linear regression
|
22
|
+
- Added `intercept` option for linear regression
|
23
|
+
- Added LightGBM evaluator safety check
|
24
|
+
- Fixed `Unknown label` error for LightGBM
|
25
|
+
- Fixed error message for unstable solutions with linear regression
|
26
|
+
|
27
|
+
## 0.3.0 (2019-09-05)
|
2
28
|
|
3
29
|
- Added support for LightGBM
|
4
30
|
- Added text features
|
@@ -12,22 +38,22 @@ Breaking
|
|
12
38
|
- Removed support for JSON and PFA formats
|
13
39
|
- Added smoothing to naive Bayes
|
14
40
|
|
15
|
-
## 0.2.1
|
41
|
+
## 0.2.1 (2019-05-19)
|
16
42
|
|
17
43
|
- Fixed error with `summary`
|
18
44
|
- Fixed error with `predict` in `Eps::Base`
|
19
45
|
- Fixed error with loaded classification models
|
20
46
|
|
21
|
-
## 0.2.0
|
47
|
+
## 0.2.0 (2019-05-19)
|
22
48
|
|
23
49
|
- Added support for classification
|
24
50
|
- Added `to_pmml` method
|
25
51
|
- Added `Eps::Base`
|
26
52
|
|
27
|
-
## 0.1.1
|
53
|
+
## 0.1.1 (2018-07-05)
|
28
54
|
|
29
55
|
- Huge performance boost
|
30
56
|
|
31
|
-
## 0.1.0
|
57
|
+
## 0.1.0 (2018-07-03)
|
32
58
|
|
33
59
|
- First release
|
data/README.md
CHANGED
@@ -4,7 +4,6 @@ Machine learning for Ruby
|
|
4
4
|
|
5
5
|
- Build predictive models quickly and easily
|
6
6
|
- Serve models built in Ruby, Python, R, and more
|
7
|
-
- No prior knowledge of machine learning required :tada:
|
8
7
|
|
9
8
|
Check out [this post](https://ankane.org/rails-meet-data-science) for more info on machine learning with Rails
|
10
9
|
|
@@ -314,7 +313,7 @@ y = [1, 2, 3]
|
|
314
313
|
Eps::Model.new(x, y)
|
315
314
|
```
|
316
315
|
|
317
|
-
|
316
|
+
Data can be an array of arrays
|
318
317
|
|
319
318
|
```ruby
|
320
319
|
x = [[1, 2], [2, 0], [3, 1]]
|
@@ -322,9 +321,22 @@ y = [1, 2, 3]
|
|
322
321
|
Eps::Model.new(x, y)
|
323
322
|
```
|
324
323
|
|
325
|
-
|
324
|
+
Or Numo arrays
|
326
325
|
|
327
|
-
|
326
|
+
```ruby
|
327
|
+
x = Numo::NArray.cast([[1, 2], [2, 0], [3, 1]])
|
328
|
+
y = Numo::NArray.cast([1, 2, 3])
|
329
|
+
Eps::Model.new(x, y)
|
330
|
+
```
|
331
|
+
|
332
|
+
Or a Rover data frame
|
333
|
+
|
334
|
+
```ruby
|
335
|
+
df = Rover.read_csv("houses.csv")
|
336
|
+
Eps::Model.new(df, target: "price")
|
337
|
+
```
|
338
|
+
|
339
|
+
Or a Daru data frame
|
328
340
|
|
329
341
|
```ruby
|
330
342
|
df = Daru::DataFrame.from_csv("houses.csv")
|
@@ -353,9 +365,19 @@ Eps supports:
|
|
353
365
|
- Linear Regression
|
354
366
|
- Naive Bayes
|
355
367
|
|
368
|
+
### LightGBM
|
369
|
+
|
370
|
+
Pass the learning rate with:
|
371
|
+
|
372
|
+
```ruby
|
373
|
+
Eps::Model.new(data, learning_rate: 0.01)
|
374
|
+
```
|
375
|
+
|
356
376
|
### Linear Regression
|
357
377
|
|
358
|
-
|
378
|
+
#### Performance
|
379
|
+
|
380
|
+
To speed up training on large datasets with linear regression, [install GSL](https://github.com/ankane/gslr#gsl-installation). With Homebrew, you can use:
|
359
381
|
|
360
382
|
```sh
|
361
383
|
brew install gsl
|
@@ -364,11 +386,29 @@ brew install gsl
|
|
364
386
|
Then, add this line to your application’s Gemfile:
|
365
387
|
|
366
388
|
```ruby
|
367
|
-
gem '
|
389
|
+
gem 'gslr', group: :development
|
368
390
|
```
|
369
391
|
|
370
392
|
It only needs to be available in environments used to build the model.
|
371
393
|
|
394
|
+
#### Options
|
395
|
+
|
396
|
+
By default, an intercept is included. Disable this with:
|
397
|
+
|
398
|
+
```ruby
|
399
|
+
Eps::Model.new(data, intercept: false)
|
400
|
+
```
|
401
|
+
|
402
|
+
## Probability
|
403
|
+
|
404
|
+
To get the probability of each category for predictions with classification, use:
|
405
|
+
|
406
|
+
```ruby
|
407
|
+
model.predict_probability(data)
|
408
|
+
```
|
409
|
+
|
410
|
+
Naive Bayes is known to produce poor probability estimates, so stick with LightGBM if you need this.
|
411
|
+
|
372
412
|
## Validation Options
|
373
413
|
|
374
414
|
Pass your own validation set with:
|
@@ -389,6 +429,12 @@ Specify the validation set size (the default is `0.25`, which is 25%)
|
|
389
429
|
Eps::Model.new(data, split: {validation_size: 0.2})
|
390
430
|
```
|
391
431
|
|
432
|
+
Disable the validation set completely with:
|
433
|
+
|
434
|
+
```ruby
|
435
|
+
Eps::Model.new(data, split: false)
|
436
|
+
```
|
437
|
+
|
392
438
|
## Database Storage
|
393
439
|
|
394
440
|
The database is another place you can store models. It’s good if you retrain models automatically.
|
@@ -398,7 +444,7 @@ The database is another place you can store models. It’s good if you retrain m
|
|
398
444
|
Create an ActiveRecord model to store the predictive model.
|
399
445
|
|
400
446
|
```sh
|
401
|
-
rails
|
447
|
+
rails generate model Model key:string:uniq data:text
|
402
448
|
```
|
403
449
|
|
404
450
|
Store the model with:
|
@@ -419,6 +465,28 @@ model = Eps::Model.load_pmml(data)
|
|
419
465
|
|
420
466
|
You can use [IRuby](https://github.com/SciRuby/iruby) to run Eps in [Jupyter](https://jupyter.org/) notebooks. Here’s how to get [IRuby working with Rails](https://ankane.org/jupyter-rails).
|
421
467
|
|
468
|
+
## Weights
|
469
|
+
|
470
|
+
Specify a weight for each data point
|
471
|
+
|
472
|
+
```ruby
|
473
|
+
Eps::Model.new(data, weight: :weight)
|
474
|
+
```
|
475
|
+
|
476
|
+
You can also pass an array
|
477
|
+
|
478
|
+
```ruby
|
479
|
+
Eps::Model.new(data, weight: [1, 2, 3])
|
480
|
+
```
|
481
|
+
|
482
|
+
Weights are supported for metrics as well
|
483
|
+
|
484
|
+
```ruby
|
485
|
+
Eps.metrics(actual, predicted, weight: weight)
|
486
|
+
```
|
487
|
+
|
488
|
+
Reweighing is one method to [mitigate bias](http://aif360.mybluemix.net/) in training data
|
489
|
+
|
422
490
|
## Upgrading
|
423
491
|
|
424
492
|
## 0.3.0
|
@@ -486,11 +554,11 @@ Everyone is encouraged to help improve this project. Here are a few ways you can
|
|
486
554
|
- Write, clarify, or fix documentation
|
487
555
|
- Suggest or add new features
|
488
556
|
|
489
|
-
To get started with development
|
557
|
+
To get started with development:
|
490
558
|
|
491
559
|
```sh
|
492
560
|
git clone https://github.com/ankane/eps.git
|
493
561
|
cd eps
|
494
562
|
bundle install
|
495
|
-
rake test
|
563
|
+
bundle exec rake test
|
496
564
|
```
|
data/lib/eps.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
# dependencies
|
2
|
-
require "bigdecimal"
|
3
2
|
require "json"
|
4
3
|
require "lightgbm"
|
5
4
|
require "matrix"
|
@@ -9,10 +8,6 @@ require "nokogiri"
|
|
9
8
|
require "eps/base"
|
10
9
|
require "eps/base_estimator"
|
11
10
|
require "eps/data_frame"
|
12
|
-
require "eps/evaluators/linear_regression"
|
13
|
-
require "eps/evaluators/lightgbm"
|
14
|
-
require "eps/evaluators/naive_bayes"
|
15
|
-
require "eps/evaluators/node"
|
16
11
|
require "eps/label_encoder"
|
17
12
|
require "eps/lightgbm"
|
18
13
|
require "eps/linear_regression"
|
@@ -24,17 +19,31 @@ require "eps/text_encoder"
|
|
24
19
|
require "eps/utils"
|
25
20
|
require "eps/version"
|
26
21
|
|
22
|
+
# pmml
|
23
|
+
require "eps/pmml"
|
24
|
+
require "eps/pmml/generator"
|
25
|
+
require "eps/pmml/loader"
|
26
|
+
|
27
|
+
# evaluators
|
28
|
+
require "eps/evaluators/linear_regression"
|
29
|
+
require "eps/evaluators/lightgbm"
|
30
|
+
require "eps/evaluators/naive_bayes"
|
31
|
+
require "eps/evaluators/node"
|
32
|
+
|
27
33
|
module Eps
|
28
|
-
|
34
|
+
class Error < StandardError; end
|
35
|
+
class UnstableSolution < Error; end
|
36
|
+
|
37
|
+
def self.metrics(y_true, y_pred, weight: nil)
|
29
38
|
if Utils.column_type(y_true, "actual") == "numeric"
|
30
39
|
{
|
31
|
-
rmse: Metrics.rmse(y_true, y_pred),
|
32
|
-
mae: Metrics.mae(y_true, y_pred),
|
33
|
-
me: Metrics.me(y_true, y_pred)
|
40
|
+
rmse: Metrics.rmse(y_true, y_pred, weight: weight),
|
41
|
+
mae: Metrics.mae(y_true, y_pred, weight: weight),
|
42
|
+
me: Metrics.me(y_true, y_pred, weight: weight)
|
34
43
|
}
|
35
44
|
else
|
36
45
|
{
|
37
|
-
accuracy: Metrics.accuracy(y_true, y_pred)
|
46
|
+
accuracy: Metrics.accuracy(y_true, y_pred, weight: weight)
|
38
47
|
}
|
39
48
|
end
|
40
49
|
end
|
data/lib/eps/base_estimator.rb
CHANGED
@@ -1,53 +1,39 @@
|
|
1
1
|
module Eps
|
2
2
|
class BaseEstimator
|
3
3
|
def initialize(data = nil, y = nil, **options)
|
4
|
+
@options = options.dup
|
5
|
+
@trained = false
|
6
|
+
# TODO better pattern - don't pass most options to train
|
4
7
|
train(data, y, **options) if data
|
5
8
|
end
|
6
9
|
|
7
10
|
def predict(data)
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
data = Eps::DataFrame.new(data)
|
12
|
-
|
13
|
-
@evaluator.features.each do |k, type|
|
14
|
-
values = data.columns[k]
|
15
|
-
raise ArgumentError, "Missing column: #{k}" if !values
|
16
|
-
column_type = Utils.column_type(values.compact, k) if values
|
17
|
-
|
18
|
-
if !column_type.nil?
|
19
|
-
if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
|
20
|
-
raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
|
21
|
-
end
|
22
|
-
end
|
23
|
-
# TODO check for unknown values for categorical features
|
24
|
-
end
|
25
|
-
|
26
|
-
predictions = @evaluator.predict(data)
|
11
|
+
_predict(data, false)
|
12
|
+
end
|
27
13
|
|
28
|
-
|
14
|
+
def predict_probability(data)
|
15
|
+
_predict(data, true)
|
29
16
|
end
|
30
17
|
|
31
|
-
def evaluate(data, y = nil, target: nil)
|
32
|
-
data, target = prep_data(data, y, target || @target)
|
33
|
-
Eps.metrics(data.label, predict(data))
|
18
|
+
def evaluate(data, y = nil, target: nil, weight: nil)
|
19
|
+
data, target = prep_data(data, y, target || @target, weight)
|
20
|
+
Eps.metrics(data.label, predict(data), weight: data.weight)
|
34
21
|
end
|
35
22
|
|
36
23
|
def to_pmml
|
37
|
-
|
24
|
+
@pmml ||= PMML.generate(self)
|
38
25
|
end
|
39
26
|
|
40
|
-
def self.load_pmml(
|
41
|
-
if data.is_a?(String)
|
42
|
-
data = Nokogiri::XML(data) { |config| config.strict }
|
43
|
-
end
|
27
|
+
def self.load_pmml(pmml)
|
44
28
|
model = new
|
45
|
-
model.instance_variable_set("@
|
46
|
-
model.instance_variable_set("@
|
29
|
+
model.instance_variable_set("@evaluator", PMML.load(pmml))
|
30
|
+
model.instance_variable_set("@pmml", pmml.respond_to?(:to_xml) ? pmml.to_xml : pmml) # cache data
|
47
31
|
model
|
48
32
|
end
|
49
33
|
|
50
34
|
def summary(extended: false)
|
35
|
+
raise "Summary not available for loaded models" unless @trained
|
36
|
+
|
51
37
|
str = String.new("")
|
52
38
|
|
53
39
|
if @validation_set
|
@@ -57,11 +43,11 @@ module Eps
|
|
57
43
|
case @target_type
|
58
44
|
when "numeric"
|
59
45
|
metric_name = "RMSE"
|
60
|
-
v = Metrics.rmse(y_true, y_pred)
|
46
|
+
v = Metrics.rmse(y_true, y_pred, weight: @validation_set.weight)
|
61
47
|
metric_value = v.round >= 1000 ? v.round.to_s : "%.3g" % v
|
62
48
|
else
|
63
49
|
metric_name = "accuracy"
|
64
|
-
metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred)).round(1)
|
50
|
+
metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred, weight: @validation_set.weight)).round(1)
|
65
51
|
end
|
66
52
|
str << "Validation %s: %s\n\n" % [metric_name, metric_value]
|
67
53
|
end
|
@@ -70,50 +56,34 @@ module Eps
|
|
70
56
|
str
|
71
57
|
end
|
72
58
|
|
73
|
-
|
74
|
-
def self.extract_text_features(data, features)
|
75
|
-
# updates features object
|
76
|
-
vocabulary = {}
|
77
|
-
function_mapping = {}
|
78
|
-
derived_fields = {}
|
79
|
-
data.css("LocalTransformations DerivedField, TransformationDictionary DerivedField").each do |n|
|
80
|
-
name = n.attribute("name")&.value
|
81
|
-
field = n.css("FieldRef").attribute("field").value
|
82
|
-
value = n.css("Constant").text
|
83
|
-
|
84
|
-
field = field[10..-2] if field =~ /\Alowercase\(.+\)\z/
|
85
|
-
next if value.empty?
|
59
|
+
private
|
86
60
|
|
87
|
-
|
61
|
+
def _predict(data, probabilities)
|
62
|
+
singular = data.is_a?(Hash)
|
63
|
+
data = [data] if singular
|
88
64
|
|
89
|
-
|
65
|
+
data = Eps::DataFrame.new(data)
|
90
66
|
|
91
|
-
|
92
|
-
|
67
|
+
@evaluator.features.each do |k, type|
|
68
|
+
values = data.columns[k]
|
69
|
+
raise ArgumentError, "Missing column: #{k}" if !values
|
70
|
+
column_type = Utils.column_type(values.compact, k) if values
|
93
71
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
case_sensitive: text_index.attribute("isCaseSensitive")&.value == "true"
|
101
|
-
}
|
72
|
+
if !column_type.nil?
|
73
|
+
if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
|
74
|
+
raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
# TODO check for unknown values for categorical features
|
102
78
|
end
|
103
79
|
|
104
|
-
|
105
|
-
function_mapping.each do |field, function|
|
106
|
-
text_features[field] = functions[function].merge(vocabulary: vocabulary[field])
|
107
|
-
features[field] = "text"
|
108
|
-
end
|
80
|
+
predictions = @evaluator.predict(data, probabilities: probabilities)
|
109
81
|
|
110
|
-
|
82
|
+
singular ? predictions.first : predictions
|
111
83
|
end
|
112
84
|
|
113
|
-
|
114
|
-
|
115
|
-
def train(data, y = nil, target: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
|
116
|
-
data, @target = prep_data(data, y, target)
|
85
|
+
def train(data, y = nil, target: nil, weight: nil, split: nil, validation_set: nil, text_features: nil, **options)
|
86
|
+
data, @target = prep_data(data, y, target, weight)
|
117
87
|
@target_type = Utils.column_type(data.label, @target)
|
118
88
|
|
119
89
|
if split.nil?
|
@@ -121,6 +91,7 @@ module Eps
|
|
121
91
|
end
|
122
92
|
|
123
93
|
# cross validation
|
94
|
+
# TODO adjust based on weight
|
124
95
|
if split && !validation_set
|
125
96
|
split = {} if split == true
|
126
97
|
split = {column: split} unless split.is_a?(Hash)
|
@@ -193,8 +164,9 @@ module Eps
|
|
193
164
|
else
|
194
165
|
@train_set = data.dup
|
195
166
|
if validation_set
|
196
|
-
|
197
|
-
|
167
|
+
raise "Target required for validation set" unless target
|
168
|
+
raise "Weight required for validation set" if data.weight && !weight
|
169
|
+
validation_set, _ = prep_data(validation_set, nil, @target, weight)
|
198
170
|
end
|
199
171
|
end
|
200
172
|
|
@@ -202,20 +174,37 @@ module Eps
|
|
202
174
|
raise "No data in validation set" if validation_set && validation_set.empty?
|
203
175
|
|
204
176
|
@validation_set = validation_set
|
205
|
-
@evaluator = _train(
|
177
|
+
@evaluator = _train(**options)
|
206
178
|
|
207
179
|
# reset pmml
|
208
180
|
@pmml = nil
|
209
181
|
|
182
|
+
@trained = true
|
183
|
+
|
210
184
|
nil
|
211
185
|
end
|
212
186
|
|
213
|
-
def prep_data(data, y, target)
|
187
|
+
def prep_data(data, y, target, weight)
|
214
188
|
data = Eps::DataFrame.new(data)
|
189
|
+
|
190
|
+
# target
|
215
191
|
target = (target || "target").to_s
|
216
192
|
y ||= data.columns.delete(target)
|
217
193
|
check_missing(y, target)
|
218
194
|
data.label = y.to_a
|
195
|
+
|
196
|
+
# weight
|
197
|
+
if weight
|
198
|
+
weight =
|
199
|
+
if weight.respond_to?(:to_a)
|
200
|
+
weight.to_a
|
201
|
+
else
|
202
|
+
data.columns.delete(weight.to_s)
|
203
|
+
end
|
204
|
+
check_missing(weight, "weight")
|
205
|
+
data.weight = weight.to_a
|
206
|
+
end
|
207
|
+
|
219
208
|
check_data(data)
|
220
209
|
[data, target]
|
221
210
|
end
|
@@ -228,7 +217,7 @@ module Eps
|
|
228
217
|
|
229
218
|
# TODO determine max features automatically
|
230
219
|
# start based on number of rows
|
231
|
-
encoder = Eps::TextEncoder.new(v)
|
220
|
+
encoder = Eps::TextEncoder.new(**v)
|
232
221
|
counts = encoder.fit(train_set.columns.delete(k))
|
233
222
|
encoder.vocabulary.each do |word|
|
234
223
|
train_set.columns[[k, word]] = [0] * counts.size
|
@@ -251,11 +240,12 @@ module Eps
|
|
251
240
|
def check_data(data)
|
252
241
|
raise "No data" if data.empty?
|
253
242
|
raise "Number of data points differs from target" if data.size != data.label.size
|
243
|
+
raise "Number of data points differs from weight" if data.weight && data.size != data.weight.size
|
254
244
|
end
|
255
245
|
|
256
246
|
def check_missing(c, name)
|
257
247
|
raise ArgumentError, "Missing column: #{name}" if !c
|
258
|
-
raise ArgumentError, "Missing values in column #{name}" if c.any?(&:nil?)
|
248
|
+
raise ArgumentError, "Missing values in column #{name}" if c.to_a.any?(&:nil?)
|
259
249
|
end
|
260
250
|
|
261
251
|
def check_missing_value(df)
|
@@ -275,77 +265,5 @@ module Eps
|
|
275
265
|
k
|
276
266
|
end
|
277
267
|
end
|
278
|
-
|
279
|
-
# pmml
|
280
|
-
|
281
|
-
def build_pmml(data_fields)
|
282
|
-
Nokogiri::XML::Builder.new do |xml|
|
283
|
-
xml.PMML(version: "4.4", xmlns: "http://www.dmg.org/PMML-4_4", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
|
284
|
-
pmml_header(xml)
|
285
|
-
pmml_data_dictionary(xml, data_fields)
|
286
|
-
pmml_transformation_dictionary(xml)
|
287
|
-
yield xml
|
288
|
-
end
|
289
|
-
end
|
290
|
-
end
|
291
|
-
|
292
|
-
def pmml_header(xml)
|
293
|
-
xml.Header do
|
294
|
-
xml.Application(name: "Eps", version: Eps::VERSION)
|
295
|
-
# xml.Timestamp Time.now.utc.iso8601
|
296
|
-
end
|
297
|
-
end
|
298
|
-
|
299
|
-
def pmml_data_dictionary(xml, data_fields)
|
300
|
-
xml.DataDictionary do
|
301
|
-
data_fields.each do |k, vs|
|
302
|
-
case @features[k]
|
303
|
-
when "categorical", nil
|
304
|
-
xml.DataField(name: k, optype: "categorical", dataType: "string") do
|
305
|
-
vs.map(&:to_s).sort.each do |v|
|
306
|
-
xml.Value(value: v)
|
307
|
-
end
|
308
|
-
end
|
309
|
-
when "text"
|
310
|
-
xml.DataField(name: k, optype: "categorical", dataType: "string")
|
311
|
-
else
|
312
|
-
xml.DataField(name: k, optype: "continuous", dataType: "double")
|
313
|
-
end
|
314
|
-
end
|
315
|
-
end
|
316
|
-
end
|
317
|
-
|
318
|
-
def pmml_transformation_dictionary(xml)
|
319
|
-
if @text_features.any?
|
320
|
-
xml.TransformationDictionary do
|
321
|
-
@text_features.each do |k, text_options|
|
322
|
-
xml.DefineFunction(name: "#{k}Transform", optype: "continuous") do
|
323
|
-
xml.ParameterField(name: "text")
|
324
|
-
xml.ParameterField(name: "term")
|
325
|
-
xml.TextIndex(textField: "text", localTermWeights: "termFrequency", wordSeparatorCharacterRE: text_options[:tokenizer].source, isCaseSensitive: !!text_options[:case_sensitive]) do
|
326
|
-
xml.FieldRef(field: "term")
|
327
|
-
end
|
328
|
-
end
|
329
|
-
end
|
330
|
-
end
|
331
|
-
end
|
332
|
-
end
|
333
|
-
|
334
|
-
def pmml_local_transformations(xml)
|
335
|
-
if @text_features.any?
|
336
|
-
xml.LocalTransformations do
|
337
|
-
@text_features.each do |k, _|
|
338
|
-
@text_encoders[k].vocabulary.each do |v|
|
339
|
-
xml.DerivedField(name: display_field([k, v]), optype: "continuous", dataType: "integer") do
|
340
|
-
xml.Apply(function: "#{k}Transform") do
|
341
|
-
xml.FieldRef(field: k)
|
342
|
-
xml.Constant v
|
343
|
-
end
|
344
|
-
end
|
345
|
-
end
|
346
|
-
end
|
347
|
-
end
|
348
|
-
end
|
349
|
-
end
|
350
268
|
end
|
351
269
|
end
|