eps 0.3.0 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -5
- data/README.md +77 -9
- data/lib/eps.rb +19 -10
- data/lib/eps/base_estimator.rb +63 -145
- data/lib/eps/data_frame.rb +19 -3
- data/lib/eps/evaluators/lightgbm.rb +20 -7
- data/lib/eps/evaluators/linear_regression.rb +7 -4
- data/lib/eps/evaluators/naive_bayes.rb +9 -7
- data/lib/eps/label_encoder.rb +7 -3
- data/lib/eps/lightgbm.rb +43 -78
- data/lib/eps/linear_regression.rb +53 -83
- data/lib/eps/metrics.rb +24 -12
- data/lib/eps/model.rb +6 -6
- data/lib/eps/naive_bayes.rb +3 -140
- data/lib/eps/pmml.rb +14 -0
- data/lib/eps/pmml/generator.rb +422 -0
- data/lib/eps/pmml/loader.rb +241 -0
- data/lib/eps/version.rb +1 -1
- metadata +36 -6
- data/lib/eps/pmml_generators/lightgbm.rb +0 -187
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1369016c3cae228f169fe580b54fca3c0d240cda202fa7d03ecc7a4e156ee8c7
|
4
|
+
data.tar.gz: bf83ca424c509798d1a1436806b52cba0cfdbefecb8d827d5b17aec7b807b121
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2bf47d80a301eb546c348aaa71f847fa22ace5bed63d97a1f19eb14bc15388b056cd3f545ccf251b2bbf2afc485ef81e5559849ff7459e9dd9f88a71c7cbf83a
|
7
|
+
data.tar.gz: 82d65d84e95a6518cd132c2a42cdec20afd05c0013192941b59ee0edb524874d12b2dd9082dd89be1422872c88e827e031469e43b80336c48c7eab7ff4fe611e
|
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,30 @@
|
|
1
|
-
## 0.3.
|
1
|
+
## 0.3.5 (2020-06-10)
|
2
|
+
|
3
|
+
- Added `learning_rate` option for LightGBM
|
4
|
+
- Added support for Numo and Rover
|
5
|
+
|
6
|
+
## 0.3.4 (2020-04-05)
|
7
|
+
|
8
|
+
- Added `predict_probability` for classification
|
9
|
+
|
10
|
+
## 0.3.3 (2020-02-24)
|
11
|
+
|
12
|
+
- Fixed errors and incorrect predictions with boolean columns
|
13
|
+
- Fixed deprecation warnings in Ruby 2.7
|
14
|
+
|
15
|
+
## 0.3.2 (2019-12-08)
|
16
|
+
|
17
|
+
- Added support for GSLR
|
18
|
+
|
19
|
+
## 0.3.1 (2019-12-06)
|
20
|
+
|
21
|
+
- Added `weight` option for LightGBM and linear regression
|
22
|
+
- Added `intercept` option for linear regression
|
23
|
+
- Added LightGBM evaluator safety check
|
24
|
+
- Fixed `Unknown label` error for LightGBM
|
25
|
+
- Fixed error message for unstable solutions with linear regression
|
26
|
+
|
27
|
+
## 0.3.0 (2019-09-05)
|
2
28
|
|
3
29
|
- Added support for LightGBM
|
4
30
|
- Added text features
|
@@ -12,22 +38,22 @@ Breaking
|
|
12
38
|
- Removed support for JSON and PFA formats
|
13
39
|
- Added smoothing to naive Bayes
|
14
40
|
|
15
|
-
## 0.2.1
|
41
|
+
## 0.2.1 (2019-05-19)
|
16
42
|
|
17
43
|
- Fixed error with `summary`
|
18
44
|
- Fixed error with `predict` in `Eps::Base`
|
19
45
|
- Fixed error with loaded classification models
|
20
46
|
|
21
|
-
## 0.2.0
|
47
|
+
## 0.2.0 (2019-05-19)
|
22
48
|
|
23
49
|
- Added support for classification
|
24
50
|
- Added `to_pmml` method
|
25
51
|
- Added `Eps::Base`
|
26
52
|
|
27
|
-
## 0.1.1
|
53
|
+
## 0.1.1 (2018-07-05)
|
28
54
|
|
29
55
|
- Huge performance boost
|
30
56
|
|
31
|
-
## 0.1.0
|
57
|
+
## 0.1.0 (2018-07-03)
|
32
58
|
|
33
59
|
- First release
|
data/README.md
CHANGED
@@ -4,7 +4,6 @@ Machine learning for Ruby
|
|
4
4
|
|
5
5
|
- Build predictive models quickly and easily
|
6
6
|
- Serve models built in Ruby, Python, R, and more
|
7
|
-
- No prior knowledge of machine learning required :tada:
|
8
7
|
|
9
8
|
Check out [this post](https://ankane.org/rails-meet-data-science) for more info on machine learning with Rails
|
10
9
|
|
@@ -314,7 +313,7 @@ y = [1, 2, 3]
|
|
314
313
|
Eps::Model.new(x, y)
|
315
314
|
```
|
316
315
|
|
317
|
-
|
316
|
+
Data can be an array of arrays
|
318
317
|
|
319
318
|
```ruby
|
320
319
|
x = [[1, 2], [2, 0], [3, 1]]
|
@@ -322,9 +321,22 @@ y = [1, 2, 3]
|
|
322
321
|
Eps::Model.new(x, y)
|
323
322
|
```
|
324
323
|
|
325
|
-
|
324
|
+
Or Numo arrays
|
326
325
|
|
327
|
-
|
326
|
+
```ruby
|
327
|
+
x = Numo::NArray.cast([[1, 2], [2, 0], [3, 1]])
|
328
|
+
y = Numo::NArray.cast([1, 2, 3])
|
329
|
+
Eps::Model.new(x, y)
|
330
|
+
```
|
331
|
+
|
332
|
+
Or a Rover data frame
|
333
|
+
|
334
|
+
```ruby
|
335
|
+
df = Rover.read_csv("houses.csv")
|
336
|
+
Eps::Model.new(df, target: "price")
|
337
|
+
```
|
338
|
+
|
339
|
+
Or a Daru data frame
|
328
340
|
|
329
341
|
```ruby
|
330
342
|
df = Daru::DataFrame.from_csv("houses.csv")
|
@@ -353,9 +365,19 @@ Eps supports:
|
|
353
365
|
- Linear Regression
|
354
366
|
- Naive Bayes
|
355
367
|
|
368
|
+
### LightGBM
|
369
|
+
|
370
|
+
Pass the learning rate with:
|
371
|
+
|
372
|
+
```ruby
|
373
|
+
Eps::Model.new(data, learning_rate: 0.01)
|
374
|
+
```
|
375
|
+
|
356
376
|
### Linear Regression
|
357
377
|
|
358
|
-
|
378
|
+
#### Performance
|
379
|
+
|
380
|
+
To speed up training on large datasets with linear regression, [install GSL](https://github.com/ankane/gslr#gsl-installation). With Homebrew, you can use:
|
359
381
|
|
360
382
|
```sh
|
361
383
|
brew install gsl
|
@@ -364,11 +386,29 @@ brew install gsl
|
|
364
386
|
Then, add this line to your application’s Gemfile:
|
365
387
|
|
366
388
|
```ruby
|
367
|
-
gem '
|
389
|
+
gem 'gslr', group: :development
|
368
390
|
```
|
369
391
|
|
370
392
|
It only needs to be available in environments used to build the model.
|
371
393
|
|
394
|
+
#### Options
|
395
|
+
|
396
|
+
By default, an intercept is included. Disable this with:
|
397
|
+
|
398
|
+
```ruby
|
399
|
+
Eps::Model.new(data, intercept: false)
|
400
|
+
```
|
401
|
+
|
402
|
+
## Probability
|
403
|
+
|
404
|
+
To get the probability of each category for predictions with classification, use:
|
405
|
+
|
406
|
+
```ruby
|
407
|
+
model.predict_probability(data)
|
408
|
+
```
|
409
|
+
|
410
|
+
Naive Bayes is known to produce poor probability estimates, so stick with LightGBM if you need this.
|
411
|
+
|
372
412
|
## Validation Options
|
373
413
|
|
374
414
|
Pass your own validation set with:
|
@@ -389,6 +429,12 @@ Specify the validation set size (the default is `0.25`, which is 25%)
|
|
389
429
|
Eps::Model.new(data, split: {validation_size: 0.2})
|
390
430
|
```
|
391
431
|
|
432
|
+
Disable the validation set completely with:
|
433
|
+
|
434
|
+
```ruby
|
435
|
+
Eps::Model.new(data, split: false)
|
436
|
+
```
|
437
|
+
|
392
438
|
## Database Storage
|
393
439
|
|
394
440
|
The database is another place you can store models. It’s good if you retrain models automatically.
|
@@ -398,7 +444,7 @@ The database is another place you can store models. It’s good if you retrain m
|
|
398
444
|
Create an ActiveRecord model to store the predictive model.
|
399
445
|
|
400
446
|
```sh
|
401
|
-
rails
|
447
|
+
rails generate model Model key:string:uniq data:text
|
402
448
|
```
|
403
449
|
|
404
450
|
Store the model with:
|
@@ -419,6 +465,28 @@ model = Eps::Model.load_pmml(data)
|
|
419
465
|
|
420
466
|
You can use [IRuby](https://github.com/SciRuby/iruby) to run Eps in [Jupyter](https://jupyter.org/) notebooks. Here’s how to get [IRuby working with Rails](https://ankane.org/jupyter-rails).
|
421
467
|
|
468
|
+
## Weights
|
469
|
+
|
470
|
+
Specify a weight for each data point
|
471
|
+
|
472
|
+
```ruby
|
473
|
+
Eps::Model.new(data, weight: :weight)
|
474
|
+
```
|
475
|
+
|
476
|
+
You can also pass an array
|
477
|
+
|
478
|
+
```ruby
|
479
|
+
Eps::Model.new(data, weight: [1, 2, 3])
|
480
|
+
```
|
481
|
+
|
482
|
+
Weights are supported for metrics as well
|
483
|
+
|
484
|
+
```ruby
|
485
|
+
Eps.metrics(actual, predicted, weight: weight)
|
486
|
+
```
|
487
|
+
|
488
|
+
Reweighing is one method to [mitigate bias](http://aif360.mybluemix.net/) in training data
|
489
|
+
|
422
490
|
## Upgrading
|
423
491
|
|
424
492
|
## 0.3.0
|
@@ -486,11 +554,11 @@ Everyone is encouraged to help improve this project. Here are a few ways you can
|
|
486
554
|
- Write, clarify, or fix documentation
|
487
555
|
- Suggest or add new features
|
488
556
|
|
489
|
-
To get started with development
|
557
|
+
To get started with development:
|
490
558
|
|
491
559
|
```sh
|
492
560
|
git clone https://github.com/ankane/eps.git
|
493
561
|
cd eps
|
494
562
|
bundle install
|
495
|
-
rake test
|
563
|
+
bundle exec rake test
|
496
564
|
```
|
data/lib/eps.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
# dependencies
|
2
|
-
require "bigdecimal"
|
3
2
|
require "json"
|
4
3
|
require "lightgbm"
|
5
4
|
require "matrix"
|
@@ -9,10 +8,6 @@ require "nokogiri"
|
|
9
8
|
require "eps/base"
|
10
9
|
require "eps/base_estimator"
|
11
10
|
require "eps/data_frame"
|
12
|
-
require "eps/evaluators/linear_regression"
|
13
|
-
require "eps/evaluators/lightgbm"
|
14
|
-
require "eps/evaluators/naive_bayes"
|
15
|
-
require "eps/evaluators/node"
|
16
11
|
require "eps/label_encoder"
|
17
12
|
require "eps/lightgbm"
|
18
13
|
require "eps/linear_regression"
|
@@ -24,17 +19,31 @@ require "eps/text_encoder"
|
|
24
19
|
require "eps/utils"
|
25
20
|
require "eps/version"
|
26
21
|
|
22
|
+
# pmml
|
23
|
+
require "eps/pmml"
|
24
|
+
require "eps/pmml/generator"
|
25
|
+
require "eps/pmml/loader"
|
26
|
+
|
27
|
+
# evaluators
|
28
|
+
require "eps/evaluators/linear_regression"
|
29
|
+
require "eps/evaluators/lightgbm"
|
30
|
+
require "eps/evaluators/naive_bayes"
|
31
|
+
require "eps/evaluators/node"
|
32
|
+
|
27
33
|
module Eps
|
28
|
-
|
34
|
+
class Error < StandardError; end
|
35
|
+
class UnstableSolution < Error; end
|
36
|
+
|
37
|
+
def self.metrics(y_true, y_pred, weight: nil)
|
29
38
|
if Utils.column_type(y_true, "actual") == "numeric"
|
30
39
|
{
|
31
|
-
rmse: Metrics.rmse(y_true, y_pred),
|
32
|
-
mae: Metrics.mae(y_true, y_pred),
|
33
|
-
me: Metrics.me(y_true, y_pred)
|
40
|
+
rmse: Metrics.rmse(y_true, y_pred, weight: weight),
|
41
|
+
mae: Metrics.mae(y_true, y_pred, weight: weight),
|
42
|
+
me: Metrics.me(y_true, y_pred, weight: weight)
|
34
43
|
}
|
35
44
|
else
|
36
45
|
{
|
37
|
-
accuracy: Metrics.accuracy(y_true, y_pred)
|
46
|
+
accuracy: Metrics.accuracy(y_true, y_pred, weight: weight)
|
38
47
|
}
|
39
48
|
end
|
40
49
|
end
|
data/lib/eps/base_estimator.rb
CHANGED
@@ -1,53 +1,39 @@
|
|
1
1
|
module Eps
|
2
2
|
class BaseEstimator
|
3
3
|
def initialize(data = nil, y = nil, **options)
|
4
|
+
@options = options.dup
|
5
|
+
@trained = false
|
6
|
+
# TODO better pattern - don't pass most options to train
|
4
7
|
train(data, y, **options) if data
|
5
8
|
end
|
6
9
|
|
7
10
|
def predict(data)
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
data = Eps::DataFrame.new(data)
|
12
|
-
|
13
|
-
@evaluator.features.each do |k, type|
|
14
|
-
values = data.columns[k]
|
15
|
-
raise ArgumentError, "Missing column: #{k}" if !values
|
16
|
-
column_type = Utils.column_type(values.compact, k) if values
|
17
|
-
|
18
|
-
if !column_type.nil?
|
19
|
-
if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
|
20
|
-
raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
|
21
|
-
end
|
22
|
-
end
|
23
|
-
# TODO check for unknown values for categorical features
|
24
|
-
end
|
25
|
-
|
26
|
-
predictions = @evaluator.predict(data)
|
11
|
+
_predict(data, false)
|
12
|
+
end
|
27
13
|
|
28
|
-
|
14
|
+
def predict_probability(data)
|
15
|
+
_predict(data, true)
|
29
16
|
end
|
30
17
|
|
31
|
-
def evaluate(data, y = nil, target: nil)
|
32
|
-
data, target = prep_data(data, y, target || @target)
|
33
|
-
Eps.metrics(data.label, predict(data))
|
18
|
+
def evaluate(data, y = nil, target: nil, weight: nil)
|
19
|
+
data, target = prep_data(data, y, target || @target, weight)
|
20
|
+
Eps.metrics(data.label, predict(data), weight: data.weight)
|
34
21
|
end
|
35
22
|
|
36
23
|
def to_pmml
|
37
|
-
|
24
|
+
@pmml ||= PMML.generate(self)
|
38
25
|
end
|
39
26
|
|
40
|
-
def self.load_pmml(
|
41
|
-
if data.is_a?(String)
|
42
|
-
data = Nokogiri::XML(data) { |config| config.strict }
|
43
|
-
end
|
27
|
+
def self.load_pmml(pmml)
|
44
28
|
model = new
|
45
|
-
model.instance_variable_set("@
|
46
|
-
model.instance_variable_set("@
|
29
|
+
model.instance_variable_set("@evaluator", PMML.load(pmml))
|
30
|
+
model.instance_variable_set("@pmml", pmml.respond_to?(:to_xml) ? pmml.to_xml : pmml) # cache data
|
47
31
|
model
|
48
32
|
end
|
49
33
|
|
50
34
|
def summary(extended: false)
|
35
|
+
raise "Summary not available for loaded models" unless @trained
|
36
|
+
|
51
37
|
str = String.new("")
|
52
38
|
|
53
39
|
if @validation_set
|
@@ -57,11 +43,11 @@ module Eps
|
|
57
43
|
case @target_type
|
58
44
|
when "numeric"
|
59
45
|
metric_name = "RMSE"
|
60
|
-
v = Metrics.rmse(y_true, y_pred)
|
46
|
+
v = Metrics.rmse(y_true, y_pred, weight: @validation_set.weight)
|
61
47
|
metric_value = v.round >= 1000 ? v.round.to_s : "%.3g" % v
|
62
48
|
else
|
63
49
|
metric_name = "accuracy"
|
64
|
-
metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred)).round(1)
|
50
|
+
metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred, weight: @validation_set.weight)).round(1)
|
65
51
|
end
|
66
52
|
str << "Validation %s: %s\n\n" % [metric_name, metric_value]
|
67
53
|
end
|
@@ -70,50 +56,34 @@ module Eps
|
|
70
56
|
str
|
71
57
|
end
|
72
58
|
|
73
|
-
|
74
|
-
def self.extract_text_features(data, features)
|
75
|
-
# updates features object
|
76
|
-
vocabulary = {}
|
77
|
-
function_mapping = {}
|
78
|
-
derived_fields = {}
|
79
|
-
data.css("LocalTransformations DerivedField, TransformationDictionary DerivedField").each do |n|
|
80
|
-
name = n.attribute("name")&.value
|
81
|
-
field = n.css("FieldRef").attribute("field").value
|
82
|
-
value = n.css("Constant").text
|
83
|
-
|
84
|
-
field = field[10..-2] if field =~ /\Alowercase\(.+\)\z/
|
85
|
-
next if value.empty?
|
59
|
+
private
|
86
60
|
|
87
|
-
|
61
|
+
def _predict(data, probabilities)
|
62
|
+
singular = data.is_a?(Hash)
|
63
|
+
data = [data] if singular
|
88
64
|
|
89
|
-
|
65
|
+
data = Eps::DataFrame.new(data)
|
90
66
|
|
91
|
-
|
92
|
-
|
67
|
+
@evaluator.features.each do |k, type|
|
68
|
+
values = data.columns[k]
|
69
|
+
raise ArgumentError, "Missing column: #{k}" if !values
|
70
|
+
column_type = Utils.column_type(values.compact, k) if values
|
93
71
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
case_sensitive: text_index.attribute("isCaseSensitive")&.value == "true"
|
101
|
-
}
|
72
|
+
if !column_type.nil?
|
73
|
+
if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
|
74
|
+
raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
# TODO check for unknown values for categorical features
|
102
78
|
end
|
103
79
|
|
104
|
-
|
105
|
-
function_mapping.each do |field, function|
|
106
|
-
text_features[field] = functions[function].merge(vocabulary: vocabulary[field])
|
107
|
-
features[field] = "text"
|
108
|
-
end
|
80
|
+
predictions = @evaluator.predict(data, probabilities: probabilities)
|
109
81
|
|
110
|
-
|
82
|
+
singular ? predictions.first : predictions
|
111
83
|
end
|
112
84
|
|
113
|
-
|
114
|
-
|
115
|
-
def train(data, y = nil, target: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
|
116
|
-
data, @target = prep_data(data, y, target)
|
85
|
+
def train(data, y = nil, target: nil, weight: nil, split: nil, validation_set: nil, text_features: nil, **options)
|
86
|
+
data, @target = prep_data(data, y, target, weight)
|
117
87
|
@target_type = Utils.column_type(data.label, @target)
|
118
88
|
|
119
89
|
if split.nil?
|
@@ -121,6 +91,7 @@ module Eps
|
|
121
91
|
end
|
122
92
|
|
123
93
|
# cross validation
|
94
|
+
# TODO adjust based on weight
|
124
95
|
if split && !validation_set
|
125
96
|
split = {} if split == true
|
126
97
|
split = {column: split} unless split.is_a?(Hash)
|
@@ -193,8 +164,9 @@ module Eps
|
|
193
164
|
else
|
194
165
|
@train_set = data.dup
|
195
166
|
if validation_set
|
196
|
-
|
197
|
-
|
167
|
+
raise "Target required for validation set" unless target
|
168
|
+
raise "Weight required for validation set" if data.weight && !weight
|
169
|
+
validation_set, _ = prep_data(validation_set, nil, @target, weight)
|
198
170
|
end
|
199
171
|
end
|
200
172
|
|
@@ -202,20 +174,37 @@ module Eps
|
|
202
174
|
raise "No data in validation set" if validation_set && validation_set.empty?
|
203
175
|
|
204
176
|
@validation_set = validation_set
|
205
|
-
@evaluator = _train(
|
177
|
+
@evaluator = _train(**options)
|
206
178
|
|
207
179
|
# reset pmml
|
208
180
|
@pmml = nil
|
209
181
|
|
182
|
+
@trained = true
|
183
|
+
|
210
184
|
nil
|
211
185
|
end
|
212
186
|
|
213
|
-
def prep_data(data, y, target)
|
187
|
+
def prep_data(data, y, target, weight)
|
214
188
|
data = Eps::DataFrame.new(data)
|
189
|
+
|
190
|
+
# target
|
215
191
|
target = (target || "target").to_s
|
216
192
|
y ||= data.columns.delete(target)
|
217
193
|
check_missing(y, target)
|
218
194
|
data.label = y.to_a
|
195
|
+
|
196
|
+
# weight
|
197
|
+
if weight
|
198
|
+
weight =
|
199
|
+
if weight.respond_to?(:to_a)
|
200
|
+
weight.to_a
|
201
|
+
else
|
202
|
+
data.columns.delete(weight.to_s)
|
203
|
+
end
|
204
|
+
check_missing(weight, "weight")
|
205
|
+
data.weight = weight.to_a
|
206
|
+
end
|
207
|
+
|
219
208
|
check_data(data)
|
220
209
|
[data, target]
|
221
210
|
end
|
@@ -228,7 +217,7 @@ module Eps
|
|
228
217
|
|
229
218
|
# TODO determine max features automatically
|
230
219
|
# start based on number of rows
|
231
|
-
encoder = Eps::TextEncoder.new(v)
|
220
|
+
encoder = Eps::TextEncoder.new(**v)
|
232
221
|
counts = encoder.fit(train_set.columns.delete(k))
|
233
222
|
encoder.vocabulary.each do |word|
|
234
223
|
train_set.columns[[k, word]] = [0] * counts.size
|
@@ -251,11 +240,12 @@ module Eps
|
|
251
240
|
def check_data(data)
|
252
241
|
raise "No data" if data.empty?
|
253
242
|
raise "Number of data points differs from target" if data.size != data.label.size
|
243
|
+
raise "Number of data points differs from weight" if data.weight && data.size != data.weight.size
|
254
244
|
end
|
255
245
|
|
256
246
|
def check_missing(c, name)
|
257
247
|
raise ArgumentError, "Missing column: #{name}" if !c
|
258
|
-
raise ArgumentError, "Missing values in column #{name}" if c.any?(&:nil?)
|
248
|
+
raise ArgumentError, "Missing values in column #{name}" if c.to_a.any?(&:nil?)
|
259
249
|
end
|
260
250
|
|
261
251
|
def check_missing_value(df)
|
@@ -275,77 +265,5 @@ module Eps
|
|
275
265
|
k
|
276
266
|
end
|
277
267
|
end
|
278
|
-
|
279
|
-
# pmml
|
280
|
-
|
281
|
-
def build_pmml(data_fields)
|
282
|
-
Nokogiri::XML::Builder.new do |xml|
|
283
|
-
xml.PMML(version: "4.4", xmlns: "http://www.dmg.org/PMML-4_4", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
|
284
|
-
pmml_header(xml)
|
285
|
-
pmml_data_dictionary(xml, data_fields)
|
286
|
-
pmml_transformation_dictionary(xml)
|
287
|
-
yield xml
|
288
|
-
end
|
289
|
-
end
|
290
|
-
end
|
291
|
-
|
292
|
-
def pmml_header(xml)
|
293
|
-
xml.Header do
|
294
|
-
xml.Application(name: "Eps", version: Eps::VERSION)
|
295
|
-
# xml.Timestamp Time.now.utc.iso8601
|
296
|
-
end
|
297
|
-
end
|
298
|
-
|
299
|
-
def pmml_data_dictionary(xml, data_fields)
|
300
|
-
xml.DataDictionary do
|
301
|
-
data_fields.each do |k, vs|
|
302
|
-
case @features[k]
|
303
|
-
when "categorical", nil
|
304
|
-
xml.DataField(name: k, optype: "categorical", dataType: "string") do
|
305
|
-
vs.map(&:to_s).sort.each do |v|
|
306
|
-
xml.Value(value: v)
|
307
|
-
end
|
308
|
-
end
|
309
|
-
when "text"
|
310
|
-
xml.DataField(name: k, optype: "categorical", dataType: "string")
|
311
|
-
else
|
312
|
-
xml.DataField(name: k, optype: "continuous", dataType: "double")
|
313
|
-
end
|
314
|
-
end
|
315
|
-
end
|
316
|
-
end
|
317
|
-
|
318
|
-
def pmml_transformation_dictionary(xml)
|
319
|
-
if @text_features.any?
|
320
|
-
xml.TransformationDictionary do
|
321
|
-
@text_features.each do |k, text_options|
|
322
|
-
xml.DefineFunction(name: "#{k}Transform", optype: "continuous") do
|
323
|
-
xml.ParameterField(name: "text")
|
324
|
-
xml.ParameterField(name: "term")
|
325
|
-
xml.TextIndex(textField: "text", localTermWeights: "termFrequency", wordSeparatorCharacterRE: text_options[:tokenizer].source, isCaseSensitive: !!text_options[:case_sensitive]) do
|
326
|
-
xml.FieldRef(field: "term")
|
327
|
-
end
|
328
|
-
end
|
329
|
-
end
|
330
|
-
end
|
331
|
-
end
|
332
|
-
end
|
333
|
-
|
334
|
-
def pmml_local_transformations(xml)
|
335
|
-
if @text_features.any?
|
336
|
-
xml.LocalTransformations do
|
337
|
-
@text_features.each do |k, _|
|
338
|
-
@text_encoders[k].vocabulary.each do |v|
|
339
|
-
xml.DerivedField(name: display_field([k, v]), optype: "continuous", dataType: "integer") do
|
340
|
-
xml.Apply(function: "#{k}Transform") do
|
341
|
-
xml.FieldRef(field: k)
|
342
|
-
xml.Constant v
|
343
|
-
end
|
344
|
-
end
|
345
|
-
end
|
346
|
-
end
|
347
|
-
end
|
348
|
-
end
|
349
|
-
end
|
350
268
|
end
|
351
269
|
end
|