eps 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3ca27ba2379d1cbfb6f3407ace5ad9dd5fcb71b08e48b8805ddda6483c026194
4
- data.tar.gz: 91bb0beb50664dda5c2a42684414b1972e2bff91c3a993926639939c91272ccd
3
+ metadata.gz: a59850fe508d404a023145710505e721f1bfc24935a30a090aee09d179887d3a
4
+ data.tar.gz: 8218bc5bb63ee5ebbd23a8e9a129bcd76789b1f6bb628d57b015f1d5740183ac
5
5
  SHA512:
6
- metadata.gz: 648d8098928d0ed952ad4cf2195b3e2562db5a38249357b76eb39c0aa17d8f8f974936c4773b2395ae1b1197aedb6e47c8fd018675496f3f966ee2feebb1ed2d
7
- data.tar.gz: aa48887027114d9b654f3564715586a1740b742fe7778602d8db770b4921cff8acfbf90baea3ae6092d7c3962f37763c630857d71fbcd573402dfb016159f0c2
6
+ metadata.gz: db1011e9228763dc0a98e1e57d1c9e18a297d362cea18b33bf8eeffecce853ea49d4273ae4e782a6de2be37711e9e6373810e5517558248489e696b477c0848b
7
+ data.tar.gz: 6b9f52453be9d2ad7a29a4703508763988447de64a7599c53f9b9d3b0135e105130aba3c2679fed17ea60ba7242b6bd0d3cac9c5c2b796fe93f9009f0bbbcb30
@@ -1,4 +1,11 @@
1
- ## 0.3.0
1
+ ## 0.3.1 (2019-12-06)
2
+
3
+ - Added `weight` option for LightGBM and linear regression
4
+ - Added `intercept` option for linear regression
5
+ - Fixed `Unknown label` error for LightGBM
6
+ - Fixed error message for unstable solutions with linear regression
7
+
8
+ ## 0.3.0 (2019-09-05)
2
9
 
3
10
  - Added support for LightGBM
4
11
  - Added text features
@@ -12,22 +19,22 @@ Breaking
12
19
  - Removed support for JSON and PFA formats
13
20
  - Added smoothing to naive Bayes
14
21
 
15
- ## 0.2.1
22
+ ## 0.2.1 (2019-05-19)
16
23
 
17
24
  - Fixed error with `summary`
18
25
  - Fixed error with `predict` in `Eps::Base`
19
26
  - Fixed error with loaded classification models
20
27
 
21
- ## 0.2.0
28
+ ## 0.2.0 (2019-05-19)
22
29
 
23
30
  - Added support for classification
24
31
  - Added `to_pmml` method
25
32
  - Added `Eps::Base`
26
33
 
27
- ## 0.1.1
34
+ ## 0.1.1 (2018-07-05)
28
35
 
29
36
  - Huge performance boost
30
37
 
31
- ## 0.1.0
38
+ ## 0.1.0 (2018-07-03)
32
39
 
33
40
  - First release
data/README.md CHANGED
@@ -369,6 +369,12 @@ gem 'gsl', group: :development
369
369
 
370
370
  It only needs to be available in environments used to build the model.
371
371
 
372
+ By default, an intercept is included. Disable this with:
373
+
374
+ ```ruby
375
+ Eps::Model.new(data, intercept: false)
376
+ ```
377
+
372
378
  ## Validation Options
373
379
 
374
380
  Pass your own validation set with:
@@ -389,6 +395,12 @@ Specify the validation set size (the default is `0.25`, which is 25%)
389
395
  Eps::Model.new(data, split: {validation_size: 0.2})
390
396
  ```
391
397
 
398
+ Disable the validation set completely with:
399
+
400
+ ```ruby
401
+ Eps::Model.new(data, split: false)
402
+ ```
403
+
392
404
  ## Database Storage
393
405
 
394
406
  The database is another place you can store models. It’s good if you retrain models automatically.
@@ -419,6 +431,28 @@ model = Eps::Model.load_pmml(data)
419
431
 
420
432
  You can use [IRuby](https://github.com/SciRuby/iruby) to run Eps in [Jupyter](https://jupyter.org/) notebooks. Here’s how to get [IRuby working with Rails](https://ankane.org/jupyter-rails).
421
433
 
434
+ ## Weights
435
+
436
+ Specify a weight for each data point
437
+
438
+ ```ruby
439
+ Eps::Model.new(data, weight: :weight)
440
+ ```
441
+
442
+ You can also pass an array
443
+
444
+ ```ruby
445
+ Eps::Model.new(data, weight: [1, 2, 3])
446
+ ```
447
+
448
+ Weights are supported for metrics as well
449
+
450
+ ```ruby
451
+ Eps.metrics(actual, predicted, weight: weight)
452
+ ```
453
+
454
+ Reweighing is one method to [mitigate bias](http://aif360.mybluemix.net/) in training data
455
+
422
456
  ## Upgrading
423
457
 
424
458
  ## 0.3.0
data/lib/eps.rb CHANGED
@@ -1,5 +1,4 @@
1
1
  # dependencies
2
- require "bigdecimal"
3
2
  require "json"
4
3
  require "lightgbm"
5
4
  require "matrix"
@@ -9,10 +8,6 @@ require "nokogiri"
9
8
  require "eps/base"
10
9
  require "eps/base_estimator"
11
10
  require "eps/data_frame"
12
- require "eps/evaluators/linear_regression"
13
- require "eps/evaluators/lightgbm"
14
- require "eps/evaluators/naive_bayes"
15
- require "eps/evaluators/node"
16
11
  require "eps/label_encoder"
17
12
  require "eps/lightgbm"
18
13
  require "eps/linear_regression"
@@ -24,17 +19,31 @@ require "eps/text_encoder"
24
19
  require "eps/utils"
25
20
  require "eps/version"
26
21
 
22
+ # pmml
23
+ require "eps/pmml"
24
+ require "eps/pmml/generator"
25
+ require "eps/pmml/loader"
26
+
27
+ # evaluators
28
+ require "eps/evaluators/linear_regression"
29
+ require "eps/evaluators/lightgbm"
30
+ require "eps/evaluators/naive_bayes"
31
+ require "eps/evaluators/node"
32
+
27
33
  module Eps
28
- def self.metrics(y_true, y_pred)
34
+ class Error < StandardError; end
35
+ class UnstableSolution < Error; end
36
+
37
+ def self.metrics(y_true, y_pred, weight: nil)
29
38
  if Utils.column_type(y_true, "actual") == "numeric"
30
39
  {
31
- rmse: Metrics.rmse(y_true, y_pred),
32
- mae: Metrics.mae(y_true, y_pred),
33
- me: Metrics.me(y_true, y_pred)
40
+ rmse: Metrics.rmse(y_true, y_pred, weight: weight),
41
+ mae: Metrics.mae(y_true, y_pred, weight: weight),
42
+ me: Metrics.me(y_true, y_pred, weight: weight)
34
43
  }
35
44
  else
36
45
  {
37
- accuracy: Metrics.accuracy(y_true, y_pred)
46
+ accuracy: Metrics.accuracy(y_true, y_pred, weight: weight)
38
47
  }
39
48
  end
40
49
  end
@@ -1,6 +1,9 @@
1
1
  module Eps
2
2
  class BaseEstimator
3
3
  def initialize(data = nil, y = nil, **options)
4
+ @options = options.dup
5
+ # TODO better pattern - don't pass most options to train
6
+ options.delete(:intercept)
4
7
  train(data, y, **options) if data
5
8
  end
6
9
 
@@ -28,22 +31,19 @@ module Eps
28
31
  singular ? predictions.first : predictions
29
32
  end
30
33
 
31
- def evaluate(data, y = nil, target: nil)
32
- data, target = prep_data(data, y, target || @target)
33
- Eps.metrics(data.label, predict(data))
34
+ def evaluate(data, y = nil, target: nil, weight: nil)
35
+ data, target = prep_data(data, y, target || @target, weight)
36
+ Eps.metrics(data.label, predict(data), weight: data.weight)
34
37
  end
35
38
 
36
39
  def to_pmml
37
- (@pmml ||= generate_pmml).to_xml
40
+ @pmml ||= PMML.generate(self)
38
41
  end
39
42
 
40
- def self.load_pmml(data)
41
- if data.is_a?(String)
42
- data = Nokogiri::XML(data) { |config| config.strict }
43
- end
43
+ def self.load_pmml(pmml)
44
44
  model = new
45
- model.instance_variable_set("@pmml", data) # cache data
46
- model.instance_variable_set("@evaluator", yield(data))
45
+ model.instance_variable_set("@evaluator", PMML.load(pmml))
46
+ model.instance_variable_set("@pmml", pmml.respond_to?(:to_xml) ? pmml.to_xml : pmml) # cache data
47
47
  model
48
48
  end
49
49
 
@@ -57,11 +57,11 @@ module Eps
57
57
  case @target_type
58
58
  when "numeric"
59
59
  metric_name = "RMSE"
60
- v = Metrics.rmse(y_true, y_pred)
60
+ v = Metrics.rmse(y_true, y_pred, weight: @validation_set.weight)
61
61
  metric_value = v.round >= 1000 ? v.round.to_s : "%.3g" % v
62
62
  else
63
63
  metric_name = "accuracy"
64
- metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred)).round(1)
64
+ metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred, weight: @validation_set.weight)).round(1)
65
65
  end
66
66
  str << "Validation %s: %s\n\n" % [metric_name, metric_value]
67
67
  end
@@ -70,50 +70,10 @@ module Eps
70
70
  str
71
71
  end
72
72
 
73
- # private
74
- def self.extract_text_features(data, features)
75
- # updates features object
76
- vocabulary = {}
77
- function_mapping = {}
78
- derived_fields = {}
79
- data.css("LocalTransformations DerivedField, TransformationDictionary DerivedField").each do |n|
80
- name = n.attribute("name")&.value
81
- field = n.css("FieldRef").attribute("field").value
82
- value = n.css("Constant").text
83
-
84
- field = field[10..-2] if field =~ /\Alowercase\(.+\)\z/
85
- next if value.empty?
86
-
87
- (vocabulary[field] ||= []) << value
88
-
89
- function_mapping[field] = n.css("Apply").attribute("function").value
90
-
91
- derived_fields[name] = [field, value]
92
- end
93
-
94
- functions = {}
95
- data.css("TransformationDictionary DefineFunction").each do |n|
96
- name = n.attribute("name").value
97
- text_index = n.css("TextIndex")
98
- functions[name] = {
99
- tokenizer: Regexp.new(text_index.attribute("wordSeparatorCharacterRE").value),
100
- case_sensitive: text_index.attribute("isCaseSensitive")&.value == "true"
101
- }
102
- end
103
-
104
- text_features = {}
105
- function_mapping.each do |field, function|
106
- text_features[field] = functions[function].merge(vocabulary: vocabulary[field])
107
- features[field] = "text"
108
- end
109
-
110
- [text_features, derived_fields]
111
- end
112
-
113
73
  private
114
74
 
115
- def train(data, y = nil, target: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
116
- data, @target = prep_data(data, y, target)
75
+ def train(data, y = nil, target: nil, weight: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
76
+ data, @target = prep_data(data, y, target, weight)
117
77
  @target_type = Utils.column_type(data.label, @target)
118
78
 
119
79
  if split.nil?
@@ -121,6 +81,7 @@ module Eps
121
81
  end
122
82
 
123
83
  # cross validation
84
+ # TODO adjust based on weight
124
85
  if split && !validation_set
125
86
  split = {} if split == true
126
87
  split = {column: split} unless split.is_a?(Hash)
@@ -193,8 +154,9 @@ module Eps
193
154
  else
194
155
  @train_set = data.dup
195
156
  if validation_set
196
- validation_set = Eps::DataFrame.new(validation_set)
197
- validation_set.label = validation_set.columns.delete(@target)
157
+ raise "Target required for validation set" unless target
158
+ raise "Weight required for validation set" if data.weight && !weight
159
+ validation_set, _ = prep_data(validation_set, nil, @target, weight)
198
160
  end
199
161
  end
200
162
 
@@ -210,12 +172,27 @@ module Eps
210
172
  nil
211
173
  end
212
174
 
213
- def prep_data(data, y, target)
175
+ def prep_data(data, y, target, weight)
214
176
  data = Eps::DataFrame.new(data)
177
+
178
+ # target
215
179
  target = (target || "target").to_s
216
180
  y ||= data.columns.delete(target)
217
181
  check_missing(y, target)
218
182
  data.label = y.to_a
183
+
184
+ # weight
185
+ if weight
186
+ weight =
187
+ if weight.respond_to?(:to_a)
188
+ weight.to_a
189
+ else
190
+ data.columns.delete(weight.to_s)
191
+ end
192
+ check_missing(weight, "weight")
193
+ data.weight = weight.to_a
194
+ end
195
+
219
196
  check_data(data)
220
197
  [data, target]
221
198
  end
@@ -251,6 +228,7 @@ module Eps
251
228
  def check_data(data)
252
229
  raise "No data" if data.empty?
253
230
  raise "Number of data points differs from target" if data.size != data.label.size
231
+ raise "Number of data points differs from weight" if data.weight && data.size != data.weight.size
254
232
  end
255
233
 
256
234
  def check_missing(c, name)
@@ -275,77 +253,5 @@ module Eps
275
253
  k
276
254
  end
277
255
  end
278
-
279
- # pmml
280
-
281
- def build_pmml(data_fields)
282
- Nokogiri::XML::Builder.new do |xml|
283
- xml.PMML(version: "4.4", xmlns: "http://www.dmg.org/PMML-4_4", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
284
- pmml_header(xml)
285
- pmml_data_dictionary(xml, data_fields)
286
- pmml_transformation_dictionary(xml)
287
- yield xml
288
- end
289
- end
290
- end
291
-
292
- def pmml_header(xml)
293
- xml.Header do
294
- xml.Application(name: "Eps", version: Eps::VERSION)
295
- # xml.Timestamp Time.now.utc.iso8601
296
- end
297
- end
298
-
299
- def pmml_data_dictionary(xml, data_fields)
300
- xml.DataDictionary do
301
- data_fields.each do |k, vs|
302
- case @features[k]
303
- when "categorical", nil
304
- xml.DataField(name: k, optype: "categorical", dataType: "string") do
305
- vs.map(&:to_s).sort.each do |v|
306
- xml.Value(value: v)
307
- end
308
- end
309
- when "text"
310
- xml.DataField(name: k, optype: "categorical", dataType: "string")
311
- else
312
- xml.DataField(name: k, optype: "continuous", dataType: "double")
313
- end
314
- end
315
- end
316
- end
317
-
318
- def pmml_transformation_dictionary(xml)
319
- if @text_features.any?
320
- xml.TransformationDictionary do
321
- @text_features.each do |k, text_options|
322
- xml.DefineFunction(name: "#{k}Transform", optype: "continuous") do
323
- xml.ParameterField(name: "text")
324
- xml.ParameterField(name: "term")
325
- xml.TextIndex(textField: "text", localTermWeights: "termFrequency", wordSeparatorCharacterRE: text_options[:tokenizer].source, isCaseSensitive: !!text_options[:case_sensitive]) do
326
- xml.FieldRef(field: "term")
327
- end
328
- end
329
- end
330
- end
331
- end
332
- end
333
-
334
- def pmml_local_transformations(xml)
335
- if @text_features.any?
336
- xml.LocalTransformations do
337
- @text_features.each do |k, _|
338
- @text_encoders[k].vocabulary.each do |v|
339
- xml.DerivedField(name: display_field([k, v]), optype: "continuous", dataType: "integer") do
340
- xml.Apply(function: "#{k}Transform") do
341
- xml.FieldRef(field: k)
342
- xml.Constant v
343
- end
344
- end
345
- end
346
- end
347
- end
348
- end
349
- end
350
256
  end
351
257
  end
@@ -1,7 +1,7 @@
1
1
  module Eps
2
2
  class DataFrame
3
3
  attr_reader :columns
4
- attr_accessor :label
4
+ attr_accessor :label, :weight
5
5
 
6
6
  def initialize(data = [])
7
7
  @columns = {}
@@ -78,6 +78,10 @@ module Eps
78
78
  rows = Range.new(rows.begin, size - 1)
79
79
  elsif rows.end < 0
80
80
  rows = Range.new(rows.begin, size + rows.end, rows.exclude_end?)
81
+ else
82
+ finish = rows.end
83
+ finish -= 1 if rows.exclude_end?
84
+ rows = Range.new(rows.begin, size - 1) if finish >= size - 1
81
85
  end
82
86
  end
83
87
 
@@ -115,6 +119,7 @@ module Eps
115
119
  df.columns[c] = columns[c].values_at(*rows)
116
120
  end
117
121
  df.label = label.values_at(*rows) if label
122
+ df.weight = weight.values_at(*rows) if weight
118
123
 
119
124
  singular ? df.columns[cols[0]] : df
120
125
  end
@@ -129,6 +134,7 @@ module Eps
129
134
  df.columns[k] = v
130
135
  end
131
136
  df.label = label
137
+ df.weight = weight
132
138
  df
133
139
  end
134
140
 
@@ -10,7 +10,7 @@ module Eps
10
10
  end
11
11
 
12
12
  def predict(x)
13
- intercept = @coefficients["_intercept"]
13
+ intercept = @coefficients["_intercept"] || 0.0
14
14
  scores = [intercept] * x.size
15
15
 
16
16
  @features.each do |k, type|
@@ -24,9 +24,13 @@ module Eps
24
24
  if yi.nil?
25
25
  nil
26
26
  else
27
- v = @labels[yi.to_s]
28
- raise "Unknown label: #{yi}" unless v
29
- v
27
+ # use an additional label for unseen values
28
+ # this is only used during training for the LightGBM eval_set
29
+ # LightGBM ignores them (only uses seen categories for predictions)
30
+ # https://github.com/microsoft/LightGBM/issues/1936
31
+ # the evaluator also ignores them (to be consistent with LightGBM)
32
+ # but doesn't use this code
33
+ @labels[yi.to_s] || @labels.size
30
34
  end
31
35
  end
32
36
  end