eps 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/eps.rb CHANGED
@@ -1,18 +1,42 @@
1
1
  # dependencies
2
- require "matrix"
2
+ require "bigdecimal"
3
3
  require "json"
4
+ require "lightgbm"
5
+ require "matrix"
6
+ require "nokogiri"
4
7
 
5
8
  # modules
6
9
  require "eps/base"
7
10
  require "eps/base_estimator"
11
+ require "eps/data_frame"
12
+ require "eps/evaluators/linear_regression"
13
+ require "eps/evaluators/lightgbm"
14
+ require "eps/evaluators/naive_bayes"
15
+ require "eps/evaluators/node"
16
+ require "eps/label_encoder"
17
+ require "eps/lightgbm"
8
18
  require "eps/linear_regression"
19
+ require "eps/metrics"
9
20
  require "eps/model"
10
21
  require "eps/naive_bayes"
22
+ require "eps/statistics"
23
+ require "eps/text_encoder"
24
+ require "eps/utils"
11
25
  require "eps/version"
12
26
 
13
27
  module Eps
14
- def self.metrics(actual, estimated)
15
- Eps::Model.metrics(actual, estimated)
28
+ def self.metrics(y_true, y_pred)
29
+ if Utils.column_type(y_true, "actual") == "numeric"
30
+ {
31
+ rmse: Metrics.rmse(y_true, y_pred),
32
+ mae: Metrics.mae(y_true, y_pred),
33
+ me: Metrics.me(y_true, y_pred)
34
+ }
35
+ else
36
+ {
37
+ accuracy: Metrics.accuracy(y_true, y_pred)
38
+ }
39
+ end
16
40
  end
17
41
 
18
42
  # backwards compatibility
@@ -1,81 +1,350 @@
1
1
  module Eps
2
2
  class BaseEstimator
3
- def train(data, y, target: nil, **options)
4
- # TODO more performant conversion
5
- if daru?(data)
6
- x = data.dup
7
- x = x.delete_vector(target) if target
8
- else
9
- x = data.map(&:dup)
10
- x.each { |r| r.delete(target) } if target
11
- end
3
+ def initialize(data = nil, y = nil, **options)
4
+ train(data, y, **options) if data
5
+ end
12
6
 
13
- y = prep_y(y.to_a)
7
+ def predict(data)
8
+ singular = data.is_a?(Hash)
9
+ data = [data] if singular
14
10
 
15
- if x.size != y.size
16
- raise "Number of samples differs from target"
11
+ data = Eps::DataFrame.new(data)
12
+
13
+ @evaluator.features.each do |k, type|
14
+ values = data.columns[k]
15
+ raise ArgumentError, "Missing column: #{k}" if !values
16
+ column_type = Utils.column_type(values.compact, k) if values
17
+
18
+ if !column_type.nil?
19
+ if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
20
+ raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
21
+ end
22
+ end
23
+ # TODO check for unknown values for categorical features
17
24
  end
18
25
 
19
- @x = x
20
- @y = y
21
- @target = target || "target"
26
+ predictions = @evaluator.predict(data)
27
+
28
+ singular ? predictions.first : predictions
22
29
  end
23
30
 
24
- def predict(x)
25
- singular = !(x.is_a?(Array) || daru?(x))
26
- x = [x] if singular
31
+ def evaluate(data, y = nil, target: nil)
32
+ data, target = prep_data(data, y, target || @target)
33
+ Eps.metrics(data.label, predict(data))
34
+ end
27
35
 
28
- pred = _predict(x)
36
+ def to_pmml
37
+ (@pmml ||= generate_pmml).to_xml
38
+ end
29
39
 
30
- singular ? pred[0] : pred
40
+ def self.load_pmml(data)
41
+ if data.is_a?(String)
42
+ data = Nokogiri::XML(data) { |config| config.strict }
43
+ end
44
+ model = new
45
+ model.instance_variable_set("@pmml", data) # cache data
46
+ model.instance_variable_set("@evaluator", yield(data))
47
+ model
31
48
  end
32
49
 
33
- def evaluate(data, y = nil, target: nil)
34
- target ||= @target
35
- raise ArgumentError, "missing target" if !target && !y
50
+ def summary(extended: false)
51
+ str = String.new("")
52
+
53
+ if @validation_set
54
+ y_true = @validation_set.label
55
+ y_pred = predict(@validation_set)
36
56
 
37
- actual = y
38
- actual ||=
39
- if daru?(data)
40
- data[target].to_a
57
+ case @target_type
58
+ when "numeric"
59
+ metric_name = "RMSE"
60
+ v = Metrics.rmse(y_true, y_pred)
61
+ metric_value = v.round >= 1000 ? v.round.to_s : "%.3g" % v
41
62
  else
42
- data.map { |v| v[target] }
63
+ metric_name = "accuracy"
64
+ metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred)).round(1)
43
65
  end
66
+ str << "Validation %s: %s\n\n" % [metric_name, metric_value]
67
+ end
68
+
69
+ str << _summary(extended: extended)
70
+ str
71
+ end
72
+
73
+ # private
74
+ def self.extract_text_features(data, features)
75
+ # updates features object
76
+ vocabulary = {}
77
+ function_mapping = {}
78
+ derived_fields = {}
79
+ data.css("LocalTransformations DerivedField, TransformationDictionary DerivedField").each do |n|
80
+ name = n.attribute("name")&.value
81
+ field = n.css("FieldRef").attribute("field").value
82
+ value = n.css("Constant").text
44
83
 
45
- actual = prep_y(actual)
46
- estimated = predict(data)
84
+ field = field[10..-2] if field =~ /\Alowercase\(.+\)\z/
85
+ next if value.empty?
47
86
 
48
- self.class.metrics(actual, estimated)
87
+ (vocabulary[field] ||= []) << value
88
+
89
+ function_mapping[field] = n.css("Apply").attribute("function").value
90
+
91
+ derived_fields[name] = [field, value]
92
+ end
93
+
94
+ functions = {}
95
+ data.css("TransformationDictionary DefineFunction").each do |n|
96
+ name = n.attribute("name").value
97
+ text_index = n.css("TextIndex")
98
+ functions[name] = {
99
+ tokenizer: Regexp.new(text_index.attribute("wordSeparatorCharacterRE").value),
100
+ case_sensitive: text_index.attribute("isCaseSensitive")&.value == "true"
101
+ }
102
+ end
103
+
104
+ text_features = {}
105
+ function_mapping.each do |field, function|
106
+ text_features[field] = functions[function].merge(vocabulary: vocabulary[field])
107
+ features[field] = "text"
108
+ end
109
+
110
+ [text_features, derived_fields]
49
111
  end
50
112
 
51
113
  private
52
114
 
53
- def categorical?(v)
54
- !v.is_a?(Numeric)
115
+ def train(data, y = nil, target: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
116
+ data, @target = prep_data(data, y, target)
117
+ @target_type = Utils.column_type(data.label, @target)
118
+
119
+ if split.nil?
120
+ split = data.size >= 30
121
+ end
122
+
123
+ # cross validation
124
+ if split && !validation_set
125
+ split = {} if split == true
126
+ split = {column: split} unless split.is_a?(Hash)
127
+
128
+ split_p = 1 - (split[:validation_size] || 0.25)
129
+ if split[:column]
130
+ split_column = split[:column].to_s
131
+ times = data.columns.delete(split_column)
132
+ check_missing(times, split_column)
133
+ split_index = (times.size * split_p).round
134
+ split_time = split[:value] || times.sort[split_index]
135
+ train_idx, validation_idx = (0...data.size).to_a.partition { |i| times[i] < split_time }
136
+ else
137
+ if split[:shuffle] != false
138
+ rng = Random.new(0) # seed random number generator
139
+ train_idx, validation_idx = (0...data.size).to_a.partition { rng.rand < split_p }
140
+ else
141
+ split_index = (data.size * split_p).round
142
+ train_idx, validation_idx = (0...data.size).to_a.partition { |i| i < split_index }
143
+ end
144
+ end
145
+ end
146
+
147
+ # determine feature types
148
+ @features = {}
149
+ data.columns.each do |k, v|
150
+ @features[k] = Utils.column_type(v.compact, k)
151
+ end
152
+
153
+ # determine text features if not specified
154
+ if text_features.nil?
155
+ text_features = []
156
+
157
+ @features.each do |k, type|
158
+ next if type != "categorical"
159
+
160
+ values = data.columns[k].compact
161
+
162
+ next unless values.first.is_a?(String) # not boolean
163
+
164
+ values = values.reject(&:empty?)
165
+ count = values.count
166
+
167
+ # check if spaces
168
+ # two spaces is rough approximation for 3 words
169
+ # TODO make more performant
170
+ if values.count { |v| v.count(" ") >= 2 } > 0.5 * count
171
+ text_features << k
172
+ end
173
+ end
174
+ end
175
+
176
+ # prep text features
177
+ @text_features = {}
178
+ (text_features || {}).each do |k, v|
179
+ @features[k.to_s] = "text"
180
+
181
+ # same output as scikit-learn CountVectorizer
182
+ # except for max_features
183
+ @text_features[k.to_s] = {
184
+ tokenizer: /\W+/,
185
+ min_length: 2,
186
+ max_features: 100
187
+ }.merge(v || {})
188
+ end
189
+
190
+ if split && !validation_set
191
+ @train_set = data[train_idx]
192
+ validation_set = data[validation_idx]
193
+ else
194
+ @train_set = data.dup
195
+ if validation_set
196
+ validation_set = Eps::DataFrame.new(validation_set)
197
+ validation_set.label = validation_set.columns.delete(@target)
198
+ end
199
+ end
200
+
201
+ raise "No data in training set" if @train_set.empty?
202
+ raise "No data in validation set" if validation_set && validation_set.empty?
203
+
204
+ @validation_set = validation_set
205
+ @evaluator = _train(verbose: verbose, early_stopping: early_stopping)
206
+
207
+ # reset pmml
208
+ @pmml = nil
209
+
210
+ nil
55
211
  end
56
212
 
57
- def daru?(x)
58
- defined?(Daru) && x.is_a?(Daru::DataFrame)
213
+ def prep_data(data, y, target)
214
+ data = Eps::DataFrame.new(data)
215
+ target = (target || "target").to_s
216
+ y ||= data.columns.delete(target)
217
+ check_missing(y, target)
218
+ data.label = y.to_a
219
+ check_data(data)
220
+ [data, target]
59
221
  end
60
222
 
61
- def flip_target(target)
62
- target.is_a?(String) ? target.to_sym : target.to_s
223
+ def prep_text_features(train_set)
224
+ @text_encoders = {}
225
+ @text_features.each do |k, v|
226
+ # reset vocabulary
227
+ v.delete(:vocabulary)
228
+
229
+ # TODO determine max features automatically
230
+ # start based on number of rows
231
+ encoder = Eps::TextEncoder.new(v)
232
+ counts = encoder.fit(train_set.columns.delete(k))
233
+ encoder.vocabulary.each do |word|
234
+ train_set.columns[[k, word]] = [0] * counts.size
235
+ end
236
+ counts.each_with_index do |ci, i|
237
+ ci.each do |word, count|
238
+ word_key = [k, word]
239
+ train_set.columns[word_key][i] = 1 if train_set.columns.key?(word_key)
240
+ end
241
+ end
242
+ @text_encoders[k] = encoder
243
+
244
+ # update vocabulary
245
+ v[:vocabulary] = encoder.vocabulary
246
+ end
247
+
248
+ raise "No features left" if train_set.columns.empty?
63
249
  end
64
250
 
65
- def prep_y(y)
66
- y.each do |yi|
67
- raise "Target missing in data" if yi.nil?
251
+ def check_data(data)
252
+ raise "No data" if data.empty?
253
+ raise "Number of data points differs from target" if data.size != data.label.size
254
+ end
255
+
256
+ def check_missing(c, name)
257
+ raise ArgumentError, "Missing column: #{name}" if !c
258
+ raise ArgumentError, "Missing values in column #{name}" if c.any?(&:nil?)
259
+ end
260
+
261
+ def check_missing_value(df)
262
+ df.columns.each do |k, v|
263
+ check_missing(v, k)
68
264
  end
69
- y
70
265
  end
71
266
 
72
- # determine if target is a string or symbol
73
- def prep_target(target, data)
74
- if daru?(data)
75
- data.has_vector?(target) ? target : flip_target(target)
267
+ def display_field(k)
268
+ if k.is_a?(Array)
269
+ if @features[k.first] == "text"
270
+ "#{k.first}(#{k.last})"
271
+ else
272
+ k.join("=")
273
+ end
76
274
  else
77
- x = data[0] || {}
78
- x[target] ? target : flip_target(target)
275
+ k
276
+ end
277
+ end
278
+
279
+ # pmml
280
+
281
+ def build_pmml(data_fields)
282
+ Nokogiri::XML::Builder.new do |xml|
283
+ xml.PMML(version: "4.4", xmlns: "http://www.dmg.org/PMML-4_4", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
284
+ pmml_header(xml)
285
+ pmml_data_dictionary(xml, data_fields)
286
+ pmml_transformation_dictionary(xml)
287
+ yield xml
288
+ end
289
+ end
290
+ end
291
+
292
+ def pmml_header(xml)
293
+ xml.Header do
294
+ xml.Application(name: "Eps", version: Eps::VERSION)
295
+ # xml.Timestamp Time.now.utc.iso8601
296
+ end
297
+ end
298
+
299
+ def pmml_data_dictionary(xml, data_fields)
300
+ xml.DataDictionary do
301
+ data_fields.each do |k, vs|
302
+ case @features[k]
303
+ when "categorical", nil
304
+ xml.DataField(name: k, optype: "categorical", dataType: "string") do
305
+ vs.map(&:to_s).sort.each do |v|
306
+ xml.Value(value: v)
307
+ end
308
+ end
309
+ when "text"
310
+ xml.DataField(name: k, optype: "categorical", dataType: "string")
311
+ else
312
+ xml.DataField(name: k, optype: "continuous", dataType: "double")
313
+ end
314
+ end
315
+ end
316
+ end
317
+
318
+ def pmml_transformation_dictionary(xml)
319
+ if @text_features.any?
320
+ xml.TransformationDictionary do
321
+ @text_features.each do |k, text_options|
322
+ xml.DefineFunction(name: "#{k}Transform", optype: "continuous") do
323
+ xml.ParameterField(name: "text")
324
+ xml.ParameterField(name: "term")
325
+ xml.TextIndex(textField: "text", localTermWeights: "termFrequency", wordSeparatorCharacterRE: text_options[:tokenizer].source, isCaseSensitive: !!text_options[:case_sensitive]) do
326
+ xml.FieldRef(field: "term")
327
+ end
328
+ end
329
+ end
330
+ end
331
+ end
332
+ end
333
+
334
+ def pmml_local_transformations(xml)
335
+ if @text_features.any?
336
+ xml.LocalTransformations do
337
+ @text_features.each do |k, _|
338
+ @text_encoders[k].vocabulary.each do |v|
339
+ xml.DerivedField(name: display_field([k, v]), optype: "continuous", dataType: "integer") do
340
+ xml.Apply(function: "#{k}Transform") do
341
+ xml.FieldRef(field: k)
342
+ xml.Constant v
343
+ end
344
+ end
345
+ end
346
+ end
347
+ end
79
348
  end
80
349
  end
81
350
  end
@@ -0,0 +1,141 @@
1
+ module Eps
2
+ class DataFrame
3
+ attr_reader :columns
4
+ attr_accessor :label
5
+
6
+ def initialize(data = [])
7
+ @columns = {}
8
+
9
+ if data.is_a?(Eps::DataFrame)
10
+ data.columns.each do |k, v|
11
+ @columns[k] = v
12
+ end
13
+ elsif daru?(data)
14
+ data.to_h.each do |k, v|
15
+ @columns[k.to_s] = v.to_a
16
+ end
17
+ elsif data.is_a?(Hash)
18
+ data.each do |k, v|
19
+ @columns[k.to_s] = v.to_a
20
+ end
21
+ else
22
+ if data.any?
23
+ row = data[0]
24
+
25
+ if row.is_a?(Hash)
26
+ row.keys.each do |k|
27
+ @columns[k.to_s] = data.map { |r| r[k] }
28
+ end
29
+ elsif row.is_a?(Array)
30
+ row.size.times do |i|
31
+ @columns["x#{i}"] = data.map { |r| r[i] }
32
+ end
33
+ else
34
+ @columns["x0"] = data
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ def empty?
41
+ size == 0
42
+ end
43
+
44
+ def size
45
+ @columns.any? ? columns.values.first.size : 0
46
+ end
47
+
48
+ def any?
49
+ @columns.any?
50
+ end
51
+
52
+ def map
53
+ if @columns.any?
54
+ size.times.map do |i|
55
+ yield Hash[@columns.map { |k, v| [k, v[i]] }]
56
+ end
57
+ end
58
+ end
59
+
60
+ def map_rows
61
+ if @columns.any?
62
+ size.times.map do |i|
63
+ yield @columns.map { |_, v| v[i] }
64
+ end
65
+ end
66
+ end
67
+
68
+ def [](rows, cols = nil)
69
+ if cols.nil?
70
+ if rows.is_a?(String) || (rows.is_a?(Array) && rows.first.is_a?(String))
71
+ cols = rows
72
+ rows = 0..-1
73
+ end
74
+ end
75
+
76
+ if rows.is_a?(Range)
77
+ if rows.end.nil?
78
+ rows = Range.new(rows.begin, size - 1)
79
+ elsif rows.end < 0
80
+ rows = Range.new(rows.begin, size + rows.end, rows.exclude_end?)
81
+ end
82
+ end
83
+
84
+ if cols
85
+ if cols.is_a?(Range)
86
+ c = columns.keys
87
+
88
+ start_index = c.index(cols.begin)
89
+ raise "Undefined column: #{cols.begin}" unless start_index
90
+
91
+ end_index = c.index(cols.end)
92
+ raise "Undefined column: #{cols.end}" unless end_index
93
+
94
+ reverse = false
95
+ if start_index > end_index
96
+ reverse = true
97
+ start_index, end_index = end_index, start_index
98
+ end
99
+
100
+ cols = c[Range.new(start_index, end_index, cols.exclude_end?)]
101
+ cols.reverse! if reverse
102
+ elsif !cols.is_a?(Array)
103
+ singular = true
104
+ cols = [cols]
105
+ end
106
+ else
107
+ cols = columns.keys
108
+ end
109
+
110
+ df = Eps::DataFrame.new
111
+
112
+ cols.each do |c|
113
+ raise "Undefined column: #{c}" unless columns.include?(c)
114
+
115
+ df.columns[c] = columns[c].values_at(*rows)
116
+ end
117
+ df.label = label.values_at(*rows) if label
118
+
119
+ singular ? df.columns[cols[0]] : df
120
+ end
121
+
122
+ def ==(other)
123
+ columns.keys == other.columns.keys && columns == other.columns
124
+ end
125
+
126
+ def dup
127
+ df = Eps::DataFrame.new
128
+ columns.each do |k, v|
129
+ df.columns[k] = v
130
+ end
131
+ df.label = label
132
+ df
133
+ end
134
+
135
+ private
136
+
137
+ def daru?(x)
138
+ defined?(Daru) && x.is_a?(Daru::DataFrame)
139
+ end
140
+ end
141
+ end