eps 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/eps.rb CHANGED
@@ -1,18 +1,42 @@
1
1
  # dependencies
2
- require "matrix"
2
+ require "bigdecimal"
3
3
  require "json"
4
+ require "lightgbm"
5
+ require "matrix"
6
+ require "nokogiri"
4
7
 
5
8
  # modules
6
9
  require "eps/base"
7
10
  require "eps/base_estimator"
11
+ require "eps/data_frame"
12
+ require "eps/evaluators/linear_regression"
13
+ require "eps/evaluators/lightgbm"
14
+ require "eps/evaluators/naive_bayes"
15
+ require "eps/evaluators/node"
16
+ require "eps/label_encoder"
17
+ require "eps/lightgbm"
8
18
  require "eps/linear_regression"
19
+ require "eps/metrics"
9
20
  require "eps/model"
10
21
  require "eps/naive_bayes"
22
+ require "eps/statistics"
23
+ require "eps/text_encoder"
24
+ require "eps/utils"
11
25
  require "eps/version"
12
26
 
13
27
  module Eps
14
- def self.metrics(actual, estimated)
15
- Eps::Model.metrics(actual, estimated)
28
+ def self.metrics(y_true, y_pred)
29
+ if Utils.column_type(y_true, "actual") == "numeric"
30
+ {
31
+ rmse: Metrics.rmse(y_true, y_pred),
32
+ mae: Metrics.mae(y_true, y_pred),
33
+ me: Metrics.me(y_true, y_pred)
34
+ }
35
+ else
36
+ {
37
+ accuracy: Metrics.accuracy(y_true, y_pred)
38
+ }
39
+ end
16
40
  end
17
41
 
18
42
  # backwards compatibility
@@ -1,81 +1,350 @@
1
1
  module Eps
2
2
  class BaseEstimator
3
- def train(data, y, target: nil, **options)
4
- # TODO more performant conversion
5
- if daru?(data)
6
- x = data.dup
7
- x = x.delete_vector(target) if target
8
- else
9
- x = data.map(&:dup)
10
- x.each { |r| r.delete(target) } if target
11
- end
3
+ def initialize(data = nil, y = nil, **options)
4
+ train(data, y, **options) if data
5
+ end
12
6
 
13
- y = prep_y(y.to_a)
7
+ def predict(data)
8
+ singular = data.is_a?(Hash)
9
+ data = [data] if singular
14
10
 
15
- if x.size != y.size
16
- raise "Number of samples differs from target"
11
+ data = Eps::DataFrame.new(data)
12
+
13
+ @evaluator.features.each do |k, type|
14
+ values = data.columns[k]
15
+ raise ArgumentError, "Missing column: #{k}" if !values
16
+ column_type = Utils.column_type(values.compact, k) if values
17
+
18
+ if !column_type.nil?
19
+ if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
20
+ raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
21
+ end
22
+ end
23
+ # TODO check for unknown values for categorical features
17
24
  end
18
25
 
19
- @x = x
20
- @y = y
21
- @target = target || "target"
26
+ predictions = @evaluator.predict(data)
27
+
28
+ singular ? predictions.first : predictions
22
29
  end
23
30
 
24
- def predict(x)
25
- singular = !(x.is_a?(Array) || daru?(x))
26
- x = [x] if singular
31
+ def evaluate(data, y = nil, target: nil)
32
+ data, target = prep_data(data, y, target || @target)
33
+ Eps.metrics(data.label, predict(data))
34
+ end
27
35
 
28
- pred = _predict(x)
36
+ def to_pmml
37
+ (@pmml ||= generate_pmml).to_xml
38
+ end
29
39
 
30
- singular ? pred[0] : pred
40
+ def self.load_pmml(data)
41
+ if data.is_a?(String)
42
+ data = Nokogiri::XML(data) { |config| config.strict }
43
+ end
44
+ model = new
45
+ model.instance_variable_set("@pmml", data) # cache data
46
+ model.instance_variable_set("@evaluator", yield(data))
47
+ model
31
48
  end
32
49
 
33
- def evaluate(data, y = nil, target: nil)
34
- target ||= @target
35
- raise ArgumentError, "missing target" if !target && !y
50
+ def summary(extended: false)
51
+ str = String.new("")
52
+
53
+ if @validation_set
54
+ y_true = @validation_set.label
55
+ y_pred = predict(@validation_set)
36
56
 
37
- actual = y
38
- actual ||=
39
- if daru?(data)
40
- data[target].to_a
57
+ case @target_type
58
+ when "numeric"
59
+ metric_name = "RMSE"
60
+ v = Metrics.rmse(y_true, y_pred)
61
+ metric_value = v.round >= 1000 ? v.round.to_s : "%.3g" % v
41
62
  else
42
- data.map { |v| v[target] }
63
+ metric_name = "accuracy"
64
+ metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred)).round(1)
43
65
  end
66
+ str << "Validation %s: %s\n\n" % [metric_name, metric_value]
67
+ end
68
+
69
+ str << _summary(extended: extended)
70
+ str
71
+ end
72
+
73
+ # private
74
+ def self.extract_text_features(data, features)
75
+ # updates features object
76
+ vocabulary = {}
77
+ function_mapping = {}
78
+ derived_fields = {}
79
+ data.css("LocalTransformations DerivedField, TransformationDictionary DerivedField").each do |n|
80
+ name = n.attribute("name")&.value
81
+ field = n.css("FieldRef").attribute("field").value
82
+ value = n.css("Constant").text
44
83
 
45
- actual = prep_y(actual)
46
- estimated = predict(data)
84
+ field = field[10..-2] if field =~ /\Alowercase\(.+\)\z/
85
+ next if value.empty?
47
86
 
48
- self.class.metrics(actual, estimated)
87
+ (vocabulary[field] ||= []) << value
88
+
89
+ function_mapping[field] = n.css("Apply").attribute("function").value
90
+
91
+ derived_fields[name] = [field, value]
92
+ end
93
+
94
+ functions = {}
95
+ data.css("TransformationDictionary DefineFunction").each do |n|
96
+ name = n.attribute("name").value
97
+ text_index = n.css("TextIndex")
98
+ functions[name] = {
99
+ tokenizer: Regexp.new(text_index.attribute("wordSeparatorCharacterRE").value),
100
+ case_sensitive: text_index.attribute("isCaseSensitive")&.value == "true"
101
+ }
102
+ end
103
+
104
+ text_features = {}
105
+ function_mapping.each do |field, function|
106
+ text_features[field] = functions[function].merge(vocabulary: vocabulary[field])
107
+ features[field] = "text"
108
+ end
109
+
110
+ [text_features, derived_fields]
49
111
  end
50
112
 
51
113
  private
52
114
 
53
- def categorical?(v)
54
- !v.is_a?(Numeric)
115
+ def train(data, y = nil, target: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
116
+ data, @target = prep_data(data, y, target)
117
+ @target_type = Utils.column_type(data.label, @target)
118
+
119
+ if split.nil?
120
+ split = data.size >= 30
121
+ end
122
+
123
+ # cross validation
124
+ if split && !validation_set
125
+ split = {} if split == true
126
+ split = {column: split} unless split.is_a?(Hash)
127
+
128
+ split_p = 1 - (split[:validation_size] || 0.25)
129
+ if split[:column]
130
+ split_column = split[:column].to_s
131
+ times = data.columns.delete(split_column)
132
+ check_missing(times, split_column)
133
+ split_index = (times.size * split_p).round
134
+ split_time = split[:value] || times.sort[split_index]
135
+ train_idx, validation_idx = (0...data.size).to_a.partition { |i| times[i] < split_time }
136
+ else
137
+ if split[:shuffle] != false
138
+ rng = Random.new(0) # seed random number generator
139
+ train_idx, validation_idx = (0...data.size).to_a.partition { rng.rand < split_p }
140
+ else
141
+ split_index = (data.size * split_p).round
142
+ train_idx, validation_idx = (0...data.size).to_a.partition { |i| i < split_index }
143
+ end
144
+ end
145
+ end
146
+
147
+ # determine feature types
148
+ @features = {}
149
+ data.columns.each do |k, v|
150
+ @features[k] = Utils.column_type(v.compact, k)
151
+ end
152
+
153
+ # determine text features if not specified
154
+ if text_features.nil?
155
+ text_features = []
156
+
157
+ @features.each do |k, type|
158
+ next if type != "categorical"
159
+
160
+ values = data.columns[k].compact
161
+
162
+ next unless values.first.is_a?(String) # not boolean
163
+
164
+ values = values.reject(&:empty?)
165
+ count = values.count
166
+
167
+ # check if spaces
168
+ # two spaces is rough approximation for 3 words
169
+ # TODO make more performant
170
+ if values.count { |v| v.count(" ") >= 2 } > 0.5 * count
171
+ text_features << k
172
+ end
173
+ end
174
+ end
175
+
176
+ # prep text features
177
+ @text_features = {}
178
+ (text_features || {}).each do |k, v|
179
+ @features[k.to_s] = "text"
180
+
181
+ # same output as scikit-learn CountVectorizer
182
+ # except for max_features
183
+ @text_features[k.to_s] = {
184
+ tokenizer: /\W+/,
185
+ min_length: 2,
186
+ max_features: 100
187
+ }.merge(v || {})
188
+ end
189
+
190
+ if split && !validation_set
191
+ @train_set = data[train_idx]
192
+ validation_set = data[validation_idx]
193
+ else
194
+ @train_set = data.dup
195
+ if validation_set
196
+ validation_set = Eps::DataFrame.new(validation_set)
197
+ validation_set.label = validation_set.columns.delete(@target)
198
+ end
199
+ end
200
+
201
+ raise "No data in training set" if @train_set.empty?
202
+ raise "No data in validation set" if validation_set && validation_set.empty?
203
+
204
+ @validation_set = validation_set
205
+ @evaluator = _train(verbose: verbose, early_stopping: early_stopping)
206
+
207
+ # reset pmml
208
+ @pmml = nil
209
+
210
+ nil
55
211
  end
56
212
 
57
- def daru?(x)
58
- defined?(Daru) && x.is_a?(Daru::DataFrame)
213
+ def prep_data(data, y, target)
214
+ data = Eps::DataFrame.new(data)
215
+ target = (target || "target").to_s
216
+ y ||= data.columns.delete(target)
217
+ check_missing(y, target)
218
+ data.label = y.to_a
219
+ check_data(data)
220
+ [data, target]
59
221
  end
60
222
 
61
- def flip_target(target)
62
- target.is_a?(String) ? target.to_sym : target.to_s
223
+ def prep_text_features(train_set)
224
+ @text_encoders = {}
225
+ @text_features.each do |k, v|
226
+ # reset vocabulary
227
+ v.delete(:vocabulary)
228
+
229
+ # TODO determine max features automatically
230
+ # start based on number of rows
231
+ encoder = Eps::TextEncoder.new(v)
232
+ counts = encoder.fit(train_set.columns.delete(k))
233
+ encoder.vocabulary.each do |word|
234
+ train_set.columns[[k, word]] = [0] * counts.size
235
+ end
236
+ counts.each_with_index do |ci, i|
237
+ ci.each do |word, count|
238
+ word_key = [k, word]
239
+ train_set.columns[word_key][i] = 1 if train_set.columns.key?(word_key)
240
+ end
241
+ end
242
+ @text_encoders[k] = encoder
243
+
244
+ # update vocabulary
245
+ v[:vocabulary] = encoder.vocabulary
246
+ end
247
+
248
+ raise "No features left" if train_set.columns.empty?
63
249
  end
64
250
 
65
- def prep_y(y)
66
- y.each do |yi|
67
- raise "Target missing in data" if yi.nil?
251
+ def check_data(data)
252
+ raise "No data" if data.empty?
253
+ raise "Number of data points differs from target" if data.size != data.label.size
254
+ end
255
+
256
+ def check_missing(c, name)
257
+ raise ArgumentError, "Missing column: #{name}" if !c
258
+ raise ArgumentError, "Missing values in column #{name}" if c.any?(&:nil?)
259
+ end
260
+
261
+ def check_missing_value(df)
262
+ df.columns.each do |k, v|
263
+ check_missing(v, k)
68
264
  end
69
- y
70
265
  end
71
266
 
72
- # determine if target is a string or symbol
73
- def prep_target(target, data)
74
- if daru?(data)
75
- data.has_vector?(target) ? target : flip_target(target)
267
+ def display_field(k)
268
+ if k.is_a?(Array)
269
+ if @features[k.first] == "text"
270
+ "#{k.first}(#{k.last})"
271
+ else
272
+ k.join("=")
273
+ end
76
274
  else
77
- x = data[0] || {}
78
- x[target] ? target : flip_target(target)
275
+ k
276
+ end
277
+ end
278
+
279
+ # pmml
280
+
281
+ def build_pmml(data_fields)
282
+ Nokogiri::XML::Builder.new do |xml|
283
+ xml.PMML(version: "4.4", xmlns: "http://www.dmg.org/PMML-4_4", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
284
+ pmml_header(xml)
285
+ pmml_data_dictionary(xml, data_fields)
286
+ pmml_transformation_dictionary(xml)
287
+ yield xml
288
+ end
289
+ end
290
+ end
291
+
292
+ def pmml_header(xml)
293
+ xml.Header do
294
+ xml.Application(name: "Eps", version: Eps::VERSION)
295
+ # xml.Timestamp Time.now.utc.iso8601
296
+ end
297
+ end
298
+
299
+ def pmml_data_dictionary(xml, data_fields)
300
+ xml.DataDictionary do
301
+ data_fields.each do |k, vs|
302
+ case @features[k]
303
+ when "categorical", nil
304
+ xml.DataField(name: k, optype: "categorical", dataType: "string") do
305
+ vs.map(&:to_s).sort.each do |v|
306
+ xml.Value(value: v)
307
+ end
308
+ end
309
+ when "text"
310
+ xml.DataField(name: k, optype: "categorical", dataType: "string")
311
+ else
312
+ xml.DataField(name: k, optype: "continuous", dataType: "double")
313
+ end
314
+ end
315
+ end
316
+ end
317
+
318
+ def pmml_transformation_dictionary(xml)
319
+ if @text_features.any?
320
+ xml.TransformationDictionary do
321
+ @text_features.each do |k, text_options|
322
+ xml.DefineFunction(name: "#{k}Transform", optype: "continuous") do
323
+ xml.ParameterField(name: "text")
324
+ xml.ParameterField(name: "term")
325
+ xml.TextIndex(textField: "text", localTermWeights: "termFrequency", wordSeparatorCharacterRE: text_options[:tokenizer].source, isCaseSensitive: !!text_options[:case_sensitive]) do
326
+ xml.FieldRef(field: "term")
327
+ end
328
+ end
329
+ end
330
+ end
331
+ end
332
+ end
333
+
334
+ def pmml_local_transformations(xml)
335
+ if @text_features.any?
336
+ xml.LocalTransformations do
337
+ @text_features.each do |k, _|
338
+ @text_encoders[k].vocabulary.each do |v|
339
+ xml.DerivedField(name: display_field([k, v]), optype: "continuous", dataType: "integer") do
340
+ xml.Apply(function: "#{k}Transform") do
341
+ xml.FieldRef(field: k)
342
+ xml.Constant v
343
+ end
344
+ end
345
+ end
346
+ end
347
+ end
79
348
  end
80
349
  end
81
350
  end
@@ -0,0 +1,141 @@
1
+ module Eps
2
+ class DataFrame
3
+ attr_reader :columns
4
+ attr_accessor :label
5
+
6
+ def initialize(data = [])
7
+ @columns = {}
8
+
9
+ if data.is_a?(Eps::DataFrame)
10
+ data.columns.each do |k, v|
11
+ @columns[k] = v
12
+ end
13
+ elsif daru?(data)
14
+ data.to_h.each do |k, v|
15
+ @columns[k.to_s] = v.to_a
16
+ end
17
+ elsif data.is_a?(Hash)
18
+ data.each do |k, v|
19
+ @columns[k.to_s] = v.to_a
20
+ end
21
+ else
22
+ if data.any?
23
+ row = data[0]
24
+
25
+ if row.is_a?(Hash)
26
+ row.keys.each do |k|
27
+ @columns[k.to_s] = data.map { |r| r[k] }
28
+ end
29
+ elsif row.is_a?(Array)
30
+ row.size.times do |i|
31
+ @columns["x#{i}"] = data.map { |r| r[i] }
32
+ end
33
+ else
34
+ @columns["x0"] = data
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ def empty?
41
+ size == 0
42
+ end
43
+
44
+ def size
45
+ @columns.any? ? columns.values.first.size : 0
46
+ end
47
+
48
+ def any?
49
+ @columns.any?
50
+ end
51
+
52
+ def map
53
+ if @columns.any?
54
+ size.times.map do |i|
55
+ yield Hash[@columns.map { |k, v| [k, v[i]] }]
56
+ end
57
+ end
58
+ end
59
+
60
+ def map_rows
61
+ if @columns.any?
62
+ size.times.map do |i|
63
+ yield @columns.map { |_, v| v[i] }
64
+ end
65
+ end
66
+ end
67
+
68
+ def [](rows, cols = nil)
69
+ if cols.nil?
70
+ if rows.is_a?(String) || (rows.is_a?(Array) && rows.first.is_a?(String))
71
+ cols = rows
72
+ rows = 0..-1
73
+ end
74
+ end
75
+
76
+ if rows.is_a?(Range)
77
+ if rows.end.nil?
78
+ rows = Range.new(rows.begin, size - 1)
79
+ elsif rows.end < 0
80
+ rows = Range.new(rows.begin, size + rows.end, rows.exclude_end?)
81
+ end
82
+ end
83
+
84
+ if cols
85
+ if cols.is_a?(Range)
86
+ c = columns.keys
87
+
88
+ start_index = c.index(cols.begin)
89
+ raise "Undefined column: #{cols.begin}" unless start_index
90
+
91
+ end_index = c.index(cols.end)
92
+ raise "Undefined column: #{cols.end}" unless end_index
93
+
94
+ reverse = false
95
+ if start_index > end_index
96
+ reverse = true
97
+ start_index, end_index = end_index, start_index
98
+ end
99
+
100
+ cols = c[Range.new(start_index, end_index, cols.exclude_end?)]
101
+ cols.reverse! if reverse
102
+ elsif !cols.is_a?(Array)
103
+ singular = true
104
+ cols = [cols]
105
+ end
106
+ else
107
+ cols = columns.keys
108
+ end
109
+
110
+ df = Eps::DataFrame.new
111
+
112
+ cols.each do |c|
113
+ raise "Undefined column: #{c}" unless columns.include?(c)
114
+
115
+ df.columns[c] = columns[c].values_at(*rows)
116
+ end
117
+ df.label = label.values_at(*rows) if label
118
+
119
+ singular ? df.columns[cols[0]] : df
120
+ end
121
+
122
+ def ==(other)
123
+ columns.keys == other.columns.keys && columns == other.columns
124
+ end
125
+
126
+ def dup
127
+ df = Eps::DataFrame.new
128
+ columns.each do |k, v|
129
+ df.columns[k] = v
130
+ end
131
+ df.label = label
132
+ df
133
+ end
134
+
135
+ private
136
+
137
+ def daru?(x)
138
+ defined?(Daru) && x.is_a?(Daru::DataFrame)
139
+ end
140
+ end
141
+ end