eps 0.3.0 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,7 @@
1
1
  module Eps
2
2
  class DataFrame
3
3
  attr_reader :columns
4
- attr_accessor :label
4
+ attr_accessor :label, :weight
5
5
 
6
6
  def initialize(data = [])
7
7
  @columns = {}
@@ -10,7 +10,7 @@ module Eps
10
10
  data.columns.each do |k, v|
11
11
  @columns[k] = v
12
12
  end
13
- elsif daru?(data)
13
+ elsif rover?(data) || daru?(data)
14
14
  data.to_h.each do |k, v|
15
15
  @columns[k.to_s] = v.to_a
16
16
  end
@@ -19,6 +19,8 @@ module Eps
19
19
  @columns[k.to_s] = v.to_a
20
20
  end
21
21
  else
22
+ data = data.to_a if numo?(data)
23
+
22
24
  if data.any?
23
25
  row = data[0]
24
26
 
@@ -78,6 +80,10 @@ module Eps
78
80
  rows = Range.new(rows.begin, size - 1)
79
81
  elsif rows.end < 0
80
82
  rows = Range.new(rows.begin, size + rows.end, rows.exclude_end?)
83
+ else
84
+ finish = rows.end
85
+ finish -= 1 if rows.exclude_end?
86
+ rows = Range.new(rows.begin, size - 1) if finish >= size - 1
81
87
  end
82
88
  end
83
89
 
@@ -115,6 +121,7 @@ module Eps
115
121
  df.columns[c] = columns[c].values_at(*rows)
116
122
  end
117
123
  df.label = label.values_at(*rows) if label
124
+ df.weight = weight.values_at(*rows) if weight
118
125
 
119
126
  singular ? df.columns[cols[0]] : df
120
127
  end
@@ -129,13 +136,22 @@ module Eps
129
136
  df.columns[k] = v
130
137
  end
131
138
  df.label = label
139
+ df.weight = weight
132
140
  df
133
141
  end
134
142
 
135
143
  private
136
144
 
145
+ def numo?(x)
146
+ defined?(Numo::NArray) && x.is_a?(Numo::NArray)
147
+ end
148
+
149
+ def rover?(x)
150
+ defined?(Rover::DataFrame) && x.is_a?(Rover::DataFrame)
151
+ end
152
+
137
153
  def daru?(x)
138
- defined?(Daru) && x.is_a?(Daru::DataFrame)
154
+ defined?(Daru::DataFrame) && x.is_a?(Daru::DataFrame)
139
155
  end
140
156
  end
141
157
  end
@@ -11,12 +11,14 @@ module Eps
11
11
  @text_features = text_features
12
12
  end
13
13
 
14
- def predict(data)
14
+ def predict(data, probabilities: false)
15
+ raise "Probabilities not supported" if probabilities && @objective == "regression"
16
+
15
17
  rows = data.map(&:to_h)
16
18
 
17
19
  # sparse matrix
18
20
  @text_features.each do |k, v|
19
- encoder = TextEncoder.new(v)
21
+ encoder = TextEncoder.new(**v)
20
22
 
21
23
  values = data.columns.delete(k)
22
24
  counts = encoder.transform(values)
@@ -38,7 +40,12 @@ module Eps
38
40
  when "regression"
39
41
  sum_trees(rows, @trees)
40
42
  when "binary"
41
- sum_trees(rows, @trees).map { |s| @labels[sigmoid(s) > 0.5 ? 1 : 0] }
43
+ prob = sum_trees(rows, @trees).map { |s| sigmoid(s) }
44
+ if probabilities
45
+ prob.map { |v| @labels.zip([1 - v, v]).to_h }
46
+ else
47
+ prob.map { |v| @labels[v > 0.5 ? 1 : 0] }
48
+ end
42
49
  else
43
50
  tree_scores = []
44
51
  num_trees = @trees.size / @labels.size
@@ -47,8 +54,14 @@ module Eps
47
54
  end
48
55
  data.size.times.map do |i|
49
56
  v = tree_scores.map { |s| s[i] }
50
- idx = v.map.with_index.max_by { |v2, _| v2 }.last
51
- @labels[idx]
57
+ if probabilities
58
+ exp = v.map { |vi| Math.exp(vi) }
59
+ sum = exp.sum
60
+ @labels.zip(exp.map { |e| e / sum }).to_h
61
+ else
62
+ idx = v.map.with_index.max_by { |v2, _| v2 }.last
63
+ @labels[idx]
64
+ end
52
65
  end
53
66
  end
54
67
  end
@@ -81,7 +94,7 @@ module Eps
81
94
  else
82
95
  case node.operator
83
96
  when "equal"
84
- v == node.value
97
+ v.to_s == node.value
85
98
  when "in"
86
99
  node.value.include?(v)
87
100
  when "greaterThan"
@@ -109,7 +122,7 @@ module Eps
109
122
  end
110
123
 
111
124
  def sigmoid(x)
112
- 1.0 / (1 + Math::E**(-x))
125
+ 1.0 / (1 + Math.exp(-x))
113
126
  end
114
127
  end
115
128
  end
@@ -9,8 +9,10 @@ module Eps
9
9
  @text_features = text_features || {}
10
10
  end
11
11
 
12
- def predict(x)
13
- intercept = @coefficients["_intercept"]
12
+ def predict(x, probabilities: false)
13
+ raise "Probabilities not supported" if probabilities
14
+
15
+ intercept = @coefficients["_intercept"] || 0.0
14
16
  scores = [intercept] * x.size
15
17
 
16
18
  @features.each do |k, type|
@@ -19,10 +21,11 @@ module Eps
19
21
  case type
20
22
  when "categorical"
21
23
  x.columns[k].each_with_index do |xv, i|
22
- scores[i] += @coefficients[[k, xv]].to_f
24
+ # TODO clean up
25
+ scores[i] += (@coefficients[[k, xv]] || @coefficients[[k, xv.to_s]]).to_f
23
26
  end
24
27
  when "text"
25
- encoder = TextEncoder.new(@text_features[k])
28
+ encoder = TextEncoder.new(**@text_features[k])
26
29
  counts = encoder.transform(x.columns[k])
27
30
  coef = {}
28
31
  @coefficients.each do |k2, v|
@@ -10,14 +10,15 @@ module Eps
10
10
  @legacy = legacy
11
11
  end
12
12
 
13
- def predict(x)
13
+ def predict(x, probabilities: false)
14
14
  probs = calculate_class_probabilities(x)
15
15
  probs.map do |xp|
16
- # convert probabilities
17
- # not needed when just returning label
18
- # sum = xp.values.map { |v| Math.exp(v) }.sum.to_f
19
- # p xp.map { |k, v| [k, Math.exp(v) / sum] }.to_h
20
- xp.sort_by { |k, v| [-v, k] }[0][0]
16
+ if probabilities
17
+ sum = xp.values.map { |v| Math.exp(v) }.sum.to_f
18
+ xp.map { |k, v| [k, Math.exp(v) / sum] }.to_h
19
+ else
20
+ xp.sort_by { |k, v| [-v, k] }[0][0]
21
+ end
21
22
  end
22
23
  end
23
24
 
@@ -38,7 +39,8 @@ module Eps
38
39
  case type
39
40
  when "categorical"
40
41
  x.columns[k].each_with_index do |xi, i|
41
- vc = probabilities[:conditional][k][xi]
42
+ # TODO clean this up
43
+ vc = probabilities[:conditional][k][xi] || probabilities[:conditional][k][xi.to_s]
42
44
 
43
45
  # unknown value if not vc
44
46
  if vc
@@ -24,9 +24,13 @@ module Eps
24
24
  if yi.nil?
25
25
  nil
26
26
  else
27
- v = @labels[yi.to_s]
28
- raise "Unknown label: #{yi}" unless v
29
- v
27
+ # use an additional label for unseen values
28
+ # this is only used during training for the LightGBM eval_set
29
+ # LightGBM ignores them (only uses seen categories for predictions)
30
+ # https://github.com/microsoft/LightGBM/issues/1936
31
+ # the evaluator also ignores them (to be consistent with LightGBM)
32
+ # but doesn't use this code
33
+ @labels[yi.to_s] || @labels.size
30
34
  end
31
35
  end
32
36
  end
@@ -1,39 +1,5 @@
1
- require "eps/pmml_generators/lightgbm"
2
-
3
1
  module Eps
4
2
  class LightGBM < BaseEstimator
5
- include PmmlGenerators::LightGBM
6
-
7
- def self.load_pmml(data)
8
- super do |data|
9
- objective = data.css("MiningModel").first.attribute("functionName").value
10
- if objective == "classification"
11
- labels = data.css("RegressionModel OutputField").map { |n| n.attribute("value").value }
12
- objective = labels.size > 2 ? "multiclass" : "binary"
13
- end
14
-
15
- features = {}
16
- text_features, derived_fields = extract_text_features(data, features)
17
- node = data.css("DataDictionary").first
18
- node.css("DataField")[1..-1].to_a.each do |node|
19
- features[node.attribute("name").value] =
20
- if node.attribute("optype").value == "categorical"
21
- "categorical"
22
- else
23
- "numeric"
24
- end
25
- end
26
-
27
- trees = []
28
- data.css("Segmentation TreeModel").each do |tree|
29
- node = find_nodes(tree.css("Node").first, derived_fields)
30
- trees << node
31
- end
32
-
33
- Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: features, text_features: text_features)
34
- end
35
- end
36
-
37
3
  private
38
4
 
39
5
  def _summary(extended: false)
@@ -51,48 +17,16 @@ module Eps
51
17
  str
52
18
  end
53
19
 
54
- def self.find_nodes(xml, derived_fields)
55
- score = BigDecimal(xml.attribute("score").value).to_f
56
-
57
- elements = xml.elements
58
- xml_predicate = elements.first
59
-
60
- predicate =
61
- if xml_predicate.name == "True"
62
- nil
63
- elsif xml_predicate.name == "SimpleSetPredicate"
64
- operator = "in"
65
- value = xml_predicate.css("Array").text.scan(/"(.+?)(?<!\\)"|(\S+)/).flatten.compact.map { |v| v.gsub('\"', '"') }
66
- field = xml_predicate.attribute("field").value
67
- field = derived_fields[field] if derived_fields[field]
68
- {
69
- field: field,
70
- operator: operator,
71
- value: value
72
- }
73
- else
74
- operator = xml_predicate.attribute("operator").value
75
- value = xml_predicate.attribute("value").value
76
- value = BigDecimal(value).to_f if operator == "greaterThan"
77
- field = xml_predicate.attribute("field").value
78
- field = derived_fields[field] if derived_fields[field]
79
- {
80
- field: field,
81
- operator: operator,
82
- value: value
83
- }
84
- end
85
-
86
- children = elements[1..-1].map { |n| find_nodes(n, derived_fields) }
87
-
88
- Evaluators::Node.new(score: score, predicate: predicate, children: children)
89
- end
90
-
91
- def _train(verbose: nil, early_stopping: nil)
20
+ def _train(verbose: nil, early_stopping: nil, learning_rate: 0.1)
92
21
  train_set = @train_set
93
22
  validation_set = @validation_set.dup
94
23
  summary_label = train_set.label
95
24
 
25
+ # create check set
26
+ evaluator_set = validation_set || train_set
27
+ check_idx = 100.times.map { rand(evaluator_set.size) }.uniq
28
+ evaluator_set = evaluator_set[check_idx]
29
+
96
30
  # objective
97
31
  objective =
98
32
  if @target_type == "numeric"
@@ -126,7 +60,10 @@ module Eps
126
60
  prep_text_features(validation_set) if validation_set
127
61
 
128
62
  # create params
129
- params = {objective: objective}
63
+ params = {
64
+ objective: objective,
65
+ learning_rate: learning_rate
66
+ }
130
67
  params[:num_classes] = labels.size if objective == "multiclass"
131
68
  if train_set.size < 30
132
69
  params[:min_data_in_bin] = 1
@@ -135,8 +72,8 @@ module Eps
135
72
 
136
73
  # create datasets
137
74
  categorical_idx = @features.values.map.with_index.select { |type, _| type == "categorical" }.map(&:last)
138
- train_ds = ::LightGBM::Dataset.new(train_set.map_rows(&:to_a), label: train_set.label, categorical_feature: categorical_idx, params: params)
139
- validation_ds = ::LightGBM::Dataset.new(validation_set.map_rows(&:to_a), label: validation_set.label, categorical_feature: categorical_idx, params: params, reference: train_ds) if validation_set
75
+ train_ds = ::LightGBM::Dataset.new(train_set.map_rows(&:to_a), label: train_set.label, weight: train_set.weight, categorical_feature: categorical_idx, params: params)
76
+ validation_ds = ::LightGBM::Dataset.new(validation_set.map_rows(&:to_a), label: validation_set.label, weight: validation_set.weight, categorical_feature: categorical_idx, params: params, reference: train_ds) if validation_set
140
77
 
141
78
  # train
142
79
  valid_sets = [train_ds]
@@ -176,11 +113,39 @@ module Eps
176
113
  # reset pmml
177
114
  @pmml = nil
178
115
 
179
- Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: @features, text_features: @text_features)
116
+ evaluator = Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: @features, text_features: @text_features)
117
+ booster_set = validation_set ? validation_set[check_idx] : train_set[check_idx]
118
+ check_evaluator(objective, labels, booster, booster_set, evaluator, evaluator_set)
119
+ evaluator
180
120
  end
181
121
 
182
- def evaluator_class
183
- PmmlLoaders::LightGBM
122
+ # compare a subset of predictions to check for possible bugs in evaluator
123
+ # NOTE LightGBM must use double data type for prediction input for these to be consistent
124
+ def check_evaluator(objective, labels, booster, booster_set, evaluator, evaluator_set)
125
+ expected = @booster.predict(booster_set.map_rows(&:to_a))
126
+ if objective == "multiclass"
127
+ actual = evaluator.predict(evaluator_set, probabilities: true)
128
+ # just compare first for now
129
+ expected.map! { |v| v.first }
130
+ actual.map! { |v| v.values.first }
131
+ elsif objective == "binary"
132
+ actual = evaluator.predict(evaluator_set, probabilities: true).map { |v| v.values.last }
133
+ else
134
+ actual = evaluator.predict(evaluator_set)
135
+ end
136
+
137
+ regression = objective == "regression" || objective == "binary"
138
+ bad_observations = []
139
+ expected.zip(actual).each_with_index do |(exp, act), i|
140
+ success = (act - exp).abs < 0.001
141
+ unless success
142
+ bad_observations << {expected: exp, actual: act, data_point: evaluator_set[i].map(&:itself).first}
143
+ end
144
+ end
145
+
146
+ if bad_observations.any?
147
+ raise "Bug detected in evaluator. Please report an issue. Bad data points: #{bad_observations.inspect}"
148
+ end
184
149
  end
185
150
 
186
151
  # for evaluator
@@ -1,40 +1,5 @@
1
1
  module Eps
2
2
  class LinearRegression < BaseEstimator
3
- # pmml
4
-
5
- def self.load_pmml(data)
6
- super do |data|
7
- # TODO more validation
8
- node = data.css("RegressionTable")
9
-
10
- coefficients = {
11
- "_intercept" => node.attribute("intercept").value.to_f
12
- }
13
-
14
- features = {}
15
-
16
- text_features, derived_fields = extract_text_features(data, features)
17
-
18
- node.css("NumericPredictor").each do |n|
19
- name = n.attribute("name").value
20
- if derived_fields[name]
21
- name = derived_fields[name]
22
- else
23
- features[name] = "numeric"
24
- end
25
- coefficients[name] = n.attribute("coefficient").value.to_f
26
- end
27
-
28
- node.css("CategoricalPredictor").each do |n|
29
- name = n.attribute("name").value
30
- coefficients[[name, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
31
- features[name] = "categorical"
32
- end
33
-
34
- Evaluators::LinearRegression.new(coefficients: coefficients, features: features, text_features: text_features)
35
- end
36
- end
37
-
38
3
  def coefficients
39
4
  @evaluator.coefficients
40
5
  end
@@ -72,6 +37,7 @@ module Eps
72
37
  str
73
38
  end
74
39
 
40
+ # TODO use keyword arguments for gsl and intercept in 0.4.0
75
41
  def _train(**options)
76
42
  raise "Target must be numeric" if @target_type != "numeric"
77
43
  check_missing_value(@train_set)
@@ -84,33 +50,64 @@ module Eps
84
50
  end
85
51
 
86
52
  x = data.map_rows(&:to_a)
87
- data.size.times do |i|
88
- # add intercept
89
- x[i].unshift(1)
90
- end
91
53
 
92
- gsl = options.key?(:gsl) ? options[:gsl] : defined?(GSL)
54
+ gsl =
55
+ if options.key?(:gsl)
56
+ options[:gsl]
57
+ elsif defined?(GSL)
58
+ true
59
+ elsif defined?(GSLR)
60
+ :gslr
61
+ else
62
+ false
63
+ end
64
+
65
+ intercept = options.key?(:intercept) ? options[:intercept] : true
66
+ if intercept && gsl != :gslr
67
+ data.size.times do |i|
68
+ x[i].unshift(1)
69
+ end
70
+ end
93
71
 
94
72
  v3 =
95
- if gsl
73
+ if gsl == :gslr
74
+ model = GSLR::OLS.new(intercept: intercept)
75
+ model.fit(x, data.label, weight: data.weight)
76
+
77
+ @covariance = model.covariance
78
+
79
+ coefficients = model.coefficients.dup
80
+ coefficients.unshift(model.intercept) if intercept
81
+ coefficients
82
+ elsif gsl
96
83
  x = GSL::Matrix.alloc(*x)
97
84
  y = GSL::Vector.alloc(data.label)
98
- c, @covariance, _, _ = GSL::MultiFit::linear(x, y)
85
+ w = GSL::Vector.alloc(data.weight) if data.weight
86
+ c, @covariance, _, _ = w ? GSL::MultiFit.wlinear(x, w, y) : GSL::MultiFit.linear(x, y)
99
87
  c.to_a
100
88
  else
101
89
  x = Matrix.rows(x)
102
90
  y = Matrix.column_vector(data.label)
91
+
92
+ # weighted OLS
93
+ # http://www.real-statistics.com/multiple-regression/weighted-linear-regression/weighted-regression-basics/
94
+ w = Matrix.diagonal(*data.weight) if data.weight
95
+
103
96
  removed = []
104
97
 
105
98
  # https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
106
- # unforutnately, this method is unstable
99
+ # unfortunately, this method is unstable
107
100
  # haven't found an efficient way to do QR-factorization in Ruby
108
101
  # the extendmatrix gem has householder and givens (givens has bug)
109
102
  # but methods are too slow
110
103
  xt = x.t
104
+ xt *= w if w
111
105
  begin
112
106
  @xtxi = (xt * x).inverse
113
107
  rescue ExceptionForMatrix::ErrNotRegular
108
+ # matrix cannot be inverted
109
+ # https://en.wikipedia.org/wiki/Multicollinearity
110
+
114
111
  constant = {}
115
112
  (1...x.column_count).each do |i|
116
113
  constant[i] = constant?(x.column(i))
@@ -134,6 +131,7 @@ module Eps
134
131
  end
135
132
  x = Matrix.columns(vectors)
136
133
  xt = x.t
134
+ xt *= w if w
137
135
 
138
136
  # try again
139
137
  begin
@@ -144,6 +142,7 @@ module Eps
144
142
  end
145
143
  # huge performance boost
146
144
  # by multiplying xt * y first
145
+ # for weighted, w is already included in wt
147
146
  v2 = @xtxi * (xt * y)
148
147
 
149
148
  # convert to array
@@ -158,47 +157,14 @@ module Eps
158
157
  v2
159
158
  end
160
159
 
161
- @coefficient_names = ["_intercept"] + data.columns.keys
162
- @coefficients = Hash[@coefficient_names.zip(v3)]
163
- Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
164
- end
165
-
166
- def generate_pmml
167
- predictors = @coefficients.dup
168
- predictors.delete("_intercept")
169
-
170
- data_fields = {}
171
- @features.each do |k, type|
172
- if type == "categorical"
173
- data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last)
174
- else
175
- data_fields[k] = nil
176
- end
160
+ if @xtxi && @xtxi.each(:diagonal).any? { |v| v < 0 }
161
+ raise UnstableSolution, "GSL is needed to find a stable solution for this dataset"
177
162
  end
178
163
 
179
- build_pmml(data_fields) do |xml|
180
- xml.RegressionModel(functionName: "regression") do
181
- xml.MiningSchema do
182
- @features.each do |k, _|
183
- xml.MiningField(name: k)
184
- end
185
- end
186
- pmml_local_transformations(xml)
187
- xml.RegressionTable(intercept: @coefficients["_intercept"]) do
188
- predictors.each do |k, v|
189
- if k.is_a?(Array)
190
- if @features[k.first] == "text"
191
- xml.NumericPredictor(name: display_field(k), coefficient: v)
192
- else
193
- xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
194
- end
195
- else
196
- xml.NumericPredictor(name: k, coefficient: v)
197
- end
198
- end
199
- end
200
- end
201
- end
164
+ @coefficient_names = data.columns.keys
165
+ @coefficient_names.unshift("_intercept") if intercept
166
+ @coefficients = Hash[@coefficient_names.zip(v3)]
167
+ Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
202
168
  end
203
169
 
204
170
  def prep_x(x)
@@ -249,7 +215,11 @@ module Eps
249
215
 
250
216
  def diagonal
251
217
  @diagonal ||= begin
252
- if covariance.respond_to?(:each)
218
+ if covariance.is_a?(Array)
219
+ covariance.size.times.map do |i|
220
+ covariance[i][i]
221
+ end
222
+ elsif covariance.respond_to?(:each)
253
223
  d = covariance.each(:diagonal).to_a
254
224
  @removed.each do |i|
255
225
  d.insert(i, 0)