eps 0.3.0 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  module Eps
2
2
  class DataFrame
3
3
  attr_reader :columns
4
- attr_accessor :label
4
+ attr_accessor :label, :weight
5
5
 
6
6
  def initialize(data = [])
7
7
  @columns = {}
@@ -10,7 +10,7 @@ module Eps
10
10
  data.columns.each do |k, v|
11
11
  @columns[k] = v
12
12
  end
13
- elsif daru?(data)
13
+ elsif rover?(data) || daru?(data)
14
14
  data.to_h.each do |k, v|
15
15
  @columns[k.to_s] = v.to_a
16
16
  end
@@ -19,6 +19,8 @@ module Eps
19
19
  @columns[k.to_s] = v.to_a
20
20
  end
21
21
  else
22
+ data = data.to_a if numo?(data)
23
+
22
24
  if data.any?
23
25
  row = data[0]
24
26
 
@@ -78,6 +80,10 @@ module Eps
78
80
  rows = Range.new(rows.begin, size - 1)
79
81
  elsif rows.end < 0
80
82
  rows = Range.new(rows.begin, size + rows.end, rows.exclude_end?)
83
+ else
84
+ finish = rows.end
85
+ finish -= 1 if rows.exclude_end?
86
+ rows = Range.new(rows.begin, size - 1) if finish >= size - 1
81
87
  end
82
88
  end
83
89
 
@@ -115,6 +121,7 @@ module Eps
115
121
  df.columns[c] = columns[c].values_at(*rows)
116
122
  end
117
123
  df.label = label.values_at(*rows) if label
124
+ df.weight = weight.values_at(*rows) if weight
118
125
 
119
126
  singular ? df.columns[cols[0]] : df
120
127
  end
@@ -129,13 +136,22 @@ module Eps
129
136
  df.columns[k] = v
130
137
  end
131
138
  df.label = label
139
+ df.weight = weight
132
140
  df
133
141
  end
134
142
 
135
143
  private
136
144
 
145
+ def numo?(x)
146
+ defined?(Numo::NArray) && x.is_a?(Numo::NArray)
147
+ end
148
+
149
+ def rover?(x)
150
+ defined?(Rover::DataFrame) && x.is_a?(Rover::DataFrame)
151
+ end
152
+
137
153
  def daru?(x)
138
- defined?(Daru) && x.is_a?(Daru::DataFrame)
154
+ defined?(Daru::DataFrame) && x.is_a?(Daru::DataFrame)
139
155
  end
140
156
  end
141
157
  end
@@ -11,12 +11,14 @@ module Eps
11
11
  @text_features = text_features
12
12
  end
13
13
 
14
- def predict(data)
14
+ def predict(data, probabilities: false)
15
+ raise "Probabilities not supported" if probabilities && @objective == "regression"
16
+
15
17
  rows = data.map(&:to_h)
16
18
 
17
19
  # sparse matrix
18
20
  @text_features.each do |k, v|
19
- encoder = TextEncoder.new(v)
21
+ encoder = TextEncoder.new(**v)
20
22
 
21
23
  values = data.columns.delete(k)
22
24
  counts = encoder.transform(values)
@@ -38,7 +40,12 @@ module Eps
38
40
  when "regression"
39
41
  sum_trees(rows, @trees)
40
42
  when "binary"
41
- sum_trees(rows, @trees).map { |s| @labels[sigmoid(s) > 0.5 ? 1 : 0] }
43
+ prob = sum_trees(rows, @trees).map { |s| sigmoid(s) }
44
+ if probabilities
45
+ prob.map { |v| @labels.zip([1 - v, v]).to_h }
46
+ else
47
+ prob.map { |v| @labels[v > 0.5 ? 1 : 0] }
48
+ end
42
49
  else
43
50
  tree_scores = []
44
51
  num_trees = @trees.size / @labels.size
@@ -47,8 +54,14 @@ module Eps
47
54
  end
48
55
  data.size.times.map do |i|
49
56
  v = tree_scores.map { |s| s[i] }
50
- idx = v.map.with_index.max_by { |v2, _| v2 }.last
51
- @labels[idx]
57
+ if probabilities
58
+ exp = v.map { |vi| Math.exp(vi) }
59
+ sum = exp.sum
60
+ @labels.zip(exp.map { |e| e / sum }).to_h
61
+ else
62
+ idx = v.map.with_index.max_by { |v2, _| v2 }.last
63
+ @labels[idx]
64
+ end
52
65
  end
53
66
  end
54
67
  end
@@ -81,7 +94,7 @@ module Eps
81
94
  else
82
95
  case node.operator
83
96
  when "equal"
84
- v == node.value
97
+ v.to_s == node.value
85
98
  when "in"
86
99
  node.value.include?(v)
87
100
  when "greaterThan"
@@ -109,7 +122,7 @@ module Eps
109
122
  end
110
123
 
111
124
  def sigmoid(x)
112
- 1.0 / (1 + Math::E**(-x))
125
+ 1.0 / (1 + Math.exp(-x))
113
126
  end
114
127
  end
115
128
  end
@@ -9,8 +9,10 @@ module Eps
9
9
  @text_features = text_features || {}
10
10
  end
11
11
 
12
- def predict(x)
13
- intercept = @coefficients["_intercept"]
12
+ def predict(x, probabilities: false)
13
+ raise "Probabilities not supported" if probabilities
14
+
15
+ intercept = @coefficients["_intercept"] || 0.0
14
16
  scores = [intercept] * x.size
15
17
 
16
18
  @features.each do |k, type|
@@ -19,10 +21,11 @@ module Eps
19
21
  case type
20
22
  when "categorical"
21
23
  x.columns[k].each_with_index do |xv, i|
22
- scores[i] += @coefficients[[k, xv]].to_f
24
+ # TODO clean up
25
+ scores[i] += (@coefficients[[k, xv]] || @coefficients[[k, xv.to_s]]).to_f
23
26
  end
24
27
  when "text"
25
- encoder = TextEncoder.new(@text_features[k])
28
+ encoder = TextEncoder.new(**@text_features[k])
26
29
  counts = encoder.transform(x.columns[k])
27
30
  coef = {}
28
31
  @coefficients.each do |k2, v|
@@ -10,14 +10,15 @@ module Eps
10
10
  @legacy = legacy
11
11
  end
12
12
 
13
- def predict(x)
13
+ def predict(x, probabilities: false)
14
14
  probs = calculate_class_probabilities(x)
15
15
  probs.map do |xp|
16
- # convert probabilities
17
- # not needed when just returning label
18
- # sum = xp.values.map { |v| Math.exp(v) }.sum.to_f
19
- # p xp.map { |k, v| [k, Math.exp(v) / sum] }.to_h
20
- xp.sort_by { |k, v| [-v, k] }[0][0]
16
+ if probabilities
17
+ sum = xp.values.map { |v| Math.exp(v) }.sum.to_f
18
+ xp.map { |k, v| [k, Math.exp(v) / sum] }.to_h
19
+ else
20
+ xp.sort_by { |k, v| [-v, k] }[0][0]
21
+ end
21
22
  end
22
23
  end
23
24
 
@@ -38,7 +39,8 @@ module Eps
38
39
  case type
39
40
  when "categorical"
40
41
  x.columns[k].each_with_index do |xi, i|
41
- vc = probabilities[:conditional][k][xi]
42
+ # TODO clean this up
43
+ vc = probabilities[:conditional][k][xi] || probabilities[:conditional][k][xi.to_s]
42
44
 
43
45
  # unknown value if not vc
44
46
  if vc
@@ -24,9 +24,13 @@ module Eps
24
24
  if yi.nil?
25
25
  nil
26
26
  else
27
- v = @labels[yi.to_s]
28
- raise "Unknown label: #{yi}" unless v
29
- v
27
+ # use an additional label for unseen values
28
+ # this is only used during training for the LightGBM eval_set
29
+ # LightGBM ignores them (only uses seen categories for predictions)
30
+ # https://github.com/microsoft/LightGBM/issues/1936
31
+ # the evaluator also ignores them (to be consistent with LightGBM)
32
+ # but doesn't use this code
33
+ @labels[yi.to_s] || @labels.size
30
34
  end
31
35
  end
32
36
  end
@@ -1,39 +1,5 @@
1
- require "eps/pmml_generators/lightgbm"
2
-
3
1
  module Eps
4
2
  class LightGBM < BaseEstimator
5
- include PmmlGenerators::LightGBM
6
-
7
- def self.load_pmml(data)
8
- super do |data|
9
- objective = data.css("MiningModel").first.attribute("functionName").value
10
- if objective == "classification"
11
- labels = data.css("RegressionModel OutputField").map { |n| n.attribute("value").value }
12
- objective = labels.size > 2 ? "multiclass" : "binary"
13
- end
14
-
15
- features = {}
16
- text_features, derived_fields = extract_text_features(data, features)
17
- node = data.css("DataDictionary").first
18
- node.css("DataField")[1..-1].to_a.each do |node|
19
- features[node.attribute("name").value] =
20
- if node.attribute("optype").value == "categorical"
21
- "categorical"
22
- else
23
- "numeric"
24
- end
25
- end
26
-
27
- trees = []
28
- data.css("Segmentation TreeModel").each do |tree|
29
- node = find_nodes(tree.css("Node").first, derived_fields)
30
- trees << node
31
- end
32
-
33
- Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: features, text_features: text_features)
34
- end
35
- end
36
-
37
3
  private
38
4
 
39
5
  def _summary(extended: false)
@@ -51,48 +17,16 @@ module Eps
51
17
  str
52
18
  end
53
19
 
54
- def self.find_nodes(xml, derived_fields)
55
- score = BigDecimal(xml.attribute("score").value).to_f
56
-
57
- elements = xml.elements
58
- xml_predicate = elements.first
59
-
60
- predicate =
61
- if xml_predicate.name == "True"
62
- nil
63
- elsif xml_predicate.name == "SimpleSetPredicate"
64
- operator = "in"
65
- value = xml_predicate.css("Array").text.scan(/"(.+?)(?<!\\)"|(\S+)/).flatten.compact.map { |v| v.gsub('\"', '"') }
66
- field = xml_predicate.attribute("field").value
67
- field = derived_fields[field] if derived_fields[field]
68
- {
69
- field: field,
70
- operator: operator,
71
- value: value
72
- }
73
- else
74
- operator = xml_predicate.attribute("operator").value
75
- value = xml_predicate.attribute("value").value
76
- value = BigDecimal(value).to_f if operator == "greaterThan"
77
- field = xml_predicate.attribute("field").value
78
- field = derived_fields[field] if derived_fields[field]
79
- {
80
- field: field,
81
- operator: operator,
82
- value: value
83
- }
84
- end
85
-
86
- children = elements[1..-1].map { |n| find_nodes(n, derived_fields) }
87
-
88
- Evaluators::Node.new(score: score, predicate: predicate, children: children)
89
- end
90
-
91
- def _train(verbose: nil, early_stopping: nil)
20
+ def _train(verbose: nil, early_stopping: nil, learning_rate: 0.1)
92
21
  train_set = @train_set
93
22
  validation_set = @validation_set.dup
94
23
  summary_label = train_set.label
95
24
 
25
+ # create check set
26
+ evaluator_set = validation_set || train_set
27
+ check_idx = 100.times.map { rand(evaluator_set.size) }.uniq
28
+ evaluator_set = evaluator_set[check_idx]
29
+
96
30
  # objective
97
31
  objective =
98
32
  if @target_type == "numeric"
@@ -126,7 +60,10 @@ module Eps
126
60
  prep_text_features(validation_set) if validation_set
127
61
 
128
62
  # create params
129
- params = {objective: objective}
63
+ params = {
64
+ objective: objective,
65
+ learning_rate: learning_rate
66
+ }
130
67
  params[:num_classes] = labels.size if objective == "multiclass"
131
68
  if train_set.size < 30
132
69
  params[:min_data_in_bin] = 1
@@ -135,8 +72,8 @@ module Eps
135
72
 
136
73
  # create datasets
137
74
  categorical_idx = @features.values.map.with_index.select { |type, _| type == "categorical" }.map(&:last)
138
- train_ds = ::LightGBM::Dataset.new(train_set.map_rows(&:to_a), label: train_set.label, categorical_feature: categorical_idx, params: params)
139
- validation_ds = ::LightGBM::Dataset.new(validation_set.map_rows(&:to_a), label: validation_set.label, categorical_feature: categorical_idx, params: params, reference: train_ds) if validation_set
75
+ train_ds = ::LightGBM::Dataset.new(train_set.map_rows(&:to_a), label: train_set.label, weight: train_set.weight, categorical_feature: categorical_idx, params: params)
76
+ validation_ds = ::LightGBM::Dataset.new(validation_set.map_rows(&:to_a), label: validation_set.label, weight: validation_set.weight, categorical_feature: categorical_idx, params: params, reference: train_ds) if validation_set
140
77
 
141
78
  # train
142
79
  valid_sets = [train_ds]
@@ -176,11 +113,39 @@ module Eps
176
113
  # reset pmml
177
114
  @pmml = nil
178
115
 
179
- Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: @features, text_features: @text_features)
116
+ evaluator = Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: @features, text_features: @text_features)
117
+ booster_set = validation_set ? validation_set[check_idx] : train_set[check_idx]
118
+ check_evaluator(objective, labels, booster, booster_set, evaluator, evaluator_set)
119
+ evaluator
180
120
  end
181
121
 
182
- def evaluator_class
183
- PmmlLoaders::LightGBM
122
+ # compare a subset of predictions to check for possible bugs in evaluator
123
+ # NOTE LightGBM must use double data type for prediction input for these to be consistent
124
+ def check_evaluator(objective, labels, booster, booster_set, evaluator, evaluator_set)
125
+ expected = @booster.predict(booster_set.map_rows(&:to_a))
126
+ if objective == "multiclass"
127
+ actual = evaluator.predict(evaluator_set, probabilities: true)
128
+ # just compare first for now
129
+ expected.map! { |v| v.first }
130
+ actual.map! { |v| v.values.first }
131
+ elsif objective == "binary"
132
+ actual = evaluator.predict(evaluator_set, probabilities: true).map { |v| v.values.last }
133
+ else
134
+ actual = evaluator.predict(evaluator_set)
135
+ end
136
+
137
+ regression = objective == "regression" || objective == "binary"
138
+ bad_observations = []
139
+ expected.zip(actual).each_with_index do |(exp, act), i|
140
+ success = (act - exp).abs < 0.001
141
+ unless success
142
+ bad_observations << {expected: exp, actual: act, data_point: evaluator_set[i].map(&:itself).first}
143
+ end
144
+ end
145
+
146
+ if bad_observations.any?
147
+ raise "Bug detected in evaluator. Please report an issue. Bad data points: #{bad_observations.inspect}"
148
+ end
184
149
  end
185
150
 
186
151
  # for evaluator
@@ -1,40 +1,5 @@
1
1
  module Eps
2
2
  class LinearRegression < BaseEstimator
3
- # pmml
4
-
5
- def self.load_pmml(data)
6
- super do |data|
7
- # TODO more validation
8
- node = data.css("RegressionTable")
9
-
10
- coefficients = {
11
- "_intercept" => node.attribute("intercept").value.to_f
12
- }
13
-
14
- features = {}
15
-
16
- text_features, derived_fields = extract_text_features(data, features)
17
-
18
- node.css("NumericPredictor").each do |n|
19
- name = n.attribute("name").value
20
- if derived_fields[name]
21
- name = derived_fields[name]
22
- else
23
- features[name] = "numeric"
24
- end
25
- coefficients[name] = n.attribute("coefficient").value.to_f
26
- end
27
-
28
- node.css("CategoricalPredictor").each do |n|
29
- name = n.attribute("name").value
30
- coefficients[[name, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
31
- features[name] = "categorical"
32
- end
33
-
34
- Evaluators::LinearRegression.new(coefficients: coefficients, features: features, text_features: text_features)
35
- end
36
- end
37
-
38
3
  def coefficients
39
4
  @evaluator.coefficients
40
5
  end
@@ -72,6 +37,7 @@ module Eps
72
37
  str
73
38
  end
74
39
 
40
+ # TODO use keyword arguments for gsl and intercept in 0.4.0
75
41
  def _train(**options)
76
42
  raise "Target must be numeric" if @target_type != "numeric"
77
43
  check_missing_value(@train_set)
@@ -84,33 +50,64 @@ module Eps
84
50
  end
85
51
 
86
52
  x = data.map_rows(&:to_a)
87
- data.size.times do |i|
88
- # add intercept
89
- x[i].unshift(1)
90
- end
91
53
 
92
- gsl = options.key?(:gsl) ? options[:gsl] : defined?(GSL)
54
+ gsl =
55
+ if options.key?(:gsl)
56
+ options[:gsl]
57
+ elsif defined?(GSL)
58
+ true
59
+ elsif defined?(GSLR)
60
+ :gslr
61
+ else
62
+ false
63
+ end
64
+
65
+ intercept = options.key?(:intercept) ? options[:intercept] : true
66
+ if intercept && gsl != :gslr
67
+ data.size.times do |i|
68
+ x[i].unshift(1)
69
+ end
70
+ end
93
71
 
94
72
  v3 =
95
- if gsl
73
+ if gsl == :gslr
74
+ model = GSLR::OLS.new(intercept: intercept)
75
+ model.fit(x, data.label, weight: data.weight)
76
+
77
+ @covariance = model.covariance
78
+
79
+ coefficients = model.coefficients.dup
80
+ coefficients.unshift(model.intercept) if intercept
81
+ coefficients
82
+ elsif gsl
96
83
  x = GSL::Matrix.alloc(*x)
97
84
  y = GSL::Vector.alloc(data.label)
98
- c, @covariance, _, _ = GSL::MultiFit::linear(x, y)
85
+ w = GSL::Vector.alloc(data.weight) if data.weight
86
+ c, @covariance, _, _ = w ? GSL::MultiFit.wlinear(x, w, y) : GSL::MultiFit.linear(x, y)
99
87
  c.to_a
100
88
  else
101
89
  x = Matrix.rows(x)
102
90
  y = Matrix.column_vector(data.label)
91
+
92
+ # weighted OLS
93
+ # http://www.real-statistics.com/multiple-regression/weighted-linear-regression/weighted-regression-basics/
94
+ w = Matrix.diagonal(*data.weight) if data.weight
95
+
103
96
  removed = []
104
97
 
105
98
  # https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
106
- # unforutnately, this method is unstable
99
+ # unfortunately, this method is unstable
107
100
  # haven't found an efficient way to do QR-factorization in Ruby
108
101
  # the extendmatrix gem has householder and givens (givens has bug)
109
102
  # but methods are too slow
110
103
  xt = x.t
104
+ xt *= w if w
111
105
  begin
112
106
  @xtxi = (xt * x).inverse
113
107
  rescue ExceptionForMatrix::ErrNotRegular
108
+ # matrix cannot be inverted
109
+ # https://en.wikipedia.org/wiki/Multicollinearity
110
+
114
111
  constant = {}
115
112
  (1...x.column_count).each do |i|
116
113
  constant[i] = constant?(x.column(i))
@@ -134,6 +131,7 @@ module Eps
134
131
  end
135
132
  x = Matrix.columns(vectors)
136
133
  xt = x.t
134
+ xt *= w if w
137
135
 
138
136
  # try again
139
137
  begin
@@ -144,6 +142,7 @@ module Eps
144
142
  end
145
143
  # huge performance boost
146
144
  # by multiplying xt * y first
145
+ # for weighted, w is already included in wt
147
146
  v2 = @xtxi * (xt * y)
148
147
 
149
148
  # convert to array
@@ -158,47 +157,14 @@ module Eps
158
157
  v2
159
158
  end
160
159
 
161
- @coefficient_names = ["_intercept"] + data.columns.keys
162
- @coefficients = Hash[@coefficient_names.zip(v3)]
163
- Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
164
- end
165
-
166
- def generate_pmml
167
- predictors = @coefficients.dup
168
- predictors.delete("_intercept")
169
-
170
- data_fields = {}
171
- @features.each do |k, type|
172
- if type == "categorical"
173
- data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last)
174
- else
175
- data_fields[k] = nil
176
- end
160
+ if @xtxi && @xtxi.each(:diagonal).any? { |v| v < 0 }
161
+ raise UnstableSolution, "GSL is needed to find a stable solution for this dataset"
177
162
  end
178
163
 
179
- build_pmml(data_fields) do |xml|
180
- xml.RegressionModel(functionName: "regression") do
181
- xml.MiningSchema do
182
- @features.each do |k, _|
183
- xml.MiningField(name: k)
184
- end
185
- end
186
- pmml_local_transformations(xml)
187
- xml.RegressionTable(intercept: @coefficients["_intercept"]) do
188
- predictors.each do |k, v|
189
- if k.is_a?(Array)
190
- if @features[k.first] == "text"
191
- xml.NumericPredictor(name: display_field(k), coefficient: v)
192
- else
193
- xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
194
- end
195
- else
196
- xml.NumericPredictor(name: k, coefficient: v)
197
- end
198
- end
199
- end
200
- end
201
- end
164
+ @coefficient_names = data.columns.keys
165
+ @coefficient_names.unshift("_intercept") if intercept
166
+ @coefficients = Hash[@coefficient_names.zip(v3)]
167
+ Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
202
168
  end
203
169
 
204
170
  def prep_x(x)
@@ -249,7 +215,11 @@ module Eps
249
215
 
250
216
  def diagonal
251
217
  @diagonal ||= begin
252
- if covariance.respond_to?(:each)
218
+ if covariance.is_a?(Array)
219
+ covariance.size.times.map do |i|
220
+ covariance[i][i]
221
+ end
222
+ elsif covariance.respond_to?(:each)
253
223
  d = covariance.each(:diagonal).to_a
254
224
  @removed.each do |i|
255
225
  d.insert(i, 0)