eps 0.3.0 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -5
- data/README.md +77 -9
- data/lib/eps.rb +19 -10
- data/lib/eps/base_estimator.rb +63 -145
- data/lib/eps/data_frame.rb +19 -3
- data/lib/eps/evaluators/lightgbm.rb +20 -7
- data/lib/eps/evaluators/linear_regression.rb +7 -4
- data/lib/eps/evaluators/naive_bayes.rb +9 -7
- data/lib/eps/label_encoder.rb +7 -3
- data/lib/eps/lightgbm.rb +43 -78
- data/lib/eps/linear_regression.rb +53 -83
- data/lib/eps/metrics.rb +24 -12
- data/lib/eps/model.rb +6 -6
- data/lib/eps/naive_bayes.rb +3 -140
- data/lib/eps/pmml.rb +14 -0
- data/lib/eps/pmml/generator.rb +422 -0
- data/lib/eps/pmml/loader.rb +241 -0
- data/lib/eps/version.rb +1 -1
- metadata +36 -6
- data/lib/eps/pmml_generators/lightgbm.rb +0 -187
data/lib/eps/data_frame.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module Eps
|
2
2
|
class DataFrame
|
3
3
|
attr_reader :columns
|
4
|
-
attr_accessor :label
|
4
|
+
attr_accessor :label, :weight
|
5
5
|
|
6
6
|
def initialize(data = [])
|
7
7
|
@columns = {}
|
@@ -10,7 +10,7 @@ module Eps
|
|
10
10
|
data.columns.each do |k, v|
|
11
11
|
@columns[k] = v
|
12
12
|
end
|
13
|
-
elsif daru?(data)
|
13
|
+
elsif rover?(data) || daru?(data)
|
14
14
|
data.to_h.each do |k, v|
|
15
15
|
@columns[k.to_s] = v.to_a
|
16
16
|
end
|
@@ -19,6 +19,8 @@ module Eps
|
|
19
19
|
@columns[k.to_s] = v.to_a
|
20
20
|
end
|
21
21
|
else
|
22
|
+
data = data.to_a if numo?(data)
|
23
|
+
|
22
24
|
if data.any?
|
23
25
|
row = data[0]
|
24
26
|
|
@@ -78,6 +80,10 @@ module Eps
|
|
78
80
|
rows = Range.new(rows.begin, size - 1)
|
79
81
|
elsif rows.end < 0
|
80
82
|
rows = Range.new(rows.begin, size + rows.end, rows.exclude_end?)
|
83
|
+
else
|
84
|
+
finish = rows.end
|
85
|
+
finish -= 1 if rows.exclude_end?
|
86
|
+
rows = Range.new(rows.begin, size - 1) if finish >= size - 1
|
81
87
|
end
|
82
88
|
end
|
83
89
|
|
@@ -115,6 +121,7 @@ module Eps
|
|
115
121
|
df.columns[c] = columns[c].values_at(*rows)
|
116
122
|
end
|
117
123
|
df.label = label.values_at(*rows) if label
|
124
|
+
df.weight = weight.values_at(*rows) if weight
|
118
125
|
|
119
126
|
singular ? df.columns[cols[0]] : df
|
120
127
|
end
|
@@ -129,13 +136,22 @@ module Eps
|
|
129
136
|
df.columns[k] = v
|
130
137
|
end
|
131
138
|
df.label = label
|
139
|
+
df.weight = weight
|
132
140
|
df
|
133
141
|
end
|
134
142
|
|
135
143
|
private
|
136
144
|
|
145
|
+
def numo?(x)
|
146
|
+
defined?(Numo::NArray) && x.is_a?(Numo::NArray)
|
147
|
+
end
|
148
|
+
|
149
|
+
def rover?(x)
|
150
|
+
defined?(Rover::DataFrame) && x.is_a?(Rover::DataFrame)
|
151
|
+
end
|
152
|
+
|
137
153
|
def daru?(x)
|
138
|
-
defined?(Daru) && x.is_a?(Daru::DataFrame)
|
154
|
+
defined?(Daru::DataFrame) && x.is_a?(Daru::DataFrame)
|
139
155
|
end
|
140
156
|
end
|
141
157
|
end
|
@@ -11,12 +11,14 @@ module Eps
|
|
11
11
|
@text_features = text_features
|
12
12
|
end
|
13
13
|
|
14
|
-
def predict(data)
|
14
|
+
def predict(data, probabilities: false)
|
15
|
+
raise "Probabilities not supported" if probabilities && @objective == "regression"
|
16
|
+
|
15
17
|
rows = data.map(&:to_h)
|
16
18
|
|
17
19
|
# sparse matrix
|
18
20
|
@text_features.each do |k, v|
|
19
|
-
encoder = TextEncoder.new(v)
|
21
|
+
encoder = TextEncoder.new(**v)
|
20
22
|
|
21
23
|
values = data.columns.delete(k)
|
22
24
|
counts = encoder.transform(values)
|
@@ -38,7 +40,12 @@ module Eps
|
|
38
40
|
when "regression"
|
39
41
|
sum_trees(rows, @trees)
|
40
42
|
when "binary"
|
41
|
-
sum_trees(rows, @trees).map { |s|
|
43
|
+
prob = sum_trees(rows, @trees).map { |s| sigmoid(s) }
|
44
|
+
if probabilities
|
45
|
+
prob.map { |v| @labels.zip([1 - v, v]).to_h }
|
46
|
+
else
|
47
|
+
prob.map { |v| @labels[v > 0.5 ? 1 : 0] }
|
48
|
+
end
|
42
49
|
else
|
43
50
|
tree_scores = []
|
44
51
|
num_trees = @trees.size / @labels.size
|
@@ -47,8 +54,14 @@ module Eps
|
|
47
54
|
end
|
48
55
|
data.size.times.map do |i|
|
49
56
|
v = tree_scores.map { |s| s[i] }
|
50
|
-
|
51
|
-
|
57
|
+
if probabilities
|
58
|
+
exp = v.map { |vi| Math.exp(vi) }
|
59
|
+
sum = exp.sum
|
60
|
+
@labels.zip(exp.map { |e| e / sum }).to_h
|
61
|
+
else
|
62
|
+
idx = v.map.with_index.max_by { |v2, _| v2 }.last
|
63
|
+
@labels[idx]
|
64
|
+
end
|
52
65
|
end
|
53
66
|
end
|
54
67
|
end
|
@@ -81,7 +94,7 @@ module Eps
|
|
81
94
|
else
|
82
95
|
case node.operator
|
83
96
|
when "equal"
|
84
|
-
v == node.value
|
97
|
+
v.to_s == node.value
|
85
98
|
when "in"
|
86
99
|
node.value.include?(v)
|
87
100
|
when "greaterThan"
|
@@ -109,7 +122,7 @@ module Eps
|
|
109
122
|
end
|
110
123
|
|
111
124
|
def sigmoid(x)
|
112
|
-
1.0 / (1 + Math
|
125
|
+
1.0 / (1 + Math.exp(-x))
|
113
126
|
end
|
114
127
|
end
|
115
128
|
end
|
@@ -9,8 +9,10 @@ module Eps
|
|
9
9
|
@text_features = text_features || {}
|
10
10
|
end
|
11
11
|
|
12
|
-
def predict(x)
|
13
|
-
|
12
|
+
def predict(x, probabilities: false)
|
13
|
+
raise "Probabilities not supported" if probabilities
|
14
|
+
|
15
|
+
intercept = @coefficients["_intercept"] || 0.0
|
14
16
|
scores = [intercept] * x.size
|
15
17
|
|
16
18
|
@features.each do |k, type|
|
@@ -19,10 +21,11 @@ module Eps
|
|
19
21
|
case type
|
20
22
|
when "categorical"
|
21
23
|
x.columns[k].each_with_index do |xv, i|
|
22
|
-
|
24
|
+
# TODO clean up
|
25
|
+
scores[i] += (@coefficients[[k, xv]] || @coefficients[[k, xv.to_s]]).to_f
|
23
26
|
end
|
24
27
|
when "text"
|
25
|
-
encoder = TextEncoder.new(
|
28
|
+
encoder = TextEncoder.new(**@text_features[k])
|
26
29
|
counts = encoder.transform(x.columns[k])
|
27
30
|
coef = {}
|
28
31
|
@coefficients.each do |k2, v|
|
@@ -10,14 +10,15 @@ module Eps
|
|
10
10
|
@legacy = legacy
|
11
11
|
end
|
12
12
|
|
13
|
-
def predict(x)
|
13
|
+
def predict(x, probabilities: false)
|
14
14
|
probs = calculate_class_probabilities(x)
|
15
15
|
probs.map do |xp|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
16
|
+
if probabilities
|
17
|
+
sum = xp.values.map { |v| Math.exp(v) }.sum.to_f
|
18
|
+
xp.map { |k, v| [k, Math.exp(v) / sum] }.to_h
|
19
|
+
else
|
20
|
+
xp.sort_by { |k, v| [-v, k] }[0][0]
|
21
|
+
end
|
21
22
|
end
|
22
23
|
end
|
23
24
|
|
@@ -38,7 +39,8 @@ module Eps
|
|
38
39
|
case type
|
39
40
|
when "categorical"
|
40
41
|
x.columns[k].each_with_index do |xi, i|
|
41
|
-
|
42
|
+
# TODO clean this up
|
43
|
+
vc = probabilities[:conditional][k][xi] || probabilities[:conditional][k][xi.to_s]
|
42
44
|
|
43
45
|
# unknown value if not vc
|
44
46
|
if vc
|
data/lib/eps/label_encoder.rb
CHANGED
@@ -24,9 +24,13 @@ module Eps
|
|
24
24
|
if yi.nil?
|
25
25
|
nil
|
26
26
|
else
|
27
|
-
|
28
|
-
|
29
|
-
|
27
|
+
# use an additional label for unseen values
|
28
|
+
# this is only used during training for the LightGBM eval_set
|
29
|
+
# LightGBM ignores them (only uses seen categories for predictions)
|
30
|
+
# https://github.com/microsoft/LightGBM/issues/1936
|
31
|
+
# the evaluator also ignores them (to be consistent with LightGBM)
|
32
|
+
# but doesn't use this code
|
33
|
+
@labels[yi.to_s] || @labels.size
|
30
34
|
end
|
31
35
|
end
|
32
36
|
end
|
data/lib/eps/lightgbm.rb
CHANGED
@@ -1,39 +1,5 @@
|
|
1
|
-
require "eps/pmml_generators/lightgbm"
|
2
|
-
|
3
1
|
module Eps
|
4
2
|
class LightGBM < BaseEstimator
|
5
|
-
include PmmlGenerators::LightGBM
|
6
|
-
|
7
|
-
def self.load_pmml(data)
|
8
|
-
super do |data|
|
9
|
-
objective = data.css("MiningModel").first.attribute("functionName").value
|
10
|
-
if objective == "classification"
|
11
|
-
labels = data.css("RegressionModel OutputField").map { |n| n.attribute("value").value }
|
12
|
-
objective = labels.size > 2 ? "multiclass" : "binary"
|
13
|
-
end
|
14
|
-
|
15
|
-
features = {}
|
16
|
-
text_features, derived_fields = extract_text_features(data, features)
|
17
|
-
node = data.css("DataDictionary").first
|
18
|
-
node.css("DataField")[1..-1].to_a.each do |node|
|
19
|
-
features[node.attribute("name").value] =
|
20
|
-
if node.attribute("optype").value == "categorical"
|
21
|
-
"categorical"
|
22
|
-
else
|
23
|
-
"numeric"
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
trees = []
|
28
|
-
data.css("Segmentation TreeModel").each do |tree|
|
29
|
-
node = find_nodes(tree.css("Node").first, derived_fields)
|
30
|
-
trees << node
|
31
|
-
end
|
32
|
-
|
33
|
-
Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: features, text_features: text_features)
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
3
|
private
|
38
4
|
|
39
5
|
def _summary(extended: false)
|
@@ -51,48 +17,16 @@ module Eps
|
|
51
17
|
str
|
52
18
|
end
|
53
19
|
|
54
|
-
def
|
55
|
-
score = BigDecimal(xml.attribute("score").value).to_f
|
56
|
-
|
57
|
-
elements = xml.elements
|
58
|
-
xml_predicate = elements.first
|
59
|
-
|
60
|
-
predicate =
|
61
|
-
if xml_predicate.name == "True"
|
62
|
-
nil
|
63
|
-
elsif xml_predicate.name == "SimpleSetPredicate"
|
64
|
-
operator = "in"
|
65
|
-
value = xml_predicate.css("Array").text.scan(/"(.+?)(?<!\\)"|(\S+)/).flatten.compact.map { |v| v.gsub('\"', '"') }
|
66
|
-
field = xml_predicate.attribute("field").value
|
67
|
-
field = derived_fields[field] if derived_fields[field]
|
68
|
-
{
|
69
|
-
field: field,
|
70
|
-
operator: operator,
|
71
|
-
value: value
|
72
|
-
}
|
73
|
-
else
|
74
|
-
operator = xml_predicate.attribute("operator").value
|
75
|
-
value = xml_predicate.attribute("value").value
|
76
|
-
value = BigDecimal(value).to_f if operator == "greaterThan"
|
77
|
-
field = xml_predicate.attribute("field").value
|
78
|
-
field = derived_fields[field] if derived_fields[field]
|
79
|
-
{
|
80
|
-
field: field,
|
81
|
-
operator: operator,
|
82
|
-
value: value
|
83
|
-
}
|
84
|
-
end
|
85
|
-
|
86
|
-
children = elements[1..-1].map { |n| find_nodes(n, derived_fields) }
|
87
|
-
|
88
|
-
Evaluators::Node.new(score: score, predicate: predicate, children: children)
|
89
|
-
end
|
90
|
-
|
91
|
-
def _train(verbose: nil, early_stopping: nil)
|
20
|
+
def _train(verbose: nil, early_stopping: nil, learning_rate: 0.1)
|
92
21
|
train_set = @train_set
|
93
22
|
validation_set = @validation_set.dup
|
94
23
|
summary_label = train_set.label
|
95
24
|
|
25
|
+
# create check set
|
26
|
+
evaluator_set = validation_set || train_set
|
27
|
+
check_idx = 100.times.map { rand(evaluator_set.size) }.uniq
|
28
|
+
evaluator_set = evaluator_set[check_idx]
|
29
|
+
|
96
30
|
# objective
|
97
31
|
objective =
|
98
32
|
if @target_type == "numeric"
|
@@ -126,7 +60,10 @@ module Eps
|
|
126
60
|
prep_text_features(validation_set) if validation_set
|
127
61
|
|
128
62
|
# create params
|
129
|
-
params = {
|
63
|
+
params = {
|
64
|
+
objective: objective,
|
65
|
+
learning_rate: learning_rate
|
66
|
+
}
|
130
67
|
params[:num_classes] = labels.size if objective == "multiclass"
|
131
68
|
if train_set.size < 30
|
132
69
|
params[:min_data_in_bin] = 1
|
@@ -135,8 +72,8 @@ module Eps
|
|
135
72
|
|
136
73
|
# create datasets
|
137
74
|
categorical_idx = @features.values.map.with_index.select { |type, _| type == "categorical" }.map(&:last)
|
138
|
-
train_ds = ::LightGBM::Dataset.new(train_set.map_rows(&:to_a), label: train_set.label, categorical_feature: categorical_idx, params: params)
|
139
|
-
validation_ds = ::LightGBM::Dataset.new(validation_set.map_rows(&:to_a), label: validation_set.label, categorical_feature: categorical_idx, params: params, reference: train_ds) if validation_set
|
75
|
+
train_ds = ::LightGBM::Dataset.new(train_set.map_rows(&:to_a), label: train_set.label, weight: train_set.weight, categorical_feature: categorical_idx, params: params)
|
76
|
+
validation_ds = ::LightGBM::Dataset.new(validation_set.map_rows(&:to_a), label: validation_set.label, weight: validation_set.weight, categorical_feature: categorical_idx, params: params, reference: train_ds) if validation_set
|
140
77
|
|
141
78
|
# train
|
142
79
|
valid_sets = [train_ds]
|
@@ -176,11 +113,39 @@ module Eps
|
|
176
113
|
# reset pmml
|
177
114
|
@pmml = nil
|
178
115
|
|
179
|
-
Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: @features, text_features: @text_features)
|
116
|
+
evaluator = Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: @features, text_features: @text_features)
|
117
|
+
booster_set = validation_set ? validation_set[check_idx] : train_set[check_idx]
|
118
|
+
check_evaluator(objective, labels, booster, booster_set, evaluator, evaluator_set)
|
119
|
+
evaluator
|
180
120
|
end
|
181
121
|
|
182
|
-
|
183
|
-
|
122
|
+
# compare a subset of predictions to check for possible bugs in evaluator
|
123
|
+
# NOTE LightGBM must use double data type for prediction input for these to be consistent
|
124
|
+
def check_evaluator(objective, labels, booster, booster_set, evaluator, evaluator_set)
|
125
|
+
expected = @booster.predict(booster_set.map_rows(&:to_a))
|
126
|
+
if objective == "multiclass"
|
127
|
+
actual = evaluator.predict(evaluator_set, probabilities: true)
|
128
|
+
# just compare first for now
|
129
|
+
expected.map! { |v| v.first }
|
130
|
+
actual.map! { |v| v.values.first }
|
131
|
+
elsif objective == "binary"
|
132
|
+
actual = evaluator.predict(evaluator_set, probabilities: true).map { |v| v.values.last }
|
133
|
+
else
|
134
|
+
actual = evaluator.predict(evaluator_set)
|
135
|
+
end
|
136
|
+
|
137
|
+
regression = objective == "regression" || objective == "binary"
|
138
|
+
bad_observations = []
|
139
|
+
expected.zip(actual).each_with_index do |(exp, act), i|
|
140
|
+
success = (act - exp).abs < 0.001
|
141
|
+
unless success
|
142
|
+
bad_observations << {expected: exp, actual: act, data_point: evaluator_set[i].map(&:itself).first}
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
if bad_observations.any?
|
147
|
+
raise "Bug detected in evaluator. Please report an issue. Bad data points: #{bad_observations.inspect}"
|
148
|
+
end
|
184
149
|
end
|
185
150
|
|
186
151
|
# for evaluator
|
@@ -1,40 +1,5 @@
|
|
1
1
|
module Eps
|
2
2
|
class LinearRegression < BaseEstimator
|
3
|
-
# pmml
|
4
|
-
|
5
|
-
def self.load_pmml(data)
|
6
|
-
super do |data|
|
7
|
-
# TODO more validation
|
8
|
-
node = data.css("RegressionTable")
|
9
|
-
|
10
|
-
coefficients = {
|
11
|
-
"_intercept" => node.attribute("intercept").value.to_f
|
12
|
-
}
|
13
|
-
|
14
|
-
features = {}
|
15
|
-
|
16
|
-
text_features, derived_fields = extract_text_features(data, features)
|
17
|
-
|
18
|
-
node.css("NumericPredictor").each do |n|
|
19
|
-
name = n.attribute("name").value
|
20
|
-
if derived_fields[name]
|
21
|
-
name = derived_fields[name]
|
22
|
-
else
|
23
|
-
features[name] = "numeric"
|
24
|
-
end
|
25
|
-
coefficients[name] = n.attribute("coefficient").value.to_f
|
26
|
-
end
|
27
|
-
|
28
|
-
node.css("CategoricalPredictor").each do |n|
|
29
|
-
name = n.attribute("name").value
|
30
|
-
coefficients[[name, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
|
31
|
-
features[name] = "categorical"
|
32
|
-
end
|
33
|
-
|
34
|
-
Evaluators::LinearRegression.new(coefficients: coefficients, features: features, text_features: text_features)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
3
|
def coefficients
|
39
4
|
@evaluator.coefficients
|
40
5
|
end
|
@@ -72,6 +37,7 @@ module Eps
|
|
72
37
|
str
|
73
38
|
end
|
74
39
|
|
40
|
+
# TODO use keyword arguments for gsl and intercept in 0.4.0
|
75
41
|
def _train(**options)
|
76
42
|
raise "Target must be numeric" if @target_type != "numeric"
|
77
43
|
check_missing_value(@train_set)
|
@@ -84,33 +50,64 @@ module Eps
|
|
84
50
|
end
|
85
51
|
|
86
52
|
x = data.map_rows(&:to_a)
|
87
|
-
data.size.times do |i|
|
88
|
-
# add intercept
|
89
|
-
x[i].unshift(1)
|
90
|
-
end
|
91
53
|
|
92
|
-
gsl =
|
54
|
+
gsl =
|
55
|
+
if options.key?(:gsl)
|
56
|
+
options[:gsl]
|
57
|
+
elsif defined?(GSL)
|
58
|
+
true
|
59
|
+
elsif defined?(GSLR)
|
60
|
+
:gslr
|
61
|
+
else
|
62
|
+
false
|
63
|
+
end
|
64
|
+
|
65
|
+
intercept = options.key?(:intercept) ? options[:intercept] : true
|
66
|
+
if intercept && gsl != :gslr
|
67
|
+
data.size.times do |i|
|
68
|
+
x[i].unshift(1)
|
69
|
+
end
|
70
|
+
end
|
93
71
|
|
94
72
|
v3 =
|
95
|
-
if gsl
|
73
|
+
if gsl == :gslr
|
74
|
+
model = GSLR::OLS.new(intercept: intercept)
|
75
|
+
model.fit(x, data.label, weight: data.weight)
|
76
|
+
|
77
|
+
@covariance = model.covariance
|
78
|
+
|
79
|
+
coefficients = model.coefficients.dup
|
80
|
+
coefficients.unshift(model.intercept) if intercept
|
81
|
+
coefficients
|
82
|
+
elsif gsl
|
96
83
|
x = GSL::Matrix.alloc(*x)
|
97
84
|
y = GSL::Vector.alloc(data.label)
|
98
|
-
|
85
|
+
w = GSL::Vector.alloc(data.weight) if data.weight
|
86
|
+
c, @covariance, _, _ = w ? GSL::MultiFit.wlinear(x, w, y) : GSL::MultiFit.linear(x, y)
|
99
87
|
c.to_a
|
100
88
|
else
|
101
89
|
x = Matrix.rows(x)
|
102
90
|
y = Matrix.column_vector(data.label)
|
91
|
+
|
92
|
+
# weighted OLS
|
93
|
+
# http://www.real-statistics.com/multiple-regression/weighted-linear-regression/weighted-regression-basics/
|
94
|
+
w = Matrix.diagonal(*data.weight) if data.weight
|
95
|
+
|
103
96
|
removed = []
|
104
97
|
|
105
98
|
# https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
|
106
|
-
#
|
99
|
+
# unfortunately, this method is unstable
|
107
100
|
# haven't found an efficient way to do QR-factorization in Ruby
|
108
101
|
# the extendmatrix gem has householder and givens (givens has bug)
|
109
102
|
# but methods are too slow
|
110
103
|
xt = x.t
|
104
|
+
xt *= w if w
|
111
105
|
begin
|
112
106
|
@xtxi = (xt * x).inverse
|
113
107
|
rescue ExceptionForMatrix::ErrNotRegular
|
108
|
+
# matrix cannot be inverted
|
109
|
+
# https://en.wikipedia.org/wiki/Multicollinearity
|
110
|
+
|
114
111
|
constant = {}
|
115
112
|
(1...x.column_count).each do |i|
|
116
113
|
constant[i] = constant?(x.column(i))
|
@@ -134,6 +131,7 @@ module Eps
|
|
134
131
|
end
|
135
132
|
x = Matrix.columns(vectors)
|
136
133
|
xt = x.t
|
134
|
+
xt *= w if w
|
137
135
|
|
138
136
|
# try again
|
139
137
|
begin
|
@@ -144,6 +142,7 @@ module Eps
|
|
144
142
|
end
|
145
143
|
# huge performance boost
|
146
144
|
# by multiplying xt * y first
|
145
|
+
# for weighted, w is already included in wt
|
147
146
|
v2 = @xtxi * (xt * y)
|
148
147
|
|
149
148
|
# convert to array
|
@@ -158,47 +157,14 @@ module Eps
|
|
158
157
|
v2
|
159
158
|
end
|
160
159
|
|
161
|
-
@
|
162
|
-
|
163
|
-
Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
|
164
|
-
end
|
165
|
-
|
166
|
-
def generate_pmml
|
167
|
-
predictors = @coefficients.dup
|
168
|
-
predictors.delete("_intercept")
|
169
|
-
|
170
|
-
data_fields = {}
|
171
|
-
@features.each do |k, type|
|
172
|
-
if type == "categorical"
|
173
|
-
data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last)
|
174
|
-
else
|
175
|
-
data_fields[k] = nil
|
176
|
-
end
|
160
|
+
if @xtxi && @xtxi.each(:diagonal).any? { |v| v < 0 }
|
161
|
+
raise UnstableSolution, "GSL is needed to find a stable solution for this dataset"
|
177
162
|
end
|
178
163
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
xml.MiningField(name: k)
|
184
|
-
end
|
185
|
-
end
|
186
|
-
pmml_local_transformations(xml)
|
187
|
-
xml.RegressionTable(intercept: @coefficients["_intercept"]) do
|
188
|
-
predictors.each do |k, v|
|
189
|
-
if k.is_a?(Array)
|
190
|
-
if @features[k.first] == "text"
|
191
|
-
xml.NumericPredictor(name: display_field(k), coefficient: v)
|
192
|
-
else
|
193
|
-
xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
|
194
|
-
end
|
195
|
-
else
|
196
|
-
xml.NumericPredictor(name: k, coefficient: v)
|
197
|
-
end
|
198
|
-
end
|
199
|
-
end
|
200
|
-
end
|
201
|
-
end
|
164
|
+
@coefficient_names = data.columns.keys
|
165
|
+
@coefficient_names.unshift("_intercept") if intercept
|
166
|
+
@coefficients = Hash[@coefficient_names.zip(v3)]
|
167
|
+
Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
|
202
168
|
end
|
203
169
|
|
204
170
|
def prep_x(x)
|
@@ -249,7 +215,11 @@ module Eps
|
|
249
215
|
|
250
216
|
def diagonal
|
251
217
|
@diagonal ||= begin
|
252
|
-
if covariance.
|
218
|
+
if covariance.is_a?(Array)
|
219
|
+
covariance.size.times.map do |i|
|
220
|
+
covariance[i][i]
|
221
|
+
end
|
222
|
+
elsif covariance.respond_to?(:each)
|
253
223
|
d = covariance.each(:diagonal).to_a
|
254
224
|
@removed.each do |i|
|
255
225
|
d.insert(i, 0)
|