eps 0.3.0 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -5
- data/README.md +77 -9
- data/lib/eps.rb +19 -10
- data/lib/eps/base_estimator.rb +63 -145
- data/lib/eps/data_frame.rb +19 -3
- data/lib/eps/evaluators/lightgbm.rb +20 -7
- data/lib/eps/evaluators/linear_regression.rb +7 -4
- data/lib/eps/evaluators/naive_bayes.rb +9 -7
- data/lib/eps/label_encoder.rb +7 -3
- data/lib/eps/lightgbm.rb +43 -78
- data/lib/eps/linear_regression.rb +53 -83
- data/lib/eps/metrics.rb +24 -12
- data/lib/eps/model.rb +6 -6
- data/lib/eps/naive_bayes.rb +3 -140
- data/lib/eps/pmml.rb +14 -0
- data/lib/eps/pmml/generator.rb +422 -0
- data/lib/eps/pmml/loader.rb +241 -0
- data/lib/eps/version.rb +1 -1
- metadata +36 -6
- data/lib/eps/pmml_generators/lightgbm.rb +0 -187
data/lib/eps/data_frame.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module Eps
|
2
2
|
class DataFrame
|
3
3
|
attr_reader :columns
|
4
|
-
attr_accessor :label
|
4
|
+
attr_accessor :label, :weight
|
5
5
|
|
6
6
|
def initialize(data = [])
|
7
7
|
@columns = {}
|
@@ -10,7 +10,7 @@ module Eps
|
|
10
10
|
data.columns.each do |k, v|
|
11
11
|
@columns[k] = v
|
12
12
|
end
|
13
|
-
elsif daru?(data)
|
13
|
+
elsif rover?(data) || daru?(data)
|
14
14
|
data.to_h.each do |k, v|
|
15
15
|
@columns[k.to_s] = v.to_a
|
16
16
|
end
|
@@ -19,6 +19,8 @@ module Eps
|
|
19
19
|
@columns[k.to_s] = v.to_a
|
20
20
|
end
|
21
21
|
else
|
22
|
+
data = data.to_a if numo?(data)
|
23
|
+
|
22
24
|
if data.any?
|
23
25
|
row = data[0]
|
24
26
|
|
@@ -78,6 +80,10 @@ module Eps
|
|
78
80
|
rows = Range.new(rows.begin, size - 1)
|
79
81
|
elsif rows.end < 0
|
80
82
|
rows = Range.new(rows.begin, size + rows.end, rows.exclude_end?)
|
83
|
+
else
|
84
|
+
finish = rows.end
|
85
|
+
finish -= 1 if rows.exclude_end?
|
86
|
+
rows = Range.new(rows.begin, size - 1) if finish >= size - 1
|
81
87
|
end
|
82
88
|
end
|
83
89
|
|
@@ -115,6 +121,7 @@ module Eps
|
|
115
121
|
df.columns[c] = columns[c].values_at(*rows)
|
116
122
|
end
|
117
123
|
df.label = label.values_at(*rows) if label
|
124
|
+
df.weight = weight.values_at(*rows) if weight
|
118
125
|
|
119
126
|
singular ? df.columns[cols[0]] : df
|
120
127
|
end
|
@@ -129,13 +136,22 @@ module Eps
|
|
129
136
|
df.columns[k] = v
|
130
137
|
end
|
131
138
|
df.label = label
|
139
|
+
df.weight = weight
|
132
140
|
df
|
133
141
|
end
|
134
142
|
|
135
143
|
private
|
136
144
|
|
145
|
+
def numo?(x)
|
146
|
+
defined?(Numo::NArray) && x.is_a?(Numo::NArray)
|
147
|
+
end
|
148
|
+
|
149
|
+
def rover?(x)
|
150
|
+
defined?(Rover::DataFrame) && x.is_a?(Rover::DataFrame)
|
151
|
+
end
|
152
|
+
|
137
153
|
def daru?(x)
|
138
|
-
defined?(Daru) && x.is_a?(Daru::DataFrame)
|
154
|
+
defined?(Daru::DataFrame) && x.is_a?(Daru::DataFrame)
|
139
155
|
end
|
140
156
|
end
|
141
157
|
end
|
@@ -11,12 +11,14 @@ module Eps
|
|
11
11
|
@text_features = text_features
|
12
12
|
end
|
13
13
|
|
14
|
-
def predict(data)
|
14
|
+
def predict(data, probabilities: false)
|
15
|
+
raise "Probabilities not supported" if probabilities && @objective == "regression"
|
16
|
+
|
15
17
|
rows = data.map(&:to_h)
|
16
18
|
|
17
19
|
# sparse matrix
|
18
20
|
@text_features.each do |k, v|
|
19
|
-
encoder = TextEncoder.new(v)
|
21
|
+
encoder = TextEncoder.new(**v)
|
20
22
|
|
21
23
|
values = data.columns.delete(k)
|
22
24
|
counts = encoder.transform(values)
|
@@ -38,7 +40,12 @@ module Eps
|
|
38
40
|
when "regression"
|
39
41
|
sum_trees(rows, @trees)
|
40
42
|
when "binary"
|
41
|
-
sum_trees(rows, @trees).map { |s|
|
43
|
+
prob = sum_trees(rows, @trees).map { |s| sigmoid(s) }
|
44
|
+
if probabilities
|
45
|
+
prob.map { |v| @labels.zip([1 - v, v]).to_h }
|
46
|
+
else
|
47
|
+
prob.map { |v| @labels[v > 0.5 ? 1 : 0] }
|
48
|
+
end
|
42
49
|
else
|
43
50
|
tree_scores = []
|
44
51
|
num_trees = @trees.size / @labels.size
|
@@ -47,8 +54,14 @@ module Eps
|
|
47
54
|
end
|
48
55
|
data.size.times.map do |i|
|
49
56
|
v = tree_scores.map { |s| s[i] }
|
50
|
-
|
51
|
-
|
57
|
+
if probabilities
|
58
|
+
exp = v.map { |vi| Math.exp(vi) }
|
59
|
+
sum = exp.sum
|
60
|
+
@labels.zip(exp.map { |e| e / sum }).to_h
|
61
|
+
else
|
62
|
+
idx = v.map.with_index.max_by { |v2, _| v2 }.last
|
63
|
+
@labels[idx]
|
64
|
+
end
|
52
65
|
end
|
53
66
|
end
|
54
67
|
end
|
@@ -81,7 +94,7 @@ module Eps
|
|
81
94
|
else
|
82
95
|
case node.operator
|
83
96
|
when "equal"
|
84
|
-
v == node.value
|
97
|
+
v.to_s == node.value
|
85
98
|
when "in"
|
86
99
|
node.value.include?(v)
|
87
100
|
when "greaterThan"
|
@@ -109,7 +122,7 @@ module Eps
|
|
109
122
|
end
|
110
123
|
|
111
124
|
def sigmoid(x)
|
112
|
-
1.0 / (1 + Math
|
125
|
+
1.0 / (1 + Math.exp(-x))
|
113
126
|
end
|
114
127
|
end
|
115
128
|
end
|
@@ -9,8 +9,10 @@ module Eps
|
|
9
9
|
@text_features = text_features || {}
|
10
10
|
end
|
11
11
|
|
12
|
-
def predict(x)
|
13
|
-
|
12
|
+
def predict(x, probabilities: false)
|
13
|
+
raise "Probabilities not supported" if probabilities
|
14
|
+
|
15
|
+
intercept = @coefficients["_intercept"] || 0.0
|
14
16
|
scores = [intercept] * x.size
|
15
17
|
|
16
18
|
@features.each do |k, type|
|
@@ -19,10 +21,11 @@ module Eps
|
|
19
21
|
case type
|
20
22
|
when "categorical"
|
21
23
|
x.columns[k].each_with_index do |xv, i|
|
22
|
-
|
24
|
+
# TODO clean up
|
25
|
+
scores[i] += (@coefficients[[k, xv]] || @coefficients[[k, xv.to_s]]).to_f
|
23
26
|
end
|
24
27
|
when "text"
|
25
|
-
encoder = TextEncoder.new(
|
28
|
+
encoder = TextEncoder.new(**@text_features[k])
|
26
29
|
counts = encoder.transform(x.columns[k])
|
27
30
|
coef = {}
|
28
31
|
@coefficients.each do |k2, v|
|
@@ -10,14 +10,15 @@ module Eps
|
|
10
10
|
@legacy = legacy
|
11
11
|
end
|
12
12
|
|
13
|
-
def predict(x)
|
13
|
+
def predict(x, probabilities: false)
|
14
14
|
probs = calculate_class_probabilities(x)
|
15
15
|
probs.map do |xp|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
16
|
+
if probabilities
|
17
|
+
sum = xp.values.map { |v| Math.exp(v) }.sum.to_f
|
18
|
+
xp.map { |k, v| [k, Math.exp(v) / sum] }.to_h
|
19
|
+
else
|
20
|
+
xp.sort_by { |k, v| [-v, k] }[0][0]
|
21
|
+
end
|
21
22
|
end
|
22
23
|
end
|
23
24
|
|
@@ -38,7 +39,8 @@ module Eps
|
|
38
39
|
case type
|
39
40
|
when "categorical"
|
40
41
|
x.columns[k].each_with_index do |xi, i|
|
41
|
-
|
42
|
+
# TODO clean this up
|
43
|
+
vc = probabilities[:conditional][k][xi] || probabilities[:conditional][k][xi.to_s]
|
42
44
|
|
43
45
|
# unknown value if not vc
|
44
46
|
if vc
|
data/lib/eps/label_encoder.rb
CHANGED
@@ -24,9 +24,13 @@ module Eps
|
|
24
24
|
if yi.nil?
|
25
25
|
nil
|
26
26
|
else
|
27
|
-
|
28
|
-
|
29
|
-
|
27
|
+
# use an additional label for unseen values
|
28
|
+
# this is only used during training for the LightGBM eval_set
|
29
|
+
# LightGBM ignores them (only uses seen categories for predictions)
|
30
|
+
# https://github.com/microsoft/LightGBM/issues/1936
|
31
|
+
# the evaluator also ignores them (to be consistent with LightGBM)
|
32
|
+
# but doesn't use this code
|
33
|
+
@labels[yi.to_s] || @labels.size
|
30
34
|
end
|
31
35
|
end
|
32
36
|
end
|
data/lib/eps/lightgbm.rb
CHANGED
@@ -1,39 +1,5 @@
|
|
1
|
-
require "eps/pmml_generators/lightgbm"
|
2
|
-
|
3
1
|
module Eps
|
4
2
|
class LightGBM < BaseEstimator
|
5
|
-
include PmmlGenerators::LightGBM
|
6
|
-
|
7
|
-
def self.load_pmml(data)
|
8
|
-
super do |data|
|
9
|
-
objective = data.css("MiningModel").first.attribute("functionName").value
|
10
|
-
if objective == "classification"
|
11
|
-
labels = data.css("RegressionModel OutputField").map { |n| n.attribute("value").value }
|
12
|
-
objective = labels.size > 2 ? "multiclass" : "binary"
|
13
|
-
end
|
14
|
-
|
15
|
-
features = {}
|
16
|
-
text_features, derived_fields = extract_text_features(data, features)
|
17
|
-
node = data.css("DataDictionary").first
|
18
|
-
node.css("DataField")[1..-1].to_a.each do |node|
|
19
|
-
features[node.attribute("name").value] =
|
20
|
-
if node.attribute("optype").value == "categorical"
|
21
|
-
"categorical"
|
22
|
-
else
|
23
|
-
"numeric"
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
trees = []
|
28
|
-
data.css("Segmentation TreeModel").each do |tree|
|
29
|
-
node = find_nodes(tree.css("Node").first, derived_fields)
|
30
|
-
trees << node
|
31
|
-
end
|
32
|
-
|
33
|
-
Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: features, text_features: text_features)
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
3
|
private
|
38
4
|
|
39
5
|
def _summary(extended: false)
|
@@ -51,48 +17,16 @@ module Eps
|
|
51
17
|
str
|
52
18
|
end
|
53
19
|
|
54
|
-
def
|
55
|
-
score = BigDecimal(xml.attribute("score").value).to_f
|
56
|
-
|
57
|
-
elements = xml.elements
|
58
|
-
xml_predicate = elements.first
|
59
|
-
|
60
|
-
predicate =
|
61
|
-
if xml_predicate.name == "True"
|
62
|
-
nil
|
63
|
-
elsif xml_predicate.name == "SimpleSetPredicate"
|
64
|
-
operator = "in"
|
65
|
-
value = xml_predicate.css("Array").text.scan(/"(.+?)(?<!\\)"|(\S+)/).flatten.compact.map { |v| v.gsub('\"', '"') }
|
66
|
-
field = xml_predicate.attribute("field").value
|
67
|
-
field = derived_fields[field] if derived_fields[field]
|
68
|
-
{
|
69
|
-
field: field,
|
70
|
-
operator: operator,
|
71
|
-
value: value
|
72
|
-
}
|
73
|
-
else
|
74
|
-
operator = xml_predicate.attribute("operator").value
|
75
|
-
value = xml_predicate.attribute("value").value
|
76
|
-
value = BigDecimal(value).to_f if operator == "greaterThan"
|
77
|
-
field = xml_predicate.attribute("field").value
|
78
|
-
field = derived_fields[field] if derived_fields[field]
|
79
|
-
{
|
80
|
-
field: field,
|
81
|
-
operator: operator,
|
82
|
-
value: value
|
83
|
-
}
|
84
|
-
end
|
85
|
-
|
86
|
-
children = elements[1..-1].map { |n| find_nodes(n, derived_fields) }
|
87
|
-
|
88
|
-
Evaluators::Node.new(score: score, predicate: predicate, children: children)
|
89
|
-
end
|
90
|
-
|
91
|
-
def _train(verbose: nil, early_stopping: nil)
|
20
|
+
def _train(verbose: nil, early_stopping: nil, learning_rate: 0.1)
|
92
21
|
train_set = @train_set
|
93
22
|
validation_set = @validation_set.dup
|
94
23
|
summary_label = train_set.label
|
95
24
|
|
25
|
+
# create check set
|
26
|
+
evaluator_set = validation_set || train_set
|
27
|
+
check_idx = 100.times.map { rand(evaluator_set.size) }.uniq
|
28
|
+
evaluator_set = evaluator_set[check_idx]
|
29
|
+
|
96
30
|
# objective
|
97
31
|
objective =
|
98
32
|
if @target_type == "numeric"
|
@@ -126,7 +60,10 @@ module Eps
|
|
126
60
|
prep_text_features(validation_set) if validation_set
|
127
61
|
|
128
62
|
# create params
|
129
|
-
params = {
|
63
|
+
params = {
|
64
|
+
objective: objective,
|
65
|
+
learning_rate: learning_rate
|
66
|
+
}
|
130
67
|
params[:num_classes] = labels.size if objective == "multiclass"
|
131
68
|
if train_set.size < 30
|
132
69
|
params[:min_data_in_bin] = 1
|
@@ -135,8 +72,8 @@ module Eps
|
|
135
72
|
|
136
73
|
# create datasets
|
137
74
|
categorical_idx = @features.values.map.with_index.select { |type, _| type == "categorical" }.map(&:last)
|
138
|
-
train_ds = ::LightGBM::Dataset.new(train_set.map_rows(&:to_a), label: train_set.label, categorical_feature: categorical_idx, params: params)
|
139
|
-
validation_ds = ::LightGBM::Dataset.new(validation_set.map_rows(&:to_a), label: validation_set.label, categorical_feature: categorical_idx, params: params, reference: train_ds) if validation_set
|
75
|
+
train_ds = ::LightGBM::Dataset.new(train_set.map_rows(&:to_a), label: train_set.label, weight: train_set.weight, categorical_feature: categorical_idx, params: params)
|
76
|
+
validation_ds = ::LightGBM::Dataset.new(validation_set.map_rows(&:to_a), label: validation_set.label, weight: validation_set.weight, categorical_feature: categorical_idx, params: params, reference: train_ds) if validation_set
|
140
77
|
|
141
78
|
# train
|
142
79
|
valid_sets = [train_ds]
|
@@ -176,11 +113,39 @@ module Eps
|
|
176
113
|
# reset pmml
|
177
114
|
@pmml = nil
|
178
115
|
|
179
|
-
Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: @features, text_features: @text_features)
|
116
|
+
evaluator = Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: @features, text_features: @text_features)
|
117
|
+
booster_set = validation_set ? validation_set[check_idx] : train_set[check_idx]
|
118
|
+
check_evaluator(objective, labels, booster, booster_set, evaluator, evaluator_set)
|
119
|
+
evaluator
|
180
120
|
end
|
181
121
|
|
182
|
-
|
183
|
-
|
122
|
+
# compare a subset of predictions to check for possible bugs in evaluator
|
123
|
+
# NOTE LightGBM must use double data type for prediction input for these to be consistent
|
124
|
+
def check_evaluator(objective, labels, booster, booster_set, evaluator, evaluator_set)
|
125
|
+
expected = @booster.predict(booster_set.map_rows(&:to_a))
|
126
|
+
if objective == "multiclass"
|
127
|
+
actual = evaluator.predict(evaluator_set, probabilities: true)
|
128
|
+
# just compare first for now
|
129
|
+
expected.map! { |v| v.first }
|
130
|
+
actual.map! { |v| v.values.first }
|
131
|
+
elsif objective == "binary"
|
132
|
+
actual = evaluator.predict(evaluator_set, probabilities: true).map { |v| v.values.last }
|
133
|
+
else
|
134
|
+
actual = evaluator.predict(evaluator_set)
|
135
|
+
end
|
136
|
+
|
137
|
+
regression = objective == "regression" || objective == "binary"
|
138
|
+
bad_observations = []
|
139
|
+
expected.zip(actual).each_with_index do |(exp, act), i|
|
140
|
+
success = (act - exp).abs < 0.001
|
141
|
+
unless success
|
142
|
+
bad_observations << {expected: exp, actual: act, data_point: evaluator_set[i].map(&:itself).first}
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
if bad_observations.any?
|
147
|
+
raise "Bug detected in evaluator. Please report an issue. Bad data points: #{bad_observations.inspect}"
|
148
|
+
end
|
184
149
|
end
|
185
150
|
|
186
151
|
# for evaluator
|
@@ -1,40 +1,5 @@
|
|
1
1
|
module Eps
|
2
2
|
class LinearRegression < BaseEstimator
|
3
|
-
# pmml
|
4
|
-
|
5
|
-
def self.load_pmml(data)
|
6
|
-
super do |data|
|
7
|
-
# TODO more validation
|
8
|
-
node = data.css("RegressionTable")
|
9
|
-
|
10
|
-
coefficients = {
|
11
|
-
"_intercept" => node.attribute("intercept").value.to_f
|
12
|
-
}
|
13
|
-
|
14
|
-
features = {}
|
15
|
-
|
16
|
-
text_features, derived_fields = extract_text_features(data, features)
|
17
|
-
|
18
|
-
node.css("NumericPredictor").each do |n|
|
19
|
-
name = n.attribute("name").value
|
20
|
-
if derived_fields[name]
|
21
|
-
name = derived_fields[name]
|
22
|
-
else
|
23
|
-
features[name] = "numeric"
|
24
|
-
end
|
25
|
-
coefficients[name] = n.attribute("coefficient").value.to_f
|
26
|
-
end
|
27
|
-
|
28
|
-
node.css("CategoricalPredictor").each do |n|
|
29
|
-
name = n.attribute("name").value
|
30
|
-
coefficients[[name, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
|
31
|
-
features[name] = "categorical"
|
32
|
-
end
|
33
|
-
|
34
|
-
Evaluators::LinearRegression.new(coefficients: coefficients, features: features, text_features: text_features)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
3
|
def coefficients
|
39
4
|
@evaluator.coefficients
|
40
5
|
end
|
@@ -72,6 +37,7 @@ module Eps
|
|
72
37
|
str
|
73
38
|
end
|
74
39
|
|
40
|
+
# TODO use keyword arguments for gsl and intercept in 0.4.0
|
75
41
|
def _train(**options)
|
76
42
|
raise "Target must be numeric" if @target_type != "numeric"
|
77
43
|
check_missing_value(@train_set)
|
@@ -84,33 +50,64 @@ module Eps
|
|
84
50
|
end
|
85
51
|
|
86
52
|
x = data.map_rows(&:to_a)
|
87
|
-
data.size.times do |i|
|
88
|
-
# add intercept
|
89
|
-
x[i].unshift(1)
|
90
|
-
end
|
91
53
|
|
92
|
-
gsl =
|
54
|
+
gsl =
|
55
|
+
if options.key?(:gsl)
|
56
|
+
options[:gsl]
|
57
|
+
elsif defined?(GSL)
|
58
|
+
true
|
59
|
+
elsif defined?(GSLR)
|
60
|
+
:gslr
|
61
|
+
else
|
62
|
+
false
|
63
|
+
end
|
64
|
+
|
65
|
+
intercept = options.key?(:intercept) ? options[:intercept] : true
|
66
|
+
if intercept && gsl != :gslr
|
67
|
+
data.size.times do |i|
|
68
|
+
x[i].unshift(1)
|
69
|
+
end
|
70
|
+
end
|
93
71
|
|
94
72
|
v3 =
|
95
|
-
if gsl
|
73
|
+
if gsl == :gslr
|
74
|
+
model = GSLR::OLS.new(intercept: intercept)
|
75
|
+
model.fit(x, data.label, weight: data.weight)
|
76
|
+
|
77
|
+
@covariance = model.covariance
|
78
|
+
|
79
|
+
coefficients = model.coefficients.dup
|
80
|
+
coefficients.unshift(model.intercept) if intercept
|
81
|
+
coefficients
|
82
|
+
elsif gsl
|
96
83
|
x = GSL::Matrix.alloc(*x)
|
97
84
|
y = GSL::Vector.alloc(data.label)
|
98
|
-
|
85
|
+
w = GSL::Vector.alloc(data.weight) if data.weight
|
86
|
+
c, @covariance, _, _ = w ? GSL::MultiFit.wlinear(x, w, y) : GSL::MultiFit.linear(x, y)
|
99
87
|
c.to_a
|
100
88
|
else
|
101
89
|
x = Matrix.rows(x)
|
102
90
|
y = Matrix.column_vector(data.label)
|
91
|
+
|
92
|
+
# weighted OLS
|
93
|
+
# http://www.real-statistics.com/multiple-regression/weighted-linear-regression/weighted-regression-basics/
|
94
|
+
w = Matrix.diagonal(*data.weight) if data.weight
|
95
|
+
|
103
96
|
removed = []
|
104
97
|
|
105
98
|
# https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
|
106
|
-
#
|
99
|
+
# unfortunately, this method is unstable
|
107
100
|
# haven't found an efficient way to do QR-factorization in Ruby
|
108
101
|
# the extendmatrix gem has householder and givens (givens has bug)
|
109
102
|
# but methods are too slow
|
110
103
|
xt = x.t
|
104
|
+
xt *= w if w
|
111
105
|
begin
|
112
106
|
@xtxi = (xt * x).inverse
|
113
107
|
rescue ExceptionForMatrix::ErrNotRegular
|
108
|
+
# matrix cannot be inverted
|
109
|
+
# https://en.wikipedia.org/wiki/Multicollinearity
|
110
|
+
|
114
111
|
constant = {}
|
115
112
|
(1...x.column_count).each do |i|
|
116
113
|
constant[i] = constant?(x.column(i))
|
@@ -134,6 +131,7 @@ module Eps
|
|
134
131
|
end
|
135
132
|
x = Matrix.columns(vectors)
|
136
133
|
xt = x.t
|
134
|
+
xt *= w if w
|
137
135
|
|
138
136
|
# try again
|
139
137
|
begin
|
@@ -144,6 +142,7 @@ module Eps
|
|
144
142
|
end
|
145
143
|
# huge performance boost
|
146
144
|
# by multiplying xt * y first
|
145
|
+
# for weighted, w is already included in wt
|
147
146
|
v2 = @xtxi * (xt * y)
|
148
147
|
|
149
148
|
# convert to array
|
@@ -158,47 +157,14 @@ module Eps
|
|
158
157
|
v2
|
159
158
|
end
|
160
159
|
|
161
|
-
@
|
162
|
-
|
163
|
-
Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
|
164
|
-
end
|
165
|
-
|
166
|
-
def generate_pmml
|
167
|
-
predictors = @coefficients.dup
|
168
|
-
predictors.delete("_intercept")
|
169
|
-
|
170
|
-
data_fields = {}
|
171
|
-
@features.each do |k, type|
|
172
|
-
if type == "categorical"
|
173
|
-
data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last)
|
174
|
-
else
|
175
|
-
data_fields[k] = nil
|
176
|
-
end
|
160
|
+
if @xtxi && @xtxi.each(:diagonal).any? { |v| v < 0 }
|
161
|
+
raise UnstableSolution, "GSL is needed to find a stable solution for this dataset"
|
177
162
|
end
|
178
163
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
xml.MiningField(name: k)
|
184
|
-
end
|
185
|
-
end
|
186
|
-
pmml_local_transformations(xml)
|
187
|
-
xml.RegressionTable(intercept: @coefficients["_intercept"]) do
|
188
|
-
predictors.each do |k, v|
|
189
|
-
if k.is_a?(Array)
|
190
|
-
if @features[k.first] == "text"
|
191
|
-
xml.NumericPredictor(name: display_field(k), coefficient: v)
|
192
|
-
else
|
193
|
-
xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
|
194
|
-
end
|
195
|
-
else
|
196
|
-
xml.NumericPredictor(name: k, coefficient: v)
|
197
|
-
end
|
198
|
-
end
|
199
|
-
end
|
200
|
-
end
|
201
|
-
end
|
164
|
+
@coefficient_names = data.columns.keys
|
165
|
+
@coefficient_names.unshift("_intercept") if intercept
|
166
|
+
@coefficients = Hash[@coefficient_names.zip(v3)]
|
167
|
+
Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
|
202
168
|
end
|
203
169
|
|
204
170
|
def prep_x(x)
|
@@ -249,7 +215,11 @@ module Eps
|
|
249
215
|
|
250
216
|
def diagonal
|
251
217
|
@diagonal ||= begin
|
252
|
-
if covariance.
|
218
|
+
if covariance.is_a?(Array)
|
219
|
+
covariance.size.times.map do |i|
|
220
|
+
covariance[i][i]
|
221
|
+
end
|
222
|
+
elsif covariance.respond_to?(:each)
|
253
223
|
d = covariance.each(:diagonal).to_a
|
254
224
|
@removed.each do |i|
|
255
225
|
d.insert(i, 0)
|