eps 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/LICENSE.txt +1 -1
- data/README.md +183 -243
- data/lib/eps.rb +27 -3
- data/lib/eps/base_estimator.rb +316 -47
- data/lib/eps/data_frame.rb +141 -0
- data/lib/eps/evaluators/lightgbm.rb +116 -0
- data/lib/eps/evaluators/linear_regression.rb +54 -0
- data/lib/eps/evaluators/naive_bayes.rb +95 -0
- data/lib/eps/evaluators/node.rb +26 -0
- data/lib/eps/label_encoder.rb +41 -0
- data/lib/eps/lightgbm.rb +237 -0
- data/lib/eps/linear_regression.rb +132 -386
- data/lib/eps/metrics.rb +46 -0
- data/lib/eps/model.rb +16 -58
- data/lib/eps/naive_bayes.rb +175 -164
- data/lib/eps/pmml_generators/lightgbm.rb +187 -0
- data/lib/eps/statistics.rb +79 -0
- data/lib/eps/text_encoder.rb +81 -0
- data/lib/eps/utils.rb +22 -0
- data/lib/eps/version.rb +1 -1
- metadata +33 -7
data/lib/eps/metrics.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
module Eps
|
2
|
+
module Metrics
|
3
|
+
class << self
|
4
|
+
def rmse(y_true, y_pred)
|
5
|
+
check_size(y_true, y_pred)
|
6
|
+
Math.sqrt(mean(errors(y_true, y_pred).map { |v| v**2 }))
|
7
|
+
end
|
8
|
+
|
9
|
+
def mae(y_true, y_pred)
|
10
|
+
check_size(y_true, y_pred)
|
11
|
+
mean(errors(y_true, y_pred).map { |v| v.abs })
|
12
|
+
end
|
13
|
+
|
14
|
+
def me(y_true, y_pred)
|
15
|
+
check_size(y_true, y_pred)
|
16
|
+
mean(errors(y_true, y_pred))
|
17
|
+
end
|
18
|
+
|
19
|
+
def accuracy(y_true, y_pred)
|
20
|
+
check_size(y_true, y_pred)
|
21
|
+
y_true.zip(y_pred).count { |yt, yp| yt == yp } / y_true.size.to_f
|
22
|
+
end
|
23
|
+
|
24
|
+
# http://wiki.fast.ai/index.php/Log_Loss
|
25
|
+
def log_loss(y_true, y_pred, eps: 1e-15)
|
26
|
+
check_size(y_true, y_pred)
|
27
|
+
p = y_pred.map { |yp| yp.clamp(eps, 1 - eps) }
|
28
|
+
mean(y_true.zip(p).map { |yt, pi| yt == 1 ? -Math.log(pi) : -Math.log(1 - pi) })
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def check_size(y_true, y_pred)
|
34
|
+
raise ArgumentError, "Different sizes" if y_true.size != y_pred.size
|
35
|
+
end
|
36
|
+
|
37
|
+
def mean(arr)
|
38
|
+
arr.sum / arr.size.to_f
|
39
|
+
end
|
40
|
+
|
41
|
+
def errors(y_true, y_pred)
|
42
|
+
y_true.zip(y_pred).map { |yt, yp| yt - yp }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/eps/model.rb
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
module Eps
|
2
2
|
class Model
|
3
|
-
def initialize(data = nil, y = nil,
|
4
|
-
@options = options
|
5
|
-
|
3
|
+
def initialize(data = nil, y = nil, estimator: nil, **options)
|
6
4
|
if estimator
|
7
5
|
@estimator = estimator
|
8
|
-
elsif data
|
9
|
-
train(data, y,
|
6
|
+
elsif data
|
7
|
+
train(data, y, **options)
|
10
8
|
end
|
11
9
|
end
|
12
10
|
|
@@ -14,12 +12,13 @@ module Eps
|
|
14
12
|
|
15
13
|
def self.load_pmml(data)
|
16
14
|
if data.is_a?(String)
|
17
|
-
require "nokogiri"
|
18
15
|
data = Nokogiri::XML(data) { |config| config.strict }
|
19
16
|
end
|
20
17
|
|
21
18
|
estimator_class =
|
22
|
-
if data.css("
|
19
|
+
if data.css("Segmentation").any?
|
20
|
+
Eps::LightGBM
|
21
|
+
elsif data.css("RegressionModel").any?
|
23
22
|
Eps::LinearRegression
|
24
23
|
elsif data.css("NaiveBayesModel").any?
|
25
24
|
Eps::NaiveBayes
|
@@ -30,55 +29,22 @@ module Eps
|
|
30
29
|
new(estimator: estimator_class.load_pmml(data))
|
31
30
|
end
|
32
31
|
|
33
|
-
# ruby - legacy
|
34
|
-
|
35
|
-
def self.load(data)
|
36
|
-
new(estimator: Eps::LinearRegression.load(data))
|
37
|
-
end
|
38
|
-
|
39
|
-
# json - legacy
|
40
|
-
|
41
|
-
def self.load_json(data)
|
42
|
-
new(estimator: Eps::LinearRegression.load_json(data))
|
43
|
-
end
|
44
|
-
|
45
|
-
def to_json
|
46
|
-
@estimator ? @estimator.to_json : super
|
47
|
-
end
|
48
|
-
|
49
|
-
# pfa - legacy
|
50
|
-
|
51
|
-
def self.load_pfa(data)
|
52
|
-
new(estimator: Eps::LinearRegression.load_pfa(data))
|
53
|
-
end
|
54
|
-
|
55
|
-
# metrics
|
56
|
-
|
57
|
-
def self.metrics(actual, estimated)
|
58
|
-
estimator_class =
|
59
|
-
if numeric?(actual)
|
60
|
-
Eps::LinearRegression
|
61
|
-
else
|
62
|
-
Eps::NaiveBayes
|
63
|
-
end
|
64
|
-
|
65
|
-
estimator_class.metrics(actual, estimated)
|
66
|
-
end
|
67
|
-
|
68
32
|
private
|
69
33
|
|
70
|
-
def train(data, y = nil, target: nil)
|
71
|
-
y ||= daru?(data) ? data[target].to_a : data.map { |r| r[target] }
|
72
|
-
|
34
|
+
def train(data, y = nil, target: nil, algorithm: :lightgbm, **options)
|
73
35
|
estimator_class =
|
74
|
-
|
36
|
+
case algorithm
|
37
|
+
when :lightgbm
|
38
|
+
Eps::LightGBM
|
39
|
+
when :linear_regression
|
75
40
|
Eps::LinearRegression
|
76
|
-
|
41
|
+
when :naive_bayes
|
77
42
|
Eps::NaiveBayes
|
43
|
+
else
|
44
|
+
raise ArgumentError, "Unknown algorithm: #{algorithm}"
|
78
45
|
end
|
79
46
|
|
80
|
-
@estimator = estimator_class.new(
|
81
|
-
@estimator.train(data, y, target: target)
|
47
|
+
@estimator = estimator_class.new(data, y, target: target, **options)
|
82
48
|
end
|
83
49
|
|
84
50
|
def respond_to_missing?(name, include_private = false)
|
@@ -90,19 +56,11 @@ module Eps
|
|
90
56
|
end
|
91
57
|
|
92
58
|
def method_missing(method, *args, &block)
|
93
|
-
if @estimator
|
59
|
+
if @estimator && @estimator.respond_to?(method)
|
94
60
|
@estimator.public_send(method, *args, &block)
|
95
61
|
else
|
96
62
|
super
|
97
63
|
end
|
98
64
|
end
|
99
|
-
|
100
|
-
def self.numeric?(y)
|
101
|
-
y.first.is_a?(Numeric)
|
102
|
-
end
|
103
|
-
|
104
|
-
def daru?(x)
|
105
|
-
defined?(Daru) && x.is_a?(Daru::DataFrame)
|
106
|
-
end
|
107
65
|
end
|
108
66
|
end
|
data/lib/eps/naive_bayes.rb
CHANGED
@@ -2,227 +2,245 @@ module Eps
|
|
2
2
|
class NaiveBayes < BaseEstimator
|
3
3
|
attr_reader :probabilities
|
4
4
|
|
5
|
-
def
|
6
|
-
@
|
7
|
-
@target = target
|
5
|
+
def accuracy
|
6
|
+
Eps::Metrics.accuracy(@train_set.label, predict(@train_set))
|
8
7
|
end
|
9
8
|
|
10
|
-
|
11
|
-
super
|
12
|
-
|
13
|
-
@y = @y.map { |yi| yi.to_s }
|
9
|
+
# pmml
|
14
10
|
|
15
|
-
|
16
|
-
|
11
|
+
def self.load_pmml(data)
|
12
|
+
super do |data|
|
13
|
+
# TODO more validation
|
14
|
+
node = data.css("NaiveBayesModel")
|
17
15
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
x.each_with_index do |xi, i|
|
22
|
-
xi[@target] = @y[i]
|
16
|
+
prior = {}
|
17
|
+
node.css("BayesOutput TargetValueCount").each do |n|
|
18
|
+
prior[n.attribute("value").value] = n.attribute("count").value.to_f
|
23
19
|
end
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
20
|
+
|
21
|
+
legacy = false
|
22
|
+
|
23
|
+
conditional = {}
|
24
|
+
features = {}
|
25
|
+
node.css("BayesInput").each do |n|
|
26
|
+
prob = {}
|
27
|
+
|
28
|
+
# numeric
|
29
|
+
n.css("TargetValueStat").each do |n2|
|
30
|
+
n3 = n2.css("GaussianDistribution")
|
31
|
+
prob[n2.attribute("value").value] = {
|
32
|
+
mean: n3.attribute("mean").value.to_f,
|
33
|
+
stdev: Math.sqrt(n3.attribute("variance").value.to_f)
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
# detect bad form in Eps < 0.3
|
38
|
+
bad_format = n.css("PairCounts").map { |n2| n2.attribute("value").value } == prior.keys
|
39
|
+
|
40
|
+
# categorical
|
41
|
+
n.css("PairCounts").each do |n2|
|
42
|
+
if bad_format
|
43
|
+
n2.css("TargetValueCount").each do |n3|
|
44
|
+
prob[n3.attribute("value").value] ||= {}
|
45
|
+
prob[n3.attribute("value").value][n2.attribute("value").value] = BigDecimal(n3.attribute("count").value)
|
46
|
+
end
|
36
47
|
else
|
37
|
-
|
48
|
+
boom = {}
|
49
|
+
n2.css("TargetValueCount").each do |n3|
|
50
|
+
boom[n3.attribute("value").value] = BigDecimal(n3.attribute("count").value)
|
51
|
+
end
|
52
|
+
prob[n2.attribute("value").value] = boom
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
if bad_format
|
57
|
+
legacy = true
|
58
|
+
prob.each do |k, v|
|
59
|
+
prior.keys.each do |k|
|
60
|
+
v[k] ||= 0.0
|
61
|
+
end
|
38
62
|
end
|
39
63
|
end
|
64
|
+
|
65
|
+
name = n.attribute("fieldName").value
|
66
|
+
conditional[name] = prob
|
67
|
+
features[name] = n.css("TargetValueStat").any? ? "numeric" : "categorical"
|
40
68
|
end
|
41
|
-
end
|
42
69
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
70
|
+
target = node.css("BayesOutput").attribute("fieldName").value
|
71
|
+
|
72
|
+
probabilities = {
|
73
|
+
prior: prior,
|
74
|
+
conditional: conditional
|
75
|
+
}
|
76
|
+
|
77
|
+
# get derived fields
|
78
|
+
derived = {}
|
79
|
+
data.css("DerivedField").each do |n|
|
80
|
+
name = n.attribute("name").value
|
81
|
+
field = n.css("NormDiscrete").attribute("field").value
|
82
|
+
value = n.css("NormDiscrete").attribute("value").value
|
83
|
+
features.delete(name)
|
84
|
+
features[field] = "derived"
|
85
|
+
derived[field] ||= {}
|
86
|
+
derived[field][name] = value
|
87
|
+
end
|
88
|
+
|
89
|
+
Evaluators::NaiveBayes.new(probabilities: probabilities, features: features, derived: derived, legacy: legacy)
|
90
|
+
end
|
47
91
|
end
|
48
92
|
|
93
|
+
private
|
94
|
+
|
49
95
|
# TODO better summary
|
50
|
-
def
|
96
|
+
def _summary(extended: false)
|
51
97
|
str = String.new("")
|
52
98
|
probabilities[:prior].each do |k, v|
|
53
99
|
str += "#{k}: #{v}\n"
|
54
100
|
end
|
55
|
-
str += "\n"
|
56
|
-
str += "accuracy: %d%%\n" % [(100 * accuracy).round]
|
57
101
|
str
|
58
102
|
end
|
59
103
|
|
60
|
-
def
|
61
|
-
|
62
|
-
|
104
|
+
def _train(smoothing: 1, **options)
|
105
|
+
raise "Target must be strings" if @target_type != "categorical"
|
106
|
+
check_missing_value(@train_set)
|
107
|
+
check_missing_value(@validation_set) if @validation_set
|
63
108
|
|
64
|
-
|
109
|
+
data = @train_set
|
65
110
|
|
66
|
-
|
67
|
-
|
68
|
-
|
111
|
+
prep_text_features(data)
|
112
|
+
|
113
|
+
# convert boolean to strings
|
114
|
+
data.label = data.label.map(&:to_s)
|
115
|
+
|
116
|
+
indexes = {}
|
117
|
+
data.label.each_with_index do |yi, i|
|
118
|
+
(indexes[yi] ||= []) << i
|
119
|
+
end
|
120
|
+
|
121
|
+
grouped = {}
|
122
|
+
indexes.each do |k, v|
|
123
|
+
grouped[k] = data[v]
|
124
|
+
end
|
69
125
|
|
70
126
|
prior = {}
|
71
|
-
|
72
|
-
prior[
|
127
|
+
grouped.sort_by { |k, _| k }.each do |k, v|
|
128
|
+
prior[k] = v.size
|
129
|
+
end
|
130
|
+
labels = prior.keys
|
131
|
+
|
132
|
+
target_counts = {}
|
133
|
+
labels.each do |k|
|
134
|
+
target_counts[k] = 0
|
73
135
|
end
|
74
136
|
|
75
137
|
conditional = {}
|
76
|
-
|
138
|
+
|
139
|
+
@features.each do |k, type|
|
77
140
|
prob = {}
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
}
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
141
|
+
|
142
|
+
case type
|
143
|
+
when "text"
|
144
|
+
raise "Text features not supported yet for naive Bayes"
|
145
|
+
when "categorical"
|
146
|
+
groups = Hash.new { |hash, key| hash[key] = [] }
|
147
|
+
data.columns[k].each_with_index do |v, i|
|
148
|
+
groups[v] << i
|
149
|
+
end
|
150
|
+
|
151
|
+
groups.each do |group, indexes|
|
152
|
+
df = data[indexes]
|
153
|
+
prob[group] = group_count(df.label, target_counts.dup)
|
154
|
+
end
|
155
|
+
|
156
|
+
# smooth
|
157
|
+
if smoothing
|
158
|
+
labels.each do |label|
|
159
|
+
sum = prob.map { |k2, v2| v2[label] }.sum.to_f
|
160
|
+
prob.each do |k2, v|
|
161
|
+
v[label] = (v[label] + smoothing) * sum / (sum + (prob.size * smoothing))
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
else
|
166
|
+
labels.each do |group|
|
167
|
+
xs = grouped[group]
|
168
|
+
|
169
|
+
# TODO handle this case
|
170
|
+
next unless xs
|
171
|
+
|
172
|
+
values = xs.columns[k]
|
173
|
+
prob[group] = {mean: mean(values), stdev: stdev(values)}
|
89
174
|
end
|
90
|
-
prob[n2.attribute("value").value] = boom
|
91
175
|
end
|
92
|
-
conditional[n.attribute("fieldName").value] = prob
|
93
|
-
end
|
94
176
|
|
95
|
-
|
177
|
+
conditional[k] = prob
|
178
|
+
end
|
96
179
|
|
97
|
-
probabilities = {
|
180
|
+
@probabilities = {
|
98
181
|
prior: prior,
|
99
182
|
conditional: conditional
|
100
183
|
}
|
101
184
|
|
102
|
-
new(probabilities: probabilities,
|
185
|
+
Evaluators::NaiveBayes.new(probabilities: probabilities, features: @features)
|
103
186
|
end
|
104
187
|
|
105
|
-
def
|
188
|
+
def generate_pmml
|
106
189
|
data_fields = {}
|
107
190
|
data_fields[@target] = probabilities[:prior].keys
|
108
191
|
probabilities[:conditional].each do |k, v|
|
109
|
-
if
|
192
|
+
if @features[k] == "categorical"
|
110
193
|
data_fields[k] = v.keys
|
111
194
|
else
|
112
195
|
data_fields[k] = nil
|
113
196
|
end
|
114
197
|
end
|
115
198
|
|
116
|
-
|
117
|
-
xml.
|
118
|
-
xml.
|
119
|
-
|
120
|
-
|
121
|
-
if vs
|
122
|
-
xml.DataField(name: k, optype: "categorical", dataType: "string") do
|
123
|
-
vs.each do |v|
|
124
|
-
xml.Value(value: v)
|
125
|
-
end
|
126
|
-
end
|
127
|
-
else
|
128
|
-
xml.DataField(name: k, optype: "continuous", dataType: "double")
|
129
|
-
end
|
199
|
+
build_pmml(data_fields) do |xml|
|
200
|
+
xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
|
201
|
+
xml.MiningSchema do
|
202
|
+
data_fields.each do |k, _|
|
203
|
+
xml.MiningField(name: k)
|
130
204
|
end
|
131
205
|
end
|
132
|
-
xml.
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
if !v.values[0][:mean]
|
142
|
-
v.each do |k2, v2|
|
143
|
-
xml.PairCounts(value: k2) do
|
144
|
-
xml.TargetValueCounts do
|
145
|
-
v2.each do |k3, v3|
|
146
|
-
xml.TargetValueCount(value: k3, count: v3)
|
147
|
-
end
|
206
|
+
xml.BayesInputs do
|
207
|
+
probabilities[:conditional].each do |k, v|
|
208
|
+
xml.BayesInput(fieldName: k) do
|
209
|
+
if @features[k] == "categorical"
|
210
|
+
v.sort_by { |k2, _| k2 }.each do |k2, v2|
|
211
|
+
xml.PairCounts(value: k2) do
|
212
|
+
xml.TargetValueCounts do
|
213
|
+
v2.sort_by { |k2, _| k2 }.each do |k3, v3|
|
214
|
+
xml.TargetValueCount(value: k3, count: v3)
|
148
215
|
end
|
149
216
|
end
|
150
217
|
end
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
218
|
+
end
|
219
|
+
else
|
220
|
+
xml.TargetValueStats do
|
221
|
+
v.sort_by { |k2, _| k2 }.each do |k2, v2|
|
222
|
+
xml.TargetValueStat(value: k2) do
|
223
|
+
xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
|
157
224
|
end
|
158
225
|
end
|
159
226
|
end
|
160
227
|
end
|
161
228
|
end
|
162
229
|
end
|
163
|
-
xml.BayesOutput(fieldName: "target") do
|
164
|
-
xml.TargetValueCounts do
|
165
|
-
probabilities[:prior].each do |k, v|
|
166
|
-
xml.TargetValueCount(value: k, count: v)
|
167
|
-
end
|
168
|
-
end
|
169
|
-
end
|
170
230
|
end
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
def self.metrics(actual, estimated)
|
178
|
-
{
|
179
|
-
accuracy: actual.zip(estimated).count { |yi, yi2| yi == yi2 } / actual.size.to_f
|
180
|
-
}
|
181
|
-
end
|
182
|
-
|
183
|
-
private
|
184
|
-
|
185
|
-
def _predict(x)
|
186
|
-
x.map do |xi|
|
187
|
-
probs = calculate_class_probabilities(stringify_keys(xi))
|
188
|
-
# deterministic for equal probabilities
|
189
|
-
probs.sort_by { |k, v| [-v, k.to_s] }[0][0]
|
190
|
-
end
|
191
|
-
end
|
192
|
-
|
193
|
-
def calculate_class_probabilities(x)
|
194
|
-
prob = {}
|
195
|
-
probabilities[:prior].each do |c, cv|
|
196
|
-
prob[c] = cv.to_f / probabilities[:prior].values.sum
|
197
|
-
probabilities[:conditional].each do |k, v|
|
198
|
-
if !v[c][:mean]
|
199
|
-
# TODO compute ahead of time
|
200
|
-
p2 = v[c][x[k]].to_f / v[c].values.sum
|
201
|
-
|
202
|
-
# assign very small probability if probability is 0
|
203
|
-
# TODO use proper smoothing instead
|
204
|
-
if p2 == 0
|
205
|
-
p2 = 0.0001
|
231
|
+
xml.BayesOutput(fieldName: "target") do
|
232
|
+
xml.TargetValueCounts do
|
233
|
+
probabilities[:prior].sort_by { |k, _| k }.each do |k, v|
|
234
|
+
xml.TargetValueCount(value: k, count: v)
|
235
|
+
end
|
206
236
|
end
|
207
|
-
|
208
|
-
prob[c] *= p2
|
209
|
-
else
|
210
|
-
prob[c] *= calculate_probability(x[k], v[c][:mean], v[c][:stdev])
|
211
237
|
end
|
212
238
|
end
|
213
239
|
end
|
214
|
-
prob
|
215
240
|
end
|
216
241
|
|
217
|
-
def
|
218
|
-
|
219
|
-
(1 / (Math.sqrt(2 * Math::PI) * stdev)) * exponent
|
220
|
-
end
|
221
|
-
|
222
|
-
def group_count(arr)
|
223
|
-
r = arr.inject(Hash.new(0)) { |h, e| h[e.to_s] += 1 ; h }
|
224
|
-
r.default = nil
|
225
|
-
r
|
242
|
+
def group_count(arr, start)
|
243
|
+
arr.inject(start) { |h, e| h[e] += 1; h }
|
226
244
|
end
|
227
245
|
|
228
246
|
def mean(arr)
|
@@ -230,17 +248,10 @@ module Eps
|
|
230
248
|
end
|
231
249
|
|
232
250
|
def stdev(arr)
|
251
|
+
return nil if arr.size <= 1
|
233
252
|
m = mean(arr)
|
234
253
|
sum = arr.inject(0) { |accum, i| accum + (i - m)**2 }
|
235
254
|
Math.sqrt(sum / (arr.length - 1).to_f)
|
236
255
|
end
|
237
|
-
|
238
|
-
def stringify_keys(h)
|
239
|
-
o = {}
|
240
|
-
h.each do |k, v|
|
241
|
-
o[k.to_s] = v
|
242
|
-
end
|
243
|
-
o
|
244
|
-
end
|
245
256
|
end
|
246
257
|
end
|