eps 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/LICENSE.txt +1 -1
- data/README.md +183 -243
- data/lib/eps.rb +27 -3
- data/lib/eps/base_estimator.rb +316 -47
- data/lib/eps/data_frame.rb +141 -0
- data/lib/eps/evaluators/lightgbm.rb +116 -0
- data/lib/eps/evaluators/linear_regression.rb +54 -0
- data/lib/eps/evaluators/naive_bayes.rb +95 -0
- data/lib/eps/evaluators/node.rb +26 -0
- data/lib/eps/label_encoder.rb +41 -0
- data/lib/eps/lightgbm.rb +237 -0
- data/lib/eps/linear_regression.rb +132 -386
- data/lib/eps/metrics.rb +46 -0
- data/lib/eps/model.rb +16 -58
- data/lib/eps/naive_bayes.rb +175 -164
- data/lib/eps/pmml_generators/lightgbm.rb +187 -0
- data/lib/eps/statistics.rb +79 -0
- data/lib/eps/text_encoder.rb +81 -0
- data/lib/eps/utils.rb +22 -0
- data/lib/eps/version.rb +1 -1
- metadata +33 -7
data/lib/eps/metrics.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
module Eps
|
2
|
+
module Metrics
|
3
|
+
class << self
|
4
|
+
def rmse(y_true, y_pred)
|
5
|
+
check_size(y_true, y_pred)
|
6
|
+
Math.sqrt(mean(errors(y_true, y_pred).map { |v| v**2 }))
|
7
|
+
end
|
8
|
+
|
9
|
+
def mae(y_true, y_pred)
|
10
|
+
check_size(y_true, y_pred)
|
11
|
+
mean(errors(y_true, y_pred).map { |v| v.abs })
|
12
|
+
end
|
13
|
+
|
14
|
+
def me(y_true, y_pred)
|
15
|
+
check_size(y_true, y_pred)
|
16
|
+
mean(errors(y_true, y_pred))
|
17
|
+
end
|
18
|
+
|
19
|
+
def accuracy(y_true, y_pred)
|
20
|
+
check_size(y_true, y_pred)
|
21
|
+
y_true.zip(y_pred).count { |yt, yp| yt == yp } / y_true.size.to_f
|
22
|
+
end
|
23
|
+
|
24
|
+
# http://wiki.fast.ai/index.php/Log_Loss
|
25
|
+
def log_loss(y_true, y_pred, eps: 1e-15)
|
26
|
+
check_size(y_true, y_pred)
|
27
|
+
p = y_pred.map { |yp| yp.clamp(eps, 1 - eps) }
|
28
|
+
mean(y_true.zip(p).map { |yt, pi| yt == 1 ? -Math.log(pi) : -Math.log(1 - pi) })
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def check_size(y_true, y_pred)
|
34
|
+
raise ArgumentError, "Different sizes" if y_true.size != y_pred.size
|
35
|
+
end
|
36
|
+
|
37
|
+
def mean(arr)
|
38
|
+
arr.sum / arr.size.to_f
|
39
|
+
end
|
40
|
+
|
41
|
+
def errors(y_true, y_pred)
|
42
|
+
y_true.zip(y_pred).map { |yt, yp| yt - yp }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/eps/model.rb
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
module Eps
|
2
2
|
class Model
|
3
|
-
def initialize(data = nil, y = nil,
|
4
|
-
@options = options
|
5
|
-
|
3
|
+
def initialize(data = nil, y = nil, estimator: nil, **options)
|
6
4
|
if estimator
|
7
5
|
@estimator = estimator
|
8
|
-
elsif data
|
9
|
-
train(data, y,
|
6
|
+
elsif data
|
7
|
+
train(data, y, **options)
|
10
8
|
end
|
11
9
|
end
|
12
10
|
|
@@ -14,12 +12,13 @@ module Eps
|
|
14
12
|
|
15
13
|
def self.load_pmml(data)
|
16
14
|
if data.is_a?(String)
|
17
|
-
require "nokogiri"
|
18
15
|
data = Nokogiri::XML(data) { |config| config.strict }
|
19
16
|
end
|
20
17
|
|
21
18
|
estimator_class =
|
22
|
-
if data.css("
|
19
|
+
if data.css("Segmentation").any?
|
20
|
+
Eps::LightGBM
|
21
|
+
elsif data.css("RegressionModel").any?
|
23
22
|
Eps::LinearRegression
|
24
23
|
elsif data.css("NaiveBayesModel").any?
|
25
24
|
Eps::NaiveBayes
|
@@ -30,55 +29,22 @@ module Eps
|
|
30
29
|
new(estimator: estimator_class.load_pmml(data))
|
31
30
|
end
|
32
31
|
|
33
|
-
# ruby - legacy
|
34
|
-
|
35
|
-
def self.load(data)
|
36
|
-
new(estimator: Eps::LinearRegression.load(data))
|
37
|
-
end
|
38
|
-
|
39
|
-
# json - legacy
|
40
|
-
|
41
|
-
def self.load_json(data)
|
42
|
-
new(estimator: Eps::LinearRegression.load_json(data))
|
43
|
-
end
|
44
|
-
|
45
|
-
def to_json
|
46
|
-
@estimator ? @estimator.to_json : super
|
47
|
-
end
|
48
|
-
|
49
|
-
# pfa - legacy
|
50
|
-
|
51
|
-
def self.load_pfa(data)
|
52
|
-
new(estimator: Eps::LinearRegression.load_pfa(data))
|
53
|
-
end
|
54
|
-
|
55
|
-
# metrics
|
56
|
-
|
57
|
-
def self.metrics(actual, estimated)
|
58
|
-
estimator_class =
|
59
|
-
if numeric?(actual)
|
60
|
-
Eps::LinearRegression
|
61
|
-
else
|
62
|
-
Eps::NaiveBayes
|
63
|
-
end
|
64
|
-
|
65
|
-
estimator_class.metrics(actual, estimated)
|
66
|
-
end
|
67
|
-
|
68
32
|
private
|
69
33
|
|
70
|
-
def train(data, y = nil, target: nil)
|
71
|
-
y ||= daru?(data) ? data[target].to_a : data.map { |r| r[target] }
|
72
|
-
|
34
|
+
def train(data, y = nil, target: nil, algorithm: :lightgbm, **options)
|
73
35
|
estimator_class =
|
74
|
-
|
36
|
+
case algorithm
|
37
|
+
when :lightgbm
|
38
|
+
Eps::LightGBM
|
39
|
+
when :linear_regression
|
75
40
|
Eps::LinearRegression
|
76
|
-
|
41
|
+
when :naive_bayes
|
77
42
|
Eps::NaiveBayes
|
43
|
+
else
|
44
|
+
raise ArgumentError, "Unknown algorithm: #{algorithm}"
|
78
45
|
end
|
79
46
|
|
80
|
-
@estimator = estimator_class.new(
|
81
|
-
@estimator.train(data, y, target: target)
|
47
|
+
@estimator = estimator_class.new(data, y, target: target, **options)
|
82
48
|
end
|
83
49
|
|
84
50
|
def respond_to_missing?(name, include_private = false)
|
@@ -90,19 +56,11 @@ module Eps
|
|
90
56
|
end
|
91
57
|
|
92
58
|
def method_missing(method, *args, &block)
|
93
|
-
if @estimator
|
59
|
+
if @estimator && @estimator.respond_to?(method)
|
94
60
|
@estimator.public_send(method, *args, &block)
|
95
61
|
else
|
96
62
|
super
|
97
63
|
end
|
98
64
|
end
|
99
|
-
|
100
|
-
def self.numeric?(y)
|
101
|
-
y.first.is_a?(Numeric)
|
102
|
-
end
|
103
|
-
|
104
|
-
def daru?(x)
|
105
|
-
defined?(Daru) && x.is_a?(Daru::DataFrame)
|
106
|
-
end
|
107
65
|
end
|
108
66
|
end
|
data/lib/eps/naive_bayes.rb
CHANGED
@@ -2,227 +2,245 @@ module Eps
|
|
2
2
|
class NaiveBayes < BaseEstimator
|
3
3
|
attr_reader :probabilities
|
4
4
|
|
5
|
-
def
|
6
|
-
@
|
7
|
-
@target = target
|
5
|
+
def accuracy
|
6
|
+
Eps::Metrics.accuracy(@train_set.label, predict(@train_set))
|
8
7
|
end
|
9
8
|
|
10
|
-
|
11
|
-
super
|
12
|
-
|
13
|
-
@y = @y.map { |yi| yi.to_s }
|
9
|
+
# pmml
|
14
10
|
|
15
|
-
|
16
|
-
|
11
|
+
def self.load_pmml(data)
|
12
|
+
super do |data|
|
13
|
+
# TODO more validation
|
14
|
+
node = data.css("NaiveBayesModel")
|
17
15
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
x.each_with_index do |xi, i|
|
22
|
-
xi[@target] = @y[i]
|
16
|
+
prior = {}
|
17
|
+
node.css("BayesOutput TargetValueCount").each do |n|
|
18
|
+
prior[n.attribute("value").value] = n.attribute("count").value.to_f
|
23
19
|
end
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
20
|
+
|
21
|
+
legacy = false
|
22
|
+
|
23
|
+
conditional = {}
|
24
|
+
features = {}
|
25
|
+
node.css("BayesInput").each do |n|
|
26
|
+
prob = {}
|
27
|
+
|
28
|
+
# numeric
|
29
|
+
n.css("TargetValueStat").each do |n2|
|
30
|
+
n3 = n2.css("GaussianDistribution")
|
31
|
+
prob[n2.attribute("value").value] = {
|
32
|
+
mean: n3.attribute("mean").value.to_f,
|
33
|
+
stdev: Math.sqrt(n3.attribute("variance").value.to_f)
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
# detect bad form in Eps < 0.3
|
38
|
+
bad_format = n.css("PairCounts").map { |n2| n2.attribute("value").value } == prior.keys
|
39
|
+
|
40
|
+
# categorical
|
41
|
+
n.css("PairCounts").each do |n2|
|
42
|
+
if bad_format
|
43
|
+
n2.css("TargetValueCount").each do |n3|
|
44
|
+
prob[n3.attribute("value").value] ||= {}
|
45
|
+
prob[n3.attribute("value").value][n2.attribute("value").value] = BigDecimal(n3.attribute("count").value)
|
46
|
+
end
|
36
47
|
else
|
37
|
-
|
48
|
+
boom = {}
|
49
|
+
n2.css("TargetValueCount").each do |n3|
|
50
|
+
boom[n3.attribute("value").value] = BigDecimal(n3.attribute("count").value)
|
51
|
+
end
|
52
|
+
prob[n2.attribute("value").value] = boom
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
if bad_format
|
57
|
+
legacy = true
|
58
|
+
prob.each do |k, v|
|
59
|
+
prior.keys.each do |k|
|
60
|
+
v[k] ||= 0.0
|
61
|
+
end
|
38
62
|
end
|
39
63
|
end
|
64
|
+
|
65
|
+
name = n.attribute("fieldName").value
|
66
|
+
conditional[name] = prob
|
67
|
+
features[name] = n.css("TargetValueStat").any? ? "numeric" : "categorical"
|
40
68
|
end
|
41
|
-
end
|
42
69
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
70
|
+
target = node.css("BayesOutput").attribute("fieldName").value
|
71
|
+
|
72
|
+
probabilities = {
|
73
|
+
prior: prior,
|
74
|
+
conditional: conditional
|
75
|
+
}
|
76
|
+
|
77
|
+
# get derived fields
|
78
|
+
derived = {}
|
79
|
+
data.css("DerivedField").each do |n|
|
80
|
+
name = n.attribute("name").value
|
81
|
+
field = n.css("NormDiscrete").attribute("field").value
|
82
|
+
value = n.css("NormDiscrete").attribute("value").value
|
83
|
+
features.delete(name)
|
84
|
+
features[field] = "derived"
|
85
|
+
derived[field] ||= {}
|
86
|
+
derived[field][name] = value
|
87
|
+
end
|
88
|
+
|
89
|
+
Evaluators::NaiveBayes.new(probabilities: probabilities, features: features, derived: derived, legacy: legacy)
|
90
|
+
end
|
47
91
|
end
|
48
92
|
|
93
|
+
private
|
94
|
+
|
49
95
|
# TODO better summary
|
50
|
-
def
|
96
|
+
def _summary(extended: false)
|
51
97
|
str = String.new("")
|
52
98
|
probabilities[:prior].each do |k, v|
|
53
99
|
str += "#{k}: #{v}\n"
|
54
100
|
end
|
55
|
-
str += "\n"
|
56
|
-
str += "accuracy: %d%%\n" % [(100 * accuracy).round]
|
57
101
|
str
|
58
102
|
end
|
59
103
|
|
60
|
-
def
|
61
|
-
|
62
|
-
|
104
|
+
def _train(smoothing: 1, **options)
|
105
|
+
raise "Target must be strings" if @target_type != "categorical"
|
106
|
+
check_missing_value(@train_set)
|
107
|
+
check_missing_value(@validation_set) if @validation_set
|
63
108
|
|
64
|
-
|
109
|
+
data = @train_set
|
65
110
|
|
66
|
-
|
67
|
-
|
68
|
-
|
111
|
+
prep_text_features(data)
|
112
|
+
|
113
|
+
# convert boolean to strings
|
114
|
+
data.label = data.label.map(&:to_s)
|
115
|
+
|
116
|
+
indexes = {}
|
117
|
+
data.label.each_with_index do |yi, i|
|
118
|
+
(indexes[yi] ||= []) << i
|
119
|
+
end
|
120
|
+
|
121
|
+
grouped = {}
|
122
|
+
indexes.each do |k, v|
|
123
|
+
grouped[k] = data[v]
|
124
|
+
end
|
69
125
|
|
70
126
|
prior = {}
|
71
|
-
|
72
|
-
prior[
|
127
|
+
grouped.sort_by { |k, _| k }.each do |k, v|
|
128
|
+
prior[k] = v.size
|
129
|
+
end
|
130
|
+
labels = prior.keys
|
131
|
+
|
132
|
+
target_counts = {}
|
133
|
+
labels.each do |k|
|
134
|
+
target_counts[k] = 0
|
73
135
|
end
|
74
136
|
|
75
137
|
conditional = {}
|
76
|
-
|
138
|
+
|
139
|
+
@features.each do |k, type|
|
77
140
|
prob = {}
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
}
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
141
|
+
|
142
|
+
case type
|
143
|
+
when "text"
|
144
|
+
raise "Text features not supported yet for naive Bayes"
|
145
|
+
when "categorical"
|
146
|
+
groups = Hash.new { |hash, key| hash[key] = [] }
|
147
|
+
data.columns[k].each_with_index do |v, i|
|
148
|
+
groups[v] << i
|
149
|
+
end
|
150
|
+
|
151
|
+
groups.each do |group, indexes|
|
152
|
+
df = data[indexes]
|
153
|
+
prob[group] = group_count(df.label, target_counts.dup)
|
154
|
+
end
|
155
|
+
|
156
|
+
# smooth
|
157
|
+
if smoothing
|
158
|
+
labels.each do |label|
|
159
|
+
sum = prob.map { |k2, v2| v2[label] }.sum.to_f
|
160
|
+
prob.each do |k2, v|
|
161
|
+
v[label] = (v[label] + smoothing) * sum / (sum + (prob.size * smoothing))
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
else
|
166
|
+
labels.each do |group|
|
167
|
+
xs = grouped[group]
|
168
|
+
|
169
|
+
# TODO handle this case
|
170
|
+
next unless xs
|
171
|
+
|
172
|
+
values = xs.columns[k]
|
173
|
+
prob[group] = {mean: mean(values), stdev: stdev(values)}
|
89
174
|
end
|
90
|
-
prob[n2.attribute("value").value] = boom
|
91
175
|
end
|
92
|
-
conditional[n.attribute("fieldName").value] = prob
|
93
|
-
end
|
94
176
|
|
95
|
-
|
177
|
+
conditional[k] = prob
|
178
|
+
end
|
96
179
|
|
97
|
-
probabilities = {
|
180
|
+
@probabilities = {
|
98
181
|
prior: prior,
|
99
182
|
conditional: conditional
|
100
183
|
}
|
101
184
|
|
102
|
-
new(probabilities: probabilities,
|
185
|
+
Evaluators::NaiveBayes.new(probabilities: probabilities, features: @features)
|
103
186
|
end
|
104
187
|
|
105
|
-
def
|
188
|
+
def generate_pmml
|
106
189
|
data_fields = {}
|
107
190
|
data_fields[@target] = probabilities[:prior].keys
|
108
191
|
probabilities[:conditional].each do |k, v|
|
109
|
-
if
|
192
|
+
if @features[k] == "categorical"
|
110
193
|
data_fields[k] = v.keys
|
111
194
|
else
|
112
195
|
data_fields[k] = nil
|
113
196
|
end
|
114
197
|
end
|
115
198
|
|
116
|
-
|
117
|
-
xml.
|
118
|
-
xml.
|
119
|
-
|
120
|
-
|
121
|
-
if vs
|
122
|
-
xml.DataField(name: k, optype: "categorical", dataType: "string") do
|
123
|
-
vs.each do |v|
|
124
|
-
xml.Value(value: v)
|
125
|
-
end
|
126
|
-
end
|
127
|
-
else
|
128
|
-
xml.DataField(name: k, optype: "continuous", dataType: "double")
|
129
|
-
end
|
199
|
+
build_pmml(data_fields) do |xml|
|
200
|
+
xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
|
201
|
+
xml.MiningSchema do
|
202
|
+
data_fields.each do |k, _|
|
203
|
+
xml.MiningField(name: k)
|
130
204
|
end
|
131
205
|
end
|
132
|
-
xml.
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
if !v.values[0][:mean]
|
142
|
-
v.each do |k2, v2|
|
143
|
-
xml.PairCounts(value: k2) do
|
144
|
-
xml.TargetValueCounts do
|
145
|
-
v2.each do |k3, v3|
|
146
|
-
xml.TargetValueCount(value: k3, count: v3)
|
147
|
-
end
|
206
|
+
xml.BayesInputs do
|
207
|
+
probabilities[:conditional].each do |k, v|
|
208
|
+
xml.BayesInput(fieldName: k) do
|
209
|
+
if @features[k] == "categorical"
|
210
|
+
v.sort_by { |k2, _| k2 }.each do |k2, v2|
|
211
|
+
xml.PairCounts(value: k2) do
|
212
|
+
xml.TargetValueCounts do
|
213
|
+
v2.sort_by { |k2, _| k2 }.each do |k3, v3|
|
214
|
+
xml.TargetValueCount(value: k3, count: v3)
|
148
215
|
end
|
149
216
|
end
|
150
217
|
end
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
218
|
+
end
|
219
|
+
else
|
220
|
+
xml.TargetValueStats do
|
221
|
+
v.sort_by { |k2, _| k2 }.each do |k2, v2|
|
222
|
+
xml.TargetValueStat(value: k2) do
|
223
|
+
xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
|
157
224
|
end
|
158
225
|
end
|
159
226
|
end
|
160
227
|
end
|
161
228
|
end
|
162
229
|
end
|
163
|
-
xml.BayesOutput(fieldName: "target") do
|
164
|
-
xml.TargetValueCounts do
|
165
|
-
probabilities[:prior].each do |k, v|
|
166
|
-
xml.TargetValueCount(value: k, count: v)
|
167
|
-
end
|
168
|
-
end
|
169
|
-
end
|
170
230
|
end
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
def self.metrics(actual, estimated)
|
178
|
-
{
|
179
|
-
accuracy: actual.zip(estimated).count { |yi, yi2| yi == yi2 } / actual.size.to_f
|
180
|
-
}
|
181
|
-
end
|
182
|
-
|
183
|
-
private
|
184
|
-
|
185
|
-
def _predict(x)
|
186
|
-
x.map do |xi|
|
187
|
-
probs = calculate_class_probabilities(stringify_keys(xi))
|
188
|
-
# deterministic for equal probabilities
|
189
|
-
probs.sort_by { |k, v| [-v, k.to_s] }[0][0]
|
190
|
-
end
|
191
|
-
end
|
192
|
-
|
193
|
-
def calculate_class_probabilities(x)
|
194
|
-
prob = {}
|
195
|
-
probabilities[:prior].each do |c, cv|
|
196
|
-
prob[c] = cv.to_f / probabilities[:prior].values.sum
|
197
|
-
probabilities[:conditional].each do |k, v|
|
198
|
-
if !v[c][:mean]
|
199
|
-
# TODO compute ahead of time
|
200
|
-
p2 = v[c][x[k]].to_f / v[c].values.sum
|
201
|
-
|
202
|
-
# assign very small probability if probability is 0
|
203
|
-
# TODO use proper smoothing instead
|
204
|
-
if p2 == 0
|
205
|
-
p2 = 0.0001
|
231
|
+
xml.BayesOutput(fieldName: "target") do
|
232
|
+
xml.TargetValueCounts do
|
233
|
+
probabilities[:prior].sort_by { |k, _| k }.each do |k, v|
|
234
|
+
xml.TargetValueCount(value: k, count: v)
|
235
|
+
end
|
206
236
|
end
|
207
|
-
|
208
|
-
prob[c] *= p2
|
209
|
-
else
|
210
|
-
prob[c] *= calculate_probability(x[k], v[c][:mean], v[c][:stdev])
|
211
237
|
end
|
212
238
|
end
|
213
239
|
end
|
214
|
-
prob
|
215
240
|
end
|
216
241
|
|
217
|
-
def
|
218
|
-
|
219
|
-
(1 / (Math.sqrt(2 * Math::PI) * stdev)) * exponent
|
220
|
-
end
|
221
|
-
|
222
|
-
def group_count(arr)
|
223
|
-
r = arr.inject(Hash.new(0)) { |h, e| h[e.to_s] += 1 ; h }
|
224
|
-
r.default = nil
|
225
|
-
r
|
242
|
+
def group_count(arr, start)
|
243
|
+
arr.inject(start) { |h, e| h[e] += 1; h }
|
226
244
|
end
|
227
245
|
|
228
246
|
def mean(arr)
|
@@ -230,17 +248,10 @@ module Eps
|
|
230
248
|
end
|
231
249
|
|
232
250
|
def stdev(arr)
|
251
|
+
return nil if arr.size <= 1
|
233
252
|
m = mean(arr)
|
234
253
|
sum = arr.inject(0) { |accum, i| accum + (i - m)**2 }
|
235
254
|
Math.sqrt(sum / (arr.length - 1).to_f)
|
236
255
|
end
|
237
|
-
|
238
|
-
def stringify_keys(h)
|
239
|
-
o = {}
|
240
|
-
h.each do |k, v|
|
241
|
-
o[k.to_s] = v
|
242
|
-
end
|
243
|
-
o
|
244
|
-
end
|
245
256
|
end
|
246
257
|
end
|