eps 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,46 @@
1
+ module Eps
2
+ module Metrics
3
+ class << self
4
+ def rmse(y_true, y_pred)
5
+ check_size(y_true, y_pred)
6
+ Math.sqrt(mean(errors(y_true, y_pred).map { |v| v**2 }))
7
+ end
8
+
9
+ def mae(y_true, y_pred)
10
+ check_size(y_true, y_pred)
11
+ mean(errors(y_true, y_pred).map { |v| v.abs })
12
+ end
13
+
14
+ def me(y_true, y_pred)
15
+ check_size(y_true, y_pred)
16
+ mean(errors(y_true, y_pred))
17
+ end
18
+
19
+ def accuracy(y_true, y_pred)
20
+ check_size(y_true, y_pred)
21
+ y_true.zip(y_pred).count { |yt, yp| yt == yp } / y_true.size.to_f
22
+ end
23
+
24
+ # http://wiki.fast.ai/index.php/Log_Loss
25
+ def log_loss(y_true, y_pred, eps: 1e-15)
26
+ check_size(y_true, y_pred)
27
+ p = y_pred.map { |yp| yp.clamp(eps, 1 - eps) }
28
+ mean(y_true.zip(p).map { |yt, pi| yt == 1 ? -Math.log(pi) : -Math.log(1 - pi) })
29
+ end
30
+
31
+ private
32
+
33
+ def check_size(y_true, y_pred)
34
+ raise ArgumentError, "Different sizes" if y_true.size != y_pred.size
35
+ end
36
+
37
+ def mean(arr)
38
+ arr.sum / arr.size.to_f
39
+ end
40
+
41
+ def errors(y_true, y_pred)
42
+ y_true.zip(y_pred).map { |yt, yp| yt - yp }
43
+ end
44
+ end
45
+ end
46
+ end
data/lib/eps/model.rb CHANGED
@@ -1,12 +1,10 @@
1
1
  module Eps
2
2
  class Model
3
- def initialize(data = nil, y = nil, target: nil, estimator: nil, **options)
4
- @options = options
5
-
3
+ def initialize(data = nil, y = nil, estimator: nil, **options)
6
4
  if estimator
7
5
  @estimator = estimator
8
- elsif data # legacy
9
- train(data, y, target: target)
6
+ elsif data
7
+ train(data, y, **options)
10
8
  end
11
9
  end
12
10
 
@@ -14,12 +12,13 @@ module Eps
14
12
 
15
13
  def self.load_pmml(data)
16
14
  if data.is_a?(String)
17
- require "nokogiri"
18
15
  data = Nokogiri::XML(data) { |config| config.strict }
19
16
  end
20
17
 
21
18
  estimator_class =
22
- if data.css("RegressionModel").any?
19
+ if data.css("Segmentation").any?
20
+ Eps::LightGBM
21
+ elsif data.css("RegressionModel").any?
23
22
  Eps::LinearRegression
24
23
  elsif data.css("NaiveBayesModel").any?
25
24
  Eps::NaiveBayes
@@ -30,55 +29,22 @@ module Eps
30
29
  new(estimator: estimator_class.load_pmml(data))
31
30
  end
32
31
 
33
- # ruby - legacy
34
-
35
- def self.load(data)
36
- new(estimator: Eps::LinearRegression.load(data))
37
- end
38
-
39
- # json - legacy
40
-
41
- def self.load_json(data)
42
- new(estimator: Eps::LinearRegression.load_json(data))
43
- end
44
-
45
- def to_json
46
- @estimator ? @estimator.to_json : super
47
- end
48
-
49
- # pfa - legacy
50
-
51
- def self.load_pfa(data)
52
- new(estimator: Eps::LinearRegression.load_pfa(data))
53
- end
54
-
55
- # metrics
56
-
57
- def self.metrics(actual, estimated)
58
- estimator_class =
59
- if numeric?(actual)
60
- Eps::LinearRegression
61
- else
62
- Eps::NaiveBayes
63
- end
64
-
65
- estimator_class.metrics(actual, estimated)
66
- end
67
-
68
32
  private
69
33
 
70
- def train(data, y = nil, target: nil)
71
- y ||= daru?(data) ? data[target].to_a : data.map { |r| r[target] }
72
-
34
+ def train(data, y = nil, target: nil, algorithm: :lightgbm, **options)
73
35
  estimator_class =
74
- if self.class.numeric?(y)
36
+ case algorithm
37
+ when :lightgbm
38
+ Eps::LightGBM
39
+ when :linear_regression
75
40
  Eps::LinearRegression
76
- else
41
+ when :naive_bayes
77
42
  Eps::NaiveBayes
43
+ else
44
+ raise ArgumentError, "Unknown algorithm: #{algorithm}"
78
45
  end
79
46
 
80
- @estimator = estimator_class.new(**@options)
81
- @estimator.train(data, y, target: target)
47
+ @estimator = estimator_class.new(data, y, target: target, **options)
82
48
  end
83
49
 
84
50
  def respond_to_missing?(name, include_private = false)
@@ -90,19 +56,11 @@ module Eps
90
56
  end
91
57
 
92
58
  def method_missing(method, *args, &block)
93
- if @estimator
59
+ if @estimator && @estimator.respond_to?(method)
94
60
  @estimator.public_send(method, *args, &block)
95
61
  else
96
62
  super
97
63
  end
98
64
  end
99
-
100
- def self.numeric?(y)
101
- y.first.is_a?(Numeric)
102
- end
103
-
104
- def daru?(x)
105
- defined?(Daru) && x.is_a?(Daru::DataFrame)
106
- end
107
65
  end
108
66
  end
@@ -2,227 +2,245 @@ module Eps
2
2
  class NaiveBayes < BaseEstimator
3
3
  attr_reader :probabilities
4
4
 
5
- def initialize(probabilities: nil, target: nil)
6
- @probabilities = probabilities
7
- @target = target
5
+ def accuracy
6
+ Eps::Metrics.accuracy(@train_set.label, predict(@train_set))
8
7
  end
9
8
 
10
- def train(*args)
11
- super
12
-
13
- @y = @y.map { |yi| yi.to_s }
9
+ # pmml
14
10
 
15
- prior = group_count(@y)
16
- conditional = {}
11
+ def self.load_pmml(data)
12
+ super do |data|
13
+ # TODO more validation
14
+ node = data.css("NaiveBayesModel")
17
15
 
18
- if @x.any?
19
- keys = @x.first.keys
20
- x = @x.dup
21
- x.each_with_index do |xi, i|
22
- xi[@target] = @y[i]
16
+ prior = {}
17
+ node.css("BayesOutput TargetValueCount").each do |n|
18
+ prior[n.attribute("value").value] = n.attribute("count").value.to_f
23
19
  end
24
- keys.each do |k|
25
- conditional[k.to_s] = {}
26
- x.group_by { |xi| xi[@target] }.each do |group, xs|
27
- v = xs.map { |xi| xi[k] }
28
-
29
- if categorical?(v[0])
30
- # TODO apply smoothing
31
- # apply smoothing only to
32
- # 1. categorical features
33
- # 2. conditional probabilities
34
- # TODO more efficient count
35
- conditional[k.to_s][group] = group_count(v)
20
+
21
+ legacy = false
22
+
23
+ conditional = {}
24
+ features = {}
25
+ node.css("BayesInput").each do |n|
26
+ prob = {}
27
+
28
+ # numeric
29
+ n.css("TargetValueStat").each do |n2|
30
+ n3 = n2.css("GaussianDistribution")
31
+ prob[n2.attribute("value").value] = {
32
+ mean: n3.attribute("mean").value.to_f,
33
+ stdev: Math.sqrt(n3.attribute("variance").value.to_f)
34
+ }
35
+ end
36
+
37
+ # detect bad form in Eps < 0.3
38
+ bad_format = n.css("PairCounts").map { |n2| n2.attribute("value").value } == prior.keys
39
+
40
+ # categorical
41
+ n.css("PairCounts").each do |n2|
42
+ if bad_format
43
+ n2.css("TargetValueCount").each do |n3|
44
+ prob[n3.attribute("value").value] ||= {}
45
+ prob[n3.attribute("value").value][n2.attribute("value").value] = BigDecimal(n3.attribute("count").value)
46
+ end
36
47
  else
37
- conditional[k.to_s][group] = {mean: mean(v), stdev: stdev(v)}
48
+ boom = {}
49
+ n2.css("TargetValueCount").each do |n3|
50
+ boom[n3.attribute("value").value] = BigDecimal(n3.attribute("count").value)
51
+ end
52
+ prob[n2.attribute("value").value] = boom
53
+ end
54
+ end
55
+
56
+ if bad_format
57
+ legacy = true
58
+ prob.each do |k, v|
59
+ prior.keys.each do |k|
60
+ v[k] ||= 0.0
61
+ end
38
62
  end
39
63
  end
64
+
65
+ name = n.attribute("fieldName").value
66
+ conditional[name] = prob
67
+ features[name] = n.css("TargetValueStat").any? ? "numeric" : "categorical"
40
68
  end
41
- end
42
69
 
43
- @probabilities = {
44
- prior: prior,
45
- conditional: conditional
46
- }
70
+ target = node.css("BayesOutput").attribute("fieldName").value
71
+
72
+ probabilities = {
73
+ prior: prior,
74
+ conditional: conditional
75
+ }
76
+
77
+ # get derived fields
78
+ derived = {}
79
+ data.css("DerivedField").each do |n|
80
+ name = n.attribute("name").value
81
+ field = n.css("NormDiscrete").attribute("field").value
82
+ value = n.css("NormDiscrete").attribute("value").value
83
+ features.delete(name)
84
+ features[field] = "derived"
85
+ derived[field] ||= {}
86
+ derived[field][name] = value
87
+ end
88
+
89
+ Evaluators::NaiveBayes.new(probabilities: probabilities, features: features, derived: derived, legacy: legacy)
90
+ end
47
91
  end
48
92
 
93
+ private
94
+
49
95
  # TODO better summary
50
- def summary(extended: false)
96
+ def _summary(extended: false)
51
97
  str = String.new("")
52
98
  probabilities[:prior].each do |k, v|
53
99
  str += "#{k}: #{v}\n"
54
100
  end
55
- str += "\n"
56
- str += "accuracy: %d%%\n" % [(100 * accuracy).round]
57
101
  str
58
102
  end
59
103
 
60
- def accuracy
61
- self.class.metrics(predict(@x), @y)[:accuracy]
62
- end
104
+ def _train(smoothing: 1, **options)
105
+ raise "Target must be strings" if @target_type != "categorical"
106
+ check_missing_value(@train_set)
107
+ check_missing_value(@validation_set) if @validation_set
63
108
 
64
- # pmml
109
+ data = @train_set
65
110
 
66
- def self.load_pmml(data)
67
- # TODO more validation
68
- node = data.css("NaiveBayesModel")
111
+ prep_text_features(data)
112
+
113
+ # convert boolean to strings
114
+ data.label = data.label.map(&:to_s)
115
+
116
+ indexes = {}
117
+ data.label.each_with_index do |yi, i|
118
+ (indexes[yi] ||= []) << i
119
+ end
120
+
121
+ grouped = {}
122
+ indexes.each do |k, v|
123
+ grouped[k] = data[v]
124
+ end
69
125
 
70
126
  prior = {}
71
- node.css("BayesOutput TargetValueCount").each do |n|
72
- prior[n.attribute("value").value] = n.attribute("count").value.to_f
127
+ grouped.sort_by { |k, _| k }.each do |k, v|
128
+ prior[k] = v.size
129
+ end
130
+ labels = prior.keys
131
+
132
+ target_counts = {}
133
+ labels.each do |k|
134
+ target_counts[k] = 0
73
135
  end
74
136
 
75
137
  conditional = {}
76
- node.css("BayesInput").each do |n|
138
+
139
+ @features.each do |k, type|
77
140
  prob = {}
78
- n.css("TargetValueStat").each do |n2|
79
- n3 = n2.css("GaussianDistribution")
80
- prob[n2.attribute("value").value] = {
81
- mean: n3.attribute("mean").value.to_f,
82
- stdev: Math.sqrt(n3.attribute("variance").value.to_f)
83
- }
84
- end
85
- n.css("PairCounts").each do |n2|
86
- boom = {}
87
- n2.css("TargetValueCount").each do |n3|
88
- boom[n3.attribute("value").value] = n3.attribute("count").value.to_f
141
+
142
+ case type
143
+ when "text"
144
+ raise "Text features not supported yet for naive Bayes"
145
+ when "categorical"
146
+ groups = Hash.new { |hash, key| hash[key] = [] }
147
+ data.columns[k].each_with_index do |v, i|
148
+ groups[v] << i
149
+ end
150
+
151
+ groups.each do |group, indexes|
152
+ df = data[indexes]
153
+ prob[group] = group_count(df.label, target_counts.dup)
154
+ end
155
+
156
+ # smooth
157
+ if smoothing
158
+ labels.each do |label|
159
+ sum = prob.map { |k2, v2| v2[label] }.sum.to_f
160
+ prob.each do |k2, v|
161
+ v[label] = (v[label] + smoothing) * sum / (sum + (prob.size * smoothing))
162
+ end
163
+ end
164
+ end
165
+ else
166
+ labels.each do |group|
167
+ xs = grouped[group]
168
+
169
+ # TODO handle this case
170
+ next unless xs
171
+
172
+ values = xs.columns[k]
173
+ prob[group] = {mean: mean(values), stdev: stdev(values)}
89
174
  end
90
- prob[n2.attribute("value").value] = boom
91
175
  end
92
- conditional[n.attribute("fieldName").value] = prob
93
- end
94
176
 
95
- @target = node.css("BayesOutput").attribute("fieldName").value
177
+ conditional[k] = prob
178
+ end
96
179
 
97
- probabilities = {
180
+ @probabilities = {
98
181
  prior: prior,
99
182
  conditional: conditional
100
183
  }
101
184
 
102
- new(probabilities: probabilities, target: @target)
185
+ Evaluators::NaiveBayes.new(probabilities: probabilities, features: @features)
103
186
  end
104
187
 
105
- def to_pmml
188
+ def generate_pmml
106
189
  data_fields = {}
107
190
  data_fields[@target] = probabilities[:prior].keys
108
191
  probabilities[:conditional].each do |k, v|
109
- if !v.values[0][:mean]
192
+ if @features[k] == "categorical"
110
193
  data_fields[k] = v.keys
111
194
  else
112
195
  data_fields[k] = nil
113
196
  end
114
197
  end
115
198
 
116
- builder = Nokogiri::XML::Builder.new do |xml|
117
- xml.PMML(version: "4.3", xmlns: "http://www.dmg.org/PMML-4_3", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
118
- xml.Header
119
- xml.DataDictionary do
120
- data_fields.each do |k, vs|
121
- if vs
122
- xml.DataField(name: k, optype: "categorical", dataType: "string") do
123
- vs.each do |v|
124
- xml.Value(value: v)
125
- end
126
- end
127
- else
128
- xml.DataField(name: k, optype: "continuous", dataType: "double")
129
- end
199
+ build_pmml(data_fields) do |xml|
200
+ xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
201
+ xml.MiningSchema do
202
+ data_fields.each do |k, _|
203
+ xml.MiningField(name: k)
130
204
  end
131
205
  end
132
- xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
133
- xml.MiningSchema do
134
- data_fields.each do |k, _|
135
- xml.MiningField(name: k)
136
- end
137
- end
138
- xml.BayesInputs do
139
- probabilities[:conditional].each do |k, v|
140
- xml.BayesInput(fieldName: k) do
141
- if !v.values[0][:mean]
142
- v.each do |k2, v2|
143
- xml.PairCounts(value: k2) do
144
- xml.TargetValueCounts do
145
- v2.each do |k3, v3|
146
- xml.TargetValueCount(value: k3, count: v3)
147
- end
206
+ xml.BayesInputs do
207
+ probabilities[:conditional].each do |k, v|
208
+ xml.BayesInput(fieldName: k) do
209
+ if @features[k] == "categorical"
210
+ v.sort_by { |k2, _| k2 }.each do |k2, v2|
211
+ xml.PairCounts(value: k2) do
212
+ xml.TargetValueCounts do
213
+ v2.sort_by { |k2, _| k2 }.each do |k3, v3|
214
+ xml.TargetValueCount(value: k3, count: v3)
148
215
  end
149
216
  end
150
217
  end
151
- else
152
- xml.TargetValueStats do
153
- v.each do |k2, v2|
154
- xml.TargetValueStat(value: k2) do
155
- xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
156
- end
218
+ end
219
+ else
220
+ xml.TargetValueStats do
221
+ v.sort_by { |k2, _| k2 }.each do |k2, v2|
222
+ xml.TargetValueStat(value: k2) do
223
+ xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
157
224
  end
158
225
  end
159
226
  end
160
227
  end
161
228
  end
162
229
  end
163
- xml.BayesOutput(fieldName: "target") do
164
- xml.TargetValueCounts do
165
- probabilities[:prior].each do |k, v|
166
- xml.TargetValueCount(value: k, count: v)
167
- end
168
- end
169
- end
170
230
  end
171
- end
172
- end.to_xml
173
- end
174
-
175
- # metrics
176
-
177
- def self.metrics(actual, estimated)
178
- {
179
- accuracy: actual.zip(estimated).count { |yi, yi2| yi == yi2 } / actual.size.to_f
180
- }
181
- end
182
-
183
- private
184
-
185
- def _predict(x)
186
- x.map do |xi|
187
- probs = calculate_class_probabilities(stringify_keys(xi))
188
- # deterministic for equal probabilities
189
- probs.sort_by { |k, v| [-v, k.to_s] }[0][0]
190
- end
191
- end
192
-
193
- def calculate_class_probabilities(x)
194
- prob = {}
195
- probabilities[:prior].each do |c, cv|
196
- prob[c] = cv.to_f / probabilities[:prior].values.sum
197
- probabilities[:conditional].each do |k, v|
198
- if !v[c][:mean]
199
- # TODO compute ahead of time
200
- p2 = v[c][x[k]].to_f / v[c].values.sum
201
-
202
- # assign very small probability if probability is 0
203
- # TODO use proper smoothing instead
204
- if p2 == 0
205
- p2 = 0.0001
231
+ xml.BayesOutput(fieldName: "target") do
232
+ xml.TargetValueCounts do
233
+ probabilities[:prior].sort_by { |k, _| k }.each do |k, v|
234
+ xml.TargetValueCount(value: k, count: v)
235
+ end
206
236
  end
207
-
208
- prob[c] *= p2
209
- else
210
- prob[c] *= calculate_probability(x[k], v[c][:mean], v[c][:stdev])
211
237
  end
212
238
  end
213
239
  end
214
- prob
215
240
  end
216
241
 
217
- def calculate_probability(x, mean, stdev)
218
- exponent = Math.exp(-((x - mean)**2) / (2 * (stdev**2)))
219
- (1 / (Math.sqrt(2 * Math::PI) * stdev)) * exponent
220
- end
221
-
222
- def group_count(arr)
223
- r = arr.inject(Hash.new(0)) { |h, e| h[e.to_s] += 1 ; h }
224
- r.default = nil
225
- r
242
+ def group_count(arr, start)
243
+ arr.inject(start) { |h, e| h[e] += 1; h }
226
244
  end
227
245
 
228
246
  def mean(arr)
@@ -230,17 +248,10 @@ module Eps
230
248
  end
231
249
 
232
250
  def stdev(arr)
251
+ return nil if arr.size <= 1
233
252
  m = mean(arr)
234
253
  sum = arr.inject(0) { |accum, i| accum + (i - m)**2 }
235
254
  Math.sqrt(sum / (arr.length - 1).to_f)
236
255
  end
237
-
238
- def stringify_keys(h)
239
- o = {}
240
- h.each do |k, v|
241
- o[k.to_s] = v
242
- end
243
- o
244
- end
245
256
  end
246
257
  end