eps 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ module Eps
2
+ module Metrics
3
+ class << self
4
+ def rmse(y_true, y_pred)
5
+ check_size(y_true, y_pred)
6
+ Math.sqrt(mean(errors(y_true, y_pred).map { |v| v**2 }))
7
+ end
8
+
9
+ def mae(y_true, y_pred)
10
+ check_size(y_true, y_pred)
11
+ mean(errors(y_true, y_pred).map { |v| v.abs })
12
+ end
13
+
14
+ def me(y_true, y_pred)
15
+ check_size(y_true, y_pred)
16
+ mean(errors(y_true, y_pred))
17
+ end
18
+
19
+ def accuracy(y_true, y_pred)
20
+ check_size(y_true, y_pred)
21
+ y_true.zip(y_pred).count { |yt, yp| yt == yp } / y_true.size.to_f
22
+ end
23
+
24
+ # http://wiki.fast.ai/index.php/Log_Loss
25
+ def log_loss(y_true, y_pred, eps: 1e-15)
26
+ check_size(y_true, y_pred)
27
+ p = y_pred.map { |yp| yp.clamp(eps, 1 - eps) }
28
+ mean(y_true.zip(p).map { |yt, pi| yt == 1 ? -Math.log(pi) : -Math.log(1 - pi) })
29
+ end
30
+
31
+ private
32
+
33
+ def check_size(y_true, y_pred)
34
+ raise ArgumentError, "Different sizes" if y_true.size != y_pred.size
35
+ end
36
+
37
+ def mean(arr)
38
+ arr.sum / arr.size.to_f
39
+ end
40
+
41
+ def errors(y_true, y_pred)
42
+ y_true.zip(y_pred).map { |yt, yp| yt - yp }
43
+ end
44
+ end
45
+ end
46
+ end
data/lib/eps/model.rb CHANGED
@@ -1,12 +1,10 @@
1
1
  module Eps
2
2
  class Model
3
- def initialize(data = nil, y = nil, target: nil, estimator: nil, **options)
4
- @options = options
5
-
3
+ def initialize(data = nil, y = nil, estimator: nil, **options)
6
4
  if estimator
7
5
  @estimator = estimator
8
- elsif data # legacy
9
- train(data, y, target: target)
6
+ elsif data
7
+ train(data, y, **options)
10
8
  end
11
9
  end
12
10
 
@@ -14,12 +12,13 @@ module Eps
14
12
 
15
13
  def self.load_pmml(data)
16
14
  if data.is_a?(String)
17
- require "nokogiri"
18
15
  data = Nokogiri::XML(data) { |config| config.strict }
19
16
  end
20
17
 
21
18
  estimator_class =
22
- if data.css("RegressionModel").any?
19
+ if data.css("Segmentation").any?
20
+ Eps::LightGBM
21
+ elsif data.css("RegressionModel").any?
23
22
  Eps::LinearRegression
24
23
  elsif data.css("NaiveBayesModel").any?
25
24
  Eps::NaiveBayes
@@ -30,55 +29,22 @@ module Eps
30
29
  new(estimator: estimator_class.load_pmml(data))
31
30
  end
32
31
 
33
- # ruby - legacy
34
-
35
- def self.load(data)
36
- new(estimator: Eps::LinearRegression.load(data))
37
- end
38
-
39
- # json - legacy
40
-
41
- def self.load_json(data)
42
- new(estimator: Eps::LinearRegression.load_json(data))
43
- end
44
-
45
- def to_json
46
- @estimator ? @estimator.to_json : super
47
- end
48
-
49
- # pfa - legacy
50
-
51
- def self.load_pfa(data)
52
- new(estimator: Eps::LinearRegression.load_pfa(data))
53
- end
54
-
55
- # metrics
56
-
57
- def self.metrics(actual, estimated)
58
- estimator_class =
59
- if numeric?(actual)
60
- Eps::LinearRegression
61
- else
62
- Eps::NaiveBayes
63
- end
64
-
65
- estimator_class.metrics(actual, estimated)
66
- end
67
-
68
32
  private
69
33
 
70
- def train(data, y = nil, target: nil)
71
- y ||= daru?(data) ? data[target].to_a : data.map { |r| r[target] }
72
-
34
+ def train(data, y = nil, target: nil, algorithm: :lightgbm, **options)
73
35
  estimator_class =
74
- if self.class.numeric?(y)
36
+ case algorithm
37
+ when :lightgbm
38
+ Eps::LightGBM
39
+ when :linear_regression
75
40
  Eps::LinearRegression
76
- else
41
+ when :naive_bayes
77
42
  Eps::NaiveBayes
43
+ else
44
+ raise ArgumentError, "Unknown algorithm: #{algorithm}"
78
45
  end
79
46
 
80
- @estimator = estimator_class.new(**@options)
81
- @estimator.train(data, y, target: target)
47
+ @estimator = estimator_class.new(data, y, target: target, **options)
82
48
  end
83
49
 
84
50
  def respond_to_missing?(name, include_private = false)
@@ -90,19 +56,11 @@ module Eps
90
56
  end
91
57
 
92
58
  def method_missing(method, *args, &block)
93
- if @estimator
59
+ if @estimator && @estimator.respond_to?(method)
94
60
  @estimator.public_send(method, *args, &block)
95
61
  else
96
62
  super
97
63
  end
98
64
  end
99
-
100
- def self.numeric?(y)
101
- y.first.is_a?(Numeric)
102
- end
103
-
104
- def daru?(x)
105
- defined?(Daru) && x.is_a?(Daru::DataFrame)
106
- end
107
65
  end
108
66
  end
@@ -2,227 +2,245 @@ module Eps
2
2
  class NaiveBayes < BaseEstimator
3
3
  attr_reader :probabilities
4
4
 
5
- def initialize(probabilities: nil, target: nil)
6
- @probabilities = probabilities
7
- @target = target
5
+ def accuracy
6
+ Eps::Metrics.accuracy(@train_set.label, predict(@train_set))
8
7
  end
9
8
 
10
- def train(*args)
11
- super
12
-
13
- @y = @y.map { |yi| yi.to_s }
9
+ # pmml
14
10
 
15
- prior = group_count(@y)
16
- conditional = {}
11
+ def self.load_pmml(data)
12
+ super do |data|
13
+ # TODO more validation
14
+ node = data.css("NaiveBayesModel")
17
15
 
18
- if @x.any?
19
- keys = @x.first.keys
20
- x = @x.dup
21
- x.each_with_index do |xi, i|
22
- xi[@target] = @y[i]
16
+ prior = {}
17
+ node.css("BayesOutput TargetValueCount").each do |n|
18
+ prior[n.attribute("value").value] = n.attribute("count").value.to_f
23
19
  end
24
- keys.each do |k|
25
- conditional[k.to_s] = {}
26
- x.group_by { |xi| xi[@target] }.each do |group, xs|
27
- v = xs.map { |xi| xi[k] }
28
-
29
- if categorical?(v[0])
30
- # TODO apply smoothing
31
- # apply smoothing only to
32
- # 1. categorical features
33
- # 2. conditional probabilities
34
- # TODO more efficient count
35
- conditional[k.to_s][group] = group_count(v)
20
+
21
+ legacy = false
22
+
23
+ conditional = {}
24
+ features = {}
25
+ node.css("BayesInput").each do |n|
26
+ prob = {}
27
+
28
+ # numeric
29
+ n.css("TargetValueStat").each do |n2|
30
+ n3 = n2.css("GaussianDistribution")
31
+ prob[n2.attribute("value").value] = {
32
+ mean: n3.attribute("mean").value.to_f,
33
+ stdev: Math.sqrt(n3.attribute("variance").value.to_f)
34
+ }
35
+ end
36
+
37
+ # detect bad form in Eps < 0.3
38
+ bad_format = n.css("PairCounts").map { |n2| n2.attribute("value").value } == prior.keys
39
+
40
+ # categorical
41
+ n.css("PairCounts").each do |n2|
42
+ if bad_format
43
+ n2.css("TargetValueCount").each do |n3|
44
+ prob[n3.attribute("value").value] ||= {}
45
+ prob[n3.attribute("value").value][n2.attribute("value").value] = BigDecimal(n3.attribute("count").value)
46
+ end
36
47
  else
37
- conditional[k.to_s][group] = {mean: mean(v), stdev: stdev(v)}
48
+ boom = {}
49
+ n2.css("TargetValueCount").each do |n3|
50
+ boom[n3.attribute("value").value] = BigDecimal(n3.attribute("count").value)
51
+ end
52
+ prob[n2.attribute("value").value] = boom
53
+ end
54
+ end
55
+
56
+ if bad_format
57
+ legacy = true
58
+ prob.each do |k, v|
59
+ prior.keys.each do |k|
60
+ v[k] ||= 0.0
61
+ end
38
62
  end
39
63
  end
64
+
65
+ name = n.attribute("fieldName").value
66
+ conditional[name] = prob
67
+ features[name] = n.css("TargetValueStat").any? ? "numeric" : "categorical"
40
68
  end
41
- end
42
69
 
43
- @probabilities = {
44
- prior: prior,
45
- conditional: conditional
46
- }
70
+ target = node.css("BayesOutput").attribute("fieldName").value
71
+
72
+ probabilities = {
73
+ prior: prior,
74
+ conditional: conditional
75
+ }
76
+
77
+ # get derived fields
78
+ derived = {}
79
+ data.css("DerivedField").each do |n|
80
+ name = n.attribute("name").value
81
+ field = n.css("NormDiscrete").attribute("field").value
82
+ value = n.css("NormDiscrete").attribute("value").value
83
+ features.delete(name)
84
+ features[field] = "derived"
85
+ derived[field] ||= {}
86
+ derived[field][name] = value
87
+ end
88
+
89
+ Evaluators::NaiveBayes.new(probabilities: probabilities, features: features, derived: derived, legacy: legacy)
90
+ end
47
91
  end
48
92
 
93
+ private
94
+
49
95
  # TODO better summary
50
- def summary(extended: false)
96
+ def _summary(extended: false)
51
97
  str = String.new("")
52
98
  probabilities[:prior].each do |k, v|
53
99
  str += "#{k}: #{v}\n"
54
100
  end
55
- str += "\n"
56
- str += "accuracy: %d%%\n" % [(100 * accuracy).round]
57
101
  str
58
102
  end
59
103
 
60
- def accuracy
61
- self.class.metrics(predict(@x), @y)[:accuracy]
62
- end
104
+ def _train(smoothing: 1, **options)
105
+ raise "Target must be strings" if @target_type != "categorical"
106
+ check_missing_value(@train_set)
107
+ check_missing_value(@validation_set) if @validation_set
63
108
 
64
- # pmml
109
+ data = @train_set
65
110
 
66
- def self.load_pmml(data)
67
- # TODO more validation
68
- node = data.css("NaiveBayesModel")
111
+ prep_text_features(data)
112
+
113
+ # convert boolean to strings
114
+ data.label = data.label.map(&:to_s)
115
+
116
+ indexes = {}
117
+ data.label.each_with_index do |yi, i|
118
+ (indexes[yi] ||= []) << i
119
+ end
120
+
121
+ grouped = {}
122
+ indexes.each do |k, v|
123
+ grouped[k] = data[v]
124
+ end
69
125
 
70
126
  prior = {}
71
- node.css("BayesOutput TargetValueCount").each do |n|
72
- prior[n.attribute("value").value] = n.attribute("count").value.to_f
127
+ grouped.sort_by { |k, _| k }.each do |k, v|
128
+ prior[k] = v.size
129
+ end
130
+ labels = prior.keys
131
+
132
+ target_counts = {}
133
+ labels.each do |k|
134
+ target_counts[k] = 0
73
135
  end
74
136
 
75
137
  conditional = {}
76
- node.css("BayesInput").each do |n|
138
+
139
+ @features.each do |k, type|
77
140
  prob = {}
78
- n.css("TargetValueStat").each do |n2|
79
- n3 = n2.css("GaussianDistribution")
80
- prob[n2.attribute("value").value] = {
81
- mean: n3.attribute("mean").value.to_f,
82
- stdev: Math.sqrt(n3.attribute("variance").value.to_f)
83
- }
84
- end
85
- n.css("PairCounts").each do |n2|
86
- boom = {}
87
- n2.css("TargetValueCount").each do |n3|
88
- boom[n3.attribute("value").value] = n3.attribute("count").value.to_f
141
+
142
+ case type
143
+ when "text"
144
+ raise "Text features not supported yet for naive Bayes"
145
+ when "categorical"
146
+ groups = Hash.new { |hash, key| hash[key] = [] }
147
+ data.columns[k].each_with_index do |v, i|
148
+ groups[v] << i
149
+ end
150
+
151
+ groups.each do |group, indexes|
152
+ df = data[indexes]
153
+ prob[group] = group_count(df.label, target_counts.dup)
154
+ end
155
+
156
+ # smooth
157
+ if smoothing
158
+ labels.each do |label|
159
+ sum = prob.map { |k2, v2| v2[label] }.sum.to_f
160
+ prob.each do |k2, v|
161
+ v[label] = (v[label] + smoothing) * sum / (sum + (prob.size * smoothing))
162
+ end
163
+ end
164
+ end
165
+ else
166
+ labels.each do |group|
167
+ xs = grouped[group]
168
+
169
+ # TODO handle this case
170
+ next unless xs
171
+
172
+ values = xs.columns[k]
173
+ prob[group] = {mean: mean(values), stdev: stdev(values)}
89
174
  end
90
- prob[n2.attribute("value").value] = boom
91
175
  end
92
- conditional[n.attribute("fieldName").value] = prob
93
- end
94
176
 
95
- @target = node.css("BayesOutput").attribute("fieldName").value
177
+ conditional[k] = prob
178
+ end
96
179
 
97
- probabilities = {
180
+ @probabilities = {
98
181
  prior: prior,
99
182
  conditional: conditional
100
183
  }
101
184
 
102
- new(probabilities: probabilities, target: @target)
185
+ Evaluators::NaiveBayes.new(probabilities: probabilities, features: @features)
103
186
  end
104
187
 
105
- def to_pmml
188
+ def generate_pmml
106
189
  data_fields = {}
107
190
  data_fields[@target] = probabilities[:prior].keys
108
191
  probabilities[:conditional].each do |k, v|
109
- if !v.values[0][:mean]
192
+ if @features[k] == "categorical"
110
193
  data_fields[k] = v.keys
111
194
  else
112
195
  data_fields[k] = nil
113
196
  end
114
197
  end
115
198
 
116
- builder = Nokogiri::XML::Builder.new do |xml|
117
- xml.PMML(version: "4.3", xmlns: "http://www.dmg.org/PMML-4_3", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
118
- xml.Header
119
- xml.DataDictionary do
120
- data_fields.each do |k, vs|
121
- if vs
122
- xml.DataField(name: k, optype: "categorical", dataType: "string") do
123
- vs.each do |v|
124
- xml.Value(value: v)
125
- end
126
- end
127
- else
128
- xml.DataField(name: k, optype: "continuous", dataType: "double")
129
- end
199
+ build_pmml(data_fields) do |xml|
200
+ xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
201
+ xml.MiningSchema do
202
+ data_fields.each do |k, _|
203
+ xml.MiningField(name: k)
130
204
  end
131
205
  end
132
- xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
133
- xml.MiningSchema do
134
- data_fields.each do |k, _|
135
- xml.MiningField(name: k)
136
- end
137
- end
138
- xml.BayesInputs do
139
- probabilities[:conditional].each do |k, v|
140
- xml.BayesInput(fieldName: k) do
141
- if !v.values[0][:mean]
142
- v.each do |k2, v2|
143
- xml.PairCounts(value: k2) do
144
- xml.TargetValueCounts do
145
- v2.each do |k3, v3|
146
- xml.TargetValueCount(value: k3, count: v3)
147
- end
206
+ xml.BayesInputs do
207
+ probabilities[:conditional].each do |k, v|
208
+ xml.BayesInput(fieldName: k) do
209
+ if @features[k] == "categorical"
210
+ v.sort_by { |k2, _| k2 }.each do |k2, v2|
211
+ xml.PairCounts(value: k2) do
212
+ xml.TargetValueCounts do
213
+ v2.sort_by { |k2, _| k2 }.each do |k3, v3|
214
+ xml.TargetValueCount(value: k3, count: v3)
148
215
  end
149
216
  end
150
217
  end
151
- else
152
- xml.TargetValueStats do
153
- v.each do |k2, v2|
154
- xml.TargetValueStat(value: k2) do
155
- xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
156
- end
218
+ end
219
+ else
220
+ xml.TargetValueStats do
221
+ v.sort_by { |k2, _| k2 }.each do |k2, v2|
222
+ xml.TargetValueStat(value: k2) do
223
+ xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
157
224
  end
158
225
  end
159
226
  end
160
227
  end
161
228
  end
162
229
  end
163
- xml.BayesOutput(fieldName: "target") do
164
- xml.TargetValueCounts do
165
- probabilities[:prior].each do |k, v|
166
- xml.TargetValueCount(value: k, count: v)
167
- end
168
- end
169
- end
170
230
  end
171
- end
172
- end.to_xml
173
- end
174
-
175
- # metrics
176
-
177
- def self.metrics(actual, estimated)
178
- {
179
- accuracy: actual.zip(estimated).count { |yi, yi2| yi == yi2 } / actual.size.to_f
180
- }
181
- end
182
-
183
- private
184
-
185
- def _predict(x)
186
- x.map do |xi|
187
- probs = calculate_class_probabilities(stringify_keys(xi))
188
- # deterministic for equal probabilities
189
- probs.sort_by { |k, v| [-v, k.to_s] }[0][0]
190
- end
191
- end
192
-
193
- def calculate_class_probabilities(x)
194
- prob = {}
195
- probabilities[:prior].each do |c, cv|
196
- prob[c] = cv.to_f / probabilities[:prior].values.sum
197
- probabilities[:conditional].each do |k, v|
198
- if !v[c][:mean]
199
- # TODO compute ahead of time
200
- p2 = v[c][x[k]].to_f / v[c].values.sum
201
-
202
- # assign very small probability if probability is 0
203
- # TODO use proper smoothing instead
204
- if p2 == 0
205
- p2 = 0.0001
231
+ xml.BayesOutput(fieldName: "target") do
232
+ xml.TargetValueCounts do
233
+ probabilities[:prior].sort_by { |k, _| k }.each do |k, v|
234
+ xml.TargetValueCount(value: k, count: v)
235
+ end
206
236
  end
207
-
208
- prob[c] *= p2
209
- else
210
- prob[c] *= calculate_probability(x[k], v[c][:mean], v[c][:stdev])
211
237
  end
212
238
  end
213
239
  end
214
- prob
215
240
  end
216
241
 
217
- def calculate_probability(x, mean, stdev)
218
- exponent = Math.exp(-((x - mean)**2) / (2 * (stdev**2)))
219
- (1 / (Math.sqrt(2 * Math::PI) * stdev)) * exponent
220
- end
221
-
222
- def group_count(arr)
223
- r = arr.inject(Hash.new(0)) { |h, e| h[e.to_s] += 1 ; h }
224
- r.default = nil
225
- r
242
+ def group_count(arr, start)
243
+ arr.inject(start) { |h, e| h[e] += 1; h }
226
244
  end
227
245
 
228
246
  def mean(arr)
@@ -230,17 +248,10 @@ module Eps
230
248
  end
231
249
 
232
250
  def stdev(arr)
251
+ return nil if arr.size <= 1
233
252
  m = mean(arr)
234
253
  sum = arr.inject(0) { |accum, i| accum + (i - m)**2 }
235
254
  Math.sqrt(sum / (arr.length - 1).to_f)
236
255
  end
237
-
238
- def stringify_keys(h)
239
- o = {}
240
- h.each do |k, v|
241
- o[k.to_s] = v
242
- end
243
- o
244
- end
245
256
  end
246
257
  end