eps 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +235 -84
- data/lib/eps.rb +9 -4
- data/lib/eps/base.rb +19 -0
- data/lib/eps/base_estimator.rb +84 -0
- data/lib/eps/linear_regression.rb +558 -0
- data/lib/eps/model.rb +108 -0
- data/lib/eps/naive_bayes.rb +240 -0
- data/lib/eps/version.rb +1 -1
- metadata +13 -18
- data/.gitignore +0 -9
- data/.travis.yml +0 -15
- data/Gemfile +0 -11
- data/Rakefile +0 -34
- data/eps.gemspec +0 -30
- data/guides/Modeling.md +0 -152
- data/lib/eps/base_regressor.rb +0 -232
- data/lib/eps/metrics.rb +0 -35
- data/lib/eps/regressor.rb +0 -314
data/lib/eps/model.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
module Eps
|
2
|
+
class Model
|
3
|
+
def initialize(data = nil, y = nil, target: nil, estimator: nil, **options)
|
4
|
+
@options = options
|
5
|
+
|
6
|
+
if estimator
|
7
|
+
@estimator = estimator
|
8
|
+
elsif data # legacy
|
9
|
+
train(data, y, target: target)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# pmml
|
14
|
+
|
15
|
+
def self.load_pmml(data)
|
16
|
+
if data.is_a?(String)
|
17
|
+
require "nokogiri"
|
18
|
+
data = Nokogiri::XML(data) { |config| config.strict }
|
19
|
+
end
|
20
|
+
|
21
|
+
estimator_class =
|
22
|
+
if data.css("RegressionModel").any?
|
23
|
+
Eps::LinearRegression
|
24
|
+
elsif data.css("NaiveBayesModel").any?
|
25
|
+
Eps::NaiveBayes
|
26
|
+
else
|
27
|
+
raise "Unknown model"
|
28
|
+
end
|
29
|
+
|
30
|
+
new(estimator: estimator_class.load_pmml(data))
|
31
|
+
end
|
32
|
+
|
33
|
+
# ruby - legacy
|
34
|
+
|
35
|
+
def self.load(data)
|
36
|
+
new(estimator: Eps::LinearRegression.load(data))
|
37
|
+
end
|
38
|
+
|
39
|
+
# json - legacy
|
40
|
+
|
41
|
+
def self.load_json(data)
|
42
|
+
new(estimator: Eps::LinearRegression.load_json(data))
|
43
|
+
end
|
44
|
+
|
45
|
+
def to_json
|
46
|
+
@estimator ? @estimator.to_json : super
|
47
|
+
end
|
48
|
+
|
49
|
+
# pfa - legacy
|
50
|
+
|
51
|
+
def self.load_pfa(data)
|
52
|
+
new(estimator: Eps::LinearRegression.load_pfa(data))
|
53
|
+
end
|
54
|
+
|
55
|
+
# metrics
|
56
|
+
|
57
|
+
def self.metrics(actual, estimated)
|
58
|
+
estimator_class =
|
59
|
+
if numeric?(actual)
|
60
|
+
Eps::LinearRegression
|
61
|
+
else
|
62
|
+
Eps::NaiveBayes
|
63
|
+
end
|
64
|
+
|
65
|
+
estimator_class.metrics(actual, estimated)
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def train(data, y = nil, target: nil)
|
71
|
+
y ||= daru?(data) ? data[target].to_a : data.map { |r| r[target] }
|
72
|
+
|
73
|
+
estimator_class =
|
74
|
+
if self.class.numeric?(y)
|
75
|
+
Eps::LinearRegression
|
76
|
+
else
|
77
|
+
Eps::NaiveBayes
|
78
|
+
end
|
79
|
+
|
80
|
+
@estimator = estimator_class.new(**@options)
|
81
|
+
@estimator.train(data, y, target: target)
|
82
|
+
end
|
83
|
+
|
84
|
+
def respond_to_missing?(name, include_private = false)
|
85
|
+
if @estimator
|
86
|
+
@estimator.respond_to?(name, include_private)
|
87
|
+
else
|
88
|
+
super
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def method_missing(method, *args, &block)
|
93
|
+
if @estimator
|
94
|
+
@estimator.public_send(method, *args, &block)
|
95
|
+
else
|
96
|
+
super
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.numeric?(y)
|
101
|
+
y.first.is_a?(Numeric)
|
102
|
+
end
|
103
|
+
|
104
|
+
def daru?(x)
|
105
|
+
defined?(Daru) && x.is_a?(Daru::DataFrame)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,240 @@
|
|
1
|
+
module Eps
|
2
|
+
class NaiveBayes < BaseEstimator
|
3
|
+
attr_reader :probabilities
|
4
|
+
|
5
|
+
def initialize(probabilities: nil, target: nil)
|
6
|
+
@probabilities = probabilities if probabilities
|
7
|
+
@target = target if target
|
8
|
+
end
|
9
|
+
|
10
|
+
def train(*args)
|
11
|
+
super
|
12
|
+
|
13
|
+
@y = @y.map { |yi| yi.to_s }
|
14
|
+
|
15
|
+
prior = group_count(@y)
|
16
|
+
conditional = {}
|
17
|
+
|
18
|
+
if @x.any?
|
19
|
+
keys = @x.first.keys
|
20
|
+
x = @x.dup
|
21
|
+
x.each_with_index do |xi, i|
|
22
|
+
xi[@target] = @y[i]
|
23
|
+
end
|
24
|
+
keys.each do |k|
|
25
|
+
conditional[k] = {}
|
26
|
+
x.group_by { |xi| xi[@target] }.each do |group, xs|
|
27
|
+
v = xs.map { |xi| xi[k] }
|
28
|
+
|
29
|
+
if categorical?(v[0])
|
30
|
+
# TODO apply smoothing
|
31
|
+
# apply smoothing only to
|
32
|
+
# 1. categorical features
|
33
|
+
# 2. conditional probabilities
|
34
|
+
# TODO more efficient count
|
35
|
+
conditional[k][group] = group_count(v)
|
36
|
+
else
|
37
|
+
conditional[k][group] = {mean: mean(v), stdev: stdev(v)}
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
@probabilities = {
|
44
|
+
prior: prior,
|
45
|
+
conditional: conditional
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
# TODO better summary
|
50
|
+
def summary(extended: false)
|
51
|
+
@summary_str ||= begin
|
52
|
+
str = String.new("")
|
53
|
+
probabilities[:prior].each do |k, v|
|
54
|
+
str += "#{k}: #{v}\n"
|
55
|
+
end
|
56
|
+
str += "\n"
|
57
|
+
str += "accuracy: %d%%\n" % [(100 * accuracy).round]
|
58
|
+
str
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def accuracy
|
63
|
+
self.class.metrics(predict(@x), @y)[:accuracy]
|
64
|
+
end
|
65
|
+
|
66
|
+
# pmml
|
67
|
+
|
68
|
+
def self.load_pmml(data)
|
69
|
+
# TODO more validation
|
70
|
+
node = data.css("NaiveBayesModel")
|
71
|
+
|
72
|
+
prior = {}
|
73
|
+
node.css("BayesOutput TargetValueCount").each do |n|
|
74
|
+
prior[n.attribute("value").value] = n.attribute("count").value.to_f
|
75
|
+
end
|
76
|
+
|
77
|
+
conditional = {}
|
78
|
+
node.css("BayesInput").each do |n|
|
79
|
+
prob = {}
|
80
|
+
n.css("TargetValueStat").each do |n2|
|
81
|
+
n3 = n2.css("GaussianDistribution")
|
82
|
+
prob[n2.attribute("value").value] = {
|
83
|
+
mean: n3.attribute("mean").value.to_f,
|
84
|
+
stdev: Math.sqrt(n3.attribute("variance").value.to_f)
|
85
|
+
}
|
86
|
+
end
|
87
|
+
n.css("PairCounts").each do |n2|
|
88
|
+
boom = {}
|
89
|
+
n2.css("TargetValueCount").each do |n3|
|
90
|
+
boom[n3.attribute("value").value] = n3.attribute("count").value.to_f
|
91
|
+
end
|
92
|
+
prob[n2.attribute("value").value] = boom
|
93
|
+
end
|
94
|
+
conditional[n.attribute("fieldName").value] = prob
|
95
|
+
end
|
96
|
+
|
97
|
+
@target = node.css("BayesOutput").attribute("fieldName").value
|
98
|
+
|
99
|
+
probabilities = {
|
100
|
+
prior: prior,
|
101
|
+
conditional: conditional
|
102
|
+
}
|
103
|
+
|
104
|
+
new(probabilities: probabilities, target: @target)
|
105
|
+
end
|
106
|
+
|
107
|
+
def to_pmml
|
108
|
+
data_fields = {}
|
109
|
+
data_fields[@target] = probabilities[:prior].keys
|
110
|
+
probabilities[:conditional].each do |k, v|
|
111
|
+
if !v.values[0][:mean]
|
112
|
+
data_fields[k] = v.keys
|
113
|
+
else
|
114
|
+
data_fields[k] = nil
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
builder = Nokogiri::XML::Builder.new do |xml|
|
119
|
+
xml.PMML(version: "4.3", xmlns: "http://www.dmg.org/PMML-4_3", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
|
120
|
+
xml.Header
|
121
|
+
xml.DataDictionary do
|
122
|
+
data_fields.each do |k, vs|
|
123
|
+
if vs
|
124
|
+
xml.DataField(name: k, optype: "categorical", dataType: "string") do
|
125
|
+
vs.each do |v|
|
126
|
+
xml.Value(value: v)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
else
|
130
|
+
xml.DataField(name: k, optype: "continuous", dataType: "double")
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
|
135
|
+
xml.MiningSchema do
|
136
|
+
data_fields.each do |k, _|
|
137
|
+
xml.MiningField(name: k)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
xml.BayesInputs do
|
141
|
+
probabilities[:conditional].each do |k, v|
|
142
|
+
xml.BayesInput(fieldName: k) do
|
143
|
+
if !v.values[0][:mean]
|
144
|
+
v.each do |k2, v2|
|
145
|
+
xml.PairCounts(value: k2) do
|
146
|
+
xml.TargetValueCounts do
|
147
|
+
v2.each do |k3, v3|
|
148
|
+
xml.TargetValueCount(value: k3, count: v3)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
else
|
154
|
+
xml.TargetValueStats do
|
155
|
+
v.each do |k2, v2|
|
156
|
+
xml.TargetValueStat(value: k2) do
|
157
|
+
xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
xml.BayesOutput(fieldName: "target") do
|
166
|
+
xml.TargetValueCounts do
|
167
|
+
probabilities[:prior].each do |k, v|
|
168
|
+
xml.TargetValueCount(value: k, count: v)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end.to_xml
|
175
|
+
end
|
176
|
+
|
177
|
+
# metrics
|
178
|
+
|
179
|
+
def self.metrics(actual, estimated)
|
180
|
+
{
|
181
|
+
accuracy: actual.zip(estimated).count { |yi, yi2| yi == yi2 } / actual.size.to_f
|
182
|
+
}
|
183
|
+
end
|
184
|
+
|
185
|
+
private
|
186
|
+
|
187
|
+
def _predict(x)
|
188
|
+
x.map do |xi|
|
189
|
+
probs = calculate_class_probabilities(xi)
|
190
|
+
# deterministic for equal probabilities
|
191
|
+
probs.sort_by { |k, v| [-v, k.to_s] }[0][0]
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
def calculate_class_probabilities(x)
|
196
|
+
prob = {}
|
197
|
+
probabilities[:prior].each do |c, cv|
|
198
|
+
prob[c] = cv.to_f / probabilities[:prior].values.sum
|
199
|
+
probabilities[:conditional].each do |k, v|
|
200
|
+
if !v[c][:mean]
|
201
|
+
# TODO compute ahead of time
|
202
|
+
p2 = v[c][x[k]].to_f / v[c].values.sum
|
203
|
+
|
204
|
+
# assign very small probability if probability is 0
|
205
|
+
# TODO use proper smoothing instead
|
206
|
+
if p2 == 0
|
207
|
+
p2 = 0.0001
|
208
|
+
end
|
209
|
+
|
210
|
+
prob[c] *= p2
|
211
|
+
else
|
212
|
+
prob[c] *= calculate_probability(x[k], v[c][:mean], v[c][:stdev])
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
prob
|
217
|
+
end
|
218
|
+
|
219
|
+
def calculate_probability(x, mean, stdev)
|
220
|
+
exponent = Math.exp(-((x - mean)**2) / (2 * (stdev**2)))
|
221
|
+
(1 / (Math.sqrt(2 * Math::PI) * stdev)) * exponent
|
222
|
+
end
|
223
|
+
|
224
|
+
def group_count(arr)
|
225
|
+
r = arr.inject(Hash.new(0)) { |h, e| h[e.to_s] += 1 ; h }
|
226
|
+
r.default = nil
|
227
|
+
r
|
228
|
+
end
|
229
|
+
|
230
|
+
def mean(arr)
|
231
|
+
arr.sum / arr.size.to_f
|
232
|
+
end
|
233
|
+
|
234
|
+
def stdev(arr)
|
235
|
+
m = mean(arr)
|
236
|
+
sum = arr.inject(0) { |accum, i| accum + (i - m)**2 }
|
237
|
+
Math.sqrt(sum / (arr.length - 1).to_f)
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
data/lib/eps/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: eps
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
|
-
bindir:
|
9
|
+
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-05-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -81,25 +81,20 @@ dependencies:
|
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
83
|
description:
|
84
|
-
email:
|
85
|
-
- andrew@chartkick.com
|
84
|
+
email: andrew@chartkick.com
|
86
85
|
executables: []
|
87
86
|
extensions: []
|
88
87
|
extra_rdoc_files: []
|
89
88
|
files:
|
90
|
-
- ".gitignore"
|
91
|
-
- ".travis.yml"
|
92
89
|
- CHANGELOG.md
|
93
|
-
- Gemfile
|
94
90
|
- LICENSE.txt
|
95
91
|
- README.md
|
96
|
-
- Rakefile
|
97
|
-
- eps.gemspec
|
98
|
-
- guides/Modeling.md
|
99
92
|
- lib/eps.rb
|
100
|
-
- lib/eps/
|
101
|
-
- lib/eps/
|
102
|
-
- lib/eps/
|
93
|
+
- lib/eps/base.rb
|
94
|
+
- lib/eps/base_estimator.rb
|
95
|
+
- lib/eps/linear_regression.rb
|
96
|
+
- lib/eps/model.rb
|
97
|
+
- lib/eps/naive_bayes.rb
|
103
98
|
- lib/eps/version.rb
|
104
99
|
homepage: https://github.com/ankane/eps
|
105
100
|
licenses:
|
@@ -113,16 +108,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
113
108
|
requirements:
|
114
109
|
- - ">="
|
115
110
|
- !ruby/object:Gem::Version
|
116
|
-
version: '
|
111
|
+
version: '2.4'
|
117
112
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
118
113
|
requirements:
|
119
114
|
- - ">="
|
120
115
|
- !ruby/object:Gem::Version
|
121
116
|
version: '0'
|
122
117
|
requirements: []
|
123
|
-
|
124
|
-
rubygems_version: 2.7.7
|
118
|
+
rubygems_version: 3.0.3
|
125
119
|
signing_key:
|
126
120
|
specification_version: 4
|
127
|
-
summary:
|
121
|
+
summary: Machine learning for Ruby. Supports regression (linear regression) and classification
|
122
|
+
(naive Bayes)
|
128
123
|
test_files: []
|
data/.gitignore
DELETED
data/.travis.yml
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
language: ruby
|
2
|
-
rvm: 2.5.1
|
3
|
-
sudo: required
|
4
|
-
before_install:
|
5
|
-
- gem install bundler
|
6
|
-
- sudo apt-get update
|
7
|
-
- sudo apt-get install -y libgsl0-dev
|
8
|
-
script: bundle exec rake test
|
9
|
-
env:
|
10
|
-
-
|
11
|
-
- GSL=t
|
12
|
-
notifications:
|
13
|
-
email:
|
14
|
-
on_success: never
|
15
|
-
on_failure: change
|