eps 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +235 -84
- data/lib/eps.rb +9 -4
- data/lib/eps/base.rb +19 -0
- data/lib/eps/base_estimator.rb +84 -0
- data/lib/eps/linear_regression.rb +558 -0
- data/lib/eps/model.rb +108 -0
- data/lib/eps/naive_bayes.rb +240 -0
- data/lib/eps/version.rb +1 -1
- metadata +13 -18
- data/.gitignore +0 -9
- data/.travis.yml +0 -15
- data/Gemfile +0 -11
- data/Rakefile +0 -34
- data/eps.gemspec +0 -30
- data/guides/Modeling.md +0 -152
- data/lib/eps/base_regressor.rb +0 -232
- data/lib/eps/metrics.rb +0 -35
- data/lib/eps/regressor.rb +0 -314
data/lib/eps/model.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
module Eps
|
2
|
+
class Model
|
3
|
+
def initialize(data = nil, y = nil, target: nil, estimator: nil, **options)
|
4
|
+
@options = options
|
5
|
+
|
6
|
+
if estimator
|
7
|
+
@estimator = estimator
|
8
|
+
elsif data # legacy
|
9
|
+
train(data, y, target: target)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# pmml
|
14
|
+
|
15
|
+
def self.load_pmml(data)
|
16
|
+
if data.is_a?(String)
|
17
|
+
require "nokogiri"
|
18
|
+
data = Nokogiri::XML(data) { |config| config.strict }
|
19
|
+
end
|
20
|
+
|
21
|
+
estimator_class =
|
22
|
+
if data.css("RegressionModel").any?
|
23
|
+
Eps::LinearRegression
|
24
|
+
elsif data.css("NaiveBayesModel").any?
|
25
|
+
Eps::NaiveBayes
|
26
|
+
else
|
27
|
+
raise "Unknown model"
|
28
|
+
end
|
29
|
+
|
30
|
+
new(estimator: estimator_class.load_pmml(data))
|
31
|
+
end
|
32
|
+
|
33
|
+
# ruby - legacy
|
34
|
+
|
35
|
+
def self.load(data)
|
36
|
+
new(estimator: Eps::LinearRegression.load(data))
|
37
|
+
end
|
38
|
+
|
39
|
+
# json - legacy
|
40
|
+
|
41
|
+
def self.load_json(data)
|
42
|
+
new(estimator: Eps::LinearRegression.load_json(data))
|
43
|
+
end
|
44
|
+
|
45
|
+
def to_json
|
46
|
+
@estimator ? @estimator.to_json : super
|
47
|
+
end
|
48
|
+
|
49
|
+
# pfa - legacy
|
50
|
+
|
51
|
+
def self.load_pfa(data)
|
52
|
+
new(estimator: Eps::LinearRegression.load_pfa(data))
|
53
|
+
end
|
54
|
+
|
55
|
+
# metrics
|
56
|
+
|
57
|
+
def self.metrics(actual, estimated)
|
58
|
+
estimator_class =
|
59
|
+
if numeric?(actual)
|
60
|
+
Eps::LinearRegression
|
61
|
+
else
|
62
|
+
Eps::NaiveBayes
|
63
|
+
end
|
64
|
+
|
65
|
+
estimator_class.metrics(actual, estimated)
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def train(data, y = nil, target: nil)
|
71
|
+
y ||= daru?(data) ? data[target].to_a : data.map { |r| r[target] }
|
72
|
+
|
73
|
+
estimator_class =
|
74
|
+
if self.class.numeric?(y)
|
75
|
+
Eps::LinearRegression
|
76
|
+
else
|
77
|
+
Eps::NaiveBayes
|
78
|
+
end
|
79
|
+
|
80
|
+
@estimator = estimator_class.new(**@options)
|
81
|
+
@estimator.train(data, y, target: target)
|
82
|
+
end
|
83
|
+
|
84
|
+
def respond_to_missing?(name, include_private = false)
|
85
|
+
if @estimator
|
86
|
+
@estimator.respond_to?(name, include_private)
|
87
|
+
else
|
88
|
+
super
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def method_missing(method, *args, &block)
|
93
|
+
if @estimator
|
94
|
+
@estimator.public_send(method, *args, &block)
|
95
|
+
else
|
96
|
+
super
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.numeric?(y)
|
101
|
+
y.first.is_a?(Numeric)
|
102
|
+
end
|
103
|
+
|
104
|
+
def daru?(x)
|
105
|
+
defined?(Daru) && x.is_a?(Daru::DataFrame)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,240 @@
|
|
1
|
+
module Eps
|
2
|
+
class NaiveBayes < BaseEstimator
|
3
|
+
attr_reader :probabilities
|
4
|
+
|
5
|
+
def initialize(probabilities: nil, target: nil)
|
6
|
+
@probabilities = probabilities if probabilities
|
7
|
+
@target = target if target
|
8
|
+
end
|
9
|
+
|
10
|
+
def train(*args)
|
11
|
+
super
|
12
|
+
|
13
|
+
@y = @y.map { |yi| yi.to_s }
|
14
|
+
|
15
|
+
prior = group_count(@y)
|
16
|
+
conditional = {}
|
17
|
+
|
18
|
+
if @x.any?
|
19
|
+
keys = @x.first.keys
|
20
|
+
x = @x.dup
|
21
|
+
x.each_with_index do |xi, i|
|
22
|
+
xi[@target] = @y[i]
|
23
|
+
end
|
24
|
+
keys.each do |k|
|
25
|
+
conditional[k] = {}
|
26
|
+
x.group_by { |xi| xi[@target] }.each do |group, xs|
|
27
|
+
v = xs.map { |xi| xi[k] }
|
28
|
+
|
29
|
+
if categorical?(v[0])
|
30
|
+
# TODO apply smoothing
|
31
|
+
# apply smoothing only to
|
32
|
+
# 1. categorical features
|
33
|
+
# 2. conditional probabilities
|
34
|
+
# TODO more efficient count
|
35
|
+
conditional[k][group] = group_count(v)
|
36
|
+
else
|
37
|
+
conditional[k][group] = {mean: mean(v), stdev: stdev(v)}
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
@probabilities = {
|
44
|
+
prior: prior,
|
45
|
+
conditional: conditional
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
# TODO better summary
|
50
|
+
def summary(extended: false)
|
51
|
+
@summary_str ||= begin
|
52
|
+
str = String.new("")
|
53
|
+
probabilities[:prior].each do |k, v|
|
54
|
+
str += "#{k}: #{v}\n"
|
55
|
+
end
|
56
|
+
str += "\n"
|
57
|
+
str += "accuracy: %d%%\n" % [(100 * accuracy).round]
|
58
|
+
str
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def accuracy
|
63
|
+
self.class.metrics(predict(@x), @y)[:accuracy]
|
64
|
+
end
|
65
|
+
|
66
|
+
# pmml
|
67
|
+
|
68
|
+
def self.load_pmml(data)
|
69
|
+
# TODO more validation
|
70
|
+
node = data.css("NaiveBayesModel")
|
71
|
+
|
72
|
+
prior = {}
|
73
|
+
node.css("BayesOutput TargetValueCount").each do |n|
|
74
|
+
prior[n.attribute("value").value] = n.attribute("count").value.to_f
|
75
|
+
end
|
76
|
+
|
77
|
+
conditional = {}
|
78
|
+
node.css("BayesInput").each do |n|
|
79
|
+
prob = {}
|
80
|
+
n.css("TargetValueStat").each do |n2|
|
81
|
+
n3 = n2.css("GaussianDistribution")
|
82
|
+
prob[n2.attribute("value").value] = {
|
83
|
+
mean: n3.attribute("mean").value.to_f,
|
84
|
+
stdev: Math.sqrt(n3.attribute("variance").value.to_f)
|
85
|
+
}
|
86
|
+
end
|
87
|
+
n.css("PairCounts").each do |n2|
|
88
|
+
boom = {}
|
89
|
+
n2.css("TargetValueCount").each do |n3|
|
90
|
+
boom[n3.attribute("value").value] = n3.attribute("count").value.to_f
|
91
|
+
end
|
92
|
+
prob[n2.attribute("value").value] = boom
|
93
|
+
end
|
94
|
+
conditional[n.attribute("fieldName").value] = prob
|
95
|
+
end
|
96
|
+
|
97
|
+
@target = node.css("BayesOutput").attribute("fieldName").value
|
98
|
+
|
99
|
+
probabilities = {
|
100
|
+
prior: prior,
|
101
|
+
conditional: conditional
|
102
|
+
}
|
103
|
+
|
104
|
+
new(probabilities: probabilities, target: @target)
|
105
|
+
end
|
106
|
+
|
107
|
+
def to_pmml
|
108
|
+
data_fields = {}
|
109
|
+
data_fields[@target] = probabilities[:prior].keys
|
110
|
+
probabilities[:conditional].each do |k, v|
|
111
|
+
if !v.values[0][:mean]
|
112
|
+
data_fields[k] = v.keys
|
113
|
+
else
|
114
|
+
data_fields[k] = nil
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
builder = Nokogiri::XML::Builder.new do |xml|
|
119
|
+
xml.PMML(version: "4.3", xmlns: "http://www.dmg.org/PMML-4_3", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
|
120
|
+
xml.Header
|
121
|
+
xml.DataDictionary do
|
122
|
+
data_fields.each do |k, vs|
|
123
|
+
if vs
|
124
|
+
xml.DataField(name: k, optype: "categorical", dataType: "string") do
|
125
|
+
vs.each do |v|
|
126
|
+
xml.Value(value: v)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
else
|
130
|
+
xml.DataField(name: k, optype: "continuous", dataType: "double")
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
|
135
|
+
xml.MiningSchema do
|
136
|
+
data_fields.each do |k, _|
|
137
|
+
xml.MiningField(name: k)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
xml.BayesInputs do
|
141
|
+
probabilities[:conditional].each do |k, v|
|
142
|
+
xml.BayesInput(fieldName: k) do
|
143
|
+
if !v.values[0][:mean]
|
144
|
+
v.each do |k2, v2|
|
145
|
+
xml.PairCounts(value: k2) do
|
146
|
+
xml.TargetValueCounts do
|
147
|
+
v2.each do |k3, v3|
|
148
|
+
xml.TargetValueCount(value: k3, count: v3)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
else
|
154
|
+
xml.TargetValueStats do
|
155
|
+
v.each do |k2, v2|
|
156
|
+
xml.TargetValueStat(value: k2) do
|
157
|
+
xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
xml.BayesOutput(fieldName: "target") do
|
166
|
+
xml.TargetValueCounts do
|
167
|
+
probabilities[:prior].each do |k, v|
|
168
|
+
xml.TargetValueCount(value: k, count: v)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end.to_xml
|
175
|
+
end
|
176
|
+
|
177
|
+
# metrics
|
178
|
+
|
179
|
+
def self.metrics(actual, estimated)
|
180
|
+
{
|
181
|
+
accuracy: actual.zip(estimated).count { |yi, yi2| yi == yi2 } / actual.size.to_f
|
182
|
+
}
|
183
|
+
end
|
184
|
+
|
185
|
+
private
|
186
|
+
|
187
|
+
def _predict(x)
|
188
|
+
x.map do |xi|
|
189
|
+
probs = calculate_class_probabilities(xi)
|
190
|
+
# deterministic for equal probabilities
|
191
|
+
probs.sort_by { |k, v| [-v, k.to_s] }[0][0]
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
def calculate_class_probabilities(x)
|
196
|
+
prob = {}
|
197
|
+
probabilities[:prior].each do |c, cv|
|
198
|
+
prob[c] = cv.to_f / probabilities[:prior].values.sum
|
199
|
+
probabilities[:conditional].each do |k, v|
|
200
|
+
if !v[c][:mean]
|
201
|
+
# TODO compute ahead of time
|
202
|
+
p2 = v[c][x[k]].to_f / v[c].values.sum
|
203
|
+
|
204
|
+
# assign very small probability if probability is 0
|
205
|
+
# TODO use proper smoothing instead
|
206
|
+
if p2 == 0
|
207
|
+
p2 = 0.0001
|
208
|
+
end
|
209
|
+
|
210
|
+
prob[c] *= p2
|
211
|
+
else
|
212
|
+
prob[c] *= calculate_probability(x[k], v[c][:mean], v[c][:stdev])
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
prob
|
217
|
+
end
|
218
|
+
|
219
|
+
def calculate_probability(x, mean, stdev)
|
220
|
+
exponent = Math.exp(-((x - mean)**2) / (2 * (stdev**2)))
|
221
|
+
(1 / (Math.sqrt(2 * Math::PI) * stdev)) * exponent
|
222
|
+
end
|
223
|
+
|
224
|
+
def group_count(arr)
|
225
|
+
r = arr.inject(Hash.new(0)) { |h, e| h[e.to_s] += 1 ; h }
|
226
|
+
r.default = nil
|
227
|
+
r
|
228
|
+
end
|
229
|
+
|
230
|
+
def mean(arr)
|
231
|
+
arr.sum / arr.size.to_f
|
232
|
+
end
|
233
|
+
|
234
|
+
def stdev(arr)
|
235
|
+
m = mean(arr)
|
236
|
+
sum = arr.inject(0) { |accum, i| accum + (i - m)**2 }
|
237
|
+
Math.sqrt(sum / (arr.length - 1).to_f)
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
data/lib/eps/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: eps
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
|
-
bindir:
|
9
|
+
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-05-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -81,25 +81,20 @@ dependencies:
|
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
83
|
description:
|
84
|
-
email:
|
85
|
-
- andrew@chartkick.com
|
84
|
+
email: andrew@chartkick.com
|
86
85
|
executables: []
|
87
86
|
extensions: []
|
88
87
|
extra_rdoc_files: []
|
89
88
|
files:
|
90
|
-
- ".gitignore"
|
91
|
-
- ".travis.yml"
|
92
89
|
- CHANGELOG.md
|
93
|
-
- Gemfile
|
94
90
|
- LICENSE.txt
|
95
91
|
- README.md
|
96
|
-
- Rakefile
|
97
|
-
- eps.gemspec
|
98
|
-
- guides/Modeling.md
|
99
92
|
- lib/eps.rb
|
100
|
-
- lib/eps/
|
101
|
-
- lib/eps/
|
102
|
-
- lib/eps/
|
93
|
+
- lib/eps/base.rb
|
94
|
+
- lib/eps/base_estimator.rb
|
95
|
+
- lib/eps/linear_regression.rb
|
96
|
+
- lib/eps/model.rb
|
97
|
+
- lib/eps/naive_bayes.rb
|
103
98
|
- lib/eps/version.rb
|
104
99
|
homepage: https://github.com/ankane/eps
|
105
100
|
licenses:
|
@@ -113,16 +108,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
113
108
|
requirements:
|
114
109
|
- - ">="
|
115
110
|
- !ruby/object:Gem::Version
|
116
|
-
version: '
|
111
|
+
version: '2.4'
|
117
112
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
118
113
|
requirements:
|
119
114
|
- - ">="
|
120
115
|
- !ruby/object:Gem::Version
|
121
116
|
version: '0'
|
122
117
|
requirements: []
|
123
|
-
|
124
|
-
rubygems_version: 2.7.7
|
118
|
+
rubygems_version: 3.0.3
|
125
119
|
signing_key:
|
126
120
|
specification_version: 4
|
127
|
-
summary:
|
121
|
+
summary: Machine learning for Ruby. Supports regression (linear regression) and classification
|
122
|
+
(naive Bayes)
|
128
123
|
test_files: []
|
data/.gitignore
DELETED
data/.travis.yml
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
language: ruby
|
2
|
-
rvm: 2.5.1
|
3
|
-
sudo: required
|
4
|
-
before_install:
|
5
|
-
- gem install bundler
|
6
|
-
- sudo apt-get update
|
7
|
-
- sudo apt-get install -y libgsl0-dev
|
8
|
-
script: bundle exec rake test
|
9
|
-
env:
|
10
|
-
-
|
11
|
-
- GSL=t
|
12
|
-
notifications:
|
13
|
-
email:
|
14
|
-
on_success: never
|
15
|
-
on_failure: change
|