eps 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/eps/model.rb ADDED
@@ -0,0 +1,108 @@
1
+ module Eps
2
+ class Model
3
+ def initialize(data = nil, y = nil, target: nil, estimator: nil, **options)
4
+ @options = options
5
+
6
+ if estimator
7
+ @estimator = estimator
8
+ elsif data # legacy
9
+ train(data, y, target: target)
10
+ end
11
+ end
12
+
13
+ # pmml
14
+
15
+ def self.load_pmml(data)
16
+ if data.is_a?(String)
17
+ require "nokogiri"
18
+ data = Nokogiri::XML(data) { |config| config.strict }
19
+ end
20
+
21
+ estimator_class =
22
+ if data.css("RegressionModel").any?
23
+ Eps::LinearRegression
24
+ elsif data.css("NaiveBayesModel").any?
25
+ Eps::NaiveBayes
26
+ else
27
+ raise "Unknown model"
28
+ end
29
+
30
+ new(estimator: estimator_class.load_pmml(data))
31
+ end
32
+
33
+ # ruby - legacy
34
+
35
+ def self.load(data)
36
+ new(estimator: Eps::LinearRegression.load(data))
37
+ end
38
+
39
+ # json - legacy
40
+
41
+ def self.load_json(data)
42
+ new(estimator: Eps::LinearRegression.load_json(data))
43
+ end
44
+
45
+ def to_json
46
+ @estimator ? @estimator.to_json : super
47
+ end
48
+
49
+ # pfa - legacy
50
+
51
+ def self.load_pfa(data)
52
+ new(estimator: Eps::LinearRegression.load_pfa(data))
53
+ end
54
+
55
+ # metrics
56
+
57
+ def self.metrics(actual, estimated)
58
+ estimator_class =
59
+ if numeric?(actual)
60
+ Eps::LinearRegression
61
+ else
62
+ Eps::NaiveBayes
63
+ end
64
+
65
+ estimator_class.metrics(actual, estimated)
66
+ end
67
+
68
+ private
69
+
70
+ def train(data, y = nil, target: nil)
71
+ y ||= daru?(data) ? data[target].to_a : data.map { |r| r[target] }
72
+
73
+ estimator_class =
74
+ if self.class.numeric?(y)
75
+ Eps::LinearRegression
76
+ else
77
+ Eps::NaiveBayes
78
+ end
79
+
80
+ @estimator = estimator_class.new(**@options)
81
+ @estimator.train(data, y, target: target)
82
+ end
83
+
84
+ def respond_to_missing?(name, include_private = false)
85
+ if @estimator
86
+ @estimator.respond_to?(name, include_private)
87
+ else
88
+ super
89
+ end
90
+ end
91
+
92
+ def method_missing(method, *args, &block)
93
+ if @estimator
94
+ @estimator.public_send(method, *args, &block)
95
+ else
96
+ super
97
+ end
98
+ end
99
+
100
+ def self.numeric?(y)
101
+ y.first.is_a?(Numeric)
102
+ end
103
+
104
+ def daru?(x)
105
+ defined?(Daru) && x.is_a?(Daru::DataFrame)
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,240 @@
1
+ module Eps
2
+ class NaiveBayes < BaseEstimator
3
+ attr_reader :probabilities
4
+
5
+ def initialize(probabilities: nil, target: nil)
6
+ @probabilities = probabilities if probabilities
7
+ @target = target if target
8
+ end
9
+
10
+ def train(*args)
11
+ super
12
+
13
+ @y = @y.map { |yi| yi.to_s }
14
+
15
+ prior = group_count(@y)
16
+ conditional = {}
17
+
18
+ if @x.any?
19
+ keys = @x.first.keys
20
+ x = @x.dup
21
+ x.each_with_index do |xi, i|
22
+ xi[@target] = @y[i]
23
+ end
24
+ keys.each do |k|
25
+ conditional[k] = {}
26
+ x.group_by { |xi| xi[@target] }.each do |group, xs|
27
+ v = xs.map { |xi| xi[k] }
28
+
29
+ if categorical?(v[0])
30
+ # TODO apply smoothing
31
+ # apply smoothing only to
32
+ # 1. categorical features
33
+ # 2. conditional probabilities
34
+ # TODO more efficient count
35
+ conditional[k][group] = group_count(v)
36
+ else
37
+ conditional[k][group] = {mean: mean(v), stdev: stdev(v)}
38
+ end
39
+ end
40
+ end
41
+ end
42
+
43
+ @probabilities = {
44
+ prior: prior,
45
+ conditional: conditional
46
+ }
47
+ end
48
+
49
+ # TODO better summary
50
+ def summary(extended: false)
51
+ @summary_str ||= begin
52
+ str = String.new("")
53
+ probabilities[:prior].each do |k, v|
54
+ str += "#{k}: #{v}\n"
55
+ end
56
+ str += "\n"
57
+ str += "accuracy: %d%%\n" % [(100 * accuracy).round]
58
+ str
59
+ end
60
+ end
61
+
62
+ def accuracy
63
+ self.class.metrics(predict(@x), @y)[:accuracy]
64
+ end
65
+
66
+ # pmml
67
+
68
+ def self.load_pmml(data)
69
+ # TODO more validation
70
+ node = data.css("NaiveBayesModel")
71
+
72
+ prior = {}
73
+ node.css("BayesOutput TargetValueCount").each do |n|
74
+ prior[n.attribute("value").value] = n.attribute("count").value.to_f
75
+ end
76
+
77
+ conditional = {}
78
+ node.css("BayesInput").each do |n|
79
+ prob = {}
80
+ n.css("TargetValueStat").each do |n2|
81
+ n3 = n2.css("GaussianDistribution")
82
+ prob[n2.attribute("value").value] = {
83
+ mean: n3.attribute("mean").value.to_f,
84
+ stdev: Math.sqrt(n3.attribute("variance").value.to_f)
85
+ }
86
+ end
87
+ n.css("PairCounts").each do |n2|
88
+ boom = {}
89
+ n2.css("TargetValueCount").each do |n3|
90
+ boom[n3.attribute("value").value] = n3.attribute("count").value.to_f
91
+ end
92
+ prob[n2.attribute("value").value] = boom
93
+ end
94
+ conditional[n.attribute("fieldName").value] = prob
95
+ end
96
+
97
+ @target = node.css("BayesOutput").attribute("fieldName").value
98
+
99
+ probabilities = {
100
+ prior: prior,
101
+ conditional: conditional
102
+ }
103
+
104
+ new(probabilities: probabilities, target: @target)
105
+ end
106
+
107
+ def to_pmml
108
+ data_fields = {}
109
+ data_fields[@target] = probabilities[:prior].keys
110
+ probabilities[:conditional].each do |k, v|
111
+ if !v.values[0][:mean]
112
+ data_fields[k] = v.keys
113
+ else
114
+ data_fields[k] = nil
115
+ end
116
+ end
117
+
118
+ builder = Nokogiri::XML::Builder.new do |xml|
119
+ xml.PMML(version: "4.3", xmlns: "http://www.dmg.org/PMML-4_3", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
120
+ xml.Header
121
+ xml.DataDictionary do
122
+ data_fields.each do |k, vs|
123
+ if vs
124
+ xml.DataField(name: k, optype: "categorical", dataType: "string") do
125
+ vs.each do |v|
126
+ xml.Value(value: v)
127
+ end
128
+ end
129
+ else
130
+ xml.DataField(name: k, optype: "continuous", dataType: "double")
131
+ end
132
+ end
133
+ end
134
+ xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
135
+ xml.MiningSchema do
136
+ data_fields.each do |k, _|
137
+ xml.MiningField(name: k)
138
+ end
139
+ end
140
+ xml.BayesInputs do
141
+ probabilities[:conditional].each do |k, v|
142
+ xml.BayesInput(fieldName: k) do
143
+ if !v.values[0][:mean]
144
+ v.each do |k2, v2|
145
+ xml.PairCounts(value: k2) do
146
+ xml.TargetValueCounts do
147
+ v2.each do |k3, v3|
148
+ xml.TargetValueCount(value: k3, count: v3)
149
+ end
150
+ end
151
+ end
152
+ end
153
+ else
154
+ xml.TargetValueStats do
155
+ v.each do |k2, v2|
156
+ xml.TargetValueStat(value: k2) do
157
+ xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
158
+ end
159
+ end
160
+ end
161
+ end
162
+ end
163
+ end
164
+ end
165
+ xml.BayesOutput(fieldName: "target") do
166
+ xml.TargetValueCounts do
167
+ probabilities[:prior].each do |k, v|
168
+ xml.TargetValueCount(value: k, count: v)
169
+ end
170
+ end
171
+ end
172
+ end
173
+ end
174
+ end.to_xml
175
+ end
176
+
177
+ # metrics
178
+
179
+ def self.metrics(actual, estimated)
180
+ {
181
+ accuracy: actual.zip(estimated).count { |yi, yi2| yi == yi2 } / actual.size.to_f
182
+ }
183
+ end
184
+
185
+ private
186
+
187
+ def _predict(x)
188
+ x.map do |xi|
189
+ probs = calculate_class_probabilities(xi)
190
+ # deterministic for equal probabilities
191
+ probs.sort_by { |k, v| [-v, k.to_s] }[0][0]
192
+ end
193
+ end
194
+
195
+ def calculate_class_probabilities(x)
196
+ prob = {}
197
+ probabilities[:prior].each do |c, cv|
198
+ prob[c] = cv.to_f / probabilities[:prior].values.sum
199
+ probabilities[:conditional].each do |k, v|
200
+ if !v[c][:mean]
201
+ # TODO compute ahead of time
202
+ p2 = v[c][x[k]].to_f / v[c].values.sum
203
+
204
+ # assign very small probability if probability is 0
205
+ # TODO use proper smoothing instead
206
+ if p2 == 0
207
+ p2 = 0.0001
208
+ end
209
+
210
+ prob[c] *= p2
211
+ else
212
+ prob[c] *= calculate_probability(x[k], v[c][:mean], v[c][:stdev])
213
+ end
214
+ end
215
+ end
216
+ prob
217
+ end
218
+
219
+ def calculate_probability(x, mean, stdev)
220
+ exponent = Math.exp(-((x - mean)**2) / (2 * (stdev**2)))
221
+ (1 / (Math.sqrt(2 * Math::PI) * stdev)) * exponent
222
+ end
223
+
224
+ def group_count(arr)
225
+ r = arr.inject(Hash.new(0)) { |h, e| h[e.to_s] += 1 ; h }
226
+ r.default = nil
227
+ r
228
+ end
229
+
230
+ def mean(arr)
231
+ arr.sum / arr.size.to_f
232
+ end
233
+
234
+ def stdev(arr)
235
+ m = mean(arr)
236
+ sum = arr.inject(0) { |accum, i| accum + (i - m)**2 }
237
+ Math.sqrt(sum / (arr.length - 1).to_f)
238
+ end
239
+ end
240
+ end
data/lib/eps/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Eps
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: eps
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
- bindir: exe
9
+ bindir: bin
10
10
  cert_chain: []
11
- date: 2018-07-05 00:00:00.000000000 Z
11
+ date: 2019-05-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -81,25 +81,20 @@ dependencies:
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
83
  description:
84
- email:
85
- - andrew@chartkick.com
84
+ email: andrew@chartkick.com
86
85
  executables: []
87
86
  extensions: []
88
87
  extra_rdoc_files: []
89
88
  files:
90
- - ".gitignore"
91
- - ".travis.yml"
92
89
  - CHANGELOG.md
93
- - Gemfile
94
90
  - LICENSE.txt
95
91
  - README.md
96
- - Rakefile
97
- - eps.gemspec
98
- - guides/Modeling.md
99
92
  - lib/eps.rb
100
- - lib/eps/base_regressor.rb
101
- - lib/eps/metrics.rb
102
- - lib/eps/regressor.rb
93
+ - lib/eps/base.rb
94
+ - lib/eps/base_estimator.rb
95
+ - lib/eps/linear_regression.rb
96
+ - lib/eps/model.rb
97
+ - lib/eps/naive_bayes.rb
103
98
  - lib/eps/version.rb
104
99
  homepage: https://github.com/ankane/eps
105
100
  licenses:
@@ -113,16 +108,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
113
108
  requirements:
114
109
  - - ">="
115
110
  - !ruby/object:Gem::Version
116
- version: '0'
111
+ version: '2.4'
117
112
  required_rubygems_version: !ruby/object:Gem::Requirement
118
113
  requirements:
119
114
  - - ">="
120
115
  - !ruby/object:Gem::Version
121
116
  version: '0'
122
117
  requirements: []
123
- rubyforge_project:
124
- rubygems_version: 2.7.7
118
+ rubygems_version: 3.0.3
125
119
  signing_key:
126
120
  specification_version: 4
127
- summary: Linear regression for Ruby
121
+ summary: Machine learning for Ruby. Supports regression (linear regression) and classification
122
+ (naive Bayes)
128
123
  test_files: []
data/.gitignore DELETED
@@ -1,9 +0,0 @@
1
- /.bundle/
2
- /.yardoc
3
- /_yardoc/
4
- /coverage/
5
- /doc/
6
- /pkg/
7
- /spec/reports/
8
- /tmp/
9
- *.lock
data/.travis.yml DELETED
@@ -1,15 +0,0 @@
1
- language: ruby
2
- rvm: 2.5.1
3
- sudo: required
4
- before_install:
5
- - gem install bundler
6
- - sudo apt-get update
7
- - sudo apt-get install -y libgsl0-dev
8
- script: bundle exec rake test
9
- env:
10
- -
11
- - GSL=t
12
- notifications:
13
- email:
14
- on_success: never
15
- on_failure: change