eps 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/eps/model.rb ADDED
@@ -0,0 +1,108 @@
1
+ module Eps
2
+ class Model
3
+ def initialize(data = nil, y = nil, target: nil, estimator: nil, **options)
4
+ @options = options
5
+
6
+ if estimator
7
+ @estimator = estimator
8
+ elsif data # legacy
9
+ train(data, y, target: target)
10
+ end
11
+ end
12
+
13
+ # pmml
14
+
15
+ def self.load_pmml(data)
16
+ if data.is_a?(String)
17
+ require "nokogiri"
18
+ data = Nokogiri::XML(data) { |config| config.strict }
19
+ end
20
+
21
+ estimator_class =
22
+ if data.css("RegressionModel").any?
23
+ Eps::LinearRegression
24
+ elsif data.css("NaiveBayesModel").any?
25
+ Eps::NaiveBayes
26
+ else
27
+ raise "Unknown model"
28
+ end
29
+
30
+ new(estimator: estimator_class.load_pmml(data))
31
+ end
32
+
33
+ # ruby - legacy
34
+
35
+ def self.load(data)
36
+ new(estimator: Eps::LinearRegression.load(data))
37
+ end
38
+
39
+ # json - legacy
40
+
41
+ def self.load_json(data)
42
+ new(estimator: Eps::LinearRegression.load_json(data))
43
+ end
44
+
45
+ def to_json
46
+ @estimator ? @estimator.to_json : super
47
+ end
48
+
49
+ # pfa - legacy
50
+
51
+ def self.load_pfa(data)
52
+ new(estimator: Eps::LinearRegression.load_pfa(data))
53
+ end
54
+
55
+ # metrics
56
+
57
+ def self.metrics(actual, estimated)
58
+ estimator_class =
59
+ if numeric?(actual)
60
+ Eps::LinearRegression
61
+ else
62
+ Eps::NaiveBayes
63
+ end
64
+
65
+ estimator_class.metrics(actual, estimated)
66
+ end
67
+
68
+ private
69
+
70
+ def train(data, y = nil, target: nil)
71
+ y ||= daru?(data) ? data[target].to_a : data.map { |r| r[target] }
72
+
73
+ estimator_class =
74
+ if self.class.numeric?(y)
75
+ Eps::LinearRegression
76
+ else
77
+ Eps::NaiveBayes
78
+ end
79
+
80
+ @estimator = estimator_class.new(**@options)
81
+ @estimator.train(data, y, target: target)
82
+ end
83
+
84
+ def respond_to_missing?(name, include_private = false)
85
+ if @estimator
86
+ @estimator.respond_to?(name, include_private)
87
+ else
88
+ super
89
+ end
90
+ end
91
+
92
+ def method_missing(method, *args, &block)
93
+ if @estimator
94
+ @estimator.public_send(method, *args, &block)
95
+ else
96
+ super
97
+ end
98
+ end
99
+
100
+ def self.numeric?(y)
101
+ y.first.is_a?(Numeric)
102
+ end
103
+
104
+ def daru?(x)
105
+ defined?(Daru) && x.is_a?(Daru::DataFrame)
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,240 @@
1
+ module Eps
2
+ class NaiveBayes < BaseEstimator
3
+ attr_reader :probabilities
4
+
5
+ def initialize(probabilities: nil, target: nil)
6
+ @probabilities = probabilities if probabilities
7
+ @target = target if target
8
+ end
9
+
10
+ def train(*args)
11
+ super
12
+
13
+ @y = @y.map { |yi| yi.to_s }
14
+
15
+ prior = group_count(@y)
16
+ conditional = {}
17
+
18
+ if @x.any?
19
+ keys = @x.first.keys
20
+ x = @x.dup
21
+ x.each_with_index do |xi, i|
22
+ xi[@target] = @y[i]
23
+ end
24
+ keys.each do |k|
25
+ conditional[k] = {}
26
+ x.group_by { |xi| xi[@target] }.each do |group, xs|
27
+ v = xs.map { |xi| xi[k] }
28
+
29
+ if categorical?(v[0])
30
+ # TODO apply smoothing
31
+ # apply smoothing only to
32
+ # 1. categorical features
33
+ # 2. conditional probabilities
34
+ # TODO more efficient count
35
+ conditional[k][group] = group_count(v)
36
+ else
37
+ conditional[k][group] = {mean: mean(v), stdev: stdev(v)}
38
+ end
39
+ end
40
+ end
41
+ end
42
+
43
+ @probabilities = {
44
+ prior: prior,
45
+ conditional: conditional
46
+ }
47
+ end
48
+
49
+ # TODO better summary
50
+ def summary(extended: false)
51
+ @summary_str ||= begin
52
+ str = String.new("")
53
+ probabilities[:prior].each do |k, v|
54
+ str += "#{k}: #{v}\n"
55
+ end
56
+ str += "\n"
57
+ str += "accuracy: %d%%\n" % [(100 * accuracy).round]
58
+ str
59
+ end
60
+ end
61
+
62
+ def accuracy
63
+ self.class.metrics(predict(@x), @y)[:accuracy]
64
+ end
65
+
66
+ # pmml
67
+
68
+ def self.load_pmml(data)
69
+ # TODO more validation
70
+ node = data.css("NaiveBayesModel")
71
+
72
+ prior = {}
73
+ node.css("BayesOutput TargetValueCount").each do |n|
74
+ prior[n.attribute("value").value] = n.attribute("count").value.to_f
75
+ end
76
+
77
+ conditional = {}
78
+ node.css("BayesInput").each do |n|
79
+ prob = {}
80
+ n.css("TargetValueStat").each do |n2|
81
+ n3 = n2.css("GaussianDistribution")
82
+ prob[n2.attribute("value").value] = {
83
+ mean: n3.attribute("mean").value.to_f,
84
+ stdev: Math.sqrt(n3.attribute("variance").value.to_f)
85
+ }
86
+ end
87
+ n.css("PairCounts").each do |n2|
88
+ boom = {}
89
+ n2.css("TargetValueCount").each do |n3|
90
+ boom[n3.attribute("value").value] = n3.attribute("count").value.to_f
91
+ end
92
+ prob[n2.attribute("value").value] = boom
93
+ end
94
+ conditional[n.attribute("fieldName").value] = prob
95
+ end
96
+
97
+ @target = node.css("BayesOutput").attribute("fieldName").value
98
+
99
+ probabilities = {
100
+ prior: prior,
101
+ conditional: conditional
102
+ }
103
+
104
+ new(probabilities: probabilities, target: @target)
105
+ end
106
+
107
+ def to_pmml
108
+ data_fields = {}
109
+ data_fields[@target] = probabilities[:prior].keys
110
+ probabilities[:conditional].each do |k, v|
111
+ if !v.values[0][:mean]
112
+ data_fields[k] = v.keys
113
+ else
114
+ data_fields[k] = nil
115
+ end
116
+ end
117
+
118
+ builder = Nokogiri::XML::Builder.new do |xml|
119
+ xml.PMML(version: "4.3", xmlns: "http://www.dmg.org/PMML-4_3", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
120
+ xml.Header
121
+ xml.DataDictionary do
122
+ data_fields.each do |k, vs|
123
+ if vs
124
+ xml.DataField(name: k, optype: "categorical", dataType: "string") do
125
+ vs.each do |v|
126
+ xml.Value(value: v)
127
+ end
128
+ end
129
+ else
130
+ xml.DataField(name: k, optype: "continuous", dataType: "double")
131
+ end
132
+ end
133
+ end
134
+ xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
135
+ xml.MiningSchema do
136
+ data_fields.each do |k, _|
137
+ xml.MiningField(name: k)
138
+ end
139
+ end
140
+ xml.BayesInputs do
141
+ probabilities[:conditional].each do |k, v|
142
+ xml.BayesInput(fieldName: k) do
143
+ if !v.values[0][:mean]
144
+ v.each do |k2, v2|
145
+ xml.PairCounts(value: k2) do
146
+ xml.TargetValueCounts do
147
+ v2.each do |k3, v3|
148
+ xml.TargetValueCount(value: k3, count: v3)
149
+ end
150
+ end
151
+ end
152
+ end
153
+ else
154
+ xml.TargetValueStats do
155
+ v.each do |k2, v2|
156
+ xml.TargetValueStat(value: k2) do
157
+ xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
158
+ end
159
+ end
160
+ end
161
+ end
162
+ end
163
+ end
164
+ end
165
+ xml.BayesOutput(fieldName: "target") do
166
+ xml.TargetValueCounts do
167
+ probabilities[:prior].each do |k, v|
168
+ xml.TargetValueCount(value: k, count: v)
169
+ end
170
+ end
171
+ end
172
+ end
173
+ end
174
+ end.to_xml
175
+ end
176
+
177
+ # metrics
178
+
179
+ def self.metrics(actual, estimated)
180
+ {
181
+ accuracy: actual.zip(estimated).count { |yi, yi2| yi == yi2 } / actual.size.to_f
182
+ }
183
+ end
184
+
185
+ private
186
+
187
+ def _predict(x)
188
+ x.map do |xi|
189
+ probs = calculate_class_probabilities(xi)
190
+ # deterministic for equal probabilities
191
+ probs.sort_by { |k, v| [-v, k.to_s] }[0][0]
192
+ end
193
+ end
194
+
195
+ def calculate_class_probabilities(x)
196
+ prob = {}
197
+ probabilities[:prior].each do |c, cv|
198
+ prob[c] = cv.to_f / probabilities[:prior].values.sum
199
+ probabilities[:conditional].each do |k, v|
200
+ if !v[c][:mean]
201
+ # TODO compute ahead of time
202
+ p2 = v[c][x[k]].to_f / v[c].values.sum
203
+
204
+ # assign very small probability if probability is 0
205
+ # TODO use proper smoothing instead
206
+ if p2 == 0
207
+ p2 = 0.0001
208
+ end
209
+
210
+ prob[c] *= p2
211
+ else
212
+ prob[c] *= calculate_probability(x[k], v[c][:mean], v[c][:stdev])
213
+ end
214
+ end
215
+ end
216
+ prob
217
+ end
218
+
219
+ def calculate_probability(x, mean, stdev)
220
+ exponent = Math.exp(-((x - mean)**2) / (2 * (stdev**2)))
221
+ (1 / (Math.sqrt(2 * Math::PI) * stdev)) * exponent
222
+ end
223
+
224
+ def group_count(arr)
225
+ r = arr.inject(Hash.new(0)) { |h, e| h[e.to_s] += 1 ; h }
226
+ r.default = nil
227
+ r
228
+ end
229
+
230
+ def mean(arr)
231
+ arr.sum / arr.size.to_f
232
+ end
233
+
234
+ def stdev(arr)
235
+ m = mean(arr)
236
+ sum = arr.inject(0) { |accum, i| accum + (i - m)**2 }
237
+ Math.sqrt(sum / (arr.length - 1).to_f)
238
+ end
239
+ end
240
+ end
data/lib/eps/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Eps
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: eps
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
- bindir: exe
9
+ bindir: bin
10
10
  cert_chain: []
11
- date: 2018-07-05 00:00:00.000000000 Z
11
+ date: 2019-05-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -81,25 +81,20 @@ dependencies:
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
83
  description:
84
- email:
85
- - andrew@chartkick.com
84
+ email: andrew@chartkick.com
86
85
  executables: []
87
86
  extensions: []
88
87
  extra_rdoc_files: []
89
88
  files:
90
- - ".gitignore"
91
- - ".travis.yml"
92
89
  - CHANGELOG.md
93
- - Gemfile
94
90
  - LICENSE.txt
95
91
  - README.md
96
- - Rakefile
97
- - eps.gemspec
98
- - guides/Modeling.md
99
92
  - lib/eps.rb
100
- - lib/eps/base_regressor.rb
101
- - lib/eps/metrics.rb
102
- - lib/eps/regressor.rb
93
+ - lib/eps/base.rb
94
+ - lib/eps/base_estimator.rb
95
+ - lib/eps/linear_regression.rb
96
+ - lib/eps/model.rb
97
+ - lib/eps/naive_bayes.rb
103
98
  - lib/eps/version.rb
104
99
  homepage: https://github.com/ankane/eps
105
100
  licenses:
@@ -113,16 +108,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
113
108
  requirements:
114
109
  - - ">="
115
110
  - !ruby/object:Gem::Version
116
- version: '0'
111
+ version: '2.4'
117
112
  required_rubygems_version: !ruby/object:Gem::Requirement
118
113
  requirements:
119
114
  - - ">="
120
115
  - !ruby/object:Gem::Version
121
116
  version: '0'
122
117
  requirements: []
123
- rubyforge_project:
124
- rubygems_version: 2.7.7
118
+ rubygems_version: 3.0.3
125
119
  signing_key:
126
120
  specification_version: 4
127
- summary: Linear regression for Ruby
121
+ summary: Machine learning for Ruby. Supports regression (linear regression) and classification
122
+ (naive Bayes)
128
123
  test_files: []
data/.gitignore DELETED
@@ -1,9 +0,0 @@
1
- /.bundle/
2
- /.yardoc
3
- /_yardoc/
4
- /coverage/
5
- /doc/
6
- /pkg/
7
- /spec/reports/
8
- /tmp/
9
- *.lock
data/.travis.yml DELETED
@@ -1,15 +0,0 @@
1
- language: ruby
2
- rvm: 2.5.1
3
- sudo: required
4
- before_install:
5
- - gem install bundler
6
- - sudo apt-get update
7
- - sudo apt-get install -y libgsl0-dev
8
- script: bundle exec rake test
9
- env:
10
- -
11
- - GSL=t
12
- notifications:
13
- email:
14
- on_success: never
15
- on_failure: change