eps 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +235 -84
- data/lib/eps.rb +9 -4
- data/lib/eps/base.rb +19 -0
- data/lib/eps/base_estimator.rb +84 -0
- data/lib/eps/linear_regression.rb +558 -0
- data/lib/eps/model.rb +108 -0
- data/lib/eps/naive_bayes.rb +240 -0
- data/lib/eps/version.rb +1 -1
- metadata +13 -18
- data/.gitignore +0 -9
- data/.travis.yml +0 -15
- data/Gemfile +0 -11
- data/Rakefile +0 -34
- data/eps.gemspec +0 -30
- data/guides/Modeling.md +0 -152
- data/lib/eps/base_regressor.rb +0 -232
- data/lib/eps/metrics.rb +0 -35
- data/lib/eps/regressor.rb +0 -314
data/Gemfile
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
source "https://rubygems.org"
|
2
|
-
|
3
|
-
git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
|
4
|
-
|
5
|
-
# Specify your gem's dependencies in eps.gemspec
|
6
|
-
gemspec
|
7
|
-
|
8
|
-
# remove when 0.2.1 released
|
9
|
-
gem "daru", github: "sciruby/daru"
|
10
|
-
|
11
|
-
gem "gsl" if ENV["GSL"]
|
data/Rakefile
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
require "bundler/gem_tasks"
|
2
|
-
require "rake/testtask"
|
3
|
-
|
4
|
-
Rake::TestTask.new(:test) do |t|
|
5
|
-
t.libs << "test"
|
6
|
-
t.libs << "lib"
|
7
|
-
t.test_files = FileList["test/**/*_test.rb"]
|
8
|
-
t.warning = false
|
9
|
-
end
|
10
|
-
|
11
|
-
task default: :test
|
12
|
-
|
13
|
-
task :benchmark do
|
14
|
-
require "benchmark"
|
15
|
-
require "eps"
|
16
|
-
require "gsl" if ENV["GSL"]
|
17
|
-
|
18
|
-
data = []
|
19
|
-
10000.times do
|
20
|
-
row = {}
|
21
|
-
30.times do |i|
|
22
|
-
row[:"x#{i}"] = rand(100)
|
23
|
-
end
|
24
|
-
row[:y] = rand(100)
|
25
|
-
data << row
|
26
|
-
end
|
27
|
-
|
28
|
-
puts "Starting benchmark..."
|
29
|
-
|
30
|
-
time = Benchmark.realtime do
|
31
|
-
Eps::Regressor.new(data, target: :y)
|
32
|
-
end
|
33
|
-
p time.round(1)
|
34
|
-
end
|
data/eps.gemspec
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path("../lib", __FILE__)
|
3
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require "eps/version"
|
5
|
-
|
6
|
-
Gem::Specification.new do |spec|
|
7
|
-
spec.name = "eps"
|
8
|
-
spec.version = Eps::VERSION
|
9
|
-
spec.authors = ["Andrew Kane"]
|
10
|
-
spec.email = ["andrew@chartkick.com"]
|
11
|
-
|
12
|
-
spec.summary = "Linear regression for Ruby"
|
13
|
-
spec.homepage = "https://github.com/ankane/eps"
|
14
|
-
spec.license = "MIT"
|
15
|
-
|
16
|
-
# Specify which files should be added to the gem when it is released.
|
17
|
-
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
18
|
-
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
19
|
-
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
20
|
-
end
|
21
|
-
spec.bindir = "exe"
|
22
|
-
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
23
|
-
spec.require_paths = ["lib"]
|
24
|
-
|
25
|
-
spec.add_development_dependency "bundler"
|
26
|
-
spec.add_development_dependency "daru"
|
27
|
-
spec.add_development_dependency "minitest"
|
28
|
-
spec.add_development_dependency "nokogiri"
|
29
|
-
spec.add_development_dependency "rake"
|
30
|
-
end
|
data/guides/Modeling.md
DELETED
@@ -1,152 +0,0 @@
|
|
1
|
-
# Modeling
|
2
|
-
|
3
|
-
- [R JSON](#r-json)
|
4
|
-
- [R PMML](#r-pmml)
|
5
|
-
- [R PFA](#r-pfa)
|
6
|
-
- [Python JSON](#python-json)
|
7
|
-
- [Python PMML](#python-pmml)
|
8
|
-
- [Python PFA](#python-pfa)
|
9
|
-
|
10
|
-
## R JSON
|
11
|
-
|
12
|
-
Install the [jsonlite](https://cran.r-project.org/package=jsonlite) package
|
13
|
-
|
14
|
-
```r
|
15
|
-
install.packages("jsonlite")
|
16
|
-
```
|
17
|
-
|
18
|
-
And run:
|
19
|
-
|
20
|
-
```r
|
21
|
-
library(jsonlite)
|
22
|
-
|
23
|
-
model <- lm(dist ~ speed, cars)
|
24
|
-
data <- toJSON(list(coefficients=as.list(coef(model))), auto_unbox=TRUE)
|
25
|
-
write(data, file="model.json")
|
26
|
-
```
|
27
|
-
|
28
|
-
## R PMML
|
29
|
-
|
30
|
-
Install the [pmml](https://cran.r-project.org/package=pmml) package
|
31
|
-
|
32
|
-
```r
|
33
|
-
install.packages("pmml")
|
34
|
-
```
|
35
|
-
|
36
|
-
And run:
|
37
|
-
|
38
|
-
```r
|
39
|
-
library(pmml)
|
40
|
-
|
41
|
-
model <- lm(dist ~ speed, cars)
|
42
|
-
data <- toString(pmml(model))
|
43
|
-
write(data, file="model.pmml")
|
44
|
-
```
|
45
|
-
|
46
|
-
## R PFA
|
47
|
-
|
48
|
-
Install the [aurelius](https://cran.r-project.org/package=aurelius) package
|
49
|
-
|
50
|
-
```r
|
51
|
-
install.packages("aurelius")
|
52
|
-
```
|
53
|
-
|
54
|
-
And run:
|
55
|
-
|
56
|
-
```r
|
57
|
-
library(aurelius)
|
58
|
-
|
59
|
-
model <- lm(dist ~ speed, cars)
|
60
|
-
write_pfa(pfa(model), file="model.pfa")
|
61
|
-
```
|
62
|
-
|
63
|
-
## Python JSON
|
64
|
-
|
65
|
-
Run:
|
66
|
-
|
67
|
-
```python
|
68
|
-
from sklearn import linear_model
|
69
|
-
import pandas as pd
|
70
|
-
import json
|
71
|
-
|
72
|
-
x = [1, 2, 3, 4, 5]
|
73
|
-
y = [5 * xi + 3 for xi in x]
|
74
|
-
|
75
|
-
df = pd.DataFrame({'x': x, 'y': y})
|
76
|
-
features = ['x']
|
77
|
-
|
78
|
-
model = linear_model.LinearRegression()
|
79
|
-
model.fit(df[features], df['y'])
|
80
|
-
|
81
|
-
coefficients = {'_intercept': model.intercept_}
|
82
|
-
for i, c in enumerate(model.coef_):
|
83
|
-
coefficients[features[i]] = c
|
84
|
-
|
85
|
-
|
86
|
-
data = json.dumps({'coefficients': coefficients})
|
87
|
-
|
88
|
-
with open('model.json', 'w') as f:
|
89
|
-
f.write(data)
|
90
|
-
```
|
91
|
-
|
92
|
-
## Python PMML
|
93
|
-
|
94
|
-
Install the [scikit2pmml](https://github.com/vaclavcadek/scikit2pmml) package
|
95
|
-
|
96
|
-
```sh
|
97
|
-
pip install scikit2pmml
|
98
|
-
```
|
99
|
-
|
100
|
-
And run:
|
101
|
-
|
102
|
-
```python
|
103
|
-
from sklearn import linear_model
|
104
|
-
from scikit2pmml import scikit2pmml
|
105
|
-
|
106
|
-
x = [1, 2, 3, 5, 6]
|
107
|
-
y = [5 * xi + 3 for xi in x]
|
108
|
-
|
109
|
-
model = linear_model.LinearRegression()
|
110
|
-
model.fit([[xi] for xi in x], y)
|
111
|
-
|
112
|
-
scikit2pmml(estimator=model, file='model.pmml')
|
113
|
-
```
|
114
|
-
|
115
|
-
## Python PFA
|
116
|
-
|
117
|
-
Install the [Titus](https://github.com/opendatagroup/hadrian) package and run:
|
118
|
-
|
119
|
-
```python
|
120
|
-
from sklearn import linear_model
|
121
|
-
import titus.prettypfa
|
122
|
-
import json
|
123
|
-
|
124
|
-
x = [1, 2, 3, 5, 6]
|
125
|
-
y = [5 * xi + 3 for xi in x]
|
126
|
-
|
127
|
-
model = linear_model.LinearRegression()
|
128
|
-
model.fit([[xi] for xi in x], y)
|
129
|
-
|
130
|
-
def pfa(estimator):
|
131
|
-
pfaDocument = titus.prettypfa.jsonNode('''
|
132
|
-
types:
|
133
|
-
Regression = record(Regression,
|
134
|
-
const: double,
|
135
|
-
coeff: array(double))
|
136
|
-
input: array(double)
|
137
|
-
output: double
|
138
|
-
cells:
|
139
|
-
regression(Regression) = {const: 0.0, coeff: []}
|
140
|
-
action:
|
141
|
-
model.reg.linear(input, regression)
|
142
|
-
''')
|
143
|
-
|
144
|
-
pfaDocument["cells"]["regression"]["init"] = {"const": estimator.intercept_, "coeff": list(estimator.coef_)}
|
145
|
-
|
146
|
-
return pfaDocument
|
147
|
-
|
148
|
-
data = json.dumps(pfa(model))
|
149
|
-
|
150
|
-
with open('model.pfa', 'w') as f:
|
151
|
-
f.write(data)
|
152
|
-
```
|
data/lib/eps/base_regressor.rb
DELETED
@@ -1,232 +0,0 @@
|
|
1
|
-
module Eps
|
2
|
-
class BaseRegressor
|
3
|
-
attr_reader :coefficients
|
4
|
-
|
5
|
-
def initialize(coefficients:)
|
6
|
-
@coefficients = Hash[coefficients.map { |k, v| [k.to_sym, v] }]
|
7
|
-
end
|
8
|
-
|
9
|
-
def predict(x)
|
10
|
-
singular = !(x.is_a?(Array) || daru?(x))
|
11
|
-
x = [x] if singular
|
12
|
-
x, c = prep_x(x, train: false)
|
13
|
-
coef = c.map do |v|
|
14
|
-
# use 0 if coefficient does not exist
|
15
|
-
# this can happen for categorical features
|
16
|
-
# since only n-1 coefficients are stored
|
17
|
-
coefficients[v] || 0
|
18
|
-
end
|
19
|
-
|
20
|
-
x = Matrix.rows(x)
|
21
|
-
c = Matrix.column_vector(coef)
|
22
|
-
pred = matrix_arr(x * c)
|
23
|
-
|
24
|
-
singular ? pred[0] : pred
|
25
|
-
end
|
26
|
-
|
27
|
-
def evaluate(data, y = nil, target: nil)
|
28
|
-
raise ArgumentError, "missing target" if !target && !y
|
29
|
-
|
30
|
-
actual = y
|
31
|
-
actual ||=
|
32
|
-
if daru?(data)
|
33
|
-
data[target].to_a
|
34
|
-
else
|
35
|
-
data.map { |v| v[target] }
|
36
|
-
end
|
37
|
-
|
38
|
-
actual = prep_y(actual)
|
39
|
-
estimated = predict(data)
|
40
|
-
Eps.metrics(actual, estimated)
|
41
|
-
end
|
42
|
-
|
43
|
-
# ruby
|
44
|
-
|
45
|
-
def self.load(data)
|
46
|
-
BaseRegressor.new(Hash[data.map { |k, v| [k.to_sym, v] }])
|
47
|
-
end
|
48
|
-
|
49
|
-
def dump
|
50
|
-
{coefficients: coefficients}
|
51
|
-
end
|
52
|
-
|
53
|
-
# json
|
54
|
-
|
55
|
-
def self.load_json(data)
|
56
|
-
data = JSON.parse(data) if data.is_a?(String)
|
57
|
-
coefficients = data["coefficients"]
|
58
|
-
|
59
|
-
# for R models
|
60
|
-
if coefficients["(Intercept)"]
|
61
|
-
coefficients = coefficients.dup
|
62
|
-
coefficients["_intercept"] = coefficients.delete("(Intercept)")
|
63
|
-
end
|
64
|
-
|
65
|
-
BaseRegressor.new(coefficients: coefficients)
|
66
|
-
end
|
67
|
-
|
68
|
-
def to_json
|
69
|
-
JSON.generate(dump)
|
70
|
-
end
|
71
|
-
|
72
|
-
# pmml
|
73
|
-
|
74
|
-
def self.load_pmml(data)
|
75
|
-
if data.is_a?(String)
|
76
|
-
require "nokogiri"
|
77
|
-
data = Nokogiri::XML(data)
|
78
|
-
end
|
79
|
-
|
80
|
-
# TODO more validation
|
81
|
-
node = data.css("RegressionTable")
|
82
|
-
coefficients = {
|
83
|
-
_intercept: node.attribute("intercept").value.to_f
|
84
|
-
}
|
85
|
-
node.css("NumericPredictor").each do |n|
|
86
|
-
coefficients[n.attribute("name").value] = n.attribute("coefficient").value.to_f
|
87
|
-
end
|
88
|
-
node.css("CategoricalPredictor").each do |n|
|
89
|
-
coefficients["#{n.attribute("name").value}#{n.attribute("value").value}"] = n.attribute("coefficient").value.to_f
|
90
|
-
end
|
91
|
-
BaseRegressor.new(coefficients: coefficients)
|
92
|
-
end
|
93
|
-
|
94
|
-
# pfa
|
95
|
-
|
96
|
-
def self.load_pfa(data)
|
97
|
-
data = JSON.parse(data) if data.is_a?(String)
|
98
|
-
init = data["cells"].first[1]["init"]
|
99
|
-
names =
|
100
|
-
if data["input"]["fields"]
|
101
|
-
data["input"]["fields"].map { |f| f["name"] }
|
102
|
-
else
|
103
|
-
init["coeff"].map.with_index { |_, i| "x#{i}" }
|
104
|
-
end
|
105
|
-
coefficients = {
|
106
|
-
_intercept: init["const"]
|
107
|
-
}
|
108
|
-
init["coeff"].each_with_index do |c, i|
|
109
|
-
name = names[i]
|
110
|
-
# R can export coefficients with same name
|
111
|
-
raise "Coefficients with same name" if coefficients[name]
|
112
|
-
coefficients[name] = c
|
113
|
-
end
|
114
|
-
BaseRegressor.new(coefficients: coefficients)
|
115
|
-
end
|
116
|
-
|
117
|
-
protected
|
118
|
-
|
119
|
-
def daru?(x)
|
120
|
-
defined?(Daru) && x.is_a?(Daru::DataFrame)
|
121
|
-
end
|
122
|
-
|
123
|
-
def prep_x(x, train: true)
|
124
|
-
if daru?(x)
|
125
|
-
x = x.to_a[0]
|
126
|
-
else
|
127
|
-
x = x.map do |xi|
|
128
|
-
case xi
|
129
|
-
when Hash
|
130
|
-
xi
|
131
|
-
when Array
|
132
|
-
Hash[xi.map.with_index { |v, i| [:"x#{i}", v] }]
|
133
|
-
else
|
134
|
-
{x0: xi}
|
135
|
-
end
|
136
|
-
end
|
137
|
-
end
|
138
|
-
|
139
|
-
# if !train && x.any?
|
140
|
-
# # check first row against coefficients
|
141
|
-
# ckeys = coefficients.keys.map(&:to_s)
|
142
|
-
# bad_keys = x[0].keys.map(&:to_s).reject { |k| ckeys.any? { |c| c.start_with?(k) } }
|
143
|
-
# raise "Unknown keys: #{bad_keys.join(", ")}" if bad_keys.any?
|
144
|
-
# end
|
145
|
-
|
146
|
-
cache = {}
|
147
|
-
first_key = {}
|
148
|
-
i = 0
|
149
|
-
rows = []
|
150
|
-
x.each do |xi|
|
151
|
-
row = {}
|
152
|
-
xi.each do |k, v|
|
153
|
-
key = v.is_a?(String) ? [k.to_sym, v] : k.to_sym
|
154
|
-
v2 = v.is_a?(String) ? 1 : v
|
155
|
-
|
156
|
-
# TODO make more efficient
|
157
|
-
next if !train && !coefficients.key?(symbolize_coef(key))
|
158
|
-
|
159
|
-
raise "Missing data" if v2.nil?
|
160
|
-
|
161
|
-
unless cache[key]
|
162
|
-
cache[key] = i
|
163
|
-
first_key[k] ||= key if v.is_a?(String)
|
164
|
-
i += 1
|
165
|
-
end
|
166
|
-
|
167
|
-
row[key] = v2
|
168
|
-
end
|
169
|
-
rows << row
|
170
|
-
end
|
171
|
-
|
172
|
-
if train
|
173
|
-
# remove one degree of freedom
|
174
|
-
first_key.values.each do |v|
|
175
|
-
num = cache.delete(v)
|
176
|
-
cache.each do |k, v2|
|
177
|
-
cache[k] -= 1 if v2 > num
|
178
|
-
end
|
179
|
-
end
|
180
|
-
end
|
181
|
-
|
182
|
-
ret2 = []
|
183
|
-
rows.each do |row|
|
184
|
-
ret = [0] * cache.size
|
185
|
-
row.each do |k, v|
|
186
|
-
if cache[k]
|
187
|
-
ret[cache[k]] = v
|
188
|
-
end
|
189
|
-
end
|
190
|
-
ret2 << ([1] + ret)
|
191
|
-
end
|
192
|
-
|
193
|
-
# flatten keys
|
194
|
-
c = [:_intercept] + cache.sort_by { |_, v| v }.map { |k, _| symbolize_coef(k) }
|
195
|
-
|
196
|
-
if c.size != c.uniq.size
|
197
|
-
raise "Overlapping coefficients"
|
198
|
-
end
|
199
|
-
|
200
|
-
[ret2, c]
|
201
|
-
end
|
202
|
-
|
203
|
-
def symbolize_coef(k)
|
204
|
-
(k.is_a?(Array) ? k.join("") : k).to_sym
|
205
|
-
end
|
206
|
-
|
207
|
-
def matrix_arr(matrix)
|
208
|
-
matrix.to_a.map { |xi| xi[0].to_f }
|
209
|
-
end
|
210
|
-
|
211
|
-
# determine if target is a string or symbol
|
212
|
-
def prep_target(target, data)
|
213
|
-
if daru?(data)
|
214
|
-
data.has_vector?(target) ? target : flip_target(target)
|
215
|
-
else
|
216
|
-
x = data[0] || {}
|
217
|
-
x[target] ? target : flip_target(target)
|
218
|
-
end
|
219
|
-
end
|
220
|
-
|
221
|
-
def flip_target(target)
|
222
|
-
target.is_a?(String) ? target.to_sym : target.to_s
|
223
|
-
end
|
224
|
-
|
225
|
-
def prep_y(y)
|
226
|
-
y.each do |yi|
|
227
|
-
raise "Target missing in data" if yi.nil?
|
228
|
-
end
|
229
|
-
y.map(&:to_f)
|
230
|
-
end
|
231
|
-
end
|
232
|
-
end
|