eps 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +235 -84
- data/lib/eps.rb +9 -4
- data/lib/eps/base.rb +19 -0
- data/lib/eps/base_estimator.rb +84 -0
- data/lib/eps/linear_regression.rb +558 -0
- data/lib/eps/model.rb +108 -0
- data/lib/eps/naive_bayes.rb +240 -0
- data/lib/eps/version.rb +1 -1
- metadata +13 -18
- data/.gitignore +0 -9
- data/.travis.yml +0 -15
- data/Gemfile +0 -11
- data/Rakefile +0 -34
- data/eps.gemspec +0 -30
- data/guides/Modeling.md +0 -152
- data/lib/eps/base_regressor.rb +0 -232
- data/lib/eps/metrics.rb +0 -35
- data/lib/eps/regressor.rb +0 -314
data/Gemfile
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
source "https://rubygems.org"
|
2
|
-
|
3
|
-
git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
|
4
|
-
|
5
|
-
# Specify your gem's dependencies in eps.gemspec
|
6
|
-
gemspec
|
7
|
-
|
8
|
-
# remove when 0.2.1 released
|
9
|
-
gem "daru", github: "sciruby/daru"
|
10
|
-
|
11
|
-
gem "gsl" if ENV["GSL"]
|
data/Rakefile
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
require "bundler/gem_tasks"
|
2
|
-
require "rake/testtask"
|
3
|
-
|
4
|
-
Rake::TestTask.new(:test) do |t|
|
5
|
-
t.libs << "test"
|
6
|
-
t.libs << "lib"
|
7
|
-
t.test_files = FileList["test/**/*_test.rb"]
|
8
|
-
t.warning = false
|
9
|
-
end
|
10
|
-
|
11
|
-
task default: :test
|
12
|
-
|
13
|
-
task :benchmark do
|
14
|
-
require "benchmark"
|
15
|
-
require "eps"
|
16
|
-
require "gsl" if ENV["GSL"]
|
17
|
-
|
18
|
-
data = []
|
19
|
-
10000.times do
|
20
|
-
row = {}
|
21
|
-
30.times do |i|
|
22
|
-
row[:"x#{i}"] = rand(100)
|
23
|
-
end
|
24
|
-
row[:y] = rand(100)
|
25
|
-
data << row
|
26
|
-
end
|
27
|
-
|
28
|
-
puts "Starting benchmark..."
|
29
|
-
|
30
|
-
time = Benchmark.realtime do
|
31
|
-
Eps::Regressor.new(data, target: :y)
|
32
|
-
end
|
33
|
-
p time.round(1)
|
34
|
-
end
|
data/eps.gemspec
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path("../lib", __FILE__)
|
3
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require "eps/version"
|
5
|
-
|
6
|
-
Gem::Specification.new do |spec|
|
7
|
-
spec.name = "eps"
|
8
|
-
spec.version = Eps::VERSION
|
9
|
-
spec.authors = ["Andrew Kane"]
|
10
|
-
spec.email = ["andrew@chartkick.com"]
|
11
|
-
|
12
|
-
spec.summary = "Linear regression for Ruby"
|
13
|
-
spec.homepage = "https://github.com/ankane/eps"
|
14
|
-
spec.license = "MIT"
|
15
|
-
|
16
|
-
# Specify which files should be added to the gem when it is released.
|
17
|
-
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
18
|
-
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
19
|
-
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
20
|
-
end
|
21
|
-
spec.bindir = "exe"
|
22
|
-
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
23
|
-
spec.require_paths = ["lib"]
|
24
|
-
|
25
|
-
spec.add_development_dependency "bundler"
|
26
|
-
spec.add_development_dependency "daru"
|
27
|
-
spec.add_development_dependency "minitest"
|
28
|
-
spec.add_development_dependency "nokogiri"
|
29
|
-
spec.add_development_dependency "rake"
|
30
|
-
end
|
data/guides/Modeling.md
DELETED
@@ -1,152 +0,0 @@
|
|
1
|
-
# Modeling
|
2
|
-
|
3
|
-
- [R JSON](#r-json)
|
4
|
-
- [R PMML](#r-pmml)
|
5
|
-
- [R PFA](#r-pfa)
|
6
|
-
- [Python JSON](#python-json)
|
7
|
-
- [Python PMML](#python-pmml)
|
8
|
-
- [Python PFA](#python-pfa)
|
9
|
-
|
10
|
-
## R JSON
|
11
|
-
|
12
|
-
Install the [jsonlite](https://cran.r-project.org/package=jsonlite) package
|
13
|
-
|
14
|
-
```r
|
15
|
-
install.packages("jsonlite")
|
16
|
-
```
|
17
|
-
|
18
|
-
And run:
|
19
|
-
|
20
|
-
```r
|
21
|
-
library(jsonlite)
|
22
|
-
|
23
|
-
model <- lm(dist ~ speed, cars)
|
24
|
-
data <- toJSON(list(coefficients=as.list(coef(model))), auto_unbox=TRUE)
|
25
|
-
write(data, file="model.json")
|
26
|
-
```
|
27
|
-
|
28
|
-
## R PMML
|
29
|
-
|
30
|
-
Install the [pmml](https://cran.r-project.org/package=pmml) package
|
31
|
-
|
32
|
-
```r
|
33
|
-
install.packages("pmml")
|
34
|
-
```
|
35
|
-
|
36
|
-
And run:
|
37
|
-
|
38
|
-
```r
|
39
|
-
library(pmml)
|
40
|
-
|
41
|
-
model <- lm(dist ~ speed, cars)
|
42
|
-
data <- toString(pmml(model))
|
43
|
-
write(data, file="model.pmml")
|
44
|
-
```
|
45
|
-
|
46
|
-
## R PFA
|
47
|
-
|
48
|
-
Install the [aurelius](https://cran.r-project.org/package=aurelius) package
|
49
|
-
|
50
|
-
```r
|
51
|
-
install.packages("aurelius")
|
52
|
-
```
|
53
|
-
|
54
|
-
And run:
|
55
|
-
|
56
|
-
```r
|
57
|
-
library(aurelius)
|
58
|
-
|
59
|
-
model <- lm(dist ~ speed, cars)
|
60
|
-
write_pfa(pfa(model), file="model.pfa")
|
61
|
-
```
|
62
|
-
|
63
|
-
## Python JSON
|
64
|
-
|
65
|
-
Run:
|
66
|
-
|
67
|
-
```python
|
68
|
-
from sklearn import linear_model
|
69
|
-
import pandas as pd
|
70
|
-
import json
|
71
|
-
|
72
|
-
x = [1, 2, 3, 4, 5]
|
73
|
-
y = [5 * xi + 3 for xi in x]
|
74
|
-
|
75
|
-
df = pd.DataFrame({'x': x, 'y': y})
|
76
|
-
features = ['x']
|
77
|
-
|
78
|
-
model = linear_model.LinearRegression()
|
79
|
-
model.fit(df[features], df['y'])
|
80
|
-
|
81
|
-
coefficients = {'_intercept': model.intercept_}
|
82
|
-
for i, c in enumerate(model.coef_):
|
83
|
-
coefficients[features[i]] = c
|
84
|
-
|
85
|
-
|
86
|
-
data = json.dumps({'coefficients': coefficients})
|
87
|
-
|
88
|
-
with open('model.json', 'w') as f:
|
89
|
-
f.write(data)
|
90
|
-
```
|
91
|
-
|
92
|
-
## Python PMML
|
93
|
-
|
94
|
-
Install the [scikit2pmml](https://github.com/vaclavcadek/scikit2pmml) package
|
95
|
-
|
96
|
-
```sh
|
97
|
-
pip install scikit2pmml
|
98
|
-
```
|
99
|
-
|
100
|
-
And run:
|
101
|
-
|
102
|
-
```python
|
103
|
-
from sklearn import linear_model
|
104
|
-
from scikit2pmml import scikit2pmml
|
105
|
-
|
106
|
-
x = [1, 2, 3, 5, 6]
|
107
|
-
y = [5 * xi + 3 for xi in x]
|
108
|
-
|
109
|
-
model = linear_model.LinearRegression()
|
110
|
-
model.fit([[xi] for xi in x], y)
|
111
|
-
|
112
|
-
scikit2pmml(estimator=model, file='model.pmml')
|
113
|
-
```
|
114
|
-
|
115
|
-
## Python PFA
|
116
|
-
|
117
|
-
Install the [Titus](https://github.com/opendatagroup/hadrian) package and run:
|
118
|
-
|
119
|
-
```python
|
120
|
-
from sklearn import linear_model
|
121
|
-
import titus.prettypfa
|
122
|
-
import json
|
123
|
-
|
124
|
-
x = [1, 2, 3, 5, 6]
|
125
|
-
y = [5 * xi + 3 for xi in x]
|
126
|
-
|
127
|
-
model = linear_model.LinearRegression()
|
128
|
-
model.fit([[xi] for xi in x], y)
|
129
|
-
|
130
|
-
def pfa(estimator):
|
131
|
-
pfaDocument = titus.prettypfa.jsonNode('''
|
132
|
-
types:
|
133
|
-
Regression = record(Regression,
|
134
|
-
const: double,
|
135
|
-
coeff: array(double))
|
136
|
-
input: array(double)
|
137
|
-
output: double
|
138
|
-
cells:
|
139
|
-
regression(Regression) = {const: 0.0, coeff: []}
|
140
|
-
action:
|
141
|
-
model.reg.linear(input, regression)
|
142
|
-
''')
|
143
|
-
|
144
|
-
pfaDocument["cells"]["regression"]["init"] = {"const": estimator.intercept_, "coeff": list(estimator.coef_)}
|
145
|
-
|
146
|
-
return pfaDocument
|
147
|
-
|
148
|
-
data = json.dumps(pfa(model))
|
149
|
-
|
150
|
-
with open('model.pfa', 'w') as f:
|
151
|
-
f.write(data)
|
152
|
-
```
|
data/lib/eps/base_regressor.rb
DELETED
@@ -1,232 +0,0 @@
|
|
1
|
-
module Eps
|
2
|
-
class BaseRegressor
|
3
|
-
attr_reader :coefficients
|
4
|
-
|
5
|
-
def initialize(coefficients:)
|
6
|
-
@coefficients = Hash[coefficients.map { |k, v| [k.to_sym, v] }]
|
7
|
-
end
|
8
|
-
|
9
|
-
def predict(x)
|
10
|
-
singular = !(x.is_a?(Array) || daru?(x))
|
11
|
-
x = [x] if singular
|
12
|
-
x, c = prep_x(x, train: false)
|
13
|
-
coef = c.map do |v|
|
14
|
-
# use 0 if coefficient does not exist
|
15
|
-
# this can happen for categorical features
|
16
|
-
# since only n-1 coefficients are stored
|
17
|
-
coefficients[v] || 0
|
18
|
-
end
|
19
|
-
|
20
|
-
x = Matrix.rows(x)
|
21
|
-
c = Matrix.column_vector(coef)
|
22
|
-
pred = matrix_arr(x * c)
|
23
|
-
|
24
|
-
singular ? pred[0] : pred
|
25
|
-
end
|
26
|
-
|
27
|
-
def evaluate(data, y = nil, target: nil)
|
28
|
-
raise ArgumentError, "missing target" if !target && !y
|
29
|
-
|
30
|
-
actual = y
|
31
|
-
actual ||=
|
32
|
-
if daru?(data)
|
33
|
-
data[target].to_a
|
34
|
-
else
|
35
|
-
data.map { |v| v[target] }
|
36
|
-
end
|
37
|
-
|
38
|
-
actual = prep_y(actual)
|
39
|
-
estimated = predict(data)
|
40
|
-
Eps.metrics(actual, estimated)
|
41
|
-
end
|
42
|
-
|
43
|
-
# ruby
|
44
|
-
|
45
|
-
def self.load(data)
|
46
|
-
BaseRegressor.new(Hash[data.map { |k, v| [k.to_sym, v] }])
|
47
|
-
end
|
48
|
-
|
49
|
-
def dump
|
50
|
-
{coefficients: coefficients}
|
51
|
-
end
|
52
|
-
|
53
|
-
# json
|
54
|
-
|
55
|
-
def self.load_json(data)
|
56
|
-
data = JSON.parse(data) if data.is_a?(String)
|
57
|
-
coefficients = data["coefficients"]
|
58
|
-
|
59
|
-
# for R models
|
60
|
-
if coefficients["(Intercept)"]
|
61
|
-
coefficients = coefficients.dup
|
62
|
-
coefficients["_intercept"] = coefficients.delete("(Intercept)")
|
63
|
-
end
|
64
|
-
|
65
|
-
BaseRegressor.new(coefficients: coefficients)
|
66
|
-
end
|
67
|
-
|
68
|
-
def to_json
|
69
|
-
JSON.generate(dump)
|
70
|
-
end
|
71
|
-
|
72
|
-
# pmml
|
73
|
-
|
74
|
-
def self.load_pmml(data)
|
75
|
-
if data.is_a?(String)
|
76
|
-
require "nokogiri"
|
77
|
-
data = Nokogiri::XML(data)
|
78
|
-
end
|
79
|
-
|
80
|
-
# TODO more validation
|
81
|
-
node = data.css("RegressionTable")
|
82
|
-
coefficients = {
|
83
|
-
_intercept: node.attribute("intercept").value.to_f
|
84
|
-
}
|
85
|
-
node.css("NumericPredictor").each do |n|
|
86
|
-
coefficients[n.attribute("name").value] = n.attribute("coefficient").value.to_f
|
87
|
-
end
|
88
|
-
node.css("CategoricalPredictor").each do |n|
|
89
|
-
coefficients["#{n.attribute("name").value}#{n.attribute("value").value}"] = n.attribute("coefficient").value.to_f
|
90
|
-
end
|
91
|
-
BaseRegressor.new(coefficients: coefficients)
|
92
|
-
end
|
93
|
-
|
94
|
-
# pfa
|
95
|
-
|
96
|
-
def self.load_pfa(data)
|
97
|
-
data = JSON.parse(data) if data.is_a?(String)
|
98
|
-
init = data["cells"].first[1]["init"]
|
99
|
-
names =
|
100
|
-
if data["input"]["fields"]
|
101
|
-
data["input"]["fields"].map { |f| f["name"] }
|
102
|
-
else
|
103
|
-
init["coeff"].map.with_index { |_, i| "x#{i}" }
|
104
|
-
end
|
105
|
-
coefficients = {
|
106
|
-
_intercept: init["const"]
|
107
|
-
}
|
108
|
-
init["coeff"].each_with_index do |c, i|
|
109
|
-
name = names[i]
|
110
|
-
# R can export coefficients with same name
|
111
|
-
raise "Coefficients with same name" if coefficients[name]
|
112
|
-
coefficients[name] = c
|
113
|
-
end
|
114
|
-
BaseRegressor.new(coefficients: coefficients)
|
115
|
-
end
|
116
|
-
|
117
|
-
protected
|
118
|
-
|
119
|
-
def daru?(x)
|
120
|
-
defined?(Daru) && x.is_a?(Daru::DataFrame)
|
121
|
-
end
|
122
|
-
|
123
|
-
def prep_x(x, train: true)
|
124
|
-
if daru?(x)
|
125
|
-
x = x.to_a[0]
|
126
|
-
else
|
127
|
-
x = x.map do |xi|
|
128
|
-
case xi
|
129
|
-
when Hash
|
130
|
-
xi
|
131
|
-
when Array
|
132
|
-
Hash[xi.map.with_index { |v, i| [:"x#{i}", v] }]
|
133
|
-
else
|
134
|
-
{x0: xi}
|
135
|
-
end
|
136
|
-
end
|
137
|
-
end
|
138
|
-
|
139
|
-
# if !train && x.any?
|
140
|
-
# # check first row against coefficients
|
141
|
-
# ckeys = coefficients.keys.map(&:to_s)
|
142
|
-
# bad_keys = x[0].keys.map(&:to_s).reject { |k| ckeys.any? { |c| c.start_with?(k) } }
|
143
|
-
# raise "Unknown keys: #{bad_keys.join(", ")}" if bad_keys.any?
|
144
|
-
# end
|
145
|
-
|
146
|
-
cache = {}
|
147
|
-
first_key = {}
|
148
|
-
i = 0
|
149
|
-
rows = []
|
150
|
-
x.each do |xi|
|
151
|
-
row = {}
|
152
|
-
xi.each do |k, v|
|
153
|
-
key = v.is_a?(String) ? [k.to_sym, v] : k.to_sym
|
154
|
-
v2 = v.is_a?(String) ? 1 : v
|
155
|
-
|
156
|
-
# TODO make more efficient
|
157
|
-
next if !train && !coefficients.key?(symbolize_coef(key))
|
158
|
-
|
159
|
-
raise "Missing data" if v2.nil?
|
160
|
-
|
161
|
-
unless cache[key]
|
162
|
-
cache[key] = i
|
163
|
-
first_key[k] ||= key if v.is_a?(String)
|
164
|
-
i += 1
|
165
|
-
end
|
166
|
-
|
167
|
-
row[key] = v2
|
168
|
-
end
|
169
|
-
rows << row
|
170
|
-
end
|
171
|
-
|
172
|
-
if train
|
173
|
-
# remove one degree of freedom
|
174
|
-
first_key.values.each do |v|
|
175
|
-
num = cache.delete(v)
|
176
|
-
cache.each do |k, v2|
|
177
|
-
cache[k] -= 1 if v2 > num
|
178
|
-
end
|
179
|
-
end
|
180
|
-
end
|
181
|
-
|
182
|
-
ret2 = []
|
183
|
-
rows.each do |row|
|
184
|
-
ret = [0] * cache.size
|
185
|
-
row.each do |k, v|
|
186
|
-
if cache[k]
|
187
|
-
ret[cache[k]] = v
|
188
|
-
end
|
189
|
-
end
|
190
|
-
ret2 << ([1] + ret)
|
191
|
-
end
|
192
|
-
|
193
|
-
# flatten keys
|
194
|
-
c = [:_intercept] + cache.sort_by { |_, v| v }.map { |k, _| symbolize_coef(k) }
|
195
|
-
|
196
|
-
if c.size != c.uniq.size
|
197
|
-
raise "Overlapping coefficients"
|
198
|
-
end
|
199
|
-
|
200
|
-
[ret2, c]
|
201
|
-
end
|
202
|
-
|
203
|
-
def symbolize_coef(k)
|
204
|
-
(k.is_a?(Array) ? k.join("") : k).to_sym
|
205
|
-
end
|
206
|
-
|
207
|
-
def matrix_arr(matrix)
|
208
|
-
matrix.to_a.map { |xi| xi[0].to_f }
|
209
|
-
end
|
210
|
-
|
211
|
-
# determine if target is a string or symbol
|
212
|
-
def prep_target(target, data)
|
213
|
-
if daru?(data)
|
214
|
-
data.has_vector?(target) ? target : flip_target(target)
|
215
|
-
else
|
216
|
-
x = data[0] || {}
|
217
|
-
x[target] ? target : flip_target(target)
|
218
|
-
end
|
219
|
-
end
|
220
|
-
|
221
|
-
def flip_target(target)
|
222
|
-
target.is_a?(String) ? target.to_sym : target.to_s
|
223
|
-
end
|
224
|
-
|
225
|
-
def prep_y(y)
|
226
|
-
y.each do |yi|
|
227
|
-
raise "Target missing in data" if yi.nil?
|
228
|
-
end
|
229
|
-
y.map(&:to_f)
|
230
|
-
end
|
231
|
-
end
|
232
|
-
end
|