eps 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile DELETED
@@ -1,11 +0,0 @@
1
- source "https://rubygems.org"
2
-
3
- git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
-
5
- # Specify your gem's dependencies in eps.gemspec
6
- gemspec
7
-
8
- # remove when 0.2.1 released
9
- gem "daru", github: "sciruby/daru"
10
-
11
- gem "gsl" if ENV["GSL"]
data/Rakefile DELETED
@@ -1,34 +0,0 @@
1
- require "bundler/gem_tasks"
2
- require "rake/testtask"
3
-
4
- Rake::TestTask.new(:test) do |t|
5
- t.libs << "test"
6
- t.libs << "lib"
7
- t.test_files = FileList["test/**/*_test.rb"]
8
- t.warning = false
9
- end
10
-
11
- task default: :test
12
-
13
- task :benchmark do
14
- require "benchmark"
15
- require "eps"
16
- require "gsl" if ENV["GSL"]
17
-
18
- data = []
19
- 10000.times do
20
- row = {}
21
- 30.times do |i|
22
- row[:"x#{i}"] = rand(100)
23
- end
24
- row[:y] = rand(100)
25
- data << row
26
- end
27
-
28
- puts "Starting benchmark..."
29
-
30
- time = Benchmark.realtime do
31
- Eps::Regressor.new(data, target: :y)
32
- end
33
- p time.round(1)
34
- end
data/eps.gemspec DELETED
@@ -1,30 +0,0 @@
1
-
2
- lib = File.expand_path("../lib", __FILE__)
3
- $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require "eps/version"
5
-
6
- Gem::Specification.new do |spec|
7
- spec.name = "eps"
8
- spec.version = Eps::VERSION
9
- spec.authors = ["Andrew Kane"]
10
- spec.email = ["andrew@chartkick.com"]
11
-
12
- spec.summary = "Linear regression for Ruby"
13
- spec.homepage = "https://github.com/ankane/eps"
14
- spec.license = "MIT"
15
-
16
- # Specify which files should be added to the gem when it is released.
17
- # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
18
- spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
19
- `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
20
- end
21
- spec.bindir = "exe"
22
- spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
- spec.require_paths = ["lib"]
24
-
25
- spec.add_development_dependency "bundler"
26
- spec.add_development_dependency "daru"
27
- spec.add_development_dependency "minitest"
28
- spec.add_development_dependency "nokogiri"
29
- spec.add_development_dependency "rake"
30
- end
data/guides/Modeling.md DELETED
@@ -1,152 +0,0 @@
1
- # Modeling
2
-
3
- - [R JSON](#r-json)
4
- - [R PMML](#r-pmml)
5
- - [R PFA](#r-pfa)
6
- - [Python JSON](#python-json)
7
- - [Python PMML](#python-pmml)
8
- - [Python PFA](#python-pfa)
9
-
10
- ## R JSON
11
-
12
- Install the [jsonlite](https://cran.r-project.org/package=jsonlite) package
13
-
14
- ```r
15
- install.packages("jsonlite")
16
- ```
17
-
18
- And run:
19
-
20
- ```r
21
- library(jsonlite)
22
-
23
- model <- lm(dist ~ speed, cars)
24
- data <- toJSON(list(coefficients=as.list(coef(model))), auto_unbox=TRUE)
25
- write(data, file="model.json")
26
- ```
27
-
28
- ## R PMML
29
-
30
- Install the [pmml](https://cran.r-project.org/package=pmml) package
31
-
32
- ```r
33
- install.packages("pmml")
34
- ```
35
-
36
- And run:
37
-
38
- ```r
39
- library(pmml)
40
-
41
- model <- lm(dist ~ speed, cars)
42
- data <- toString(pmml(model))
43
- write(data, file="model.pmml")
44
- ```
45
-
46
- ## R PFA
47
-
48
- Install the [aurelius](https://cran.r-project.org/package=aurelius) package
49
-
50
- ```r
51
- install.packages("aurelius")
52
- ```
53
-
54
- And run:
55
-
56
- ```r
57
- library(aurelius)
58
-
59
- model <- lm(dist ~ speed, cars)
60
- write_pfa(pfa(model), file="model.pfa")
61
- ```
62
-
63
- ## Python JSON
64
-
65
- Run:
66
-
67
- ```python
68
- from sklearn import linear_model
69
- import pandas as pd
70
- import json
71
-
72
- x = [1, 2, 3, 4, 5]
73
- y = [5 * xi + 3 for xi in x]
74
-
75
- df = pd.DataFrame({'x': x, 'y': y})
76
- features = ['x']
77
-
78
- model = linear_model.LinearRegression()
79
- model.fit(df[features], df['y'])
80
-
81
- coefficients = {'_intercept': model.intercept_}
82
- for i, c in enumerate(model.coef_):
83
- coefficients[features[i]] = c
84
-
85
-
86
- data = json.dumps({'coefficients': coefficients})
87
-
88
- with open('model.json', 'w') as f:
89
- f.write(data)
90
- ```
91
-
92
- ## Python PMML
93
-
94
- Install the [scikit2pmml](https://github.com/vaclavcadek/scikit2pmml) package
95
-
96
- ```sh
97
- pip install scikit2pmml
98
- ```
99
-
100
- And run:
101
-
102
- ```python
103
- from sklearn import linear_model
104
- from scikit2pmml import scikit2pmml
105
-
106
- x = [1, 2, 3, 5, 6]
107
- y = [5 * xi + 3 for xi in x]
108
-
109
- model = linear_model.LinearRegression()
110
- model.fit([[xi] for xi in x], y)
111
-
112
- scikit2pmml(estimator=model, file='model.pmml')
113
- ```
114
-
115
- ## Python PFA
116
-
117
- Install the [Titus](https://github.com/opendatagroup/hadrian) package and run:
118
-
119
- ```python
120
- from sklearn import linear_model
121
- import titus.prettypfa
122
- import json
123
-
124
- x = [1, 2, 3, 5, 6]
125
- y = [5 * xi + 3 for xi in x]
126
-
127
- model = linear_model.LinearRegression()
128
- model.fit([[xi] for xi in x], y)
129
-
130
- def pfa(estimator):
131
- pfaDocument = titus.prettypfa.jsonNode('''
132
- types:
133
- Regression = record(Regression,
134
- const: double,
135
- coeff: array(double))
136
- input: array(double)
137
- output: double
138
- cells:
139
- regression(Regression) = {const: 0.0, coeff: []}
140
- action:
141
- model.reg.linear(input, regression)
142
- ''')
143
-
144
- pfaDocument["cells"]["regression"]["init"] = {"const": estimator.intercept_, "coeff": list(estimator.coef_)}
145
-
146
- return pfaDocument
147
-
148
- data = json.dumps(pfa(model))
149
-
150
- with open('model.pfa', 'w') as f:
151
- f.write(data)
152
- ```
@@ -1,232 +0,0 @@
1
- module Eps
2
- class BaseRegressor
3
- attr_reader :coefficients
4
-
5
- def initialize(coefficients:)
6
- @coefficients = Hash[coefficients.map { |k, v| [k.to_sym, v] }]
7
- end
8
-
9
- def predict(x)
10
- singular = !(x.is_a?(Array) || daru?(x))
11
- x = [x] if singular
12
- x, c = prep_x(x, train: false)
13
- coef = c.map do |v|
14
- # use 0 if coefficient does not exist
15
- # this can happen for categorical features
16
- # since only n-1 coefficients are stored
17
- coefficients[v] || 0
18
- end
19
-
20
- x = Matrix.rows(x)
21
- c = Matrix.column_vector(coef)
22
- pred = matrix_arr(x * c)
23
-
24
- singular ? pred[0] : pred
25
- end
26
-
27
- def evaluate(data, y = nil, target: nil)
28
- raise ArgumentError, "missing target" if !target && !y
29
-
30
- actual = y
31
- actual ||=
32
- if daru?(data)
33
- data[target].to_a
34
- else
35
- data.map { |v| v[target] }
36
- end
37
-
38
- actual = prep_y(actual)
39
- estimated = predict(data)
40
- Eps.metrics(actual, estimated)
41
- end
42
-
43
- # ruby
44
-
45
- def self.load(data)
46
- BaseRegressor.new(Hash[data.map { |k, v| [k.to_sym, v] }])
47
- end
48
-
49
- def dump
50
- {coefficients: coefficients}
51
- end
52
-
53
- # json
54
-
55
- def self.load_json(data)
56
- data = JSON.parse(data) if data.is_a?(String)
57
- coefficients = data["coefficients"]
58
-
59
- # for R models
60
- if coefficients["(Intercept)"]
61
- coefficients = coefficients.dup
62
- coefficients["_intercept"] = coefficients.delete("(Intercept)")
63
- end
64
-
65
- BaseRegressor.new(coefficients: coefficients)
66
- end
67
-
68
- def to_json
69
- JSON.generate(dump)
70
- end
71
-
72
- # pmml
73
-
74
- def self.load_pmml(data)
75
- if data.is_a?(String)
76
- require "nokogiri"
77
- data = Nokogiri::XML(data)
78
- end
79
-
80
- # TODO more validation
81
- node = data.css("RegressionTable")
82
- coefficients = {
83
- _intercept: node.attribute("intercept").value.to_f
84
- }
85
- node.css("NumericPredictor").each do |n|
86
- coefficients[n.attribute("name").value] = n.attribute("coefficient").value.to_f
87
- end
88
- node.css("CategoricalPredictor").each do |n|
89
- coefficients["#{n.attribute("name").value}#{n.attribute("value").value}"] = n.attribute("coefficient").value.to_f
90
- end
91
- BaseRegressor.new(coefficients: coefficients)
92
- end
93
-
94
- # pfa
95
-
96
- def self.load_pfa(data)
97
- data = JSON.parse(data) if data.is_a?(String)
98
- init = data["cells"].first[1]["init"]
99
- names =
100
- if data["input"]["fields"]
101
- data["input"]["fields"].map { |f| f["name"] }
102
- else
103
- init["coeff"].map.with_index { |_, i| "x#{i}" }
104
- end
105
- coefficients = {
106
- _intercept: init["const"]
107
- }
108
- init["coeff"].each_with_index do |c, i|
109
- name = names[i]
110
- # R can export coefficients with same name
111
- raise "Coefficients with same name" if coefficients[name]
112
- coefficients[name] = c
113
- end
114
- BaseRegressor.new(coefficients: coefficients)
115
- end
116
-
117
- protected
118
-
119
- def daru?(x)
120
- defined?(Daru) && x.is_a?(Daru::DataFrame)
121
- end
122
-
123
- def prep_x(x, train: true)
124
- if daru?(x)
125
- x = x.to_a[0]
126
- else
127
- x = x.map do |xi|
128
- case xi
129
- when Hash
130
- xi
131
- when Array
132
- Hash[xi.map.with_index { |v, i| [:"x#{i}", v] }]
133
- else
134
- {x0: xi}
135
- end
136
- end
137
- end
138
-
139
- # if !train && x.any?
140
- # # check first row against coefficients
141
- # ckeys = coefficients.keys.map(&:to_s)
142
- # bad_keys = x[0].keys.map(&:to_s).reject { |k| ckeys.any? { |c| c.start_with?(k) } }
143
- # raise "Unknown keys: #{bad_keys.join(", ")}" if bad_keys.any?
144
- # end
145
-
146
- cache = {}
147
- first_key = {}
148
- i = 0
149
- rows = []
150
- x.each do |xi|
151
- row = {}
152
- xi.each do |k, v|
153
- key = v.is_a?(String) ? [k.to_sym, v] : k.to_sym
154
- v2 = v.is_a?(String) ? 1 : v
155
-
156
- # TODO make more efficient
157
- next if !train && !coefficients.key?(symbolize_coef(key))
158
-
159
- raise "Missing data" if v2.nil?
160
-
161
- unless cache[key]
162
- cache[key] = i
163
- first_key[k] ||= key if v.is_a?(String)
164
- i += 1
165
- end
166
-
167
- row[key] = v2
168
- end
169
- rows << row
170
- end
171
-
172
- if train
173
- # remove one degree of freedom
174
- first_key.values.each do |v|
175
- num = cache.delete(v)
176
- cache.each do |k, v2|
177
- cache[k] -= 1 if v2 > num
178
- end
179
- end
180
- end
181
-
182
- ret2 = []
183
- rows.each do |row|
184
- ret = [0] * cache.size
185
- row.each do |k, v|
186
- if cache[k]
187
- ret[cache[k]] = v
188
- end
189
- end
190
- ret2 << ([1] + ret)
191
- end
192
-
193
- # flatten keys
194
- c = [:_intercept] + cache.sort_by { |_, v| v }.map { |k, _| symbolize_coef(k) }
195
-
196
- if c.size != c.uniq.size
197
- raise "Overlapping coefficients"
198
- end
199
-
200
- [ret2, c]
201
- end
202
-
203
- def symbolize_coef(k)
204
- (k.is_a?(Array) ? k.join("") : k).to_sym
205
- end
206
-
207
- def matrix_arr(matrix)
208
- matrix.to_a.map { |xi| xi[0].to_f }
209
- end
210
-
211
- # determine if target is a string or symbol
212
- def prep_target(target, data)
213
- if daru?(data)
214
- data.has_vector?(target) ? target : flip_target(target)
215
- else
216
- x = data[0] || {}
217
- x[target] ? target : flip_target(target)
218
- end
219
- end
220
-
221
- def flip_target(target)
222
- target.is_a?(String) ? target.to_sym : target.to_s
223
- end
224
-
225
- def prep_y(y)
226
- y.each do |yi|
227
- raise "Target missing in data" if yi.nil?
228
- end
229
- y.map(&:to_f)
230
- end
231
- end
232
- end