eps 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile DELETED
@@ -1,11 +0,0 @@
1
- source "https://rubygems.org"
2
-
3
- git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
-
5
- # Specify your gem's dependencies in eps.gemspec
6
- gemspec
7
-
8
- # remove when 0.2.1 released
9
- gem "daru", github: "sciruby/daru"
10
-
11
- gem "gsl" if ENV["GSL"]
data/Rakefile DELETED
@@ -1,34 +0,0 @@
1
- require "bundler/gem_tasks"
2
- require "rake/testtask"
3
-
4
- Rake::TestTask.new(:test) do |t|
5
- t.libs << "test"
6
- t.libs << "lib"
7
- t.test_files = FileList["test/**/*_test.rb"]
8
- t.warning = false
9
- end
10
-
11
- task default: :test
12
-
13
- task :benchmark do
14
- require "benchmark"
15
- require "eps"
16
- require "gsl" if ENV["GSL"]
17
-
18
- data = []
19
- 10000.times do
20
- row = {}
21
- 30.times do |i|
22
- row[:"x#{i}"] = rand(100)
23
- end
24
- row[:y] = rand(100)
25
- data << row
26
- end
27
-
28
- puts "Starting benchmark..."
29
-
30
- time = Benchmark.realtime do
31
- Eps::Regressor.new(data, target: :y)
32
- end
33
- p time.round(1)
34
- end
data/eps.gemspec DELETED
@@ -1,30 +0,0 @@
1
-
2
- lib = File.expand_path("../lib", __FILE__)
3
- $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require "eps/version"
5
-
6
- Gem::Specification.new do |spec|
7
- spec.name = "eps"
8
- spec.version = Eps::VERSION
9
- spec.authors = ["Andrew Kane"]
10
- spec.email = ["andrew@chartkick.com"]
11
-
12
- spec.summary = "Linear regression for Ruby"
13
- spec.homepage = "https://github.com/ankane/eps"
14
- spec.license = "MIT"
15
-
16
- # Specify which files should be added to the gem when it is released.
17
- # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
18
- spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
19
- `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
20
- end
21
- spec.bindir = "exe"
22
- spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
- spec.require_paths = ["lib"]
24
-
25
- spec.add_development_dependency "bundler"
26
- spec.add_development_dependency "daru"
27
- spec.add_development_dependency "minitest"
28
- spec.add_development_dependency "nokogiri"
29
- spec.add_development_dependency "rake"
30
- end
data/guides/Modeling.md DELETED
@@ -1,152 +0,0 @@
1
- # Modeling
2
-
3
- - [R JSON](#r-json)
4
- - [R PMML](#r-pmml)
5
- - [R PFA](#r-pfa)
6
- - [Python JSON](#python-json)
7
- - [Python PMML](#python-pmml)
8
- - [Python PFA](#python-pfa)
9
-
10
- ## R JSON
11
-
12
- Install the [jsonlite](https://cran.r-project.org/package=jsonlite) package
13
-
14
- ```r
15
- install.packages("jsonlite")
16
- ```
17
-
18
- And run:
19
-
20
- ```r
21
- library(jsonlite)
22
-
23
- model <- lm(dist ~ speed, cars)
24
- data <- toJSON(list(coefficients=as.list(coef(model))), auto_unbox=TRUE)
25
- write(data, file="model.json")
26
- ```
27
-
28
- ## R PMML
29
-
30
- Install the [pmml](https://cran.r-project.org/package=pmml) package
31
-
32
- ```r
33
- install.packages("pmml")
34
- ```
35
-
36
- And run:
37
-
38
- ```r
39
- library(pmml)
40
-
41
- model <- lm(dist ~ speed, cars)
42
- data <- toString(pmml(model))
43
- write(data, file="model.pmml")
44
- ```
45
-
46
- ## R PFA
47
-
48
- Install the [aurelius](https://cran.r-project.org/package=aurelius) package
49
-
50
- ```r
51
- install.packages("aurelius")
52
- ```
53
-
54
- And run:
55
-
56
- ```r
57
- library(aurelius)
58
-
59
- model <- lm(dist ~ speed, cars)
60
- write_pfa(pfa(model), file="model.pfa")
61
- ```
62
-
63
- ## Python JSON
64
-
65
- Run:
66
-
67
- ```python
68
- from sklearn import linear_model
69
- import pandas as pd
70
- import json
71
-
72
- x = [1, 2, 3, 4, 5]
73
- y = [5 * xi + 3 for xi in x]
74
-
75
- df = pd.DataFrame({'x': x, 'y': y})
76
- features = ['x']
77
-
78
- model = linear_model.LinearRegression()
79
- model.fit(df[features], df['y'])
80
-
81
- coefficients = {'_intercept': model.intercept_}
82
- for i, c in enumerate(model.coef_):
83
- coefficients[features[i]] = c
84
-
85
-
86
- data = json.dumps({'coefficients': coefficients})
87
-
88
- with open('model.json', 'w') as f:
89
- f.write(data)
90
- ```
91
-
92
- ## Python PMML
93
-
94
- Install the [scikit2pmml](https://github.com/vaclavcadek/scikit2pmml) package
95
-
96
- ```sh
97
- pip install scikit2pmml
98
- ```
99
-
100
- And run:
101
-
102
- ```python
103
- from sklearn import linear_model
104
- from scikit2pmml import scikit2pmml
105
-
106
- x = [1, 2, 3, 5, 6]
107
- y = [5 * xi + 3 for xi in x]
108
-
109
- model = linear_model.LinearRegression()
110
- model.fit([[xi] for xi in x], y)
111
-
112
- scikit2pmml(estimator=model, file='model.pmml')
113
- ```
114
-
115
- ## Python PFA
116
-
117
- Install the [Titus](https://github.com/opendatagroup/hadrian) package and run:
118
-
119
- ```python
120
- from sklearn import linear_model
121
- import titus.prettypfa
122
- import json
123
-
124
- x = [1, 2, 3, 5, 6]
125
- y = [5 * xi + 3 for xi in x]
126
-
127
- model = linear_model.LinearRegression()
128
- model.fit([[xi] for xi in x], y)
129
-
130
- def pfa(estimator):
131
- pfaDocument = titus.prettypfa.jsonNode('''
132
- types:
133
- Regression = record(Regression,
134
- const: double,
135
- coeff: array(double))
136
- input: array(double)
137
- output: double
138
- cells:
139
- regression(Regression) = {const: 0.0, coeff: []}
140
- action:
141
- model.reg.linear(input, regression)
142
- ''')
143
-
144
- pfaDocument["cells"]["regression"]["init"] = {"const": estimator.intercept_, "coeff": list(estimator.coef_)}
145
-
146
- return pfaDocument
147
-
148
- data = json.dumps(pfa(model))
149
-
150
- with open('model.pfa', 'w') as f:
151
- f.write(data)
152
- ```
@@ -1,232 +0,0 @@
1
- module Eps
2
- class BaseRegressor
3
- attr_reader :coefficients
4
-
5
- def initialize(coefficients:)
6
- @coefficients = Hash[coefficients.map { |k, v| [k.to_sym, v] }]
7
- end
8
-
9
- def predict(x)
10
- singular = !(x.is_a?(Array) || daru?(x))
11
- x = [x] if singular
12
- x, c = prep_x(x, train: false)
13
- coef = c.map do |v|
14
- # use 0 if coefficient does not exist
15
- # this can happen for categorical features
16
- # since only n-1 coefficients are stored
17
- coefficients[v] || 0
18
- end
19
-
20
- x = Matrix.rows(x)
21
- c = Matrix.column_vector(coef)
22
- pred = matrix_arr(x * c)
23
-
24
- singular ? pred[0] : pred
25
- end
26
-
27
- def evaluate(data, y = nil, target: nil)
28
- raise ArgumentError, "missing target" if !target && !y
29
-
30
- actual = y
31
- actual ||=
32
- if daru?(data)
33
- data[target].to_a
34
- else
35
- data.map { |v| v[target] }
36
- end
37
-
38
- actual = prep_y(actual)
39
- estimated = predict(data)
40
- Eps.metrics(actual, estimated)
41
- end
42
-
43
- # ruby
44
-
45
- def self.load(data)
46
- BaseRegressor.new(Hash[data.map { |k, v| [k.to_sym, v] }])
47
- end
48
-
49
- def dump
50
- {coefficients: coefficients}
51
- end
52
-
53
- # json
54
-
55
- def self.load_json(data)
56
- data = JSON.parse(data) if data.is_a?(String)
57
- coefficients = data["coefficients"]
58
-
59
- # for R models
60
- if coefficients["(Intercept)"]
61
- coefficients = coefficients.dup
62
- coefficients["_intercept"] = coefficients.delete("(Intercept)")
63
- end
64
-
65
- BaseRegressor.new(coefficients: coefficients)
66
- end
67
-
68
- def to_json
69
- JSON.generate(dump)
70
- end
71
-
72
- # pmml
73
-
74
- def self.load_pmml(data)
75
- if data.is_a?(String)
76
- require "nokogiri"
77
- data = Nokogiri::XML(data)
78
- end
79
-
80
- # TODO more validation
81
- node = data.css("RegressionTable")
82
- coefficients = {
83
- _intercept: node.attribute("intercept").value.to_f
84
- }
85
- node.css("NumericPredictor").each do |n|
86
- coefficients[n.attribute("name").value] = n.attribute("coefficient").value.to_f
87
- end
88
- node.css("CategoricalPredictor").each do |n|
89
- coefficients["#{n.attribute("name").value}#{n.attribute("value").value}"] = n.attribute("coefficient").value.to_f
90
- end
91
- BaseRegressor.new(coefficients: coefficients)
92
- end
93
-
94
- # pfa
95
-
96
- def self.load_pfa(data)
97
- data = JSON.parse(data) if data.is_a?(String)
98
- init = data["cells"].first[1]["init"]
99
- names =
100
- if data["input"]["fields"]
101
- data["input"]["fields"].map { |f| f["name"] }
102
- else
103
- init["coeff"].map.with_index { |_, i| "x#{i}" }
104
- end
105
- coefficients = {
106
- _intercept: init["const"]
107
- }
108
- init["coeff"].each_with_index do |c, i|
109
- name = names[i]
110
- # R can export coefficients with same name
111
- raise "Coefficients with same name" if coefficients[name]
112
- coefficients[name] = c
113
- end
114
- BaseRegressor.new(coefficients: coefficients)
115
- end
116
-
117
- protected
118
-
119
- def daru?(x)
120
- defined?(Daru) && x.is_a?(Daru::DataFrame)
121
- end
122
-
123
- def prep_x(x, train: true)
124
- if daru?(x)
125
- x = x.to_a[0]
126
- else
127
- x = x.map do |xi|
128
- case xi
129
- when Hash
130
- xi
131
- when Array
132
- Hash[xi.map.with_index { |v, i| [:"x#{i}", v] }]
133
- else
134
- {x0: xi}
135
- end
136
- end
137
- end
138
-
139
- # if !train && x.any?
140
- # # check first row against coefficients
141
- # ckeys = coefficients.keys.map(&:to_s)
142
- # bad_keys = x[0].keys.map(&:to_s).reject { |k| ckeys.any? { |c| c.start_with?(k) } }
143
- # raise "Unknown keys: #{bad_keys.join(", ")}" if bad_keys.any?
144
- # end
145
-
146
- cache = {}
147
- first_key = {}
148
- i = 0
149
- rows = []
150
- x.each do |xi|
151
- row = {}
152
- xi.each do |k, v|
153
- key = v.is_a?(String) ? [k.to_sym, v] : k.to_sym
154
- v2 = v.is_a?(String) ? 1 : v
155
-
156
- # TODO make more efficient
157
- next if !train && !coefficients.key?(symbolize_coef(key))
158
-
159
- raise "Missing data" if v2.nil?
160
-
161
- unless cache[key]
162
- cache[key] = i
163
- first_key[k] ||= key if v.is_a?(String)
164
- i += 1
165
- end
166
-
167
- row[key] = v2
168
- end
169
- rows << row
170
- end
171
-
172
- if train
173
- # remove one degree of freedom
174
- first_key.values.each do |v|
175
- num = cache.delete(v)
176
- cache.each do |k, v2|
177
- cache[k] -= 1 if v2 > num
178
- end
179
- end
180
- end
181
-
182
- ret2 = []
183
- rows.each do |row|
184
- ret = [0] * cache.size
185
- row.each do |k, v|
186
- if cache[k]
187
- ret[cache[k]] = v
188
- end
189
- end
190
- ret2 << ([1] + ret)
191
- end
192
-
193
- # flatten keys
194
- c = [:_intercept] + cache.sort_by { |_, v| v }.map { |k, _| symbolize_coef(k) }
195
-
196
- if c.size != c.uniq.size
197
- raise "Overlapping coefficients"
198
- end
199
-
200
- [ret2, c]
201
- end
202
-
203
- def symbolize_coef(k)
204
- (k.is_a?(Array) ? k.join("") : k).to_sym
205
- end
206
-
207
- def matrix_arr(matrix)
208
- matrix.to_a.map { |xi| xi[0].to_f }
209
- end
210
-
211
- # determine if target is a string or symbol
212
- def prep_target(target, data)
213
- if daru?(data)
214
- data.has_vector?(target) ? target : flip_target(target)
215
- else
216
- x = data[0] || {}
217
- x[target] ? target : flip_target(target)
218
- end
219
- end
220
-
221
- def flip_target(target)
222
- target.is_a?(String) ? target.to_sym : target.to_s
223
- end
224
-
225
- def prep_y(y)
226
- y.each do |yi|
227
- raise "Target missing in data" if yi.nil?
228
- end
229
- y.map(&:to_f)
230
- end
231
- end
232
- end