eps 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +235 -84
- data/lib/eps.rb +9 -4
- data/lib/eps/base.rb +19 -0
- data/lib/eps/base_estimator.rb +84 -0
- data/lib/eps/linear_regression.rb +558 -0
- data/lib/eps/model.rb +108 -0
- data/lib/eps/naive_bayes.rb +240 -0
- data/lib/eps/version.rb +1 -1
- metadata +13 -18
- data/.gitignore +0 -9
- data/.travis.yml +0 -15
- data/Gemfile +0 -11
- data/Rakefile +0 -34
- data/eps.gemspec +0 -30
- data/guides/Modeling.md +0 -152
- data/lib/eps/base_regressor.rb +0 -232
- data/lib/eps/metrics.rb +0 -35
- data/lib/eps/regressor.rb +0 -314
data/lib/eps/metrics.rb
DELETED
@@ -1,35 +0,0 @@
|
|
1
|
-
module Eps
|
2
|
-
class Metrics
|
3
|
-
attr_reader :errors
|
4
|
-
|
5
|
-
def initialize(actual, estimated)
|
6
|
-
@errors = actual.zip(estimated).map { |yi, yi2| yi - yi2 }
|
7
|
-
end
|
8
|
-
|
9
|
-
def all
|
10
|
-
{
|
11
|
-
rmse: rmse,
|
12
|
-
mae: mae,
|
13
|
-
me: me
|
14
|
-
}
|
15
|
-
end
|
16
|
-
|
17
|
-
private
|
18
|
-
|
19
|
-
def me
|
20
|
-
mean(errors)
|
21
|
-
end
|
22
|
-
|
23
|
-
def mae
|
24
|
-
mean(errors.map { |v| v.abs })
|
25
|
-
end
|
26
|
-
|
27
|
-
def rmse
|
28
|
-
Math.sqrt(mean(errors.map { |v| v**2 }))
|
29
|
-
end
|
30
|
-
|
31
|
-
def mean(arr)
|
32
|
-
arr.inject(0, &:+) / arr.size.to_f
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
data/lib/eps/regressor.rb
DELETED
@@ -1,314 +0,0 @@
|
|
1
|
-
module Eps
|
2
|
-
class Regressor < BaseRegressor
|
3
|
-
def initialize(data, y = nil, target: nil, gsl: nil)
|
4
|
-
raise ArgumentError, "missing target" if !target && !y
|
5
|
-
|
6
|
-
target = prep_target(target, data) if target
|
7
|
-
|
8
|
-
# TODO more performant conversion
|
9
|
-
if daru?(data)
|
10
|
-
y ||= data[target].to_a
|
11
|
-
x = data.dup.delete_vector(target)
|
12
|
-
else
|
13
|
-
x = data.map(&:dup)
|
14
|
-
y ||= x.map { |v| v.delete(target) }
|
15
|
-
end
|
16
|
-
|
17
|
-
@x = x
|
18
|
-
@y = prep_y(y.to_a)
|
19
|
-
|
20
|
-
if @x.size != @y.size
|
21
|
-
raise "Number of samples differs from target"
|
22
|
-
end
|
23
|
-
|
24
|
-
@target = target
|
25
|
-
@gsl = gsl.nil? ? defined?(GSL) : gsl
|
26
|
-
|
27
|
-
# fit immediately
|
28
|
-
coefficients
|
29
|
-
end
|
30
|
-
|
31
|
-
def coefficients
|
32
|
-
@coefficients ||= begin
|
33
|
-
x, @coefficient_names = prep_x(@x)
|
34
|
-
|
35
|
-
if x.size <= @coefficient_names.size
|
36
|
-
raise "Number of samples must be at least two more than number of features"
|
37
|
-
end
|
38
|
-
|
39
|
-
v =
|
40
|
-
if @gsl
|
41
|
-
x = GSL::Matrix.alloc(*x)
|
42
|
-
y = GSL::Vector.alloc(@y)
|
43
|
-
c, @covariance, _, _ = GSL::MultiFit::linear(x, y)
|
44
|
-
c.to_a
|
45
|
-
else
|
46
|
-
x = Matrix.rows(x)
|
47
|
-
y = Matrix.column_vector(@y)
|
48
|
-
removed = []
|
49
|
-
|
50
|
-
# https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
|
51
|
-
# unforutnately, this method is unstable
|
52
|
-
# haven't found an efficient way to do QR-factorization in Ruby
|
53
|
-
# the extendmatrix gem has householder and givens (givens has bug)
|
54
|
-
# but methods are too slow
|
55
|
-
xt = x.t
|
56
|
-
begin
|
57
|
-
@xtxi = (xt * x).inverse
|
58
|
-
rescue ExceptionForMatrix::ErrNotRegular
|
59
|
-
constant = {}
|
60
|
-
(1...x.column_count).each do |i|
|
61
|
-
constant[i] = constant?(x.column(i))
|
62
|
-
end
|
63
|
-
|
64
|
-
# remove constant columns
|
65
|
-
removed = constant.select { |_, v| v }.keys
|
66
|
-
|
67
|
-
# remove non-independent columns
|
68
|
-
constant.select { |_, v| !v }.keys.combination(2) do |c|
|
69
|
-
if !x.column(c[0]).independent?(x.column(c[1]))
|
70
|
-
removed << c[1]
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
vectors = x.column_vectors
|
75
|
-
# delete in reverse of indexes stay the same
|
76
|
-
removed.sort.reverse.each do |i|
|
77
|
-
# @coefficient_names.delete_at(i)
|
78
|
-
vectors.delete_at(i)
|
79
|
-
end
|
80
|
-
x = Matrix.columns(vectors)
|
81
|
-
xt = x.t
|
82
|
-
|
83
|
-
# try again
|
84
|
-
begin
|
85
|
-
@xtxi = (xt * x).inverse
|
86
|
-
rescue ExceptionForMatrix::ErrNotRegular
|
87
|
-
raise "Multiple solutions - GSL is needed to select one"
|
88
|
-
end
|
89
|
-
end
|
90
|
-
# huge performance boost
|
91
|
-
# by multiplying xt * y first
|
92
|
-
v2 = matrix_arr(@xtxi * (xt * y))
|
93
|
-
|
94
|
-
# add back removed
|
95
|
-
removed.sort.each do |i|
|
96
|
-
v2.insert(i, 0)
|
97
|
-
end
|
98
|
-
@removed = removed
|
99
|
-
|
100
|
-
v2
|
101
|
-
end
|
102
|
-
|
103
|
-
Hash[@coefficient_names.zip(v)]
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
def evaluate(data, y = nil)
|
108
|
-
super(data, y, target: @target)
|
109
|
-
end
|
110
|
-
|
111
|
-
# https://people.richland.edu/james/ictcm/2004/multiple.html
|
112
|
-
def summary(extended: false)
|
113
|
-
@summary_str ||= begin
|
114
|
-
str = String.new("")
|
115
|
-
len = [coefficients.keys.map(&:size).max, 15].max
|
116
|
-
if extended
|
117
|
-
str += "%-#{len}s %12s %12s %12s %12s\n" % ["", "coef", "stderr", "t", "p"]
|
118
|
-
else
|
119
|
-
str += "%-#{len}s %12s %12s\n" % ["", "coef", "p"]
|
120
|
-
end
|
121
|
-
coefficients.each do |k, v|
|
122
|
-
if extended
|
123
|
-
str += "%-#{len}s %12.2f %12.2f %12.2f %12.3f\n" % [k, v, std_err[k], t_value[k], p_value[k]]
|
124
|
-
else
|
125
|
-
str += "%-#{len}s %12.2f %12.3f\n" % [k, v, p_value[k]]
|
126
|
-
end
|
127
|
-
end
|
128
|
-
str += "\n"
|
129
|
-
str += "r2: %.3f\n" % [r2] if extended
|
130
|
-
str += "adjusted r2: %.3f\n" % [adjusted_r2]
|
131
|
-
str
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
def r2
|
136
|
-
@r2 ||= (sst - sse) / sst
|
137
|
-
end
|
138
|
-
|
139
|
-
def adjusted_r2
|
140
|
-
@adjusted_r2 ||= (mst - mse) / mst
|
141
|
-
end
|
142
|
-
|
143
|
-
private
|
144
|
-
|
145
|
-
def constant?(arr)
|
146
|
-
arr.all? { |x| x == arr[0] }
|
147
|
-
end
|
148
|
-
|
149
|
-
# add epsilon for perfect fits
|
150
|
-
# consistent with GSL
|
151
|
-
def t_value
|
152
|
-
@t_value ||= Hash[coefficients.map { |k, v| [k, v / (std_err[k] + Float::EPSILON)] }]
|
153
|
-
end
|
154
|
-
|
155
|
-
def p_value
|
156
|
-
@p_value ||= begin
|
157
|
-
Hash[coefficients.map do |k, _|
|
158
|
-
tp =
|
159
|
-
if @gsl
|
160
|
-
GSL::Cdf.tdist_P(t_value[k].abs, degrees_of_freedom)
|
161
|
-
else
|
162
|
-
tdist_p(t_value[k].abs, degrees_of_freedom)
|
163
|
-
end
|
164
|
-
|
165
|
-
[k, 2 * (1 - tp)]
|
166
|
-
end]
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
170
|
-
def std_err
|
171
|
-
@std_err ||= begin
|
172
|
-
Hash[@coefficient_names.zip(diagonal.map { |v| Math.sqrt(v) })]
|
173
|
-
end
|
174
|
-
end
|
175
|
-
|
176
|
-
def diagonal
|
177
|
-
@diagonal ||= begin
|
178
|
-
if covariance.respond_to?(:each)
|
179
|
-
d = covariance.each(:diagonal).to_a
|
180
|
-
@removed.each do |i|
|
181
|
-
d.insert(i, 0)
|
182
|
-
end
|
183
|
-
d
|
184
|
-
else
|
185
|
-
covariance.diagonal.to_a
|
186
|
-
end
|
187
|
-
end
|
188
|
-
end
|
189
|
-
|
190
|
-
def covariance
|
191
|
-
@covariance ||= mse * @xtxi
|
192
|
-
end
|
193
|
-
|
194
|
-
def y_bar
|
195
|
-
@y_bar ||= mean(@y)
|
196
|
-
end
|
197
|
-
|
198
|
-
def y_hat
|
199
|
-
@y_hat ||= predict(@x)
|
200
|
-
end
|
201
|
-
|
202
|
-
# total sum of squares
|
203
|
-
def sst
|
204
|
-
@sst ||= sum(@y.map { |y| (y - y_bar)**2 })
|
205
|
-
end
|
206
|
-
|
207
|
-
# sum of squared errors of prediction
|
208
|
-
# not to be confused with "explained sum of squares"
|
209
|
-
def sse
|
210
|
-
@sse ||= sum(@y.zip(y_hat).map { |y, yh| (y - yh)**2 })
|
211
|
-
end
|
212
|
-
|
213
|
-
def mst
|
214
|
-
@mst ||= sst / (@y.size - 1)
|
215
|
-
end
|
216
|
-
|
217
|
-
def mse
|
218
|
-
@mse ||= sse / degrees_of_freedom
|
219
|
-
end
|
220
|
-
|
221
|
-
def degrees_of_freedom
|
222
|
-
@y.size - coefficients.size
|
223
|
-
end
|
224
|
-
|
225
|
-
def sum(arr)
|
226
|
-
arr.inject(0, &:+)
|
227
|
-
end
|
228
|
-
|
229
|
-
def mean(arr)
|
230
|
-
sum(arr) / arr.size.to_f
|
231
|
-
end
|
232
|
-
|
233
|
-
### Extracted from https://github.com/estebanz01/ruby-statistics
|
234
|
-
### The Ruby author is Esteban Zapata Rojas
|
235
|
-
###
|
236
|
-
### Originally extracted from https://codeplea.com/incomplete-beta-function-c
|
237
|
-
### This function is shared under zlib license and the author is Lewis Van Winkle
|
238
|
-
def tdist_p(value, degrees_of_freedom)
|
239
|
-
upper = (value + Math.sqrt(value * value + degrees_of_freedom))
|
240
|
-
lower = (2.0 * Math.sqrt(value * value + degrees_of_freedom))
|
241
|
-
|
242
|
-
x = upper/lower
|
243
|
-
|
244
|
-
alpha = degrees_of_freedom/2.0
|
245
|
-
beta = degrees_of_freedom/2.0
|
246
|
-
|
247
|
-
incomplete_beta_function(x, alpha, beta)
|
248
|
-
end
|
249
|
-
|
250
|
-
### Extracted from https://github.com/estebanz01/ruby-statistics
|
251
|
-
### The Ruby author is Esteban Zapata Rojas
|
252
|
-
###
|
253
|
-
### This implementation is an adaptation of the incomplete beta function made in C by
|
254
|
-
### Lewis Van Winkle, which released the code under the zlib license.
|
255
|
-
### The whole math behind this code is described in the following post: https://codeplea.com/incomplete-beta-function-c
|
256
|
-
def incomplete_beta_function(x, alp, bet)
|
257
|
-
return if x < 0.0
|
258
|
-
return 1.0 if x > 1.0
|
259
|
-
|
260
|
-
tiny = 1.0E-50
|
261
|
-
|
262
|
-
if x > ((alp + 1.0)/(alp + bet + 2.0))
|
263
|
-
return 1.0 - incomplete_beta_function(1.0 - x, bet, alp)
|
264
|
-
end
|
265
|
-
|
266
|
-
# To avoid overflow problems, the implementation applies the logarithm properties
|
267
|
-
# to calculate in a faster and safer way the values.
|
268
|
-
lbet_ab = (Math.lgamma(alp)[0] + Math.lgamma(bet)[0] - Math.lgamma(alp + bet)[0]).freeze
|
269
|
-
front = (Math.exp(Math.log(x) * alp + Math.log(1.0 - x) * bet - lbet_ab) / alp.to_f).freeze
|
270
|
-
|
271
|
-
# This is the non-log version of the left part of the formula (before the continuous fraction)
|
272
|
-
# down_left = alp * self.beta_function(alp, bet)
|
273
|
-
# upper_left = (x ** alp) * ((1.0 - x) ** bet)
|
274
|
-
# front = upper_left/down_left
|
275
|
-
|
276
|
-
f, c, d = 1.0, 1.0, 0.0
|
277
|
-
|
278
|
-
returned_value = nil
|
279
|
-
|
280
|
-
# Let's do more iterations than the proposed implementation (200 iters)
|
281
|
-
(0..500).each do |number|
|
282
|
-
m = number/2
|
283
|
-
|
284
|
-
numerator = if number == 0
|
285
|
-
1.0
|
286
|
-
elsif number % 2 == 0
|
287
|
-
(m * (bet - m) * x)/((alp + 2.0 * m - 1.0)* (alp + 2.0 * m))
|
288
|
-
else
|
289
|
-
top = -((alp + m) * (alp + bet + m) * x)
|
290
|
-
down = ((alp + 2.0 * m) * (alp + 2.0 * m + 1.0))
|
291
|
-
|
292
|
-
top/down
|
293
|
-
end
|
294
|
-
|
295
|
-
d = 1.0 + numerator * d
|
296
|
-
d = tiny if d.abs < tiny
|
297
|
-
d = 1.0 / d
|
298
|
-
|
299
|
-
c = 1.0 + numerator / c
|
300
|
-
c = tiny if c.abs < tiny
|
301
|
-
|
302
|
-
cd = (c*d).freeze
|
303
|
-
f = f * cd
|
304
|
-
|
305
|
-
if (1.0 - cd).abs < 1.0E-10
|
306
|
-
returned_value = front * (f - 1.0)
|
307
|
-
break
|
308
|
-
end
|
309
|
-
end
|
310
|
-
|
311
|
-
returned_value
|
312
|
-
end
|
313
|
-
end
|
314
|
-
end
|