eps 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +235 -84
- data/lib/eps.rb +9 -4
- data/lib/eps/base.rb +19 -0
- data/lib/eps/base_estimator.rb +84 -0
- data/lib/eps/linear_regression.rb +558 -0
- data/lib/eps/model.rb +108 -0
- data/lib/eps/naive_bayes.rb +240 -0
- data/lib/eps/version.rb +1 -1
- metadata +13 -18
- data/.gitignore +0 -9
- data/.travis.yml +0 -15
- data/Gemfile +0 -11
- data/Rakefile +0 -34
- data/eps.gemspec +0 -30
- data/guides/Modeling.md +0 -152
- data/lib/eps/base_regressor.rb +0 -232
- data/lib/eps/metrics.rb +0 -35
- data/lib/eps/regressor.rb +0 -314
data/lib/eps/metrics.rb
DELETED
@@ -1,35 +0,0 @@
|
|
1
|
-
module Eps
|
2
|
-
class Metrics
|
3
|
-
attr_reader :errors
|
4
|
-
|
5
|
-
def initialize(actual, estimated)
|
6
|
-
@errors = actual.zip(estimated).map { |yi, yi2| yi - yi2 }
|
7
|
-
end
|
8
|
-
|
9
|
-
def all
|
10
|
-
{
|
11
|
-
rmse: rmse,
|
12
|
-
mae: mae,
|
13
|
-
me: me
|
14
|
-
}
|
15
|
-
end
|
16
|
-
|
17
|
-
private
|
18
|
-
|
19
|
-
def me
|
20
|
-
mean(errors)
|
21
|
-
end
|
22
|
-
|
23
|
-
def mae
|
24
|
-
mean(errors.map { |v| v.abs })
|
25
|
-
end
|
26
|
-
|
27
|
-
def rmse
|
28
|
-
Math.sqrt(mean(errors.map { |v| v**2 }))
|
29
|
-
end
|
30
|
-
|
31
|
-
def mean(arr)
|
32
|
-
arr.inject(0, &:+) / arr.size.to_f
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
data/lib/eps/regressor.rb
DELETED
@@ -1,314 +0,0 @@
|
|
1
|
-
module Eps
|
2
|
-
class Regressor < BaseRegressor
|
3
|
-
def initialize(data, y = nil, target: nil, gsl: nil)
|
4
|
-
raise ArgumentError, "missing target" if !target && !y
|
5
|
-
|
6
|
-
target = prep_target(target, data) if target
|
7
|
-
|
8
|
-
# TODO more performant conversion
|
9
|
-
if daru?(data)
|
10
|
-
y ||= data[target].to_a
|
11
|
-
x = data.dup.delete_vector(target)
|
12
|
-
else
|
13
|
-
x = data.map(&:dup)
|
14
|
-
y ||= x.map { |v| v.delete(target) }
|
15
|
-
end
|
16
|
-
|
17
|
-
@x = x
|
18
|
-
@y = prep_y(y.to_a)
|
19
|
-
|
20
|
-
if @x.size != @y.size
|
21
|
-
raise "Number of samples differs from target"
|
22
|
-
end
|
23
|
-
|
24
|
-
@target = target
|
25
|
-
@gsl = gsl.nil? ? defined?(GSL) : gsl
|
26
|
-
|
27
|
-
# fit immediately
|
28
|
-
coefficients
|
29
|
-
end
|
30
|
-
|
31
|
-
def coefficients
|
32
|
-
@coefficients ||= begin
|
33
|
-
x, @coefficient_names = prep_x(@x)
|
34
|
-
|
35
|
-
if x.size <= @coefficient_names.size
|
36
|
-
raise "Number of samples must be at least two more than number of features"
|
37
|
-
end
|
38
|
-
|
39
|
-
v =
|
40
|
-
if @gsl
|
41
|
-
x = GSL::Matrix.alloc(*x)
|
42
|
-
y = GSL::Vector.alloc(@y)
|
43
|
-
c, @covariance, _, _ = GSL::MultiFit::linear(x, y)
|
44
|
-
c.to_a
|
45
|
-
else
|
46
|
-
x = Matrix.rows(x)
|
47
|
-
y = Matrix.column_vector(@y)
|
48
|
-
removed = []
|
49
|
-
|
50
|
-
# https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
|
51
|
-
# unforutnately, this method is unstable
|
52
|
-
# haven't found an efficient way to do QR-factorization in Ruby
|
53
|
-
# the extendmatrix gem has householder and givens (givens has bug)
|
54
|
-
# but methods are too slow
|
55
|
-
xt = x.t
|
56
|
-
begin
|
57
|
-
@xtxi = (xt * x).inverse
|
58
|
-
rescue ExceptionForMatrix::ErrNotRegular
|
59
|
-
constant = {}
|
60
|
-
(1...x.column_count).each do |i|
|
61
|
-
constant[i] = constant?(x.column(i))
|
62
|
-
end
|
63
|
-
|
64
|
-
# remove constant columns
|
65
|
-
removed = constant.select { |_, v| v }.keys
|
66
|
-
|
67
|
-
# remove non-independent columns
|
68
|
-
constant.select { |_, v| !v }.keys.combination(2) do |c|
|
69
|
-
if !x.column(c[0]).independent?(x.column(c[1]))
|
70
|
-
removed << c[1]
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
vectors = x.column_vectors
|
75
|
-
# delete in reverse of indexes stay the same
|
76
|
-
removed.sort.reverse.each do |i|
|
77
|
-
# @coefficient_names.delete_at(i)
|
78
|
-
vectors.delete_at(i)
|
79
|
-
end
|
80
|
-
x = Matrix.columns(vectors)
|
81
|
-
xt = x.t
|
82
|
-
|
83
|
-
# try again
|
84
|
-
begin
|
85
|
-
@xtxi = (xt * x).inverse
|
86
|
-
rescue ExceptionForMatrix::ErrNotRegular
|
87
|
-
raise "Multiple solutions - GSL is needed to select one"
|
88
|
-
end
|
89
|
-
end
|
90
|
-
# huge performance boost
|
91
|
-
# by multiplying xt * y first
|
92
|
-
v2 = matrix_arr(@xtxi * (xt * y))
|
93
|
-
|
94
|
-
# add back removed
|
95
|
-
removed.sort.each do |i|
|
96
|
-
v2.insert(i, 0)
|
97
|
-
end
|
98
|
-
@removed = removed
|
99
|
-
|
100
|
-
v2
|
101
|
-
end
|
102
|
-
|
103
|
-
Hash[@coefficient_names.zip(v)]
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
def evaluate(data, y = nil)
|
108
|
-
super(data, y, target: @target)
|
109
|
-
end
|
110
|
-
|
111
|
-
# https://people.richland.edu/james/ictcm/2004/multiple.html
|
112
|
-
def summary(extended: false)
|
113
|
-
@summary_str ||= begin
|
114
|
-
str = String.new("")
|
115
|
-
len = [coefficients.keys.map(&:size).max, 15].max
|
116
|
-
if extended
|
117
|
-
str += "%-#{len}s %12s %12s %12s %12s\n" % ["", "coef", "stderr", "t", "p"]
|
118
|
-
else
|
119
|
-
str += "%-#{len}s %12s %12s\n" % ["", "coef", "p"]
|
120
|
-
end
|
121
|
-
coefficients.each do |k, v|
|
122
|
-
if extended
|
123
|
-
str += "%-#{len}s %12.2f %12.2f %12.2f %12.3f\n" % [k, v, std_err[k], t_value[k], p_value[k]]
|
124
|
-
else
|
125
|
-
str += "%-#{len}s %12.2f %12.3f\n" % [k, v, p_value[k]]
|
126
|
-
end
|
127
|
-
end
|
128
|
-
str += "\n"
|
129
|
-
str += "r2: %.3f\n" % [r2] if extended
|
130
|
-
str += "adjusted r2: %.3f\n" % [adjusted_r2]
|
131
|
-
str
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
def r2
|
136
|
-
@r2 ||= (sst - sse) / sst
|
137
|
-
end
|
138
|
-
|
139
|
-
def adjusted_r2
|
140
|
-
@adjusted_r2 ||= (mst - mse) / mst
|
141
|
-
end
|
142
|
-
|
143
|
-
private
|
144
|
-
|
145
|
-
def constant?(arr)
|
146
|
-
arr.all? { |x| x == arr[0] }
|
147
|
-
end
|
148
|
-
|
149
|
-
# add epsilon for perfect fits
|
150
|
-
# consistent with GSL
|
151
|
-
def t_value
|
152
|
-
@t_value ||= Hash[coefficients.map { |k, v| [k, v / (std_err[k] + Float::EPSILON)] }]
|
153
|
-
end
|
154
|
-
|
155
|
-
def p_value
|
156
|
-
@p_value ||= begin
|
157
|
-
Hash[coefficients.map do |k, _|
|
158
|
-
tp =
|
159
|
-
if @gsl
|
160
|
-
GSL::Cdf.tdist_P(t_value[k].abs, degrees_of_freedom)
|
161
|
-
else
|
162
|
-
tdist_p(t_value[k].abs, degrees_of_freedom)
|
163
|
-
end
|
164
|
-
|
165
|
-
[k, 2 * (1 - tp)]
|
166
|
-
end]
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
170
|
-
def std_err
|
171
|
-
@std_err ||= begin
|
172
|
-
Hash[@coefficient_names.zip(diagonal.map { |v| Math.sqrt(v) })]
|
173
|
-
end
|
174
|
-
end
|
175
|
-
|
176
|
-
def diagonal
|
177
|
-
@diagonal ||= begin
|
178
|
-
if covariance.respond_to?(:each)
|
179
|
-
d = covariance.each(:diagonal).to_a
|
180
|
-
@removed.each do |i|
|
181
|
-
d.insert(i, 0)
|
182
|
-
end
|
183
|
-
d
|
184
|
-
else
|
185
|
-
covariance.diagonal.to_a
|
186
|
-
end
|
187
|
-
end
|
188
|
-
end
|
189
|
-
|
190
|
-
def covariance
|
191
|
-
@covariance ||= mse * @xtxi
|
192
|
-
end
|
193
|
-
|
194
|
-
def y_bar
|
195
|
-
@y_bar ||= mean(@y)
|
196
|
-
end
|
197
|
-
|
198
|
-
def y_hat
|
199
|
-
@y_hat ||= predict(@x)
|
200
|
-
end
|
201
|
-
|
202
|
-
# total sum of squares
|
203
|
-
def sst
|
204
|
-
@sst ||= sum(@y.map { |y| (y - y_bar)**2 })
|
205
|
-
end
|
206
|
-
|
207
|
-
# sum of squared errors of prediction
|
208
|
-
# not to be confused with "explained sum of squares"
|
209
|
-
def sse
|
210
|
-
@sse ||= sum(@y.zip(y_hat).map { |y, yh| (y - yh)**2 })
|
211
|
-
end
|
212
|
-
|
213
|
-
def mst
|
214
|
-
@mst ||= sst / (@y.size - 1)
|
215
|
-
end
|
216
|
-
|
217
|
-
def mse
|
218
|
-
@mse ||= sse / degrees_of_freedom
|
219
|
-
end
|
220
|
-
|
221
|
-
def degrees_of_freedom
|
222
|
-
@y.size - coefficients.size
|
223
|
-
end
|
224
|
-
|
225
|
-
def sum(arr)
|
226
|
-
arr.inject(0, &:+)
|
227
|
-
end
|
228
|
-
|
229
|
-
def mean(arr)
|
230
|
-
sum(arr) / arr.size.to_f
|
231
|
-
end
|
232
|
-
|
233
|
-
### Extracted from https://github.com/estebanz01/ruby-statistics
|
234
|
-
### The Ruby author is Esteban Zapata Rojas
|
235
|
-
###
|
236
|
-
### Originally extracted from https://codeplea.com/incomplete-beta-function-c
|
237
|
-
### This function is shared under zlib license and the author is Lewis Van Winkle
|
238
|
-
def tdist_p(value, degrees_of_freedom)
|
239
|
-
upper = (value + Math.sqrt(value * value + degrees_of_freedom))
|
240
|
-
lower = (2.0 * Math.sqrt(value * value + degrees_of_freedom))
|
241
|
-
|
242
|
-
x = upper/lower
|
243
|
-
|
244
|
-
alpha = degrees_of_freedom/2.0
|
245
|
-
beta = degrees_of_freedom/2.0
|
246
|
-
|
247
|
-
incomplete_beta_function(x, alpha, beta)
|
248
|
-
end
|
249
|
-
|
250
|
-
### Extracted from https://github.com/estebanz01/ruby-statistics
|
251
|
-
### The Ruby author is Esteban Zapata Rojas
|
252
|
-
###
|
253
|
-
### This implementation is an adaptation of the incomplete beta function made in C by
|
254
|
-
### Lewis Van Winkle, which released the code under the zlib license.
|
255
|
-
### The whole math behind this code is described in the following post: https://codeplea.com/incomplete-beta-function-c
|
256
|
-
def incomplete_beta_function(x, alp, bet)
|
257
|
-
return if x < 0.0
|
258
|
-
return 1.0 if x > 1.0
|
259
|
-
|
260
|
-
tiny = 1.0E-50
|
261
|
-
|
262
|
-
if x > ((alp + 1.0)/(alp + bet + 2.0))
|
263
|
-
return 1.0 - incomplete_beta_function(1.0 - x, bet, alp)
|
264
|
-
end
|
265
|
-
|
266
|
-
# To avoid overflow problems, the implementation applies the logarithm properties
|
267
|
-
# to calculate in a faster and safer way the values.
|
268
|
-
lbet_ab = (Math.lgamma(alp)[0] + Math.lgamma(bet)[0] - Math.lgamma(alp + bet)[0]).freeze
|
269
|
-
front = (Math.exp(Math.log(x) * alp + Math.log(1.0 - x) * bet - lbet_ab) / alp.to_f).freeze
|
270
|
-
|
271
|
-
# This is the non-log version of the left part of the formula (before the continuous fraction)
|
272
|
-
# down_left = alp * self.beta_function(alp, bet)
|
273
|
-
# upper_left = (x ** alp) * ((1.0 - x) ** bet)
|
274
|
-
# front = upper_left/down_left
|
275
|
-
|
276
|
-
f, c, d = 1.0, 1.0, 0.0
|
277
|
-
|
278
|
-
returned_value = nil
|
279
|
-
|
280
|
-
# Let's do more iterations than the proposed implementation (200 iters)
|
281
|
-
(0..500).each do |number|
|
282
|
-
m = number/2
|
283
|
-
|
284
|
-
numerator = if number == 0
|
285
|
-
1.0
|
286
|
-
elsif number % 2 == 0
|
287
|
-
(m * (bet - m) * x)/((alp + 2.0 * m - 1.0)* (alp + 2.0 * m))
|
288
|
-
else
|
289
|
-
top = -((alp + m) * (alp + bet + m) * x)
|
290
|
-
down = ((alp + 2.0 * m) * (alp + 2.0 * m + 1.0))
|
291
|
-
|
292
|
-
top/down
|
293
|
-
end
|
294
|
-
|
295
|
-
d = 1.0 + numerator * d
|
296
|
-
d = tiny if d.abs < tiny
|
297
|
-
d = 1.0 / d
|
298
|
-
|
299
|
-
c = 1.0 + numerator / c
|
300
|
-
c = tiny if c.abs < tiny
|
301
|
-
|
302
|
-
cd = (c*d).freeze
|
303
|
-
f = f * cd
|
304
|
-
|
305
|
-
if (1.0 - cd).abs < 1.0E-10
|
306
|
-
returned_value = front * (f - 1.0)
|
307
|
-
break
|
308
|
-
end
|
309
|
-
end
|
310
|
-
|
311
|
-
returned_value
|
312
|
-
end
|
313
|
-
end
|
314
|
-
end
|