eps 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/eps/metrics.rb DELETED
@@ -1,35 +0,0 @@
1
- module Eps
2
- class Metrics
3
- attr_reader :errors
4
-
5
- def initialize(actual, estimated)
6
- @errors = actual.zip(estimated).map { |yi, yi2| yi - yi2 }
7
- end
8
-
9
- def all
10
- {
11
- rmse: rmse,
12
- mae: mae,
13
- me: me
14
- }
15
- end
16
-
17
- private
18
-
19
- def me
20
- mean(errors)
21
- end
22
-
23
- def mae
24
- mean(errors.map { |v| v.abs })
25
- end
26
-
27
- def rmse
28
- Math.sqrt(mean(errors.map { |v| v**2 }))
29
- end
30
-
31
- def mean(arr)
32
- arr.inject(0, &:+) / arr.size.to_f
33
- end
34
- end
35
- end
data/lib/eps/regressor.rb DELETED
@@ -1,314 +0,0 @@
1
- module Eps
2
- class Regressor < BaseRegressor
3
- def initialize(data, y = nil, target: nil, gsl: nil)
4
- raise ArgumentError, "missing target" if !target && !y
5
-
6
- target = prep_target(target, data) if target
7
-
8
- # TODO more performant conversion
9
- if daru?(data)
10
- y ||= data[target].to_a
11
- x = data.dup.delete_vector(target)
12
- else
13
- x = data.map(&:dup)
14
- y ||= x.map { |v| v.delete(target) }
15
- end
16
-
17
- @x = x
18
- @y = prep_y(y.to_a)
19
-
20
- if @x.size != @y.size
21
- raise "Number of samples differs from target"
22
- end
23
-
24
- @target = target
25
- @gsl = gsl.nil? ? defined?(GSL) : gsl
26
-
27
- # fit immediately
28
- coefficients
29
- end
30
-
31
- def coefficients
32
- @coefficients ||= begin
33
- x, @coefficient_names = prep_x(@x)
34
-
35
- if x.size <= @coefficient_names.size
36
- raise "Number of samples must be at least two more than number of features"
37
- end
38
-
39
- v =
40
- if @gsl
41
- x = GSL::Matrix.alloc(*x)
42
- y = GSL::Vector.alloc(@y)
43
- c, @covariance, _, _ = GSL::MultiFit::linear(x, y)
44
- c.to_a
45
- else
46
- x = Matrix.rows(x)
47
- y = Matrix.column_vector(@y)
48
- removed = []
49
-
50
- # https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
51
- # unforutnately, this method is unstable
52
- # haven't found an efficient way to do QR-factorization in Ruby
53
- # the extendmatrix gem has householder and givens (givens has bug)
54
- # but methods are too slow
55
- xt = x.t
56
- begin
57
- @xtxi = (xt * x).inverse
58
- rescue ExceptionForMatrix::ErrNotRegular
59
- constant = {}
60
- (1...x.column_count).each do |i|
61
- constant[i] = constant?(x.column(i))
62
- end
63
-
64
- # remove constant columns
65
- removed = constant.select { |_, v| v }.keys
66
-
67
- # remove non-independent columns
68
- constant.select { |_, v| !v }.keys.combination(2) do |c|
69
- if !x.column(c[0]).independent?(x.column(c[1]))
70
- removed << c[1]
71
- end
72
- end
73
-
74
- vectors = x.column_vectors
75
- # delete in reverse of indexes stay the same
76
- removed.sort.reverse.each do |i|
77
- # @coefficient_names.delete_at(i)
78
- vectors.delete_at(i)
79
- end
80
- x = Matrix.columns(vectors)
81
- xt = x.t
82
-
83
- # try again
84
- begin
85
- @xtxi = (xt * x).inverse
86
- rescue ExceptionForMatrix::ErrNotRegular
87
- raise "Multiple solutions - GSL is needed to select one"
88
- end
89
- end
90
- # huge performance boost
91
- # by multiplying xt * y first
92
- v2 = matrix_arr(@xtxi * (xt * y))
93
-
94
- # add back removed
95
- removed.sort.each do |i|
96
- v2.insert(i, 0)
97
- end
98
- @removed = removed
99
-
100
- v2
101
- end
102
-
103
- Hash[@coefficient_names.zip(v)]
104
- end
105
- end
106
-
107
- def evaluate(data, y = nil)
108
- super(data, y, target: @target)
109
- end
110
-
111
- # https://people.richland.edu/james/ictcm/2004/multiple.html
112
- def summary(extended: false)
113
- @summary_str ||= begin
114
- str = String.new("")
115
- len = [coefficients.keys.map(&:size).max, 15].max
116
- if extended
117
- str += "%-#{len}s %12s %12s %12s %12s\n" % ["", "coef", "stderr", "t", "p"]
118
- else
119
- str += "%-#{len}s %12s %12s\n" % ["", "coef", "p"]
120
- end
121
- coefficients.each do |k, v|
122
- if extended
123
- str += "%-#{len}s %12.2f %12.2f %12.2f %12.3f\n" % [k, v, std_err[k], t_value[k], p_value[k]]
124
- else
125
- str += "%-#{len}s %12.2f %12.3f\n" % [k, v, p_value[k]]
126
- end
127
- end
128
- str += "\n"
129
- str += "r2: %.3f\n" % [r2] if extended
130
- str += "adjusted r2: %.3f\n" % [adjusted_r2]
131
- str
132
- end
133
- end
134
-
135
- def r2
136
- @r2 ||= (sst - sse) / sst
137
- end
138
-
139
- def adjusted_r2
140
- @adjusted_r2 ||= (mst - mse) / mst
141
- end
142
-
143
- private
144
-
145
- def constant?(arr)
146
- arr.all? { |x| x == arr[0] }
147
- end
148
-
149
- # add epsilon for perfect fits
150
- # consistent with GSL
151
- def t_value
152
- @t_value ||= Hash[coefficients.map { |k, v| [k, v / (std_err[k] + Float::EPSILON)] }]
153
- end
154
-
155
- def p_value
156
- @p_value ||= begin
157
- Hash[coefficients.map do |k, _|
158
- tp =
159
- if @gsl
160
- GSL::Cdf.tdist_P(t_value[k].abs, degrees_of_freedom)
161
- else
162
- tdist_p(t_value[k].abs, degrees_of_freedom)
163
- end
164
-
165
- [k, 2 * (1 - tp)]
166
- end]
167
- end
168
- end
169
-
170
- def std_err
171
- @std_err ||= begin
172
- Hash[@coefficient_names.zip(diagonal.map { |v| Math.sqrt(v) })]
173
- end
174
- end
175
-
176
- def diagonal
177
- @diagonal ||= begin
178
- if covariance.respond_to?(:each)
179
- d = covariance.each(:diagonal).to_a
180
- @removed.each do |i|
181
- d.insert(i, 0)
182
- end
183
- d
184
- else
185
- covariance.diagonal.to_a
186
- end
187
- end
188
- end
189
-
190
- def covariance
191
- @covariance ||= mse * @xtxi
192
- end
193
-
194
- def y_bar
195
- @y_bar ||= mean(@y)
196
- end
197
-
198
- def y_hat
199
- @y_hat ||= predict(@x)
200
- end
201
-
202
- # total sum of squares
203
- def sst
204
- @sst ||= sum(@y.map { |y| (y - y_bar)**2 })
205
- end
206
-
207
- # sum of squared errors of prediction
208
- # not to be confused with "explained sum of squares"
209
- def sse
210
- @sse ||= sum(@y.zip(y_hat).map { |y, yh| (y - yh)**2 })
211
- end
212
-
213
- def mst
214
- @mst ||= sst / (@y.size - 1)
215
- end
216
-
217
- def mse
218
- @mse ||= sse / degrees_of_freedom
219
- end
220
-
221
- def degrees_of_freedom
222
- @y.size - coefficients.size
223
- end
224
-
225
- def sum(arr)
226
- arr.inject(0, &:+)
227
- end
228
-
229
- def mean(arr)
230
- sum(arr) / arr.size.to_f
231
- end
232
-
233
- ### Extracted from https://github.com/estebanz01/ruby-statistics
234
- ### The Ruby author is Esteban Zapata Rojas
235
- ###
236
- ### Originally extracted from https://codeplea.com/incomplete-beta-function-c
237
- ### This function is shared under zlib license and the author is Lewis Van Winkle
238
- def tdist_p(value, degrees_of_freedom)
239
- upper = (value + Math.sqrt(value * value + degrees_of_freedom))
240
- lower = (2.0 * Math.sqrt(value * value + degrees_of_freedom))
241
-
242
- x = upper/lower
243
-
244
- alpha = degrees_of_freedom/2.0
245
- beta = degrees_of_freedom/2.0
246
-
247
- incomplete_beta_function(x, alpha, beta)
248
- end
249
-
250
- ### Extracted from https://github.com/estebanz01/ruby-statistics
251
- ### The Ruby author is Esteban Zapata Rojas
252
- ###
253
- ### This implementation is an adaptation of the incomplete beta function made in C by
254
- ### Lewis Van Winkle, which released the code under the zlib license.
255
- ### The whole math behind this code is described in the following post: https://codeplea.com/incomplete-beta-function-c
256
- def incomplete_beta_function(x, alp, bet)
257
- return if x < 0.0
258
- return 1.0 if x > 1.0
259
-
260
- tiny = 1.0E-50
261
-
262
- if x > ((alp + 1.0)/(alp + bet + 2.0))
263
- return 1.0 - incomplete_beta_function(1.0 - x, bet, alp)
264
- end
265
-
266
- # To avoid overflow problems, the implementation applies the logarithm properties
267
- # to calculate in a faster and safer way the values.
268
- lbet_ab = (Math.lgamma(alp)[0] + Math.lgamma(bet)[0] - Math.lgamma(alp + bet)[0]).freeze
269
- front = (Math.exp(Math.log(x) * alp + Math.log(1.0 - x) * bet - lbet_ab) / alp.to_f).freeze
270
-
271
- # This is the non-log version of the left part of the formula (before the continuous fraction)
272
- # down_left = alp * self.beta_function(alp, bet)
273
- # upper_left = (x ** alp) * ((1.0 - x) ** bet)
274
- # front = upper_left/down_left
275
-
276
- f, c, d = 1.0, 1.0, 0.0
277
-
278
- returned_value = nil
279
-
280
- # Let's do more iterations than the proposed implementation (200 iters)
281
- (0..500).each do |number|
282
- m = number/2
283
-
284
- numerator = if number == 0
285
- 1.0
286
- elsif number % 2 == 0
287
- (m * (bet - m) * x)/((alp + 2.0 * m - 1.0)* (alp + 2.0 * m))
288
- else
289
- top = -((alp + m) * (alp + bet + m) * x)
290
- down = ((alp + 2.0 * m) * (alp + 2.0 * m + 1.0))
291
-
292
- top/down
293
- end
294
-
295
- d = 1.0 + numerator * d
296
- d = tiny if d.abs < tiny
297
- d = 1.0 / d
298
-
299
- c = 1.0 + numerator / c
300
- c = tiny if c.abs < tiny
301
-
302
- cd = (c*d).freeze
303
- f = f * cd
304
-
305
- if (1.0 - cd).abs < 1.0E-10
306
- returned_value = front * (f - 1.0)
307
- break
308
- end
309
- end
310
-
311
- returned_value
312
- end
313
- end
314
- end