eps 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/eps/metrics.rb DELETED
@@ -1,35 +0,0 @@
1
- module Eps
2
- class Metrics
3
- attr_reader :errors
4
-
5
- def initialize(actual, estimated)
6
- @errors = actual.zip(estimated).map { |yi, yi2| yi - yi2 }
7
- end
8
-
9
- def all
10
- {
11
- rmse: rmse,
12
- mae: mae,
13
- me: me
14
- }
15
- end
16
-
17
- private
18
-
19
- def me
20
- mean(errors)
21
- end
22
-
23
- def mae
24
- mean(errors.map { |v| v.abs })
25
- end
26
-
27
- def rmse
28
- Math.sqrt(mean(errors.map { |v| v**2 }))
29
- end
30
-
31
- def mean(arr)
32
- arr.inject(0, &:+) / arr.size.to_f
33
- end
34
- end
35
- end
data/lib/eps/regressor.rb DELETED
@@ -1,314 +0,0 @@
1
- module Eps
2
- class Regressor < BaseRegressor
3
- def initialize(data, y = nil, target: nil, gsl: nil)
4
- raise ArgumentError, "missing target" if !target && !y
5
-
6
- target = prep_target(target, data) if target
7
-
8
- # TODO more performant conversion
9
- if daru?(data)
10
- y ||= data[target].to_a
11
- x = data.dup.delete_vector(target)
12
- else
13
- x = data.map(&:dup)
14
- y ||= x.map { |v| v.delete(target) }
15
- end
16
-
17
- @x = x
18
- @y = prep_y(y.to_a)
19
-
20
- if @x.size != @y.size
21
- raise "Number of samples differs from target"
22
- end
23
-
24
- @target = target
25
- @gsl = gsl.nil? ? defined?(GSL) : gsl
26
-
27
- # fit immediately
28
- coefficients
29
- end
30
-
31
- def coefficients
32
- @coefficients ||= begin
33
- x, @coefficient_names = prep_x(@x)
34
-
35
- if x.size <= @coefficient_names.size
36
- raise "Number of samples must be at least two more than number of features"
37
- end
38
-
39
- v =
40
- if @gsl
41
- x = GSL::Matrix.alloc(*x)
42
- y = GSL::Vector.alloc(@y)
43
- c, @covariance, _, _ = GSL::MultiFit::linear(x, y)
44
- c.to_a
45
- else
46
- x = Matrix.rows(x)
47
- y = Matrix.column_vector(@y)
48
- removed = []
49
-
50
- # https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
51
- # unforutnately, this method is unstable
52
- # haven't found an efficient way to do QR-factorization in Ruby
53
- # the extendmatrix gem has householder and givens (givens has bug)
54
- # but methods are too slow
55
- xt = x.t
56
- begin
57
- @xtxi = (xt * x).inverse
58
- rescue ExceptionForMatrix::ErrNotRegular
59
- constant = {}
60
- (1...x.column_count).each do |i|
61
- constant[i] = constant?(x.column(i))
62
- end
63
-
64
- # remove constant columns
65
- removed = constant.select { |_, v| v }.keys
66
-
67
- # remove non-independent columns
68
- constant.select { |_, v| !v }.keys.combination(2) do |c|
69
- if !x.column(c[0]).independent?(x.column(c[1]))
70
- removed << c[1]
71
- end
72
- end
73
-
74
- vectors = x.column_vectors
75
- # delete in reverse of indexes stay the same
76
- removed.sort.reverse.each do |i|
77
- # @coefficient_names.delete_at(i)
78
- vectors.delete_at(i)
79
- end
80
- x = Matrix.columns(vectors)
81
- xt = x.t
82
-
83
- # try again
84
- begin
85
- @xtxi = (xt * x).inverse
86
- rescue ExceptionForMatrix::ErrNotRegular
87
- raise "Multiple solutions - GSL is needed to select one"
88
- end
89
- end
90
- # huge performance boost
91
- # by multiplying xt * y first
92
- v2 = matrix_arr(@xtxi * (xt * y))
93
-
94
- # add back removed
95
- removed.sort.each do |i|
96
- v2.insert(i, 0)
97
- end
98
- @removed = removed
99
-
100
- v2
101
- end
102
-
103
- Hash[@coefficient_names.zip(v)]
104
- end
105
- end
106
-
107
- def evaluate(data, y = nil)
108
- super(data, y, target: @target)
109
- end
110
-
111
- # https://people.richland.edu/james/ictcm/2004/multiple.html
112
- def summary(extended: false)
113
- @summary_str ||= begin
114
- str = String.new("")
115
- len = [coefficients.keys.map(&:size).max, 15].max
116
- if extended
117
- str += "%-#{len}s %12s %12s %12s %12s\n" % ["", "coef", "stderr", "t", "p"]
118
- else
119
- str += "%-#{len}s %12s %12s\n" % ["", "coef", "p"]
120
- end
121
- coefficients.each do |k, v|
122
- if extended
123
- str += "%-#{len}s %12.2f %12.2f %12.2f %12.3f\n" % [k, v, std_err[k], t_value[k], p_value[k]]
124
- else
125
- str += "%-#{len}s %12.2f %12.3f\n" % [k, v, p_value[k]]
126
- end
127
- end
128
- str += "\n"
129
- str += "r2: %.3f\n" % [r2] if extended
130
- str += "adjusted r2: %.3f\n" % [adjusted_r2]
131
- str
132
- end
133
- end
134
-
135
- def r2
136
- @r2 ||= (sst - sse) / sst
137
- end
138
-
139
- def adjusted_r2
140
- @adjusted_r2 ||= (mst - mse) / mst
141
- end
142
-
143
- private
144
-
145
- def constant?(arr)
146
- arr.all? { |x| x == arr[0] }
147
- end
148
-
149
- # add epsilon for perfect fits
150
- # consistent with GSL
151
- def t_value
152
- @t_value ||= Hash[coefficients.map { |k, v| [k, v / (std_err[k] + Float::EPSILON)] }]
153
- end
154
-
155
- def p_value
156
- @p_value ||= begin
157
- Hash[coefficients.map do |k, _|
158
- tp =
159
- if @gsl
160
- GSL::Cdf.tdist_P(t_value[k].abs, degrees_of_freedom)
161
- else
162
- tdist_p(t_value[k].abs, degrees_of_freedom)
163
- end
164
-
165
- [k, 2 * (1 - tp)]
166
- end]
167
- end
168
- end
169
-
170
- def std_err
171
- @std_err ||= begin
172
- Hash[@coefficient_names.zip(diagonal.map { |v| Math.sqrt(v) })]
173
- end
174
- end
175
-
176
- def diagonal
177
- @diagonal ||= begin
178
- if covariance.respond_to?(:each)
179
- d = covariance.each(:diagonal).to_a
180
- @removed.each do |i|
181
- d.insert(i, 0)
182
- end
183
- d
184
- else
185
- covariance.diagonal.to_a
186
- end
187
- end
188
- end
189
-
190
- def covariance
191
- @covariance ||= mse * @xtxi
192
- end
193
-
194
- def y_bar
195
- @y_bar ||= mean(@y)
196
- end
197
-
198
- def y_hat
199
- @y_hat ||= predict(@x)
200
- end
201
-
202
- # total sum of squares
203
- def sst
204
- @sst ||= sum(@y.map { |y| (y - y_bar)**2 })
205
- end
206
-
207
- # sum of squared errors of prediction
208
- # not to be confused with "explained sum of squares"
209
- def sse
210
- @sse ||= sum(@y.zip(y_hat).map { |y, yh| (y - yh)**2 })
211
- end
212
-
213
- def mst
214
- @mst ||= sst / (@y.size - 1)
215
- end
216
-
217
- def mse
218
- @mse ||= sse / degrees_of_freedom
219
- end
220
-
221
- def degrees_of_freedom
222
- @y.size - coefficients.size
223
- end
224
-
225
- def sum(arr)
226
- arr.inject(0, &:+)
227
- end
228
-
229
- def mean(arr)
230
- sum(arr) / arr.size.to_f
231
- end
232
-
233
- ### Extracted from https://github.com/estebanz01/ruby-statistics
234
- ### The Ruby author is Esteban Zapata Rojas
235
- ###
236
- ### Originally extracted from https://codeplea.com/incomplete-beta-function-c
237
- ### This function is shared under zlib license and the author is Lewis Van Winkle
238
- def tdist_p(value, degrees_of_freedom)
239
- upper = (value + Math.sqrt(value * value + degrees_of_freedom))
240
- lower = (2.0 * Math.sqrt(value * value + degrees_of_freedom))
241
-
242
- x = upper/lower
243
-
244
- alpha = degrees_of_freedom/2.0
245
- beta = degrees_of_freedom/2.0
246
-
247
- incomplete_beta_function(x, alpha, beta)
248
- end
249
-
250
- ### Extracted from https://github.com/estebanz01/ruby-statistics
251
- ### The Ruby author is Esteban Zapata Rojas
252
- ###
253
- ### This implementation is an adaptation of the incomplete beta function made in C by
254
- ### Lewis Van Winkle, which released the code under the zlib license.
255
- ### The whole math behind this code is described in the following post: https://codeplea.com/incomplete-beta-function-c
256
- def incomplete_beta_function(x, alp, bet)
257
- return if x < 0.0
258
- return 1.0 if x > 1.0
259
-
260
- tiny = 1.0E-50
261
-
262
- if x > ((alp + 1.0)/(alp + bet + 2.0))
263
- return 1.0 - incomplete_beta_function(1.0 - x, bet, alp)
264
- end
265
-
266
- # To avoid overflow problems, the implementation applies the logarithm properties
267
- # to calculate in a faster and safer way the values.
268
- lbet_ab = (Math.lgamma(alp)[0] + Math.lgamma(bet)[0] - Math.lgamma(alp + bet)[0]).freeze
269
- front = (Math.exp(Math.log(x) * alp + Math.log(1.0 - x) * bet - lbet_ab) / alp.to_f).freeze
270
-
271
- # This is the non-log version of the left part of the formula (before the continuous fraction)
272
- # down_left = alp * self.beta_function(alp, bet)
273
- # upper_left = (x ** alp) * ((1.0 - x) ** bet)
274
- # front = upper_left/down_left
275
-
276
- f, c, d = 1.0, 1.0, 0.0
277
-
278
- returned_value = nil
279
-
280
- # Let's do more iterations than the proposed implementation (200 iters)
281
- (0..500).each do |number|
282
- m = number/2
283
-
284
- numerator = if number == 0
285
- 1.0
286
- elsif number % 2 == 0
287
- (m * (bet - m) * x)/((alp + 2.0 * m - 1.0)* (alp + 2.0 * m))
288
- else
289
- top = -((alp + m) * (alp + bet + m) * x)
290
- down = ((alp + 2.0 * m) * (alp + 2.0 * m + 1.0))
291
-
292
- top/down
293
- end
294
-
295
- d = 1.0 + numerator * d
296
- d = tiny if d.abs < tiny
297
- d = 1.0 / d
298
-
299
- c = 1.0 + numerator / c
300
- c = tiny if c.abs < tiny
301
-
302
- cd = (c*d).freeze
303
- f = f * cd
304
-
305
- if (1.0 - cd).abs < 1.0E-10
306
- returned_value = front * (f - 1.0)
307
- break
308
- end
309
- end
310
-
311
- returned_value
312
- end
313
- end
314
- end