eps 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,28 +1,105 @@
1
1
  module Eps
2
2
  class LinearRegression < BaseEstimator
3
- def initialize(coefficients: nil, gsl: nil)
4
- @coefficients = Hash[coefficients.map { |k, v| [k.is_a?(Array) ? [k[0].to_sym, k[1]] : k.to_sym, v] }] if coefficients
5
- @gsl = gsl.nil? ? defined?(GSL) : gsl
3
+ # pmml
4
+
5
+ def self.load_pmml(data)
6
+ super do |data|
7
+ # TODO more validation
8
+ node = data.css("RegressionTable")
9
+
10
+ coefficients = {
11
+ "_intercept" => node.attribute("intercept").value.to_f
12
+ }
13
+
14
+ features = {}
15
+
16
+ text_features, derived_fields = extract_text_features(data, features)
17
+
18
+ node.css("NumericPredictor").each do |n|
19
+ name = n.attribute("name").value
20
+ if derived_fields[name]
21
+ name = derived_fields[name]
22
+ else
23
+ features[name] = "numeric"
24
+ end
25
+ coefficients[name] = n.attribute("coefficient").value.to_f
26
+ end
27
+
28
+ node.css("CategoricalPredictor").each do |n|
29
+ name = n.attribute("name").value
30
+ coefficients[[name, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
31
+ features[name] = "categorical"
32
+ end
33
+
34
+ Evaluators::LinearRegression.new(coefficients: coefficients, features: features, text_features: text_features)
35
+ end
36
+ end
37
+
38
+ def coefficients
39
+ @evaluator.coefficients
40
+ end
41
+
42
+ def r2
43
+ @r2 ||= (sst - sse) / sst
44
+ end
45
+
46
+ def adjusted_r2
47
+ @adjusted_r2 ||= (mst - mse) / mst
48
+ end
49
+
50
+ private
51
+
52
+ # https://people.richland.edu/james/ictcm/2004/multiple.html
53
+ def _summary(extended: false)
54
+ coefficients = @coefficients
55
+ str = String.new("")
56
+ len = [coefficients.keys.map(&:size).max, 15].max
57
+ if extended
58
+ str += "%-#{len}s %12s %12s %12s %12s\n" % ["", "coef", "stderr", "t", "p"]
59
+ else
60
+ str += "%-#{len}s %12s %12s\n" % ["", "coef", "p"]
61
+ end
62
+ coefficients.each do |k, v|
63
+ if extended
64
+ str += "%-#{len}s %12.2f %12.2f %12.2f %12.3f\n" % [display_field(k), v, std_err[k], t_value[k], p_value[k]]
65
+ else
66
+ str += "%-#{len}s %12.2f %12.3f\n" % [display_field(k), v, p_value[k]]
67
+ end
68
+ end
69
+ str += "\n"
70
+ str += "r2: %.3f\n" % [r2] if extended
71
+ str += "adjusted r2: %.3f\n" % [adjusted_r2]
72
+ str
6
73
  end
7
74
 
8
- def train(*args)
9
- super
75
+ def _train(**options)
76
+ raise "Target must be numeric" if @target_type != "numeric"
77
+ check_missing_value(@train_set)
78
+ check_missing_value(@validation_set) if @validation_set
10
79
 
11
- x, @coefficient_names = prep_x(@x)
80
+ data = prep_x(@train_set)
12
81
 
13
- if x.size <= @coefficient_names.size
14
- raise "Number of samples must be at least two more than number of features"
82
+ if data.size < data.columns.size + 2
83
+ raise "Number of data points must be at least two more than number of features"
15
84
  end
16
85
 
86
+ x = data.map_rows(&:to_a)
87
+ data.size.times do |i|
88
+ # add intercept
89
+ x[i].unshift(1)
90
+ end
91
+
92
+ gsl = options.key?(:gsl) ? options[:gsl] : defined?(GSL)
93
+
17
94
  v3 =
18
- if @gsl
95
+ if gsl
19
96
  x = GSL::Matrix.alloc(*x)
20
- y = GSL::Vector.alloc(@y)
97
+ y = GSL::Vector.alloc(data.label)
21
98
  c, @covariance, _, _ = GSL::MultiFit::linear(x, y)
22
99
  c.to_a
23
100
  else
24
101
  x = Matrix.rows(x)
25
- y = Matrix.column_vector(@y)
102
+ y = Matrix.column_vector(data.label)
26
103
  removed = []
27
104
 
28
105
  # https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
@@ -67,7 +144,10 @@ module Eps
67
144
  end
68
145
  # huge performance boost
69
146
  # by multiplying xt * y first
70
- v2 = matrix_arr(@xtxi * (xt * y))
147
+ v2 = @xtxi * (xt * y)
148
+
149
+ # convert to array
150
+ v2 = v2.to_a.map { |xi| xi[0].to_f }
71
151
 
72
152
  # add back removed
73
153
  removed.sort.each do |i|
@@ -78,198 +158,62 @@ module Eps
78
158
  v2
79
159
  end
80
160
 
161
+ @coefficient_names = ["_intercept"] + data.columns.keys
81
162
  @coefficients = Hash[@coefficient_names.zip(v3)]
163
+ Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
82
164
  end
83
165
 
84
- # legacy
85
-
86
- def coefficients
87
- Hash[@coefficients.map { |k, v| [Array(k).join.to_sym, v] }]
88
- end
89
-
90
- # ruby
91
-
92
- def self.load(data)
93
- new(Hash[data.map { |k, v| [k.to_sym, v] }])
94
- end
95
-
96
- def dump
97
- {coefficients: coefficients}
98
- end
99
-
100
- # json
101
-
102
- def self.load_json(data)
103
- data = JSON.parse(data) if data.is_a?(String)
104
- coefficients = data["coefficients"]
105
-
106
- # for R models
107
- if coefficients["(Intercept)"]
108
- coefficients = coefficients.dup
109
- coefficients["_intercept"] = coefficients.delete("(Intercept)")
110
- end
111
-
112
- new(coefficients: coefficients)
113
- end
114
-
115
- def to_json
116
- JSON.generate(dump)
117
- end
118
-
119
- # pmml
120
-
121
- def self.load_pmml(data)
122
- # TODO more validation
123
- node = data.css("RegressionTable")
124
- coefficients = {
125
- _intercept: node.attribute("intercept").value.to_f
126
- }
127
- node.css("NumericPredictor").each do |n|
128
- coefficients[n.attribute("name").value] = n.attribute("coefficient").value.to_f
129
- end
130
- node.css("CategoricalPredictor").each do |n|
131
- coefficients[[n.attribute("name").value.to_sym, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
132
- end
133
- new(coefficients: coefficients)
134
- end
135
-
136
- def to_pmml
137
- predictors = @coefficients.reject { |k| k == :_intercept }
166
+ def generate_pmml
167
+ predictors = @coefficients.dup
168
+ predictors.delete("_intercept")
138
169
 
139
170
  data_fields = {}
140
- predictors.each do |k, v|
141
- if k.is_a?(Array)
142
- (data_fields[k[0]] ||= []) << k[1]
171
+ @features.each do |k, type|
172
+ if type == "categorical"
173
+ data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last)
143
174
  else
144
175
  data_fields[k] = nil
145
176
  end
146
177
  end
147
178
 
148
- builder = Nokogiri::XML::Builder.new do |xml|
149
- xml.PMML(version: "4.3", xmlns: "http://www.dmg.org/PMML-4_3", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
150
- xml.Header
151
- xml.DataDictionary do
152
- data_fields.each do |k, vs|
153
- if vs
154
- xml.DataField(name: k, optype: "categorical", dataType: "string") do
155
- vs.each do |v|
156
- xml.Value(value: v)
157
- end
158
- end
159
- else
160
- xml.DataField(name: k, optype: "continuous", dataType: "double")
161
- end
179
+ build_pmml(data_fields) do |xml|
180
+ xml.RegressionModel(functionName: "regression") do
181
+ xml.MiningSchema do
182
+ @features.each do |k, _|
183
+ xml.MiningField(name: k)
162
184
  end
163
185
  end
164
- xml.RegressionModel(functionName: "regression") do
165
- xml.MiningSchema do
166
- data_fields.each do |k, _|
167
- xml.MiningField(name: k)
168
- end
169
- end
170
- xml.RegressionTable(intercept: @coefficients[:_intercept]) do
171
- predictors.each do |k, v|
172
- if k.is_a?(Array)
173
- xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
186
+ pmml_local_transformations(xml)
187
+ xml.RegressionTable(intercept: @coefficients["_intercept"]) do
188
+ predictors.each do |k, v|
189
+ if k.is_a?(Array)
190
+ if @features[k.first] == "text"
191
+ xml.NumericPredictor(name: display_field(k), coefficient: v)
174
192
  else
175
- xml.NumericPredictor(name: k, coefficient: v)
193
+ xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
176
194
  end
195
+ else
196
+ xml.NumericPredictor(name: k, coefficient: v)
177
197
  end
178
198
  end
179
199
  end
180
200
  end
181
- end.to_xml
182
- end
183
-
184
- # pfa
185
-
186
- def self.load_pfa(data)
187
- data = JSON.parse(data) if data.is_a?(String)
188
- init = data["cells"].first[1]["init"]
189
- names =
190
- if data["input"]["fields"]
191
- data["input"]["fields"].map { |f| f["name"] }
192
- else
193
- init["coeff"].map.with_index { |_, i| "x#{i}" }
194
- end
195
- coefficients = {
196
- _intercept: init["const"]
197
- }
198
- init["coeff"].each_with_index do |c, i|
199
- name = names[i]
200
- # R can export coefficients with same name
201
- raise "Coefficients with same name" if coefficients[name]
202
- coefficients[name] = c
203
201
  end
204
- new(coefficients: coefficients)
205
- end
206
-
207
- # metrics
208
-
209
- def self.metrics(actual, estimated)
210
- errors = actual.zip(estimated).map { |yi, yi2| yi - yi2 }
211
-
212
- {
213
- me: mean(errors),
214
- mae: mean(errors.map { |v| v.abs }),
215
- rmse: Math.sqrt(mean(errors.map { |v| v**2 }))
216
- }
217
202
  end
218
203
 
219
- # private
220
- def self.mean(arr)
221
- arr.inject(0, &:+) / arr.size.to_f
222
- end
223
-
224
- # https://people.richland.edu/james/ictcm/2004/multiple.html
225
- def summary(extended: false)
226
- coefficients = @coefficients
227
- str = String.new("")
228
- len = [coefficients.keys.map(&:size).max, 15].max
229
- if extended
230
- str += "%-#{len}s %12s %12s %12s %12s\n" % ["", "coef", "stderr", "t", "p"]
231
- else
232
- str += "%-#{len}s %12s %12s\n" % ["", "coef", "p"]
233
- end
234
- coefficients.each do |k, v|
235
- if extended
236
- str += "%-#{len}s %12.2f %12.2f %12.2f %12.3f\n" % [display_field(k), v, std_err[k], t_value[k], p_value[k]]
237
- else
238
- str += "%-#{len}s %12.2f %12.3f\n" % [display_field(k), v, p_value[k]]
204
+ def prep_x(x)
205
+ x = x.dup
206
+ @features.each do |k, type|
207
+ if type == "categorical"
208
+ values = x.columns.delete(k)
209
+ labels = values.uniq[1..-1]
210
+ labels.each do |label|
211
+ x.columns[[k, label]] = values.map { |v| v == label ? 1 : 0 }
212
+ end
239
213
  end
240
214
  end
241
- str += "\n"
242
- str += "r2: %.3f\n" % [r2] if extended
243
- str += "adjusted r2: %.3f\n" % [adjusted_r2]
244
- str
245
- end
246
-
247
- def r2
248
- @r2 ||= (sst - sse) / sst
249
- end
250
-
251
- def adjusted_r2
252
- @adjusted_r2 ||= (mst - mse) / mst
253
- end
254
-
255
- private
256
-
257
- def _predict(x)
258
- x, c = prep_x(x, train: false)
259
- coef = c.map do |v|
260
- # use 0 if coefficient does not exist
261
- # this can happen for categorical features
262
- # since only n-1 coefficients are stored
263
- @coefficients[v] || 0
264
- end
265
-
266
- x = Matrix.rows(x)
267
- c = Matrix.column_vector(coef)
268
- matrix_arr(x * c)
269
- end
270
-
271
- def display_field(k)
272
- k.is_a?(Array) ? k.join("") : k
215
+ prep_text_features(x)
216
+ x
273
217
  end
274
218
 
275
219
  def constant?(arr)
@@ -289,7 +233,7 @@ module Eps
289
233
  if @gsl
290
234
  GSL::Cdf.tdist_P(t_value[k].abs, degrees_of_freedom)
291
235
  else
292
- tdist_p(t_value[k].abs, degrees_of_freedom)
236
+ Eps::Statistics.tdist_p(t_value[k].abs, degrees_of_freedom)
293
237
  end
294
238
 
295
239
  [k, 2 * (1 - tp)]
@@ -322,26 +266,26 @@ module Eps
322
266
  end
323
267
 
324
268
  def y_bar
325
- @y_bar ||= mean(@y)
269
+ @y_bar ||= mean(@train_set.label)
326
270
  end
327
271
 
328
272
  def y_hat
329
- @y_hat ||= predict(@x)
273
+ @y_hat ||= predict(@train_set)
330
274
  end
331
275
 
332
276
  # total sum of squares
333
277
  def sst
334
- @sst ||= @y.map { |y| (y - y_bar)**2 }.sum
278
+ @sst ||= @train_set.label.map { |y| (y - y_bar)**2 }.sum
335
279
  end
336
280
 
337
281
  # sum of squared errors of prediction
338
282
  # not to be confused with "explained sum of squares"
339
283
  def sse
340
- @sse ||= @y.zip(y_hat).map { |y, yh| (y - yh)**2 }.sum
284
+ @sse ||= @train_set.label.zip(y_hat).map { |y, yh| (y - yh)**2 }.sum
341
285
  end
342
286
 
343
287
  def mst
344
- @mst ||= sst / (@y.size - 1)
288
+ @mst ||= sst / (@train_set.size - 1)
345
289
  end
346
290
 
347
291
  def mse
@@ -349,209 +293,11 @@ module Eps
349
293
  end
350
294
 
351
295
  def degrees_of_freedom
352
- @y.size - @coefficients.size
296
+ @train_set.size - @coefficients.size
353
297
  end
354
298
 
355
299
  def mean(arr)
356
300
  arr.sum / arr.size.to_f
357
301
  end
358
-
359
- ### Extracted from https://github.com/estebanz01/ruby-statistics
360
- ### The Ruby author is Esteban Zapata Rojas
361
- ###
362
- ### Originally extracted from https://codeplea.com/incomplete-beta-function-c
363
- ### This function is shared under zlib license and the author is Lewis Van Winkle
364
- def tdist_p(value, degrees_of_freedom)
365
- upper = (value + Math.sqrt(value * value + degrees_of_freedom))
366
- lower = (2.0 * Math.sqrt(value * value + degrees_of_freedom))
367
-
368
- x = upper/lower
369
-
370
- alpha = degrees_of_freedom/2.0
371
- beta = degrees_of_freedom/2.0
372
-
373
- incomplete_beta_function(x, alpha, beta)
374
- end
375
-
376
- ### Extracted from https://github.com/estebanz01/ruby-statistics
377
- ### The Ruby author is Esteban Zapata Rojas
378
- ###
379
- ### This implementation is an adaptation of the incomplete beta function made in C by
380
- ### Lewis Van Winkle, which released the code under the zlib license.
381
- ### The whole math behind this code is described in the following post: https://codeplea.com/incomplete-beta-function-c
382
- def incomplete_beta_function(x, alp, bet)
383
- return if x < 0.0
384
- return 1.0 if x > 1.0
385
-
386
- tiny = 1.0E-50
387
-
388
- if x > ((alp + 1.0)/(alp + bet + 2.0))
389
- return 1.0 - incomplete_beta_function(1.0 - x, bet, alp)
390
- end
391
-
392
- # To avoid overflow problems, the implementation applies the logarithm properties
393
- # to calculate in a faster and safer way the values.
394
- lbet_ab = (Math.lgamma(alp)[0] + Math.lgamma(bet)[0] - Math.lgamma(alp + bet)[0]).freeze
395
- front = (Math.exp(Math.log(x) * alp + Math.log(1.0 - x) * bet - lbet_ab) / alp.to_f).freeze
396
-
397
- # This is the non-log version of the left part of the formula (before the continuous fraction)
398
- # down_left = alp * self.beta_function(alp, bet)
399
- # upper_left = (x ** alp) * ((1.0 - x) ** bet)
400
- # front = upper_left/down_left
401
-
402
- f, c, d = 1.0, 1.0, 0.0
403
-
404
- returned_value = nil
405
-
406
- # Let's do more iterations than the proposed implementation (200 iters)
407
- (0..500).each do |number|
408
- m = number/2
409
-
410
- numerator = if number == 0
411
- 1.0
412
- elsif number % 2 == 0
413
- (m * (bet - m) * x)/((alp + 2.0 * m - 1.0)* (alp + 2.0 * m))
414
- else
415
- top = -((alp + m) * (alp + bet + m) * x)
416
- down = ((alp + 2.0 * m) * (alp + 2.0 * m + 1.0))
417
-
418
- top/down
419
- end
420
-
421
- d = 1.0 + numerator * d
422
- d = tiny if d.abs < tiny
423
- d = 1.0 / d
424
-
425
- c = 1.0 + numerator / c
426
- c = tiny if c.abs < tiny
427
-
428
- cd = (c*d).freeze
429
- f = f * cd
430
-
431
- if (1.0 - cd).abs < 1.0E-10
432
- returned_value = front * (f - 1.0)
433
- break
434
- end
435
- end
436
-
437
- returned_value
438
- end
439
-
440
- def prep_x(x, train: true)
441
- coefficients = @coefficients
442
-
443
- if daru?(x)
444
- x = x.to_a[0]
445
- else
446
- x = x.map do |xi|
447
- case xi
448
- when Hash
449
- xi
450
- when Array
451
- Hash[xi.map.with_index { |v, i| [:"x#{i}", v] }]
452
- else
453
- {x0: xi}
454
- end
455
- end
456
- end
457
-
458
- # get column types
459
- if train
460
- column_types = {}
461
- if x.any?
462
- row = x.first
463
- row.each do |k, v|
464
- column_types[k] = categorical?(v) ? "categorical" : "numeric"
465
- end
466
- end
467
- else
468
- # get column types for prediction
469
- column_types = {}
470
- coefficients.each do |k, v|
471
- next if k == :_intercept
472
- if k.is_a?(Array)
473
- column_types[k.first] = "categorical"
474
- else
475
- column_types[k] = "numeric"
476
- end
477
- end
478
- end
479
-
480
- # if !train && x.any?
481
- # # check first row against coefficients
482
- # ckeys = coefficients.keys.map(&:to_s)
483
- # bad_keys = x[0].keys.map(&:to_s).reject { |k| ckeys.any? { |c| c.start_with?(k) } }
484
- # raise "Unknown keys: #{bad_keys.join(", ")}" if bad_keys.any?
485
- # end
486
-
487
- supports_categorical = train || coefficients.any? { |k, _| k.is_a?(Array) }
488
-
489
- cache = {}
490
- first_key = {}
491
- i = 0
492
- rows = []
493
- x.each do |xi|
494
- row = {}
495
- xi.each do |k, v|
496
- categorical = column_types[k.to_sym] == "categorical" || (!supports_categorical && categorical?(v))
497
-
498
- key = categorical ? [k.to_sym, v.to_s] : k.to_sym
499
- v2 = categorical ? 1 : v
500
-
501
- # TODO make more efficient
502
- check_key = supports_categorical ? key : symbolize_coef(key)
503
- next if !train && !coefficients.key?(check_key)
504
-
505
- raise "Missing data" if v2.nil?
506
-
507
- unless cache[key]
508
- cache[key] = i
509
- first_key[k] ||= key if categorical
510
- i += 1
511
- end
512
-
513
- row[key] = v2
514
- end
515
- rows << row
516
- end
517
-
518
- if train
519
- # remove one degree of freedom
520
- first_key.values.each do |v|
521
- num = cache.delete(v)
522
- cache.each do |k, v2|
523
- cache[k] -= 1 if v2 > num
524
- end
525
- end
526
- end
527
-
528
- ret2 = []
529
- rows.each do |row|
530
- ret = [0] * cache.size
531
- row.each do |k, v|
532
- if cache[k]
533
- ret[cache[k]] = v
534
- end
535
- end
536
- ret2 << ([1] + ret)
537
- end
538
-
539
- # flatten keys
540
- c = [:_intercept] + cache.sort_by { |_, v| v }.map(&:first)
541
-
542
- unless supports_categorical
543
- c = c.map { |v| symbolize_coef(v) }
544
- end
545
-
546
- [ret2, c]
547
- end
548
-
549
- def symbolize_coef(k)
550
- (k.is_a?(Array) ? k.join("") : k).to_sym
551
- end
552
-
553
- def matrix_arr(matrix)
554
- matrix.to_a.map { |xi| xi[0].to_f }
555
- end
556
302
  end
557
303
  end