eps 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,28 +1,105 @@
1
1
  module Eps
2
2
  class LinearRegression < BaseEstimator
3
- def initialize(coefficients: nil, gsl: nil)
4
- @coefficients = Hash[coefficients.map { |k, v| [k.is_a?(Array) ? [k[0].to_sym, k[1]] : k.to_sym, v] }] if coefficients
5
- @gsl = gsl.nil? ? defined?(GSL) : gsl
3
+ # pmml
4
+
5
+ def self.load_pmml(data)
6
+ super do |data|
7
+ # TODO more validation
8
+ node = data.css("RegressionTable")
9
+
10
+ coefficients = {
11
+ "_intercept" => node.attribute("intercept").value.to_f
12
+ }
13
+
14
+ features = {}
15
+
16
+ text_features, derived_fields = extract_text_features(data, features)
17
+
18
+ node.css("NumericPredictor").each do |n|
19
+ name = n.attribute("name").value
20
+ if derived_fields[name]
21
+ name = derived_fields[name]
22
+ else
23
+ features[name] = "numeric"
24
+ end
25
+ coefficients[name] = n.attribute("coefficient").value.to_f
26
+ end
27
+
28
+ node.css("CategoricalPredictor").each do |n|
29
+ name = n.attribute("name").value
30
+ coefficients[[name, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
31
+ features[name] = "categorical"
32
+ end
33
+
34
+ Evaluators::LinearRegression.new(coefficients: coefficients, features: features, text_features: text_features)
35
+ end
36
+ end
37
+
38
+ def coefficients
39
+ @evaluator.coefficients
40
+ end
41
+
42
+ def r2
43
+ @r2 ||= (sst - sse) / sst
44
+ end
45
+
46
+ def adjusted_r2
47
+ @adjusted_r2 ||= (mst - mse) / mst
48
+ end
49
+
50
+ private
51
+
52
+ # https://people.richland.edu/james/ictcm/2004/multiple.html
53
+ def _summary(extended: false)
54
+ coefficients = @coefficients
55
+ str = String.new("")
56
+ len = [coefficients.keys.map(&:size).max, 15].max
57
+ if extended
58
+ str += "%-#{len}s %12s %12s %12s %12s\n" % ["", "coef", "stderr", "t", "p"]
59
+ else
60
+ str += "%-#{len}s %12s %12s\n" % ["", "coef", "p"]
61
+ end
62
+ coefficients.each do |k, v|
63
+ if extended
64
+ str += "%-#{len}s %12.2f %12.2f %12.2f %12.3f\n" % [display_field(k), v, std_err[k], t_value[k], p_value[k]]
65
+ else
66
+ str += "%-#{len}s %12.2f %12.3f\n" % [display_field(k), v, p_value[k]]
67
+ end
68
+ end
69
+ str += "\n"
70
+ str += "r2: %.3f\n" % [r2] if extended
71
+ str += "adjusted r2: %.3f\n" % [adjusted_r2]
72
+ str
6
73
  end
7
74
 
8
- def train(*args)
9
- super
75
+ def _train(**options)
76
+ raise "Target must be numeric" if @target_type != "numeric"
77
+ check_missing_value(@train_set)
78
+ check_missing_value(@validation_set) if @validation_set
10
79
 
11
- x, @coefficient_names = prep_x(@x)
80
+ data = prep_x(@train_set)
12
81
 
13
- if x.size <= @coefficient_names.size
14
- raise "Number of samples must be at least two more than number of features"
82
+ if data.size < data.columns.size + 2
83
+ raise "Number of data points must be at least two more than number of features"
15
84
  end
16
85
 
86
+ x = data.map_rows(&:to_a)
87
+ data.size.times do |i|
88
+ # add intercept
89
+ x[i].unshift(1)
90
+ end
91
+
92
+ gsl = options.key?(:gsl) ? options[:gsl] : defined?(GSL)
93
+
17
94
  v3 =
18
- if @gsl
95
+ if gsl
19
96
  x = GSL::Matrix.alloc(*x)
20
- y = GSL::Vector.alloc(@y)
97
+ y = GSL::Vector.alloc(data.label)
21
98
  c, @covariance, _, _ = GSL::MultiFit::linear(x, y)
22
99
  c.to_a
23
100
  else
24
101
  x = Matrix.rows(x)
25
- y = Matrix.column_vector(@y)
102
+ y = Matrix.column_vector(data.label)
26
103
  removed = []
27
104
 
28
105
  # https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
@@ -67,7 +144,10 @@ module Eps
67
144
  end
68
145
  # huge performance boost
69
146
  # by multiplying xt * y first
70
- v2 = matrix_arr(@xtxi * (xt * y))
147
+ v2 = @xtxi * (xt * y)
148
+
149
+ # convert to array
150
+ v2 = v2.to_a.map { |xi| xi[0].to_f }
71
151
 
72
152
  # add back removed
73
153
  removed.sort.each do |i|
@@ -78,198 +158,62 @@ module Eps
78
158
  v2
79
159
  end
80
160
 
161
+ @coefficient_names = ["_intercept"] + data.columns.keys
81
162
  @coefficients = Hash[@coefficient_names.zip(v3)]
163
+ Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
82
164
  end
83
165
 
84
- # legacy
85
-
86
- def coefficients
87
- Hash[@coefficients.map { |k, v| [Array(k).join.to_sym, v] }]
88
- end
89
-
90
- # ruby
91
-
92
- def self.load(data)
93
- new(Hash[data.map { |k, v| [k.to_sym, v] }])
94
- end
95
-
96
- def dump
97
- {coefficients: coefficients}
98
- end
99
-
100
- # json
101
-
102
- def self.load_json(data)
103
- data = JSON.parse(data) if data.is_a?(String)
104
- coefficients = data["coefficients"]
105
-
106
- # for R models
107
- if coefficients["(Intercept)"]
108
- coefficients = coefficients.dup
109
- coefficients["_intercept"] = coefficients.delete("(Intercept)")
110
- end
111
-
112
- new(coefficients: coefficients)
113
- end
114
-
115
- def to_json
116
- JSON.generate(dump)
117
- end
118
-
119
- # pmml
120
-
121
- def self.load_pmml(data)
122
- # TODO more validation
123
- node = data.css("RegressionTable")
124
- coefficients = {
125
- _intercept: node.attribute("intercept").value.to_f
126
- }
127
- node.css("NumericPredictor").each do |n|
128
- coefficients[n.attribute("name").value] = n.attribute("coefficient").value.to_f
129
- end
130
- node.css("CategoricalPredictor").each do |n|
131
- coefficients[[n.attribute("name").value.to_sym, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
132
- end
133
- new(coefficients: coefficients)
134
- end
135
-
136
- def to_pmml
137
- predictors = @coefficients.reject { |k| k == :_intercept }
166
+ def generate_pmml
167
+ predictors = @coefficients.dup
168
+ predictors.delete("_intercept")
138
169
 
139
170
  data_fields = {}
140
- predictors.each do |k, v|
141
- if k.is_a?(Array)
142
- (data_fields[k[0]] ||= []) << k[1]
171
+ @features.each do |k, type|
172
+ if type == "categorical"
173
+ data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last)
143
174
  else
144
175
  data_fields[k] = nil
145
176
  end
146
177
  end
147
178
 
148
- builder = Nokogiri::XML::Builder.new do |xml|
149
- xml.PMML(version: "4.3", xmlns: "http://www.dmg.org/PMML-4_3", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
150
- xml.Header
151
- xml.DataDictionary do
152
- data_fields.each do |k, vs|
153
- if vs
154
- xml.DataField(name: k, optype: "categorical", dataType: "string") do
155
- vs.each do |v|
156
- xml.Value(value: v)
157
- end
158
- end
159
- else
160
- xml.DataField(name: k, optype: "continuous", dataType: "double")
161
- end
179
+ build_pmml(data_fields) do |xml|
180
+ xml.RegressionModel(functionName: "regression") do
181
+ xml.MiningSchema do
182
+ @features.each do |k, _|
183
+ xml.MiningField(name: k)
162
184
  end
163
185
  end
164
- xml.RegressionModel(functionName: "regression") do
165
- xml.MiningSchema do
166
- data_fields.each do |k, _|
167
- xml.MiningField(name: k)
168
- end
169
- end
170
- xml.RegressionTable(intercept: @coefficients[:_intercept]) do
171
- predictors.each do |k, v|
172
- if k.is_a?(Array)
173
- xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
186
+ pmml_local_transformations(xml)
187
+ xml.RegressionTable(intercept: @coefficients["_intercept"]) do
188
+ predictors.each do |k, v|
189
+ if k.is_a?(Array)
190
+ if @features[k.first] == "text"
191
+ xml.NumericPredictor(name: display_field(k), coefficient: v)
174
192
  else
175
- xml.NumericPredictor(name: k, coefficient: v)
193
+ xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
176
194
  end
195
+ else
196
+ xml.NumericPredictor(name: k, coefficient: v)
177
197
  end
178
198
  end
179
199
  end
180
200
  end
181
- end.to_xml
182
- end
183
-
184
- # pfa
185
-
186
- def self.load_pfa(data)
187
- data = JSON.parse(data) if data.is_a?(String)
188
- init = data["cells"].first[1]["init"]
189
- names =
190
- if data["input"]["fields"]
191
- data["input"]["fields"].map { |f| f["name"] }
192
- else
193
- init["coeff"].map.with_index { |_, i| "x#{i}" }
194
- end
195
- coefficients = {
196
- _intercept: init["const"]
197
- }
198
- init["coeff"].each_with_index do |c, i|
199
- name = names[i]
200
- # R can export coefficients with same name
201
- raise "Coefficients with same name" if coefficients[name]
202
- coefficients[name] = c
203
201
  end
204
- new(coefficients: coefficients)
205
- end
206
-
207
- # metrics
208
-
209
- def self.metrics(actual, estimated)
210
- errors = actual.zip(estimated).map { |yi, yi2| yi - yi2 }
211
-
212
- {
213
- me: mean(errors),
214
- mae: mean(errors.map { |v| v.abs }),
215
- rmse: Math.sqrt(mean(errors.map { |v| v**2 }))
216
- }
217
202
  end
218
203
 
219
- # private
220
- def self.mean(arr)
221
- arr.inject(0, &:+) / arr.size.to_f
222
- end
223
-
224
- # https://people.richland.edu/james/ictcm/2004/multiple.html
225
- def summary(extended: false)
226
- coefficients = @coefficients
227
- str = String.new("")
228
- len = [coefficients.keys.map(&:size).max, 15].max
229
- if extended
230
- str += "%-#{len}s %12s %12s %12s %12s\n" % ["", "coef", "stderr", "t", "p"]
231
- else
232
- str += "%-#{len}s %12s %12s\n" % ["", "coef", "p"]
233
- end
234
- coefficients.each do |k, v|
235
- if extended
236
- str += "%-#{len}s %12.2f %12.2f %12.2f %12.3f\n" % [display_field(k), v, std_err[k], t_value[k], p_value[k]]
237
- else
238
- str += "%-#{len}s %12.2f %12.3f\n" % [display_field(k), v, p_value[k]]
204
+ def prep_x(x)
205
+ x = x.dup
206
+ @features.each do |k, type|
207
+ if type == "categorical"
208
+ values = x.columns.delete(k)
209
+ labels = values.uniq[1..-1]
210
+ labels.each do |label|
211
+ x.columns[[k, label]] = values.map { |v| v == label ? 1 : 0 }
212
+ end
239
213
  end
240
214
  end
241
- str += "\n"
242
- str += "r2: %.3f\n" % [r2] if extended
243
- str += "adjusted r2: %.3f\n" % [adjusted_r2]
244
- str
245
- end
246
-
247
- def r2
248
- @r2 ||= (sst - sse) / sst
249
- end
250
-
251
- def adjusted_r2
252
- @adjusted_r2 ||= (mst - mse) / mst
253
- end
254
-
255
- private
256
-
257
- def _predict(x)
258
- x, c = prep_x(x, train: false)
259
- coef = c.map do |v|
260
- # use 0 if coefficient does not exist
261
- # this can happen for categorical features
262
- # since only n-1 coefficients are stored
263
- @coefficients[v] || 0
264
- end
265
-
266
- x = Matrix.rows(x)
267
- c = Matrix.column_vector(coef)
268
- matrix_arr(x * c)
269
- end
270
-
271
- def display_field(k)
272
- k.is_a?(Array) ? k.join("") : k
215
+ prep_text_features(x)
216
+ x
273
217
  end
274
218
 
275
219
  def constant?(arr)
@@ -289,7 +233,7 @@ module Eps
289
233
  if @gsl
290
234
  GSL::Cdf.tdist_P(t_value[k].abs, degrees_of_freedom)
291
235
  else
292
- tdist_p(t_value[k].abs, degrees_of_freedom)
236
+ Eps::Statistics.tdist_p(t_value[k].abs, degrees_of_freedom)
293
237
  end
294
238
 
295
239
  [k, 2 * (1 - tp)]
@@ -322,26 +266,26 @@ module Eps
322
266
  end
323
267
 
324
268
  def y_bar
325
- @y_bar ||= mean(@y)
269
+ @y_bar ||= mean(@train_set.label)
326
270
  end
327
271
 
328
272
  def y_hat
329
- @y_hat ||= predict(@x)
273
+ @y_hat ||= predict(@train_set)
330
274
  end
331
275
 
332
276
  # total sum of squares
333
277
  def sst
334
- @sst ||= @y.map { |y| (y - y_bar)**2 }.sum
278
+ @sst ||= @train_set.label.map { |y| (y - y_bar)**2 }.sum
335
279
  end
336
280
 
337
281
  # sum of squared errors of prediction
338
282
  # not to be confused with "explained sum of squares"
339
283
  def sse
340
- @sse ||= @y.zip(y_hat).map { |y, yh| (y - yh)**2 }.sum
284
+ @sse ||= @train_set.label.zip(y_hat).map { |y, yh| (y - yh)**2 }.sum
341
285
  end
342
286
 
343
287
  def mst
344
- @mst ||= sst / (@y.size - 1)
288
+ @mst ||= sst / (@train_set.size - 1)
345
289
  end
346
290
 
347
291
  def mse
@@ -349,209 +293,11 @@ module Eps
349
293
  end
350
294
 
351
295
  def degrees_of_freedom
352
- @y.size - @coefficients.size
296
+ @train_set.size - @coefficients.size
353
297
  end
354
298
 
355
299
  def mean(arr)
356
300
  arr.sum / arr.size.to_f
357
301
  end
358
-
359
- ### Extracted from https://github.com/estebanz01/ruby-statistics
360
- ### The Ruby author is Esteban Zapata Rojas
361
- ###
362
- ### Originally extracted from https://codeplea.com/incomplete-beta-function-c
363
- ### This function is shared under zlib license and the author is Lewis Van Winkle
364
- def tdist_p(value, degrees_of_freedom)
365
- upper = (value + Math.sqrt(value * value + degrees_of_freedom))
366
- lower = (2.0 * Math.sqrt(value * value + degrees_of_freedom))
367
-
368
- x = upper/lower
369
-
370
- alpha = degrees_of_freedom/2.0
371
- beta = degrees_of_freedom/2.0
372
-
373
- incomplete_beta_function(x, alpha, beta)
374
- end
375
-
376
- ### Extracted from https://github.com/estebanz01/ruby-statistics
377
- ### The Ruby author is Esteban Zapata Rojas
378
- ###
379
- ### This implementation is an adaptation of the incomplete beta function made in C by
380
- ### Lewis Van Winkle, which released the code under the zlib license.
381
- ### The whole math behind this code is described in the following post: https://codeplea.com/incomplete-beta-function-c
382
- def incomplete_beta_function(x, alp, bet)
383
- return if x < 0.0
384
- return 1.0 if x > 1.0
385
-
386
- tiny = 1.0E-50
387
-
388
- if x > ((alp + 1.0)/(alp + bet + 2.0))
389
- return 1.0 - incomplete_beta_function(1.0 - x, bet, alp)
390
- end
391
-
392
- # To avoid overflow problems, the implementation applies the logarithm properties
393
- # to calculate in a faster and safer way the values.
394
- lbet_ab = (Math.lgamma(alp)[0] + Math.lgamma(bet)[0] - Math.lgamma(alp + bet)[0]).freeze
395
- front = (Math.exp(Math.log(x) * alp + Math.log(1.0 - x) * bet - lbet_ab) / alp.to_f).freeze
396
-
397
- # This is the non-log version of the left part of the formula (before the continuous fraction)
398
- # down_left = alp * self.beta_function(alp, bet)
399
- # upper_left = (x ** alp) * ((1.0 - x) ** bet)
400
- # front = upper_left/down_left
401
-
402
- f, c, d = 1.0, 1.0, 0.0
403
-
404
- returned_value = nil
405
-
406
- # Let's do more iterations than the proposed implementation (200 iters)
407
- (0..500).each do |number|
408
- m = number/2
409
-
410
- numerator = if number == 0
411
- 1.0
412
- elsif number % 2 == 0
413
- (m * (bet - m) * x)/((alp + 2.0 * m - 1.0)* (alp + 2.0 * m))
414
- else
415
- top = -((alp + m) * (alp + bet + m) * x)
416
- down = ((alp + 2.0 * m) * (alp + 2.0 * m + 1.0))
417
-
418
- top/down
419
- end
420
-
421
- d = 1.0 + numerator * d
422
- d = tiny if d.abs < tiny
423
- d = 1.0 / d
424
-
425
- c = 1.0 + numerator / c
426
- c = tiny if c.abs < tiny
427
-
428
- cd = (c*d).freeze
429
- f = f * cd
430
-
431
- if (1.0 - cd).abs < 1.0E-10
432
- returned_value = front * (f - 1.0)
433
- break
434
- end
435
- end
436
-
437
- returned_value
438
- end
439
-
440
- def prep_x(x, train: true)
441
- coefficients = @coefficients
442
-
443
- if daru?(x)
444
- x = x.to_a[0]
445
- else
446
- x = x.map do |xi|
447
- case xi
448
- when Hash
449
- xi
450
- when Array
451
- Hash[xi.map.with_index { |v, i| [:"x#{i}", v] }]
452
- else
453
- {x0: xi}
454
- end
455
- end
456
- end
457
-
458
- # get column types
459
- if train
460
- column_types = {}
461
- if x.any?
462
- row = x.first
463
- row.each do |k, v|
464
- column_types[k] = categorical?(v) ? "categorical" : "numeric"
465
- end
466
- end
467
- else
468
- # get column types for prediction
469
- column_types = {}
470
- coefficients.each do |k, v|
471
- next if k == :_intercept
472
- if k.is_a?(Array)
473
- column_types[k.first] = "categorical"
474
- else
475
- column_types[k] = "numeric"
476
- end
477
- end
478
- end
479
-
480
- # if !train && x.any?
481
- # # check first row against coefficients
482
- # ckeys = coefficients.keys.map(&:to_s)
483
- # bad_keys = x[0].keys.map(&:to_s).reject { |k| ckeys.any? { |c| c.start_with?(k) } }
484
- # raise "Unknown keys: #{bad_keys.join(", ")}" if bad_keys.any?
485
- # end
486
-
487
- supports_categorical = train || coefficients.any? { |k, _| k.is_a?(Array) }
488
-
489
- cache = {}
490
- first_key = {}
491
- i = 0
492
- rows = []
493
- x.each do |xi|
494
- row = {}
495
- xi.each do |k, v|
496
- categorical = column_types[k.to_sym] == "categorical" || (!supports_categorical && categorical?(v))
497
-
498
- key = categorical ? [k.to_sym, v.to_s] : k.to_sym
499
- v2 = categorical ? 1 : v
500
-
501
- # TODO make more efficient
502
- check_key = supports_categorical ? key : symbolize_coef(key)
503
- next if !train && !coefficients.key?(check_key)
504
-
505
- raise "Missing data" if v2.nil?
506
-
507
- unless cache[key]
508
- cache[key] = i
509
- first_key[k] ||= key if categorical
510
- i += 1
511
- end
512
-
513
- row[key] = v2
514
- end
515
- rows << row
516
- end
517
-
518
- if train
519
- # remove one degree of freedom
520
- first_key.values.each do |v|
521
- num = cache.delete(v)
522
- cache.each do |k, v2|
523
- cache[k] -= 1 if v2 > num
524
- end
525
- end
526
- end
527
-
528
- ret2 = []
529
- rows.each do |row|
530
- ret = [0] * cache.size
531
- row.each do |k, v|
532
- if cache[k]
533
- ret[cache[k]] = v
534
- end
535
- end
536
- ret2 << ([1] + ret)
537
- end
538
-
539
- # flatten keys
540
- c = [:_intercept] + cache.sort_by { |_, v| v }.map(&:first)
541
-
542
- unless supports_categorical
543
- c = c.map { |v| symbolize_coef(v) }
544
- end
545
-
546
- [ret2, c]
547
- end
548
-
549
- def symbolize_coef(k)
550
- (k.is_a?(Array) ? k.join("") : k).to_sym
551
- end
552
-
553
- def matrix_arr(matrix)
554
- matrix.to_a.map { |xi| xi[0].to_f }
555
- end
556
302
  end
557
303
  end