eps 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8133bd3887423fb41421aa2a4270aa0c7fc75b741ea60a6c55fb97308f3ddea4
4
- data.tar.gz: '018283d9934459202f8395b4c3f4ba201894296e3cfa185bb884cc5b73981f0a'
3
+ metadata.gz: e1f8c1d5af8015ba568528bce2c928cf59c97bda5c4f91a6ace0af72a9a864da
4
+ data.tar.gz: 3cbb6dfca84687c833051147a0fcad16cd258dd09f48a93da61f051c2081f62c
5
5
  SHA512:
6
- metadata.gz: 2e1439f4a9a268a0434dc926a68822731db9267c746d4c76fa43a8debdbc49c25a502ff2051254fbf3453edb33141d35c02fa067afcddf2761e92b96e9d85751
7
- data.tar.gz: 0c87d327d5f8083349cc75ea6b6e725e15ad20ac2535dc37dbc6b7955e6eca4408db93140287e8a9bfa7cee9da11956eb018600851bb07072c9e4347978d89dc
6
+ metadata.gz: 83477bb53b14a04798ab85f2efc9d4bb3af3195bd100ca42d601d2dbb2bc7fa09e7886f414e5a1e3041128bd671e3d15e39e071fe8de9d8d19dacc4c7e702cfc
7
+ data.tar.gz: 1914eeb9509916e9b4eb530f56dab0bfa73683a9197e157c8030fc11c96efca983a0d0f257d2845d890c9772ccfb6550651fb9fba60d1671db914c51b3675fc8
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 0.3.4 (2020-04-05)
2
+
3
+ - Added `predict_probability` for classification
4
+
1
5
  ## 0.3.3 (2020-02-24)
2
6
 
3
7
  - Fixed errors and incorrect predictions with boolean columns
data/README.md CHANGED
@@ -4,7 +4,6 @@ Machine learning for Ruby
4
4
 
5
5
  - Build predictive models quickly and easily
6
6
  - Serve models built in Ruby, Python, R, and more
7
- - No prior knowledge of machine learning required :tada:
8
7
 
9
8
  Check out [this post](https://ankane.org/rails-meet-data-science) for more info on machine learning with Rails
10
9
 
@@ -379,6 +378,16 @@ By default, an intercept is included. Disable this with:
379
378
  Eps::Model.new(data, intercept: false)
380
379
  ```
381
380
 
381
+ ## Probability
382
+
383
+ To get the probability of each category for predictions with classification, use:
384
+
385
+ ```ruby
386
+ model.predict_probability(data)
387
+ ```
388
+
389
+ Naive Bayes is known to produce poor probability estimates, so stick with LightGBM if you need this.
390
+
382
391
  ## Validation Options
383
392
 
384
393
  Pass your own validation set with:
@@ -414,7 +423,7 @@ The database is another place you can store models. It’s good if you retrain m
414
423
  Create an ActiveRecord model to store the predictive model.
415
424
 
416
425
  ```sh
417
- rails g model Model key:string:uniq data:text
426
+ rails generate model Model key:string:uniq data:text
418
427
  ```
419
428
 
420
429
  Store the model with:
@@ -524,11 +533,11 @@ Everyone is encouraged to help improve this project. Here are a few ways you can
524
533
  - Write, clarify, or fix documentation
525
534
  - Suggest or add new features
526
535
 
527
- To get started with development and testing:
536
+ To get started with development:
528
537
 
529
538
  ```sh
530
539
  git clone https://github.com/ankane/eps.git
531
540
  cd eps
532
541
  bundle install
533
- rake test
542
+ bundle exec rake test
534
543
  ```
@@ -9,27 +9,11 @@ module Eps
9
9
  end
10
10
 
11
11
  def predict(data)
12
- singular = data.is_a?(Hash)
13
- data = [data] if singular
14
-
15
- data = Eps::DataFrame.new(data)
16
-
17
- @evaluator.features.each do |k, type|
18
- values = data.columns[k]
19
- raise ArgumentError, "Missing column: #{k}" if !values
20
- column_type = Utils.column_type(values.compact, k) if values
21
-
22
- if !column_type.nil?
23
- if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
24
- raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
25
- end
26
- end
27
- # TODO check for unknown values for categorical features
28
- end
29
-
30
- predictions = @evaluator.predict(data)
12
+ _predict(data, false)
13
+ end
31
14
 
32
- singular ? predictions.first : predictions
15
+ def predict_probability(data)
16
+ _predict(data, true)
33
17
  end
34
18
 
35
19
  def evaluate(data, y = nil, target: nil, weight: nil)
@@ -75,6 +59,30 @@ module Eps
75
59
 
76
60
  private
77
61
 
62
+ def _predict(data, probabilities)
63
+ singular = data.is_a?(Hash)
64
+ data = [data] if singular
65
+
66
+ data = Eps::DataFrame.new(data)
67
+
68
+ @evaluator.features.each do |k, type|
69
+ values = data.columns[k]
70
+ raise ArgumentError, "Missing column: #{k}" if !values
71
+ column_type = Utils.column_type(values.compact, k) if values
72
+
73
+ if !column_type.nil?
74
+ if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
75
+ raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
76
+ end
77
+ end
78
+ # TODO check for unknown values for categorical features
79
+ end
80
+
81
+ predictions = @evaluator.predict(data, probabilities: probabilities)
82
+
83
+ singular ? predictions.first : predictions
84
+ end
85
+
78
86
  def train(data, y = nil, target: nil, weight: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
79
87
  data, @target = prep_data(data, y, target, weight)
80
88
  @target_type = Utils.column_type(data.label, @target)
@@ -11,7 +11,9 @@ module Eps
11
11
  @text_features = text_features
12
12
  end
13
13
 
14
- def predict(data)
14
+ def predict(data, probabilities: false)
15
+ raise "Probabilities not supported" if probabilities && @objective == "regression"
16
+
15
17
  rows = data.map(&:to_h)
16
18
 
17
19
  # sparse matrix
@@ -38,7 +40,12 @@ module Eps
38
40
  when "regression"
39
41
  sum_trees(rows, @trees)
40
42
  when "binary"
41
- sum_trees(rows, @trees).map { |s| @labels[sigmoid(s) > 0.5 ? 1 : 0] }
43
+ prob = sum_trees(rows, @trees).map { |s| sigmoid(s) }
44
+ if probabilities
45
+ prob.map { |v| @labels.zip([1 - v, v]).to_h }
46
+ else
47
+ prob.map { |v| @labels[v > 0.5 ? 1 : 0] }
48
+ end
42
49
  else
43
50
  tree_scores = []
44
51
  num_trees = @trees.size / @labels.size
@@ -47,8 +54,14 @@ module Eps
47
54
  end
48
55
  data.size.times.map do |i|
49
56
  v = tree_scores.map { |s| s[i] }
50
- idx = v.map.with_index.max_by { |v2, _| v2 }.last
51
- @labels[idx]
57
+ if probabilities
58
+ exp = v.map { |vi| Math.exp(vi) }
59
+ sum = exp.sum
60
+ @labels.zip(exp.map { |e| e / sum }).to_h
61
+ else
62
+ idx = v.map.with_index.max_by { |v2, _| v2 }.last
63
+ @labels[idx]
64
+ end
52
65
  end
53
66
  end
54
67
  end
@@ -109,7 +122,7 @@ module Eps
109
122
  end
110
123
 
111
124
  def sigmoid(x)
112
- 1.0 / (1 + Math::E**(-x))
125
+ 1.0 / (1 + Math.exp(-x))
113
126
  end
114
127
  end
115
128
  end
@@ -9,7 +9,9 @@ module Eps
9
9
  @text_features = text_features || {}
10
10
  end
11
11
 
12
- def predict(x)
12
+ def predict(x, probabilities: false)
13
+ raise "Probabilities not supported" if probabilities
14
+
13
15
  intercept = @coefficients["_intercept"] || 0.0
14
16
  scores = [intercept] * x.size
15
17
 
@@ -10,14 +10,15 @@ module Eps
10
10
  @legacy = legacy
11
11
  end
12
12
 
13
- def predict(x)
13
+ def predict(x, probabilities: false)
14
14
  probs = calculate_class_probabilities(x)
15
15
  probs.map do |xp|
16
- # convert probabilities
17
- # not needed when just returning label
18
- # sum = xp.values.map { |v| Math.exp(v) }.sum.to_f
19
- # p xp.map { |k, v| [k, Math.exp(v) / sum] }.to_h
20
- xp.sort_by { |k, v| [-v, k] }[0][0]
16
+ if probabilities
17
+ sum = xp.values.map { |v| Math.exp(v) }.sum.to_f
18
+ xp.map { |k, v| [k, Math.exp(v) / sum] }.to_h
19
+ else
20
+ xp.sort_by { |k, v| [-v, k] }[0][0]
21
+ end
21
22
  end
22
23
  end
23
24
 
data/lib/eps/lightgbm.rb CHANGED
@@ -121,18 +121,20 @@ module Eps
121
121
  def check_evaluator(objective, labels, booster, booster_set, evaluator, evaluator_set)
122
122
  expected = @booster.predict(booster_set.map_rows(&:to_a))
123
123
  if objective == "multiclass"
124
- expected.map! do |v|
125
- labels[v.map.with_index.max_by { |v2, _| v2 }.last]
126
- end
124
+ actual = evaluator.predict(evaluator_set, probabilities: true)
125
+ # just compare first for now
126
+ expected.map! { |v| v.first }
127
+ actual.map! { |v| v.values.first }
127
128
  elsif objective == "binary"
128
- expected.map! { |v| labels[v >= 0.5 ? 1 : 0] }
129
+ actual = evaluator.predict(evaluator_set, probabilities: true).map { |v| v.values.last }
130
+ else
131
+ actual = evaluator.predict(evaluator_set)
129
132
  end
130
- actual = evaluator.predict(evaluator_set)
131
133
 
132
- regression = objective == "regression"
134
+ regression = objective == "regression" || objective == "binary"
133
135
  bad_observations = []
134
136
  expected.zip(actual).each_with_index do |(exp, act), i|
135
- success = regression ? (act - exp).abs < 0.001 : act == exp
137
+ success = (act - exp).abs < 0.001
136
138
  unless success
137
139
  bad_observations << {expected: exp, actual: act, data_point: evaluator_set[i].map(&:itself).first}
138
140
  end
data/lib/eps/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Eps
2
- VERSION = "0.3.3"
2
+ VERSION = "0.3.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: eps
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-02-25 00:00:00.000000000 Z
11
+ date: 2020-04-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: lightgbm