eps 0.3.3 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8133bd3887423fb41421aa2a4270aa0c7fc75b741ea60a6c55fb97308f3ddea4
4
- data.tar.gz: '018283d9934459202f8395b4c3f4ba201894296e3cfa185bb884cc5b73981f0a'
3
+ metadata.gz: e1f8c1d5af8015ba568528bce2c928cf59c97bda5c4f91a6ace0af72a9a864da
4
+ data.tar.gz: 3cbb6dfca84687c833051147a0fcad16cd258dd09f48a93da61f051c2081f62c
5
5
  SHA512:
6
- metadata.gz: 2e1439f4a9a268a0434dc926a68822731db9267c746d4c76fa43a8debdbc49c25a502ff2051254fbf3453edb33141d35c02fa067afcddf2761e92b96e9d85751
7
- data.tar.gz: 0c87d327d5f8083349cc75ea6b6e725e15ad20ac2535dc37dbc6b7955e6eca4408db93140287e8a9bfa7cee9da11956eb018600851bb07072c9e4347978d89dc
6
+ metadata.gz: 83477bb53b14a04798ab85f2efc9d4bb3af3195bd100ca42d601d2dbb2bc7fa09e7886f414e5a1e3041128bd671e3d15e39e071fe8de9d8d19dacc4c7e702cfc
7
+ data.tar.gz: 1914eeb9509916e9b4eb530f56dab0bfa73683a9197e157c8030fc11c96efca983a0d0f257d2845d890c9772ccfb6550651fb9fba60d1671db914c51b3675fc8
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 0.3.4 (2020-04-05)
2
+
3
+ - Added `predict_probability` for classification
4
+
1
5
  ## 0.3.3 (2020-02-24)
2
6
 
3
7
  - Fixed errors and incorrect predictions with boolean columns
data/README.md CHANGED
@@ -4,7 +4,6 @@ Machine learning for Ruby
4
4
 
5
5
  - Build predictive models quickly and easily
6
6
  - Serve models built in Ruby, Python, R, and more
7
- - No prior knowledge of machine learning required :tada:
8
7
 
9
8
  Check out [this post](https://ankane.org/rails-meet-data-science) for more info on machine learning with Rails
10
9
 
@@ -379,6 +378,16 @@ By default, an intercept is included. Disable this with:
379
378
  Eps::Model.new(data, intercept: false)
380
379
  ```
381
380
 
381
+ ## Probability
382
+
383
+ To get the probability of each category for predictions with classification, use:
384
+
385
+ ```ruby
386
+ model.predict_probability(data)
387
+ ```
388
+
389
+ Naive Bayes is known to produce poor probability estimates, so stick with LightGBM if you need this.
390
+
382
391
  ## Validation Options
383
392
 
384
393
  Pass your own validation set with:
@@ -414,7 +423,7 @@ The database is another place you can store models. It’s good if you retrain m
414
423
  Create an ActiveRecord model to store the predictive model.
415
424
 
416
425
  ```sh
417
- rails g model Model key:string:uniq data:text
426
+ rails generate model Model key:string:uniq data:text
418
427
  ```
419
428
 
420
429
  Store the model with:
@@ -524,11 +533,11 @@ Everyone is encouraged to help improve this project. Here are a few ways you can
524
533
  - Write, clarify, or fix documentation
525
534
  - Suggest or add new features
526
535
 
527
- To get started with development and testing:
536
+ To get started with development:
528
537
 
529
538
  ```sh
530
539
  git clone https://github.com/ankane/eps.git
531
540
  cd eps
532
541
  bundle install
533
- rake test
542
+ bundle exec rake test
534
543
  ```
@@ -9,27 +9,11 @@ module Eps
9
9
  end
10
10
 
11
11
  def predict(data)
12
- singular = data.is_a?(Hash)
13
- data = [data] if singular
14
-
15
- data = Eps::DataFrame.new(data)
16
-
17
- @evaluator.features.each do |k, type|
18
- values = data.columns[k]
19
- raise ArgumentError, "Missing column: #{k}" if !values
20
- column_type = Utils.column_type(values.compact, k) if values
21
-
22
- if !column_type.nil?
23
- if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
24
- raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
25
- end
26
- end
27
- # TODO check for unknown values for categorical features
28
- end
29
-
30
- predictions = @evaluator.predict(data)
12
+ _predict(data, false)
13
+ end
31
14
 
32
- singular ? predictions.first : predictions
15
+ def predict_probability(data)
16
+ _predict(data, true)
33
17
  end
34
18
 
35
19
  def evaluate(data, y = nil, target: nil, weight: nil)
@@ -75,6 +59,30 @@ module Eps
75
59
 
76
60
  private
77
61
 
62
+ def _predict(data, probabilities)
63
+ singular = data.is_a?(Hash)
64
+ data = [data] if singular
65
+
66
+ data = Eps::DataFrame.new(data)
67
+
68
+ @evaluator.features.each do |k, type|
69
+ values = data.columns[k]
70
+ raise ArgumentError, "Missing column: #{k}" if !values
71
+ column_type = Utils.column_type(values.compact, k) if values
72
+
73
+ if !column_type.nil?
74
+ if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
75
+ raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
76
+ end
77
+ end
78
+ # TODO check for unknown values for categorical features
79
+ end
80
+
81
+ predictions = @evaluator.predict(data, probabilities: probabilities)
82
+
83
+ singular ? predictions.first : predictions
84
+ end
85
+
78
86
  def train(data, y = nil, target: nil, weight: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
79
87
  data, @target = prep_data(data, y, target, weight)
80
88
  @target_type = Utils.column_type(data.label, @target)
@@ -11,7 +11,9 @@ module Eps
11
11
  @text_features = text_features
12
12
  end
13
13
 
14
- def predict(data)
14
+ def predict(data, probabilities: false)
15
+ raise "Probabilities not supported" if probabilities && @objective == "regression"
16
+
15
17
  rows = data.map(&:to_h)
16
18
 
17
19
  # sparse matrix
@@ -38,7 +40,12 @@ module Eps
38
40
  when "regression"
39
41
  sum_trees(rows, @trees)
40
42
  when "binary"
41
- sum_trees(rows, @trees).map { |s| @labels[sigmoid(s) > 0.5 ? 1 : 0] }
43
+ prob = sum_trees(rows, @trees).map { |s| sigmoid(s) }
44
+ if probabilities
45
+ prob.map { |v| @labels.zip([1 - v, v]).to_h }
46
+ else
47
+ prob.map { |v| @labels[v > 0.5 ? 1 : 0] }
48
+ end
42
49
  else
43
50
  tree_scores = []
44
51
  num_trees = @trees.size / @labels.size
@@ -47,8 +54,14 @@ module Eps
47
54
  end
48
55
  data.size.times.map do |i|
49
56
  v = tree_scores.map { |s| s[i] }
50
- idx = v.map.with_index.max_by { |v2, _| v2 }.last
51
- @labels[idx]
57
+ if probabilities
58
+ exp = v.map { |vi| Math.exp(vi) }
59
+ sum = exp.sum
60
+ @labels.zip(exp.map { |e| e / sum }).to_h
61
+ else
62
+ idx = v.map.with_index.max_by { |v2, _| v2 }.last
63
+ @labels[idx]
64
+ end
52
65
  end
53
66
  end
54
67
  end
@@ -109,7 +122,7 @@ module Eps
109
122
  end
110
123
 
111
124
  def sigmoid(x)
112
- 1.0 / (1 + Math::E**(-x))
125
+ 1.0 / (1 + Math.exp(-x))
113
126
  end
114
127
  end
115
128
  end
@@ -9,7 +9,9 @@ module Eps
9
9
  @text_features = text_features || {}
10
10
  end
11
11
 
12
- def predict(x)
12
+ def predict(x, probabilities: false)
13
+ raise "Probabilities not supported" if probabilities
14
+
13
15
  intercept = @coefficients["_intercept"] || 0.0
14
16
  scores = [intercept] * x.size
15
17
 
@@ -10,14 +10,15 @@ module Eps
10
10
  @legacy = legacy
11
11
  end
12
12
 
13
- def predict(x)
13
+ def predict(x, probabilities: false)
14
14
  probs = calculate_class_probabilities(x)
15
15
  probs.map do |xp|
16
- # convert probabilities
17
- # not needed when just returning label
18
- # sum = xp.values.map { |v| Math.exp(v) }.sum.to_f
19
- # p xp.map { |k, v| [k, Math.exp(v) / sum] }.to_h
20
- xp.sort_by { |k, v| [-v, k] }[0][0]
16
+ if probabilities
17
+ sum = xp.values.map { |v| Math.exp(v) }.sum.to_f
18
+ xp.map { |k, v| [k, Math.exp(v) / sum] }.to_h
19
+ else
20
+ xp.sort_by { |k, v| [-v, k] }[0][0]
21
+ end
21
22
  end
22
23
  end
23
24
 
data/lib/eps/lightgbm.rb CHANGED
@@ -121,18 +121,20 @@ module Eps
121
121
  def check_evaluator(objective, labels, booster, booster_set, evaluator, evaluator_set)
122
122
  expected = @booster.predict(booster_set.map_rows(&:to_a))
123
123
  if objective == "multiclass"
124
- expected.map! do |v|
125
- labels[v.map.with_index.max_by { |v2, _| v2 }.last]
126
- end
124
+ actual = evaluator.predict(evaluator_set, probabilities: true)
125
+ # just compare first for now
126
+ expected.map! { |v| v.first }
127
+ actual.map! { |v| v.values.first }
127
128
  elsif objective == "binary"
128
- expected.map! { |v| labels[v >= 0.5 ? 1 : 0] }
129
+ actual = evaluator.predict(evaluator_set, probabilities: true).map { |v| v.values.last }
130
+ else
131
+ actual = evaluator.predict(evaluator_set)
129
132
  end
130
- actual = evaluator.predict(evaluator_set)
131
133
 
132
- regression = objective == "regression"
134
+ regression = objective == "regression" || objective == "binary"
133
135
  bad_observations = []
134
136
  expected.zip(actual).each_with_index do |(exp, act), i|
135
- success = regression ? (act - exp).abs < 0.001 : act == exp
137
+ success = (act - exp).abs < 0.001
136
138
  unless success
137
139
  bad_observations << {expected: exp, actual: act, data_point: evaluator_set[i].map(&:itself).first}
138
140
  end
data/lib/eps/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Eps
2
- VERSION = "0.3.3"
2
+ VERSION = "0.3.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: eps
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-02-25 00:00:00.000000000 Z
11
+ date: 2020-04-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: lightgbm