RubyGems - eps - Versions diffs - 0.3.3 → 0.3.4 - Mend

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/README.md +13 -4
data/lib/eps/base_estimator.rb +28 -20
data/lib/eps/evaluators/lightgbm.rb +18 -5
data/lib/eps/evaluators/linear_regression.rb +3 -1
data/lib/eps/evaluators/naive_bayes.rb +7 -6
data/lib/eps/lightgbm.rb +9 -7
data/lib/eps/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 8133bd3887423fb41421aa2a4270aa0c7fc75b741ea60a6c55fb97308f3ddea4
-  data.tar.gz: '018283d9934459202f8395b4c3f4ba201894296e3cfa185bb884cc5b73981f0a'
+  metadata.gz: e1f8c1d5af8015ba568528bce2c928cf59c97bda5c4f91a6ace0af72a9a864da
+  data.tar.gz: 3cbb6dfca84687c833051147a0fcad16cd258dd09f48a93da61f051c2081f62c
 SHA512:
-  metadata.gz: 2e1439f4a9a268a0434dc926a68822731db9267c746d4c76fa43a8debdbc49c25a502ff2051254fbf3453edb33141d35c02fa067afcddf2761e92b96e9d85751
-  data.tar.gz: 0c87d327d5f8083349cc75ea6b6e725e15ad20ac2535dc37dbc6b7955e6eca4408db93140287e8a9bfa7cee9da11956eb018600851bb07072c9e4347978d89dc
+  metadata.gz: 83477bb53b14a04798ab85f2efc9d4bb3af3195bd100ca42d601d2dbb2bc7fa09e7886f414e5a1e3041128bd671e3d15e39e071fe8de9d8d19dacc4c7e702cfc
+  data.tar.gz: 1914eeb9509916e9b4eb530f56dab0bfa73683a9197e157c8030fc11c96efca983a0d0f257d2845d890c9772ccfb6550651fb9fba60d1671db914c51b3675fc8

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,7 @@
+## 0.3.4 (2020-04-05)
+- Added `predict_probability` for classification
 ## 0.3.3 (2020-02-24)
 - Fixed errors and incorrect predictions with boolean columns

data/README.md CHANGED Viewed

@@ -4,7 +4,6 @@ Machine learning for Ruby
 - Build predictive models quickly and easily
 - Serve models built in Ruby, Python, R, and more
-- No prior knowledge of machine learning required :tada:
 Check out [this post](https://ankane.org/rails-meet-data-science) for more info on machine learning with Rails
@@ -379,6 +378,16 @@ By default, an intercept is included. Disable this with:
 Eps::Model.new(data, intercept: false)
 ```
+## Probability
+To get the probability of each category for predictions with classification, use:
+```ruby
+model.predict_probability(data)
+```
+Naive Bayes is known to produce poor probability estimates, so stick with LightGBM if you need this.
 ## Validation Options
 Pass your own validation set with:
@@ -414,7 +423,7 @@ The database is another place you can store models. It’s good if you retrain m
 Create an ActiveRecord model to store the predictive model.
 ```sh
-rails g model Model key:string:uniq data:text
+rails generate model Model key:string:uniq data:text
 ```
 Store the model with:
@@ -524,11 +533,11 @@ Everyone is encouraged to help improve this project. Here are a few ways you can
 - Write, clarify, or fix documentation
 - Suggest or add new features
-To get started with development and testing:
+To get started with development:
 ```sh
 git clone https://github.com/ankane/eps.git
 cd eps
 bundle install
-rake test
+bundle exec rake test
 ```

data/lib/eps/base_estimator.rb CHANGED Viewed

@@ -9,27 +9,11 @@ module Eps
     end
     def predict(data)
-      singular = data.is_a?(Hash)
-      data = [data] if singular
-      data = Eps::DataFrame.new(data)
-      @evaluator.features.each do |k, type|
-        values = data.columns[k]
-        raise ArgumentError, "Missing column: #{k}" if !values
-        column_type = Utils.column_type(values.compact, k) if values
-        if !column_type.nil?
-          if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
-            raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
-          end
-        end
-        # TODO check for unknown values for categorical features
-      end
-      predictions = @evaluator.predict(data)
+      _predict(data, false)
+    end
-      singular ? predictions.first : predictions
+    def predict_probability(data)
+      _predict(data, true)
     end
     def evaluate(data, y = nil, target: nil, weight: nil)
@@ -75,6 +59,30 @@ module Eps
     private
+    def _predict(data, probabilities)
+      singular = data.is_a?(Hash)
+      data = [data] if singular
+      data = Eps::DataFrame.new(data)
+      @evaluator.features.each do |k, type|
+        values = data.columns[k]
+        raise ArgumentError, "Missing column: #{k}" if !values
+        column_type = Utils.column_type(values.compact, k) if values
+        if !column_type.nil?
+          if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
+            raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
+          end
+        end
+        # TODO check for unknown values for categorical features
+      end
+      predictions = @evaluator.predict(data, probabilities: probabilities)
+      singular ? predictions.first : predictions
+    end
     def train(data, y = nil, target: nil, weight: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
       data, @target = prep_data(data, y, target, weight)
       @target_type = Utils.column_type(data.label, @target)

data/lib/eps/evaluators/lightgbm.rb CHANGED Viewed

@@ -11,7 +11,9 @@ module Eps
         @text_features = text_features
       end
-      def predict(data)
+      def predict(data, probabilities: false)
+        raise "Probabilities not supported" if probabilities && @objective == "regression"
         rows = data.map(&:to_h)
         # sparse matrix
@@ -38,7 +40,12 @@ module Eps
         when "regression"
           sum_trees(rows, @trees)
         when "binary"
-          sum_trees(rows, @trees).map { |s| @labels[sigmoid(s) > 0.5 ? 1 : 0] }
+          prob = sum_trees(rows, @trees).map { |s| sigmoid(s) }
+          if probabilities
+            prob.map { |v| @labels.zip([1 - v, v]).to_h }
+          else
+            prob.map { |v| @labels[v > 0.5 ? 1 : 0] }
+          end
         else
           tree_scores = []
           num_trees = @trees.size / @labels.size
@@ -47,8 +54,14 @@ module Eps
           end
           data.size.times.map do |i|
             v = tree_scores.map { |s| s[i] }
-            idx = v.map.with_index.max_by { |v2, _| v2 }.last
-            @labels[idx]
+            if probabilities
+              exp = v.map { |vi| Math.exp(vi) }
+              sum = exp.sum
+              @labels.zip(exp.map { |e| e / sum }).to_h
+            else
+              idx = v.map.with_index.max_by { |v2, _| v2 }.last
+              @labels[idx]
+            end
           end
         end
       end
@@ -109,7 +122,7 @@ module Eps
       end
       def sigmoid(x)
-        1.0 / (1 + Math::E**(-x))
+        1.0 / (1 + Math.exp(-x))
       end
     end
   end

data/lib/eps/evaluators/linear_regression.rb CHANGED Viewed

@@ -9,7 +9,9 @@ module Eps
         @text_features = text_features || {}
       end
-      def predict(x)
+      def predict(x, probabilities: false)
+        raise "Probabilities not supported" if probabilities
         intercept = @coefficients["_intercept"] || 0.0
         scores = [intercept] * x.size

data/lib/eps/evaluators/naive_bayes.rb CHANGED Viewed

@@ -10,14 +10,15 @@ module Eps
         @legacy = legacy
       end
-      def predict(x)
+      def predict(x, probabilities: false)
         probs = calculate_class_probabilities(x)
         probs.map do |xp|
-          # convert probabilities
-          # not needed when just returning label
-          # sum = xp.values.map { |v| Math.exp(v) }.sum.to_f
-          # p xp.map { |k, v| [k, Math.exp(v) / sum] }.to_h
-          xp.sort_by { |k, v| [-v, k] }[0][0]
+          if probabilities
+            sum = xp.values.map { |v| Math.exp(v) }.sum.to_f
+            xp.map { |k, v| [k, Math.exp(v) / sum] }.to_h
+          else
+            xp.sort_by { |k, v| [-v, k] }[0][0]
+          end
         end
       end

data/lib/eps/lightgbm.rb CHANGED Viewed

@@ -121,18 +121,20 @@ module Eps
     def check_evaluator(objective, labels, booster, booster_set, evaluator, evaluator_set)
       expected = @booster.predict(booster_set.map_rows(&:to_a))
       if objective == "multiclass"
-        expected.map! do |v|
-          labels[v.map.with_index.max_by { |v2, _| v2 }.last]
-        end
+        actual = evaluator.predict(evaluator_set, probabilities: true)
+        # just compare first for now
+        expected.map! { |v| v.first }
+        actual.map! { |v| v.values.first }
       elsif objective == "binary"
-        expected.map! { |v| labels[v >= 0.5 ? 1 : 0] }
+        actual = evaluator.predict(evaluator_set, probabilities: true).map { |v| v.values.last }
+      else
+        actual = evaluator.predict(evaluator_set)
       end
-      actual = evaluator.predict(evaluator_set)
-      regression = objective == "regression"
+      regression = objective == "regression" || objective == "binary"
       bad_observations = []
       expected.zip(actual).each_with_index do |(exp, act), i|
-        success = regression ? (act - exp).abs < 0.001 : act == exp
+        success = (act - exp).abs < 0.001
         unless success
           bad_observations << {expected: exp, actual: act, data_point: evaluator_set[i].map(&:itself).first}
         end

data/lib/eps/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Eps
-  VERSION = "0.3.3"
+  VERSION = "0.3.4"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: eps
 version: !ruby/object:Gem::Version
-  version: 0.3.3
+  version: 0.3.4
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-02-25 00:00:00.000000000 Z
+date: 2020-04-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: lightgbm

eps 0.3.3 → 0.3.4