RubyGems - eps - Versions diffs - 0.3.0 → 0.3.5 - Mend

eps 0.3.0 → 0.3.5

Files changed (21) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +31 -5
data/README.md +77 -9
data/lib/eps.rb +19 -10
data/lib/eps/base_estimator.rb +63 -145
data/lib/eps/data_frame.rb +19 -3
data/lib/eps/evaluators/lightgbm.rb +20 -7
data/lib/eps/evaluators/linear_regression.rb +7 -4
data/lib/eps/evaluators/naive_bayes.rb +9 -7
data/lib/eps/label_encoder.rb +7 -3
data/lib/eps/lightgbm.rb +43 -78
data/lib/eps/linear_regression.rb +53 -83
data/lib/eps/metrics.rb +24 -12
data/lib/eps/model.rb +6 -6
data/lib/eps/naive_bayes.rb +3 -140
data/lib/eps/pmml.rb +14 -0
data/lib/eps/pmml/generator.rb +422 -0
data/lib/eps/pmml/loader.rb +241 -0
data/lib/eps/version.rb +1 -1
metadata +36 -6
data/lib/eps/pmml_generators/lightgbm.rb +0 -187

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3ca27ba2379d1cbfb6f3407ace5ad9dd5fcb71b08e48b8805ddda6483c026194
-  data.tar.gz: 91bb0beb50664dda5c2a42684414b1972e2bff91c3a993926639939c91272ccd
+  metadata.gz: 1369016c3cae228f169fe580b54fca3c0d240cda202fa7d03ecc7a4e156ee8c7
+  data.tar.gz: bf83ca424c509798d1a1436806b52cba0cfdbefecb8d827d5b17aec7b807b121
 SHA512:
-  metadata.gz: 648d8098928d0ed952ad4cf2195b3e2562db5a38249357b76eb39c0aa17d8f8f974936c4773b2395ae1b1197aedb6e47c8fd018675496f3f966ee2feebb1ed2d
-  data.tar.gz: aa48887027114d9b654f3564715586a1740b742fe7778602d8db770b4921cff8acfbf90baea3ae6092d7c3962f37763c630857d71fbcd573402dfb016159f0c2
+  metadata.gz: 2bf47d80a301eb546c348aaa71f847fa22ace5bed63d97a1f19eb14bc15388b056cd3f545ccf251b2bbf2afc485ef81e5559849ff7459e9dd9f88a71c7cbf83a
+  data.tar.gz: 82d65d84e95a6518cd132c2a42cdec20afd05c0013192941b59ee0edb524874d12b2dd9082dd89be1422872c88e827e031469e43b80336c48c7eab7ff4fe611e

data/CHANGELOG.md CHANGED

@@ -1,4 +1,30 @@
-## 0.3.0
+## 0.3.5 (2020-06-10)
+- Added `learning_rate` option for LightGBM
+- Added support for Numo and Rover
+## 0.3.4 (2020-04-05)
+- Added `predict_probability` for classification
+## 0.3.3 (2020-02-24)
+- Fixed errors and incorrect predictions with boolean columns
+- Fixed deprecation warnings in Ruby 2.7
+## 0.3.2 (2019-12-08)
+- Added support for GSLR
+## 0.3.1 (2019-12-06)
+- Added `weight` option for LightGBM and linear regression
+- Added `intercept` option for linear regression
+- Added LightGBM evaluator safety check
+- Fixed `Unknown label` error for LightGBM
+- Fixed error message for unstable solutions with linear regression
+## 0.3.0 (2019-09-05)
 - Added support for LightGBM
 - Added text features
@@ -12,22 +38,22 @@ Breaking
 - Removed support for JSON and PFA formats
 - Added smoothing to naive Bayes
-## 0.2.1
+## 0.2.1 (2019-05-19)
 - Fixed error with `summary`
 - Fixed error with `predict` in `Eps::Base`
 - Fixed error with loaded classification models
-## 0.2.0
+## 0.2.0 (2019-05-19)
 - Added support for classification
 - Added `to_pmml` method
 - Added `Eps::Base`
-## 0.1.1
+## 0.1.1 (2018-07-05)
 - Huge performance boost
-## 0.1.0
+## 0.1.0 (2018-07-03)
 - First release

data/README.md CHANGED

@@ -4,7 +4,6 @@ Machine learning for Ruby
 - Build predictive models quickly and easily
 - Serve models built in Ruby, Python, R, and more
-- No prior knowledge of machine learning required :tada:
 Check out [this post](https://ankane.org/rails-meet-data-science) for more info on machine learning with Rails
@@ -314,7 +313,7 @@ y = [1, 2, 3]
 Eps::Model.new(x, y)
 ```
-Or pass arrays of arrays
+Data can be an array of arrays
 ```ruby
 x = [[1, 2], [2, 0], [3, 1]]
@@ -322,9 +321,22 @@ y = [1, 2, 3]
 Eps::Model.new(x, y)
 ```
-### Daru
+Or Numo arrays
-Eps works well with Daru data frames.
+```ruby
+x = Numo::NArray.cast([[1, 2], [2, 0], [3, 1]])
+y = Numo::NArray.cast([1, 2, 3])
+Eps::Model.new(x, y)
+```
+Or a Rover data frame
+```ruby
+df = Rover.read_csv("houses.csv")
+Eps::Model.new(df, target: "price")
+```
+Or a Daru data frame
 ```ruby
 df = Daru::DataFrame.from_csv("houses.csv")
@@ -353,9 +365,19 @@ Eps supports:
 - Linear Regression
 - Naive Bayes
+### LightGBM
+Pass the learning rate with:
+```ruby
+Eps::Model.new(data, learning_rate: 0.01)
+```
 ### Linear Regression
-To speed up training on large datasets with linear regression, [install GSL](https://www.gnu.org/software/gsl/). With Homebrew, you can use:
+#### Performance
+To speed up training on large datasets with linear regression, [install GSL](https://github.com/ankane/gslr#gsl-installation). With Homebrew, you can use:
 ```sh
 brew install gsl
@@ -364,11 +386,29 @@ brew install gsl
 Then, add this line to your application’s Gemfile:
 ```ruby
-gem 'gsl', group: :development
+gem 'gslr', group: :development
 ```
 It only needs to be available in environments used to build the model.
+#### Options
+By default, an intercept is included. Disable this with:
+```ruby
+Eps::Model.new(data, intercept: false)
+```
+## Probability
+To get the probability of each category for predictions with classification, use:
+```ruby
+model.predict_probability(data)
+```
+Naive Bayes is known to produce poor probability estimates, so stick with LightGBM if you need this.
 ## Validation Options
 Pass your own validation set with:
@@ -389,6 +429,12 @@ Specify the validation set size (the default is `0.25`, which is 25%)
 Eps::Model.new(data, split: {validation_size: 0.2})
 ```
+Disable the validation set completely with:
+```ruby
+Eps::Model.new(data, split: false)
+```
 ## Database Storage
 The database is another place you can store models. It’s good if you retrain models automatically.
@@ -398,7 +444,7 @@ The database is another place you can store models. It’s good if you retrain m
 Create an ActiveRecord model to store the predictive model.
 ```sh
-rails g model Model key:string:uniq data:text
+rails generate model Model key:string:uniq data:text
 ```
 Store the model with:
@@ -419,6 +465,28 @@ model = Eps::Model.load_pmml(data)
 You can use [IRuby](https://github.com/SciRuby/iruby) to run Eps in [Jupyter](https://jupyter.org/) notebooks. Here’s how to get [IRuby working with Rails](https://ankane.org/jupyter-rails).
+## Weights
+Specify a weight for each data point
+```ruby
+Eps::Model.new(data, weight: :weight)
+```
+You can also pass an array
+```ruby
+Eps::Model.new(data, weight: [1, 2, 3])
+```
+Weights are supported for metrics as well
+```ruby
+Eps.metrics(actual, predicted, weight: weight)
+```
+Reweighing is one method to [mitigate bias](http://aif360.mybluemix.net/) in training data
 ## Upgrading
 ## 0.3.0
@@ -486,11 +554,11 @@ Everyone is encouraged to help improve this project. Here are a few ways you can
 - Write, clarify, or fix documentation
 - Suggest or add new features
-To get started with development and testing:
+To get started with development:
 ```sh
 git clone https://github.com/ankane/eps.git
 cd eps
 bundle install
-rake test
+bundle exec rake test
 ```

data/lib/eps.rb CHANGED

@@ -1,5 +1,4 @@
 # dependencies
-require "bigdecimal"
 require "json"
 require "lightgbm"
 require "matrix"
@@ -9,10 +8,6 @@ require "nokogiri"
 require "eps/base"
 require "eps/base_estimator"
 require "eps/data_frame"
-require "eps/evaluators/linear_regression"
-require "eps/evaluators/lightgbm"
-require "eps/evaluators/naive_bayes"
-require "eps/evaluators/node"
 require "eps/label_encoder"
 require "eps/lightgbm"
 require "eps/linear_regression"
@@ -24,17 +19,31 @@ require "eps/text_encoder"
 require "eps/utils"
 require "eps/version"
+# pmml
+require "eps/pmml"
+require "eps/pmml/generator"
+require "eps/pmml/loader"
+# evaluators
+require "eps/evaluators/linear_regression"
+require "eps/evaluators/lightgbm"
+require "eps/evaluators/naive_bayes"
+require "eps/evaluators/node"
 module Eps
-  def self.metrics(y_true, y_pred)
+  class Error < StandardError; end
+  class UnstableSolution < Error; end
+  def self.metrics(y_true, y_pred, weight: nil)
     if Utils.column_type(y_true, "actual") == "numeric"
       {
-        rmse: Metrics.rmse(y_true, y_pred),
-        mae: Metrics.mae(y_true, y_pred),
-        me: Metrics.me(y_true, y_pred)
+        rmse: Metrics.rmse(y_true, y_pred, weight: weight),
+        mae: Metrics.mae(y_true, y_pred, weight: weight),
+        me: Metrics.me(y_true, y_pred, weight: weight)
       }
     else
       {
-        accuracy: Metrics.accuracy(y_true, y_pred)
+        accuracy: Metrics.accuracy(y_true, y_pred, weight: weight)
       }
     end
   end

data/lib/eps/base_estimator.rb CHANGED

@@ -1,53 +1,39 @@
 module Eps
   class BaseEstimator
     def initialize(data = nil, y = nil, **options)
+      @options = options.dup
+      @trained = false
+      # TODO better pattern - don't pass most options to train
       train(data, y, **options) if data
     end
     def predict(data)
-      singular = data.is_a?(Hash)
-      data = [data] if singular
-      data = Eps::DataFrame.new(data)
-      @evaluator.features.each do |k, type|
-        values = data.columns[k]
-        raise ArgumentError, "Missing column: #{k}" if !values
-        column_type = Utils.column_type(values.compact, k) if values
-        if !column_type.nil?
-          if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
-            raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
-          end
-        end
-        # TODO check for unknown values for categorical features
-      end
-      predictions = @evaluator.predict(data)
+      _predict(data, false)
+    end
-      singular ? predictions.first : predictions
+    def predict_probability(data)
+      _predict(data, true)
     end
-    def evaluate(data, y = nil, target: nil)
-      data, target = prep_data(data, y, target || @target)
-      Eps.metrics(data.label, predict(data))
+    def evaluate(data, y = nil, target: nil, weight: nil)
+      data, target = prep_data(data, y, target || @target, weight)
+      Eps.metrics(data.label, predict(data), weight: data.weight)
     end
     def to_pmml
-      (@pmml ||= generate_pmml).to_xml
+      @pmml ||= PMML.generate(self)
     end
-    def self.load_pmml(data)
-      if data.is_a?(String)
-        data = Nokogiri::XML(data) { |config| config.strict }
-      end
+    def self.load_pmml(pmml)
       model = new
-      model.instance_variable_set("@pmml", data) # cache data
-      model.instance_variable_set("@evaluator", yield(data))
+      model.instance_variable_set("@evaluator", PMML.load(pmml))
+      model.instance_variable_set("@pmml", pmml.respond_to?(:to_xml) ? pmml.to_xml : pmml) # cache data
       model
     end
     def summary(extended: false)
+      raise "Summary not available for loaded models" unless @trained
       str = String.new("")
       if @validation_set
@@ -57,11 +43,11 @@ module Eps
         case @target_type
         when "numeric"
           metric_name = "RMSE"
-          v = Metrics.rmse(y_true, y_pred)
+          v = Metrics.rmse(y_true, y_pred, weight: @validation_set.weight)
           metric_value = v.round >= 1000 ? v.round.to_s : "%.3g" % v
         else
           metric_name = "accuracy"
-          metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred)).round(1)
+          metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred, weight: @validation_set.weight)).round(1)
         end
         str << "Validation %s: %s\n\n"  % [metric_name, metric_value]
       end
@@ -70,50 +56,34 @@ module Eps
       str
     end
-    # private
-    def self.extract_text_features(data, features)
-      # updates features object
-      vocabulary = {}
-      function_mapping = {}
-      derived_fields = {}
-      data.css("LocalTransformations DerivedField, TransformationDictionary DerivedField").each do |n|
-        name = n.attribute("name")&.value
-        field = n.css("FieldRef").attribute("field").value
-        value = n.css("Constant").text
-        field = field[10..-2] if field =~ /\Alowercase\(.+\)\z/
-        next if value.empty?
+    private
-        (vocabulary[field] ||= []) << value
+    def _predict(data, probabilities)
+      singular = data.is_a?(Hash)
+      data = [data] if singular
-        function_mapping[field] = n.css("Apply").attribute("function").value
+      data = Eps::DataFrame.new(data)
-        derived_fields[name] = [field, value]
-      end
+      @evaluator.features.each do |k, type|
+        values = data.columns[k]
+        raise ArgumentError, "Missing column: #{k}" if !values
+        column_type = Utils.column_type(values.compact, k) if values
-      functions = {}
-      data.css("TransformationDictionary DefineFunction").each do |n|
-        name = n.attribute("name").value
-        text_index = n.css("TextIndex")
-        functions[name] = {
-          tokenizer: Regexp.new(text_index.attribute("wordSeparatorCharacterRE").value),
-          case_sensitive: text_index.attribute("isCaseSensitive")&.value == "true"
-        }
+        if !column_type.nil?
+          if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
+            raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
+          end
+        end
+        # TODO check for unknown values for categorical features
       end
-      text_features = {}
-      function_mapping.each do |field, function|
-        text_features[field] = functions[function].merge(vocabulary: vocabulary[field])
-        features[field] = "text"
-      end
+      predictions = @evaluator.predict(data, probabilities: probabilities)
-      [text_features, derived_fields]
+      singular ? predictions.first : predictions
     end
-    private
-    def train(data, y = nil, target: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
-      data, @target = prep_data(data, y, target)
+    def train(data, y = nil, target: nil, weight: nil, split: nil, validation_set: nil, text_features: nil, **options)
+      data, @target = prep_data(data, y, target, weight)
       @target_type = Utils.column_type(data.label, @target)
       if split.nil?
@@ -121,6 +91,7 @@ module Eps
       end
       # cross validation
+      # TODO adjust based on weight
       if split && !validation_set
         split = {} if split == true
         split = {column: split} unless split.is_a?(Hash)
@@ -193,8 +164,9 @@ module Eps
       else
         @train_set = data.dup
         if validation_set
-          validation_set = Eps::DataFrame.new(validation_set)
-          validation_set.label = validation_set.columns.delete(@target)
+          raise "Target required for validation set" unless target
+          raise "Weight required for validation set" if data.weight && !weight
+          validation_set, _ = prep_data(validation_set, nil, @target, weight)
         end
       end
@@ -202,20 +174,37 @@ module Eps
       raise "No data in validation set" if validation_set && validation_set.empty?
       @validation_set = validation_set
-      @evaluator = _train(verbose: verbose, early_stopping: early_stopping)
+      @evaluator = _train(**options)
       # reset pmml
       @pmml = nil
+      @trained = true
       nil
     end
-    def prep_data(data, y, target)
+    def prep_data(data, y, target, weight)
       data = Eps::DataFrame.new(data)
+      # target
       target = (target || "target").to_s
       y ||= data.columns.delete(target)
       check_missing(y, target)
       data.label = y.to_a
+      # weight
+      if weight
+        weight =
+          if weight.respond_to?(:to_a)
+            weight.to_a
+          else
+            data.columns.delete(weight.to_s)
+          end
+        check_missing(weight, "weight")
+        data.weight = weight.to_a
+      end
       check_data(data)
       [data, target]
     end
@@ -228,7 +217,7 @@ module Eps
         # TODO determine max features automatically
         # start based on number of rows
-        encoder = Eps::TextEncoder.new(v)
+        encoder = Eps::TextEncoder.new(**v)
         counts = encoder.fit(train_set.columns.delete(k))
         encoder.vocabulary.each do |word|
           train_set.columns[[k, word]] = [0] * counts.size
@@ -251,11 +240,12 @@ module Eps
     def check_data(data)
       raise "No data" if data.empty?
       raise "Number of data points differs from target" if data.size != data.label.size
+      raise "Number of data points differs from weight" if data.weight && data.size != data.weight.size
     end
     def check_missing(c, name)
       raise ArgumentError, "Missing column: #{name}" if !c
-      raise ArgumentError, "Missing values in column #{name}" if c.any?(&:nil?)
+      raise ArgumentError, "Missing values in column #{name}" if c.to_a.any?(&:nil?)
     end
     def check_missing_value(df)
@@ -275,77 +265,5 @@ module Eps
         k
       end
     end
-    # pmml
-    def build_pmml(data_fields)
-      Nokogiri::XML::Builder.new do |xml|
-        xml.PMML(version: "4.4", xmlns: "http://www.dmg.org/PMML-4_4", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
-          pmml_header(xml)
-          pmml_data_dictionary(xml, data_fields)
-          pmml_transformation_dictionary(xml)
-          yield xml
-        end
-      end
-    end
-    def pmml_header(xml)
-      xml.Header do
-        xml.Application(name: "Eps", version: Eps::VERSION)
-        # xml.Timestamp Time.now.utc.iso8601
-      end
-    end
-    def pmml_data_dictionary(xml, data_fields)
-      xml.DataDictionary do
-        data_fields.each do |k, vs|
-          case @features[k]
-          when "categorical", nil
-            xml.DataField(name: k, optype: "categorical", dataType: "string") do
-              vs.map(&:to_s).sort.each do |v|
-                xml.Value(value: v)
-              end
-            end
-          when "text"
-            xml.DataField(name: k, optype: "categorical", dataType: "string")
-          else
-            xml.DataField(name: k, optype: "continuous", dataType: "double")
-          end
-        end
-      end
-    end
-    def pmml_transformation_dictionary(xml)
-      if @text_features.any?
-        xml.TransformationDictionary do
-          @text_features.each do |k, text_options|
-            xml.DefineFunction(name: "#{k}Transform", optype: "continuous") do
-              xml.ParameterField(name: "text")
-              xml.ParameterField(name: "term")
-              xml.TextIndex(textField: "text", localTermWeights: "termFrequency", wordSeparatorCharacterRE: text_options[:tokenizer].source, isCaseSensitive: !!text_options[:case_sensitive]) do
-                xml.FieldRef(field: "term")
-              end
-            end
-          end
-        end
-      end
-    end
-    def pmml_local_transformations(xml)
-      if @text_features.any?
-        xml.LocalTransformations do
-          @text_features.each do |k, _|
-            @text_encoders[k].vocabulary.each do |v|
-              xml.DerivedField(name: display_field([k, v]), optype: "continuous", dataType: "integer") do
-                xml.Apply(function: "#{k}Transform") do
-                  xml.FieldRef(field: k)
-                  xml.Constant v
-                end
-              end
-            end
-          end
-        end
-      end
-    end
   end
 end