RubyGems - eps - Versions diffs - 0.3.0 → 0.3.5 - Mend

eps 0.3.0 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +31 -5
data/README.md +77 -9
data/lib/eps.rb +19 -10
data/lib/eps/base_estimator.rb +63 -145
data/lib/eps/data_frame.rb +19 -3
data/lib/eps/evaluators/lightgbm.rb +20 -7
data/lib/eps/evaluators/linear_regression.rb +7 -4
data/lib/eps/evaluators/naive_bayes.rb +9 -7
data/lib/eps/label_encoder.rb +7 -3
data/lib/eps/lightgbm.rb +43 -78
data/lib/eps/linear_regression.rb +53 -83
data/lib/eps/metrics.rb +24 -12
data/lib/eps/model.rb +6 -6
data/lib/eps/naive_bayes.rb +3 -140
data/lib/eps/pmml.rb +14 -0
data/lib/eps/pmml/generator.rb +422 -0
data/lib/eps/pmml/loader.rb +241 -0
data/lib/eps/version.rb +1 -1
metadata +36 -6
data/lib/eps/pmml_generators/lightgbm.rb +0 -187

data/lib/eps/data_frame.rb CHANGED

@@ -1,7 +1,7 @@
 module Eps
   class DataFrame
     attr_reader :columns
-    attr_accessor :label
+    attr_accessor :label, :weight
     def initialize(data = [])
       @columns = {}
@@ -10,7 +10,7 @@ module Eps
         data.columns.each do |k, v|
           @columns[k] = v
         end
-      elsif daru?(data)
+      elsif rover?(data) || daru?(data)
         data.to_h.each do |k, v|
           @columns[k.to_s] = v.to_a
         end
@@ -19,6 +19,8 @@ module Eps
           @columns[k.to_s] = v.to_a
         end
       else
+        data = data.to_a if numo?(data)
         if data.any?
           row = data[0]
@@ -78,6 +80,10 @@ module Eps
           rows = Range.new(rows.begin, size - 1)
         elsif rows.end < 0
           rows = Range.new(rows.begin, size + rows.end, rows.exclude_end?)
+        else
+          finish = rows.end
+          finish -= 1 if rows.exclude_end?
+          rows = Range.new(rows.begin, size - 1) if finish >= size - 1
         end
       end
@@ -115,6 +121,7 @@ module Eps
         df.columns[c] = columns[c].values_at(*rows)
       end
       df.label = label.values_at(*rows) if label
+      df.weight = weight.values_at(*rows) if weight
       singular ? df.columns[cols[0]] : df
     end
@@ -129,13 +136,22 @@ module Eps
         df.columns[k] = v
       end
       df.label = label
+      df.weight = weight
       df
     end
     private
+    def numo?(x)
+      defined?(Numo::NArray) && x.is_a?(Numo::NArray)
+    end
+    def rover?(x)
+      defined?(Rover::DataFrame) && x.is_a?(Rover::DataFrame)
+    end
     def daru?(x)
-      defined?(Daru) && x.is_a?(Daru::DataFrame)
+      defined?(Daru::DataFrame) && x.is_a?(Daru::DataFrame)
     end
   end
 end

data/lib/eps/evaluators/lightgbm.rb CHANGED

@@ -11,12 +11,14 @@ module Eps
         @text_features = text_features
       end
-      def predict(data)
+      def predict(data, probabilities: false)
+        raise "Probabilities not supported" if probabilities && @objective == "regression"
         rows = data.map(&:to_h)
         # sparse matrix
         @text_features.each do |k, v|
-          encoder = TextEncoder.new(v)
+          encoder = TextEncoder.new(**v)
           values = data.columns.delete(k)
           counts = encoder.transform(values)
@@ -38,7 +40,12 @@ module Eps
         when "regression"
           sum_trees(rows, @trees)
         when "binary"
-          sum_trees(rows, @trees).map { |s| @labels[sigmoid(s) > 0.5 ? 1 : 0] }
+          prob = sum_trees(rows, @trees).map { |s| sigmoid(s) }
+          if probabilities
+            prob.map { |v| @labels.zip([1 - v, v]).to_h }
+          else
+            prob.map { |v| @labels[v > 0.5 ? 1 : 0] }
+          end
         else
           tree_scores = []
           num_trees = @trees.size / @labels.size
@@ -47,8 +54,14 @@ module Eps
           end
           data.size.times.map do |i|
             v = tree_scores.map { |s| s[i] }
-            idx = v.map.with_index.max_by { |v2, _| v2 }.last
-            @labels[idx]
+            if probabilities
+              exp = v.map { |vi| Math.exp(vi) }
+              sum = exp.sum
+              @labels.zip(exp.map { |e| e / sum }).to_h
+            else
+              idx = v.map.with_index.max_by { |v2, _| v2 }.last
+              @labels[idx]
+            end
           end
         end
       end
@@ -81,7 +94,7 @@ module Eps
           else
             case node.operator
             when "equal"
-              v == node.value
+              v.to_s == node.value
             when "in"
               node.value.include?(v)
             when "greaterThan"
@@ -109,7 +122,7 @@ module Eps
       end
       def sigmoid(x)
-        1.0 / (1 + Math::E**(-x))
+        1.0 / (1 + Math.exp(-x))
       end
     end
   end

data/lib/eps/evaluators/linear_regression.rb CHANGED

@@ -9,8 +9,10 @@ module Eps
         @text_features = text_features || {}
       end
-      def predict(x)
-        intercept = @coefficients["_intercept"]
+      def predict(x, probabilities: false)
+        raise "Probabilities not supported" if probabilities
+        intercept = @coefficients["_intercept"] || 0.0
         scores = [intercept] * x.size
         @features.each do |k, type|
@@ -19,10 +21,11 @@ module Eps
           case type
           when "categorical"
             x.columns[k].each_with_index do |xv, i|
-              scores[i] += @coefficients[[k, xv]].to_f
+              # TODO clean up
+              scores[i] += (@coefficients[[k, xv]] || @coefficients[[k, xv.to_s]]).to_f
             end
           when "text"
-            encoder = TextEncoder.new(@text_features[k])
+            encoder = TextEncoder.new(**@text_features[k])
             counts = encoder.transform(x.columns[k])
             coef = {}
             @coefficients.each do |k2, v|

data/lib/eps/evaluators/naive_bayes.rb CHANGED

@@ -10,14 +10,15 @@ module Eps
         @legacy = legacy
       end
-      def predict(x)
+      def predict(x, probabilities: false)
         probs = calculate_class_probabilities(x)
         probs.map do |xp|
-          # convert probabilities
-          # not needed when just returning label
-          # sum = xp.values.map { |v| Math.exp(v) }.sum.to_f
-          # p xp.map { |k, v| [k, Math.exp(v) / sum] }.to_h
-          xp.sort_by { |k, v| [-v, k] }[0][0]
+          if probabilities
+            sum = xp.values.map { |v| Math.exp(v) }.sum.to_f
+            xp.map { |k, v| [k, Math.exp(v) / sum] }.to_h
+          else
+            xp.sort_by { |k, v| [-v, k] }[0][0]
+          end
         end
       end
@@ -38,7 +39,8 @@ module Eps
             case type
             when "categorical"
               x.columns[k].each_with_index do |xi, i|
-                vc = probabilities[:conditional][k][xi]
+                # TODO clean this up
+                vc = probabilities[:conditional][k][xi] || probabilities[:conditional][k][xi.to_s]
                 # unknown value if not vc
                 if vc

data/lib/eps/label_encoder.rb CHANGED

@@ -24,9 +24,13 @@ module Eps
         if yi.nil?
           nil
         else
-          v = @labels[yi.to_s]
-          raise "Unknown label: #{yi}" unless v
-          v
+          # use an additional label for unseen values
+          # this is only used during training for the LightGBM eval_set
+          # LightGBM ignores them (only uses seen categories for predictions)
+          # https://github.com/microsoft/LightGBM/issues/1936
+          # the evaluator also ignores them (to be consistent with LightGBM)
+          # but doesn't use this code
+          @labels[yi.to_s] || @labels.size
         end
       end
     end

data/lib/eps/lightgbm.rb CHANGED

@@ -1,39 +1,5 @@
-require "eps/pmml_generators/lightgbm"
 module Eps
   class LightGBM < BaseEstimator
-    include PmmlGenerators::LightGBM
-    def self.load_pmml(data)
-      super do |data|
-        objective = data.css("MiningModel").first.attribute("functionName").value
-        if objective == "classification"
-          labels = data.css("RegressionModel OutputField").map { |n| n.attribute("value").value }
-          objective = labels.size > 2 ? "multiclass" : "binary"
-        end
-        features = {}
-        text_features, derived_fields = extract_text_features(data, features)
-        node = data.css("DataDictionary").first
-        node.css("DataField")[1..-1].to_a.each do |node|
-          features[node.attribute("name").value] =
-            if node.attribute("optype").value == "categorical"
-              "categorical"
-            else
-              "numeric"
-            end
-        end
-        trees = []
-        data.css("Segmentation TreeModel").each do |tree|
-          node = find_nodes(tree.css("Node").first, derived_fields)
-          trees << node
-        end
-        Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: features, text_features: text_features)
-      end
-    end
     private
     def _summary(extended: false)
@@ -51,48 +17,16 @@ module Eps
       str
     end
-    def self.find_nodes(xml, derived_fields)
-      score = BigDecimal(xml.attribute("score").value).to_f
-      elements = xml.elements
-      xml_predicate = elements.first
-      predicate =
-        if xml_predicate.name == "True"
-          nil
-        elsif xml_predicate.name == "SimpleSetPredicate"
-          operator = "in"
-          value = xml_predicate.css("Array").text.scan(/"(.+?)(?<!\\)"|(\S+)/).flatten.compact.map { |v| v.gsub('\"', '"') }
-          field = xml_predicate.attribute("field").value
-          field = derived_fields[field] if derived_fields[field]
-          {
-            field: field,
-            operator: operator,
-            value: value
-          }
-        else
-          operator = xml_predicate.attribute("operator").value
-          value = xml_predicate.attribute("value").value
-          value = BigDecimal(value).to_f if operator == "greaterThan"
-          field = xml_predicate.attribute("field").value
-          field = derived_fields[field] if derived_fields[field]
-          {
-            field: field,
-            operator: operator,
-            value: value
-          }
-        end
-      children = elements[1..-1].map { |n| find_nodes(n, derived_fields) }
-      Evaluators::Node.new(score: score, predicate: predicate, children: children)
-    end
-    def _train(verbose: nil, early_stopping: nil)
+    def _train(verbose: nil, early_stopping: nil, learning_rate: 0.1)
       train_set = @train_set
       validation_set = @validation_set.dup
       summary_label = train_set.label
+      # create check set
+      evaluator_set = validation_set || train_set
+      check_idx = 100.times.map { rand(evaluator_set.size) }.uniq
+      evaluator_set = evaluator_set[check_idx]
       # objective
       objective =
         if @target_type == "numeric"
@@ -126,7 +60,10 @@ module Eps
       prep_text_features(validation_set) if validation_set
       # create params
-      params = {objective: objective}
+      params = {
+        objective: objective,
+        learning_rate: learning_rate
+      }
       params[:num_classes] = labels.size if objective == "multiclass"
       if train_set.size < 30
         params[:min_data_in_bin] = 1
@@ -135,8 +72,8 @@ module Eps
       # create datasets
       categorical_idx = @features.values.map.with_index.select { |type, _| type == "categorical" }.map(&:last)
-      train_ds = ::LightGBM::Dataset.new(train_set.map_rows(&:to_a), label: train_set.label, categorical_feature: categorical_idx, params: params)
-      validation_ds = ::LightGBM::Dataset.new(validation_set.map_rows(&:to_a), label: validation_set.label, categorical_feature: categorical_idx, params: params, reference: train_ds) if validation_set
+      train_ds = ::LightGBM::Dataset.new(train_set.map_rows(&:to_a), label: train_set.label, weight: train_set.weight, categorical_feature: categorical_idx, params: params)
+      validation_ds = ::LightGBM::Dataset.new(validation_set.map_rows(&:to_a), label: validation_set.label, weight: validation_set.weight, categorical_feature: categorical_idx, params: params, reference: train_ds) if validation_set
       # train
       valid_sets = [train_ds]
@@ -176,11 +113,39 @@ module Eps
       # reset pmml
       @pmml = nil
-      Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: @features, text_features: @text_features)
+      evaluator = Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: @features, text_features: @text_features)
+      booster_set = validation_set ? validation_set[check_idx] : train_set[check_idx]
+      check_evaluator(objective, labels, booster, booster_set, evaluator, evaluator_set)
+      evaluator
     end
-    def evaluator_class
-      PmmlLoaders::LightGBM
+    # compare a subset of predictions to check for possible bugs in evaluator
+    # NOTE LightGBM must use double data type for prediction input for these to be consistent
+    def check_evaluator(objective, labels, booster, booster_set, evaluator, evaluator_set)
+      expected = @booster.predict(booster_set.map_rows(&:to_a))
+      if objective == "multiclass"
+        actual = evaluator.predict(evaluator_set, probabilities: true)
+        # just compare first for now
+        expected.map! { |v| v.first }
+        actual.map! { |v| v.values.first }
+      elsif objective == "binary"
+        actual = evaluator.predict(evaluator_set, probabilities: true).map { |v| v.values.last }
+      else
+        actual = evaluator.predict(evaluator_set)
+      end
+      regression = objective == "regression" || objective == "binary"
+      bad_observations = []
+      expected.zip(actual).each_with_index do |(exp, act), i|
+        success = (act - exp).abs < 0.001
+        unless success
+          bad_observations << {expected: exp, actual: act, data_point: evaluator_set[i].map(&:itself).first}
+        end
+      end
+      if bad_observations.any?
+        raise "Bug detected in evaluator. Please report an issue. Bad data points: #{bad_observations.inspect}"
+      end
     end
     # for evaluator

data/lib/eps/linear_regression.rb CHANGED

@@ -1,40 +1,5 @@
 module Eps
   class LinearRegression < BaseEstimator
-    # pmml
-    def self.load_pmml(data)
-      super do |data|
-        # TODO more validation
-        node = data.css("RegressionTable")
-        coefficients = {
-          "_intercept" => node.attribute("intercept").value.to_f
-        }
-        features = {}
-        text_features, derived_fields = extract_text_features(data, features)
-        node.css("NumericPredictor").each do |n|
-          name = n.attribute("name").value
-          if derived_fields[name]
-            name = derived_fields[name]
-          else
-            features[name] = "numeric"
-          end
-          coefficients[name] = n.attribute("coefficient").value.to_f
-        end
-        node.css("CategoricalPredictor").each do |n|
-          name = n.attribute("name").value
-          coefficients[[name, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
-          features[name] = "categorical"
-        end
-        Evaluators::LinearRegression.new(coefficients: coefficients, features: features, text_features: text_features)
-      end
-    end
     def coefficients
       @evaluator.coefficients
     end
@@ -72,6 +37,7 @@ module Eps
       str
     end
+    # TODO use keyword arguments for gsl and intercept in 0.4.0
     def _train(**options)
       raise "Target must be numeric" if @target_type != "numeric"
       check_missing_value(@train_set)
@@ -84,33 +50,64 @@ module Eps
       end
       x = data.map_rows(&:to_a)
-      data.size.times do |i|
-        # add intercept
-        x[i].unshift(1)
-      end
-      gsl = options.key?(:gsl) ? options[:gsl] : defined?(GSL)
+      gsl =
+        if options.key?(:gsl)
+          options[:gsl]
+        elsif defined?(GSL)
+          true
+        elsif defined?(GSLR)
+          :gslr
+        else
+          false
+        end
+      intercept = options.key?(:intercept) ? options[:intercept] : true
+      if intercept && gsl != :gslr
+        data.size.times do |i|
+          x[i].unshift(1)
+        end
+      end
       v3 =
-        if gsl
+        if gsl == :gslr
+          model = GSLR::OLS.new(intercept: intercept)
+          model.fit(x, data.label, weight: data.weight)
+          @covariance = model.covariance
+          coefficients = model.coefficients.dup
+          coefficients.unshift(model.intercept) if intercept
+          coefficients
+        elsif gsl
           x = GSL::Matrix.alloc(*x)
           y = GSL::Vector.alloc(data.label)
-          c, @covariance, _, _ = GSL::MultiFit::linear(x, y)
+          w = GSL::Vector.alloc(data.weight) if data.weight
+          c, @covariance, _, _ = w ? GSL::MultiFit.wlinear(x, w, y) : GSL::MultiFit.linear(x, y)
           c.to_a
         else
           x = Matrix.rows(x)
           y = Matrix.column_vector(data.label)
+          # weighted OLS
+          # http://www.real-statistics.com/multiple-regression/weighted-linear-regression/weighted-regression-basics/
+          w = Matrix.diagonal(*data.weight) if data.weight
           removed = []
           # https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
-          # unforutnately, this method is unstable
+          # unfortunately, this method is unstable
           # haven't found an efficient way to do QR-factorization in Ruby
           # the extendmatrix gem has householder and givens (givens has bug)
           # but methods are too slow
           xt = x.t
+          xt *= w if w
           begin
             @xtxi = (xt * x).inverse
           rescue ExceptionForMatrix::ErrNotRegular
+            # matrix cannot be inverted
+            # https://en.wikipedia.org/wiki/Multicollinearity
             constant = {}
             (1...x.column_count).each do |i|
               constant[i] = constant?(x.column(i))
@@ -134,6 +131,7 @@ module Eps
             end
             x = Matrix.columns(vectors)
             xt = x.t
+            xt *= w if w
             # try again
             begin
@@ -144,6 +142,7 @@ module Eps
           end
           # huge performance boost
           # by multiplying xt * y first
+          # for weighted, w is already included in wt
           v2 = @xtxi * (xt * y)
           # convert to array
@@ -158,47 +157,14 @@ module Eps
           v2
         end
-      @coefficient_names = ["_intercept"] + data.columns.keys
-      @coefficients = Hash[@coefficient_names.zip(v3)]
-      Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
-    end
-    def generate_pmml
-      predictors = @coefficients.dup
-      predictors.delete("_intercept")
-      data_fields = {}
-      @features.each do |k, type|
-        if type == "categorical"
-          data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last)
-        else
-          data_fields[k] = nil
-        end
+      if @xtxi && @xtxi.each(:diagonal).any? { |v| v < 0 }
+        raise UnstableSolution, "GSL is needed to find a stable solution for this dataset"
       end
-      build_pmml(data_fields) do |xml|
-        xml.RegressionModel(functionName: "regression") do
-          xml.MiningSchema do
-            @features.each do |k, _|
-              xml.MiningField(name: k)
-            end
-          end
-          pmml_local_transformations(xml)
-          xml.RegressionTable(intercept: @coefficients["_intercept"]) do
-            predictors.each do |k, v|
-              if k.is_a?(Array)
-                if @features[k.first] == "text"
-                  xml.NumericPredictor(name: display_field(k), coefficient: v)
-                else
-                  xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
-                end
-              else
-                xml.NumericPredictor(name: k, coefficient: v)
-              end
-            end
-          end
-        end
-      end
+      @coefficient_names = data.columns.keys
+      @coefficient_names.unshift("_intercept") if intercept
+      @coefficients = Hash[@coefficient_names.zip(v3)]
+      Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
     end
     def prep_x(x)
@@ -249,7 +215,11 @@ module Eps
     def diagonal
       @diagonal ||= begin
-        if covariance.respond_to?(:each)
+        if covariance.is_a?(Array)
+          covariance.size.times.map do |i|
+            covariance[i][i]
+          end
+        elsif covariance.respond_to?(:each)
           d = covariance.each(:diagonal).to_a
           @removed.each do |i|
             d.insert(i, 0)