RubyGems - eps - Versions diffs - 0.3.0 → 0.3.1 - Mend

eps 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +12 -5
data/README.md +34 -0
data/lib/eps.rb +19 -10
data/lib/eps/base_estimator.rb +35 -129
data/lib/eps/data_frame.rb +7 -1
data/lib/eps/evaluators/linear_regression.rb +1 -1
data/lib/eps/label_encoder.rb +7 -3
data/lib/eps/lightgbm.rb +36 -76
data/lib/eps/linear_regression.rb +26 -79
data/lib/eps/metrics.rb +24 -12
data/lib/eps/model.rb +6 -6
data/lib/eps/naive_bayes.rb +2 -139
data/lib/eps/pmml.rb +14 -0
data/lib/eps/pmml/generator.rb +422 -0
data/lib/eps/pmml/loader.rb +241 -0
data/lib/eps/version.rb +1 -1
metadata +7 -5
data/lib/eps/pmml_generators/lightgbm.rb +0 -187

data/lib/eps/lightgbm.rb CHANGED

@@ -1,39 +1,5 @@
-require "eps/pmml_generators/lightgbm"
 module Eps
   class LightGBM < BaseEstimator
-    include PmmlGenerators::LightGBM
-    def self.load_pmml(data)
-      super do |data|
-        objective = data.css("MiningModel").first.attribute("functionName").value
-        if objective == "classification"
-          labels = data.css("RegressionModel OutputField").map { |n| n.attribute("value").value }
-          objective = labels.size > 2 ? "multiclass" : "binary"
-        end
-        features = {}
-        text_features, derived_fields = extract_text_features(data, features)
-        node = data.css("DataDictionary").first
-        node.css("DataField")[1..-1].to_a.each do |node|
-          features[node.attribute("name").value] =
-            if node.attribute("optype").value == "categorical"
-              "categorical"
-            else
-              "numeric"
-            end
-        end
-        trees = []
-        data.css("Segmentation TreeModel").each do |tree|
-          node = find_nodes(tree.css("Node").first, derived_fields)
-          trees << node
-        end
-        Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: features, text_features: text_features)
-      end
-    end
     private
     def _summary(extended: false)
@@ -51,48 +17,16 @@ module Eps
       str
     end
-    def self.find_nodes(xml, derived_fields)
-      score = BigDecimal(xml.attribute("score").value).to_f
-      elements = xml.elements
-      xml_predicate = elements.first
-      predicate =
-        if xml_predicate.name == "True"
-          nil
-        elsif xml_predicate.name == "SimpleSetPredicate"
-          operator = "in"
-          value = xml_predicate.css("Array").text.scan(/"(.+?)(?<!\\)"|(\S+)/).flatten.compact.map { |v| v.gsub('\"', '"') }
-          field = xml_predicate.attribute("field").value
-          field = derived_fields[field] if derived_fields[field]
-          {
-            field: field,
-            operator: operator,
-            value: value
-          }
-        else
-          operator = xml_predicate.attribute("operator").value
-          value = xml_predicate.attribute("value").value
-          value = BigDecimal(value).to_f if operator == "greaterThan"
-          field = xml_predicate.attribute("field").value
-          field = derived_fields[field] if derived_fields[field]
-          {
-            field: field,
-            operator: operator,
-            value: value
-          }
-        end
-      children = elements[1..-1].map { |n| find_nodes(n, derived_fields) }
-      Evaluators::Node.new(score: score, predicate: predicate, children: children)
-    end
     def _train(verbose: nil, early_stopping: nil)
       train_set = @train_set
       validation_set = @validation_set.dup
       summary_label = train_set.label
+      # create check set
+      evaluator_set = validation_set || train_set
+      check_idx = 100.times.map { rand(evaluator_set.size) }.uniq
+      evaluator_set = evaluator_set[check_idx]
       # objective
       objective =
         if @target_type == "numeric"
@@ -135,8 +69,8 @@ module Eps
       # create datasets
       categorical_idx = @features.values.map.with_index.select { |type, _| type == "categorical" }.map(&:last)
-      train_ds = ::LightGBM::Dataset.new(train_set.map_rows(&:to_a), label: train_set.label, categorical_feature: categorical_idx, params: params)
-      validation_ds = ::LightGBM::Dataset.new(validation_set.map_rows(&:to_a), label: validation_set.label, categorical_feature: categorical_idx, params: params, reference: train_ds) if validation_set
+      train_ds = ::LightGBM::Dataset.new(train_set.map_rows(&:to_a), label: train_set.label, weight: train_set.weight, categorical_feature: categorical_idx, params: params)
+      validation_ds = ::LightGBM::Dataset.new(validation_set.map_rows(&:to_a), label: validation_set.label, weight: validation_set.weight, categorical_feature: categorical_idx, params: params, reference: train_ds) if validation_set
       # train
       valid_sets = [train_ds]
@@ -176,11 +110,37 @@ module Eps
       # reset pmml
       @pmml = nil
-      Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: @features, text_features: @text_features)
+      evaluator = Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: @features, text_features: @text_features)
+      booster_set = validation_set ? validation_set[check_idx] : train_set[check_idx]
+      check_evaluator(objective, labels, booster, booster_set, evaluator, evaluator_set)
+      evaluator
     end
-    def evaluator_class
-      PmmlLoaders::LightGBM
+    # compare a subset of predictions to check for possible bugs in evaluator
+    # NOTE LightGBM must use double data type for prediction input for these to be consistent
+    def check_evaluator(objective, labels, booster, booster_set, evaluator, evaluator_set)
+      expected = @booster.predict(booster_set.map_rows(&:to_a))
+      if objective == "multiclass"
+        expected.map! do |v|
+          labels[v.map.with_index.max_by { |v2, _| v2 }.last]
+        end
+      elsif objective == "binary"
+        expected.map! { |v| labels[v >= 0.5 ? 1 : 0] }
+      end
+      actual = evaluator.predict(evaluator_set)
+      regression = objective == "regression"
+      bad_observations = []
+      expected.zip(actual).each_with_index do |(exp, act), i|
+        success = regression ? (act - exp).abs < 0.001 : act == exp
+        unless success
+          bad_observations << {expected: exp, actual: act, data_point: evaluator_set[i].map(&:itself).first}
+        end
+      end
+      if bad_observations.any?
+        raise "Bug detected in evaluator. Please report an issue. Bad data points: #{bad_observations.inspect}"
+      end
     end
     # for evaluator

data/lib/eps/linear_regression.rb CHANGED

@@ -1,40 +1,5 @@
 module Eps
   class LinearRegression < BaseEstimator
-    # pmml
-    def self.load_pmml(data)
-      super do |data|
-        # TODO more validation
-        node = data.css("RegressionTable")
-        coefficients = {
-          "_intercept" => node.attribute("intercept").value.to_f
-        }
-        features = {}
-        text_features, derived_fields = extract_text_features(data, features)
-        node.css("NumericPredictor").each do |n|
-          name = n.attribute("name").value
-          if derived_fields[name]
-            name = derived_fields[name]
-          else
-            features[name] = "numeric"
-          end
-          coefficients[name] = n.attribute("coefficient").value.to_f
-        end
-        node.css("CategoricalPredictor").each do |n|
-          name = n.attribute("name").value
-          coefficients[[name, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
-          features[name] = "categorical"
-        end
-        Evaluators::LinearRegression.new(coefficients: coefficients, features: features, text_features: text_features)
-      end
-    end
     def coefficients
       @evaluator.coefficients
     end
@@ -84,9 +49,12 @@ module Eps
       end
       x = data.map_rows(&:to_a)
-      data.size.times do |i|
-        # add intercept
-        x[i].unshift(1)
+      intercept = @options.key?(:intercept) ? @options[:intercept] : true
+      if intercept
+        data.size.times do |i|
+          x[i].unshift(1)
+        end
       end
       gsl = options.key?(:gsl) ? options[:gsl] : defined?(GSL)
@@ -95,22 +63,32 @@ module Eps
         if gsl
           x = GSL::Matrix.alloc(*x)
           y = GSL::Vector.alloc(data.label)
-          c, @covariance, _, _ = GSL::MultiFit::linear(x, y)
+          w = GSL::Vector.alloc(data.weight) if data.weight
+          c, @covariance, _, _ = w ? GSL::MultiFit.wlinear(x, w, y) : GSL::MultiFit.linear(x, y)
           c.to_a
         else
           x = Matrix.rows(x)
           y = Matrix.column_vector(data.label)
+          # weighted OLS
+          # http://www.real-statistics.com/multiple-regression/weighted-linear-regression/weighted-regression-basics/
+          w = Matrix.diagonal(*data.weight) if data.weight
           removed = []
           # https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
-          # unforutnately, this method is unstable
+          # unfortunately, this method is unstable
           # haven't found an efficient way to do QR-factorization in Ruby
           # the extendmatrix gem has householder and givens (givens has bug)
           # but methods are too slow
           xt = x.t
+          xt *= w if w
           begin
             @xtxi = (xt * x).inverse
           rescue ExceptionForMatrix::ErrNotRegular
+            # matrix cannot be inverted
+            # https://en.wikipedia.org/wiki/Multicollinearity
             constant = {}
             (1...x.column_count).each do |i|
               constant[i] = constant?(x.column(i))
@@ -134,6 +112,7 @@ module Eps
             end
             x = Matrix.columns(vectors)
             xt = x.t
+            xt *= w if w
             # try again
             begin
@@ -144,6 +123,7 @@ module Eps
           end
           # huge performance boost
           # by multiplying xt * y first
+          # for weighted, w is already included in wt
           v2 = @xtxi * (xt * y)
           # convert to array
@@ -158,47 +138,14 @@ module Eps
           v2
         end
-      @coefficient_names = ["_intercept"] + data.columns.keys
-      @coefficients = Hash[@coefficient_names.zip(v3)]
-      Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
-    end
-    def generate_pmml
-      predictors = @coefficients.dup
-      predictors.delete("_intercept")
-      data_fields = {}
-      @features.each do |k, type|
-        if type == "categorical"
-          data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last)
-        else
-          data_fields[k] = nil
-        end
+      if @xtxi && @xtxi.each(:diagonal).any? { |v| v < 0 }
+        raise UnstableSolution, "GSL is needed to find a stable solution for this dataset"
       end
-      build_pmml(data_fields) do |xml|
-        xml.RegressionModel(functionName: "regression") do
-          xml.MiningSchema do
-            @features.each do |k, _|
-              xml.MiningField(name: k)
-            end
-          end
-          pmml_local_transformations(xml)
-          xml.RegressionTable(intercept: @coefficients["_intercept"]) do
-            predictors.each do |k, v|
-              if k.is_a?(Array)
-                if @features[k.first] == "text"
-                  xml.NumericPredictor(name: display_field(k), coefficient: v)
-                else
-                  xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
-                end
-              else
-                xml.NumericPredictor(name: k, coefficient: v)
-              end
-            end
-          end
-        end
-      end
+      @coefficient_names = data.columns.keys
+      @coefficient_names.unshift("_intercept") if intercept
+      @coefficients = Hash[@coefficient_names.zip(v3)]
+      Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
     end
     def prep_x(x)

data/lib/eps/metrics.rb CHANGED

@@ -1,31 +1,39 @@
 module Eps
   module Metrics
     class << self
-      def rmse(y_true, y_pred)
+      def rmse(y_true, y_pred, weight: nil)
         check_size(y_true, y_pred)
-        Math.sqrt(mean(errors(y_true, y_pred).map { |v| v**2 }))
+        Math.sqrt(mean(errors(y_true, y_pred).map { |v| v**2 }, weight: weight))
       end
-      def mae(y_true, y_pred)
+      def mae(y_true, y_pred, weight: nil)
         check_size(y_true, y_pred)
-        mean(errors(y_true, y_pred).map { |v| v.abs })
+        mean(errors(y_true, y_pred).map { |v| v.abs }, weight: weight)
       end
-      def me(y_true, y_pred)
+      def me(y_true, y_pred, weight: nil)
         check_size(y_true, y_pred)
-        mean(errors(y_true, y_pred))
+        mean(errors(y_true, y_pred), weight: weight)
       end
-      def accuracy(y_true, y_pred)
+      def accuracy(y_true, y_pred, weight: nil)
         check_size(y_true, y_pred)
-        y_true.zip(y_pred).count { |yt, yp| yt == yp } / y_true.size.to_f
+        values = y_true.zip(y_pred).map { |yt, yp| yt == yp ? 1 : 0 }
+        if weight
+          values.each_with_index do |v, i|
+            values[i] *= weight[i]
+          end
+          values.sum / weight.sum.to_f
+        else
+          values.sum / y_true.size.to_f
+        end
       end
       # http://wiki.fast.ai/index.php/Log_Loss
-      def log_loss(y_true, y_pred, eps: 1e-15)
+      def log_loss(y_true, y_pred, eps: 1e-15, weight: nil)
         check_size(y_true, y_pred)
         p = y_pred.map { |yp| yp.clamp(eps, 1 - eps) }
-        mean(y_true.zip(p).map { |yt, pi| yt == 1 ? -Math.log(pi) : -Math.log(1 - pi) })
+        mean(y_true.zip(p).map { |yt, pi| yt == 1 ? -Math.log(pi) : -Math.log(1 - pi) }, weight: weight)
       end
       private
@@ -34,8 +42,12 @@ module Eps
         raise ArgumentError, "Different sizes" if y_true.size != y_pred.size
       end
-      def mean(arr)
-        arr.sum / arr.size.to_f
+      def mean(arr, weight: nil)
+        if weight
+          arr.map.with_index { |v, i| v * weight[i] }.sum / weight.sum.to_f
+        else
+          arr.sum / arr.size.to_f
+        end
       end
       def errors(y_true, y_pred)

data/lib/eps/model.rb CHANGED

@@ -17,11 +17,11 @@ module Eps
       estimator_class =
         if data.css("Segmentation").any?
-          Eps::LightGBM
+          LightGBM
         elsif data.css("RegressionModel").any?
-          Eps::LinearRegression
+          LinearRegression
         elsif data.css("NaiveBayesModel").any?
-          Eps::NaiveBayes
+          NaiveBayes
         else
           raise "Unknown model"
         end
@@ -35,11 +35,11 @@ module Eps
       estimator_class =
         case algorithm
         when :lightgbm
-          Eps::LightGBM
+          LightGBM
         when :linear_regression
-          Eps::LinearRegression
+          LinearRegression
         when :naive_bayes
-          Eps::NaiveBayes
+          NaiveBayes
         else
           raise ArgumentError, "Unknown algorithm: #{algorithm}"
         end

data/lib/eps/naive_bayes.rb CHANGED

@@ -3,91 +3,7 @@ module Eps
     attr_reader :probabilities
     def accuracy
-      Eps::Metrics.accuracy(@train_set.label, predict(@train_set))
-    end
-    # pmml
-    def self.load_pmml(data)
-      super do |data|
-        # TODO more validation
-        node = data.css("NaiveBayesModel")
-        prior = {}
-        node.css("BayesOutput TargetValueCount").each do |n|
-          prior[n.attribute("value").value] = n.attribute("count").value.to_f
-        end
-        legacy = false
-        conditional = {}
-        features = {}
-        node.css("BayesInput").each do |n|
-          prob = {}
-          # numeric
-          n.css("TargetValueStat").each do |n2|
-            n3 = n2.css("GaussianDistribution")
-            prob[n2.attribute("value").value] = {
-              mean: n3.attribute("mean").value.to_f,
-              stdev: Math.sqrt(n3.attribute("variance").value.to_f)
-            }
-          end
-          # detect bad form in Eps < 0.3
-          bad_format = n.css("PairCounts").map { |n2| n2.attribute("value").value } == prior.keys
-          # categorical
-          n.css("PairCounts").each do |n2|
-            if bad_format
-              n2.css("TargetValueCount").each do |n3|
-                prob[n3.attribute("value").value] ||= {}
-                prob[n3.attribute("value").value][n2.attribute("value").value] = BigDecimal(n3.attribute("count").value)
-              end
-            else
-              boom = {}
-              n2.css("TargetValueCount").each do |n3|
-                boom[n3.attribute("value").value] = BigDecimal(n3.attribute("count").value)
-              end
-              prob[n2.attribute("value").value] = boom
-            end
-          end
-          if bad_format
-            legacy = true
-            prob.each do |k, v|
-              prior.keys.each do |k|
-                v[k] ||= 0.0
-              end
-            end
-          end
-          name = n.attribute("fieldName").value
-          conditional[name] = prob
-          features[name] = n.css("TargetValueStat").any? ? "numeric" : "categorical"
-        end
-        target = node.css("BayesOutput").attribute("fieldName").value
-        probabilities = {
-          prior: prior,
-          conditional: conditional
-        }
-        # get derived fields
-        derived = {}
-        data.css("DerivedField").each do |n|
-          name = n.attribute("name").value
-          field = n.css("NormDiscrete").attribute("field").value
-          value = n.css("NormDiscrete").attribute("value").value
-          features.delete(name)
-          features[field] = "derived"
-          derived[field] ||= {}
-          derived[field][name] = value
-        end
-        Evaluators::NaiveBayes.new(probabilities: probabilities, features: features, derived: derived, legacy: legacy)
-      end
+      Eps::Metrics.accuracy(@train_set.label, predict(@train_set), weight: @train_set.weight)
     end
     private
@@ -105,6 +21,7 @@ module Eps
       raise "Target must be strings" if @target_type != "categorical"
       check_missing_value(@train_set)
       check_missing_value(@validation_set) if @validation_set
+      raise ArgumentError, "weight not supported" if @train_set.weight
       data = @train_set
@@ -185,60 +102,6 @@ module Eps
       Evaluators::NaiveBayes.new(probabilities: probabilities, features: @features)
     end
-    def generate_pmml
-      data_fields = {}
-      data_fields[@target] = probabilities[:prior].keys
-      probabilities[:conditional].each do |k, v|
-        if @features[k] == "categorical"
-          data_fields[k] = v.keys
-        else
-          data_fields[k] = nil
-        end
-      end
-      build_pmml(data_fields) do |xml|
-        xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
-          xml.MiningSchema do
-            data_fields.each do |k, _|
-              xml.MiningField(name: k)
-            end
-          end
-          xml.BayesInputs do
-            probabilities[:conditional].each do |k, v|
-              xml.BayesInput(fieldName: k) do
-                if @features[k] == "categorical"
-                  v.sort_by { |k2, _| k2 }.each do |k2, v2|
-                    xml.PairCounts(value: k2) do
-                      xml.TargetValueCounts do
-                        v2.sort_by { |k2, _| k2 }.each do |k3, v3|
-                          xml.TargetValueCount(value: k3, count: v3)
-                        end
-                      end
-                    end
-                  end
-                else
-                  xml.TargetValueStats do
-                    v.sort_by { |k2, _| k2 }.each do |k2, v2|
-                      xml.TargetValueStat(value: k2) do
-                        xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
-                      end
-                    end
-                  end
-                end
-              end
-            end
-          end
-          xml.BayesOutput(fieldName: "target") do
-            xml.TargetValueCounts do
-              probabilities[:prior].sort_by { |k, _| k }.each do |k, v|
-                xml.TargetValueCount(value: k, count: v)
-              end
-            end
-          end
-        end
-      end
-    end
     def group_count(arr, start)
       arr.inject(start) { |h, e| h[e] += 1; h }
     end