RubyGems - eps - Versions diffs - 0.2.1 → 0.3.0 - Mend

eps 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +14 -0
data/LICENSE.txt +1 -1
data/README.md +183 -243
data/lib/eps.rb +27 -3
data/lib/eps/base_estimator.rb +316 -47
data/lib/eps/data_frame.rb +141 -0
data/lib/eps/evaluators/lightgbm.rb +116 -0
data/lib/eps/evaluators/linear_regression.rb +54 -0
data/lib/eps/evaluators/naive_bayes.rb +95 -0
data/lib/eps/evaluators/node.rb +26 -0
data/lib/eps/label_encoder.rb +41 -0
data/lib/eps/lightgbm.rb +237 -0
data/lib/eps/linear_regression.rb +132 -386
data/lib/eps/metrics.rb +46 -0
data/lib/eps/model.rb +16 -58
data/lib/eps/naive_bayes.rb +175 -164
data/lib/eps/pmml_generators/lightgbm.rb +187 -0
data/lib/eps/statistics.rb +79 -0
data/lib/eps/text_encoder.rb +81 -0
data/lib/eps/utils.rb +22 -0
data/lib/eps/version.rb +1 -1
metadata +33 -7

data/lib/eps/metrics.rb ADDED Viewed

@@ -0,0 +1,46 @@
+module Eps
+  module Metrics
+    class << self
+      def rmse(y_true, y_pred)
+        check_size(y_true, y_pred)
+        Math.sqrt(mean(errors(y_true, y_pred).map { |v| v**2 }))
+      end
+      def mae(y_true, y_pred)
+        check_size(y_true, y_pred)
+        mean(errors(y_true, y_pred).map { |v| v.abs })
+      end
+      def me(y_true, y_pred)
+        check_size(y_true, y_pred)
+        mean(errors(y_true, y_pred))
+      end
+      def accuracy(y_true, y_pred)
+        check_size(y_true, y_pred)
+        y_true.zip(y_pred).count { |yt, yp| yt == yp } / y_true.size.to_f
+      end
+      # http://wiki.fast.ai/index.php/Log_Loss
+      def log_loss(y_true, y_pred, eps: 1e-15)
+        check_size(y_true, y_pred)
+        p = y_pred.map { |yp| yp.clamp(eps, 1 - eps) }
+        mean(y_true.zip(p).map { |yt, pi| yt == 1 ? -Math.log(pi) : -Math.log(1 - pi) })
+      end
+      private
+      def check_size(y_true, y_pred)
+        raise ArgumentError, "Different sizes" if y_true.size != y_pred.size
+      end
+      def mean(arr)
+        arr.sum / arr.size.to_f
+      end
+      def errors(y_true, y_pred)
+        y_true.zip(y_pred).map { |yt, yp| yt - yp }
+      end
+    end
+  end
+end

data/lib/eps/model.rb CHANGED Viewed

@@ -1,12 +1,10 @@
 module Eps
   class Model
-    def initialize(data = nil, y = nil, target: nil, estimator: nil, **options)
-      @options = options
+    def initialize(data = nil, y = nil, estimator: nil, **options)
       if estimator
         @estimator = estimator
-      elsif data # legacy
-        train(data, y, target: target)
+      elsif data
+        train(data, y, **options)
       end
     end
@@ -14,12 +12,13 @@ module Eps
     def self.load_pmml(data)
       if data.is_a?(String)
-        require "nokogiri"
         data = Nokogiri::XML(data) { |config| config.strict }
       end
       estimator_class =
-        if data.css("RegressionModel").any?
+        if data.css("Segmentation").any?
+          Eps::LightGBM
+        elsif data.css("RegressionModel").any?
           Eps::LinearRegression
         elsif data.css("NaiveBayesModel").any?
           Eps::NaiveBayes
@@ -30,55 +29,22 @@ module Eps
       new(estimator: estimator_class.load_pmml(data))
     end
-    # ruby - legacy
-    def self.load(data)
-      new(estimator: Eps::LinearRegression.load(data))
-    end
-    # json - legacy
-    def self.load_json(data)
-      new(estimator: Eps::LinearRegression.load_json(data))
-    end
-    def to_json
-      @estimator ? @estimator.to_json : super
-    end
-    # pfa - legacy
-    def self.load_pfa(data)
-      new(estimator: Eps::LinearRegression.load_pfa(data))
-    end
-    # metrics
-    def self.metrics(actual, estimated)
-      estimator_class =
-        if numeric?(actual)
-          Eps::LinearRegression
-        else
-          Eps::NaiveBayes
-        end
-      estimator_class.metrics(actual, estimated)
-    end
     private
-    def train(data, y = nil, target: nil)
-      y ||= daru?(data) ? data[target].to_a : data.map { |r| r[target] }
+    def train(data, y = nil, target: nil, algorithm: :lightgbm, **options)
       estimator_class =
-        if self.class.numeric?(y)
+        case algorithm
+        when :lightgbm
+          Eps::LightGBM
+        when :linear_regression
           Eps::LinearRegression
-        else
+        when :naive_bayes
           Eps::NaiveBayes
+        else
+          raise ArgumentError, "Unknown algorithm: #{algorithm}"
         end
-      @estimator = estimator_class.new(**@options)
-      @estimator.train(data, y, target: target)
+      @estimator = estimator_class.new(data, y, target: target, **options)
     end
     def respond_to_missing?(name, include_private = false)
@@ -90,19 +56,11 @@ module Eps
     end
     def method_missing(method, *args, &block)
-      if @estimator
+      if @estimator && @estimator.respond_to?(method)
         @estimator.public_send(method, *args, &block)
       else
         super
       end
     end
-    def self.numeric?(y)
-      y.first.is_a?(Numeric)
-    end
-    def daru?(x)
-      defined?(Daru) && x.is_a?(Daru::DataFrame)
-    end
   end
 end

data/lib/eps/naive_bayes.rb CHANGED Viewed

@@ -2,227 +2,245 @@ module Eps
   class NaiveBayes < BaseEstimator
     attr_reader :probabilities
-    def initialize(probabilities: nil, target: nil)
-      @probabilities = probabilities
-      @target = target
+    def accuracy
+      Eps::Metrics.accuracy(@train_set.label, predict(@train_set))
     end
-    def train(*args)
-      super
-      @y = @y.map { |yi| yi.to_s }
+    # pmml
-      prior = group_count(@y)
-      conditional = {}
+    def self.load_pmml(data)
+      super do |data|
+        # TODO more validation
+        node = data.css("NaiveBayesModel")
-      if @x.any?
-        keys = @x.first.keys
-        x = @x.dup
-        x.each_with_index do |xi, i|
-          xi[@target] = @y[i]
+        prior = {}
+        node.css("BayesOutput TargetValueCount").each do |n|
+          prior[n.attribute("value").value] = n.attribute("count").value.to_f
         end
-        keys.each do |k|
-          conditional[k.to_s] = {}
-          x.group_by { |xi| xi[@target] }.each do |group, xs|
-            v = xs.map { |xi| xi[k] }
-            if categorical?(v[0])
-              # TODO apply smoothing
-              # apply smoothing only to
-              # 1. categorical features
-              # 2. conditional probabilities
-              # TODO more efficient count
-              conditional[k.to_s][group] = group_count(v)
+        legacy = false
+        conditional = {}
+        features = {}
+        node.css("BayesInput").each do |n|
+          prob = {}
+          # numeric
+          n.css("TargetValueStat").each do |n2|
+            n3 = n2.css("GaussianDistribution")
+            prob[n2.attribute("value").value] = {
+              mean: n3.attribute("mean").value.to_f,
+              stdev: Math.sqrt(n3.attribute("variance").value.to_f)
+            }
+          end
+          # detect bad form in Eps < 0.3
+          bad_format = n.css("PairCounts").map { |n2| n2.attribute("value").value } == prior.keys
+          # categorical
+          n.css("PairCounts").each do |n2|
+            if bad_format
+              n2.css("TargetValueCount").each do |n3|
+                prob[n3.attribute("value").value] ||= {}
+                prob[n3.attribute("value").value][n2.attribute("value").value] = BigDecimal(n3.attribute("count").value)
+              end
             else
-              conditional[k.to_s][group] = {mean: mean(v), stdev: stdev(v)}
+              boom = {}
+              n2.css("TargetValueCount").each do |n3|
+                boom[n3.attribute("value").value] = BigDecimal(n3.attribute("count").value)
+              end
+              prob[n2.attribute("value").value] = boom
+            end
+          end
+          if bad_format
+            legacy = true
+            prob.each do |k, v|
+              prior.keys.each do |k|
+                v[k] ||= 0.0
+              end
             end
           end
+          name = n.attribute("fieldName").value
+          conditional[name] = prob
+          features[name] = n.css("TargetValueStat").any? ? "numeric" : "categorical"
         end
-      end
-      @probabilities = {
-        prior: prior,
-        conditional: conditional
-      }
+        target = node.css("BayesOutput").attribute("fieldName").value
+        probabilities = {
+          prior: prior,
+          conditional: conditional
+        }
+        # get derived fields
+        derived = {}
+        data.css("DerivedField").each do |n|
+          name = n.attribute("name").value
+          field = n.css("NormDiscrete").attribute("field").value
+          value = n.css("NormDiscrete").attribute("value").value
+          features.delete(name)
+          features[field] = "derived"
+          derived[field] ||= {}
+          derived[field][name] = value
+        end
+        Evaluators::NaiveBayes.new(probabilities: probabilities, features: features, derived: derived, legacy: legacy)
+      end
     end
+    private
     # TODO better summary
-    def summary(extended: false)
+    def _summary(extended: false)
       str = String.new("")
       probabilities[:prior].each do |k, v|
         str += "#{k}: #{v}\n"
       end
-      str += "\n"
-      str += "accuracy: %d%%\n" % [(100 * accuracy).round]
       str
     end
-    def accuracy
-      self.class.metrics(predict(@x), @y)[:accuracy]
-    end
+    def _train(smoothing: 1, **options)
+      raise "Target must be strings" if @target_type != "categorical"
+      check_missing_value(@train_set)
+      check_missing_value(@validation_set) if @validation_set
-    # pmml
+      data = @train_set
-    def self.load_pmml(data)
-      # TODO more validation
-      node = data.css("NaiveBayesModel")
+      prep_text_features(data)
+      # convert boolean to strings
+      data.label = data.label.map(&:to_s)
+      indexes = {}
+      data.label.each_with_index do |yi, i|
+        (indexes[yi] ||= []) << i
+      end
+      grouped = {}
+      indexes.each do |k, v|
+        grouped[k] = data[v]
+      end
       prior = {}
-      node.css("BayesOutput TargetValueCount").each do |n|
-        prior[n.attribute("value").value] = n.attribute("count").value.to_f
+      grouped.sort_by { |k, _| k }.each do |k, v|
+        prior[k] = v.size
+      end
+      labels = prior.keys
+      target_counts = {}
+      labels.each do |k|
+        target_counts[k] = 0
       end
       conditional = {}
-      node.css("BayesInput").each do |n|
+      @features.each do |k, type|
         prob = {}
-        n.css("TargetValueStat").each do |n2|
-          n3 = n2.css("GaussianDistribution")
-          prob[n2.attribute("value").value] = {
-            mean: n3.attribute("mean").value.to_f,
-            stdev: Math.sqrt(n3.attribute("variance").value.to_f)
-          }
-        end
-        n.css("PairCounts").each do |n2|
-          boom = {}
-          n2.css("TargetValueCount").each do |n3|
-            boom[n3.attribute("value").value] = n3.attribute("count").value.to_f
+        case type
+        when "text"
+          raise "Text features not supported yet for naive Bayes"
+        when "categorical"
+          groups = Hash.new { |hash, key| hash[key] = [] }
+          data.columns[k].each_with_index do |v, i|
+            groups[v] << i
+          end
+          groups.each do |group, indexes|
+            df = data[indexes]
+            prob[group] = group_count(df.label, target_counts.dup)
+          end
+          # smooth
+          if smoothing
+            labels.each do |label|
+              sum = prob.map { |k2, v2| v2[label] }.sum.to_f
+              prob.each do |k2, v|
+                v[label] = (v[label] + smoothing) * sum / (sum + (prob.size * smoothing))
+              end
+            end
+          end
+        else
+          labels.each do |group|
+            xs = grouped[group]
+            # TODO handle this case
+            next unless xs
+            values = xs.columns[k]
+            prob[group] = {mean: mean(values), stdev: stdev(values)}
           end
-          prob[n2.attribute("value").value] = boom
         end
-        conditional[n.attribute("fieldName").value] = prob
-      end
-      @target = node.css("BayesOutput").attribute("fieldName").value
+        conditional[k] = prob
+      end
-      probabilities = {
+      @probabilities = {
         prior: prior,
         conditional: conditional
       }
-      new(probabilities: probabilities, target: @target)
+      Evaluators::NaiveBayes.new(probabilities: probabilities, features: @features)
     end
-    def to_pmml
+    def generate_pmml
       data_fields = {}
       data_fields[@target] = probabilities[:prior].keys
       probabilities[:conditional].each do |k, v|
-        if !v.values[0][:mean]
+        if @features[k] == "categorical"
           data_fields[k] = v.keys
         else
           data_fields[k] = nil
         end
       end
-      builder = Nokogiri::XML::Builder.new do |xml|
-        xml.PMML(version: "4.3", xmlns: "http://www.dmg.org/PMML-4_3", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
-          xml.Header
-          xml.DataDictionary do
-            data_fields.each do |k, vs|
-              if vs
-                xml.DataField(name: k, optype: "categorical", dataType: "string") do
-                  vs.each do |v|
-                    xml.Value(value: v)
-                  end
-                end
-              else
-                xml.DataField(name: k, optype: "continuous", dataType: "double")
-              end
+      build_pmml(data_fields) do |xml|
+        xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
+          xml.MiningSchema do
+            data_fields.each do |k, _|
+              xml.MiningField(name: k)
             end
           end
-          xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
-            xml.MiningSchema do
-              data_fields.each do |k, _|
-                xml.MiningField(name: k)
-              end
-            end
-            xml.BayesInputs do
-              probabilities[:conditional].each do |k, v|
-                xml.BayesInput(fieldName: k) do
-                  if !v.values[0][:mean]
-                    v.each do |k2, v2|
-                      xml.PairCounts(value: k2) do
-                        xml.TargetValueCounts do
-                          v2.each do |k3, v3|
-                            xml.TargetValueCount(value: k3, count: v3)
-                          end
+          xml.BayesInputs do
+            probabilities[:conditional].each do |k, v|
+              xml.BayesInput(fieldName: k) do
+                if @features[k] == "categorical"
+                  v.sort_by { |k2, _| k2 }.each do |k2, v2|
+                    xml.PairCounts(value: k2) do
+                      xml.TargetValueCounts do
+                        v2.sort_by { |k2, _| k2 }.each do |k3, v3|
+                          xml.TargetValueCount(value: k3, count: v3)
                         end
                       end
                     end
-                  else
-                    xml.TargetValueStats do
-                      v.each do |k2, v2|
-                        xml.TargetValueStat(value: k2) do
-                          xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
-                        end
+                  end
+                else
+                  xml.TargetValueStats do
+                    v.sort_by { |k2, _| k2 }.each do |k2, v2|
+                      xml.TargetValueStat(value: k2) do
+                        xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
                       end
                     end
                   end
                 end
               end
             end
-            xml.BayesOutput(fieldName: "target") do
-              xml.TargetValueCounts do
-                probabilities[:prior].each do |k, v|
-                  xml.TargetValueCount(value: k, count: v)
-                end
-              end
-            end
           end
-        end
-      end.to_xml
-    end
-    # metrics
-    def self.metrics(actual, estimated)
-      {
-        accuracy: actual.zip(estimated).count { |yi, yi2| yi == yi2 } / actual.size.to_f
-      }
-    end
-    private
-    def _predict(x)
-      x.map do |xi|
-        probs = calculate_class_probabilities(stringify_keys(xi))
-        # deterministic for equal probabilities
-        probs.sort_by { |k, v| [-v, k.to_s] }[0][0]
-      end
-    end
-    def calculate_class_probabilities(x)
-      prob = {}
-      probabilities[:prior].each do |c, cv|
-        prob[c] = cv.to_f / probabilities[:prior].values.sum
-        probabilities[:conditional].each do |k, v|
-          if !v[c][:mean]
-            # TODO compute ahead of time
-            p2 = v[c][x[k]].to_f / v[c].values.sum
-            # assign very small probability if probability is 0
-            # TODO use proper smoothing instead
-            if p2 == 0
-              p2 = 0.0001
+          xml.BayesOutput(fieldName: "target") do
+            xml.TargetValueCounts do
+              probabilities[:prior].sort_by { |k, _| k }.each do |k, v|
+                xml.TargetValueCount(value: k, count: v)
+              end
             end
-            prob[c] *= p2
-          else
-            prob[c] *= calculate_probability(x[k], v[c][:mean], v[c][:stdev])
           end
         end
       end
-      prob
     end
-    def calculate_probability(x, mean, stdev)
-      exponent = Math.exp(-((x - mean)**2) / (2 * (stdev**2)))
-      (1 / (Math.sqrt(2 * Math::PI) * stdev)) * exponent
-    end
-    def group_count(arr)
-      r = arr.inject(Hash.new(0)) { |h, e| h[e.to_s] += 1 ; h }
-      r.default = nil
-      r
+    def group_count(arr, start)
+      arr.inject(start) { |h, e| h[e] += 1; h }
     end
     def mean(arr)
@@ -230,17 +248,10 @@ module Eps
     end
     def stdev(arr)
+      return nil if arr.size <= 1
       m = mean(arr)
       sum = arr.inject(0) { |accum, i| accum + (i - m)**2 }
       Math.sqrt(sum / (arr.length - 1).to_f)
     end
-    def stringify_keys(h)
-      o = {}
-      h.each do |k, v|
-        o[k.to_s] = v
-      end
-      o
-    end
   end
 end