RubyGems - eps - Versions diffs - 0.3.0 → 0.3.5 - Mend

eps 0.3.0 → 0.3.5

Files changed (21) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +31 -5
data/README.md +77 -9
data/lib/eps.rb +19 -10
data/lib/eps/base_estimator.rb +63 -145
data/lib/eps/data_frame.rb +19 -3
data/lib/eps/evaluators/lightgbm.rb +20 -7
data/lib/eps/evaluators/linear_regression.rb +7 -4
data/lib/eps/evaluators/naive_bayes.rb +9 -7
data/lib/eps/label_encoder.rb +7 -3
data/lib/eps/lightgbm.rb +43 -78
data/lib/eps/linear_regression.rb +53 -83
data/lib/eps/metrics.rb +24 -12
data/lib/eps/model.rb +6 -6
data/lib/eps/naive_bayes.rb +3 -140
data/lib/eps/pmml.rb +14 -0
data/lib/eps/pmml/generator.rb +422 -0
data/lib/eps/pmml/loader.rb +241 -0
data/lib/eps/version.rb +1 -1
metadata +36 -6
data/lib/eps/pmml_generators/lightgbm.rb +0 -187

@@ -1,31 +1,39 @@
 module Eps
   module Metrics
     class << self
-      def rmse(y_true, y_pred)
+      def rmse(y_true, y_pred, weight: nil)
         check_size(y_true, y_pred)
-        Math.sqrt(mean(errors(y_true, y_pred).map { |v| v**2 }))
+        Math.sqrt(mean(errors(y_true, y_pred).map { |v| v**2 }, weight: weight))
       end
-      def mae(y_true, y_pred)
+      def mae(y_true, y_pred, weight: nil)
         check_size(y_true, y_pred)
-        mean(errors(y_true, y_pred).map { |v| v.abs })
+        mean(errors(y_true, y_pred).map { |v| v.abs }, weight: weight)
       end
-      def me(y_true, y_pred)
+      def me(y_true, y_pred, weight: nil)
         check_size(y_true, y_pred)
-        mean(errors(y_true, y_pred))
+        mean(errors(y_true, y_pred), weight: weight)
       end
-      def accuracy(y_true, y_pred)
+      def accuracy(y_true, y_pred, weight: nil)
         check_size(y_true, y_pred)
-        y_true.zip(y_pred).count { |yt, yp| yt == yp } / y_true.size.to_f
+        values = y_true.zip(y_pred).map { |yt, yp| yt == yp ? 1 : 0 }
+        if weight
+          values.each_with_index do |v, i|
+            values[i] *= weight[i]
+          end
+          values.sum / weight.sum.to_f
+        else
+          values.sum / y_true.size.to_f
+        end
       end
       # http://wiki.fast.ai/index.php/Log_Loss
-      def log_loss(y_true, y_pred, eps: 1e-15)
+      def log_loss(y_true, y_pred, eps: 1e-15, weight: nil)
         check_size(y_true, y_pred)
         p = y_pred.map { |yp| yp.clamp(eps, 1 - eps) }
-        mean(y_true.zip(p).map { |yt, pi| yt == 1 ? -Math.log(pi) : -Math.log(1 - pi) })
+        mean(y_true.zip(p).map { |yt, pi| yt == 1 ? -Math.log(pi) : -Math.log(1 - pi) }, weight: weight)
       end
       private
@@ -34,8 +42,12 @@ module Eps
         raise ArgumentError, "Different sizes" if y_true.size != y_pred.size
       end
-      def mean(arr)
-        arr.sum / arr.size.to_f
+      def mean(arr, weight: nil)
+        if weight
+          arr.map.with_index { |v, i| v * weight[i] }.sum / weight.sum.to_f
+        else
+          arr.sum / arr.size.to_f
+        end
       end
       def errors(y_true, y_pred)

data/lib/eps/model.rb CHANGED

@@ -17,11 +17,11 @@ module Eps
       estimator_class =
         if data.css("Segmentation").any?
-          Eps::LightGBM
+          LightGBM
         elsif data.css("RegressionModel").any?
-          Eps::LinearRegression
+          LinearRegression
         elsif data.css("NaiveBayesModel").any?
-          Eps::NaiveBayes
+          NaiveBayes
         else
           raise "Unknown model"
         end
@@ -35,11 +35,11 @@ module Eps
       estimator_class =
         case algorithm
         when :lightgbm
-          Eps::LightGBM
+          LightGBM
         when :linear_regression
-          Eps::LinearRegression
+          LinearRegression
         when :naive_bayes
-          Eps::NaiveBayes
+          NaiveBayes
         else
           raise ArgumentError, "Unknown algorithm: #{algorithm}"
         end

data/lib/eps/naive_bayes.rb CHANGED

@@ -3,91 +3,7 @@ module Eps
     attr_reader :probabilities
     def accuracy
-      Eps::Metrics.accuracy(@train_set.label, predict(@train_set))
-    end
-    # pmml
-    def self.load_pmml(data)
-      super do |data|
-        # TODO more validation
-        node = data.css("NaiveBayesModel")
-        prior = {}
-        node.css("BayesOutput TargetValueCount").each do |n|
-          prior[n.attribute("value").value] = n.attribute("count").value.to_f
-        end
-        legacy = false
-        conditional = {}
-        features = {}
-        node.css("BayesInput").each do |n|
-          prob = {}
-          # numeric
-          n.css("TargetValueStat").each do |n2|
-            n3 = n2.css("GaussianDistribution")
-            prob[n2.attribute("value").value] = {
-              mean: n3.attribute("mean").value.to_f,
-              stdev: Math.sqrt(n3.attribute("variance").value.to_f)
-            }
-          end
-          # detect bad form in Eps < 0.3
-          bad_format = n.css("PairCounts").map { |n2| n2.attribute("value").value } == prior.keys
-          # categorical
-          n.css("PairCounts").each do |n2|
-            if bad_format
-              n2.css("TargetValueCount").each do |n3|
-                prob[n3.attribute("value").value] ||= {}
-                prob[n3.attribute("value").value][n2.attribute("value").value] = BigDecimal(n3.attribute("count").value)
-              end
-            else
-              boom = {}
-              n2.css("TargetValueCount").each do |n3|
-                boom[n3.attribute("value").value] = BigDecimal(n3.attribute("count").value)
-              end
-              prob[n2.attribute("value").value] = boom
-            end
-          end
-          if bad_format
-            legacy = true
-            prob.each do |k, v|
-              prior.keys.each do |k|
-                v[k] ||= 0.0
-              end
-            end
-          end
-          name = n.attribute("fieldName").value
-          conditional[name] = prob
-          features[name] = n.css("TargetValueStat").any? ? "numeric" : "categorical"
-        end
-        target = node.css("BayesOutput").attribute("fieldName").value
-        probabilities = {
-          prior: prior,
-          conditional: conditional
-        }
-        # get derived fields
-        derived = {}
-        data.css("DerivedField").each do |n|
-          name = n.attribute("name").value
-          field = n.css("NormDiscrete").attribute("field").value
-          value = n.css("NormDiscrete").attribute("value").value
-          features.delete(name)
-          features[field] = "derived"
-          derived[field] ||= {}
-          derived[field][name] = value
-        end
-        Evaluators::NaiveBayes.new(probabilities: probabilities, features: features, derived: derived, legacy: legacy)
-      end
+      Eps::Metrics.accuracy(@train_set.label, predict(@train_set), weight: @train_set.weight)
     end
     private
@@ -101,10 +17,11 @@ module Eps
       str
     end
-    def _train(smoothing: 1, **options)
+    def _train(smoothing: 1)
       raise "Target must be strings" if @target_type != "categorical"
       check_missing_value(@train_set)
       check_missing_value(@validation_set) if @validation_set
+      raise ArgumentError, "weight not supported" if @train_set.weight
       data = @train_set
@@ -185,60 +102,6 @@ module Eps
       Evaluators::NaiveBayes.new(probabilities: probabilities, features: @features)
     end
-    def generate_pmml
-      data_fields = {}
-      data_fields[@target] = probabilities[:prior].keys
-      probabilities[:conditional].each do |k, v|
-        if @features[k] == "categorical"
-          data_fields[k] = v.keys
-        else
-          data_fields[k] = nil
-        end
-      end
-      build_pmml(data_fields) do |xml|
-        xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
-          xml.MiningSchema do
-            data_fields.each do |k, _|
-              xml.MiningField(name: k)
-            end
-          end
-          xml.BayesInputs do
-            probabilities[:conditional].each do |k, v|
-              xml.BayesInput(fieldName: k) do
-                if @features[k] == "categorical"
-                  v.sort_by { |k2, _| k2 }.each do |k2, v2|
-                    xml.PairCounts(value: k2) do
-                      xml.TargetValueCounts do
-                        v2.sort_by { |k2, _| k2 }.each do |k3, v3|
-                          xml.TargetValueCount(value: k3, count: v3)
-                        end
-                      end
-                    end
-                  end
-                else
-                  xml.TargetValueStats do
-                    v.sort_by { |k2, _| k2 }.each do |k2, v2|
-                      xml.TargetValueStat(value: k2) do
-                        xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
-                      end
-                    end
-                  end
-                end
-              end
-            end
-          end
-          xml.BayesOutput(fieldName: "target") do
-            xml.TargetValueCounts do
-              probabilities[:prior].sort_by { |k, _| k }.each do |k, v|
-                xml.TargetValueCount(value: k, count: v)
-              end
-            end
-          end
-        end
-      end
-    end
     def group_count(arr, start)
       arr.inject(start) { |h, e| h[e] += 1; h }
     end

data/lib/eps/pmml.rb ADDED

@@ -0,0 +1,14 @@
+# similar to Marshal/JSON/YAML interface
+module Eps
+  module PMML
+    class << self
+      def load(pmml)
+        Loader.new(pmml).load
+      end
+      def generate(model)
+        Generator.new(model).generate
+      end
+    end
+  end
+end

data/lib/eps/pmml/generator.rb ADDED

@@ -0,0 +1,422 @@
+module Eps
+  module PMML
+    class Generator
+      attr_reader :model
+      def initialize(model)
+        @model = model
+      end
+      def generate
+        case @model
+        when LightGBM
+          lightgbm
+        when LinearRegression
+          linear_regression
+        when NaiveBayes
+          naive_bayes
+        else
+          raise "Unknown model"
+        end
+      end
+      private
+      def lightgbm
+        data_fields = {}
+        data_fields[target] = labels if labels
+        features.each_with_index do |(k, type), i|
+          # TODO remove zero importance features
+          if type == "categorical"
+            data_fields[k] = label_encoders[k].labels.keys
+          else
+            data_fields[k] = nil
+          end
+        end
+        build_pmml(data_fields) do |xml|
+          function_name = objective == "regression" ? "regression" : "classification"
+          xml.MiningModel(functionName: function_name, algorithmName: "LightGBM") do
+            xml.MiningSchema do
+              xml.MiningField(name: target, usageType: "target")
+              features.keys.each_with_index do |k, i|
+                # next if feature_importance[i] == 0
+                # TODO add importance, but need to handle text features
+                xml.MiningField(name: k) #, importance: feature_importance[i].to_f, missingValueTreatment: "asIs")
+              end
+            end
+            pmml_local_transformations(xml)
+            case objective
+            when "regression"
+              xml_segmentation(xml, trees)
+            when "binary"
+              xml.Segmentation(multipleModelMethod: "modelChain") do
+                xml.Segment(id: 1) do
+                  xml.True
+                  xml.MiningModel(functionName: "regression") do
+                    xml.MiningSchema do
+                      features.each do |k, _|
+                        xml.MiningField(name: k)
+                      end
+                    end
+                    xml.Output do
+                      xml.OutputField(name: "lgbmValue", optype: "continuous", dataType: "double", feature: "predictedValue", isFinalResult: false) do
+                        xml.Apply(function: "/") do
+                          xml.Constant(dataType: "double") do
+                            1.0
+                          end
+                          xml.Apply(function: "+") do
+                            xml.Constant(dataType: "double") do
+                              1.0
+                            end
+                            xml.Apply(function: "exp") do
+                              xml.Apply(function: "*") do
+                                xml.Constant(dataType: "double") do
+                                  -1.0
+                                end
+                                xml.FieldRef(field: "lgbmValue")
+                              end
+                            end
+                          end
+                        end
+                      end
+                    end
+                    xml_segmentation(xml, trees)
+                  end
+                end
+                xml.Segment(id: 2) do
+                  xml.True
+                  xml.RegressionModel(functionName: "classification", normalizationMethod: "none") do
+                    xml.MiningSchema do
+                      xml.MiningField(name: target, usageType: "target")
+                      xml.MiningField(name: "transformedLgbmValue")
+                    end
+                    xml.Output do
+                      labels.each do |label|
+                        xml.OutputField(name: "probability(#{label})", optype: "continuous", dataType: "double", feature: "probability", value: label)
+                      end
+                    end
+                    xml.RegressionTable(intercept: 0.0, targetCategory: labels.last) do
+                      xml.NumericPredictor(name: "transformedLgbmValue", coefficient: "1.0")
+                    end
+                    xml.RegressionTable(intercept: 0.0, targetCategory: labels.first)
+                  end
+                end
+              end
+            else # multiclass
+              xml.Segmentation(multipleModelMethod: "modelChain") do
+                n = trees.size / labels.size
+                trees.each_slice(n).each_with_index do |trees, idx|
+                  xml.Segment(id: idx + 1) do
+                    xml.True
+                    xml.MiningModel(functionName: "regression") do
+                      xml.MiningSchema do
+                        features.each do |k, _|
+                          xml.MiningField(name: k)
+                        end
+                      end
+                      xml.Output do
+                        xml.OutputField(name: "lgbmValue(#{labels[idx]})", optype: "continuous", dataType: "double", feature: "predictedValue", isFinalResult: false)
+                      end
+                      xml_segmentation(xml, trees)
+                    end
+                  end
+                end
+                xml.Segment(id: labels.size + 1) do
+                  xml.True
+                  xml.RegressionModel(functionName: "classification", normalizationMethod: "softmax") do
+                    xml.MiningSchema do
+                      xml.MiningField(name: target, usageType: "target")
+                      labels.each do |label|
+                        xml.MiningField(name: "lgbmValue(#{label})")
+                      end
+                    end
+                    xml.Output do
+                      labels.each do |label|
+                        xml.OutputField(name: "probability(#{label})", optype: "continuous", dataType: "double", feature: "probability", value: label)
+                      end
+                    end
+                    labels.each do |label|
+                      xml.RegressionTable(intercept: 0.0, targetCategory: label) do
+                        xml.NumericPredictor(name: "lgbmValue(#{label})", coefficient: "1.0")
+                      end
+                    end
+                  end
+                end
+              end
+            end
+          end
+        end
+      end
+      def linear_regression
+        predictors = model.instance_variable_get("@coefficients").dup
+        intercept = predictors.delete("_intercept") || 0.0
+        data_fields = {}
+        features.each do |k, type|
+          if type == "categorical"
+            data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last)
+          else
+            data_fields[k] = nil
+          end
+        end
+        build_pmml(data_fields) do |xml|
+          xml.RegressionModel(functionName: "regression") do
+            xml.MiningSchema do
+              features.each do |k, _|
+                xml.MiningField(name: k)
+              end
+            end
+            pmml_local_transformations(xml)
+            xml.RegressionTable(intercept: intercept) do
+              predictors.each do |k, v|
+                if k.is_a?(Array)
+                  if features[k.first] == "text"
+                    xml.NumericPredictor(name: display_field(k), coefficient: v)
+                  else
+                    xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
+                  end
+                else
+                  xml.NumericPredictor(name: k, coefficient: v)
+                end
+              end
+            end
+          end
+        end
+      end
+      def naive_bayes
+        data_fields = {}
+        data_fields[target] = probabilities[:prior].keys
+        probabilities[:conditional].each do |k, v|
+          if features[k] == "categorical"
+            data_fields[k] = v.keys
+          else
+            data_fields[k] = nil
+          end
+        end
+        build_pmml(data_fields) do |xml|
+          xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
+            xml.MiningSchema do
+              data_fields.each do |k, _|
+                xml.MiningField(name: k)
+              end
+            end
+            xml.BayesInputs do
+              probabilities[:conditional].each do |k, v|
+                xml.BayesInput(fieldName: k) do
+                  if features[k] == "categorical"
+                    v.sort_by { |k2, _| k2.to_s }.each do |k2, v2|
+                      xml.PairCounts(value: k2) do
+                        xml.TargetValueCounts do
+                          v2.sort_by { |k2, _| k2.to_s }.each do |k3, v3|
+                            xml.TargetValueCount(value: k3, count: v3)
+                          end
+                        end
+                      end
+                    end
+                  else
+                    xml.TargetValueStats do
+                      v.sort_by { |k2, _| k2.to_s }.each do |k2, v2|
+                        xml.TargetValueStat(value: k2) do
+                          xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
+                        end
+                      end
+                    end
+                  end
+                end
+              end
+            end
+            xml.BayesOutput(fieldName: "target") do
+              xml.TargetValueCounts do
+                probabilities[:prior].sort_by { |k, _| k.to_s }.each do |k, v|
+                  xml.TargetValueCount(value: k, count: v)
+                end
+              end
+            end
+          end
+        end
+      end
+      def display_field(k)
+        if k.is_a?(Array)
+          if features[k.first] == "text"
+            "#{k.first}(#{k.last})"
+          else
+            k.join("=")
+          end
+        else
+          k
+        end
+      end
+      def xml_segmentation(xml, trees)
+        xml.Segmentation(multipleModelMethod: "sum") do
+          trees.each_with_index do |node, i|
+            xml.Segment(id: i + 1) do
+              xml.True
+              xml.TreeModel(functionName: "regression", missingValueStrategy: "none", noTrueChildStrategy: "returnLastPrediction", splitCharacteristic: "multiSplit") do
+                xml.MiningSchema do
+                  node_fields(node).uniq.each do |k|
+                    xml.MiningField(name: display_field(k))
+                  end
+                end
+                node_pmml(node, xml)
+              end
+            end
+          end
+        end
+      end
+      def node_fields(node)
+        fields = []
+        fields << node.field if node.predicate
+        node.children.each do |n|
+          fields.concat(node_fields(n))
+        end
+        fields
+      end
+      def node_pmml(node, xml)
+        xml.Node(score: node.score) do
+          if node.predicate.nil?
+            xml.True
+          elsif node.operator == "in"
+            xml.SimpleSetPredicate(field: display_field(node.field), booleanOperator: "isIn") do
+              xml.Array(type: "string") do
+                xml.text node.value.map { |v| escape_element(v) }.join(" ")
+              end
+            end
+          else
+            xml.SimplePredicate(field: display_field(node.field), operator: node.operator, value: node.value)
+          end
+          node.children.each do |n|
+            node_pmml(n, xml)
+          end
+        end
+      end
+      def escape_element(v)
+        "\"#{v.gsub("\"", "\\\"")}\""
+      end
+      def build_pmml(data_fields)
+        Nokogiri::XML::Builder.new do |xml|
+          xml.PMML(version: "4.4", xmlns: "http://www.dmg.org/PMML-4_4", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
+            pmml_header(xml)
+            pmml_data_dictionary(xml, data_fields)
+            pmml_transformation_dictionary(xml)
+            yield xml
+          end
+        end.to_xml
+      end
+      def pmml_header(xml)
+        xml.Header do
+          xml.Application(name: "Eps", version: Eps::VERSION)
+          # xml.Timestamp Time.now.utc.iso8601
+        end
+      end
+      def pmml_data_dictionary(xml, data_fields)
+        xml.DataDictionary do
+          data_fields.each do |k, vs|
+            case features[k]
+            when "categorical", nil
+              xml.DataField(name: k, optype: "categorical", dataType: "string") do
+                vs.map(&:to_s).sort.each do |v|
+                  xml.Value(value: v)
+                end
+              end
+            when "text"
+              xml.DataField(name: k, optype: "categorical", dataType: "string")
+            else
+              xml.DataField(name: k, optype: "continuous", dataType: "double")
+            end
+          end
+        end
+      end
+      def pmml_transformation_dictionary(xml)
+        if text_features.any?
+          xml.TransformationDictionary do
+            text_features.each do |k, text_options|
+              xml.DefineFunction(name: "#{k}Transform", optype: "continuous") do
+                xml.ParameterField(name: "text")
+                xml.ParameterField(name: "term")
+                xml.TextIndex(textField: "text", localTermWeights: "termFrequency", wordSeparatorCharacterRE: text_options[:tokenizer].source, isCaseSensitive: !!text_options[:case_sensitive]) do
+                  xml.FieldRef(field: "term")
+                end
+              end
+            end
+          end
+        end
+      end
+      def pmml_local_transformations(xml)
+        if text_features.any?
+          xml.LocalTransformations do
+            text_features.each do |k, _|
+              text_encoders[k].vocabulary.each do |v|
+                xml.DerivedField(name: display_field([k, v]), optype: "continuous", dataType: "integer") do
+                  xml.Apply(function: "#{k}Transform") do
+                    xml.FieldRef(field: k)
+                    xml.Constant v
+                  end
+                end
+              end
+            end
+          end
+        end
+      end
+      # TODO create instance methods on model for all of these features
+      def features
+        model.instance_variable_get("@features")
+      end
+      def text_features
+        model.instance_variable_get("@text_features")
+      end
+      def text_encoders
+        model.instance_variable_get("@text_encoders")
+      end
+      def feature_importance
+        model.instance_variable_get("@feature_importance")
+      end
+      def labels
+        model.instance_variable_get("@labels")
+      end
+      def trees
+        model.instance_variable_get("@trees")
+      end
+      def target
+        model.instance_variable_get("@target")
+      end
+      def label_encoders
+        model.instance_variable_get("@label_encoders")
+      end
+      def objective
+        model.instance_variable_get("@objective")
+      end
+      def probabilities
+        model.instance_variable_get("@probabilities")
+      end
+      # end TODO
+    end
+  end
+end