RubyGems - eps - Versions diffs - 0.3.0 → 0.3.1 - Mend

eps 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +12 -5
data/README.md +34 -0
data/lib/eps.rb +19 -10
data/lib/eps/base_estimator.rb +35 -129
data/lib/eps/data_frame.rb +7 -1
data/lib/eps/evaluators/linear_regression.rb +1 -1
data/lib/eps/label_encoder.rb +7 -3
data/lib/eps/lightgbm.rb +36 -76
data/lib/eps/linear_regression.rb +26 -79
data/lib/eps/metrics.rb +24 -12
data/lib/eps/model.rb +6 -6
data/lib/eps/naive_bayes.rb +2 -139
data/lib/eps/pmml.rb +14 -0
data/lib/eps/pmml/generator.rb +422 -0
data/lib/eps/pmml/loader.rb +241 -0
data/lib/eps/version.rb +1 -1
metadata +7 -5
data/lib/eps/pmml_generators/lightgbm.rb +0 -187

data/lib/eps/pmml.rb ADDED

@@ -0,0 +1,14 @@
+# similar to Marshal/JSON/YAML interface
+module Eps
+  module PMML
+    class << self
+      def load(pmml)
+        Loader.new(pmml).load
+      end
+      def generate(model)
+        Generator.new(model).generate
+      end
+    end
+  end
+end

data/lib/eps/pmml/generator.rb ADDED

@@ -0,0 +1,422 @@
+module Eps
+  module PMML
+    class Generator
+      attr_reader :model
+      def initialize(model)
+        @model = model
+      end
+      def generate
+        case @model
+        when LightGBM
+          lightgbm
+        when LinearRegression
+          linear_regression
+        when NaiveBayes
+          naive_bayes
+        else
+          raise "Unknown model"
+        end
+      end
+      private
+      def lightgbm
+        data_fields = {}
+        data_fields[target] = labels if labels
+        features.each_with_index do |(k, type), i|
+          # TODO remove zero importance features
+          if type == "categorical"
+            data_fields[k] = label_encoders[k].labels.keys
+          else
+            data_fields[k] = nil
+          end
+        end
+        build_pmml(data_fields) do |xml|
+          function_name = objective == "regression" ? "regression" : "classification"
+          xml.MiningModel(functionName: function_name, algorithmName: "LightGBM") do
+            xml.MiningSchema do
+              xml.MiningField(name: target, usageType: "target")
+              features.keys.each_with_index do |k, i|
+                # next if feature_importance[i] == 0
+                # TODO add importance, but need to handle text features
+                xml.MiningField(name: k) #, importance: feature_importance[i].to_f, missingValueTreatment: "asIs")
+              end
+            end
+            pmml_local_transformations(xml)
+            case objective
+            when "regression"
+              xml_segmentation(xml, trees)
+            when "binary"
+              xml.Segmentation(multipleModelMethod: "modelChain") do
+                xml.Segment(id: 1) do
+                  xml.True
+                  xml.MiningModel(functionName: "regression") do
+                    xml.MiningSchema do
+                      features.each do |k, _|
+                        xml.MiningField(name: k)
+                      end
+                    end
+                    xml.Output do
+                      xml.OutputField(name: "lgbmValue", optype: "continuous", dataType: "double", feature: "predictedValue", isFinalResult: false) do
+                        xml.Apply(function: "/") do
+                          xml.Constant(dataType: "double") do
+                            1.0
+                          end
+                          xml.Apply(function: "+") do
+                            xml.Constant(dataType: "double") do
+                              1.0
+                            end
+                            xml.Apply(function: "exp") do
+                              xml.Apply(function: "*") do
+                                xml.Constant(dataType: "double") do
+                                  -1.0
+                                end
+                                xml.FieldRef(field: "lgbmValue")
+                              end
+                            end
+                          end
+                        end
+                      end
+                    end
+                    xml_segmentation(xml, trees)
+                  end
+                end
+                xml.Segment(id: 2) do
+                  xml.True
+                  xml.RegressionModel(functionName: "classification", normalizationMethod: "none") do
+                    xml.MiningSchema do
+                      xml.MiningField(name: target, usageType: "target")
+                      xml.MiningField(name: "transformedLgbmValue")
+                    end
+                    xml.Output do
+                      labels.each do |label|
+                        xml.OutputField(name: "probability(#{label})", optype: "continuous", dataType: "double", feature: "probability", value: label)
+                      end
+                    end
+                    xml.RegressionTable(intercept: 0.0, targetCategory: labels.last) do
+                      xml.NumericPredictor(name: "transformedLgbmValue", coefficient: "1.0")
+                    end
+                    xml.RegressionTable(intercept: 0.0, targetCategory: labels.first)
+                  end
+                end
+              end
+            else # multiclass
+              xml.Segmentation(multipleModelMethod: "modelChain") do
+                n = trees.size / labels.size
+                trees.each_slice(n).each_with_index do |trees, idx|
+                  xml.Segment(id: idx + 1) do
+                    xml.True
+                    xml.MiningModel(functionName: "regression") do
+                      xml.MiningSchema do
+                        features.each do |k, _|
+                          xml.MiningField(name: k)
+                        end
+                      end
+                      xml.Output do
+                        xml.OutputField(name: "lgbmValue(#{labels[idx]})", optype: "continuous", dataType: "double", feature: "predictedValue", isFinalResult: false)
+                      end
+                      xml_segmentation(xml, trees)
+                    end
+                  end
+                end
+                xml.Segment(id: labels.size + 1) do
+                  xml.True
+                  xml.RegressionModel(functionName: "classification", normalizationMethod: "softmax") do
+                    xml.MiningSchema do
+                      xml.MiningField(name: target, usageType: "target")
+                      labels.each do |label|
+                        xml.MiningField(name: "lgbmValue(#{label})")
+                      end
+                    end
+                    xml.Output do
+                      labels.each do |label|
+                        xml.OutputField(name: "probability(#{label})", optype: "continuous", dataType: "double", feature: "probability", value: label)
+                      end
+                    end
+                    labels.each do |label|
+                      xml.RegressionTable(intercept: 0.0, targetCategory: label) do
+                        xml.NumericPredictor(name: "lgbmValue(#{label})", coefficient: "1.0")
+                      end
+                    end
+                  end
+                end
+              end
+            end
+          end
+        end
+      end
+      def linear_regression
+        predictors = model.instance_variable_get("@coefficients").dup
+        intercept = predictors.delete("_intercept") || 0.0
+        data_fields = {}
+        features.each do |k, type|
+          if type == "categorical"
+            data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last)
+          else
+            data_fields[k] = nil
+          end
+        end
+        build_pmml(data_fields) do |xml|
+          xml.RegressionModel(functionName: "regression") do
+            xml.MiningSchema do
+              features.each do |k, _|
+                xml.MiningField(name: k)
+              end
+            end
+            pmml_local_transformations(xml)
+            xml.RegressionTable(intercept: intercept) do
+              predictors.each do |k, v|
+                if k.is_a?(Array)
+                  if features[k.first] == "text"
+                    xml.NumericPredictor(name: display_field(k), coefficient: v)
+                  else
+                    xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
+                  end
+                else
+                  xml.NumericPredictor(name: k, coefficient: v)
+                end
+              end
+            end
+          end
+        end
+      end
+      def naive_bayes
+        data_fields = {}
+        data_fields[target] = probabilities[:prior].keys
+        probabilities[:conditional].each do |k, v|
+          if features[k] == "categorical"
+            data_fields[k] = v.keys
+          else
+            data_fields[k] = nil
+          end
+        end
+        build_pmml(data_fields) do |xml|
+          xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
+            xml.MiningSchema do
+              data_fields.each do |k, _|
+                xml.MiningField(name: k)
+              end
+            end
+            xml.BayesInputs do
+              probabilities[:conditional].each do |k, v|
+                xml.BayesInput(fieldName: k) do
+                  if features[k] == "categorical"
+                    v.sort_by { |k2, _| k2 }.each do |k2, v2|
+                      xml.PairCounts(value: k2) do
+                        xml.TargetValueCounts do
+                          v2.sort_by { |k2, _| k2 }.each do |k3, v3|
+                            xml.TargetValueCount(value: k3, count: v3)
+                          end
+                        end
+                      end
+                    end
+                  else
+                    xml.TargetValueStats do
+                      v.sort_by { |k2, _| k2 }.each do |k2, v2|
+                        xml.TargetValueStat(value: k2) do
+                          xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
+                        end
+                      end
+                    end
+                  end
+                end
+              end
+            end
+            xml.BayesOutput(fieldName: "target") do
+              xml.TargetValueCounts do
+                probabilities[:prior].sort_by { |k, _| k }.each do |k, v|
+                  xml.TargetValueCount(value: k, count: v)
+                end
+              end
+            end
+          end
+        end
+      end
+      def display_field(k)
+        if k.is_a?(Array)
+          if features[k.first] == "text"
+            "#{k.first}(#{k.last})"
+          else
+            k.join("=")
+          end
+        else
+          k
+        end
+      end
+      def xml_segmentation(xml, trees)
+        xml.Segmentation(multipleModelMethod: "sum") do
+          trees.each_with_index do |node, i|
+            xml.Segment(id: i + 1) do
+              xml.True
+              xml.TreeModel(functionName: "regression", missingValueStrategy: "none", noTrueChildStrategy: "returnLastPrediction", splitCharacteristic: "multiSplit") do
+                xml.MiningSchema do
+                  node_fields(node).uniq.each do |k|
+                    xml.MiningField(name: display_field(k))
+                  end
+                end
+                node_pmml(node, xml)
+              end
+            end
+          end
+        end
+      end
+      def node_fields(node)
+        fields = []
+        fields << node.field if node.predicate
+        node.children.each do |n|
+          fields.concat(node_fields(n))
+        end
+        fields
+      end
+      def node_pmml(node, xml)
+        xml.Node(score: node.score) do
+          if node.predicate.nil?
+            xml.True
+          elsif node.operator == "in"
+            xml.SimpleSetPredicate(field: display_field(node.field), booleanOperator: "isIn") do
+              xml.Array(type: "string") do
+                xml.text node.value.map { |v| escape_element(v) }.join(" ")
+              end
+            end
+          else
+            xml.SimplePredicate(field: display_field(node.field), operator: node.operator, value: node.value)
+          end
+          node.children.each do |n|
+            node_pmml(n, xml)
+          end
+        end
+      end
+      def escape_element(v)
+        "\"#{v.gsub("\"", "\\\"")}\""
+      end
+      def build_pmml(data_fields)
+        Nokogiri::XML::Builder.new do |xml|
+          xml.PMML(version: "4.4", xmlns: "http://www.dmg.org/PMML-4_4", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
+            pmml_header(xml)
+            pmml_data_dictionary(xml, data_fields)
+            pmml_transformation_dictionary(xml)
+            yield xml
+          end
+        end.to_xml
+      end
+      def pmml_header(xml)
+        xml.Header do
+          xml.Application(name: "Eps", version: Eps::VERSION)
+          # xml.Timestamp Time.now.utc.iso8601
+        end
+      end
+      def pmml_data_dictionary(xml, data_fields)
+        xml.DataDictionary do
+          data_fields.each do |k, vs|
+            case features[k]
+            when "categorical", nil
+              xml.DataField(name: k, optype: "categorical", dataType: "string") do
+                vs.map(&:to_s).sort.each do |v|
+                  xml.Value(value: v)
+                end
+              end
+            when "text"
+              xml.DataField(name: k, optype: "categorical", dataType: "string")
+            else
+              xml.DataField(name: k, optype: "continuous", dataType: "double")
+            end
+          end
+        end
+      end
+      def pmml_transformation_dictionary(xml)
+        if text_features.any?
+          xml.TransformationDictionary do
+            text_features.each do |k, text_options|
+              xml.DefineFunction(name: "#{k}Transform", optype: "continuous") do
+                xml.ParameterField(name: "text")
+                xml.ParameterField(name: "term")
+                xml.TextIndex(textField: "text", localTermWeights: "termFrequency", wordSeparatorCharacterRE: text_options[:tokenizer].source, isCaseSensitive: !!text_options[:case_sensitive]) do
+                  xml.FieldRef(field: "term")
+                end
+              end
+            end
+          end
+        end
+      end
+      def pmml_local_transformations(xml)
+        if text_features.any?
+          xml.LocalTransformations do
+            text_features.each do |k, _|
+              text_encoders[k].vocabulary.each do |v|
+                xml.DerivedField(name: display_field([k, v]), optype: "continuous", dataType: "integer") do
+                  xml.Apply(function: "#{k}Transform") do
+                    xml.FieldRef(field: k)
+                    xml.Constant v
+                  end
+                end
+              end
+            end
+          end
+        end
+      end
+      # TODO create instance methods on model for all of these features
+      def features
+        model.instance_variable_get("@features")
+      end
+      def text_features
+        model.instance_variable_get("@text_features")
+      end
+      def text_encoders
+        model.instance_variable_get("@text_encoders")
+      end
+      def feature_importance
+        model.instance_variable_get("@feature_importance")
+      end
+      def labels
+        model.instance_variable_get("@labels")
+      end
+      def trees
+        model.instance_variable_get("@trees")
+      end
+      def target
+        model.instance_variable_get("@target")
+      end
+      def label_encoders
+        model.instance_variable_get("@label_encoders")
+      end
+      def objective
+        model.instance_variable_get("@objective")
+      end
+      def probabilities
+        model.instance_variable_get("@probabilities")
+      end
+      # end TODO
+    end
+  end
+end

data/lib/eps/pmml/loader.rb ADDED

@@ -0,0 +1,241 @@
+module Eps
+  module PMML
+    class Loader
+      attr_reader :data
+      def initialize(pmml)
+        if pmml.is_a?(String)
+          pmml = Nokogiri::XML(pmml) { |config| config.strict }
+        end
+        @data = pmml
+      end
+      def load
+        if data.css("Segmentation").any?
+          lightgbm
+        elsif data.css("RegressionModel").any?
+          linear_regression
+        elsif data.css("NaiveBayesModel").any?
+          naive_bayes
+        else
+          raise "Unknown model"
+        end
+      end
+      private
+      def lightgbm
+        objective = data.css("MiningModel").first.attribute("functionName").value
+        if objective == "classification"
+          labels = data.css("RegressionModel OutputField").map { |n| n.attribute("value").value }
+          objective = labels.size > 2 ? "multiclass" : "binary"
+        end
+        features = {}
+        text_features, derived_fields = extract_text_features(data, features)
+        node = data.css("DataDictionary").first
+        node.css("DataField")[1..-1].to_a.each do |node|
+          features[node.attribute("name").value] =
+            if node.attribute("optype").value == "categorical"
+              "categorical"
+            else
+              "numeric"
+            end
+        end
+        trees = []
+        data.css("Segmentation TreeModel").each do |tree|
+          node = find_nodes(tree.css("Node").first, derived_fields)
+          trees << node
+        end
+        Evaluators::LightGBM.new(trees: trees, objective: objective, labels: labels, features: features, text_features: text_features)
+      end
+      def linear_regression
+        node = data.css("RegressionTable")
+        coefficients = {
+          "_intercept" => node.attribute("intercept").value.to_f
+        }
+        features = {}
+        text_features, derived_fields = extract_text_features(data, features)
+        node.css("NumericPredictor").each do |n|
+          name = n.attribute("name").value
+          if derived_fields[name]
+            name = derived_fields[name]
+          else
+            features[name] = "numeric"
+          end
+          coefficients[name] = n.attribute("coefficient").value.to_f
+        end
+        node.css("CategoricalPredictor").each do |n|
+          name = n.attribute("name").value
+          coefficients[[name, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
+          features[name] = "categorical"
+        end
+        Evaluators::LinearRegression.new(coefficients: coefficients, features: features, text_features: text_features)
+      end
+      def naive_bayes
+        node = data.css("NaiveBayesModel")
+        prior = {}
+        node.css("BayesOutput TargetValueCount").each do |n|
+          prior[n.attribute("value").value] = n.attribute("count").value.to_f
+        end
+        legacy = false
+        conditional = {}
+        features = {}
+        node.css("BayesInput").each do |n|
+          prob = {}
+          # numeric
+          n.css("TargetValueStat").each do |n2|
+            n3 = n2.css("GaussianDistribution")
+            prob[n2.attribute("value").value] = {
+              mean: n3.attribute("mean").value.to_f,
+              stdev: Math.sqrt(n3.attribute("variance").value.to_f)
+            }
+          end
+          # detect bad form in Eps < 0.3
+          bad_format = n.css("PairCounts").map { |n2| n2.attribute("value").value } == prior.keys
+          # categorical
+          n.css("PairCounts").each do |n2|
+            if bad_format
+              n2.css("TargetValueCount").each do |n3|
+                prob[n3.attribute("value").value] ||= {}
+                prob[n3.attribute("value").value][n2.attribute("value").value] = n3.attribute("count").value.to_f
+              end
+            else
+              boom = {}
+              n2.css("TargetValueCount").each do |n3|
+                boom[n3.attribute("value").value] = n3.attribute("count").value.to_f
+              end
+              prob[n2.attribute("value").value] = boom
+            end
+          end
+          if bad_format
+            legacy = true
+            prob.each do |k, v|
+              prior.keys.each do |k|
+                v[k] ||= 0.0
+              end
+            end
+          end
+          name = n.attribute("fieldName").value
+          conditional[name] = prob
+          features[name] = n.css("TargetValueStat").any? ? "numeric" : "categorical"
+        end
+        target = node.css("BayesOutput").attribute("fieldName").value
+        probabilities = {
+          prior: prior,
+          conditional: conditional
+        }
+        # get derived fields
+        derived = {}
+        data.css("DerivedField").each do |n|
+          name = n.attribute("name").value
+          field = n.css("NormDiscrete").attribute("field").value
+          value = n.css("NormDiscrete").attribute("value").value
+          features.delete(name)
+          features[field] = "derived"
+          derived[field] ||= {}
+          derived[field][name] = value
+        end
+        Evaluators::NaiveBayes.new(probabilities: probabilities, features: features, derived: derived, legacy: legacy)
+      end
+      def extract_text_features(data, features)
+        # updates features object
+        vocabulary = {}
+        function_mapping = {}
+        derived_fields = {}
+        data.css("LocalTransformations DerivedField, TransformationDictionary DerivedField").each do |n|
+          name = n.attribute("name")&.value
+          field = n.css("FieldRef").attribute("field").value
+          value = n.css("Constant").text
+          field = field[10..-2] if field =~ /\Alowercase\(.+\)\z/
+          next if value.empty?
+          (vocabulary[field] ||= []) << value
+          function_mapping[field] = n.css("Apply").attribute("function").value
+          derived_fields[name] = [field, value]
+        end
+        functions = {}
+        data.css("TransformationDictionary DefineFunction").each do |n|
+          name = n.attribute("name").value
+          text_index = n.css("TextIndex")
+          functions[name] = {
+            tokenizer: Regexp.new(text_index.attribute("wordSeparatorCharacterRE").value),
+            case_sensitive: text_index.attribute("isCaseSensitive")&.value == "true"
+          }
+        end
+        text_features = {}
+        function_mapping.each do |field, function|
+          text_features[field] = functions[function].merge(vocabulary: vocabulary[field])
+          features[field] = "text"
+        end
+        [text_features, derived_fields]
+      end
+      def find_nodes(xml, derived_fields)
+        score = xml.attribute("score").value.to_f
+        elements = xml.elements
+        xml_predicate = elements.first
+        predicate =
+          if xml_predicate.name == "True"
+            nil
+          elsif xml_predicate.name == "SimpleSetPredicate"
+            operator = "in"
+            value = xml_predicate.css("Array").text.scan(/"(.+?)(?<!\\)"|(\S+)/).flatten.compact.map { |v| v.gsub('\"', '"') }
+            field = xml_predicate.attribute("field").value
+            field = derived_fields[field] if derived_fields[field]
+            {
+              field: field,
+              operator: operator,
+              value: value
+            }
+          else
+            operator = xml_predicate.attribute("operator").value
+            value = xml_predicate.attribute("value").value
+            value = value.to_f if operator == "greaterThan"
+            field = xml_predicate.attribute("field").value
+            field = derived_fields[field] if derived_fields[field]
+            {
+              field: field,
+              operator: operator,
+              value: value
+            }
+          end
+        children = elements[1..-1].map { |n| find_nodes(n, derived_fields) }
+        Evaluators::Node.new(score: score, predicate: predicate, children: children)
+      end
+    end
+  end
+end