RubyGems - eps - Versions diffs - 0.2.1 → 0.3.0 - Mend

eps 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +14 -0
data/LICENSE.txt +1 -1
data/README.md +183 -243
data/lib/eps.rb +27 -3
data/lib/eps/base_estimator.rb +316 -47
data/lib/eps/data_frame.rb +141 -0
data/lib/eps/evaluators/lightgbm.rb +116 -0
data/lib/eps/evaluators/linear_regression.rb +54 -0
data/lib/eps/evaluators/naive_bayes.rb +95 -0
data/lib/eps/evaluators/node.rb +26 -0
data/lib/eps/label_encoder.rb +41 -0
data/lib/eps/lightgbm.rb +237 -0
data/lib/eps/linear_regression.rb +132 -386
data/lib/eps/metrics.rb +46 -0
data/lib/eps/model.rb +16 -58
data/lib/eps/naive_bayes.rb +175 -164
data/lib/eps/pmml_generators/lightgbm.rb +187 -0
data/lib/eps/statistics.rb +79 -0
data/lib/eps/text_encoder.rb +81 -0
data/lib/eps/utils.rb +22 -0
data/lib/eps/version.rb +1 -1
metadata +33 -7

data/lib/eps.rb CHANGED Viewed

@@ -1,18 +1,42 @@
 # dependencies
-require "matrix"
+require "bigdecimal"
 require "json"
+require "lightgbm"
+require "matrix"
+require "nokogiri"
 # modules
 require "eps/base"
 require "eps/base_estimator"
+require "eps/data_frame"
+require "eps/evaluators/linear_regression"
+require "eps/evaluators/lightgbm"
+require "eps/evaluators/naive_bayes"
+require "eps/evaluators/node"
+require "eps/label_encoder"
+require "eps/lightgbm"
 require "eps/linear_regression"
+require "eps/metrics"
 require "eps/model"
 require "eps/naive_bayes"
+require "eps/statistics"
+require "eps/text_encoder"
+require "eps/utils"
 require "eps/version"
 module Eps
-  def self.metrics(actual, estimated)
-    Eps::Model.metrics(actual, estimated)
+  def self.metrics(y_true, y_pred)
+    if Utils.column_type(y_true, "actual") == "numeric"
+      {
+        rmse: Metrics.rmse(y_true, y_pred),
+        mae: Metrics.mae(y_true, y_pred),
+        me: Metrics.me(y_true, y_pred)
+      }
+    else
+      {
+        accuracy: Metrics.accuracy(y_true, y_pred)
+      }
+    end
   end
   # backwards compatibility

data/lib/eps/base_estimator.rb CHANGED Viewed

@@ -1,81 +1,350 @@
 module Eps
   class BaseEstimator
-    def train(data, y, target: nil, **options)
-      # TODO more performant conversion
-      if daru?(data)
-        x = data.dup
-        x = x.delete_vector(target) if target
-      else
-        x = data.map(&:dup)
-        x.each { |r| r.delete(target) } if target
-      end
+    def initialize(data = nil, y = nil, **options)
+      train(data, y, **options) if data
+    end
-      y = prep_y(y.to_a)
+    def predict(data)
+      singular = data.is_a?(Hash)
+      data = [data] if singular
-      if x.size != y.size
-        raise "Number of samples differs from target"
+      data = Eps::DataFrame.new(data)
+      @evaluator.features.each do |k, type|
+        values = data.columns[k]
+        raise ArgumentError, "Missing column: #{k}" if !values
+        column_type = Utils.column_type(values.compact, k) if values
+        if !column_type.nil?
+          if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
+            raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
+          end
+        end
+        # TODO check for unknown values for categorical features
       end
-      @x = x
-      @y = y
-      @target = target || "target"
+      predictions = @evaluator.predict(data)
+      singular ? predictions.first : predictions
     end
-    def predict(x)
-      singular = !(x.is_a?(Array) || daru?(x))
-      x = [x] if singular
+    def evaluate(data, y = nil, target: nil)
+      data, target = prep_data(data, y, target || @target)
+      Eps.metrics(data.label, predict(data))
+    end
-      pred = _predict(x)
+    def to_pmml
+      (@pmml ||= generate_pmml).to_xml
+    end
-      singular ? pred[0] : pred
+    def self.load_pmml(data)
+      if data.is_a?(String)
+        data = Nokogiri::XML(data) { |config| config.strict }
+      end
+      model = new
+      model.instance_variable_set("@pmml", data) # cache data
+      model.instance_variable_set("@evaluator", yield(data))
+      model
     end
-    def evaluate(data, y = nil, target: nil)
-      target ||= @target
-      raise ArgumentError, "missing target" if !target && !y
+    def summary(extended: false)
+      str = String.new("")
+      if @validation_set
+        y_true = @validation_set.label
+        y_pred = predict(@validation_set)
-      actual = y
-      actual ||=
-        if daru?(data)
-          data[target].to_a
+        case @target_type
+        when "numeric"
+          metric_name = "RMSE"
+          v = Metrics.rmse(y_true, y_pred)
+          metric_value = v.round >= 1000 ? v.round.to_s : "%.3g" % v
         else
-          data.map { |v| v[target] }
+          metric_name = "accuracy"
+          metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred)).round(1)
         end
+        str << "Validation %s: %s\n\n"  % [metric_name, metric_value]
+      end
+      str << _summary(extended: extended)
+      str
+    end
+    # private
+    def self.extract_text_features(data, features)
+      # updates features object
+      vocabulary = {}
+      function_mapping = {}
+      derived_fields = {}
+      data.css("LocalTransformations DerivedField, TransformationDictionary DerivedField").each do |n|
+        name = n.attribute("name")&.value
+        field = n.css("FieldRef").attribute("field").value
+        value = n.css("Constant").text
-      actual = prep_y(actual)
-      estimated = predict(data)
+        field = field[10..-2] if field =~ /\Alowercase\(.+\)\z/
+        next if value.empty?
-      self.class.metrics(actual, estimated)
+        (vocabulary[field] ||= []) << value
+        function_mapping[field] = n.css("Apply").attribute("function").value
+        derived_fields[name] = [field, value]
+      end
+      functions = {}
+      data.css("TransformationDictionary DefineFunction").each do |n|
+        name = n.attribute("name").value
+        text_index = n.css("TextIndex")
+        functions[name] = {
+          tokenizer: Regexp.new(text_index.attribute("wordSeparatorCharacterRE").value),
+          case_sensitive: text_index.attribute("isCaseSensitive")&.value == "true"
+        }
+      end
+      text_features = {}
+      function_mapping.each do |field, function|
+        text_features[field] = functions[function].merge(vocabulary: vocabulary[field])
+        features[field] = "text"
+      end
+      [text_features, derived_fields]
     end
     private
-    def categorical?(v)
-      !v.is_a?(Numeric)
+    def train(data, y = nil, target: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
+      data, @target = prep_data(data, y, target)
+      @target_type = Utils.column_type(data.label, @target)
+      if split.nil?
+        split = data.size >= 30
+      end
+      # cross validation
+      if split && !validation_set
+        split = {} if split == true
+        split = {column: split} unless split.is_a?(Hash)
+        split_p = 1 - (split[:validation_size] || 0.25)
+        if split[:column]
+          split_column = split[:column].to_s
+          times = data.columns.delete(split_column)
+          check_missing(times, split_column)
+          split_index = (times.size * split_p).round
+          split_time = split[:value] || times.sort[split_index]
+          train_idx, validation_idx = (0...data.size).to_a.partition { |i| times[i] < split_time }
+        else
+          if split[:shuffle] != false
+            rng = Random.new(0) # seed random number generator
+            train_idx, validation_idx = (0...data.size).to_a.partition { rng.rand < split_p }
+          else
+            split_index = (data.size * split_p).round
+            train_idx, validation_idx = (0...data.size).to_a.partition { |i| i < split_index }
+          end
+        end
+      end
+      # determine feature types
+      @features = {}
+      data.columns.each do |k, v|
+        @features[k] = Utils.column_type(v.compact, k)
+      end
+      # determine text features if not specified
+      if text_features.nil?
+        text_features = []
+        @features.each do |k, type|
+          next if type != "categorical"
+          values = data.columns[k].compact
+          next unless values.first.is_a?(String) # not boolean
+          values = values.reject(&:empty?)
+          count = values.count
+          # check if spaces
+          # two spaces is rough approximation for 3 words
+          # TODO make more performant
+          if values.count { |v| v.count(" ") >= 2 } > 0.5 * count
+            text_features << k
+          end
+        end
+      end
+      # prep text features
+      @text_features = {}
+      (text_features || {}).each do |k, v|
+        @features[k.to_s] = "text"
+        # same output as scikit-learn CountVectorizer
+        # except for max_features
+        @text_features[k.to_s] = {
+          tokenizer: /\W+/,
+          min_length: 2,
+          max_features: 100
+        }.merge(v || {})
+      end
+      if split && !validation_set
+        @train_set = data[train_idx]
+        validation_set = data[validation_idx]
+      else
+        @train_set = data.dup
+        if validation_set
+          validation_set = Eps::DataFrame.new(validation_set)
+          validation_set.label = validation_set.columns.delete(@target)
+        end
+      end
+      raise "No data in training set" if @train_set.empty?
+      raise "No data in validation set" if validation_set && validation_set.empty?
+      @validation_set = validation_set
+      @evaluator = _train(verbose: verbose, early_stopping: early_stopping)
+      # reset pmml
+      @pmml = nil
+      nil
     end
-    def daru?(x)
-      defined?(Daru) && x.is_a?(Daru::DataFrame)
+    def prep_data(data, y, target)
+      data = Eps::DataFrame.new(data)
+      target = (target || "target").to_s
+      y ||= data.columns.delete(target)
+      check_missing(y, target)
+      data.label = y.to_a
+      check_data(data)
+      [data, target]
     end
-    def flip_target(target)
-      target.is_a?(String) ? target.to_sym : target.to_s
+    def prep_text_features(train_set)
+      @text_encoders = {}
+      @text_features.each do |k, v|
+        # reset vocabulary
+        v.delete(:vocabulary)
+        # TODO determine max features automatically
+        # start based on number of rows
+        encoder = Eps::TextEncoder.new(v)
+        counts = encoder.fit(train_set.columns.delete(k))
+        encoder.vocabulary.each do |word|
+          train_set.columns[[k, word]] = [0] * counts.size
+        end
+        counts.each_with_index do |ci, i|
+          ci.each do |word, count|
+            word_key = [k, word]
+            train_set.columns[word_key][i] = 1 if train_set.columns.key?(word_key)
+          end
+        end
+        @text_encoders[k] = encoder
+        # update vocabulary
+        v[:vocabulary] = encoder.vocabulary
+      end
+      raise "No features left" if train_set.columns.empty?
     end
-    def prep_y(y)
-      y.each do |yi|
-        raise "Target missing in data" if yi.nil?
+    def check_data(data)
+      raise "No data" if data.empty?
+      raise "Number of data points differs from target" if data.size != data.label.size
+    end
+    def check_missing(c, name)
+      raise ArgumentError, "Missing column: #{name}" if !c
+      raise ArgumentError, "Missing values in column #{name}" if c.any?(&:nil?)
+    end
+    def check_missing_value(df)
+      df.columns.each do |k, v|
+        check_missing(v, k)
       end
-      y
     end
-    # determine if target is a string or symbol
-    def prep_target(target, data)
-      if daru?(data)
-        data.has_vector?(target) ? target : flip_target(target)
+    def display_field(k)
+      if k.is_a?(Array)
+        if @features[k.first] == "text"
+          "#{k.first}(#{k.last})"
+        else
+          k.join("=")
+        end
       else
-        x = data[0] || {}
-        x[target] ? target : flip_target(target)
+        k
+      end
+    end
+    # pmml
+    def build_pmml(data_fields)
+      Nokogiri::XML::Builder.new do |xml|
+        xml.PMML(version: "4.4", xmlns: "http://www.dmg.org/PMML-4_4", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
+          pmml_header(xml)
+          pmml_data_dictionary(xml, data_fields)
+          pmml_transformation_dictionary(xml)
+          yield xml
+        end
+      end
+    end
+    def pmml_header(xml)
+      xml.Header do
+        xml.Application(name: "Eps", version: Eps::VERSION)
+        # xml.Timestamp Time.now.utc.iso8601
+      end
+    end
+    def pmml_data_dictionary(xml, data_fields)
+      xml.DataDictionary do
+        data_fields.each do |k, vs|
+          case @features[k]
+          when "categorical", nil
+            xml.DataField(name: k, optype: "categorical", dataType: "string") do
+              vs.map(&:to_s).sort.each do |v|
+                xml.Value(value: v)
+              end
+            end
+          when "text"
+            xml.DataField(name: k, optype: "categorical", dataType: "string")
+          else
+            xml.DataField(name: k, optype: "continuous", dataType: "double")
+          end
+        end
+      end
+    end
+    def pmml_transformation_dictionary(xml)
+      if @text_features.any?
+        xml.TransformationDictionary do
+          @text_features.each do |k, text_options|
+            xml.DefineFunction(name: "#{k}Transform", optype: "continuous") do
+              xml.ParameterField(name: "text")
+              xml.ParameterField(name: "term")
+              xml.TextIndex(textField: "text", localTermWeights: "termFrequency", wordSeparatorCharacterRE: text_options[:tokenizer].source, isCaseSensitive: !!text_options[:case_sensitive]) do
+                xml.FieldRef(field: "term")
+              end
+            end
+          end
+        end
+      end
+    end
+    def pmml_local_transformations(xml)
+      if @text_features.any?
+        xml.LocalTransformations do
+          @text_features.each do |k, _|
+            @text_encoders[k].vocabulary.each do |v|
+              xml.DerivedField(name: display_field([k, v]), optype: "continuous", dataType: "integer") do
+                xml.Apply(function: "#{k}Transform") do
+                  xml.FieldRef(field: k)
+                  xml.Constant v
+                end
+              end
+            end
+          end
+        end
       end
     end
   end

data/lib/eps/data_frame.rb ADDED Viewed

@@ -0,0 +1,141 @@
+module Eps
+  class DataFrame
+    attr_reader :columns
+    attr_accessor :label
+    def initialize(data = [])
+      @columns = {}
+      if data.is_a?(Eps::DataFrame)
+        data.columns.each do |k, v|
+          @columns[k] = v
+        end
+      elsif daru?(data)
+        data.to_h.each do |k, v|
+          @columns[k.to_s] = v.to_a
+        end
+      elsif data.is_a?(Hash)
+        data.each do |k, v|
+          @columns[k.to_s] = v.to_a
+        end
+      else
+        if data.any?
+          row = data[0]
+          if row.is_a?(Hash)
+            row.keys.each do |k|
+              @columns[k.to_s] = data.map { |r| r[k] }
+            end
+          elsif row.is_a?(Array)
+            row.size.times do |i|
+              @columns["x#{i}"] = data.map { |r| r[i] }
+            end
+          else
+            @columns["x0"] = data
+          end
+        end
+      end
+    end
+    def empty?
+      size == 0
+    end
+    def size
+      @columns.any? ? columns.values.first.size : 0
+    end
+    def any?
+      @columns.any?
+    end
+    def map
+      if @columns.any?
+        size.times.map do |i|
+          yield Hash[@columns.map { |k, v| [k, v[i]] }]
+        end
+      end
+    end
+    def map_rows
+      if @columns.any?
+        size.times.map do |i|
+          yield @columns.map { |_, v| v[i] }
+        end
+      end
+    end
+    def [](rows, cols = nil)
+      if cols.nil?
+        if rows.is_a?(String) || (rows.is_a?(Array) && rows.first.is_a?(String))
+          cols = rows
+          rows = 0..-1
+        end
+      end
+      if rows.is_a?(Range)
+        if rows.end.nil?
+          rows = Range.new(rows.begin, size - 1)
+        elsif rows.end < 0
+          rows = Range.new(rows.begin, size + rows.end, rows.exclude_end?)
+        end
+      end
+      if cols
+        if cols.is_a?(Range)
+          c = columns.keys
+          start_index = c.index(cols.begin)
+          raise "Undefined column: #{cols.begin}" unless start_index
+          end_index = c.index(cols.end)
+          raise "Undefined column: #{cols.end}" unless end_index
+          reverse = false
+          if start_index > end_index
+            reverse = true
+            start_index, end_index = end_index, start_index
+          end
+          cols = c[Range.new(start_index, end_index, cols.exclude_end?)]
+          cols.reverse! if reverse
+        elsif !cols.is_a?(Array)
+          singular = true
+          cols = [cols]
+        end
+      else
+        cols = columns.keys
+      end
+      df = Eps::DataFrame.new
+      cols.each do |c|
+        raise "Undefined column: #{c}" unless columns.include?(c)
+        df.columns[c] = columns[c].values_at(*rows)
+      end
+      df.label = label.values_at(*rows) if label
+      singular ? df.columns[cols[0]] : df
+    end
+    def ==(other)
+      columns.keys == other.columns.keys && columns == other.columns
+    end
+    def dup
+      df = Eps::DataFrame.new
+      columns.each do |k, v|
+        df.columns[k] = v
+      end
+      df.label = label
+      df
+    end
+    private
+    def daru?(x)
+      defined?(Daru) && x.is_a?(Daru::DataFrame)
+    end
+  end
+end