RubyGems - eps - Versions diffs - 0.1.1 → 0.2.0 - Mend

eps 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/README.md +235 -84
data/lib/eps.rb +9 -4
data/lib/eps/base.rb +19 -0
data/lib/eps/base_estimator.rb +84 -0
data/lib/eps/linear_regression.rb +558 -0
data/lib/eps/model.rb +108 -0
data/lib/eps/naive_bayes.rb +240 -0
data/lib/eps/version.rb +1 -1
metadata +13 -18
data/.gitignore +0 -9
data/.travis.yml +0 -15
data/Gemfile +0 -11
data/Rakefile +0 -34
data/eps.gemspec +0 -30
data/guides/Modeling.md +0 -152
data/lib/eps/base_regressor.rb +0 -232
data/lib/eps/metrics.rb +0 -35
data/lib/eps/regressor.rb +0 -314

data/lib/eps/model.rb ADDED Viewed

@@ -0,0 +1,108 @@
+module Eps
+  class Model
+    def initialize(data = nil, y = nil, target: nil, estimator: nil, **options)
+      @options = options
+      if estimator
+        @estimator = estimator
+      elsif data # legacy
+        train(data, y, target: target)
+      end
+    end
+    # pmml
+    def self.load_pmml(data)
+      if data.is_a?(String)
+        require "nokogiri"
+        data = Nokogiri::XML(data) { |config| config.strict }
+      end
+      estimator_class =
+        if data.css("RegressionModel").any?
+          Eps::LinearRegression
+        elsif data.css("NaiveBayesModel").any?
+          Eps::NaiveBayes
+        else
+          raise "Unknown model"
+        end
+      new(estimator: estimator_class.load_pmml(data))
+    end
+    # ruby - legacy
+    def self.load(data)
+      new(estimator: Eps::LinearRegression.load(data))
+    end
+    # json - legacy
+    def self.load_json(data)
+      new(estimator: Eps::LinearRegression.load_json(data))
+    end
+    def to_json
+      @estimator ? @estimator.to_json : super
+    end
+    # pfa - legacy
+    def self.load_pfa(data)
+      new(estimator: Eps::LinearRegression.load_pfa(data))
+    end
+    # metrics
+    def self.metrics(actual, estimated)
+      estimator_class =
+        if numeric?(actual)
+          Eps::LinearRegression
+        else
+          Eps::NaiveBayes
+        end
+      estimator_class.metrics(actual, estimated)
+    end
+    private
+    def train(data, y = nil, target: nil)
+      y ||= daru?(data) ? data[target].to_a : data.map { |r| r[target] }
+      estimator_class =
+        if self.class.numeric?(y)
+          Eps::LinearRegression
+        else
+          Eps::NaiveBayes
+        end
+      @estimator = estimator_class.new(**@options)
+      @estimator.train(data, y, target: target)
+    end
+    def respond_to_missing?(name, include_private = false)
+      if @estimator
+        @estimator.respond_to?(name, include_private)
+      else
+        super
+      end
+    end
+    def method_missing(method, *args, &block)
+      if @estimator
+        @estimator.public_send(method, *args, &block)
+      else
+        super
+      end
+    end
+    def self.numeric?(y)
+      y.first.is_a?(Numeric)
+    end
+    def daru?(x)
+      defined?(Daru) && x.is_a?(Daru::DataFrame)
+    end
+  end
+end

data/lib/eps/naive_bayes.rb ADDED Viewed

@@ -0,0 +1,240 @@
+module Eps
+  class NaiveBayes < BaseEstimator
+    attr_reader :probabilities
+    def initialize(probabilities: nil, target: nil)
+      @probabilities = probabilities if probabilities
+      @target = target if target
+    end
+    def train(*args)
+      super
+      @y = @y.map { |yi| yi.to_s }
+      prior = group_count(@y)
+      conditional = {}
+      if @x.any?
+        keys = @x.first.keys
+        x = @x.dup
+        x.each_with_index do |xi, i|
+          xi[@target] = @y[i]
+        end
+        keys.each do |k|
+          conditional[k] = {}
+          x.group_by { |xi| xi[@target] }.each do |group, xs|
+            v = xs.map { |xi| xi[k] }
+            if categorical?(v[0])
+              # TODO apply smoothing
+              # apply smoothing only to
+              # 1. categorical features
+              # 2. conditional probabilities
+              # TODO more efficient count
+              conditional[k][group] = group_count(v)
+            else
+              conditional[k][group] = {mean: mean(v), stdev: stdev(v)}
+            end
+          end
+        end
+      end
+      @probabilities = {
+        prior: prior,
+        conditional: conditional
+      }
+    end
+    # TODO better summary
+    def summary(extended: false)
+      @summary_str ||= begin
+        str = String.new("")
+        probabilities[:prior].each do |k, v|
+          str += "#{k}: #{v}\n"
+        end
+        str += "\n"
+        str += "accuracy: %d%%\n" % [(100 * accuracy).round]
+        str
+      end
+    end
+    def accuracy
+      self.class.metrics(predict(@x), @y)[:accuracy]
+    end
+    # pmml
+    def self.load_pmml(data)
+      # TODO more validation
+      node = data.css("NaiveBayesModel")
+      prior = {}
+      node.css("BayesOutput TargetValueCount").each do |n|
+        prior[n.attribute("value").value] = n.attribute("count").value.to_f
+      end
+      conditional = {}
+      node.css("BayesInput").each do |n|
+        prob = {}
+        n.css("TargetValueStat").each do |n2|
+          n3 = n2.css("GaussianDistribution")
+          prob[n2.attribute("value").value] = {
+            mean: n3.attribute("mean").value.to_f,
+            stdev: Math.sqrt(n3.attribute("variance").value.to_f)
+          }
+        end
+        n.css("PairCounts").each do |n2|
+          boom = {}
+          n2.css("TargetValueCount").each do |n3|
+            boom[n3.attribute("value").value] = n3.attribute("count").value.to_f
+          end
+          prob[n2.attribute("value").value] = boom
+        end
+        conditional[n.attribute("fieldName").value] = prob
+      end
+      @target = node.css("BayesOutput").attribute("fieldName").value
+      probabilities = {
+        prior: prior,
+        conditional: conditional
+      }
+      new(probabilities: probabilities, target: @target)
+    end
+    def to_pmml
+      data_fields = {}
+      data_fields[@target] = probabilities[:prior].keys
+      probabilities[:conditional].each do |k, v|
+        if !v.values[0][:mean]
+          data_fields[k] = v.keys
+        else
+          data_fields[k] = nil
+        end
+      end
+      builder = Nokogiri::XML::Builder.new do |xml|
+        xml.PMML(version: "4.3", xmlns: "http://www.dmg.org/PMML-4_3", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
+          xml.Header
+          xml.DataDictionary do
+            data_fields.each do |k, vs|
+              if vs
+                xml.DataField(name: k, optype: "categorical", dataType: "string") do
+                  vs.each do |v|
+                    xml.Value(value: v)
+                  end
+                end
+              else
+                xml.DataField(name: k, optype: "continuous", dataType: "double")
+              end
+            end
+          end
+          xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
+            xml.MiningSchema do
+              data_fields.each do |k, _|
+                xml.MiningField(name: k)
+              end
+            end
+            xml.BayesInputs do
+              probabilities[:conditional].each do |k, v|
+                xml.BayesInput(fieldName: k) do
+                  if !v.values[0][:mean]
+                    v.each do |k2, v2|
+                      xml.PairCounts(value: k2) do
+                        xml.TargetValueCounts do
+                          v2.each do |k3, v3|
+                            xml.TargetValueCount(value: k3, count: v3)
+                          end
+                        end
+                      end
+                    end
+                  else
+                    xml.TargetValueStats do
+                      v.each do |k2, v2|
+                        xml.TargetValueStat(value: k2) do
+                          xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
+                        end
+                      end
+                    end
+                  end
+                end
+              end
+            end
+            xml.BayesOutput(fieldName: "target") do
+              xml.TargetValueCounts do
+                probabilities[:prior].each do |k, v|
+                  xml.TargetValueCount(value: k, count: v)
+                end
+              end
+            end
+          end
+        end
+      end.to_xml
+    end
+    # metrics
+    def self.metrics(actual, estimated)
+      {
+        accuracy: actual.zip(estimated).count { |yi, yi2| yi == yi2 } / actual.size.to_f
+      }
+    end
+    private
+    def _predict(x)
+      x.map do |xi|
+        probs = calculate_class_probabilities(xi)
+        # deterministic for equal probabilities
+        probs.sort_by { |k, v| [-v, k.to_s] }[0][0]
+      end
+    end
+    def calculate_class_probabilities(x)
+      prob = {}
+      probabilities[:prior].each do |c, cv|
+        prob[c] = cv.to_f / probabilities[:prior].values.sum
+        probabilities[:conditional].each do |k, v|
+          if !v[c][:mean]
+            # TODO compute ahead of time
+            p2 = v[c][x[k]].to_f / v[c].values.sum
+            # assign very small probability if probability is 0
+            # TODO use proper smoothing instead
+            if p2 == 0
+              p2 = 0.0001
+            end
+            prob[c] *= p2
+          else
+            prob[c] *= calculate_probability(x[k], v[c][:mean], v[c][:stdev])
+          end
+        end
+      end
+      prob
+    end
+    def calculate_probability(x, mean, stdev)
+      exponent = Math.exp(-((x - mean)**2) / (2 * (stdev**2)))
+      (1 / (Math.sqrt(2 * Math::PI) * stdev)) * exponent
+    end
+    def group_count(arr)
+      r = arr.inject(Hash.new(0)) { |h, e| h[e.to_s] += 1 ; h }
+      r.default = nil
+      r
+    end
+    def mean(arr)
+      arr.sum / arr.size.to_f
+    end
+    def stdev(arr)
+      m = mean(arr)
+      sum = arr.inject(0) { |accum, i| accum + (i - m)**2 }
+      Math.sqrt(sum / (arr.length - 1).to_f)
+    end
+  end
+end

data/lib/eps/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Eps
-  VERSION = "0.1.1"
+  VERSION = "0.2.0"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: eps
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.2.0
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
-bindir: exe
+bindir: bin
 cert_chain: []
-date: 2018-07-05 00:00:00.000000000 Z
+date: 2019-05-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -81,25 +81,20 @@ dependencies:
       - !ruby/object:Gem::Version
         version: '0'
 description:
-email:
-- andrew@chartkick.com
+email: andrew@chartkick.com
 executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- ".gitignore"
-- ".travis.yml"
 - CHANGELOG.md
-- Gemfile
 - LICENSE.txt
 - README.md
-- Rakefile
-- eps.gemspec
-- guides/Modeling.md
 - lib/eps.rb
-- lib/eps/base_regressor.rb
-- lib/eps/metrics.rb
-- lib/eps/regressor.rb
+- lib/eps/base.rb
+- lib/eps/base_estimator.rb
+- lib/eps/linear_regression.rb
+- lib/eps/model.rb
+- lib/eps/naive_bayes.rb
 - lib/eps/version.rb
 homepage: https://github.com/ankane/eps
 licenses:
@@ -113,16 +108,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '0'
+      version: '2.4'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.7
+rubygems_version: 3.0.3
 signing_key:
 specification_version: 4
-summary: Linear regression for Ruby
+summary: Machine learning for Ruby. Supports regression (linear regression) and classification
+  (naive Bayes)
 test_files: []

data/.gitignore DELETED Viewed

@@ -1,9 +0,0 @@
-/.bundle/
-/.yardoc
-/_yardoc/
-/coverage/
-/doc/
-/pkg/
-/spec/reports/
-/tmp/
-*.lock

data/.travis.yml DELETED Viewed

@@ -1,15 +0,0 @@
-language: ruby
-rvm: 2.5.1
-sudo: required
-before_install:
-  - gem install bundler
-  - sudo apt-get update
-  - sudo apt-get install -y libgsl0-dev
-script: bundle exec rake test
-env:
-  -
-  - GSL=t
-notifications:
-  email:
-    on_success: never
-    on_failure: change