RubyGems - adaboost - Versions diffs - 0.0.1 - Mend

adaboost 0.0.1

Files changed (11) hide show

checksums.yaml +7 -0
data/lib/adaboost/adaboost.rb +74 -0
data/lib/adaboost/config.rb +10 -0
data/lib/adaboost/contingency_table.rb +182 -0
data/lib/adaboost/evaluator.rb +66 -0
data/lib/adaboost/features_analyzer.rb +82 -0
data/lib/adaboost/resampler.rb +31 -0
data/lib/adaboost/weak_classifier.rb +31 -0
data/lib/adaboost/weak_learner.rb +87 -0
data/lib/adaboost.rb +11 -0
metadata +52 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 2144b2d6abb1c701c8dbf955673a9a5ffd96e00e
+  data.tar.gz: 74dbd52e82039ca79e2dcfa3ef8c2b2ac20bd3e7
+SHA512:
+  metadata.gz: 035ca3856d5343afde2f968f43cc5de165f0f69fb1f8284c87fe2ba4525599221eb711ffeced58346ddbfbf2b89f640a240fbf336c3c2e367d6e6f99826a932b
+  data.tar.gz: d6f6adc39fa2327ea37e986f759976d83c9c0efda519f429af0d9325927fffe9dc8c540b84556ba890434fd59674b550191dd7d1f8df8ad9e39759795e697be8

data/lib/adaboost/adaboost.rb ADDED Viewed

@@ -0,0 +1,74 @@
+module AdaBoost
+  class AdaBoost
+    attr_reader :weak_classifiers, :y_index
+    def initialize number_of_classifiers, y_index
+      @weak_classifiers = []
+      @weak_learner = WeakLearner.new y_index
+      @number_of_classifiers = number_of_classifiers
+      @weights = []
+      @y_index = y_index
+    end
+    def initialize_weights samples
+      samples_size = samples.size.to_f
+      negative_weight = 1 / samples_size
+      positive_weight = negative_weight
+      if Config::INCORPORATE_COST_SENSITIVE_LEARNING
+          analyzer = FeaturesAnalyzer.new @y_index
+          distribution = analyzer.analyze(samples).distribution
+          positive_rate = distribution.positive / samples_size
+          negative_rate = distribution.negative / samples_size
+          normalizing_constant = distribution.negative * positive_rate + distribution.positive * negative_rate
+          positive_weight = positive_rate / normalizing_constant.to_f
+          negative_weight = negative_rate / normalizing_constant.to_f
+      end
+      samples.each_with_index do |sample, i|
+          y = sample[@y_index]
+          if y == -1
+              @weights[i] = positive_weight
+          else
+              @weights[i] = negative_weight
+          end
+      end
+    end
+    def update_weights weak_classifier, samples
+      sum = 0.0
+      samples.each_with_index do |sample, i|
+        y = sample[@y_index]
+        @weights[i] *= Math.exp -(weak_classifier.alpha) * weak_classifier.classify(sample) * y
+        sum += @weights[i]
+      end
+      @weights.each_with_index do |_, i|
+          @weights[i] /= sum
+      end
+    end
+    def train samples
+        puts "boom2"
+      if Config::OVER_SAMPLING_TRAINING_SET
+        resampler = Resampler.new @y_index
+        resampler.over_sample samples
+      end
+      initialize_weights samples
+      0.upto @number_of_classifiers - 1 do |i|
+        puts "boom"
+        weak_classifier = @weak_learner.generate_weak_classifier samples, @weights
+        weak_classifier.compute_alpha
+        update_weights weak_classifier, samples
+        @weak_classifiers << weak_classifier
+      end
+    end
+    def classify sample
+      score = 0.0
+      @weak_classifiers.each do |weak_classifier|
+        score += weak_classifier.classify_with_alpha sample
+      end
+      score
+    end
+  end
+end

data/lib/adaboost/config.rb ADDED Viewed

@@ -0,0 +1,10 @@
+module AdaBoost
+  module Config
+    NUMBER_OF_RANDOM_CLASSIFIERS = 100
+    INCORPORATE_COST_SENSITIVE_LEARNING = true
+    OVER_SAMPLING_TRAINING_SET = false
+    USE_RANDOM_WEAK_CLASSIFIERS = false
+    USE_THRESHOLD_CLASSIFICATION = true
+  end
+end

data/lib/adaboost/contingency_table.rb ADDED Viewed

@@ -0,0 +1,182 @@
+module AdaBoost
+  class ContingencyTable
+    def initialize
+      @table = [[0, 0], [0, 0]]
+    end
+    def true_positive
+      @table[1][1]
+    end
+    def false_positive
+      @table[0][1]
+    end
+    def true_negative
+      @table[0][0]
+    end
+    def false_negative
+      @table[1][0]
+    end
+    def add_prediction y, h
+      @table[class_to_index(y)][class_to_index(h)] += 1
+    end
+    def outcome_positive
+      true_positive + false_positive
+    end
+    def outcome_negative
+      true_negative + false_negative
+    end
+    def total_population
+      @table[0][0] + @table[0][1] + @table[1][0] + @table[1][1]
+    end
+    def predicted_condition_positive
+      true_positive + false_positive
+    end
+    def predicted_condition_negative
+      false_negative + true_negative
+    end
+    def condition_positive
+      true_positive + false_negative
+    end
+    def condition_negative
+      false_positive + true_negative
+    end
+    def prevalence
+      condition_positive / total_population.to_f
+    end
+    def true_positive_rate
+      true_positive / condition_positive.to_f
+    end
+    def recall
+      true_positive_rate
+    end
+    def sensitivity
+      true_positive_rate
+    end
+    def false_positive_rate
+      false_positive / condition_negative.to_f
+    end
+    def fall_out
+      false_positive_rate
+    end
+    def false_negative_rate
+      false_negative / condition_positive.to_f
+    end
+    def true_negative_rate
+      true_negative / condition_negative.to_f
+    end
+    def specificity
+      true_negative_rate
+    end
+    def accuracy
+      (true_positive + true_negative) / total_population.to_f
+    end
+    def positive_predictive_value
+      true_positive / outcome_positive.to_f
+    end
+    def precision
+      positive_predictive_value
+    end
+    def false_discovery_rate
+      false_positive / outcome_positive.to_f
+    end
+    def false_omission_rate
+      false_negative / outcome_negative.to_f
+    end
+    def negative_predictive_value
+      true_negative / outcome_negative.to_f
+    end
+    def positive_likelihood_ratio
+      true_positive_rate / false_positive_rate.to_f
+    end
+    def negative_likelihood_ratio
+      false_negative_rate / true_negative_rate.to_f
+    end
+    def diagnostic_odds_ratio
+      positive_likelihood_ratio / negative_likelihood_ratio.to_f
+    end
+    def to_s
+      "\nTotal population: %d\t \
+      \nCondition positive: %d\t \
+      \nCondition negative: %d\t \
+      \nPredicted Condition positive: %d\t \
+      \nPredicted Condition negative: %d\t \
+      \nTrue positive: %d\t \
+      \nTrue negative: %d\t \
+      \nFalse Negative: %d\t \
+      \nFalse Positive: %d\t \
+      \nPrevalence = Σ Condition positive / Σ Total population: %f\t \
+      \nTrue positive rate (TPR) = Σ True positive / Σ Condition positive: %f\t \
+      \nFalse positive rate (FPR) = Σ False positive / Σ Condition negative: %f\t \
+      \nFalse negative rate (FNR) = Σ False negative / Σ Condition positive: %f\t \
+      \nTrue negative rate (TNR) = Σ True negative / Σ Condition negative: %f\t \
+      \nAccuracy (ACC) = Σ True positive \ Σ True negative / Σ Total population: %f\t \
+      \nPositive predictive value (PPV) = Σ True positive / Σ Test outcome positive: %f\t \
+      \nFalse discovery rate (FDR) = Σ False positive / Σ Test outcome positive: %f\t \
+      \nFalse omission rate (FOR) = Σ False negative / Σ Test outcome negative: %f\t \
+      \nNegative predictive value (NPV) = Σ True negative / Σ Test outcome negative: %f\t \
+      \nPositive likelihood ratio (LR\) = TPR / FPR: %f\t \
+      \nNegative likelihood ratio (LR−) = FNR / TNR: %f\t \
+      \nDiagnostic odds ratio (DOR) = LR+ / LR−: %f\t" %
+      [
+        total_population,
+        condition_positive,
+        condition_negative,
+        predicted_condition_positive,
+        predicted_condition_negative,
+        true_positive,
+        true_negative,
+        false_negative,
+        false_positive,
+        prevalence,
+        true_positive_rate,
+        false_positive_rate,
+        false_negative_rate,
+        true_negative_rate,
+        accuracy,
+        positive_predictive_value,
+        false_discovery_rate,
+        false_omission_rate,
+        negative_predictive_value,
+        positive_likelihood_ratio,
+        negative_likelihood_ratio,
+        diagnostic_odds_ratio
+      ]
+    end
+    def class_to_index k
+      k > 0 ? 1 : 0
+    end
+  end
+end

data/lib/adaboost/evaluator.rb ADDED Viewed

@@ -0,0 +1,66 @@
+module AdaBoost
+  class Evaluator
+    def initialize classifier
+      @classifier = classifier
+      @threshold = Float::MAX
+    end
+    def evaluate test_set
+      contingency_table = ContingencyTable.new
+      test_set.each do |sample|
+        y = sample[@classifier.y_index]
+        if Config::USE_THRESHOLD_CLASSIFICATION
+          h = classify_using_threshold sample
+        else
+          h = e.classify_normally sample
+        end
+        contingency_table.add_prediction y, h
+      end
+      contingency_table
+    end
+    def threshold
+      if @threshold == Float::MAX
+        @threshold = 0
+        @classifier.weak_classifiers.each do |weak_classifier|
+          @threshold += weak_classifier.alpha / 2.0
+        end
+      end
+      @threshold
+    end
+    def classify_using_threshold sample
+      score = 0.0
+      @classifier.weak_classifiers.each do |weak_classifier|
+        if sample[weak_classifier.feature_number] > weak_classifier.split
+          score += weak_classifier.alpha
+        end
+      end
+      score > threshold ? 1 : -1
+    end
+    def classify_normally sample
+      @classifier.classify(sample > 0) ? 1 : -1
+    end
+    def used_feature_numbers unique = false
+      used_feature_numbers = []
+      @classifier.weak_classifiers.each do |weak_classifier|
+        used_feature_numbers << weak_classifier.feature_number
+      end
+      unique ? used_feature_numbers.uniq : used_feature_numbers
+    end
+    def feature_occurrences
+      used_numbers = used_feature_numbers
+      occurrences = {}
+      used_numbers.each do |number|
+        occurrences[number] = 0 if occurrences[number].nil?
+        occurrences[number] += 1
+      end
+      occurrences
+    end
+  end
+end

data/lib/adaboost/features_analyzer.rb ADDED Viewed

@@ -0,0 +1,82 @@
+module AdaBoost
+    Analyze = Struct.new(:statistics, :distribution)
+    Distribution = Struct.new(:negative, :positive)
+    FeatureStatistic = Struct.new(:min, :max, :sum, :avg, :vrn, :std, :rng)
+    VariableRelations = Struct.new(:x, :y, :cov, :cor)
+  class FeaturesAnalyzer
+    def initialize y_index
+      @y_index = y_index
+    end
+    def analyze samples
+      statistics = []
+      distribution = Distribution.new 0, 0
+      number_of_samples = samples.size
+      if number_of_samples < 1
+        raise ArgumentError.new 'At least one sample is needed to analyze.'
+      end
+      number_of_features = @y_index
+      sample_size = samples[0].size
+      if number_of_features < 1 or sample_size < 2 or sample_size <= @y_index
+        raise ArgumentError.new 'At least 1 feature is needed to analyze.'
+      end
+      0.upto number_of_features - 1 do
+          statistics << FeatureStatistic.new(Float::MAX, -Float::MAX, 0, 0, 0, 0)
+      end
+      samples.each do |sample|
+          y = sample[@y_index]
+          if y == -1
+              distribution.negative += 1
+          else
+              distribution.positive += 1
+          end
+          0.upto number_of_features - 1 do |i|
+              statistic = statistics[i]
+              feature_value = sample[i]
+              if feature_value < statistic.min
+                  statistic.min = feature_value
+              end
+              if feature_value > statistic.max
+                  statistic.max = feature_value
+              end
+              statistic.sum += feature_value
+          end
+      end
+      statistics.each do |statistic|
+        statistic.avg = statistic.sum / number_of_samples.to_f
+        statistic.rng = (statistic.max - statistic.min).abs
+      end
+      samples.each do |sample|
+        statistics.each_with_index do |statistic, i|
+          feature_value = sample[i]
+          statistic.vrn += (statistic.avg - feature_value) ** 2
+        end
+      end
+      statistics.each do |statistic|
+        statistic.vrn /= (number_of_samples - 1).to_f
+        statistic.std = Math.sqrt statistic.vrn
+      end
+      analyze = Analyze.new
+      analyze.statistics = statistics
+      analyze.distribution = distribution
+      analyze
+    end
+    def relations x, y, samples, statistics
+      sum = 0.0
+      samples.each do |sample|
+        x_value = sample[x].to_f
+        y_value = sample[y].to_f
+        sum += (x_value - statistics[x].avg) * (y_value - statistics[y].avg)
+      end
+      cov = sum / (samples.size - 1).to_f
+      cor = cov / (statistics[x].std * statistics[y].std).to_f
+      VariableRelations.new x, y, cov, cor
+    end
+  end
+end

data/lib/adaboost/resampler.rb ADDED Viewed

@@ -0,0 +1,31 @@
+module AdaBoost
+  class Resampler
+    def initialize y_index
+      @y_index = y_index
+    end
+    def over_sample samples
+      distribution = distribution samples
+      y0 = distribution.negative
+      y1 = distribution.positive
+      majority = y0 < y1 ? 1.0 : -1.0
+      difference = (y0 - y1).abs
+      samples.each do |sample|
+        if difference <= 0
+          break
+        end
+        if sample[@y_index] != majority
+          samples << sample
+          difference -= 1
+        end
+      end
+    end
+    def distribution instances
+      analyzer = FeaturesAnalyzer.new @y_index
+      analyzer.analyze(instances).distribution
+    end
+  end
+end

data/lib/adaboost/weak_classifier.rb ADDED Viewed

@@ -0,0 +1,31 @@
+module AdaBoost
+  class WeakClassifier
+    attr_accessor :error
+    attr_reader :feature_number, :split, :alpha
+    def initialize feature_number, split
+      @feature_number = feature_number
+      @split = split
+      @error = 0.0
+      @alpha = 0.0
+    end
+    def compute_alpha
+      @alpha = 0.5 * Math.log((1.0 - @error) / @error)
+    end
+    def classify sample
+      sample[@feature_number] > @split ? 1 : -1
+    end
+    def classify_with_alpha sample
+      return classify(sample) * @alpha
+    end
+    def increase_error amount
+      @error += amount
+    end
+  end
+end

data/lib/adaboost/weak_learner.rb ADDED Viewed

@@ -0,0 +1,87 @@
+module AdaBoost
+  class WeakLearner
+    def initialize y_index
+      @y_index = y_index
+      @analyzer = FeaturesAnalyzer.new y_index
+      @classifiers_cache = []
+    end
+    def features_satistics samples
+       @analyzer.analyze(samples).statistics
+    end
+    def generate_weak_classifier samples, weights
+      number_of_samples = samples.size
+      if number_of_samples < 1
+        raise ArgumentError.new 'At least one sample is needed to generate.'
+      end
+      number_of_features = @y_index
+      sample_size = samples[0].size
+      if number_of_features < 1 or sample_size < 2 or sample_size <= @y_index
+        raise ArgumentError.new 'At least 1 feature is needed to generate.'
+      end
+      classifiers = []
+      if Config::USE_RANDOM_WEAK_CLASSIFIERS
+        classifiers = generate_random_classifiers samples, number_of_features
+      else
+        classifiers = generate_all_possible_classifiers samples, number_of_features
+      end
+      best_index = -1
+      best_error = Float::MAX
+      classifiers.each_with_index do |classifier, i|
+        classifier.error = 0.0
+        samples.each_with_index do |sample, j|
+          y = sample[@y_index]
+          if classifier.classify(sample).to_f != y
+            classifier.increase_error weights[j]
+          end
+        end
+        if classifier.error < best_error
+          best_error = classifier.error
+          best_index = i
+        end
+      end
+      best = classifiers[best_index]
+      if !Config::USE_RANDOM_WEAK_CLASSIFIERS
+        classifiers.delete_at best_index
+      end
+      best
+    end
+    def generate_random_classifiers samples, number_of_features
+      classifiers = []
+      statistics = features_satistics samples
+      0.upto Config::NUMBER_OF_RANDOM_CLASSIFIERS - 1 do
+          feature_number = rand number_of_features
+          info = statistics[feature_number]
+          split = rand * info.rng + info.min
+          classifiers << WeakClassifier.new(feature_number, split)
+      end
+      classifiers
+    end
+    def generate_all_possible_classifiers samples, number_of_features
+      if @classifiers_cache.size == 0
+        matrix = []
+        0.upto number_of_features - 1 do
+          matrix << []
+        end
+        samples.each do |sample|
+          0.upto number_of_features - 1 do |i|
+            sample_value = sample[i]
+            matrix[i] << sample_value
+          end
+        end
+        matrix.each_with_index do |entry, i|
+          entry = entry.uniq
+          entry.each do |uniq_value|
+            @classifiers_cache << WeakClassifier.new(i, uniq_value)
+          end
+        end
+      end
+      @classifiers_cache
+    end
+  end
+end

data/lib/adaboost.rb ADDED Viewed

@@ -0,0 +1,11 @@
+require 'adaboost/adaboost.rb'
+require 'adaboost/config.rb'
+require 'adaboost/contingency_table.rb'
+require 'adaboost/evaluator.rb'
+require 'adaboost/features_analyzer.rb'
+require 'adaboost/resampler.rb'
+require 'adaboost/weak_classifier.rb'
+require 'adaboost/weak_learner.rb'
+module AdaBoost
+end

metadata ADDED Viewed

@@ -0,0 +1,52 @@
+--- !ruby/object:Gem::Specification
+name: adaboost
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Dalmir da Silva
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2016-07-25 00:00:00.000000000 Z
+dependencies: []
+description: AdaBoost classifier!
+email: dalmirdasilva@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/adaboost.rb
+- lib/adaboost/adaboost.rb
+- lib/adaboost/config.rb
+- lib/adaboost/contingency_table.rb
+- lib/adaboost/evaluator.rb
+- lib/adaboost/features_analyzer.rb
+- lib/adaboost/resampler.rb
+- lib/adaboost/weak_classifier.rb
+- lib/adaboost/weak_learner.rb
+homepage: http://dalmirdasilva.com/adaboost-classifier
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.5.1
+signing_key:
+specification_version: 4
+summary: AdaBoost classifier!
+test_files: []