RubyGems - idhja22 - Versions diffs - 0.14.4 → 1.1.0 - Mend

idhja22 0.14.4 → 1.1.0

Files changed (25) hide show

data/.gitignore +1 -1
data/bin/idhja22 +12 -3
data/idhja22.gemspec +2 -2
data/lib/idhja22/bayes.rb +50 -2
data/lib/idhja22/binary_classifier.rb +54 -0
data/lib/idhja22/config/default.rb +1 -0
data/lib/idhja22/dataset/errors.rb +1 -0
data/lib/idhja22/dataset.rb +22 -5
data/lib/idhja22/tree/node.rb +57 -14
data/lib/idhja22/tree.rb +3 -45
data/lib/idhja22/version.rb +1 -1
data/lib/idhja22.rb +1 -0
data/spec/bayes_spec.rb +114 -0
data/spec/{another_large_spec_data.csv → data/another_large_spec_data.csv} +0 -0
data/spec/data/evenly_split.csv +9 -0
data/spec/{large_spec_data.csv → data/large_spec_data.csv} +0 -0
data/spec/{spec_data.csv → data/spec_data.csv} +0 -0
data/spec/dataset_spec.rb +49 -2
data/spec/node/decision_node_spec.rb +205 -0
data/spec/node/leaf_node_spec.rb +53 -0
data/spec/spec_helper.rb +4 -0
data/spec/tree_spec.rb +4 -4
data/spec/version_spec.rb +1 -1
metadata +19 -14
data/spec/node_spec.rb +0 -97

data/.gitignore CHANGED Viewed

@@ -27,4 +27,4 @@ Gemfile.lock
 .DS_Store
 # data directory for storing csvs to run the program against
-data
+/data

data/bin/idhja22 CHANGED Viewed

@@ -4,13 +4,22 @@ require 'thor'
 require 'idhja22'
 class TrainAndValidate < Thor
-  desc "train_and_validate FILE", "train a tree for the given file and validate is against a validation set"
+  desc "train_and_validate_tree FILE", "train a tree for the given file and validate is against a validation set"
+  method_option :attributes, :type => :array
   method_option :"training-proportion", :type => :numeric, :default => 1.0, :aliases => 't'
-  def train_and_validate(filename)
-    t, v = Idhja22::Tree.train_and_validate_from_csv(filename, options[:"training-proportion"])
+  def train_and_validate_tree(filename)
+    t, v = Idhja22::Tree.train_and_validate_from_csv(filename, options)
     puts t.get_rules
     puts "Against validation set probability of successful classifiction: #{v}" if options[:"training-proportion"] < 1.0
   end
+  desc "train_and_validate_bayes FILE", "train a naive Bayesian classifier for the given file and validate is against a validation set"
+  method_option :attributes, :type => :array
+  method_option :"training-proportion", :type => :numeric, :default => 1.0, :aliases => 't'
+  def train_and_validate_bayes(filename)
+    t, v = Idhja22::Bayes.train_and_validate_from_csv(filename, options)
+    puts "Against validation set probability of successful classifiction: #{v}" if options[:"training-proportion"] < 1.0
+  end
 end
 TrainAndValidate.start

data/idhja22.gemspec CHANGED Viewed

@@ -7,8 +7,8 @@ Gem::Specification.new do |gem|
   gem.name          = "idhja22"
   gem.version       = Idhja22::VERSION
   gem.authors       = ["Henry Addison"]
-  gem.description   = %q{Decision Trees}
-  gem.summary       = %q{A gem for creating decision trees}
+  gem.description   = %q{Classifiers}
+  gem.summary       = %q{A gem for creating classifiers (decision trees and naive Bayes so far)}
   gem.homepage      = "https://github.com/henryaddison/idhja22"
   gem.files         = `git ls-files`.split($/)

data/lib/idhja22/bayes.rb CHANGED Viewed

@@ -1,5 +1,53 @@
 module Idhja22
-  class Bayes
+  class Bayes < BinaryClassifier
+    attr_accessor :conditional_probabilities, :prior_probabilities
+    class << self
+      def calculate_conditional_probabilities dataset, attribute_labels_to_use
+        conditional_probabilities = {}
+        attribute_labels_to_use.each do |attr_label|
+          conditional_probabilities[attr_label] = {}
+          dataset.partition_by_category.each do |cat, uniform_category_ds|
+            conditional_probabilities[attr_label][cat] = Hash.new(0)
+            partitioned_data = uniform_category_ds.partition(attr_label)
+            partitioned_data.each do |attr_value, uniform_value_ds|
+              conditional_probabilities[attr_label][cat][attr_value] = uniform_value_ds.size.to_f/uniform_category_ds.size.to_f
+            end
+          end
+        end
+        return conditional_probabilities
+      end
+      def calculate_priors dataset
+        output = Hash.new(0)
+        dataset.category_counts.each do |cat, count|
+          output[cat] = count.to_f/dataset.size.to_f
+        end
+        return output
+      end
+    end
+    def evaluate(query)
+      nb_values = {}
+      total_values = 0
+      prior_probabilities.each do |cat, prior_prob|
+        nb_value = prior_prob
+        conditional_probabilities.each do |attr_label, probs|
+          raise Idhja22::Dataset::Datum::UnknownAttributeValue, "Not seen value #{query[attr_label]} for attribute #{attr_label} in training." unless probs[cat].has_key? query[attr_label]
+          nb_value *= probs[cat][query[attr_label]]
+        end
+        total_values += nb_value
+        nb_values[cat] = nb_value
+      end
+      return nb_values['Y']/total_values
+    end
+    def train(dataset, attributes_to_use)
+      self.conditional_probabilities = self.class.calculate_conditional_probabilities(dataset, attributes_to_use)
+      self.prior_probabilities = self.class.calculate_priors(dataset)
+      return self
+    end
   end
 end

data/lib/idhja22/binary_classifier.rb ADDED Viewed

@@ -0,0 +1,54 @@
+module Idhja22
+  class BinaryClassifier
+    class << self
+      # Trains a classifier using the provided Dataset.
+      def train(dataset, opts = {})
+        attributes_to_use = (opts[:attributes] || dataset.attribute_labels)
+        classifier = new
+        classifier.train(dataset, attributes_to_use)
+        return classifier
+      end
+      # Takes a dataset and splits it randomly into training and validation data.
+      # Uses the training data to train a classifier whose perfomance then measured using the validation data.
+      # @param [Float] Proportion of dataset to use for training. The rest will be used to validate the resulting classifier.
+      def train_and_validate(dataset, opts = {})
+        opts[:"training-proportion"] ||= 0.5
+        training_set, validation_set = dataset.split(opts[:"training-proportion"])
+        tree = self.train(training_set, opts)
+        validation_value = tree.validate(validation_set)
+        return tree, validation_value
+      end
+      # see #train
+      # @note Takes a CSV filename rather than a Dataset
+      def train_from_csv(filename, opts={})
+        ds = Dataset.from_csv(filename)
+        train(ds, opts)
+      end
+      # see #train_and_validate
+      # @note Takes a CSV filename rather than a Dataset
+      def train_and_validate_from_csv(filename, opts={})
+        ds = Dataset.from_csv(filename)
+        train_and_validate(ds, opts)
+      end
+    end
+    def validate(ds)
+      output = 0
+      ds.data.each do |validation_point|
+        begin
+          prob = evaluate(validation_point)
+          output += (validation_point.category == 'Y' ? prob : 1.0 - prob)
+        rescue Idhja22::Dataset::Datum::UnknownAttributeValue
+          # if don't recognised the attribute value in the example, then assume the worst:
+          # will never classify this point correctly
+          # equivalent to output += 0 but no point running this
+        end
+      end
+      return output.to_f/ds.size.to_f
+    end
+  end
+end

data/lib/idhja22/config/default.rb CHANGED Viewed

@@ -2,4 +2,5 @@ Configuration.for('default') {
   default_probability 0.5
   termination_probability 0.95
   min_dataset_size 20
+  probability_delta 0.01
 }

data/lib/idhja22/dataset/errors.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 module Idhja22
+  class IncompleteTree < StandardError; end
   class Dataset
     class BadData < ArgumentError; end
     class InsufficientData < BadData; end

data/lib/idhja22/dataset.rb CHANGED Viewed

@@ -17,13 +17,13 @@ module Idhja22
         category_label = labels.pop
         attribute_labels = labels
-        data = []
+        set = new([], attribute_labels, category_label)
         csv.each do |row|
           training_example = Example.new(row, attribute_labels, category_label)
-          data << training_example
+          set << training_example
         end
-        new(data, attribute_labels, category_label)
+        return set
       end
     end
@@ -36,8 +36,9 @@ module Idhja22
     def category_counts
       counts = Hash.new(0)
-      data.each do |d|
-        counts[d.category]+=1
+      split_data = partition_by_category
+      split_data.each do |cat, d|
+        counts[cat] = d.size
       end
       return counts
     end
@@ -66,5 +67,21 @@ module Idhja22
       return training_set, validation_set
     end
+    def <<(example)
+      raise Idhja22::Dataset::Datum::UnknownCategoryLabel unless example.category_label == self.category_label
+      raise Idhja22::Dataset::Datum::UnknownAttributeLabel unless example.attribute_labels == self.attribute_labels
+      self.data << example
+    end
+    def partition_by_category
+      output = Hash.new do |hash, key|
+        hash[key] = self.class.new([], attribute_labels, category_label)
+      end
+      self.data.each do |d|
+        output[d.category] << d
+      end
+      return output
+    end
   end
 end

data/lib/idhja22/tree/node.rb CHANGED Viewed

@@ -20,9 +20,7 @@ module Idhja22
           return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
         end
-        data_split, best_attribute = best_attribute(dataset, attributes_available)
-        node = Idhja22::DecisionNode.new(data_split, best_attribute, attributes_available-[best_attribute], depth, dataset.probability)
+        node = DecisionNode.build(dataset, attributes_available, depth)
         return node
       end
@@ -59,21 +57,34 @@ module Idhja22
   class DecisionNode < Node
     attr_reader :branches, :decision_attribute
-    def initialize(data_split, decision_attribute, attributes_available, depth, parent_probability)
-      @decision_attribute = decision_attribute
-      @branches = {}
-      data_split.each do |value, dataset|
-        node = Node.build_node(dataset, attributes_available, depth+1, parent_probability)
-        if(node.is_a?(DecisionNode) && node.branches.values.all? { |n| n.is_a?(LeafNode) })
-          probs = node.branches.values.collect(&:probability)
-          if(probs.max - probs.min < 0.01)
-            node = LeafNode.new(probs.max, dataset.category_label)
-          end
+    class << self
+      def build(dataset, attributes_available, depth)
+        data_split, best_attribute = best_attribute(dataset, attributes_available)
+        output_node = new(best_attribute)
+        data_split.each do |value, dataset|
+          node = Node.build_node(dataset, attributes_available-[best_attribute], depth+1, dataset.probability)
+          output_node.add_branch(value, node) if node && !(node.is_a?(DecisionNode) && node.branches.empty?)
         end
-        @branches[value] = node if node && !(node.is_a?(DecisionNode) && node.branches.empty?)
+        output_node.cleanup_children!
+        return output_node
       end
     end
+    def initialize(decision_attribute)
+      @decision_attribute = decision_attribute
+      @branches = {}
+    end
+    def add_branch(attr_value, node)
+      @branches[attr_value] = node
+    end
     def get_rules
       rules = []
       branches.each do |v,n|
@@ -104,6 +115,29 @@ module Idhja22
       raise Idhja22::Dataset::Datum::UnknownAttributeValue, "when looking at attribute labelled #{self.decision_attribute} could not find branch for value #{queried_value}" if branch.nil?
       branch.evaluate(query)
     end
+    def cleanup_children!
+      branches.each do |attr, child_node|
+        child_node.cleanup_children!
+        leaves = child_node.leaves
+        probs = leaves.collect(&:probability)
+        if(probs.max - probs.min < Idhja22.config.probability_delta)
+          new_node = LeafNode.new(probs.max, category_label)
+          add_branch(attr, new_node)
+        end
+      end
+    end
+    def leaves
+      raise Idhja22::IncompleteTree, "decision node with no branches" if branches.empty?
+      branches.values.flat_map do |child_node|
+        child_node.leaves
+      end
+    end
+    def category_label
+      leaves.first.category_label
+    end
   end
   class LeafNode < Node
@@ -125,5 +159,14 @@ module Idhja22
       raise Idhja22::Dataset::Datum::UnknownCategoryLabel, "expected category label for query is #{query.category_label} but node is using #{self.category_label}" unless query.category_label == self.category_label
       return probability
     end
+    def leaves
+      return [self]
+    end
+    # no-op method - a leaf node has no children by definition
+    def cleanup_children!
+    end
   end
 end

data/lib/idhja22/tree.rb CHANGED Viewed

@@ -2,42 +2,15 @@ require "idhja22/tree/node"
 module Idhja22
   # The main entry class for a training, viewing and evaluating a decision tree.
-  class Tree
+  class Tree < BinaryClassifier
     attr_accessor :root
     class << self
-      # Trains a Tree using the provided Dataset.
-      def train(dataset)
-        new(dataset, dataset.attribute_labels)
-      end
-      # Takes a dataset and splits it randomly into training and validation data.
-      # Uses the training data to train a tree whose perfomance then measured using the validation data.
-      # @param [Float] Proportion of dataset to use for training. The rest will be used to validate the resulting tree.
-      def train_and_validate(dataset, training_proportion=0.5)
-        training_set, validation_set = dataset.split(training_proportion)
-        tree = self.train(training_set)
-        validation_value = tree.validate(validation_set)
-        return tree, validation_value
-      end
-      # see #train
-      # @note Takes a CSV filename rather than a Dataset
-      def train_from_csv(filename)
-        ds = Dataset.from_csv(filename)
-        train(ds)
-      end
-      # see #train_and_validate
-      # @note Takes a CSV filename rather than a Dataset
-      def train_and_validate_from_csv(filename, training_proportion=0.5)
-        ds = Dataset.from_csv(filename)
-        train_and_validate(ds, training_proportion)
-      end
     end
-    def initialize(dataset, attributes_available)
+    def train(dataset, attributes_available)
       raise Idhja22::Dataset::InsufficientData, "require at least #{Idhja22.config.min_dataset_size} data points, only have #{dataset.size} in data set provided" if(dataset.size < Idhja22.config.min_dataset_size)
       @root = Node.build_node(dataset, attributes_available, 0)
+      return self
     end
     def get_rules
@@ -52,20 +25,5 @@ module Idhja22
     def evaluate query
       @root.evaluate(query)
     end
-    def validate(ds)
-      output = 0
-      ds.data.each do |validation_point|
-        begin
-          prob = evaluate(validation_point)
-          output += (validation_point.category == 'Y' ? prob : 1.0 - prob)
-        rescue Idhja22::Dataset::Datum::UnknownAttributeValue
-          # if don't recognised the attribute value in the example, then assume the worst:
-          # will never classify this point correctly
-          # equivalent to output += 0 but no point running this
-        end
-      end
-      return output.to_f/ds.size.to_f
-    end
   end
 end

data/lib/idhja22/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Idhja22
-  VERSION = "0.14.4"
+  VERSION = "1.1.0"
 end

data/lib/idhja22.rb CHANGED Viewed

@@ -3,6 +3,7 @@ require 'idhja22/config/default'
 require "idhja22/version"
 require "idhja22/dataset"
+require "idhja22/binary_classifier"
 require "idhja22/tree"
 require "idhja22/bayes"

data/spec/bayes_spec.rb CHANGED Viewed

@@ -1,5 +1,119 @@
 require 'spec_helper'
 describe Idhja22::Bayes do
+  before(:all) do
+    @ds = Idhja22::Dataset.from_csv(File.join(data_dir,'large_spec_data.csv'))
+  end
+  describe '.train' do
+    it 'should train a classifier from a dataset' do
+      classifier = Idhja22::Bayes.train @ds, :attributes => %w{0}
+      cond_probs = classifier.conditional_probabilities
+      cond_probs.keys.should == ['0']
+      cond_probs['0'].keys.should == ['Y', 'N']
+      cond_probs['0']['Y']['a'].should == 5.0/6.0
+      cond_probs['0']['N']['a'].should == 0.75
+      cond_probs['0']['Y']['b'].should == 1.0/6.0
+      cond_probs['0']['N']['b'].should == 0.25
+      prior_probs = classifier.prior_probabilities
+      prior_probs['Y'].should == 0.6
+      prior_probs['N'].should == 0.4
+    end
+  end
+  describe '.calculate_conditional_probabilities' do
+    it 'should calculate the conditional probabilities of P(Cat|attr_val) from dataset for given attribute labels' do
+      cond_probs = Idhja22::Bayes.calculate_conditional_probabilities @ds, %w{0 2}
+      cond_probs.keys.should == ['0', '2']
+      cond_probs['0'].keys.should == ['Y','N']
+      cond_probs['2'].keys.should == ['Y','N']
+      cond_probs['0']['Y']['a'].should == 5.0/6.0
+      cond_probs['0']['N']['a'].should == 0.75
+      cond_probs['0']['Y']['b'].should == 1.0/6.0
+      cond_probs['0']['N']['b'].should == 0.25
+      cond_probs['2']['Y']['a'].should == 1.0
+      cond_probs['2']['N']['a'].should == 0.5
+      cond_probs['2']['Y']['b'].should == 0
+      cond_probs['2']['N']['b'].should == 0.5
+    end
+  end
+  describe '.calculate_priors' do
+    it 'should calculate the prior probabilities' do
+      prior_probs = Idhja22::Bayes.calculate_priors @ds
+      prior_probs['Y'].should == 0.6
+      prior_probs['N'].should == 0.4
+    end
+    context 'all single category' do
+      it 'should return 0 for other categories' do
+        uniform_ds = Idhja22::Dataset.new([Idhja22::Dataset::Example.new(['high', '20-30', 'vanilla', 'Y'], ['Confidence', 'Age group', 'fav ice cream'] , 'Loves Reading')], ['Confidence', 'Age group', 'fav ice cream'], 'Loves Reading')
+        prior_probs = Idhja22::Bayes.calculate_priors uniform_ds
+        prior_probs['Y'].should == 1.0
+        prior_probs['N'].should == 0
+      end
+    end
+  end
+  describe '#evaluate' do
+    before(:all) do
+      @bayes = Idhja22::Bayes.new
+      @bayes.conditional_probabilities = {
+        'age' => {
+          'Y' => {'young' => 0.98, 'old' => 0.02},
+          'N' => {'young' => 0.98, 'old' => 0.02}
+        },
+        'confidence' => {
+          'Y' => {'high' => 0.6, 'medium' => 0.3, 'low' => 0.1},
+          'N' => {'high' => 0.8, 'medium' => 0.15, 'low' => 0.05}
+        },
+        'fav ice cream' => {
+          'Y' => {'vanilla' => 0.75, 'strawberry' => 0.25},
+          'N' => {'vanilla' => 0.5, 'strawberry' => 0.6}
+        }
+      }
+      @bayes.prior_probabilities = {'Y' => 0.75, 'N' => 0.25}
+    end
+    context 'Y likely' do
+      it 'should return probability of being Y' do
+        query = Idhja22::Dataset::Datum.new(['high', 'young', 'vanilla', 'cheddar'], ['confidence', 'age', 'fav ice cream', 'fav cheese'], 'Loves Reading')
+        @bayes.evaluate(query).should be_within(0.00001).of(0.77143)
+      end
+    end
+    context 'N likely' do
+      it 'should return probability of being Y' do
+        query = Idhja22::Dataset::Datum.new(['high', 'young', 'strawberry', 'cheddar'], ['confidence', 'age', 'fav ice cream', 'fav cheese'], 'Loves Reading')
+        @bayes.evaluate(query).should be_within(0.00001).of(0.48387)
+      end
+    end
+    context 'unrecognised attribute value' do
+      it 'should throw an error' do
+        query = Idhja22::Dataset::Datum.new(['high', 'young', 'chocolate', 'cheddar'], ['confidence', 'age', 'fav ice cream', 'fav cheese'], 'Loves Reading')
+        expect { @bayes.evaluate(query) }.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeValue)
+      end
+    end
+  end
+  describe '#validate' do
+    before(:all) do
+      @bayes = Idhja22::Bayes.train(@ds)
+    end
+    it 'should return the average probability that the tree gets the validation examples correct' do
+      vds = Idhja22::Dataset.new([], ['0', '1','2','3','4'],'C')
+      vds << Idhja22::Dataset::Example.new(['a','a','a','a','a','Y'],['0', '1','2','3','4'],'C')
+      vds << Idhja22::Dataset::Example.new(['a','a','a','a','a','N'],['0', '1','2','3','4'],'C')
+      @bayes.validate(vds).should == 0.5
+    end
+  end
 end

data/spec/{another_large_spec_data.csv → data/another_large_spec_data.csv} RENAMED Viewed

File without changes

data/spec/data/evenly_split.csv ADDED Viewed

@@ -0,0 +1,9 @@
+1,2,C
+a,a,Y
+a,a,N
+a,b,Y
+a,b,N
+b,a,Y
+b,a,N
+b,b,Y
+b,b,N

data/spec/{large_spec_data.csv → data/large_spec_data.csv} RENAMED Viewed

File without changes

data/spec/{spec_data.csv → data/spec_data.csv} RENAMED Viewed

File without changes

data/spec/dataset_spec.rb CHANGED Viewed

@@ -10,7 +10,7 @@ describe Idhja22::Dataset do
     describe 'from_csv' do
       before(:all) do
-        @ds = Idhja22::Dataset.from_csv(File.join(File.dirname(__FILE__),'spec_data.csv'))
+        @ds = Idhja22::Dataset.from_csv(File.join(data_dir,'spec_data.csv'))
       end
       it 'should extract labels' do
@@ -50,7 +50,7 @@ describe Idhja22::Dataset do
     context 'ready made' do
       before(:all) do
-        @ds = Idhja22::Dataset.from_csv(File.join(File.dirname(__FILE__),'large_spec_data.csv'))
+        @ds = Idhja22::Dataset.from_csv(File.join(data_dir,'large_spec_data.csv'))
       end
       describe '#partition' do
@@ -125,6 +125,53 @@ describe Idhja22::Dataset do
           vs.size.should == 3
         end
       end
+      describe '#partition_by_category' do
+        it 'should divide the data set into a set of all Ys and a set of all Ns' do
+          sets = @ds.partition_by_category
+          sets.length.should == 2
+          sets['Y'].data.collect(&:category).uniq.should == ['Y']
+          sets['N'].data.collect(&:category).uniq.should == ['N']
+        end
+      end
+      describe '#<<' do
+        it 'should all datum to list of data' do
+          added_datum = Idhja22::Dataset::Example.new(['a','b','c','d','e', 'Y'],['0','1','2','3','4'],'C')
+          expect { @ds << added_datum}.to change(@ds, :size)
+          @ds.data.last.should == added_datum
+        end
+        context 'mismatched category label' do
+          it 'should throw an error' do
+            added_datum = Idhja22::Dataset::Example.new(['a','b','c','d','e', 'Y'],['0','1','2','3','4'],'D')
+            expect { @ds << added_datum}.to raise_error(Idhja22::Dataset::Datum::UnknownCategoryLabel)
+          end
+        end
+        context 'mismatching attributes' do
+          context 'extra attribute' do
+            it 'should throw an error' do
+              added_datum = Idhja22::Dataset::Example.new(['a','b','c','d','e', 'f', 'Y'],['0','1','2','3','4', '5'],'C')
+              expect { @ds << added_datum}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeLabel)
+            end
+          end
+          context 'missing attribute' do
+            it 'should throw an error' do
+              added_datum = Idhja22::Dataset::Example.new(['a','b','c','d', 'Y'],['0','1','2','3'],'C')
+              expect { @ds << added_datum}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeLabel)
+            end
+          end
+          context 'different attribute' do
+            it 'should throw an error' do
+              added_datum = Idhja22::Dataset::Example.new(['a','b','c','d', 'e', 'Y'],['0','1','2','3','9'],'C')
+              expect { @ds << added_datum}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeLabel)
+            end
+          end
+        end
+      end
     end
   end
 end

data/spec/node/decision_node_spec.rb ADDED Viewed

@@ -0,0 +1,205 @@
+require 'spec_helper'
+describe Idhja22::DecisionNode do
+  before(:all) do
+    @ds = Idhja22::Dataset.from_csv(File.join(data_dir,'large_spec_data.csv'))
+    @simple_decision_node = Idhja22::DecisionNode.new('3')
+    l1 = Idhja22::LeafNode.new(0.75, 'C')
+    l2 = Idhja22::LeafNode.new(0.0, 'C')
+    @simple_decision_node.add_branch('a', l1)
+    @simple_decision_node.add_branch('b', l2)
+  end
+  describe('#get_rules') do
+    it 'should return a list of rules' do
+      @simple_decision_node.get_rules.should == ["3 == a and then chance of C = 0.75", "3 == b and then chance of C = 0.0"]
+    end
+  end
+  describe '#leaves' do
+    it 'should return a list of terminating values' do
+      @simple_decision_node.leaves.should == [Idhja22::LeafNode.new(0.75, 'C'), Idhja22::LeafNode.new(0.0, 'C')]
+    end
+    context 'a branch without a terminating leaf node' do
+      it 'should throw an error' do
+        decision_node = Idhja22::DecisionNode.new('a')
+        decision_node.add_branch('1', Idhja22::LeafNode.new(0.75, 'C'))
+        decision_node.add_branch('2', Idhja22::DecisionNode.new('b'))
+        expect { decision_node.leaves }.to raise_error(Idhja22::IncompleteTree)
+      end
+    end
+  end
+  describe(' == ') do
+    it 'should return false with different decision attributes' do
+      dn = Idhja22::DecisionNode.new('2')
+      diff_dn = Idhja22::DecisionNode.new('3')
+      dn.should_not == diff_dn
+    end
+    it 'should return false with different branches' do
+      dn1 = Idhja22::DecisionNode.new('2')
+      diff_dn = Idhja22::DecisionNode.new('2')
+      leaf = Idhja22::LeafNode.new(0.75, 'C')
+      dn1.add_branch('value', leaf)
+      dn1.should_not == diff_dn
+    end
+    it 'should return true if decision node and branches match' do
+      dn1 = Idhja22::DecisionNode.new('2')
+      dn2 = Idhja22::DecisionNode.new('2')
+      leaf = Idhja22::LeafNode.new(0.75, 'C')
+      dn1.add_branch('value', leaf)
+      dn2.add_branch('value', leaf)
+      dn1.should == dn2
+    end
+  end
+  describe 'category_label' do
+    it 'should return the category_label from the leaves' do
+      @simple_decision_node.category_label.should == 'C'
+    end
+    context 'incomplete node' do
+      it 'should throw an error' do
+        dn = Idhja22::DecisionNode.new('a')
+        expect { dn.category_label }.to raise_error(Idhja22::IncompleteTree)
+      end
+    end
+  end
+  describe 'evaluate' do
+    it 'should follow node to probability' do
+      query = Idhja22::Dataset::Datum.new(['a', 'a'], ['3', '4'], 'C')
+      @simple_decision_node.evaluate(query).should == 0.75
+      query = Idhja22::Dataset::Datum.new(['b', 'a'], ['3', '4'], 'C')
+      @simple_decision_node.evaluate(query).should == 0.0
+    end
+    context 'mismatching attribute label' do
+      it 'should raise an error' do
+        query = Idhja22::Dataset::Datum.new(['b', 'a'], ['1', '2'], 'C')
+        expect {@simple_decision_node.evaluate(query)}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeLabel)
+      end
+    end
+    context 'unknown attribute value' do
+      it 'should raise an error' do
+        query = Idhja22::Dataset::Datum.new(['c', 'a'], ['3', '4'], 'C')
+        expect {@simple_decision_node.evaluate(query)}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeValue)
+      end
+    end
+  end
+  describe('.build') do
+    it 'should build a decision node based on the provided data' do
+      node = Idhja22::DecisionNode.build(@ds, @ds.attribute_labels, 0)
+      node.decision_attribute.should == "2"
+      node.branches.keys.should == ['a','b']
+    end
+    it 'should cleanup matching tails' do
+      ds = Idhja22::Dataset.from_csv(File.join(data_dir,'evenly_split.csv'))
+      node = Idhja22::DecisionNode.build(ds, ds.attribute_labels, 0)
+      node.get_rules.should == ['1 == a and then chance of C = 0.5', '1 == b and then chance of C = 0.5']
+    end
+  end
+  describe '#add_branch' do
+    it 'should add a branch for the given attribute value' do
+      node = Idhja22::DecisionNode.new 'attribute_name'
+      branch_node = Idhja22::DecisionNode.new 'other_name'
+      node.add_branch('value', branch_node)
+      node.branches.keys.should == ['value']
+      node.branches['value'].should == branch_node
+    end
+  end
+  describe '#cleanup_children!' do
+    context 'with matching output at level below' do
+      before(:all) do
+        @dn = Idhja22::DecisionNode.new('a')
+        @dn_below = Idhja22::DecisionNode.new('b')
+        @dn_below.add_branch('1', Idhja22::LeafNode.new(0.505, 'Category'))
+        @dn_below.add_branch('2', Idhja22::LeafNode.new(0.50, 'Category'))
+        @dn.add_branch('1', @dn_below)
+      end
+      it 'should merge any subnodes with same output into a single leafnode' do
+        @dn.cleanup_children!
+        @dn.branches['1'].should == Idhja22::LeafNode.new(0.505, 'Category')
+      end
+    end
+    context 'with matching output at two levels below' do
+      before(:all) do
+        @dn = Idhja22::DecisionNode.new('a')
+        @dn_1_below = Idhja22::DecisionNode.new('b')
+        @dn.add_branch('1', @dn_1_below)
+        @dn_2_below = Idhja22::DecisionNode.new('c')
+        @dn_1_below.add_branch('1', @dn_2_below)
+        @dn_2_below.add_branch('1', Idhja22::LeafNode.new(0.50, 'Category'))
+        @dn_2_below.add_branch('2', Idhja22::LeafNode.new(0.50, 'Category'))
+      end
+      it 'should merge nodes recusively' do
+        @dn.cleanup_children!
+        @dn.branches['1'].should == Idhja22::LeafNode.new(0.50, 'Category')
+      end
+    end
+    context 'with diverging branches that match internally' do
+      before(:all) do
+        @dn = Idhja22::DecisionNode.new('a')
+        dn_1_below = Idhja22::DecisionNode.new('b')
+        @dn.add_branch('1', dn_1_below)
+        dn_2_below = Idhja22::DecisionNode.new('c')
+        dn_1_below.add_branch('1', dn_2_below)
+        dn_2_below.add_branch('1', Idhja22::LeafNode.new(0.50, 'Category'))
+        dn_2_below.add_branch('2', Idhja22::LeafNode.new(0.50, 'Category'))
+        dn_2_below = Idhja22::DecisionNode.new('d')
+        dn_1_below.add_branch('2', dn_2_below)
+        dn_2_below.add_branch('1', Idhja22::LeafNode.new(0.70, 'Category'))
+        dn_2_below.add_branch('2', Idhja22::LeafNode.new(0.70, 'Category'))
+      end
+      it 'should merge nodes recusively' do
+        @dn.cleanup_children!
+        @dn.branches['1'].branches['1'].should == Idhja22::LeafNode.new(0.50, 'Category')
+        @dn.branches['1'].branches['2'].should == Idhja22::LeafNode.new(0.70, 'Category')
+      end
+    end
+    context 'without matching output' do
+      before(:all) do
+        @dn = Idhja22::DecisionNode.new('a')
+        @dn_below = Idhja22::DecisionNode.new('b')
+        @dn_below.add_branch('1', Idhja22::LeafNode.new(0.2, 'Category'))
+        @dn_below.add_branch('2', Idhja22::LeafNode.new(0.70, 'Category'))
+        @dn.add_branch('1', @dn_below)
+      end
+      it 'should do nothing' do
+        saved_rules = @dn.get_rules
+        @dn.cleanup_children!
+        @dn.get_rules.should == saved_rules
+      end
+    end
+  end
+end

data/spec/node/leaf_node_spec.rb ADDED Viewed

@@ -0,0 +1,53 @@
+require 'spec_helper'
+describe Idhja22::LeafNode do
+  describe('.new') do
+    it 'should store probability and category label' do
+      l = Idhja22::LeafNode.new(0.75, 'label')
+      l.probability.should == 0.75
+      l.category_label.should == 'label'
+    end
+  end
+  describe('#get_rules') do
+    it 'should return the probability' do
+      l = Idhja22::LeafNode.new(0.75, 'pudding')
+      l.get_rules.should == ['then chance of pudding = 0.75']
+    end
+  end
+  describe(' == ') do
+    let(:l1) { Idhja22::LeafNode.new(0.75, 'pudding') }
+    let(:l2) { Idhja22::LeafNode.new(0.75, 'pudding') }
+    let(:diff_l1) { Idhja22::LeafNode.new(0.7, 'pudding') }
+    let(:diff_l2) { Idhja22::LeafNode.new(0.75, 'starter') }
+    it 'should compare attributes' do
+      l1.should == l2
+      l1.should_not == diff_l1
+      l1.should_not == diff_l2
+    end
+  end
+  describe 'evaluate' do
+    let(:leaf) { Idhja22::LeafNode.new(0.6, 'pudding') }
+    it 'should return probability' do
+      query = Idhja22::Dataset::Datum.new(['high', 'gusty'], ['temperature', 'windy'], 'pudding')
+      leaf.evaluate(query).should == 0.6
+    end
+    context 'mismatching category labels' do
+      it 'should raise error' do
+        query = Idhja22::Dataset::Datum.new(['high', 'gusty'], ['temperature', 'windy'], 'tennis')
+        expect {leaf.evaluate(query)}.to raise_error(Idhja22::Dataset::Datum::UnknownCategoryLabel)
+      end
+    end
+  end
+  describe '#leaves' do
+    it 'should return self' do
+      leaf = Idhja22::LeafNode.new(0.6, 'pudding')
+      leaf.leaves.should == [leaf]
+    end
+  end
+end

data/spec/spec_helper.rb CHANGED Viewed

@@ -15,6 +15,10 @@ Configuration.for('spec', Idhja22.config) {
 Idhja22.configure('spec')
+def data_dir
+  File.dirname(__FILE__) + '/data/'
+end
 RSpec.configure do |config|
 end

data/spec/tree_spec.rb CHANGED Viewed

@@ -2,7 +2,7 @@ require 'spec_helper'
 describe Idhja22::Tree do
   before(:all) do
-    @ds = Idhja22::Dataset.from_csv(File.join(File.dirname(__FILE__),'large_spec_data.csv'))
+    @ds = Idhja22::Dataset.from_csv(File.join(data_dir,'large_spec_data.csv'))
   end
@@ -29,7 +29,7 @@ describe Idhja22::Tree do
     it 'should compare root nodes' do
       tree1 = Idhja22::Tree.train(@ds)
       tree2 = Idhja22::Tree.train(@ds)
-      diff_ds = Idhja22::Dataset.from_csv(File.join(File.dirname(__FILE__),'another_large_spec_data.csv'))
+      diff_ds = Idhja22::Dataset.from_csv(File.join(data_dir,'another_large_spec_data.csv'))
       diff_tree = Idhja22::Tree.train(diff_ds)
       tree1.should == tree2
       tree1.should_not == diff_tree
@@ -39,7 +39,7 @@ describe Idhja22::Tree do
   describe('.train_from_csv') do
     it 'should make the same tree as the one from the dataset' do
       tree = Idhja22::Tree.train(@ds)
-      csv_tree = Idhja22::Tree.train_from_csv(File.join(File.dirname(__FILE__),'large_spec_data.csv'))
+      csv_tree = Idhja22::Tree.train_from_csv(File.join(data_dir,'large_spec_data.csv'))
       tree.should == csv_tree
     end
   end
@@ -85,7 +85,7 @@ describe Idhja22::Tree do
   describe('.train_and_validate_from_csv') do
     it 'should make the same tree as the one from the dataset' do
-      csv_tree, validation_value = Idhja22::Tree.train_and_validate_from_csv(File.join(File.dirname(__FILE__),'large_spec_data.csv'), 0.75)
+      csv_tree, validation_value = Idhja22::Tree.train_and_validate_from_csv(File.join(data_dir,'large_spec_data.csv'), :"training-proportion" => 0.75)
       csv_tree.is_a?(Idhja22::Tree).should be_true
       (0..1).include?(validation_value).should be_true
     end

data/spec/version_spec.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'spec_helper'
 describe Idhja22 do
   describe 'VERSION' do
     it 'should be current version' do
-      Idhja22::VERSION.should == '0.14.4'
+      Idhja22::VERSION.should == '1.1.0'
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: idhja22
 version: !ruby/object:Gem::Version
-  version: 0.14.4
+  version: 1.1.0
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-12-18 00:00:00.000000000 Z
+date: 2012-12-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
@@ -123,7 +123,7 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
-description: Decision Trees
+description: Classifiers
 email:
 executables:
 - idhja22
@@ -141,6 +141,7 @@ files:
 - idhja22.gemspec
 - lib/idhja22.rb
 - lib/idhja22/bayes.rb
+- lib/idhja22/binary_classifier.rb
 - lib/idhja22/config/default.rb
 - lib/idhja22/dataset.rb
 - lib/idhja22/dataset/datum.rb
@@ -149,13 +150,15 @@ files:
 - lib/idhja22/tree.rb
 - lib/idhja22/tree/node.rb
 - lib/idhja22/version.rb
-- spec/another_large_spec_data.csv
 - spec/bayes_spec.rb
+- spec/data/another_large_spec_data.csv
+- spec/data/evenly_split.csv
+- spec/data/large_spec_data.csv
+- spec/data/spec_data.csv
 - spec/dataset/example_spec.rb
 - spec/dataset_spec.rb
-- spec/large_spec_data.csv
-- spec/node_spec.rb
-- spec/spec_data.csv
+- spec/node/decision_node_spec.rb
+- spec/node/leaf_node_spec.rb
 - spec/spec_helper.rb
 - spec/tree_spec.rb
 - spec/version_spec.rb
@@ -173,7 +176,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: -803768552583374641
+      hash: 3479458333568153307
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
@@ -182,21 +185,23 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: -803768552583374641
+      hash: 3479458333568153307
 requirements: []
 rubyforge_project:
 rubygems_version: 1.8.24
 signing_key:
 specification_version: 3
-summary: A gem for creating decision trees
+summary: A gem for creating classifiers (decision trees and naive Bayes so far)
 test_files:
-- spec/another_large_spec_data.csv
 - spec/bayes_spec.rb
+- spec/data/another_large_spec_data.csv
+- spec/data/evenly_split.csv
+- spec/data/large_spec_data.csv
+- spec/data/spec_data.csv
 - spec/dataset/example_spec.rb
 - spec/dataset_spec.rb
-- spec/large_spec_data.csv
-- spec/node_spec.rb
-- spec/spec_data.csv
+- spec/node/decision_node_spec.rb
+- spec/node/leaf_node_spec.rb
 - spec/spec_helper.rb
 - spec/tree_spec.rb
 - spec/version_spec.rb

data/spec/node_spec.rb DELETED Viewed

@@ -1,97 +0,0 @@
-require 'spec_helper'
-describe Idhja22::LeafNode do
-  describe('.new') do
-    it 'should store probability and category label' do
-      l = Idhja22::LeafNode.new(0.75, 'label')
-      l.probability.should == 0.75
-      l.category_label.should == 'label'
-    end
-  end
-  describe('#get_rules') do
-    it 'should return the probability' do
-      l = Idhja22::LeafNode.new(0.75, 'pudding')
-      l.get_rules.should == ['then chance of pudding = 0.75']
-    end
-  end
-  describe(' == ') do
-    let(:l1) { Idhja22::LeafNode.new(0.75, 'pudding') }
-    let(:l2) { Idhja22::LeafNode.new(0.75, 'pudding') }
-    let(:diff_l1) { Idhja22::LeafNode.new(0.7, 'pudding') }
-    let(:diff_l2) { Idhja22::LeafNode.new(0.75, 'starter') }
-    it 'should compare attributes' do
-      l1.should == l2
-      l1.should_not == diff_l1
-      l1.should_not == diff_l2
-    end
-  end
-  describe 'evaluate' do
-    let(:leaf) { Idhja22::LeafNode.new(0.6, 'pudding') }
-    it 'should return probability' do
-      query = Idhja22::Dataset::Datum.new(['high', 'gusty'], ['temperature', 'windy'], 'pudding')
-      leaf.evaluate(query).should == 0.6
-    end
-    context 'mismatching category labels' do
-      it 'should raise error' do
-        query = Idhja22::Dataset::Datum.new(['high', 'gusty'], ['temperature', 'windy'], 'tennis')
-        expect {leaf.evaluate(query)}.to raise_error(Idhja22::Dataset::Datum::UnknownCategoryLabel)
-      end
-    end
-  end
-end
-describe Idhja22::DecisionNode do
-  before(:all) do
-    @ds = Idhja22::Dataset.from_csv(File.join(File.dirname(__FILE__),'large_spec_data.csv'))
-  end
-  describe('#get_rules') do
-    it 'should return a list of rules' do
-      l = Idhja22::DecisionNode.new(@ds.partition('2'), '3', [], 0, 0.75)
-      l.get_rules.should == ["3 == a and then chance of C = 0.75", "3 == b and then chance of C = 0.0"]
-    end
-  end
-  describe(' == ') do
-    let(:dn1) { Idhja22::DecisionNode.new(@ds.partition('2'), '2', [], 0, 0.75) }
-    let(:dn2) { Idhja22::DecisionNode.new(@ds.partition('2'), '2', [], 0, 0.75) }
-    let(:diff_dn1) { Idhja22::DecisionNode.new(@ds.partition('0'), '2', [], 0, 0.75) }
-    let(:diff_dn2) { Idhja22::DecisionNode.new(@ds.partition('3'), '3', [], 0, 0.75) }
-    it 'should compare ' do
-      dn1.should == dn2
-      dn1.should_not == diff_dn1
-      dn1.should_not == diff_dn2
-    end
-  end
-  describe 'evaluate' do
-    let(:dn) { Idhja22::DecisionNode.new(@ds.partition('2'), '3', [], 0, 0.75) }
-    it 'should follow node to probability' do
-      query = Idhja22::Dataset::Datum.new(['a', 'a'], ['3', '4'], 'C')
-      dn.evaluate(query).should == 0.75
-      query = Idhja22::Dataset::Datum.new(['b', 'a'], ['3', '4'], 'C')
-      dn.evaluate(query).should == 0.0
-    end
-    context 'mismatching attribute label' do
-      it 'should raise an error' do
-        query = Idhja22::Dataset::Datum.new(['b', 'a'], ['1', '2'], 'C')
-        expect {dn.evaluate(query)}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeLabel)
-      end
-    end
-    context 'unknown attribute value' do
-      it 'should raise an error' do
-        query = Idhja22::Dataset::Datum.new(['c', 'a'], ['3', '4'], 'C')
-        expect {dn.evaluate(query)}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeValue)
-      end
-    end
-  end
-end