RubyGems - dwarf - Versions diffs - 0.0.4 → 0.0.5 - Mend

dwarf 0.0.4 → 0.0.5

Files changed (16) hide show

data/Gemfile.lock +5 -1
data/Rakefile +1 -0
data/TODO.taskpaper +22 -0
data/dwarf.gemspec +2 -0
data/lib/dwarf.rb +3 -1
data/lib/dwarf/classifier.rb +111 -66
data/lib/dwarf/example_management.rb +32 -0
data/lib/dwarf/information.rb +61 -0
data/lib/dwarf/version.rb +1 -1
data/spec/dwarf/classifier_spec.rb +197 -0
data/spec/dwarf/information_spec.rb +157 -0
data/spec/frawd.rb +105 -0
data/spec/spec_helper.rb +59 -0
data/specs.watchr +60 -0
metadata +39 -4
data/spec/classifier_spec.rb +0 -80

data/Gemfile.lock CHANGED

@@ -1,13 +1,14 @@
 PATH
   remote: .
   specs:
-    dwarf (0.0.3)
+    dwarf (0.0.4)
       rubytree (>= 0.8.1)
 GEM
   remote: http://rubygems.org/
   specs:
     diff-lcs (1.1.2)
+    faker (0.3.1)
     rspec (2.0.1)
       rspec-core (~> 2.0.1)
       rspec-expectations (~> 2.0.1)
@@ -19,6 +20,7 @@ GEM
       rspec-core (~> 2.0.1)
       rspec-expectations (~> 2.0.1)
     rubytree (0.8.1)
+    watchr (0.7)
 PLATFORMS
   ruby
@@ -26,5 +28,7 @@ PLATFORMS
 DEPENDENCIES
   bundler (>= 1.0.0)
   dwarf!
+  faker (>= 0.3.1)
   rspec (>= 2.0.1)
   rubytree (>= 0.8.1)
+  watchr (>= 0.7)

data/Rakefile CHANGED

@@ -5,3 +5,4 @@ Bundler::GemHelper.install_tasks
 RSpec::Core::RakeTask.new(:spec) do
 end
+task :default => :spec

data/TODO.taskpaper ADDED

@@ -0,0 +1,22 @@
+Dwarf 1.0 Features:
+  Find all instances with a given classification given a world. @alex @priority(3)
+- Create queries to find all instances of a given classification. @priority(1)
+- Make logic of a query for a given classification readble (as Ruby, or SQL, or ...) @priority(1)
+- Generate large worlds with consistent instances to test against. @muness @priority(2)
+- Handle nested features (e.g. example.engine.cylinders) @priority(1)
+  Handle messy data well (nil examples, examples with nil features, examples with nil subfeatures) @alex @priority(2)
+  Resolve weird behavior when all examples are missing some attribute. @alex @sam @priority(1)
+- When attribute.nil? bisects a heterogenous group, we should probably split on that attribute. @priority(1)
+  Refactor information theory methods out to enable unit testing. @alex @priority(3)
+  Eliminate features which are unique across all examples @sam @alex @priority(2)
+- Treat hashes as nested features. @priority(1)
+Dwarf Nice To Haves:
+- meta-features based on type, e.g. parity(car.engine.cylinders) can be :even or :odd @priority(2)
+- Bayesian classification of text fields. @priority(1)
+- Junk uniquely identifying features (implicit in info gain calculations? add tests to verify!) @priority(3)
+- Modular feature enumeration and feature fetching code. (Don't rely on attributes and example.attribute to be your only duck type checks!) @priority(2)
+- A world-generation tool to create internally consistent data sets to measure dwarf's learning against. Maybe we can call it "frawd". @priority(1)

data/dwarf.gemspec CHANGED

@@ -17,6 +17,8 @@ Gem::Specification.new do |s|
   s.add_dependency "rubytree", ">= 0.8.1"
   s.add_development_dependency "bundler", ">= 1.0.0"
   s.add_development_dependency "rspec", ">= 2.0.1"
+  s.add_development_dependency "watchr", ">= 0.7"
+  s.add_development_dependency "faker", ">= 0.3.1"
   s.files        = `git ls-files`.split("\n")
   s.executables  = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact

data/lib/dwarf.rb CHANGED

@@ -1,4 +1,6 @@
 module Dwarf
-  require 'dwarf/classifier'
+  require 'dwarf/example_management'
+  require 'dwarf/information'
   require 'dwarf/tree_node'
+  require 'dwarf/classifier'
 end

data/lib/dwarf/classifier.rb CHANGED

@@ -1,12 +1,15 @@
 module Dwarf
   class Classifier
     attr_accessor :examples
     attr_accessor :example_attributes
     attr_accessor :classifier_logic
+    attr_reader :decision_tree
     def initialize()
       @examples, @example_attributes = {}, []
       @decision_tree = TreeNode.new("ROOT")
+      @nil_name = Object.new.to_s
     end
     def add_examples(example_hash)
@@ -17,7 +20,7 @@ module Dwarf
     def add_example(example_record, classification)
       @examples[example_record]=classification
-      @example_attributes |= example_record.attributes
+      @example_attributes |= example_record.attribute_names
     end
     def classify(example)
@@ -26,37 +29,120 @@ module Dwarf
     def learn!
       @decision_tree.examples = @examples.keys
+      converge_tree
+      self.classifier_logic = codify_tree(@decision_tree)
+      implement_classify
+    end
+    def find_by_classification(world, classification)
+      matches = []
+      world.each do |instance|
+        if classify(instance) == classification
+          matches << instance
+        end
+      end
+      matches
+    end
+    private
+    include ExampleManagement
+    def converge_tree
       pending = []
       pending.push @decision_tree
-      used_attributes = []
       until pending.empty?
         node = pending.pop
         if classification = homogenous_examples(node)
           node.classification = classification
         elsif no_valuable_attributes?(node) && node.parent
-          node.parent.classification= expected_value(node.examples)
+          if split_nil_children = check_nil_split(node)
+            split_nil_children.each {|child_node| pending.push(child_node)}
+          else
+            create_expected_value(node)
+          end
         elsif no_valuable_attributes?(node)
-          classifier_logic = expected_value(node.examples)
+          node.classification = expected_value(node.examples)
         elsif false #stub branch
           #C4.5 would also allow for previously unseen classifications
-          #dwarf's API dictates all classifications are known before learning
-          #starts
+          #dwarf needs to correctly handle a pre-existing tree when
+          #learn! is called
         else
-          infogains = {}
-          (@example_attributes-used_attributes).each do |example_attribute|
-            infogains[information_gain(node.examples,example_attribute)] = example_attribute
-          end
-          best_gain = infogains.keys.sort[0]
-          best_attribute = infogains[best_gain]
-          split(node,best_attribute).each {|child_node| pending.push(child_node)}
-          used_attributes << best_attribute
+          split_children = homogenize_children(node)
+          split_children.each {|child_node| pending.push(child_node)}
         end
       end
-      self.classifier_logic = codify_tree(@decision_tree)
-      implement_classify
     end
-    private
+    def check_nil_split(node)
+      infogains = {}
+      used_attributes = used_attributes(node)
+      (filtered_attributes-used_attributes).each do |example_attribute|
+        infogains[Information::unfiltered_information_gain(node.examples,example_attribute,@examples)] =
+          example_attribute
+      end
+      best_gain = infogains.keys.sort[0]
+      best_attribute = infogains[best_gain]
+      if best_gain > 0.0
+        return split(node, best_attribute)
+      end
+    end
+    def create_expected_value(node)
+      new_node = TreeNode.new(node.name)
+      expected_value = expected_value(node.examples)
+      new_node.classification = expected_value
+      parent = node.parent
+      parent.remove! node
+      parent << new_node
+      new_node << node
+    end
+    def used_attributes(node)
+      if node.parentage
+        node.parentage.map { |parent| parent.attribute }
+      else
+        []
+      end
+    end
+    def attribute_homogeneous?(example_subset, attribute)
+      invert_with_dups(attribute_map(example_subset, attribute)).keys.size == 1
+    end
+    def heterogeneous_attributes
+      @example_attributes.reject { |attr| attribute_homogeneous?(@examples.keys, attr) }
+    end
+    def attribute_clusters?(example_subset, attribute)
+      invert_with_dups(attribute_map(example_subset, attribute)).keys.size == example_subset.size
+    end
+    def clustering_attributes
+      @example_attributes.select {|attr| attribute_clusters?(@examples.keys, attr) }
+    end
+    def filtered_attributes
+      clustering_attributes | heterogeneous_attributes
+    end
+    def homogenize_children(node)
+      infogains = {}
+      used_attributes = used_attributes(node)
+      (filtered_attributes-used_attributes).each do |example_attribute|
+        infogains[Information::information_gain(node.examples,example_attribute,@examples)] =
+          example_attribute
+      end
+      best_gain = infogains.keys.sort[0]
+      best_attribute = infogains[best_gain]
+      return split(node,best_attribute)
+    end
     def implement_classify
       classify_impl = "def classify(example)\n#{self.classifier_logic}\nend"
@@ -85,6 +171,7 @@ module Dwarf
     def codify_literal(object)
       case object
+        when @nil_name then "nil"
         when Symbol then ":#{object}"
         when String then "\"#{object}\""
       else
@@ -97,6 +184,9 @@ module Dwarf
       example_subset = node.examples
       examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
       examples_inversion.each do |key, value|
+        if key.nil?
+          key = @nil_name
+        end
         child_node = TreeNode.new(key)
         child_node.examples = value
         node << child_node
@@ -106,21 +196,20 @@ module Dwarf
     end
     def expected_value(example_subset)
-      examples_inversion = invert_with_dups(classification_map(example_subset))
+      examples_inversion = invert_with_dups(classification_map(example_subset, @examples))
       occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
       occurrences.keys.sort { |key| occurrences[key] }[0]
     end
     def no_valuable_attributes?(node)
-      @example_attributes.map {|example_attribute|
-        information_gain(node.examples, example_attribute)}.each {|info_gain|
+      filtered_attributes.map {|example_attribute|
+        Information::information_gain(node.examples, example_attribute, @examples)}.each {|info_gain|
         return false if info_gain != 0}
       return true
     end
     def homogenous_examples(node)
-      classifications = classifications(node.examples)
+      classifications = filter_classifications(@examples, node.examples)
       if classifications.length == 1
         return classifications[0]
       else
@@ -128,49 +217,5 @@ module Dwarf
       end
     end
-    def entropy(example_subset)
-      set_size = example_subset.length.to_f
-      examples_inversion = invert_with_dups(classification_map(example_subset))
-      occurences = examples_inversion.merge(examples_inversion) { |key, value| value.length.to_f }
-      0.0 - classifications(example_subset).inject(0.0) do |sum, classification|
-        sum + ((occurences[classification]/set_size)* Math.log2((occurences[classification]/set_size)))
-      end
-    end
-    def information_gain(example_subset,attribute)
-      set_size = example_subset.length.to_f
-      examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
-      occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
-      entropy(example_subset) - attribute_values(example_subset,attribute).inject(0.0) do |sum, attribute_value|
-        sum + (occurrences[attribute_value]/set_size) * entropy(examples_inversion[attribute_value])
-      end
-    end
-    def classifications(example_subset)
-      example_subset.map {|example| @examples[example]}.compact
-    end
-    def classification_map(example_subset)
-      classification_map = {}
-      example_subset.each {|example| classification_map[example] = @examples[example]}
-      classification_map
-    end
-    def attribute_values(example_subset, attribute)
-      example_subset.map {|example| example.method(attribute.to_sym).call}.compact
-    end
-    def attribute_map(example_subset, attribute)
-      example_map = {}
-      example_subset.each {|example| example_map[example] = example.method(attribute.to_sym).call}
-      example_map
-    end
-    def invert_with_dups(hash)
-      inversion = {}
-      hash.values.each {|value| inversion[value] = []}
-      hash.keys.each {|key| inversion[hash[key]] << key}
-      inversion
-    end
   end
 end

data/lib/dwarf/example_management.rb ADDED

@@ -0,0 +1,32 @@
+module Dwarf
+  module ExampleManagement
+    def classification_map(example_subset, classifications)
+      classification_map = {}
+      example_subset.each {|example| classification_map[example] = classifications[example]}
+      classification_map
+    end
+    def invert_with_dups(hash)
+      inversion = { }
+      hash.values.each {|value| inversion[value] = []}
+      hash.keys.each {|key| inversion[hash[key]] << key}
+      inversion
+    end
+    def eval_attribute(example,attribute)
+      example.method(attribute.to_sym).call
+    end
+    def attribute_map(example_subset, attribute)
+      example_map = {}
+      example_subset.each {|example| example_map[example] = eval_attribute(example, attribute)}
+      example_map
+    end
+    def filter_classifications(classifications,example_subset)
+      example_subset.map {|example| classifications[example]}.uniq
+    end
+  end
+end

data/lib/dwarf/information.rb ADDED

@@ -0,0 +1,61 @@
+module Dwarf
+  module Information
+    class<< self
+      include ExampleManagement
+      def entropy(example_subset, classifications)
+        seen_classifications = filter_classifications(classifications, example_subset)
+        return 0.0 if seen_classifications.length == 1
+        set_size = example_subset.length.to_f
+        examples_inversion = invert_with_dups(classification_map(example_subset, classifications))
+        occurrences = occurrences(examples_inversion)
+        sum_over(seen_classifications) do |classification|
+          frequency = occurrences[classification]/set_size
+          - frequency *  Math.log(frequency,seen_classifications.length)
+        end
+      end
+      def information_gain(example_subset, attribute, classifications)
+        filtered_example_subset = filter_for_missing_attribute(example_subset, attribute)
+        unfiltered_information_gain(filtered_example_subset, attribute, classifications)
+      end
+      def unfiltered_information_gain(example_subset, attribute, classifications)
+        set_size = example_subset.length.to_f
+        examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
+        occurrences = occurrences(examples_inversion)
+        heterogeneous_entropy = entropy(example_subset, classifications)
+        seen_attribute_values = attribute_values(example_subset,attribute)
+        heterogeneous_entropy -
+          sum_over(seen_attribute_values) do |attribute_value|
+          frequency = occurrences[attribute_value]/set_size
+          frequency * entropy(examples_inversion[attribute_value], classifications)
+        end
+      end
+      private
+      def sum_over(collection)
+        collection.inject(0.0) do |sum, classification|
+          sum + yield(classification)
+        end
+      end
+      def occurrences(examples_inversion)
+        examples_inversion.merge(examples_inversion) { |key, value| value.length.to_f }
+      end
+      def filter_for_missing_attribute(example_subset, attribute)
+        example_subset.reject { |example| eval_attribute(example,attribute).nil? }
+      end
+      def attribute_values(example_subset, attribute)
+        example_subset.map {|example| eval_attribute(example, attribute)}.uniq
+      end
+    end
+  end
+end

data/lib/dwarf/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Dwarf
-  VERSION = "0.0.4"
+  VERSION = "0.0.5"
 end

data/spec/dwarf/classifier_spec.rb ADDED

@@ -0,0 +1,197 @@
+require File.join(File.dirname(__FILE__), *%w[.. spec_helper.rb])
+describe Dwarf::Classifier do
+  before(:each) do
+    @classifier = Dwarf::Classifier.new()
+  end
+  def mock_car_examples
+    @example1 = FakeCar.new(:body_style => :boxy,
+                            :cylinders => 4,
+                            :transmission => :manual)
+    @example2 = FakeCar.new(:body_style => :swoopy,
+                            :cylinders => 6,
+                            :transmission => :manual)
+    @example3 = FakeCar.new(:body_style => :angry,
+                            :cylinders => 8,
+                            :transmission => :manual)
+    @example4 = FakeCar.new(:body_style => :swoopy,
+                            :cylinders => 8,
+                            :transmission => :manual)
+    @example5 = FakeCar.new(:body_style => nil,
+                            :cylinders => 6,
+                            :transmission => :manual)
+    @example6 = FakeCar.new(:body_style => :sleek,
+                            :cylinders => nil,
+                            :transmission => :manual)
+  end
+  context "add_example" do
+    it "accepts example classifications" do
+      @classifier.should respond_to(:add_example)
+    end
+    it "stores examples" do
+      @example3 = double('example3')
+      @example3.stub(:attribute_names) { [] }
+      @classifier.add_example(@example3, :irish)
+      @classifier.examples.should include(@example3)
+    end
+    it "enumerate example attributes" do
+      @example_with_attributes = double('attrs')
+      @example_with_attributes.stub(:attribute_names) { ["height", "branch_density"] }
+      @classifier.add_example(@example_with_attributes, :pine)
+      @classifier.example_attributes.should include("height", "branch_density")
+    end
+    it "gracefully accepts examples with nil attributes" do
+      @example_with_nil_attributes = double('nils')
+      @example_with_nil_attributes.stub(:attribute_names) { ["height", "branch_density"] }
+      @example_with_nil_attributes.stub(:height) { nil }
+      @example_with_nil_attributes.stub(:branch_density) { :high }
+      lambda {@classifier.add_example(@example_with_nil_attributes, :pine)}.should_not raise_exception
+    end
+  end
+  context "learn! and classify" do
+    it "only implements classify on the learning instance" do
+      @example = double('example3')
+      @example.stub(:attribute_names) { [] }
+      @class2 = Dwarf::Classifier.new()
+      @classifier.add_example(@example, :round)
+      @classifier.learn!
+      @classifier.classify(@example) == :round
+      @class2.classify(@example).should  == nil
+    end
+    context "frawd is dwarf backwards" do
+      before(:each) do
+         @frawd = Frawd.new(1,100)
+       end
+      it "is totally awesome" do
+         @frawd.training.each do |example, classification|
+          @classifier.add_example(example, classification)
+        end
+        @classifier.learn!
+        success = 0
+        @frawd.testing.each do |example, classification|
+          success += 1 if @classifier.classify(example) == classification
+        end
+        success.should == @frawd.testing.size
+      end
+    end
+    context "classifying cars" do
+      before(:each) do
+        mock_car_examples
+        @classifier.add_example(@example1, :japanese)
+        @classifier.add_example(@example2, :german)
+        @classifier.add_example(@example3, :american)
+      end
+      it "classifies in a trivial case" do
+        @classifier.learn!
+        @classifier.classify(@example1).should == :japanese
+        @classifier.classify(@example2).should == :german
+        @classifier.classify(@example3).should == :american
+      end
+      it "classifies when multiple predicates required" do
+        @classifier.add_example(@example4, :german)
+        @classifier.learn!
+        @classifier.classify(@example1).should == :japanese
+        @classifier.classify(@example2).should == :german
+        @classifier.classify(@example3).should == :american
+        @classifier.classify(@example4).should == :german
+      end
+      it "handles nils gracefully" do
+        @classifier.add_examples(@example4 => :german,
+                                 @example5 => :japanese,
+                                 @example6 => :japanese)
+        lambda{@classifier.learn!}.should_not raise_exception
+      end
+      it "handles nils correctly" do
+        @classifier.add_examples(@example4 => :german,
+                                 @example5 => :japanese,
+                                 @example6 => :japanese)
+        @classifier.learn!
+        @classifier.classify(@example1).should == :japanese
+        @classifier.classify(@example2).should == :german
+        @classifier.classify(@example3).should == :american
+        @classifier.classify(@example4).should == :german
+        @classifier.classify(@example5).should == :japanese
+        @classifier.classify(@example6).should == :japanese
+      end
+      it "handles a feature missing from all examples correctly" do
+        @classifier.add_examples(@example4 => :german,
+                                 @example5 => :japanese,
+                                 @example6 => :japanese)
+        @classifier.learn!
+        open = [@classifier.decision_tree]
+        until open.empty?
+          current = open.pop
+          current.attribute.should_not == "wheel_diameter"
+          current.children.each {|child| open.push child}
+        end
+      end
+      it "does not use a feature which is identical across all examples" do
+        @classifier.add_examples(@example4 => :german,
+                                 @example5 => :japanese,
+                                 @example6 => :japanese)
+        @classifier.learn!
+        open = [@classifier.decision_tree]
+        until open.empty?
+          current = open.pop
+          current.attribute.should_not == "transmission"
+          current.children.each {|child| open.push child}
+        end
+      end
+      it "does not use a feature unique to each example" do
+        @classifier.add_examples(@example4 => :german,
+                                 @example5 => :japanese,
+                                 @example6 => :japanese)
+        @classifier.learn!
+        open = [@classifier.decision_tree]
+        until open.empty?
+          current = open.pop
+          current.attribute.should_not == "vin"
+          current.children.each {|child| open.push child}
+        end
+      end
+    end
+  end
+  context "find_by_classification" do
+    it "returns sets of cars based on class" do
+      mock_car_examples
+      @classifier.add_examples(@example1 => :japanese,
+                               @example2 => :german,
+                               @example3 => :american,
+                               @example4 => :german)
+      @classifier.learn!
+      all_cars = [@example1, @example2, @example3, @example4]
+      japanese_cars = @classifier.find_by_classification(all_cars, :japanese)
+      japanese_cars.should == [@example1]
+    end
+  end
+end

data/spec/dwarf/information_spec.rb ADDED

@@ -0,0 +1,157 @@
+require File.join(File.dirname(__FILE__), *%w[.. spec_helper.rb])
+describe Dwarf::Information do
+  class Deck
+    def initialize()
+      @draw = (1..52).map{|v| v}
+      @draw.shuffle!
+      @discard = []
+    end
+    def sample
+      unless @draw.empty?
+        card = @draw.pop
+        @discard.push card
+        return card
+      else
+        @draw = @discard
+        @draw.shuffle
+        @discard = []
+        return self.sample
+      end
+    end
+  end
+  class Coin
+    def initialize(weighting)
+      @weighting = weighting
+      @faces = [:heads, :tails]
+    end
+    def attributes
+      "weighting"
+    end
+    def weighting
+      @weighting
+    end
+    def sample
+      case @weighting
+      when :fair then @faces.sample
+      when :heads then :heads
+      when :tails then :tails
+      end
+    end
+  end
+  context "entropy" do
+    it "calculates correctly for heads and tails" do
+      examples = []
+      classifications = {}
+      coin = Coin.new(:fair)
+      1000.times do
+        obj = Object.new
+        examples << obj
+        classifications[obj] = coin.sample
+      end
+      entropy = Dwarf::Information.entropy(examples, classifications)
+      entropy.should > 0.99
+      entropy.should <= 1.0
+    end
+    it "calculates correctly for 1d6" do
+      examples = []
+      classifications = {}
+      die = (1..6).map{|v| v}
+      1000.times do
+        obj = Object.new
+        examples << obj
+        classifications[obj] = die.sample
+      end
+      entropy = Dwarf::Information.entropy(examples, classifications)
+      entropy.should > 0.99
+      entropy.should <= 1.0
+    end
+    it "calculates correctly for a deck of cards" do
+      examples = []
+      classifications = {}
+      deck = Deck.new
+      1000.times do
+        obj = Object.new
+        examples << obj
+        classifications[obj] = deck.sample
+      end
+      entropy = Dwarf::Information.entropy(examples, classifications)
+      entropy.should > 0.99
+      entropy.should <= 1.0
+    end
+    it "calculates correctly with a weighted coin" do
+      examples = []
+      classifications = {}
+      1000.times do
+        obj = Object.new
+        examples << obj
+        classifications[obj] = (rand(100) == 99) ? :heads : :tails
+      end
+      entropy = Dwarf::Information.entropy(examples,classifications)
+      entropy.should < 0.101 #With a perfect 99:1 distribution, entropy should == 0.0807...
+      entropy.should >= 0.04
+    end
+    it "calculates correctly with a homogenous set" do
+      examples = []
+      classifications = {}
+      1000.times do
+        obj = Object.new
+        examples << obj
+        classifications[obj] = :heads
+      end
+      entropy = Dwarf::Information.entropy(examples,classifications)
+      entropy.should == 0.0
+    end
+  end
+  context "information_gain" do
+    it "calculates correctly splitting perfectly weighted coins" do
+      examples = []
+      classifications = {}
+      500.times do
+        coin = Coin.new(:heads)
+        examples << coin
+        classifications[coin] = coin.sample
+      end
+      500.times do
+        coin = Coin.new(:tails)
+        examples << coin
+        classifications[coin] = coin.sample
+      end
+      information_gain = Dwarf::Information.information_gain(examples, "weighting", classifications)
+      information_gain.should == 1.0
+    end
+    it "calculates worthless infogame for fair weighted coins" do
+      examples = []
+      classifications = {}
+      coin = Coin.new(:fair)
+        1000.times do
+        coin = Coin.new(:fair)
+          examples << coin
+        classifications[coin] = coin.sample
+      end
+      information_gain = Dwarf::Information.information_gain(examples, "weighting", classifications)
+      information_gain.should == 0.0
+    end
+  end
+end

data/spec/frawd.rb ADDED

@@ -0,0 +1,105 @@
+#require File.join(File.dirname(__FILE__), *%w[. spec_helper.rb])
+require 'rspec/mocks'
+require 'faker'
+require 'digest'
+class Frawd
+  attr_reader :rules
+  def initialize(depth = 10, sample_sizes = 1000)
+    @depth = depth
+    @sample_sizes = sample_sizes
+    initialize_attributes
+    @leaves = []
+    @rules = build_rules
+    @rules.each_leaf do |leaf|
+      @leaves << leaf
+    end
+  end
+  def types
+    [:enum, :number, :text]
+  end
+  def enums
+    unless @enums
+      @enums = [[:true, :false],
+                [:baz, :bar, :zot],
+                [:baz, :bar, :zot, :quux]]
+      (1..rand(10)).each do
+        @enums << Faker::Lorem.words(rand(10)).uniq.map(&:to_sym)
+      end
+    end
+    @enums
+  end
+  def classifications
+    @classifications ||= (1..rand(10)).map {|x| "classification#{x}".to_sym }
+  end
+  def initialize_attributes
+    @attributes = []
+    num_attributes = 10#rand(100)
+    (1..num_attributes).each do |number|
+      type = types.sample
+      values = enums.sample if type == :enum
+      @attributes << ["attribute#{number}", type, values]
+    end
+  end
+  def filtered_attributes
+    @attributes.select {|a| a[1] == :enum}
+  end
+  def build_rules(node = Dwarf::TreeNode.new("ROOT"), attributes = filtered_attributes)
+    parents = node.parentage || []
+    if (rand(@depth) < parents.length) || attributes.empty?
+      node.classification = classifications.sample
+    else
+      attribute = attributes.sample
+      node.attribute = attribute[0]
+      attribute[2].each do |value|
+        child = Dwarf::TreeNode.new(value.to_s)
+        node << child
+        build_rules(child,attributes-[attribute[0]])
+      end
+    end
+    node
+  end
+  def generate_example
+    node = @leaves.sample
+    example_classification = node.classification
+    example = RSpec::Mocks::Mock.new('example')
+    node.parentage.unshift(node).each_cons(2) do |child, parent|
+      example.stub!(parent.attribute.to_sym) { child.name }
+      example.stub!(:attribute_names) { @attributes.map {|a| a[0]} }
+    end
+    @attributes.each do |attribute|
+      unless example.respond_to? attribute[0].to_sym
+        val = case attribute[1]
+              when :enum then attribute[2].sample
+              when :number then rand((2**(0.size * 8 -2) -1))
+              when :text then Faker::Lorem.paragraphs
+              end
+        example.stub!(attribute[0].to_sym) { val }
+      end
+    end
+    [ example, example_classification ]
+  end
+  def generate_examples(count)
+    examples = Array.new(count)
+    (0...count).each { |index| examples[index] = generate_example }
+    examples
+  end
+  def training
+    @training ||= generate_examples(@sample_sizes)
+  end
+  def testing
+    @testing ||= generate_examples(@sample_sizes)
+  end
+end

data/spec/spec_helper.rb CHANGED

@@ -1 +1,60 @@
 require File.join(File.dirname(__FILE__), *%w[.. lib dwarf])
+require File.join(File.dirname(__FILE__), *%w[. frawd])
+# http://blog.jayfields.com/2007/04/ruby-assigning-instance-variables-in.html
+class Module
+  def initializer(*args, &block)
+    define_method :initialize do |*ctor_args|
+      ctor_named_args = (ctor_args.last.is_a?(Hash) ? ctor_args.pop : {})
+      (0..args.size).each do |index|
+        instance_variable_set("@#{args[index]}", ctor_args[index])
+      end
+      ctor_named_args.each_pair do |param_name, param_value|
+        instance_variable_set("@#{param_name}", param_value)
+      end
+    end
+  end
+end
+class FakeCar
+  initializer :body_style, :cylinders, :wheel_diameter, :transmission
+  attr_accessor :body_style, :cylinders, :wheel_diameter, :transmission
+  @@vin_counter = 0
+  def vin
+    @vin ||= @@vin_counter+=1
+  end
+  def attributes
+    ["body_style", "cylinders", "wheel_diameter", "transmission", "vin"]
+  end
+  alias_method :attribute_names, :attributes
+  def to_s
+    "#{body_style} with #{cylinders} cylinders"
+  end
+  def self.valid_body_styles
+    [:boxy, :swoopy, :angry, :boring]
+  end
+  def self.valid_cylinders
+    [4, 6, 8]
+  end
+  def self.fake
+    new(:body_style => valid_body_styles.sample,
+        :cylinders => valid_cylinders.sample)
+  end
+  def self.multiple_fakes(how_many=5)
+    array = []
+    how_many.times do
+      array << fake
+    end
+    array
+  end
+end

data/specs.watchr ADDED

@@ -0,0 +1,60 @@
+# Run me with:
+#
+# $ watchr specs.watchr
+# --------------------------------------------------
+# Convenience Methods
+# --------------------------------------------------
+def all_spec_files
+  Dir['spec/**/*_spec.rb']
+end
+def run_spec_matching(thing_to_match)
+  matches = all_spec_files.grep(/#{thing_to_match}/i)
+  if matches.empty?
+    puts "Sorry, thanks for playing, but there were no matches for #{thing_to_match}"
+  else
+    run matches.join(' ')
+  end
+end
+def run(files_to_run)
+  puts("Running: #{files_to_run}")
+  system("clear;rspec -cfs #{files_to_run}")
+  no_int_for_you
+end
+def run_all_specs
+  run(all_spec_files.join(' '))
+end
+# --------------------------------------------------
+# Watchr Rules
+# --------------------------------------------------
+watch('^spec/(.*)_spec\.rb') { |m| run_spec_matching(m[1]) }
+watch('^lib/(.*)\.rb') { |m| run_spec_matching(m[1]) }
+watch('^spec/spec_helper\.rb') { run_all_specs }
+watch('^spec/frawd\.rb') { run_all_specs }
+watch('^spec/support/.*\.rb') { run_all_specs }
+# --------------------------------------------------
+# Signal Handling
+# --------------------------------------------------
+def no_int_for_you
+  @sent_an_int = nil
+end
+Signal.trap 'INT' do
+  if @sent_an_int then
+    puts " A second INT? Ok, I get the message. Shutting down now."
+    exit
+  else
+    puts " Did you just send me an INT? Ugh. I'll quit for real if you do it again."
+    @sent_an_int = true
+    Kernel.sleep 1.5
+    run_all_specs
+  end
+end
+# vim:ft=ruby

metadata CHANGED

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 0
-  - 4
-  version: 0.0.4
+  - 5
+  version: 0.0.5
 platform: ruby
 authors:
 - Alex Redington
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-10-22 00:00:00 -04:00
+date: 2010-11-05 00:00:00 -04:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -62,6 +62,35 @@ dependencies:
         version: 2.0.1
   type: :development
   version_requirements: *id003
+- !ruby/object:Gem::Dependency
+  name: watchr
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        - 7
+        version: "0.7"
+  type: :development
+  version_requirements: *id004
+- !ruby/object:Gem::Dependency
+  name: faker
+  prerelease: false
+  requirement: &id005 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        - 3
+        - 1
+        version: 0.3.1
+  type: :development
+  version_requirements: *id005
 description: Dwarf is an implementation of decision tree learning algorithms targeted for use in the Rails 3 console environment for classifying ActiveRecord objects.
 email:
 - aredington@gmail.com
@@ -77,13 +106,19 @@ files:
 - Gemfile.lock
 - README.md
 - Rakefile
+- TODO.taskpaper
 - dwarf.gemspec
 - lib/dwarf.rb
 - lib/dwarf/classifier.rb
+- lib/dwarf/example_management.rb
+- lib/dwarf/information.rb
 - lib/dwarf/tree_node.rb
 - lib/dwarf/version.rb
-- spec/classifier_spec.rb
+- spec/dwarf/classifier_spec.rb
+- spec/dwarf/information_spec.rb
+- spec/frawd.rb
 - spec/spec_helper.rb
+- specs.watchr
 has_rdoc: true
 homepage: http://github.com/aredington/dwarf
 licenses: []

data/spec/classifier_spec.rb DELETED

@@ -1,80 +0,0 @@
-require File.join(File.dirname(__FILE__), *%w[spec_helper])
-describe Dwarf::Classifier do
-  before(:each) do
-    @classifier = Dwarf::Classifier.new()
-  end
-  it "accepts example classifications" do
-    @classifier.should respond_to(:add_example)
-  end
-  it "stores examples" do
-    @example3 = double('example3')
-    @example3.stub(:attributes) { [] }
-    @classifier.add_example(@example3, :irish)
-    @classifier.examples.should include(@example3)
-  end
-  it "only implements classify on the learning instance" do
-    @example = double('example3')
-    @example.stub(:attributes) { [] }
-    @class2 = Dwarf::Classifier.new()
-    @classifier.add_example(@example, :round)
-    @classifier.learn!
-    @classifier.classify(@example).should eq(:round)
-    @class2.classify(@example).should eq(nil)
-  end
-  context "classifying cars" do
-    def mock_car_examples
-      @example1 = double('example1')
-      @example1.stub(:body_style) { :boxy }
-      @example1.stub(:cylinders) { 4 }
-      @example1.stub(:attributes) { ["body_style", "cylinders"] }
-      @example2 = double('example2')
-      @example2.stub(:body_style) { :swoopy }
-      @example2.stub(:cylinders) { 6 }
-      @example2.stub(:attributes) { ["body_style", "cylinders"] }
-      @example3 = double('example3')
-      @example3.stub(:body_style) { :angry }
-      @example3.stub(:cylinders) { 8 }
-      @example3.stub(:attributes) { ["body_style", "cylinders"] }
-      @example4 = double('example4')
-      @example4.stub(:body_style) {:swoopy}
-      @example4.stub(:cylinders) {8}
-      @example4.stub(:attributes) { ["body_style", "cylinders"] }
-    end
-    it "enumerate example attributes" do
-      mock_car_examples
-      @classifier.add_example(@example1, :japanese)
-      @classifier.example_attributes.should include("body_style", "cylinders")
-    end
-    it "classifies in a trivial case" do
-      mock_car_examples
-      @classifier.add_example(@example1, :japanese)
-      @classifier.add_example(@example2, :german)
-      @classifier.add_example(@example3, :american)
-      @classifier.learn!
-      @classifier.classify(@example1).should eq(:japanese)
-      @classifier.classify(@example2).should eq(:german)
-      @classifier.classify(@example3).should eq(:american)
-    end
-    it "classifies when multiple predicates required" do
-      mock_car_examples
-      @classifier.add_examples(@example1 => :japanese, @example2 => :german, @example3 => :american, @example4 => :german)
-      @classifier.learn!
-      @classifier.classify(@example1).should eq(:japanese)
-      @classifier.classify(@example2).should eq(:german)
-      @classifier.classify(@example3).should eq(:american)
-      @classifier.classify(@example4).should eq(:german)
-    end
-  end
-end