RubyGems - dwarf - Versions diffs - 0.0.4 → 0.0.5 - Mend

dwarf 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data/Gemfile.lock +5 -1
data/Rakefile +1 -0
data/TODO.taskpaper +22 -0
data/dwarf.gemspec +2 -0
data/lib/dwarf.rb +3 -1
data/lib/dwarf/classifier.rb +111 -66
data/lib/dwarf/example_management.rb +32 -0
data/lib/dwarf/information.rb +61 -0
data/lib/dwarf/version.rb +1 -1
data/spec/dwarf/classifier_spec.rb +197 -0
data/spec/dwarf/information_spec.rb +157 -0
data/spec/frawd.rb +105 -0
data/spec/spec_helper.rb +59 -0
data/specs.watchr +60 -0
metadata +39 -4
data/spec/classifier_spec.rb +0 -80

data/Gemfile.lock CHANGED

@@ -1,13 +1,14 @@
 PATH
   remote: .
   specs:
-    dwarf (0.0.3)
+    dwarf (0.0.4)
       rubytree (>= 0.8.1)
 GEM
   remote: http://rubygems.org/
   specs:
     diff-lcs (1.1.2)
+    faker (0.3.1)
     rspec (2.0.1)
       rspec-core (~> 2.0.1)
       rspec-expectations (~> 2.0.1)
@@ -19,6 +20,7 @@ GEM
       rspec-core (~> 2.0.1)
       rspec-expectations (~> 2.0.1)
     rubytree (0.8.1)
+    watchr (0.7)
 PLATFORMS
   ruby
@@ -26,5 +28,7 @@ PLATFORMS
 DEPENDENCIES
   bundler (>= 1.0.0)
   dwarf!
+  faker (>= 0.3.1)
   rspec (>= 2.0.1)
   rubytree (>= 0.8.1)
+  watchr (>= 0.7)

data/Rakefile CHANGED

@@ -5,3 +5,4 @@ Bundler::GemHelper.install_tasks
 RSpec::Core::RakeTask.new(:spec) do
 end
+task :default => :spec

data/TODO.taskpaper ADDED

@@ -0,0 +1,22 @@
+Dwarf 1.0 Features:
+  Find all instances with a given classification given a world. @alex @priority(3)
+- Create queries to find all instances of a given classification. @priority(1)
+- Make logic of a query for a given classification readble (as Ruby, or SQL, or ...) @priority(1)
+- Generate large worlds with consistent instances to test against. @muness @priority(2)
+- Handle nested features (e.g. example.engine.cylinders) @priority(1)
+  Handle messy data well (nil examples, examples with nil features, examples with nil subfeatures) @alex @priority(2)
+  Resolve weird behavior when all examples are missing some attribute. @alex @sam @priority(1)
+- When attribute.nil? bisects a heterogenous group, we should probably split on that attribute. @priority(1)
+  Refactor information theory methods out to enable unit testing. @alex @priority(3)
+  Eliminate features which are unique across all examples @sam @alex @priority(2)
+- Treat hashes as nested features. @priority(1)
+Dwarf Nice To Haves:
+- meta-features based on type, e.g. parity(car.engine.cylinders) can be :even or :odd @priority(2)
+- Bayesian classification of text fields. @priority(1)
+- Junk uniquely identifying features (implicit in info gain calculations? add tests to verify!) @priority(3)
+- Modular feature enumeration and feature fetching code. (Don't rely on attributes and example.attribute to be your only duck type checks!) @priority(2)
+- A world-generation tool to create internally consistent data sets to measure dwarf's learning against. Maybe we can call it "frawd". @priority(1)

data/dwarf.gemspec CHANGED

@@ -17,6 +17,8 @@ Gem::Specification.new do |s|
   s.add_dependency "rubytree", ">= 0.8.1"
   s.add_development_dependency "bundler", ">= 1.0.0"
   s.add_development_dependency "rspec", ">= 2.0.1"
+  s.add_development_dependency "watchr", ">= 0.7"
+  s.add_development_dependency "faker", ">= 0.3.1"
   s.files        = `git ls-files`.split("\n")
   s.executables  = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact

data/lib/dwarf.rb CHANGED

@@ -1,4 +1,6 @@
 module Dwarf
-  require 'dwarf/classifier'
+  require 'dwarf/example_management'
+  require 'dwarf/information'
   require 'dwarf/tree_node'
+  require 'dwarf/classifier'
 end

data/lib/dwarf/classifier.rb CHANGED

@@ -1,12 +1,15 @@
 module Dwarf
   class Classifier
     attr_accessor :examples
     attr_accessor :example_attributes
     attr_accessor :classifier_logic
+    attr_reader :decision_tree
     def initialize()
       @examples, @example_attributes = {}, []
       @decision_tree = TreeNode.new("ROOT")
+      @nil_name = Object.new.to_s
     end
     def add_examples(example_hash)
@@ -17,7 +20,7 @@ module Dwarf
     def add_example(example_record, classification)
       @examples[example_record]=classification
-      @example_attributes |= example_record.attributes
+      @example_attributes |= example_record.attribute_names
     end
     def classify(example)
@@ -26,37 +29,120 @@ module Dwarf
     def learn!
       @decision_tree.examples = @examples.keys
+      converge_tree
+      self.classifier_logic = codify_tree(@decision_tree)
+      implement_classify
+    end
+    def find_by_classification(world, classification)
+      matches = []
+      world.each do |instance|
+        if classify(instance) == classification
+          matches << instance
+        end
+      end
+      matches
+    end
+    private
+    include ExampleManagement
+    def converge_tree
       pending = []
       pending.push @decision_tree
-      used_attributes = []
       until pending.empty?
         node = pending.pop
         if classification = homogenous_examples(node)
           node.classification = classification
         elsif no_valuable_attributes?(node) && node.parent
-          node.parent.classification= expected_value(node.examples)
+          if split_nil_children = check_nil_split(node)
+            split_nil_children.each {|child_node| pending.push(child_node)}
+          else
+            create_expected_value(node)
+          end
         elsif no_valuable_attributes?(node)
-          classifier_logic = expected_value(node.examples)
+          node.classification = expected_value(node.examples)
         elsif false #stub branch
           #C4.5 would also allow for previously unseen classifications
-          #dwarf's API dictates all classifications are known before learning
-          #starts
+          #dwarf needs to correctly handle a pre-existing tree when
+          #learn! is called
         else
-          infogains = {}
-          (@example_attributes-used_attributes).each do |example_attribute|
-            infogains[information_gain(node.examples,example_attribute)] = example_attribute
-          end
-          best_gain = infogains.keys.sort[0]
-          best_attribute = infogains[best_gain]
-          split(node,best_attribute).each {|child_node| pending.push(child_node)}
-          used_attributes << best_attribute
+          split_children = homogenize_children(node)
+          split_children.each {|child_node| pending.push(child_node)}
         end
       end
-      self.classifier_logic = codify_tree(@decision_tree)
-      implement_classify
     end
-    private
+    def check_nil_split(node)
+      infogains = {}
+      used_attributes = used_attributes(node)
+      (filtered_attributes-used_attributes).each do |example_attribute|
+        infogains[Information::unfiltered_information_gain(node.examples,example_attribute,@examples)] =
+          example_attribute
+      end
+      best_gain = infogains.keys.sort[0]
+      best_attribute = infogains[best_gain]
+      if best_gain > 0.0
+        return split(node, best_attribute)
+      end
+    end
+    def create_expected_value(node)
+      new_node = TreeNode.new(node.name)
+      expected_value = expected_value(node.examples)
+      new_node.classification = expected_value
+      parent = node.parent
+      parent.remove! node
+      parent << new_node
+      new_node << node
+    end
+    def used_attributes(node)
+      if node.parentage
+        node.parentage.map { |parent| parent.attribute }
+      else
+        []
+      end
+    end
+    def attribute_homogeneous?(example_subset, attribute)
+      invert_with_dups(attribute_map(example_subset, attribute)).keys.size == 1
+    end
+    def heterogeneous_attributes
+      @example_attributes.reject { |attr| attribute_homogeneous?(@examples.keys, attr) }
+    end
+    def attribute_clusters?(example_subset, attribute)
+      invert_with_dups(attribute_map(example_subset, attribute)).keys.size == example_subset.size
+    end
+    def clustering_attributes
+      @example_attributes.select {|attr| attribute_clusters?(@examples.keys, attr) }
+    end
+    def filtered_attributes
+      clustering_attributes | heterogeneous_attributes
+    end
+    def homogenize_children(node)
+      infogains = {}
+      used_attributes = used_attributes(node)
+      (filtered_attributes-used_attributes).each do |example_attribute|
+        infogains[Information::information_gain(node.examples,example_attribute,@examples)] =
+          example_attribute
+      end
+      best_gain = infogains.keys.sort[0]
+      best_attribute = infogains[best_gain]
+      return split(node,best_attribute)
+    end
     def implement_classify
       classify_impl = "def classify(example)\n#{self.classifier_logic}\nend"
@@ -85,6 +171,7 @@ module Dwarf
     def codify_literal(object)
       case object
+        when @nil_name then "nil"
         when Symbol then ":#{object}"
         when String then "\"#{object}\""
       else
@@ -97,6 +184,9 @@ module Dwarf
       example_subset = node.examples
       examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
       examples_inversion.each do |key, value|
+        if key.nil?
+          key = @nil_name
+        end
         child_node = TreeNode.new(key)
         child_node.examples = value
         node << child_node
@@ -106,21 +196,20 @@ module Dwarf
     end
     def expected_value(example_subset)
-      examples_inversion = invert_with_dups(classification_map(example_subset))
+      examples_inversion = invert_with_dups(classification_map(example_subset, @examples))
       occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
       occurrences.keys.sort { |key| occurrences[key] }[0]
     end
     def no_valuable_attributes?(node)
-      @example_attributes.map {|example_attribute|
-        information_gain(node.examples, example_attribute)}.each {|info_gain|
+      filtered_attributes.map {|example_attribute|
+        Information::information_gain(node.examples, example_attribute, @examples)}.each {|info_gain|
         return false if info_gain != 0}
       return true
     end
     def homogenous_examples(node)
-      classifications = classifications(node.examples)
+      classifications = filter_classifications(@examples, node.examples)
       if classifications.length == 1
         return classifications[0]
       else
@@ -128,49 +217,5 @@ module Dwarf
       end
     end
-    def entropy(example_subset)
-      set_size = example_subset.length.to_f
-      examples_inversion = invert_with_dups(classification_map(example_subset))
-      occurences = examples_inversion.merge(examples_inversion) { |key, value| value.length.to_f }
-      0.0 - classifications(example_subset).inject(0.0) do |sum, classification|
-        sum + ((occurences[classification]/set_size)* Math.log2((occurences[classification]/set_size)))
-      end
-    end
-    def information_gain(example_subset,attribute)
-      set_size = example_subset.length.to_f
-      examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
-      occurrences = examples_inversion.merge(examples_inversion) { |key, value| value.length }
-      entropy(example_subset) - attribute_values(example_subset,attribute).inject(0.0) do |sum, attribute_value|
-        sum + (occurrences[attribute_value]/set_size) * entropy(examples_inversion[attribute_value])
-      end
-    end
-    def classifications(example_subset)
-      example_subset.map {|example| @examples[example]}.compact
-    end
-    def classification_map(example_subset)
-      classification_map = {}
-      example_subset.each {|example| classification_map[example] = @examples[example]}
-      classification_map
-    end
-    def attribute_values(example_subset, attribute)
-      example_subset.map {|example| example.method(attribute.to_sym).call}.compact
-    end
-    def attribute_map(example_subset, attribute)
-      example_map = {}
-      example_subset.each {|example| example_map[example] = example.method(attribute.to_sym).call}
-      example_map
-    end
-    def invert_with_dups(hash)
-      inversion = {}
-      hash.values.each {|value| inversion[value] = []}
-      hash.keys.each {|key| inversion[hash[key]] << key}
-      inversion
-    end
   end
 end

data/lib/dwarf/example_management.rb ADDED

@@ -0,0 +1,32 @@
+module Dwarf
+  module ExampleManagement
+    def classification_map(example_subset, classifications)
+      classification_map = {}
+      example_subset.each {|example| classification_map[example] = classifications[example]}
+      classification_map
+    end
+    def invert_with_dups(hash)
+      inversion = { }
+      hash.values.each {|value| inversion[value] = []}
+      hash.keys.each {|key| inversion[hash[key]] << key}
+      inversion
+    end
+    def eval_attribute(example,attribute)
+      example.method(attribute.to_sym).call
+    end
+    def attribute_map(example_subset, attribute)
+      example_map = {}
+      example_subset.each {|example| example_map[example] = eval_attribute(example, attribute)}
+      example_map
+    end
+    def filter_classifications(classifications,example_subset)
+      example_subset.map {|example| classifications[example]}.uniq
+    end
+  end
+end

data/lib/dwarf/information.rb ADDED

@@ -0,0 +1,61 @@
+module Dwarf
+  module Information
+    class<< self
+      include ExampleManagement
+      def entropy(example_subset, classifications)
+        seen_classifications = filter_classifications(classifications, example_subset)
+        return 0.0 if seen_classifications.length == 1
+        set_size = example_subset.length.to_f
+        examples_inversion = invert_with_dups(classification_map(example_subset, classifications))
+        occurrences = occurrences(examples_inversion)
+        sum_over(seen_classifications) do |classification|
+          frequency = occurrences[classification]/set_size
+          - frequency *  Math.log(frequency,seen_classifications.length)
+        end
+      end
+      def information_gain(example_subset, attribute, classifications)
+        filtered_example_subset = filter_for_missing_attribute(example_subset, attribute)
+        unfiltered_information_gain(filtered_example_subset, attribute, classifications)
+      end
+      def unfiltered_information_gain(example_subset, attribute, classifications)
+        set_size = example_subset.length.to_f
+        examples_inversion = invert_with_dups(attribute_map(example_subset,attribute))
+        occurrences = occurrences(examples_inversion)
+        heterogeneous_entropy = entropy(example_subset, classifications)
+        seen_attribute_values = attribute_values(example_subset,attribute)
+        heterogeneous_entropy -
+          sum_over(seen_attribute_values) do |attribute_value|
+          frequency = occurrences[attribute_value]/set_size
+          frequency * entropy(examples_inversion[attribute_value], classifications)
+        end
+      end
+      private
+      def sum_over(collection)
+        collection.inject(0.0) do |sum, classification|
+          sum + yield(classification)
+        end
+      end
+      def occurrences(examples_inversion)
+        examples_inversion.merge(examples_inversion) { |key, value| value.length.to_f }
+      end
+      def filter_for_missing_attribute(example_subset, attribute)
+        example_subset.reject { |example| eval_attribute(example,attribute).nil? }
+      end
+      def attribute_values(example_subset, attribute)
+        example_subset.map {|example| eval_attribute(example, attribute)}.uniq
+      end
+    end
+  end
+end

data/lib/dwarf/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Dwarf
-  VERSION = "0.0.4"
+  VERSION = "0.0.5"
 end

data/spec/dwarf/classifier_spec.rb ADDED

@@ -0,0 +1,197 @@
+require File.join(File.dirname(__FILE__), *%w[.. spec_helper.rb])
+describe Dwarf::Classifier do
+  before(:each) do
+    @classifier = Dwarf::Classifier.new()
+  end
+  def mock_car_examples
+    @example1 = FakeCar.new(:body_style => :boxy,
+                            :cylinders => 4,
+                            :transmission => :manual)
+    @example2 = FakeCar.new(:body_style => :swoopy,
+                            :cylinders => 6,
+                            :transmission => :manual)
+    @example3 = FakeCar.new(:body_style => :angry,
+                            :cylinders => 8,
+                            :transmission => :manual)
+    @example4 = FakeCar.new(:body_style => :swoopy,
+                            :cylinders => 8,
+                            :transmission => :manual)
+    @example5 = FakeCar.new(:body_style => nil,
+                            :cylinders => 6,
+                            :transmission => :manual)
+    @example6 = FakeCar.new(:body_style => :sleek,
+                            :cylinders => nil,
+                            :transmission => :manual)
+  end
+  context "add_example" do
+    it "accepts example classifications" do
+      @classifier.should respond_to(:add_example)
+    end
+    it "stores examples" do
+      @example3 = double('example3')
+      @example3.stub(:attribute_names) { [] }
+      @classifier.add_example(@example3, :irish)
+      @classifier.examples.should include(@example3)
+    end
+    it "enumerate example attributes" do
+      @example_with_attributes = double('attrs')
+      @example_with_attributes.stub(:attribute_names) { ["height", "branch_density"] }
+      @classifier.add_example(@example_with_attributes, :pine)
+      @classifier.example_attributes.should include("height", "branch_density")
+    end
+    it "gracefully accepts examples with nil attributes" do
+      @example_with_nil_attributes = double('nils')
+      @example_with_nil_attributes.stub(:attribute_names) { ["height", "branch_density"] }
+      @example_with_nil_attributes.stub(:height) { nil }
+      @example_with_nil_attributes.stub(:branch_density) { :high }
+      lambda {@classifier.add_example(@example_with_nil_attributes, :pine)}.should_not raise_exception
+    end
+  end
+  context "learn! and classify" do
+    it "only implements classify on the learning instance" do
+      @example = double('example3')
+      @example.stub(:attribute_names) { [] }
+      @class2 = Dwarf::Classifier.new()
+      @classifier.add_example(@example, :round)
+      @classifier.learn!
+      @classifier.classify(@example) == :round
+      @class2.classify(@example).should  == nil
+    end
+    context "frawd is dwarf backwards" do
+      before(:each) do
+         @frawd = Frawd.new(1,100)
+       end
+      it "is totally awesome" do
+         @frawd.training.each do |example, classification|
+          @classifier.add_example(example, classification)
+        end
+        @classifier.learn!
+        success = 0
+        @frawd.testing.each do |example, classification|
+          success += 1 if @classifier.classify(example) == classification
+        end
+        success.should == @frawd.testing.size
+      end
+    end
+    context "classifying cars" do
+      before(:each) do
+        mock_car_examples
+        @classifier.add_example(@example1, :japanese)
+        @classifier.add_example(@example2, :german)
+        @classifier.add_example(@example3, :american)
+      end
+      it "classifies in a trivial case" do
+        @classifier.learn!
+        @classifier.classify(@example1).should == :japanese
+        @classifier.classify(@example2).should == :german
+        @classifier.classify(@example3).should == :american
+      end
+      it "classifies when multiple predicates required" do
+        @classifier.add_example(@example4, :german)
+        @classifier.learn!
+        @classifier.classify(@example1).should == :japanese
+        @classifier.classify(@example2).should == :german
+        @classifier.classify(@example3).should == :american
+        @classifier.classify(@example4).should == :german
+      end
+      it "handles nils gracefully" do
+        @classifier.add_examples(@example4 => :german,
+                                 @example5 => :japanese,
+                                 @example6 => :japanese)
+        lambda{@classifier.learn!}.should_not raise_exception
+      end
+      it "handles nils correctly" do
+        @classifier.add_examples(@example4 => :german,
+                                 @example5 => :japanese,
+                                 @example6 => :japanese)
+        @classifier.learn!
+        @classifier.classify(@example1).should == :japanese
+        @classifier.classify(@example2).should == :german
+        @classifier.classify(@example3).should == :american
+        @classifier.classify(@example4).should == :german
+        @classifier.classify(@example5).should == :japanese
+        @classifier.classify(@example6).should == :japanese
+      end
+      it "handles a feature missing from all examples correctly" do
+        @classifier.add_examples(@example4 => :german,
+                                 @example5 => :japanese,
+                                 @example6 => :japanese)
+        @classifier.learn!
+        open = [@classifier.decision_tree]
+        until open.empty?
+          current = open.pop
+          current.attribute.should_not == "wheel_diameter"
+          current.children.each {|child| open.push child}
+        end
+      end
+      it "does not use a feature which is identical across all examples" do
+        @classifier.add_examples(@example4 => :german,
+                                 @example5 => :japanese,
+                                 @example6 => :japanese)
+        @classifier.learn!
+        open = [@classifier.decision_tree]
+        until open.empty?
+          current = open.pop
+          current.attribute.should_not == "transmission"
+          current.children.each {|child| open.push child}
+        end
+      end
+      it "does not use a feature unique to each example" do
+        @classifier.add_examples(@example4 => :german,
+                                 @example5 => :japanese,
+                                 @example6 => :japanese)
+        @classifier.learn!
+        open = [@classifier.decision_tree]
+        until open.empty?
+          current = open.pop
+          current.attribute.should_not == "vin"
+          current.children.each {|child| open.push child}
+        end
+      end
+    end
+  end
+  context "find_by_classification" do
+    it "returns sets of cars based on class" do
+      mock_car_examples
+      @classifier.add_examples(@example1 => :japanese,
+                               @example2 => :german,
+                               @example3 => :american,
+                               @example4 => :german)
+      @classifier.learn!
+      all_cars = [@example1, @example2, @example3, @example4]
+      japanese_cars = @classifier.find_by_classification(all_cars, :japanese)
+      japanese_cars.should == [@example1]
+    end
+  end
+end

data/spec/dwarf/information_spec.rb ADDED

@@ -0,0 +1,157 @@
+require File.join(File.dirname(__FILE__), *%w[.. spec_helper.rb])
+describe Dwarf::Information do
+  class Deck
+    def initialize()
+      @draw = (1..52).map{|v| v}
+      @draw.shuffle!
+      @discard = []
+    end
+    def sample
+      unless @draw.empty?
+        card = @draw.pop
+        @discard.push card
+        return card
+      else
+        @draw = @discard
+        @draw.shuffle
+        @discard = []
+        return self.sample
+      end
+    end
+  end
+  class Coin
+    def initialize(weighting)
+      @weighting = weighting
+      @faces = [:heads, :tails]
+    end
+    def attributes
+      "weighting"
+    end
+    def weighting
+      @weighting
+    end
+    def sample
+      case @weighting
+      when :fair then @faces.sample
+      when :heads then :heads
+      when :tails then :tails
+      end
+    end
+  end
+  context "entropy" do
+    it "calculates correctly for heads and tails" do
+      examples = []
+      classifications = {}
+      coin = Coin.new(:fair)
+      1000.times do
+        obj = Object.new
+        examples << obj
+        classifications[obj] = coin.sample
+      end
+      entropy = Dwarf::Information.entropy(examples, classifications)
+      entropy.should > 0.99
+      entropy.should <= 1.0
+    end
+    it "calculates correctly for 1d6" do
+      examples = []
+      classifications = {}
+      die = (1..6).map{|v| v}
+      1000.times do
+        obj = Object.new
+        examples << obj
+        classifications[obj] = die.sample
+      end
+      entropy = Dwarf::Information.entropy(examples, classifications)
+      entropy.should > 0.99
+      entropy.should <= 1.0
+    end
+    it "calculates correctly for a deck of cards" do
+      examples = []
+      classifications = {}
+      deck = Deck.new
+      1000.times do
+        obj = Object.new
+        examples << obj
+        classifications[obj] = deck.sample
+      end
+      entropy = Dwarf::Information.entropy(examples, classifications)
+      entropy.should > 0.99
+      entropy.should <= 1.0
+    end
+    it "calculates correctly with a weighted coin" do
+      examples = []
+      classifications = {}
+      1000.times do
+        obj = Object.new
+        examples << obj
+        classifications[obj] = (rand(100) == 99) ? :heads : :tails
+      end
+      entropy = Dwarf::Information.entropy(examples,classifications)
+      entropy.should < 0.101 #With a perfect 99:1 distribution, entropy should == 0.0807...
+      entropy.should >= 0.04
+    end
+    it "calculates correctly with a homogenous set" do
+      examples = []
+      classifications = {}
+      1000.times do
+        obj = Object.new
+        examples << obj
+        classifications[obj] = :heads
+      end
+      entropy = Dwarf::Information.entropy(examples,classifications)
+      entropy.should == 0.0
+    end
+  end
+  context "information_gain" do
+    it "calculates correctly splitting perfectly weighted coins" do
+      examples = []
+      classifications = {}
+      500.times do
+        coin = Coin.new(:heads)
+        examples << coin
+        classifications[coin] = coin.sample
+      end
+      500.times do
+        coin = Coin.new(:tails)
+        examples << coin
+        classifications[coin] = coin.sample
+      end
+      information_gain = Dwarf::Information.information_gain(examples, "weighting", classifications)
+      information_gain.should == 1.0
+    end
+    it "calculates worthless infogame for fair weighted coins" do
+      examples = []
+      classifications = {}
+      coin = Coin.new(:fair)
+        1000.times do
+        coin = Coin.new(:fair)
+          examples << coin
+        classifications[coin] = coin.sample
+      end
+      information_gain = Dwarf::Information.information_gain(examples, "weighting", classifications)
+      information_gain.should == 0.0
+    end
+  end
+end

data/spec/frawd.rb ADDED

@@ -0,0 +1,105 @@
+#require File.join(File.dirname(__FILE__), *%w[. spec_helper.rb])
+require 'rspec/mocks'
+require 'faker'
+require 'digest'
+class Frawd
+  attr_reader :rules
+  def initialize(depth = 10, sample_sizes = 1000)
+    @depth = depth
+    @sample_sizes = sample_sizes
+    initialize_attributes
+    @leaves = []
+    @rules = build_rules
+    @rules.each_leaf do |leaf|
+      @leaves << leaf
+    end
+  end
+  def types
+    [:enum, :number, :text]
+  end
+  def enums
+    unless @enums
+      @enums = [[:true, :false],
+                [:baz, :bar, :zot],
+                [:baz, :bar, :zot, :quux]]
+      (1..rand(10)).each do
+        @enums << Faker::Lorem.words(rand(10)).uniq.map(&:to_sym)
+      end
+    end
+    @enums
+  end
+  def classifications
+    @classifications ||= (1..rand(10)).map {|x| "classification#{x}".to_sym }
+  end
+  def initialize_attributes
+    @attributes = []
+    num_attributes = 10#rand(100)
+    (1..num_attributes).each do |number|
+      type = types.sample
+      values = enums.sample if type == :enum
+      @attributes << ["attribute#{number}", type, values]
+    end
+  end
+  def filtered_attributes
+    @attributes.select {|a| a[1] == :enum}
+  end
+  def build_rules(node = Dwarf::TreeNode.new("ROOT"), attributes = filtered_attributes)
+    parents = node.parentage || []
+    if (rand(@depth) < parents.length) || attributes.empty?
+      node.classification = classifications.sample
+    else
+      attribute = attributes.sample
+      node.attribute = attribute[0]
+      attribute[2].each do |value|
+        child = Dwarf::TreeNode.new(value.to_s)
+        node << child
+        build_rules(child,attributes-[attribute[0]])
+      end
+    end
+    node
+  end
+  def generate_example
+    node = @leaves.sample
+    example_classification = node.classification
+    example = RSpec::Mocks::Mock.new('example')
+    node.parentage.unshift(node).each_cons(2) do |child, parent|
+      example.stub!(parent.attribute.to_sym) { child.name }
+      example.stub!(:attribute_names) { @attributes.map {|a| a[0]} }
+    end
+    @attributes.each do |attribute|
+      unless example.respond_to? attribute[0].to_sym
+        val = case attribute[1]
+              when :enum then attribute[2].sample
+              when :number then rand((2**(0.size * 8 -2) -1))
+              when :text then Faker::Lorem.paragraphs
+              end
+        example.stub!(attribute[0].to_sym) { val }
+      end
+    end
+    [ example, example_classification ]
+  end
+  def generate_examples(count)
+    examples = Array.new(count)
+    (0...count).each { |index| examples[index] = generate_example }
+    examples
+  end
+  def training
+    @training ||= generate_examples(@sample_sizes)
+  end
+  def testing
+    @testing ||= generate_examples(@sample_sizes)
+  end
+end

data/spec/spec_helper.rb CHANGED

@@ -1 +1,60 @@
 require File.join(File.dirname(__FILE__), *%w[.. lib dwarf])
+require File.join(File.dirname(__FILE__), *%w[. frawd])
+# http://blog.jayfields.com/2007/04/ruby-assigning-instance-variables-in.html
+class Module
+  def initializer(*args, &block)
+    define_method :initialize do |*ctor_args|
+      ctor_named_args = (ctor_args.last.is_a?(Hash) ? ctor_args.pop : {})
+      (0..args.size).each do |index|
+        instance_variable_set("@#{args[index]}", ctor_args[index])
+      end
+      ctor_named_args.each_pair do |param_name, param_value|
+        instance_variable_set("@#{param_name}", param_value)
+      end
+    end
+  end
+end
+class FakeCar
+  initializer :body_style, :cylinders, :wheel_diameter, :transmission
+  attr_accessor :body_style, :cylinders, :wheel_diameter, :transmission
+  @@vin_counter = 0
+  def vin
+    @vin ||= @@vin_counter+=1
+  end
+  def attributes
+    ["body_style", "cylinders", "wheel_diameter", "transmission", "vin"]
+  end
+  alias_method :attribute_names, :attributes
+  def to_s
+    "#{body_style} with #{cylinders} cylinders"
+  end
+  def self.valid_body_styles
+    [:boxy, :swoopy, :angry, :boring]
+  end
+  def self.valid_cylinders
+    [4, 6, 8]
+  end
+  def self.fake
+    new(:body_style => valid_body_styles.sample,
+        :cylinders => valid_cylinders.sample)
+  end
+  def self.multiple_fakes(how_many=5)
+    array = []
+    how_many.times do
+      array << fake
+    end
+    array
+  end
+end

data/specs.watchr ADDED

@@ -0,0 +1,60 @@
+# Run me with:
+#
+# $ watchr specs.watchr
+# --------------------------------------------------
+# Convenience Methods
+# --------------------------------------------------
+def all_spec_files
+  Dir['spec/**/*_spec.rb']
+end
+def run_spec_matching(thing_to_match)
+  matches = all_spec_files.grep(/#{thing_to_match}/i)
+  if matches.empty?
+    puts "Sorry, thanks for playing, but there were no matches for #{thing_to_match}"
+  else
+    run matches.join(' ')
+  end
+end
+def run(files_to_run)
+  puts("Running: #{files_to_run}")
+  system("clear;rspec -cfs #{files_to_run}")
+  no_int_for_you
+end
+def run_all_specs
+  run(all_spec_files.join(' '))
+end
+# --------------------------------------------------
+# Watchr Rules
+# --------------------------------------------------
+watch('^spec/(.*)_spec\.rb') { |m| run_spec_matching(m[1]) }
+watch('^lib/(.*)\.rb') { |m| run_spec_matching(m[1]) }
+watch('^spec/spec_helper\.rb') { run_all_specs }
+watch('^spec/frawd\.rb') { run_all_specs }
+watch('^spec/support/.*\.rb') { run_all_specs }
+# --------------------------------------------------
+# Signal Handling
+# --------------------------------------------------
+def no_int_for_you
+  @sent_an_int = nil
+end
+Signal.trap 'INT' do
+  if @sent_an_int then
+    puts " A second INT? Ok, I get the message. Shutting down now."
+    exit
+  else
+    puts " Did you just send me an INT? Ugh. I'll quit for real if you do it again."
+    @sent_an_int = true
+    Kernel.sleep 1.5
+    run_all_specs
+  end
+end
+# vim:ft=ruby

metadata CHANGED

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 0
-  - 4
-  version: 0.0.4
+  - 5
+  version: 0.0.5
 platform: ruby
 authors:
 - Alex Redington
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-10-22 00:00:00 -04:00
+date: 2010-11-05 00:00:00 -04:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -62,6 +62,35 @@ dependencies:
         version: 2.0.1
   type: :development
   version_requirements: *id003
+- !ruby/object:Gem::Dependency
+  name: watchr
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        - 7
+        version: "0.7"
+  type: :development
+  version_requirements: *id004
+- !ruby/object:Gem::Dependency
+  name: faker
+  prerelease: false
+  requirement: &id005 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        - 3
+        - 1
+        version: 0.3.1
+  type: :development
+  version_requirements: *id005
 description: Dwarf is an implementation of decision tree learning algorithms targeted for use in the Rails 3 console environment for classifying ActiveRecord objects.
 email:
 - aredington@gmail.com
@@ -77,13 +106,19 @@ files:
 - Gemfile.lock
 - README.md
 - Rakefile
+- TODO.taskpaper
 - dwarf.gemspec
 - lib/dwarf.rb
 - lib/dwarf/classifier.rb
+- lib/dwarf/example_management.rb
+- lib/dwarf/information.rb
 - lib/dwarf/tree_node.rb
 - lib/dwarf/version.rb
-- spec/classifier_spec.rb
+- spec/dwarf/classifier_spec.rb
+- spec/dwarf/information_spec.rb
+- spec/frawd.rb
 - spec/spec_helper.rb
+- specs.watchr
 has_rdoc: true
 homepage: http://github.com/aredington/dwarf
 licenses: []

data/spec/classifier_spec.rb DELETED

@@ -1,80 +0,0 @@
-require File.join(File.dirname(__FILE__), *%w[spec_helper])
-describe Dwarf::Classifier do
-  before(:each) do
-    @classifier = Dwarf::Classifier.new()
-  end
-  it "accepts example classifications" do
-    @classifier.should respond_to(:add_example)
-  end
-  it "stores examples" do
-    @example3 = double('example3')
-    @example3.stub(:attributes) { [] }
-    @classifier.add_example(@example3, :irish)
-    @classifier.examples.should include(@example3)
-  end
-  it "only implements classify on the learning instance" do
-    @example = double('example3')
-    @example.stub(:attributes) { [] }
-    @class2 = Dwarf::Classifier.new()
-    @classifier.add_example(@example, :round)
-    @classifier.learn!
-    @classifier.classify(@example).should eq(:round)
-    @class2.classify(@example).should eq(nil)
-  end
-  context "classifying cars" do
-    def mock_car_examples
-      @example1 = double('example1')
-      @example1.stub(:body_style) { :boxy }
-      @example1.stub(:cylinders) { 4 }
-      @example1.stub(:attributes) { ["body_style", "cylinders"] }
-      @example2 = double('example2')
-      @example2.stub(:body_style) { :swoopy }
-      @example2.stub(:cylinders) { 6 }
-      @example2.stub(:attributes) { ["body_style", "cylinders"] }
-      @example3 = double('example3')
-      @example3.stub(:body_style) { :angry }
-      @example3.stub(:cylinders) { 8 }
-      @example3.stub(:attributes) { ["body_style", "cylinders"] }
-      @example4 = double('example4')
-      @example4.stub(:body_style) {:swoopy}
-      @example4.stub(:cylinders) {8}
-      @example4.stub(:attributes) { ["body_style", "cylinders"] }
-    end
-    it "enumerate example attributes" do
-      mock_car_examples
-      @classifier.add_example(@example1, :japanese)
-      @classifier.example_attributes.should include("body_style", "cylinders")
-    end
-    it "classifies in a trivial case" do
-      mock_car_examples
-      @classifier.add_example(@example1, :japanese)
-      @classifier.add_example(@example2, :german)
-      @classifier.add_example(@example3, :american)
-      @classifier.learn!
-      @classifier.classify(@example1).should eq(:japanese)
-      @classifier.classify(@example2).should eq(:german)
-      @classifier.classify(@example3).should eq(:american)
-    end
-    it "classifies when multiple predicates required" do
-      mock_car_examples
-      @classifier.add_examples(@example1 => :japanese, @example2 => :german, @example3 => :american, @example4 => :german)
-      @classifier.learn!
-      @classifier.classify(@example1).should eq(:japanese)
-      @classifier.classify(@example2).should eq(:german)
-      @classifier.classify(@example3).should eq(:american)
-      @classifier.classify(@example4).should eq(:german)
-    end
-  end
-end