RubyGems - idhja22 - Versions diffs - 0.14.2 - Mend

idhja22 0.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

data/.gitignore +30 -0
data/.travis.yml +3 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +39 -0
data/Rakefile +11 -0
data/bin/idhja22 +16 -0
data/idhja22.gemspec +23 -0
data/lib/idhja22/dataset/datum.rb +40 -0
data/lib/idhja22/dataset/errors.rb +13 -0
data/lib/idhja22/dataset/tree_methods.rb +27 -0
data/lib/idhja22/dataset.rb +70 -0
data/lib/idhja22/node.rb +77 -0
data/lib/idhja22/tree.rb +110 -0
data/lib/idhja22/version.rb +3 -0
data/lib/idhja22.rb +10 -0
data/spec/another_large_spec_data.csv +11 -0
data/spec/dataset/example_spec.rb +59 -0
data/spec/dataset_spec.rb +130 -0
data/spec/large_spec_data.csv +11 -0
data/spec/node_spec.rb +97 -0
data/spec/spec_data.csv +4 -0
data/spec/spec_helper.rb +20 -0
data/spec/tree_spec.rb +93 -0
data/spec/version_spec.rb +9 -0
metadata +149 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,30 @@
+*.gem
+*.rbc
+.bundle
+.config
+coverage
+InstalledFiles
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+# YARD artifacts
+.yardoc
+_yardoc
+doc/
+# RVM
+.rvmrc
+#bundler artifacts
+Gemfile.lock
+#OS X files
+.DS_Store
+# data directory for storing csvs to run the program against
+data

data/.travis.yml ADDED Viewed

@@ -0,0 +1,3 @@
+language: ruby
+rvm:
+  - "1.9.3"

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in idhja22.gemspec
+gemspec

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2012 Henry Addison
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,39 @@
+# Idhja22
+[![Build Status](https://travis-ci.org/henryaddison/idhja22.png?branch=master)](https://travis-ci.org/henryaddison/idhja22)
+Mostly my attempt at writing a gem.
+Used for training a binary classifying tree (target values should be Y or N). Leaf nodes are a probability of Y rather than a Y or N.
+## Installation
+Add this line to your application's Gemfile:
+    gem 'idhja22'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install idhja22
+## Usage
+Simplest usage is to have a CSV of training data. The final column is treated as the target category value of each entry, the other columns are attributes for each datum. The first row is used as for attribute and target category labels.
+    > tree = Idhja22::Tree.train_from_csv('/path/to/data.csv')
+To print out the rules produced by the tree:
+    > puts tree.get_rules
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED Viewed

@@ -0,0 +1,11 @@
+require "bundler/gem_tasks"
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new('spec')
+desc "Run specs with SimpleCov"
+RSpec::Core::RakeTask.new('coverage') do |t|
+  ENV['COVERAGE'] = "true"
+end
+task :default => :spec

data/bin/idhja22 ADDED Viewed

@@ -0,0 +1,16 @@
+#!/usr/bin/env ruby
+require 'thor'
+require 'idhja22'
+class TrainAndValidate < Thor
+  desc "train_and_validate FILE", "train a tree for the given file and validate is against a validation set"
+  method_option :"training-proportion", :type => :numeric, :default => 1.0, :aliases => 't'
+  def train_and_validate(filename)
+    t, v = Idhja22::Tree.train_and_validate_from_csv(filename, options[:"training-proportion"])
+    puts t.get_rules
+    puts "Against validation set probability of successful classifiction: #{v}" if options[:"training-proportion"] < 1.0
+  end
+end
+TrainAndValidate.start

data/idhja22.gemspec ADDED Viewed

@@ -0,0 +1,23 @@
+# -*- encoding: utf-8 -*-
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'idhja22/version'
+Gem::Specification.new do |gem|
+  gem.name          = "idhja22"
+  gem.version       = Idhja22::VERSION
+  gem.authors       = ["Henry Addison"]
+  gem.description   = %q{Decision Trees}
+  gem.summary       = %q{A different take on decision trees}
+  gem.homepage      = "https://github.com/henryaddison/idhja22"
+  gem.files         = `git ls-files`.split($/)
+  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
+  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
+  gem.require_paths = ["lib"]
+  gem.add_development_dependency "rspec", "~>2.10"
+  gem.add_development_dependency "rake"
+  gem.add_development_dependency 'debugger'
+  gem.add_development_dependency 'simplecov'
+end

data/lib/idhja22/dataset/datum.rb ADDED Viewed

@@ -0,0 +1,40 @@
+module Idhja22
+  class Dataset
+    class Datum
+      attr_accessor :attributes, :category_label, :attribute_labels
+      def initialize(row, attr_labels, category_label)
+        self.category_label = category_label
+        raise NonUniqueAttributeLabels, "repeated attributes in #{attr_labels}" unless attr_labels == attr_labels.uniq
+        self.attribute_labels = attr_labels
+        self.attributes = row
+      end
+      def to_a
+        attributes
+      end
+      def [](attr_label)
+        if index = @attribute_labels.index(attr_label)
+          self.attributes[index]
+        else
+          raise UnknownAttributeLabel, "unknown attribute label #{attr_label} in labels #{@attribute_labels.join(', ')}"
+        end
+      end
+    end
+    class Example < Datum
+      attr_accessor :category
+      def initialize(row, attr_labels, category_label)
+        super
+        self.category = self.attributes.pop
+        raise UnknownCategoryValue, "Unrecognised category: #{@category} - should be Y or N" unless ['Y', 'N'].include?(@category)
+      end
+      def to_a
+        super+[category]
+      end
+    end
+  end
+end

data/lib/idhja22/dataset/errors.rb ADDED Viewed

@@ -0,0 +1,13 @@
+module Idhja22
+  class Dataset
+    class BadData < ArgumentError; end
+    class InsufficientData < BadData; end
+    class NonUniqueAttributeLabels < BadData; end
+    class Datum
+      class UnknownAttributeLabel < BadData; end
+      class UnknownAttributeValue < BadData; end
+      class UnknownCategoryLabel < BadData; end
+      class UnknownCategoryValue < BadData; end
+    end
+  end
+end

data/lib/idhja22/dataset/tree_methods.rb ADDED Viewed

@@ -0,0 +1,27 @@
+module Idhja22
+  class Dataset
+    module TreeMethods
+      def partition(attr_label)
+        groups = Hash.new([])
+        data.each do |datum|
+          groups[datum[attr_label]] += [datum]
+        end
+        output = Hash.new
+        groups.each do |value, data|
+          output[value] = Dataset.new(data, attribute_labels, category_label)
+        end
+        return output
+      end
+      def entropy
+        total = self.size
+        return 1.0 if total < Idhja22::MIN_DATASET_SIZE
+        category_counts.values.inject(0.0) { |ent, count|  prop = count.to_f/total.to_f; ent-prop*Math.log(prop,2)  }
+      end
+      def terminating?
+        probability > Idhja22::TERMINATION_PROBABILITY || probability < 1-Idhja22::TERMINATION_PROBABILITY
+      end
+    end
+  end
+end

data/lib/idhja22/dataset.rb ADDED Viewed

@@ -0,0 +1,70 @@
+require "idhja22/dataset/errors"
+require "idhja22/dataset/tree_methods"
+require "idhja22/dataset/datum"
+require 'csv'
+module Idhja22
+  class Dataset
+    attr_reader :category_label, :attribute_labels, :data
+    include Idhja22::Dataset::TreeMethods
+    class << self
+      def from_csv(filename)
+        csv = CSV.read(filename)
+        labels = csv.shift
+        category_label = labels.pop
+        attribute_labels = labels
+        data = []
+        csv.each do |row|
+          training_example = Example.new(row, attribute_labels, category_label)
+          data << training_example
+        end
+        new(data, attribute_labels, category_label)
+      end
+    end
+    def initialize(data, attr_labels, category_label)
+      @category_label = category_label
+      raise NonUniqueAttributeLabels, "repeated attributes in #{attr_labels}" unless attr_labels == attr_labels.uniq
+      @attribute_labels = attr_labels
+      @data = data
+    end
+    def category_counts
+      counts = Hash.new(0)
+      data.each do |d|
+        counts[d.category]+=1
+      end
+      return counts
+    end
+    def size
+      return data.size
+    end
+    def empty?
+      return data.empty?
+    end
+    def probability
+      category_counts['Y'].to_f/size.to_f
+    end
+    def split(training_proportion)
+      shuffled_data = data.shuffle
+      cutoff_point = (training_proportion.to_f*size).to_i
+      training_data = shuffled_data[0...cutoff_point]
+      validation_data = shuffled_data[cutoff_point...size]
+      training_set = self.class.new(training_data, attribute_labels, category_label)
+      validation_set = self.class.new(validation_data, attribute_labels, category_label)
+      return training_set, validation_set
+    end
+  end
+end

data/lib/idhja22/node.rb ADDED Viewed

@@ -0,0 +1,77 @@
+module Idhja22
+  class Node
+    def ==(other)
+      return self.class == other.class
+    end
+  end
+  class DecisionNode < Node
+    attr_reader :branches, :decision_attribute
+    def initialize(data_split, decision_attribute, attributes_available, depth, parent_probability)
+      @decision_attribute = decision_attribute
+      @branches = {}
+      data_split.each do |value, dataset|
+        node = Tree.build_node(dataset, attributes_available, depth+1, parent_probability)
+        if(node.is_a?(DecisionNode) && node.branches.values.all? { |n| n.is_a?(LeafNode) })
+          probs = node.branches.values.collect(&:probability)
+          if(probs.max - probs.min < 0.01)
+            node = LeafNode.new(probs.max, dataset.category_label)
+          end
+        end
+        @branches[value] = node if node && !(node.is_a?(DecisionNode) && node.branches.empty?)
+      end
+    end
+    def get_rules
+      rules = []
+      branches.each do |v,n|
+        current_rule = "#{decision_attribute} == #{v}"
+        sub_rules = n.get_rules
+        sub_rules.each do |r|
+          rules << "#{current_rule} and #{r}"
+        end
+      end
+      return rules
+    end
+    def ==(other)
+      return false unless super
+      return false unless self.decision_attribute == other.decision_attribute
+      return false unless self.branches.length == other.branches.length
+      self.branches.each do |attr_value, node|
+        return false unless other.branches.has_key?(attr_value)
+        return false unless node == other.branches[attr_value]
+      end
+      return true
+    end
+    def evaluate(query)
+      queried_value = query[self.decision_attribute]
+      branch = self.branches[queried_value]
+      raise Idhja22::Dataset::Datum::UnknownAttributeValue, "when looking at attribute labelled #{self.decision_attribute} could not find branch for value #{queried_value}" if branch.nil?
+      branch.evaluate(query)
+    end
+  end
+  class LeafNode < Node
+    attr_reader :probability, :category_label
+    def initialize(probability, category_label)
+      @probability = probability
+      @category_label = category_label
+    end
+    def get_rules
+      ["then chance of #{category_label} = #{probability.round(2)}"]
+    end
+    def ==(other)
+      return super && self.probability == other.probability && self.category_label == other.category_label
+    end
+    def evaluate(query)
+      raise Idhja22::Dataset::Datum::UnknownCategoryLabel, "expected category label for query is #{query.category_label} but node is using #{self.category_label}" unless query.category_label == self.category_label
+      return probability
+    end
+  end
+end

data/lib/idhja22/tree.rb ADDED Viewed

@@ -0,0 +1,110 @@
+module Idhja22
+  class Tree
+    attr_accessor :root
+    class << self
+      def train(dataset)
+        new(dataset, dataset.attribute_labels)
+      end
+      def train_and_validate(dataset, training_proportion=0.5)
+        training_set, validation_set = dataset.split(training_proportion)
+        tree = self.train(training_set)
+        validation_value = tree.validate(validation_set)
+        return tree, validation_value
+      end
+      def train_from_csv(filename)
+        ds = Dataset.from_csv(filename)
+        train(ds)
+      end
+      def train_and_validate_from_csv(filename, training_proportion=0.5)
+        ds = Dataset.from_csv(filename)
+        train_and_validate(ds, training_proportion)
+      end
+      def build_node(dataset, attributes_available, depth, parent_probability = nil)
+        if(dataset.size < Idhja22::MIN_DATASET_SIZE)
+          return Idhja22::LeafNode.new(probability_guess(parent_probability, depth), dataset.category_label)
+        end
+        #if successful termination - create and return a leaf node
+        if(dataset.terminating? && depth > 0) # don't terminate without splitting the data at least once
+          return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
+        end
+        if(depth >= 3) # don't let trees get too long
+          return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
+        end
+        #if we have no more attributes left to split the dataset on, then return a leafnode
+        if(attributes_available.empty?)
+          return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
+        end
+        data_split , best_attribute = best_attribute(dataset, attributes_available)
+        node = Idhja22::DecisionNode.new(data_split, best_attribute, attributes_available-[best_attribute], depth, dataset.probability)
+        return node
+      end
+      private
+      def best_attribute(dataset, attributes_available)
+        data_split = best_attribute = nil
+        igain = - Float::INFINITY
+        attributes_available.each do |attr_label|
+          possible_split = dataset.partition(attr_label)
+          possible_igain = dataset.entropy
+          possible_split.each do |value, ds|
+            possible_igain -= (ds.size.to_f/dataset.size.to_f)*ds.entropy
+          end
+          if(possible_igain > igain)
+            igain = possible_igain
+            data_split = possible_split
+            best_attribute = attr_label
+          end
+        end
+        return data_split, best_attribute
+      end
+      def probability_guess(parent_probability, depth)
+        return (parent_probability + (Idhja22::DEFAULT_PROBABILITY-parent_probability)/2**depth)
+      end
+    end
+    def initialize(dataset, attributes_available)
+      raise Idhja22::Dataset::InsufficientData, "require at least #{Idhja22::MIN_DATASET_SIZE} data points, only have #{dataset.size} in data set provided" if(dataset.size < Idhja22::MIN_DATASET_SIZE)
+      @root = self.class.build_node(dataset, attributes_available, 0)
+    end
+    def get_rules
+      rules = root.get_rules
+      "if " + rules.join("\nelsif ")
+    end
+    def ==(other)
+      return self.root == other.root
+    end
+    def evaluate query
+      @root.evaluate(query)
+    end
+    def validate(ds)
+      output = 0
+      ds.data.each do |validation_point|
+        begin
+          prob = evaluate(validation_point)
+          output += (validation_point.category == 'Y' ? prob : 1.0 - prob)
+        rescue Idhja22::Dataset::Datum::UnknownAttributeValue
+          # if don't recognised the attribute value in the example, then assume the worst:
+          # will never classify this point correctly
+          # equivalent to output += 0 but no point running this
+        end
+      end
+      return output.to_f/ds.size.to_f
+    end
+  end
+end

data/lib/idhja22/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Idhja22
+  VERSION = "0.14.2"
+end

data/lib/idhja22.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require "idhja22/version"
+require "idhja22/dataset"
+require "idhja22/tree"
+require "idhja22/node"
+module Idhja22
+  DEFAULT_PROBABILITY = 0.5
+  TERMINATION_PROBABILITY = 0.95
+  MIN_DATASET_SIZE = 20
+end

data/spec/another_large_spec_data.csv ADDED Viewed

@@ -0,0 +1,11 @@
+0,1,2,3,4,C
+a,a,a,a,a,Y
+a,a,b,b,a,N
+a,a,a,c,a,Y
+b,a,a,a,a,Y
+b,a,b,c,a,N
+a,a,a,a,a,Y
+a,a,a,a,a,Y
+a,a,a,a,a,Y
+a,a,a,a,b,N
+a,a,a,a,b,Y

data/spec/dataset/example_spec.rb ADDED Viewed

@@ -0,0 +1,59 @@
+require 'spec_helper'
+describe Idhja22::Dataset::Example do
+  before(:all) do
+    @datum = Idhja22::Dataset::Example.new(['high', '20-30', 'vanilla','Y'], ['confidence', 'age', 'fav ice cream'], 'likes')
+  end
+  describe 'new' do
+    it 'should extract attributes' do
+      @datum.attributes.should == ['high', '20-30', 'vanilla']
+      @datum.attribute_labels.should == ['confidence', 'age', 'fav ice cream']
+    end
+    it 'should extract category' do
+      @datum.category.should == 'Y'
+      @datum.category_label.should == 'likes'
+    end
+    context 'with non-unique attribute labels' do
+      it 'should throw an exception' do
+        expect do
+          Idhja22::Dataset::Example.new(['high', '20-30', 'vanilla','Y'], ['confidence', 'age', 'age'], 'likes')
+        end.to raise_error(Idhja22::Dataset::NonUniqueAttributeLabels)
+      end
+    end
+    context 'unexpected label' do
+      it 'should raise an exception' do
+        expect do
+          Idhja22::Dataset::Example.new(['high', '20-30', 'vanilla','H'], ['confidence', 'age', 'fav ice cream'], 'likes')
+        end.to raise_error(Idhja22::Dataset::Example::UnknownCategoryValue)
+      end
+    end
+  end
+  describe 'to_a' do
+    it 'should list the data in an array format' do
+      @datum.to_a.should == ['high', '20-30', 'vanilla','Y']
+    end
+  end
+  describe '[]' do
+    context 'known attribute' do
+      it 'should map attribute label to value' do
+        @datum['age'].should == '20-30'
+      end
+    end
+    context 'unknown attribute' do
+      it 'should throw an exception' do
+        expect do
+          @datum['madeup']
+        end.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeLabel)
+      end
+    end
+  end
+end

data/spec/dataset_spec.rb ADDED Viewed

@@ -0,0 +1,130 @@
+require 'spec_helper'
+describe Idhja22::Dataset do
+  context 'initialization' do
+    def check_labels(obj, exp_attr_labels, exp_cat_label)
+      obj.attribute_labels.should == exp_attr_labels
+      obj.category_label.should == exp_cat_label
+    end
+    describe 'from_csv' do
+      before(:all) do
+        @ds = Idhja22::Dataset.from_csv(File.join(File.dirname(__FILE__),'spec_data.csv'))
+      end
+      it 'should extract labels' do
+        check_labels(@ds, ['Weather', 'Temperature', 'Wind'], 'Plays')
+      end
+      it 'should extract data' do
+        @ds.data.length.should == 3
+        @ds.data.collect(&:attributes).should == [['sunny', 'hot', 'light'], ['sunny', 'cold', 'medium'], ['raining', 'cold', 'high']]
+        @ds.data.collect(&:category).should == ['Y', 'Y','N']
+      end
+    end
+    describe 'new' do
+      before(:all) do
+        @ds = Idhja22::Dataset.new([Idhja22::Dataset::Example.new(['high', '20-30', 'vanilla', 'Y'], ['Confidence', 'Age group', 'fav ice cream'] , 'Loves Reading')], ['Confidence', 'Age group', 'fav ice cream'], 'Loves Reading')
+      end
+      it 'should extract labels' do
+        check_labels(@ds, ['Confidence', 'Age group', 'fav ice cream'], 'Loves Reading')
+      end
+      it 'should extract data' do
+        @ds.data.length.should == 1
+        @ds.data.first.attributes.should == ['high', '20-30', 'vanilla']
+        @ds.data.first.category.should == 'Y'
+      end
+      context 'with repeated attribute labels' do
+        it 'should throw an error' do
+          expect do
+            Idhja22::Dataset.new([Idhja22::Dataset::Example.new(['high', '20-30', 'vanilla', 'Y'], ['Confidence', 'Age group', 'Confidence'] , 'Loves Reading')], ['Confidence', 'Age group', 'Confidence'], 'Loves Reading')
+          end.to raise_error(Idhja22::Dataset::NonUniqueAttributeLabels)
+        end
+      end
+    end
+    context 'ready made' do
+      before(:all) do
+        @ds = Idhja22::Dataset.from_csv(File.join(File.dirname(__FILE__),'large_spec_data.csv'))
+      end
+      describe '#partition' do
+        it 'should split the data set based on the values of an given attribute index' do
+          new_sets = @ds.partition('0')
+          new_sets.length.should == 2
+          new_sets.each do |value, dset|
+            dset.data.collect { |d| d.attributes[0] }.uniq.should == [value]
+          end
+        end
+        it 'should preserve the data other than splitting it' do
+          new_sets = @ds.partition('3')
+          new_sets.length.should == 3
+          new_sets['a'].attribute_labels.should == @ds.attribute_labels
+          new_sets['a'].category_label.should == @ds.category_label
+          new_sets['a'].data.collect(&:to_a).should == [%w{a a a a a Y}, %w{b a a a a Y}, %w{a a a a a Y}, %w{a a a a a Y}, %w{a a a a a Y}, %w{a a a a b N}, %w{a a a a b N}]
+        end
+        it 'should produce one item when the values are all the same' do
+          @ds.partition('1').length.should == 1
+        end
+      end
+      describe 'category_counts' do
+        it 'should count the number of entries in each category' do
+          @ds.category_counts.should == {'Y' => 6, 'N' => 4}
+        end
+      end
+      describe '#entropy' do
+        it 'should calculate entropy of set' do
+          @ds.entropy.should be_within(0.000001).of(0.970951)
+        end
+        context 'with little data' do
+          it 'should return 1' do
+            ds = Idhja22::Dataset.new([Idhja22::Dataset::Example.new(['high', '20-30', 'vanilla', 'Y'], ['Confidence', 'Age group', 'fav ice cream'] , 'Loves Reading')], ['Confidence', 'Age group', 'fav ice cream'], 'Loves Reading')
+            ds.entropy.should == 1.0
+          end
+        end
+      end
+      describe '#size' do
+        it 'should calculate size of dataset' do
+          @ds.size.should == 10
+        end
+      end
+      describe '#empty?' do
+        it 'should calculate size of dataset' do
+          @ds.empty?.should be_false
+        end
+      end
+      describe '#probability' do
+        it 'should return probabilty category is Y' do
+          @ds.probability.should be_within(0.0001).of(0.6)
+        end
+      end
+      describe '#split' do
+        it 'should split into a training and validation set according to the given proportion' do
+          ts, vs = @ds.split(0.5)
+          ts.size.should == 5
+          vs.size.should == 5
+          ts, vs = @ds.split(0.75)
+          ts.size.should == 7
+          vs.size.should == 3
+        end
+      end
+    end
+  end
+end

data/spec/large_spec_data.csv ADDED Viewed

@@ -0,0 +1,11 @@
+0,1,2,3,4,C
+a,a,a,a,a,Y
+a,a,b,b,a,N
+a,a,a,c,a,Y
+b,a,a,a,a,Y
+b,a,b,c,a,N
+a,a,a,a,a,Y
+a,a,a,a,a,Y
+a,a,a,a,a,Y
+a,a,a,a,b,N
+a,a,a,a,b,N

data/spec/node_spec.rb ADDED Viewed

@@ -0,0 +1,97 @@
+require 'spec_helper'
+describe Idhja22::LeafNode do
+  describe('.new') do
+    it 'should store probability and category label' do
+      l = Idhja22::LeafNode.new(0.75, 'label')
+      l.probability.should == 0.75
+      l.category_label.should == 'label'
+    end
+  end
+  describe('#get_rules') do
+    it 'should return the probability' do
+      l = Idhja22::LeafNode.new(0.75, 'pudding')
+      l.get_rules.should == ['then chance of pudding = 0.75']
+    end
+  end
+  describe(' == ') do
+    let(:l1) { Idhja22::LeafNode.new(0.75, 'pudding') }
+    let(:l2) { Idhja22::LeafNode.new(0.75, 'pudding') }
+    let(:diff_l1) { Idhja22::LeafNode.new(0.7, 'pudding') }
+    let(:diff_l2) { Idhja22::LeafNode.new(0.75, 'starter') }
+    it 'should compare attributes' do
+      l1.should == l2
+      l1.should_not == diff_l1
+      l1.should_not == diff_l2
+    end
+  end
+  describe 'evaluate' do
+    let(:leaf) { Idhja22::LeafNode.new(0.6, 'pudding') }
+    it 'should return probability' do
+      query = Idhja22::Dataset::Datum.new(['high', 'gusty'], ['temperature', 'windy'], 'pudding')
+      leaf.evaluate(query).should == 0.6
+    end
+    context 'mismatching category labels' do
+      it 'should raise error' do
+        query = Idhja22::Dataset::Datum.new(['high', 'gusty'], ['temperature', 'windy'], 'tennis')
+        expect {leaf.evaluate(query)}.to raise_error(Idhja22::Dataset::Datum::UnknownCategoryLabel)
+      end
+    end
+  end
+end
+describe Idhja22::DecisionNode do
+  before(:all) do
+    @ds = Idhja22::Dataset.from_csv(File.join(File.dirname(__FILE__),'large_spec_data.csv'))
+  end
+  describe('#get_rules') do
+    it 'should return a list of rules' do
+      l = Idhja22::DecisionNode.new(@ds.partition('2'), '3', [], 0, 0.75)
+      l.get_rules.should == ["3 == a and then chance of C = 0.75", "3 == b and then chance of C = 0.0"]
+    end
+  end
+  describe(' == ') do
+    let(:dn1) { Idhja22::DecisionNode.new(@ds.partition('2'), '2', [], 0, 0.75) }
+    let(:dn2) { Idhja22::DecisionNode.new(@ds.partition('2'), '2', [], 0, 0.75) }
+    let(:diff_dn1) { Idhja22::DecisionNode.new(@ds.partition('0'), '2', [], 0, 0.75) }
+    let(:diff_dn2) { Idhja22::DecisionNode.new(@ds.partition('3'), '3', [], 0, 0.75) }
+    it 'should compare ' do
+      dn1.should == dn2
+      dn1.should_not == diff_dn1
+      dn1.should_not == diff_dn2
+    end
+  end
+  describe 'evaluate' do
+    let(:dn) { Idhja22::DecisionNode.new(@ds.partition('2'), '3', [], 0, 0.75) }
+    it 'should follow node to probability' do
+      query = Idhja22::Dataset::Datum.new(['a', 'a'], ['3', '4'], 'C')
+      dn.evaluate(query).should == 0.75
+      query = Idhja22::Dataset::Datum.new(['b', 'a'], ['3', '4'], 'C')
+      dn.evaluate(query).should == 0.0
+    end
+    context 'mismatching attribute label' do
+      it 'should raise an error' do
+        query = Idhja22::Dataset::Datum.new(['b', 'a'], ['1', '2'], 'C')
+        expect {dn.evaluate(query)}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeLabel)
+      end
+    end
+    context 'unknown attribute value' do
+      it 'should raise an error' do
+        query = Idhja22::Dataset::Datum.new(['c', 'a'], ['3', '4'], 'C')
+        expect {dn.evaluate(query)}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeValue)
+      end
+    end
+  end
+end

data/spec/spec_data.csv ADDED Viewed

@@ -0,0 +1,4 @@
+Weather,Temperature,Wind,Plays
+sunny,hot,light,Y
+sunny,cold,medium,Y
+raining,cold,high,N

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,20 @@
+if ENV['COVERAGE']
+  require 'simplecov'
+  SimpleCov.start do
+    add_filter '/spec/'
+  end
+end
+$: << File.dirname(__FILE__) + '/../lib'
+require 'idhja22'
+require 'ruby-debug'
+module Idhja22
+  MIN_DATASET_SIZE = 2
+end
+RSpec.configure do |config|
+end

data/spec/tree_spec.rb ADDED Viewed

@@ -0,0 +1,93 @@
+require 'spec_helper'
+describe Idhja22::Tree do
+  before(:all) do
+    @ds = Idhja22::Dataset.from_csv(File.join(File.dirname(__FILE__),'large_spec_data.csv'))
+  end
+  describe('.train') do
+    it 'should make a tree' do
+      tree = Idhja22::Tree.train(@ds)
+    end
+    context 'with insufficient data' do
+      it 'should throw exception' do
+        ds = Idhja22::Dataset.new([Idhja22::Dataset::Datum.new(['high', '20-30', 'Vanilla', 'Y'], ['Confidence', 'Age group', 'Fav ice cream'] , 'Loves Reading')], ['Confidence', 'Age group', 'Fav ice cream'], 'Loves Reading')
+        expect { Idhja22::Tree.train(ds) }.to raise_error(Idhja22::Dataset::InsufficientData)
+      end
+    end
+  end
+  describe('#get_rules') do
+    it 'should list the rules of the tree' do
+      Idhja22::Tree.train(@ds).get_rules.should == "if 2 == a and 4 == a and then chance of C = 1.0\nelsif 2 == a and 4 == b and then chance of C = 0.0\nelsif 2 == b and then chance of C = 0.0"
+    end
+  end
+  describe(' == ') do
+    it 'should compare root nodes' do
+      tree1 = Idhja22::Tree.train(@ds)
+      tree2 = Idhja22::Tree.train(@ds)
+      diff_ds = Idhja22::Dataset.from_csv(File.join(File.dirname(__FILE__),'another_large_spec_data.csv'))
+      diff_tree = Idhja22::Tree.train(diff_ds)
+      tree1.should == tree2
+      tree1.should_not == diff_tree
+    end
+  end
+  describe('.train_from_csv') do
+    it 'should make the same tree as the one from the dataset' do
+      tree = Idhja22::Tree.train(@ds)
+      csv_tree = Idhja22::Tree.train_from_csv(File.join(File.dirname(__FILE__),'large_spec_data.csv'))
+      tree.should == csv_tree
+    end
+  end
+  describe('#evaluate') do
+    it 'should return the probabilty at the leaf of the tree' do
+      tree = Idhja22::Tree.train(@ds)
+      query = Idhja22::Dataset::Datum.new(['z','z','a','z','a'],['0', '1','2','3','4'],'C')
+      tree.evaluate(query).should == 1.0
+    end
+  end
+  describe '#validate' do
+    before(:all) do
+      @tree = Idhja22::Tree.train(@ds)
+    end
+    it 'should return the average probability that the tree gets the validation examples correct' do
+      vps = [Idhja22::Dataset::Example.new(['z','z','a','z','a','Y'],['0', '1','2','3','4'],'C')]
+      vps << Idhja22::Dataset::Example.new(['z','z','a','z','a','N'],['0', '1','2','3','4'],'C')
+      @tree.validate(Idhja22::Dataset.new(vps, ['0', '1','2','3','4'],'C')).should == 0.5
+    end
+    context 'against a data point with an unrecognised attribute value' do
+      before(:all) do
+        validation_point = Idhja22::Dataset::Example.new(['z','z','o','z','a','Y'],['0', '1','2','3','4'],'C')
+        @vps = Idhja22::Dataset.new([validation_point], ['0', '1','2','3','4'],'C')
+      end
+      it 'should treat a validation example as one it will never get right' do
+        @tree.validate(@vps).should == 0.0
+      end
+    end
+  end
+  describe '.train_and_validate' do
+    it 'should return a tree and the validation result' do
+      tree, value = Idhja22::Tree.train_and_validate(@ds)
+      tree.is_a?(Idhja22::Tree).should be_true
+      (0..1).include?(value).should be_true
+    end
+  end
+  describe('.train_and_validate_from_csv') do
+    it 'should make the same tree as the one from the dataset' do
+      csv_tree, validation_value = Idhja22::Tree.train_and_validate_from_csv(File.join(File.dirname(__FILE__),'large_spec_data.csv'), 0.75)
+      csv_tree.is_a?(Idhja22::Tree).should be_true
+      (0..1).include?(validation_value).should be_true
+    end
+  end
+end

data/spec/version_spec.rb ADDED Viewed

@@ -0,0 +1,9 @@
+require 'spec_helper'
+describe Idhja22 do
+  describe 'VERSION' do
+    it 'should be current version' do
+      Idhja22::VERSION.should == '0.14.2'
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,149 @@
+--- !ruby/object:Gem::Specification
+name: idhja22
+version: !ruby/object:Gem::Version
+  version: 0.14.2
+  prerelease:
+platform: ruby
+authors:
+- Henry Addison
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-12-17 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '2.10'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '2.10'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: debugger
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: simplecov
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Decision Trees
+email:
+executables:
+- idhja22
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- .travis.yml
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/idhja22
+- idhja22.gemspec
+- lib/idhja22.rb
+- lib/idhja22/dataset.rb
+- lib/idhja22/dataset/datum.rb
+- lib/idhja22/dataset/errors.rb
+- lib/idhja22/dataset/tree_methods.rb
+- lib/idhja22/node.rb
+- lib/idhja22/tree.rb
+- lib/idhja22/version.rb
+- spec/another_large_spec_data.csv
+- spec/dataset/example_spec.rb
+- spec/dataset_spec.rb
+- spec/large_spec_data.csv
+- spec/node_spec.rb
+- spec/spec_data.csv
+- spec/spec_helper.rb
+- spec/tree_spec.rb
+- spec/version_spec.rb
+homepage: https://github.com/henryaddison/idhja22
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+      segments:
+      - 0
+      hash: -4104544286961851710
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+      segments:
+      - 0
+      hash: -4104544286961851710
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.24
+signing_key:
+specification_version: 3
+summary: A different take on decision trees
+test_files:
+- spec/another_large_spec_data.csv
+- spec/dataset/example_spec.rb
+- spec/dataset_spec.rb
+- spec/large_spec_data.csv
+- spec/node_spec.rb
+- spec/spec_data.csv
+- spec/spec_helper.rb
+- spec/tree_spec.rb
+- spec/version_spec.rb