idhja22 0.14.2 → 0.14.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +1 -0
- data/idhja22.gemspec +3 -1
- data/lib/idhja22/node.rb +53 -1
- data/lib/idhja22/tree.rb +10 -51
- data/lib/idhja22/version.rb +1 -1
- data/spec/version_spec.rb +1 -1
- metadata +38 -4
    
        data/.yardopts
    ADDED
    
    | @@ -0,0 +1 @@ | |
| 1 | 
            +
            --no-private  lib/**/*.rb - LICENSE.txt
         | 
    
        data/idhja22.gemspec
    CHANGED
    
    | @@ -8,7 +8,7 @@ Gem::Specification.new do |gem| | |
| 8 8 | 
             
              gem.version       = Idhja22::VERSION
         | 
| 9 9 | 
             
              gem.authors       = ["Henry Addison"]
         | 
| 10 10 | 
             
              gem.description   = %q{Decision Trees}
         | 
| 11 | 
            -
              gem.summary       = %q{A  | 
| 11 | 
            +
              gem.summary       = %q{A gem for creating decision trees}
         | 
| 12 12 | 
             
              gem.homepage      = "https://github.com/henryaddison/idhja22"
         | 
| 13 13 |  | 
| 14 14 | 
             
              gem.files         = `git ls-files`.split($/)
         | 
| @@ -20,4 +20,6 @@ Gem::Specification.new do |gem| | |
| 20 20 | 
             
              gem.add_development_dependency "rake"
         | 
| 21 21 | 
             
              gem.add_development_dependency 'debugger'
         | 
| 22 22 | 
             
              gem.add_development_dependency 'simplecov'
         | 
| 23 | 
            +
              gem.add_development_dependency 'yard'
         | 
| 24 | 
            +
              gem.add_development_dependency 'redcarpet'
         | 
| 23 25 | 
             
            end
         | 
    
        data/lib/idhja22/node.rb
    CHANGED
    
    | @@ -1,5 +1,57 @@ | |
| 1 1 | 
             
            module Idhja22
         | 
| 2 2 | 
             
              class Node
         | 
| 3 | 
            +
                class << self
         | 
| 4 | 
            +
                  def build_node(dataset, attributes_available, depth, parent_probability = nil)
         | 
| 5 | 
            +
                    if(dataset.size < Idhja22::MIN_DATASET_SIZE)
         | 
| 6 | 
            +
                      return Idhja22::LeafNode.new(probability_guess(parent_probability, depth), dataset.category_label)
         | 
| 7 | 
            +
                    end
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                    #if successful termination - create and return a leaf node
         | 
| 10 | 
            +
                    if(dataset.terminating? && depth > 0) # don't terminate without splitting the data at least once
         | 
| 11 | 
            +
                      return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
         | 
| 12 | 
            +
                    end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                    if(depth >= 3) # don't let trees get too long
         | 
| 15 | 
            +
                      return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
         | 
| 16 | 
            +
                    end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                    #if we have no more attributes left to split the dataset on, then return a leafnode
         | 
| 19 | 
            +
                    if(attributes_available.empty?)
         | 
| 20 | 
            +
                      return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
         | 
| 21 | 
            +
                    end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                    data_split, best_attribute = best_attribute(dataset, attributes_available)
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                    node = Idhja22::DecisionNode.new(data_split, best_attribute, attributes_available-[best_attribute], depth, dataset.probability)
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                    return node
         | 
| 28 | 
            +
                  end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                  private
         | 
| 31 | 
            +
                  def best_attribute(dataset, attributes_available)
         | 
| 32 | 
            +
                    data_split = best_attribute = nil
         | 
| 33 | 
            +
                    igain = - Float::INFINITY
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    attributes_available.each do |attr_label|
         | 
| 36 | 
            +
                      possible_split = dataset.partition(attr_label)
         | 
| 37 | 
            +
                      possible_igain = dataset.entropy
         | 
| 38 | 
            +
                      possible_split.each do |value, ds|
         | 
| 39 | 
            +
                        possible_igain -= (ds.size.to_f/dataset.size.to_f)*ds.entropy
         | 
| 40 | 
            +
                      end
         | 
| 41 | 
            +
                      if(possible_igain > igain)
         | 
| 42 | 
            +
                        igain = possible_igain
         | 
| 43 | 
            +
                        data_split = possible_split
         | 
| 44 | 
            +
                        best_attribute = attr_label
         | 
| 45 | 
            +
                      end
         | 
| 46 | 
            +
                    end
         | 
| 47 | 
            +
                    return data_split, best_attribute
         | 
| 48 | 
            +
                  end
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                  def probability_guess(parent_probability, depth)
         | 
| 51 | 
            +
                    return (parent_probability + (Idhja22::DEFAULT_PROBABILITY-parent_probability)/2**depth)
         | 
| 52 | 
            +
                  end
         | 
| 53 | 
            +
                end
         | 
| 54 | 
            +
             | 
| 3 55 | 
             
                def ==(other)
         | 
| 4 56 | 
             
                  return self.class == other.class
         | 
| 5 57 | 
             
                end
         | 
| @@ -11,7 +63,7 @@ module Idhja22 | |
| 11 63 | 
             
                  @decision_attribute = decision_attribute
         | 
| 12 64 | 
             
                  @branches = {}
         | 
| 13 65 | 
             
                  data_split.each do |value, dataset|
         | 
| 14 | 
            -
                    node =  | 
| 66 | 
            +
                    node = Node.build_node(dataset, attributes_available, depth+1, parent_probability)
         | 
| 15 67 | 
             
                    if(node.is_a?(DecisionNode) && node.branches.values.all? { |n| n.is_a?(LeafNode) })
         | 
| 16 68 | 
             
                      probs = node.branches.values.collect(&:probability)
         | 
| 17 69 | 
             
                      if(probs.max - probs.min < 0.01)
         | 
    
        data/lib/idhja22/tree.rb
    CHANGED
    
    | @@ -1,11 +1,16 @@ | |
| 1 1 | 
             
            module Idhja22
         | 
| 2 | 
            +
              # The main entry class for a training, viewing and evaluating a decision tree.
         | 
| 2 3 | 
             
              class Tree
         | 
| 3 4 | 
             
                attr_accessor :root
         | 
| 4 5 | 
             
                class << self
         | 
| 6 | 
            +
                  # Trains a Tree using the provided Dataset.
         | 
| 5 7 | 
             
                  def train(dataset)
         | 
| 6 8 | 
             
                    new(dataset, dataset.attribute_labels)
         | 
| 7 9 | 
             
                  end
         | 
| 8 10 |  | 
| 11 | 
            +
                  # Takes a dataset and splits it randomly into training and validation data. 
         | 
| 12 | 
            +
                  # Uses the training data to train a tree whose perfomance then measured using the validation data.
         | 
| 13 | 
            +
                  # @param [Float] Proportion of dataset to use for training. The rest will be used to validate the resulting tree.
         | 
| 9 14 | 
             
                  def train_and_validate(dataset, training_proportion=0.5)
         | 
| 10 15 | 
             
                    training_set, validation_set = dataset.split(training_proportion)
         | 
| 11 16 | 
             
                    tree = self.train(training_set)
         | 
| @@ -13,70 +18,24 @@ module Idhja22 | |
| 13 18 | 
             
                    return tree, validation_value
         | 
| 14 19 | 
             
                  end
         | 
| 15 20 |  | 
| 21 | 
            +
                  # see #train
         | 
| 22 | 
            +
                  # @note Takes a CSV filename rather than a Dataset
         | 
| 16 23 | 
             
                  def train_from_csv(filename)
         | 
| 17 24 | 
             
                    ds = Dataset.from_csv(filename)
         | 
| 18 25 | 
             
                    train(ds)
         | 
| 19 26 | 
             
                  end
         | 
| 20 27 |  | 
| 28 | 
            +
                  # see #train_and_validate
         | 
| 29 | 
            +
                  # @note Takes a CSV filename rather than a Dataset
         | 
| 21 30 | 
             
                  def train_and_validate_from_csv(filename, training_proportion=0.5)
         | 
| 22 31 | 
             
                    ds = Dataset.from_csv(filename)
         | 
| 23 32 | 
             
                    train_and_validate(ds, training_proportion)
         | 
| 24 33 | 
             
                  end
         | 
| 25 | 
            -
             | 
| 26 | 
            -
                  def build_node(dataset, attributes_available, depth, parent_probability = nil)
         | 
| 27 | 
            -
                    if(dataset.size < Idhja22::MIN_DATASET_SIZE)
         | 
| 28 | 
            -
                      return Idhja22::LeafNode.new(probability_guess(parent_probability, depth), dataset.category_label)
         | 
| 29 | 
            -
                    end
         | 
| 30 | 
            -
             | 
| 31 | 
            -
                    #if successful termination - create and return a leaf node
         | 
| 32 | 
            -
                    if(dataset.terminating? && depth > 0) # don't terminate without splitting the data at least once
         | 
| 33 | 
            -
                      return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
         | 
| 34 | 
            -
                    end
         | 
| 35 | 
            -
             | 
| 36 | 
            -
                    if(depth >= 3) # don't let trees get too long
         | 
| 37 | 
            -
                      return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
         | 
| 38 | 
            -
                    end
         | 
| 39 | 
            -
             | 
| 40 | 
            -
                    #if we have no more attributes left to split the dataset on, then return a leafnode
         | 
| 41 | 
            -
                    if(attributes_available.empty?)
         | 
| 42 | 
            -
                      return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
         | 
| 43 | 
            -
                    end
         | 
| 44 | 
            -
             | 
| 45 | 
            -
                    data_split , best_attribute = best_attribute(dataset, attributes_available)
         | 
| 46 | 
            -
             | 
| 47 | 
            -
                    node = Idhja22::DecisionNode.new(data_split, best_attribute, attributes_available-[best_attribute], depth, dataset.probability)
         | 
| 48 | 
            -
             | 
| 49 | 
            -
                    return node
         | 
| 50 | 
            -
                  end
         | 
| 51 | 
            -
             | 
| 52 | 
            -
                  private
         | 
| 53 | 
            -
                  def best_attribute(dataset, attributes_available)
         | 
| 54 | 
            -
                    data_split = best_attribute = nil
         | 
| 55 | 
            -
                    igain = - Float::INFINITY
         | 
| 56 | 
            -
             | 
| 57 | 
            -
                    attributes_available.each do |attr_label|
         | 
| 58 | 
            -
                      possible_split = dataset.partition(attr_label)
         | 
| 59 | 
            -
                      possible_igain = dataset.entropy
         | 
| 60 | 
            -
                      possible_split.each do |value, ds|
         | 
| 61 | 
            -
                        possible_igain -= (ds.size.to_f/dataset.size.to_f)*ds.entropy
         | 
| 62 | 
            -
                      end
         | 
| 63 | 
            -
                      if(possible_igain > igain)
         | 
| 64 | 
            -
                        igain = possible_igain
         | 
| 65 | 
            -
                        data_split = possible_split
         | 
| 66 | 
            -
                        best_attribute = attr_label
         | 
| 67 | 
            -
                      end
         | 
| 68 | 
            -
                    end
         | 
| 69 | 
            -
                    return data_split, best_attribute
         | 
| 70 | 
            -
                  end
         | 
| 71 | 
            -
             | 
| 72 | 
            -
                  def probability_guess(parent_probability, depth)
         | 
| 73 | 
            -
                    return (parent_probability + (Idhja22::DEFAULT_PROBABILITY-parent_probability)/2**depth)
         | 
| 74 | 
            -
                  end
         | 
| 75 34 | 
             
                end
         | 
| 76 35 |  | 
| 77 36 | 
             
                def initialize(dataset, attributes_available)
         | 
| 78 37 | 
             
                  raise Idhja22::Dataset::InsufficientData, "require at least #{Idhja22::MIN_DATASET_SIZE} data points, only have #{dataset.size} in data set provided" if(dataset.size < Idhja22::MIN_DATASET_SIZE)
         | 
| 79 | 
            -
                  @root =  | 
| 38 | 
            +
                  @root = Node.build_node(dataset, attributes_available, 0)
         | 
| 80 39 | 
             
                end
         | 
| 81 40 |  | 
| 82 41 | 
             
                def get_rules
         | 
    
        data/lib/idhja22/version.rb
    CHANGED
    
    
    
        data/spec/version_spec.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: idhja22
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.14. | 
| 4 | 
            +
              version: 0.14.3
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
            platform: ruby
         | 
| 7 7 | 
             
            authors:
         | 
| @@ -75,6 +75,38 @@ dependencies: | |
| 75 75 | 
             
                - - ! '>='
         | 
| 76 76 | 
             
                  - !ruby/object:Gem::Version
         | 
| 77 77 | 
             
                    version: '0'
         | 
| 78 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 79 | 
            +
              name: yard
         | 
| 80 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 81 | 
            +
                none: false
         | 
| 82 | 
            +
                requirements:
         | 
| 83 | 
            +
                - - ! '>='
         | 
| 84 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 85 | 
            +
                    version: '0'
         | 
| 86 | 
            +
              type: :development
         | 
| 87 | 
            +
              prerelease: false
         | 
| 88 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 89 | 
            +
                none: false
         | 
| 90 | 
            +
                requirements:
         | 
| 91 | 
            +
                - - ! '>='
         | 
| 92 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 93 | 
            +
                    version: '0'
         | 
| 94 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 95 | 
            +
              name: redcarpet
         | 
| 96 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 97 | 
            +
                none: false
         | 
| 98 | 
            +
                requirements:
         | 
| 99 | 
            +
                - - ! '>='
         | 
| 100 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 101 | 
            +
                    version: '0'
         | 
| 102 | 
            +
              type: :development
         | 
| 103 | 
            +
              prerelease: false
         | 
| 104 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 105 | 
            +
                none: false
         | 
| 106 | 
            +
                requirements:
         | 
| 107 | 
            +
                - - ! '>='
         | 
| 108 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 109 | 
            +
                    version: '0'
         | 
| 78 110 | 
             
            description: Decision Trees
         | 
| 79 111 | 
             
            email: 
         | 
| 80 112 | 
             
            executables:
         | 
| @@ -84,6 +116,7 @@ extra_rdoc_files: [] | |
| 84 116 | 
             
            files:
         | 
| 85 117 | 
             
            - .gitignore
         | 
| 86 118 | 
             
            - .travis.yml
         | 
| 119 | 
            +
            - .yardopts
         | 
| 87 120 | 
             
            - Gemfile
         | 
| 88 121 | 
             
            - LICENSE.txt
         | 
| 89 122 | 
             
            - README.md
         | 
| @@ -121,7 +154,7 @@ required_ruby_version: !ruby/object:Gem::Requirement | |
| 121 154 | 
             
                  version: '0'
         | 
| 122 155 | 
             
                  segments:
         | 
| 123 156 | 
             
                  - 0
         | 
| 124 | 
            -
                  hash:  | 
| 157 | 
            +
                  hash: 2323453043414878291
         | 
| 125 158 | 
             
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 126 159 | 
             
              none: false
         | 
| 127 160 | 
             
              requirements:
         | 
| @@ -130,13 +163,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 130 163 | 
             
                  version: '0'
         | 
| 131 164 | 
             
                  segments:
         | 
| 132 165 | 
             
                  - 0
         | 
| 133 | 
            -
                  hash:  | 
| 166 | 
            +
                  hash: 2323453043414878291
         | 
| 134 167 | 
             
            requirements: []
         | 
| 135 168 | 
             
            rubyforge_project: 
         | 
| 136 169 | 
             
            rubygems_version: 1.8.24
         | 
| 137 170 | 
             
            signing_key: 
         | 
| 138 171 | 
             
            specification_version: 3
         | 
| 139 | 
            -
            summary: A  | 
| 172 | 
            +
            summary: A gem for creating decision trees
         | 
| 140 173 | 
             
            test_files:
         | 
| 141 174 | 
             
            - spec/another_large_spec_data.csv
         | 
| 142 175 | 
             
            - spec/dataset/example_spec.rb
         | 
| @@ -147,3 +180,4 @@ test_files: | |
| 147 180 | 
             
            - spec/spec_helper.rb
         | 
| 148 181 | 
             
            - spec/tree_spec.rb
         | 
| 149 182 | 
             
            - spec/version_spec.rb
         | 
| 183 | 
            +
            has_rdoc: 
         |