idhja22 0.14.2 → 0.14.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.yardopts ADDED
@@ -0,0 +1 @@
1
+ --no-private lib/**/*.rb - LICENSE.txt
data/idhja22.gemspec CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |gem|
8
8
  gem.version = Idhja22::VERSION
9
9
  gem.authors = ["Henry Addison"]
10
10
  gem.description = %q{Decision Trees}
11
- gem.summary = %q{A different take on decision trees}
11
+ gem.summary = %q{A gem for creating decision trees}
12
12
  gem.homepage = "https://github.com/henryaddison/idhja22"
13
13
 
14
14
  gem.files = `git ls-files`.split($/)
@@ -20,4 +20,6 @@ Gem::Specification.new do |gem|
20
20
  gem.add_development_dependency "rake"
21
21
  gem.add_development_dependency 'debugger'
22
22
  gem.add_development_dependency 'simplecov'
23
+ gem.add_development_dependency 'yard'
24
+ gem.add_development_dependency 'redcarpet'
23
25
  end
data/lib/idhja22/node.rb CHANGED
@@ -1,5 +1,57 @@
1
1
  module Idhja22
2
2
  class Node
3
+ class << self
4
+ def build_node(dataset, attributes_available, depth, parent_probability = nil)
5
+ if(dataset.size < Idhja22::MIN_DATASET_SIZE)
6
+ return Idhja22::LeafNode.new(probability_guess(parent_probability, depth), dataset.category_label)
7
+ end
8
+
9
+ #if successful termination - create and return a leaf node
10
+ if(dataset.terminating? && depth > 0) # don't terminate without splitting the data at least once
11
+ return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
12
+ end
13
+
14
+ if(depth >= 3) # don't let trees get too long
15
+ return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
16
+ end
17
+
18
+ #if we have no more attributes left to split the dataset on, then return a leafnode
19
+ if(attributes_available.empty?)
20
+ return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
21
+ end
22
+
23
+ data_split, best_attribute = best_attribute(dataset, attributes_available)
24
+
25
+ node = Idhja22::DecisionNode.new(data_split, best_attribute, attributes_available-[best_attribute], depth, dataset.probability)
26
+
27
+ return node
28
+ end
29
+
30
+ private
31
+ def best_attribute(dataset, attributes_available)
32
+ data_split = best_attribute = nil
33
+ igain = - Float::INFINITY
34
+
35
+ attributes_available.each do |attr_label|
36
+ possible_split = dataset.partition(attr_label)
37
+ possible_igain = dataset.entropy
38
+ possible_split.each do |value, ds|
39
+ possible_igain -= (ds.size.to_f/dataset.size.to_f)*ds.entropy
40
+ end
41
+ if(possible_igain > igain)
42
+ igain = possible_igain
43
+ data_split = possible_split
44
+ best_attribute = attr_label
45
+ end
46
+ end
47
+ return data_split, best_attribute
48
+ end
49
+
50
+ def probability_guess(parent_probability, depth)
51
+ return (parent_probability + (Idhja22::DEFAULT_PROBABILITY-parent_probability)/2**depth)
52
+ end
53
+ end
54
+
3
55
  def ==(other)
4
56
  return self.class == other.class
5
57
  end
@@ -11,7 +63,7 @@ module Idhja22
11
63
  @decision_attribute = decision_attribute
12
64
  @branches = {}
13
65
  data_split.each do |value, dataset|
14
- node = Tree.build_node(dataset, attributes_available, depth+1, parent_probability)
66
+ node = Node.build_node(dataset, attributes_available, depth+1, parent_probability)
15
67
  if(node.is_a?(DecisionNode) && node.branches.values.all? { |n| n.is_a?(LeafNode) })
16
68
  probs = node.branches.values.collect(&:probability)
17
69
  if(probs.max - probs.min < 0.01)
data/lib/idhja22/tree.rb CHANGED
@@ -1,11 +1,16 @@
1
1
  module Idhja22
2
+ # The main entry class for a training, viewing and evaluating a decision tree.
2
3
  class Tree
3
4
  attr_accessor :root
4
5
  class << self
6
+ # Trains a Tree using the provided Dataset.
5
7
  def train(dataset)
6
8
  new(dataset, dataset.attribute_labels)
7
9
  end
8
10
 
11
+ # Takes a dataset and splits it randomly into training and validation data.
12
+ # Uses the training data to train a tree whose perfomance then measured using the validation data.
13
+ # @param [Float] Proportion of dataset to use for training. The rest will be used to validate the resulting tree.
9
14
  def train_and_validate(dataset, training_proportion=0.5)
10
15
  training_set, validation_set = dataset.split(training_proportion)
11
16
  tree = self.train(training_set)
@@ -13,70 +18,24 @@ module Idhja22
13
18
  return tree, validation_value
14
19
  end
15
20
 
21
+ # see #train
22
+ # @note Takes a CSV filename rather than a Dataset
16
23
  def train_from_csv(filename)
17
24
  ds = Dataset.from_csv(filename)
18
25
  train(ds)
19
26
  end
20
27
 
28
+ # see #train_and_validate
29
+ # @note Takes a CSV filename rather than a Dataset
21
30
  def train_and_validate_from_csv(filename, training_proportion=0.5)
22
31
  ds = Dataset.from_csv(filename)
23
32
  train_and_validate(ds, training_proportion)
24
33
  end
25
-
26
- def build_node(dataset, attributes_available, depth, parent_probability = nil)
27
- if(dataset.size < Idhja22::MIN_DATASET_SIZE)
28
- return Idhja22::LeafNode.new(probability_guess(parent_probability, depth), dataset.category_label)
29
- end
30
-
31
- #if successful termination - create and return a leaf node
32
- if(dataset.terminating? && depth > 0) # don't terminate without splitting the data at least once
33
- return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
34
- end
35
-
36
- if(depth >= 3) # don't let trees get too long
37
- return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
38
- end
39
-
40
- #if we have no more attributes left to split the dataset on, then return a leafnode
41
- if(attributes_available.empty?)
42
- return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
43
- end
44
-
45
- data_split , best_attribute = best_attribute(dataset, attributes_available)
46
-
47
- node = Idhja22::DecisionNode.new(data_split, best_attribute, attributes_available-[best_attribute], depth, dataset.probability)
48
-
49
- return node
50
- end
51
-
52
- private
53
- def best_attribute(dataset, attributes_available)
54
- data_split = best_attribute = nil
55
- igain = - Float::INFINITY
56
-
57
- attributes_available.each do |attr_label|
58
- possible_split = dataset.partition(attr_label)
59
- possible_igain = dataset.entropy
60
- possible_split.each do |value, ds|
61
- possible_igain -= (ds.size.to_f/dataset.size.to_f)*ds.entropy
62
- end
63
- if(possible_igain > igain)
64
- igain = possible_igain
65
- data_split = possible_split
66
- best_attribute = attr_label
67
- end
68
- end
69
- return data_split, best_attribute
70
- end
71
-
72
- def probability_guess(parent_probability, depth)
73
- return (parent_probability + (Idhja22::DEFAULT_PROBABILITY-parent_probability)/2**depth)
74
- end
75
34
  end
76
35
 
77
36
  def initialize(dataset, attributes_available)
78
37
  raise Idhja22::Dataset::InsufficientData, "require at least #{Idhja22::MIN_DATASET_SIZE} data points, only have #{dataset.size} in data set provided" if(dataset.size < Idhja22::MIN_DATASET_SIZE)
79
- @root = self.class.build_node(dataset, attributes_available, 0)
38
+ @root = Node.build_node(dataset, attributes_available, 0)
80
39
  end
81
40
 
82
41
  def get_rules
@@ -1,3 +1,3 @@
1
1
  module Idhja22
2
- VERSION = "0.14.2"
2
+ VERSION = "0.14.3"
3
3
  end
data/spec/version_spec.rb CHANGED
@@ -3,7 +3,7 @@ require 'spec_helper'
3
3
  describe Idhja22 do
4
4
  describe 'VERSION' do
5
5
  it 'should be current version' do
6
- Idhja22::VERSION.should == '0.14.2'
6
+ Idhja22::VERSION.should == '0.14.3'
7
7
  end
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: idhja22
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.2
4
+ version: 0.14.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -75,6 +75,38 @@ dependencies:
75
75
  - - ! '>='
76
76
  - !ruby/object:Gem::Version
77
77
  version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: yard
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: redcarpet
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
78
110
  description: Decision Trees
79
111
  email:
80
112
  executables:
@@ -84,6 +116,7 @@ extra_rdoc_files: []
84
116
  files:
85
117
  - .gitignore
86
118
  - .travis.yml
119
+ - .yardopts
87
120
  - Gemfile
88
121
  - LICENSE.txt
89
122
  - README.md
@@ -121,7 +154,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
121
154
  version: '0'
122
155
  segments:
123
156
  - 0
124
- hash: -4104544286961851710
157
+ hash: 2323453043414878291
125
158
  required_rubygems_version: !ruby/object:Gem::Requirement
126
159
  none: false
127
160
  requirements:
@@ -130,13 +163,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
130
163
  version: '0'
131
164
  segments:
132
165
  - 0
133
- hash: -4104544286961851710
166
+ hash: 2323453043414878291
134
167
  requirements: []
135
168
  rubyforge_project:
136
169
  rubygems_version: 1.8.24
137
170
  signing_key:
138
171
  specification_version: 3
139
- summary: A different take on decision trees
172
+ summary: A gem for creating decision trees
140
173
  test_files:
141
174
  - spec/another_large_spec_data.csv
142
175
  - spec/dataset/example_spec.rb
@@ -147,3 +180,4 @@ test_files:
147
180
  - spec/spec_helper.rb
148
181
  - spec/tree_spec.rb
149
182
  - spec/version_spec.rb
183
+ has_rdoc: