idhja22 0.14.2 → 0.14.3

Sign up to get free protection for your applications and to get access to all the features.
data/.yardopts ADDED
@@ -0,0 +1 @@
1
+ --no-private lib/**/*.rb - LICENSE.txt
data/idhja22.gemspec CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |gem|
8
8
  gem.version = Idhja22::VERSION
9
9
  gem.authors = ["Henry Addison"]
10
10
  gem.description = %q{Decision Trees}
11
- gem.summary = %q{A different take on decision trees}
11
+ gem.summary = %q{A gem for creating decision trees}
12
12
  gem.homepage = "https://github.com/henryaddison/idhja22"
13
13
 
14
14
  gem.files = `git ls-files`.split($/)
@@ -20,4 +20,6 @@ Gem::Specification.new do |gem|
20
20
  gem.add_development_dependency "rake"
21
21
  gem.add_development_dependency 'debugger'
22
22
  gem.add_development_dependency 'simplecov'
23
+ gem.add_development_dependency 'yard'
24
+ gem.add_development_dependency 'redcarpet'
23
25
  end
data/lib/idhja22/node.rb CHANGED
@@ -1,5 +1,57 @@
1
1
  module Idhja22
2
2
  class Node
3
+ class << self
4
+ def build_node(dataset, attributes_available, depth, parent_probability = nil)
5
+ if(dataset.size < Idhja22::MIN_DATASET_SIZE)
6
+ return Idhja22::LeafNode.new(probability_guess(parent_probability, depth), dataset.category_label)
7
+ end
8
+
9
+ #if successful termination - create and return a leaf node
10
+ if(dataset.terminating? && depth > 0) # don't terminate without splitting the data at least once
11
+ return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
12
+ end
13
+
14
+ if(depth >= 3) # don't let trees get too long
15
+ return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
16
+ end
17
+
18
+ #if we have no more attributes left to split the dataset on, then return a leafnode
19
+ if(attributes_available.empty?)
20
+ return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
21
+ end
22
+
23
+ data_split, best_attribute = best_attribute(dataset, attributes_available)
24
+
25
+ node = Idhja22::DecisionNode.new(data_split, best_attribute, attributes_available-[best_attribute], depth, dataset.probability)
26
+
27
+ return node
28
+ end
29
+
30
+ private
31
+ def best_attribute(dataset, attributes_available)
32
+ data_split = best_attribute = nil
33
+ igain = - Float::INFINITY
34
+
35
+ attributes_available.each do |attr_label|
36
+ possible_split = dataset.partition(attr_label)
37
+ possible_igain = dataset.entropy
38
+ possible_split.each do |value, ds|
39
+ possible_igain -= (ds.size.to_f/dataset.size.to_f)*ds.entropy
40
+ end
41
+ if(possible_igain > igain)
42
+ igain = possible_igain
43
+ data_split = possible_split
44
+ best_attribute = attr_label
45
+ end
46
+ end
47
+ return data_split, best_attribute
48
+ end
49
+
50
+ def probability_guess(parent_probability, depth)
51
+ return (parent_probability + (Idhja22::DEFAULT_PROBABILITY-parent_probability)/2**depth)
52
+ end
53
+ end
54
+
3
55
  def ==(other)
4
56
  return self.class == other.class
5
57
  end
@@ -11,7 +63,7 @@ module Idhja22
11
63
  @decision_attribute = decision_attribute
12
64
  @branches = {}
13
65
  data_split.each do |value, dataset|
14
- node = Tree.build_node(dataset, attributes_available, depth+1, parent_probability)
66
+ node = Node.build_node(dataset, attributes_available, depth+1, parent_probability)
15
67
  if(node.is_a?(DecisionNode) && node.branches.values.all? { |n| n.is_a?(LeafNode) })
16
68
  probs = node.branches.values.collect(&:probability)
17
69
  if(probs.max - probs.min < 0.01)
data/lib/idhja22/tree.rb CHANGED
@@ -1,11 +1,16 @@
1
1
  module Idhja22
2
+ # The main entry class for a training, viewing and evaluating a decision tree.
2
3
  class Tree
3
4
  attr_accessor :root
4
5
  class << self
6
+ # Trains a Tree using the provided Dataset.
5
7
  def train(dataset)
6
8
  new(dataset, dataset.attribute_labels)
7
9
  end
8
10
 
11
+ # Takes a dataset and splits it randomly into training and validation data.
12
+ # Uses the training data to train a tree whose perfomance then measured using the validation data.
13
+ # @param [Float] Proportion of dataset to use for training. The rest will be used to validate the resulting tree.
9
14
  def train_and_validate(dataset, training_proportion=0.5)
10
15
  training_set, validation_set = dataset.split(training_proportion)
11
16
  tree = self.train(training_set)
@@ -13,70 +18,24 @@ module Idhja22
13
18
  return tree, validation_value
14
19
  end
15
20
 
21
+ # see #train
22
+ # @note Takes a CSV filename rather than a Dataset
16
23
  def train_from_csv(filename)
17
24
  ds = Dataset.from_csv(filename)
18
25
  train(ds)
19
26
  end
20
27
 
28
+ # see #train_and_validate
29
+ # @note Takes a CSV filename rather than a Dataset
21
30
  def train_and_validate_from_csv(filename, training_proportion=0.5)
22
31
  ds = Dataset.from_csv(filename)
23
32
  train_and_validate(ds, training_proportion)
24
33
  end
25
-
26
- def build_node(dataset, attributes_available, depth, parent_probability = nil)
27
- if(dataset.size < Idhja22::MIN_DATASET_SIZE)
28
- return Idhja22::LeafNode.new(probability_guess(parent_probability, depth), dataset.category_label)
29
- end
30
-
31
- #if successful termination - create and return a leaf node
32
- if(dataset.terminating? && depth > 0) # don't terminate without splitting the data at least once
33
- return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
34
- end
35
-
36
- if(depth >= 3) # don't let trees get too long
37
- return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
38
- end
39
-
40
- #if we have no more attributes left to split the dataset on, then return a leafnode
41
- if(attributes_available.empty?)
42
- return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
43
- end
44
-
45
- data_split , best_attribute = best_attribute(dataset, attributes_available)
46
-
47
- node = Idhja22::DecisionNode.new(data_split, best_attribute, attributes_available-[best_attribute], depth, dataset.probability)
48
-
49
- return node
50
- end
51
-
52
- private
53
- def best_attribute(dataset, attributes_available)
54
- data_split = best_attribute = nil
55
- igain = - Float::INFINITY
56
-
57
- attributes_available.each do |attr_label|
58
- possible_split = dataset.partition(attr_label)
59
- possible_igain = dataset.entropy
60
- possible_split.each do |value, ds|
61
- possible_igain -= (ds.size.to_f/dataset.size.to_f)*ds.entropy
62
- end
63
- if(possible_igain > igain)
64
- igain = possible_igain
65
- data_split = possible_split
66
- best_attribute = attr_label
67
- end
68
- end
69
- return data_split, best_attribute
70
- end
71
-
72
- def probability_guess(parent_probability, depth)
73
- return (parent_probability + (Idhja22::DEFAULT_PROBABILITY-parent_probability)/2**depth)
74
- end
75
34
  end
76
35
 
77
36
  def initialize(dataset, attributes_available)
78
37
  raise Idhja22::Dataset::InsufficientData, "require at least #{Idhja22::MIN_DATASET_SIZE} data points, only have #{dataset.size} in data set provided" if(dataset.size < Idhja22::MIN_DATASET_SIZE)
79
- @root = self.class.build_node(dataset, attributes_available, 0)
38
+ @root = Node.build_node(dataset, attributes_available, 0)
80
39
  end
81
40
 
82
41
  def get_rules
@@ -1,3 +1,3 @@
1
1
  module Idhja22
2
- VERSION = "0.14.2"
2
+ VERSION = "0.14.3"
3
3
  end
data/spec/version_spec.rb CHANGED
@@ -3,7 +3,7 @@ require 'spec_helper'
3
3
  describe Idhja22 do
4
4
  describe 'VERSION' do
5
5
  it 'should be current version' do
6
- Idhja22::VERSION.should == '0.14.2'
6
+ Idhja22::VERSION.should == '0.14.3'
7
7
  end
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: idhja22
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.2
4
+ version: 0.14.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -75,6 +75,38 @@ dependencies:
75
75
  - - ! '>='
76
76
  - !ruby/object:Gem::Version
77
77
  version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: yard
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: redcarpet
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
78
110
  description: Decision Trees
79
111
  email:
80
112
  executables:
@@ -84,6 +116,7 @@ extra_rdoc_files: []
84
116
  files:
85
117
  - .gitignore
86
118
  - .travis.yml
119
+ - .yardopts
87
120
  - Gemfile
88
121
  - LICENSE.txt
89
122
  - README.md
@@ -121,7 +154,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
121
154
  version: '0'
122
155
  segments:
123
156
  - 0
124
- hash: -4104544286961851710
157
+ hash: 2323453043414878291
125
158
  required_rubygems_version: !ruby/object:Gem::Requirement
126
159
  none: false
127
160
  requirements:
@@ -130,13 +163,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
130
163
  version: '0'
131
164
  segments:
132
165
  - 0
133
- hash: -4104544286961851710
166
+ hash: 2323453043414878291
134
167
  requirements: []
135
168
  rubyforge_project:
136
169
  rubygems_version: 1.8.24
137
170
  signing_key:
138
171
  specification_version: 3
139
- summary: A different take on decision trees
172
+ summary: A gem for creating decision trees
140
173
  test_files:
141
174
  - spec/another_large_spec_data.csv
142
175
  - spec/dataset/example_spec.rb
@@ -147,3 +180,4 @@ test_files:
147
180
  - spec/spec_helper.rb
148
181
  - spec/tree_spec.rb
149
182
  - spec/version_spec.rb
183
+ has_rdoc: