idhja22 0.14.2 → 0.14.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +1 -0
- data/idhja22.gemspec +3 -1
- data/lib/idhja22/node.rb +53 -1
- data/lib/idhja22/tree.rb +10 -51
- data/lib/idhja22/version.rb +1 -1
- data/spec/version_spec.rb +1 -1
- metadata +38 -4
data/.yardopts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--no-private lib/**/*.rb - LICENSE.txt
|
data/idhja22.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |gem|
|
|
8
8
|
gem.version = Idhja22::VERSION
|
9
9
|
gem.authors = ["Henry Addison"]
|
10
10
|
gem.description = %q{Decision Trees}
|
11
|
-
gem.summary = %q{A
|
11
|
+
gem.summary = %q{A gem for creating decision trees}
|
12
12
|
gem.homepage = "https://github.com/henryaddison/idhja22"
|
13
13
|
|
14
14
|
gem.files = `git ls-files`.split($/)
|
@@ -20,4 +20,6 @@ Gem::Specification.new do |gem|
|
|
20
20
|
gem.add_development_dependency "rake"
|
21
21
|
gem.add_development_dependency 'debugger'
|
22
22
|
gem.add_development_dependency 'simplecov'
|
23
|
+
gem.add_development_dependency 'yard'
|
24
|
+
gem.add_development_dependency 'redcarpet'
|
23
25
|
end
|
data/lib/idhja22/node.rb
CHANGED
@@ -1,5 +1,57 @@
|
|
1
1
|
module Idhja22
|
2
2
|
class Node
|
3
|
+
class << self
|
4
|
+
def build_node(dataset, attributes_available, depth, parent_probability = nil)
|
5
|
+
if(dataset.size < Idhja22::MIN_DATASET_SIZE)
|
6
|
+
return Idhja22::LeafNode.new(probability_guess(parent_probability, depth), dataset.category_label)
|
7
|
+
end
|
8
|
+
|
9
|
+
#if successful termination - create and return a leaf node
|
10
|
+
if(dataset.terminating? && depth > 0) # don't terminate without splitting the data at least once
|
11
|
+
return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
|
12
|
+
end
|
13
|
+
|
14
|
+
if(depth >= 3) # don't let trees get too long
|
15
|
+
return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
|
16
|
+
end
|
17
|
+
|
18
|
+
#if we have no more attributes left to split the dataset on, then return a leafnode
|
19
|
+
if(attributes_available.empty?)
|
20
|
+
return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
|
21
|
+
end
|
22
|
+
|
23
|
+
data_split, best_attribute = best_attribute(dataset, attributes_available)
|
24
|
+
|
25
|
+
node = Idhja22::DecisionNode.new(data_split, best_attribute, attributes_available-[best_attribute], depth, dataset.probability)
|
26
|
+
|
27
|
+
return node
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
def best_attribute(dataset, attributes_available)
|
32
|
+
data_split = best_attribute = nil
|
33
|
+
igain = - Float::INFINITY
|
34
|
+
|
35
|
+
attributes_available.each do |attr_label|
|
36
|
+
possible_split = dataset.partition(attr_label)
|
37
|
+
possible_igain = dataset.entropy
|
38
|
+
possible_split.each do |value, ds|
|
39
|
+
possible_igain -= (ds.size.to_f/dataset.size.to_f)*ds.entropy
|
40
|
+
end
|
41
|
+
if(possible_igain > igain)
|
42
|
+
igain = possible_igain
|
43
|
+
data_split = possible_split
|
44
|
+
best_attribute = attr_label
|
45
|
+
end
|
46
|
+
end
|
47
|
+
return data_split, best_attribute
|
48
|
+
end
|
49
|
+
|
50
|
+
def probability_guess(parent_probability, depth)
|
51
|
+
return (parent_probability + (Idhja22::DEFAULT_PROBABILITY-parent_probability)/2**depth)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
3
55
|
def ==(other)
|
4
56
|
return self.class == other.class
|
5
57
|
end
|
@@ -11,7 +63,7 @@ module Idhja22
|
|
11
63
|
@decision_attribute = decision_attribute
|
12
64
|
@branches = {}
|
13
65
|
data_split.each do |value, dataset|
|
14
|
-
node =
|
66
|
+
node = Node.build_node(dataset, attributes_available, depth+1, parent_probability)
|
15
67
|
if(node.is_a?(DecisionNode) && node.branches.values.all? { |n| n.is_a?(LeafNode) })
|
16
68
|
probs = node.branches.values.collect(&:probability)
|
17
69
|
if(probs.max - probs.min < 0.01)
|
data/lib/idhja22/tree.rb
CHANGED
@@ -1,11 +1,16 @@
|
|
1
1
|
module Idhja22
|
2
|
+
# The main entry class for a training, viewing and evaluating a decision tree.
|
2
3
|
class Tree
|
3
4
|
attr_accessor :root
|
4
5
|
class << self
|
6
|
+
# Trains a Tree using the provided Dataset.
|
5
7
|
def train(dataset)
|
6
8
|
new(dataset, dataset.attribute_labels)
|
7
9
|
end
|
8
10
|
|
11
|
+
# Takes a dataset and splits it randomly into training and validation data.
|
12
|
+
# Uses the training data to train a tree whose perfomance then measured using the validation data.
|
13
|
+
# @param [Float] Proportion of dataset to use for training. The rest will be used to validate the resulting tree.
|
9
14
|
def train_and_validate(dataset, training_proportion=0.5)
|
10
15
|
training_set, validation_set = dataset.split(training_proportion)
|
11
16
|
tree = self.train(training_set)
|
@@ -13,70 +18,24 @@ module Idhja22
|
|
13
18
|
return tree, validation_value
|
14
19
|
end
|
15
20
|
|
21
|
+
# see #train
|
22
|
+
# @note Takes a CSV filename rather than a Dataset
|
16
23
|
def train_from_csv(filename)
|
17
24
|
ds = Dataset.from_csv(filename)
|
18
25
|
train(ds)
|
19
26
|
end
|
20
27
|
|
28
|
+
# see #train_and_validate
|
29
|
+
# @note Takes a CSV filename rather than a Dataset
|
21
30
|
def train_and_validate_from_csv(filename, training_proportion=0.5)
|
22
31
|
ds = Dataset.from_csv(filename)
|
23
32
|
train_and_validate(ds, training_proportion)
|
24
33
|
end
|
25
|
-
|
26
|
-
def build_node(dataset, attributes_available, depth, parent_probability = nil)
|
27
|
-
if(dataset.size < Idhja22::MIN_DATASET_SIZE)
|
28
|
-
return Idhja22::LeafNode.new(probability_guess(parent_probability, depth), dataset.category_label)
|
29
|
-
end
|
30
|
-
|
31
|
-
#if successful termination - create and return a leaf node
|
32
|
-
if(dataset.terminating? && depth > 0) # don't terminate without splitting the data at least once
|
33
|
-
return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
|
34
|
-
end
|
35
|
-
|
36
|
-
if(depth >= 3) # don't let trees get too long
|
37
|
-
return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
|
38
|
-
end
|
39
|
-
|
40
|
-
#if we have no more attributes left to split the dataset on, then return a leafnode
|
41
|
-
if(attributes_available.empty?)
|
42
|
-
return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
|
43
|
-
end
|
44
|
-
|
45
|
-
data_split , best_attribute = best_attribute(dataset, attributes_available)
|
46
|
-
|
47
|
-
node = Idhja22::DecisionNode.new(data_split, best_attribute, attributes_available-[best_attribute], depth, dataset.probability)
|
48
|
-
|
49
|
-
return node
|
50
|
-
end
|
51
|
-
|
52
|
-
private
|
53
|
-
def best_attribute(dataset, attributes_available)
|
54
|
-
data_split = best_attribute = nil
|
55
|
-
igain = - Float::INFINITY
|
56
|
-
|
57
|
-
attributes_available.each do |attr_label|
|
58
|
-
possible_split = dataset.partition(attr_label)
|
59
|
-
possible_igain = dataset.entropy
|
60
|
-
possible_split.each do |value, ds|
|
61
|
-
possible_igain -= (ds.size.to_f/dataset.size.to_f)*ds.entropy
|
62
|
-
end
|
63
|
-
if(possible_igain > igain)
|
64
|
-
igain = possible_igain
|
65
|
-
data_split = possible_split
|
66
|
-
best_attribute = attr_label
|
67
|
-
end
|
68
|
-
end
|
69
|
-
return data_split, best_attribute
|
70
|
-
end
|
71
|
-
|
72
|
-
def probability_guess(parent_probability, depth)
|
73
|
-
return (parent_probability + (Idhja22::DEFAULT_PROBABILITY-parent_probability)/2**depth)
|
74
|
-
end
|
75
34
|
end
|
76
35
|
|
77
36
|
def initialize(dataset, attributes_available)
|
78
37
|
raise Idhja22::Dataset::InsufficientData, "require at least #{Idhja22::MIN_DATASET_SIZE} data points, only have #{dataset.size} in data set provided" if(dataset.size < Idhja22::MIN_DATASET_SIZE)
|
79
|
-
@root =
|
38
|
+
@root = Node.build_node(dataset, attributes_available, 0)
|
80
39
|
end
|
81
40
|
|
82
41
|
def get_rules
|
data/lib/idhja22/version.rb
CHANGED
data/spec/version_spec.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: idhja22
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -75,6 +75,38 @@ dependencies:
|
|
75
75
|
- - ! '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
77
|
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: yard
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: redcarpet
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :development
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
78
110
|
description: Decision Trees
|
79
111
|
email:
|
80
112
|
executables:
|
@@ -84,6 +116,7 @@ extra_rdoc_files: []
|
|
84
116
|
files:
|
85
117
|
- .gitignore
|
86
118
|
- .travis.yml
|
119
|
+
- .yardopts
|
87
120
|
- Gemfile
|
88
121
|
- LICENSE.txt
|
89
122
|
- README.md
|
@@ -121,7 +154,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
121
154
|
version: '0'
|
122
155
|
segments:
|
123
156
|
- 0
|
124
|
-
hash:
|
157
|
+
hash: 2323453043414878291
|
125
158
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
126
159
|
none: false
|
127
160
|
requirements:
|
@@ -130,13 +163,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
130
163
|
version: '0'
|
131
164
|
segments:
|
132
165
|
- 0
|
133
|
-
hash:
|
166
|
+
hash: 2323453043414878291
|
134
167
|
requirements: []
|
135
168
|
rubyforge_project:
|
136
169
|
rubygems_version: 1.8.24
|
137
170
|
signing_key:
|
138
171
|
specification_version: 3
|
139
|
-
summary: A
|
172
|
+
summary: A gem for creating decision trees
|
140
173
|
test_files:
|
141
174
|
- spec/another_large_spec_data.csv
|
142
175
|
- spec/dataset/example_spec.rb
|
@@ -147,3 +180,4 @@ test_files:
|
|
147
180
|
- spec/spec_helper.rb
|
148
181
|
- spec/tree_spec.rb
|
149
182
|
- spec/version_spec.rb
|
183
|
+
has_rdoc:
|