idhja22 0.14.2 → 0.14.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +1 -0
- data/idhja22.gemspec +3 -1
- data/lib/idhja22/node.rb +53 -1
- data/lib/idhja22/tree.rb +10 -51
- data/lib/idhja22/version.rb +1 -1
- data/spec/version_spec.rb +1 -1
- metadata +38 -4
data/.yardopts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--no-private lib/**/*.rb - LICENSE.txt
|
data/idhja22.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |gem|
|
|
8
8
|
gem.version = Idhja22::VERSION
|
9
9
|
gem.authors = ["Henry Addison"]
|
10
10
|
gem.description = %q{Decision Trees}
|
11
|
-
gem.summary = %q{A
|
11
|
+
gem.summary = %q{A gem for creating decision trees}
|
12
12
|
gem.homepage = "https://github.com/henryaddison/idhja22"
|
13
13
|
|
14
14
|
gem.files = `git ls-files`.split($/)
|
@@ -20,4 +20,6 @@ Gem::Specification.new do |gem|
|
|
20
20
|
gem.add_development_dependency "rake"
|
21
21
|
gem.add_development_dependency 'debugger'
|
22
22
|
gem.add_development_dependency 'simplecov'
|
23
|
+
gem.add_development_dependency 'yard'
|
24
|
+
gem.add_development_dependency 'redcarpet'
|
23
25
|
end
|
data/lib/idhja22/node.rb
CHANGED
@@ -1,5 +1,57 @@
|
|
1
1
|
module Idhja22
|
2
2
|
class Node
|
3
|
+
class << self
|
4
|
+
def build_node(dataset, attributes_available, depth, parent_probability = nil)
|
5
|
+
if(dataset.size < Idhja22::MIN_DATASET_SIZE)
|
6
|
+
return Idhja22::LeafNode.new(probability_guess(parent_probability, depth), dataset.category_label)
|
7
|
+
end
|
8
|
+
|
9
|
+
#if successful termination - create and return a leaf node
|
10
|
+
if(dataset.terminating? && depth > 0) # don't terminate without splitting the data at least once
|
11
|
+
return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
|
12
|
+
end
|
13
|
+
|
14
|
+
if(depth >= 3) # don't let trees get too long
|
15
|
+
return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
|
16
|
+
end
|
17
|
+
|
18
|
+
#if we have no more attributes left to split the dataset on, then return a leafnode
|
19
|
+
if(attributes_available.empty?)
|
20
|
+
return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
|
21
|
+
end
|
22
|
+
|
23
|
+
data_split, best_attribute = best_attribute(dataset, attributes_available)
|
24
|
+
|
25
|
+
node = Idhja22::DecisionNode.new(data_split, best_attribute, attributes_available-[best_attribute], depth, dataset.probability)
|
26
|
+
|
27
|
+
return node
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
def best_attribute(dataset, attributes_available)
|
32
|
+
data_split = best_attribute = nil
|
33
|
+
igain = - Float::INFINITY
|
34
|
+
|
35
|
+
attributes_available.each do |attr_label|
|
36
|
+
possible_split = dataset.partition(attr_label)
|
37
|
+
possible_igain = dataset.entropy
|
38
|
+
possible_split.each do |value, ds|
|
39
|
+
possible_igain -= (ds.size.to_f/dataset.size.to_f)*ds.entropy
|
40
|
+
end
|
41
|
+
if(possible_igain > igain)
|
42
|
+
igain = possible_igain
|
43
|
+
data_split = possible_split
|
44
|
+
best_attribute = attr_label
|
45
|
+
end
|
46
|
+
end
|
47
|
+
return data_split, best_attribute
|
48
|
+
end
|
49
|
+
|
50
|
+
def probability_guess(parent_probability, depth)
|
51
|
+
return (parent_probability + (Idhja22::DEFAULT_PROBABILITY-parent_probability)/2**depth)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
3
55
|
def ==(other)
|
4
56
|
return self.class == other.class
|
5
57
|
end
|
@@ -11,7 +63,7 @@ module Idhja22
|
|
11
63
|
@decision_attribute = decision_attribute
|
12
64
|
@branches = {}
|
13
65
|
data_split.each do |value, dataset|
|
14
|
-
node =
|
66
|
+
node = Node.build_node(dataset, attributes_available, depth+1, parent_probability)
|
15
67
|
if(node.is_a?(DecisionNode) && node.branches.values.all? { |n| n.is_a?(LeafNode) })
|
16
68
|
probs = node.branches.values.collect(&:probability)
|
17
69
|
if(probs.max - probs.min < 0.01)
|
data/lib/idhja22/tree.rb
CHANGED
@@ -1,11 +1,16 @@
|
|
1
1
|
module Idhja22
|
2
|
+
# The main entry class for a training, viewing and evaluating a decision tree.
|
2
3
|
class Tree
|
3
4
|
attr_accessor :root
|
4
5
|
class << self
|
6
|
+
# Trains a Tree using the provided Dataset.
|
5
7
|
def train(dataset)
|
6
8
|
new(dataset, dataset.attribute_labels)
|
7
9
|
end
|
8
10
|
|
11
|
+
# Takes a dataset and splits it randomly into training and validation data.
|
12
|
+
# Uses the training data to train a tree whose perfomance then measured using the validation data.
|
13
|
+
# @param [Float] Proportion of dataset to use for training. The rest will be used to validate the resulting tree.
|
9
14
|
def train_and_validate(dataset, training_proportion=0.5)
|
10
15
|
training_set, validation_set = dataset.split(training_proportion)
|
11
16
|
tree = self.train(training_set)
|
@@ -13,70 +18,24 @@ module Idhja22
|
|
13
18
|
return tree, validation_value
|
14
19
|
end
|
15
20
|
|
21
|
+
# see #train
|
22
|
+
# @note Takes a CSV filename rather than a Dataset
|
16
23
|
def train_from_csv(filename)
|
17
24
|
ds = Dataset.from_csv(filename)
|
18
25
|
train(ds)
|
19
26
|
end
|
20
27
|
|
28
|
+
# see #train_and_validate
|
29
|
+
# @note Takes a CSV filename rather than a Dataset
|
21
30
|
def train_and_validate_from_csv(filename, training_proportion=0.5)
|
22
31
|
ds = Dataset.from_csv(filename)
|
23
32
|
train_and_validate(ds, training_proportion)
|
24
33
|
end
|
25
|
-
|
26
|
-
def build_node(dataset, attributes_available, depth, parent_probability = nil)
|
27
|
-
if(dataset.size < Idhja22::MIN_DATASET_SIZE)
|
28
|
-
return Idhja22::LeafNode.new(probability_guess(parent_probability, depth), dataset.category_label)
|
29
|
-
end
|
30
|
-
|
31
|
-
#if successful termination - create and return a leaf node
|
32
|
-
if(dataset.terminating? && depth > 0) # don't terminate without splitting the data at least once
|
33
|
-
return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
|
34
|
-
end
|
35
|
-
|
36
|
-
if(depth >= 3) # don't let trees get too long
|
37
|
-
return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
|
38
|
-
end
|
39
|
-
|
40
|
-
#if we have no more attributes left to split the dataset on, then return a leafnode
|
41
|
-
if(attributes_available.empty?)
|
42
|
-
return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
|
43
|
-
end
|
44
|
-
|
45
|
-
data_split , best_attribute = best_attribute(dataset, attributes_available)
|
46
|
-
|
47
|
-
node = Idhja22::DecisionNode.new(data_split, best_attribute, attributes_available-[best_attribute], depth, dataset.probability)
|
48
|
-
|
49
|
-
return node
|
50
|
-
end
|
51
|
-
|
52
|
-
private
|
53
|
-
def best_attribute(dataset, attributes_available)
|
54
|
-
data_split = best_attribute = nil
|
55
|
-
igain = - Float::INFINITY
|
56
|
-
|
57
|
-
attributes_available.each do |attr_label|
|
58
|
-
possible_split = dataset.partition(attr_label)
|
59
|
-
possible_igain = dataset.entropy
|
60
|
-
possible_split.each do |value, ds|
|
61
|
-
possible_igain -= (ds.size.to_f/dataset.size.to_f)*ds.entropy
|
62
|
-
end
|
63
|
-
if(possible_igain > igain)
|
64
|
-
igain = possible_igain
|
65
|
-
data_split = possible_split
|
66
|
-
best_attribute = attr_label
|
67
|
-
end
|
68
|
-
end
|
69
|
-
return data_split, best_attribute
|
70
|
-
end
|
71
|
-
|
72
|
-
def probability_guess(parent_probability, depth)
|
73
|
-
return (parent_probability + (Idhja22::DEFAULT_PROBABILITY-parent_probability)/2**depth)
|
74
|
-
end
|
75
34
|
end
|
76
35
|
|
77
36
|
def initialize(dataset, attributes_available)
|
78
37
|
raise Idhja22::Dataset::InsufficientData, "require at least #{Idhja22::MIN_DATASET_SIZE} data points, only have #{dataset.size} in data set provided" if(dataset.size < Idhja22::MIN_DATASET_SIZE)
|
79
|
-
@root =
|
38
|
+
@root = Node.build_node(dataset, attributes_available, 0)
|
80
39
|
end
|
81
40
|
|
82
41
|
def get_rules
|
data/lib/idhja22/version.rb
CHANGED
data/spec/version_spec.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: idhja22
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.14.
|
4
|
+
version: 0.14.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -75,6 +75,38 @@ dependencies:
|
|
75
75
|
- - ! '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
77
|
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: yard
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: redcarpet
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :development
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
78
110
|
description: Decision Trees
|
79
111
|
email:
|
80
112
|
executables:
|
@@ -84,6 +116,7 @@ extra_rdoc_files: []
|
|
84
116
|
files:
|
85
117
|
- .gitignore
|
86
118
|
- .travis.yml
|
119
|
+
- .yardopts
|
87
120
|
- Gemfile
|
88
121
|
- LICENSE.txt
|
89
122
|
- README.md
|
@@ -121,7 +154,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
121
154
|
version: '0'
|
122
155
|
segments:
|
123
156
|
- 0
|
124
|
-
hash:
|
157
|
+
hash: 2323453043414878291
|
125
158
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
126
159
|
none: false
|
127
160
|
requirements:
|
@@ -130,13 +163,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
130
163
|
version: '0'
|
131
164
|
segments:
|
132
165
|
- 0
|
133
|
-
hash:
|
166
|
+
hash: 2323453043414878291
|
134
167
|
requirements: []
|
135
168
|
rubyforge_project:
|
136
169
|
rubygems_version: 1.8.24
|
137
170
|
signing_key:
|
138
171
|
specification_version: 3
|
139
|
-
summary: A
|
172
|
+
summary: A gem for creating decision trees
|
140
173
|
test_files:
|
141
174
|
- spec/another_large_spec_data.csv
|
142
175
|
- spec/dataset/example_spec.rb
|
@@ -147,3 +180,4 @@ test_files:
|
|
147
180
|
- spec/spec_helper.rb
|
148
181
|
- spec/tree_spec.rb
|
149
182
|
- spec/version_spec.rb
|
183
|
+
has_rdoc:
|