baobab 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/baobab.rb +4 -0
- data/lib/baobab/dataset.rb +74 -0
- data/lib/baobab/node.rb +189 -0
- data/lib/baobab/shannon.rb +13 -0
- data/lib/baobab/tree.rb +64 -0
- metadata +59 -0
data/lib/baobab.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
|
2
|
+
# Represents a dataset or subset thereof.
|
3
|
+
# Is an array of hashes where all hashes contain the same keys.
|
4
|
+
class Dataset < Array
|
5
|
+
|
6
|
+
# Receives an array of hashes. All hashes must contain the same keys.
|
7
|
+
def initialize data
|
8
|
+
data.each do |row|
|
9
|
+
self << row
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.from_json filename
|
14
|
+
text = File.read(filename)
|
15
|
+
self.new JSON.parse(text)
|
16
|
+
end
|
17
|
+
|
18
|
+
def attribute_names class_var
|
19
|
+
self.column_names.reject{|name| name == class_var}
|
20
|
+
end
|
21
|
+
|
22
|
+
# Returns an array of the attribute names in the dataset
|
23
|
+
# Careful: it's empty on an empty set.
|
24
|
+
def column_names
|
25
|
+
self[0].keys
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns an array of the values of an attribute in the dataset.
|
29
|
+
# Careful: it's empty on an empty set.
|
30
|
+
def column_values attribute
|
31
|
+
self.map{|row| row[attribute]}.to_a.uniq
|
32
|
+
end
|
33
|
+
|
34
|
+
# Gets a subset with given conditions. Keys must be of the same type as
|
35
|
+
# in the dataset (be careful with symbols).
|
36
|
+
def subset conditions
|
37
|
+
rows = self.select do |row|
|
38
|
+
conditions.reduce(true) do |memo, (var, val)|
|
39
|
+
memo and row[var] == val
|
40
|
+
end
|
41
|
+
end
|
42
|
+
Dataset.new rows
|
43
|
+
end
|
44
|
+
|
45
|
+
def entropy class_var
|
46
|
+
class_vals = self.column_values(class_var)
|
47
|
+
probabilities = class_vals.map do |class_val|
|
48
|
+
self.probability(class_var, class_val)
|
49
|
+
end
|
50
|
+
Shannon::entropy *probabilities
|
51
|
+
end
|
52
|
+
|
53
|
+
# Evaluates the probability that var be val in this dataset.
|
54
|
+
# Can also be used for subsets.
|
55
|
+
def probability var, val
|
56
|
+
unless self.count.zero?
|
57
|
+
self.count{|r| r[var] == val}.fdiv(self.count)
|
58
|
+
else
|
59
|
+
0
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def validate
|
64
|
+
raise 'Dataset is empty' if self.empty?
|
65
|
+
self.reduce(self[0].keys) do |memo, row|
|
66
|
+
if memo == row.keys then
|
67
|
+
memo
|
68
|
+
else
|
69
|
+
raise 'Dataset is inconsistent'
|
70
|
+
end
|
71
|
+
end
|
72
|
+
return nil
|
73
|
+
end
|
74
|
+
end
|
data/lib/baobab/node.rb
ADDED
@@ -0,0 +1,189 @@
|
|
1
|
+
|
2
|
+
class DecisionTreeNode
|
3
|
+
|
4
|
+
# A list of nodes
|
5
|
+
attr_accessor :children
|
6
|
+
|
7
|
+
# The variable on which this operates, a string
|
8
|
+
attr_reader :variable
|
9
|
+
|
10
|
+
# The value for the variable, a string
|
11
|
+
attr_reader :value
|
12
|
+
|
13
|
+
# The decision tree
|
14
|
+
attr_reader :tree
|
15
|
+
|
16
|
+
attr_reader :entropy
|
17
|
+
|
18
|
+
attr_accessor :next_attribute
|
19
|
+
|
20
|
+
# Accumulates the conditions down the tree
|
21
|
+
attr_reader :conditions
|
22
|
+
|
23
|
+
# The conditional probability that variable=value
|
24
|
+
attr_reader :probability
|
25
|
+
|
26
|
+
def initialize tree, parent, variable, value, entropy, conditions=nil
|
27
|
+
@tree = tree
|
28
|
+
@parent = parent
|
29
|
+
@variable = variable
|
30
|
+
@value = value
|
31
|
+
@entropy = entropy
|
32
|
+
@conditions = conditions ? conditions : {}
|
33
|
+
@children = Array.new
|
34
|
+
@subset = nil
|
35
|
+
end
|
36
|
+
|
37
|
+
def to_s
|
38
|
+
s = @variable ? "#{@variable} => #{@value}" : "ROOT"
|
39
|
+
s += " (#{@entropy.round(3)})"
|
40
|
+
end
|
41
|
+
|
42
|
+
def subset
|
43
|
+
unless @subset.nil?
|
44
|
+
@subset
|
45
|
+
else
|
46
|
+
@subset = @tree.dataset.subset(self.full_conditions)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def clear
|
51
|
+
@subset = nil
|
52
|
+
end
|
53
|
+
|
54
|
+
def pending_attrs
|
55
|
+
@tree.dataset.column_names.reject do |name|
|
56
|
+
@tree.class_var == name or @conditions.include? name
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Returns a subset selected on all the parent node's conditions plus this
|
61
|
+
# node's attribute and its value.
|
62
|
+
def full_conditions
|
63
|
+
if @variable
|
64
|
+
conditions.merge({@variable => @value})
|
65
|
+
else
|
66
|
+
conditions
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def build_subtree recursive=true
|
71
|
+
if self.try_finish
|
72
|
+
return
|
73
|
+
end
|
74
|
+
subset = self.subset
|
75
|
+
subset_count = self.subset.count
|
76
|
+
entropies = self.entropies
|
77
|
+
inf_gain = entropies.each.with_object({}) do |(a, e), o|
|
78
|
+
o[a] = @entropy - e
|
79
|
+
end
|
80
|
+
max_attr, max_gain = inf_gain.max_by{|v, g| g}
|
81
|
+
self.next_attribute = max_attr
|
82
|
+
@children += self.tree.dataset.column_values(max_attr).map do |value|
|
83
|
+
conditions = self.full_conditions
|
84
|
+
DecisionTreeNode.new(
|
85
|
+
tree=@tree, parent=self,
|
86
|
+
attribute=max_attr, value=value,
|
87
|
+
entropy=entropies[max_attr], conditions=conditions
|
88
|
+
)
|
89
|
+
end
|
90
|
+
if recursive
|
91
|
+
@children.each do |c|
|
92
|
+
c.build_subtree
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Checks whether any of three is true:
|
98
|
+
# - All elements in the subset belong to the same class value: a leaf with
|
99
|
+
# this value is created.
|
100
|
+
# - There are no more attributes to be selected: a leaf with the most common
|
101
|
+
# class value is selected
|
102
|
+
# - There are no more rows in the dataset: a leaf with the most common class
|
103
|
+
# value in the parent node is created.
|
104
|
+
def try_finish
|
105
|
+
var = self.tree.class_var
|
106
|
+
val = (
|
107
|
+
self.try_finish_single_value_class or
|
108
|
+
self.try_finish_empty_subset or
|
109
|
+
self.try_finish_no_more_attributes
|
110
|
+
)
|
111
|
+
if val
|
112
|
+
@next_attribute = @tree.class_var
|
113
|
+
self.children << DecisionTreeNode.new(
|
114
|
+
tree=@tree, parent=self,
|
115
|
+
variable=@tree.class_var, value=val,
|
116
|
+
entropy=0, conditions=self.full_conditions
|
117
|
+
)
|
118
|
+
else
|
119
|
+
false
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# If all class values are the same, returns that value; else, nil.
|
124
|
+
def try_finish_single_value_class
|
125
|
+
if self.subset.any?
|
126
|
+
v0 = self.subset[0][@tree.class_var]
|
127
|
+
self.subset.slice(1...-1).reduce(v0) do |memo, row|
|
128
|
+
if memo != row[@tree.class_var]
|
129
|
+
return nil
|
130
|
+
else
|
131
|
+
memo
|
132
|
+
end
|
133
|
+
end
|
134
|
+
else
|
135
|
+
nil
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
139
|
+
|
140
|
+
# If there are not more attributes, returns the most common class value;
|
141
|
+
# else, nil.
|
142
|
+
def try_finish_no_more_attributes
|
143
|
+
if self.pending_attrs.empty? then
|
144
|
+
self.most_common_value
|
145
|
+
else
|
146
|
+
nil
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
# If the subset is empty, returns the most common value in the parent
|
151
|
+
# node's subset.
|
152
|
+
def try_finish_empty_subset
|
153
|
+
if self.subset.empty?
|
154
|
+
@parent.most_common_value
|
155
|
+
else
|
156
|
+
nil
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Returns the most common class value in the dataset.
|
161
|
+
def most_common_value
|
162
|
+
class_var = @tree.class_var
|
163
|
+
class_values = @tree.dataset.column_values(class_var)
|
164
|
+
count = class_values.each.with_object({}) do |val, o|
|
165
|
+
o[val] = 0
|
166
|
+
end
|
167
|
+
self.subset.each.with_object(count) do |row, o|
|
168
|
+
count[row[class_var]] += 1
|
169
|
+
end
|
170
|
+
count.max_by{|v, c| c}[0]
|
171
|
+
end
|
172
|
+
|
173
|
+
# Returns a hash of {attribute, entropy} given that we divide the dataset
|
174
|
+
# on attribute.
|
175
|
+
def entropies
|
176
|
+
self.pending_attrs.each.with_object({}) do |a, o|
|
177
|
+
values = @tree.dataset.column_values(a)
|
178
|
+
val_probabilities = values.each.with_object({}) do |v, o|
|
179
|
+
o[v] = subset.probability a, v
|
180
|
+
end
|
181
|
+
val_entropies = values.each.with_object({}) do |v, o|
|
182
|
+
o[v] = subset.subset({a => v}).entropy(self.tree.class_var)
|
183
|
+
end
|
184
|
+
o[a] = values.reduce(0) do |memo, v|
|
185
|
+
memo + val_entropies[v] * val_probabilities[v]
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
|
2
|
+
module Shannon
|
3
|
+
|
4
|
+
def self.entropy *probabilities
|
5
|
+
probabilities.reduce(0) do |memo, p|
|
6
|
+
if p.zero? then 0 else memo + self.entropy_term(p) end
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.entropy_term probability
|
11
|
+
- probability * Math::log2(probability)
|
12
|
+
end
|
13
|
+
end
|
data/lib/baobab/tree.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
class DecisionTree
|
5
|
+
|
6
|
+
# The first decision tree node
|
7
|
+
attr_reader :root
|
8
|
+
|
9
|
+
# The class variable of interest in this decision tree.
|
10
|
+
attr_reader :class_var
|
11
|
+
|
12
|
+
# The underlying dataset
|
13
|
+
attr_reader :dataset
|
14
|
+
|
15
|
+
def initialize dataset, class_var
|
16
|
+
@dataset = dataset
|
17
|
+
@class_var = class_var
|
18
|
+
entropy = dataset.entropy class_var
|
19
|
+
@root = DecisionTreeNode.new(
|
20
|
+
self, parent=nil,
|
21
|
+
attribute=nil, value=nil,
|
22
|
+
entropy
|
23
|
+
)
|
24
|
+
@root.build_subtree
|
25
|
+
end
|
26
|
+
|
27
|
+
# Prints the decision depth-first with the respective entropy values.
|
28
|
+
def to_s
|
29
|
+
s = ""
|
30
|
+
nodes = [[0, @root]]
|
31
|
+
while nodes.any?
|
32
|
+
l, n = nodes.last
|
33
|
+
nodes = nodes.slice(0...-1)
|
34
|
+
n.children.each do |c|
|
35
|
+
nodes << [l + 1, c]
|
36
|
+
end
|
37
|
+
s = s + "#{' ' * (l * 2)}#{n.to_s}\n"
|
38
|
+
end
|
39
|
+
return s
|
40
|
+
end
|
41
|
+
|
42
|
+
# Receives attributes and their values (they must be all defined).
|
43
|
+
# Returns the value of the predicted class value.
|
44
|
+
def query values
|
45
|
+
if values.keys != @dataset.attribute_names(@class_var)
|
46
|
+
raise "Query does not fit all variables"
|
47
|
+
end
|
48
|
+
node = @root
|
49
|
+
while node.variable != @class_var
|
50
|
+
if node.next_attribute
|
51
|
+
if node.children.count > 1
|
52
|
+
val = values[node.next_attribute]
|
53
|
+
node = node.children.select do |child|
|
54
|
+
child.value == val
|
55
|
+
end[0]
|
56
|
+
else
|
57
|
+
node = node.children[0]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
node.value
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: baobab
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Johnny E. Lee Othon
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2015-03-22 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: ! "# baobab\n\nAn implementation of the ID3 (Iterative Dichotomiser 3)
|
15
|
+
in Ruby\n\n## How to run the tests\n\n```\nrake tests\n```\n\n## Coming soon\n\nI
|
16
|
+
promise I'll make this an installable gem. One of these days.\n\n## Sources of the
|
17
|
+
datasets\n\nThe weather dataset has been adapted from the `weather.nominal.arff`
|
18
|
+
that comes shipped with [Weka](http://www.cs.waikato.ac.nz/ml/weka/).\n\nThe transportation
|
19
|
+
dataset was taken from the example data in [https://www.youtube.com/watch?v=wL9aogTuZw8](https://www.youtube.com/watch?v=wL9aogTuZw8).\n\nThe
|
20
|
+
breast cancer dataset is adapted from the `breast-cancer.arff` file that comes shipped
|
21
|
+
with Weka. It should be attributed to:\n\nMatjaz Zwitter & Milan Soklic (physicians)\nInstitute
|
22
|
+
of Oncology \nUniversity Medical Center\nLjubljana, Yugoslavia\nDonors: Ming Tan
|
23
|
+
and Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)\nDate: 11 July 1988\n"
|
24
|
+
email: jleeothon@gmail.com
|
25
|
+
executables: []
|
26
|
+
extensions: []
|
27
|
+
extra_rdoc_files: []
|
28
|
+
files:
|
29
|
+
- lib/baobab.rb
|
30
|
+
- lib/baobab/dataset.rb
|
31
|
+
- lib/baobab/node.rb
|
32
|
+
- lib/baobab/shannon.rb
|
33
|
+
- lib/baobab/tree.rb
|
34
|
+
homepage: https://github.com/jleeothon/baobab
|
35
|
+
licenses:
|
36
|
+
- MIT
|
37
|
+
post_install_message:
|
38
|
+
rdoc_options: []
|
39
|
+
require_paths:
|
40
|
+
- lib
|
41
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ! '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
48
|
+
none: false
|
49
|
+
requirements:
|
50
|
+
- - ! '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
requirements: []
|
54
|
+
rubyforge_project:
|
55
|
+
rubygems_version: 1.8.23
|
56
|
+
signing_key:
|
57
|
+
specification_version: 3
|
58
|
+
summary: ID3 decision trees for machine learning in Ruby
|
59
|
+
test_files: []
|