baobab 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/baobab.rb +4 -0
- data/lib/baobab/dataset.rb +74 -0
- data/lib/baobab/node.rb +189 -0
- data/lib/baobab/shannon.rb +13 -0
- data/lib/baobab/tree.rb +64 -0
- metadata +59 -0
data/lib/baobab.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
|
2
|
+
# Represents a dataset or subset thereof.
|
3
|
+
# Is an array of hashes where all hashes contain the same keys.
|
4
|
+
class Dataset < Array
|
5
|
+
|
6
|
+
# Receives an array of hashes. All hashes must contain the same keys.
|
7
|
+
def initialize data
|
8
|
+
data.each do |row|
|
9
|
+
self << row
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.from_json filename
|
14
|
+
text = File.read(filename)
|
15
|
+
self.new JSON.parse(text)
|
16
|
+
end
|
17
|
+
|
18
|
+
def attribute_names class_var
|
19
|
+
self.column_names.reject{|name| name == class_var}
|
20
|
+
end
|
21
|
+
|
22
|
+
# Returns an array of the attribute names in the dataset
|
23
|
+
# Careful: it's empty on an empty set.
|
24
|
+
def column_names
|
25
|
+
self[0].keys
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns an array of the values of an attribute in the dataset.
|
29
|
+
# Careful: it's empty on an empty set.
|
30
|
+
def column_values attribute
|
31
|
+
self.map{|row| row[attribute]}.to_a.uniq
|
32
|
+
end
|
33
|
+
|
34
|
+
# Gets a subset with given conditions. Keys must be of the same type as
|
35
|
+
# in the dataset (be careful with symbols).
|
36
|
+
def subset conditions
|
37
|
+
rows = self.select do |row|
|
38
|
+
conditions.reduce(true) do |memo, (var, val)|
|
39
|
+
memo and row[var] == val
|
40
|
+
end
|
41
|
+
end
|
42
|
+
Dataset.new rows
|
43
|
+
end
|
44
|
+
|
45
|
+
def entropy class_var
|
46
|
+
class_vals = self.column_values(class_var)
|
47
|
+
probabilities = class_vals.map do |class_val|
|
48
|
+
self.probability(class_var, class_val)
|
49
|
+
end
|
50
|
+
Shannon::entropy *probabilities
|
51
|
+
end
|
52
|
+
|
53
|
+
# Evaluates the probability that var be val in this dataset.
|
54
|
+
# Can also be used for subsets.
|
55
|
+
def probability var, val
|
56
|
+
unless self.count.zero?
|
57
|
+
self.count{|r| r[var] == val}.fdiv(self.count)
|
58
|
+
else
|
59
|
+
0
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def validate
|
64
|
+
raise 'Dataset is empty' if self.empty?
|
65
|
+
self.reduce(self[0].keys) do |memo, row|
|
66
|
+
if memo == row.keys then
|
67
|
+
memo
|
68
|
+
else
|
69
|
+
raise 'Dataset is inconsistent'
|
70
|
+
end
|
71
|
+
end
|
72
|
+
return nil
|
73
|
+
end
|
74
|
+
end
|
data/lib/baobab/node.rb
ADDED
@@ -0,0 +1,189 @@
|
|
1
|
+
|
2
|
+
class DecisionTreeNode
|
3
|
+
|
4
|
+
# A list of nodes
|
5
|
+
attr_accessor :children
|
6
|
+
|
7
|
+
# The variable on which this operates, a string
|
8
|
+
attr_reader :variable
|
9
|
+
|
10
|
+
# The value for the variable, a string
|
11
|
+
attr_reader :value
|
12
|
+
|
13
|
+
# The decision tree
|
14
|
+
attr_reader :tree
|
15
|
+
|
16
|
+
attr_reader :entropy
|
17
|
+
|
18
|
+
attr_accessor :next_attribute
|
19
|
+
|
20
|
+
# Accumulates the conditions down the tree
|
21
|
+
attr_reader :conditions
|
22
|
+
|
23
|
+
# The conditional probability that variable=value
|
24
|
+
attr_reader :probability
|
25
|
+
|
26
|
+
def initialize tree, parent, variable, value, entropy, conditions=nil
|
27
|
+
@tree = tree
|
28
|
+
@parent = parent
|
29
|
+
@variable = variable
|
30
|
+
@value = value
|
31
|
+
@entropy = entropy
|
32
|
+
@conditions = conditions ? conditions : {}
|
33
|
+
@children = Array.new
|
34
|
+
@subset = nil
|
35
|
+
end
|
36
|
+
|
37
|
+
def to_s
|
38
|
+
s = @variable ? "#{@variable} => #{@value}" : "ROOT"
|
39
|
+
s += " (#{@entropy.round(3)})"
|
40
|
+
end
|
41
|
+
|
42
|
+
def subset
|
43
|
+
unless @subset.nil?
|
44
|
+
@subset
|
45
|
+
else
|
46
|
+
@subset = @tree.dataset.subset(self.full_conditions)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def clear
|
51
|
+
@subset = nil
|
52
|
+
end
|
53
|
+
|
54
|
+
def pending_attrs
|
55
|
+
@tree.dataset.column_names.reject do |name|
|
56
|
+
@tree.class_var == name or @conditions.include? name
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Returns a subset selected on all the parent node's conditions plus this
|
61
|
+
# node's attribute and its value.
|
62
|
+
def full_conditions
|
63
|
+
if @variable
|
64
|
+
conditions.merge({@variable => @value})
|
65
|
+
else
|
66
|
+
conditions
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def build_subtree recursive=true
|
71
|
+
if self.try_finish
|
72
|
+
return
|
73
|
+
end
|
74
|
+
subset = self.subset
|
75
|
+
subset_count = self.subset.count
|
76
|
+
entropies = self.entropies
|
77
|
+
inf_gain = entropies.each.with_object({}) do |(a, e), o|
|
78
|
+
o[a] = @entropy - e
|
79
|
+
end
|
80
|
+
max_attr, max_gain = inf_gain.max_by{|v, g| g}
|
81
|
+
self.next_attribute = max_attr
|
82
|
+
@children += self.tree.dataset.column_values(max_attr).map do |value|
|
83
|
+
conditions = self.full_conditions
|
84
|
+
DecisionTreeNode.new(
|
85
|
+
tree=@tree, parent=self,
|
86
|
+
attribute=max_attr, value=value,
|
87
|
+
entropy=entropies[max_attr], conditions=conditions
|
88
|
+
)
|
89
|
+
end
|
90
|
+
if recursive
|
91
|
+
@children.each do |c|
|
92
|
+
c.build_subtree
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Checks whether any of three is true:
|
98
|
+
# - All elements in the subset belong to the same class value: a leaf with
|
99
|
+
# this value is created.
|
100
|
+
# - There are no more attributes to be selected: a leaf with the most common
|
101
|
+
# class value is selected
|
102
|
+
# - There are no more rows in the dataset: a leaf with the most common class
|
103
|
+
# value in the parent node is created.
|
104
|
+
def try_finish
|
105
|
+
var = self.tree.class_var
|
106
|
+
val = (
|
107
|
+
self.try_finish_single_value_class or
|
108
|
+
self.try_finish_empty_subset or
|
109
|
+
self.try_finish_no_more_attributes
|
110
|
+
)
|
111
|
+
if val
|
112
|
+
@next_attribute = @tree.class_var
|
113
|
+
self.children << DecisionTreeNode.new(
|
114
|
+
tree=@tree, parent=self,
|
115
|
+
variable=@tree.class_var, value=val,
|
116
|
+
entropy=0, conditions=self.full_conditions
|
117
|
+
)
|
118
|
+
else
|
119
|
+
false
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# If all class values are the same, returns that value; else, nil.
|
124
|
+
def try_finish_single_value_class
|
125
|
+
if self.subset.any?
|
126
|
+
v0 = self.subset[0][@tree.class_var]
|
127
|
+
self.subset.slice(1...-1).reduce(v0) do |memo, row|
|
128
|
+
if memo != row[@tree.class_var]
|
129
|
+
return nil
|
130
|
+
else
|
131
|
+
memo
|
132
|
+
end
|
133
|
+
end
|
134
|
+
else
|
135
|
+
nil
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
139
|
+
|
140
|
+
# If there are not more attributes, returns the most common class value;
|
141
|
+
# else, nil.
|
142
|
+
def try_finish_no_more_attributes
|
143
|
+
if self.pending_attrs.empty? then
|
144
|
+
self.most_common_value
|
145
|
+
else
|
146
|
+
nil
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
# If the subset is empty, returns the most common value in the parent
|
151
|
+
# node's subset.
|
152
|
+
def try_finish_empty_subset
|
153
|
+
if self.subset.empty?
|
154
|
+
@parent.most_common_value
|
155
|
+
else
|
156
|
+
nil
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Returns the most common class value in the dataset.
|
161
|
+
def most_common_value
|
162
|
+
class_var = @tree.class_var
|
163
|
+
class_values = @tree.dataset.column_values(class_var)
|
164
|
+
count = class_values.each.with_object({}) do |val, o|
|
165
|
+
o[val] = 0
|
166
|
+
end
|
167
|
+
self.subset.each.with_object(count) do |row, o|
|
168
|
+
count[row[class_var]] += 1
|
169
|
+
end
|
170
|
+
count.max_by{|v, c| c}[0]
|
171
|
+
end
|
172
|
+
|
173
|
+
# Returns a hash of {attribute, entropy} given that we divide the dataset
|
174
|
+
# on attribute.
|
175
|
+
def entropies
|
176
|
+
self.pending_attrs.each.with_object({}) do |a, o|
|
177
|
+
values = @tree.dataset.column_values(a)
|
178
|
+
val_probabilities = values.each.with_object({}) do |v, o|
|
179
|
+
o[v] = subset.probability a, v
|
180
|
+
end
|
181
|
+
val_entropies = values.each.with_object({}) do |v, o|
|
182
|
+
o[v] = subset.subset({a => v}).entropy(self.tree.class_var)
|
183
|
+
end
|
184
|
+
o[a] = values.reduce(0) do |memo, v|
|
185
|
+
memo + val_entropies[v] * val_probabilities[v]
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
|
2
|
+
module Shannon
|
3
|
+
|
4
|
+
def self.entropy *probabilities
|
5
|
+
probabilities.reduce(0) do |memo, p|
|
6
|
+
if p.zero? then 0 else memo + self.entropy_term(p) end
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.entropy_term probability
|
11
|
+
- probability * Math::log2(probability)
|
12
|
+
end
|
13
|
+
end
|
data/lib/baobab/tree.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
class DecisionTree
|
5
|
+
|
6
|
+
# The first decision tree node
|
7
|
+
attr_reader :root
|
8
|
+
|
9
|
+
# The class variable of interest in this decision tree.
|
10
|
+
attr_reader :class_var
|
11
|
+
|
12
|
+
# The underlying dataset
|
13
|
+
attr_reader :dataset
|
14
|
+
|
15
|
+
def initialize dataset, class_var
|
16
|
+
@dataset = dataset
|
17
|
+
@class_var = class_var
|
18
|
+
entropy = dataset.entropy class_var
|
19
|
+
@root = DecisionTreeNode.new(
|
20
|
+
self, parent=nil,
|
21
|
+
attribute=nil, value=nil,
|
22
|
+
entropy
|
23
|
+
)
|
24
|
+
@root.build_subtree
|
25
|
+
end
|
26
|
+
|
27
|
+
# Prints the decision depth-first with the respective entropy values.
|
28
|
+
def to_s
|
29
|
+
s = ""
|
30
|
+
nodes = [[0, @root]]
|
31
|
+
while nodes.any?
|
32
|
+
l, n = nodes.last
|
33
|
+
nodes = nodes.slice(0...-1)
|
34
|
+
n.children.each do |c|
|
35
|
+
nodes << [l + 1, c]
|
36
|
+
end
|
37
|
+
s = s + "#{' ' * (l * 2)}#{n.to_s}\n"
|
38
|
+
end
|
39
|
+
return s
|
40
|
+
end
|
41
|
+
|
42
|
+
# Receives attributes and their values (they must be all defined).
|
43
|
+
# Returns the value of the predicted class value.
|
44
|
+
def query values
|
45
|
+
if values.keys != @dataset.attribute_names(@class_var)
|
46
|
+
raise "Query does not fit all variables"
|
47
|
+
end
|
48
|
+
node = @root
|
49
|
+
while node.variable != @class_var
|
50
|
+
if node.next_attribute
|
51
|
+
if node.children.count > 1
|
52
|
+
val = values[node.next_attribute]
|
53
|
+
node = node.children.select do |child|
|
54
|
+
child.value == val
|
55
|
+
end[0]
|
56
|
+
else
|
57
|
+
node = node.children[0]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
node.value
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: baobab
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Johnny E. Lee Othon
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2015-03-22 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: ! "# baobab\n\nAn implementation of the ID3 (Iterative Dichotomiser 3)
|
15
|
+
in Ruby\n\n## How to run the tests\n\n```\nrake tests\n```\n\n## Coming soon\n\nI
|
16
|
+
promise I'll make this an installable gem. One of these days.\n\n## Sources of the
|
17
|
+
datasets\n\nThe weather dataset has been adapted from the `weather.nominal.arff`
|
18
|
+
that comes shipped with [Weka](http://www.cs.waikato.ac.nz/ml/weka/).\n\nThe transportation
|
19
|
+
dataset was taken from the example data in [https://www.youtube.com/watch?v=wL9aogTuZw8](https://www.youtube.com/watch?v=wL9aogTuZw8).\n\nThe
|
20
|
+
breast cancer dataset is adapted from the `breast-cancer.arff` file that comes shipped
|
21
|
+
with Weka. It should be attributed to:\n\nMatjaz Zwitter & Milan Soklic (physicians)\nInstitute
|
22
|
+
of Oncology \nUniversity Medical Center\nLjubljana, Yugoslavia\nDonors: Ming Tan
|
23
|
+
and Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)\nDate: 11 July 1988\n"
|
24
|
+
email: jleeothon@gmail.com
|
25
|
+
executables: []
|
26
|
+
extensions: []
|
27
|
+
extra_rdoc_files: []
|
28
|
+
files:
|
29
|
+
- lib/baobab.rb
|
30
|
+
- lib/baobab/dataset.rb
|
31
|
+
- lib/baobab/node.rb
|
32
|
+
- lib/baobab/shannon.rb
|
33
|
+
- lib/baobab/tree.rb
|
34
|
+
homepage: https://github.com/jleeothon/baobab
|
35
|
+
licenses:
|
36
|
+
- MIT
|
37
|
+
post_install_message:
|
38
|
+
rdoc_options: []
|
39
|
+
require_paths:
|
40
|
+
- lib
|
41
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ! '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
48
|
+
none: false
|
49
|
+
requirements:
|
50
|
+
- - ! '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
requirements: []
|
54
|
+
rubyforge_project:
|
55
|
+
rubygems_version: 1.8.23
|
56
|
+
signing_key:
|
57
|
+
specification_version: 3
|
58
|
+
summary: ID3 decision trees for machine learning in Ruby
|
59
|
+
test_files: []
|