baobab 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ require 'baobab/dataset'
2
+ require 'baobab/node'
3
+ require 'baobab/shannon'
4
+ require 'baobab/tree'
@@ -0,0 +1,74 @@
1
+
2
+ # Represents a dataset or subset thereof.
3
+ # Is an array of hashes where all hashes contain the same keys.
4
+ class Dataset < Array
5
+
6
+ # Receives an array of hashes. All hashes must contain the same keys.
7
+ def initialize data
8
+ data.each do |row|
9
+ self << row
10
+ end
11
+ end
12
+
13
+ def self.from_json filename
14
+ text = File.read(filename)
15
+ self.new JSON.parse(text)
16
+ end
17
+
18
+ def attribute_names class_var
19
+ self.column_names.reject{|name| name == class_var}
20
+ end
21
+
22
+ # Returns an array of the attribute names in the dataset
23
+ # Careful: it's empty on an empty set.
24
+ def column_names
25
+ self[0].keys
26
+ end
27
+
28
+ # Returns an array of the values of an attribute in the dataset.
29
+ # Careful: it's empty on an empty set.
30
+ def column_values attribute
31
+ self.map{|row| row[attribute]}.to_a.uniq
32
+ end
33
+
34
+ # Gets a subset with given conditions. Keys must be of the same type as
35
+ # in the dataset (be careful with symbols).
36
+ def subset conditions
37
+ rows = self.select do |row|
38
+ conditions.reduce(true) do |memo, (var, val)|
39
+ memo and row[var] == val
40
+ end
41
+ end
42
+ Dataset.new rows
43
+ end
44
+
45
+ def entropy class_var
46
+ class_vals = self.column_values(class_var)
47
+ probabilities = class_vals.map do |class_val|
48
+ self.probability(class_var, class_val)
49
+ end
50
+ Shannon::entropy *probabilities
51
+ end
52
+
53
+ # Evaluates the probability that var be val in this dataset.
54
+ # Can also be used for subsets.
55
+ def probability var, val
56
+ unless self.count.zero?
57
+ self.count{|r| r[var] == val}.fdiv(self.count)
58
+ else
59
+ 0
60
+ end
61
+ end
62
+
63
+ def validate
64
+ raise 'Dataset is empty' if self.empty?
65
+ self.reduce(self[0].keys) do |memo, row|
66
+ if memo == row.keys then
67
+ memo
68
+ else
69
+ raise 'Dataset is inconsistent'
70
+ end
71
+ end
72
+ return nil
73
+ end
74
+ end
@@ -0,0 +1,189 @@
1
+
2
+ class DecisionTreeNode
3
+
4
+ # A list of nodes
5
+ attr_accessor :children
6
+
7
+ # The variable on which this operates, a string
8
+ attr_reader :variable
9
+
10
+ # The value for the variable, a string
11
+ attr_reader :value
12
+
13
+ # The decision tree
14
+ attr_reader :tree
15
+
16
+ attr_reader :entropy
17
+
18
+ attr_accessor :next_attribute
19
+
20
+ # Accumulates the conditions down the tree
21
+ attr_reader :conditions
22
+
23
+ # The conditional probability that variable=value
24
+ attr_reader :probability
25
+
26
+ def initialize tree, parent, variable, value, entropy, conditions=nil
27
+ @tree = tree
28
+ @parent = parent
29
+ @variable = variable
30
+ @value = value
31
+ @entropy = entropy
32
+ @conditions = conditions ? conditions : {}
33
+ @children = Array.new
34
+ @subset = nil
35
+ end
36
+
37
+ def to_s
38
+ s = @variable ? "#{@variable} => #{@value}" : "ROOT"
39
+ s += " (#{@entropy.round(3)})"
40
+ end
41
+
42
+ def subset
43
+ unless @subset.nil?
44
+ @subset
45
+ else
46
+ @subset = @tree.dataset.subset(self.full_conditions)
47
+ end
48
+ end
49
+
50
+ def clear
51
+ @subset = nil
52
+ end
53
+
54
+ def pending_attrs
55
+ @tree.dataset.column_names.reject do |name|
56
+ @tree.class_var == name or @conditions.include? name
57
+ end
58
+ end
59
+
60
+ # Returns a subset selected on all the parent node's conditions plus this
61
+ # node's attribute and its value.
62
+ def full_conditions
63
+ if @variable
64
+ conditions.merge({@variable => @value})
65
+ else
66
+ conditions
67
+ end
68
+ end
69
+
70
+ def build_subtree recursive=true
71
+ if self.try_finish
72
+ return
73
+ end
74
+ subset = self.subset
75
+ subset_count = self.subset.count
76
+ entropies = self.entropies
77
+ inf_gain = entropies.each.with_object({}) do |(a, e), o|
78
+ o[a] = @entropy - e
79
+ end
80
+ max_attr, max_gain = inf_gain.max_by{|v, g| g}
81
+ self.next_attribute = max_attr
82
+ @children += self.tree.dataset.column_values(max_attr).map do |value|
83
+ conditions = self.full_conditions
84
+ DecisionTreeNode.new(
85
+ tree=@tree, parent=self,
86
+ attribute=max_attr, value=value,
87
+ entropy=entropies[max_attr], conditions=conditions
88
+ )
89
+ end
90
+ if recursive
91
+ @children.each do |c|
92
+ c.build_subtree
93
+ end
94
+ end
95
+ end
96
+
97
+ # Checks whether any of three is true:
98
+ # - All elements in the subset belong to the same class value: a leaf with
99
+ # this value is created.
100
+ # - There are no more attributes to be selected: a leaf with the most common
101
+ # class value is selected
102
+ # - There are no more rows in the dataset: a leaf with the most common class
103
+ # value in the parent node is created.
104
+ def try_finish
105
+ var = self.tree.class_var
106
+ val = (
107
+ self.try_finish_single_value_class or
108
+ self.try_finish_empty_subset or
109
+ self.try_finish_no_more_attributes
110
+ )
111
+ if val
112
+ @next_attribute = @tree.class_var
113
+ self.children << DecisionTreeNode.new(
114
+ tree=@tree, parent=self,
115
+ variable=@tree.class_var, value=val,
116
+ entropy=0, conditions=self.full_conditions
117
+ )
118
+ else
119
+ false
120
+ end
121
+ end
122
+
123
+ # If all class values are the same, returns that value; else, nil.
124
+ def try_finish_single_value_class
125
+ if self.subset.any?
126
+ v0 = self.subset[0][@tree.class_var]
127
+ self.subset.slice(1...-1).reduce(v0) do |memo, row|
128
+ if memo != row[@tree.class_var]
129
+ return nil
130
+ else
131
+ memo
132
+ end
133
+ end
134
+ else
135
+ nil
136
+ end
137
+
138
+ end
139
+
140
+ # If there are not more attributes, returns the most common class value;
141
+ # else, nil.
142
+ def try_finish_no_more_attributes
143
+ if self.pending_attrs.empty? then
144
+ self.most_common_value
145
+ else
146
+ nil
147
+ end
148
+ end
149
+
150
+ # If the subset is empty, returns the most common value in the parent
151
+ # node's subset.
152
+ def try_finish_empty_subset
153
+ if self.subset.empty?
154
+ @parent.most_common_value
155
+ else
156
+ nil
157
+ end
158
+ end
159
+
160
+ # Returns the most common class value in the dataset.
161
+ def most_common_value
162
+ class_var = @tree.class_var
163
+ class_values = @tree.dataset.column_values(class_var)
164
+ count = class_values.each.with_object({}) do |val, o|
165
+ o[val] = 0
166
+ end
167
+ self.subset.each.with_object(count) do |row, o|
168
+ count[row[class_var]] += 1
169
+ end
170
+ count.max_by{|v, c| c}[0]
171
+ end
172
+
173
+ # Returns a hash of {attribute, entropy} given that we divide the dataset
174
+ # on attribute.
175
+ def entropies
176
+ self.pending_attrs.each.with_object({}) do |a, o|
177
+ values = @tree.dataset.column_values(a)
178
+ val_probabilities = values.each.with_object({}) do |v, o|
179
+ o[v] = subset.probability a, v
180
+ end
181
+ val_entropies = values.each.with_object({}) do |v, o|
182
+ o[v] = subset.subset({a => v}).entropy(self.tree.class_var)
183
+ end
184
+ o[a] = values.reduce(0) do |memo, v|
185
+ memo + val_entropies[v] * val_probabilities[v]
186
+ end
187
+ end
188
+ end
189
+ end
@@ -0,0 +1,13 @@
1
+
2
+ module Shannon
3
+
4
+ def self.entropy *probabilities
5
+ probabilities.reduce(0) do |memo, p|
6
+ if p.zero? then 0 else memo + self.entropy_term(p) end
7
+ end
8
+ end
9
+
10
+ def self.entropy_term probability
11
+ - probability * Math::log2(probability)
12
+ end
13
+ end
@@ -0,0 +1,64 @@
1
+ require 'json'
2
+ require 'set'
3
+
4
+ class DecisionTree
5
+
6
+ # The first decision tree node
7
+ attr_reader :root
8
+
9
+ # The class variable of interest in this decision tree.
10
+ attr_reader :class_var
11
+
12
+ # The underlying dataset
13
+ attr_reader :dataset
14
+
15
+ def initialize dataset, class_var
16
+ @dataset = dataset
17
+ @class_var = class_var
18
+ entropy = dataset.entropy class_var
19
+ @root = DecisionTreeNode.new(
20
+ self, parent=nil,
21
+ attribute=nil, value=nil,
22
+ entropy
23
+ )
24
+ @root.build_subtree
25
+ end
26
+
27
+ # Prints the decision depth-first with the respective entropy values.
28
+ def to_s
29
+ s = ""
30
+ nodes = [[0, @root]]
31
+ while nodes.any?
32
+ l, n = nodes.last
33
+ nodes = nodes.slice(0...-1)
34
+ n.children.each do |c|
35
+ nodes << [l + 1, c]
36
+ end
37
+ s = s + "#{' ' * (l * 2)}#{n.to_s}\n"
38
+ end
39
+ return s
40
+ end
41
+
42
+ # Receives attributes and their values (they must be all defined).
43
+ # Returns the value of the predicted class value.
44
+ def query values
45
+ if values.keys != @dataset.attribute_names(@class_var)
46
+ raise "Query does not fit all variables"
47
+ end
48
+ node = @root
49
+ while node.variable != @class_var
50
+ if node.next_attribute
51
+ if node.children.count > 1
52
+ val = values[node.next_attribute]
53
+ node = node.children.select do |child|
54
+ child.value == val
55
+ end[0]
56
+ else
57
+ node = node.children[0]
58
+ end
59
+ end
60
+ end
61
+ node.value
62
+ end
63
+
64
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: baobab
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Johnny E. Lee Othon
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2015-03-22 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: ! "# baobab\n\nAn implementation of the ID3 (Iterative Dichotomiser 3)
15
+ in Ruby\n\n## How to run the tests\n\n```\nrake tests\n```\n\n## Coming soon\n\nI
16
+ promise I'll make this an installable gem. One of these days.\n\n## Sources of the
17
+ datasets\n\nThe weather dataset has been adapted from the `weather.nominal.arff`
18
+ that comes shipped with [Weka](http://www.cs.waikato.ac.nz/ml/weka/).\n\nThe transportation
19
+ dataset was taken from the example data in [https://www.youtube.com/watch?v=wL9aogTuZw8](https://www.youtube.com/watch?v=wL9aogTuZw8).\n\nThe
20
+ breast cancer dataset is adapted from the `breast-cancer.arff` file that comes shipped
21
+ with Weka. It should be attributed to:\n\nMatjaz Zwitter & Milan Soklic (physicians)\nInstitute
22
+ of Oncology \nUniversity Medical Center\nLjubljana, Yugoslavia\nDonors: Ming Tan
23
+ and Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)\nDate: 11 July 1988\n"
24
+ email: jleeothon@gmail.com
25
+ executables: []
26
+ extensions: []
27
+ extra_rdoc_files: []
28
+ files:
29
+ - lib/baobab.rb
30
+ - lib/baobab/dataset.rb
31
+ - lib/baobab/node.rb
32
+ - lib/baobab/shannon.rb
33
+ - lib/baobab/tree.rb
34
+ homepage: https://github.com/jleeothon/baobab
35
+ licenses:
36
+ - MIT
37
+ post_install_message:
38
+ rdoc_options: []
39
+ require_paths:
40
+ - lib
41
+ required_ruby_version: !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ! '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ required_rubygems_version: !ruby/object:Gem::Requirement
48
+ none: false
49
+ requirements:
50
+ - - ! '>='
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ requirements: []
54
+ rubyforge_project:
55
+ rubygems_version: 1.8.23
56
+ signing_key:
57
+ specification_version: 3
58
+ summary: ID3 decision trees for machine learning in Ruby
59
+ test_files: []