baobab 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ require 'baobab/dataset'
2
+ require 'baobab/node'
3
+ require 'baobab/shannon'
4
+ require 'baobab/tree'
@@ -0,0 +1,74 @@
1
+
2
+ # Represents a dataset or subset thereof.
3
+ # Is an array of hashes where all hashes contain the same keys.
4
+ class Dataset < Array
5
+
6
+ # Receives an array of hashes. All hashes must contain the same keys.
7
+ def initialize data
8
+ data.each do |row|
9
+ self << row
10
+ end
11
+ end
12
+
13
+ def self.from_json filename
14
+ text = File.read(filename)
15
+ self.new JSON.parse(text)
16
+ end
17
+
18
+ def attribute_names class_var
19
+ self.column_names.reject{|name| name == class_var}
20
+ end
21
+
22
+ # Returns an array of the attribute names in the dataset
23
+ # Careful: it's empty on an empty set.
24
+ def column_names
25
+ self[0].keys
26
+ end
27
+
28
+ # Returns an array of the values of an attribute in the dataset.
29
+ # Careful: it's empty on an empty set.
30
+ def column_values attribute
31
+ self.map{|row| row[attribute]}.to_a.uniq
32
+ end
33
+
34
+ # Gets a subset with given conditions. Keys must be of the same type as
35
+ # in the dataset (be careful with symbols).
36
+ def subset conditions
37
+ rows = self.select do |row|
38
+ conditions.reduce(true) do |memo, (var, val)|
39
+ memo and row[var] == val
40
+ end
41
+ end
42
+ Dataset.new rows
43
+ end
44
+
45
+ def entropy class_var
46
+ class_vals = self.column_values(class_var)
47
+ probabilities = class_vals.map do |class_val|
48
+ self.probability(class_var, class_val)
49
+ end
50
+ Shannon::entropy *probabilities
51
+ end
52
+
53
+ # Evaluates the probability that var be val in this dataset.
54
+ # Can also be used for subsets.
55
+ def probability var, val
56
+ unless self.count.zero?
57
+ self.count{|r| r[var] == val}.fdiv(self.count)
58
+ else
59
+ 0
60
+ end
61
+ end
62
+
63
+ def validate
64
+ raise 'Dataset is empty' if self.empty?
65
+ self.reduce(self[0].keys) do |memo, row|
66
+ if memo == row.keys then
67
+ memo
68
+ else
69
+ raise 'Dataset is inconsistent'
70
+ end
71
+ end
72
+ return nil
73
+ end
74
+ end
@@ -0,0 +1,189 @@
1
+
2
+ class DecisionTreeNode
3
+
4
+ # A list of nodes
5
+ attr_accessor :children
6
+
7
+ # The variable on which this operates, a string
8
+ attr_reader :variable
9
+
10
+ # The value for the variable, a string
11
+ attr_reader :value
12
+
13
+ # The decision tree
14
+ attr_reader :tree
15
+
16
+ attr_reader :entropy
17
+
18
+ attr_accessor :next_attribute
19
+
20
+ # Accumulates the conditions down the tree
21
+ attr_reader :conditions
22
+
23
+ # The conditional probability that variable=value
24
+ attr_reader :probability
25
+
26
+ def initialize tree, parent, variable, value, entropy, conditions=nil
27
+ @tree = tree
28
+ @parent = parent
29
+ @variable = variable
30
+ @value = value
31
+ @entropy = entropy
32
+ @conditions = conditions ? conditions : {}
33
+ @children = Array.new
34
+ @subset = nil
35
+ end
36
+
37
+ def to_s
38
+ s = @variable ? "#{@variable} => #{@value}" : "ROOT"
39
+ s += " (#{@entropy.round(3)})"
40
+ end
41
+
42
+ def subset
43
+ unless @subset.nil?
44
+ @subset
45
+ else
46
+ @subset = @tree.dataset.subset(self.full_conditions)
47
+ end
48
+ end
49
+
50
+ def clear
51
+ @subset = nil
52
+ end
53
+
54
+ def pending_attrs
55
+ @tree.dataset.column_names.reject do |name|
56
+ @tree.class_var == name or @conditions.include? name
57
+ end
58
+ end
59
+
60
+ # Returns a subset selected on all the parent node's conditions plus this
61
+ # node's attribute and its value.
62
+ def full_conditions
63
+ if @variable
64
+ conditions.merge({@variable => @value})
65
+ else
66
+ conditions
67
+ end
68
+ end
69
+
70
+ def build_subtree recursive=true
71
+ if self.try_finish
72
+ return
73
+ end
74
+ subset = self.subset
75
+ subset_count = self.subset.count
76
+ entropies = self.entropies
77
+ inf_gain = entropies.each.with_object({}) do |(a, e), o|
78
+ o[a] = @entropy - e
79
+ end
80
+ max_attr, max_gain = inf_gain.max_by{|v, g| g}
81
+ self.next_attribute = max_attr
82
+ @children += self.tree.dataset.column_values(max_attr).map do |value|
83
+ conditions = self.full_conditions
84
+ DecisionTreeNode.new(
85
+ tree=@tree, parent=self,
86
+ attribute=max_attr, value=value,
87
+ entropy=entropies[max_attr], conditions=conditions
88
+ )
89
+ end
90
+ if recursive
91
+ @children.each do |c|
92
+ c.build_subtree
93
+ end
94
+ end
95
+ end
96
+
97
+ # Checks whether any of three is true:
98
+ # - All elements in the subset belong to the same class value: a leaf with
99
+ # this value is created.
100
+ # - There are no more attributes to be selected: a leaf with the most common
101
+ # class value is selected
102
+ # - There are no more rows in the dataset: a leaf with the most common class
103
+ # value in the parent node is created.
104
+ def try_finish
105
+ var = self.tree.class_var
106
+ val = (
107
+ self.try_finish_single_value_class or
108
+ self.try_finish_empty_subset or
109
+ self.try_finish_no_more_attributes
110
+ )
111
+ if val
112
+ @next_attribute = @tree.class_var
113
+ self.children << DecisionTreeNode.new(
114
+ tree=@tree, parent=self,
115
+ variable=@tree.class_var, value=val,
116
+ entropy=0, conditions=self.full_conditions
117
+ )
118
+ else
119
+ false
120
+ end
121
+ end
122
+
123
+ # If all class values are the same, returns that value; else, nil.
124
+ def try_finish_single_value_class
125
+ if self.subset.any?
126
+ v0 = self.subset[0][@tree.class_var]
127
+ self.subset.slice(1...-1).reduce(v0) do |memo, row|
128
+ if memo != row[@tree.class_var]
129
+ return nil
130
+ else
131
+ memo
132
+ end
133
+ end
134
+ else
135
+ nil
136
+ end
137
+
138
+ end
139
+
140
+ # If there are not more attributes, returns the most common class value;
141
+ # else, nil.
142
+ def try_finish_no_more_attributes
143
+ if self.pending_attrs.empty? then
144
+ self.most_common_value
145
+ else
146
+ nil
147
+ end
148
+ end
149
+
150
+ # If the subset is empty, returns the most common value in the parent
151
+ # node's subset.
152
+ def try_finish_empty_subset
153
+ if self.subset.empty?
154
+ @parent.most_common_value
155
+ else
156
+ nil
157
+ end
158
+ end
159
+
160
+ # Returns the most common class value in the dataset.
161
+ def most_common_value
162
+ class_var = @tree.class_var
163
+ class_values = @tree.dataset.column_values(class_var)
164
+ count = class_values.each.with_object({}) do |val, o|
165
+ o[val] = 0
166
+ end
167
+ self.subset.each.with_object(count) do |row, o|
168
+ count[row[class_var]] += 1
169
+ end
170
+ count.max_by{|v, c| c}[0]
171
+ end
172
+
173
+ # Returns a hash of {attribute, entropy} given that we divide the dataset
174
+ # on attribute.
175
+ def entropies
176
+ self.pending_attrs.each.with_object({}) do |a, o|
177
+ values = @tree.dataset.column_values(a)
178
+ val_probabilities = values.each.with_object({}) do |v, o|
179
+ o[v] = subset.probability a, v
180
+ end
181
+ val_entropies = values.each.with_object({}) do |v, o|
182
+ o[v] = subset.subset({a => v}).entropy(self.tree.class_var)
183
+ end
184
+ o[a] = values.reduce(0) do |memo, v|
185
+ memo + val_entropies[v] * val_probabilities[v]
186
+ end
187
+ end
188
+ end
189
+ end
@@ -0,0 +1,13 @@
1
+
2
+ module Shannon
3
+
4
+ def self.entropy *probabilities
5
+ probabilities.reduce(0) do |memo, p|
6
+ if p.zero? then 0 else memo + self.entropy_term(p) end
7
+ end
8
+ end
9
+
10
+ def self.entropy_term probability
11
+ - probability * Math::log2(probability)
12
+ end
13
+ end
@@ -0,0 +1,64 @@
1
+ require 'json'
2
+ require 'set'
3
+
4
+ class DecisionTree
5
+
6
+ # The first decision tree node
7
+ attr_reader :root
8
+
9
+ # The class variable of interest in this decision tree.
10
+ attr_reader :class_var
11
+
12
+ # The underlying dataset
13
+ attr_reader :dataset
14
+
15
+ def initialize dataset, class_var
16
+ @dataset = dataset
17
+ @class_var = class_var
18
+ entropy = dataset.entropy class_var
19
+ @root = DecisionTreeNode.new(
20
+ self, parent=nil,
21
+ attribute=nil, value=nil,
22
+ entropy
23
+ )
24
+ @root.build_subtree
25
+ end
26
+
27
+ # Prints the decision depth-first with the respective entropy values.
28
+ def to_s
29
+ s = ""
30
+ nodes = [[0, @root]]
31
+ while nodes.any?
32
+ l, n = nodes.last
33
+ nodes = nodes.slice(0...-1)
34
+ n.children.each do |c|
35
+ nodes << [l + 1, c]
36
+ end
37
+ s = s + "#{' ' * (l * 2)}#{n.to_s}\n"
38
+ end
39
+ return s
40
+ end
41
+
42
+ # Receives attributes and their values (they must be all defined).
43
+ # Returns the value of the predicted class value.
44
+ def query values
45
+ if values.keys != @dataset.attribute_names(@class_var)
46
+ raise "Query does not fit all variables"
47
+ end
48
+ node = @root
49
+ while node.variable != @class_var
50
+ if node.next_attribute
51
+ if node.children.count > 1
52
+ val = values[node.next_attribute]
53
+ node = node.children.select do |child|
54
+ child.value == val
55
+ end[0]
56
+ else
57
+ node = node.children[0]
58
+ end
59
+ end
60
+ end
61
+ node.value
62
+ end
63
+
64
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: baobab
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Johnny E. Lee Othon
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2015-03-22 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: ! "# baobab\n\nAn implementation of the ID3 (Iterative Dichotomiser 3)
15
+ in Ruby\n\n## How to run the tests\n\n```\nrake tests\n```\n\n## Coming soon\n\nI
16
+ promise I'll make this an installable gem. One of these days.\n\n## Sources of the
17
+ datasets\n\nThe weather dataset has been adapted from the `weather.nominal.arff`
18
+ that comes shipped with [Weka](http://www.cs.waikato.ac.nz/ml/weka/).\n\nThe transportation
19
+ dataset was taken from the example data in [https://www.youtube.com/watch?v=wL9aogTuZw8](https://www.youtube.com/watch?v=wL9aogTuZw8).\n\nThe
20
+ breast cancer dataset is adapted from the `breast-cancer.arff` file that comes shipped
21
+ with Weka. It should be attributed to:\n\nMatjaz Zwitter & Milan Soklic (physicians)\nInstitute
22
+ of Oncology \nUniversity Medical Center\nLjubljana, Yugoslavia\nDonors: Ming Tan
23
+ and Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)\nDate: 11 July 1988\n"
24
+ email: jleeothon@gmail.com
25
+ executables: []
26
+ extensions: []
27
+ extra_rdoc_files: []
28
+ files:
29
+ - lib/baobab.rb
30
+ - lib/baobab/dataset.rb
31
+ - lib/baobab/node.rb
32
+ - lib/baobab/shannon.rb
33
+ - lib/baobab/tree.rb
34
+ homepage: https://github.com/jleeothon/baobab
35
+ licenses:
36
+ - MIT
37
+ post_install_message:
38
+ rdoc_options: []
39
+ require_paths:
40
+ - lib
41
+ required_ruby_version: !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ! '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ required_rubygems_version: !ruby/object:Gem::Requirement
48
+ none: false
49
+ requirements:
50
+ - - ! '>='
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ requirements: []
54
+ rubyforge_project:
55
+ rubygems_version: 1.8.23
56
+ signing_key:
57
+ specification_version: 3
58
+ summary: ID3 decision trees for machine learning in Ruby
59
+ test_files: []