baobab 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: bc9a6568b4998bd54bb07030d90bbebcc6979d72
4
+ data.tar.gz: a392e91b05f89b6dc055249e9b5256e3fc98a93b
5
+ SHA512:
6
+ metadata.gz: cddc12fb7be6d7b5a73f3e659f22df27b52bdd1f75f53e9bddc9da2aa85a0d7499111f4e1b61459d071e489b68017e06306a5287b7aae3a6c84741dafaff8b34
7
+ data.tar.gz: f6d28f686b71791eed288ccdd6c6e9d6cfba3632ddbaabc0bd162fc5672f613d77c05150814c9d5a1b53854c04f7ab1b52ebc484536a111b8b7d7da78af20021
@@ -1,74 +1,77 @@
1
+ module Baobab
1
2
 
2
- # Represents a dataset or subset thereof.
3
- # Is an array of hashes where all hashes contain the same keys.
4
- class Dataset < Array
3
+ # Represents a dataset or subset thereof.
4
+ # Is an array of hashes where all hashes contain the same keys.
5
+ class Dataset < Array
5
6
 
6
- # Receives an array of hashes. All hashes must contain the same keys.
7
- def initialize data
8
- data.each do |row|
9
- self << row
10
- end
11
- end
7
+ # Receives an array of hashes. All hashes must contain the same keys.
8
+ def initialize data
9
+ data.each do |row|
10
+ self << row
11
+ end
12
+ end
12
13
 
13
- def self.from_json filename
14
- text = File.read(filename)
15
- self.new JSON.parse(text)
16
- end
14
+ def self.from_json filename
15
+ text = File.read(filename)
16
+ self.new JSON.parse(text)
17
+ end
17
18
 
18
- def attribute_names class_var
19
- self.column_names.reject{|name| name == class_var}
20
- end
19
+ def attribute_names class_var
20
+ self.column_names.reject{|name| name == class_var}
21
+ end
21
22
 
22
- # Returns an array of the attribute names in the dataset
23
- # Careful: it's empty on an empty set.
24
- def column_names
25
- self[0].keys
26
- end
23
+ # Returns an array of the attribute names in the dataset
24
+ # Careful: it's empty on an empty set.
25
+ def column_names
26
+ self[0].keys
27
+ end
27
28
 
28
- # Returns an array of the values of an attribute in the dataset.
29
- # Careful: it's empty on an empty set.
30
- def column_values attribute
31
- self.map{|row| row[attribute]}.to_a.uniq
32
- end
29
+ # Returns an array of the values of an attribute in the dataset.
30
+ # Careful: it's empty on an empty set.
31
+ def column_values attribute
32
+ self.map{|row| row[attribute]}.to_a.uniq
33
+ end
33
34
 
34
- # Gets a subset with given conditions. Keys must be of the same type as
35
- # in the dataset (be careful with symbols).
36
- def subset conditions
37
- rows = self.select do |row|
38
- conditions.reduce(true) do |memo, (var, val)|
39
- memo and row[var] == val
40
- end
41
- end
42
- Dataset.new rows
43
- end
35
+ # Gets a subset with given conditions. Keys must be of the same type as
36
+ # in the dataset (be careful with symbols).
37
+ def subset conditions
38
+ rows = self.select do |row|
39
+ conditions.reduce(true) do |memo, (var, val)|
40
+ memo and row[var] == val
41
+ end
42
+ end
43
+ Dataset.new rows
44
+ end
44
45
 
45
- def entropy class_var
46
- class_vals = self.column_values(class_var)
47
- probabilities = class_vals.map do |class_val|
48
- self.probability(class_var, class_val)
49
- end
50
- Shannon::entropy *probabilities
51
- end
46
+ def entropy class_var
47
+ class_vals = self.column_values(class_var)
48
+ probabilities = class_vals.map do |class_val|
49
+ self.probability(class_var, class_val)
50
+ end
51
+ Shannon::entropy *probabilities
52
+ end
52
53
 
53
- # Evaluates the probability that var be val in this dataset.
54
- # Can also be used for subsets.
55
- def probability var, val
56
- unless self.count.zero?
57
- self.count{|r| r[var] == val}.fdiv(self.count)
58
- else
59
- 0
60
- end
61
- end
54
+ # Evaluates the probability that var be val in this dataset.
55
+ # Can also be used for subsets.
56
+ def probability var, val
57
+ unless self.count.zero?
58
+ self.count{|r| r[var] == val}.fdiv(self.count)
59
+ else
60
+ 0
61
+ end
62
+ end
63
+
64
+ def validate
65
+ raise 'Dataset is empty' if self.empty?
66
+ self.reduce(self[0].keys) do |memo, row|
67
+ if memo == row.keys then
68
+ memo
69
+ else
70
+ raise 'Dataset is inconsistent'
71
+ end
72
+ end
73
+ return nil
74
+ end
75
+ end
62
76
 
63
- def validate
64
- raise 'Dataset is empty' if self.empty?
65
- self.reduce(self[0].keys) do |memo, row|
66
- if memo == row.keys then
67
- memo
68
- else
69
- raise 'Dataset is inconsistent'
70
- end
71
- end
72
- return nil
73
- end
74
77
  end
@@ -1,189 +1,192 @@
1
+ module Baobab
2
+
3
+ class DecisionTreeNode
4
+
5
+ # A list of nodes
6
+ attr_accessor :children
7
+
8
+ # The variable on which this operates, a string
9
+ attr_reader :variable
10
+
11
+ # The value for the variable, a string
12
+ attr_reader :value
13
+
14
+ # The decision tree
15
+ attr_reader :tree
16
+
17
+ attr_reader :entropy
18
+
19
+ attr_accessor :next_attribute
20
+
21
+ # Accumulates the conditions down the tree
22
+ attr_reader :conditions
23
+
24
+ # The conditional probability that variable=value
25
+ attr_reader :probability
26
+
27
+ def initialize tree, parent, variable, value, entropy, conditions=nil
28
+ @tree = tree
29
+ @parent = parent
30
+ @variable = variable
31
+ @value = value
32
+ @entropy = entropy
33
+ @conditions = conditions ? conditions : {}
34
+ @children = Array.new
35
+ @subset = nil
36
+ end
37
+
38
+ def to_s
39
+ s = @variable ? "#{@variable} => #{@value}" : "ROOT"
40
+ s += " (#{@entropy.round(3)})"
41
+ end
42
+
43
+ def subset
44
+ unless @subset.nil?
45
+ @subset
46
+ else
47
+ @subset = @tree.dataset.subset(self.full_conditions)
48
+ end
49
+ end
50
+
51
+ def clear
52
+ @subset = nil
53
+ end
54
+
55
+ def pending_attrs
56
+ @tree.dataset.column_names.reject do |name|
57
+ @tree.class_var == name or @conditions.include? name
58
+ end
59
+ end
60
+
61
+ # Returns a subset selected on all the parent node's conditions plus this
62
+ # node's attribute and its value.
63
+ def full_conditions
64
+ if @variable
65
+ conditions.merge({@variable => @value})
66
+ else
67
+ conditions
68
+ end
69
+ end
70
+
71
+ def build_subtree recursive=true
72
+ if self.try_finish
73
+ return
74
+ end
75
+ subset = self.subset
76
+ subset_count = self.subset.count
77
+ entropies = self.entropies
78
+ inf_gain = entropies.each.with_object({}) do |(a, e), o|
79
+ o[a] = @entropy - e
80
+ end
81
+ max_attr, max_gain = inf_gain.max_by{|v, g| g}
82
+ self.next_attribute = max_attr
83
+ @children += self.tree.dataset.column_values(max_attr).map do |value|
84
+ conditions = self.full_conditions
85
+ DecisionTreeNode.new(
86
+ tree=@tree, parent=self,
87
+ attribute=max_attr, value=value,
88
+ entropy=entropies[max_attr], conditions=conditions
89
+ )
90
+ end
91
+ if recursive
92
+ @children.each do |c|
93
+ c.build_subtree
94
+ end
95
+ end
96
+ end
97
+
98
+ # Checks whether any of three is true:
99
+ # - All elements in the subset belong to the same class value: a leaf with
100
+ # this value is created.
101
+ # - There are no more attributes to be selected: a leaf with the most common
102
+ # class value is selected
103
+ # - There are no more rows in the dataset: a leaf with the most common class
104
+ # value in the parent node is created.
105
+ def try_finish
106
+ var = self.tree.class_var
107
+ val = (
108
+ self.try_finish_single_value_class or
109
+ self.try_finish_empty_subset or
110
+ self.try_finish_no_more_attributes
111
+ )
112
+ if val
113
+ @next_attribute = @tree.class_var
114
+ self.children << DecisionTreeNode.new(
115
+ tree=@tree, parent=self,
116
+ variable=@tree.class_var, value=val,
117
+ entropy=0, conditions=self.full_conditions
118
+ )
119
+ else
120
+ false
121
+ end
122
+ end
123
+
124
+ # If all class values are the same, returns that value; else, nil.
125
+ def try_finish_single_value_class
126
+ if self.subset.any?
127
+ v0 = self.subset[0][@tree.class_var]
128
+ self.subset.slice(1...-1).reduce(v0) do |memo, row|
129
+ if memo != row[@tree.class_var]
130
+ return nil
131
+ else
132
+ memo
133
+ end
134
+ end
135
+ else
136
+ nil
137
+ end
138
+
139
+ end
140
+
141
+ # If there are not more attributes, returns the most common class value;
142
+ # else, nil.
143
+ def try_finish_no_more_attributes
144
+ if self.pending_attrs.empty? then
145
+ self.most_common_value
146
+ else
147
+ nil
148
+ end
149
+ end
150
+
151
+ # If the subset is empty, returns the most common value in the parent
152
+ # node's subset.
153
+ def try_finish_empty_subset
154
+ if self.subset.empty?
155
+ @parent.most_common_value
156
+ else
157
+ nil
158
+ end
159
+ end
160
+
161
+ # Returns the most common class value in the dataset.
162
+ def most_common_value
163
+ class_var = @tree.class_var
164
+ class_values = @tree.dataset.column_values(class_var)
165
+ count = class_values.each.with_object({}) do |val, o|
166
+ o[val] = 0
167
+ end
168
+ self.subset.each.with_object(count) do |row, o|
169
+ count[row[class_var]] += 1
170
+ end
171
+ count.max_by{|v, c| c}[0]
172
+ end
173
+
174
+ # Returns a hash of {attribute, entropy} given that we divide the dataset
175
+ # on attribute.
176
+ def entropies
177
+ self.pending_attrs.each.with_object({}) do |a, o|
178
+ values = @tree.dataset.column_values(a)
179
+ val_probabilities = values.each.with_object({}) do |v, o|
180
+ o[v] = subset.probability a, v
181
+ end
182
+ val_entropies = values.each.with_object({}) do |v, o|
183
+ o[v] = subset.subset({a => v}).entropy(self.tree.class_var)
184
+ end
185
+ o[a] = values.reduce(0) do |memo, v|
186
+ memo + val_entropies[v] * val_probabilities[v]
187
+ end
188
+ end
189
+ end
190
+ end
1
191
 
2
- class DecisionTreeNode
3
-
4
- # A list of nodes
5
- attr_accessor :children
6
-
7
- # The variable on which this operates, a string
8
- attr_reader :variable
9
-
10
- # The value for the variable, a string
11
- attr_reader :value
12
-
13
- # The decision tree
14
- attr_reader :tree
15
-
16
- attr_reader :entropy
17
-
18
- attr_accessor :next_attribute
19
-
20
- # Accumulates the conditions down the tree
21
- attr_reader :conditions
22
-
23
- # The conditional probability that variable=value
24
- attr_reader :probability
25
-
26
- def initialize tree, parent, variable, value, entropy, conditions=nil
27
- @tree = tree
28
- @parent = parent
29
- @variable = variable
30
- @value = value
31
- @entropy = entropy
32
- @conditions = conditions ? conditions : {}
33
- @children = Array.new
34
- @subset = nil
35
- end
36
-
37
- def to_s
38
- s = @variable ? "#{@variable} => #{@value}" : "ROOT"
39
- s += " (#{@entropy.round(3)})"
40
- end
41
-
42
- def subset
43
- unless @subset.nil?
44
- @subset
45
- else
46
- @subset = @tree.dataset.subset(self.full_conditions)
47
- end
48
- end
49
-
50
- def clear
51
- @subset = nil
52
- end
53
-
54
- def pending_attrs
55
- @tree.dataset.column_names.reject do |name|
56
- @tree.class_var == name or @conditions.include? name
57
- end
58
- end
59
-
60
- # Returns a subset selected on all the parent node's conditions plus this
61
- # node's attribute and its value.
62
- def full_conditions
63
- if @variable
64
- conditions.merge({@variable => @value})
65
- else
66
- conditions
67
- end
68
- end
69
-
70
- def build_subtree recursive=true
71
- if self.try_finish
72
- return
73
- end
74
- subset = self.subset
75
- subset_count = self.subset.count
76
- entropies = self.entropies
77
- inf_gain = entropies.each.with_object({}) do |(a, e), o|
78
- o[a] = @entropy - e
79
- end
80
- max_attr, max_gain = inf_gain.max_by{|v, g| g}
81
- self.next_attribute = max_attr
82
- @children += self.tree.dataset.column_values(max_attr).map do |value|
83
- conditions = self.full_conditions
84
- DecisionTreeNode.new(
85
- tree=@tree, parent=self,
86
- attribute=max_attr, value=value,
87
- entropy=entropies[max_attr], conditions=conditions
88
- )
89
- end
90
- if recursive
91
- @children.each do |c|
92
- c.build_subtree
93
- end
94
- end
95
- end
96
-
97
- # Checks whether any of three is true:
98
- # - All elements in the subset belong to the same class value: a leaf with
99
- # this value is created.
100
- # - There are no more attributes to be selected: a leaf with the most common
101
- # class value is selected
102
- # - There are no more rows in the dataset: a leaf with the most common class
103
- # value in the parent node is created.
104
- def try_finish
105
- var = self.tree.class_var
106
- val = (
107
- self.try_finish_single_value_class or
108
- self.try_finish_empty_subset or
109
- self.try_finish_no_more_attributes
110
- )
111
- if val
112
- @next_attribute = @tree.class_var
113
- self.children << DecisionTreeNode.new(
114
- tree=@tree, parent=self,
115
- variable=@tree.class_var, value=val,
116
- entropy=0, conditions=self.full_conditions
117
- )
118
- else
119
- false
120
- end
121
- end
122
-
123
- # If all class values are the same, returns that value; else, nil.
124
- def try_finish_single_value_class
125
- if self.subset.any?
126
- v0 = self.subset[0][@tree.class_var]
127
- self.subset.slice(1...-1).reduce(v0) do |memo, row|
128
- if memo != row[@tree.class_var]
129
- return nil
130
- else
131
- memo
132
- end
133
- end
134
- else
135
- nil
136
- end
137
-
138
- end
139
-
140
- # If there are not more attributes, returns the most common class value;
141
- # else, nil.
142
- def try_finish_no_more_attributes
143
- if self.pending_attrs.empty? then
144
- self.most_common_value
145
- else
146
- nil
147
- end
148
- end
149
-
150
- # If the subset is empty, returns the most common value in the parent
151
- # node's subset.
152
- def try_finish_empty_subset
153
- if self.subset.empty?
154
- @parent.most_common_value
155
- else
156
- nil
157
- end
158
- end
159
-
160
- # Returns the most common class value in the dataset.
161
- def most_common_value
162
- class_var = @tree.class_var
163
- class_values = @tree.dataset.column_values(class_var)
164
- count = class_values.each.with_object({}) do |val, o|
165
- o[val] = 0
166
- end
167
- self.subset.each.with_object(count) do |row, o|
168
- count[row[class_var]] += 1
169
- end
170
- count.max_by{|v, c| c}[0]
171
- end
172
-
173
- # Returns a hash of {attribute, entropy} given that we divide the dataset
174
- # on attribute.
175
- def entropies
176
- self.pending_attrs.each.with_object({}) do |a, o|
177
- values = @tree.dataset.column_values(a)
178
- val_probabilities = values.each.with_object({}) do |v, o|
179
- o[v] = subset.probability a, v
180
- end
181
- val_entropies = values.each.with_object({}) do |v, o|
182
- o[v] = subset.subset({a => v}).entropy(self.tree.class_var)
183
- end
184
- o[a] = values.reduce(0) do |memo, v|
185
- memo + val_entropies[v] * val_probabilities[v]
186
- end
187
- end
188
- end
189
192
  end
@@ -1,13 +1,16 @@
1
+ module Baobab
1
2
 
2
- module Shannon
3
+ module Shannon
3
4
 
4
- def self.entropy *probabilities
5
- probabilities.reduce(0) do |memo, p|
6
- if p.zero? then 0 else memo + self.entropy_term(p) end
7
- end
8
- end
5
+ def self.entropy *probabilities
6
+ probabilities.reduce(0) do |memo, p|
7
+ if p.zero? then 0 else memo + self.entropy_term(p) end
8
+ end
9
+ end
10
+
11
+ def self.entropy_term probability
12
+ - probability * Math::log2(probability)
13
+ end
14
+ end
9
15
 
10
- def self.entropy_term probability
11
- - probability * Math::log2(probability)
12
- end
13
16
  end
@@ -1,64 +1,68 @@
1
1
  require 'json'
2
2
  require 'set'
3
3
 
4
- class DecisionTree
4
+ module Baobab
5
5
 
6
- # The first decision tree node
7
- attr_reader :root
6
+ class DecisionTree
8
7
 
9
- # The class variable of interest in this decision tree.
10
- attr_reader :class_var
8
+ # The first decision tree node
9
+ attr_reader :root
11
10
 
12
- # The underlying dataset
13
- attr_reader :dataset
11
+ # The class variable of interest in this decision tree.
12
+ attr_reader :class_var
14
13
 
15
- def initialize dataset, class_var
16
- @dataset = dataset
17
- @class_var = class_var
18
- entropy = dataset.entropy class_var
19
- @root = DecisionTreeNode.new(
20
- self, parent=nil,
21
- attribute=nil, value=nil,
22
- entropy
23
- )
24
- @root.build_subtree
25
- end
14
+ # The underlying dataset
15
+ attr_reader :dataset
26
16
 
27
- # Prints the decision depth-first with the respective entropy values.
28
- def to_s
29
- s = ""
30
- nodes = [[0, @root]]
31
- while nodes.any?
32
- l, n = nodes.last
33
- nodes = nodes.slice(0...-1)
34
- n.children.each do |c|
35
- nodes << [l + 1, c]
36
- end
37
- s = s + "#{' ' * (l * 2)}#{n.to_s}\n"
38
- end
39
- return s
40
- end
17
+ def initialize dataset, class_var
18
+ @dataset = dataset
19
+ @class_var = class_var
20
+ entropy = dataset.entropy class_var
21
+ @root = DecisionTreeNode.new(
22
+ self, parent=nil,
23
+ attribute=nil, value=nil,
24
+ entropy
25
+ )
26
+ @root.build_subtree
27
+ end
41
28
 
42
- # Receives attributes and their values (they must be all defined).
43
- # Returns the value of the predicted class value.
44
- def query values
45
- if values.keys != @dataset.attribute_names(@class_var)
46
- raise "Query does not fit all variables"
47
- end
48
- node = @root
49
- while node.variable != @class_var
50
- if node.next_attribute
51
- if node.children.count > 1
52
- val = values[node.next_attribute]
53
- node = node.children.select do |child|
54
- child.value == val
55
- end[0]
56
- else
57
- node = node.children[0]
58
- end
59
- end
60
- end
61
- node.value
62
- end
29
+ # Prints the decision depth-first with the respective entropy values.
30
+ def to_s
31
+ s = ""
32
+ nodes = [[0, @root]]
33
+ while nodes.any?
34
+ l, n = nodes.last
35
+ nodes = nodes.slice(0...-1)
36
+ n.children.each do |c|
37
+ nodes << [l + 1, c]
38
+ end
39
+ s = s + "#{' ' * (l * 2)}#{n.to_s}\n"
40
+ end
41
+ return s
42
+ end
43
+
44
+ # Receives attributes and their values (they must be all defined).
45
+ # Returns the value of the predicted class value.
46
+ def query values
47
+ if values.keys != @dataset.attribute_names(@class_var)
48
+ raise "Query does not fit all variables"
49
+ end
50
+ node = @root
51
+ while node.variable != @class_var
52
+ if node.next_attribute
53
+ if node.children.count > 1
54
+ val = values[node.next_attribute]
55
+ node = node.children.select do |child|
56
+ child.value == val
57
+ end[0]
58
+ else
59
+ node = node.children[0]
60
+ end
61
+ end
62
+ end
63
+ node.value
64
+ end
65
+
66
+ end
63
67
 
64
68
  end
metadata CHANGED
@@ -1,8 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baobab
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
5
- prerelease:
4
+ version: 0.1.1
6
5
  platform: ruby
7
6
  authors:
8
7
  - Johnny E. Lee Othon
@@ -11,16 +10,7 @@ bindir: bin
11
10
  cert_chain: []
12
11
  date: 2015-03-22 00:00:00.000000000 Z
13
12
  dependencies: []
14
- description: ! "# baobab\n\nAn implementation of the ID3 (Iterative Dichotomiser 3)
15
- in Ruby\n\n## How to run the tests\n\n```\nrake tests\n```\n\n## Coming soon\n\nI
16
- promise I'll make this an installable gem. One of these days.\n\n## Sources of the
17
- datasets\n\nThe weather dataset has been adapted from the `weather.nominal.arff`
18
- that comes shipped with [Weka](http://www.cs.waikato.ac.nz/ml/weka/).\n\nThe transportation
19
- dataset was taken from the example data in [https://www.youtube.com/watch?v=wL9aogTuZw8](https://www.youtube.com/watch?v=wL9aogTuZw8).\n\nThe
20
- breast cancer dataset is adapted from the `breast-cancer.arff` file that comes shipped
21
- with Weka. It should be attributed to:\n\nMatjaz Zwitter & Milan Soklic (physicians)\nInstitute
22
- of Oncology \nUniversity Medical Center\nLjubljana, Yugoslavia\nDonors: Ming Tan
23
- and Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)\nDate: 11 July 1988\n"
13
+ description:
24
14
  email: jleeothon@gmail.com
25
15
  executables: []
26
16
  extensions: []
@@ -34,26 +24,25 @@ files:
34
24
  homepage: https://github.com/jleeothon/baobab
35
25
  licenses:
36
26
  - MIT
27
+ metadata: {}
37
28
  post_install_message:
38
29
  rdoc_options: []
39
30
  require_paths:
40
31
  - lib
41
32
  required_ruby_version: !ruby/object:Gem::Requirement
42
- none: false
43
33
  requirements:
44
- - - ! '>='
34
+ - - ">="
45
35
  - !ruby/object:Gem::Version
46
36
  version: '0'
47
37
  required_rubygems_version: !ruby/object:Gem::Requirement
48
- none: false
49
38
  requirements:
50
- - - ! '>='
39
+ - - ">="
51
40
  - !ruby/object:Gem::Version
52
41
  version: '0'
53
42
  requirements: []
54
43
  rubyforge_project:
55
- rubygems_version: 1.8.23
44
+ rubygems_version: 2.4.6
56
45
  signing_key:
57
- specification_version: 3
46
+ specification_version: 4
58
47
  summary: ID3 decision trees for machine learning in Ruby
59
48
  test_files: []