baobab 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: bc9a6568b4998bd54bb07030d90bbebcc6979d72
4
+ data.tar.gz: a392e91b05f89b6dc055249e9b5256e3fc98a93b
5
+ SHA512:
6
+ metadata.gz: cddc12fb7be6d7b5a73f3e659f22df27b52bdd1f75f53e9bddc9da2aa85a0d7499111f4e1b61459d071e489b68017e06306a5287b7aae3a6c84741dafaff8b34
7
+ data.tar.gz: f6d28f686b71791eed288ccdd6c6e9d6cfba3632ddbaabc0bd162fc5672f613d77c05150814c9d5a1b53854c04f7ab1b52ebc484536a111b8b7d7da78af20021
@@ -1,74 +1,77 @@
1
+ module Baobab
1
2
 
2
- # Represents a dataset or subset thereof.
3
- # Is an array of hashes where all hashes contain the same keys.
4
- class Dataset < Array
3
+ # Represents a dataset or subset thereof.
4
+ # Is an array of hashes where all hashes contain the same keys.
5
+ class Dataset < Array
5
6
 
6
- # Receives an array of hashes. All hashes must contain the same keys.
7
- def initialize data
8
- data.each do |row|
9
- self << row
10
- end
11
- end
7
+ # Receives an array of hashes. All hashes must contain the same keys.
8
+ def initialize data
9
+ data.each do |row|
10
+ self << row
11
+ end
12
+ end
12
13
 
13
- def self.from_json filename
14
- text = File.read(filename)
15
- self.new JSON.parse(text)
16
- end
14
+ def self.from_json filename
15
+ text = File.read(filename)
16
+ self.new JSON.parse(text)
17
+ end
17
18
 
18
- def attribute_names class_var
19
- self.column_names.reject{|name| name == class_var}
20
- end
19
+ def attribute_names class_var
20
+ self.column_names.reject{|name| name == class_var}
21
+ end
21
22
 
22
- # Returns an array of the attribute names in the dataset
23
- # Careful: it's empty on an empty set.
24
- def column_names
25
- self[0].keys
26
- end
23
+ # Returns an array of the attribute names in the dataset
24
+ # Careful: it's empty on an empty set.
25
+ def column_names
26
+ self[0].keys
27
+ end
27
28
 
28
- # Returns an array of the values of an attribute in the dataset.
29
- # Careful: it's empty on an empty set.
30
- def column_values attribute
31
- self.map{|row| row[attribute]}.to_a.uniq
32
- end
29
+ # Returns an array of the values of an attribute in the dataset.
30
+ # Careful: it's empty on an empty set.
31
+ def column_values attribute
32
+ self.map{|row| row[attribute]}.to_a.uniq
33
+ end
33
34
 
34
- # Gets a subset with given conditions. Keys must be of the same type as
35
- # in the dataset (be careful with symbols).
36
- def subset conditions
37
- rows = self.select do |row|
38
- conditions.reduce(true) do |memo, (var, val)|
39
- memo and row[var] == val
40
- end
41
- end
42
- Dataset.new rows
43
- end
35
+ # Gets a subset with given conditions. Keys must be of the same type as
36
+ # in the dataset (be careful with symbols).
37
+ def subset conditions
38
+ rows = self.select do |row|
39
+ conditions.reduce(true) do |memo, (var, val)|
40
+ memo and row[var] == val
41
+ end
42
+ end
43
+ Dataset.new rows
44
+ end
44
45
 
45
- def entropy class_var
46
- class_vals = self.column_values(class_var)
47
- probabilities = class_vals.map do |class_val|
48
- self.probability(class_var, class_val)
49
- end
50
- Shannon::entropy *probabilities
51
- end
46
+ def entropy class_var
47
+ class_vals = self.column_values(class_var)
48
+ probabilities = class_vals.map do |class_val|
49
+ self.probability(class_var, class_val)
50
+ end
51
+ Shannon::entropy *probabilities
52
+ end
52
53
 
53
- # Evaluates the probability that var be val in this dataset.
54
- # Can also be used for subsets.
55
- def probability var, val
56
- unless self.count.zero?
57
- self.count{|r| r[var] == val}.fdiv(self.count)
58
- else
59
- 0
60
- end
61
- end
54
+ # Evaluates the probability that var be val in this dataset.
55
+ # Can also be used for subsets.
56
+ def probability var, val
57
+ unless self.count.zero?
58
+ self.count{|r| r[var] == val}.fdiv(self.count)
59
+ else
60
+ 0
61
+ end
62
+ end
63
+
64
+ def validate
65
+ raise 'Dataset is empty' if self.empty?
66
+ self.reduce(self[0].keys) do |memo, row|
67
+ if memo == row.keys then
68
+ memo
69
+ else
70
+ raise 'Dataset is inconsistent'
71
+ end
72
+ end
73
+ return nil
74
+ end
75
+ end
62
76
 
63
- def validate
64
- raise 'Dataset is empty' if self.empty?
65
- self.reduce(self[0].keys) do |memo, row|
66
- if memo == row.keys then
67
- memo
68
- else
69
- raise 'Dataset is inconsistent'
70
- end
71
- end
72
- return nil
73
- end
74
77
  end
@@ -1,189 +1,192 @@
1
+ module Baobab
2
+
3
+ class DecisionTreeNode
4
+
5
+ # A list of nodes
6
+ attr_accessor :children
7
+
8
+ # The variable on which this operates, a string
9
+ attr_reader :variable
10
+
11
+ # The value for the variable, a string
12
+ attr_reader :value
13
+
14
+ # The decision tree
15
+ attr_reader :tree
16
+
17
+ attr_reader :entropy
18
+
19
+ attr_accessor :next_attribute
20
+
21
+ # Accumulates the conditions down the tree
22
+ attr_reader :conditions
23
+
24
+ # The conditional probability that variable=value
25
+ attr_reader :probability
26
+
27
+ def initialize tree, parent, variable, value, entropy, conditions=nil
28
+ @tree = tree
29
+ @parent = parent
30
+ @variable = variable
31
+ @value = value
32
+ @entropy = entropy
33
+ @conditions = conditions ? conditions : {}
34
+ @children = Array.new
35
+ @subset = nil
36
+ end
37
+
38
+ def to_s
39
+ s = @variable ? "#{@variable} => #{@value}" : "ROOT"
40
+ s += " (#{@entropy.round(3)})"
41
+ end
42
+
43
+ def subset
44
+ unless @subset.nil?
45
+ @subset
46
+ else
47
+ @subset = @tree.dataset.subset(self.full_conditions)
48
+ end
49
+ end
50
+
51
+ def clear
52
+ @subset = nil
53
+ end
54
+
55
+ def pending_attrs
56
+ @tree.dataset.column_names.reject do |name|
57
+ @tree.class_var == name or @conditions.include? name
58
+ end
59
+ end
60
+
61
+ # Returns a subset selected on all the parent node's conditions plus this
62
+ # node's attribute and its value.
63
+ def full_conditions
64
+ if @variable
65
+ conditions.merge({@variable => @value})
66
+ else
67
+ conditions
68
+ end
69
+ end
70
+
71
+ def build_subtree recursive=true
72
+ if self.try_finish
73
+ return
74
+ end
75
+ subset = self.subset
76
+ subset_count = self.subset.count
77
+ entropies = self.entropies
78
+ inf_gain = entropies.each.with_object({}) do |(a, e), o|
79
+ o[a] = @entropy - e
80
+ end
81
+ max_attr, max_gain = inf_gain.max_by{|v, g| g}
82
+ self.next_attribute = max_attr
83
+ @children += self.tree.dataset.column_values(max_attr).map do |value|
84
+ conditions = self.full_conditions
85
+ DecisionTreeNode.new(
86
+ tree=@tree, parent=self,
87
+ attribute=max_attr, value=value,
88
+ entropy=entropies[max_attr], conditions=conditions
89
+ )
90
+ end
91
+ if recursive
92
+ @children.each do |c|
93
+ c.build_subtree
94
+ end
95
+ end
96
+ end
97
+
98
+ # Checks whether any of three is true:
99
+ # - All elements in the subset belong to the same class value: a leaf with
100
+ # this value is created.
101
+ # - There are no more attributes to be selected: a leaf with the most common
102
+ # class value is selected
103
+ # - There are no more rows in the dataset: a leaf with the most common class
104
+ # value in the parent node is created.
105
+ def try_finish
106
+ var = self.tree.class_var
107
+ val = (
108
+ self.try_finish_single_value_class or
109
+ self.try_finish_empty_subset or
110
+ self.try_finish_no_more_attributes
111
+ )
112
+ if val
113
+ @next_attribute = @tree.class_var
114
+ self.children << DecisionTreeNode.new(
115
+ tree=@tree, parent=self,
116
+ variable=@tree.class_var, value=val,
117
+ entropy=0, conditions=self.full_conditions
118
+ )
119
+ else
120
+ false
121
+ end
122
+ end
123
+
124
+ # If all class values are the same, returns that value; else, nil.
125
+ def try_finish_single_value_class
126
+ if self.subset.any?
127
+ v0 = self.subset[0][@tree.class_var]
128
+ self.subset.slice(1...-1).reduce(v0) do |memo, row|
129
+ if memo != row[@tree.class_var]
130
+ return nil
131
+ else
132
+ memo
133
+ end
134
+ end
135
+ else
136
+ nil
137
+ end
138
+
139
+ end
140
+
141
+ # If there are not more attributes, returns the most common class value;
142
+ # else, nil.
143
+ def try_finish_no_more_attributes
144
+ if self.pending_attrs.empty? then
145
+ self.most_common_value
146
+ else
147
+ nil
148
+ end
149
+ end
150
+
151
+ # If the subset is empty, returns the most common value in the parent
152
+ # node's subset.
153
+ def try_finish_empty_subset
154
+ if self.subset.empty?
155
+ @parent.most_common_value
156
+ else
157
+ nil
158
+ end
159
+ end
160
+
161
+ # Returns the most common class value in the dataset.
162
+ def most_common_value
163
+ class_var = @tree.class_var
164
+ class_values = @tree.dataset.column_values(class_var)
165
+ count = class_values.each.with_object({}) do |val, o|
166
+ o[val] = 0
167
+ end
168
+ self.subset.each.with_object(count) do |row, o|
169
+ count[row[class_var]] += 1
170
+ end
171
+ count.max_by{|v, c| c}[0]
172
+ end
173
+
174
+ # Returns a hash of {attribute, entropy} given that we divide the dataset
175
+ # on attribute.
176
+ def entropies
177
+ self.pending_attrs.each.with_object({}) do |a, o|
178
+ values = @tree.dataset.column_values(a)
179
+ val_probabilities = values.each.with_object({}) do |v, o|
180
+ o[v] = subset.probability a, v
181
+ end
182
+ val_entropies = values.each.with_object({}) do |v, o|
183
+ o[v] = subset.subset({a => v}).entropy(self.tree.class_var)
184
+ end
185
+ o[a] = values.reduce(0) do |memo, v|
186
+ memo + val_entropies[v] * val_probabilities[v]
187
+ end
188
+ end
189
+ end
190
+ end
1
191
 
2
- class DecisionTreeNode
3
-
4
- # A list of nodes
5
- attr_accessor :children
6
-
7
- # The variable on which this operates, a string
8
- attr_reader :variable
9
-
10
- # The value for the variable, a string
11
- attr_reader :value
12
-
13
- # The decision tree
14
- attr_reader :tree
15
-
16
- attr_reader :entropy
17
-
18
- attr_accessor :next_attribute
19
-
20
- # Accumulates the conditions down the tree
21
- attr_reader :conditions
22
-
23
- # The conditional probability that variable=value
24
- attr_reader :probability
25
-
26
- def initialize tree, parent, variable, value, entropy, conditions=nil
27
- @tree = tree
28
- @parent = parent
29
- @variable = variable
30
- @value = value
31
- @entropy = entropy
32
- @conditions = conditions ? conditions : {}
33
- @children = Array.new
34
- @subset = nil
35
- end
36
-
37
- def to_s
38
- s = @variable ? "#{@variable} => #{@value}" : "ROOT"
39
- s += " (#{@entropy.round(3)})"
40
- end
41
-
42
- def subset
43
- unless @subset.nil?
44
- @subset
45
- else
46
- @subset = @tree.dataset.subset(self.full_conditions)
47
- end
48
- end
49
-
50
- def clear
51
- @subset = nil
52
- end
53
-
54
- def pending_attrs
55
- @tree.dataset.column_names.reject do |name|
56
- @tree.class_var == name or @conditions.include? name
57
- end
58
- end
59
-
60
- # Returns a subset selected on all the parent node's conditions plus this
61
- # node's attribute and its value.
62
- def full_conditions
63
- if @variable
64
- conditions.merge({@variable => @value})
65
- else
66
- conditions
67
- end
68
- end
69
-
70
- def build_subtree recursive=true
71
- if self.try_finish
72
- return
73
- end
74
- subset = self.subset
75
- subset_count = self.subset.count
76
- entropies = self.entropies
77
- inf_gain = entropies.each.with_object({}) do |(a, e), o|
78
- o[a] = @entropy - e
79
- end
80
- max_attr, max_gain = inf_gain.max_by{|v, g| g}
81
- self.next_attribute = max_attr
82
- @children += self.tree.dataset.column_values(max_attr).map do |value|
83
- conditions = self.full_conditions
84
- DecisionTreeNode.new(
85
- tree=@tree, parent=self,
86
- attribute=max_attr, value=value,
87
- entropy=entropies[max_attr], conditions=conditions
88
- )
89
- end
90
- if recursive
91
- @children.each do |c|
92
- c.build_subtree
93
- end
94
- end
95
- end
96
-
97
- # Checks whether any of three is true:
98
- # - All elements in the subset belong to the same class value: a leaf with
99
- # this value is created.
100
- # - There are no more attributes to be selected: a leaf with the most common
101
- # class value is selected
102
- # - There are no more rows in the dataset: a leaf with the most common class
103
- # value in the parent node is created.
104
- def try_finish
105
- var = self.tree.class_var
106
- val = (
107
- self.try_finish_single_value_class or
108
- self.try_finish_empty_subset or
109
- self.try_finish_no_more_attributes
110
- )
111
- if val
112
- @next_attribute = @tree.class_var
113
- self.children << DecisionTreeNode.new(
114
- tree=@tree, parent=self,
115
- variable=@tree.class_var, value=val,
116
- entropy=0, conditions=self.full_conditions
117
- )
118
- else
119
- false
120
- end
121
- end
122
-
123
- # If all class values are the same, returns that value; else, nil.
124
- def try_finish_single_value_class
125
- if self.subset.any?
126
- v0 = self.subset[0][@tree.class_var]
127
- self.subset.slice(1...-1).reduce(v0) do |memo, row|
128
- if memo != row[@tree.class_var]
129
- return nil
130
- else
131
- memo
132
- end
133
- end
134
- else
135
- nil
136
- end
137
-
138
- end
139
-
140
- # If there are not more attributes, returns the most common class value;
141
- # else, nil.
142
- def try_finish_no_more_attributes
143
- if self.pending_attrs.empty? then
144
- self.most_common_value
145
- else
146
- nil
147
- end
148
- end
149
-
150
- # If the subset is empty, returns the most common value in the parent
151
- # node's subset.
152
- def try_finish_empty_subset
153
- if self.subset.empty?
154
- @parent.most_common_value
155
- else
156
- nil
157
- end
158
- end
159
-
160
- # Returns the most common class value in the dataset.
161
- def most_common_value
162
- class_var = @tree.class_var
163
- class_values = @tree.dataset.column_values(class_var)
164
- count = class_values.each.with_object({}) do |val, o|
165
- o[val] = 0
166
- end
167
- self.subset.each.with_object(count) do |row, o|
168
- count[row[class_var]] += 1
169
- end
170
- count.max_by{|v, c| c}[0]
171
- end
172
-
173
- # Returns a hash of {attribute, entropy} given that we divide the dataset
174
- # on attribute.
175
- def entropies
176
- self.pending_attrs.each.with_object({}) do |a, o|
177
- values = @tree.dataset.column_values(a)
178
- val_probabilities = values.each.with_object({}) do |v, o|
179
- o[v] = subset.probability a, v
180
- end
181
- val_entropies = values.each.with_object({}) do |v, o|
182
- o[v] = subset.subset({a => v}).entropy(self.tree.class_var)
183
- end
184
- o[a] = values.reduce(0) do |memo, v|
185
- memo + val_entropies[v] * val_probabilities[v]
186
- end
187
- end
188
- end
189
192
  end
@@ -1,13 +1,16 @@
1
+ module Baobab
1
2
 
2
- module Shannon
3
+ module Shannon
3
4
 
4
- def self.entropy *probabilities
5
- probabilities.reduce(0) do |memo, p|
6
- if p.zero? then 0 else memo + self.entropy_term(p) end
7
- end
8
- end
5
+ def self.entropy *probabilities
6
+ probabilities.reduce(0) do |memo, p|
7
+ if p.zero? then 0 else memo + self.entropy_term(p) end
8
+ end
9
+ end
10
+
11
+ def self.entropy_term probability
12
+ - probability * Math::log2(probability)
13
+ end
14
+ end
9
15
 
10
- def self.entropy_term probability
11
- - probability * Math::log2(probability)
12
- end
13
16
  end
@@ -1,64 +1,68 @@
1
1
  require 'json'
2
2
  require 'set'
3
3
 
4
- class DecisionTree
4
+ module Baobab
5
5
 
6
- # The first decision tree node
7
- attr_reader :root
6
+ class DecisionTree
8
7
 
9
- # The class variable of interest in this decision tree.
10
- attr_reader :class_var
8
+ # The first decision tree node
9
+ attr_reader :root
11
10
 
12
- # The underlying dataset
13
- attr_reader :dataset
11
+ # The class variable of interest in this decision tree.
12
+ attr_reader :class_var
14
13
 
15
- def initialize dataset, class_var
16
- @dataset = dataset
17
- @class_var = class_var
18
- entropy = dataset.entropy class_var
19
- @root = DecisionTreeNode.new(
20
- self, parent=nil,
21
- attribute=nil, value=nil,
22
- entropy
23
- )
24
- @root.build_subtree
25
- end
14
+ # The underlying dataset
15
+ attr_reader :dataset
26
16
 
27
- # Prints the decision depth-first with the respective entropy values.
28
- def to_s
29
- s = ""
30
- nodes = [[0, @root]]
31
- while nodes.any?
32
- l, n = nodes.last
33
- nodes = nodes.slice(0...-1)
34
- n.children.each do |c|
35
- nodes << [l + 1, c]
36
- end
37
- s = s + "#{' ' * (l * 2)}#{n.to_s}\n"
38
- end
39
- return s
40
- end
17
+ def initialize dataset, class_var
18
+ @dataset = dataset
19
+ @class_var = class_var
20
+ entropy = dataset.entropy class_var
21
+ @root = DecisionTreeNode.new(
22
+ self, parent=nil,
23
+ attribute=nil, value=nil,
24
+ entropy
25
+ )
26
+ @root.build_subtree
27
+ end
41
28
 
42
- # Receives attributes and their values (they must be all defined).
43
- # Returns the value of the predicted class value.
44
- def query values
45
- if values.keys != @dataset.attribute_names(@class_var)
46
- raise "Query does not fit all variables"
47
- end
48
- node = @root
49
- while node.variable != @class_var
50
- if node.next_attribute
51
- if node.children.count > 1
52
- val = values[node.next_attribute]
53
- node = node.children.select do |child|
54
- child.value == val
55
- end[0]
56
- else
57
- node = node.children[0]
58
- end
59
- end
60
- end
61
- node.value
62
- end
29
+ # Prints the decision depth-first with the respective entropy values.
30
+ def to_s
31
+ s = ""
32
+ nodes = [[0, @root]]
33
+ while nodes.any?
34
+ l, n = nodes.last
35
+ nodes = nodes.slice(0...-1)
36
+ n.children.each do |c|
37
+ nodes << [l + 1, c]
38
+ end
39
+ s = s + "#{' ' * (l * 2)}#{n.to_s}\n"
40
+ end
41
+ return s
42
+ end
43
+
44
+ # Receives attributes and their values (they must be all defined).
45
+ # Returns the value of the predicted class value.
46
+ def query values
47
+ if values.keys != @dataset.attribute_names(@class_var)
48
+ raise "Query does not fit all variables"
49
+ end
50
+ node = @root
51
+ while node.variable != @class_var
52
+ if node.next_attribute
53
+ if node.children.count > 1
54
+ val = values[node.next_attribute]
55
+ node = node.children.select do |child|
56
+ child.value == val
57
+ end[0]
58
+ else
59
+ node = node.children[0]
60
+ end
61
+ end
62
+ end
63
+ node.value
64
+ end
65
+
66
+ end
63
67
 
64
68
  end
metadata CHANGED
@@ -1,8 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baobab
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
5
- prerelease:
4
+ version: 0.1.1
6
5
  platform: ruby
7
6
  authors:
8
7
  - Johnny E. Lee Othon
@@ -11,16 +10,7 @@ bindir: bin
11
10
  cert_chain: []
12
11
  date: 2015-03-22 00:00:00.000000000 Z
13
12
  dependencies: []
14
- description: ! "# baobab\n\nAn implementation of the ID3 (Iterative Dichotomiser 3)
15
- in Ruby\n\n## How to run the tests\n\n```\nrake tests\n```\n\n## Coming soon\n\nI
16
- promise I'll make this an installable gem. One of these days.\n\n## Sources of the
17
- datasets\n\nThe weather dataset has been adapted from the `weather.nominal.arff`
18
- that comes shipped with [Weka](http://www.cs.waikato.ac.nz/ml/weka/).\n\nThe transportation
19
- dataset was taken from the example data in [https://www.youtube.com/watch?v=wL9aogTuZw8](https://www.youtube.com/watch?v=wL9aogTuZw8).\n\nThe
20
- breast cancer dataset is adapted from the `breast-cancer.arff` file that comes shipped
21
- with Weka. It should be attributed to:\n\nMatjaz Zwitter & Milan Soklic (physicians)\nInstitute
22
- of Oncology \nUniversity Medical Center\nLjubljana, Yugoslavia\nDonors: Ming Tan
23
- and Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)\nDate: 11 July 1988\n"
13
+ description:
24
14
  email: jleeothon@gmail.com
25
15
  executables: []
26
16
  extensions: []
@@ -34,26 +24,25 @@ files:
34
24
  homepage: https://github.com/jleeothon/baobab
35
25
  licenses:
36
26
  - MIT
27
+ metadata: {}
37
28
  post_install_message:
38
29
  rdoc_options: []
39
30
  require_paths:
40
31
  - lib
41
32
  required_ruby_version: !ruby/object:Gem::Requirement
42
- none: false
43
33
  requirements:
44
- - - ! '>='
34
+ - - ">="
45
35
  - !ruby/object:Gem::Version
46
36
  version: '0'
47
37
  required_rubygems_version: !ruby/object:Gem::Requirement
48
- none: false
49
38
  requirements:
50
- - - ! '>='
39
+ - - ">="
51
40
  - !ruby/object:Gem::Version
52
41
  version: '0'
53
42
  requirements: []
54
43
  rubyforge_project:
55
- rubygems_version: 1.8.23
44
+ rubygems_version: 2.4.6
56
45
  signing_key:
57
- specification_version: 3
46
+ specification_version: 4
58
47
  summary: ID3 decision trees for machine learning in Ruby
59
48
  test_files: []