ai4ruby 1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. data/README.rdoc +47 -0
  2. data/examples/classifiers/id3_data.csv +121 -0
  3. data/examples/classifiers/id3_example.rb +29 -0
  4. data/examples/classifiers/naive_bayes_data.csv +11 -0
  5. data/examples/classifiers/naive_bayes_example.rb +16 -0
  6. data/examples/classifiers/results.txt +31 -0
  7. data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
  8. data/examples/genetic_algorithm/travel_cost.csv +16 -0
  9. data/examples/neural_network/backpropagation_example.rb +67 -0
  10. data/examples/neural_network/patterns_with_base_noise.rb +68 -0
  11. data/examples/neural_network/patterns_with_noise.rb +66 -0
  12. data/examples/neural_network/training_patterns.rb +68 -0
  13. data/examples/neural_network/xor_example.rb +35 -0
  14. data/examples/som/som_data.rb +156 -0
  15. data/examples/som/som_multi_node_example.rb +22 -0
  16. data/examples/som/som_single_example.rb +24 -0
  17. data/lib/ai4r.rb +33 -0
  18. data/lib/ai4r/classifiers/classifier.rb +62 -0
  19. data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
  20. data/lib/ai4r/classifiers/ib1.rb +121 -0
  21. data/lib/ai4r/classifiers/id3.rb +326 -0
  22. data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
  23. data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
  24. data/lib/ai4r/classifiers/one_r.rb +110 -0
  25. data/lib/ai4r/classifiers/prism.rb +197 -0
  26. data/lib/ai4r/classifiers/zero_r.rb +73 -0
  27. data/lib/ai4r/clusterers/average_linkage.rb +59 -0
  28. data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
  29. data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
  30. data/lib/ai4r/clusterers/clusterer.rb +61 -0
  31. data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
  32. data/lib/ai4r/clusterers/diana.rb +139 -0
  33. data/lib/ai4r/clusterers/k_means.rb +126 -0
  34. data/lib/ai4r/clusterers/median_linkage.rb +61 -0
  35. data/lib/ai4r/clusterers/single_linkage.rb +194 -0
  36. data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
  37. data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +31 -0
  38. data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
  39. data/lib/ai4r/data/data_set.rb +266 -0
  40. data/lib/ai4r/data/parameterizable.rb +64 -0
  41. data/lib/ai4r/data/proximity.rb +100 -0
  42. data/lib/ai4r/data/statistics.rb +77 -0
  43. data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
  44. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
  45. data/lib/ai4r/neural_network/backpropagation.rb +326 -0
  46. data/lib/ai4r/neural_network/hopfield.rb +149 -0
  47. data/lib/ai4r/som/layer.rb +68 -0
  48. data/lib/ai4r/som/node.rb +96 -0
  49. data/lib/ai4r/som/som.rb +155 -0
  50. data/lib/ai4r/som/two_phase_layer.rb +90 -0
  51. data/test/classifiers/hyperpipes_test.rb +84 -0
  52. data/test/classifiers/ib1_test.rb +78 -0
  53. data/test/classifiers/id3_test.rb +208 -0
  54. data/test/classifiers/multilayer_perceptron_test.rb +79 -0
  55. data/test/classifiers/naive_bayes_test.rb +43 -0
  56. data/test/classifiers/one_r_test.rb +62 -0
  57. data/test/classifiers/prism_test.rb +85 -0
  58. data/test/classifiers/zero_r_test.rb +49 -0
  59. data/test/clusterers/average_linkage_test.rb +51 -0
  60. data/test/clusterers/bisecting_k_means_test.rb +66 -0
  61. data/test/clusterers/centroid_linkage_test.rb +53 -0
  62. data/test/clusterers/complete_linkage_test.rb +57 -0
  63. data/test/clusterers/diana_test.rb +69 -0
  64. data/test/clusterers/k_means_test.rb +100 -0
  65. data/test/clusterers/median_linkage_test.rb +53 -0
  66. data/test/clusterers/single_linkage_test.rb +122 -0
  67. data/test/clusterers/ward_linkage_hierarchical_test.rb +61 -0
  68. data/test/clusterers/ward_linkage_test.rb +53 -0
  69. data/test/clusterers/weighted_average_linkage_test.rb +53 -0
  70. data/test/data/data_set_test.rb +96 -0
  71. data/test/data/proximity_test.rb +81 -0
  72. data/test/data/statistics_test.rb +65 -0
  73. data/test/experiment/classifier_evaluator_test.rb +76 -0
  74. data/test/genetic_algorithm/chromosome_test.rb +58 -0
  75. data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
  76. data/test/neural_network/backpropagation_test.rb +82 -0
  77. data/test/neural_network/hopfield_test.rb +72 -0
  78. data/test/som/som_test.rb +97 -0
  79. metadata +168 -0
@@ -0,0 +1,118 @@
1
+ # Author:: Sergio Fierens (Implementation only)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require 'set'
11
+ require File.dirname(__FILE__) + '/../data/data_set'
12
+ require File.dirname(__FILE__) + '/../classifiers/classifier'
13
+
14
+ module Ai4r
15
+ module Classifiers
16
+
17
+ include Ai4r::Data
18
+
19
+ # = Introduction
20
+ #
21
+ # A fast classifier algorithm, created by Lucio de Souza Coelho
22
+ # and Len Trigg.
23
+ class Hyperpipes < Classifier
24
+
25
+ attr_reader :data_set, :pipes
26
+
27
+ # Build a new Hyperpipes classifier. You must provide a DataSet instance
28
+ # as parameter. The last attribute of each item is considered as
29
+ # the item class.
30
+ def build(data_set)
31
+ data_set.check_not_empty
32
+ @data_set = data_set
33
+ @domains = data_set.build_domains
34
+
35
+ @pipes = {}
36
+ @domains.last.each {|cat| @pipes[cat] = build_pipe(@data_set)}
37
+ @data_set.data_items.each {|item| update_pipe(@pipes[item.last], item) }
38
+
39
+ return self
40
+ end
41
+
42
+ # You can evaluate new data, predicting its class.
43
+ # e.g.
44
+ # classifier.eval(['New York', '<30', 'F']) # => 'Y'
45
+ def eval(data)
46
+ votes = Hash.new {0}
47
+ @pipes.each do |category, pipe|
48
+ pipe.each_with_index do |bounds, i|
49
+ if data[i].is_a? Numeric
50
+ votes[category]+=1 if data[i]>=bounds[:min] && data[i]<=bounds[:max]
51
+ else
52
+ votes[category]+=1 if bounds[data[i]]
53
+ end
54
+ end
55
+ end
56
+ return votes.to_a.max {|x, y| x.last <=> y.last}.first
57
+ end
58
+
59
+ # This method returns the generated rules in ruby code.
60
+ # e.g.
61
+ #
62
+ # classifier.get_rules
63
+ # # => if age_range == '<30' then marketing_target = 'Y'
64
+ # elsif age_range == '[30-50)' then marketing_target = 'N'
65
+ # elsif age_range == '[50-80]' then marketing_target = 'N'
66
+ # end
67
+ #
68
+ # It is a nice way to inspect induction results, and also to execute them:
69
+ # marketing_target = nil
70
+ # eval classifier.get_rules
71
+ # puts marketing_target
72
+ # # => 'Y'
73
+ def get_rules
74
+ rules = []
75
+ rules << "votes = Hash.new {0}"
76
+ data = @data_set.data_items.first
77
+ labels = @data_set.data_labels.collect {|l| l.to_s}
78
+ @pipes.each do |category, pipe|
79
+ pipe.each_with_index do |bounds, i|
80
+ rule = "votes['#{category}'] += 1 "
81
+ if data[i].is_a? Numeric
82
+ rule += "if #{labels[i]} >= #{bounds[:min]} && #{labels[i]} <= #{bounds[:max]}"
83
+ else
84
+ rule += "if #{bounds.inspect}[#{labels[i]}]"
85
+ end
86
+ rules << rule
87
+ end
88
+ end
89
+ rules << "#{labels.last} = votes.to_a.max {|x, y| x.last <=> y.last}.first"
90
+ return rules.join("\n")
91
+ end
92
+
93
+ protected
94
+
95
+ def build_pipe(data_set)
96
+ data_set.data_items.first[0...-1].collect do |att|
97
+ if att.is_a? Numeric
98
+ {:min=>1.0/0, :max=>-1.0/0}
99
+ else
100
+ Hash.new(false)
101
+ end
102
+ end
103
+ end
104
+
105
+ def update_pipe(pipe, data_item)
106
+ data_item[0...-1].each_with_index do |att, i|
107
+ if att.is_a? Numeric
108
+ pipe[i][:min] = att if att < pipe[i][:min]
109
+ pipe[i][:max] = att if att > pipe[i][:max]
110
+ else
111
+ pipe[i][att] = true
112
+ end
113
+ end
114
+ end
115
+
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,121 @@
1
+ # Author:: Sergio Fierens (Implementation only)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require 'set'
11
+ require File.dirname(__FILE__) + '/../data/data_set'
12
+ require File.dirname(__FILE__) + '/../classifiers/classifier'
13
+
14
+ module Ai4r
15
+ module Classifiers
16
+
17
+ # = Introduction
18
+ #
19
+ # IB1 algorithm implementation.
20
+ # IB1 is the simplest instance-based learning (IBL) algorithm.
21
+ #
22
+ # D. Aha, D. Kibler (1991). Instance-based learning algorithms.
23
+ # Machine Learning. 6:37-66.
24
+ #
25
+ # IBI is identical to the nearest neighbor algorithm except that
26
+ # it normalizes its attributes' ranges, processes instances
27
+ # incrementally, and has a simple policy for tolerating missing values
28
+ class IB1 < Classifier
29
+
30
+ attr_reader :data_set
31
+
32
+ # Build a new IB1 classifier. You must provide a DataSet instance
33
+ # as parameter. The last attribute of each item is considered as
34
+ # the item class.
35
+ def build(data_set)
36
+ data_set.check_not_empty
37
+ @data_set = data_set
38
+ @min_values = Array.new(data_set.data_labels.length)
39
+ @max_values = Array.new(data_set.data_labels.length)
40
+ data_set.data_items.each { |data_item| update_min_max(data_item[0...-1]) }
41
+ return self
42
+ end
43
+
44
+ # You can evaluate new data, predicting its class.
45
+ # e.g.
46
+ # classifier.eval(['New York', '<30', 'F']) # => 'Y'
47
+ def eval(data)
48
+ update_min_max(data)
49
+ min_distance = 1.0/0
50
+ klass = nil
51
+ @data_set.data_items.each do |train_item|
52
+ d = distance(data, train_item)
53
+ if d < min_distance
54
+ min_distance = d
55
+ klass = train_item.last
56
+ end
57
+ end
58
+ return klass
59
+ end
60
+
61
+ protected
62
+
63
+ # We keep in the state the min and max value of each attribute,
64
+ # to provide normalized distances between to values of a numeric attribute
65
+ def update_min_max(atts)
66
+ atts.each_with_index do |att, i|
67
+ if att && att.is_a?(Numeric)
68
+ @min_values[i] = att if @min_values[i].nil? || @min_values[i] > att
69
+ @max_values[i] = att if @max_values[i].nil? || @max_values[i] < att
70
+ end
71
+ end
72
+ end
73
+
74
+ # Normalized distance between 2 instances
75
+ #
76
+ #
77
+ # Returns sum of
78
+ # * squared difference between normalized numeric att values
79
+ # * 1 for nominal atts which differs or one is missing
80
+ # * 1 if both atts are missing
81
+ # * normalized numeric att value if other att value is missing and > 0.5
82
+ # * 1.0-normalized numeric att value if other att value is missing and < 0.5
83
+ def distance(a, b)
84
+ d = 0
85
+ a.each_with_index do |att_a, i|
86
+ att_b = b[i]
87
+ if att_a.nil?
88
+ if att_b.is_a? Numeric
89
+ diff = norm(att_b, i)
90
+ diff = 1.0 - diff if diff < 0.5
91
+ else
92
+ diff = 1
93
+ end
94
+ elsif att_a.is_a? Numeric
95
+ if att_b.is_a? Numeric
96
+ diff = norm(att_a, i) - norm(att_b, i);
97
+ else
98
+ diff = norm(att_a, i)
99
+ diff = 1.0 - diff if diff < 0.5
100
+ end
101
+ elsif att_a != att_b
102
+ diff = 1
103
+ else
104
+ diff = 0
105
+ end
106
+ d += diff * diff
107
+ end
108
+ return d
109
+ end
110
+
111
+ # Returns normalized value att
112
+ #
113
+ # index is the index of the attribute in the instance.
114
+ def norm(att, index)
115
+ return 0 if @min_values[index].nil?
116
+ return 1.0*(att - @min_values[index]) / (@max_values[index] -@min_values[index]);
117
+ end
118
+
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,326 @@
1
+ # Author:: Sergio Fierens (Implementation, Quinlan is
2
+ # the creator of the algorithm)
3
+ # License:: MPL 1.1
4
+ # Project:: ai4r
5
+ # Url:: http://ai4r.rubyforge.org/
6
+ #
7
+ # You can redistribute it and/or modify it under the terms of
8
+ # the Mozilla Public License version 1.1 as published by the
9
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
10
+
11
+ require File.dirname(__FILE__) + '/../data/data_set'
12
+ require File.dirname(__FILE__) + '/../classifiers/classifier'
13
+
14
+ module Ai4r
15
+
16
+ module Classifiers
17
+
18
+ # = Introduction
19
+ # This is an implementation of the ID3 algorithm (Quinlan)
20
+ # Given a set of preclassified examples, it builds a top-down
21
+ # induction of decision tree, biased by the information gain and
22
+ # entropy measure.
23
+ #
24
+ # * http://en.wikipedia.org/wiki/Decision_tree
25
+ # * http://en.wikipedia.org/wiki/ID3_algorithm
26
+ #
27
+ # = How to use it
28
+ #
29
+ # DATA_LABELS = [ 'city', 'age_range', 'gender', 'marketing_target' ]
30
+ #
31
+ # DATA_ITEMS = [
32
+ # ['New York', '<30', 'M', 'Y'],
33
+ # ['Chicago', '<30', 'M', 'Y'],
34
+ # ['Chicago', '<30', 'F', 'Y'],
35
+ # ['New York', '<30', 'M', 'Y'],
36
+ # ['New York', '<30', 'M', 'Y'],
37
+ # ['Chicago', '[30-50)', 'M', 'Y'],
38
+ # ['New York', '[30-50)', 'F', 'N'],
39
+ # ['Chicago', '[30-50)', 'F', 'Y'],
40
+ # ['New York', '[30-50)', 'F', 'N'],
41
+ # ['Chicago', '[50-80]', 'M', 'N'],
42
+ # ['New York', '[50-80]', 'F', 'N'],
43
+ # ['New York', '[50-80]', 'M', 'N'],
44
+ # ['Chicago', '[50-80]', 'M', 'N'],
45
+ # ['New York', '[50-80]', 'F', 'N'],
46
+ # ['Chicago', '>80', 'F', 'Y']
47
+ # ]
48
+ #
49
+ # data_set = DataSet.new(:data_items=>DATA_SET, :data_labels=>DATA_LABELS)
50
+ # id3 = Ai4r::Classifiers::ID3.new.build(data_set)
51
+ #
52
+ # id3.get_rules
53
+ # # => if age_range=='<30' then marketing_target='Y'
54
+ # elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
55
+ # elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
56
+ # elsif age_range=='[50-80]' then marketing_target='N'
57
+ # elsif age_range=='>80' then marketing_target='Y'
58
+ # else raise 'There was not enough information during training to do a proper induction for this data element' end
59
+ #
60
+ # id3.eval(['New York', '<30', 'M'])
61
+ # # => 'Y'
62
+ #
63
+ # = A better way to load the data
64
+ #
65
+ # In the real life you will use lot more data training examples, with more
66
+ # attributes. Consider moving your data to an external CSV (comma separate
67
+ # values) file.
68
+ #
69
+ # data_file = "#{File.dirname(__FILE__)}/data_set.csv"
70
+ # data_set = DataSet.load_csv_with_labels data_file
71
+ # id3 = Ai4r::Classifiers::ID3.new.build(data_set)
72
+ #
73
+ # = A nice tip for data evaluation
74
+ #
75
+ # id3 = Ai4r::Classifiers::ID3.new.build(data_set)
76
+ #
77
+ # age_range = '<30'
78
+ # marketing_target = nil
79
+ # eval id3.get_rules
80
+ # puts marketing_target
81
+ # # => 'Y'
82
+ #
83
+ # = More about ID3 and decision trees
84
+ #
85
+ # * http://en.wikipedia.org/wiki/Decision_tree
86
+ # * http://en.wikipedia.org/wiki/ID3_algorithm
87
+ #
88
+ # = About the project
89
+ # Author:: Sergio Fierens
90
+ # License:: MPL 1.1
91
+ # Url:: http://ai4r.rubyforge.org/
92
+ class ID3 < Classifier
93
+
94
+ attr_reader :data_set
95
+
96
+ # Create a new ID3 classifier. You must provide a DataSet instance
97
+ # as parameter. The last attribute of each item is considered as the
98
+ # item class.
99
+ def build(data_set)
100
+ data_set.check_not_empty
101
+ @data_set = data_set
102
+ preprocess_data(@data_set.data_items)
103
+ return self
104
+ end
105
+
106
+ # You can evaluate new data, predicting its category.
107
+ # e.g.
108
+ # id3.eval(['New York', '<30', 'F']) # => 'Y'
109
+ def eval(data)
110
+ @tree.value(data) if @tree
111
+ end
112
+
113
+ # This method returns the generated rules in ruby code.
114
+ # e.g.
115
+ #
116
+ # id3.get_rules
117
+ # # => if age_range=='<30' then marketing_target='Y'
118
+ # elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
119
+ # elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
120
+ # elsif age_range=='[50-80]' then marketing_target='N'
121
+ # elsif age_range=='>80' then marketing_target='Y'
122
+ # else raise 'There was not enough information during training to do a proper induction for this data element' end
123
+ #
124
+ # It is a nice way to inspect induction results, and also to execute them:
125
+ # age_range = '<30'
126
+ # marketing_target = nil
127
+ # eval id3.get_rules
128
+ # puts marketing_target
129
+ # # => 'Y'
130
+ def get_rules
131
+ #return "Empty ID3 tree" if !@tree
132
+ rules = @tree.get_rules
133
+ rules = rules.collect do |rule|
134
+ "#{rule[0..-2].join(' and ')} then #{rule.last}"
135
+ end
136
+ return "if #{rules.join("\nelsif ")}\nelse raise 'There was not enough information during training to do a proper induction for this data element' end"
137
+ end
138
+
139
+ private
140
+ def preprocess_data(data_examples)
141
+ @tree = build_node(data_examples)
142
+ end
143
+
144
+ private
145
+ def build_node(data_examples, flag_att = [])
146
+ return ErrorNode.new if data_examples.length == 0
147
+ domain = domain(data_examples)
148
+ return CategoryNode.new(@data_set.data_labels.last, domain.last[0]) if domain.last.length == 1
149
+ min_entropy_index = min_entropy_index(data_examples, domain, flag_att)
150
+ flag_att << min_entropy_index
151
+ split_data_examples = split_data_examples(data_examples, domain, min_entropy_index)
152
+ return CategoryNode.new(@data_set.data_labels.last, most_freq(data_examples, domain)) if split_data_examples.length == 1
153
+ nodes = split_data_examples.collect do |partial_data_examples|
154
+ build_node(partial_data_examples, flag_att)
155
+ end
156
+ return EvaluationNode.new(@data_set.data_labels, min_entropy_index, domain[min_entropy_index], nodes)
157
+ end
158
+
159
+ private
160
+ def self.sum(values)
161
+ values.inject( 0 ) { |sum,x| sum+x }
162
+ end
163
+
164
+ private
165
+ def self.log2(z)
166
+ return 0.0 if z == 0
167
+ Math.log(z)/LOG2
168
+ end
169
+
170
+ private
171
+ def most_freq(examples, domain)
172
+ freqs = []
173
+ domain.last.length.times { freqs << 0}
174
+ examples.each do |example|
175
+ cat_index = domain.last.index(example.last)
176
+ freq = freqs[cat_index] + 1
177
+ freqs[cat_index] = freq
178
+ end
179
+ max_freq = freqs.max
180
+ max_freq_index = freqs.index(max_freq)
181
+ domain.last[max_freq_index]
182
+ end
183
+
184
+ private
185
+ def split_data_examples(data_examples, domain, att_index)
186
+ data_examples_array = []
187
+ att_value_examples = {}
188
+ data_examples.each do |example|
189
+ example_set = att_value_examples[example[att_index]]
190
+ example_set = [] if !example_set
191
+ example_set << example
192
+ att_value_examples.store(example[att_index], example_set)
193
+ end
194
+ att_value_examples.each_pair do |att_value, example_set|
195
+ att_value_index = domain[att_index].index(att_value)
196
+ data_examples_array[att_value_index] = example_set
197
+ end
198
+ return data_examples_array
199
+ end
200
+
201
+ private
202
+ def min_entropy_index(data_examples, domain, flag_att=[])
203
+ min_entropy = nil
204
+ min_index = 0
205
+ domain[0..-2].each_index do |index|
206
+ freq_grid = freq_grid(index, data_examples, domain)
207
+ entropy = entropy(freq_grid, data_examples.length)
208
+ if (!min_entropy || entropy < min_entropy) && !flag_att.include?(index)
209
+ min_entropy = entropy
210
+ min_index = index
211
+ end
212
+ end
213
+ return min_index
214
+ end
215
+
216
+ private
217
+ def domain(data_examples)
218
+ #return build_domains(data_examples)
219
+ domain = []
220
+ @data_set.data_labels.length.times { domain << [] }
221
+ data_examples.each do |data|
222
+ data.each_index do |i|
223
+ domain[i] << data[i] if i<domain.length && !domain[i].include?(data[i])
224
+ end
225
+ end
226
+ return domain
227
+ end
228
+
229
+ private
230
+ def freq_grid(att_index, data_examples, domain)
231
+ #Initialize empty grid
232
+ grid_element = []
233
+ domain.last.length.times { grid_element << 0}
234
+ grid = []
235
+ domain[att_index].length.times { grid << grid_element.clone }
236
+ #Fill frecuency with grid
237
+ data_examples.each do |example|
238
+ att_val = example[att_index]
239
+ att_val_index = domain[att_index].index(att_val)
240
+ category = example.last
241
+ category_index = domain.last.index(category)
242
+ freq = grid[att_val_index][category_index] + 1
243
+ grid[att_val_index][category_index] = freq
244
+ end
245
+ return grid
246
+ end
247
+
248
+ private
249
+ def entropy(freq_grid, total_examples)
250
+ #Calc entropy of each element
251
+ entropy = 0
252
+ freq_grid.each do |att_freq|
253
+ att_total_freq = ID3.sum(att_freq)
254
+ partial_entropy = 0
255
+ if att_total_freq != 0
256
+ att_freq.each do |freq|
257
+ prop = freq.to_f/att_total_freq
258
+ partial_entropy += (-1*prop*ID3.log2(prop))
259
+ end
260
+ end
261
+ entropy += (att_total_freq.to_f/total_examples) * partial_entropy
262
+ end
263
+ return entropy
264
+ end
265
+
266
+ private
267
+ LOG2 = Math.log(2)
268
+ end
269
+
270
+ class EvaluationNode #:nodoc: all
271
+
272
+ attr_reader :index, :values, :nodes
273
+
274
+ def initialize(data_labels, index, values, nodes)
275
+ @index = index
276
+ @values = values
277
+ @nodes = nodes
278
+ @data_labels = data_labels
279
+ end
280
+
281
+ def value(data)
282
+ value = data[@index]
283
+ return rule_not_found if !@values.include?(value)
284
+ return nodes[@values.index(value)].value(data)
285
+ end
286
+
287
+ def get_rules
288
+ rule_set = []
289
+ @nodes.each_index do |child_node_index|
290
+ my_rule = "#{@data_labels[@index]}=='#{@values[child_node_index]}'"
291
+ child_node = @nodes[child_node_index]
292
+ child_node_rules = child_node.get_rules
293
+ child_node_rules.each do |child_rule|
294
+ child_rule.unshift(my_rule)
295
+ end
296
+ rule_set += child_node_rules
297
+ end
298
+ return rule_set
299
+ end
300
+
301
+ end
302
+
303
+ class CategoryNode #:nodoc: all
304
+ def initialize(label, value)
305
+ @label = label
306
+ @value = value
307
+ end
308
+ def value(data)
309
+ return @value
310
+ end
311
+ def get_rules
312
+ return [["#{@label}='#{@value}'"]]
313
+ end
314
+ end
315
+
316
+ class ErrorNode #:nodoc: all
317
+ def value(data)
318
+ raise "There was not enough information during training to do a proper induction for this data element."
319
+ end
320
+ def get_rules
321
+ return []
322
+ end
323
+ end
324
+
325
+ end
326
+ end