ai4ruby 1.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (79) hide show
  1. data/README.rdoc +47 -0
  2. data/examples/classifiers/id3_data.csv +121 -0
  3. data/examples/classifiers/id3_example.rb +29 -0
  4. data/examples/classifiers/naive_bayes_data.csv +11 -0
  5. data/examples/classifiers/naive_bayes_example.rb +16 -0
  6. data/examples/classifiers/results.txt +31 -0
  7. data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
  8. data/examples/genetic_algorithm/travel_cost.csv +16 -0
  9. data/examples/neural_network/backpropagation_example.rb +67 -0
  10. data/examples/neural_network/patterns_with_base_noise.rb +68 -0
  11. data/examples/neural_network/patterns_with_noise.rb +66 -0
  12. data/examples/neural_network/training_patterns.rb +68 -0
  13. data/examples/neural_network/xor_example.rb +35 -0
  14. data/examples/som/som_data.rb +156 -0
  15. data/examples/som/som_multi_node_example.rb +22 -0
  16. data/examples/som/som_single_example.rb +24 -0
  17. data/lib/ai4r.rb +33 -0
  18. data/lib/ai4r/classifiers/classifier.rb +62 -0
  19. data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
  20. data/lib/ai4r/classifiers/ib1.rb +121 -0
  21. data/lib/ai4r/classifiers/id3.rb +326 -0
  22. data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
  23. data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
  24. data/lib/ai4r/classifiers/one_r.rb +110 -0
  25. data/lib/ai4r/classifiers/prism.rb +197 -0
  26. data/lib/ai4r/classifiers/zero_r.rb +73 -0
  27. data/lib/ai4r/clusterers/average_linkage.rb +59 -0
  28. data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
  29. data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
  30. data/lib/ai4r/clusterers/clusterer.rb +61 -0
  31. data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
  32. data/lib/ai4r/clusterers/diana.rb +139 -0
  33. data/lib/ai4r/clusterers/k_means.rb +126 -0
  34. data/lib/ai4r/clusterers/median_linkage.rb +61 -0
  35. data/lib/ai4r/clusterers/single_linkage.rb +194 -0
  36. data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
  37. data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +31 -0
  38. data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
  39. data/lib/ai4r/data/data_set.rb +266 -0
  40. data/lib/ai4r/data/parameterizable.rb +64 -0
  41. data/lib/ai4r/data/proximity.rb +100 -0
  42. data/lib/ai4r/data/statistics.rb +77 -0
  43. data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
  44. data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
  45. data/lib/ai4r/neural_network/backpropagation.rb +326 -0
  46. data/lib/ai4r/neural_network/hopfield.rb +149 -0
  47. data/lib/ai4r/som/layer.rb +68 -0
  48. data/lib/ai4r/som/node.rb +96 -0
  49. data/lib/ai4r/som/som.rb +155 -0
  50. data/lib/ai4r/som/two_phase_layer.rb +90 -0
  51. data/test/classifiers/hyperpipes_test.rb +84 -0
  52. data/test/classifiers/ib1_test.rb +78 -0
  53. data/test/classifiers/id3_test.rb +208 -0
  54. data/test/classifiers/multilayer_perceptron_test.rb +79 -0
  55. data/test/classifiers/naive_bayes_test.rb +43 -0
  56. data/test/classifiers/one_r_test.rb +62 -0
  57. data/test/classifiers/prism_test.rb +85 -0
  58. data/test/classifiers/zero_r_test.rb +49 -0
  59. data/test/clusterers/average_linkage_test.rb +51 -0
  60. data/test/clusterers/bisecting_k_means_test.rb +66 -0
  61. data/test/clusterers/centroid_linkage_test.rb +53 -0
  62. data/test/clusterers/complete_linkage_test.rb +57 -0
  63. data/test/clusterers/diana_test.rb +69 -0
  64. data/test/clusterers/k_means_test.rb +100 -0
  65. data/test/clusterers/median_linkage_test.rb +53 -0
  66. data/test/clusterers/single_linkage_test.rb +122 -0
  67. data/test/clusterers/ward_linkage_hierarchical_test.rb +61 -0
  68. data/test/clusterers/ward_linkage_test.rb +53 -0
  69. data/test/clusterers/weighted_average_linkage_test.rb +53 -0
  70. data/test/data/data_set_test.rb +96 -0
  71. data/test/data/proximity_test.rb +81 -0
  72. data/test/data/statistics_test.rb +65 -0
  73. data/test/experiment/classifier_evaluator_test.rb +76 -0
  74. data/test/genetic_algorithm/chromosome_test.rb +58 -0
  75. data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
  76. data/test/neural_network/backpropagation_test.rb +82 -0
  77. data/test/neural_network/hopfield_test.rb +72 -0
  78. data/test/som/som_test.rb +97 -0
  79. metadata +168 -0
@@ -0,0 +1,118 @@
1
+ # Author:: Sergio Fierens (Implementation only)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require 'set'
11
+ require File.dirname(__FILE__) + '/../data/data_set'
12
+ require File.dirname(__FILE__) + '/../classifiers/classifier'
13
+
14
+ module Ai4r
15
+ module Classifiers
16
+
17
+ include Ai4r::Data
18
+
19
+ # = Introduction
20
+ #
21
+ # A fast classifier algorithm, created by Lucio de Souza Coelho
22
+ # and Len Trigg.
23
+ class Hyperpipes < Classifier
24
+
25
+ attr_reader :data_set, :pipes
26
+
27
+ # Build a new Hyperpipes classifier. You must provide a DataSet instance
28
+ # as parameter. The last attribute of each item is considered as
29
+ # the item class.
30
+ def build(data_set)
31
+ data_set.check_not_empty
32
+ @data_set = data_set
33
+ @domains = data_set.build_domains
34
+
35
+ @pipes = {}
36
+ @domains.last.each {|cat| @pipes[cat] = build_pipe(@data_set)}
37
+ @data_set.data_items.each {|item| update_pipe(@pipes[item.last], item) }
38
+
39
+ return self
40
+ end
41
+
42
+ # You can evaluate new data, predicting its class.
43
+ # e.g.
44
+ # classifier.eval(['New York', '<30', 'F']) # => 'Y'
45
+ def eval(data)
46
+ votes = Hash.new {0}
47
+ @pipes.each do |category, pipe|
48
+ pipe.each_with_index do |bounds, i|
49
+ if data[i].is_a? Numeric
50
+ votes[category]+=1 if data[i]>=bounds[:min] && data[i]<=bounds[:max]
51
+ else
52
+ votes[category]+=1 if bounds[data[i]]
53
+ end
54
+ end
55
+ end
56
+ return votes.to_a.max {|x, y| x.last <=> y.last}.first
57
+ end
58
+
59
+ # This method returns the generated rules in ruby code.
60
+ # e.g.
61
+ #
62
+ # classifier.get_rules
63
+ # # => if age_range == '<30' then marketing_target = 'Y'
64
+ # elsif age_range == '[30-50)' then marketing_target = 'N'
65
+ # elsif age_range == '[50-80]' then marketing_target = 'N'
66
+ # end
67
+ #
68
+ # It is a nice way to inspect induction results, and also to execute them:
69
+ # marketing_target = nil
70
+ # eval classifier.get_rules
71
+ # puts marketing_target
72
+ # # => 'Y'
73
+ def get_rules
74
+ rules = []
75
+ rules << "votes = Hash.new {0}"
76
+ data = @data_set.data_items.first
77
+ labels = @data_set.data_labels.collect {|l| l.to_s}
78
+ @pipes.each do |category, pipe|
79
+ pipe.each_with_index do |bounds, i|
80
+ rule = "votes['#{category}'] += 1 "
81
+ if data[i].is_a? Numeric
82
+ rule += "if #{labels[i]} >= #{bounds[:min]} && #{labels[i]} <= #{bounds[:max]}"
83
+ else
84
+ rule += "if #{bounds.inspect}[#{labels[i]}]"
85
+ end
86
+ rules << rule
87
+ end
88
+ end
89
+ rules << "#{labels.last} = votes.to_a.max {|x, y| x.last <=> y.last}.first"
90
+ return rules.join("\n")
91
+ end
92
+
93
+ protected
94
+
95
+ def build_pipe(data_set)
96
+ data_set.data_items.first[0...-1].collect do |att|
97
+ if att.is_a? Numeric
98
+ {:min=>1.0/0, :max=>-1.0/0}
99
+ else
100
+ Hash.new(false)
101
+ end
102
+ end
103
+ end
104
+
105
+ def update_pipe(pipe, data_item)
106
+ data_item[0...-1].each_with_index do |att, i|
107
+ if att.is_a? Numeric
108
+ pipe[i][:min] = att if att < pipe[i][:min]
109
+ pipe[i][:max] = att if att > pipe[i][:max]
110
+ else
111
+ pipe[i][att] = true
112
+ end
113
+ end
114
+ end
115
+
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,121 @@
1
+ # Author:: Sergio Fierens (Implementation only)
2
+ # License:: MPL 1.1
3
+ # Project:: ai4r
4
+ # Url:: http://ai4r.rubyforge.org/
5
+ #
6
+ # You can redistribute it and/or modify it under the terms of
7
+ # the Mozilla Public License version 1.1 as published by the
8
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
9
+
10
+ require 'set'
11
+ require File.dirname(__FILE__) + '/../data/data_set'
12
+ require File.dirname(__FILE__) + '/../classifiers/classifier'
13
+
14
+ module Ai4r
15
+ module Classifiers
16
+
17
+ # = Introduction
18
+ #
19
+ # IB1 algorithm implementation.
20
+ # IB1 is the simplest instance-based learning (IBL) algorithm.
21
+ #
22
+ # D. Aha, D. Kibler (1991). Instance-based learning algorithms.
23
+ # Machine Learning. 6:37-66.
24
+ #
25
+ # IBI is identical to the nearest neighbor algorithm except that
26
+ # it normalizes its attributes' ranges, processes instances
27
+ # incrementally, and has a simple policy for tolerating missing values
28
+ class IB1 < Classifier
29
+
30
+ attr_reader :data_set
31
+
32
+ # Build a new IB1 classifier. You must provide a DataSet instance
33
+ # as parameter. The last attribute of each item is considered as
34
+ # the item class.
35
+ def build(data_set)
36
+ data_set.check_not_empty
37
+ @data_set = data_set
38
+ @min_values = Array.new(data_set.data_labels.length)
39
+ @max_values = Array.new(data_set.data_labels.length)
40
+ data_set.data_items.each { |data_item| update_min_max(data_item[0...-1]) }
41
+ return self
42
+ end
43
+
44
+ # You can evaluate new data, predicting its class.
45
+ # e.g.
46
+ # classifier.eval(['New York', '<30', 'F']) # => 'Y'
47
+ def eval(data)
48
+ update_min_max(data)
49
+ min_distance = 1.0/0
50
+ klass = nil
51
+ @data_set.data_items.each do |train_item|
52
+ d = distance(data, train_item)
53
+ if d < min_distance
54
+ min_distance = d
55
+ klass = train_item.last
56
+ end
57
+ end
58
+ return klass
59
+ end
60
+
61
+ protected
62
+
63
+ # We keep in the state the min and max value of each attribute,
64
+ # to provide normalized distances between to values of a numeric attribute
65
+ def update_min_max(atts)
66
+ atts.each_with_index do |att, i|
67
+ if att && att.is_a?(Numeric)
68
+ @min_values[i] = att if @min_values[i].nil? || @min_values[i] > att
69
+ @max_values[i] = att if @max_values[i].nil? || @max_values[i] < att
70
+ end
71
+ end
72
+ end
73
+
74
+ # Normalized distance between 2 instances
75
+ #
76
+ #
77
+ # Returns sum of
78
+ # * squared difference between normalized numeric att values
79
+ # * 1 for nominal atts which differs or one is missing
80
+ # * 1 if both atts are missing
81
+ # * normalized numeric att value if other att value is missing and > 0.5
82
+ # * 1.0-normalized numeric att value if other att value is missing and < 0.5
83
+ def distance(a, b)
84
+ d = 0
85
+ a.each_with_index do |att_a, i|
86
+ att_b = b[i]
87
+ if att_a.nil?
88
+ if att_b.is_a? Numeric
89
+ diff = norm(att_b, i)
90
+ diff = 1.0 - diff if diff < 0.5
91
+ else
92
+ diff = 1
93
+ end
94
+ elsif att_a.is_a? Numeric
95
+ if att_b.is_a? Numeric
96
+ diff = norm(att_a, i) - norm(att_b, i);
97
+ else
98
+ diff = norm(att_a, i)
99
+ diff = 1.0 - diff if diff < 0.5
100
+ end
101
+ elsif att_a != att_b
102
+ diff = 1
103
+ else
104
+ diff = 0
105
+ end
106
+ d += diff * diff
107
+ end
108
+ return d
109
+ end
110
+
111
+ # Returns normalized value att
112
+ #
113
+ # index is the index of the attribute in the instance.
114
+ def norm(att, index)
115
+ return 0 if @min_values[index].nil?
116
+ return 1.0*(att - @min_values[index]) / (@max_values[index] -@min_values[index]);
117
+ end
118
+
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,326 @@
1
+ # Author:: Sergio Fierens (Implementation, Quinlan is
2
+ # the creator of the algorithm)
3
+ # License:: MPL 1.1
4
+ # Project:: ai4r
5
+ # Url:: http://ai4r.rubyforge.org/
6
+ #
7
+ # You can redistribute it and/or modify it under the terms of
8
+ # the Mozilla Public License version 1.1 as published by the
9
+ # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
10
+
11
+ require File.dirname(__FILE__) + '/../data/data_set'
12
+ require File.dirname(__FILE__) + '/../classifiers/classifier'
13
+
14
+ module Ai4r
15
+
16
+ module Classifiers
17
+
18
+ # = Introduction
19
+ # This is an implementation of the ID3 algorithm (Quinlan)
20
+ # Given a set of preclassified examples, it builds a top-down
21
+ # induction of decision tree, biased by the information gain and
22
+ # entropy measure.
23
+ #
24
+ # * http://en.wikipedia.org/wiki/Decision_tree
25
+ # * http://en.wikipedia.org/wiki/ID3_algorithm
26
+ #
27
+ # = How to use it
28
+ #
29
+ # DATA_LABELS = [ 'city', 'age_range', 'gender', 'marketing_target' ]
30
+ #
31
+ # DATA_ITEMS = [
32
+ # ['New York', '<30', 'M', 'Y'],
33
+ # ['Chicago', '<30', 'M', 'Y'],
34
+ # ['Chicago', '<30', 'F', 'Y'],
35
+ # ['New York', '<30', 'M', 'Y'],
36
+ # ['New York', '<30', 'M', 'Y'],
37
+ # ['Chicago', '[30-50)', 'M', 'Y'],
38
+ # ['New York', '[30-50)', 'F', 'N'],
39
+ # ['Chicago', '[30-50)', 'F', 'Y'],
40
+ # ['New York', '[30-50)', 'F', 'N'],
41
+ # ['Chicago', '[50-80]', 'M', 'N'],
42
+ # ['New York', '[50-80]', 'F', 'N'],
43
+ # ['New York', '[50-80]', 'M', 'N'],
44
+ # ['Chicago', '[50-80]', 'M', 'N'],
45
+ # ['New York', '[50-80]', 'F', 'N'],
46
+ # ['Chicago', '>80', 'F', 'Y']
47
+ # ]
48
+ #
49
+ # data_set = DataSet.new(:data_items=>DATA_SET, :data_labels=>DATA_LABELS)
50
+ # id3 = Ai4r::Classifiers::ID3.new.build(data_set)
51
+ #
52
+ # id3.get_rules
53
+ # # => if age_range=='<30' then marketing_target='Y'
54
+ # elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
55
+ # elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
56
+ # elsif age_range=='[50-80]' then marketing_target='N'
57
+ # elsif age_range=='>80' then marketing_target='Y'
58
+ # else raise 'There was not enough information during training to do a proper induction for this data element' end
59
+ #
60
+ # id3.eval(['New York', '<30', 'M'])
61
+ # # => 'Y'
62
+ #
63
+ # = A better way to load the data
64
+ #
65
+ # In the real life you will use lot more data training examples, with more
66
+ # attributes. Consider moving your data to an external CSV (comma separate
67
+ # values) file.
68
+ #
69
+ # data_file = "#{File.dirname(__FILE__)}/data_set.csv"
70
+ # data_set = DataSet.load_csv_with_labels data_file
71
+ # id3 = Ai4r::Classifiers::ID3.new.build(data_set)
72
+ #
73
+ # = A nice tip for data evaluation
74
+ #
75
+ # id3 = Ai4r::Classifiers::ID3.new.build(data_set)
76
+ #
77
+ # age_range = '<30'
78
+ # marketing_target = nil
79
+ # eval id3.get_rules
80
+ # puts marketing_target
81
+ # # => 'Y'
82
+ #
83
+ # = More about ID3 and decision trees
84
+ #
85
+ # * http://en.wikipedia.org/wiki/Decision_tree
86
+ # * http://en.wikipedia.org/wiki/ID3_algorithm
87
+ #
88
+ # = About the project
89
+ # Author:: Sergio Fierens
90
+ # License:: MPL 1.1
91
+ # Url:: http://ai4r.rubyforge.org/
92
+ class ID3 < Classifier
93
+
94
+ attr_reader :data_set
95
+
96
+ # Create a new ID3 classifier. You must provide a DataSet instance
97
+ # as parameter. The last attribute of each item is considered as the
98
+ # item class.
99
+ def build(data_set)
100
+ data_set.check_not_empty
101
+ @data_set = data_set
102
+ preprocess_data(@data_set.data_items)
103
+ return self
104
+ end
105
+
106
+ # You can evaluate new data, predicting its category.
107
+ # e.g.
108
+ # id3.eval(['New York', '<30', 'F']) # => 'Y'
109
+ def eval(data)
110
+ @tree.value(data) if @tree
111
+ end
112
+
113
+ # This method returns the generated rules in ruby code.
114
+ # e.g.
115
+ #
116
+ # id3.get_rules
117
+ # # => if age_range=='<30' then marketing_target='Y'
118
+ # elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
119
+ # elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
120
+ # elsif age_range=='[50-80]' then marketing_target='N'
121
+ # elsif age_range=='>80' then marketing_target='Y'
122
+ # else raise 'There was not enough information during training to do a proper induction for this data element' end
123
+ #
124
+ # It is a nice way to inspect induction results, and also to execute them:
125
+ # age_range = '<30'
126
+ # marketing_target = nil
127
+ # eval id3.get_rules
128
+ # puts marketing_target
129
+ # # => 'Y'
130
+ def get_rules
131
+ #return "Empty ID3 tree" if !@tree
132
+ rules = @tree.get_rules
133
+ rules = rules.collect do |rule|
134
+ "#{rule[0..-2].join(' and ')} then #{rule.last}"
135
+ end
136
+ return "if #{rules.join("\nelsif ")}\nelse raise 'There was not enough information during training to do a proper induction for this data element' end"
137
+ end
138
+
139
+ private
140
+ def preprocess_data(data_examples)
141
+ @tree = build_node(data_examples)
142
+ end
143
+
144
+ private
145
+ def build_node(data_examples, flag_att = [])
146
+ return ErrorNode.new if data_examples.length == 0
147
+ domain = domain(data_examples)
148
+ return CategoryNode.new(@data_set.data_labels.last, domain.last[0]) if domain.last.length == 1
149
+ min_entropy_index = min_entropy_index(data_examples, domain, flag_att)
150
+ flag_att << min_entropy_index
151
+ split_data_examples = split_data_examples(data_examples, domain, min_entropy_index)
152
+ return CategoryNode.new(@data_set.data_labels.last, most_freq(data_examples, domain)) if split_data_examples.length == 1
153
+ nodes = split_data_examples.collect do |partial_data_examples|
154
+ build_node(partial_data_examples, flag_att)
155
+ end
156
+ return EvaluationNode.new(@data_set.data_labels, min_entropy_index, domain[min_entropy_index], nodes)
157
+ end
158
+
159
+ private
160
+ def self.sum(values)
161
+ values.inject( 0 ) { |sum,x| sum+x }
162
+ end
163
+
164
+ private
165
+ def self.log2(z)
166
+ return 0.0 if z == 0
167
+ Math.log(z)/LOG2
168
+ end
169
+
170
+ private
171
+ def most_freq(examples, domain)
172
+ freqs = []
173
+ domain.last.length.times { freqs << 0}
174
+ examples.each do |example|
175
+ cat_index = domain.last.index(example.last)
176
+ freq = freqs[cat_index] + 1
177
+ freqs[cat_index] = freq
178
+ end
179
+ max_freq = freqs.max
180
+ max_freq_index = freqs.index(max_freq)
181
+ domain.last[max_freq_index]
182
+ end
183
+
184
+ private
185
+ def split_data_examples(data_examples, domain, att_index)
186
+ data_examples_array = []
187
+ att_value_examples = {}
188
+ data_examples.each do |example|
189
+ example_set = att_value_examples[example[att_index]]
190
+ example_set = [] if !example_set
191
+ example_set << example
192
+ att_value_examples.store(example[att_index], example_set)
193
+ end
194
+ att_value_examples.each_pair do |att_value, example_set|
195
+ att_value_index = domain[att_index].index(att_value)
196
+ data_examples_array[att_value_index] = example_set
197
+ end
198
+ return data_examples_array
199
+ end
200
+
201
+ private
202
+ def min_entropy_index(data_examples, domain, flag_att=[])
203
+ min_entropy = nil
204
+ min_index = 0
205
+ domain[0..-2].each_index do |index|
206
+ freq_grid = freq_grid(index, data_examples, domain)
207
+ entropy = entropy(freq_grid, data_examples.length)
208
+ if (!min_entropy || entropy < min_entropy) && !flag_att.include?(index)
209
+ min_entropy = entropy
210
+ min_index = index
211
+ end
212
+ end
213
+ return min_index
214
+ end
215
+
216
+ private
217
+ def domain(data_examples)
218
+ #return build_domains(data_examples)
219
+ domain = []
220
+ @data_set.data_labels.length.times { domain << [] }
221
+ data_examples.each do |data|
222
+ data.each_index do |i|
223
+ domain[i] << data[i] if i<domain.length && !domain[i].include?(data[i])
224
+ end
225
+ end
226
+ return domain
227
+ end
228
+
229
+ private
230
+ def freq_grid(att_index, data_examples, domain)
231
+ #Initialize empty grid
232
+ grid_element = []
233
+ domain.last.length.times { grid_element << 0}
234
+ grid = []
235
+ domain[att_index].length.times { grid << grid_element.clone }
236
+ #Fill frecuency with grid
237
+ data_examples.each do |example|
238
+ att_val = example[att_index]
239
+ att_val_index = domain[att_index].index(att_val)
240
+ category = example.last
241
+ category_index = domain.last.index(category)
242
+ freq = grid[att_val_index][category_index] + 1
243
+ grid[att_val_index][category_index] = freq
244
+ end
245
+ return grid
246
+ end
247
+
248
+ private
249
+ def entropy(freq_grid, total_examples)
250
+ #Calc entropy of each element
251
+ entropy = 0
252
+ freq_grid.each do |att_freq|
253
+ att_total_freq = ID3.sum(att_freq)
254
+ partial_entropy = 0
255
+ if att_total_freq != 0
256
+ att_freq.each do |freq|
257
+ prop = freq.to_f/att_total_freq
258
+ partial_entropy += (-1*prop*ID3.log2(prop))
259
+ end
260
+ end
261
+ entropy += (att_total_freq.to_f/total_examples) * partial_entropy
262
+ end
263
+ return entropy
264
+ end
265
+
266
+ private
267
+ LOG2 = Math.log(2)
268
+ end
269
+
270
+ class EvaluationNode #:nodoc: all
271
+
272
+ attr_reader :index, :values, :nodes
273
+
274
+ def initialize(data_labels, index, values, nodes)
275
+ @index = index
276
+ @values = values
277
+ @nodes = nodes
278
+ @data_labels = data_labels
279
+ end
280
+
281
+ def value(data)
282
+ value = data[@index]
283
+ return rule_not_found if !@values.include?(value)
284
+ return nodes[@values.index(value)].value(data)
285
+ end
286
+
287
+ def get_rules
288
+ rule_set = []
289
+ @nodes.each_index do |child_node_index|
290
+ my_rule = "#{@data_labels[@index]}=='#{@values[child_node_index]}'"
291
+ child_node = @nodes[child_node_index]
292
+ child_node_rules = child_node.get_rules
293
+ child_node_rules.each do |child_rule|
294
+ child_rule.unshift(my_rule)
295
+ end
296
+ rule_set += child_node_rules
297
+ end
298
+ return rule_set
299
+ end
300
+
301
+ end
302
+
303
+ class CategoryNode #:nodoc: all
304
+ def initialize(label, value)
305
+ @label = label
306
+ @value = value
307
+ end
308
+ def value(data)
309
+ return @value
310
+ end
311
+ def get_rules
312
+ return [["#{@label}='#{@value}'"]]
313
+ end
314
+ end
315
+
316
+ class ErrorNode #:nodoc: all
317
+ def value(data)
318
+ raise "There was not enough information during training to do a proper induction for this data element."
319
+ end
320
+ def get_rules
321
+ return []
322
+ end
323
+ end
324
+
325
+ end
326
+ end