ai4r 1.1 → 1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +21 -20
- data/examples/decision_trees/id3_example.rb +3 -2
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +6 -6
- data/examples/neural_network/backpropagation_example.rb +2 -2
- data/lib/ai4r/classifiers/classifier_helper.rb +54 -0
- data/lib/ai4r/classifiers/id3.rb +356 -0
- data/lib/ai4r/classifiers/one_r.rb +148 -0
- data/lib/ai4r/classifiers/prism.rb +231 -0
- data/lib/ai4r/classifiers/zero_r.rb +104 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +272 -0
- data/lib/ai4r/neural_network/backpropagation.rb +271 -0
- data/site/build/tmp/locationmap.xml +14 -14
- data/site/build/tmp/output.xmap +23 -23
- data/site/build/tmp/pluginlist2fetchbuild.xml +144 -144
- data/site/build/tmp/plugins-1.xml +0 -11
- data/site/build/tmp/plugins-2.xml +54 -0
- data/site/build/tmp/projfilters.properties +41 -41
- data/site/build/webapp/WEB-INF/logs/core.log +681 -788
- data/site/build/webapp/WEB-INF/logs/error.log +281 -248
- data/site/build/webapp/WEB-INF/logs/sitemap.log +1015 -0
- data/site/src/documentation/content/xdocs/forum.html +9 -0
- data/site/src/documentation/content/xdocs/geneticAlgorithms.xml +82 -68
- data/site/src/documentation/content/xdocs/index.xml +47 -18
- data/site/src/documentation/content/xdocs/machineLearning.xml +10 -9
- data/site/src/documentation/content/xdocs/neuralNetworks.xml +60 -36
- data/site/src/documentation/content/xdocs/site.xml +8 -5
- data/site/src/documentation/content/xdocs/svn.xml +11 -1
- data/site/src/documentation/resources/images/Thumbs.db +0 -0
- data/site/src/documentation/resources/images/ai4r-logo.png +0 -0
- data/site/src/documentation/resources/images/genetic_algorithms_example.png +0 -0
- data/site/src/documentation/resources/images/jadeferret.png +0 -0
- data/site/src/documentation/resources/images/neural_network_example.png +0 -0
- data/site/src/documentation/resources/images/sub-dir/Thumbs.db +0 -0
- data/site/src/documentation/skinconf.xml +18 -18
- data/test/classifiers/id3_test.rb +206 -0
- data/test/classifiers/one_r_test.rb +62 -0
- data/test/classifiers/prism_test.rb +83 -0
- data/test/classifiers/zero_r_test.rb +48 -0
- data/test/genetic_algorithm/chromosome_test.rb +41 -38
- data/test/genetic_algorithm/genetic_algorithm_test.rb +64 -61
- data/test/neural_network/backpropagation_test.rb +20 -18
- metadata +109 -199
- data/lib/decision_tree/id3.rb +0 -354
- data/lib/genetic_algorithm/genetic_algorithm.rb +0 -268
- data/lib/neural_network/backpropagation.rb +0 -264
- data/site/build/site/en/broken-links.xml +0 -2
- data/site/build/site/en/downloads.html +0 -187
- data/site/build/site/en/downloads.pdf +0 -151
- data/site/build/site/en/geneticAlgorithms.html +0 -564
- data/site/build/site/en/geneticAlgorithms.pdf +0 -911
- data/site/build/site/en/images/ai4r-logo.png +0 -0
- data/site/build/site/en/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/images/c.png +0 -0
- data/site/build/site/en/images/c_wbn.png +0 -0
- data/site/build/site/en/images/c_wn.png +0 -0
- data/site/build/site/en/images/ero.gif +0 -0
- data/site/build/site/en/images/europe2.png +0 -0
- data/site/build/site/en/images/europe3.png +0 -0
- data/site/build/site/en/images/fitness.png +0 -0
- data/site/build/site/en/images/instruction_arrow.png +0 -0
- data/site/build/site/en/images/my_email.png +0 -0
- data/site/build/site/en/images/rubyforge.png +0 -0
- data/site/build/site/en/images/s.png +0 -0
- data/site/build/site/en/images/s_wbn.png +0 -0
- data/site/build/site/en/images/s_wn.png +0 -0
- data/site/build/site/en/images/sigmoid.png +0 -0
- data/site/build/site/en/images/t.png +0 -0
- data/site/build/site/en/images/t_wbn.png +0 -0
- data/site/build/site/en/images/t_wn.png +0 -0
- data/site/build/site/en/index.html +0 -258
- data/site/build/site/en/index.pdf +0 -306
- data/site/build/site/en/linkmap.html +0 -231
- data/site/build/site/en/linkmap.pdf +0 -94
- data/site/build/site/en/locationmap.xml +0 -72
- data/site/build/site/en/machineLearning.html +0 -325
- data/site/build/site/en/machineLearning.pdf +0 -337
- data/site/build/site/en/neuralNetworks.html +0 -446
- data/site/build/site/en/neuralNetworks.pdf +0 -604
- data/site/build/site/en/skin/CommonMessages_de.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_en_US.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_es.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_fr.xml +0 -23
- data/site/build/site/en/skin/basic.css +0 -166
- data/site/build/site/en/skin/breadcrumbs-optimized.js +0 -90
- data/site/build/site/en/skin/breadcrumbs.js +0 -237
- data/site/build/site/en/skin/fontsize.js +0 -166
- data/site/build/site/en/skin/getBlank.js +0 -40
- data/site/build/site/en/skin/getMenu.js +0 -45
- data/site/build/site/en/skin/images/README.txt +0 -1
- data/site/build/site/en/skin/images/add.jpg +0 -0
- data/site/build/site/en/skin/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/skin/images/chapter.gif +0 -0
- data/site/build/site/en/skin/images/chapter_open.gif +0 -0
- data/site/build/site/en/skin/images/current.gif +0 -0
- data/site/build/site/en/skin/images/error.png +0 -0
- data/site/build/site/en/skin/images/external-link.gif +0 -0
- data/site/build/site/en/skin/images/fix.jpg +0 -0
- data/site/build/site/en/skin/images/forrest-credit-logo.png +0 -0
- data/site/build/site/en/skin/images/hack.jpg +0 -0
- data/site/build/site/en/skin/images/header_white_line.gif +0 -0
- data/site/build/site/en/skin/images/info.png +0 -0
- data/site/build/site/en/skin/images/instruction_arrow.png +0 -0
- data/site/build/site/en/skin/images/label.gif +0 -0
- data/site/build/site/en/skin/images/page.gif +0 -0
- data/site/build/site/en/skin/images/pdfdoc.gif +0 -0
- data/site/build/site/en/skin/images/poddoc.png +0 -0
- data/site/build/site/en/skin/images/printer.gif +0 -0
- data/site/build/site/en/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/remove.jpg +0 -0
- data/site/build/site/en/skin/images/rss.png +0 -0
- data/site/build/site/en/skin/images/spacer.gif +0 -0
- data/site/build/site/en/skin/images/success.png +0 -0
- data/site/build/site/en/skin/images/txtdoc.png +0 -0
- data/site/build/site/en/skin/images/update.jpg +0 -0
- data/site/build/site/en/skin/images/valid-html401.png +0 -0
- data/site/build/site/en/skin/images/vcss.png +0 -0
- data/site/build/site/en/skin/images/warning.png +0 -0
- data/site/build/site/en/skin/images/xmldoc.gif +0 -0
- data/site/build/site/en/skin/menu.js +0 -48
- data/site/build/site/en/skin/note.txt +0 -50
- data/site/build/site/en/skin/print.css +0 -54
- data/site/build/site/en/skin/profile.css +0 -163
- data/site/build/site/en/skin/prototype.js +0 -1257
- data/site/build/site/en/skin/screen.css +0 -587
- data/site/build/site/en/svn.html +0 -223
- data/site/build/site/en/svn.pdf +0 -239
- data/site/build/site/en/wholesite.pdf +0 -1686
- data/site/build/tmp/brokenlinks.xml +0 -2
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
- data/test/decision_tree/id3_test.rb +0 -209
data/lib/decision_tree/id3.rb
DELETED
@@ -1,354 +0,0 @@
|
|
1
|
-
|
2
|
-
# Decision tree learning, used in data mining and machine learning,
|
3
|
-
# uses a decision tree as a predictive model which maps observations about
|
4
|
-
# an item to conclusions about the item's target value.
|
5
|
-
#
|
6
|
-
# In this module you will find an implementation of the ID3 algorithm (Quinlan)
|
7
|
-
#
|
8
|
-
# * http://en.wikipedia.org/wiki/Decision_tree
|
9
|
-
# * http://en.wikipedia.org/wiki/ID3_algorithm
|
10
|
-
#
|
11
|
-
# Author:: Sergio Fierens
|
12
|
-
# License:: MPL 1.1
|
13
|
-
# Project:: ai4r
|
14
|
-
# Url:: http://ai4r.rubyforge.org/
|
15
|
-
#
|
16
|
-
# You can redistribute it and/or modify it under the terms of
|
17
|
-
# the Mozilla Public License version 1.1 as published by the
|
18
|
-
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
19
|
-
|
20
|
-
module DecisionTree
|
21
|
-
|
22
|
-
# = Introduction
|
23
|
-
# This is an implementation of the ID3 algorithm (Quinlan)
|
24
|
-
# Given a set of preclassified examples, it builds a top-down
|
25
|
-
# induction of decision tree, biased by the information gain and
|
26
|
-
# entropy measure.
|
27
|
-
#
|
28
|
-
# = How to use it
|
29
|
-
#
|
30
|
-
# DATA_LABELS = [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
31
|
-
#
|
32
|
-
# DATA_SET = [ ['New York', '<30', 'M', 'Y'],
|
33
|
-
# ['Chicago', '<30', 'M', 'Y'],
|
34
|
-
# ['Chicago', '<30', 'F', 'Y'],
|
35
|
-
# ['New York', '<30', 'M', 'Y'],
|
36
|
-
# ['New York', '<30', 'M', 'Y'],
|
37
|
-
# ['Chicago', '[30-50)', 'M', 'Y'],
|
38
|
-
# ['New York', '[30-50)', 'F', 'N'],
|
39
|
-
# ['Chicago', '[30-50)', 'F', 'Y'],
|
40
|
-
# ['New York', '[30-50)', 'F', 'N'],
|
41
|
-
# ['Chicago', '[50-80]', 'M', 'N'],
|
42
|
-
# ['New York', '[50-80]', 'F', 'N'],
|
43
|
-
# ['New York', '[50-80]', 'M', 'N'],
|
44
|
-
# ['Chicago', '[50-80]', 'M', 'N'],
|
45
|
-
# ['New York', '[50-80]', 'F', 'N'],
|
46
|
-
# ['Chicago', '>80', 'F', 'Y']
|
47
|
-
# ]
|
48
|
-
#
|
49
|
-
# id3 = DecisionTree::ID3.new(DATA_SET, DATA_LABELS)
|
50
|
-
#
|
51
|
-
# id3.to_s
|
52
|
-
# # => if age_range=='<30' then marketing_target='Y'
|
53
|
-
# elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
|
54
|
-
# elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
|
55
|
-
# elsif age_range=='[50-80]' then marketing_target='N'
|
56
|
-
# elsif age_range=='>80' then marketing_target='Y'
|
57
|
-
# else raise 'There was not enough information during training to do a proper induction for this data element' end
|
58
|
-
#
|
59
|
-
# id3.eval(['New York', '<30', 'M'])
|
60
|
-
# # => 'Y'
|
61
|
-
#
|
62
|
-
# = A better way to load the data
|
63
|
-
#
|
64
|
-
# In the real life you will use lot more data training examples, with more
|
65
|
-
# attributes. Consider moving your data to an external CSV (comma separate
|
66
|
-
# values) file.
|
67
|
-
#
|
68
|
-
# data_set = []
|
69
|
-
# CSV::Reader.parse(File.open("#{File.dirname(__FILE__)}/data_set.csv", 'r')) do |row|
|
70
|
-
# data_set << row
|
71
|
-
# end
|
72
|
-
# data_labels = data_set.shift
|
73
|
-
#
|
74
|
-
# id3 = DecisionTree::ID3.new(data_set, data_labels)
|
75
|
-
#
|
76
|
-
# = A nice tip for data evaluation
|
77
|
-
#
|
78
|
-
# id3 = DecisionTree::ID3.new(DATA_SET, DATA_LABELS)
|
79
|
-
# age_range = '<30'
|
80
|
-
# marketing_target = nil
|
81
|
-
# eval id3.to_s
|
82
|
-
# puts marketing_target
|
83
|
-
# # => 'Y'
|
84
|
-
# = More about ID3 and decision trees
|
85
|
-
#
|
86
|
-
# * http://en.wikipedia.org/wiki/Decision_tree
|
87
|
-
# * http://en.wikipedia.org/wiki/ID3_algorithm
|
88
|
-
#
|
89
|
-
# = About the project
|
90
|
-
# Author:: Sergio Fierens
|
91
|
-
# License:: MPL 1.1
|
92
|
-
|
93
|
-
class ID3
|
94
|
-
attr_reader :data_labels
|
95
|
-
# Create a new decision tree. If your data is classified with N attributed
|
96
|
-
# and M examples, then your data examples must have the following format:
|
97
|
-
#
|
98
|
-
# [ [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1, CATEGORY_VAL1],
|
99
|
-
# [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2, CATEGORY_VAL2],
|
100
|
-
# ...
|
101
|
-
# [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CATEGORY_VALM],
|
102
|
-
# ]
|
103
|
-
#
|
104
|
-
# e.g.
|
105
|
-
# [ ['New York', '<30', 'M', 'Y'],
|
106
|
-
# ['Chicago', '<30', 'M', 'Y'],
|
107
|
-
# ['Chicago', '<30', 'F', 'Y'],
|
108
|
-
# ['New York', '<30', 'M', 'Y'],
|
109
|
-
# ['New York', '<30', 'M', 'Y'],
|
110
|
-
# ['Chicago', '[30-50)', 'M', 'Y'],
|
111
|
-
# ['New York', '[30-50)', 'F', 'N'],
|
112
|
-
# ['Chicago', '[30-50)', 'F', 'Y'],
|
113
|
-
# ['New York', '[30-50)', 'F', 'N'],
|
114
|
-
# ['Chicago', '[50-80]', 'M', 'N'],
|
115
|
-
# ['New York', '[50-80]', 'F', 'N'],
|
116
|
-
# ['New York', '[50-80]', 'M', 'N'],
|
117
|
-
# ['Chicago', '[50-80]', 'M', 'N'],
|
118
|
-
# ['New York', '[50-80]', 'F', 'N'],
|
119
|
-
# ['Chicago', '>80', 'F', 'Y']
|
120
|
-
# ]
|
121
|
-
#
|
122
|
-
# Data labels must have the following format:
|
123
|
-
# [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
124
|
-
#
|
125
|
-
# If you do not provide labels for you data, the following labels will
|
126
|
-
# be created by default:
|
127
|
-
# [ 'ATTRIBUTE_1', 'ATTRIBUTE_2', 'ATTRIBUTE_3', 'CATEGORY' ]
|
128
|
-
#
|
129
|
-
def initialize(data_examples, data_labels=nil)
|
130
|
-
raise "Examples data set must not be empty." if !data_examples || data_examples.empty?
|
131
|
-
if !data_labels
|
132
|
-
data_labels = []
|
133
|
-
data_examples[0][0..-2].each_index do |i|
|
134
|
-
data_labels[i] = "ATTRIBUTE_#{i+1}"
|
135
|
-
end
|
136
|
-
data_labels[data_labels.length]="CATEGORY"
|
137
|
-
end
|
138
|
-
@data_labels = data_labels
|
139
|
-
preprocess_data(data_examples)
|
140
|
-
end
|
141
|
-
|
142
|
-
# You can evaluate new data, predicting its category.
|
143
|
-
# e.g.
|
144
|
-
# id3.eval(['New York', '<30', 'F']) # => 'Y'
|
145
|
-
def eval(data)
|
146
|
-
@tree.value(data)
|
147
|
-
end
|
148
|
-
|
149
|
-
# This method returns the generated rules in ruby code.
|
150
|
-
# e.g.
|
151
|
-
#
|
152
|
-
# id3.to_s
|
153
|
-
# # => if age_range=='<30' then marketing_target='Y'
|
154
|
-
# elsif age_range=='[30-50)' and city=='Chicago' then marketing_target='Y'
|
155
|
-
# elsif age_range=='[30-50)' and city=='New York' then marketing_target='N'
|
156
|
-
# elsif age_range=='[50-80]' then marketing_target='N'
|
157
|
-
# elsif age_range=='>80' then marketing_target='Y'
|
158
|
-
# else raise 'There was not enough information during training to do a proper induction for this data element' end
|
159
|
-
#
|
160
|
-
# It is a nice way to inspect induction results, and also to execute them:
|
161
|
-
# age_range = '<30'
|
162
|
-
# marketing_target = nil
|
163
|
-
# eval id3.to_s
|
164
|
-
# puts marketing_target
|
165
|
-
# # => 'Y'
|
166
|
-
def to_s
|
167
|
-
rules = @tree.get_rules
|
168
|
-
rules = rules.collect do |rule|
|
169
|
-
"#{rule[0..-2].join(' and ')} then #{rule.last}"
|
170
|
-
end
|
171
|
-
return "if #{rules.join("\nelsif ")}\nelse raise 'There was not enough information during training to do a proper induction for this data element' end"
|
172
|
-
end
|
173
|
-
|
174
|
-
private
|
175
|
-
def preprocess_data(data_examples)
|
176
|
-
@tree = build_node(data_examples)
|
177
|
-
end
|
178
|
-
|
179
|
-
private
|
180
|
-
def build_node(data_examples, flag_att = [])
|
181
|
-
return ErrorNode.new if data_examples.length == 0
|
182
|
-
domain = domain(data_examples)
|
183
|
-
return CategoryNode.new(@data_labels.last, domain.last[0]) if domain.last.length == 1
|
184
|
-
min_entropy_index = min_entropy_index(data_examples, domain, flag_att)
|
185
|
-
flag_att << min_entropy_index
|
186
|
-
split_data_examples = split_data_examples(data_examples, domain, min_entropy_index)
|
187
|
-
return CategoryNode.new(@data_labels.last, most_freq(data_examples, domain)) if split_data_examples.length == 1
|
188
|
-
nodes = split_data_examples.collect do |partial_data_examples|
|
189
|
-
build_node(partial_data_examples, flag_att)
|
190
|
-
end
|
191
|
-
return EvaluationNode.new(@data_labels, min_entropy_index, domain[min_entropy_index], nodes)
|
192
|
-
end
|
193
|
-
|
194
|
-
private
|
195
|
-
def self.sum(values)
|
196
|
-
values.inject( 0 ) { |sum,x| sum+x }
|
197
|
-
end
|
198
|
-
|
199
|
-
private
|
200
|
-
def self.log2(z)
|
201
|
-
return 0.0 if z == 0
|
202
|
-
Math.log(z)/LOG2
|
203
|
-
end
|
204
|
-
|
205
|
-
private
|
206
|
-
def most_freq(examples, domain)
|
207
|
-
freqs = []
|
208
|
-
domain.last.length.times { freqs << 0}
|
209
|
-
examples.each do |example|
|
210
|
-
cat_index = domain.last.index(example.last)
|
211
|
-
freq = freqs[cat_index] + 1
|
212
|
-
freqs[cat_index] = freq
|
213
|
-
end
|
214
|
-
max_freq = freqs.max
|
215
|
-
max_freq_index = freqs.index(max_freq)
|
216
|
-
domain.last[max_freq_index]
|
217
|
-
end
|
218
|
-
|
219
|
-
private
|
220
|
-
def split_data_examples(data_examples, domain, att_index)
|
221
|
-
data_examples_array = []
|
222
|
-
att_value_examples = {}
|
223
|
-
data_examples.each do |example|
|
224
|
-
example_set = att_value_examples[example[att_index]]
|
225
|
-
example_set = [] if !example_set
|
226
|
-
example_set << example
|
227
|
-
att_value_examples.store(example[att_index], example_set)
|
228
|
-
end
|
229
|
-
att_value_examples.each_pair do |att_value, example_set|
|
230
|
-
att_value_index = domain[att_index].index(att_value)
|
231
|
-
data_examples_array[att_value_index] = example_set
|
232
|
-
end
|
233
|
-
return data_examples_array
|
234
|
-
end
|
235
|
-
|
236
|
-
private
|
237
|
-
def min_entropy_index(data_examples, domain, flag_att=[])
|
238
|
-
min_entropy = nil
|
239
|
-
min_index = 0
|
240
|
-
domain[0..-2].each_index do |index|
|
241
|
-
freq_grid = freq_grid(index, data_examples, domain)
|
242
|
-
entropy = entropy(freq_grid, data_examples.length)
|
243
|
-
if (!min_entropy || entropy < min_entropy) && !flag_att.include?(index)
|
244
|
-
min_entropy = entropy
|
245
|
-
min_index = index
|
246
|
-
end
|
247
|
-
end
|
248
|
-
return min_index
|
249
|
-
end
|
250
|
-
|
251
|
-
private
|
252
|
-
def domain(data_examples)
|
253
|
-
domain = []
|
254
|
-
@data_labels.length.times { domain << [] }
|
255
|
-
data_examples.each do |data|
|
256
|
-
data.each_index do |i|
|
257
|
-
domain[i] << data[i] if i<domain.length && !domain[i].include?(data[i])
|
258
|
-
end
|
259
|
-
end
|
260
|
-
return domain
|
261
|
-
end
|
262
|
-
|
263
|
-
private
|
264
|
-
def freq_grid(att_index, data_examples, domain)
|
265
|
-
#Initialize empty grid
|
266
|
-
grid_element = []
|
267
|
-
domain.last.length.times { grid_element << 0}
|
268
|
-
grid = []
|
269
|
-
domain[att_index].length.times { grid << grid_element.clone }
|
270
|
-
#Fill frecuency with grid
|
271
|
-
data_examples.each do |example|
|
272
|
-
att_val = example[att_index]
|
273
|
-
att_val_index = domain[att_index].index(att_val)
|
274
|
-
category = example.last
|
275
|
-
category_index = domain.last.index(category)
|
276
|
-
freq = grid[att_val_index][category_index] + 1
|
277
|
-
grid[att_val_index][category_index] = freq
|
278
|
-
end
|
279
|
-
return grid
|
280
|
-
end
|
281
|
-
|
282
|
-
private
|
283
|
-
def entropy(freq_grid, total_examples)
|
284
|
-
#Calc entropy of each element
|
285
|
-
entropy = 0
|
286
|
-
freq_grid.each do |att_freq|
|
287
|
-
att_total_freq = ID3.sum(att_freq)
|
288
|
-
partial_entropy = 0
|
289
|
-
if att_total_freq != 0
|
290
|
-
att_freq.each do |freq|
|
291
|
-
prop = freq.to_f/att_total_freq
|
292
|
-
partial_entropy += (-1*prop*ID3.log2(prop))
|
293
|
-
end
|
294
|
-
end
|
295
|
-
entropy += (att_total_freq.to_f/total_examples) * partial_entropy
|
296
|
-
end
|
297
|
-
return entropy
|
298
|
-
end
|
299
|
-
|
300
|
-
private
|
301
|
-
LOG2 = Math.log(2)
|
302
|
-
end
|
303
|
-
|
304
|
-
class EvaluationNode
|
305
|
-
attr_reader :index, :values, :nodes
|
306
|
-
def initialize(data_labels, index, values, nodes)
|
307
|
-
@index = index
|
308
|
-
@values = values
|
309
|
-
@nodes = nodes
|
310
|
-
@data_labels = data_labels
|
311
|
-
end
|
312
|
-
def value(data)
|
313
|
-
value = data[@index]
|
314
|
-
return rule_not_found if !@values.include?(value)
|
315
|
-
return nodes[@values.index(value)].value(data)
|
316
|
-
end
|
317
|
-
def get_rules
|
318
|
-
rule_set = []
|
319
|
-
@nodes.each_index do |child_node_index|
|
320
|
-
my_rule = "#{@data_labels[@index]}=='#{@values[child_node_index]}'"
|
321
|
-
child_node = @nodes[child_node_index]
|
322
|
-
child_node_rules = child_node.get_rules
|
323
|
-
child_node_rules.each do |child_rule|
|
324
|
-
child_rule.unshift(my_rule)
|
325
|
-
end
|
326
|
-
rule_set += child_node_rules
|
327
|
-
end
|
328
|
-
return rule_set
|
329
|
-
end
|
330
|
-
end
|
331
|
-
|
332
|
-
class CategoryNode
|
333
|
-
def initialize(label, value)
|
334
|
-
@label = label
|
335
|
-
@value = value
|
336
|
-
end
|
337
|
-
def value(data)
|
338
|
-
return @value
|
339
|
-
end
|
340
|
-
def get_rules
|
341
|
-
return [["#{@label}='#{@value}'"]]
|
342
|
-
end
|
343
|
-
end
|
344
|
-
|
345
|
-
class ErrorNode
|
346
|
-
def value(data)
|
347
|
-
raise "There was not enough information during training to do a proper induction for this data element."
|
348
|
-
end
|
349
|
-
def get_rules
|
350
|
-
return []
|
351
|
-
end
|
352
|
-
end
|
353
|
-
|
354
|
-
end
|
@@ -1,268 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# The GeneticAlgorithm module implements the GeneticSearch and Chromosome
|
3
|
-
# classes. The GeneticSearch is a generic class, and can be used to solved
|
4
|
-
# any kind of problems. The GeneticSearch class performs a stochastic search
|
5
|
-
# of the solution of a given problem.
|
6
|
-
#
|
7
|
-
# The Chromosome is "problem specific". Ai4r built-in Chromosomeclass was
|
8
|
-
# designed to model the Travelling salesman problem. If you want to solve other
|
9
|
-
# type of problem, you will have to modify the Chromosome class, by overwriting
|
10
|
-
# its fitness, reproduce, and mutate functions, to model you specific problem.
|
11
|
-
#
|
12
|
-
# Author:: Sergio Fierens
|
13
|
-
# License:: MPL 1.1
|
14
|
-
# Project:: ai4r
|
15
|
-
# Url:: http://ai4r.rubyforge.org/
|
16
|
-
#
|
17
|
-
# You can redistribute it and/or modify it under the terms of
|
18
|
-
# the Mozilla Public License version 1.1 as published by the
|
19
|
-
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
20
|
-
|
21
|
-
module GeneticAlgorithm
|
22
|
-
|
23
|
-
# This class is used to automatically:
|
24
|
-
#
|
25
|
-
# 1. Choose initial population
|
26
|
-
# 2. Evaluate the fitness of each individual in the population
|
27
|
-
# 3. Repeat
|
28
|
-
# 1. Select best-ranking individuals to reproduce
|
29
|
-
# 2. Breed new generation through crossover and mutation (genetic operations) and give birth to offspring
|
30
|
-
# 3. Evaluate the individual fitnesses of the offspring
|
31
|
-
# 4. Replace worst ranked part of population with offspring
|
32
|
-
# 4. Until termination
|
33
|
-
#
|
34
|
-
# If you want to customize the algorithm, you must modify any of the following classes:
|
35
|
-
# - Chromosome
|
36
|
-
# - Population
|
37
|
-
class GeneticSearch
|
38
|
-
|
39
|
-
attr_accessor :population
|
40
|
-
|
41
|
-
|
42
|
-
def initialize(initial_population_size, generations)
|
43
|
-
@population_size = initial_population_size
|
44
|
-
@max_generation = generations
|
45
|
-
@generation = 0
|
46
|
-
end
|
47
|
-
|
48
|
-
# 1. Choose initial population
|
49
|
-
# 2. Evaluate the fitness of each individual in the population
|
50
|
-
# 3. Repeat
|
51
|
-
# 1. Select best-ranking individuals to reproduce
|
52
|
-
# 2. Breed new generation through crossover and mutation (genetic operations) and give birth to offspring
|
53
|
-
# 3. Evaluate the individual fitnesses of the offspring
|
54
|
-
# 4. Replace worst ranked part of population with offspring
|
55
|
-
# 4. Until termination
|
56
|
-
# 5. Return the best chromosome
|
57
|
-
def run
|
58
|
-
generate_initial_population #Generate initial population
|
59
|
-
@max_generation.times do
|
60
|
-
selected_to_breed = selection #Evaluates current population
|
61
|
-
offsprings = reproduction selected_to_breed #Generate the population for this new generation
|
62
|
-
replace_worst_ranked offsprings
|
63
|
-
end
|
64
|
-
return best_chromosome
|
65
|
-
end
|
66
|
-
|
67
|
-
|
68
|
-
def generate_initial_population
|
69
|
-
@population = []
|
70
|
-
@population_size.times do
|
71
|
-
population << Chromosome.seed
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
# Select best-ranking individuals to reproduce
|
76
|
-
#
|
77
|
-
# Selection is the stage of a genetic algorithm in which individual
|
78
|
-
# genomes are chosen from a population for later breeding.
|
79
|
-
# There are several generic selection algorithms, such as
|
80
|
-
# tournament selection and roulette wheel selection. We implemented the
|
81
|
-
# latest.
|
82
|
-
#
|
83
|
-
# Steps:
|
84
|
-
#
|
85
|
-
# 1. The fitness function is evaluated for each individual, providing fitness values
|
86
|
-
# 2. The population is sorted by descending fitness values.
|
87
|
-
# 3. The fitness values ar then normalized. (Highest fitness gets 1, lowest fitness gets 0). The normalized value is stored in the "normalized_fitness" attribute of the chromosomes.
|
88
|
-
# 4. A random number R is chosen. R is between 0 and the accumulated normalized value (all the normalized fitness values added togheter).
|
89
|
-
# 5. The selected individual is the first one whose accumulated normalized value (its is normalized value plus the normalized values of the chromosomes prior it) greater than R.
|
90
|
-
# 6. We repeat steps 4 and 5, 2/3 times the population size.
|
91
|
-
def selection
|
92
|
-
@population.sort! { |a, b| b.fitness <=> a.fitness}
|
93
|
-
best_fitness = @population[0].fitness
|
94
|
-
worst_fitness = @population.last.fitness
|
95
|
-
acum_fitness = 0
|
96
|
-
if best_fitness-worst_fitness > 0
|
97
|
-
@population.each do |chromosome|
|
98
|
-
chromosome.normalized_fitness = (chromosome.fitness - worst_fitness)/(best_fitness-worst_fitness)
|
99
|
-
acum_fitness += chromosome.normalized_fitness
|
100
|
-
end
|
101
|
-
else
|
102
|
-
@population.each { |chromosome| chromosome.normalized_fitness = 1}
|
103
|
-
end
|
104
|
-
selected_to_breed = []
|
105
|
-
((2*@population_size)/3).times do
|
106
|
-
selected_to_breed << select_random_individual(acum_fitness)
|
107
|
-
end
|
108
|
-
selected_to_breed
|
109
|
-
end
|
110
|
-
|
111
|
-
# We combine each pair of selected chromosome using the method
|
112
|
-
# Chromosome.reproduce
|
113
|
-
#
|
114
|
-
# The reproduction will also call the Chromosome.mutate method with
|
115
|
-
# each member of the population. You should implement Chromosome.mutate
|
116
|
-
# to only change (mutate) randomly. E.g. You could effectivly change the
|
117
|
-
# chromosome only if
|
118
|
-
# rand < ((1 - chromosome.normalized_fitness) * 0.4)
|
119
|
-
def reproduction(selected_to_breed)
|
120
|
-
offsprings = []
|
121
|
-
0.upto(selected_to_breed.length/2-1) do |i|
|
122
|
-
offsprings << Chromosome.reproduce(selected_to_breed[2*i], selected_to_breed[2*i+1])
|
123
|
-
end
|
124
|
-
@population.each do |individual|
|
125
|
-
Chromosome.mutate(individual)
|
126
|
-
end
|
127
|
-
return offsprings
|
128
|
-
end
|
129
|
-
|
130
|
-
# Replace worst ranked part of population with offspring
|
131
|
-
def replace_worst_ranked(offsprings)
|
132
|
-
size = offsprings.length
|
133
|
-
@population = @population [0..((-1*size)-1)] + offsprings
|
134
|
-
end
|
135
|
-
|
136
|
-
# Select the best chromosome in the population
|
137
|
-
def best_chromosome
|
138
|
-
the_best = @population[0]
|
139
|
-
@population.each do |chromosome|
|
140
|
-
the_best = chromosome if chromosome.fitness > the_best.fitness
|
141
|
-
end
|
142
|
-
return the_best
|
143
|
-
end
|
144
|
-
|
145
|
-
private
|
146
|
-
def select_random_individual(acum_fitness)
|
147
|
-
select_random_target = acum_fitness * rand
|
148
|
-
local_acum = 0
|
149
|
-
@population.each do |chromosome|
|
150
|
-
local_acum += chromosome.normalized_fitness
|
151
|
-
return chromosome if local_acum >= select_random_target
|
152
|
-
end
|
153
|
-
end
|
154
|
-
|
155
|
-
end
|
156
|
-
|
157
|
-
# A Chromosome is a representation of an individual solutions for a specific
|
158
|
-
# problem. You will have to redifine you Chromosome representation for each
|
159
|
-
# particular problem, along with its fitness, mutate, reproduce, and seed
|
160
|
-
# functions.
|
161
|
-
class Chromosome
|
162
|
-
|
163
|
-
attr_accessor :data
|
164
|
-
attr_accessor :normalized_fitness
|
165
|
-
|
166
|
-
def initialize(data)
|
167
|
-
@data = data
|
168
|
-
end
|
169
|
-
|
170
|
-
# The fitness function quantifies the optimality of a solution
|
171
|
-
# (that is, a chromosome) in a genetic algorithm so that that particular
|
172
|
-
# chromosome may be ranked against all the other chromosomes.
|
173
|
-
#
|
174
|
-
# Optimal chromosomes, or at least chromosomes which are more optimal,
|
175
|
-
# are allowed to breed and mix their datasets by any of several techniques,
|
176
|
-
# producing a new generation that will (hopefully) be even better.
|
177
|
-
def fitness
|
178
|
-
return @fitness if @fitness
|
179
|
-
last_token = @data[0]
|
180
|
-
cost = 0
|
181
|
-
@data[1..-1].each do |token|
|
182
|
-
cost += @@costs[last_token][token]
|
183
|
-
last_token = token
|
184
|
-
end
|
185
|
-
@fitness = -1 * cost
|
186
|
-
return @fitness
|
187
|
-
end
|
188
|
-
|
189
|
-
# mutation is a function used to maintain genetic diversity from one
|
190
|
-
# generation of a population of chromosomes to the next. It is analogous
|
191
|
-
# to biological mutation.
|
192
|
-
#
|
193
|
-
# The purpose of mutation in GAs is to allow the
|
194
|
-
# algorithm to avoid local minima by preventing the population of
|
195
|
-
# chromosomes from becoming too similar to each other, thus slowing or even
|
196
|
-
# stopping evolution.
|
197
|
-
#
|
198
|
-
# Calling the mutate function will "probably" slightly change a chromosome
|
199
|
-
# randomly.
|
200
|
-
#
|
201
|
-
# This implementation of "mutation" will (probably) reverse the
|
202
|
-
# order of 2 consecutive randome nodes
|
203
|
-
# (e.g. from [ 0, 1, 2, 4] to [0, 2, 1, 4]) if:
|
204
|
-
# ((1 - chromosome.normalized_fitness) * 0.4)
|
205
|
-
def self.mutate(chromosome)
|
206
|
-
if chromosome.normalized_fitness && rand < ((1 - chromosome.normalized_fitness) * 0.3)
|
207
|
-
data = chromosome.data
|
208
|
-
index = rand(data.length-1)
|
209
|
-
data[index], data[index+1] = data[index+1], data[index]
|
210
|
-
chromosome.data = data
|
211
|
-
@fitness = nil
|
212
|
-
end
|
213
|
-
end
|
214
|
-
|
215
|
-
# Reproduction is used to vary the programming of a chromosome or
|
216
|
-
# chromosomes from one generation to the next. There are several ways to
|
217
|
-
# combine two chromosomes: One-point crossover, Two-point crossover,
|
218
|
-
# "Cut and splice", edge recombination, and more.
|
219
|
-
#
|
220
|
-
# The method is usually dependant of the problem domain.
|
221
|
-
# In this case, we have implemented edge recombination, wich is the
|
222
|
-
# most used reproduction algorithm for the Travelling salesman problem.
|
223
|
-
def self.reproduce(a, b)
|
224
|
-
data_size = @@costs[0].length
|
225
|
-
available = []
|
226
|
-
0.upto(data_size-1) { |n| available << n }
|
227
|
-
token = a.data[0]
|
228
|
-
spawn = [token]
|
229
|
-
available.delete(token)
|
230
|
-
while available.length > 0 do
|
231
|
-
#Select next
|
232
|
-
if token != b.data.last && available.include?(b.data[b.data.index(token)+1])
|
233
|
-
next_token = b.data[b.data.index(token)+1]
|
234
|
-
elsif token != a.data.last && available.include?(a.data[a.data.index(token)+1])
|
235
|
-
next_token = a.data[a.data.index(token)+1]
|
236
|
-
else
|
237
|
-
next_token = available[rand(available.length)]
|
238
|
-
end
|
239
|
-
#Add to spawn
|
240
|
-
token = next_token
|
241
|
-
available.delete(token)
|
242
|
-
spawn << next_token
|
243
|
-
a, b = b, a if rand < 0.4
|
244
|
-
end
|
245
|
-
return Chromosome.new(spawn)
|
246
|
-
end
|
247
|
-
|
248
|
-
# Initializes an individual solution (chromosome) for the initial
|
249
|
-
# population. Usually the chromosome is generated randomly, but you can
|
250
|
-
# use some problem domain knowledge, to generate better initial solutions.
|
251
|
-
def self.seed
|
252
|
-
data_size = @@costs[0].length
|
253
|
-
available = []
|
254
|
-
0.upto(data_size-1) { |n| available << n }
|
255
|
-
seed = []
|
256
|
-
while available.length > 0 do
|
257
|
-
index = rand(available.length)
|
258
|
-
seed << available.delete_at(index)
|
259
|
-
end
|
260
|
-
return Chromosome.new(seed)
|
261
|
-
end
|
262
|
-
|
263
|
-
def self.set_cost_matrix(costs)
|
264
|
-
@@costs = costs
|
265
|
-
end
|
266
|
-
end
|
267
|
-
|
268
|
-
end
|