ai4r 1.2 → 1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +12 -25
- data/examples/decision_trees/id3_example.rb +6 -9
- data/examples/decision_trees/results.txt +2 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +11 -13
- data/examples/neural_network/xor_example.rb +25 -0
- data/lib/ai4r.rb +10 -0
- data/lib/ai4r/classifiers/classifier.rb +46 -0
- data/lib/ai4r/classifiers/id3.rb +27 -58
- data/lib/ai4r/classifiers/one_r.rb +19 -58
- data/lib/ai4r/classifiers/prism.rb +21 -57
- data/lib/ai4r/classifiers/zero_r.rb +16 -48
- data/lib/ai4r/clusterers/bisecting_k_means.rb +115 -0
- data/lib/ai4r/clusterers/clusterer.rb +55 -0
- data/lib/ai4r/clusterers/k_means.rb +164 -0
- data/lib/ai4r/data/data_set.rb +250 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +19 -19
- data/lib/ai4r/neural_network/backpropagation.rb +23 -24
- data/site/build/site/en/broken-links.xml +2 -0
- data/site/build/site/en/downloads.html +200 -0
- data/site/build/site/en/downloads.pdf +151 -0
- data/site/build/site/en/forum.html +197 -0
- data/site/build/site/en/forum.pdf +151 -0
- data/site/build/site/en/geneticAlgorithms.html +591 -0
- data/site/build/site/en/geneticAlgorithms.pdf +934 -0
- data/site/build/site/en/images/ai4r-logo.png +0 -0
- data/site/build/site/en/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/images/c.png +0 -0
- data/site/build/site/en/images/c_wbn.png +0 -0
- data/site/build/site/en/images/c_wn.png +0 -0
- data/site/build/site/en/images/ero.gif +0 -0
- data/site/build/site/en/images/europe2.png +0 -0
- data/site/build/site/en/images/europe3.png +0 -0
- data/site/build/site/en/images/fitness.png +0 -0
- data/site/build/site/en/images/genetic_algorithms_example.png +0 -0
- data/site/build/site/en/images/instruction_arrow.png +0 -0
- data/site/build/site/en/images/jadeferret.png +0 -0
- data/site/build/site/en/images/my_email.png +0 -0
- data/site/build/site/en/images/neural_network_example.png +0 -0
- data/site/build/site/en/images/rubyforge.png +0 -0
- data/site/build/site/en/images/s.png +0 -0
- data/site/build/site/en/images/s_wbn.png +0 -0
- data/site/build/site/en/images/s_wn.png +0 -0
- data/site/build/site/en/images/sigmoid.png +0 -0
- data/site/build/site/en/images/t.png +0 -0
- data/site/build/site/en/images/t_wbn.png +0 -0
- data/site/build/site/en/images/t_wn.png +0 -0
- data/site/build/site/en/index.html +336 -0
- data/site/build/site/en/index.pdf +508 -0
- data/site/build/site/en/linkmap.html +263 -0
- data/site/build/site/en/linkmap.pdf +94 -0
- data/site/build/site/en/locationmap.xml +72 -0
- data/site/build/site/en/machineLearning.html +339 -0
- data/site/build/site/en/machineLearning.pdf +337 -0
- data/site/build/site/en/neuralNetworks.html +484 -0
- data/site/build/site/en/neuralNetworks.pdf +604 -0
- data/site/build/site/en/skin/CommonMessages_de.xml +23 -0
- data/site/build/site/en/skin/CommonMessages_en_US.xml +23 -0
- data/site/build/site/en/skin/CommonMessages_es.xml +23 -0
- data/site/build/site/en/skin/CommonMessages_fr.xml +23 -0
- data/site/build/site/en/skin/basic.css +166 -0
- data/site/build/site/en/skin/breadcrumbs-optimized.js +90 -0
- data/site/build/site/en/skin/breadcrumbs.js +237 -0
- data/site/build/site/en/skin/fontsize.js +166 -0
- data/site/build/site/en/skin/getBlank.js +40 -0
- data/site/build/site/en/skin/getMenu.js +45 -0
- data/site/build/site/en/skin/images/README.txt +1 -0
- data/site/build/site/en/skin/images/add.jpg +0 -0
- data/site/build/site/en/skin/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/skin/images/chapter.gif +0 -0
- data/site/build/site/en/skin/images/chapter_open.gif +0 -0
- data/site/build/site/en/skin/images/current.gif +0 -0
- data/site/build/site/en/skin/images/error.png +0 -0
- data/site/build/site/en/skin/images/external-link.gif +0 -0
- data/site/build/site/en/skin/images/fix.jpg +0 -0
- data/site/build/site/en/skin/images/forrest-credit-logo.png +0 -0
- data/site/build/site/en/skin/images/hack.jpg +0 -0
- data/site/build/site/en/skin/images/header_white_line.gif +0 -0
- data/site/build/site/en/skin/images/info.png +0 -0
- data/site/build/site/en/skin/images/instruction_arrow.png +0 -0
- data/site/build/site/en/skin/images/label.gif +0 -0
- data/site/build/site/en/skin/images/page.gif +0 -0
- data/site/build/site/en/skin/images/pdfdoc.gif +0 -0
- data/site/build/site/en/skin/images/poddoc.png +0 -0
- data/site/build/site/en/skin/images/printer.gif +0 -0
- data/site/build/site/en/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/remove.jpg +0 -0
- data/site/build/site/en/skin/images/rss.png +0 -0
- data/site/build/site/en/skin/images/spacer.gif +0 -0
- data/site/build/site/en/skin/images/success.png +0 -0
- data/site/build/site/en/skin/images/txtdoc.png +0 -0
- data/site/build/site/en/skin/images/update.jpg +0 -0
- data/site/build/site/en/skin/images/valid-html401.png +0 -0
- data/site/build/site/en/skin/images/vcss.png +0 -0
- data/site/build/site/en/skin/images/warning.png +0 -0
- data/site/build/site/en/skin/images/xmldoc.gif +0 -0
- data/site/build/site/en/skin/menu.js +48 -0
- data/site/build/site/en/skin/note.txt +50 -0
- data/site/build/site/en/skin/print.css +54 -0
- data/site/build/site/en/skin/profile.css +163 -0
- data/site/build/site/en/skin/prototype.js +1257 -0
- data/site/build/site/en/skin/screen.css +587 -0
- data/site/build/site/en/svn.html +252 -0
- data/site/build/site/en/svn.pdf +306 -0
- data/site/build/site/en/wholesite.pdf +1915 -0
- data/site/build/tmp/brokenlinks.xml +2 -0
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
- data/site/build/tmp/locationmap.xml +14 -14
- data/site/build/tmp/output.xmap +23 -23
- data/site/build/tmp/pluginlist2fetchbuild.xml +144 -144
- data/site/build/tmp/projfilters.properties +41 -41
- data/site/build/webapp/WEB-INF/logs/core.log +593 -679
- data/site/build/webapp/WEB-INF/logs/error.log +362 -279
- data/site/build/webapp/WEB-INF/logs/sitemap.log +368 -1015
- data/site/src/documentation/content/xdocs/index.xml +18 -10
- data/site/src/documentation/content/xdocs/machineLearning.xml +4 -3
- data/site/src/documentation/content/xdocs/site.xml +2 -1
- data/site/src/documentation/resources/images/sigmoid.png +0 -0
- data/test/classifiers/id3_test.rb +45 -44
- data/test/classifiers/one_r_test.rb +19 -17
- data/test/classifiers/prism_test.rb +22 -20
- data/test/classifiers/zero_r_test.rb +15 -12
- data/test/clusterers/bisecting_k_means_test.rb +59 -0
- data/test/clusterers/k_means_test.rb +93 -0
- data/test/data/data_set_test.rb +92 -0
- metadata +252 -128
- data/lib/ai4r/classifiers/classifier_helper.rb +0 -54
- data/site/src/documentation/content/xdocs/forum.html +0 -9
- data/site/src/documentation/resources/images/Thumbs.db +0 -0
- data/site/src/documentation/resources/images/sub-dir/Thumbs.db +0 -0
|
@@ -11,7 +11,8 @@
|
|
|
11
11
|
# J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
|
|
12
12
|
# International Journal of Man-Machine Studies. 27(4):349-370.
|
|
13
13
|
|
|
14
|
-
require File.dirname(__FILE__) + '/
|
|
14
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
|
15
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
|
15
16
|
|
|
16
17
|
module Ai4r
|
|
17
18
|
module Classifiers
|
|
@@ -23,50 +24,17 @@ module Ai4r
|
|
|
23
24
|
#
|
|
24
25
|
# J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
|
|
25
26
|
# International Journal of Man-Machine Studies. 27(4):349-370.
|
|
26
|
-
class Prism
|
|
27
|
+
class Prism < Classifier
|
|
27
28
|
|
|
28
|
-
|
|
29
|
-
include ClassifierHelper
|
|
29
|
+
attr_reader :data_set, :rules
|
|
30
30
|
|
|
31
|
-
# Build a new Prism classifier.
|
|
32
|
-
#
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
# ]
|
|
39
|
-
#
|
|
40
|
-
# e.g.
|
|
41
|
-
# [ ['New York', '<30', 'M', 'Y'],
|
|
42
|
-
# ['Chicago', '<30', 'M', 'Y'],
|
|
43
|
-
# ['Chicago', '<30', 'F', 'Y'],
|
|
44
|
-
# ['New York', '<30', 'M', 'Y'],
|
|
45
|
-
# ['New York', '<30', 'M', 'Y'],
|
|
46
|
-
# ['Chicago', '[30-50)', 'M', 'Y'],
|
|
47
|
-
# ['New York', '[30-50)', 'F', 'N'],
|
|
48
|
-
# ['Chicago', '[30-50)', 'F', 'Y'],
|
|
49
|
-
# ['New York', '[30-50)', 'F', 'N'],
|
|
50
|
-
# ['Chicago', '[50-80]', 'M', 'N'],
|
|
51
|
-
# ['New York', '[50-80]', 'F', 'N'],
|
|
52
|
-
# ['New York', '[50-80]', 'M', 'N'],
|
|
53
|
-
# ['Chicago', '[50-80]', 'M', 'N'],
|
|
54
|
-
# ['New York', '[50-80]', 'F', 'N'],
|
|
55
|
-
# ['Chicago', '>80', 'F', 'Y']
|
|
56
|
-
# ]
|
|
57
|
-
#
|
|
58
|
-
# Data labels must have the following format:
|
|
59
|
-
# [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
|
60
|
-
#
|
|
61
|
-
# If you do not provide labels for you data, the following labels will
|
|
62
|
-
# be created by default:
|
|
63
|
-
# [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ]
|
|
64
|
-
#
|
|
65
|
-
def build(data_examples, data_labels=nil)
|
|
66
|
-
check_data_examples(data_examples)
|
|
67
|
-
@data_labels = (data_labels) ? data_labels : default_data_labels(data_examples)
|
|
68
|
-
domains = build_domains(data_examples)
|
|
69
|
-
instances = data_examples.collect {|data| data }
|
|
31
|
+
# Build a new Prism classifier. You must provide a DataSet instance
|
|
32
|
+
# as parameter.
|
|
33
|
+
def build(data_set)
|
|
34
|
+
data_set.check_not_empty
|
|
35
|
+
@data_set = data_set
|
|
36
|
+
domains = @data_set.build_domains
|
|
37
|
+
instances = @data_set.data_items.collect {|data| data }
|
|
70
38
|
@rules = []
|
|
71
39
|
domains.last.each do |class_value|
|
|
72
40
|
while(has_class_value(instances, class_value))
|
|
@@ -91,7 +59,7 @@ module Ai4r
|
|
|
91
59
|
# This method returns the generated rules in ruby code.
|
|
92
60
|
# e.g.
|
|
93
61
|
#
|
|
94
|
-
# classifier.
|
|
62
|
+
# classifier.get_rules
|
|
95
63
|
# # => if age_range == '<30' then marketing_target = 'Y'
|
|
96
64
|
# elsif age_range == '>80' then marketing_target = 'Y'
|
|
97
65
|
# elsif city == 'Chicago' and age_range == '[30-50)' then marketing_target = 'Y'
|
|
@@ -101,10 +69,10 @@ module Ai4r
|
|
|
101
69
|
# It is a nice way to inspect induction results, and also to execute them:
|
|
102
70
|
# age_range = '[30-50)'
|
|
103
71
|
# city = 'New York'
|
|
104
|
-
# eval(classifier.
|
|
72
|
+
# eval(classifier.get_rules)
|
|
105
73
|
# puts marketing_target
|
|
106
74
|
# 'Y'
|
|
107
|
-
def
|
|
75
|
+
def get_rules
|
|
108
76
|
out = "if #{join_terms(@rules.first)} then #{then_clause(@rules.first)}"
|
|
109
77
|
@rules[1...-1].each do |rule|
|
|
110
78
|
out += "\nelsif #{join_terms(rule)} then #{then_clause(rule)}"
|
|
@@ -116,6 +84,10 @@ module Ai4r
|
|
|
116
84
|
|
|
117
85
|
protected
|
|
118
86
|
|
|
87
|
+
def get_attr_value(data, attr)
|
|
88
|
+
data[@data_set.get_index(attr)]
|
|
89
|
+
end
|
|
90
|
+
|
|
119
91
|
def has_class_value(instances, class_value)
|
|
120
92
|
instances.each { |data| return true if data.last == class_value}
|
|
121
93
|
return false
|
|
@@ -131,23 +103,15 @@ module Ai4r
|
|
|
131
103
|
|
|
132
104
|
def matches_conditions(data, conditions)
|
|
133
105
|
conditions.each_pair do |attr_label, attr_value|
|
|
134
|
-
return false if data
|
|
106
|
+
return false if get_attr_value(data, attr_label) != attr_value
|
|
135
107
|
end
|
|
136
108
|
return true
|
|
137
109
|
end
|
|
138
110
|
|
|
139
|
-
def get_attr_index(attr_label)
|
|
140
|
-
return @data_labels.index(attr_label)
|
|
141
|
-
end
|
|
142
|
-
|
|
143
|
-
def get_attr_value(data, attr_label)
|
|
144
|
-
return data[get_attr_index(attr_label)]
|
|
145
|
-
end
|
|
146
|
-
|
|
147
111
|
def build_rule(class_value, instances)
|
|
148
112
|
rule = {:class_value => class_value, :conditions => {}}
|
|
149
113
|
rule_instances = instances.collect {|data| data }
|
|
150
|
-
attributes = @data_labels[0...-1].collect {|label| label }
|
|
114
|
+
attributes = @data_set.data_labels[0...-1].collect {|label| label }
|
|
151
115
|
until(is_perfect(instances, rule) || attributes.empty?)
|
|
152
116
|
freq_table = build_freq_table(rule_instances, attributes, class_value)
|
|
153
117
|
condition = get_condition(freq_table)
|
|
@@ -223,7 +187,7 @@ module Ai4r
|
|
|
223
187
|
end
|
|
224
188
|
|
|
225
189
|
def then_clause(rule)
|
|
226
|
-
"#{@data_labels.last} = '#{rule[:class_value]}'"
|
|
190
|
+
"#{@data_set.data_labels.last} = '#{rule[:class_value]}'"
|
|
227
191
|
end
|
|
228
192
|
|
|
229
193
|
end
|
|
@@ -7,10 +7,12 @@
|
|
|
7
7
|
# the Mozilla Public License version 1.1 as published by the
|
|
8
8
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
|
9
9
|
|
|
10
|
-
require File.dirname(__FILE__) + '/
|
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set.rb'
|
|
11
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
|
11
12
|
|
|
12
13
|
module Ai4r
|
|
13
14
|
module Classifiers
|
|
15
|
+
|
|
14
16
|
# = Introduction
|
|
15
17
|
#
|
|
16
18
|
# The idea behind the ZeroR classifier is to identify the
|
|
@@ -18,53 +20,19 @@ module Ai4r
|
|
|
18
20
|
# It always returns that value when evaluating an instance.
|
|
19
21
|
# It is frequently used as a baseline for evaluating other machine learning
|
|
20
22
|
# algorithms.
|
|
21
|
-
class ZeroR
|
|
23
|
+
class ZeroR < Classifier
|
|
22
24
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
include ClassifierHelper
|
|
25
|
+
attr_reader :data_set, :class_value
|
|
26
26
|
|
|
27
|
-
# Build a new ZeroR classifier.
|
|
28
|
-
#
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# ...
|
|
33
|
-
# [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
|
|
34
|
-
# ]
|
|
35
|
-
#
|
|
36
|
-
# e.g.
|
|
37
|
-
# [ ['New York', '<30', 'M', 'Y'],
|
|
38
|
-
# ['Chicago', '<30', 'M', 'Y'],
|
|
39
|
-
# ['Chicago', '<30', 'F', 'Y'],
|
|
40
|
-
# ['New York', '<30', 'M', 'Y'],
|
|
41
|
-
# ['New York', '<30', 'M', 'Y'],
|
|
42
|
-
# ['Chicago', '[30-50)', 'M', 'Y'],
|
|
43
|
-
# ['New York', '[30-50)', 'F', 'N'],
|
|
44
|
-
# ['Chicago', '[30-50)', 'F', 'Y'],
|
|
45
|
-
# ['New York', '[30-50)', 'F', 'N'],
|
|
46
|
-
# ['Chicago', '[50-80]', 'M', 'N'],
|
|
47
|
-
# ['New York', '[50-80]', 'F', 'N'],
|
|
48
|
-
# ['New York', '[50-80]', 'M', 'N'],
|
|
49
|
-
# ['Chicago', '[50-80]', 'M', 'N'],
|
|
50
|
-
# ['New York', '[50-80]', 'F', 'N'],
|
|
51
|
-
# ['Chicago', '>80', 'F', 'Y']
|
|
52
|
-
# ]
|
|
53
|
-
#
|
|
54
|
-
# Data labels must have the following format:
|
|
55
|
-
# [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
|
56
|
-
#
|
|
57
|
-
# If you do not provide labels for you data, the following labels will
|
|
58
|
-
# be created by default:
|
|
59
|
-
# [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ]
|
|
60
|
-
#
|
|
61
|
-
def build(data_examples, data_labels=nil)
|
|
62
|
-
check_data_examples(data_examples)
|
|
63
|
-
@data_labels = (data_labels) ? data_labels : default_data_labels(data_examples)
|
|
27
|
+
# Build a new ZeroR classifier. You must provide a DataSet instance
|
|
28
|
+
# as parameter.
|
|
29
|
+
def build(data_set)
|
|
30
|
+
data_set.check_not_empty
|
|
31
|
+
@data_set = data_set
|
|
64
32
|
frequence = {}
|
|
65
33
|
max_freq = 0
|
|
66
|
-
@class_value
|
|
67
|
-
|
|
34
|
+
@class_value = nil
|
|
35
|
+
@data_set.data_items.each do |example|
|
|
68
36
|
class_value = example.last
|
|
69
37
|
class_frequency = frequence[class_value]
|
|
70
38
|
class_frequency = (class_frequency) ? class_frequency+1 : 1
|
|
@@ -86,16 +54,16 @@ module Ai4r
|
|
|
86
54
|
# This method returns the generated rules in ruby code.
|
|
87
55
|
# e.g.
|
|
88
56
|
#
|
|
89
|
-
# classifier.
|
|
57
|
+
# classifier.get_rules
|
|
90
58
|
# # => marketing_target='Y'
|
|
91
59
|
#
|
|
92
60
|
# It is a nice way to inspect induction results, and also to execute them:
|
|
93
61
|
# marketing_target = nil
|
|
94
|
-
# eval classifier.
|
|
62
|
+
# eval classifier.get_rules
|
|
95
63
|
# puts marketing_target
|
|
96
64
|
# # => 'Y'
|
|
97
|
-
def
|
|
98
|
-
return "#{@data_labels.last} = '#{@class_value}'"
|
|
65
|
+
def get_rules
|
|
66
|
+
return "#{@data_set.data_labels.last} = '#{@class_value}'"
|
|
99
67
|
end
|
|
100
68
|
|
|
101
69
|
end
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
|
2
|
+
# License:: MPL 1.1
|
|
3
|
+
# Project:: ai4r
|
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
|
5
|
+
#
|
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
|
9
|
+
|
|
10
|
+
require "set"
|
|
11
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
|
12
|
+
require File.dirname(__FILE__) + '/../clusterers/k_means'
|
|
13
|
+
|
|
14
|
+
module Ai4r
|
|
15
|
+
module Clusterers
|
|
16
|
+
|
|
17
|
+
# The Bisecting k-means algorithm is a variation of the "k-means" algorithm,
|
|
18
|
+
# somewhat less sensible to the initial election of centroids than the
|
|
19
|
+
# original.
|
|
20
|
+
#
|
|
21
|
+
# More about K Means algorithm:
|
|
22
|
+
# http://en.wikipedia.org/wiki/K-means_algorithm
|
|
23
|
+
class BisectingKMeans < KMeans
|
|
24
|
+
|
|
25
|
+
attr_reader :data_set, :number_of_clusters, :clusters, :centroids
|
|
26
|
+
attr_accessor :max_iterations, :distance_function, :refine
|
|
27
|
+
|
|
28
|
+
def intialize
|
|
29
|
+
@refine = true
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Build a new clusterer, using data examples found in data_set.
|
|
33
|
+
# Items will be clustered in "number_of_clusters" different
|
|
34
|
+
# clusters.
|
|
35
|
+
def build(data_set, number_of_clusters)
|
|
36
|
+
@data_set = data_set
|
|
37
|
+
@number_of_clusters = number_of_clusters
|
|
38
|
+
|
|
39
|
+
@clusters = [@data_set]
|
|
40
|
+
@centroids = [@data_set.get_mean_or_mode]
|
|
41
|
+
while @clusters.length < @number_of_clusters
|
|
42
|
+
biggest_cluster_index = find_biggest_cluster_index(@clusters)
|
|
43
|
+
clusterer = KMeans.new.
|
|
44
|
+
set_parameters(get_parameters).
|
|
45
|
+
build(@clusters[biggest_cluster_index], 2)
|
|
46
|
+
@clusters.delete_at(biggest_cluster_index)
|
|
47
|
+
@centroids.delete_at(biggest_cluster_index)
|
|
48
|
+
@clusters.concat(clusterer.clusters)
|
|
49
|
+
@centroids.concat(clusterer.centroids)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
super if @refine
|
|
53
|
+
|
|
54
|
+
return self
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Get info on what can be parameterized on this clusterer algorithm.
|
|
58
|
+
# It returns a hash with the following format:
|
|
59
|
+
# { :param_name => "Info on the parameter" }
|
|
60
|
+
def get_parameters_info
|
|
61
|
+
{ :max_iterations => "Maximum number of iterations used to bisect a " +
|
|
62
|
+
"cluster. By default it is uncapped.",
|
|
63
|
+
:distance_function => "Custom implementation of distance function. " +
|
|
64
|
+
"It must be a closure receiving two data items and return the " +
|
|
65
|
+
"distance bewteen them. By default, this algorithm uses " +
|
|
66
|
+
"ecuclidean distance of numeric attributes to the power of 2.",
|
|
67
|
+
:refine => "Boolean value. True by default. It will run the " +
|
|
68
|
+
"classic K Means algorithm, using as initial centroids the " +
|
|
69
|
+
"result of the bisecting approach."
|
|
70
|
+
}
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Set parameters on this clusterer instance.
|
|
74
|
+
# You must provide a hash with the folowing format:
|
|
75
|
+
# { :param_name => parameter_value }
|
|
76
|
+
#
|
|
77
|
+
# Use get_parameters_info to know what parameters are accepted.
|
|
78
|
+
def set_parameters(parameters)
|
|
79
|
+
super
|
|
80
|
+
if parameters.has_key?(:refine)
|
|
81
|
+
@refine = parameters[:refine]
|
|
82
|
+
end
|
|
83
|
+
return self
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Get parameter values on this clusterer instance.
|
|
87
|
+
# Returns a hash with the folowing format:
|
|
88
|
+
# { :param_name => parameter_value }
|
|
89
|
+
def get_parameters
|
|
90
|
+
params = super
|
|
91
|
+
params[:refine] = @refine
|
|
92
|
+
return params
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
protected
|
|
96
|
+
def calc_initial_centroids
|
|
97
|
+
@centroids # Use existing centroids
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def find_biggest_cluster_index(clusters)
|
|
101
|
+
max_index = 0
|
|
102
|
+
max_length = 0
|
|
103
|
+
clusters.each_index do |cluster_index|
|
|
104
|
+
cluster = clusters[cluster_index]
|
|
105
|
+
if max_length < cluster.data_items.length
|
|
106
|
+
max_length = cluster.data_items.length
|
|
107
|
+
max_index = cluster_index
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
return max_index
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Author:: Sergio Fierens
|
|
2
|
+
# License:: MPL 1.1
|
|
3
|
+
# Project:: ai4r
|
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
|
5
|
+
#
|
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
|
9
|
+
|
|
10
|
+
module Ai4r
|
|
11
|
+
module Clusterers
|
|
12
|
+
|
|
13
|
+
# The purpose of this class is to define a common API for Clusterers.
|
|
14
|
+
# All methods in this class (other than eval) must be implemented in
|
|
15
|
+
# subclasses.
|
|
16
|
+
class Clusterer
|
|
17
|
+
|
|
18
|
+
# Build a new clusterer, using data examples found in data_set.
|
|
19
|
+
# Data items will be clustered in "number_of_clusters" different
|
|
20
|
+
# clusters.
|
|
21
|
+
def build(data_set, number_of_clusters)
|
|
22
|
+
raise NotImplementedError
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Classifies the given data item, returning the cluster it belongs to.
|
|
26
|
+
def eval(data_item)
|
|
27
|
+
raise NotImplementedError
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Get info on what can be parameterized on this clusterer.
|
|
31
|
+
# It returns a hash with the following format:
|
|
32
|
+
# { :param_name => "Info on the parameter" }
|
|
33
|
+
def get_parameters_info
|
|
34
|
+
raise NotImplementedError
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Set parameter values on this clusterer instance.
|
|
38
|
+
# You must provide a hash with the folowing format:
|
|
39
|
+
# { :param_name => parameter_value }
|
|
40
|
+
def set_parameters(parameters)
|
|
41
|
+
raise NotImplementedError
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Get parameter values on this clusterer instance.
|
|
45
|
+
# Returns a hash with the folowing format:
|
|
46
|
+
# { :param_name => parameter_value }
|
|
47
|
+
def get_parameters
|
|
48
|
+
raise NotImplementedError
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
|
2
|
+
# License:: MPL 1.1
|
|
3
|
+
# Project:: ai4r
|
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
|
5
|
+
#
|
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
|
9
|
+
|
|
10
|
+
require "set"
|
|
11
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
|
12
|
+
require File.dirname(__FILE__) + '/../clusterers/clusterer'
|
|
13
|
+
|
|
14
|
+
module Ai4r
|
|
15
|
+
module Clusterers
|
|
16
|
+
|
|
17
|
+
# The k-means algorithm is an algorithm to cluster n objects
|
|
18
|
+
# based on attributes into k partitions, with k < n.
|
|
19
|
+
#
|
|
20
|
+
# More about K Means algorithm:
|
|
21
|
+
# http://en.wikipedia.org/wiki/K-means_algorithm
|
|
22
|
+
class KMeans < Clusterer
|
|
23
|
+
|
|
24
|
+
attr_reader :data_set, :number_of_clusters
|
|
25
|
+
attr_reader :clusters, :centroids, :iterations
|
|
26
|
+
attr_accessor :max_iterations
|
|
27
|
+
attr_accessor :distance_function
|
|
28
|
+
|
|
29
|
+
# Build a new clusterer, using data examples found in data_set.
|
|
30
|
+
# Items will be clustered in "number_of_clusters" different
|
|
31
|
+
# clusters.
|
|
32
|
+
def build(data_set, number_of_clusters)
|
|
33
|
+
@data_set = data_set
|
|
34
|
+
@number_of_clusters = number_of_clusters
|
|
35
|
+
@iterations = 0
|
|
36
|
+
|
|
37
|
+
calc_initial_centroids
|
|
38
|
+
while(not stop_criteria_met)
|
|
39
|
+
calculate_membership_clusters
|
|
40
|
+
recompute_centroids
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
return self
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Classifies the given data item, returning the cluster index it belongs
|
|
47
|
+
# to (0-based).
|
|
48
|
+
def eval(data_item)
|
|
49
|
+
get_min_index(@centroids.collect {|centroid|
|
|
50
|
+
distance(data_item, centroid)})
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Get info on what can be parameterized on this clusterer algorithm.
|
|
54
|
+
# It returns a hash with the following format:
|
|
55
|
+
# { :param_name => "Info on the parameter" }
|
|
56
|
+
def get_parameters_info
|
|
57
|
+
{ :max_iterations => "Maximum number of iterations to build the " +
|
|
58
|
+
"clusterer. By default it is uncapped.",
|
|
59
|
+
:distance_function => "Custom implementation of distance function. " +
|
|
60
|
+
"It must be a closure receiving two data items and return the " +
|
|
61
|
+
"distance bewteen them. By default, this algorithm uses " +
|
|
62
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
|
63
|
+
}
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Set parameters on this clusterer instance.
|
|
67
|
+
# You must provide a hash with the folowing format:
|
|
68
|
+
# { :param_name => parameter_value }
|
|
69
|
+
#
|
|
70
|
+
# Use get_parameters_info to know what parameters are accepted.
|
|
71
|
+
def set_parameters(parameters)
|
|
72
|
+
if parameters.has_key?(:max_iterations)
|
|
73
|
+
@max_iterations = parameters[:max_iterations]
|
|
74
|
+
end
|
|
75
|
+
if parameters.has_key?(:distance_function)
|
|
76
|
+
@distance_function = parameters[:distance_function]
|
|
77
|
+
end
|
|
78
|
+
return self
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Get parameter values on this clusterer instance.
|
|
82
|
+
# Returns a hash with the folowing format:
|
|
83
|
+
# { :param_name => parameter_value }
|
|
84
|
+
def get_parameters
|
|
85
|
+
{ :max_iterations => @max_iterations,
|
|
86
|
+
:distance_function => @distance_function }
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# This function calculates the distance between 2 different
|
|
90
|
+
# instances. By default, it returns the euclidean distance to the
|
|
91
|
+
# power of 2.
|
|
92
|
+
# You can provide a more convinient distance implementation:
|
|
93
|
+
#
|
|
94
|
+
# 1- Overwriting this method
|
|
95
|
+
#
|
|
96
|
+
# 2- Providing a closure to the :distance_function parameter
|
|
97
|
+
def distance(a, b)
|
|
98
|
+
return @distance_function.call(a, b) if @distance_function
|
|
99
|
+
return euclidean_distance(a, b)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
protected
|
|
103
|
+
def euclidean_distance(a, b)
|
|
104
|
+
dist = 0.0
|
|
105
|
+
a.each_index do |index|
|
|
106
|
+
if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
|
|
107
|
+
dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
return dist
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def calc_initial_centroids
|
|
114
|
+
@centroids = []
|
|
115
|
+
tried_indexes = []
|
|
116
|
+
while @centroids.length < @number_of_clusters &&
|
|
117
|
+
tried_indexes.length < @data_set.data_items.length
|
|
118
|
+
random_index = rand(@data_set.data_items.length)
|
|
119
|
+
if !tried_indexes.include?(random_index)
|
|
120
|
+
tried_indexes << random_index
|
|
121
|
+
if !@centroids.include? @data_set.data_items[random_index]
|
|
122
|
+
@centroids << @data_set.data_items[random_index]
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
@number_of_clusters = @centroids.length
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def stop_criteria_met
|
|
130
|
+
@old_centroids == @centroids ||
|
|
131
|
+
(@max_iterations && (@max_iterations <= @iterations))
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def calculate_membership_clusters
|
|
135
|
+
@clusters = Array.new(@number_of_clusters) do
|
|
136
|
+
Ai4r::Data::DataSet.new :data_labels => @data_set.data_labels
|
|
137
|
+
end
|
|
138
|
+
@data_set.data_items.each do |data_item|
|
|
139
|
+
@clusters[eval(data_item)] << data_item
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def recompute_centroids
|
|
144
|
+
@old_centroids = @centroids
|
|
145
|
+
@centroids = @clusters.collect { |cluster| cluster.get_mean_or_mode }
|
|
146
|
+
@iterations += 1
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def get_min_index(array)
|
|
150
|
+
min = array.first
|
|
151
|
+
index = 0
|
|
152
|
+
array.each_index do |i|
|
|
153
|
+
x = array[i]
|
|
154
|
+
if x < min
|
|
155
|
+
min = x
|
|
156
|
+
index = i
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
return index
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
end
|