ai4r 1.2 → 1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +12 -25
- data/examples/decision_trees/id3_example.rb +6 -9
- data/examples/decision_trees/results.txt +2 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +11 -13
- data/examples/neural_network/xor_example.rb +25 -0
- data/lib/ai4r.rb +10 -0
- data/lib/ai4r/classifiers/classifier.rb +46 -0
- data/lib/ai4r/classifiers/id3.rb +27 -58
- data/lib/ai4r/classifiers/one_r.rb +19 -58
- data/lib/ai4r/classifiers/prism.rb +21 -57
- data/lib/ai4r/classifiers/zero_r.rb +16 -48
- data/lib/ai4r/clusterers/bisecting_k_means.rb +115 -0
- data/lib/ai4r/clusterers/clusterer.rb +55 -0
- data/lib/ai4r/clusterers/k_means.rb +164 -0
- data/lib/ai4r/data/data_set.rb +250 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +19 -19
- data/lib/ai4r/neural_network/backpropagation.rb +23 -24
- data/site/build/site/en/broken-links.xml +2 -0
- data/site/build/site/en/downloads.html +200 -0
- data/site/build/site/en/downloads.pdf +151 -0
- data/site/build/site/en/forum.html +197 -0
- data/site/build/site/en/forum.pdf +151 -0
- data/site/build/site/en/geneticAlgorithms.html +591 -0
- data/site/build/site/en/geneticAlgorithms.pdf +934 -0
- data/site/build/site/en/images/ai4r-logo.png +0 -0
- data/site/build/site/en/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/images/c.png +0 -0
- data/site/build/site/en/images/c_wbn.png +0 -0
- data/site/build/site/en/images/c_wn.png +0 -0
- data/site/build/site/en/images/ero.gif +0 -0
- data/site/build/site/en/images/europe2.png +0 -0
- data/site/build/site/en/images/europe3.png +0 -0
- data/site/build/site/en/images/fitness.png +0 -0
- data/site/build/site/en/images/genetic_algorithms_example.png +0 -0
- data/site/build/site/en/images/instruction_arrow.png +0 -0
- data/site/build/site/en/images/jadeferret.png +0 -0
- data/site/build/site/en/images/my_email.png +0 -0
- data/site/build/site/en/images/neural_network_example.png +0 -0
- data/site/build/site/en/images/rubyforge.png +0 -0
- data/site/build/site/en/images/s.png +0 -0
- data/site/build/site/en/images/s_wbn.png +0 -0
- data/site/build/site/en/images/s_wn.png +0 -0
- data/site/build/site/en/images/sigmoid.png +0 -0
- data/site/build/site/en/images/t.png +0 -0
- data/site/build/site/en/images/t_wbn.png +0 -0
- data/site/build/site/en/images/t_wn.png +0 -0
- data/site/build/site/en/index.html +336 -0
- data/site/build/site/en/index.pdf +508 -0
- data/site/build/site/en/linkmap.html +263 -0
- data/site/build/site/en/linkmap.pdf +94 -0
- data/site/build/site/en/locationmap.xml +72 -0
- data/site/build/site/en/machineLearning.html +339 -0
- data/site/build/site/en/machineLearning.pdf +337 -0
- data/site/build/site/en/neuralNetworks.html +484 -0
- data/site/build/site/en/neuralNetworks.pdf +604 -0
- data/site/build/site/en/skin/CommonMessages_de.xml +23 -0
- data/site/build/site/en/skin/CommonMessages_en_US.xml +23 -0
- data/site/build/site/en/skin/CommonMessages_es.xml +23 -0
- data/site/build/site/en/skin/CommonMessages_fr.xml +23 -0
- data/site/build/site/en/skin/basic.css +166 -0
- data/site/build/site/en/skin/breadcrumbs-optimized.js +90 -0
- data/site/build/site/en/skin/breadcrumbs.js +237 -0
- data/site/build/site/en/skin/fontsize.js +166 -0
- data/site/build/site/en/skin/getBlank.js +40 -0
- data/site/build/site/en/skin/getMenu.js +45 -0
- data/site/build/site/en/skin/images/README.txt +1 -0
- data/site/build/site/en/skin/images/add.jpg +0 -0
- data/site/build/site/en/skin/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/skin/images/chapter.gif +0 -0
- data/site/build/site/en/skin/images/chapter_open.gif +0 -0
- data/site/build/site/en/skin/images/current.gif +0 -0
- data/site/build/site/en/skin/images/error.png +0 -0
- data/site/build/site/en/skin/images/external-link.gif +0 -0
- data/site/build/site/en/skin/images/fix.jpg +0 -0
- data/site/build/site/en/skin/images/forrest-credit-logo.png +0 -0
- data/site/build/site/en/skin/images/hack.jpg +0 -0
- data/site/build/site/en/skin/images/header_white_line.gif +0 -0
- data/site/build/site/en/skin/images/info.png +0 -0
- data/site/build/site/en/skin/images/instruction_arrow.png +0 -0
- data/site/build/site/en/skin/images/label.gif +0 -0
- data/site/build/site/en/skin/images/page.gif +0 -0
- data/site/build/site/en/skin/images/pdfdoc.gif +0 -0
- data/site/build/site/en/skin/images/poddoc.png +0 -0
- data/site/build/site/en/skin/images/printer.gif +0 -0
- data/site/build/site/en/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/remove.jpg +0 -0
- data/site/build/site/en/skin/images/rss.png +0 -0
- data/site/build/site/en/skin/images/spacer.gif +0 -0
- data/site/build/site/en/skin/images/success.png +0 -0
- data/site/build/site/en/skin/images/txtdoc.png +0 -0
- data/site/build/site/en/skin/images/update.jpg +0 -0
- data/site/build/site/en/skin/images/valid-html401.png +0 -0
- data/site/build/site/en/skin/images/vcss.png +0 -0
- data/site/build/site/en/skin/images/warning.png +0 -0
- data/site/build/site/en/skin/images/xmldoc.gif +0 -0
- data/site/build/site/en/skin/menu.js +48 -0
- data/site/build/site/en/skin/note.txt +50 -0
- data/site/build/site/en/skin/print.css +54 -0
- data/site/build/site/en/skin/profile.css +163 -0
- data/site/build/site/en/skin/prototype.js +1257 -0
- data/site/build/site/en/skin/screen.css +587 -0
- data/site/build/site/en/svn.html +252 -0
- data/site/build/site/en/svn.pdf +306 -0
- data/site/build/site/en/wholesite.pdf +1915 -0
- data/site/build/tmp/brokenlinks.xml +2 -0
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
- data/site/build/tmp/locationmap.xml +14 -14
- data/site/build/tmp/output.xmap +23 -23
- data/site/build/tmp/pluginlist2fetchbuild.xml +144 -144
- data/site/build/tmp/projfilters.properties +41 -41
- data/site/build/webapp/WEB-INF/logs/core.log +593 -679
- data/site/build/webapp/WEB-INF/logs/error.log +362 -279
- data/site/build/webapp/WEB-INF/logs/sitemap.log +368 -1015
- data/site/src/documentation/content/xdocs/index.xml +18 -10
- data/site/src/documentation/content/xdocs/machineLearning.xml +4 -3
- data/site/src/documentation/content/xdocs/site.xml +2 -1
- data/site/src/documentation/resources/images/sigmoid.png +0 -0
- data/test/classifiers/id3_test.rb +45 -44
- data/test/classifiers/one_r_test.rb +19 -17
- data/test/classifiers/prism_test.rb +22 -20
- data/test/classifiers/zero_r_test.rb +15 -12
- data/test/clusterers/bisecting_k_means_test.rb +59 -0
- data/test/clusterers/k_means_test.rb +93 -0
- data/test/data/data_set_test.rb +92 -0
- metadata +252 -128
- data/lib/ai4r/classifiers/classifier_helper.rb +0 -54
- data/site/src/documentation/content/xdocs/forum.html +0 -9
- data/site/src/documentation/resources/images/Thumbs.db +0 -0
- data/site/src/documentation/resources/images/sub-dir/Thumbs.db +0 -0
@@ -11,7 +11,8 @@
|
|
11
11
|
# J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
|
12
12
|
# International Journal of Man-Machine Studies. 27(4):349-370.
|
13
13
|
|
14
|
-
require File.dirname(__FILE__) + '/
|
14
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
15
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
15
16
|
|
16
17
|
module Ai4r
|
17
18
|
module Classifiers
|
@@ -23,50 +24,17 @@ module Ai4r
|
|
23
24
|
#
|
24
25
|
# J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
|
25
26
|
# International Journal of Man-Machine Studies. 27(4):349-370.
|
26
|
-
class Prism
|
27
|
+
class Prism < Classifier
|
27
28
|
|
28
|
-
|
29
|
-
include ClassifierHelper
|
29
|
+
attr_reader :data_set, :rules
|
30
30
|
|
31
|
-
# Build a new Prism classifier.
|
32
|
-
#
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
# ]
|
39
|
-
#
|
40
|
-
# e.g.
|
41
|
-
# [ ['New York', '<30', 'M', 'Y'],
|
42
|
-
# ['Chicago', '<30', 'M', 'Y'],
|
43
|
-
# ['Chicago', '<30', 'F', 'Y'],
|
44
|
-
# ['New York', '<30', 'M', 'Y'],
|
45
|
-
# ['New York', '<30', 'M', 'Y'],
|
46
|
-
# ['Chicago', '[30-50)', 'M', 'Y'],
|
47
|
-
# ['New York', '[30-50)', 'F', 'N'],
|
48
|
-
# ['Chicago', '[30-50)', 'F', 'Y'],
|
49
|
-
# ['New York', '[30-50)', 'F', 'N'],
|
50
|
-
# ['Chicago', '[50-80]', 'M', 'N'],
|
51
|
-
# ['New York', '[50-80]', 'F', 'N'],
|
52
|
-
# ['New York', '[50-80]', 'M', 'N'],
|
53
|
-
# ['Chicago', '[50-80]', 'M', 'N'],
|
54
|
-
# ['New York', '[50-80]', 'F', 'N'],
|
55
|
-
# ['Chicago', '>80', 'F', 'Y']
|
56
|
-
# ]
|
57
|
-
#
|
58
|
-
# Data labels must have the following format:
|
59
|
-
# [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
60
|
-
#
|
61
|
-
# If you do not provide labels for you data, the following labels will
|
62
|
-
# be created by default:
|
63
|
-
# [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ]
|
64
|
-
#
|
65
|
-
def build(data_examples, data_labels=nil)
|
66
|
-
check_data_examples(data_examples)
|
67
|
-
@data_labels = (data_labels) ? data_labels : default_data_labels(data_examples)
|
68
|
-
domains = build_domains(data_examples)
|
69
|
-
instances = data_examples.collect {|data| data }
|
31
|
+
# Build a new Prism classifier. You must provide a DataSet instance
|
32
|
+
# as parameter.
|
33
|
+
def build(data_set)
|
34
|
+
data_set.check_not_empty
|
35
|
+
@data_set = data_set
|
36
|
+
domains = @data_set.build_domains
|
37
|
+
instances = @data_set.data_items.collect {|data| data }
|
70
38
|
@rules = []
|
71
39
|
domains.last.each do |class_value|
|
72
40
|
while(has_class_value(instances, class_value))
|
@@ -91,7 +59,7 @@ module Ai4r
|
|
91
59
|
# This method returns the generated rules in ruby code.
|
92
60
|
# e.g.
|
93
61
|
#
|
94
|
-
# classifier.
|
62
|
+
# classifier.get_rules
|
95
63
|
# # => if age_range == '<30' then marketing_target = 'Y'
|
96
64
|
# elsif age_range == '>80' then marketing_target = 'Y'
|
97
65
|
# elsif city == 'Chicago' and age_range == '[30-50)' then marketing_target = 'Y'
|
@@ -101,10 +69,10 @@ module Ai4r
|
|
101
69
|
# It is a nice way to inspect induction results, and also to execute them:
|
102
70
|
# age_range = '[30-50)'
|
103
71
|
# city = 'New York'
|
104
|
-
# eval(classifier.
|
72
|
+
# eval(classifier.get_rules)
|
105
73
|
# puts marketing_target
|
106
74
|
# 'Y'
|
107
|
-
def
|
75
|
+
def get_rules
|
108
76
|
out = "if #{join_terms(@rules.first)} then #{then_clause(@rules.first)}"
|
109
77
|
@rules[1...-1].each do |rule|
|
110
78
|
out += "\nelsif #{join_terms(rule)} then #{then_clause(rule)}"
|
@@ -116,6 +84,10 @@ module Ai4r
|
|
116
84
|
|
117
85
|
protected
|
118
86
|
|
87
|
+
def get_attr_value(data, attr)
|
88
|
+
data[@data_set.get_index(attr)]
|
89
|
+
end
|
90
|
+
|
119
91
|
def has_class_value(instances, class_value)
|
120
92
|
instances.each { |data| return true if data.last == class_value}
|
121
93
|
return false
|
@@ -131,23 +103,15 @@ module Ai4r
|
|
131
103
|
|
132
104
|
def matches_conditions(data, conditions)
|
133
105
|
conditions.each_pair do |attr_label, attr_value|
|
134
|
-
return false if data
|
106
|
+
return false if get_attr_value(data, attr_label) != attr_value
|
135
107
|
end
|
136
108
|
return true
|
137
109
|
end
|
138
110
|
|
139
|
-
def get_attr_index(attr_label)
|
140
|
-
return @data_labels.index(attr_label)
|
141
|
-
end
|
142
|
-
|
143
|
-
def get_attr_value(data, attr_label)
|
144
|
-
return data[get_attr_index(attr_label)]
|
145
|
-
end
|
146
|
-
|
147
111
|
def build_rule(class_value, instances)
|
148
112
|
rule = {:class_value => class_value, :conditions => {}}
|
149
113
|
rule_instances = instances.collect {|data| data }
|
150
|
-
attributes = @data_labels[0...-1].collect {|label| label }
|
114
|
+
attributes = @data_set.data_labels[0...-1].collect {|label| label }
|
151
115
|
until(is_perfect(instances, rule) || attributes.empty?)
|
152
116
|
freq_table = build_freq_table(rule_instances, attributes, class_value)
|
153
117
|
condition = get_condition(freq_table)
|
@@ -223,7 +187,7 @@ module Ai4r
|
|
223
187
|
end
|
224
188
|
|
225
189
|
def then_clause(rule)
|
226
|
-
"#{@data_labels.last} = '#{rule[:class_value]}'"
|
190
|
+
"#{@data_set.data_labels.last} = '#{rule[:class_value]}'"
|
227
191
|
end
|
228
192
|
|
229
193
|
end
|
@@ -7,10 +7,12 @@
|
|
7
7
|
# the Mozilla Public License version 1.1 as published by the
|
8
8
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
9
|
|
10
|
-
require File.dirname(__FILE__) + '/
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set.rb'
|
11
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
11
12
|
|
12
13
|
module Ai4r
|
13
14
|
module Classifiers
|
15
|
+
|
14
16
|
# = Introduction
|
15
17
|
#
|
16
18
|
# The idea behind the ZeroR classifier is to identify the
|
@@ -18,53 +20,19 @@ module Ai4r
|
|
18
20
|
# It always returns that value when evaluating an instance.
|
19
21
|
# It is frequently used as a baseline for evaluating other machine learning
|
20
22
|
# algorithms.
|
21
|
-
class ZeroR
|
23
|
+
class ZeroR < Classifier
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
include ClassifierHelper
|
25
|
+
attr_reader :data_set, :class_value
|
26
26
|
|
27
|
-
# Build a new ZeroR classifier.
|
28
|
-
#
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
# ...
|
33
|
-
# [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
|
34
|
-
# ]
|
35
|
-
#
|
36
|
-
# e.g.
|
37
|
-
# [ ['New York', '<30', 'M', 'Y'],
|
38
|
-
# ['Chicago', '<30', 'M', 'Y'],
|
39
|
-
# ['Chicago', '<30', 'F', 'Y'],
|
40
|
-
# ['New York', '<30', 'M', 'Y'],
|
41
|
-
# ['New York', '<30', 'M', 'Y'],
|
42
|
-
# ['Chicago', '[30-50)', 'M', 'Y'],
|
43
|
-
# ['New York', '[30-50)', 'F', 'N'],
|
44
|
-
# ['Chicago', '[30-50)', 'F', 'Y'],
|
45
|
-
# ['New York', '[30-50)', 'F', 'N'],
|
46
|
-
# ['Chicago', '[50-80]', 'M', 'N'],
|
47
|
-
# ['New York', '[50-80]', 'F', 'N'],
|
48
|
-
# ['New York', '[50-80]', 'M', 'N'],
|
49
|
-
# ['Chicago', '[50-80]', 'M', 'N'],
|
50
|
-
# ['New York', '[50-80]', 'F', 'N'],
|
51
|
-
# ['Chicago', '>80', 'F', 'Y']
|
52
|
-
# ]
|
53
|
-
#
|
54
|
-
# Data labels must have the following format:
|
55
|
-
# [ 'city', 'age_range', 'gender', 'marketing_target' ]
|
56
|
-
#
|
57
|
-
# If you do not provide labels for you data, the following labels will
|
58
|
-
# be created by default:
|
59
|
-
# [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ]
|
60
|
-
#
|
61
|
-
def build(data_examples, data_labels=nil)
|
62
|
-
check_data_examples(data_examples)
|
63
|
-
@data_labels = (data_labels) ? data_labels : default_data_labels(data_examples)
|
27
|
+
# Build a new ZeroR classifier. You must provide a DataSet instance
|
28
|
+
# as parameter.
|
29
|
+
def build(data_set)
|
30
|
+
data_set.check_not_empty
|
31
|
+
@data_set = data_set
|
64
32
|
frequence = {}
|
65
33
|
max_freq = 0
|
66
|
-
@class_value
|
67
|
-
|
34
|
+
@class_value = nil
|
35
|
+
@data_set.data_items.each do |example|
|
68
36
|
class_value = example.last
|
69
37
|
class_frequency = frequence[class_value]
|
70
38
|
class_frequency = (class_frequency) ? class_frequency+1 : 1
|
@@ -86,16 +54,16 @@ module Ai4r
|
|
86
54
|
# This method returns the generated rules in ruby code.
|
87
55
|
# e.g.
|
88
56
|
#
|
89
|
-
# classifier.
|
57
|
+
# classifier.get_rules
|
90
58
|
# # => marketing_target='Y'
|
91
59
|
#
|
92
60
|
# It is a nice way to inspect induction results, and also to execute them:
|
93
61
|
# marketing_target = nil
|
94
|
-
# eval classifier.
|
62
|
+
# eval classifier.get_rules
|
95
63
|
# puts marketing_target
|
96
64
|
# # => 'Y'
|
97
|
-
def
|
98
|
-
return "#{@data_labels.last} = '#{@class_value}'"
|
65
|
+
def get_rules
|
66
|
+
return "#{@data_set.data_labels.last} = '#{@class_value}'"
|
99
67
|
end
|
100
68
|
|
101
69
|
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require "set"
|
11
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
12
|
+
require File.dirname(__FILE__) + '/../clusterers/k_means'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Clusterers
|
16
|
+
|
17
|
+
# The Bisecting k-means algorithm is a variation of the "k-means" algorithm,
|
18
|
+
# somewhat less sensible to the initial election of centroids than the
|
19
|
+
# original.
|
20
|
+
#
|
21
|
+
# More about K Means algorithm:
|
22
|
+
# http://en.wikipedia.org/wiki/K-means_algorithm
|
23
|
+
class BisectingKMeans < KMeans
|
24
|
+
|
25
|
+
attr_reader :data_set, :number_of_clusters, :clusters, :centroids
|
26
|
+
attr_accessor :max_iterations, :distance_function, :refine
|
27
|
+
|
28
|
+
def intialize
|
29
|
+
@refine = true
|
30
|
+
end
|
31
|
+
|
32
|
+
# Build a new clusterer, using data examples found in data_set.
|
33
|
+
# Items will be clustered in "number_of_clusters" different
|
34
|
+
# clusters.
|
35
|
+
def build(data_set, number_of_clusters)
|
36
|
+
@data_set = data_set
|
37
|
+
@number_of_clusters = number_of_clusters
|
38
|
+
|
39
|
+
@clusters = [@data_set]
|
40
|
+
@centroids = [@data_set.get_mean_or_mode]
|
41
|
+
while @clusters.length < @number_of_clusters
|
42
|
+
biggest_cluster_index = find_biggest_cluster_index(@clusters)
|
43
|
+
clusterer = KMeans.new.
|
44
|
+
set_parameters(get_parameters).
|
45
|
+
build(@clusters[biggest_cluster_index], 2)
|
46
|
+
@clusters.delete_at(biggest_cluster_index)
|
47
|
+
@centroids.delete_at(biggest_cluster_index)
|
48
|
+
@clusters.concat(clusterer.clusters)
|
49
|
+
@centroids.concat(clusterer.centroids)
|
50
|
+
end
|
51
|
+
|
52
|
+
super if @refine
|
53
|
+
|
54
|
+
return self
|
55
|
+
end
|
56
|
+
|
57
|
+
# Get info on what can be parameterized on this clusterer algorithm.
|
58
|
+
# It returns a hash with the following format:
|
59
|
+
# { :param_name => "Info on the parameter" }
|
60
|
+
def get_parameters_info
|
61
|
+
{ :max_iterations => "Maximum number of iterations used to bisect a " +
|
62
|
+
"cluster. By default it is uncapped.",
|
63
|
+
:distance_function => "Custom implementation of distance function. " +
|
64
|
+
"It must be a closure receiving two data items and return the " +
|
65
|
+
"distance bewteen them. By default, this algorithm uses " +
|
66
|
+
"ecuclidean distance of numeric attributes to the power of 2.",
|
67
|
+
:refine => "Boolean value. True by default. It will run the " +
|
68
|
+
"classic K Means algorithm, using as initial centroids the " +
|
69
|
+
"result of the bisecting approach."
|
70
|
+
}
|
71
|
+
end
|
72
|
+
|
73
|
+
# Set parameters on this clusterer instance.
|
74
|
+
# You must provide a hash with the folowing format:
|
75
|
+
# { :param_name => parameter_value }
|
76
|
+
#
|
77
|
+
# Use get_parameters_info to know what parameters are accepted.
|
78
|
+
def set_parameters(parameters)
|
79
|
+
super
|
80
|
+
if parameters.has_key?(:refine)
|
81
|
+
@refine = parameters[:refine]
|
82
|
+
end
|
83
|
+
return self
|
84
|
+
end
|
85
|
+
|
86
|
+
# Get parameter values on this clusterer instance.
|
87
|
+
# Returns a hash with the folowing format:
|
88
|
+
# { :param_name => parameter_value }
|
89
|
+
def get_parameters
|
90
|
+
params = super
|
91
|
+
params[:refine] = @refine
|
92
|
+
return params
|
93
|
+
end
|
94
|
+
|
95
|
+
protected
|
96
|
+
def calc_initial_centroids
|
97
|
+
@centroids # Use existing centroids
|
98
|
+
end
|
99
|
+
|
100
|
+
def find_biggest_cluster_index(clusters)
|
101
|
+
max_index = 0
|
102
|
+
max_length = 0
|
103
|
+
clusters.each_index do |cluster_index|
|
104
|
+
cluster = clusters[cluster_index]
|
105
|
+
if max_length < cluster.data_items.length
|
106
|
+
max_length = cluster.data_items.length
|
107
|
+
max_index = cluster_index
|
108
|
+
end
|
109
|
+
end
|
110
|
+
return max_index
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# Author:: Sergio Fierens
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
module Ai4r
|
11
|
+
module Clusterers
|
12
|
+
|
13
|
+
# The purpose of this class is to define a common API for Clusterers.
|
14
|
+
# All methods in this class (other than eval) must be implemented in
|
15
|
+
# subclasses.
|
16
|
+
class Clusterer
|
17
|
+
|
18
|
+
# Build a new clusterer, using data examples found in data_set.
|
19
|
+
# Data items will be clustered in "number_of_clusters" different
|
20
|
+
# clusters.
|
21
|
+
def build(data_set, number_of_clusters)
|
22
|
+
raise NotImplementedError
|
23
|
+
end
|
24
|
+
|
25
|
+
# Classifies the given data item, returning the cluster it belongs to.
|
26
|
+
def eval(data_item)
|
27
|
+
raise NotImplementedError
|
28
|
+
end
|
29
|
+
|
30
|
+
# Get info on what can be parameterized on this clusterer.
|
31
|
+
# It returns a hash with the following format:
|
32
|
+
# { :param_name => "Info on the parameter" }
|
33
|
+
def get_parameters_info
|
34
|
+
raise NotImplementedError
|
35
|
+
end
|
36
|
+
|
37
|
+
# Set parameter values on this clusterer instance.
|
38
|
+
# You must provide a hash with the folowing format:
|
39
|
+
# { :param_name => parameter_value }
|
40
|
+
def set_parameters(parameters)
|
41
|
+
raise NotImplementedError
|
42
|
+
end
|
43
|
+
|
44
|
+
# Get parameter values on this clusterer instance.
|
45
|
+
# Returns a hash with the folowing format:
|
46
|
+
# { :param_name => parameter_value }
|
47
|
+
def get_parameters
|
48
|
+
raise NotImplementedError
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,164 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require "set"
|
11
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
12
|
+
require File.dirname(__FILE__) + '/../clusterers/clusterer'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Clusterers
|
16
|
+
|
17
|
+
# The k-means algorithm is an algorithm to cluster n objects
|
18
|
+
# based on attributes into k partitions, with k < n.
|
19
|
+
#
|
20
|
+
# More about K Means algorithm:
|
21
|
+
# http://en.wikipedia.org/wiki/K-means_algorithm
|
22
|
+
class KMeans < Clusterer
|
23
|
+
|
24
|
+
attr_reader :data_set, :number_of_clusters
|
25
|
+
attr_reader :clusters, :centroids, :iterations
|
26
|
+
attr_accessor :max_iterations
|
27
|
+
attr_accessor :distance_function
|
28
|
+
|
29
|
+
# Build a new clusterer, using data examples found in data_set.
|
30
|
+
# Items will be clustered in "number_of_clusters" different
|
31
|
+
# clusters.
|
32
|
+
def build(data_set, number_of_clusters)
|
33
|
+
@data_set = data_set
|
34
|
+
@number_of_clusters = number_of_clusters
|
35
|
+
@iterations = 0
|
36
|
+
|
37
|
+
calc_initial_centroids
|
38
|
+
while(not stop_criteria_met)
|
39
|
+
calculate_membership_clusters
|
40
|
+
recompute_centroids
|
41
|
+
end
|
42
|
+
|
43
|
+
return self
|
44
|
+
end
|
45
|
+
|
46
|
+
# Classifies the given data item, returning the cluster index it belongs
|
47
|
+
# to (0-based).
|
48
|
+
def eval(data_item)
|
49
|
+
get_min_index(@centroids.collect {|centroid|
|
50
|
+
distance(data_item, centroid)})
|
51
|
+
end
|
52
|
+
|
53
|
+
# Get info on what can be parameterized on this clusterer algorithm.
|
54
|
+
# It returns a hash with the following format:
|
55
|
+
# { :param_name => "Info on the parameter" }
|
56
|
+
def get_parameters_info
|
57
|
+
{ :max_iterations => "Maximum number of iterations to build the " +
|
58
|
+
"clusterer. By default it is uncapped.",
|
59
|
+
:distance_function => "Custom implementation of distance function. " +
|
60
|
+
"It must be a closure receiving two data items and return the " +
|
61
|
+
"distance bewteen them. By default, this algorithm uses " +
|
62
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
63
|
+
}
|
64
|
+
end
|
65
|
+
|
66
|
+
# Set parameters on this clusterer instance.
|
67
|
+
# You must provide a hash with the folowing format:
|
68
|
+
# { :param_name => parameter_value }
|
69
|
+
#
|
70
|
+
# Use get_parameters_info to know what parameters are accepted.
|
71
|
+
def set_parameters(parameters)
|
72
|
+
if parameters.has_key?(:max_iterations)
|
73
|
+
@max_iterations = parameters[:max_iterations]
|
74
|
+
end
|
75
|
+
if parameters.has_key?(:distance_function)
|
76
|
+
@distance_function = parameters[:distance_function]
|
77
|
+
end
|
78
|
+
return self
|
79
|
+
end
|
80
|
+
|
81
|
+
# Get parameter values on this clusterer instance.
|
82
|
+
# Returns a hash with the folowing format:
|
83
|
+
# { :param_name => parameter_value }
|
84
|
+
def get_parameters
|
85
|
+
{ :max_iterations => @max_iterations,
|
86
|
+
:distance_function => @distance_function }
|
87
|
+
end
|
88
|
+
|
89
|
+
# This function calculates the distance between 2 different
|
90
|
+
# instances. By default, it returns the euclidean distance to the
|
91
|
+
# power of 2.
|
92
|
+
# You can provide a more convinient distance implementation:
|
93
|
+
#
|
94
|
+
# 1- Overwriting this method
|
95
|
+
#
|
96
|
+
# 2- Providing a closure to the :distance_function parameter
|
97
|
+
def distance(a, b)
|
98
|
+
return @distance_function.call(a, b) if @distance_function
|
99
|
+
return euclidean_distance(a, b)
|
100
|
+
end
|
101
|
+
|
102
|
+
protected
|
103
|
+
def euclidean_distance(a, b)
|
104
|
+
dist = 0.0
|
105
|
+
a.each_index do |index|
|
106
|
+
if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
|
107
|
+
dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
|
108
|
+
end
|
109
|
+
end
|
110
|
+
return dist
|
111
|
+
end
|
112
|
+
|
113
|
+
def calc_initial_centroids
|
114
|
+
@centroids = []
|
115
|
+
tried_indexes = []
|
116
|
+
while @centroids.length < @number_of_clusters &&
|
117
|
+
tried_indexes.length < @data_set.data_items.length
|
118
|
+
random_index = rand(@data_set.data_items.length)
|
119
|
+
if !tried_indexes.include?(random_index)
|
120
|
+
tried_indexes << random_index
|
121
|
+
if !@centroids.include? @data_set.data_items[random_index]
|
122
|
+
@centroids << @data_set.data_items[random_index]
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
@number_of_clusters = @centroids.length
|
127
|
+
end
|
128
|
+
|
129
|
+
def stop_criteria_met
|
130
|
+
@old_centroids == @centroids ||
|
131
|
+
(@max_iterations && (@max_iterations <= @iterations))
|
132
|
+
end
|
133
|
+
|
134
|
+
def calculate_membership_clusters
|
135
|
+
@clusters = Array.new(@number_of_clusters) do
|
136
|
+
Ai4r::Data::DataSet.new :data_labels => @data_set.data_labels
|
137
|
+
end
|
138
|
+
@data_set.data_items.each do |data_item|
|
139
|
+
@clusters[eval(data_item)] << data_item
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def recompute_centroids
|
144
|
+
@old_centroids = @centroids
|
145
|
+
@centroids = @clusters.collect { |cluster| cluster.get_mean_or_mode }
|
146
|
+
@iterations += 1
|
147
|
+
end
|
148
|
+
|
149
|
+
def get_min_index(array)
|
150
|
+
min = array.first
|
151
|
+
index = 0
|
152
|
+
array.each_index do |i|
|
153
|
+
x = array[i]
|
154
|
+
if x < min
|
155
|
+
min = x
|
156
|
+
index = i
|
157
|
+
end
|
158
|
+
end
|
159
|
+
return index
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|