nirvdrum-ai4r 1.9.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/.rakeTasks +7 -0
- data/README.rdoc +56 -0
- data/Rakefile.rb +42 -0
- data/VERSION +1 -0
- data/ai4r.gemspec +221 -0
- data/change_log +49 -0
- data/examples/classifiers/id3_data.csv +121 -0
- data/examples/classifiers/id3_example.rb +29 -0
- data/examples/classifiers/naive_bayes_data.csv +11 -0
- data/examples/classifiers/naive_bayes_example.rb +16 -0
- data/examples/classifiers/results.txt +31 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
- data/examples/genetic_algorithm/travel_cost.csv +16 -0
- data/examples/neural_network/backpropagation_example.rb +67 -0
- data/examples/neural_network/patterns_with_base_noise.rb +68 -0
- data/examples/neural_network/patterns_with_noise.rb +66 -0
- data/examples/neural_network/training_patterns.rb +68 -0
- data/examples/neural_network/xor_example.rb +35 -0
- data/examples/som/som_data.rb +156 -0
- data/examples/som/som_multi_node_example.rb +22 -0
- data/examples/som/som_single_example.rb +24 -0
- data/lib/ai4r.rb +32 -0
- data/lib/ai4r/classifiers/classifier.rb +59 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
- data/lib/ai4r/classifiers/id3.rb +326 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
- data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
- data/lib/ai4r/classifiers/one_r.rb +110 -0
- data/lib/ai4r/classifiers/prism.rb +197 -0
- data/lib/ai4r/classifiers/zero_r.rb +73 -0
- data/lib/ai4r/clusterers/average_linkage.rb +59 -0
- data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
- data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
- data/lib/ai4r/clusterers/clusterer.rb +61 -0
- data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
- data/lib/ai4r/clusterers/diana.rb +139 -0
- data/lib/ai4r/clusterers/k_means.rb +126 -0
- data/lib/ai4r/clusterers/median_linkage.rb +61 -0
- data/lib/ai4r/clusterers/single_linkage.rb +194 -0
- data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
- data/lib/ai4r/data/data_set.rb +266 -0
- data/lib/ai4r/data/parameterizable.rb +64 -0
- data/lib/ai4r/data/proximity.rb +100 -0
- data/lib/ai4r/data/statistics.rb +77 -0
- data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
- data/lib/ai4r/neural_network/backpropagation.rb +293 -0
- data/lib/ai4r/neural_network/hopfield.rb +149 -0
- data/lib/ai4r/som/layer.rb +68 -0
- data/lib/ai4r/som/node.rb +96 -0
- data/lib/ai4r/som/som.rb +155 -0
- data/lib/ai4r/som/two_phase_layer.rb +90 -0
- data/site/forrest.properties +152 -0
- data/site/forrest.properties.dispatcher.properties +25 -0
- data/site/forrest.properties.xml +29 -0
- data/site/src/documentation/README.txt +7 -0
- data/site/src/documentation/classes/CatalogManager.properties +62 -0
- data/site/src/documentation/content/locationmap.xml +72 -0
- data/site/src/documentation/content/xdocs/downloads.html +9 -0
- data/site/src/documentation/content/xdocs/geneticAlgorithms.xml +294 -0
- data/site/src/documentation/content/xdocs/index.xml +155 -0
- data/site/src/documentation/content/xdocs/machineLearning.xml +131 -0
- data/site/src/documentation/content/xdocs/neuralNetworks.xml +270 -0
- data/site/src/documentation/content/xdocs/site.xml +54 -0
- data/site/src/documentation/content/xdocs/sourceCode.xml +43 -0
- data/site/src/documentation/content/xdocs/tabs.xml +35 -0
- data/site/src/documentation/resources/images/ai4r-logo.png +0 -0
- data/site/src/documentation/resources/images/c.png +0 -0
- data/site/src/documentation/resources/images/c_wbn.png +0 -0
- data/site/src/documentation/resources/images/c_wn.png +0 -0
- data/site/src/documentation/resources/images/ellipse-2.svg +30 -0
- data/site/src/documentation/resources/images/ero.gif +0 -0
- data/site/src/documentation/resources/images/europe2.png +0 -0
- data/site/src/documentation/resources/images/europe3.png +0 -0
- data/site/src/documentation/resources/images/fitness.png +0 -0
- data/site/src/documentation/resources/images/genetic_algorithms_example.png +0 -0
- data/site/src/documentation/resources/images/icon-a.png +0 -0
- data/site/src/documentation/resources/images/icon-b.png +0 -0
- data/site/src/documentation/resources/images/icon.png +0 -0
- data/site/src/documentation/resources/images/jadeferret.png +0 -0
- data/site/src/documentation/resources/images/my_email.png +0 -0
- data/site/src/documentation/resources/images/neural_network_example.png +0 -0
- data/site/src/documentation/resources/images/project-logo.png +0 -0
- data/site/src/documentation/resources/images/rubyforge.png +0 -0
- data/site/src/documentation/resources/images/s.png +0 -0
- data/site/src/documentation/resources/images/s_wbn.png +0 -0
- data/site/src/documentation/resources/images/s_wn.png +0 -0
- data/site/src/documentation/resources/images/sigmoid.png +0 -0
- data/site/src/documentation/resources/images/sub-dir/icon-c.png +0 -0
- data/site/src/documentation/resources/images/t.png +0 -0
- data/site/src/documentation/resources/images/t_wbn.png +0 -0
- data/site/src/documentation/resources/images/t_wn.png +0 -0
- data/site/src/documentation/resources/schema/catalog.xcat +29 -0
- data/site/src/documentation/resources/schema/hello-v10.dtd +51 -0
- data/site/src/documentation/resources/schema/symbols-project-v10.ent +26 -0
- data/site/src/documentation/resources/stylesheets/hello2document.xsl +33 -0
- data/site/src/documentation/sitemap.xmap +66 -0
- data/site/src/documentation/skinconf.xml +418 -0
- data/site/src/documentation/translations/langcode.xml +29 -0
- data/site/src/documentation/translations/languages_de.xml +24 -0
- data/site/src/documentation/translations/languages_en.xml +24 -0
- data/site/src/documentation/translations/languages_es.xml +22 -0
- data/site/src/documentation/translations/languages_fr.xml +24 -0
- data/site/src/documentation/translations/languages_nl.xml +24 -0
- data/site/src/documentation/translations/menu.xml +33 -0
- data/site/src/documentation/translations/menu_af.xml +33 -0
- data/site/src/documentation/translations/menu_de.xml +33 -0
- data/site/src/documentation/translations/menu_es.xml +33 -0
- data/site/src/documentation/translations/menu_fr.xml +33 -0
- data/site/src/documentation/translations/menu_it.xml +33 -0
- data/site/src/documentation/translations/menu_nl.xml +33 -0
- data/site/src/documentation/translations/menu_no.xml +33 -0
- data/site/src/documentation/translations/menu_ru.xml +33 -0
- data/site/src/documentation/translations/menu_sk.xml +33 -0
- data/site/src/documentation/translations/tabs.xml +22 -0
- data/site/src/documentation/translations/tabs_de.xml +22 -0
- data/site/src/documentation/translations/tabs_es.xml +22 -0
- data/site/src/documentation/translations/tabs_fr.xml +22 -0
- data/site/src/documentation/translations/tabs_nl.xml +22 -0
- data/test/classifiers/hyperpipes_test.rb +84 -0
- data/test/classifiers/id3_test.rb +208 -0
- data/test/classifiers/multilayer_perceptron_test.rb +79 -0
- data/test/classifiers/naive_bayes_test.rb +43 -0
- data/test/classifiers/one_r_test.rb +62 -0
- data/test/classifiers/prism_test.rb +85 -0
- data/test/classifiers/zero_r_test.rb +50 -0
- data/test/clusterers/average_linkage_test.rb +51 -0
- data/test/clusterers/bisecting_k_means_test.rb +66 -0
- data/test/clusterers/centroid_linkage_test.rb +53 -0
- data/test/clusterers/complete_linkage_test.rb +57 -0
- data/test/clusterers/diana_test.rb +69 -0
- data/test/clusterers/k_means_test.rb +100 -0
- data/test/clusterers/median_linkage_test.rb +53 -0
- data/test/clusterers/single_linkage_test.rb +122 -0
- data/test/clusterers/ward_linkage_test.rb +53 -0
- data/test/clusterers/weighted_average_linkage_test.rb +53 -0
- data/test/data/data_set.csv +121 -0
- data/test/data/data_set_test.rb +96 -0
- data/test/data/proximity_test.rb +81 -0
- data/test/data/statistics_data_set.csv +5 -0
- data/test/data/statistics_test.rb +65 -0
- data/test/experiment/classifier_evaluator_test.rb +76 -0
- data/test/genetic_algorithm/chromosome_test.rb +58 -0
- data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
- data/test/neural_network/backpropagation_test.rb +69 -0
- data/test/neural_network/hopfield_test.rb +72 -0
- data/test/som/som_test.rb +97 -0
- metadata +238 -0
@@ -0,0 +1,197 @@
|
|
1
|
+
# Author:: Sergio Fierens (Implementation only, Cendrowska is
|
2
|
+
# the creator of the algorithm)
|
3
|
+
# License:: MPL 1.1
|
4
|
+
# Project:: ai4r
|
5
|
+
# Url:: http://ai4r.rubyforge.org/
|
6
|
+
#
|
7
|
+
# You can redistribute it and/or modify it under the terms of
|
8
|
+
# the Mozilla Public License version 1.1 as published by the
|
9
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
10
|
+
#
|
11
|
+
# J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
|
12
|
+
# International Journal of Man-Machine Studies. 27(4):349-370.
|
13
|
+
|
14
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
15
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
16
|
+
|
17
|
+
module Ai4r
|
18
|
+
module Classifiers
|
19
|
+
|
20
|
+
# = Introduction
|
21
|
+
# This is an implementation of the PRISM algorithm (Cendrowska, 1987)
|
22
|
+
# Given a set of preclassified examples, it builds a set of rules
|
23
|
+
# to predict the class of other instaces.
|
24
|
+
#
|
25
|
+
# J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
|
26
|
+
# International Journal of Man-Machine Studies. 27(4):349-370.
|
27
|
+
class Prism < Classifier
|
28
|
+
|
29
|
+
attr_reader :data_set, :rules
|
30
|
+
|
31
|
+
# Build a new Prism classifier. You must provide a DataSet instance
|
32
|
+
# as parameter. The last attribute of each item is considered as
|
33
|
+
# the item class.
|
34
|
+
def build(data_set)
|
35
|
+
data_set.check_not_empty
|
36
|
+
@data_set = data_set
|
37
|
+
domains = @data_set.build_domains
|
38
|
+
instances = @data_set.data_items.collect {|data| data }
|
39
|
+
@rules = []
|
40
|
+
domains.last.each do |class_value|
|
41
|
+
while(has_class_value(instances, class_value))
|
42
|
+
rule = build_rule(class_value, instances)
|
43
|
+
@rules << rule
|
44
|
+
instances = instances.select {|data| !matches_conditions(data, rule[:conditions])}
|
45
|
+
end
|
46
|
+
end
|
47
|
+
return self
|
48
|
+
end
|
49
|
+
|
50
|
+
# You can evaluate new data, predicting its class.
|
51
|
+
# e.g.
|
52
|
+
# classifier.eval(['New York', '<30', 'F']) # => 'Y'
|
53
|
+
def eval(instace)
|
54
|
+
@rules.each do |rule|
|
55
|
+
return rule[:class_value] if matches_conditions(instace, rule[:conditions])
|
56
|
+
end
|
57
|
+
return nil
|
58
|
+
end
|
59
|
+
|
60
|
+
# This method returns the generated rules in ruby code.
|
61
|
+
# e.g.
|
62
|
+
#
|
63
|
+
# classifier.get_rules
|
64
|
+
# # => if age_range == '<30' then marketing_target = 'Y'
|
65
|
+
# elsif age_range == '>80' then marketing_target = 'Y'
|
66
|
+
# elsif city == 'Chicago' and age_range == '[30-50)' then marketing_target = 'Y'
|
67
|
+
# else marketing_target = 'N'
|
68
|
+
# end
|
69
|
+
#
|
70
|
+
# It is a nice way to inspect induction results, and also to execute them:
|
71
|
+
# age_range = '[30-50)'
|
72
|
+
# city = 'New York'
|
73
|
+
# eval(classifier.get_rules)
|
74
|
+
# puts marketing_target
|
75
|
+
# 'Y'
|
76
|
+
def get_rules
|
77
|
+
out = "if #{join_terms(@rules.first)} then #{then_clause(@rules.first)}"
|
78
|
+
@rules[1...-1].each do |rule|
|
79
|
+
out += "\nelsif #{join_terms(rule)} then #{then_clause(rule)}"
|
80
|
+
end
|
81
|
+
out += "\nelse #{then_clause(@rules.last)}" if @rules.size > 1
|
82
|
+
out += "\nend"
|
83
|
+
return out
|
84
|
+
end
|
85
|
+
|
86
|
+
protected
|
87
|
+
|
88
|
+
def get_attr_value(data, attr)
|
89
|
+
data[@data_set.get_index(attr)]
|
90
|
+
end
|
91
|
+
|
92
|
+
def has_class_value(instances, class_value)
|
93
|
+
instances.each { |data| return true if data.last == class_value}
|
94
|
+
return false
|
95
|
+
end
|
96
|
+
|
97
|
+
def is_perfect(instances, rule)
|
98
|
+
class_value = rule[:class_value]
|
99
|
+
instances.each do |data|
|
100
|
+
return false if data.last != class_value and matches_conditions(data, rule[:conditions])
|
101
|
+
end
|
102
|
+
return true
|
103
|
+
end
|
104
|
+
|
105
|
+
def matches_conditions(data, conditions)
|
106
|
+
conditions.each_pair do |attr_label, attr_value|
|
107
|
+
return false if get_attr_value(data, attr_label) != attr_value
|
108
|
+
end
|
109
|
+
return true
|
110
|
+
end
|
111
|
+
|
112
|
+
def build_rule(class_value, instances)
|
113
|
+
rule = {:class_value => class_value, :conditions => {}}
|
114
|
+
rule_instances = instances.collect {|data| data }
|
115
|
+
attributes = @data_set.data_labels[0...-1].collect {|label| label }
|
116
|
+
until(is_perfect(instances, rule) || attributes.empty?)
|
117
|
+
freq_table = build_freq_table(rule_instances, attributes, class_value)
|
118
|
+
condition = get_condition(freq_table)
|
119
|
+
rule[:conditions].merge!(condition)
|
120
|
+
rule_instances = rule_instances.select do |data|
|
121
|
+
matches_conditions(data, condition)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
return rule
|
125
|
+
end
|
126
|
+
|
127
|
+
# Returns a structure with the folloring format:
|
128
|
+
# => {attr1_label => { :attr1_value1 => [p, t], attr1_value2 => [p, t], ... },
|
129
|
+
# attr2_label => { :attr2_value1 => [p, t], attr2_value2 => [p, t], ... },
|
130
|
+
# ...
|
131
|
+
# }
|
132
|
+
# where p is the number of instances classified as class_value
|
133
|
+
# with that attribute value, and t is the total number of instances with
|
134
|
+
# that attribute value
|
135
|
+
def build_freq_table(rule_instances, attributes, class_value)
|
136
|
+
freq_table = Hash.new()
|
137
|
+
rule_instances.each do |data|
|
138
|
+
attributes.each do |attr_label|
|
139
|
+
attr_freqs = freq_table[attr_label] || Hash.new([0, 0])
|
140
|
+
pt = attr_freqs[get_attr_value(data, attr_label)]
|
141
|
+
pt = [(data.last == class_value) ? pt[0]+1 : pt[0], pt[1]+1]
|
142
|
+
attr_freqs[get_attr_value(data, attr_label)] = pt
|
143
|
+
freq_table[attr_label] = attr_freqs
|
144
|
+
end
|
145
|
+
end
|
146
|
+
return freq_table
|
147
|
+
end
|
148
|
+
|
149
|
+
# returns a single conditional term: {attrN_label => attrN_valueM}
|
150
|
+
# selecting the attribute with higher pt ratio
|
151
|
+
# (occurrences of attribute value classified as class_value /
|
152
|
+
# occurrences of attribute value)
|
153
|
+
def get_condition(freq_table)
|
154
|
+
best_pt = [0, 0]
|
155
|
+
condition = nil
|
156
|
+
freq_table.each do |attr_label, attr_freqs|
|
157
|
+
attr_freqs.each do |attr_value, pt|
|
158
|
+
if(better_pt(pt, best_pt))
|
159
|
+
condition = { attr_label => attr_value }
|
160
|
+
best_pt = pt
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
return condition
|
165
|
+
end
|
166
|
+
|
167
|
+
# pt = [p, t]
|
168
|
+
# p = occurrences of attribute value with instance classified as class_value
|
169
|
+
# t = occurrences of attribute value
|
170
|
+
# a pt is better if:
|
171
|
+
# 1- its ratio is higher
|
172
|
+
# 2- its ratio is equal, and has a higher p
|
173
|
+
def better_pt(pt, best_pt)
|
174
|
+
return false if pt[1] == 0
|
175
|
+
return true if best_pt[1] == 0
|
176
|
+
a = pt[0]*best_pt[1]
|
177
|
+
b = best_pt[0]*pt[1]
|
178
|
+
return true if a>b || (a==b && pt[0]>best_pt[0])
|
179
|
+
return false
|
180
|
+
end
|
181
|
+
|
182
|
+
def join_terms(rule)
|
183
|
+
terms = []
|
184
|
+
rule[:conditions].each do |attr_label, attr_value|
|
185
|
+
terms << "#{attr_label} == '#{attr_value}'"
|
186
|
+
end
|
187
|
+
"#{terms.join(" and ")}"
|
188
|
+
end
|
189
|
+
|
190
|
+
def then_clause(rule)
|
191
|
+
"#{@data_set.data_labels.last} = '#{rule[:class_value]}'"
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# Author:: Sergio Fierens (Implementation only)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set.rb'
|
11
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Classifiers
|
15
|
+
|
16
|
+
# = Introduction
|
17
|
+
#
|
18
|
+
# The idea behind the ZeroR classifier is to identify the
|
19
|
+
# the most common class value in the training set.
|
20
|
+
# It always returns that value when evaluating an instance.
|
21
|
+
# It is frequently used as a baseline for evaluating other machine learning
|
22
|
+
# algorithms.
|
23
|
+
class ZeroR < Classifier
|
24
|
+
|
25
|
+
attr_reader :data_set, :class_value
|
26
|
+
|
27
|
+
# Build a new ZeroR classifier. You must provide a DataSet instance
|
28
|
+
# as parameter. The last attribute of each item is considered as
|
29
|
+
# the item class.
|
30
|
+
def build(data_set)
|
31
|
+
data_set.check_not_empty
|
32
|
+
@data_set = data_set
|
33
|
+
frequencies = {}
|
34
|
+
max_freq = 0
|
35
|
+
@class_value = nil
|
36
|
+
@data_set.data_items.each do |example|
|
37
|
+
class_value = example.last
|
38
|
+
frequencies[class_value] = frequencies[class_value].nil? ? 1 : frequencies[class_value] + 1
|
39
|
+
class_frequency = frequencies[class_value]
|
40
|
+
if max_freq < class_frequency
|
41
|
+
max_freq = class_frequency
|
42
|
+
@class_value = class_value
|
43
|
+
end
|
44
|
+
end
|
45
|
+
return self
|
46
|
+
end
|
47
|
+
|
48
|
+
# You can evaluate new data, predicting its class.
|
49
|
+
# e.g.
|
50
|
+
# classifier.eval(['New York', '<30', 'F']) # => 'Y'
|
51
|
+
def eval(data)
|
52
|
+
@class_value
|
53
|
+
end
|
54
|
+
|
55
|
+
# This method returns the generated rules in ruby code.
|
56
|
+
# e.g.
|
57
|
+
#
|
58
|
+
# classifier.get_rules
|
59
|
+
# # => marketing_target='Y'
|
60
|
+
#
|
61
|
+
# It is a nice way to inspect induction results, and also to execute them:
|
62
|
+
# marketing_target = nil
|
63
|
+
# eval classifier.get_rules
|
64
|
+
# puts marketing_target
|
65
|
+
# # => 'Y'
|
66
|
+
def get_rules
|
67
|
+
return "#{@data_set.data_labels.last} = '#{@class_value}'"
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of a Hierarchical clusterer with group average
|
17
|
+
# linkage, AKA unweighted pair group method average or UPGMA (Everitt
|
18
|
+
# et al., 2001 ; Jain and Dubes, 1988 ; Sokal and Michener, 1958).
|
19
|
+
# Hierarchical clusteres create one cluster per element, and then
|
20
|
+
# progressively merge clusters, until the required number of clusters
|
21
|
+
# is reached.
|
22
|
+
# With average linkage, the distance between a clusters cx and
|
23
|
+
# cluster (ci U cj) the the average distance between cx and ci, and
|
24
|
+
# cx and cj.
|
25
|
+
#
|
26
|
+
# D(cx, (ci U cj) = (D(cx, ci) + D(cx, cj)) / 2
|
27
|
+
class AverageLinkage < SingleLinkage
|
28
|
+
|
29
|
+
parameters_info :distance_function =>
|
30
|
+
"Custom implementation of distance function. " +
|
31
|
+
"It must be a closure receiving two data items and return the " +
|
32
|
+
"distance bewteen them. By default, this algorithm uses " +
|
33
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
34
|
+
|
35
|
+
# Build a new clusterer, using data examples found in data_set.
|
36
|
+
# Items will be clustered in "number_of_clusters" different
|
37
|
+
# clusters.
|
38
|
+
def build(data_set, number_of_clusters)
|
39
|
+
super
|
40
|
+
end
|
41
|
+
|
42
|
+
# This algorithms does not allow classification of new data items
|
43
|
+
# once it has been built. Rebuild the cluster including you data element.
|
44
|
+
def eval(data_item)
|
45
|
+
Raise "Eval of new data is not supported by this algorithm."
|
46
|
+
end
|
47
|
+
|
48
|
+
protected
|
49
|
+
|
50
|
+
# return distance between cluster cx and cluster (ci U cj),
|
51
|
+
# using average linkage
|
52
|
+
def linkage_distance(cx, ci, cj)
|
53
|
+
(read_distance_matrix(cx, ci)+
|
54
|
+
read_distance_matrix(cx, cj))/2
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/k_means'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# The Bisecting k-means algorithm is a variation of the "k-means" algorithm,
|
17
|
+
# somewhat less sensible to the initial election of centroids than the
|
18
|
+
# original.
|
19
|
+
#
|
20
|
+
# More about K Means algorithm:
|
21
|
+
# http://en.wikipedia.org/wiki/K-means_algorithm
|
22
|
+
class BisectingKMeans < KMeans
|
23
|
+
|
24
|
+
attr_reader :data_set, :number_of_clusters, :clusters, :centroids
|
25
|
+
attr_accessor :max_iterations, :distance_function, :refine
|
26
|
+
|
27
|
+
parameters_info :max_iterations => "Maximum number of iterations to " +
|
28
|
+
"build the clusterer. By default it is uncapped.",
|
29
|
+
:distance_function => "Custom implementation of distance function. " +
|
30
|
+
"It must be a closure receiving two data items and return the " +
|
31
|
+
"distance bewteen them. By default, this algorithm uses " +
|
32
|
+
"ecuclidean distance of numeric attributes to the power of 2.",
|
33
|
+
:centroid_function => "Custom implementation to calculate the " +
|
34
|
+
"centroid of a cluster. It must be a closure receiving an array of " +
|
35
|
+
"data sets, and return an array of data items, representing the " +
|
36
|
+
"centroids of for each data set. " +
|
37
|
+
"By default, this algorithm returns a data items using the mode "+
|
38
|
+
"or mean of each attribute on each data set.",
|
39
|
+
:refine => "Boolean value. True by default. It will run the " +
|
40
|
+
"classic K Means algorithm, using as initial centroids the " +
|
41
|
+
"result of the bisecting approach."
|
42
|
+
|
43
|
+
|
44
|
+
def intialize
|
45
|
+
@refine = true
|
46
|
+
end
|
47
|
+
|
48
|
+
# Build a new clusterer, using data examples found in data_set.
|
49
|
+
# Items will be clustered in "number_of_clusters" different
|
50
|
+
# clusters.
|
51
|
+
def build(data_set, number_of_clusters)
|
52
|
+
@data_set = data_set
|
53
|
+
@number_of_clusters = number_of_clusters
|
54
|
+
|
55
|
+
@clusters = [@data_set]
|
56
|
+
@centroids = [@data_set.get_mean_or_mode]
|
57
|
+
while @clusters.length < @number_of_clusters
|
58
|
+
biggest_cluster_index = find_biggest_cluster_index(@clusters)
|
59
|
+
clusterer = KMeans.new.
|
60
|
+
set_parameters(get_parameters).
|
61
|
+
build(@clusters[biggest_cluster_index], 2)
|
62
|
+
@clusters.delete_at(biggest_cluster_index)
|
63
|
+
@centroids.delete_at(biggest_cluster_index)
|
64
|
+
@clusters.concat(clusterer.clusters)
|
65
|
+
@centroids.concat(clusterer.centroids)
|
66
|
+
end
|
67
|
+
|
68
|
+
super if @refine
|
69
|
+
|
70
|
+
return self
|
71
|
+
end
|
72
|
+
|
73
|
+
protected
|
74
|
+
def calc_initial_centroids
|
75
|
+
@centroids # Use existing centroids
|
76
|
+
end
|
77
|
+
|
78
|
+
def find_biggest_cluster_index(clusters)
|
79
|
+
max_index = 0
|
80
|
+
max_length = 0
|
81
|
+
clusters.each_index do |cluster_index|
|
82
|
+
cluster = clusters[cluster_index]
|
83
|
+
if max_length < cluster.data_items.length
|
84
|
+
max_length = cluster.data_items.length
|
85
|
+
max_index = cluster_index
|
86
|
+
end
|
87
|
+
end
|
88
|
+
return max_index
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
|
+
# centroid linkage algorithm, aka unweighted pair group method
|
18
|
+
# centroid (UPGMC) (Everitt et al., 2001 ; Jain and Dubes, 1988 ;
|
19
|
+
# Sokal and Michener, 1958 )
|
20
|
+
# Hierarchical clusteres create one cluster per element, and then
|
21
|
+
# progressively merge clusters, until the required number of clusters
|
22
|
+
# is reached.
|
23
|
+
# The distance between clusters is the squared euclidean distance
|
24
|
+
# between their centroids.
|
25
|
+
#
|
26
|
+
# D(cx, (ci U cj)) = | mx - mij |^2
|
27
|
+
# D(cx, (ci U cj)) = (ni/(ni+nj))*D(cx, ci) +
|
28
|
+
# (nj/(ni+nj))*D(cx, cj) -
|
29
|
+
# (ni*nj/(ni+nj)^2)*D(ci, cj)
|
30
|
+
class CentroidLinkage < SingleLinkage
|
31
|
+
|
32
|
+
parameters_info :distance_function =>
|
33
|
+
"Custom implementation of distance function. " +
|
34
|
+
"It must be a closure receiving two data items and return the " +
|
35
|
+
"distance bewteen them. By default, this algorithm uses " +
|
36
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
37
|
+
|
38
|
+
# Build a new clusterer, using data examples found in data_set.
|
39
|
+
# Items will be clustered in "number_of_clusters" different
|
40
|
+
# clusters.
|
41
|
+
def build(data_set, number_of_clusters)
|
42
|
+
super
|
43
|
+
end
|
44
|
+
|
45
|
+
# This algorithms does not allow classification of new data items
|
46
|
+
# once it has been built. Rebuild the cluster including you data element.
|
47
|
+
def eval(data_item)
|
48
|
+
Raise "Eval of new data is not supported by this algorithm."
|
49
|
+
end
|
50
|
+
|
51
|
+
protected
|
52
|
+
|
53
|
+
# return distance between cluster cx and cluster (ci U cj),
|
54
|
+
# using centroid linkage
|
55
|
+
def linkage_distance(cx, ci, cj)
|
56
|
+
ni = @index_clusters[ci].length
|
57
|
+
nj = @index_clusters[cj].length
|
58
|
+
( ni * read_distance_matrix(cx, ci) +
|
59
|
+
nj * read_distance_matrix(cx, cj) -
|
60
|
+
1.0 * ni * nj * read_distance_matrix(ci, cj) / (ni+nj)) / (ni+nj)
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|