ai4ruby 1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +47 -0
- data/examples/classifiers/id3_data.csv +121 -0
- data/examples/classifiers/id3_example.rb +29 -0
- data/examples/classifiers/naive_bayes_data.csv +11 -0
- data/examples/classifiers/naive_bayes_example.rb +16 -0
- data/examples/classifiers/results.txt +31 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
- data/examples/genetic_algorithm/travel_cost.csv +16 -0
- data/examples/neural_network/backpropagation_example.rb +67 -0
- data/examples/neural_network/patterns_with_base_noise.rb +68 -0
- data/examples/neural_network/patterns_with_noise.rb +66 -0
- data/examples/neural_network/training_patterns.rb +68 -0
- data/examples/neural_network/xor_example.rb +35 -0
- data/examples/som/som_data.rb +156 -0
- data/examples/som/som_multi_node_example.rb +22 -0
- data/examples/som/som_single_example.rb +24 -0
- data/lib/ai4r.rb +33 -0
- data/lib/ai4r/classifiers/classifier.rb +62 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
- data/lib/ai4r/classifiers/ib1.rb +121 -0
- data/lib/ai4r/classifiers/id3.rb +326 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
- data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
- data/lib/ai4r/classifiers/one_r.rb +110 -0
- data/lib/ai4r/classifiers/prism.rb +197 -0
- data/lib/ai4r/classifiers/zero_r.rb +73 -0
- data/lib/ai4r/clusterers/average_linkage.rb +59 -0
- data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
- data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
- data/lib/ai4r/clusterers/clusterer.rb +61 -0
- data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
- data/lib/ai4r/clusterers/diana.rb +139 -0
- data/lib/ai4r/clusterers/k_means.rb +126 -0
- data/lib/ai4r/clusterers/median_linkage.rb +61 -0
- data/lib/ai4r/clusterers/single_linkage.rb +194 -0
- data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +31 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
- data/lib/ai4r/data/data_set.rb +266 -0
- data/lib/ai4r/data/parameterizable.rb +64 -0
- data/lib/ai4r/data/proximity.rb +100 -0
- data/lib/ai4r/data/statistics.rb +77 -0
- data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
- data/lib/ai4r/neural_network/backpropagation.rb +326 -0
- data/lib/ai4r/neural_network/hopfield.rb +149 -0
- data/lib/ai4r/som/layer.rb +68 -0
- data/lib/ai4r/som/node.rb +96 -0
- data/lib/ai4r/som/som.rb +155 -0
- data/lib/ai4r/som/two_phase_layer.rb +90 -0
- data/test/classifiers/hyperpipes_test.rb +84 -0
- data/test/classifiers/ib1_test.rb +78 -0
- data/test/classifiers/id3_test.rb +208 -0
- data/test/classifiers/multilayer_perceptron_test.rb +79 -0
- data/test/classifiers/naive_bayes_test.rb +43 -0
- data/test/classifiers/one_r_test.rb +62 -0
- data/test/classifiers/prism_test.rb +85 -0
- data/test/classifiers/zero_r_test.rb +49 -0
- data/test/clusterers/average_linkage_test.rb +51 -0
- data/test/clusterers/bisecting_k_means_test.rb +66 -0
- data/test/clusterers/centroid_linkage_test.rb +53 -0
- data/test/clusterers/complete_linkage_test.rb +57 -0
- data/test/clusterers/diana_test.rb +69 -0
- data/test/clusterers/k_means_test.rb +100 -0
- data/test/clusterers/median_linkage_test.rb +53 -0
- data/test/clusterers/single_linkage_test.rb +122 -0
- data/test/clusterers/ward_linkage_hierarchical_test.rb +61 -0
- data/test/clusterers/ward_linkage_test.rb +53 -0
- data/test/clusterers/weighted_average_linkage_test.rb +53 -0
- data/test/data/data_set_test.rb +96 -0
- data/test/data/proximity_test.rb +81 -0
- data/test/data/statistics_test.rb +65 -0
- data/test/experiment/classifier_evaluator_test.rb +76 -0
- data/test/genetic_algorithm/chromosome_test.rb +58 -0
- data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
- data/test/neural_network/backpropagation_test.rb +82 -0
- data/test/neural_network/hopfield_test.rb +72 -0
- data/test/som/som_test.rb +97 -0
- metadata +168 -0
@@ -0,0 +1,197 @@
|
|
1
|
+
# Author:: Sergio Fierens (Implementation only, Cendrowska is
|
2
|
+
# the creator of the algorithm)
|
3
|
+
# License:: MPL 1.1
|
4
|
+
# Project:: ai4r
|
5
|
+
# Url:: http://ai4r.rubyforge.org/
|
6
|
+
#
|
7
|
+
# You can redistribute it and/or modify it under the terms of
|
8
|
+
# the Mozilla Public License version 1.1 as published by the
|
9
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
10
|
+
#
|
11
|
+
# J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
|
12
|
+
# International Journal of Man-Machine Studies. 27(4):349-370.
|
13
|
+
|
14
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
15
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
16
|
+
|
17
|
+
module Ai4r
|
18
|
+
module Classifiers
|
19
|
+
|
20
|
+
# = Introduction
|
21
|
+
# This is an implementation of the PRISM algorithm (Cendrowska, 1987)
|
22
|
+
# Given a set of preclassified examples, it builds a set of rules
|
23
|
+
# to predict the class of other instaces.
|
24
|
+
#
|
25
|
+
# J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
|
26
|
+
# International Journal of Man-Machine Studies. 27(4):349-370.
|
27
|
+
class Prism < Classifier
|
28
|
+
|
29
|
+
attr_reader :data_set, :rules
|
30
|
+
|
31
|
+
# Build a new Prism classifier. You must provide a DataSet instance
|
32
|
+
# as parameter. The last attribute of each item is considered as
|
33
|
+
# the item class.
|
34
|
+
def build(data_set)
|
35
|
+
data_set.check_not_empty
|
36
|
+
@data_set = data_set
|
37
|
+
domains = @data_set.build_domains
|
38
|
+
instances = @data_set.data_items.collect {|data| data }
|
39
|
+
@rules = []
|
40
|
+
domains.last.each do |class_value|
|
41
|
+
while(has_class_value(instances, class_value))
|
42
|
+
rule = build_rule(class_value, instances)
|
43
|
+
@rules << rule
|
44
|
+
instances = instances.select {|data| !matches_conditions(data, rule[:conditions])}
|
45
|
+
end
|
46
|
+
end
|
47
|
+
return self
|
48
|
+
end
|
49
|
+
|
50
|
+
# You can evaluate new data, predicting its class.
|
51
|
+
# e.g.
|
52
|
+
# classifier.eval(['New York', '<30', 'F']) # => 'Y'
|
53
|
+
def eval(instace)
|
54
|
+
@rules.each do |rule|
|
55
|
+
return rule[:class_value] if matches_conditions(instace, rule[:conditions])
|
56
|
+
end
|
57
|
+
return nil
|
58
|
+
end
|
59
|
+
|
60
|
+
# This method returns the generated rules in ruby code.
|
61
|
+
# e.g.
|
62
|
+
#
|
63
|
+
# classifier.get_rules
|
64
|
+
# # => if age_range == '<30' then marketing_target = 'Y'
|
65
|
+
# elsif age_range == '>80' then marketing_target = 'Y'
|
66
|
+
# elsif city == 'Chicago' and age_range == '[30-50)' then marketing_target = 'Y'
|
67
|
+
# else marketing_target = 'N'
|
68
|
+
# end
|
69
|
+
#
|
70
|
+
# It is a nice way to inspect induction results, and also to execute them:
|
71
|
+
# age_range = '[30-50)'
|
72
|
+
# city = 'New York'
|
73
|
+
# eval(classifier.get_rules)
|
74
|
+
# puts marketing_target
|
75
|
+
# 'Y'
|
76
|
+
def get_rules
|
77
|
+
out = "if #{join_terms(@rules.first)} then #{then_clause(@rules.first)}"
|
78
|
+
@rules[1...-1].each do |rule|
|
79
|
+
out += "\nelsif #{join_terms(rule)} then #{then_clause(rule)}"
|
80
|
+
end
|
81
|
+
out += "\nelse #{then_clause(@rules.last)}" if @rules.size > 1
|
82
|
+
out += "\nend"
|
83
|
+
return out
|
84
|
+
end
|
85
|
+
|
86
|
+
protected
|
87
|
+
|
88
|
+
def get_attr_value(data, attr)
|
89
|
+
data[@data_set.get_index(attr)]
|
90
|
+
end
|
91
|
+
|
92
|
+
def has_class_value(instances, class_value)
|
93
|
+
instances.each { |data| return true if data.last == class_value}
|
94
|
+
return false
|
95
|
+
end
|
96
|
+
|
97
|
+
def is_perfect(instances, rule)
|
98
|
+
class_value = rule[:class_value]
|
99
|
+
instances.each do |data|
|
100
|
+
return false if data.last != class_value and matches_conditions(data, rule[:conditions])
|
101
|
+
end
|
102
|
+
return true
|
103
|
+
end
|
104
|
+
|
105
|
+
def matches_conditions(data, conditions)
|
106
|
+
conditions.each_pair do |attr_label, attr_value|
|
107
|
+
return false if get_attr_value(data, attr_label) != attr_value
|
108
|
+
end
|
109
|
+
return true
|
110
|
+
end
|
111
|
+
|
112
|
+
def build_rule(class_value, instances)
|
113
|
+
rule = {:class_value => class_value, :conditions => {}}
|
114
|
+
rule_instances = instances.collect {|data| data }
|
115
|
+
attributes = @data_set.data_labels[0...-1].collect {|label| label }
|
116
|
+
until(is_perfect(instances, rule) || attributes.empty?)
|
117
|
+
freq_table = build_freq_table(rule_instances, attributes, class_value)
|
118
|
+
condition = get_condition(freq_table)
|
119
|
+
rule[:conditions].merge!(condition)
|
120
|
+
rule_instances = rule_instances.select do |data|
|
121
|
+
matches_conditions(data, condition)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
return rule
|
125
|
+
end
|
126
|
+
|
127
|
+
# Returns a structure with the folloring format:
|
128
|
+
# => {attr1_label => { :attr1_value1 => [p, t], attr1_value2 => [p, t], ... },
|
129
|
+
# attr2_label => { :attr2_value1 => [p, t], attr2_value2 => [p, t], ... },
|
130
|
+
# ...
|
131
|
+
# }
|
132
|
+
# where p is the number of instances classified as class_value
|
133
|
+
# with that attribute value, and t is the total number of instances with
|
134
|
+
# that attribute value
|
135
|
+
def build_freq_table(rule_instances, attributes, class_value)
|
136
|
+
freq_table = Hash.new()
|
137
|
+
rule_instances.each do |data|
|
138
|
+
attributes.each do |attr_label|
|
139
|
+
attr_freqs = freq_table[attr_label] || Hash.new([0, 0])
|
140
|
+
pt = attr_freqs[get_attr_value(data, attr_label)]
|
141
|
+
pt = [(data.last == class_value) ? pt[0]+1 : pt[0], pt[1]+1]
|
142
|
+
attr_freqs[get_attr_value(data, attr_label)] = pt
|
143
|
+
freq_table[attr_label] = attr_freqs
|
144
|
+
end
|
145
|
+
end
|
146
|
+
return freq_table
|
147
|
+
end
|
148
|
+
|
149
|
+
# returns a single conditional term: {attrN_label => attrN_valueM}
|
150
|
+
# selecting the attribute with higher pt ratio
|
151
|
+
# (occurrences of attribute value classified as class_value /
|
152
|
+
# occurrences of attribute value)
|
153
|
+
def get_condition(freq_table)
|
154
|
+
best_pt = [0, 0]
|
155
|
+
condition = nil
|
156
|
+
freq_table.each do |attr_label, attr_freqs|
|
157
|
+
attr_freqs.each do |attr_value, pt|
|
158
|
+
if(better_pt(pt, best_pt))
|
159
|
+
condition = { attr_label => attr_value }
|
160
|
+
best_pt = pt
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
return condition
|
165
|
+
end
|
166
|
+
|
167
|
+
# pt = [p, t]
|
168
|
+
# p = occurrences of attribute value with instance classified as class_value
|
169
|
+
# t = occurrences of attribute value
|
170
|
+
# a pt is better if:
|
171
|
+
# 1- its ratio is higher
|
172
|
+
# 2- its ratio is equal, and has a higher p
|
173
|
+
def better_pt(pt, best_pt)
|
174
|
+
return false if pt[1] == 0
|
175
|
+
return true if best_pt[1] == 0
|
176
|
+
a = pt[0]*best_pt[1]
|
177
|
+
b = best_pt[0]*pt[1]
|
178
|
+
return true if a>b || (a==b && pt[0]>best_pt[0])
|
179
|
+
return false
|
180
|
+
end
|
181
|
+
|
182
|
+
def join_terms(rule)
|
183
|
+
terms = []
|
184
|
+
rule[:conditions].each do |attr_label, attr_value|
|
185
|
+
terms << "#{attr_label} == '#{attr_value}'"
|
186
|
+
end
|
187
|
+
"#{terms.join(" and ")}"
|
188
|
+
end
|
189
|
+
|
190
|
+
def then_clause(rule)
|
191
|
+
"#{@data_set.data_labels.last} = '#{rule[:class_value]}'"
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# Author:: Sergio Fierens (Implementation only)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set.rb'
|
11
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Classifiers
|
15
|
+
|
16
|
+
# = Introduction
|
17
|
+
#
|
18
|
+
# The idea behind the ZeroR classifier is to identify the
|
19
|
+
# the most common class value in the training set.
|
20
|
+
# It always returns that value when evaluating an instance.
|
21
|
+
# It is frequently used as a baseline for evaluating other machine learning
|
22
|
+
# algorithms.
|
23
|
+
class ZeroR < Classifier
|
24
|
+
|
25
|
+
attr_reader :data_set, :class_value
|
26
|
+
|
27
|
+
# Build a new ZeroR classifier. You must provide a DataSet instance
|
28
|
+
# as parameter. The last attribute of each item is considered as
|
29
|
+
# the item class.
|
30
|
+
def build(data_set)
|
31
|
+
data_set.check_not_empty
|
32
|
+
@data_set = data_set
|
33
|
+
frequence = {}
|
34
|
+
max_freq = 0
|
35
|
+
@class_value = nil
|
36
|
+
@data_set.data_items.each do |example|
|
37
|
+
class_value = example.last
|
38
|
+
class_frequency = frequence[class_value]
|
39
|
+
class_frequency = (class_frequency) ? class_frequency+1 : 1
|
40
|
+
if max_freq < class_frequency
|
41
|
+
max_freq = class_frequency
|
42
|
+
@class_value = class_value
|
43
|
+
end
|
44
|
+
end
|
45
|
+
return self
|
46
|
+
end
|
47
|
+
|
48
|
+
# You can evaluate new data, predicting its class.
|
49
|
+
# e.g.
|
50
|
+
# classifier.eval(['New York', '<30', 'F']) # => 'Y'
|
51
|
+
def eval(data)
|
52
|
+
@class_value
|
53
|
+
end
|
54
|
+
|
55
|
+
# This method returns the generated rules in ruby code.
|
56
|
+
# e.g.
|
57
|
+
#
|
58
|
+
# classifier.get_rules
|
59
|
+
# # => marketing_target='Y'
|
60
|
+
#
|
61
|
+
# It is a nice way to inspect induction results, and also to execute them:
|
62
|
+
# marketing_target = nil
|
63
|
+
# eval classifier.get_rules
|
64
|
+
# puts marketing_target
|
65
|
+
# # => 'Y'
|
66
|
+
def get_rules
|
67
|
+
return "#{@data_set.data_labels.last} = '#{@class_value}'"
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of a Hierarchical clusterer with group average
|
17
|
+
# linkage, AKA unweighted pair group method average or UPGMA (Everitt
|
18
|
+
# et al., 2001 ; Jain and Dubes, 1988 ; Sokal and Michener, 1958).
|
19
|
+
# Hierarchical clusteres create one cluster per element, and then
|
20
|
+
# progressively merge clusters, until the required number of clusters
|
21
|
+
# is reached.
|
22
|
+
# With average linkage, the distance between a clusters cx and
|
23
|
+
# cluster (ci U cj) the the average distance between cx and ci, and
|
24
|
+
# cx and cj.
|
25
|
+
#
|
26
|
+
# D(cx, (ci U cj) = (D(cx, ci) + D(cx, cj)) / 2
|
27
|
+
class AverageLinkage < SingleLinkage
|
28
|
+
|
29
|
+
parameters_info :distance_function =>
|
30
|
+
"Custom implementation of distance function. " +
|
31
|
+
"It must be a closure receiving two data items and return the " +
|
32
|
+
"distance bewteen them. By default, this algorithm uses " +
|
33
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
34
|
+
|
35
|
+
# Build a new clusterer, using data examples found in data_set.
|
36
|
+
# Items will be clustered in "number_of_clusters" different
|
37
|
+
# clusters.
|
38
|
+
def build(data_set, number_of_clusters)
|
39
|
+
super
|
40
|
+
end
|
41
|
+
|
42
|
+
# This algorithms does not allow classification of new data items
|
43
|
+
# once it has been built. Rebuild the cluster including you data element.
|
44
|
+
def eval(data_item)
|
45
|
+
Raise "Eval of new data is not supported by this algorithm."
|
46
|
+
end
|
47
|
+
|
48
|
+
protected
|
49
|
+
|
50
|
+
# return distance between cluster cx and cluster (ci U cj),
|
51
|
+
# using average linkage
|
52
|
+
def linkage_distance(cx, ci, cj)
|
53
|
+
(read_distance_matrix(cx, ci)+
|
54
|
+
read_distance_matrix(cx, cj))/2
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/k_means'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# The Bisecting k-means algorithm is a variation of the "k-means" algorithm,
|
17
|
+
# somewhat less sensible to the initial election of centroids than the
|
18
|
+
# original.
|
19
|
+
#
|
20
|
+
# More about K Means algorithm:
|
21
|
+
# http://en.wikipedia.org/wiki/K-means_algorithm
|
22
|
+
class BisectingKMeans < KMeans
|
23
|
+
|
24
|
+
attr_reader :data_set, :number_of_clusters, :clusters, :centroids
|
25
|
+
attr_accessor :max_iterations, :distance_function, :refine
|
26
|
+
|
27
|
+
parameters_info :max_iterations => "Maximum number of iterations to " +
|
28
|
+
"build the clusterer. By default it is uncapped.",
|
29
|
+
:distance_function => "Custom implementation of distance function. " +
|
30
|
+
"It must be a closure receiving two data items and return the " +
|
31
|
+
"distance bewteen them. By default, this algorithm uses " +
|
32
|
+
"ecuclidean distance of numeric attributes to the power of 2.",
|
33
|
+
:centroid_function => "Custom implementation to calculate the " +
|
34
|
+
"centroid of a cluster. It must be a closure receiving an array of " +
|
35
|
+
"data sets, and return an array of data items, representing the " +
|
36
|
+
"centroids of for each data set. " +
|
37
|
+
"By default, this algorithm returns a data items using the mode "+
|
38
|
+
"or mean of each attribute on each data set.",
|
39
|
+
:refine => "Boolean value. True by default. It will run the " +
|
40
|
+
"classic K Means algorithm, using as initial centroids the " +
|
41
|
+
"result of the bisecting approach."
|
42
|
+
|
43
|
+
|
44
|
+
def intialize
|
45
|
+
@refine = true
|
46
|
+
end
|
47
|
+
|
48
|
+
# Build a new clusterer, using data examples found in data_set.
|
49
|
+
# Items will be clustered in "number_of_clusters" different
|
50
|
+
# clusters.
|
51
|
+
def build(data_set, number_of_clusters)
|
52
|
+
@data_set = data_set
|
53
|
+
@number_of_clusters = number_of_clusters
|
54
|
+
|
55
|
+
@clusters = [@data_set]
|
56
|
+
@centroids = [@data_set.get_mean_or_mode]
|
57
|
+
while @clusters.length < @number_of_clusters
|
58
|
+
biggest_cluster_index = find_biggest_cluster_index(@clusters)
|
59
|
+
clusterer = KMeans.new.
|
60
|
+
set_parameters(get_parameters).
|
61
|
+
build(@clusters[biggest_cluster_index], 2)
|
62
|
+
@clusters.delete_at(biggest_cluster_index)
|
63
|
+
@centroids.delete_at(biggest_cluster_index)
|
64
|
+
@clusters.concat(clusterer.clusters)
|
65
|
+
@centroids.concat(clusterer.centroids)
|
66
|
+
end
|
67
|
+
|
68
|
+
super if @refine
|
69
|
+
|
70
|
+
return self
|
71
|
+
end
|
72
|
+
|
73
|
+
protected
|
74
|
+
def calc_initial_centroids
|
75
|
+
@centroids # Use existing centroids
|
76
|
+
end
|
77
|
+
|
78
|
+
def find_biggest_cluster_index(clusters)
|
79
|
+
max_index = 0
|
80
|
+
max_length = 0
|
81
|
+
clusters.each_index do |cluster_index|
|
82
|
+
cluster = clusters[cluster_index]
|
83
|
+
if max_length < cluster.data_items.length
|
84
|
+
max_length = cluster.data_items.length
|
85
|
+
max_index = cluster_index
|
86
|
+
end
|
87
|
+
end
|
88
|
+
return max_index
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
|
+
# centroid linkage algorithm, aka unweighted pair group method
|
18
|
+
# centroid (UPGMC) (Everitt et al., 2001 ; Jain and Dubes, 1988 ;
|
19
|
+
# Sokal and Michener, 1958 )
|
20
|
+
# Hierarchical clusteres create one cluster per element, and then
|
21
|
+
# progressively merge clusters, until the required number of clusters
|
22
|
+
# is reached.
|
23
|
+
# The distance between clusters is the squared euclidean distance
|
24
|
+
# between their centroids.
|
25
|
+
#
|
26
|
+
# D(cx, (ci U cj)) = | mx - mij |^2
|
27
|
+
# D(cx, (ci U cj)) = (ni/(ni+nj))*D(cx, ci) +
|
28
|
+
# (nj/(ni+nj))*D(cx, cj) -
|
29
|
+
# (ni*nj/(ni+nj)^2)*D(ci, cj)
|
30
|
+
class CentroidLinkage < SingleLinkage
|
31
|
+
|
32
|
+
parameters_info :distance_function =>
|
33
|
+
"Custom implementation of distance function. " +
|
34
|
+
"It must be a closure receiving two data items and return the " +
|
35
|
+
"distance bewteen them. By default, this algorithm uses " +
|
36
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
37
|
+
|
38
|
+
# Build a new clusterer, using data examples found in data_set.
|
39
|
+
# Items will be clustered in "number_of_clusters" different
|
40
|
+
# clusters.
|
41
|
+
def build(data_set, number_of_clusters)
|
42
|
+
super
|
43
|
+
end
|
44
|
+
|
45
|
+
# This algorithms does not allow classification of new data items
|
46
|
+
# once it has been built. Rebuild the cluster including you data element.
|
47
|
+
def eval(data_item)
|
48
|
+
Raise "Eval of new data is not supported by this algorithm."
|
49
|
+
end
|
50
|
+
|
51
|
+
protected
|
52
|
+
|
53
|
+
# return distance between cluster cx and cluster (ci U cj),
|
54
|
+
# using centroid linkage
|
55
|
+
def linkage_distance(cx, ci, cj)
|
56
|
+
ni = @index_clusters[ci].length
|
57
|
+
nj = @index_clusters[cj].length
|
58
|
+
( ni * read_distance_matrix(cx, ci) +
|
59
|
+
nj * read_distance_matrix(cx, cj) -
|
60
|
+
1.0 * ni * nj * read_distance_matrix(ci, cj) / (ni+nj)) / (ni+nj)
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|