ai4ruby 1.11
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +47 -0
- data/examples/classifiers/id3_data.csv +121 -0
- data/examples/classifiers/id3_example.rb +29 -0
- data/examples/classifiers/naive_bayes_data.csv +11 -0
- data/examples/classifiers/naive_bayes_example.rb +16 -0
- data/examples/classifiers/results.txt +31 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
- data/examples/genetic_algorithm/travel_cost.csv +16 -0
- data/examples/neural_network/backpropagation_example.rb +67 -0
- data/examples/neural_network/patterns_with_base_noise.rb +68 -0
- data/examples/neural_network/patterns_with_noise.rb +66 -0
- data/examples/neural_network/training_patterns.rb +68 -0
- data/examples/neural_network/xor_example.rb +35 -0
- data/examples/som/som_data.rb +156 -0
- data/examples/som/som_multi_node_example.rb +22 -0
- data/examples/som/som_single_example.rb +24 -0
- data/lib/ai4r.rb +33 -0
- data/lib/ai4r/classifiers/classifier.rb +62 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
- data/lib/ai4r/classifiers/ib1.rb +121 -0
- data/lib/ai4r/classifiers/id3.rb +326 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
- data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
- data/lib/ai4r/classifiers/one_r.rb +110 -0
- data/lib/ai4r/classifiers/prism.rb +197 -0
- data/lib/ai4r/classifiers/zero_r.rb +73 -0
- data/lib/ai4r/clusterers/average_linkage.rb +59 -0
- data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
- data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
- data/lib/ai4r/clusterers/clusterer.rb +61 -0
- data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
- data/lib/ai4r/clusterers/diana.rb +139 -0
- data/lib/ai4r/clusterers/k_means.rb +126 -0
- data/lib/ai4r/clusterers/median_linkage.rb +61 -0
- data/lib/ai4r/clusterers/single_linkage.rb +194 -0
- data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +31 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
- data/lib/ai4r/data/data_set.rb +266 -0
- data/lib/ai4r/data/parameterizable.rb +64 -0
- data/lib/ai4r/data/proximity.rb +100 -0
- data/lib/ai4r/data/statistics.rb +77 -0
- data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
- data/lib/ai4r/neural_network/backpropagation.rb +326 -0
- data/lib/ai4r/neural_network/hopfield.rb +149 -0
- data/lib/ai4r/som/layer.rb +68 -0
- data/lib/ai4r/som/node.rb +96 -0
- data/lib/ai4r/som/som.rb +155 -0
- data/lib/ai4r/som/two_phase_layer.rb +90 -0
- data/test/classifiers/hyperpipes_test.rb +84 -0
- data/test/classifiers/ib1_test.rb +78 -0
- data/test/classifiers/id3_test.rb +208 -0
- data/test/classifiers/multilayer_perceptron_test.rb +79 -0
- data/test/classifiers/naive_bayes_test.rb +43 -0
- data/test/classifiers/one_r_test.rb +62 -0
- data/test/classifiers/prism_test.rb +85 -0
- data/test/classifiers/zero_r_test.rb +49 -0
- data/test/clusterers/average_linkage_test.rb +51 -0
- data/test/clusterers/bisecting_k_means_test.rb +66 -0
- data/test/clusterers/centroid_linkage_test.rb +53 -0
- data/test/clusterers/complete_linkage_test.rb +57 -0
- data/test/clusterers/diana_test.rb +69 -0
- data/test/clusterers/k_means_test.rb +100 -0
- data/test/clusterers/median_linkage_test.rb +53 -0
- data/test/clusterers/single_linkage_test.rb +122 -0
- data/test/clusterers/ward_linkage_hierarchical_test.rb +61 -0
- data/test/clusterers/ward_linkage_test.rb +53 -0
- data/test/clusterers/weighted_average_linkage_test.rb +53 -0
- data/test/data/data_set_test.rb +96 -0
- data/test/data/proximity_test.rb +81 -0
- data/test/data/statistics_test.rb +65 -0
- data/test/experiment/classifier_evaluator_test.rb +76 -0
- data/test/genetic_algorithm/chromosome_test.rb +58 -0
- data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
- data/test/neural_network/backpropagation_test.rb +82 -0
- data/test/neural_network/hopfield_test.rb +72 -0
- data/test/som/som_test.rb +97 -0
- metadata +168 -0
@@ -0,0 +1,61 @@
|
|
1
|
+
# Author:: Sergio Fierens
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/parameterizable'
|
11
|
+
|
12
|
+
module Ai4r
|
13
|
+
module Clusterers
|
14
|
+
|
15
|
+
# The purpose of this class is to define a common API for Clusterers.
|
16
|
+
# All methods in this class (other than eval) must be implemented in
|
17
|
+
# subclasses.
|
18
|
+
class Clusterer
|
19
|
+
|
20
|
+
include Ai4r::Data::Parameterizable
|
21
|
+
|
22
|
+
# Build a new clusterer, using data examples found in data_set.
|
23
|
+
# Data items will be clustered in "number_of_clusters" different
|
24
|
+
# clusters.
|
25
|
+
def build(data_set, number_of_clusters)
|
26
|
+
raise NotImplementedError
|
27
|
+
end
|
28
|
+
|
29
|
+
# Classifies the given data item, returning the cluster it belongs to.
|
30
|
+
def eval(data_item)
|
31
|
+
raise NotImplementedError
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
# Usefull as a defult distance function for clustering algorithms
|
36
|
+
def euclidean_distance(a, b)
|
37
|
+
dist = 0.0
|
38
|
+
a.each_index do |index|
|
39
|
+
if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
|
40
|
+
dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
|
41
|
+
end
|
42
|
+
end
|
43
|
+
return dist
|
44
|
+
end
|
45
|
+
|
46
|
+
def get_min_index(array)
|
47
|
+
min = array.first
|
48
|
+
index = 0
|
49
|
+
array.each_index do |i|
|
50
|
+
x = array[i]
|
51
|
+
if x < min
|
52
|
+
min = x
|
53
|
+
index = i
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return index
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of a Hierarchical clusterer with complete linkage (Everitt
|
17
|
+
# et al., 2001 ; Jain and Dubes, 1988 ; Sorensen, 1948 ).
|
18
|
+
# Hierarchical clusteres create one cluster per element, and then
|
19
|
+
# progressively merge clusters, until the required number of clusters
|
20
|
+
# is reached.
|
21
|
+
# With complete linkage, the distance between two clusters is computed as
|
22
|
+
# the maximum distance between elements of each cluster.
|
23
|
+
#
|
24
|
+
# D(cx, (ci U cj) = max(D(cx, ci), D(cx, cj))
|
25
|
+
class CompleteLinkage < SingleLinkage
|
26
|
+
|
27
|
+
parameters_info :distance_function =>
|
28
|
+
"Custom implementation of distance function. " +
|
29
|
+
"It must be a closure receiving two data items and return the " +
|
30
|
+
"distance bewteen them. By default, this algorithm uses " +
|
31
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
32
|
+
|
33
|
+
|
34
|
+
# Build a new clusterer, using data examples found in data_set.
|
35
|
+
# Items will be clustered in "number_of_clusters" different
|
36
|
+
# clusters.
|
37
|
+
def build(data_set, number_of_clusters)
|
38
|
+
super
|
39
|
+
end
|
40
|
+
|
41
|
+
# Classifies the given data item, returning the cluster index it belongs
|
42
|
+
# to (0-based).
|
43
|
+
def eval(data_item)
|
44
|
+
super
|
45
|
+
end
|
46
|
+
|
47
|
+
protected
|
48
|
+
|
49
|
+
# return distance between cluster cx and new cluster (ci U cj),
|
50
|
+
# using complete linkage
|
51
|
+
def linkage_distance(cx, ci, cj)
|
52
|
+
[read_distance_matrix(cx, ci),
|
53
|
+
read_distance_matrix(cx, cj)].max
|
54
|
+
end
|
55
|
+
|
56
|
+
def distance_between_item_and_cluster(data_item, cluster)
|
57
|
+
max_dist = 0
|
58
|
+
cluster.data_items.each do |another_item|
|
59
|
+
dist = @distance_function.call(data_item, another_item)
|
60
|
+
max_dist = dist if dist > max_dist
|
61
|
+
end
|
62
|
+
return max_dist
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../data/proximity'
|
12
|
+
require File.dirname(__FILE__) + '/../clusterers/clusterer'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Clusterers
|
16
|
+
|
17
|
+
# DIANA (Divisive ANAlysis) (Kaufman and Rousseeuw, 1990;
|
18
|
+
# Macnaughton - Smith et al. 1964) is a Divisive Hierarchical
|
19
|
+
# Clusterer. It begins with only one cluster with all data items,
|
20
|
+
# and divides the clusters until the desired clusters number is reached.
|
21
|
+
class Diana < Clusterer
|
22
|
+
|
23
|
+
attr_reader :data_set, :number_of_clusters, :clusters
|
24
|
+
|
25
|
+
parameters_info :distance_function =>
|
26
|
+
"Custom implementation of distance function. " +
|
27
|
+
"It must be a closure receiving two data items and return the " +
|
28
|
+
"distance bewteen them. By default, this algorithm uses " +
|
29
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
30
|
+
|
31
|
+
def initialize
|
32
|
+
@distance_function = lambda do |a,b|
|
33
|
+
Ai4r::Data::Proximity.squared_euclidean_distance(
|
34
|
+
a.select {|att_a| att_a.is_a? Numeric} ,
|
35
|
+
b.select {|att_b| att_b.is_a? Numeric})
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Build a new clusterer, using divisive analysis (DIANA algorithm)
|
40
|
+
def build(data_set, number_of_clusters)
|
41
|
+
@data_set = data_set
|
42
|
+
@number_of_clusters = number_of_clusters
|
43
|
+
@clusters = [@data_set[0..-1]]
|
44
|
+
|
45
|
+
while(@clusters.length < @number_of_clusters)
|
46
|
+
cluster_index_to_split = max_diameter_cluster(@clusters)
|
47
|
+
cluster_to_split = @clusters[cluster_index_to_split]
|
48
|
+
splinter_cluster = init_splinter_cluster(cluster_to_split)
|
49
|
+
while true
|
50
|
+
dist_diff, index = max_distance_difference(cluster_to_split, splinter_cluster)
|
51
|
+
break if dist_diff < 0
|
52
|
+
splinter_cluster << cluster_to_split.data_items[index]
|
53
|
+
cluster_to_split.data_items.delete_at(index)
|
54
|
+
end
|
55
|
+
@clusters << splinter_cluster
|
56
|
+
end
|
57
|
+
|
58
|
+
return self
|
59
|
+
end
|
60
|
+
|
61
|
+
# Classifies the given data item, returning the cluster index it belongs
|
62
|
+
# to (0-based).
|
63
|
+
def eval(data_item)
|
64
|
+
get_min_index(@clusters.collect do |cluster|
|
65
|
+
distance_sum(data_item, cluster) / cluster.data_items.length
|
66
|
+
end)
|
67
|
+
end
|
68
|
+
|
69
|
+
protected
|
70
|
+
|
71
|
+
# return the cluster with max diameter
|
72
|
+
def max_diameter_cluster(clusters)
|
73
|
+
max_index = 0
|
74
|
+
max_diameter = 0
|
75
|
+
clusters.each_with_index do |cluster, index|
|
76
|
+
diameter = cluster_diameter(cluster)
|
77
|
+
if diameter > max_diameter
|
78
|
+
max_index = index
|
79
|
+
max_diameter = diameter
|
80
|
+
end
|
81
|
+
end
|
82
|
+
return max_index
|
83
|
+
end
|
84
|
+
|
85
|
+
# Max distance between 2 items in a cluster
|
86
|
+
def cluster_diameter(cluster)
|
87
|
+
diameter = 0
|
88
|
+
cluster.data_items.each_with_index do |item_a, item_a_pos|
|
89
|
+
item_a_pos.times do |item_b_pos|
|
90
|
+
d = @distance_function.call(item_a, cluster.data_items[item_b_pos])
|
91
|
+
diameter = d if d > diameter
|
92
|
+
end
|
93
|
+
end
|
94
|
+
return diameter
|
95
|
+
end
|
96
|
+
|
97
|
+
# Create a cluster with the item with mx distance
|
98
|
+
# to the rest of the cluster's items.
|
99
|
+
# That item is removed from the initial cluster.
|
100
|
+
def init_splinter_cluster(cluster_to_split)
|
101
|
+
max = 0.0
|
102
|
+
max_index = 0
|
103
|
+
cluster_to_split.data_items.each_with_index do |item, index|
|
104
|
+
sum = distance_sum(item, cluster_to_split)
|
105
|
+
max, max_index = sum, index if sum > max
|
106
|
+
end
|
107
|
+
splinter_cluster = cluster_to_split[max_index]
|
108
|
+
cluster_to_split.data_items.delete_at(max_index)
|
109
|
+
return splinter_cluster
|
110
|
+
end
|
111
|
+
|
112
|
+
# Return the max average distance between any item of
|
113
|
+
# cluster_to_split and the rest of items in that cluster,
|
114
|
+
# minus the average distance with the items of splinter_cluster,
|
115
|
+
# and the index of the item.
|
116
|
+
# A positive value means that the items is closer to the
|
117
|
+
# splinter group than to its current cluster.
|
118
|
+
def max_distance_difference(cluster_to_split, splinter_cluster)
|
119
|
+
max_diff = -1.0/0
|
120
|
+
max_diff_index = 0
|
121
|
+
cluster_to_split.data_items.each_with_index do |item, index|
|
122
|
+
dist_a = distance_sum(item, cluster_to_split) / (cluster_to_split.data_items.length-1)
|
123
|
+
dist_b = distance_sum(item, splinter_cluster) / (splinter_cluster.data_items.length)
|
124
|
+
dist_diff = dist_a - dist_b
|
125
|
+
max_diff, max_diff_index = dist_diff, index if dist_diff > max_diff
|
126
|
+
end
|
127
|
+
return max_diff, max_diff_index
|
128
|
+
end
|
129
|
+
|
130
|
+
# Sum up the distance between an item and all the items in a cluster
|
131
|
+
def distance_sum(item_a, cluster)
|
132
|
+
cluster.data_items.inject(0.0) do |sum, item_b|
|
133
|
+
sum + @distance_function.call(item_a, item_b)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/clusterer'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# The k-means algorithm is an algorithm to cluster n objects
|
17
|
+
# based on attributes into k partitions, with k < n.
|
18
|
+
#
|
19
|
+
# More about K Means algorithm:
|
20
|
+
# http://en.wikipedia.org/wiki/K-means_algorithm
|
21
|
+
class KMeans < Clusterer
|
22
|
+
|
23
|
+
attr_reader :data_set, :number_of_clusters
|
24
|
+
attr_reader :clusters, :centroids, :iterations
|
25
|
+
|
26
|
+
parameters_info :max_iterations => "Maximum number of iterations to " +
|
27
|
+
"build the clusterer. By default it is uncapped.",
|
28
|
+
:distance_function => "Custom implementation of distance function. " +
|
29
|
+
"It must be a closure receiving two data items and return the " +
|
30
|
+
"distance bewteen them. By default, this algorithm uses " +
|
31
|
+
"ecuclidean distance of numeric attributes to the power of 2.",
|
32
|
+
:centroid_function => "Custom implementation to calculate the " +
|
33
|
+
"centroid of a cluster. It must be a closure receiving an array of " +
|
34
|
+
"data sets, and return an array of data items, representing the " +
|
35
|
+
"centroids of for each data set. " +
|
36
|
+
"By default, this algorithm returns a data items using the mode "+
|
37
|
+
"or mean of each attribute on each data set."
|
38
|
+
|
39
|
+
def initialize
|
40
|
+
@distance_function = nil
|
41
|
+
@max_iterations = nil
|
42
|
+
@old_centroids = nil
|
43
|
+
@centroid_function = lambda do |data_sets|
|
44
|
+
data_sets.collect{ |data_set| data_set.get_mean_or_mode}
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
# Build a new clusterer, using data examples found in data_set.
|
50
|
+
# Items will be clustered in "number_of_clusters" different
|
51
|
+
# clusters.
|
52
|
+
def build(data_set, number_of_clusters)
|
53
|
+
@data_set = data_set
|
54
|
+
@number_of_clusters = number_of_clusters
|
55
|
+
@iterations = 0
|
56
|
+
|
57
|
+
calc_initial_centroids
|
58
|
+
while(not stop_criteria_met)
|
59
|
+
calculate_membership_clusters
|
60
|
+
recompute_centroids
|
61
|
+
end
|
62
|
+
|
63
|
+
return self
|
64
|
+
end
|
65
|
+
|
66
|
+
# Classifies the given data item, returning the cluster index it belongs
|
67
|
+
# to (0-based).
|
68
|
+
def eval(data_item)
|
69
|
+
get_min_index(@centroids.collect {|centroid|
|
70
|
+
distance(data_item, centroid)})
|
71
|
+
end
|
72
|
+
|
73
|
+
# This function calculates the distance between 2 different
|
74
|
+
# instances. By default, it returns the euclidean distance to the
|
75
|
+
# power of 2.
|
76
|
+
# You can provide a more convinient distance implementation:
|
77
|
+
#
|
78
|
+
# 1- Overwriting this method
|
79
|
+
#
|
80
|
+
# 2- Providing a closure to the :distance_function parameter
|
81
|
+
def distance(a, b)
|
82
|
+
return @distance_function.call(a, b) if @distance_function
|
83
|
+
return euclidean_distance(a, b)
|
84
|
+
end
|
85
|
+
|
86
|
+
protected
|
87
|
+
|
88
|
+
def calc_initial_centroids
|
89
|
+
@centroids = []
|
90
|
+
tried_indexes = []
|
91
|
+
while @centroids.length < @number_of_clusters &&
|
92
|
+
tried_indexes.length < @data_set.data_items.length
|
93
|
+
random_index = rand(@data_set.data_items.length)
|
94
|
+
if !tried_indexes.include?(random_index)
|
95
|
+
tried_indexes << random_index
|
96
|
+
if !@centroids.include? @data_set.data_items[random_index]
|
97
|
+
@centroids << @data_set.data_items[random_index]
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
@number_of_clusters = @centroids.length
|
102
|
+
end
|
103
|
+
|
104
|
+
def stop_criteria_met
|
105
|
+
@old_centroids == @centroids ||
|
106
|
+
(@max_iterations && (@max_iterations <= @iterations))
|
107
|
+
end
|
108
|
+
|
109
|
+
def calculate_membership_clusters
|
110
|
+
@clusters = Array.new(@number_of_clusters) do
|
111
|
+
Ai4r::Data::DataSet.new :data_labels => @data_set.data_labels
|
112
|
+
end
|
113
|
+
@data_set.data_items.each do |data_item|
|
114
|
+
@clusters[eval(data_item)] << data_item
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def recompute_centroids
|
119
|
+
@old_centroids = @centroids
|
120
|
+
@iterations += 1
|
121
|
+
@centroids = @centroid_function.call(@clusters)
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
|
+
# median linkage algorithm, aka weighted pair group method centroid
|
18
|
+
# or WPGMC (Everitt et al., 2001 ; Gower, 1967 ; Jain and Dubes, 1988 ).
|
19
|
+
# Hierarchical clusteres create one cluster per element, and then
|
20
|
+
# progressively merge clusters, until the required number of clusters
|
21
|
+
# is reached.
|
22
|
+
# Similar to centroid linkages, but using fix weight:
|
23
|
+
#
|
24
|
+
# D(cx, (ci U cj)) = (1/2)*D(cx, ci) +
|
25
|
+
# (1/2)*D(cx, cj) -
|
26
|
+
# (1/4)*D(ci, cj)
|
27
|
+
class MedianLinkage < SingleLinkage
|
28
|
+
|
29
|
+
parameters_info :distance_function =>
|
30
|
+
"Custom implementation of distance function. " +
|
31
|
+
"It must be a closure receiving two data items and return the " +
|
32
|
+
"distance bewteen them. By default, this algorithm uses " +
|
33
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
34
|
+
|
35
|
+
# Build a new clusterer, using data examples found in data_set.
|
36
|
+
# Items will be clustered in "number_of_clusters" different
|
37
|
+
# clusters.
|
38
|
+
def build(data_set, number_of_clusters)
|
39
|
+
super
|
40
|
+
end
|
41
|
+
|
42
|
+
# This algorithms does not allow classification of new data items
|
43
|
+
# once it has been built. Rebuild the cluster including you data element.
|
44
|
+
def eval(data_item)
|
45
|
+
Raise "Eval of new data is not supported by this algorithm."
|
46
|
+
end
|
47
|
+
|
48
|
+
protected
|
49
|
+
|
50
|
+
# return distance between cluster cx and cluster (ci U cj),
|
51
|
+
# using median linkage
|
52
|
+
def linkage_distance(cx, ci, cj)
|
53
|
+
( 0.5 * read_distance_matrix(cx, ci) +
|
54
|
+
0.5 * read_distance_matrix(cx, cj) -
|
55
|
+
0.25 * read_distance_matrix(ci, cj))
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|