ai4ruby 1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +47 -0
- data/examples/classifiers/id3_data.csv +121 -0
- data/examples/classifiers/id3_example.rb +29 -0
- data/examples/classifiers/naive_bayes_data.csv +11 -0
- data/examples/classifiers/naive_bayes_example.rb +16 -0
- data/examples/classifiers/results.txt +31 -0
- data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
- data/examples/genetic_algorithm/travel_cost.csv +16 -0
- data/examples/neural_network/backpropagation_example.rb +67 -0
- data/examples/neural_network/patterns_with_base_noise.rb +68 -0
- data/examples/neural_network/patterns_with_noise.rb +66 -0
- data/examples/neural_network/training_patterns.rb +68 -0
- data/examples/neural_network/xor_example.rb +35 -0
- data/examples/som/som_data.rb +156 -0
- data/examples/som/som_multi_node_example.rb +22 -0
- data/examples/som/som_single_example.rb +24 -0
- data/lib/ai4r.rb +33 -0
- data/lib/ai4r/classifiers/classifier.rb +62 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
- data/lib/ai4r/classifiers/ib1.rb +121 -0
- data/lib/ai4r/classifiers/id3.rb +326 -0
- data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
- data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
- data/lib/ai4r/classifiers/one_r.rb +110 -0
- data/lib/ai4r/classifiers/prism.rb +197 -0
- data/lib/ai4r/classifiers/zero_r.rb +73 -0
- data/lib/ai4r/clusterers/average_linkage.rb +59 -0
- data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
- data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
- data/lib/ai4r/clusterers/clusterer.rb +61 -0
- data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
- data/lib/ai4r/clusterers/diana.rb +139 -0
- data/lib/ai4r/clusterers/k_means.rb +126 -0
- data/lib/ai4r/clusterers/median_linkage.rb +61 -0
- data/lib/ai4r/clusterers/single_linkage.rb +194 -0
- data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
- data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +31 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
- data/lib/ai4r/data/data_set.rb +266 -0
- data/lib/ai4r/data/parameterizable.rb +64 -0
- data/lib/ai4r/data/proximity.rb +100 -0
- data/lib/ai4r/data/statistics.rb +77 -0
- data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
- data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
- data/lib/ai4r/neural_network/backpropagation.rb +326 -0
- data/lib/ai4r/neural_network/hopfield.rb +149 -0
- data/lib/ai4r/som/layer.rb +68 -0
- data/lib/ai4r/som/node.rb +96 -0
- data/lib/ai4r/som/som.rb +155 -0
- data/lib/ai4r/som/two_phase_layer.rb +90 -0
- data/test/classifiers/hyperpipes_test.rb +84 -0
- data/test/classifiers/ib1_test.rb +78 -0
- data/test/classifiers/id3_test.rb +208 -0
- data/test/classifiers/multilayer_perceptron_test.rb +79 -0
- data/test/classifiers/naive_bayes_test.rb +43 -0
- data/test/classifiers/one_r_test.rb +62 -0
- data/test/classifiers/prism_test.rb +85 -0
- data/test/classifiers/zero_r_test.rb +49 -0
- data/test/clusterers/average_linkage_test.rb +51 -0
- data/test/clusterers/bisecting_k_means_test.rb +66 -0
- data/test/clusterers/centroid_linkage_test.rb +53 -0
- data/test/clusterers/complete_linkage_test.rb +57 -0
- data/test/clusterers/diana_test.rb +69 -0
- data/test/clusterers/k_means_test.rb +100 -0
- data/test/clusterers/median_linkage_test.rb +53 -0
- data/test/clusterers/single_linkage_test.rb +122 -0
- data/test/clusterers/ward_linkage_hierarchical_test.rb +61 -0
- data/test/clusterers/ward_linkage_test.rb +53 -0
- data/test/clusterers/weighted_average_linkage_test.rb +53 -0
- data/test/data/data_set_test.rb +96 -0
- data/test/data/proximity_test.rb +81 -0
- data/test/data/statistics_test.rb +65 -0
- data/test/experiment/classifier_evaluator_test.rb +76 -0
- data/test/genetic_algorithm/chromosome_test.rb +58 -0
- data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
- data/test/neural_network/backpropagation_test.rb +82 -0
- data/test/neural_network/hopfield_test.rb +72 -0
- data/test/som/som_test.rb +97 -0
- metadata +168 -0
@@ -0,0 +1,61 @@
|
|
1
|
+
# Author:: Sergio Fierens
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/parameterizable'
|
11
|
+
|
12
|
+
module Ai4r
|
13
|
+
module Clusterers
|
14
|
+
|
15
|
+
# The purpose of this class is to define a common API for Clusterers.
|
16
|
+
# All methods in this class (other than eval) must be implemented in
|
17
|
+
# subclasses.
|
18
|
+
class Clusterer
|
19
|
+
|
20
|
+
include Ai4r::Data::Parameterizable
|
21
|
+
|
22
|
+
# Build a new clusterer, using data examples found in data_set.
|
23
|
+
# Data items will be clustered in "number_of_clusters" different
|
24
|
+
# clusters.
|
25
|
+
def build(data_set, number_of_clusters)
|
26
|
+
raise NotImplementedError
|
27
|
+
end
|
28
|
+
|
29
|
+
# Classifies the given data item, returning the cluster it belongs to.
|
30
|
+
def eval(data_item)
|
31
|
+
raise NotImplementedError
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
# Usefull as a defult distance function for clustering algorithms
|
36
|
+
def euclidean_distance(a, b)
|
37
|
+
dist = 0.0
|
38
|
+
a.each_index do |index|
|
39
|
+
if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
|
40
|
+
dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
|
41
|
+
end
|
42
|
+
end
|
43
|
+
return dist
|
44
|
+
end
|
45
|
+
|
46
|
+
def get_min_index(array)
|
47
|
+
min = array.first
|
48
|
+
index = 0
|
49
|
+
array.each_index do |i|
|
50
|
+
x = array[i]
|
51
|
+
if x < min
|
52
|
+
min = x
|
53
|
+
index = i
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return index
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of a Hierarchical clusterer with complete linkage (Everitt
|
17
|
+
# et al., 2001 ; Jain and Dubes, 1988 ; Sorensen, 1948 ).
|
18
|
+
# Hierarchical clusteres create one cluster per element, and then
|
19
|
+
# progressively merge clusters, until the required number of clusters
|
20
|
+
# is reached.
|
21
|
+
# With complete linkage, the distance between two clusters is computed as
|
22
|
+
# the maximum distance between elements of each cluster.
|
23
|
+
#
|
24
|
+
# D(cx, (ci U cj) = max(D(cx, ci), D(cx, cj))
|
25
|
+
class CompleteLinkage < SingleLinkage
|
26
|
+
|
27
|
+
parameters_info :distance_function =>
|
28
|
+
"Custom implementation of distance function. " +
|
29
|
+
"It must be a closure receiving two data items and return the " +
|
30
|
+
"distance bewteen them. By default, this algorithm uses " +
|
31
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
32
|
+
|
33
|
+
|
34
|
+
# Build a new clusterer, using data examples found in data_set.
|
35
|
+
# Items will be clustered in "number_of_clusters" different
|
36
|
+
# clusters.
|
37
|
+
def build(data_set, number_of_clusters)
|
38
|
+
super
|
39
|
+
end
|
40
|
+
|
41
|
+
# Classifies the given data item, returning the cluster index it belongs
|
42
|
+
# to (0-based).
|
43
|
+
def eval(data_item)
|
44
|
+
super
|
45
|
+
end
|
46
|
+
|
47
|
+
protected
|
48
|
+
|
49
|
+
# return distance between cluster cx and new cluster (ci U cj),
|
50
|
+
# using complete linkage
|
51
|
+
def linkage_distance(cx, ci, cj)
|
52
|
+
[read_distance_matrix(cx, ci),
|
53
|
+
read_distance_matrix(cx, cj)].max
|
54
|
+
end
|
55
|
+
|
56
|
+
def distance_between_item_and_cluster(data_item, cluster)
|
57
|
+
max_dist = 0
|
58
|
+
cluster.data_items.each do |another_item|
|
59
|
+
dist = @distance_function.call(data_item, another_item)
|
60
|
+
max_dist = dist if dist > max_dist
|
61
|
+
end
|
62
|
+
return max_dist
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../data/proximity'
|
12
|
+
require File.dirname(__FILE__) + '/../clusterers/clusterer'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Clusterers
|
16
|
+
|
17
|
+
# DIANA (Divisive ANAlysis) (Kaufman and Rousseeuw, 1990;
|
18
|
+
# Macnaughton - Smith et al. 1964) is a Divisive Hierarchical
|
19
|
+
# Clusterer. It begins with only one cluster with all data items,
|
20
|
+
# and divides the clusters until the desired clusters number is reached.
|
21
|
+
class Diana < Clusterer
|
22
|
+
|
23
|
+
attr_reader :data_set, :number_of_clusters, :clusters
|
24
|
+
|
25
|
+
parameters_info :distance_function =>
|
26
|
+
"Custom implementation of distance function. " +
|
27
|
+
"It must be a closure receiving two data items and return the " +
|
28
|
+
"distance bewteen them. By default, this algorithm uses " +
|
29
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
30
|
+
|
31
|
+
def initialize
|
32
|
+
@distance_function = lambda do |a,b|
|
33
|
+
Ai4r::Data::Proximity.squared_euclidean_distance(
|
34
|
+
a.select {|att_a| att_a.is_a? Numeric} ,
|
35
|
+
b.select {|att_b| att_b.is_a? Numeric})
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Build a new clusterer, using divisive analysis (DIANA algorithm)
|
40
|
+
def build(data_set, number_of_clusters)
|
41
|
+
@data_set = data_set
|
42
|
+
@number_of_clusters = number_of_clusters
|
43
|
+
@clusters = [@data_set[0..-1]]
|
44
|
+
|
45
|
+
while(@clusters.length < @number_of_clusters)
|
46
|
+
cluster_index_to_split = max_diameter_cluster(@clusters)
|
47
|
+
cluster_to_split = @clusters[cluster_index_to_split]
|
48
|
+
splinter_cluster = init_splinter_cluster(cluster_to_split)
|
49
|
+
while true
|
50
|
+
dist_diff, index = max_distance_difference(cluster_to_split, splinter_cluster)
|
51
|
+
break if dist_diff < 0
|
52
|
+
splinter_cluster << cluster_to_split.data_items[index]
|
53
|
+
cluster_to_split.data_items.delete_at(index)
|
54
|
+
end
|
55
|
+
@clusters << splinter_cluster
|
56
|
+
end
|
57
|
+
|
58
|
+
return self
|
59
|
+
end
|
60
|
+
|
61
|
+
# Classifies the given data item, returning the cluster index it belongs
|
62
|
+
# to (0-based).
|
63
|
+
def eval(data_item)
|
64
|
+
get_min_index(@clusters.collect do |cluster|
|
65
|
+
distance_sum(data_item, cluster) / cluster.data_items.length
|
66
|
+
end)
|
67
|
+
end
|
68
|
+
|
69
|
+
protected
|
70
|
+
|
71
|
+
# return the cluster with max diameter
|
72
|
+
def max_diameter_cluster(clusters)
|
73
|
+
max_index = 0
|
74
|
+
max_diameter = 0
|
75
|
+
clusters.each_with_index do |cluster, index|
|
76
|
+
diameter = cluster_diameter(cluster)
|
77
|
+
if diameter > max_diameter
|
78
|
+
max_index = index
|
79
|
+
max_diameter = diameter
|
80
|
+
end
|
81
|
+
end
|
82
|
+
return max_index
|
83
|
+
end
|
84
|
+
|
85
|
+
# Max distance between 2 items in a cluster
|
86
|
+
def cluster_diameter(cluster)
|
87
|
+
diameter = 0
|
88
|
+
cluster.data_items.each_with_index do |item_a, item_a_pos|
|
89
|
+
item_a_pos.times do |item_b_pos|
|
90
|
+
d = @distance_function.call(item_a, cluster.data_items[item_b_pos])
|
91
|
+
diameter = d if d > diameter
|
92
|
+
end
|
93
|
+
end
|
94
|
+
return diameter
|
95
|
+
end
|
96
|
+
|
97
|
+
# Create a cluster with the item with mx distance
|
98
|
+
# to the rest of the cluster's items.
|
99
|
+
# That item is removed from the initial cluster.
|
100
|
+
def init_splinter_cluster(cluster_to_split)
|
101
|
+
max = 0.0
|
102
|
+
max_index = 0
|
103
|
+
cluster_to_split.data_items.each_with_index do |item, index|
|
104
|
+
sum = distance_sum(item, cluster_to_split)
|
105
|
+
max, max_index = sum, index if sum > max
|
106
|
+
end
|
107
|
+
splinter_cluster = cluster_to_split[max_index]
|
108
|
+
cluster_to_split.data_items.delete_at(max_index)
|
109
|
+
return splinter_cluster
|
110
|
+
end
|
111
|
+
|
112
|
+
# Return the max average distance between any item of
|
113
|
+
# cluster_to_split and the rest of items in that cluster,
|
114
|
+
# minus the average distance with the items of splinter_cluster,
|
115
|
+
# and the index of the item.
|
116
|
+
# A positive value means that the items is closer to the
|
117
|
+
# splinter group than to its current cluster.
|
118
|
+
def max_distance_difference(cluster_to_split, splinter_cluster)
|
119
|
+
max_diff = -1.0/0
|
120
|
+
max_diff_index = 0
|
121
|
+
cluster_to_split.data_items.each_with_index do |item, index|
|
122
|
+
dist_a = distance_sum(item, cluster_to_split) / (cluster_to_split.data_items.length-1)
|
123
|
+
dist_b = distance_sum(item, splinter_cluster) / (splinter_cluster.data_items.length)
|
124
|
+
dist_diff = dist_a - dist_b
|
125
|
+
max_diff, max_diff_index = dist_diff, index if dist_diff > max_diff
|
126
|
+
end
|
127
|
+
return max_diff, max_diff_index
|
128
|
+
end
|
129
|
+
|
130
|
+
# Sum up the distance between an item and all the items in a cluster
|
131
|
+
def distance_sum(item_a, cluster)
|
132
|
+
cluster.data_items.inject(0.0) do |sum, item_b|
|
133
|
+
sum + @distance_function.call(item_a, item_b)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/clusterer'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# The k-means algorithm is an algorithm to cluster n objects
|
17
|
+
# based on attributes into k partitions, with k < n.
|
18
|
+
#
|
19
|
+
# More about K Means algorithm:
|
20
|
+
# http://en.wikipedia.org/wiki/K-means_algorithm
|
21
|
+
class KMeans < Clusterer
|
22
|
+
|
23
|
+
attr_reader :data_set, :number_of_clusters
|
24
|
+
attr_reader :clusters, :centroids, :iterations
|
25
|
+
|
26
|
+
parameters_info :max_iterations => "Maximum number of iterations to " +
|
27
|
+
"build the clusterer. By default it is uncapped.",
|
28
|
+
:distance_function => "Custom implementation of distance function. " +
|
29
|
+
"It must be a closure receiving two data items and return the " +
|
30
|
+
"distance bewteen them. By default, this algorithm uses " +
|
31
|
+
"ecuclidean distance of numeric attributes to the power of 2.",
|
32
|
+
:centroid_function => "Custom implementation to calculate the " +
|
33
|
+
"centroid of a cluster. It must be a closure receiving an array of " +
|
34
|
+
"data sets, and return an array of data items, representing the " +
|
35
|
+
"centroids of for each data set. " +
|
36
|
+
"By default, this algorithm returns a data items using the mode "+
|
37
|
+
"or mean of each attribute on each data set."
|
38
|
+
|
39
|
+
def initialize
|
40
|
+
@distance_function = nil
|
41
|
+
@max_iterations = nil
|
42
|
+
@old_centroids = nil
|
43
|
+
@centroid_function = lambda do |data_sets|
|
44
|
+
data_sets.collect{ |data_set| data_set.get_mean_or_mode}
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
# Build a new clusterer, using data examples found in data_set.
|
50
|
+
# Items will be clustered in "number_of_clusters" different
|
51
|
+
# clusters.
|
52
|
+
def build(data_set, number_of_clusters)
|
53
|
+
@data_set = data_set
|
54
|
+
@number_of_clusters = number_of_clusters
|
55
|
+
@iterations = 0
|
56
|
+
|
57
|
+
calc_initial_centroids
|
58
|
+
while(not stop_criteria_met)
|
59
|
+
calculate_membership_clusters
|
60
|
+
recompute_centroids
|
61
|
+
end
|
62
|
+
|
63
|
+
return self
|
64
|
+
end
|
65
|
+
|
66
|
+
# Classifies the given data item, returning the cluster index it belongs
|
67
|
+
# to (0-based).
|
68
|
+
def eval(data_item)
|
69
|
+
get_min_index(@centroids.collect {|centroid|
|
70
|
+
distance(data_item, centroid)})
|
71
|
+
end
|
72
|
+
|
73
|
+
# This function calculates the distance between 2 different
|
74
|
+
# instances. By default, it returns the euclidean distance to the
|
75
|
+
# power of 2.
|
76
|
+
# You can provide a more convinient distance implementation:
|
77
|
+
#
|
78
|
+
# 1- Overwriting this method
|
79
|
+
#
|
80
|
+
# 2- Providing a closure to the :distance_function parameter
|
81
|
+
def distance(a, b)
|
82
|
+
return @distance_function.call(a, b) if @distance_function
|
83
|
+
return euclidean_distance(a, b)
|
84
|
+
end
|
85
|
+
|
86
|
+
protected
|
87
|
+
|
88
|
+
def calc_initial_centroids
|
89
|
+
@centroids = []
|
90
|
+
tried_indexes = []
|
91
|
+
while @centroids.length < @number_of_clusters &&
|
92
|
+
tried_indexes.length < @data_set.data_items.length
|
93
|
+
random_index = rand(@data_set.data_items.length)
|
94
|
+
if !tried_indexes.include?(random_index)
|
95
|
+
tried_indexes << random_index
|
96
|
+
if !@centroids.include? @data_set.data_items[random_index]
|
97
|
+
@centroids << @data_set.data_items[random_index]
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
@number_of_clusters = @centroids.length
|
102
|
+
end
|
103
|
+
|
104
|
+
def stop_criteria_met
|
105
|
+
@old_centroids == @centroids ||
|
106
|
+
(@max_iterations && (@max_iterations <= @iterations))
|
107
|
+
end
|
108
|
+
|
109
|
+
def calculate_membership_clusters
|
110
|
+
@clusters = Array.new(@number_of_clusters) do
|
111
|
+
Ai4r::Data::DataSet.new :data_labels => @data_set.data_labels
|
112
|
+
end
|
113
|
+
@data_set.data_items.each do |data_item|
|
114
|
+
@clusters[eval(data_item)] << data_item
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def recompute_centroids
|
119
|
+
@old_centroids = @centroids
|
120
|
+
@iterations += 1
|
121
|
+
@centroids = @centroid_function.call(@clusters)
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
|
+
# median linkage algorithm, aka weighted pair group method centroid
|
18
|
+
# or WPGMC (Everitt et al., 2001 ; Gower, 1967 ; Jain and Dubes, 1988 ).
|
19
|
+
# Hierarchical clusteres create one cluster per element, and then
|
20
|
+
# progressively merge clusters, until the required number of clusters
|
21
|
+
# is reached.
|
22
|
+
# Similar to centroid linkages, but using fix weight:
|
23
|
+
#
|
24
|
+
# D(cx, (ci U cj)) = (1/2)*D(cx, ci) +
|
25
|
+
# (1/2)*D(cx, cj) -
|
26
|
+
# (1/4)*D(ci, cj)
|
27
|
+
class MedianLinkage < SingleLinkage
|
28
|
+
|
29
|
+
parameters_info :distance_function =>
|
30
|
+
"Custom implementation of distance function. " +
|
31
|
+
"It must be a closure receiving two data items and return the " +
|
32
|
+
"distance bewteen them. By default, this algorithm uses " +
|
33
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
34
|
+
|
35
|
+
# Build a new clusterer, using data examples found in data_set.
|
36
|
+
# Items will be clustered in "number_of_clusters" different
|
37
|
+
# clusters.
|
38
|
+
def build(data_set, number_of_clusters)
|
39
|
+
super
|
40
|
+
end
|
41
|
+
|
42
|
+
# This algorithms does not allow classification of new data items
|
43
|
+
# once it has been built. Rebuild the cluster including you data element.
|
44
|
+
def eval(data_item)
|
45
|
+
Raise "Eval of new data is not supported by this algorithm."
|
46
|
+
end
|
47
|
+
|
48
|
+
protected
|
49
|
+
|
50
|
+
# return distance between cluster cx and cluster (ci U cj),
|
51
|
+
# using median linkage
|
52
|
+
def linkage_distance(cx, ci, cj)
|
53
|
+
( 0.5 * read_distance_matrix(cx, ci) +
|
54
|
+
0.5 * read_distance_matrix(cx, cj) -
|
55
|
+
0.25 * read_distance_matrix(ci, cj))
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|