ai4r 1.5 → 1.6
Sign up to get free protection for your applications and to get access to all the features.
- data/examples/clusterers/simple_website_clustering.rb +47 -0
- data/lib/ai4r.rb +7 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
- data/lib/ai4r/clusterers/average_linkage.rb +22 -23
- data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
- data/lib/ai4r/clusterers/complete_linkage.rb +17 -12
- data/lib/ai4r/clusterers/diana.rb +139 -0
- data/lib/ai4r/clusterers/median_linkage.rb +61 -0
- data/lib/ai4r/clusterers/single_linkage.rb +57 -42
- data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
- data/lib/ai4r/data/constants.rb +18 -0
- data/lib/ai4r/data/data_set.rb +5 -3
- data/lib/ai4r/data/proximity.rb +18 -0
- data/test/clusterers/average_linkage_test.rb +14 -11
- data/test/clusterers/bisecting_k_means_test.rb +9 -0
- data/test/clusterers/centroid_linkage_test.rb +50 -0
- data/test/clusterers/complete_linkage_test.rb +14 -5
- data/test/clusterers/diana_test.rb +69 -0
- data/test/clusterers/k_means_test.rb +9 -0
- data/test/clusterers/median_linkage_test.rb +50 -0
- data/test/clusterers/single_linkage_test.rb +15 -6
- data/test/clusterers/ward_linkage_test.rb +50 -0
- data/test/clusterers/weighted_average_linkage_test.rb +50 -0
- data/test/data/data_set_test.rb +14 -0
- data/test/data/proximity_test.rb +10 -0
- metadata +87 -298
- data/site/build/site/en/broken-links.xml +0 -2
- data/site/build/site/en/build/tmp/build-info.xml +0 -5
- data/site/build/site/en/build/tmp/plugins-1.xml +0 -212
- data/site/build/site/en/build/tmp/plugins-2.xml +0 -252
- data/site/build/site/en/build/tmp/projfilters.properties +0 -41
- data/site/build/site/en/downloads.html +0 -200
- data/site/build/site/en/downloads.pdf +0 -151
- data/site/build/site/en/geneticAlgorithms.html +0 -591
- data/site/build/site/en/geneticAlgorithms.pdf +0 -934
- data/site/build/site/en/images/ai4r-logo.png +0 -0
- data/site/build/site/en/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/images/c.png +0 -0
- data/site/build/site/en/images/c_wbn.png +0 -0
- data/site/build/site/en/images/c_wn.png +0 -0
- data/site/build/site/en/images/ero.gif +0 -0
- data/site/build/site/en/images/europe2.png +0 -0
- data/site/build/site/en/images/europe3.png +0 -0
- data/site/build/site/en/images/fitness.png +0 -0
- data/site/build/site/en/images/genetic_algorithms_example.png +0 -0
- data/site/build/site/en/images/instruction_arrow.png +0 -0
- data/site/build/site/en/images/jadeferret.png +0 -0
- data/site/build/site/en/images/my_email.png +0 -0
- data/site/build/site/en/images/neural_network_example.png +0 -0
- data/site/build/site/en/images/rubyforge.png +0 -0
- data/site/build/site/en/images/s.png +0 -0
- data/site/build/site/en/images/s_wbn.png +0 -0
- data/site/build/site/en/images/s_wn.png +0 -0
- data/site/build/site/en/images/sigmoid.png +0 -0
- data/site/build/site/en/images/t.png +0 -0
- data/site/build/site/en/images/t_wbn.png +0 -0
- data/site/build/site/en/images/t_wn.png +0 -0
- data/site/build/site/en/index.html +0 -390
- data/site/build/site/en/index.pdf +0 -657
- data/site/build/site/en/linkmap.html +0 -261
- data/site/build/site/en/linkmap.pdf +0 -94
- data/site/build/site/en/locationmap.xml +0 -72
- data/site/build/site/en/machineLearning.html +0 -340
- data/site/build/site/en/machineLearning.pdf +0 -337
- data/site/build/site/en/neuralNetworks.html +0 -521
- data/site/build/site/en/neuralNetworks.pdf +0 -671
- data/site/build/site/en/skin/CommonMessages_de.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_en_US.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_es.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_fr.xml +0 -23
- data/site/build/site/en/skin/basic.css +0 -166
- data/site/build/site/en/skin/breadcrumbs-optimized.js +0 -90
- data/site/build/site/en/skin/breadcrumbs.js +0 -237
- data/site/build/site/en/skin/fontsize.js +0 -166
- data/site/build/site/en/skin/getBlank.js +0 -40
- data/site/build/site/en/skin/getMenu.js +0 -45
- data/site/build/site/en/skin/images/README.txt +0 -1
- data/site/build/site/en/skin/images/add.jpg +0 -0
- data/site/build/site/en/skin/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/skin/images/chapter.gif +0 -0
- data/site/build/site/en/skin/images/chapter_open.gif +0 -0
- data/site/build/site/en/skin/images/current.gif +0 -0
- data/site/build/site/en/skin/images/error.png +0 -0
- data/site/build/site/en/skin/images/external-link.gif +0 -0
- data/site/build/site/en/skin/images/fix.jpg +0 -0
- data/site/build/site/en/skin/images/forrest-credit-logo.png +0 -0
- data/site/build/site/en/skin/images/hack.jpg +0 -0
- data/site/build/site/en/skin/images/header_white_line.gif +0 -0
- data/site/build/site/en/skin/images/info.png +0 -0
- data/site/build/site/en/skin/images/instruction_arrow.png +0 -0
- data/site/build/site/en/skin/images/label.gif +0 -0
- data/site/build/site/en/skin/images/page.gif +0 -0
- data/site/build/site/en/skin/images/pdfdoc.gif +0 -0
- data/site/build/site/en/skin/images/poddoc.png +0 -0
- data/site/build/site/en/skin/images/printer.gif +0 -0
- data/site/build/site/en/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/remove.jpg +0 -0
- data/site/build/site/en/skin/images/rss.png +0 -0
- data/site/build/site/en/skin/images/spacer.gif +0 -0
- data/site/build/site/en/skin/images/success.png +0 -0
- data/site/build/site/en/skin/images/txtdoc.png +0 -0
- data/site/build/site/en/skin/images/update.jpg +0 -0
- data/site/build/site/en/skin/images/valid-html401.png +0 -0
- data/site/build/site/en/skin/images/vcss.png +0 -0
- data/site/build/site/en/skin/images/warning.png +0 -0
- data/site/build/site/en/skin/images/xmldoc.gif +0 -0
- data/site/build/site/en/skin/menu.js +0 -48
- data/site/build/site/en/skin/note.txt +0 -50
- data/site/build/site/en/skin/print.css +0 -54
- data/site/build/site/en/skin/profile.css +0 -163
- data/site/build/site/en/skin/prototype.js +0 -1257
- data/site/build/site/en/skin/screen.css +0 -587
- data/site/build/site/en/sourceCode.html +0 -244
- data/site/build/site/en/sourceCode.pdf +0 -278
- data/site/build/site/en/svn.html +0 -244
- data/site/build/site/en/svn.pdf +0 -278
- data/site/build/tmp/brokenlinks.xml +0 -2
- data/site/build/tmp/build-info.xml +0 -5
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
- data/site/build/tmp/input.xmap +0 -32
- data/site/build/tmp/internal.xmap +0 -32
- data/site/build/tmp/locationmap.xml +0 -29
- data/site/build/tmp/output.xmap +0 -38
- data/site/build/tmp/pluginlist2fetchbuild.xml +0 -144
- data/site/build/tmp/plugins-1.xml +0 -201
- data/site/build/tmp/plugins-2.xml +0 -401
- data/site/build/tmp/projfilters.properties +0 -41
- data/site/build/tmp/resources.xmap +0 -32
- data/site/build/webapp/WEB-INF/logs/access.log +0 -0
- data/site/build/webapp/WEB-INF/logs/core.log +0 -775
- data/site/build/webapp/WEB-INF/logs/debug.log +0 -0
- data/site/build/webapp/WEB-INF/logs/error.log +0 -213
- data/site/build/webapp/WEB-INF/logs/flow.log +0 -0
- data/site/build/webapp/WEB-INF/logs/idgen.log +0 -0
- data/site/build/webapp/WEB-INF/logs/linkrewriter.log +0 -0
- data/site/build/webapp/WEB-INF/logs/locationmap.log +0 -0
- data/site/build/webapp/WEB-INF/logs/sitemap.log +0 -0
- data/site/build/webapp/WEB-INF/logs/xmlform.log +0 -0
- data/site/forrest.properties +0 -152
- data/site/forrest.properties.dispatcher.properties +0 -25
- data/site/forrest.properties.xml +0 -29
- data/site/src/documentation/README.txt +0 -7
- data/site/src/documentation/classes/CatalogManager.properties +0 -62
- data/site/src/documentation/content/locationmap.xml +0 -72
- data/site/src/documentation/content/xdocs/downloads.html +0 -9
- data/site/src/documentation/content/xdocs/geneticAlgorithms.xml +0 -294
- data/site/src/documentation/content/xdocs/index.xml +0 -129
- data/site/src/documentation/content/xdocs/machineLearning.xml +0 -131
- data/site/src/documentation/content/xdocs/neuralNetworks.xml +0 -270
- data/site/src/documentation/content/xdocs/site.xml +0 -54
- data/site/src/documentation/content/xdocs/sourceCode.xml +0 -43
- data/site/src/documentation/content/xdocs/tabs.xml +0 -35
- data/site/src/documentation/resources/images/ai4r-logo.png +0 -0
- data/site/src/documentation/resources/images/c.png +0 -0
- data/site/src/documentation/resources/images/c_wbn.png +0 -0
- data/site/src/documentation/resources/images/c_wn.png +0 -0
- data/site/src/documentation/resources/images/ellipse-2.svg +0 -30
- data/site/src/documentation/resources/images/ero.gif +0 -0
- data/site/src/documentation/resources/images/europe2.png +0 -0
- data/site/src/documentation/resources/images/europe3.png +0 -0
- data/site/src/documentation/resources/images/fitness.png +0 -0
- data/site/src/documentation/resources/images/genetic_algorithms_example.png +0 -0
- data/site/src/documentation/resources/images/icon-a.png +0 -0
- data/site/src/documentation/resources/images/icon-b.png +0 -0
- data/site/src/documentation/resources/images/icon.png +0 -0
- data/site/src/documentation/resources/images/jadeferret.png +0 -0
- data/site/src/documentation/resources/images/my_email.png +0 -0
- data/site/src/documentation/resources/images/neural_network_example.png +0 -0
- data/site/src/documentation/resources/images/project-logo.png +0 -0
- data/site/src/documentation/resources/images/rubyforge.png +0 -0
- data/site/src/documentation/resources/images/s.png +0 -0
- data/site/src/documentation/resources/images/s_wbn.png +0 -0
- data/site/src/documentation/resources/images/s_wn.png +0 -0
- data/site/src/documentation/resources/images/sigmoid.png +0 -0
- data/site/src/documentation/resources/images/sub-dir/icon-c.png +0 -0
- data/site/src/documentation/resources/images/t.png +0 -0
- data/site/src/documentation/resources/images/t_wbn.png +0 -0
- data/site/src/documentation/resources/images/t_wn.png +0 -0
- data/site/src/documentation/resources/schema/catalog.xcat +0 -29
- data/site/src/documentation/resources/schema/hello-v10.dtd +0 -51
- data/site/src/documentation/resources/schema/symbols-project-v10.ent +0 -26
- data/site/src/documentation/resources/stylesheets/hello2document.xsl +0 -33
- data/site/src/documentation/sitemap.xmap +0 -66
- data/site/src/documentation/skinconf.xml +0 -418
- data/site/src/documentation/translations/langcode.xml +0 -29
- data/site/src/documentation/translations/languages_de.xml +0 -24
- data/site/src/documentation/translations/languages_en.xml +0 -24
- data/site/src/documentation/translations/languages_es.xml +0 -22
- data/site/src/documentation/translations/languages_fr.xml +0 -24
- data/site/src/documentation/translations/languages_nl.xml +0 -24
- data/site/src/documentation/translations/menu.xml +0 -33
- data/site/src/documentation/translations/menu_af.xml +0 -33
- data/site/src/documentation/translations/menu_de.xml +0 -33
- data/site/src/documentation/translations/menu_es.xml +0 -33
- data/site/src/documentation/translations/menu_fr.xml +0 -33
- data/site/src/documentation/translations/menu_it.xml +0 -33
- data/site/src/documentation/translations/menu_nl.xml +0 -33
- data/site/src/documentation/translations/menu_no.xml +0 -33
- data/site/src/documentation/translations/menu_ru.xml +0 -33
- data/site/src/documentation/translations/menu_sk.xml +0 -33
- data/site/src/documentation/translations/tabs.xml +0 -22
- data/site/src/documentation/translations/tabs_de.xml +0 -22
- data/site/src/documentation/translations/tabs_es.xml +0 -22
- data/site/src/documentation/translations/tabs_fr.xml +0 -22
- data/site/src/documentation/translations/tabs_nl.xml +0 -22
@@ -0,0 +1,61 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
|
+
# median linkage algorithm, aka weighted pair group method centroid
|
18
|
+
# or WPGMC (Everitt et al., 2001 ; Gower, 1967 ; Jain and Dubes, 1988 ).
|
19
|
+
# Hierarchical clusteres create one cluster per element, and then
|
20
|
+
# progressively merge clusters, until the required number of clusters
|
21
|
+
# is reached.
|
22
|
+
# Similar to centroid linkages, but using fix weight:
|
23
|
+
#
|
24
|
+
# D(cx, (ci U cj)) = (1/2)*D(cx, ci) +
|
25
|
+
# (1/2)*D(cx, cj) -
|
26
|
+
# (1/4)*D(ci, cj)
|
27
|
+
class MedianLinkage < SingleLinkage
|
28
|
+
|
29
|
+
parameters_info :distance_function =>
|
30
|
+
"Custom implementation of distance function. " +
|
31
|
+
"It must be a closure receiving two data items and return the " +
|
32
|
+
"distance bewteen them. By default, this algorithm uses " +
|
33
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
34
|
+
|
35
|
+
# Build a new clusterer, using data examples found in data_set.
|
36
|
+
# Items will be clustered in "number_of_clusters" different
|
37
|
+
# clusters.
|
38
|
+
def build(data_set, number_of_clusters)
|
39
|
+
super
|
40
|
+
end
|
41
|
+
|
42
|
+
# This algorithms does not allow classification of new data items
|
43
|
+
# once it has been built. Rebuild the cluster including you data element.
|
44
|
+
def eval(data_item)
|
45
|
+
Raise "Eval of new data is not supported by this algorithm."
|
46
|
+
end
|
47
|
+
|
48
|
+
protected
|
49
|
+
|
50
|
+
# return distance between cluster cx and cluster (ci U cj),
|
51
|
+
# using median linkage
|
52
|
+
def linkage_distance(cx, ci, cj)
|
53
|
+
( 0.5 * read_distance_matrix(cx, ci) +
|
54
|
+
0.5 * read_distance_matrix(cx, cj) -
|
55
|
+
0.25 * read_distance_matrix(ci, cj))
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
@@ -8,17 +8,21 @@
|
|
8
8
|
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
9
|
|
10
10
|
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../data/proximity'
|
11
12
|
require File.dirname(__FILE__) + '/../clusterers/clusterer'
|
12
13
|
|
13
14
|
module Ai4r
|
14
15
|
module Clusterers
|
15
16
|
|
16
|
-
# Implementation of a Hierarchical clusterer with single linkage
|
17
|
+
# Implementation of a Hierarchical clusterer with single linkage (Everitt et
|
18
|
+
# al., 2001 ; Johnson, 1967 ; Jain and Dubes, 1988 ; Sneath, 1957 )
|
17
19
|
# Hierarchical clusteres create one cluster per element, and then
|
18
20
|
# progressively merge clusters, until the required number of clusters
|
19
21
|
# is reached.
|
20
22
|
# With single linkage, the distance between two clusters is computed as the
|
21
23
|
# distance between the two closest elements in the two clusters.
|
24
|
+
#
|
25
|
+
# D(cx, (ci U cj) = min(D(cx, ci), D(cx, cj))
|
22
26
|
class SingleLinkage < Clusterer
|
23
27
|
|
24
28
|
attr_reader :data_set, :number_of_clusters, :clusters
|
@@ -30,7 +34,11 @@ module Ai4r
|
|
30
34
|
"ecuclidean distance of numeric attributes to the power of 2."
|
31
35
|
|
32
36
|
def initialize
|
33
|
-
@distance_function =
|
37
|
+
@distance_function = lambda do |a,b|
|
38
|
+
Ai4r::Data::Proximity.squared_euclidean_distance(
|
39
|
+
a.select {|att_a| att_a.is_a? Numeric} ,
|
40
|
+
b.select {|att_b| att_b.is_a? Numeric})
|
41
|
+
end
|
34
42
|
end
|
35
43
|
|
36
44
|
# Build a new clusterer, using data examples found in data_set.
|
@@ -40,13 +48,14 @@ module Ai4r
|
|
40
48
|
@data_set = data_set
|
41
49
|
@number_of_clusters = number_of_clusters
|
42
50
|
|
43
|
-
index_clusters = create_initial_index_clusters
|
51
|
+
@index_clusters = create_initial_index_clusters
|
44
52
|
create_distance_matrix(data_set)
|
45
|
-
while index_clusters.length > @number_of_clusters
|
46
|
-
|
47
|
-
|
53
|
+
while @index_clusters.length > @number_of_clusters
|
54
|
+
ci, cj = get_closest_clusters(@index_clusters)
|
55
|
+
update_distance_matrix(ci, cj)
|
56
|
+
merge_clusters(ci, cj, @index_clusters)
|
48
57
|
end
|
49
|
-
@clusters = build_clusters_from_index_clusters index_clusters
|
58
|
+
@clusters = build_clusters_from_index_clusters @index_clusters
|
50
59
|
|
51
60
|
return self
|
52
61
|
end
|
@@ -58,19 +67,6 @@ module Ai4r
|
|
58
67
|
distance_between_item_and_cluster(data_item, cluster)})
|
59
68
|
end
|
60
69
|
|
61
|
-
# This function calculates the distance between 2 different
|
62
|
-
# instances. By default, it returns the euclidean distance to the
|
63
|
-
# power of 2.
|
64
|
-
# You can provide a more convinient distance implementation:
|
65
|
-
#
|
66
|
-
# 1- Overwriting this method
|
67
|
-
#
|
68
|
-
# 2- Providing a closure to the :distance_function parameter
|
69
|
-
def distance(a, b)
|
70
|
-
return @distance_function.call(a, b) if @distance_function
|
71
|
-
return euclidean_distance(a, b)
|
72
|
-
end
|
73
|
-
|
74
70
|
protected
|
75
71
|
|
76
72
|
# returns [ [0], [1], [2], ... , [n-1] ]
|
@@ -95,7 +91,7 @@ module Ai4r
|
|
95
91
|
data_set.data_items.each_with_index do |a, i|
|
96
92
|
i.times do |j|
|
97
93
|
b = data_set.data_items[j]
|
98
|
-
@distance_matrix[i-1][j] =
|
94
|
+
@distance_matrix[i-1][j] = @distance_function.call(a, b)
|
99
95
|
end
|
100
96
|
end
|
101
97
|
end
|
@@ -108,14 +104,46 @@ module Ai4r
|
|
108
104
|
return @distance_matrix[index_a-1][index_b]
|
109
105
|
end
|
110
106
|
|
111
|
-
#
|
107
|
+
# ci and cj are the indexes of the clusters that are going to
|
108
|
+
# be merged. We need to remove distances from/to ci and ci,
|
109
|
+
# and add distances from/to new cluster (ci U cj)
|
110
|
+
def update_distance_matrix(ci, cj)
|
111
|
+
ci, cj = cj, ci if cj > ci
|
112
|
+
distances_to_new_cluster = Array.new
|
113
|
+
(@distance_matrix.length+1).times do |cx|
|
114
|
+
if cx!= ci && cx!=cj
|
115
|
+
distances_to_new_cluster << linkage_distance(cx, ci, cj)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
if cj==0 && ci==1
|
119
|
+
@distance_matrix.delete_at(1)
|
120
|
+
@distance_matrix.delete_at(0)
|
121
|
+
elsif cj==0
|
122
|
+
@distance_matrix.delete_at(ci-1)
|
123
|
+
@distance_matrix.delete_at(0)
|
124
|
+
else
|
125
|
+
@distance_matrix.delete_at(ci-1)
|
126
|
+
@distance_matrix.delete_at(cj-1)
|
127
|
+
end
|
128
|
+
@distance_matrix.each do |d|
|
129
|
+
d.delete_at(ci)
|
130
|
+
d.delete_at(cj)
|
131
|
+
end
|
132
|
+
@distance_matrix << distances_to_new_cluster
|
133
|
+
end
|
134
|
+
|
135
|
+
# return distance between cluster cx and new cluster (ci U cj),
|
136
|
+
# using single linkage
|
137
|
+
def linkage_distance(cx, ci, cj)
|
138
|
+
[read_distance_matrix(cx, ci),
|
139
|
+
read_distance_matrix(cx, cj)].min
|
140
|
+
end
|
141
|
+
|
112
142
|
# cluster_a and cluster_b are removed from index_cluster,
|
113
143
|
# and a new cluster with all members of cluster_a and cluster_b
|
114
144
|
# is added.
|
115
|
-
# It
|
116
|
-
def merge_clusters(
|
117
|
-
index_a = clusters_to_merge.first
|
118
|
-
index_b = clusters_to_merge.last
|
145
|
+
# It modifies index clusters array.
|
146
|
+
def merge_clusters(index_a, index_b, index_clusters)
|
119
147
|
index_a, index_b = index_b, index_a if index_b > index_a
|
120
148
|
new_index_cluster = index_clusters[index_a] +
|
121
149
|
index_clusters[index_b]
|
@@ -140,10 +168,9 @@ module Ai4r
|
|
140
168
|
def get_closest_clusters(index_clusters)
|
141
169
|
min_distance = 1.0/0
|
142
170
|
closest_clusters = [1, 0]
|
143
|
-
index_clusters.
|
171
|
+
index_clusters.each_index do |index_a|
|
144
172
|
index_a.times do |index_b|
|
145
|
-
|
146
|
-
cluster_distance = calc_index_clusters_distance(cluster_a, cluster_b)
|
173
|
+
cluster_distance = read_distance_matrix(index_a, index_b)
|
147
174
|
if cluster_distance < min_distance
|
148
175
|
closest_clusters = [index_a, index_b]
|
149
176
|
min_distance = cluster_distance
|
@@ -153,22 +180,10 @@ module Ai4r
|
|
153
180
|
return closest_clusters
|
154
181
|
end
|
155
182
|
|
156
|
-
# Calculate cluster distance using the single linkage method
|
157
|
-
def calc_index_clusters_distance(cluster_a, cluster_b)
|
158
|
-
min_dist = 1.0/0
|
159
|
-
cluster_a.each do |index_a|
|
160
|
-
cluster_b.each do |index_b|
|
161
|
-
dist = read_distance_matrix(index_a, index_b)
|
162
|
-
min_dist = dist if dist < min_dist
|
163
|
-
end
|
164
|
-
end
|
165
|
-
return min_dist
|
166
|
-
end
|
167
|
-
|
168
183
|
def distance_between_item_and_cluster(data_item, cluster)
|
169
184
|
min_dist = 1.0/0
|
170
185
|
cluster.data_items.each do |another_item|
|
171
|
-
dist =
|
186
|
+
dist = @distance_function.call(data_item, another_item)
|
172
187
|
min_dist = dist if dist < min_dist
|
173
188
|
end
|
174
189
|
return min_dist
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
|
+
# Ward's method linkage algorithm, aka the minimum variance method (Everitt
|
18
|
+
# et al., 2001 ; Jain and Dubes, 1988 ; Ward, 1963 ).
|
19
|
+
# Hierarchical clusteres create one cluster per element, and then
|
20
|
+
# progressively merge clusters, until the required number of clusters
|
21
|
+
# is reached.
|
22
|
+
# The objective of this method is to minime the variance.
|
23
|
+
#
|
24
|
+
# D(cx, (ci U cj)) = (ni/(ni+nj+nx))*D(cx, ci) +
|
25
|
+
# (nj/(ni+nj+nx))*D(cx, cj) -
|
26
|
+
# (nx/(ni+nj)^2)*D(ci, cj)
|
27
|
+
class WardLinkage < SingleLinkage
|
28
|
+
|
29
|
+
parameters_info :distance_function =>
|
30
|
+
"Custom implementation of distance function. " +
|
31
|
+
"It must be a closure receiving two data items and return the " +
|
32
|
+
"distance bewteen them. By default, this algorithm uses " +
|
33
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
34
|
+
|
35
|
+
# Build a new clusterer, using data examples found in data_set.
|
36
|
+
# Items will be clustered in "number_of_clusters" different
|
37
|
+
# clusters.
|
38
|
+
def build(data_set, number_of_clusters)
|
39
|
+
super
|
40
|
+
end
|
41
|
+
|
42
|
+
# This algorithms does not allow classification of new data items
|
43
|
+
# once it has been built. Rebuild the cluster including you data element.
|
44
|
+
def eval(data_item)
|
45
|
+
Raise "Eval of new data is not supported by this algorithm."
|
46
|
+
end
|
47
|
+
|
48
|
+
protected
|
49
|
+
|
50
|
+
# return distance between cluster cx and cluster (ci U cj),
|
51
|
+
# using ward's method linkage
|
52
|
+
def linkage_distance(cx, ci, cj)
|
53
|
+
ni = @index_clusters[ci].length
|
54
|
+
nj = @index_clusters[cj].length
|
55
|
+
nx = @index_clusters[cx].length
|
56
|
+
( ( ( 1.0* (ni+nx) * read_distance_matrix(cx, ci) ) +
|
57
|
+
( 1.0* (nj+nx) * read_distance_matrix(cx, cj) ) ) / (ni + nj + nx) -
|
58
|
+
( 1.0 * nx * read_distance_matrix(ci, cj) / (ni+nj)**2 ) )
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
|
+
# weighted average linkage algorithm, aka weighted pair group method
|
18
|
+
# average or WPGMA (Jain and Dubes, 1988 ; McQuitty, 1966 )
|
19
|
+
# Hierarchical clusteres create one cluster per element, and then
|
20
|
+
# progressively merge clusters, until the required number of clusters
|
21
|
+
# is reached.
|
22
|
+
# Similar to AverageLinkage, but the distances between clusters are
|
23
|
+
# weighted based on the number of data items in each of them.
|
24
|
+
#
|
25
|
+
# D(cx, (ci U cj)) = ( ni * D(cx, ci) + nj * D(cx, cj)) / (ni + nj)
|
26
|
+
class WeightedAverageLinkage < SingleLinkage
|
27
|
+
|
28
|
+
parameters_info :distance_function =>
|
29
|
+
"Custom implementation of distance function. " +
|
30
|
+
"It must be a closure receiving two data items and return the " +
|
31
|
+
"distance bewteen them. By default, this algorithm uses " +
|
32
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
33
|
+
|
34
|
+
# Build a new clusterer, using data examples found in data_set.
|
35
|
+
# Items will be clustered in "number_of_clusters" different
|
36
|
+
# clusters.
|
37
|
+
def build(data_set, number_of_clusters)
|
38
|
+
super
|
39
|
+
end
|
40
|
+
|
41
|
+
# This algorithms does not allow classification of new data items
|
42
|
+
# once it has been built. Rebuild the cluster including you data element.
|
43
|
+
def eval(data_item)
|
44
|
+
Raise "Eval of new data is not supported by this algorithm."
|
45
|
+
end
|
46
|
+
|
47
|
+
protected
|
48
|
+
|
49
|
+
# return distance between cluster cx and cluster (ci U cj),
|
50
|
+
# using weighted average linkage
|
51
|
+
def linkage_distance(cx, ci, cj)
|
52
|
+
ni = @index_clusters[ci].length
|
53
|
+
nj = @index_clusters[cj].length
|
54
|
+
(1.0 * ni * read_distance_matrix(cx, ci)+
|
55
|
+
nj * read_distance_matrix(cx, cj))/(ni+nj)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# Author:: Sergio Fierens
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
module Ai4r
|
11
|
+
module Data
|
12
|
+
|
13
|
+
POSITIVE_INFINTY = 1.0/0
|
14
|
+
|
15
|
+
NEGATIVE_INFINTY = -1.0/0
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
data/lib/ai4r/data/data_set.rb
CHANGED
@@ -41,7 +41,9 @@ module Ai4r
|
|
41
41
|
# Retrieve a new DataSet, with the item(s) selected by the provided
|
42
42
|
# index. You can specify an index range, too.
|
43
43
|
def [](index)
|
44
|
-
|
44
|
+
selected_items = (index.is_a?(Fixnum)) ?
|
45
|
+
[@data_items[index]] : @data_items[index]
|
46
|
+
return DataSet.new(:data_items => selected_items,
|
45
47
|
:data_labels =>@data_labels)
|
46
48
|
end
|
47
49
|
|
@@ -173,7 +175,7 @@ module Ai4r
|
|
173
175
|
|
174
176
|
# Add a data item to the data set
|
175
177
|
def << data_item
|
176
|
-
if data_item.nil? || !data_item.is_a?(
|
178
|
+
if data_item.nil? || !data_item.is_a?(Enumerable) || data_item.empty?
|
177
179
|
raise ArgumentError,"Data must not be an non empty array."
|
178
180
|
elsif @data_items.empty?
|
179
181
|
set_data_items([data_item])
|
@@ -205,7 +207,7 @@ module Ai4r
|
|
205
207
|
def check_data_items(data_items)
|
206
208
|
if !data_items || data_items.empty?
|
207
209
|
raise ArgumentError,"Examples data set must not be empty."
|
208
|
-
elsif !data_items.first.is_a?(
|
210
|
+
elsif !data_items.first.is_a?(Enumerable)
|
209
211
|
raise ArgumentError,"Unkown format for example data."
|
210
212
|
end
|
211
213
|
attributes_num = data_items.first.length
|
data/lib/ai4r/data/proximity.rb
CHANGED
@@ -74,6 +74,24 @@ module Ai4r
|
|
74
74
|
return count
|
75
75
|
end
|
76
76
|
|
77
|
+
# The "Simple matching" distance between two attribute sets is given
|
78
|
+
# by the number of values present on both vectors.
|
79
|
+
# If sets a and b have lengths da and db then:
|
80
|
+
#
|
81
|
+
# S = 2/(da + db) * Number of values present on both sets
|
82
|
+
# D = 1.0/S - 1
|
83
|
+
#
|
84
|
+
# Some considerations:
|
85
|
+
# * a and b must not include repeated items
|
86
|
+
# * all attributes are treated equally
|
87
|
+
# * all attributes are treated equally
|
88
|
+
def self.simple_matching_distance(a,b)
|
89
|
+
similarity = 0.0
|
90
|
+
a.each {|item| similarity += 2 if b.include?(item)}
|
91
|
+
similarity /= (a.length + b.length)
|
92
|
+
return 1.0/similarity - 1
|
93
|
+
end
|
94
|
+
|
77
95
|
end
|
78
96
|
|
79
97
|
end
|
@@ -1,9 +1,18 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
1
10
|
require 'test/unit'
|
2
11
|
require File.dirname(__FILE__) + '/../../lib/ai4r/clusterers/average_linkage'
|
3
12
|
|
4
13
|
class Ai4r::Clusterers::AverageLinkage < Ai4r::Clusterers::SingleLinkage
|
5
14
|
attr_accessor :data_set, :number_of_clusters, :clusters, :distance_matrix
|
6
|
-
public :
|
15
|
+
public :linkage_distance
|
7
16
|
public :distance_between_item_and_cluster
|
8
17
|
end
|
9
18
|
|
@@ -28,18 +37,12 @@ class AverageLinkageTest < Test::Unit::TestCase
|
|
28
37
|
[49.0, 49.0, 26.0, 5.0, 25.0, 49.0, 4.0, 29.0, 37.0, 5.0],
|
29
38
|
[2.0, 72.0, 65.0, 50.0, 52.0, 2.0, 65.0, 10.0, 74.0, 50.0, 37.0]]
|
30
39
|
|
31
|
-
|
32
|
-
clusterer = AverageLinkage.new
|
40
|
+
def test_linkage_distance
|
41
|
+
clusterer = Ai4r::Clusterers::AverageLinkage.new
|
33
42
|
clusterer.distance_matrix = @@expected_distance_matrix
|
34
|
-
assert_equal
|
35
|
-
assert_equal
|
43
|
+
assert_equal 93.5, clusterer.linkage_distance(0,1,2)
|
44
|
+
assert_equal 37.5, clusterer.linkage_distance(4,2,5)
|
36
45
|
end
|
37
46
|
|
38
|
-
def test_distance_between_item_and_cluster
|
39
|
-
clusterer = AverageLinkage.new
|
40
|
-
assert_equal 20.0, clusterer.distance_between_item_and_cluster([1,2],
|
41
|
-
DataSet.new(:data_items => [[3,4],[5,6]]))
|
42
|
-
end
|
43
|
-
|
44
47
|
end
|
45
48
|
|