ai4r 1.5 → 1.6
Sign up to get free protection for your applications and to get access to all the features.
- data/examples/clusterers/simple_website_clustering.rb +47 -0
- data/lib/ai4r.rb +7 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
- data/lib/ai4r/clusterers/average_linkage.rb +22 -23
- data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
- data/lib/ai4r/clusterers/complete_linkage.rb +17 -12
- data/lib/ai4r/clusterers/diana.rb +139 -0
- data/lib/ai4r/clusterers/median_linkage.rb +61 -0
- data/lib/ai4r/clusterers/single_linkage.rb +57 -42
- data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
- data/lib/ai4r/data/constants.rb +18 -0
- data/lib/ai4r/data/data_set.rb +5 -3
- data/lib/ai4r/data/proximity.rb +18 -0
- data/test/clusterers/average_linkage_test.rb +14 -11
- data/test/clusterers/bisecting_k_means_test.rb +9 -0
- data/test/clusterers/centroid_linkage_test.rb +50 -0
- data/test/clusterers/complete_linkage_test.rb +14 -5
- data/test/clusterers/diana_test.rb +69 -0
- data/test/clusterers/k_means_test.rb +9 -0
- data/test/clusterers/median_linkage_test.rb +50 -0
- data/test/clusterers/single_linkage_test.rb +15 -6
- data/test/clusterers/ward_linkage_test.rb +50 -0
- data/test/clusterers/weighted_average_linkage_test.rb +50 -0
- data/test/data/data_set_test.rb +14 -0
- data/test/data/proximity_test.rb +10 -0
- metadata +87 -298
- data/site/build/site/en/broken-links.xml +0 -2
- data/site/build/site/en/build/tmp/build-info.xml +0 -5
- data/site/build/site/en/build/tmp/plugins-1.xml +0 -212
- data/site/build/site/en/build/tmp/plugins-2.xml +0 -252
- data/site/build/site/en/build/tmp/projfilters.properties +0 -41
- data/site/build/site/en/downloads.html +0 -200
- data/site/build/site/en/downloads.pdf +0 -151
- data/site/build/site/en/geneticAlgorithms.html +0 -591
- data/site/build/site/en/geneticAlgorithms.pdf +0 -934
- data/site/build/site/en/images/ai4r-logo.png +0 -0
- data/site/build/site/en/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/images/c.png +0 -0
- data/site/build/site/en/images/c_wbn.png +0 -0
- data/site/build/site/en/images/c_wn.png +0 -0
- data/site/build/site/en/images/ero.gif +0 -0
- data/site/build/site/en/images/europe2.png +0 -0
- data/site/build/site/en/images/europe3.png +0 -0
- data/site/build/site/en/images/fitness.png +0 -0
- data/site/build/site/en/images/genetic_algorithms_example.png +0 -0
- data/site/build/site/en/images/instruction_arrow.png +0 -0
- data/site/build/site/en/images/jadeferret.png +0 -0
- data/site/build/site/en/images/my_email.png +0 -0
- data/site/build/site/en/images/neural_network_example.png +0 -0
- data/site/build/site/en/images/rubyforge.png +0 -0
- data/site/build/site/en/images/s.png +0 -0
- data/site/build/site/en/images/s_wbn.png +0 -0
- data/site/build/site/en/images/s_wn.png +0 -0
- data/site/build/site/en/images/sigmoid.png +0 -0
- data/site/build/site/en/images/t.png +0 -0
- data/site/build/site/en/images/t_wbn.png +0 -0
- data/site/build/site/en/images/t_wn.png +0 -0
- data/site/build/site/en/index.html +0 -390
- data/site/build/site/en/index.pdf +0 -657
- data/site/build/site/en/linkmap.html +0 -261
- data/site/build/site/en/linkmap.pdf +0 -94
- data/site/build/site/en/locationmap.xml +0 -72
- data/site/build/site/en/machineLearning.html +0 -340
- data/site/build/site/en/machineLearning.pdf +0 -337
- data/site/build/site/en/neuralNetworks.html +0 -521
- data/site/build/site/en/neuralNetworks.pdf +0 -671
- data/site/build/site/en/skin/CommonMessages_de.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_en_US.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_es.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_fr.xml +0 -23
- data/site/build/site/en/skin/basic.css +0 -166
- data/site/build/site/en/skin/breadcrumbs-optimized.js +0 -90
- data/site/build/site/en/skin/breadcrumbs.js +0 -237
- data/site/build/site/en/skin/fontsize.js +0 -166
- data/site/build/site/en/skin/getBlank.js +0 -40
- data/site/build/site/en/skin/getMenu.js +0 -45
- data/site/build/site/en/skin/images/README.txt +0 -1
- data/site/build/site/en/skin/images/add.jpg +0 -0
- data/site/build/site/en/skin/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/skin/images/chapter.gif +0 -0
- data/site/build/site/en/skin/images/chapter_open.gif +0 -0
- data/site/build/site/en/skin/images/current.gif +0 -0
- data/site/build/site/en/skin/images/error.png +0 -0
- data/site/build/site/en/skin/images/external-link.gif +0 -0
- data/site/build/site/en/skin/images/fix.jpg +0 -0
- data/site/build/site/en/skin/images/forrest-credit-logo.png +0 -0
- data/site/build/site/en/skin/images/hack.jpg +0 -0
- data/site/build/site/en/skin/images/header_white_line.gif +0 -0
- data/site/build/site/en/skin/images/info.png +0 -0
- data/site/build/site/en/skin/images/instruction_arrow.png +0 -0
- data/site/build/site/en/skin/images/label.gif +0 -0
- data/site/build/site/en/skin/images/page.gif +0 -0
- data/site/build/site/en/skin/images/pdfdoc.gif +0 -0
- data/site/build/site/en/skin/images/poddoc.png +0 -0
- data/site/build/site/en/skin/images/printer.gif +0 -0
- data/site/build/site/en/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/remove.jpg +0 -0
- data/site/build/site/en/skin/images/rss.png +0 -0
- data/site/build/site/en/skin/images/spacer.gif +0 -0
- data/site/build/site/en/skin/images/success.png +0 -0
- data/site/build/site/en/skin/images/txtdoc.png +0 -0
- data/site/build/site/en/skin/images/update.jpg +0 -0
- data/site/build/site/en/skin/images/valid-html401.png +0 -0
- data/site/build/site/en/skin/images/vcss.png +0 -0
- data/site/build/site/en/skin/images/warning.png +0 -0
- data/site/build/site/en/skin/images/xmldoc.gif +0 -0
- data/site/build/site/en/skin/menu.js +0 -48
- data/site/build/site/en/skin/note.txt +0 -50
- data/site/build/site/en/skin/print.css +0 -54
- data/site/build/site/en/skin/profile.css +0 -163
- data/site/build/site/en/skin/prototype.js +0 -1257
- data/site/build/site/en/skin/screen.css +0 -587
- data/site/build/site/en/sourceCode.html +0 -244
- data/site/build/site/en/sourceCode.pdf +0 -278
- data/site/build/site/en/svn.html +0 -244
- data/site/build/site/en/svn.pdf +0 -278
- data/site/build/tmp/brokenlinks.xml +0 -2
- data/site/build/tmp/build-info.xml +0 -5
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
- data/site/build/tmp/input.xmap +0 -32
- data/site/build/tmp/internal.xmap +0 -32
- data/site/build/tmp/locationmap.xml +0 -29
- data/site/build/tmp/output.xmap +0 -38
- data/site/build/tmp/pluginlist2fetchbuild.xml +0 -144
- data/site/build/tmp/plugins-1.xml +0 -201
- data/site/build/tmp/plugins-2.xml +0 -401
- data/site/build/tmp/projfilters.properties +0 -41
- data/site/build/tmp/resources.xmap +0 -32
- data/site/build/webapp/WEB-INF/logs/access.log +0 -0
- data/site/build/webapp/WEB-INF/logs/core.log +0 -775
- data/site/build/webapp/WEB-INF/logs/debug.log +0 -0
- data/site/build/webapp/WEB-INF/logs/error.log +0 -213
- data/site/build/webapp/WEB-INF/logs/flow.log +0 -0
- data/site/build/webapp/WEB-INF/logs/idgen.log +0 -0
- data/site/build/webapp/WEB-INF/logs/linkrewriter.log +0 -0
- data/site/build/webapp/WEB-INF/logs/locationmap.log +0 -0
- data/site/build/webapp/WEB-INF/logs/sitemap.log +0 -0
- data/site/build/webapp/WEB-INF/logs/xmlform.log +0 -0
- data/site/forrest.properties +0 -152
- data/site/forrest.properties.dispatcher.properties +0 -25
- data/site/forrest.properties.xml +0 -29
- data/site/src/documentation/README.txt +0 -7
- data/site/src/documentation/classes/CatalogManager.properties +0 -62
- data/site/src/documentation/content/locationmap.xml +0 -72
- data/site/src/documentation/content/xdocs/downloads.html +0 -9
- data/site/src/documentation/content/xdocs/geneticAlgorithms.xml +0 -294
- data/site/src/documentation/content/xdocs/index.xml +0 -129
- data/site/src/documentation/content/xdocs/machineLearning.xml +0 -131
- data/site/src/documentation/content/xdocs/neuralNetworks.xml +0 -270
- data/site/src/documentation/content/xdocs/site.xml +0 -54
- data/site/src/documentation/content/xdocs/sourceCode.xml +0 -43
- data/site/src/documentation/content/xdocs/tabs.xml +0 -35
- data/site/src/documentation/resources/images/ai4r-logo.png +0 -0
- data/site/src/documentation/resources/images/c.png +0 -0
- data/site/src/documentation/resources/images/c_wbn.png +0 -0
- data/site/src/documentation/resources/images/c_wn.png +0 -0
- data/site/src/documentation/resources/images/ellipse-2.svg +0 -30
- data/site/src/documentation/resources/images/ero.gif +0 -0
- data/site/src/documentation/resources/images/europe2.png +0 -0
- data/site/src/documentation/resources/images/europe3.png +0 -0
- data/site/src/documentation/resources/images/fitness.png +0 -0
- data/site/src/documentation/resources/images/genetic_algorithms_example.png +0 -0
- data/site/src/documentation/resources/images/icon-a.png +0 -0
- data/site/src/documentation/resources/images/icon-b.png +0 -0
- data/site/src/documentation/resources/images/icon.png +0 -0
- data/site/src/documentation/resources/images/jadeferret.png +0 -0
- data/site/src/documentation/resources/images/my_email.png +0 -0
- data/site/src/documentation/resources/images/neural_network_example.png +0 -0
- data/site/src/documentation/resources/images/project-logo.png +0 -0
- data/site/src/documentation/resources/images/rubyforge.png +0 -0
- data/site/src/documentation/resources/images/s.png +0 -0
- data/site/src/documentation/resources/images/s_wbn.png +0 -0
- data/site/src/documentation/resources/images/s_wn.png +0 -0
- data/site/src/documentation/resources/images/sigmoid.png +0 -0
- data/site/src/documentation/resources/images/sub-dir/icon-c.png +0 -0
- data/site/src/documentation/resources/images/t.png +0 -0
- data/site/src/documentation/resources/images/t_wbn.png +0 -0
- data/site/src/documentation/resources/images/t_wn.png +0 -0
- data/site/src/documentation/resources/schema/catalog.xcat +0 -29
- data/site/src/documentation/resources/schema/hello-v10.dtd +0 -51
- data/site/src/documentation/resources/schema/symbols-project-v10.ent +0 -26
- data/site/src/documentation/resources/stylesheets/hello2document.xsl +0 -33
- data/site/src/documentation/sitemap.xmap +0 -66
- data/site/src/documentation/skinconf.xml +0 -418
- data/site/src/documentation/translations/langcode.xml +0 -29
- data/site/src/documentation/translations/languages_de.xml +0 -24
- data/site/src/documentation/translations/languages_en.xml +0 -24
- data/site/src/documentation/translations/languages_es.xml +0 -22
- data/site/src/documentation/translations/languages_fr.xml +0 -24
- data/site/src/documentation/translations/languages_nl.xml +0 -24
- data/site/src/documentation/translations/menu.xml +0 -33
- data/site/src/documentation/translations/menu_af.xml +0 -33
- data/site/src/documentation/translations/menu_de.xml +0 -33
- data/site/src/documentation/translations/menu_es.xml +0 -33
- data/site/src/documentation/translations/menu_fr.xml +0 -33
- data/site/src/documentation/translations/menu_it.xml +0 -33
- data/site/src/documentation/translations/menu_nl.xml +0 -33
- data/site/src/documentation/translations/menu_no.xml +0 -33
- data/site/src/documentation/translations/menu_ru.xml +0 -33
- data/site/src/documentation/translations/menu_sk.xml +0 -33
- data/site/src/documentation/translations/tabs.xml +0 -22
- data/site/src/documentation/translations/tabs_de.xml +0 -22
- data/site/src/documentation/translations/tabs_es.xml +0 -22
- data/site/src/documentation/translations/tabs_fr.xml +0 -22
- data/site/src/documentation/translations/tabs_nl.xml +0 -22
@@ -0,0 +1,47 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/google_search'
|
2
|
+
require File.dirname(__FILE__) + '/build_keywords'
|
3
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/clusterers/average_linkage'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'hpricot'
|
6
|
+
require 'net/http'
|
7
|
+
require 'benchmark'
|
8
|
+
|
9
|
+
SITES_TO_CLASSIFY = [
|
10
|
+
"www.foxnews.com", "www.usatoday.com", "scm.jadeferret.com",
|
11
|
+
"www.accurev.com", "www.lastminute.com", "subversion.tigris.org",
|
12
|
+
"news.yahoo.com", "news.bbc.co.uk", "www.orbitz.com"
|
13
|
+
]
|
14
|
+
|
15
|
+
# Return array of keywords for the site
|
16
|
+
def get_keywords(site)
|
17
|
+
response = Net::HTTP.get_response(site, "/")
|
18
|
+
Hpricot(response.body).
|
19
|
+
search("meta[@name='keywords']")[0]. #Select meta keywords element
|
20
|
+
attributes["content"]. #Select its content
|
21
|
+
split(","). #Keywords are coma separated
|
22
|
+
collect{ |k| k.strip.downcase } #Remove start and end white spaces
|
23
|
+
end
|
24
|
+
|
25
|
+
# Get keywords data for each website
|
26
|
+
Site = Struct.new("Site", :name, :keywords)
|
27
|
+
sites = SITES_TO_CLASSIFY.collect do |site_name|
|
28
|
+
Site.new(site_name, get_keywords(site_name))
|
29
|
+
end
|
30
|
+
data_set = Ai4r::Data::DataSet.new(:data_items => sites,
|
31
|
+
:data_labels => Site.members)
|
32
|
+
|
33
|
+
# The distance between sites depends on the keywords collected from internet
|
34
|
+
keywords_distance_function = lambda do |x,y|
|
35
|
+
return Ai4r::Data::Proximity.simple_matching(x.keyword, y.keywords)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Create the clusters
|
39
|
+
clusterer = Ai4r::Clusterers::AverageLinkage.new
|
40
|
+
clusterer.distance_function = keywords_distance_function
|
41
|
+
clusterer.build(data_set, 3)
|
42
|
+
|
43
|
+
# Print results
|
44
|
+
clusterer.clusters.each do |cluster|
|
45
|
+
puts cluster.data_items.collect {|item| item.name}.join(", ")
|
46
|
+
puts "============"
|
47
|
+
end
|
data/lib/ai4r.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# Data
|
2
2
|
require "ai4r/data/data_set"
|
3
3
|
require "ai4r/data/statistics"
|
4
|
+
require "ai4r/data/proximity"
|
4
5
|
require "ai4r/data/parameterizable"
|
5
6
|
# Clusterers
|
6
7
|
require "ai4r/clusterers/clusterer"
|
@@ -9,12 +10,18 @@ require "ai4r/clusterers/bisecting_k_means"
|
|
9
10
|
require "ai4r/clusterers/single_linkage"
|
10
11
|
require "ai4r/clusterers/complete_linkage"
|
11
12
|
require "ai4r/clusterers/average_linkage"
|
13
|
+
require "ai4r/clusterers/weighted_average_linkage"
|
14
|
+
require "ai4r/clusterers/centroid_linkage"
|
15
|
+
require "ai4r/clusterers/median_linkage"
|
16
|
+
require "ai4r/clusterers/ward_linkage"
|
17
|
+
require "ai4r/clusterers/diana"
|
12
18
|
# Classifiers
|
13
19
|
require "ai4r/classifiers/classifier"
|
14
20
|
require "ai4r/classifiers/id3"
|
15
21
|
require "ai4r/classifiers/prism"
|
16
22
|
require "ai4r/classifiers/one_r"
|
17
23
|
require "ai4r/classifiers/zero_r"
|
24
|
+
require "ai4r/classifiers/hyperpipes"
|
18
25
|
# Neural networks
|
19
26
|
require "ai4r/neural_network/backpropagation"
|
20
27
|
# Genetic Algorithms
|
@@ -0,0 +1,118 @@
|
|
1
|
+
# Author:: Sergio Fierens (Implementation only)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require 'set'
|
11
|
+
require File.dirname(__FILE__) + '/../data/constants'
|
12
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
13
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
14
|
+
|
15
|
+
module Ai4r
|
16
|
+
module Classifiers
|
17
|
+
|
18
|
+
include Ai4r::Data
|
19
|
+
|
20
|
+
# = Introduction
|
21
|
+
#
|
22
|
+
# A fast classifier algorithm, created by Lucio de Souza Coelho
|
23
|
+
# and Len Trigg.
|
24
|
+
class Hyperpipes < Classifier
|
25
|
+
|
26
|
+
attr_reader :data_set, :pipes
|
27
|
+
|
28
|
+
# Build a new Hyperpipes classifier. You must provide a DataSet instance
|
29
|
+
# as parameter. The last attribute of each item is considered as
|
30
|
+
# the item class.
|
31
|
+
def build(data_set)
|
32
|
+
@data_set = data_set
|
33
|
+
@domains = data_set.build_domains
|
34
|
+
|
35
|
+
@pipes = {}
|
36
|
+
@domains.last.each {|cat| @pipes[cat] = build_pipe(@domains)}
|
37
|
+
@data_set.data_item.each {|item| update_pipe(@pipes[item.last], item) }
|
38
|
+
|
39
|
+
return self
|
40
|
+
end
|
41
|
+
|
42
|
+
# You can evaluate new data, predicting its class.
|
43
|
+
# e.g.
|
44
|
+
# classifier.eval(['New York', '<30', 'F']) # => 'Y'
|
45
|
+
def eval(data)
|
46
|
+
votes = Hash.new {0}
|
47
|
+
@pipes.each do |category, pipe|
|
48
|
+
pipe.each_with_index do |bounds, i|
|
49
|
+
if data[i].is_a? Numeric
|
50
|
+
votes[category]+=1 if data[i]>bounds[:min] && data[i]<bounds[:max]
|
51
|
+
else
|
52
|
+
votes[category]+=1 if bounds[data[i]]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return votes.to_a.max {|x, y| x.last <=> y.last}.first
|
57
|
+
end
|
58
|
+
|
59
|
+
# This method returns the generated rules in ruby code.
|
60
|
+
# e.g.
|
61
|
+
#
|
62
|
+
# classifier.get_rules
|
63
|
+
# # => if age_range == '<30' then marketing_target = 'Y'
|
64
|
+
# elsif age_range == '[30-50)' then marketing_target = 'N'
|
65
|
+
# elsif age_range == '[50-80]' then marketing_target = 'N'
|
66
|
+
# end
|
67
|
+
#
|
68
|
+
# It is a nice way to inspect induction results, and also to execute them:
|
69
|
+
# marketing_target = nil
|
70
|
+
# eval classifier.get_rules
|
71
|
+
# puts marketing_target
|
72
|
+
# # => 'Y'
|
73
|
+
def get_rules
|
74
|
+
rules = []
|
75
|
+
rules << "votes = Hash.new {0}"
|
76
|
+
data = @data_set.data_items.first
|
77
|
+
labels = @data_set.data_labels.collect {|l| l.to_s}
|
78
|
+
@pipes.each do |category, pipe|
|
79
|
+
pipe.each_with_index do |bounds, i|
|
80
|
+
rule = "votes['#{category}'] += 1 "
|
81
|
+
if data[i].is_a? Numeric
|
82
|
+
rule += "if #{labels[i]} > #{bounds[:min]} && #{labels[i]} < #{bounds[:max]}"
|
83
|
+
else
|
84
|
+
rule += "if #{bounds.inspect}['#{labels[i]}']"
|
85
|
+
end
|
86
|
+
rules << rule
|
87
|
+
end
|
88
|
+
end
|
89
|
+
rules << "votes.to_a.max {|x, y| x.last <=> y.last}.first"
|
90
|
+
return rules.join('\n')
|
91
|
+
end
|
92
|
+
|
93
|
+
protected
|
94
|
+
|
95
|
+
def build_pipe(data_set)
|
96
|
+
data_set.data_items.first[0...-1].collect do |att|
|
97
|
+
if att.is_a? Numeric
|
98
|
+
{:min=>POSITIVE_INFINITY, :max=>NEGATIVE_INFINITY}
|
99
|
+
else
|
100
|
+
Hash.new(false)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def update_pipe(pipe, data_item)
|
106
|
+
data_item[0...-1].each_with_index do |att, i|
|
107
|
+
if att.first.is_a? Numeric
|
108
|
+
pipe[i][:min] = att if att < pipe[i][:min]
|
109
|
+
pipe[i][:max] = att if att > pipe[i][:max]
|
110
|
+
else
|
111
|
+
pipe[i][att] = true
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -13,14 +13,25 @@ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
|
13
13
|
module Ai4r
|
14
14
|
module Clusterers
|
15
15
|
|
16
|
-
# Implementation of a Hierarchical clusterer with
|
16
|
+
# Implementation of a Hierarchical clusterer with group average
|
17
|
+
# linkage, AKA unweighted pair group method average or UPGMA (Everitt
|
18
|
+
# et al., 2001 ; Jain and Dubes, 1988 ; Sokal and Michener, 1958).
|
17
19
|
# Hierarchical clusteres create one cluster per element, and then
|
18
20
|
# progressively merge clusters, until the required number of clusters
|
19
21
|
# is reached.
|
20
|
-
# With average linkage, the distance between
|
21
|
-
# the average distance between
|
22
|
+
# With average linkage, the distance between a clusters cx and
|
23
|
+
# cluster (ci U cj) the the average distance between cx and ci, and
|
24
|
+
# cx and cj.
|
25
|
+
#
|
26
|
+
# D(cx, (ci U cj) = (D(cx, ci) + D(cx, cj)) / 2
|
22
27
|
class AverageLinkage < SingleLinkage
|
23
28
|
|
29
|
+
parameters_info :distance_function =>
|
30
|
+
"Custom implementation of distance function. " +
|
31
|
+
"It must be a closure receiving two data items and return the " +
|
32
|
+
"distance bewteen them. By default, this algorithm uses " +
|
33
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
34
|
+
|
24
35
|
# Build a new clusterer, using data examples found in data_set.
|
25
36
|
# Items will be clustered in "number_of_clusters" different
|
26
37
|
# clusters.
|
@@ -28,31 +39,19 @@ module Ai4r
|
|
28
39
|
super
|
29
40
|
end
|
30
41
|
|
31
|
-
#
|
32
|
-
#
|
42
|
+
# This algorithms does not allow classification of new data items
|
43
|
+
# once it has been built. Rebuild the cluster including you data element.
|
33
44
|
def eval(data_item)
|
34
|
-
|
45
|
+
Raise "Eval of new data is not supported by this algorithm."
|
35
46
|
end
|
36
47
|
|
37
48
|
protected
|
38
49
|
|
39
|
-
#
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
dist_sum += read_distance_matrix(index_a, index_b)
|
45
|
-
end
|
46
|
-
end
|
47
|
-
return dist_sum/(cluster_a.length*cluster_b.length)
|
48
|
-
end
|
49
|
-
|
50
|
-
def distance_between_item_and_cluster(data_item, cluster)
|
51
|
-
dist_sum = 0.0
|
52
|
-
cluster.data_items.each do |another_item|
|
53
|
-
dist_sum += distance(data_item, another_item)
|
54
|
-
end
|
55
|
-
return dist_sum/cluster.data_items.length
|
50
|
+
# return distance between cluster cx and cluster (ci U cj),
|
51
|
+
# using average linkage
|
52
|
+
def linkage_distance(cx, ci, cj)
|
53
|
+
(read_distance_matrix(cx, ci)+
|
54
|
+
read_distance_matrix(cx, cj))/2
|
56
55
|
end
|
57
56
|
|
58
57
|
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
12
|
+
|
13
|
+
module Ai4r
|
14
|
+
module Clusterers
|
15
|
+
|
16
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
17
|
+
# centroid linkage algorithm, aka unweighted pair group method
|
18
|
+
# centroid (UPGMC) (Everitt et al., 2001 ; Jain and Dubes, 1988 ;
|
19
|
+
# Sokal and Michener, 1958 )
|
20
|
+
# Hierarchical clusteres create one cluster per element, and then
|
21
|
+
# progressively merge clusters, until the required number of clusters
|
22
|
+
# is reached.
|
23
|
+
# The distance between clusters is the squared euclidean distance
|
24
|
+
# between their centroids.
|
25
|
+
#
|
26
|
+
# D(cx, (ci U cj)) = | mx - mij |^2
|
27
|
+
# D(cx, (ci U cj)) = (ni/(ni+nj))*D(cx, ci) +
|
28
|
+
# (nj/(ni+nj))*D(cx, cj) -
|
29
|
+
# (ni*nj/(ni+nj)^2)*D(ci, cj)
|
30
|
+
class CentroidLinkage < SingleLinkage
|
31
|
+
|
32
|
+
parameters_info :distance_function =>
|
33
|
+
"Custom implementation of distance function. " +
|
34
|
+
"It must be a closure receiving two data items and return the " +
|
35
|
+
"distance bewteen them. By default, this algorithm uses " +
|
36
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
37
|
+
|
38
|
+
# Build a new clusterer, using data examples found in data_set.
|
39
|
+
# Items will be clustered in "number_of_clusters" different
|
40
|
+
# clusters.
|
41
|
+
def build(data_set, number_of_clusters)
|
42
|
+
super
|
43
|
+
end
|
44
|
+
|
45
|
+
# This algorithms does not allow classification of new data items
|
46
|
+
# once it has been built. Rebuild the cluster including you data element.
|
47
|
+
def eval(data_item)
|
48
|
+
Raise "Eval of new data is not supported by this algorithm."
|
49
|
+
end
|
50
|
+
|
51
|
+
protected
|
52
|
+
|
53
|
+
# return distance between cluster cx and cluster (ci U cj),
|
54
|
+
# using centroid linkage
|
55
|
+
def linkage_distance(cx, ci, cj)
|
56
|
+
ni = @index_clusters[ci].length
|
57
|
+
nj = @index_clusters[cj].length
|
58
|
+
( ni * read_distance_matrix(cx, ci) +
|
59
|
+
nj * read_distance_matrix(cx, cj) -
|
60
|
+
1.0 * ni * nj * read_distance_matrix(ci, cj) / (ni+nj)) / (ni+nj)
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
@@ -13,14 +13,24 @@ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
|
13
13
|
module Ai4r
|
14
14
|
module Clusterers
|
15
15
|
|
16
|
-
# Implementation of a Hierarchical clusterer with complete linkage
|
16
|
+
# Implementation of a Hierarchical clusterer with complete linkage (Everitt
|
17
|
+
# et al., 2001 ; Jain and Dubes, 1988 ; Sorensen, 1948 ).
|
17
18
|
# Hierarchical clusteres create one cluster per element, and then
|
18
19
|
# progressively merge clusters, until the required number of clusters
|
19
20
|
# is reached.
|
20
21
|
# With complete linkage, the distance between two clusters is computed as
|
21
22
|
# the maximum distance between elements of each cluster.
|
23
|
+
#
|
24
|
+
# D(cx, (ci U cj) = max(D(cx, ci), D(cx, cj))
|
22
25
|
class CompleteLinkage < SingleLinkage
|
23
26
|
|
27
|
+
parameters_info :distance_function =>
|
28
|
+
"Custom implementation of distance function. " +
|
29
|
+
"It must be a closure receiving two data items and return the " +
|
30
|
+
"distance bewteen them. By default, this algorithm uses " +
|
31
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
32
|
+
|
33
|
+
|
24
34
|
# Build a new clusterer, using data examples found in data_set.
|
25
35
|
# Items will be clustered in "number_of_clusters" different
|
26
36
|
# clusters.
|
@@ -36,22 +46,17 @@ module Ai4r
|
|
36
46
|
|
37
47
|
protected
|
38
48
|
|
39
|
-
#
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
dist = read_distance_matrix(index_a, index_b)
|
45
|
-
max_dist = dist if dist > max_dist
|
46
|
-
end
|
47
|
-
end
|
48
|
-
return max_dist
|
49
|
+
# return distance between cluster cx and new cluster (ci U cj),
|
50
|
+
# using complete linkage
|
51
|
+
def linkage_distance(cx, ci, cj)
|
52
|
+
[read_distance_matrix(cx, ci),
|
53
|
+
read_distance_matrix(cx, cj)].max
|
49
54
|
end
|
50
55
|
|
51
56
|
def distance_between_item_and_cluster(data_item, cluster)
|
52
57
|
max_dist = 0
|
53
58
|
cluster.data_items.each do |another_item|
|
54
|
-
dist =
|
59
|
+
dist = @distance_function.call(data_item, another_item)
|
55
60
|
max_dist = dist if dist > max_dist
|
56
61
|
end
|
57
62
|
return max_dist
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
2
|
+
# License:: MPL 1.1
|
3
|
+
# Project:: ai4r
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
5
|
+
#
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
9
|
+
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
11
|
+
require File.dirname(__FILE__) + '/../data/proximity'
|
12
|
+
require File.dirname(__FILE__) + '/../clusterers/clusterer'
|
13
|
+
|
14
|
+
module Ai4r
|
15
|
+
module Clusterers
|
16
|
+
|
17
|
+
# DIANA (Divisive ANAlysis) (Kaufman and Rousseeuw, 1990;
|
18
|
+
# Macnaughton - Smith et al. 1964) is a Divisive Hierarchical
|
19
|
+
# Clusterer. It begins with only one cluster with all data items,
|
20
|
+
# and divides the clusters until the desired clusters number is reached.
|
21
|
+
class Diana < Clusterer
|
22
|
+
|
23
|
+
attr_reader :data_set, :number_of_clusters, :clusters
|
24
|
+
|
25
|
+
parameters_info :distance_function =>
|
26
|
+
"Custom implementation of distance function. " +
|
27
|
+
"It must be a closure receiving two data items and return the " +
|
28
|
+
"distance bewteen them. By default, this algorithm uses " +
|
29
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
30
|
+
|
31
|
+
def initialize
|
32
|
+
@distance_function = lambda do |a,b|
|
33
|
+
Ai4r::Data::Proximity.squared_euclidean_distance(
|
34
|
+
a.select {|att_a| att_a.is_a? Numeric} ,
|
35
|
+
b.select {|att_b| att_b.is_a? Numeric})
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Build a new clusterer, using divisive analysis (DIANA algorithm)
|
40
|
+
def build(data_set, number_of_clusters)
|
41
|
+
@data_set = data_set
|
42
|
+
@number_of_clusters = number_of_clusters
|
43
|
+
@clusters = [@data_set[0..-1]]
|
44
|
+
|
45
|
+
while(@clusters.length < @number_of_clusters)
|
46
|
+
cluster_index_to_split = max_diameter_cluster(@clusters)
|
47
|
+
cluster_to_split = @clusters[cluster_index_to_split]
|
48
|
+
splinter_cluster = init_splinter_cluster(cluster_to_split)
|
49
|
+
while true
|
50
|
+
dist_diff, index = max_distance_difference(cluster_to_split, splinter_cluster)
|
51
|
+
break if dist_diff < 0
|
52
|
+
splinter_cluster << cluster_to_split.data_items[index]
|
53
|
+
cluster_to_split.data_items.delete_at(index)
|
54
|
+
end
|
55
|
+
@clusters << splinter_cluster
|
56
|
+
end
|
57
|
+
|
58
|
+
return self
|
59
|
+
end
|
60
|
+
|
61
|
+
# Classifies the given data item, returning the cluster index it belongs
|
62
|
+
# to (0-based).
|
63
|
+
def eval(data_item)
|
64
|
+
get_min_index(@clusters.collect do |cluster|
|
65
|
+
distance_sum(data_item, cluster) / cluster.data_items.length
|
66
|
+
end)
|
67
|
+
end
|
68
|
+
|
69
|
+
protected
|
70
|
+
|
71
|
+
# return the cluster with max diameter
|
72
|
+
def max_diameter_cluster(clusters)
|
73
|
+
max_index = 0
|
74
|
+
max_diameter = 0
|
75
|
+
clusters.each_with_index do |cluster, index|
|
76
|
+
diameter = cluster_diameter(cluster)
|
77
|
+
if diameter > max_diameter
|
78
|
+
max_index = index
|
79
|
+
max_diameter = diameter
|
80
|
+
end
|
81
|
+
end
|
82
|
+
return max_index
|
83
|
+
end
|
84
|
+
|
85
|
+
# Max distance between 2 items in a cluster
|
86
|
+
def cluster_diameter(cluster)
|
87
|
+
diameter = 0
|
88
|
+
cluster.data_items.each_with_index do |item_a, item_a_pos|
|
89
|
+
item_a_pos.times do |item_b_pos|
|
90
|
+
d = @distance_function.call(item_a, cluster.data_items[item_b_pos])
|
91
|
+
diameter = d if d > diameter
|
92
|
+
end
|
93
|
+
end
|
94
|
+
return diameter
|
95
|
+
end
|
96
|
+
|
97
|
+
# Create a cluster with the item with mx distance
|
98
|
+
# to the rest of the cluster's items.
|
99
|
+
# That item is removed from the initial cluster.
|
100
|
+
def init_splinter_cluster(cluster_to_split)
|
101
|
+
max = 0.0
|
102
|
+
max_index = 0
|
103
|
+
cluster_to_split.data_items.each_with_index do |item, index|
|
104
|
+
sum = distance_sum(item, cluster_to_split)
|
105
|
+
max, max_index = sum, index if sum > max
|
106
|
+
end
|
107
|
+
splinter_cluster = cluster_to_split[max_index]
|
108
|
+
cluster_to_split.data_items.delete_at(max_index)
|
109
|
+
return splinter_cluster
|
110
|
+
end
|
111
|
+
|
112
|
+
# Return the max average distance between any item of
|
113
|
+
# cluster_to_split and the rest of items in that cluster,
|
114
|
+
# minus the average distance with the items of splinter_cluster,
|
115
|
+
# and the index of the item.
|
116
|
+
# A positive value means that the items is closer to the
|
117
|
+
# splinter group than to its current cluster.
|
118
|
+
def max_distance_difference(cluster_to_split, splinter_cluster)
|
119
|
+
max_diff = -1.0/0
|
120
|
+
max_diff_index = 0
|
121
|
+
cluster_to_split.data_items.each_with_index do |item, index|
|
122
|
+
dist_a = distance_sum(item, cluster_to_split) / (cluster_to_split.data_items.length-1)
|
123
|
+
dist_b = distance_sum(item, splinter_cluster) / (splinter_cluster.data_items.length)
|
124
|
+
dist_diff = dist_a - dist_b
|
125
|
+
max_diff, max_diff_index = dist_diff, index if dist_diff > max_diff
|
126
|
+
end
|
127
|
+
return max_diff, max_diff_index
|
128
|
+
end
|
129
|
+
|
130
|
+
# Sum up the distance between an item and all the items in a cluster
|
131
|
+
def distance_sum(item_a, cluster)
|
132
|
+
cluster.data_items.inject(0.0) do |sum, item_b|
|
133
|
+
sum + @distance_function.call(item_a, item_b)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|