ai4r 1.5 → 1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/examples/clusterers/simple_website_clustering.rb +47 -0
- data/lib/ai4r.rb +7 -0
- data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
- data/lib/ai4r/clusterers/average_linkage.rb +22 -23
- data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
- data/lib/ai4r/clusterers/complete_linkage.rb +17 -12
- data/lib/ai4r/clusterers/diana.rb +139 -0
- data/lib/ai4r/clusterers/median_linkage.rb +61 -0
- data/lib/ai4r/clusterers/single_linkage.rb +57 -42
- data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
- data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
- data/lib/ai4r/data/constants.rb +18 -0
- data/lib/ai4r/data/data_set.rb +5 -3
- data/lib/ai4r/data/proximity.rb +18 -0
- data/test/clusterers/average_linkage_test.rb +14 -11
- data/test/clusterers/bisecting_k_means_test.rb +9 -0
- data/test/clusterers/centroid_linkage_test.rb +50 -0
- data/test/clusterers/complete_linkage_test.rb +14 -5
- data/test/clusterers/diana_test.rb +69 -0
- data/test/clusterers/k_means_test.rb +9 -0
- data/test/clusterers/median_linkage_test.rb +50 -0
- data/test/clusterers/single_linkage_test.rb +15 -6
- data/test/clusterers/ward_linkage_test.rb +50 -0
- data/test/clusterers/weighted_average_linkage_test.rb +50 -0
- data/test/data/data_set_test.rb +14 -0
- data/test/data/proximity_test.rb +10 -0
- metadata +87 -298
- data/site/build/site/en/broken-links.xml +0 -2
- data/site/build/site/en/build/tmp/build-info.xml +0 -5
- data/site/build/site/en/build/tmp/plugins-1.xml +0 -212
- data/site/build/site/en/build/tmp/plugins-2.xml +0 -252
- data/site/build/site/en/build/tmp/projfilters.properties +0 -41
- data/site/build/site/en/downloads.html +0 -200
- data/site/build/site/en/downloads.pdf +0 -151
- data/site/build/site/en/geneticAlgorithms.html +0 -591
- data/site/build/site/en/geneticAlgorithms.pdf +0 -934
- data/site/build/site/en/images/ai4r-logo.png +0 -0
- data/site/build/site/en/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/images/c.png +0 -0
- data/site/build/site/en/images/c_wbn.png +0 -0
- data/site/build/site/en/images/c_wn.png +0 -0
- data/site/build/site/en/images/ero.gif +0 -0
- data/site/build/site/en/images/europe2.png +0 -0
- data/site/build/site/en/images/europe3.png +0 -0
- data/site/build/site/en/images/fitness.png +0 -0
- data/site/build/site/en/images/genetic_algorithms_example.png +0 -0
- data/site/build/site/en/images/instruction_arrow.png +0 -0
- data/site/build/site/en/images/jadeferret.png +0 -0
- data/site/build/site/en/images/my_email.png +0 -0
- data/site/build/site/en/images/neural_network_example.png +0 -0
- data/site/build/site/en/images/rubyforge.png +0 -0
- data/site/build/site/en/images/s.png +0 -0
- data/site/build/site/en/images/s_wbn.png +0 -0
- data/site/build/site/en/images/s_wn.png +0 -0
- data/site/build/site/en/images/sigmoid.png +0 -0
- data/site/build/site/en/images/t.png +0 -0
- data/site/build/site/en/images/t_wbn.png +0 -0
- data/site/build/site/en/images/t_wn.png +0 -0
- data/site/build/site/en/index.html +0 -390
- data/site/build/site/en/index.pdf +0 -657
- data/site/build/site/en/linkmap.html +0 -261
- data/site/build/site/en/linkmap.pdf +0 -94
- data/site/build/site/en/locationmap.xml +0 -72
- data/site/build/site/en/machineLearning.html +0 -340
- data/site/build/site/en/machineLearning.pdf +0 -337
- data/site/build/site/en/neuralNetworks.html +0 -521
- data/site/build/site/en/neuralNetworks.pdf +0 -671
- data/site/build/site/en/skin/CommonMessages_de.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_en_US.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_es.xml +0 -23
- data/site/build/site/en/skin/CommonMessages_fr.xml +0 -23
- data/site/build/site/en/skin/basic.css +0 -166
- data/site/build/site/en/skin/breadcrumbs-optimized.js +0 -90
- data/site/build/site/en/skin/breadcrumbs.js +0 -237
- data/site/build/site/en/skin/fontsize.js +0 -166
- data/site/build/site/en/skin/getBlank.js +0 -40
- data/site/build/site/en/skin/getMenu.js +0 -45
- data/site/build/site/en/skin/images/README.txt +0 -1
- data/site/build/site/en/skin/images/add.jpg +0 -0
- data/site/build/site/en/skin/images/built-with-forrest-button.png +0 -0
- data/site/build/site/en/skin/images/chapter.gif +0 -0
- data/site/build/site/en/skin/images/chapter_open.gif +0 -0
- data/site/build/site/en/skin/images/current.gif +0 -0
- data/site/build/site/en/skin/images/error.png +0 -0
- data/site/build/site/en/skin/images/external-link.gif +0 -0
- data/site/build/site/en/skin/images/fix.jpg +0 -0
- data/site/build/site/en/skin/images/forrest-credit-logo.png +0 -0
- data/site/build/site/en/skin/images/hack.jpg +0 -0
- data/site/build/site/en/skin/images/header_white_line.gif +0 -0
- data/site/build/site/en/skin/images/info.png +0 -0
- data/site/build/site/en/skin/images/instruction_arrow.png +0 -0
- data/site/build/site/en/skin/images/label.gif +0 -0
- data/site/build/site/en/skin/images/page.gif +0 -0
- data/site/build/site/en/skin/images/pdfdoc.gif +0 -0
- data/site/build/site/en/skin/images/poddoc.png +0 -0
- data/site/build/site/en/skin/images/printer.gif +0 -0
- data/site/build/site/en/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/site/build/site/en/skin/images/remove.jpg +0 -0
- data/site/build/site/en/skin/images/rss.png +0 -0
- data/site/build/site/en/skin/images/spacer.gif +0 -0
- data/site/build/site/en/skin/images/success.png +0 -0
- data/site/build/site/en/skin/images/txtdoc.png +0 -0
- data/site/build/site/en/skin/images/update.jpg +0 -0
- data/site/build/site/en/skin/images/valid-html401.png +0 -0
- data/site/build/site/en/skin/images/vcss.png +0 -0
- data/site/build/site/en/skin/images/warning.png +0 -0
- data/site/build/site/en/skin/images/xmldoc.gif +0 -0
- data/site/build/site/en/skin/menu.js +0 -48
- data/site/build/site/en/skin/note.txt +0 -50
- data/site/build/site/en/skin/print.css +0 -54
- data/site/build/site/en/skin/profile.css +0 -163
- data/site/build/site/en/skin/prototype.js +0 -1257
- data/site/build/site/en/skin/screen.css +0 -587
- data/site/build/site/en/sourceCode.html +0 -244
- data/site/build/site/en/sourceCode.pdf +0 -278
- data/site/build/site/en/svn.html +0 -244
- data/site/build/site/en/svn.pdf +0 -278
- data/site/build/tmp/brokenlinks.xml +0 -2
- data/site/build/tmp/build-info.xml +0 -5
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
- data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
- data/site/build/tmp/input.xmap +0 -32
- data/site/build/tmp/internal.xmap +0 -32
- data/site/build/tmp/locationmap.xml +0 -29
- data/site/build/tmp/output.xmap +0 -38
- data/site/build/tmp/pluginlist2fetchbuild.xml +0 -144
- data/site/build/tmp/plugins-1.xml +0 -201
- data/site/build/tmp/plugins-2.xml +0 -401
- data/site/build/tmp/projfilters.properties +0 -41
- data/site/build/tmp/resources.xmap +0 -32
- data/site/build/webapp/WEB-INF/logs/access.log +0 -0
- data/site/build/webapp/WEB-INF/logs/core.log +0 -775
- data/site/build/webapp/WEB-INF/logs/debug.log +0 -0
- data/site/build/webapp/WEB-INF/logs/error.log +0 -213
- data/site/build/webapp/WEB-INF/logs/flow.log +0 -0
- data/site/build/webapp/WEB-INF/logs/idgen.log +0 -0
- data/site/build/webapp/WEB-INF/logs/linkrewriter.log +0 -0
- data/site/build/webapp/WEB-INF/logs/locationmap.log +0 -0
- data/site/build/webapp/WEB-INF/logs/sitemap.log +0 -0
- data/site/build/webapp/WEB-INF/logs/xmlform.log +0 -0
- data/site/forrest.properties +0 -152
- data/site/forrest.properties.dispatcher.properties +0 -25
- data/site/forrest.properties.xml +0 -29
- data/site/src/documentation/README.txt +0 -7
- data/site/src/documentation/classes/CatalogManager.properties +0 -62
- data/site/src/documentation/content/locationmap.xml +0 -72
- data/site/src/documentation/content/xdocs/downloads.html +0 -9
- data/site/src/documentation/content/xdocs/geneticAlgorithms.xml +0 -294
- data/site/src/documentation/content/xdocs/index.xml +0 -129
- data/site/src/documentation/content/xdocs/machineLearning.xml +0 -131
- data/site/src/documentation/content/xdocs/neuralNetworks.xml +0 -270
- data/site/src/documentation/content/xdocs/site.xml +0 -54
- data/site/src/documentation/content/xdocs/sourceCode.xml +0 -43
- data/site/src/documentation/content/xdocs/tabs.xml +0 -35
- data/site/src/documentation/resources/images/ai4r-logo.png +0 -0
- data/site/src/documentation/resources/images/c.png +0 -0
- data/site/src/documentation/resources/images/c_wbn.png +0 -0
- data/site/src/documentation/resources/images/c_wn.png +0 -0
- data/site/src/documentation/resources/images/ellipse-2.svg +0 -30
- data/site/src/documentation/resources/images/ero.gif +0 -0
- data/site/src/documentation/resources/images/europe2.png +0 -0
- data/site/src/documentation/resources/images/europe3.png +0 -0
- data/site/src/documentation/resources/images/fitness.png +0 -0
- data/site/src/documentation/resources/images/genetic_algorithms_example.png +0 -0
- data/site/src/documentation/resources/images/icon-a.png +0 -0
- data/site/src/documentation/resources/images/icon-b.png +0 -0
- data/site/src/documentation/resources/images/icon.png +0 -0
- data/site/src/documentation/resources/images/jadeferret.png +0 -0
- data/site/src/documentation/resources/images/my_email.png +0 -0
- data/site/src/documentation/resources/images/neural_network_example.png +0 -0
- data/site/src/documentation/resources/images/project-logo.png +0 -0
- data/site/src/documentation/resources/images/rubyforge.png +0 -0
- data/site/src/documentation/resources/images/s.png +0 -0
- data/site/src/documentation/resources/images/s_wbn.png +0 -0
- data/site/src/documentation/resources/images/s_wn.png +0 -0
- data/site/src/documentation/resources/images/sigmoid.png +0 -0
- data/site/src/documentation/resources/images/sub-dir/icon-c.png +0 -0
- data/site/src/documentation/resources/images/t.png +0 -0
- data/site/src/documentation/resources/images/t_wbn.png +0 -0
- data/site/src/documentation/resources/images/t_wn.png +0 -0
- data/site/src/documentation/resources/schema/catalog.xcat +0 -29
- data/site/src/documentation/resources/schema/hello-v10.dtd +0 -51
- data/site/src/documentation/resources/schema/symbols-project-v10.ent +0 -26
- data/site/src/documentation/resources/stylesheets/hello2document.xsl +0 -33
- data/site/src/documentation/sitemap.xmap +0 -66
- data/site/src/documentation/skinconf.xml +0 -418
- data/site/src/documentation/translations/langcode.xml +0 -29
- data/site/src/documentation/translations/languages_de.xml +0 -24
- data/site/src/documentation/translations/languages_en.xml +0 -24
- data/site/src/documentation/translations/languages_es.xml +0 -22
- data/site/src/documentation/translations/languages_fr.xml +0 -24
- data/site/src/documentation/translations/languages_nl.xml +0 -24
- data/site/src/documentation/translations/menu.xml +0 -33
- data/site/src/documentation/translations/menu_af.xml +0 -33
- data/site/src/documentation/translations/menu_de.xml +0 -33
- data/site/src/documentation/translations/menu_es.xml +0 -33
- data/site/src/documentation/translations/menu_fr.xml +0 -33
- data/site/src/documentation/translations/menu_it.xml +0 -33
- data/site/src/documentation/translations/menu_nl.xml +0 -33
- data/site/src/documentation/translations/menu_no.xml +0 -33
- data/site/src/documentation/translations/menu_ru.xml +0 -33
- data/site/src/documentation/translations/menu_sk.xml +0 -33
- data/site/src/documentation/translations/tabs.xml +0 -22
- data/site/src/documentation/translations/tabs_de.xml +0 -22
- data/site/src/documentation/translations/tabs_es.xml +0 -22
- data/site/src/documentation/translations/tabs_fr.xml +0 -22
- data/site/src/documentation/translations/tabs_nl.xml +0 -22
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
require File.dirname(__FILE__) + '/google_search'
|
|
2
|
+
require File.dirname(__FILE__) + '/build_keywords'
|
|
3
|
+
require File.dirname(__FILE__) + '/../../lib/ai4r/clusterers/average_linkage'
|
|
4
|
+
require 'rubygems'
|
|
5
|
+
require 'hpricot'
|
|
6
|
+
require 'net/http'
|
|
7
|
+
require 'benchmark'
|
|
8
|
+
|
|
9
|
+
SITES_TO_CLASSIFY = [
|
|
10
|
+
"www.foxnews.com", "www.usatoday.com", "scm.jadeferret.com",
|
|
11
|
+
"www.accurev.com", "www.lastminute.com", "subversion.tigris.org",
|
|
12
|
+
"news.yahoo.com", "news.bbc.co.uk", "www.orbitz.com"
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
# Return array of keywords for the site
|
|
16
|
+
def get_keywords(site)
|
|
17
|
+
response = Net::HTTP.get_response(site, "/")
|
|
18
|
+
Hpricot(response.body).
|
|
19
|
+
search("meta[@name='keywords']")[0]. #Select meta keywords element
|
|
20
|
+
attributes["content"]. #Select its content
|
|
21
|
+
split(","). #Keywords are coma separated
|
|
22
|
+
collect{ |k| k.strip.downcase } #Remove start and end white spaces
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Get keywords data for each website
|
|
26
|
+
Site = Struct.new("Site", :name, :keywords)
|
|
27
|
+
sites = SITES_TO_CLASSIFY.collect do |site_name|
|
|
28
|
+
Site.new(site_name, get_keywords(site_name))
|
|
29
|
+
end
|
|
30
|
+
data_set = Ai4r::Data::DataSet.new(:data_items => sites,
|
|
31
|
+
:data_labels => Site.members)
|
|
32
|
+
|
|
33
|
+
# The distance between sites depends on the keywords collected from internet
|
|
34
|
+
keywords_distance_function = lambda do |x,y|
|
|
35
|
+
return Ai4r::Data::Proximity.simple_matching(x.keyword, y.keywords)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Create the clusters
|
|
39
|
+
clusterer = Ai4r::Clusterers::AverageLinkage.new
|
|
40
|
+
clusterer.distance_function = keywords_distance_function
|
|
41
|
+
clusterer.build(data_set, 3)
|
|
42
|
+
|
|
43
|
+
# Print results
|
|
44
|
+
clusterer.clusters.each do |cluster|
|
|
45
|
+
puts cluster.data_items.collect {|item| item.name}.join(", ")
|
|
46
|
+
puts "============"
|
|
47
|
+
end
|
data/lib/ai4r.rb
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# Data
|
|
2
2
|
require "ai4r/data/data_set"
|
|
3
3
|
require "ai4r/data/statistics"
|
|
4
|
+
require "ai4r/data/proximity"
|
|
4
5
|
require "ai4r/data/parameterizable"
|
|
5
6
|
# Clusterers
|
|
6
7
|
require "ai4r/clusterers/clusterer"
|
|
@@ -9,12 +10,18 @@ require "ai4r/clusterers/bisecting_k_means"
|
|
|
9
10
|
require "ai4r/clusterers/single_linkage"
|
|
10
11
|
require "ai4r/clusterers/complete_linkage"
|
|
11
12
|
require "ai4r/clusterers/average_linkage"
|
|
13
|
+
require "ai4r/clusterers/weighted_average_linkage"
|
|
14
|
+
require "ai4r/clusterers/centroid_linkage"
|
|
15
|
+
require "ai4r/clusterers/median_linkage"
|
|
16
|
+
require "ai4r/clusterers/ward_linkage"
|
|
17
|
+
require "ai4r/clusterers/diana"
|
|
12
18
|
# Classifiers
|
|
13
19
|
require "ai4r/classifiers/classifier"
|
|
14
20
|
require "ai4r/classifiers/id3"
|
|
15
21
|
require "ai4r/classifiers/prism"
|
|
16
22
|
require "ai4r/classifiers/one_r"
|
|
17
23
|
require "ai4r/classifiers/zero_r"
|
|
24
|
+
require "ai4r/classifiers/hyperpipes"
|
|
18
25
|
# Neural networks
|
|
19
26
|
require "ai4r/neural_network/backpropagation"
|
|
20
27
|
# Genetic Algorithms
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# Author:: Sergio Fierens (Implementation only)
|
|
2
|
+
# License:: MPL 1.1
|
|
3
|
+
# Project:: ai4r
|
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
|
5
|
+
#
|
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
|
9
|
+
|
|
10
|
+
require 'set'
|
|
11
|
+
require File.dirname(__FILE__) + '/../data/constants'
|
|
12
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
|
13
|
+
require File.dirname(__FILE__) + '/../classifiers/classifier'
|
|
14
|
+
|
|
15
|
+
module Ai4r
|
|
16
|
+
module Classifiers
|
|
17
|
+
|
|
18
|
+
include Ai4r::Data
|
|
19
|
+
|
|
20
|
+
# = Introduction
|
|
21
|
+
#
|
|
22
|
+
# A fast classifier algorithm, created by Lucio de Souza Coelho
|
|
23
|
+
# and Len Trigg.
|
|
24
|
+
class Hyperpipes < Classifier
|
|
25
|
+
|
|
26
|
+
attr_reader :data_set, :pipes
|
|
27
|
+
|
|
28
|
+
# Build a new Hyperpipes classifier. You must provide a DataSet instance
|
|
29
|
+
# as parameter. The last attribute of each item is considered as
|
|
30
|
+
# the item class.
|
|
31
|
+
def build(data_set)
|
|
32
|
+
@data_set = data_set
|
|
33
|
+
@domains = data_set.build_domains
|
|
34
|
+
|
|
35
|
+
@pipes = {}
|
|
36
|
+
@domains.last.each {|cat| @pipes[cat] = build_pipe(@domains)}
|
|
37
|
+
@data_set.data_item.each {|item| update_pipe(@pipes[item.last], item) }
|
|
38
|
+
|
|
39
|
+
return self
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# You can evaluate new data, predicting its class.
|
|
43
|
+
# e.g.
|
|
44
|
+
# classifier.eval(['New York', '<30', 'F']) # => 'Y'
|
|
45
|
+
def eval(data)
|
|
46
|
+
votes = Hash.new {0}
|
|
47
|
+
@pipes.each do |category, pipe|
|
|
48
|
+
pipe.each_with_index do |bounds, i|
|
|
49
|
+
if data[i].is_a? Numeric
|
|
50
|
+
votes[category]+=1 if data[i]>bounds[:min] && data[i]<bounds[:max]
|
|
51
|
+
else
|
|
52
|
+
votes[category]+=1 if bounds[data[i]]
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
return votes.to_a.max {|x, y| x.last <=> y.last}.first
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# This method returns the generated rules in ruby code.
|
|
60
|
+
# e.g.
|
|
61
|
+
#
|
|
62
|
+
# classifier.get_rules
|
|
63
|
+
# # => if age_range == '<30' then marketing_target = 'Y'
|
|
64
|
+
# elsif age_range == '[30-50)' then marketing_target = 'N'
|
|
65
|
+
# elsif age_range == '[50-80]' then marketing_target = 'N'
|
|
66
|
+
# end
|
|
67
|
+
#
|
|
68
|
+
# It is a nice way to inspect induction results, and also to execute them:
|
|
69
|
+
# marketing_target = nil
|
|
70
|
+
# eval classifier.get_rules
|
|
71
|
+
# puts marketing_target
|
|
72
|
+
# # => 'Y'
|
|
73
|
+
def get_rules
|
|
74
|
+
rules = []
|
|
75
|
+
rules << "votes = Hash.new {0}"
|
|
76
|
+
data = @data_set.data_items.first
|
|
77
|
+
labels = @data_set.data_labels.collect {|l| l.to_s}
|
|
78
|
+
@pipes.each do |category, pipe|
|
|
79
|
+
pipe.each_with_index do |bounds, i|
|
|
80
|
+
rule = "votes['#{category}'] += 1 "
|
|
81
|
+
if data[i].is_a? Numeric
|
|
82
|
+
rule += "if #{labels[i]} > #{bounds[:min]} && #{labels[i]} < #{bounds[:max]}"
|
|
83
|
+
else
|
|
84
|
+
rule += "if #{bounds.inspect}['#{labels[i]}']"
|
|
85
|
+
end
|
|
86
|
+
rules << rule
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
rules << "votes.to_a.max {|x, y| x.last <=> y.last}.first"
|
|
90
|
+
return rules.join('\n')
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
protected
|
|
94
|
+
|
|
95
|
+
def build_pipe(data_set)
|
|
96
|
+
data_set.data_items.first[0...-1].collect do |att|
|
|
97
|
+
if att.is_a? Numeric
|
|
98
|
+
{:min=>POSITIVE_INFINITY, :max=>NEGATIVE_INFINITY}
|
|
99
|
+
else
|
|
100
|
+
Hash.new(false)
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def update_pipe(pipe, data_item)
|
|
106
|
+
data_item[0...-1].each_with_index do |att, i|
|
|
107
|
+
if att.first.is_a? Numeric
|
|
108
|
+
pipe[i][:min] = att if att < pipe[i][:min]
|
|
109
|
+
pipe[i][:max] = att if att > pipe[i][:max]
|
|
110
|
+
else
|
|
111
|
+
pipe[i][att] = true
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
@@ -13,14 +13,25 @@ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
|
|
13
13
|
module Ai4r
|
|
14
14
|
module Clusterers
|
|
15
15
|
|
|
16
|
-
# Implementation of a Hierarchical clusterer with
|
|
16
|
+
# Implementation of a Hierarchical clusterer with group average
|
|
17
|
+
# linkage, AKA unweighted pair group method average or UPGMA (Everitt
|
|
18
|
+
# et al., 2001 ; Jain and Dubes, 1988 ; Sokal and Michener, 1958).
|
|
17
19
|
# Hierarchical clusteres create one cluster per element, and then
|
|
18
20
|
# progressively merge clusters, until the required number of clusters
|
|
19
21
|
# is reached.
|
|
20
|
-
# With average linkage, the distance between
|
|
21
|
-
# the average distance between
|
|
22
|
+
# With average linkage, the distance between a clusters cx and
|
|
23
|
+
# cluster (ci U cj) the the average distance between cx and ci, and
|
|
24
|
+
# cx and cj.
|
|
25
|
+
#
|
|
26
|
+
# D(cx, (ci U cj) = (D(cx, ci) + D(cx, cj)) / 2
|
|
22
27
|
class AverageLinkage < SingleLinkage
|
|
23
28
|
|
|
29
|
+
parameters_info :distance_function =>
|
|
30
|
+
"Custom implementation of distance function. " +
|
|
31
|
+
"It must be a closure receiving two data items and return the " +
|
|
32
|
+
"distance bewteen them. By default, this algorithm uses " +
|
|
33
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
|
34
|
+
|
|
24
35
|
# Build a new clusterer, using data examples found in data_set.
|
|
25
36
|
# Items will be clustered in "number_of_clusters" different
|
|
26
37
|
# clusters.
|
|
@@ -28,31 +39,19 @@ module Ai4r
|
|
|
28
39
|
super
|
|
29
40
|
end
|
|
30
41
|
|
|
31
|
-
#
|
|
32
|
-
#
|
|
42
|
+
# This algorithms does not allow classification of new data items
|
|
43
|
+
# once it has been built. Rebuild the cluster including you data element.
|
|
33
44
|
def eval(data_item)
|
|
34
|
-
|
|
45
|
+
Raise "Eval of new data is not supported by this algorithm."
|
|
35
46
|
end
|
|
36
47
|
|
|
37
48
|
protected
|
|
38
49
|
|
|
39
|
-
#
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
dist_sum += read_distance_matrix(index_a, index_b)
|
|
45
|
-
end
|
|
46
|
-
end
|
|
47
|
-
return dist_sum/(cluster_a.length*cluster_b.length)
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
def distance_between_item_and_cluster(data_item, cluster)
|
|
51
|
-
dist_sum = 0.0
|
|
52
|
-
cluster.data_items.each do |another_item|
|
|
53
|
-
dist_sum += distance(data_item, another_item)
|
|
54
|
-
end
|
|
55
|
-
return dist_sum/cluster.data_items.length
|
|
50
|
+
# return distance between cluster cx and cluster (ci U cj),
|
|
51
|
+
# using average linkage
|
|
52
|
+
def linkage_distance(cx, ci, cj)
|
|
53
|
+
(read_distance_matrix(cx, ci)+
|
|
54
|
+
read_distance_matrix(cx, cj))/2
|
|
56
55
|
end
|
|
57
56
|
|
|
58
57
|
end
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
|
2
|
+
# License:: MPL 1.1
|
|
3
|
+
# Project:: ai4r
|
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
|
5
|
+
#
|
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
|
9
|
+
|
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
|
11
|
+
require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
|
12
|
+
|
|
13
|
+
module Ai4r
|
|
14
|
+
module Clusterers
|
|
15
|
+
|
|
16
|
+
# Implementation of an Agglomerative Hierarchical clusterer with
|
|
17
|
+
# centroid linkage algorithm, aka unweighted pair group method
|
|
18
|
+
# centroid (UPGMC) (Everitt et al., 2001 ; Jain and Dubes, 1988 ;
|
|
19
|
+
# Sokal and Michener, 1958 )
|
|
20
|
+
# Hierarchical clusteres create one cluster per element, and then
|
|
21
|
+
# progressively merge clusters, until the required number of clusters
|
|
22
|
+
# is reached.
|
|
23
|
+
# The distance between clusters is the squared euclidean distance
|
|
24
|
+
# between their centroids.
|
|
25
|
+
#
|
|
26
|
+
# D(cx, (ci U cj)) = | mx - mij |^2
|
|
27
|
+
# D(cx, (ci U cj)) = (ni/(ni+nj))*D(cx, ci) +
|
|
28
|
+
# (nj/(ni+nj))*D(cx, cj) -
|
|
29
|
+
# (ni*nj/(ni+nj)^2)*D(ci, cj)
|
|
30
|
+
class CentroidLinkage < SingleLinkage
|
|
31
|
+
|
|
32
|
+
parameters_info :distance_function =>
|
|
33
|
+
"Custom implementation of distance function. " +
|
|
34
|
+
"It must be a closure receiving two data items and return the " +
|
|
35
|
+
"distance bewteen them. By default, this algorithm uses " +
|
|
36
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
|
37
|
+
|
|
38
|
+
# Build a new clusterer, using data examples found in data_set.
|
|
39
|
+
# Items will be clustered in "number_of_clusters" different
|
|
40
|
+
# clusters.
|
|
41
|
+
def build(data_set, number_of_clusters)
|
|
42
|
+
super
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# This algorithms does not allow classification of new data items
|
|
46
|
+
# once it has been built. Rebuild the cluster including you data element.
|
|
47
|
+
def eval(data_item)
|
|
48
|
+
Raise "Eval of new data is not supported by this algorithm."
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
protected
|
|
52
|
+
|
|
53
|
+
# return distance between cluster cx and cluster (ci U cj),
|
|
54
|
+
# using centroid linkage
|
|
55
|
+
def linkage_distance(cx, ci, cj)
|
|
56
|
+
ni = @index_clusters[ci].length
|
|
57
|
+
nj = @index_clusters[cj].length
|
|
58
|
+
( ni * read_distance_matrix(cx, ci) +
|
|
59
|
+
nj * read_distance_matrix(cx, cj) -
|
|
60
|
+
1.0 * ni * nj * read_distance_matrix(ci, cj) / (ni+nj)) / (ni+nj)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
@@ -13,14 +13,24 @@ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
|
|
|
13
13
|
module Ai4r
|
|
14
14
|
module Clusterers
|
|
15
15
|
|
|
16
|
-
# Implementation of a Hierarchical clusterer with complete linkage
|
|
16
|
+
# Implementation of a Hierarchical clusterer with complete linkage (Everitt
|
|
17
|
+
# et al., 2001 ; Jain and Dubes, 1988 ; Sorensen, 1948 ).
|
|
17
18
|
# Hierarchical clusteres create one cluster per element, and then
|
|
18
19
|
# progressively merge clusters, until the required number of clusters
|
|
19
20
|
# is reached.
|
|
20
21
|
# With complete linkage, the distance between two clusters is computed as
|
|
21
22
|
# the maximum distance between elements of each cluster.
|
|
23
|
+
#
|
|
24
|
+
# D(cx, (ci U cj) = max(D(cx, ci), D(cx, cj))
|
|
22
25
|
class CompleteLinkage < SingleLinkage
|
|
23
26
|
|
|
27
|
+
parameters_info :distance_function =>
|
|
28
|
+
"Custom implementation of distance function. " +
|
|
29
|
+
"It must be a closure receiving two data items and return the " +
|
|
30
|
+
"distance bewteen them. By default, this algorithm uses " +
|
|
31
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
|
32
|
+
|
|
33
|
+
|
|
24
34
|
# Build a new clusterer, using data examples found in data_set.
|
|
25
35
|
# Items will be clustered in "number_of_clusters" different
|
|
26
36
|
# clusters.
|
|
@@ -36,22 +46,17 @@ module Ai4r
|
|
|
36
46
|
|
|
37
47
|
protected
|
|
38
48
|
|
|
39
|
-
#
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
dist = read_distance_matrix(index_a, index_b)
|
|
45
|
-
max_dist = dist if dist > max_dist
|
|
46
|
-
end
|
|
47
|
-
end
|
|
48
|
-
return max_dist
|
|
49
|
+
# return distance between cluster cx and new cluster (ci U cj),
|
|
50
|
+
# using complete linkage
|
|
51
|
+
def linkage_distance(cx, ci, cj)
|
|
52
|
+
[read_distance_matrix(cx, ci),
|
|
53
|
+
read_distance_matrix(cx, cj)].max
|
|
49
54
|
end
|
|
50
55
|
|
|
51
56
|
def distance_between_item_and_cluster(data_item, cluster)
|
|
52
57
|
max_dist = 0
|
|
53
58
|
cluster.data_items.each do |another_item|
|
|
54
|
-
dist =
|
|
59
|
+
dist = @distance_function.call(data_item, another_item)
|
|
55
60
|
max_dist = dist if dist > max_dist
|
|
56
61
|
end
|
|
57
62
|
return max_dist
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# Author:: Sergio Fierens (implementation)
|
|
2
|
+
# License:: MPL 1.1
|
|
3
|
+
# Project:: ai4r
|
|
4
|
+
# Url:: http://ai4r.rubyforge.org/
|
|
5
|
+
#
|
|
6
|
+
# You can redistribute it and/or modify it under the terms of
|
|
7
|
+
# the Mozilla Public License version 1.1 as published by the
|
|
8
|
+
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
|
|
9
|
+
|
|
10
|
+
require File.dirname(__FILE__) + '/../data/data_set'
|
|
11
|
+
require File.dirname(__FILE__) + '/../data/proximity'
|
|
12
|
+
require File.dirname(__FILE__) + '/../clusterers/clusterer'
|
|
13
|
+
|
|
14
|
+
module Ai4r
|
|
15
|
+
module Clusterers
|
|
16
|
+
|
|
17
|
+
# DIANA (Divisive ANAlysis) (Kaufman and Rousseeuw, 1990;
|
|
18
|
+
# Macnaughton - Smith et al. 1964) is a Divisive Hierarchical
|
|
19
|
+
# Clusterer. It begins with only one cluster with all data items,
|
|
20
|
+
# and divides the clusters until the desired clusters number is reached.
|
|
21
|
+
class Diana < Clusterer
|
|
22
|
+
|
|
23
|
+
attr_reader :data_set, :number_of_clusters, :clusters
|
|
24
|
+
|
|
25
|
+
parameters_info :distance_function =>
|
|
26
|
+
"Custom implementation of distance function. " +
|
|
27
|
+
"It must be a closure receiving two data items and return the " +
|
|
28
|
+
"distance bewteen them. By default, this algorithm uses " +
|
|
29
|
+
"ecuclidean distance of numeric attributes to the power of 2."
|
|
30
|
+
|
|
31
|
+
def initialize
|
|
32
|
+
@distance_function = lambda do |a,b|
|
|
33
|
+
Ai4r::Data::Proximity.squared_euclidean_distance(
|
|
34
|
+
a.select {|att_a| att_a.is_a? Numeric} ,
|
|
35
|
+
b.select {|att_b| att_b.is_a? Numeric})
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Build a new clusterer, using divisive analysis (DIANA algorithm)
|
|
40
|
+
def build(data_set, number_of_clusters)
|
|
41
|
+
@data_set = data_set
|
|
42
|
+
@number_of_clusters = number_of_clusters
|
|
43
|
+
@clusters = [@data_set[0..-1]]
|
|
44
|
+
|
|
45
|
+
while(@clusters.length < @number_of_clusters)
|
|
46
|
+
cluster_index_to_split = max_diameter_cluster(@clusters)
|
|
47
|
+
cluster_to_split = @clusters[cluster_index_to_split]
|
|
48
|
+
splinter_cluster = init_splinter_cluster(cluster_to_split)
|
|
49
|
+
while true
|
|
50
|
+
dist_diff, index = max_distance_difference(cluster_to_split, splinter_cluster)
|
|
51
|
+
break if dist_diff < 0
|
|
52
|
+
splinter_cluster << cluster_to_split.data_items[index]
|
|
53
|
+
cluster_to_split.data_items.delete_at(index)
|
|
54
|
+
end
|
|
55
|
+
@clusters << splinter_cluster
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
return self
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Classifies the given data item, returning the cluster index it belongs
|
|
62
|
+
# to (0-based).
|
|
63
|
+
def eval(data_item)
|
|
64
|
+
get_min_index(@clusters.collect do |cluster|
|
|
65
|
+
distance_sum(data_item, cluster) / cluster.data_items.length
|
|
66
|
+
end)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
protected
|
|
70
|
+
|
|
71
|
+
# return the cluster with max diameter
|
|
72
|
+
def max_diameter_cluster(clusters)
|
|
73
|
+
max_index = 0
|
|
74
|
+
max_diameter = 0
|
|
75
|
+
clusters.each_with_index do |cluster, index|
|
|
76
|
+
diameter = cluster_diameter(cluster)
|
|
77
|
+
if diameter > max_diameter
|
|
78
|
+
max_index = index
|
|
79
|
+
max_diameter = diameter
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
return max_index
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Max distance between 2 items in a cluster
|
|
86
|
+
def cluster_diameter(cluster)
|
|
87
|
+
diameter = 0
|
|
88
|
+
cluster.data_items.each_with_index do |item_a, item_a_pos|
|
|
89
|
+
item_a_pos.times do |item_b_pos|
|
|
90
|
+
d = @distance_function.call(item_a, cluster.data_items[item_b_pos])
|
|
91
|
+
diameter = d if d > diameter
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
return diameter
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Create a cluster with the item with mx distance
|
|
98
|
+
# to the rest of the cluster's items.
|
|
99
|
+
# That item is removed from the initial cluster.
|
|
100
|
+
def init_splinter_cluster(cluster_to_split)
|
|
101
|
+
max = 0.0
|
|
102
|
+
max_index = 0
|
|
103
|
+
cluster_to_split.data_items.each_with_index do |item, index|
|
|
104
|
+
sum = distance_sum(item, cluster_to_split)
|
|
105
|
+
max, max_index = sum, index if sum > max
|
|
106
|
+
end
|
|
107
|
+
splinter_cluster = cluster_to_split[max_index]
|
|
108
|
+
cluster_to_split.data_items.delete_at(max_index)
|
|
109
|
+
return splinter_cluster
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Return the max average distance between any item of
|
|
113
|
+
# cluster_to_split and the rest of items in that cluster,
|
|
114
|
+
# minus the average distance with the items of splinter_cluster,
|
|
115
|
+
# and the index of the item.
|
|
116
|
+
# A positive value means that the items is closer to the
|
|
117
|
+
# splinter group than to its current cluster.
|
|
118
|
+
def max_distance_difference(cluster_to_split, splinter_cluster)
|
|
119
|
+
max_diff = -1.0/0
|
|
120
|
+
max_diff_index = 0
|
|
121
|
+
cluster_to_split.data_items.each_with_index do |item, index|
|
|
122
|
+
dist_a = distance_sum(item, cluster_to_split) / (cluster_to_split.data_items.length-1)
|
|
123
|
+
dist_b = distance_sum(item, splinter_cluster) / (splinter_cluster.data_items.length)
|
|
124
|
+
dist_diff = dist_a - dist_b
|
|
125
|
+
max_diff, max_diff_index = dist_diff, index if dist_diff > max_diff
|
|
126
|
+
end
|
|
127
|
+
return max_diff, max_diff_index
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Sum up the distance between an item and all the items in a cluster
|
|
131
|
+
def distance_sum(item_a, cluster)
|
|
132
|
+
cluster.data_items.inject(0.0) do |sum, item_b|
|
|
133
|
+
sum + @distance_function.call(item_a, item_b)
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
end
|