RubyGems - ai4r - Versions diffs - 1.5 → 1.6 - Mend

ai4r 1.5 → 1.6

Files changed (216) hide show

data/examples/clusterers/simple_website_clustering.rb +47 -0
data/lib/ai4r.rb +7 -0
data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
data/lib/ai4r/clusterers/average_linkage.rb +22 -23
data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
data/lib/ai4r/clusterers/complete_linkage.rb +17 -12
data/lib/ai4r/clusterers/diana.rb +139 -0
data/lib/ai4r/clusterers/median_linkage.rb +61 -0
data/lib/ai4r/clusterers/single_linkage.rb +57 -42
data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
data/lib/ai4r/data/constants.rb +18 -0
data/lib/ai4r/data/data_set.rb +5 -3
data/lib/ai4r/data/proximity.rb +18 -0
data/test/clusterers/average_linkage_test.rb +14 -11
data/test/clusterers/bisecting_k_means_test.rb +9 -0
data/test/clusterers/centroid_linkage_test.rb +50 -0
data/test/clusterers/complete_linkage_test.rb +14 -5
data/test/clusterers/diana_test.rb +69 -0
data/test/clusterers/k_means_test.rb +9 -0
data/test/clusterers/median_linkage_test.rb +50 -0
data/test/clusterers/single_linkage_test.rb +15 -6
data/test/clusterers/ward_linkage_test.rb +50 -0
data/test/clusterers/weighted_average_linkage_test.rb +50 -0
data/test/data/data_set_test.rb +14 -0
data/test/data/proximity_test.rb +10 -0
metadata +87 -298
data/site/build/site/en/broken-links.xml +0 -2
data/site/build/site/en/build/tmp/build-info.xml +0 -5
data/site/build/site/en/build/tmp/plugins-1.xml +0 -212
data/site/build/site/en/build/tmp/plugins-2.xml +0 -252
data/site/build/site/en/build/tmp/projfilters.properties +0 -41
data/site/build/site/en/downloads.html +0 -200
data/site/build/site/en/downloads.pdf +0 -151
data/site/build/site/en/geneticAlgorithms.html +0 -591
data/site/build/site/en/geneticAlgorithms.pdf +0 -934
data/site/build/site/en/images/ai4r-logo.png +0 -0
data/site/build/site/en/images/built-with-forrest-button.png +0 -0
data/site/build/site/en/images/c.png +0 -0
data/site/build/site/en/images/c_wbn.png +0 -0
data/site/build/site/en/images/c_wn.png +0 -0
data/site/build/site/en/images/ero.gif +0 -0
data/site/build/site/en/images/europe2.png +0 -0
data/site/build/site/en/images/europe3.png +0 -0
data/site/build/site/en/images/fitness.png +0 -0
data/site/build/site/en/images/genetic_algorithms_example.png +0 -0
data/site/build/site/en/images/instruction_arrow.png +0 -0
data/site/build/site/en/images/jadeferret.png +0 -0
data/site/build/site/en/images/my_email.png +0 -0
data/site/build/site/en/images/neural_network_example.png +0 -0
data/site/build/site/en/images/rubyforge.png +0 -0
data/site/build/site/en/images/s.png +0 -0
data/site/build/site/en/images/s_wbn.png +0 -0
data/site/build/site/en/images/s_wn.png +0 -0
data/site/build/site/en/images/sigmoid.png +0 -0
data/site/build/site/en/images/t.png +0 -0
data/site/build/site/en/images/t_wbn.png +0 -0
data/site/build/site/en/images/t_wn.png +0 -0
data/site/build/site/en/index.html +0 -390
data/site/build/site/en/index.pdf +0 -657
data/site/build/site/en/linkmap.html +0 -261
data/site/build/site/en/linkmap.pdf +0 -94
data/site/build/site/en/locationmap.xml +0 -72
data/site/build/site/en/machineLearning.html +0 -340
data/site/build/site/en/machineLearning.pdf +0 -337
data/site/build/site/en/neuralNetworks.html +0 -521
data/site/build/site/en/neuralNetworks.pdf +0 -671
data/site/build/site/en/skin/CommonMessages_de.xml +0 -23
data/site/build/site/en/skin/CommonMessages_en_US.xml +0 -23
data/site/build/site/en/skin/CommonMessages_es.xml +0 -23
data/site/build/site/en/skin/CommonMessages_fr.xml +0 -23
data/site/build/site/en/skin/basic.css +0 -166
data/site/build/site/en/skin/breadcrumbs-optimized.js +0 -90
data/site/build/site/en/skin/breadcrumbs.js +0 -237
data/site/build/site/en/skin/fontsize.js +0 -166
data/site/build/site/en/skin/getBlank.js +0 -40
data/site/build/site/en/skin/getMenu.js +0 -45
data/site/build/site/en/skin/images/README.txt +0 -1
data/site/build/site/en/skin/images/add.jpg +0 -0
data/site/build/site/en/skin/images/built-with-forrest-button.png +0 -0
data/site/build/site/en/skin/images/chapter.gif +0 -0
data/site/build/site/en/skin/images/chapter_open.gif +0 -0
data/site/build/site/en/skin/images/current.gif +0 -0
data/site/build/site/en/skin/images/error.png +0 -0
data/site/build/site/en/skin/images/external-link.gif +0 -0
data/site/build/site/en/skin/images/fix.jpg +0 -0
data/site/build/site/en/skin/images/forrest-credit-logo.png +0 -0
data/site/build/site/en/skin/images/hack.jpg +0 -0
data/site/build/site/en/skin/images/header_white_line.gif +0 -0
data/site/build/site/en/skin/images/info.png +0 -0
data/site/build/site/en/skin/images/instruction_arrow.png +0 -0
data/site/build/site/en/skin/images/label.gif +0 -0
data/site/build/site/en/skin/images/page.gif +0 -0
data/site/build/site/en/skin/images/pdfdoc.gif +0 -0
data/site/build/site/en/skin/images/poddoc.png +0 -0
data/site/build/site/en/skin/images/printer.gif +0 -0
data/site/build/site/en/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
data/site/build/site/en/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
data/site/build/site/en/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
data/site/build/site/en/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
data/site/build/site/en/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
data/site/build/site/en/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
data/site/build/site/en/skin/images/remove.jpg +0 -0
data/site/build/site/en/skin/images/rss.png +0 -0
data/site/build/site/en/skin/images/spacer.gif +0 -0
data/site/build/site/en/skin/images/success.png +0 -0
data/site/build/site/en/skin/images/txtdoc.png +0 -0
data/site/build/site/en/skin/images/update.jpg +0 -0
data/site/build/site/en/skin/images/valid-html401.png +0 -0
data/site/build/site/en/skin/images/vcss.png +0 -0
data/site/build/site/en/skin/images/warning.png +0 -0
data/site/build/site/en/skin/images/xmldoc.gif +0 -0
data/site/build/site/en/skin/menu.js +0 -48
data/site/build/site/en/skin/note.txt +0 -50
data/site/build/site/en/skin/print.css +0 -54
data/site/build/site/en/skin/profile.css +0 -163
data/site/build/site/en/skin/prototype.js +0 -1257
data/site/build/site/en/skin/screen.css +0 -587
data/site/build/site/en/sourceCode.html +0 -244
data/site/build/site/en/sourceCode.pdf +0 -278
data/site/build/site/en/svn.html +0 -244
data/site/build/site/en/svn.pdf +0 -278
data/site/build/tmp/brokenlinks.xml +0 -2
data/site/build/tmp/build-info.xml +0 -5
data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
data/site/build/tmp/input.xmap +0 -32
data/site/build/tmp/internal.xmap +0 -32
data/site/build/tmp/locationmap.xml +0 -29
data/site/build/tmp/output.xmap +0 -38
data/site/build/tmp/pluginlist2fetchbuild.xml +0 -144
data/site/build/tmp/plugins-1.xml +0 -201
data/site/build/tmp/plugins-2.xml +0 -401
data/site/build/tmp/projfilters.properties +0 -41
data/site/build/tmp/resources.xmap +0 -32
data/site/build/webapp/WEB-INF/logs/access.log +0 -0
data/site/build/webapp/WEB-INF/logs/core.log +0 -775
data/site/build/webapp/WEB-INF/logs/debug.log +0 -0
data/site/build/webapp/WEB-INF/logs/error.log +0 -213
data/site/build/webapp/WEB-INF/logs/flow.log +0 -0
data/site/build/webapp/WEB-INF/logs/idgen.log +0 -0
data/site/build/webapp/WEB-INF/logs/linkrewriter.log +0 -0
data/site/build/webapp/WEB-INF/logs/locationmap.log +0 -0
data/site/build/webapp/WEB-INF/logs/sitemap.log +0 -0
data/site/build/webapp/WEB-INF/logs/xmlform.log +0 -0
data/site/forrest.properties +0 -152
data/site/forrest.properties.dispatcher.properties +0 -25
data/site/forrest.properties.xml +0 -29
data/site/src/documentation/README.txt +0 -7
data/site/src/documentation/classes/CatalogManager.properties +0 -62
data/site/src/documentation/content/locationmap.xml +0 -72
data/site/src/documentation/content/xdocs/downloads.html +0 -9
data/site/src/documentation/content/xdocs/geneticAlgorithms.xml +0 -294
data/site/src/documentation/content/xdocs/index.xml +0 -129
data/site/src/documentation/content/xdocs/machineLearning.xml +0 -131
data/site/src/documentation/content/xdocs/neuralNetworks.xml +0 -270
data/site/src/documentation/content/xdocs/site.xml +0 -54
data/site/src/documentation/content/xdocs/sourceCode.xml +0 -43
data/site/src/documentation/content/xdocs/tabs.xml +0 -35
data/site/src/documentation/resources/images/ai4r-logo.png +0 -0
data/site/src/documentation/resources/images/c.png +0 -0
data/site/src/documentation/resources/images/c_wbn.png +0 -0
data/site/src/documentation/resources/images/c_wn.png +0 -0
data/site/src/documentation/resources/images/ellipse-2.svg +0 -30
data/site/src/documentation/resources/images/ero.gif +0 -0
data/site/src/documentation/resources/images/europe2.png +0 -0
data/site/src/documentation/resources/images/europe3.png +0 -0
data/site/src/documentation/resources/images/fitness.png +0 -0
data/site/src/documentation/resources/images/genetic_algorithms_example.png +0 -0
data/site/src/documentation/resources/images/icon-a.png +0 -0
data/site/src/documentation/resources/images/icon-b.png +0 -0
data/site/src/documentation/resources/images/icon.png +0 -0
data/site/src/documentation/resources/images/jadeferret.png +0 -0
data/site/src/documentation/resources/images/my_email.png +0 -0
data/site/src/documentation/resources/images/neural_network_example.png +0 -0
data/site/src/documentation/resources/images/project-logo.png +0 -0
data/site/src/documentation/resources/images/rubyforge.png +0 -0
data/site/src/documentation/resources/images/s.png +0 -0
data/site/src/documentation/resources/images/s_wbn.png +0 -0
data/site/src/documentation/resources/images/s_wn.png +0 -0
data/site/src/documentation/resources/images/sigmoid.png +0 -0
data/site/src/documentation/resources/images/sub-dir/icon-c.png +0 -0
data/site/src/documentation/resources/images/t.png +0 -0
data/site/src/documentation/resources/images/t_wbn.png +0 -0
data/site/src/documentation/resources/images/t_wn.png +0 -0
data/site/src/documentation/resources/schema/catalog.xcat +0 -29
data/site/src/documentation/resources/schema/hello-v10.dtd +0 -51
data/site/src/documentation/resources/schema/symbols-project-v10.ent +0 -26
data/site/src/documentation/resources/stylesheets/hello2document.xsl +0 -33
data/site/src/documentation/sitemap.xmap +0 -66
data/site/src/documentation/skinconf.xml +0 -418
data/site/src/documentation/translations/langcode.xml +0 -29
data/site/src/documentation/translations/languages_de.xml +0 -24
data/site/src/documentation/translations/languages_en.xml +0 -24
data/site/src/documentation/translations/languages_es.xml +0 -22
data/site/src/documentation/translations/languages_fr.xml +0 -24
data/site/src/documentation/translations/languages_nl.xml +0 -24
data/site/src/documentation/translations/menu.xml +0 -33
data/site/src/documentation/translations/menu_af.xml +0 -33
data/site/src/documentation/translations/menu_de.xml +0 -33
data/site/src/documentation/translations/menu_es.xml +0 -33
data/site/src/documentation/translations/menu_fr.xml +0 -33
data/site/src/documentation/translations/menu_it.xml +0 -33
data/site/src/documentation/translations/menu_nl.xml +0 -33
data/site/src/documentation/translations/menu_no.xml +0 -33
data/site/src/documentation/translations/menu_ru.xml +0 -33
data/site/src/documentation/translations/menu_sk.xml +0 -33
data/site/src/documentation/translations/tabs.xml +0 -22
data/site/src/documentation/translations/tabs_de.xml +0 -22
data/site/src/documentation/translations/tabs_es.xml +0 -22
data/site/src/documentation/translations/tabs_fr.xml +0 -22
data/site/src/documentation/translations/tabs_nl.xml +0 -22

data/examples/clusterers/simple_website_clustering.rb ADDED

@@ -0,0 +1,47 @@
+require File.dirname(__FILE__) + '/google_search'
+require File.dirname(__FILE__) + '/build_keywords'
+require File.dirname(__FILE__) + '/../../lib/ai4r/clusterers/average_linkage'
+require 'rubygems'
+require 'hpricot'
+require 'net/http'
+require 'benchmark'
+SITES_TO_CLASSIFY = [
+  "www.foxnews.com", "www.usatoday.com", "scm.jadeferret.com",
+  "www.accurev.com", "www.lastminute.com", "subversion.tigris.org",
+  "news.yahoo.com", "news.bbc.co.uk", "www.orbitz.com"
+]
+# Return array of keywords for the site
+def get_keywords(site)
+  response = Net::HTTP.get_response(site, "/")
+  Hpricot(response.body).
+    search("meta[@name='keywords']")[0]. #Select meta keywords element
+    attributes["content"].               #Select its content
+    split(",").                          #Keywords are coma separated
+    collect{ |k| k.strip.downcase }      #Remove start and end white spaces
+end
+# Get keywords data for each website
+Site = Struct.new("Site", :name, :keywords)
+sites = SITES_TO_CLASSIFY.collect do |site_name|
+  Site.new(site_name, get_keywords(site_name))
+end
+data_set = Ai4r::Data::DataSet.new(:data_items => sites,
+  :data_labels => Site.members)
+# The distance between sites depends on the keywords collected from internet
+keywords_distance_function = lambda do |x,y|
+  return Ai4r::Data::Proximity.simple_matching(x.keyword, y.keywords)
+end
+# Create the clusters
+clusterer = Ai4r::Clusterers::AverageLinkage.new
+clusterer.distance_function = keywords_distance_function
+clusterer.build(data_set, 3)
+# Print results
+clusterer.clusters.each do |cluster|
+  puts cluster.data_items.collect {|item| item.name}.join(", ")
+  puts "============"
+end

data/lib/ai4r.rb CHANGED

@@ -1,6 +1,7 @@
 # Data
 require "ai4r/data/data_set"
 require "ai4r/data/statistics"
+require "ai4r/data/proximity"
 require "ai4r/data/parameterizable"
 # Clusterers
 require "ai4r/clusterers/clusterer"
@@ -9,12 +10,18 @@ require "ai4r/clusterers/bisecting_k_means"
 require "ai4r/clusterers/single_linkage"
 require "ai4r/clusterers/complete_linkage"
 require "ai4r/clusterers/average_linkage"
+require "ai4r/clusterers/weighted_average_linkage"
+require "ai4r/clusterers/centroid_linkage"
+require "ai4r/clusterers/median_linkage"
+require "ai4r/clusterers/ward_linkage"
+require "ai4r/clusterers/diana"
 # Classifiers
 require "ai4r/classifiers/classifier"
 require "ai4r/classifiers/id3"
 require "ai4r/classifiers/prism"
 require "ai4r/classifiers/one_r"
 require "ai4r/classifiers/zero_r"
+require "ai4r/classifiers/hyperpipes"
 # Neural networks
 require "ai4r/neural_network/backpropagation"
 # Genetic Algorithms

data/lib/ai4r/classifiers/hyperpipes.rb ADDED

@@ -0,0 +1,118 @@
+# Author::    Sergio Fierens (Implementation only)
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require 'set'
+require File.dirname(__FILE__) + '/../data/constants'
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../classifiers/classifier'
+module Ai4r
+  module Classifiers
+    include Ai4r::Data
+    # = Introduction
+    #
+    # A fast classifier algorithm, created by Lucio de Souza Coelho
+    # and Len Trigg.
+    class Hyperpipes < Classifier
+      attr_reader :data_set, :pipes
+      # Build a new Hyperpipes classifier. You must provide a DataSet instance
+      # as parameter. The last attribute of each item is considered as
+      # the item class.
+      def build(data_set)
+        @data_set = data_set
+        @domains = data_set.build_domains
+        @pipes = {}
+        @domains.last.each {|cat| @pipes[cat] = build_pipe(@domains)}
+        @data_set.data_item.each {|item| update_pipe(@pipes[item.last], item) }
+        return self
+      end
+      # You can evaluate new data, predicting its class.
+      # e.g.
+      #   classifier.eval(['New York',  '<30', 'F'])  # => 'Y'
+      def eval(data)
+        votes = Hash.new {0}
+        @pipes.each do |category, pipe|
+          pipe.each_with_index do |bounds, i|
+            if data[i].is_a? Numeric
+              votes[category]+=1 if data[i]>bounds[:min] && data[i]<bounds[:max]
+            else
+              votes[category]+=1 if bounds[data[i]]
+            end
+          end
+        end
+        return votes.to_a.max {|x, y| x.last <=> y.last}.first
+      end
+      # This method returns the generated rules in ruby code.
+      # e.g.
+      #
+      #   classifier.get_rules
+      #     # =>  if age_range == '<30' then marketing_target = 'Y'
+      #           elsif age_range == '[30-50)' then marketing_target = 'N'
+      #           elsif age_range == '[50-80]' then marketing_target = 'N'
+      #           end
+      #
+      # It is a nice way to inspect induction results, and also to execute them:
+      #     marketing_target = nil
+      #     eval classifier.get_rules
+      #     puts marketing_target
+      #       # =>  'Y'
+      def get_rules
+        rules = []
+        rules << "votes = Hash.new {0}"
+        data = @data_set.data_items.first
+        labels = @data_set.data_labels.collect {|l| l.to_s}
+        @pipes.each do |category, pipe|
+          pipe.each_with_index do |bounds, i|
+            rule = "votes['#{category}'] += 1 "
+            if data[i].is_a? Numeric
+              rule += "if #{labels[i]} > #{bounds[:min]} && #{labels[i]} < #{bounds[:max]}"
+            else
+              rule += "if #{bounds.inspect}['#{labels[i]}']"
+            end
+            rules << rule
+          end
+        end
+        rules << "votes.to_a.max {|x, y| x.last <=> y.last}.first"
+        return rules.join('\n')
+      end
+      protected
+      def build_pipe(data_set)
+        data_set.data_items.first[0...-1].collect do |att|
+          if att.is_a? Numeric
+            {:min=>POSITIVE_INFINITY, :max=>NEGATIVE_INFINITY}
+          else
+            Hash.new(false)
+          end
+        end
+      end
+      def update_pipe(pipe, data_item)
+        data_item[0...-1].each_with_index do |att, i|
+          if att.first.is_a? Numeric
+            pipe[i][:min] = att if att < pipe[i][:min]
+            pipe[i][:max] = att if att > pipe[i][:max]
+          else
+            pipe[i][att] = true
+          end
+        end
+      end
+    end
+  end
+end

data/lib/ai4r/clusterers/average_linkage.rb CHANGED

@@ -13,14 +13,25 @@ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
 module Ai4r
   module Clusterers
-    # Implementation of a Hierarchical clusterer with complete linkage.
+    # Implementation of a Hierarchical clusterer with group average
+    # linkage, AKA unweighted pair group method average or UPGMA (Everitt
+    # et al., 2001 ; Jain and Dubes, 1988 ; Sokal and Michener, 1958).
     # Hierarchical clusteres create one cluster per element, and then
     # progressively merge clusters, until the required number of clusters
     # is reached.
-    # With average linkage, the distance between two clusters is computed as
-    # the average distance between elements of each cluster.
+    # With average linkage, the distance between a clusters cx and
+    # cluster (ci U cj) the the average distance between cx and ci, and
+    # cx and cj.
+    #
+    #   D(cx, (ci U cj) = (D(cx, ci) + D(cx, cj)) / 2
     class AverageLinkage < SingleLinkage
+      parameters_info :distance_function =>
+          "Custom implementation of distance function. " +
+          "It must be a closure receiving two data items and return the " +
+          "distance bewteen them. By default, this algorithm uses " +
+          "ecuclidean distance of numeric attributes to the power of 2."
       # Build a new clusterer, using data examples found in data_set.
       # Items will be clustered in "number_of_clusters" different
       # clusters.
@@ -28,31 +39,19 @@ module Ai4r
         super
       end
-      # Classifies the given data item, returning the cluster index it belongs
-      # to (0-based).
+      # This algorithms does not allow classification of new data items
+      # once it has been built. Rebuild the cluster including you data element.
       def eval(data_item)
-        super
+        Raise "Eval of new data is not supported by this algorithm."
       end
       protected
-      # Calculate cluster distance using the average linkage method
-      def calc_index_clusters_distance(cluster_a, cluster_b)
-        dist_sum = 0.0
-        cluster_a.each do |index_a|
-          cluster_b.each do |index_b|
-            dist_sum += read_distance_matrix(index_a, index_b)
-            end
-        end
-        return dist_sum/(cluster_a.length*cluster_b.length)
-      end
-      def distance_between_item_and_cluster(data_item, cluster)
-        dist_sum = 0.0
-        cluster.data_items.each do |another_item|
-          dist_sum += distance(data_item, another_item)
-        end
-        return dist_sum/cluster.data_items.length
+      # return distance between cluster cx and cluster (ci U cj),
+      # using average linkage
+      def linkage_distance(cx, ci, cj)
+        (read_distance_matrix(cx, ci)+
+          read_distance_matrix(cx, cj))/2
       end
     end

data/lib/ai4r/clusterers/centroid_linkage.rb ADDED

@@ -0,0 +1,66 @@
+# Author::    Sergio Fierens (implementation)
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../clusterers/single_linkage'
+module Ai4r
+  module Clusterers
+    # Implementation of an Agglomerative Hierarchical clusterer with
+    # centroid linkage algorithm, aka unweighted pair group method
+    # centroid (UPGMC) (Everitt et al., 2001 ; Jain and Dubes, 1988 ;
+    # Sokal and Michener, 1958 )
+    # Hierarchical clusteres create one cluster per element, and then
+    # progressively merge clusters, until the required number of clusters
+    # is reached.
+    # The distance between clusters is the squared euclidean distance
+    # between their centroids.
+    #
+    #   D(cx, (ci U cj)) = | mx - mij |^2
+    #   D(cx, (ci U cj)) =  (ni/(ni+nj))*D(cx, ci) +
+    #                       (nj/(ni+nj))*D(cx, cj) -
+    #                       (ni*nj/(ni+nj)^2)*D(ci, cj)
+    class CentroidLinkage < SingleLinkage
+    parameters_info :distance_function =>
+          "Custom implementation of distance function. " +
+          "It must be a closure receiving two data items and return the " +
+          "distance bewteen them. By default, this algorithm uses " +
+          "ecuclidean distance of numeric attributes to the power of 2."
+      # Build a new clusterer, using data examples found in data_set.
+      # Items will be clustered in "number_of_clusters" different
+      # clusters.
+      def build(data_set, number_of_clusters)
+        super
+      end
+      # This algorithms does not allow classification of new data items
+      # once it has been built. Rebuild the cluster including you data element.
+      def eval(data_item)
+        Raise "Eval of new data is not supported by this algorithm."
+      end
+      protected
+      # return distance between cluster cx and cluster (ci U cj),
+      # using centroid linkage
+      def linkage_distance(cx, ci, cj)
+        ni = @index_clusters[ci].length
+        nj = @index_clusters[cj].length
+        ( ni * read_distance_matrix(cx, ci) +
+          nj * read_distance_matrix(cx, cj) -
+         1.0 * ni * nj * read_distance_matrix(ci, cj) / (ni+nj)) / (ni+nj)
+      end
+    end
+  end
+end

data/lib/ai4r/clusterers/complete_linkage.rb CHANGED

@@ -13,14 +13,24 @@ require File.dirname(__FILE__) + '/../clusterers/single_linkage'
 module Ai4r
   module Clusterers
-    # Implementation of a Hierarchical clusterer with complete linkage.
+    # Implementation of a Hierarchical clusterer with complete linkage (Everitt
+    # et al., 2001 ; Jain and Dubes, 1988 ; Sorensen, 1948 ).
     # Hierarchical clusteres create one cluster per element, and then
     # progressively merge clusters, until the required number of clusters
     # is reached.
     # With complete linkage, the distance between two clusters is computed as
     # the maximum distance between elements of each cluster.
+    #
+    #   D(cx, (ci U cj) = max(D(cx, ci), D(cx, cj))
     class CompleteLinkage < SingleLinkage
+      parameters_info :distance_function =>
+          "Custom implementation of distance function. " +
+          "It must be a closure receiving two data items and return the " +
+          "distance bewteen them. By default, this algorithm uses " +
+          "ecuclidean distance of numeric attributes to the power of 2."
       # Build a new clusterer, using data examples found in data_set.
       # Items will be clustered in "number_of_clusters" different
       # clusters.
@@ -36,22 +46,17 @@ module Ai4r
       protected
-      # Calculate cluster distance using the complete linkage method
-      def calc_index_clusters_distance(cluster_a, cluster_b)
-        max_dist = 0
-        cluster_a.each do |index_a|
-          cluster_b.each do |index_b|
-            dist = read_distance_matrix(index_a, index_b)
-            max_dist = dist if dist > max_dist
-          end
-        end
-        return max_dist
+      # return distance between cluster cx and new cluster (ci U cj),
+      # using complete linkage
+      def linkage_distance(cx, ci, cj)
+        [read_distance_matrix(cx, ci),
+          read_distance_matrix(cx, cj)].max
       end
       def distance_between_item_and_cluster(data_item, cluster)
         max_dist = 0
         cluster.data_items.each do |another_item|
-          dist = distance(data_item, another_item)
+          dist = @distance_function.call(data_item, another_item)
           max_dist = dist if dist > max_dist
         end
         return max_dist

data/lib/ai4r/clusterers/diana.rb ADDED

@@ -0,0 +1,139 @@
+# Author::    Sergio Fierens (implementation)
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../data/proximity'
+require File.dirname(__FILE__) + '/../clusterers/clusterer'
+module Ai4r
+  module Clusterers
+    # DIANA (Divisive ANAlysis) (Kaufman and Rousseeuw, 1990;
+    # Macnaughton - Smith et al. 1964) is a Divisive Hierarchical
+    # Clusterer. It begins with only one cluster with all data items,
+    # and divides the clusters until the desired clusters number is reached.
+    class Diana < Clusterer
+      attr_reader :data_set, :number_of_clusters, :clusters
+      parameters_info :distance_function =>
+          "Custom implementation of distance function. " +
+          "It must be a closure receiving two data items and return the " +
+          "distance bewteen them. By default, this algorithm uses " +
+          "ecuclidean distance of numeric attributes to the power of 2."
+      def initialize
+        @distance_function = lambda do |a,b|
+            Ai4r::Data::Proximity.squared_euclidean_distance(
+              a.select {|att_a| att_a.is_a? Numeric} ,
+              b.select {|att_b| att_b.is_a? Numeric})
+          end
+      end
+      # Build a new clusterer, using divisive analysis (DIANA algorithm)
+      def build(data_set, number_of_clusters)
+        @data_set = data_set
+        @number_of_clusters = number_of_clusters
+        @clusters = [@data_set[0..-1]]
+        while(@clusters.length < @number_of_clusters)
+          cluster_index_to_split = max_diameter_cluster(@clusters)
+          cluster_to_split = @clusters[cluster_index_to_split]
+          splinter_cluster = init_splinter_cluster(cluster_to_split)
+          while true
+            dist_diff, index = max_distance_difference(cluster_to_split, splinter_cluster)
+            break if dist_diff < 0
+            splinter_cluster << cluster_to_split.data_items[index]
+            cluster_to_split.data_items.delete_at(index)
+          end
+          @clusters << splinter_cluster
+        end
+        return self
+      end
+      # Classifies the given data item, returning the cluster index it belongs
+      # to (0-based).
+      def eval(data_item)
+        get_min_index(@clusters.collect do |cluster|
+          distance_sum(data_item, cluster) / cluster.data_items.length
+          end)
+      end
+      protected
+      # return the cluster with max diameter
+      def max_diameter_cluster(clusters)
+        max_index = 0
+        max_diameter = 0
+        clusters.each_with_index do |cluster, index|
+          diameter = cluster_diameter(cluster)
+          if diameter > max_diameter
+            max_index = index
+            max_diameter = diameter
+          end
+        end
+        return max_index
+      end
+      # Max distance between 2 items in a cluster
+      def cluster_diameter(cluster)
+        diameter = 0
+        cluster.data_items.each_with_index do |item_a, item_a_pos|
+          item_a_pos.times do |item_b_pos|
+            d = @distance_function.call(item_a, cluster.data_items[item_b_pos])
+            diameter = d if d > diameter
+          end
+        end
+        return diameter
+      end
+      # Create a cluster with the item with mx distance
+      # to the rest of the cluster's items.
+      # That item is removed from the initial cluster.
+      def init_splinter_cluster(cluster_to_split)
+        max = 0.0
+        max_index = 0
+        cluster_to_split.data_items.each_with_index do |item, index|
+          sum = distance_sum(item, cluster_to_split)
+          max, max_index = sum, index if sum > max
+        end
+        splinter_cluster = cluster_to_split[max_index]
+        cluster_to_split.data_items.delete_at(max_index)
+        return splinter_cluster
+      end
+      # Return the max average distance between any item of
+      # cluster_to_split and the rest of items in that cluster,
+      # minus the average distance with the items of splinter_cluster,
+      # and the index of the item.
+      # A positive value means that the items is closer to the
+      # splinter group than to its current cluster.
+      def max_distance_difference(cluster_to_split, splinter_cluster)
+        max_diff = -1.0/0
+        max_diff_index = 0
+        cluster_to_split.data_items.each_with_index do |item, index|
+          dist_a = distance_sum(item, cluster_to_split) / (cluster_to_split.data_items.length-1)
+          dist_b = distance_sum(item, splinter_cluster) / (splinter_cluster.data_items.length)
+          dist_diff = dist_a - dist_b
+          max_diff, max_diff_index = dist_diff, index if dist_diff > max_diff
+        end
+        return max_diff, max_diff_index
+      end
+      # Sum up the distance between an item and all the items in a cluster
+      def distance_sum(item_a, cluster)
+        cluster.data_items.inject(0.0) do |sum, item_b|
+          sum + @distance_function.call(item_a, item_b)
+        end
+      end
+    end
+  end
+end