RubyGems - ai4r - Versions diffs - 1.2 → 1.3 - Mend

ai4r 1.2 → 1.3

Files changed (139) hide show

data/README.rdoc +12 -25
data/examples/decision_trees/id3_example.rb +6 -9
data/examples/decision_trees/results.txt +2 -0
data/examples/genetic_algorithm/genetic_algorithm_example.rb +11 -13
data/examples/neural_network/xor_example.rb +25 -0
data/lib/ai4r.rb +10 -0
data/lib/ai4r/classifiers/classifier.rb +46 -0
data/lib/ai4r/classifiers/id3.rb +27 -58
data/lib/ai4r/classifiers/one_r.rb +19 -58
data/lib/ai4r/classifiers/prism.rb +21 -57
data/lib/ai4r/classifiers/zero_r.rb +16 -48
data/lib/ai4r/clusterers/bisecting_k_means.rb +115 -0
data/lib/ai4r/clusterers/clusterer.rb +55 -0
data/lib/ai4r/clusterers/k_means.rb +164 -0
data/lib/ai4r/data/data_set.rb +250 -0
data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +19 -19
data/lib/ai4r/neural_network/backpropagation.rb +23 -24
data/site/build/site/en/broken-links.xml +2 -0
data/site/build/site/en/downloads.html +200 -0
data/site/build/site/en/downloads.pdf +151 -0
data/site/build/site/en/forum.html +197 -0
data/site/build/site/en/forum.pdf +151 -0
data/site/build/site/en/geneticAlgorithms.html +591 -0
data/site/build/site/en/geneticAlgorithms.pdf +934 -0
data/site/build/site/en/images/ai4r-logo.png +0 -0
data/site/build/site/en/images/built-with-forrest-button.png +0 -0
data/site/build/site/en/images/c.png +0 -0
data/site/build/site/en/images/c_wbn.png +0 -0
data/site/build/site/en/images/c_wn.png +0 -0
data/site/build/site/en/images/ero.gif +0 -0
data/site/build/site/en/images/europe2.png +0 -0
data/site/build/site/en/images/europe3.png +0 -0
data/site/build/site/en/images/fitness.png +0 -0
data/site/build/site/en/images/genetic_algorithms_example.png +0 -0
data/site/build/site/en/images/instruction_arrow.png +0 -0
data/site/build/site/en/images/jadeferret.png +0 -0
data/site/build/site/en/images/my_email.png +0 -0
data/site/build/site/en/images/neural_network_example.png +0 -0
data/site/build/site/en/images/rubyforge.png +0 -0
data/site/build/site/en/images/s.png +0 -0
data/site/build/site/en/images/s_wbn.png +0 -0
data/site/build/site/en/images/s_wn.png +0 -0
data/site/build/site/en/images/sigmoid.png +0 -0
data/site/build/site/en/images/t.png +0 -0
data/site/build/site/en/images/t_wbn.png +0 -0
data/site/build/site/en/images/t_wn.png +0 -0
data/site/build/site/en/index.html +336 -0
data/site/build/site/en/index.pdf +508 -0
data/site/build/site/en/linkmap.html +263 -0
data/site/build/site/en/linkmap.pdf +94 -0
data/site/build/site/en/locationmap.xml +72 -0
data/site/build/site/en/machineLearning.html +339 -0
data/site/build/site/en/machineLearning.pdf +337 -0
data/site/build/site/en/neuralNetworks.html +484 -0
data/site/build/site/en/neuralNetworks.pdf +604 -0
data/site/build/site/en/skin/CommonMessages_de.xml +23 -0
data/site/build/site/en/skin/CommonMessages_en_US.xml +23 -0
data/site/build/site/en/skin/CommonMessages_es.xml +23 -0
data/site/build/site/en/skin/CommonMessages_fr.xml +23 -0
data/site/build/site/en/skin/basic.css +166 -0
data/site/build/site/en/skin/breadcrumbs-optimized.js +90 -0
data/site/build/site/en/skin/breadcrumbs.js +237 -0
data/site/build/site/en/skin/fontsize.js +166 -0
data/site/build/site/en/skin/getBlank.js +40 -0
data/site/build/site/en/skin/getMenu.js +45 -0
data/site/build/site/en/skin/images/README.txt +1 -0
data/site/build/site/en/skin/images/add.jpg +0 -0
data/site/build/site/en/skin/images/built-with-forrest-button.png +0 -0
data/site/build/site/en/skin/images/chapter.gif +0 -0
data/site/build/site/en/skin/images/chapter_open.gif +0 -0
data/site/build/site/en/skin/images/current.gif +0 -0
data/site/build/site/en/skin/images/error.png +0 -0
data/site/build/site/en/skin/images/external-link.gif +0 -0
data/site/build/site/en/skin/images/fix.jpg +0 -0
data/site/build/site/en/skin/images/forrest-credit-logo.png +0 -0
data/site/build/site/en/skin/images/hack.jpg +0 -0
data/site/build/site/en/skin/images/header_white_line.gif +0 -0
data/site/build/site/en/skin/images/info.png +0 -0
data/site/build/site/en/skin/images/instruction_arrow.png +0 -0
data/site/build/site/en/skin/images/label.gif +0 -0
data/site/build/site/en/skin/images/page.gif +0 -0
data/site/build/site/en/skin/images/pdfdoc.gif +0 -0
data/site/build/site/en/skin/images/poddoc.png +0 -0
data/site/build/site/en/skin/images/printer.gif +0 -0
data/site/build/site/en/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
data/site/build/site/en/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
data/site/build/site/en/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
data/site/build/site/en/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
data/site/build/site/en/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
data/site/build/site/en/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
data/site/build/site/en/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
data/site/build/site/en/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
data/site/build/site/en/skin/images/remove.jpg +0 -0
data/site/build/site/en/skin/images/rss.png +0 -0
data/site/build/site/en/skin/images/spacer.gif +0 -0
data/site/build/site/en/skin/images/success.png +0 -0
data/site/build/site/en/skin/images/txtdoc.png +0 -0
data/site/build/site/en/skin/images/update.jpg +0 -0
data/site/build/site/en/skin/images/valid-html401.png +0 -0
data/site/build/site/en/skin/images/vcss.png +0 -0
data/site/build/site/en/skin/images/warning.png +0 -0
data/site/build/site/en/skin/images/xmldoc.gif +0 -0
data/site/build/site/en/skin/menu.js +48 -0
data/site/build/site/en/skin/note.txt +50 -0
data/site/build/site/en/skin/print.css +54 -0
data/site/build/site/en/skin/profile.css +163 -0
data/site/build/site/en/skin/prototype.js +1257 -0
data/site/build/site/en/skin/screen.css +587 -0
data/site/build/site/en/svn.html +252 -0
data/site/build/site/en/svn.pdf +306 -0
data/site/build/site/en/wholesite.pdf +1915 -0
data/site/build/tmp/brokenlinks.xml +2 -0
data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
data/site/build/tmp/locationmap.xml +14 -14
data/site/build/tmp/output.xmap +23 -23
data/site/build/tmp/pluginlist2fetchbuild.xml +144 -144
data/site/build/tmp/projfilters.properties +41 -41
data/site/build/webapp/WEB-INF/logs/core.log +593 -679
data/site/build/webapp/WEB-INF/logs/error.log +362 -279
data/site/build/webapp/WEB-INF/logs/sitemap.log +368 -1015
data/site/src/documentation/content/xdocs/index.xml +18 -10
data/site/src/documentation/content/xdocs/machineLearning.xml +4 -3
data/site/src/documentation/content/xdocs/site.xml +2 -1
data/site/src/documentation/resources/images/sigmoid.png +0 -0
data/test/classifiers/id3_test.rb +45 -44
data/test/classifiers/one_r_test.rb +19 -17
data/test/classifiers/prism_test.rb +22 -20
data/test/classifiers/zero_r_test.rb +15 -12
data/test/clusterers/bisecting_k_means_test.rb +59 -0
data/test/clusterers/k_means_test.rb +93 -0
data/test/data/data_set_test.rb +92 -0
metadata +252 -128
data/lib/ai4r/classifiers/classifier_helper.rb +0 -54
data/site/src/documentation/content/xdocs/forum.html +0 -9
data/site/src/documentation/resources/images/Thumbs.db +0 -0
data/site/src/documentation/resources/images/sub-dir/Thumbs.db +0 -0

data/lib/ai4r/classifiers/prism.rb CHANGED

@@ -11,7 +11,8 @@
 # J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
 # International Journal of Man-Machine Studies. 27(4):349-370.
-require File.dirname(__FILE__) + '/classifier_helper'
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../classifiers/classifier'
 module Ai4r
   module Classifiers
@@ -23,50 +24,17 @@ module Ai4r
     #
     # J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules.
     # International Journal of Man-Machine Studies. 27(4):349-370.
-    class Prism
+    class Prism < Classifier
-      attr_accessor :data_labels, :rules
-      include ClassifierHelper
+      attr_reader :data_set, :rules
-      # Build a new Prism classifier. If your data is classified with N attributed
-      # and M examples, then your data examples must have the following format:
-      #
-      #     [   [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1,  CLASS_VAL1],
-      #         [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2,  CLASS_VAL2],
-      #         ...
-      #         [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
-      #     ]
-      #
-      # e.g.
-      #     [   ['New York',  '<30',      'M', 'Y'],
-      #          ['Chicago',     '<30',      'M', 'Y'],
-      #          ['Chicago',     '<30',      'F', 'Y'],
-      #          ['New York',  '<30',      'M', 'Y'],
-      #          ['New York',  '<30',      'M', 'Y'],
-      #          ['Chicago',     '[30-50)',  'M', 'Y'],
-      #          ['New York',  '[30-50)',  'F', 'N'],
-      #          ['Chicago',     '[30-50)',  'F', 'Y'],
-      #          ['New York',  '[30-50)',  'F', 'N'],
-      #          ['Chicago',     '[50-80]', 'M', 'N'],
-      #          ['New York',  '[50-80]', 'F', 'N'],
-      #          ['New York',  '[50-80]', 'M', 'N'],
-      #          ['Chicago',     '[50-80]', 'M', 'N'],
-      #          ['New York',  '[50-80]', 'F', 'N'],
-      #          ['Chicago',     '>80',      'F', 'Y']
-      #        ]
-      #
-      # Data labels must have the following format:
-      #     [ 'city', 'age_range', 'gender', 'marketing_target'  ]
-      #
-      # If you do not provide labels for you data, the following labels will
-      # be created by default:
-      #     [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value'  ]
-      #
-      def build(data_examples, data_labels=nil)
-        check_data_examples(data_examples)
-        @data_labels = (data_labels) ? data_labels : default_data_labels(data_examples)
-        domains = build_domains(data_examples)
-        instances = data_examples.collect {|data| data }
+      # Build a new Prism classifier. You must provide a DataSet instance
+      # as parameter.
+      def build(data_set)
+        data_set.check_not_empty
+        @data_set = data_set
+        domains = @data_set.build_domains
+        instances = @data_set.data_items.collect {|data| data }
         @rules = []
         domains.last.each do |class_value|
           while(has_class_value(instances, class_value))
@@ -91,7 +59,7 @@ module Ai4r
       # This method returns the generated rules in ruby code.
       # e.g.
       #
-      #   classifier.to_s
+      #   classifier.get_rules
       #     # => if age_range == '<30' then marketing_target = 'Y'
       #    elsif age_range == '>80' then marketing_target = 'Y'
       #    elsif city == 'Chicago' and age_range == '[30-50)' then marketing_target = 'Y'
@@ -101,10 +69,10 @@ module Ai4r
       # It is a nice way to inspect induction results, and also to execute them:
       #        age_range = '[30-50)'
       #        city = 'New York'
-      #        eval(classifier.to_s)
+      #        eval(classifier.get_rules)
       #        puts marketing_target
       #         'Y'
-      def to_s
+      def get_rules
         out = "if #{join_terms(@rules.first)} then #{then_clause(@rules.first)}"
         @rules[1...-1].each do |rule|
           out += "\nelsif #{join_terms(rule)} then #{then_clause(rule)}"
@@ -116,6 +84,10 @@ module Ai4r
       protected
+      def get_attr_value(data, attr)
+        data[@data_set.get_index(attr)]
+      end
       def has_class_value(instances, class_value)
         instances.each { |data| return true if data.last == class_value}
         return false
@@ -131,23 +103,15 @@ module Ai4r
       def matches_conditions(data, conditions)
         conditions.each_pair do |attr_label, attr_value|
-          return false if data[get_attr_index(attr_label)] != attr_value
+          return false if get_attr_value(data, attr_label) != attr_value
         end
         return true
       end
-      def get_attr_index(attr_label)
-        return @data_labels.index(attr_label)
-      end
-      def get_attr_value(data, attr_label)
-        return data[get_attr_index(attr_label)]
-      end
       def build_rule(class_value, instances)
         rule = {:class_value => class_value, :conditions => {}}
         rule_instances = instances.collect {|data| data }
-        attributes = @data_labels[0...-1].collect {|label| label }
+        attributes = @data_set.data_labels[0...-1].collect {|label| label }
         until(is_perfect(instances, rule) || attributes.empty?)
           freq_table = build_freq_table(rule_instances, attributes, class_value)
           condition = get_condition(freq_table)
@@ -223,7 +187,7 @@ module Ai4r
       end
       def then_clause(rule)
-        "#{@data_labels.last} = '#{rule[:class_value]}'"
+        "#{@data_set.data_labels.last} = '#{rule[:class_value]}'"
       end
     end

data/lib/ai4r/classifiers/zero_r.rb CHANGED

@@ -7,10 +7,12 @@
 # the Mozilla Public License version 1.1  as published by the
 # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
-require File.dirname(__FILE__) + '/classifier_helper'
+require File.dirname(__FILE__) + '/../data/data_set.rb'
+require File.dirname(__FILE__) + '/../classifiers/classifier'
 module Ai4r
   module Classifiers
     # = Introduction
     #
     # The idea behind the ZeroR classifier is to identify the
@@ -18,53 +20,19 @@ module Ai4r
     # It always returns that value when evaluating an instance.
     # It is frequently used as a baseline for evaluating other machine learning
     # algorithms.
-    class ZeroR
+    class ZeroR < Classifier
-      attr_accessor :data_labels, :class_value
-      include ClassifierHelper
+      attr_reader :data_set, :class_value
-      # Build a new ZeroR classifier. If your data is classified with N attributed
-      # and M examples, then your data examples must have the following format:
-      #
-      #     [   [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1,  CLASS_VAL1],
-      #         [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2,  CLASS_VAL2],
-      #         ...
-      #         [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
-      #     ]
-      #
-      # e.g.
-      #     [   ['New York',  '<30',      'M', 'Y'],
-      #          ['Chicago',     '<30',      'M', 'Y'],
-      #          ['Chicago',     '<30',      'F', 'Y'],
-      #          ['New York',  '<30',      'M', 'Y'],
-      #          ['New York',  '<30',      'M', 'Y'],
-      #          ['Chicago',     '[30-50)',  'M', 'Y'],
-      #          ['New York',  '[30-50)',  'F', 'N'],
-      #          ['Chicago',     '[30-50)',  'F', 'Y'],
-      #          ['New York',  '[30-50)',  'F', 'N'],
-      #          ['Chicago',     '[50-80]', 'M', 'N'],
-      #          ['New York',  '[50-80]', 'F', 'N'],
-      #          ['New York',  '[50-80]', 'M', 'N'],
-      #          ['Chicago',     '[50-80]', 'M', 'N'],
-      #          ['New York',  '[50-80]', 'F', 'N'],
-      #          ['Chicago',     '>80',      'F', 'Y']
-      #        ]
-      #
-      # Data labels must have the following format:
-      #     [ 'city', 'age_range', 'gender', 'marketing_target'  ]
-      #
-      # If you do not provide labels for you data, the following labels will
-      # be created by default:
-      #     [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value'  ]
-      #
-      def build(data_examples, data_labels=nil)
-        check_data_examples(data_examples)
-        @data_labels = (data_labels) ? data_labels : default_data_labels(data_examples)
+      # Build a new ZeroR classifier. You must provide a DataSet instance
+      # as parameter.
+      def build(data_set)
+        data_set.check_not_empty
+        @data_set = data_set
         frequence = {}
         max_freq = 0
-        @class_value
-        data_examples.each do |example|
+        @class_value = nil
+        @data_set.data_items.each do |example|
           class_value = example.last
           class_frequency = frequence[class_value]
           class_frequency = (class_frequency) ? class_frequency+1 : 1
@@ -86,16 +54,16 @@ module Ai4r
       # This method returns the generated rules in ruby code.
       # e.g.
       #
-      #   classifier.to_s
+      #   classifier.get_rules
       #     # =>  marketing_target='Y'
       #
       # It is a nice way to inspect induction results, and also to execute them:
       #     marketing_target = nil
-      #     eval classifier.to_s
+      #     eval classifier.get_rules
       #     puts marketing_target
       #       # =>  'Y'
-      def to_s
-        return "#{@data_labels.last} = '#{@class_value}'"
+      def get_rules
+        return "#{@data_set.data_labels.last} = '#{@class_value}'"
       end
     end

data/lib/ai4r/clusterers/bisecting_k_means.rb ADDED

@@ -0,0 +1,115 @@
+# Author::    Sergio Fierens (implementation)
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require "set"
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../clusterers/k_means'
+module Ai4r
+  module Clusterers
+    # The Bisecting k-means algorithm is a variation of the "k-means" algorithm,
+    # somewhat less sensible to the initial election of centroids than the
+    # original.
+    #
+    # More about K Means algorithm:
+    # http://en.wikipedia.org/wiki/K-means_algorithm
+    class BisectingKMeans < KMeans
+      attr_reader :data_set, :number_of_clusters, :clusters, :centroids
+      attr_accessor :max_iterations, :distance_function, :refine
+      def intialize
+        @refine = true
+      end
+      # Build a new clusterer, using data examples found in data_set.
+      # Items will be clustered in "number_of_clusters" different
+      # clusters.
+      def build(data_set, number_of_clusters)
+        @data_set = data_set
+        @number_of_clusters = number_of_clusters
+        @clusters = [@data_set]
+        @centroids = [@data_set.get_mean_or_mode]
+        while @clusters.length < @number_of_clusters
+          biggest_cluster_index = find_biggest_cluster_index(@clusters)
+          clusterer = KMeans.new.
+            set_parameters(get_parameters).
+            build(@clusters[biggest_cluster_index], 2)
+          @clusters.delete_at(biggest_cluster_index)
+          @centroids.delete_at(biggest_cluster_index)
+          @clusters.concat(clusterer.clusters)
+          @centroids.concat(clusterer.centroids)
+        end
+        super if @refine
+        return self
+      end
+      # Get info on what can be parameterized on this clusterer algorithm.
+      # It returns a hash with the following format:
+      # { :param_name => "Info on the parameter" }
+      def get_parameters_info
+        { :max_iterations => "Maximum number of iterations used to bisect a " +
+          "cluster. By default it is uncapped.",
+          :distance_function => "Custom implementation of distance function. " +
+            "It must be a closure receiving two data items and return the " +
+            "distance bewteen them. By default, this algorithm uses " +
+            "ecuclidean distance of numeric attributes to the power of 2.",
+          :refine => "Boolean value. True by default. It will run the " +
+            "classic K Means algorithm, using as initial centroids the " +
+            "result of the bisecting approach."
+          }
+      end
+      # Set parameters on this clusterer instance.
+      # You must provide a hash with the folowing format:
+      # { :param_name => parameter_value }
+      #
+      # Use get_parameters_info to know what parameters are accepted.
+      def set_parameters(parameters)
+        super
+        if parameters.has_key?(:refine)
+          @refine = parameters[:refine]
+        end
+        return self
+      end
+      # Get parameter values on this clusterer instance.
+      # Returns a hash with the folowing format:
+      # { :param_name => parameter_value }
+      def get_parameters
+        params = super
+        params[:refine] = @refine
+        return params
+      end
+      protected
+      def calc_initial_centroids
+        @centroids # Use existing centroids
+      end
+      def find_biggest_cluster_index(clusters)
+        max_index = 0
+        max_length = 0
+        clusters.each_index do |cluster_index|
+          cluster = clusters[cluster_index]
+          if max_length < cluster.data_items.length
+            max_length = cluster.data_items.length
+            max_index = cluster_index
+          end
+        end
+        return max_index
+      end
+    end
+  end
+end

data/lib/ai4r/clusterers/clusterer.rb ADDED

@@ -0,0 +1,55 @@
+# Author::    Sergio Fierens
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+module Ai4r
+  module Clusterers
+    # The purpose of this class is to define a common API for Clusterers.
+    # All methods in this class (other than eval) must be implemented in
+    # subclasses.
+    class Clusterer
+      # Build a new clusterer, using data examples found in data_set.
+      # Data items will be clustered in "number_of_clusters" different
+      # clusters.
+      def build(data_set, number_of_clusters)
+        raise NotImplementedError
+      end
+      # Classifies the given data item, returning the cluster it belongs to.
+      def eval(data_item)
+        raise NotImplementedError
+      end
+      # Get info on what can be parameterized on this clusterer.
+      # It returns a hash with the following format:
+      # { :param_name => "Info on the parameter" }
+      def get_parameters_info
+        raise NotImplementedError
+      end
+      # Set parameter values on this clusterer instance.
+      # You must provide a hash with the folowing format:
+      # { :param_name => parameter_value }
+      def set_parameters(parameters)
+        raise NotImplementedError
+      end
+      # Get parameter values on this clusterer instance.
+      # Returns a hash with the folowing format:
+      # { :param_name => parameter_value }
+      def get_parameters
+        raise NotImplementedError
+      end
+    end
+  end
+end

data/lib/ai4r/clusterers/k_means.rb ADDED

@@ -0,0 +1,164 @@
+# Author::    Sergio Fierens (implementation)
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require "set"
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../clusterers/clusterer'
+module Ai4r
+  module Clusterers
+    # The k-means algorithm is an algorithm to cluster n objects
+    # based on attributes into k partitions, with k < n.
+    #
+    # More about K Means algorithm:
+    # http://en.wikipedia.org/wiki/K-means_algorithm
+    class KMeans < Clusterer
+      attr_reader :data_set, :number_of_clusters
+      attr_reader :clusters, :centroids, :iterations
+      attr_accessor :max_iterations
+      attr_accessor :distance_function
+      # Build a new clusterer, using data examples found in data_set.
+      # Items will be clustered in "number_of_clusters" different
+      # clusters.
+      def build(data_set, number_of_clusters)
+        @data_set = data_set
+        @number_of_clusters = number_of_clusters
+        @iterations = 0
+        calc_initial_centroids
+        while(not stop_criteria_met)
+          calculate_membership_clusters
+          recompute_centroids
+        end
+        return self
+      end
+      # Classifies the given data item, returning the cluster index it belongs
+      # to (0-based).
+      def eval(data_item)
+        get_min_index(@centroids.collect {|centroid|
+            distance(data_item, centroid)})
+      end
+      # Get info on what can be parameterized on this clusterer algorithm.
+      # It returns a hash with the following format:
+      # { :param_name => "Info on the parameter" }
+      def get_parameters_info
+        { :max_iterations => "Maximum number of iterations to build the " +
+          "clusterer. By default it is uncapped.",
+          :distance_function => "Custom implementation of distance function. " +
+            "It must be a closure receiving two data items and return the " +
+            "distance bewteen them. By default, this algorithm uses " +
+            "ecuclidean distance of numeric attributes to the power of 2."
+          }
+      end
+      # Set parameters on this clusterer instance.
+      # You must provide a hash with the folowing format:
+      # { :param_name => parameter_value }
+      #
+      # Use get_parameters_info to know what parameters are accepted.
+      def set_parameters(parameters)
+        if parameters.has_key?(:max_iterations)
+          @max_iterations = parameters[:max_iterations]
+        end
+        if parameters.has_key?(:distance_function)
+          @distance_function = parameters[:distance_function]
+        end
+        return self
+      end
+      # Get parameter values on this clusterer instance.
+      # Returns a hash with the folowing format:
+      # { :param_name => parameter_value }
+      def get_parameters
+        { :max_iterations => @max_iterations,
+          :distance_function => @distance_function }
+      end
+      # This function calculates the distance between 2 different
+      # instances. By default, it returns the euclidean distance to the
+      # power of 2.
+      # You can provide a more convinient distance implementation:
+      #
+      # 1- Overwriting this method
+      #
+      # 2- Providing a closure to the :distance_function parameter
+      def distance(a, b)
+        return @distance_function.call(a, b) if @distance_function
+        return euclidean_distance(a, b)
+      end
+      protected
+      def euclidean_distance(a, b)
+        dist = 0.0
+        a.each_index do |index|
+          if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
+            dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
+          end
+        end
+        return dist
+      end
+      def calc_initial_centroids
+        @centroids = []
+        tried_indexes = []
+        while @centroids.length < @number_of_clusters &&
+            tried_indexes.length < @data_set.data_items.length
+          random_index = rand(@data_set.data_items.length)
+          if !tried_indexes.include?(random_index)
+            tried_indexes << random_index
+            if !@centroids.include? @data_set.data_items[random_index]
+              @centroids << @data_set.data_items[random_index]
+            end
+          end
+        end
+        @number_of_clusters = @centroids.length
+      end
+      def stop_criteria_met
+        @old_centroids == @centroids ||
+          (@max_iterations && (@max_iterations <= @iterations))
+      end
+      def calculate_membership_clusters
+        @clusters = Array.new(@number_of_clusters) do
+          Ai4r::Data::DataSet.new :data_labels => @data_set.data_labels
+        end
+        @data_set.data_items.each do |data_item|
+          @clusters[eval(data_item)] << data_item
+        end
+      end
+      def recompute_centroids
+        @old_centroids = @centroids
+        @centroids = @clusters.collect { |cluster| cluster.get_mean_or_mode }
+        @iterations += 1
+      end
+      def get_min_index(array)
+        min = array.first
+        index = 0
+        array.each_index do |i|
+          x = array[i]
+          if x < min
+            min = x
+            index = i
+          end
+        end
+        return index
+      end
+    end
+  end
+end