RubyGems - ai4r - Versions diffs - 1.12 → 1.13 - Mend

ai4r 1.12 → 1.13

Files changed (34) hide show

data/README.rdoc +7 -12
data/examples/classifiers/simple_linear_regression_example.csv +159 -0
data/examples/classifiers/simple_linear_regression_example.rb +15 -0
data/examples/clusterers/clusterer_example.rb +56 -0
data/examples/neural_network/backpropagation_example.rb +2 -1
data/lib/ai4r.rb +3 -1
data/lib/ai4r/classifiers/id3.rb +6 -2
data/lib/ai4r/classifiers/multilayer_perceptron.rb +1 -1
data/lib/ai4r/classifiers/naive_bayes.rb +24 -21
data/lib/ai4r/classifiers/simple_linear_regression.rb +118 -0
data/lib/ai4r/clusterers/average_linkage.rb +3 -3
data/lib/ai4r/clusterers/bisecting_k_means.rb +2 -2
data/lib/ai4r/clusterers/centroid_linkage.rb +3 -3
data/lib/ai4r/clusterers/clusterer.rb +0 -11
data/lib/ai4r/clusterers/complete_linkage.rb +3 -3
data/lib/ai4r/clusterers/diana.rb +2 -2
data/lib/ai4r/clusterers/k_means.rb +123 -21
data/lib/ai4r/clusterers/median_linkage.rb +3 -3
data/lib/ai4r/clusterers/single_linkage.rb +4 -4
data/lib/ai4r/clusterers/ward_linkage.rb +4 -4
data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +48 -0
data/lib/ai4r/clusterers/weighted_average_linkage.rb +3 -3
data/lib/ai4r/data/data_set.rb +12 -3
data/lib/ai4r/data/proximity.rb +22 -0
data/lib/ai4r/neural_network/backpropagation.rb +26 -15
data/test/classifiers/id3_test.rb +12 -0
data/test/classifiers/multilayer_perceptron_test.rb +1 -1
data/test/classifiers/naive_bayes_test.rb +18 -18
data/test/classifiers/simple_linear_regression_test.rb +37 -0
data/test/clusterers/k_means_test.rb +75 -8
data/test/clusterers/ward_linkage_hierarchical_test.rb +81 -0
data/test/data/data_set_test.rb +8 -0
data/test/data/proximity_test.rb +7 -1
metadata +96 -55

data/lib/ai4r/classifiers/simple_linear_regression.rb ADDED

@@ -0,0 +1,118 @@
+# Author::    Malav Bhavsar
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/classifier'
+module Ai4r
+  module Classifiers
+    # = Introduction
+    #
+    # This is an implementation of a Simple Linear Regression Classifier.
+    #
+    # For further details regarding Bayes and Naive Bayes Classifier have a look at this link:
+    # http://en.wikipedia.org/wiki/Naive_Bayesian_classification
+    # http://en.wikipedia.org/wiki/Bayes%27_theorem
+    #
+    #
+    # = How to use it
+    #
+    #   data = DataSet.new.parse_csv_with_labels "autoPrice.csv"
+    #   c = SimpleLinearRegression.new.
+    #     build data
+    #   c.eval([1,158,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25])
+    #
+    class SimpleLinearRegression < Classifier
+      attr_reader :attribute, :attribute_index, :slope, :intercept
+      def initialize
+        @attribute = nil
+        @attribute_index = 0
+        @slope = 0
+        @intercept = 0
+      end
+      # You can evaluate new data, predicting its category.
+      # e.g.
+      #   c.eval([1,158,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25])
+      #     => 11876.96774193548
+      def eval(data)
+        @intercept + @slope * data[@attribute_index]
+      end
+      # Gets the best attribute and does Linear Regression using it to find out the
+      # slope and intercept.
+      # Parameter data has to be an instance of DataSet
+      def build(data)
+        raise "Error instance must be passed" unless data.is_a?(DataSet)
+        raise "Data should not be empty" if data.data_items.length == 0
+        y_mean = data.get_mean_or_mode[data.num_attributes - 1]
+        # Choose best attribute
+        min_msq = Float::MAX
+        attribute = nil
+        chosen = -1
+        chosen_slope = 0.0 / 0.0 # Float::NAN
+        chosen_intercept = 0.0 / 0.0 # Float::NAN
+        data.data_labels.each do |attr_name|
+          attr_index = data.get_index attr_name
+          if attr_index != data.num_attributes-1
+            # Compute slope and intercept
+            x_mean = data.get_mean_or_mode[attr_index]
+            sum_x_diff_squared = 0
+            sum_y_diff_squared = 0
+            slope = 0
+            data.data_items.map do |instance|
+              x_diff = instance[attr_index] - x_mean
+              y_diff = instance[attr_index] - y_mean
+              slope += x_diff * y_diff
+              sum_x_diff_squared += x_diff * x_diff
+              sum_y_diff_squared += y_diff * y_diff
+            end
+            if sum_x_diff_squared == 0
+              next
+            end
+            numerator = slope
+            slope /= sum_x_diff_squared
+            intercept = y_mean - slope * x_mean
+            msq = sum_y_diff_squared - slope * numerator
+            if msq < min_msq
+              min_msq = msq
+              chosen = attr_index
+              chosen_slope = slope
+              chosen_intercept = intercept
+            end
+          end
+        end
+        if chosen == -1
+          raise "no useful attribute found"
+          @attribute = nil
+          @attribute_index = 0
+          @slope = 0
+          @intercept = y_mean
+        else
+          @attribute = data.data_labels[chosen]
+          @attribute_index = chosen
+          @slope = chosen_slope
+          @intercept = chosen_intercept
+        end
+        return self
+      end
+    end
+  end
+end

data/lib/ai4r/clusterers/average_linkage.rb CHANGED

@@ -16,7 +16,7 @@ module Ai4r
     # Implementation of a Hierarchical clusterer with group average
     # linkage, AKA unweighted pair group method average or UPGMA (Everitt
     # et al., 2001 ; Jain and Dubes, 1988 ; Sokal and Michener, 1958).
-    # Hierarchical clusteres create one cluster per element, and then
+    # Hierarchical clusterer create one cluster per element, and then
     # progressively merge clusters, until the required number of clusters
     # is reached.
     # With average linkage, the distance between a clusters cx and
@@ -29,8 +29,8 @@ module Ai4r
       parameters_info :distance_function =>
           "Custom implementation of distance function. " +
           "It must be a closure receiving two data items and return the " +
-          "distance bewteen them. By default, this algorithm uses " +
-          "ecuclidean distance of numeric attributes to the power of 2."
+          "distance between them. By default, this algorithm uses " +
+          "euclidean distance of numeric attributes to the power of 2."
       # Build a new clusterer, using data examples found in data_set.
       # Items will be clustered in "number_of_clusters" different

data/lib/ai4r/clusterers/bisecting_k_means.rb CHANGED

@@ -28,8 +28,8 @@ module Ai4r
         "build the clusterer. By default it is uncapped.",
         :distance_function => "Custom implementation of distance function. " +
           "It must be a closure receiving two data items and return the " +
-          "distance bewteen them. By default, this algorithm uses " +
-          "ecuclidean distance of numeric attributes to the power of 2.",
+          "distance between them. By default, this algorithm uses " +
+          "euclidean distance of numeric attributes to the power of 2.",
         :centroid_function => "Custom implementation to calculate the " +
           "centroid of a cluster. It must be a closure receiving an array of " +
           "data sets, and return an array of data items, representing the " +

data/lib/ai4r/clusterers/centroid_linkage.rb CHANGED

@@ -17,7 +17,7 @@ module Ai4r
     # centroid linkage algorithm, aka unweighted pair group method
     # centroid (UPGMC) (Everitt et al., 2001 ; Jain and Dubes, 1988 ;
     # Sokal and Michener, 1958 )
-    # Hierarchical clusteres create one cluster per element, and then
+    # Hierarchical clusterer create one cluster per element, and then
     # progressively merge clusters, until the required number of clusters
     # is reached.
     # The distance between clusters is the squared euclidean distance
@@ -32,8 +32,8 @@ module Ai4r
     parameters_info :distance_function =>
           "Custom implementation of distance function. " +
           "It must be a closure receiving two data items and return the " +
-          "distance bewteen them. By default, this algorithm uses " +
-          "ecuclidean distance of numeric attributes to the power of 2."
+          "distance between them. By default, this algorithm uses " +
+          "euclidean distance of numeric attributes to the power of 2."
       # Build a new clusterer, using data examples found in data_set.
       # Items will be clustered in "number_of_clusters" different

data/lib/ai4r/clusterers/clusterer.rb CHANGED

@@ -32,17 +32,6 @@ module Ai4r
       end
       protected
-      # Usefull as a defult distance function for clustering algorithms
-      def euclidean_distance(a, b)
-        dist = 0.0
-        a.each_index do |index|
-          if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
-            dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
-          end
-        end
-        return dist
-      end
       def get_min_index(array)
         min = array.first
         index = 0

data/lib/ai4r/clusterers/complete_linkage.rb CHANGED

@@ -15,7 +15,7 @@ module Ai4r
     # Implementation of a Hierarchical clusterer with complete linkage (Everitt
     # et al., 2001 ; Jain and Dubes, 1988 ; Sorensen, 1948 ).
-    # Hierarchical clusteres create one cluster per element, and then
+    # Hierarchical clusterer create one cluster per element, and then
     # progressively merge clusters, until the required number of clusters
     # is reached.
     # With complete linkage, the distance between two clusters is computed as
@@ -27,8 +27,8 @@ module Ai4r
       parameters_info :distance_function =>
           "Custom implementation of distance function. " +
           "It must be a closure receiving two data items and return the " +
-          "distance bewteen them. By default, this algorithm uses " +
-          "ecuclidean distance of numeric attributes to the power of 2."
+          "distance between them. By default, this algorithm uses " +
+          "euclidean distance of numeric attributes to the power of 2."
       # Build a new clusterer, using data examples found in data_set.

data/lib/ai4r/clusterers/diana.rb CHANGED

@@ -25,8 +25,8 @@ module Ai4r
       parameters_info :distance_function =>
           "Custom implementation of distance function. " +
           "It must be a closure receiving two data items and return the " +
-          "distance bewteen them. By default, this algorithm uses " +
-          "ecuclidean distance of numeric attributes to the power of 2."
+          "distance between them. By default, this algorithm uses " +
+          "euclidean distance of numeric attributes to the power of 2."
       def initialize
         @distance_function = lambda do |a,b|

data/lib/ai4r/clusterers/k_means.rb CHANGED

@@ -8,6 +8,7 @@
 # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
 require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../data/proximity'
 require File.dirname(__FILE__) + '/../clusterers/clusterer'
 module Ai4r
@@ -27,22 +28,31 @@ module Ai4r
         "build the clusterer. By default it is uncapped.",
         :distance_function => "Custom implementation of distance function. " +
           "It must be a closure receiving two data items and return the " +
-          "distance bewteen them. By default, this algorithm uses " +
-          "ecuclidean distance of numeric attributes to the power of 2.",
+          "distance between them. By default, this algorithm uses " +
+          "euclidean distance of numeric attributes to the power of 2.",
         :centroid_function => "Custom implementation to calculate the " +
           "centroid of a cluster. It must be a closure receiving an array of " +
           "data sets, and return an array of data items, representing the " +
           "centroids of for each data set. " +
           "By default, this algorithm returns a data items using the mode "+
-          "or mean of each attribute on each data set."
+          "or mean of each attribute on each data set.",
+        :centroid_indices => "Indices of data items (indexed from 0) to be " +
+          "the initial centroids.  Otherwise, the initial centroids will be " +
+          "assigned randomly from the data set.",
+        :on_empty => "Action to take if a cluster becomes empty, with values " +
+          "'eliminate' (the default action, eliminate the empty cluster), " +
+          "'terminate' (terminate with error), 'random' (relocate the " +
+          "empty cluster to a random point), 'outlier' (relocate the " +
+          "empty cluster to the point furthest from its centroid)."
       def initialize
         @distance_function = nil
         @max_iterations = nil
-        @old_centroids = nil
         @centroid_function = lambda do |data_sets|
           data_sets.collect{ |data_set| data_set.get_mean_or_mode}
         end
+        @centroid_indices = []
+        @on_empty = 'eliminate' # default if none specified
       end
@@ -52,6 +62,8 @@ module Ai4r
       def build(data_set, number_of_clusters)
         @data_set = data_set
         @number_of_clusters = number_of_clusters
+        raise ArgumentError, 'Length of centroid indices array differs from the specified number of clusters' unless @centroid_indices.empty? || @centroid_indices.length == @number_of_clusters
+        raise ArgumentError, 'Invalid value for on_empty' unless @on_empty == 'eliminate' || @on_empty == 'terminate' || @on_empty == 'random' || @on_empty == 'outlier'
         @iterations = 0
         calc_initial_centroids
@@ -73,32 +85,27 @@ module Ai4r
       # This function calculates the distance between 2 different
       # instances. By default, it returns the euclidean distance to the
       # power of 2.
-      # You can provide a more convinient distance implementation:
+      # You can provide a more convenient distance implementation:
       #
       # 1- Overwriting this method
       #
       # 2- Providing a closure to the :distance_function parameter
       def distance(a, b)
         return @distance_function.call(a, b) if @distance_function
-        return euclidean_distance(a, b)
+        return Ai4r::Data::Proximity.squared_euclidean_distance(
+                 a.select {|att_a| att_a.is_a? Numeric} ,
+                 b.select {|att_b| att_b.is_a? Numeric})
       end
       protected
       def calc_initial_centroids
-        @centroids = []
-        tried_indexes = []
-        while @centroids.length < @number_of_clusters &&
-            tried_indexes.length < @data_set.data_items.length
-          random_index = rand(@data_set.data_items.length)
-          if !tried_indexes.include?(random_index)
-            tried_indexes << random_index
-            if !@centroids.include? @data_set.data_items[random_index]
-              @centroids << @data_set.data_items[random_index]
-            end
-          end
+        @centroids, @old_centroids = [], nil
+        if @centroid_indices.empty?
+          populate_centroids('random')
+        else
+          populate_centroids('indices')
         end
-        @number_of_clusters = @centroids.length
       end
       def stop_criteria_met
@@ -110,9 +117,14 @@ module Ai4r
         @clusters = Array.new(@number_of_clusters) do
           Ai4r::Data::DataSet.new :data_labels => @data_set.data_labels
         end
-        @data_set.data_items.each do |data_item|
-          @clusters[eval(data_item)] << data_item
+        @cluster_indices = Array.new(@number_of_clusters) {[]}
+        @data_set.data_items.each_with_index do |data_item, data_index|
+          c = eval(data_item)
+          @clusters[c] << data_item
+          @cluster_indices[c] << data_index if @on_empty == 'outlier'
         end
+        manage_empty_clusters if has_empty_cluster?
       end
       def recompute_centroids
@@ -120,7 +132,97 @@ module Ai4r
         @iterations += 1
         @centroids = @centroid_function.call(@clusters)
       end
+      def populate_centroids(populate_method, number_of_clusters=@number_of_clusters)
+        tried_indexes = []
+        case populate_method
+        when 'random' # for initial assignment (without the :centroid_indices option) and for reassignment of empty cluster centroids (with :on_empty option 'random')
+          while @centroids.length < number_of_clusters &&
+              tried_indexes.length < @data_set.data_items.length
+            random_index = rand(@data_set.data_items.length)
+            if !tried_indexes.include?(random_index)
+              tried_indexes << random_index
+              if !@centroids.include? @data_set.data_items[random_index]
+                @centroids << @data_set.data_items[random_index]
+              end
+            end
+          end
+        when 'indices' # for initial assignment only (with the :centroid_indices option)
+          @centroid_indices.each do |index|
+            raise ArgumentError, "Invalid centroid index #{index}" unless (index.is_a? Integer) && index >=0 && index < @data_set.data_items.length
+            if !tried_indexes.include?(index)
+              tried_indexes << index
+              if !@centroids.include? @data_set.data_items[index]
+                @centroids << @data_set.data_items[index]
+              end
+            end
+          end
+        when 'outlier' # for reassignment of empty cluster centroids only (with :on_empty option 'outlier')
+          sorted_data_indices = sort_data_indices_by_dist_to_centroid
+          i = sorted_data_indices.length - 1 # the last item is the furthest from its centroid
+          while @centroids.length < number_of_clusters &&
+              tried_indexes.length < @data_set.data_items.length
+            outlier_index = sorted_data_indices[i]
+            if !tried_indexes.include?(outlier_index)
+              tried_indexes << outlier_index
+              if !@centroids.include? @data_set.data_items[outlier_index]
+                @centroids << @data_set.data_items[outlier_index]
+              end
+            end
+            i > 0 ? i -= 1 : break
+          end
+        end
+        @number_of_clusters = @centroids.length
+      end
+       # Sort cluster points by distance to assigned centroid.  Utilizes @cluster_indices.
+       # Returns indices, sorted in order from the nearest to furthest.
+       def sort_data_indices_by_dist_to_centroid
+         sorted_data_indices = []
+         h = {}
+         @clusters.each_with_index do |cluster, c|
+           centroid = @centroids[c]
+           cluster.data_items.each_with_index do |data_item, i|
+             dist_to_centroid = distance(data_item, centroid)
+             data_index = @cluster_indices[c][i]
+             h[data_index] = dist_to_centroid
+           end
+         end
+         # sort hash of {index => dist to centroid} by dist to centroid (ascending) and then return an array of only the indices
+         sorted_data_indices = h.sort_by{|k,v| v}.collect{|a,b| a}
+       end
+      def has_empty_cluster?
+        found_empty = false
+        @number_of_clusters.times do |c|
+          found_empty = true if @clusters[c].data_items.empty?
+        end
+        found_empty
+      end
+      def manage_empty_clusters
+        return if self.on_empty == 'terminate' # Do nothing to terminate with error. (The empty cluster will be assigned a nil centroid, and then calculating the distance from this centroid to another point will raise an exception.)
+        initial_number_of_clusters = @number_of_clusters
+        eliminate_empty_clusters
+        return if self.on_empty == 'eliminate'
+        populate_centroids(self.on_empty, initial_number_of_clusters) # Add initial_number_of_clusters - @number_of_clusters
+        calculate_membership_clusters
+      end
+      def eliminate_empty_clusters
+        old_clusters, old_centroids, old_cluster_indices = @clusters, @centroids, @cluster_indices
+        @clusters, @centroids, @cluster_indices = [], [], []
+        @number_of_clusters.times do |i|
+          if !old_clusters[i].data_items.empty?
+            @clusters << old_clusters[i]
+            @cluster_indices << old_cluster_indices[i]
+            @centroids << old_centroids[i]
+          end
+        end
+        @number_of_clusters = @centroids.length
+      end
     end
   end
 end

data/lib/ai4r/clusterers/median_linkage.rb CHANGED

@@ -16,7 +16,7 @@ module Ai4r
     # Implementation of an Agglomerative Hierarchical clusterer with
     # median linkage algorithm, aka weighted pair group method centroid
     # or WPGMC (Everitt et al., 2001 ; Gower, 1967 ; Jain and Dubes, 1988 ).
-    # Hierarchical clusteres create one cluster per element, and then
+    # Hierarchical clusterer create one cluster per element, and then
     # progressively merge clusters, until the required number of clusters
     # is reached.
     # Similar to centroid linkages, but using fix weight:
@@ -29,8 +29,8 @@ module Ai4r
     parameters_info :distance_function =>
           "Custom implementation of distance function. " +
           "It must be a closure receiving two data items and return the " +
-          "distance bewteen them. By default, this algorithm uses " +
-          "ecuclidean distance of numeric attributes to the power of 2."
+          "distance between them. By default, this algorithm uses " +
+          "euclidean distance of numeric attributes to the power of 2."
       # Build a new clusterer, using data examples found in data_set.
       # Items will be clustered in "number_of_clusters" different