RubyGems - ai4r - Versions diffs - 1.4 → 1.5 - Mend

ai4r 1.4 → 1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

data/README.rdoc +24 -3
data/examples/decision_trees/id3_example.rb +1 -1
data/examples/genetic_algorithm/genetic_algorithm_example.rb +1 -1
data/lib/ai4r.rb +11 -0
data/lib/ai4r/classifiers/classifier.rb +2 -0
data/lib/ai4r/classifiers/id3.rb +3 -2
data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
data/lib/ai4r/classifiers/one_r.rb +2 -1
data/lib/ai4r/classifiers/prism.rb +2 -1
data/lib/ai4r/classifiers/zero_r.rb +2 -1
data/lib/ai4r/clusterers/average_linkage.rb +60 -0
data/lib/ai4r/clusterers/bisecting_k_means.rb +17 -39
data/lib/ai4r/clusterers/clusterer.rb +25 -0
data/lib/ai4r/clusterers/complete_linkage.rb +62 -0
data/lib/ai4r/clusterers/k_means.rb +18 -25
data/lib/ai4r/clusterers/single_linkage.rb +179 -0
data/lib/ai4r/data/data_set.rb +33 -41
data/lib/ai4r/data/proximity.rb +82 -0
data/lib/ai4r/data/statistics.rb +77 -0
data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +2 -4
data/site/build/site/en/build/tmp/build-info.xml +5 -0
data/site/build/site/en/build/tmp/plugins-1.xml +212 -0
data/site/build/site/en/build/tmp/plugins-2.xml +252 -0
data/site/build/site/en/build/tmp/projfilters.properties +41 -0
data/site/build/site/en/downloads.html +1 -1
data/site/build/site/en/geneticAlgorithms.html +1 -1
data/site/build/site/en/index.html +44 -7
data/site/build/site/en/index.pdf +278 -155
data/site/build/site/en/linkmap.html +2 -2
data/site/build/site/en/linkmap.pdf +12 -12
data/site/build/site/en/machineLearning.html +1 -1
data/site/build/site/en/neuralNetworks.html +1 -1
data/site/build/site/en/sourceCode.html +244 -0
data/site/build/site/en/sourceCode.pdf +278 -0
data/site/build/site/en/svn.html +34 -42
data/site/build/site/en/svn.pdf +86 -114
data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.data +0 -0
data/site/build/tmp/cocoon-work/cache-dir/cocoon-ehcache-1.index +0 -0
data/site/build/tmp/projfilters.properties +1 -1
data/site/build/webapp/WEB-INF/logs/core.log +628 -629
data/site/build/webapp/WEB-INF/logs/error.log +213 -213
data/site/src/documentation/content/xdocs/index.xml +20 -1
data/site/src/documentation/content/xdocs/site.xml +1 -1
data/site/src/documentation/content/xdocs/sourceCode.xml +43 -0
data/site/src/documentation/resources/images/sigmoid.png +0 -0
data/test/classifiers/id3_test.rb +0 -1
data/test/classifiers/multilayer_perceptron_test.rb +79 -0
data/test/classifiers/one_r_test.rb +0 -2
data/test/classifiers/prism_test.rb +0 -2
data/test/classifiers/zero_r_test.rb +0 -2
data/test/clusterers/average_linkage_test.rb +45 -0
data/test/clusterers/bisecting_k_means_test.rb +0 -2
data/test/clusterers/complete_linkage_test.rb +45 -0
data/test/clusterers/k_means_test.rb +0 -2
data/test/clusterers/single_linkage_test.rb +113 -0
data/test/data/data_set_test.rb +3 -15
data/test/data/proximity_test.rb +71 -0
data/test/data/statistics_test.rb +65 -0
data/test/experiment/classifier_evaluator_test.rb +76 -0
metadata +27 -6
data/site/src/documentation/content/xdocs/svn.xml +0 -41

data/lib/ai4r/clusterers/k_means.rb CHANGED

@@ -7,7 +7,6 @@
 # the Mozilla Public License version 1.1  as published by the
 # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
-require "set"
 require File.dirname(__FILE__) + '/../data/data_set'
 require File.dirname(__FILE__) + '/../clusterers/clusterer'
@@ -29,7 +28,23 @@ module Ai4r
         :distance_function => "Custom implementation of distance function. " +
           "It must be a closure receiving two data items and return the " +
           "distance bewteen them. By default, this algorithm uses " +
-          "ecuclidean distance of numeric attributes to the power of 2."
+          "ecuclidean distance of numeric attributes to the power of 2.",
+        :centroid_function => "Custom implementation to calculate the " +
+          "centroid of a cluster. It must be a closure receiving an array of " +
+          "data sets, and return an array of data items, representing the " +
+          "centroids of for each data set. " +
+          "By default, this algorithm returns a data items using the mode "+
+          "or mean of each attribute on each data set."
+      def initialize
+        @distance_function = nil
+        @max_iterations = nil
+        @old_centroids = nil
+        @centroid_function = lambda do |data_sets|
+          data_sets.collect{ |data_set| data_set.get_mean_or_mode}
+        end
+      end
       # Build a new clusterer, using data examples found in data_set.
       # Items will be clustered in "number_of_clusters" different
@@ -69,15 +84,6 @@ module Ai4r
       end
       protected
-      def euclidean_distance(a, b)
-        dist = 0.0
-        a.each_index do |index|
-          if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
-            dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
-          end
-        end
-        return dist
-      end
       def calc_initial_centroids
         @centroids = []
@@ -111,21 +117,8 @@ module Ai4r
       def recompute_centroids
         @old_centroids = @centroids
-        @centroids = @clusters.collect { |cluster| cluster.get_mean_or_mode }
         @iterations += 1
-      end
-      def get_min_index(array)
-        min = array.first
-        index = 0
-        array.each_index do |i|
-          x = array[i]
-          if x < min
-            min = x
-            index = i
-          end
-        end
-        return index
+        @centroids = @centroid_function.call(@clusters)
       end
     end

data/lib/ai4r/clusterers/single_linkage.rb ADDED

@@ -0,0 +1,179 @@
+# Author::    Sergio Fierens (implementation)
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../clusterers/clusterer'
+module Ai4r
+  module Clusterers
+    # Implementation of a Hierarchical clusterer with single linkage.
+    # Hierarchical clusteres create one cluster per element, and then
+    # progressively merge clusters, until the required number of clusters
+    # is reached.
+    # With single linkage, the distance between two clusters is computed as the
+    # distance between the two closest elements in the two clusters.
+    class SingleLinkage < Clusterer
+      attr_reader :data_set, :number_of_clusters, :clusters
+      parameters_info :distance_function =>
+          "Custom implementation of distance function. " +
+          "It must be a closure receiving two data items and return the " +
+          "distance bewteen them. By default, this algorithm uses " +
+          "ecuclidean distance of numeric attributes to the power of 2."
+      def initialize
+        @distance_function = nil
+      end
+      # Build a new clusterer, using data examples found in data_set.
+      # Items will be clustered in "number_of_clusters" different
+      # clusters.
+      def build(data_set, number_of_clusters)
+        @data_set = data_set
+        @number_of_clusters = number_of_clusters
+        index_clusters = create_initial_index_clusters
+        create_distance_matrix(data_set)
+        while index_clusters.length > @number_of_clusters
+          clusters_to_merge = get_closest_clusters(index_clusters)
+          index_clusters = merge_clusters(clusters_to_merge, index_clusters)
+        end
+        @clusters = build_clusters_from_index_clusters index_clusters
+        return self
+      end
+      # Classifies the given data item, returning the cluster index it belongs
+      # to (0-based).
+      def eval(data_item)
+        get_min_index(@clusters.collect {|cluster|
+            distance_between_item_and_cluster(data_item, cluster)})
+      end
+      # This function calculates the distance between 2 different
+      # instances. By default, it returns the euclidean distance to the
+      # power of 2.
+      # You can provide a more convinient distance implementation:
+      #
+      # 1- Overwriting this method
+      #
+      # 2- Providing a closure to the :distance_function parameter
+      def distance(a, b)
+        return @distance_function.call(a, b) if @distance_function
+        return euclidean_distance(a, b)
+      end
+      protected
+      # returns [ [0], [1], [2], ... , [n-1] ]
+      # where n is the number of data items in the data set
+      def create_initial_index_clusters
+        index_clusters = []
+        @data_set.data_items.length.times {|i| index_clusters << [i]}
+        return index_clusters
+      end
+      # Create a partial distance matrix:
+      #   [
+      #     [d(1,0)],
+      #     [d(2,0)], [d(2,1)],
+      #     [d(3,0)], [d(3,1)], [d(3,2)],
+      #     ...
+      #     [d(n-1,0)], [d(n-1,1)], [d(n-1,2)], ... , [d(n-1,n-2)]
+      #   ]
+      # where n is the number of data items in the data set
+      def create_distance_matrix(data_set)
+        @distance_matrix = Array.new(data_set.data_items.length-1) {|index| Array.new(index+1)}
+        data_set.data_items.each_with_index do |a, i|
+          i.times do |j|
+            b = data_set.data_items[j]
+            @distance_matrix[i-1][j] = distance(a, b)
+          end
+        end
+      end
+      # Returns the distance between element data_item[index_a] and
+      # data_item[index_b] using the distance matrix
+      def read_distance_matrix(index_a, index_b)
+        return 0 if index_a == index_b
+        index_a, index_b = index_b, index_a if index_b > index_a
+        return @distance_matrix[index_a-1][index_b]
+      end
+      # clusters_to_merge = [index_cluster_a, index_cluster_b].
+      # cluster_a and cluster_b are removed from index_cluster,
+      # and a new cluster with all members of cluster_a and cluster_b
+      # is added.
+      # It returns the new clusters array.
+      def merge_clusters(clusters_to_merge, index_clusters)
+        index_a = clusters_to_merge.first
+        index_b = clusters_to_merge.last
+        index_a, index_b = index_b, index_a if index_b > index_a
+        new_index_cluster = index_clusters[index_a] +
+          index_clusters[index_b]
+        index_clusters.delete_at index_a
+        index_clusters.delete_at index_b
+        index_clusters << new_index_cluster
+        return index_clusters
+      end
+      # Given an array with clusters of data_items indexes,
+      # it returns an array of data_items clusters
+      def build_clusters_from_index_clusters(index_clusters)
+        @distance_matrix = nil
+        return index_clusters.collect do |index_cluster|
+          Ai4r::Data::DataSet.new(:data_labels => @data_set.data_labels,
+            :data_items => index_cluster.collect {|i| @data_set.data_items[i]})
+        end
+      end
+      # Returns ans array with the indexes of the two closest
+      # clusters => [index_cluster_a, index_cluster_b]
+      def get_closest_clusters(index_clusters)
+        min_distance = 1.0/0
+        closest_clusters = [1, 0]
+        index_clusters.each_with_index do |cluster_a, index_a|
+          index_a.times do |index_b|
+            cluster_b = index_clusters[index_b]
+            cluster_distance = calc_index_clusters_distance(cluster_a, cluster_b)
+            if cluster_distance < min_distance
+              closest_clusters = [index_a, index_b]
+              min_distance = cluster_distance
+            end
+          end
+        end
+        return closest_clusters
+      end
+      # Calculate cluster distance using the single linkage method
+      def calc_index_clusters_distance(cluster_a, cluster_b)
+        min_dist = 1.0/0
+        cluster_a.each do |index_a|
+          cluster_b.each do |index_b|
+            dist = read_distance_matrix(index_a, index_b)
+            min_dist = dist if dist < min_dist
+          end
+        end
+        return min_dist
+      end
+      def distance_between_item_and_cluster(data_item, cluster)
+        min_dist = 1.0/0
+        cluster.data_items.each do |another_item|
+          dist = distance(data_item, another_item)
+          min_dist = dist if dist < min_dist
+        end
+        return min_dist
+      end
+    end
+  end
+end

data/lib/ai4r/data/data_set.rb CHANGED

@@ -9,11 +9,19 @@
 require 'csv'
 require 'set'
+require File.dirname(__FILE__) + '/statistics'
 module Ai4r
   module Data
+    # A data set is a collection of N data items. Each data item is
+    # described by a set of attributes, represented as an array.
+    # Optionally, you can assign a label to the attributes, using
+    # the data_labels property.
     class DataSet
+      @@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
       attr_reader :data_labels, :data_items
       # Create a new DataSet. By default, empty.
@@ -24,7 +32,7 @@ module Ai4r
       # If you provide data items, but no data labels, the data set will
       # use the default data label values (see set_data_labels)
       def initialize(options = {})
-        @data_labels = options[:data_labels] || []
+        @data_labels = []
         @data_items = options[:data_items] || []
         set_data_labels(options[:data_labels]) if options[:data_labels]
         set_data_items(options[:data_items]) if options[:data_items]
@@ -38,7 +46,7 @@ module Ai4r
       end
       # Load data items from csv file
-      def load_data_from_csv(filepath)
+      def load_csv(filepath)
         items = []
         CSV::Reader.parse(File.open(filepath, 'r')) do |row|
           items << row
@@ -47,12 +55,21 @@ module Ai4r
       end
       # Load data items from csv file. The first row is used as data labels.
-      def load_data_and_labels_from_csv(filepath)
-        load_data_from_csv(filepath)
+      def load_csv_with_labels(filepath)
+        load_csv(filepath)
         @data_labels = @data_items.shift
         return self
       end
+      # Same as load_csv, but it will try to convert cell contents as numbers.
+      def parse_csv(filepath)
+        items = []
+        CSV::Reader.parse(File.open(filepath, 'r')) do |row|
+          items << row.collect{|x| (x.match(@@number_regex)) ? x.to_f : x.data }
+        end
+        set_data_items(items)
+      end
       # Set data labels.
       # Data labels must have the following format:
       #     [ 'city', 'age_range', 'gender', 'marketing_target'  ]
@@ -144,7 +161,7 @@ module Ai4r
       #   get_index("gender")
       #   => 2
       def get_index(attr)
-        return (attr.is_a?(String)) ? @data_labels.index(attr) : attr
+        return (attr.is_a?(Fixnum) || attr.is_a?(Range)) ? attr : @data_labels.index(attr)
       end
       # Raise an exception if there is no data item.
@@ -168,44 +185,19 @@ module Ai4r
           @data_items << data_item
         end
       end
-      def get_attribute_mean(attribute)
-        index = get_index(attribute)
-        mean = 0.0
-        @data_items.each { |data_item| mean += data_item[index] }
-        mean /= @data_items.length
-        return mean
-      end
-      def get_attribute_mode(attribute)
-        index = get_index(attribute)
-        domain = build_domain(attribute)
-        count = {}
-        domain.each {|value| count[value]=0}
-        @data_items.each { |data_item| count[data_item[index]] += 1 }
-        max_count = 0
-        mode = nil
-        count.each_pair do |value, value_count|
-          if value_count > max_count
-            mode = value
-            max_count = value_count
-          end
-        end
-        return mode
-      end
-      def get_attribute_mean_or_mode(attribute)
-        index = get_index(attribute)
-        if @data_items.first[index].is_a?(Numeric)
-          return get_attribute_mean(attribute)
-        else
-          return get_attribute_mode(attribute)
-        end
-      end
+      # Returns an array with the mean value of numeric attributes, and
+      # the most frequent value of non numeric attributes
       def get_mean_or_mode
         mean = []
-        num_attributes.times {|i| mean[i] = get_attribute_mean_or_mode(i) }
+        num_attributes.times do |i|
+          mean[i] =
+            if @data_items.first[i].is_a?(Numeric)
+              Statistics.mean(self, i)
+            else
+              Statistics.mode(self, i)
+            end
+        end
         return mean
       end

data/lib/ai4r/data/proximity.rb ADDED

@@ -0,0 +1,82 @@
+# Author::    Sergio Fierens
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+module Ai4r
+  module Data
+    # This module provides classical distance functions
+    module Proximity
+      # This is a faster computational replacement for eclidean distance.
+      # Parameters a and b are vectors with continuous attributes.
+      def self.squared_euclidean_distance(a, b)
+        sum = 0.0
+        a.each_with_index do |item_a, i|
+          item_b = b[i]
+          sum += (item_a - item_b)**2
+        end
+        return sum
+      end
+      # Euclidean distance, or L2 norm.
+      # Parameters a and b are vectors with continuous attributes.
+      # Euclidean distance tends to form hyperspherical
+      # clusters(Clustering, Xu and Wunsch, 2009).
+      # Translations and rotations do not cause a
+      # distortion in distance relation (Duda et al, 2001)
+      # If attributes are measured with different units,
+      # attributes with larger values and variance will
+      # dominate the metric.
+      def self.euclidean_distance(a, b)
+        Math.sqrt(squared_euclidean_distance(a, b))
+      end
+      # city block, Manhattan distance, or L1 norm.
+      # Parameters a and b are vectors with continuous attributes.
+      def self.manhattan_distance(a, b)
+        sum = 0.0
+        a.each_with_index do |item_a, i|
+          item_b = b[i]
+          sum += (item_a - item_b).abs
+        end
+        return sum
+      end
+      # Sup distance, or L-intinity norm
+      # Parameters a and b are vectors with continuous attributes.
+      def self.sup_distance(a, b)
+        distance = 0.0
+        a.each_with_index do |item_a, i|
+          item_b = b[i]
+          diff = (item_a - item_b).abs
+          distance = diff if diff > distance
+        end
+        return distance
+      end
+      # The Hamming distance between two attributes vectors of equal
+      # length is the number of attributes for which the corresponding
+      # vectors are different
+      # This distance function is frequently used with binary attributes,
+      # though it can be used with other discrete attributes.
+      def self.hamming_distance(a,b)
+        count = 0
+        a.each_index do |i|
+          count += 1 if a[i] != b[i]
+        end
+        return count
+      end
+    end
+  end
+end