RubyGems - ai4ruby - Versions diffs - 1.11 - Mend

ai4ruby 1.11

Files changed (79) hide show

data/README.rdoc +47 -0
data/examples/classifiers/id3_data.csv +121 -0
data/examples/classifiers/id3_example.rb +29 -0
data/examples/classifiers/naive_bayes_data.csv +11 -0
data/examples/classifiers/naive_bayes_example.rb +16 -0
data/examples/classifiers/results.txt +31 -0
data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
data/examples/genetic_algorithm/travel_cost.csv +16 -0
data/examples/neural_network/backpropagation_example.rb +67 -0
data/examples/neural_network/patterns_with_base_noise.rb +68 -0
data/examples/neural_network/patterns_with_noise.rb +66 -0
data/examples/neural_network/training_patterns.rb +68 -0
data/examples/neural_network/xor_example.rb +35 -0
data/examples/som/som_data.rb +156 -0
data/examples/som/som_multi_node_example.rb +22 -0
data/examples/som/som_single_example.rb +24 -0
data/lib/ai4r.rb +33 -0
data/lib/ai4r/classifiers/classifier.rb +62 -0
data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
data/lib/ai4r/classifiers/ib1.rb +121 -0
data/lib/ai4r/classifiers/id3.rb +326 -0
data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
data/lib/ai4r/classifiers/one_r.rb +110 -0
data/lib/ai4r/classifiers/prism.rb +197 -0
data/lib/ai4r/classifiers/zero_r.rb +73 -0
data/lib/ai4r/clusterers/average_linkage.rb +59 -0
data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
data/lib/ai4r/clusterers/clusterer.rb +61 -0
data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
data/lib/ai4r/clusterers/diana.rb +139 -0
data/lib/ai4r/clusterers/k_means.rb +126 -0
data/lib/ai4r/clusterers/median_linkage.rb +61 -0
data/lib/ai4r/clusterers/single_linkage.rb +194 -0
data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +31 -0
data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
data/lib/ai4r/data/data_set.rb +266 -0
data/lib/ai4r/data/parameterizable.rb +64 -0
data/lib/ai4r/data/proximity.rb +100 -0
data/lib/ai4r/data/statistics.rb +77 -0
data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
data/lib/ai4r/neural_network/backpropagation.rb +326 -0
data/lib/ai4r/neural_network/hopfield.rb +149 -0
data/lib/ai4r/som/layer.rb +68 -0
data/lib/ai4r/som/node.rb +96 -0
data/lib/ai4r/som/som.rb +155 -0
data/lib/ai4r/som/two_phase_layer.rb +90 -0
data/test/classifiers/hyperpipes_test.rb +84 -0
data/test/classifiers/ib1_test.rb +78 -0
data/test/classifiers/id3_test.rb +208 -0
data/test/classifiers/multilayer_perceptron_test.rb +79 -0
data/test/classifiers/naive_bayes_test.rb +43 -0
data/test/classifiers/one_r_test.rb +62 -0
data/test/classifiers/prism_test.rb +85 -0
data/test/classifiers/zero_r_test.rb +49 -0
data/test/clusterers/average_linkage_test.rb +51 -0
data/test/clusterers/bisecting_k_means_test.rb +66 -0
data/test/clusterers/centroid_linkage_test.rb +53 -0
data/test/clusterers/complete_linkage_test.rb +57 -0
data/test/clusterers/diana_test.rb +69 -0
data/test/clusterers/k_means_test.rb +100 -0
data/test/clusterers/median_linkage_test.rb +53 -0
data/test/clusterers/single_linkage_test.rb +122 -0
data/test/clusterers/ward_linkage_hierarchical_test.rb +61 -0
data/test/clusterers/ward_linkage_test.rb +53 -0
data/test/clusterers/weighted_average_linkage_test.rb +53 -0
data/test/data/data_set_test.rb +96 -0
data/test/data/proximity_test.rb +81 -0
data/test/data/statistics_test.rb +65 -0
data/test/experiment/classifier_evaluator_test.rb +76 -0
data/test/genetic_algorithm/chromosome_test.rb +58 -0
data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
data/test/neural_network/backpropagation_test.rb +82 -0
data/test/neural_network/hopfield_test.rb +72 -0
data/test/som/som_test.rb +97 -0
metadata +168 -0

data/lib/ai4r/clusterers/clusterer.rb ADDED

@@ -0,0 +1,61 @@
+# Author::    Sergio Fierens
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require File.dirname(__FILE__) + '/../data/parameterizable'
+module Ai4r
+  module Clusterers
+    # The purpose of this class is to define a common API for Clusterers.
+    # All methods in this class (other than eval) must be implemented in
+    # subclasses.
+    class Clusterer
+      include Ai4r::Data::Parameterizable
+      # Build a new clusterer, using data examples found in data_set.
+      # Data items will be clustered in "number_of_clusters" different
+      # clusters.
+      def build(data_set, number_of_clusters)
+        raise NotImplementedError
+      end
+      # Classifies the given data item, returning the cluster it belongs to.
+      def eval(data_item)
+        raise NotImplementedError
+      end
+      protected
+      # Usefull as a defult distance function for clustering algorithms
+      def euclidean_distance(a, b)
+        dist = 0.0
+        a.each_index do |index|
+          if a[index].is_a?(Numeric) && b[index].is_a?(Numeric)
+            dist = dist + ((a[index]-b[index])*(a[index]-b[index]))
+          end
+        end
+        return dist
+      end
+      def get_min_index(array)
+        min = array.first
+        index = 0
+        array.each_index do |i|
+          x = array[i]
+          if x < min
+            min = x
+            index = i
+          end
+        end
+        return index
+      end
+    end
+  end
+end

data/lib/ai4r/clusterers/complete_linkage.rb ADDED

@@ -0,0 +1,67 @@
+# Author::    Sergio Fierens (implementation)
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../clusterers/single_linkage'
+module Ai4r
+  module Clusterers
+    # Implementation of a Hierarchical clusterer with complete linkage (Everitt
+    # et al., 2001 ; Jain and Dubes, 1988 ; Sorensen, 1948 ).
+    # Hierarchical clusteres create one cluster per element, and then
+    # progressively merge clusters, until the required number of clusters
+    # is reached.
+    # With complete linkage, the distance between two clusters is computed as
+    # the maximum distance between elements of each cluster.
+    #
+    #   D(cx, (ci U cj) = max(D(cx, ci), D(cx, cj))
+    class CompleteLinkage < SingleLinkage
+      parameters_info :distance_function =>
+          "Custom implementation of distance function. " +
+          "It must be a closure receiving two data items and return the " +
+          "distance bewteen them. By default, this algorithm uses " +
+          "ecuclidean distance of numeric attributes to the power of 2."
+      # Build a new clusterer, using data examples found in data_set.
+      # Items will be clustered in "number_of_clusters" different
+      # clusters.
+      def build(data_set, number_of_clusters)
+        super
+      end
+      # Classifies the given data item, returning the cluster index it belongs
+      # to (0-based).
+      def eval(data_item)
+        super
+      end
+      protected
+      # return distance between cluster cx and new cluster (ci U cj),
+      # using complete linkage
+      def linkage_distance(cx, ci, cj)
+        [read_distance_matrix(cx, ci),
+          read_distance_matrix(cx, cj)].max
+      end
+      def distance_between_item_and_cluster(data_item, cluster)
+        max_dist = 0
+        cluster.data_items.each do |another_item|
+          dist = @distance_function.call(data_item, another_item)
+          max_dist = dist if dist > max_dist
+        end
+        return max_dist
+      end
+    end
+  end
+end

data/lib/ai4r/clusterers/diana.rb ADDED

@@ -0,0 +1,139 @@
+# Author::    Sergio Fierens (implementation)
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../data/proximity'
+require File.dirname(__FILE__) + '/../clusterers/clusterer'
+module Ai4r
+  module Clusterers
+    # DIANA (Divisive ANAlysis) (Kaufman and Rousseeuw, 1990;
+    # Macnaughton - Smith et al. 1964) is a Divisive Hierarchical
+    # Clusterer. It begins with only one cluster with all data items,
+    # and divides the clusters until the desired clusters number is reached.
+    class Diana < Clusterer
+      attr_reader :data_set, :number_of_clusters, :clusters
+      parameters_info :distance_function =>
+          "Custom implementation of distance function. " +
+          "It must be a closure receiving two data items and return the " +
+          "distance bewteen them. By default, this algorithm uses " +
+          "ecuclidean distance of numeric attributes to the power of 2."
+      def initialize
+        @distance_function = lambda do |a,b|
+            Ai4r::Data::Proximity.squared_euclidean_distance(
+              a.select {|att_a| att_a.is_a? Numeric} ,
+              b.select {|att_b| att_b.is_a? Numeric})
+          end
+      end
+      # Build a new clusterer, using divisive analysis (DIANA algorithm)
+      def build(data_set, number_of_clusters)
+        @data_set = data_set
+        @number_of_clusters = number_of_clusters
+        @clusters = [@data_set[0..-1]]
+        while(@clusters.length < @number_of_clusters)
+          cluster_index_to_split = max_diameter_cluster(@clusters)
+          cluster_to_split = @clusters[cluster_index_to_split]
+          splinter_cluster = init_splinter_cluster(cluster_to_split)
+          while true
+            dist_diff, index = max_distance_difference(cluster_to_split, splinter_cluster)
+            break if dist_diff < 0
+            splinter_cluster << cluster_to_split.data_items[index]
+            cluster_to_split.data_items.delete_at(index)
+          end
+          @clusters << splinter_cluster
+        end
+        return self
+      end
+      # Classifies the given data item, returning the cluster index it belongs
+      # to (0-based).
+      def eval(data_item)
+        get_min_index(@clusters.collect do |cluster|
+          distance_sum(data_item, cluster) / cluster.data_items.length
+          end)
+      end
+      protected
+      # return the cluster with max diameter
+      def max_diameter_cluster(clusters)
+        max_index = 0
+        max_diameter = 0
+        clusters.each_with_index do |cluster, index|
+          diameter = cluster_diameter(cluster)
+          if diameter > max_diameter
+            max_index = index
+            max_diameter = diameter
+          end
+        end
+        return max_index
+      end
+      # Max distance between 2 items in a cluster
+      def cluster_diameter(cluster)
+        diameter = 0
+        cluster.data_items.each_with_index do |item_a, item_a_pos|
+          item_a_pos.times do |item_b_pos|
+            d = @distance_function.call(item_a, cluster.data_items[item_b_pos])
+            diameter = d if d > diameter
+          end
+        end
+        return diameter
+      end
+      # Create a cluster with the item with mx distance
+      # to the rest of the cluster's items.
+      # That item is removed from the initial cluster.
+      def init_splinter_cluster(cluster_to_split)
+        max = 0.0
+        max_index = 0
+        cluster_to_split.data_items.each_with_index do |item, index|
+          sum = distance_sum(item, cluster_to_split)
+          max, max_index = sum, index if sum > max
+        end
+        splinter_cluster = cluster_to_split[max_index]
+        cluster_to_split.data_items.delete_at(max_index)
+        return splinter_cluster
+      end
+      # Return the max average distance between any item of
+      # cluster_to_split and the rest of items in that cluster,
+      # minus the average distance with the items of splinter_cluster,
+      # and the index of the item.
+      # A positive value means that the items is closer to the
+      # splinter group than to its current cluster.
+      def max_distance_difference(cluster_to_split, splinter_cluster)
+        max_diff = -1.0/0
+        max_diff_index = 0
+        cluster_to_split.data_items.each_with_index do |item, index|
+          dist_a = distance_sum(item, cluster_to_split) / (cluster_to_split.data_items.length-1)
+          dist_b = distance_sum(item, splinter_cluster) / (splinter_cluster.data_items.length)
+          dist_diff = dist_a - dist_b
+          max_diff, max_diff_index = dist_diff, index if dist_diff > max_diff
+        end
+        return max_diff, max_diff_index
+      end
+      # Sum up the distance between an item and all the items in a cluster
+      def distance_sum(item_a, cluster)
+        cluster.data_items.inject(0.0) do |sum, item_b|
+          sum + @distance_function.call(item_a, item_b)
+        end
+      end
+    end
+  end
+end

data/lib/ai4r/clusterers/k_means.rb ADDED

@@ -0,0 +1,126 @@
+# Author::    Sergio Fierens (implementation)
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../clusterers/clusterer'
+module Ai4r
+  module Clusterers
+    # The k-means algorithm is an algorithm to cluster n objects
+    # based on attributes into k partitions, with k < n.
+    #
+    # More about K Means algorithm:
+    # http://en.wikipedia.org/wiki/K-means_algorithm
+    class KMeans < Clusterer
+      attr_reader :data_set, :number_of_clusters
+      attr_reader :clusters, :centroids, :iterations
+      parameters_info :max_iterations => "Maximum number of iterations to " +
+        "build the clusterer. By default it is uncapped.",
+        :distance_function => "Custom implementation of distance function. " +
+          "It must be a closure receiving two data items and return the " +
+          "distance bewteen them. By default, this algorithm uses " +
+          "ecuclidean distance of numeric attributes to the power of 2.",
+        :centroid_function => "Custom implementation to calculate the " +
+          "centroid of a cluster. It must be a closure receiving an array of " +
+          "data sets, and return an array of data items, representing the " +
+          "centroids of for each data set. " +
+          "By default, this algorithm returns a data items using the mode "+
+          "or mean of each attribute on each data set."
+      def initialize
+        @distance_function = nil
+        @max_iterations = nil
+        @old_centroids = nil
+        @centroid_function = lambda do |data_sets|
+          data_sets.collect{ |data_set| data_set.get_mean_or_mode}
+        end
+      end
+      # Build a new clusterer, using data examples found in data_set.
+      # Items will be clustered in "number_of_clusters" different
+      # clusters.
+      def build(data_set, number_of_clusters)
+        @data_set = data_set
+        @number_of_clusters = number_of_clusters
+        @iterations = 0
+        calc_initial_centroids
+        while(not stop_criteria_met)
+          calculate_membership_clusters
+          recompute_centroids
+        end
+        return self
+      end
+      # Classifies the given data item, returning the cluster index it belongs
+      # to (0-based).
+      def eval(data_item)
+        get_min_index(@centroids.collect {|centroid|
+            distance(data_item, centroid)})
+      end
+      # This function calculates the distance between 2 different
+      # instances. By default, it returns the euclidean distance to the
+      # power of 2.
+      # You can provide a more convinient distance implementation:
+      #
+      # 1- Overwriting this method
+      #
+      # 2- Providing a closure to the :distance_function parameter
+      def distance(a, b)
+        return @distance_function.call(a, b) if @distance_function
+        return euclidean_distance(a, b)
+      end
+      protected
+      def calc_initial_centroids
+        @centroids = []
+        tried_indexes = []
+        while @centroids.length < @number_of_clusters &&
+            tried_indexes.length < @data_set.data_items.length
+          random_index = rand(@data_set.data_items.length)
+          if !tried_indexes.include?(random_index)
+            tried_indexes << random_index
+            if !@centroids.include? @data_set.data_items[random_index]
+              @centroids << @data_set.data_items[random_index]
+            end
+          end
+        end
+        @number_of_clusters = @centroids.length
+      end
+      def stop_criteria_met
+        @old_centroids == @centroids ||
+          (@max_iterations && (@max_iterations <= @iterations))
+      end
+      def calculate_membership_clusters
+        @clusters = Array.new(@number_of_clusters) do
+          Ai4r::Data::DataSet.new :data_labels => @data_set.data_labels
+        end
+        @data_set.data_items.each do |data_item|
+          @clusters[eval(data_item)] << data_item
+        end
+      end
+      def recompute_centroids
+        @old_centroids = @centroids
+        @iterations += 1
+        @centroids = @centroid_function.call(@clusters)
+      end
+    end
+  end
+end

data/lib/ai4r/clusterers/median_linkage.rb ADDED

@@ -0,0 +1,61 @@
+# Author::    Sergio Fierens (implementation)
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../clusterers/single_linkage'
+module Ai4r
+  module Clusterers
+    # Implementation of an Agglomerative Hierarchical clusterer with
+    # median linkage algorithm, aka weighted pair group method centroid
+    # or WPGMC (Everitt et al., 2001 ; Gower, 1967 ; Jain and Dubes, 1988 ).
+    # Hierarchical clusteres create one cluster per element, and then
+    # progressively merge clusters, until the required number of clusters
+    # is reached.
+    # Similar to centroid linkages, but using fix weight:
+    #
+    #   D(cx, (ci U cj)) =  (1/2)*D(cx, ci) +
+    #                       (1/2)*D(cx, cj) -
+    #                       (1/4)*D(ci, cj)
+    class MedianLinkage < SingleLinkage
+    parameters_info :distance_function =>
+          "Custom implementation of distance function. " +
+          "It must be a closure receiving two data items and return the " +
+          "distance bewteen them. By default, this algorithm uses " +
+          "ecuclidean distance of numeric attributes to the power of 2."
+      # Build a new clusterer, using data examples found in data_set.
+      # Items will be clustered in "number_of_clusters" different
+      # clusters.
+      def build(data_set, number_of_clusters)
+        super
+      end
+      # This algorithms does not allow classification of new data items
+      # once it has been built. Rebuild the cluster including you data element.
+      def eval(data_item)
+        Raise "Eval of new data is not supported by this algorithm."
+      end
+      protected
+      # return distance between cluster cx and cluster (ci U cj),
+      # using median linkage
+      def linkage_distance(cx, ci, cj)
+        ( 0.5  * read_distance_matrix(cx, ci) +
+          0.5  * read_distance_matrix(cx, cj) -
+          0.25 * read_distance_matrix(ci, cj))
+      end
+    end
+  end
+end