RubyGems - ai4r - Versions diffs - 1.13 → 2.0 - Mend

ai4r 1.13 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (129) hide show

checksums.yaml +7 -0
data/README.md +174 -0
data/examples/classifiers/hyperpipes_data.csv +14 -0
data/examples/classifiers/hyperpipes_example.rb +22 -0
data/examples/classifiers/ib1_example.rb +12 -0
data/examples/classifiers/id3_example.rb +15 -10
data/examples/classifiers/id3_graphviz_example.rb +17 -0
data/examples/classifiers/logistic_regression_example.rb +11 -0
data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
data/examples/classifiers/naive_bayes_example.rb +12 -13
data/examples/classifiers/one_r_example.rb +27 -0
data/examples/classifiers/parameter_tutorial.rb +29 -0
data/examples/classifiers/prism_nominal_example.rb +15 -0
data/examples/classifiers/prism_numeric_example.rb +21 -0
data/examples/classifiers/simple_linear_regression_example.rb +14 -11
data/examples/classifiers/zero_and_one_r_example.rb +34 -0
data/examples/classifiers/zero_one_r_data.csv +8 -0
data/examples/clusterers/clusterer_example.rb +40 -34
data/examples/clusterers/dbscan_example.rb +17 -0
data/examples/clusterers/dendrogram_example.rb +17 -0
data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
data/examples/clusterers/kmeans_custom_example.rb +26 -0
data/examples/genetic_algorithm/bitstring_example.rb +41 -0
data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
data/examples/neural_network/backpropagation_example.rb +48 -48
data/examples/neural_network/hopfield_example.rb +45 -0
data/examples/neural_network/patterns_with_base_noise.rb +39 -39
data/examples/neural_network/patterns_with_noise.rb +41 -39
data/examples/neural_network/train_epochs_callback.rb +25 -0
data/examples/neural_network/training_patterns.rb +39 -39
data/examples/neural_network/transformer_text_classification.rb +78 -0
data/examples/neural_network/xor_example.rb +23 -22
data/examples/reinforcement/q_learning_example.rb +10 -0
data/examples/som/som_data.rb +155 -152
data/examples/som/som_multi_node_example.rb +12 -13
data/examples/som/som_single_example.rb +12 -15
data/examples/transformer/decode_classifier_example.rb +68 -0
data/examples/transformer/deterministic_example.rb +10 -0
data/examples/transformer/seq2seq_example.rb +16 -0
data/lib/ai4r/classifiers/classifier.rb +24 -16
data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
data/lib/ai4r/classifiers/ib1.rb +122 -32
data/lib/ai4r/classifiers/id3.rb +524 -145
data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
data/lib/ai4r/classifiers/naive_bayes.rb +95 -34
data/lib/ai4r/classifiers/one_r.rb +112 -44
data/lib/ai4r/classifiers/prism.rb +167 -76
data/lib/ai4r/classifiers/random_forest.rb +72 -0
data/lib/ai4r/classifiers/simple_linear_regression.rb +83 -58
data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
data/lib/ai4r/classifiers/votes.rb +57 -0
data/lib/ai4r/classifiers/zero_r.rb +71 -30
data/lib/ai4r/clusterers/average_linkage.rb +46 -27
data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
data/lib/ai4r/clusterers/clusterer.rb +29 -14
data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
data/lib/ai4r/clusterers/dbscan.rb +134 -0
data/lib/ai4r/clusterers/diana.rb +75 -49
data/lib/ai4r/clusterers/k_means.rb +270 -135
data/lib/ai4r/clusterers/median_linkage.rb +49 -33
data/lib/ai4r/clusterers/single_linkage.rb +196 -88
data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +25 -10
data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
data/lib/ai4r/data/data_set.rb +223 -103
data/lib/ai4r/data/parameterizable.rb +31 -25
data/lib/ai4r/data/proximity.rb +62 -62
data/lib/ai4r/data/statistics.rb +46 -35
data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
data/lib/ai4r/experiment/split.rb +39 -0
data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
data/lib/ai4r/neural_network/activation_functions.rb +37 -0
data/lib/ai4r/neural_network/backpropagation.rb +399 -134
data/lib/ai4r/neural_network/hopfield.rb +175 -58
data/lib/ai4r/neural_network/transformer.rb +194 -0
data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
data/lib/ai4r/reinforcement/q_learning.rb +51 -0
data/lib/ai4r/search/a_star.rb +76 -0
data/lib/ai4r/search/bfs.rb +50 -0
data/lib/ai4r/search/dfs.rb +50 -0
data/lib/ai4r/search/mcts.rb +118 -0
data/lib/ai4r/search.rb +12 -0
data/lib/ai4r/som/distance_metrics.rb +29 -0
data/lib/ai4r/som/layer.rb +28 -17
data/lib/ai4r/som/node.rb +61 -32
data/lib/ai4r/som/som.rb +158 -41
data/lib/ai4r/som/two_phase_layer.rb +21 -25
data/lib/ai4r/version.rb +3 -0
data/lib/ai4r.rb +57 -28
metadata +79 -109
data/README.rdoc +0 -39
data/test/classifiers/hyperpipes_test.rb +0 -84
data/test/classifiers/ib1_test.rb +0 -78
data/test/classifiers/id3_test.rb +0 -220
data/test/classifiers/multilayer_perceptron_test.rb +0 -79
data/test/classifiers/naive_bayes_test.rb +0 -43
data/test/classifiers/one_r_test.rb +0 -62
data/test/classifiers/prism_test.rb +0 -85
data/test/classifiers/simple_linear_regression_test.rb +0 -37
data/test/classifiers/zero_r_test.rb +0 -50
data/test/clusterers/average_linkage_test.rb +0 -51
data/test/clusterers/bisecting_k_means_test.rb +0 -66
data/test/clusterers/centroid_linkage_test.rb +0 -53
data/test/clusterers/complete_linkage_test.rb +0 -57
data/test/clusterers/diana_test.rb +0 -69
data/test/clusterers/k_means_test.rb +0 -167
data/test/clusterers/median_linkage_test.rb +0 -53
data/test/clusterers/single_linkage_test.rb +0 -122
data/test/clusterers/ward_linkage_hierarchical_test.rb +0 -81
data/test/clusterers/ward_linkage_test.rb +0 -53
data/test/clusterers/weighted_average_linkage_test.rb +0 -53
data/test/data/data_set_test.rb +0 -104
data/test/data/proximity_test.rb +0 -87
data/test/data/statistics_test.rb +0 -65
data/test/experiment/classifier_evaluator_test.rb +0 -76
data/test/genetic_algorithm/chromosome_test.rb +0 -57
data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
data/test/neural_network/backpropagation_test.rb +0 -82
data/test/neural_network/hopfield_test.rb +0 -72
data/test/som/som_test.rb +0 -97

data/lib/ai4r/clusterers/k_means.rb CHANGED Viewed

@@ -1,228 +1,363 @@
+# frozen_string_literal: true
 # Author::    Sergio Fierens (implementation)
 # License::   MPL 1.1
 # Project::   ai4r
-# Url::       http://ai4r.org/
+# Url::       https://github.com/SergioFierens/ai4r
 #
-# You can redistribute it and/or modify it under the terms of
-# the Mozilla Public License version 1.1  as published by the
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
 # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
-require File.dirname(__FILE__) + '/../data/data_set'
-require File.dirname(__FILE__) + '/../data/proximity'
-require File.dirname(__FILE__) + '/../clusterers/clusterer'
+require_relative '../data/data_set'
+require_relative '../data/proximity'
+require_relative '../clusterers/clusterer'
 module Ai4r
   module Clusterers
-    # The k-means algorithm is an algorithm to cluster n objects
+    # The k-means algorithm is an algorithm to cluster n objects
     # based on attributes into k partitions, with k < n.
-    #
+    #
     # More about K Means algorithm:
-    # http://en.wikipedia.org/wiki/K-means_algorithm
+    # http://en.wikipedia.org/wiki/K-means_algorithm
     class KMeans < Clusterer
-      attr_reader :data_set, :number_of_clusters
-      attr_reader :clusters, :centroids, :iterations
-      parameters_info :max_iterations => "Maximum number of iterations to " +
-        "build the clusterer. By default it is uncapped.",
-        :distance_function => "Custom implementation of distance function. " +
-          "It must be a closure receiving two data items and return the " +
-          "distance between them. By default, this algorithm uses " +
-          "euclidean distance of numeric attributes to the power of 2.",
-        :centroid_function => "Custom implementation to calculate the " +
-          "centroid of a cluster. It must be a closure receiving an array of " +
-          "data sets, and return an array of data items, representing the " +
-          "centroids of for each data set. " +
-          "By default, this algorithm returns a data items using the mode "+
-          "or mean of each attribute on each data set.",
-        :centroid_indices => "Indices of data items (indexed from 0) to be " +
-          "the initial centroids.  Otherwise, the initial centroids will be " +
-          "assigned randomly from the data set.",
-        :on_empty => "Action to take if a cluster becomes empty, with values " +
-          "'eliminate' (the default action, eliminate the empty cluster), " +
-          "'terminate' (terminate with error), 'random' (relocate the " +
-          "empty cluster to a random point), 'outlier' (relocate the " +
-          "empty cluster to the point furthest from its centroid)."
+      attr_reader :data_set, :number_of_clusters, :clusters, :centroids, :iterations, :history
+      parameters_info(
+        max_iterations: 'Maximum number of iterations to build the clusterer. By default it is uncapped.',
+        distance_function: 'Custom implementation of distance function. ' \
+                           'It must be a closure receiving two data items and return the ' \
+                           'distance between them. By default, this algorithm uses ' \
+                           'euclidean distance of numeric attributes to the power of 2.',
+        centroid_function: 'Custom implementation to calculate the ' \
+                           'centroid of a cluster. It must be a closure receiving an array of ' \
+                           'data sets, and return an array of data items, representing the ' \
+                           'centroids of for each data set. ' \
+                           'By default, this algorithm returns a data items using the mode ' \
+                           'or mean of each attribute on each data set.',
+        centroid_indices: 'Indices of data items (indexed from 0) to be ' \
+                          'the initial centroids.  Otherwise, the initial centroids will be ' \
+                          'assigned randomly from the data set.',
+        on_empty: 'Action to take if a cluster becomes empty, with values ' \
+                  "'eliminate' (the default action, eliminate the empty cluster), " \
+                  "'terminate' (terminate with error), 'random' (relocate the " \
+                  "empty cluster to a random point), 'outlier' (relocate the " \
+                  'empty cluster to the point furthest from its centroid).',
+        random_seed: "Seed value used to initialize Ruby's random number " \
+                     'generator when selecting random centroids.',
+        init_method: 'Strategy to initialize centroids. Available values: ' \
+                     ':random (default) and :kmeans_plus_plus.',
+        restarts: 'Number of random initializations to perform. ' \
+                  'The best run (lowest SSE) will be kept.',
+        track_history: 'Keep centroids and assignments for each iteration ' \
+                       'when building the clusterer.'
+      )
+      # @return [Object]
       def initialize
+        super()
         @distance_function = nil
         @max_iterations = nil
-        @centroid_function = lambda do |data_sets|
-          data_sets.collect{ |data_set| data_set.get_mean_or_mode}
+        @centroid_function = lambda do |data_sets|
+          data_sets.collect(&:get_mean_or_mode)
         end
         @centroid_indices = []
         @on_empty = 'eliminate' # default if none specified
+        @random_seed = nil
+        @rng = nil
+        @init_method = :random
+        @restarts = 1
+        @track_history = false
       end
       # Build a new clusterer, using data examples found in data_set.
       # Items will be clustered in "number_of_clusters" different
       # clusters.
+      # @param data_set [Object]
+      # @param number_of_clusters [Object]
+      # @return [Object]
       def build(data_set, number_of_clusters)
         @data_set = data_set
         @number_of_clusters = number_of_clusters
-        raise ArgumentError, 'Length of centroid indices array differs from the specified number of clusters' unless @centroid_indices.empty? || @centroid_indices.length == @number_of_clusters
-        raise ArgumentError, 'Invalid value for on_empty' unless @on_empty == 'eliminate' || @on_empty == 'terminate' || @on_empty == 'random' || @on_empty == 'outlier'
-        @iterations = 0
-        calc_initial_centroids
-        while(not stop_criteria_met)
-          calculate_membership_clusters
-          recompute_centroids
+        raise ArgumentError, 'Number of clusters larger than data items' if @number_of_clusters > @data_set.data_items.length
+        unless @centroid_indices.empty? || @centroid_indices.length == @number_of_clusters
+          raise ArgumentError,
+                'Length of centroid indices array differs from the specified number of clusters'
+        end
+        unless @on_empty == 'eliminate' || @on_empty == 'terminate' || @on_empty == 'random' || @on_empty == 'outlier'
+          raise ArgumentError,
+                'Invalid value for on_empty'
+        end
+        seed_base = @random_seed
+        best_sse = nil
+        best_centroids = nil
+        best_clusters = nil
+        best_iterations = nil
+        (@restarts || 1).times do |i|
+          @random_seed = seed_base.nil? ? nil : seed_base + i
+          @rng = @random_seed.nil? ? Random.new : Random.new(@random_seed)
+          @iterations = 0
+          @history = [] if @track_history
+          calc_initial_centroids
+          until stop_criteria_met
+            calculate_membership_clusters
+            if @track_history
+              @history << {
+                centroids: @centroids.collect(&:dup),
+                assignments: @assignments.dup
+              }
+            end
+            recompute_centroids
+          end
+          current_sse = sse
+          next unless best_sse.nil? || current_sse < best_sse
+          best_sse = current_sse
+          best_centroids = Marshal.load(Marshal.dump(@centroids))
+          best_clusters = Marshal.load(Marshal.dump(@clusters))
+          best_iterations = @iterations
         end
-        return self
+        @random_seed = seed_base
+        @rng = @random_seed.nil? ? Random.new : Random.new(@random_seed)
+        @centroids = best_centroids
+        @clusters = best_clusters
+        @iterations = best_iterations
+        self
       end
-      # Classifies the given data item, returning the cluster index it belongs
+      # Classifies the given data item, returning the cluster index it belongs
       # to (0-based).
+      # @param data_item [Object]
+      # @return [Object]
       def eval(data_item)
-        get_min_index(@centroids.collect {|centroid|
-            distance(data_item, centroid)})
+        get_min_index(@centroids.collect do |centroid|
+          distance(data_item, centroid)
+        end)
       end
+      # Sum of squared distances of all points to their respective centroids.
+      # It can be used as a measure of cluster compactness (SSE).
+      # @return [Object]
+      def sse
+        sum = 0.0
+        @clusters.each_with_index do |cluster, i|
+          centroid = @centroids[i]
+          cluster.data_items.each do |item|
+            sum += distance(item, centroid)
+          end
+        end
+        sum
+      end
       # This function calculates the distance between 2 different
-      # instances. By default, it returns the euclidean distance to the
+      # instances. By default, it returns the euclidean distance to the
       # power of 2.
       # You can provide a more convenient distance implementation:
-      #
+      #
       # 1- Overwriting this method
-      #
+      #
       # 2- Providing a closure to the :distance_function parameter
+      # @param a [Object]
+      # @param b [Object]
+      # @return [Object]
       def distance(a, b)
         return @distance_function.call(a, b) if @distance_function
-        return Ai4r::Data::Proximity.squared_euclidean_distance(
-                 a.select {|att_a| att_a.is_a? Numeric} ,
-                 b.select {|att_b| att_b.is_a? Numeric})
+        Ai4r::Data::Proximity.squared_euclidean_distance(
+          a.select { |att_a| att_a.is_a? Numeric },
+          b.select { |att_b| att_b.is_a? Numeric }
+        )
       end
-      protected
+      protected
+      # @return [Object]
       def calc_initial_centroids
-        @centroids, @old_centroids = [], nil
+        @centroids = []
+        @old_centroids = nil
         if @centroid_indices.empty?
-          populate_centroids('random')
+          if @init_method == :kmeans_plus_plus
+            kmeans_plus_plus_init
+          else
+            populate_centroids('random')
+          end
         else
           populate_centroids('indices')
         end
       end
+      # @return [Object]
       def stop_criteria_met
-        @old_centroids == @centroids ||
+        @old_centroids == @centroids ||
           (@max_iterations && (@max_iterations <= @iterations))
       end
+      # @return [Object]
       def calculate_membership_clusters
-        @clusters = Array.new(@number_of_clusters) do
-          Ai4r::Data::DataSet.new :data_labels => @data_set.data_labels
+        @clusters = Array.new(@number_of_clusters) do
+          Ai4r::Data::DataSet.new data_labels: @data_set.data_labels
         end
-        @cluster_indices = Array.new(@number_of_clusters) {[]}
+        @cluster_indices = Array.new(@number_of_clusters) { [] }
+        @assignments = Array.new(@data_set.data_items.length)
         @data_set.data_items.each_with_index do |data_item, data_index|
           c = eval(data_item)
           @clusters[c] << data_item
           @cluster_indices[c] << data_index if @on_empty == 'outlier'
+          @assignments[data_index] = c
         end
-        manage_empty_clusters if has_empty_cluster?
+        manage_empty_clusters if empty_cluster?
       end
+      # @return [Object]
       def recompute_centroids
         @old_centroids = @centroids
         @iterations += 1
-        @centroids = @centroid_function.call(@clusters)
+        @centroids = @centroid_function.call(@clusters)
       end
-      def populate_centroids(populate_method, number_of_clusters=@number_of_clusters)
+      # @return [Object]
+      def kmeans_plus_plus_init
+        chosen_indices = []
+        first_index = (0...@data_set.data_items.length).to_a.sample(random: @rng)
+        return if first_index.nil?
+        @centroids << @data_set.data_items[first_index]
+        chosen_indices << first_index
+        while @centroids.length < @number_of_clusters &&
+              chosen_indices.length < @data_set.data_items.length
+          distances = []
+          total = 0.0
+          @data_set.data_items.each_with_index do |item, index|
+            next if chosen_indices.include?(index)
+            min_dist = @centroids.map { |c| distance(item, c) }.min
+            distances << [index, min_dist]
+            total += min_dist
+          end
+          break if distances.empty?
+          r = @rng.rand * total
+          cumulative = 0.0
+          chosen = distances.find do |_idx, dist|
+            cumulative += dist
+            cumulative >= r
+          end
+          chosen_indices << chosen[0]
+          @centroids << @data_set.data_items[chosen[0]]
+        end
+        @number_of_clusters = @centroids.length
+      end
+      # @param populate_method [Object]
+      # @param number_of_clusters [Object]
+      # @return [Object]
+      def populate_centroids(populate_method, number_of_clusters = @number_of_clusters)
         tried_indexes = []
         case populate_method
         when 'random' # for initial assignment (without the :centroid_indices option) and for reassignment of empty cluster centroids (with :on_empty option 'random')
-          while @centroids.length < number_of_clusters &&
-              tried_indexes.length < @data_set.data_items.length
-            random_index = rand(@data_set.data_items.length)
-            if !tried_indexes.include?(random_index)
-              tried_indexes << random_index
-              if !@centroids.include? @data_set.data_items[random_index]
-                @centroids << @data_set.data_items[random_index]
-              end
-            end
+          while @centroids.length < number_of_clusters &&
+                tried_indexes.length < @data_set.data_items.length
+            random_index = (0...@data_set.data_items.length).to_a.sample(random: @rng)
+            next if tried_indexes.include?(random_index)
+            tried_indexes << random_index
+            @centroids << @data_set.data_items[random_index] unless @centroids.include? @data_set.data_items[random_index]
           end
         when 'indices' # for initial assignment only (with the :centroid_indices option)
           @centroid_indices.each do |index|
-            raise ArgumentError, "Invalid centroid index #{index}" unless (index.is_a? Integer) && index >=0 && index < @data_set.data_items.length
-            if !tried_indexes.include?(index)
-              tried_indexes << index
-              if !@centroids.include? @data_set.data_items[index]
-                @centroids << @data_set.data_items[index]
-              end
+            unless (index.is_a? Integer) && index >= 0 && index < @data_set.data_items.length
+              raise ArgumentError,
+                    "Invalid centroid index #{index}"
             end
+            next if tried_indexes.include?(index)
+            tried_indexes << index
+            @centroids << @data_set.data_items[index] unless @centroids.include? @data_set.data_items[index]
           end
         when 'outlier' # for reassignment of empty cluster centroids only (with :on_empty option 'outlier')
           sorted_data_indices = sort_data_indices_by_dist_to_centroid
           i = sorted_data_indices.length - 1 # the last item is the furthest from its centroid
-          while @centroids.length < number_of_clusters &&
-              tried_indexes.length < @data_set.data_items.length
-            outlier_index = sorted_data_indices[i]
-            if !tried_indexes.include?(outlier_index)
+          while @centroids.length < number_of_clusters &&
+                tried_indexes.length < @data_set.data_items.length
+            outlier_index = sorted_data_indices[i]
+            unless tried_indexes.include?(outlier_index)
               tried_indexes << outlier_index
-              if !@centroids.include? @data_set.data_items[outlier_index]
-                @centroids << @data_set.data_items[outlier_index]
-              end
+              @centroids << @data_set.data_items[outlier_index] unless @centroids.include? @data_set.data_items[outlier_index]
             end
-            i > 0 ? i -= 1 : break
+            i.positive? ? i -= 1 : break
           end
-        end
+        end
         @number_of_clusters = @centroids.length
-      end
-       # Sort cluster points by distance to assigned centroid.  Utilizes @cluster_indices.
-       # Returns indices, sorted in order from the nearest to furthest.
-       def sort_data_indices_by_dist_to_centroid
-         sorted_data_indices = []
-         h = {}
-         @clusters.each_with_index do |cluster, c|
-           centroid = @centroids[c]
-           cluster.data_items.each_with_index do |data_item, i|
-             dist_to_centroid = distance(data_item, centroid)
-             data_index = @cluster_indices[c][i]
-             h[data_index] = dist_to_centroid
-           end
-         end
-         # sort hash of {index => dist to centroid} by dist to centroid (ascending) and then return an array of only the indices
-         sorted_data_indices = h.sort_by{|k,v| v}.collect{|a,b| a}
-       end
-      def has_empty_cluster?
+      end
+      # Sort cluster points by distance to assigned centroid.  Utilizes @cluster_indices.
+      # Returns indices, sorted in order from the nearest to furthest.
+      # @return [Object]
+      def sort_data_indices_by_dist_to_centroid
+        h = {}
+        @clusters.each_with_index do |cluster, c|
+          centroid = @centroids[c]
+          cluster.data_items.each_with_index do |data_item, i|
+            dist_to_centroid = distance(data_item, centroid)
+            data_index = @cluster_indices[c][i]
+            h[data_index] = dist_to_centroid
+          end
+        end
+        # sort hash of {index => dist to centroid} by dist to centroid (ascending) and then return an array of only the indices
+        h.sort_by { |_k, v| v }.collect { |a, _b| a }
+      end
+      # @return [Object]
+      def empty_cluster?
         found_empty = false
         @number_of_clusters.times do |c|
           found_empty = true if @clusters[c].data_items.empty?
         end
         found_empty
       end
+      # @return [Object]
       def manage_empty_clusters
-        return if self.on_empty == 'terminate' # Do nothing to terminate with error. (The empty cluster will be assigned a nil centroid, and then calculating the distance from this centroid to another point will raise an exception.)
+        # Do nothing to terminate with error. (The empty cluster will be assigned a nil centroid, and then calculating the distance from this centroid to another point will raise an exception.)
+        return if on_empty == 'terminate'
         initial_number_of_clusters = @number_of_clusters
         eliminate_empty_clusters
-        return if self.on_empty == 'eliminate'
-        populate_centroids(self.on_empty, initial_number_of_clusters) # Add initial_number_of_clusters - @number_of_clusters
-        calculate_membership_clusters
+        return if on_empty == 'eliminate'
+        populate_centroids(on_empty, initial_number_of_clusters) # Add initial_number_of_clusters - @number_of_clusters
+        calculate_membership_clusters
       end
+      # @return [Object]
       def eliminate_empty_clusters
-        old_clusters, old_centroids, old_cluster_indices = @clusters, @centroids, @cluster_indices
-        @clusters, @centroids, @cluster_indices = [], [], []
+        old_clusters = @clusters
+        old_centroids = @centroids
+        old_cluster_indices = @cluster_indices
+        old_assignments = @assignments
+        @clusters = []
+        @centroids = []
+        @cluster_indices = []
+        remap = {}
+        new_index = 0
         @number_of_clusters.times do |i|
-          if !old_clusters[i].data_items.empty?
-            @clusters << old_clusters[i]
-            @cluster_indices << old_cluster_indices[i]
-            @centroids << old_centroids[i]
-          end
+          next if old_clusters[i].data_items.empty?
+          remap[i] = new_index
+          @clusters << old_clusters[i]
+          @cluster_indices << old_cluster_indices[i]
+          @centroids << old_centroids[i]
+          new_index += 1
         end
         @number_of_clusters = @centroids.length
+        @assignments = old_assignments.map { |c| remap[c] }
       end
     end
   end
 end

data/lib/ai4r/clusterers/median_linkage.rb CHANGED Viewed

@@ -1,61 +1,77 @@
+# frozen_string_literal: true
 # Author::    Sergio Fierens (implementation)
 # License::   MPL 1.1
 # Project::   ai4r
-# Url::       http://www.ai4r.org/
+# Url::       https://github.com/SergioFierens/ai4r
 #
-# You can redistribute it and/or modify it under the terms of
-# the Mozilla Public License version 1.1  as published by the
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
 # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
-require File.dirname(__FILE__) + '/../data/data_set'
-require File.dirname(__FILE__) + '/../clusterers/single_linkage'
+require_relative '../data/data_set'
+require_relative '../clusterers/single_linkage'
+require_relative '../clusterers/cluster_tree'
 module Ai4r
   module Clusterers
-    # Implementation of an Agglomerative Hierarchical clusterer with
-    # median linkage algorithm, aka weighted pair group method centroid
+    # Implementation of an Agglomerative Hierarchical clusterer with
+    # median linkage algorithm, aka weighted pair group method centroid
     # or WPGMC (Everitt et al., 2001 ; Gower, 1967 ; Jain and Dubes, 1988 ).
-    # Hierarchical clusterer create one cluster per element, and then
+    # Hierarchical clusterer create one cluster per element, and then
     # progressively merge clusters, until the required number of clusters
     # is reached.
-    # Similar to centroid linkages, but using fix weight:
-    #
-    #   D(cx, (ci U cj)) =  (1/2)*D(cx, ci) +
-    #                       (1/2)*D(cx, cj) -
+    # Similar to centroid linkages, but using fix weight:
+    #
+    #   D(cx, (ci U cj)) =  (1/2)*D(cx, ci) +
+    #                       (1/2)*D(cx, cj) -
     #                       (1/4)*D(ci, cj)
     class MedianLinkage < SingleLinkage
-    parameters_info :distance_function =>
-          "Custom implementation of distance function. " +
-          "It must be a closure receiving two data items and return the " +
-          "distance between them. By default, this algorithm uses " +
-          "euclidean distance of numeric attributes to the power of 2."
+      include ClusterTree
+      parameters_info distance_function:
+            'Custom implementation of distance function. ' \
+            'It must be a closure receiving two data items and return the ' \
+            'distance between them. By default, this algorithm uses ' \
+            'euclidean distance of numeric attributes to the power of 2.'
       # Build a new clusterer, using data examples found in data_set.
       # Items will be clustered in "number_of_clusters" different
       # clusters.
-      def build(data_set, number_of_clusters)
+      # @param data_set [Object]
+      # @param number_of_clusters [Object]
+      # @param *options [Object]
+      # @return [Object]
+      def build(data_set, number_of_clusters = 1, **options)
         super
       end
-      # This algorithms does not allow classification of new data items
+      # This algorithms does not allow classification of new data items
       # once it has been built. Rebuild the cluster including you data element.
-      def eval(data_item)
-        Raise "Eval of new data is not supported by this algorithm."
+      # @param _data_item [Object]
+      # @return [Object]
+      def eval(_data_item)
+        raise NotImplementedError, 'Eval of new data is not supported by this algorithm.'
       end
+      # @return [Object]
+      def supports_eval?
+        false
+      end
       protected
       # return distance between cluster cx and cluster (ci U cj),
       # using median linkage
-      def linkage_distance(cx, ci, cj)
-        ( 0.5  * read_distance_matrix(cx, ci) +
-          0.5  * read_distance_matrix(cx, cj) -
-          0.25 * read_distance_matrix(ci, cj))
+      # @param cx [Object]
+      # @param ci [Object]
+      # @param cj [Object]
+      # @return [Object]
+      def linkage_distance(cluster_x, cluster_i, cluster_j)
+        ((0.5 * read_distance_matrix(cluster_x, cluster_i)) +
+          (0.5  * read_distance_matrix(cluster_x, cluster_j)) -
+          (0.25 * read_distance_matrix(cluster_i, cluster_j)))
       end
     end
   end
 end