RubyGems - ai4r - Versions diffs - 1.13 → 2.0 - Mend

ai4r 1.13 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (129) hide show

checksums.yaml +7 -0
data/README.md +174 -0
data/examples/classifiers/hyperpipes_data.csv +14 -0
data/examples/classifiers/hyperpipes_example.rb +22 -0
data/examples/classifiers/ib1_example.rb +12 -0
data/examples/classifiers/id3_example.rb +15 -10
data/examples/classifiers/id3_graphviz_example.rb +17 -0
data/examples/classifiers/logistic_regression_example.rb +11 -0
data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
data/examples/classifiers/naive_bayes_example.rb +12 -13
data/examples/classifiers/one_r_example.rb +27 -0
data/examples/classifiers/parameter_tutorial.rb +29 -0
data/examples/classifiers/prism_nominal_example.rb +15 -0
data/examples/classifiers/prism_numeric_example.rb +21 -0
data/examples/classifiers/simple_linear_regression_example.rb +14 -11
data/examples/classifiers/zero_and_one_r_example.rb +34 -0
data/examples/classifiers/zero_one_r_data.csv +8 -0
data/examples/clusterers/clusterer_example.rb +40 -34
data/examples/clusterers/dbscan_example.rb +17 -0
data/examples/clusterers/dendrogram_example.rb +17 -0
data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
data/examples/clusterers/kmeans_custom_example.rb +26 -0
data/examples/genetic_algorithm/bitstring_example.rb +41 -0
data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
data/examples/neural_network/backpropagation_example.rb +48 -48
data/examples/neural_network/hopfield_example.rb +45 -0
data/examples/neural_network/patterns_with_base_noise.rb +39 -39
data/examples/neural_network/patterns_with_noise.rb +41 -39
data/examples/neural_network/train_epochs_callback.rb +25 -0
data/examples/neural_network/training_patterns.rb +39 -39
data/examples/neural_network/transformer_text_classification.rb +78 -0
data/examples/neural_network/xor_example.rb +23 -22
data/examples/reinforcement/q_learning_example.rb +10 -0
data/examples/som/som_data.rb +155 -152
data/examples/som/som_multi_node_example.rb +12 -13
data/examples/som/som_single_example.rb +12 -15
data/examples/transformer/decode_classifier_example.rb +68 -0
data/examples/transformer/deterministic_example.rb +10 -0
data/examples/transformer/seq2seq_example.rb +16 -0
data/lib/ai4r/classifiers/classifier.rb +24 -16
data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
data/lib/ai4r/classifiers/ib1.rb +122 -32
data/lib/ai4r/classifiers/id3.rb +524 -145
data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
data/lib/ai4r/classifiers/naive_bayes.rb +95 -34
data/lib/ai4r/classifiers/one_r.rb +112 -44
data/lib/ai4r/classifiers/prism.rb +167 -76
data/lib/ai4r/classifiers/random_forest.rb +72 -0
data/lib/ai4r/classifiers/simple_linear_regression.rb +83 -58
data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
data/lib/ai4r/classifiers/votes.rb +57 -0
data/lib/ai4r/classifiers/zero_r.rb +71 -30
data/lib/ai4r/clusterers/average_linkage.rb +46 -27
data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
data/lib/ai4r/clusterers/clusterer.rb +29 -14
data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
data/lib/ai4r/clusterers/dbscan.rb +134 -0
data/lib/ai4r/clusterers/diana.rb +75 -49
data/lib/ai4r/clusterers/k_means.rb +270 -135
data/lib/ai4r/clusterers/median_linkage.rb +49 -33
data/lib/ai4r/clusterers/single_linkage.rb +196 -88
data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +25 -10
data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
data/lib/ai4r/data/data_set.rb +223 -103
data/lib/ai4r/data/parameterizable.rb +31 -25
data/lib/ai4r/data/proximity.rb +62 -62
data/lib/ai4r/data/statistics.rb +46 -35
data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
data/lib/ai4r/experiment/split.rb +39 -0
data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
data/lib/ai4r/neural_network/activation_functions.rb +37 -0
data/lib/ai4r/neural_network/backpropagation.rb +399 -134
data/lib/ai4r/neural_network/hopfield.rb +175 -58
data/lib/ai4r/neural_network/transformer.rb +194 -0
data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
data/lib/ai4r/reinforcement/q_learning.rb +51 -0
data/lib/ai4r/search/a_star.rb +76 -0
data/lib/ai4r/search/bfs.rb +50 -0
data/lib/ai4r/search/dfs.rb +50 -0
data/lib/ai4r/search/mcts.rb +118 -0
data/lib/ai4r/search.rb +12 -0
data/lib/ai4r/som/distance_metrics.rb +29 -0
data/lib/ai4r/som/layer.rb +28 -17
data/lib/ai4r/som/node.rb +61 -32
data/lib/ai4r/som/som.rb +158 -41
data/lib/ai4r/som/two_phase_layer.rb +21 -25
data/lib/ai4r/version.rb +3 -0
data/lib/ai4r.rb +57 -28
metadata +79 -109
data/README.rdoc +0 -39
data/test/classifiers/hyperpipes_test.rb +0 -84
data/test/classifiers/ib1_test.rb +0 -78
data/test/classifiers/id3_test.rb +0 -220
data/test/classifiers/multilayer_perceptron_test.rb +0 -79
data/test/classifiers/naive_bayes_test.rb +0 -43
data/test/classifiers/one_r_test.rb +0 -62
data/test/classifiers/prism_test.rb +0 -85
data/test/classifiers/simple_linear_regression_test.rb +0 -37
data/test/classifiers/zero_r_test.rb +0 -50
data/test/clusterers/average_linkage_test.rb +0 -51
data/test/clusterers/bisecting_k_means_test.rb +0 -66
data/test/clusterers/centroid_linkage_test.rb +0 -53
data/test/clusterers/complete_linkage_test.rb +0 -57
data/test/clusterers/diana_test.rb +0 -69
data/test/clusterers/k_means_test.rb +0 -167
data/test/clusterers/median_linkage_test.rb +0 -53
data/test/clusterers/single_linkage_test.rb +0 -122
data/test/clusterers/ward_linkage_hierarchical_test.rb +0 -81
data/test/clusterers/ward_linkage_test.rb +0 -53
data/test/clusterers/weighted_average_linkage_test.rb +0 -53
data/test/data/data_set_test.rb +0 -104
data/test/data/proximity_test.rb +0 -87
data/test/data/statistics_test.rb +0 -65
data/test/experiment/classifier_evaluator_test.rb +0 -76
data/test/genetic_algorithm/chromosome_test.rb +0 -57
data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
data/test/neural_network/backpropagation_test.rb +0 -82
data/test/neural_network/hopfield_test.rb +0 -72
data/test/som/som_test.rb +0 -97

data/lib/ai4r/clusterers/bisecting_k_means.rb CHANGED Viewed

@@ -1,80 +1,87 @@
+# frozen_string_literal: true
 # Author::    Sergio Fierens (implementation)
 # License::   MPL 1.1
 # Project::   ai4r
-# Url::       http://www.ai4r.org/
+# Url::       https://github.com/SergioFierens/ai4r
 #
-# You can redistribute it and/or modify it under the terms of
-# the Mozilla Public License version 1.1  as published by the
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
 # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
-require File.dirname(__FILE__) + '/../data/data_set'
-require File.dirname(__FILE__) + '/../clusterers/k_means'
+require_relative '../data/data_set'
+require_relative '../clusterers/k_means'
 module Ai4r
   module Clusterers
     # The Bisecting k-means algorithm is a variation of the "k-means" algorithm,
     # somewhat less sensitive to the initial election of centroids than the
-    # original.
-    #
+    # original.
+    #
     # More about K Means algorithm:
-    # http://en.wikipedia.org/wiki/K-means_algorithm
+    # http://en.wikipedia.org/wiki/K-means_algorithm
     class BisectingKMeans < KMeans
       attr_reader :data_set, :number_of_clusters, :clusters, :centroids
-      attr_accessor :max_iterations, :distance_function, :refine
-      parameters_info :max_iterations => "Maximum number of iterations to " +
-        "build the clusterer. By default it is uncapped.",
-        :distance_function => "Custom implementation of distance function. " +
-          "It must be a closure receiving two data items and return the " +
-          "distance between them. By default, this algorithm uses " +
-          "euclidean distance of numeric attributes to the power of 2.",
-        :centroid_function => "Custom implementation to calculate the " +
-          "centroid of a cluster. It must be a closure receiving an array of " +
-          "data sets, and return an array of data items, representing the " +
-          "centroids of for each data set. " +
-          "By default, this algorithm returns a data items using the mode "+
-          "or mean of each attribute on each data set.",
-        :refine => "Boolean value. True by default. It will run the " +
-            "classic K Means algorithm, using as initial centroids the " +
-            "result of the bisecting approach."
-      def intialize
+      parameters_info max_iterations: 'Maximum number of iterations to ' \
+                                      'build the clusterer. By default it is uncapped.',
+                      distance_function: 'Custom implementation of distance function. ' \
+                                         'It must be a closure receiving two data items and return the ' \
+                                         'distance between them. By default, this algorithm uses ' \
+                                         'euclidean distance of numeric attributes to the power of 2.',
+                      centroid_function: 'Custom implementation to calculate the ' \
+                                         'centroid of a cluster. It must be a closure receiving an array of ' \
+                                         'data sets, and return an array of data items, representing the ' \
+                                         'centroids of for each data set. ' \
+                                         'By default, this algorithm returns a data items using the mode ' \
+                                         'or mean of each attribute on each data set.',
+                      refine: 'Boolean value. True by default. It will run the ' \
+                              'classic K Means algorithm, using as initial centroids the ' \
+                              'result of the bisecting approach.'
+      # @return [Object]
+      def initialize
+        super
         @refine = true
       end
       # Build a new clusterer, using data examples found in data_set.
       # Items will be clustered in "number_of_clusters" different
       # clusters.
+      # @param data_set [Object]
+      # @param number_of_clusters [Object]
+      # @return [Object]
       def build(data_set, number_of_clusters)
         @data_set = data_set
         @number_of_clusters = number_of_clusters
         @clusters = [@data_set]
         @centroids = [@data_set.get_mean_or_mode]
         while @clusters.length < @number_of_clusters
           biggest_cluster_index = find_biggest_cluster_index(@clusters)
-          clusterer = KMeans.new.
-            set_parameters(get_parameters).
-            build(@clusters[biggest_cluster_index], 2)
+          clusterer = KMeans.new
+                            .set_parameters(get_parameters)
+                            .build(@clusters[biggest_cluster_index], 2)
           @clusters.delete_at(biggest_cluster_index)
           @centroids.delete_at(biggest_cluster_index)
           @clusters.concat(clusterer.clusters)
           @centroids.concat(clusterer.centroids)
         end
         super if @refine
-        return self
-      end
-      protected
+        self
+      end
+      protected
+      # @return [Object]
       def calc_initial_centroids
         @centroids # Use existing centroids
       end
+      # @param clusters [Object]
+      # @return [Object]
       def find_biggest_cluster_index(clusters)
         max_index = 0
         max_length = 0
@@ -85,9 +92,8 @@ module Ai4r
             max_index = cluster_index
           end
         end
-        return max_index
+        max_index
       end
     end
   end
 end

data/lib/ai4r/clusterers/centroid_linkage.rb CHANGED Viewed

@@ -1,66 +1,82 @@
+# frozen_string_literal: true
 # Author::    Sergio Fierens (implementation)
 # License::   MPL 1.1
 # Project::   ai4r
-# Url::       http://ai4r.org/
+# Url::       https://github.com/SergioFierens/ai4r
 #
-# You can redistribute it and/or modify it under the terms of
-# the Mozilla Public License version 1.1  as published by the
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
 # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
-require File.dirname(__FILE__) + '/../data/data_set'
-require File.dirname(__FILE__) + '/../clusterers/single_linkage'
+require_relative '../data/data_set'
+require_relative '../clusterers/single_linkage'
+require_relative '../clusterers/cluster_tree'
 module Ai4r
   module Clusterers
-    # Implementation of an Agglomerative Hierarchical clusterer with
-    # centroid linkage algorithm, aka unweighted pair group method
+    # Implementation of an Agglomerative Hierarchical clusterer with
+    # centroid linkage algorithm, aka unweighted pair group method
     # centroid (UPGMC) (Everitt et al., 2001 ; Jain and Dubes, 1988 ;
     # Sokal and Michener, 1958 )
-    # Hierarchical clusterer create one cluster per element, and then
+    # Hierarchical clusterer create one cluster per element, and then
     # progressively merge clusters, until the required number of clusters
     # is reached.
-    # The distance between clusters is the squared euclidean distance
-    # between their centroids.
-    #
+    # The distance between clusters is the squared euclidean distance
+    # between their centroids.
+    #
     #   D(cx, (ci U cj)) = | mx - mij |^2
-    #   D(cx, (ci U cj)) =  (ni/(ni+nj))*D(cx, ci) +
-    #                       (nj/(ni+nj))*D(cx, cj) -
+    #   D(cx, (ci U cj)) =  (ni/(ni+nj))*D(cx, ci) +
+    #                       (nj/(ni+nj))*D(cx, cj) -
     #                       (ni*nj/(ni+nj)^2)*D(ci, cj)
     class CentroidLinkage < SingleLinkage
-    parameters_info :distance_function =>
-          "Custom implementation of distance function. " +
-          "It must be a closure receiving two data items and return the " +
-          "distance between them. By default, this algorithm uses " +
-          "euclidean distance of numeric attributes to the power of 2."
+      include ClusterTree
+      parameters_info distance_function:
+            'Custom implementation of distance function. ' \
+            'It must be a closure receiving two data items and return the ' \
+            'distance between them. By default, this algorithm uses ' \
+            'euclidean distance of numeric attributes to the power of 2.'
       # Build a new clusterer, using data examples found in data_set.
       # Items will be clustered in "number_of_clusters" different
       # clusters.
-      def build(data_set, number_of_clusters)
+      # @param data_set [Object]
+      # @param number_of_clusters [Object]
+      # @param *options [Object]
+      # @return [Object]
+      def build(data_set, number_of_clusters = 1, **options)
         super
       end
-      # This algorithms does not allow classification of new data items
+      # This algorithms does not allow classification of new data items
       # once it has been built. Rebuild the cluster including you data element.
-      def eval(data_item)
-        Raise "Eval of new data is not supported by this algorithm."
+      # @param _data_item [Object]
+      # @return [Object]
+      def eval(_data_item)
+        raise NotImplementedError, 'Eval of new data is not supported by this algorithm.'
       end
+      # @return [Object]
+      def supports_eval?
+        false
+      end
       protected
       # return distance between cluster cx and cluster (ci U cj),
       # using centroid linkage
-      def linkage_distance(cx, ci, cj)
-        ni = @index_clusters[ci].length
-        nj = @index_clusters[cj].length
-        ( ni * read_distance_matrix(cx, ci) +
-          nj * read_distance_matrix(cx, cj) -
-         1.0 * ni * nj * read_distance_matrix(ci, cj) / (ni+nj)) / (ni+nj)
+      # @param cx [Object]
+      # @param ci [Object]
+      # @param cj [Object]
+      # @return [Object]
+      def linkage_distance(cluster_x, cluster_i, cluster_j)
+        ni = @index_clusters[cluster_i].length
+        nj = @index_clusters[cluster_j].length
+        ((ni * read_distance_matrix(cluster_x, cluster_i)) +
+          (nj * read_distance_matrix(cluster_x, cluster_j)) -
+         (1.0 * ni * nj * read_distance_matrix(cluster_i, cluster_j) / (ni + nj))) / (ni + nj)
       end
     end
   end
 end

data/lib/ai4r/clusterers/cluster_tree.rb ADDED Viewed

@@ -0,0 +1,50 @@
+# frozen_string_literal: true
+module Ai4r
+  module Clusterers
+    # Mixin to capture merge steps during agglomerative clustering.
+    # Stores intermediate clusters in +cluster_tree+. Optional +depth+
+    # limits how many last merges are recorded.
+    module ClusterTree
+      attr_reader :cluster_tree
+      # @param depth [Object]
+      # @param args [Object]
+      # @return [Object]
+      def initialize(depth = nil, *args)
+        @cluster_tree = []
+        @depth = depth
+        @merges_so_far = 0
+        super(*args)
+      end
+      # @param data_set [Object]
+      # @param number_of_clusters [Object]
+      # @param *options [Object]
+      # @return [Object]
+      def build(data_set, number_of_clusters = 1, **options)
+        @total_merges = data_set.data_items.length - number_of_clusters
+        super
+        @cluster_tree << clusters
+        @cluster_tree.reverse!
+        self
+      end
+      protected
+      # @param index_a [Object]
+      # @param index_b [Object]
+      # @param index_clusters [Object]
+      # @return [Object]
+      def merge_clusters(index_a, index_b, index_clusters)
+        if @depth.nil? || @merges_so_far > @total_merges - @depth
+          stored_distance_matrix = @distance_matrix.dup
+          @cluster_tree << build_clusters_from_index_clusters(index_clusters)
+          @distance_matrix = stored_distance_matrix
+        end
+        @merges_so_far += 1
+        super
+      end
+    end
+  end
+end

data/lib/ai4r/clusterers/clusterer.rb CHANGED Viewed

@@ -1,37 +1,53 @@
+# frozen_string_literal: true
 # Author::    Sergio Fierens
 # License::   MPL 1.1
 # Project::   ai4r
-# Url::       http://ai4r.org/
+# Url::       https://github.com/SergioFierens/ai4r
 #
-# You can redistribute it and/or modify it under the terms of
-# the Mozilla Public License version 1.1  as published by the
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
 # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
-require File.dirname(__FILE__) + '/../data/parameterizable'
+require_relative '../data/parameterizable'
 module Ai4r
   module Clusterers
     # The purpose of this class is to define a common API for Clusterers.
-    # All methods in this class (other than eval) must be implemented in
-    # subclasses.
+    # All methods in this class (other than eval) must be implemented in
+    # subclasses.
     class Clusterer
       include Ai4r::Data::Parameterizable
       # Build a new clusterer, using data examples found in data_set.
       # Data items will be clustered in "number_of_clusters" different
       # clusters.
+      # @param data_set [Object]
+      # @param number_of_clusters [Object]
+      # @return [Object]
       def build(data_set, number_of_clusters)
         raise NotImplementedError
       end
       # Classifies the given data item, returning the cluster it belongs to.
+      # @param data_item [Object]
+      # @return [Object]
       def eval(data_item)
         raise NotImplementedError
       end
-      protected
+      # Returns +true+ if this clusterer supports evaluating new data items
+      # with {#eval}. Hierarchical algorithms that only build a dendrogram
+      # will override this method to return +false+.
+      # @return [Object]
+      def supports_eval?
+        true
+      end
+      protected
+      # @param array [Object]
+      # @return [Object]
       def get_min_index(array)
         min = array.first
         index = 0
@@ -42,9 +58,8 @@ module Ai4r
             index = i
           end
         end
-        return index
+        index
       end
     end
   end
 end

data/lib/ai4r/clusterers/complete_linkage.rb CHANGED Viewed

@@ -1,67 +1,78 @@
+# frozen_string_literal: true
 # Author::    Sergio Fierens (implementation)
 # License::   MPL 1.1
 # Project::   ai4r
-# Url::       http://ai4r.org/
+# Url::       https://github.com/SergioFierens/ai4r
 #
-# You can redistribute it and/or modify it under the terms of
-# the Mozilla Public License version 1.1  as published by the
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
 # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
-require File.dirname(__FILE__) + '/../data/data_set'
-require File.dirname(__FILE__) + '/../clusterers/single_linkage'
+require_relative '../data/data_set'
+require_relative '../clusterers/single_linkage'
+require_relative '../clusterers/cluster_tree'
 module Ai4r
   module Clusterers
-    # Implementation of a Hierarchical clusterer with complete linkage (Everitt
+    # Implementation of a Hierarchical clusterer with complete linkage (Everitt
     # et al., 2001 ; Jain and Dubes, 1988 ; Sorensen, 1948 ).
-    # Hierarchical clusterer create one cluster per element, and then
+    # Hierarchical clusterer create one cluster per element, and then
     # progressively merge clusters, until the required number of clusters
     # is reached.
-    # With complete linkage, the distance between two clusters is computed as
+    # With complete linkage, the distance between two clusters is computed as
     # the maximum distance between elements of each cluster.
     #
     #   D(cx, (ci U cj) = max(D(cx, ci), D(cx, cj))
     class CompleteLinkage < SingleLinkage
-      parameters_info :distance_function =>
-          "Custom implementation of distance function. " +
-          "It must be a closure receiving two data items and return the " +
-          "distance between them. By default, this algorithm uses " +
-          "euclidean distance of numeric attributes to the power of 2."
+      include ClusterTree
+      parameters_info distance_function:
+          'Custom implementation of distance function. ' \
+          'It must be a closure receiving two data items and return the ' \
+          'distance between them. By default, this algorithm uses ' \
+          'euclidean distance of numeric attributes to the power of 2.'
       # Build a new clusterer, using data examples found in data_set.
       # Items will be clustered in "number_of_clusters" different
       # clusters.
-      def build(data_set, number_of_clusters)
+      # @param data_set [Object]
+      # @param number_of_clusters [Object]
+      # @param *options [Object]
+      # @return [Object]
+      def build(data_set, number_of_clusters = 1, **options)
         super
       end
-      # Classifies the given data item, returning the cluster index it belongs
+      # Classifies the given data item, returning the cluster index it belongs
       # to (0-based).
-      def eval(data_item)
-        super
-      end
+      # @param data_item [Object]
+      # @return [Object]
       protected
       # return distance between cluster cx and new cluster (ci U cj),
       # using complete linkage
-      def linkage_distance(cx, ci, cj)
-        [read_distance_matrix(cx, ci),
-          read_distance_matrix(cx, cj)].max
+      # @param cx [Object]
+      # @param ci [Object]
+      # @param cj [Object]
+      # @return [Object]
+      def linkage_distance(cluster_x, cluster_i, cluster_j)
+        [read_distance_matrix(cluster_x, cluster_i),
+         read_distance_matrix(cluster_x, cluster_j)].max
       end
+      # @param data_item [Object]
+      # @param cluster [Object]
+      # @return [Object]
       def distance_between_item_and_cluster(data_item, cluster)
         max_dist = 0
         cluster.data_items.each do |another_item|
           dist = @distance_function.call(data_item, another_item)
           max_dist = dist if dist > max_dist
         end
-        return max_dist
+        max_dist
       end
     end
   end
 end

data/lib/ai4r/clusterers/dbscan.rb ADDED Viewed

@@ -0,0 +1,134 @@
+# frozen_string_literal: true
+# Author::    Gwénaël Rault (implementation)
+# License::   AGPL-3.0
+# Project::   ai4r
+# Url::       https://github.com/SergioFierens/ai4r
+require_relative '../data/data_set'
+require_relative '../data/proximity'
+require_relative '../clusterers/clusterer'
+module Ai4r
+  module Clusterers
+    # More about DBSCAN algorithm:
+    # https://en.wikipedia.org/wiki/DBSCAN
+    class DBSCAN < Clusterer
+      attr_reader :data_set, :number_of_clusters, :clusters, :cluster_indices, :labels
+      parameters_info epsilon: 'Squared radius used with squared Euclidean distance.',
+                      min_points: 'Minimum neighbours excluding the point itself required to form a cluster.',
+                      distance_function: 'Optional closure computing distance; defaults to squared Euclidean.'
+      def initialize
+        super()
+        @distance_function = nil
+        @epsilon = nil
+        @min_points = 5
+        @clusters = []
+        @cluster_indices = []
+      end
+      # Build a new clusterer using data from +data_set+.
+      # An optional +number_of_clusters+ argument is ignored and present only to
+      # keep a consistent interface with other clusterers.
+      #
+      # @param data_set [Ai4r::Data::DataSet]
+      # @param number_of_clusters [Integer, nil]
+      # @return [DBSCAN]
+      def build(data_set, _number_of_clusters = nil)
+        @data_set = data_set
+        @clusters = []
+        @cluster_indices = []
+        @labels = Array.new(data_set.data_items.size)
+        @number_of_clusters = 0
+        raise ArgumentError, 'epsilon must be defined' if @epsilon.nil?
+        # Detect if the neighborhood of the current item
+        # is dense enough
+        data_set.data_items.each_with_index do |data_item, data_index|
+          next unless @labels[data_index].nil?
+          neighbors = range_query(data_item) - [data_index]
+          if neighbors.size < @min_points
+            @labels[data_index] = :noise
+          else
+            @number_of_clusters += 1
+            @labels[data_index] = @number_of_clusters
+            ds = Ai4r::Data::DataSet.new(data_labels: @data_set.data_labels)
+            ds << data_item
+            @clusters.push(ds)
+            @cluster_indices.push([data_index])
+            extend_cluster(neighbors, @number_of_clusters)
+          end
+        end
+        raise 'number_of_clusters must be positive' if !@clusters.empty? && @number_of_clusters <= 0
+        valid_labels = (1..@number_of_clusters).to_a << :noise
+        raise 'labels must be cluster ids or :noise' unless @labels.all? { |l| valid_labels.include?(l) }
+        self
+      end
+      # This algorithm cannot classify new data items once it has been built.
+      # Rebuild the cluster with your new data item instead.
+      # @param _data_item [Object]
+      # @return [Object]
+      def eval(_data_item)
+        raise NotImplementedError, 'Eval of new data is not supported by this algorithm.'
+      end
+      # @return [Object]
+      def supports_eval?
+        false
+      end
+      def distance(a, b)
+        return @distance_function.call(a, b) if @distance_function
+        Ai4r::Data::Proximity.squared_euclidean_distance(
+          a.select { |att_a| att_a.is_a? Numeric },
+          b.select { |att_b| att_b.is_a? Numeric }
+        )
+      end
+      protected
+      # Scan the data set and return the indices of all points
+      # belonging to the neighborhood of the current item
+      def range_query(evaluated_data_item)
+        neighbors = []
+        @data_set.data_items.each_with_index do |data_item, data_index|
+          neighbors << data_index if distance(evaluated_data_item, data_item) <= @epsilon
+        end
+        neighbors
+      end
+      # Expand the cluster by visiting neighbours of the current point.
+      # Skip neighbours already assigned to another cluster.
+      # If a neighbour was previously labeled as noise, assign it to the current
+      # cluster.
+      def extend_cluster(neighbors, current_cluster)
+        while neighbors.any?
+          data_index = neighbors.shift
+          if @labels[data_index] == :noise
+            @labels[data_index] = current_cluster
+            @clusters.last << @data_set.data_items[data_index]
+            @cluster_indices.last << data_index
+          elsif @labels[data_index].nil?
+            @labels[data_index] = current_cluster
+            @clusters.last << @data_set.data_items[data_index]
+            @cluster_indices.last << data_index
+            new_neighbors = range_query(@data_set.data_items[data_index]) - [data_index]
+            if new_neighbors.size >= @min_points
+              neighbors.concat(new_neighbors)
+              neighbors.uniq!
+            end
+          end
+        end
+      end
+    end
+  end
+end