RubyGems - rumale - Versions diffs - 0.14.3 → 0.14.4 - Mend

rumale 0.14.3 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +9 -0
data/lib/rumale.rb +1 -0
data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +57 -17
data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +44 -14
data/lib/rumale/nearest_neighbors/vp_tree.rb +132 -0
data/lib/rumale/version.rb +1 -1
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d1df6dee93147a75173bc099cd68dd116e7729c1
-  data.tar.gz: da85a19ca4964ee95cf026a69f2610b0c5d2b92c
+  metadata.gz: 94deb528b418e7c6e86c0a4b0c3969406cea827c
+  data.tar.gz: ce85f0e6182230b9db81c4cff86685fba6220748
 SHA512:
-  metadata.gz: 893ae704bf217de39ee1b4ccbb0601ffea3804252c992c5b4d79f9dc68171d6a7f0be8d6218af315cd540331bc6b91627d8c990cd53c10cf3d13e5a5123481c0
-  data.tar.gz: 5faf0ce1a7f38974a996534b0817557fea033bf0aea5a867f1fd2460db7153a09bc4ab7ddc233791d73be7c220be9da714830b8dcfbc20a9b366164ef48ea617
+  metadata.gz: 00e59b4343e6f393431f5625a825c409d392d823529dbc92ebd3ded86029c7432546ccd474a9c11253dc39d489f35397c8eab46fab8be3498dbe354a2c94b7c6
+  data.tar.gz: 166505de18d18e77eecb8b259f4dfd883fe4687c514b4586d361049d2d3977758b0c53aeea8e26c8da401ad8b3cf0000d11e21c0badc1990728777064cd2c0c7

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,12 @@
+# 0.14.4
+- Add metric parameter that specifies distance metric to
+[KNeighborsClassifier](https://yoshoku.github.io/rumale/doc/Rumale/NearestNeighbors/KNeighborsClassifier.html) and
+[KNeighborsRegressor](https://yoshoku.github.io/rumale/doc/Rumale/NearestNeighbors/KNeighborsRegressor.html).
+- Add algorithm parameter that specifies nearest neighbor search algorithm to
+[KNeighborsClassifier](https://yoshoku.github.io/rumale/doc/Rumale/NearestNeighbors/KNeighborsClassifier.html) and
+[KNeighborsRegressor](https://yoshoku.github.io/rumale/doc/Rumale/NearestNeighbors/KNeighborsRegressor.html).
+- Add nearest neighbor search class with [vantage point tree](https://yoshoku.github.io/rumale/doc/Rumale/NearestNeighbors/VPTree.html).
 # 0.14.3
 - Fix documents of GradientBoosting, RandomForest, and ExtraTrees.
 - Refactor gaussian mixture clustering with Rubocop.

data/lib/rumale.rb CHANGED Viewed

@@ -40,6 +40,7 @@ require 'rumale/polynomial_model/base_factorization_machine'
 require 'rumale/polynomial_model/factorization_machine_classifier'
 require 'rumale/polynomial_model/factorization_machine_regressor'
 require 'rumale/multiclass/one_vs_rest_classifier'
+require 'rumale/nearest_neighbors/vp_tree'
 require 'rumale/nearest_neighbors/k_neighbors_classifier'
 require 'rumale/nearest_neighbors/k_neighbors_regressor'
 require 'rumale/naive_bayes/naive_bayes'

data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb CHANGED Viewed

@@ -20,11 +20,13 @@ module Rumale
       include Base::Classifier
       # Return the prototypes for the nearest neighbor classifier.
-      # @return [Numo::DFloat] (shape: [n_samples, n_features])
+      # If the metric is 'precomputed', that returns nil.
+      # If the algorithm is 'vptree', that returns Rumale::NearestNeighbors::VPTree.
+      # @return [Numo::DFloat] (shape: [n_training_samples, n_features])
       attr_reader :prototypes
       # Return the labels of the prototypes
-      # @return [Numo::Int32] (size: n_samples)
+      # @return [Numo::Int32] (size: n_training_samples)
       attr_reader :labels
       # Return the class labels.
@@ -34,11 +36,21 @@ module Rumale
       # Create a new classifier with the nearest neighbor rule.
       #
       # @param n_neighbors [Integer] The number of neighbors.
-      def initialize(n_neighbors: 5)
+      # @param algorithm [String] The algorithm is used for finding the nearest neighbors.
+      #   If algorithm is 'brute', brute-force search will be used.
+      #   If algorithm is 'vptree', vantage point tree will be used.
+      #   This parameter is ignored when metric parameter is 'precomputed'.
+      # @param metric [String] The metric to calculate the distances.
+      #   If metric is 'euclidean', Euclidean distance is calculated for distance between points.
+      #   If metric is 'precomputed', the fit and predict methods expect to be given a distance matrix.
+      def initialize(n_neighbors: 5, algorithm: 'brute', metric: 'euclidean')
         check_params_numeric(n_neighbors: n_neighbors)
         check_params_positive(n_neighbors: n_neighbors)
+        check_params_string(algorith: algorithm, metric: metric)
         @params = {}
         @params[:n_neighbors] = n_neighbors
+        @params[:algorithm] = algorithm == 'vptree' ? 'vptree' : 'brute'
+        @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
         @prototypes = nil
         @labels = nil
         @classes = nil
@@ -46,14 +58,22 @@ module Rumale
       # Fit the model with given training data.
       #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
-      # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
+      # @param x [Numo::DFloat] (shape: [n_training_samples, n_features]) The training data to be used for fitting the model.
+      #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_training_samples, n_training_samples]).
+      # @param y [Numo::Int32] (shape: [n_training_samples]) The labels to be used for fitting the model.
       # @return [KNeighborsClassifier] The learned classifier itself.
       def fit(x, y)
         x = check_convert_sample_array(x)
         y = check_convert_label_array(y)
         check_sample_label_size(x, y)
-        @prototypes = Numo::DFloat.asarray(x.to_a)
+        raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
+        @prototypes = if @params[:metric] == 'euclidean'
+                        if @params[:algorithm] == 'vptree'
+                          VPTree.new(x)
+                        else
+                          x.dup
+                        end
+                      end
         @labels = Numo::Int32.asarray(y.to_a)
         @classes = Numo::Int32.asarray(y.to_a.uniq.sort)
         self
@@ -61,30 +81,50 @@ module Rumale
       # Calculate confidence scores for samples.
       #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
-      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence scores per sample for each class.
+      # @param x [Numo::DFloat] (shape: [n_testing_samples, n_features]) The samples to compute the scores.
+      #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_testing_samples, n_training_samples]).
+      # @return [Numo::DFloat] (shape: [n_testing_samples, n_classes]) Confidence scores per sample for each class.
       def decision_function(x)
         x = check_convert_sample_array(x)
-        distance_matrix = PairwiseMetric.euclidean_distance(x, @prototypes)
-        n_samples, n_prototypes = distance_matrix.shape
-        n_classes = @classes.size
+        if @params[:metric] == 'precomputed' && x.shape[1] != @labels.size
+          raise ArgumentError, 'Expect the size input matrix to be n_testing_samples-by-n_training_samples.'
+        end
+        n_prototypes = @labels.size
         n_neighbors = [@params[:n_neighbors], n_prototypes].min
+        n_samples = x.shape[0]
+        n_classes = @classes.size
         scores = Numo::DFloat.zeros(n_samples, n_classes)
-        n_samples.times do |m|
-          neighbor_ids = distance_matrix[m, true].to_a.each_with_index.sort.map(&:last)[0...n_neighbors]
-          neighbor_ids.each { |n| scores[m, @classes.to_a.index(@labels[n])] += 1.0 }
+        if @params[:metric] == 'euclidean' && @params[:algorithm] == 'vptree'
+          neighbor_ids, = @prototypes.query(x, n_neighbors)
+          n_samples.times do |m|
+            neighbor_ids[m, true].each { |n| scores[m, @classes.to_a.index(@labels[n])] += 1.0 }
+          end
+        else
+          distance_matrix = @params[:metric] == 'precomputed' ? x : PairwiseMetric.euclidean_distance(x, @prototypes)
+          n_samples.times do |m|
+            neighbor_ids = distance_matrix[m, true].to_a.each_with_index.sort.map(&:last)[0...n_neighbors]
+            neighbor_ids.each { |n| scores[m, @classes.to_a.index(@labels[n])] += 1.0 }
+          end
         end
         scores
       end
       # Predict class labels for samples.
       #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
-      # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
+      # @param x [Numo::DFloat] (shape: [n_testing_samples, n_features]) The samples to predict the labels.
+      #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_testing_samples, n_training_samples]).
+      # @return [Numo::Int32] (shape: [n_testing_samples]) Predicted class label per sample.
       def predict(x)
         x = check_convert_sample_array(x)
-        n_samples = x.shape.first
+        if @params[:metric] == 'precomputed' && x.shape[1] != @labels.size
+          raise ArgumentError, 'Expect the size input matrix to be n_samples-by-n_training_samples.'
+        end
         decision_values = decision_function(x)
+        n_samples = x.shape[0]
         Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[decision_values[n, true].max_index] })
       end

data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb CHANGED Viewed

@@ -19,55 +19,85 @@ module Rumale
       include Base::Regressor
       # Return the prototypes for the nearest neighbor regressor.
-      # @return [Numo::DFloat] (shape: [n_samples, n_features])
+      # If the metric is 'precomputed', that returns nil.
+      # If the algorithm is 'vptree', that returns Rumale::NearestNeighbors::VPTree.
+      # @return [Numo::DFloat] (shape: [n_testing_samples, n_features])
       attr_reader :prototypes
       # Return the values of the prototypes
-      # @return [Numo::DFloat] (shape: [n_samples, n_outputs])
+      # @return [Numo::DFloat] (shape: [n_testing_samples, n_outputs])
       attr_reader :values
       # Create a new regressor with the nearest neighbor rule.
       #
       # @param n_neighbors [Integer] The number of neighbors.
-      def initialize(n_neighbors: 5)
+      # @param algorithm [String] The algorithm is used for finding the nearest neighbors.
+      #   If algorithm is 'brute', brute-force search will be used.
+      #   If algorithm is 'vptree', vantage point tree will be used.
+      #   This parameter is ignored when metric parameter is 'precomputed'.
+      # @param metric [String] The metric to calculate the distances.
+      #   If metric is 'euclidean', Euclidean distance is calculated for distance between points.
+      #   If metric is 'precomputed', the fit and predict methods expect to be given a distance matrix.
+      def initialize(n_neighbors: 5, algorithm: 'brute', metric: 'euclidean')
         check_params_numeric(n_neighbors: n_neighbors)
         check_params_positive(n_neighbors: n_neighbors)
+        check_params_string(algorith: algorithm, metric: metric)
         @params = {}
         @params[:n_neighbors] = n_neighbors
+        @params[:algorithm] = algorithm == 'vptree' ? 'vptree' : 'brute'
+        @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
         @prototypes = nil
         @values = nil
       end
       # Fit the model with given training data.
       #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
-      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
+      # @param x [Numo::DFloat] (shape: [n_training_samples, n_features]) The training data to be used for fitting the model.
+      #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_training_samples, n_training_samples]).
+      # @param y [Numo::DFloat] (shape: [n_training_samples, n_outputs]) The target values to be used for fitting the model.
       # @return [KNeighborsRegressor] The learned regressor itself.
       def fit(x, y)
         x = check_convert_sample_array(x)
         y = check_convert_tvalue_array(y)
         check_sample_tvalue_size(x, y)
-        @prototypes = x.dup
+        raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
+        @prototypes = if @params[:metric] == 'euclidean'
+                        if @params[:algorithm] == 'vptree'
+                          VPTree.new(x)
+                        else
+                          x.dup
+                        end
+                      end
         @values = y.dup
         self
       end
       # Predict values for samples.
       #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
-      # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted values per sample.
+      # @param x [Numo::DFloat] (shape: [n_training_samples, n_features]) The samples to predict the values.
+      #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_testing_samples, n_training_samples]).
+      # @return [Numo::DFloat] (shape: [n_training_samples, n_outputs]) Predicted values per sample.
       def predict(x)
         x = check_convert_sample_array(x)
+        if @params[:metric] == 'precomputed' && x.shape[1] != @values.shape[0]
+          raise ArgumentError, 'Expect the size input matrix to be n_testing_samples-by-n_training_samples.'
+        end
         # Initialize some variables.
-        n_samples, = x.shape
+        n_samples = x.shape[0]
         n_prototypes, n_outputs = @values.shape
         n_neighbors = [@params[:n_neighbors], n_prototypes].min
-        # Calculate distance matrix.
-        distance_matrix = PairwiseMetric.euclidean_distance(x, @prototypes)
         # Predict values for the given samples.
-        predicted_values = Array.new(n_samples) do |n|
-          neighbor_ids = distance_matrix[n, true].to_a.each_with_index.sort.map(&:last)[0...n_neighbors]
-          n_outputs.nil? ? @values[neighbor_ids].mean : @values[neighbor_ids, true].mean(0).to_a
+        if @params[:metric] == 'euclidean' && @params[:algorithm] == 'vptree'
+          neighbor_ids, = @prototypes.query(x, n_neighbors)
+          predicted_values = Array.new(n_samples) do |n|
+            n_outputs.nil? ? @values[neighbor_ids[n, true]].mean : @values[neighbor_ids[n, true], true].mean(0).to_a
+          end
+        else
+          distance_matrix = @params[:metric] == 'precomputed' ? x : PairwiseMetric.euclidean_distance(x, @prototypes)
+          predicted_values = Array.new(n_samples) do |n|
+            neighbor_ids = distance_matrix[n, true].to_a.each_with_index.sort.map(&:last)[0...n_neighbors]
+            n_outputs.nil? ? @values[neighbor_ids].mean : @values[neighbor_ids, true].mean(0).to_a
+          end
         end
         Numo::DFloat[*predicted_values]
       end

data/lib/rumale/nearest_neighbors/vp_tree.rb ADDED Viewed

@@ -0,0 +1,132 @@
+# frozen_string_literal: true
+require 'rumale/validation'
+require 'rumale/pairwise_metric'
+require 'rumale/base/base_estimator'
+module Rumale
+  module NearestNeighbors
+    # VPTree is a class that implements the nearest neigbor searcher based on vantage point tree.
+    # This implementation, unlike the paper, does not perform random sampling with vantage point selection.
+    # This class is used internally for k-nearest neighbor estimators.
+    #
+    # *Reference*
+    # P N. Yianilos, "Data Structures and Algorithms for Nearest Neighbor Search in General Metric Spaces," Proc. SODA'93, pp. 311--321, 1993.
+    class VPTree
+      include Validation
+      include Base::BaseEstimator
+      # Return the training data.
+      # @return [Numo::DFloat] (shape: [n_samples, n_features])
+      attr_reader :data
+      # Create a search index with vantage point tree algorithm.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to used generating search index.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      def initialize(x, min_samples_leaf: 1)
+        check_params_numeric(min_samples_leaf: min_samples_leaf)
+        check_params_positive(min_samples_leaf: min_samples_leaf)
+        @params = {}
+        @params[:min_samples_leaf] = min_samples_leaf
+        @data = x
+        @tree = build_tree(Numo::Int32.cast([*0...@data.shape[0]]))
+      end
+      # Search k-nearest neighbors of given query point.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features])
+      # @param k [Integer] The samples to be query points.
+      # @return [Array<Array<Numo::Int32, Numo::DFloat>>] The indices and distances of retrieved k-nearest neighbors.
+      def query(x, k = 1)
+        x = check_convert_sample_array(x)
+        check_params_numeric(k: k)
+        check_params_positive(k: k)
+        n_samples = x.shape[0]
+        rel_ids = []
+        rel_dists = []
+        n_samples.times do |n|
+          q = x[n, true]
+          rel_node = search(q, @tree, k)
+          dist_arr = calc_distances(q, @data[rel_node.sample_ids, true])
+          rank_ids = dist_arr.sort_index[0...k]
+          rel_ids.push(rel_node.sample_ids[rank_ids].dup)
+          rel_dists.push(dist_arr[rank_ids].dup)
+        end
+        [Numo::Int32.cast(rel_ids), Numo::DFloat.cast(rel_dists)]
+      end
+      private
+      Node = Struct.new(:sample_ids, :n_samples, :vantage_point_id, :threshold, :left, :right) do
+        def leaf?
+          vantage_point_id.nil?
+        end
+      end
+      private_constant :Node
+      def search(q, node, k, tau = Float::INFINITY)
+        return node if node.leaf?
+        dist = Math.sqrt(((q - @data[node.vantage_point_id, true])**2).sum)
+        tau = dist if dist < tau
+        # :nocov:
+        if dist < node.threshold
+          if dist - tau <= node.threshold
+            node.left.n_samples < k ? node : search(q, node.left, k, tau)
+          elsif dist + tau >= node.threshold
+            node.right.n_samples < k ? node : search(q, node.right, k, tau)
+          else
+            node
+          end
+        else
+          if dist + tau >= node.threshold
+            node.right.n_samples < k ? node : search(q, node.right, k, tau)
+          elsif dist - tau <= node.threshold
+            node.left.n_samples < k ? node : search(q, node.left, k, tau)
+          else
+            node
+          end
+        end
+        # :nocov:
+      end
+      def build_tree(sample_ids)
+        n_samples = sample_ids.size
+        node = Node.new
+        node.n_samples = n_samples
+        node.sample_ids = sample_ids
+        return node if n_samples <= @params[:min_samples_leaf]
+        vantage_point_id = select_vantage_point_id(sample_ids)
+        distance_arr = calc_distances(@data[vantage_point_id, true], @data[sample_ids, true])
+        threshold = distance_arr.median
+        left_flgs = distance_arr.lt(threshold)
+        right_flgs = distance_arr.ge(threshold)
+        return node if left_flgs.count < @params[:min_samples_leaf] || right_flgs.count < @params[:min_samples_leaf]
+        node.left = build_tree(sample_ids[left_flgs])
+        node.right = build_tree(sample_ids[right_flgs])
+        node.vantage_point_id = vantage_point_id
+        node.threshold = threshold
+        node
+      end
+      def select_vantage_point_id(sample_ids)
+        dist_mat = Rumale::PairwiseMetric.euclidean_distance(@data[sample_ids, true])
+        means = dist_mat.mean(0)
+        vars = ((dist_mat - means)**2).mean(0)
+        sample_ids[vars.max_index]
+      end
+      def calc_distances(q, x)
+        Rumale::PairwiseMetric.euclidean_distance(q.expand_dims(0), x).flatten.dup
+      end
+    end
+  end
+end

data/lib/rumale/version.rb CHANGED Viewed

@@ -3,5 +3,5 @@
 # Rumale is a machine learning library in Ruby.
 module Rumale
   # The version of Rumale you are using.
-  VERSION = '0.14.3'
+  VERSION = '0.14.4'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rumale
 version: !ruby/object:Gem::Version
-  version: 0.14.3
+  version: 0.14.4
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-12-16 00:00:00.000000000 Z
+date: 2019-12-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray
@@ -230,6 +230,7 @@ files:
 - lib/rumale/naive_bayes/naive_bayes.rb
 - lib/rumale/nearest_neighbors/k_neighbors_classifier.rb
 - lib/rumale/nearest_neighbors/k_neighbors_regressor.rb
+- lib/rumale/nearest_neighbors/vp_tree.rb
 - lib/rumale/neural_network/base_mlp.rb
 - lib/rumale/neural_network/mlp_classifier.rb
 - lib/rumale/neural_network/mlp_regressor.rb