RubyGems - rumale - Versions diffs - 0.13.0 → 0.13.1 - Mend

rumale 0.13.0 → 0.13.1

Files changed (14) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +8 -0
data/README.md +2 -2
data/lib/rumale.rb +1 -0
data/lib/rumale/clustering/dbscan.rb +25 -13
data/lib/rumale/clustering/k_medoids.rb +2 -2
data/lib/rumale/clustering/snn.rb +76 -0
data/lib/rumale/decomposition/pca.rb +2 -1
data/lib/rumale/linear_model/linear_regression.rb +54 -15
data/lib/rumale/linear_model/ridge.rb +57 -17
data/lib/rumale/pairwise_metric.rb +18 -5
data/lib/rumale/version.rb +1 -1
data/rumale.gemspec +1 -1
metadata +4 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: cdd9f655f3a1f95edc68476dd614deac7718ef65
-  data.tar.gz: f979c070a55fc8fb2afce0c04b1fa26bf92aa2af
+  metadata.gz: ce88d7170fd676377227427a0be90f8bdb1a9c97
+  data.tar.gz: 04f0d07e6d098768eda726fc82f864420678e427
 SHA512:
-  metadata.gz: 94a6aed271a2f0da786544aaef2dc81b62c357f7dbd552c5ebbdd6a67c39b3584b9b139a6ea8be269a030ca2c7ec0f5852ccc3990492d3f4aabf55ab46172d9a
-  data.tar.gz: b89de3059991cecfe92d492193287777cd8278715ee55e52868005a8df5489005f02ed862b8a782111e0cc8f76da67b1965b690e5a5876c7877025d7657f6be3
+  metadata.gz: 203444f0e7d833946f67c2ee922e02a48b7174c20eac84480e190f8749e150e0c5ed18e3d7b7d30480e565483b5a5b51d1990cced7e09b5db027d8c508fa4313
+  data.tar.gz: e608c97fc0d29c018c778f9cc96cd53b0edff927c5631bd3b0cb606ee93f4e8c647ed2c76e7835b49f1933c0e5aeccb1ffbda4fe9aec59a2689f7bde4a28e103

data/CHANGELOG.md CHANGED

@@ -1,3 +1,11 @@
+# 0.13.1
+- Add class for Shared Neareset Neighbor clustering.
+- Add function for calculation of manhattan distance to Rumale::PairwiseMetric.
+- Add metric parameter that specifies distance metric to Rumale::Clustering::DBSCAN.
+- Add the solver parameter that specifies the optimization algorithm to Rumale::LinearModel::LinearRegression.
+- Add the solver parameter that specifies the optimization algorithm to Rumale::LinearModel::Ridge.
+- Fix bug that the ndim of NArray of 1-dimensional principal components is not 1.
 # 0.13.0
 - Introduce [Numo::Linalg](https://github.com/ruby-numo/numo-linalg) to use linear algebra algorithms on the optimization.
 - Add the solver parameter that specifies the optimization algorithm to Rumale::Decomposition::PCA.

data/README.md CHANGED

@@ -6,14 +6,14 @@
 [![Coverage Status](https://coveralls.io/repos/github/yoshoku/rumale/badge.svg?branch=master)](https://coveralls.io/github/yoshoku/rumale?branch=master)
 [![Gem Version](https://badge.fury.io/rb/rumale.svg)](https://badge.fury.io/rb/rumale)
 [![BSD 2-Clause License](https://img.shields.io/badge/License-BSD%202--Clause-orange.svg)](https://github.com/yoshoku/rumale/blob/master/LICENSE.txt)
-[![Documentation](http://img.shields.io/badge/docs-rdoc.info-blue.svg)](https://www.rubydoc.info/gems/rumale/0.13.0)
+[![Documentation](http://img.shields.io/badge/docs-rdoc.info-blue.svg)](https://www.rubydoc.info/gems/rumale/0.13.1)
 Rumale (**Ru**by **ma**chine **le**arning) is a machine learning library in Ruby.
 Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
 Rumale supports Linear / Kernel Support Vector Machine,
 Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
 Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor classifier,
-K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
+K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, SNN, Power Iteration Clustering,
 Mutidimensional Scaling, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
 This project was formerly known as "SVMKit".

data/lib/rumale.rb CHANGED

@@ -60,6 +60,7 @@ require 'rumale/clustering/k_means'
 require 'rumale/clustering/k_medoids'
 require 'rumale/clustering/gaussian_mixture'
 require 'rumale/clustering/dbscan'
+require 'rumale/clustering/snn'
 require 'rumale/clustering/power_iteration'
 require 'rumale/decomposition/pca'
 require 'rumale/decomposition/nmf'

data/lib/rumale/clustering/dbscan.rb CHANGED

@@ -7,7 +7,6 @@ require 'rumale/pairwise_metric'
 module Rumale
   module Clustering
     # DBSCAN is a class that implements DBSCAN cluster analysis.
-    # The current implementation uses the Euclidean distance for analyzing the clusters.
     #
     # @example
     #   analyzer = Rumale::Clustering::DBSCAN.new(eps: 0.5, min_samples: 5)
@@ -31,12 +30,17 @@ module Rumale
       #
       # @param eps [Float] The radius of neighborhood.
       # @param min_samples [Integer] The number of neighbor samples to be used for the criterion whether a point is a core point.
-      def initialize(eps: 0.5, min_samples: 5)
+      # @param metric [String] The metric to calculate the distances.
+      #   If metric is 'euclidean', Euclidean distance is calculated for distance between points.
+      #   If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
+      def initialize(eps: 0.5, min_samples: 5, metric: 'euclidean')
         check_params_float(eps: eps)
         check_params_integer(min_samples: min_samples)
+        check_params_string(metric: metric)
         @params = {}
         @params[:eps] = eps
         @params[:min_samples] = min_samples
+        @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
         @core_sample_ids = nil
         @labels = nil
       end
@@ -46,19 +50,23 @@ module Rumale
       # @overload fit(x) -> DBSCAN
       #
       # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
       # @return [DBSCAN] The learned cluster analyzer itself.
       def fit(x, _y = nil)
         check_sample_array(x)
+        raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
         partial_fit(x)
         self
       end
       # Analysis clusters and assign samples to clusters.
       #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for cluster analysis.
+      #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
       # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
       def fit_predict(x)
         check_sample_array(x)
+        raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
         partial_fit(x)
         labels
       end
@@ -84,19 +92,24 @@ module Rumale
       def partial_fit(x)
         cluster_id = 0
-        n_samples  = x.shape[0]
+        metric_mat = calc_pairwise_metrics(x)
+        n_samples = metric_mat.shape[0]
         @core_sample_ids = []
         @labels = Numo::Int32.zeros(n_samples) - 2
-        n_samples.times do |q|
-          next if @labels[q] >= -1
-          cluster_id += 1 if expand_cluster(x, q, cluster_id)
+        n_samples.times do |query_id|
+          next if @labels[query_id] >= -1
+          cluster_id += 1 if expand_cluster(metric_mat, query_id, cluster_id)
         end
         @core_sample_ids = Numo::Int32[*@core_sample_ids.flatten]
         nil
       end
-      def expand_cluster(x, query_id, cluster_id)
-        target_ids = region_query(x[query_id, true], x)
+      def calc_pairwise_metrics(x)
+        @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
+      end
+      def expand_cluster(metric_mat, query_id, cluster_id)
+        target_ids = region_query(metric_mat[query_id, true])
         if target_ids.size < @params[:min_samples]
           @labels[query_id] = -1
           false
@@ -105,7 +118,7 @@ module Rumale
           @core_sample_ids.push(target_ids.dup)
           target_ids.delete(query_id)
           while (m = target_ids.shift)
-            neighbor_ids = region_query(x[m, true], x)
+            neighbor_ids = region_query(metric_mat[m, true])
             next if neighbor_ids.size < @params[:min_samples]
             neighbor_ids.each do |n|
               target_ids.push(n) if @labels[n] < -1
@@ -116,9 +129,8 @@ module Rumale
         end
       end
-      def region_query(query, targets)
-        distance_arr = PairwiseMetric.euclidean_distance(query.expand_dims(0), targets)[0, true]
-        distance_arr.lt(@params[:eps]).where.to_a
+      def region_query(metric_arr)
+        metric_arr.lt(@params[:eps]).where.to_a
       end
     end
   end

data/lib/rumale/clustering/k_medoids.rb CHANGED

@@ -29,8 +29,8 @@ module Rumale
       # Create a new cluster analyzer with K-Medoids method.
       #
       # @param n_clusters [Integer] The number of clusters.
-      # @param metric [String] The metric to calculate the distances in original space.
-      #   If metric is 'euclidean', Euclidean distance is calculated for distance in original space.
+      # @param metric [String] The metric to calculate the distances.
+      #   If metric is 'euclidean', Euclidean distance is calculated for distance between points.
       #   If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
       # @param init [String] The initialization method for centroids ('random' or 'k-means++').
       # @param max_iter [Integer] The maximum number of iterations.

data/lib/rumale/clustering/snn.rb ADDED

@@ -0,0 +1,76 @@
+# frozen_string_literal: true
+require 'rumale/pairwise_metric'
+require 'rumale/clustering/dbscan'
+module Rumale
+  module Clustering
+    # SNN is a class that implements Shared Nearest Neighbor cluster analysis.
+    # The SNN method is a variation of DBSCAN that uses similarity based on k-nearest neighbors as a metric.
+    #
+    # @example
+    #   analyzer = Rumale::Clustering::SNN.new(n_neighbros: 10, eps: 5, min_samples: 5)
+    #   cluster_labels = analyzer.fit_predict(samples)
+    #
+    # *Reference*
+    # - L. Ertoz, M. Steinbach, and V. Kumar, "Finding Clusters of Different Sizes, Shapes, and Densities in Noisy, High Dimensional Data," Proc. SDM'03, pp. 47--58, 2003.
+    # - M E. Houle, H-P. Kriegel, P. Kroger, E. Schubert, and A. Zimek, "Can Shared-Neighbor Distances Defeat the Curse of Dimensionality?," Proc. SSDBM'10, pp. 482--500, 2010.
+    class SNN < DBSCAN
+      # Create a new cluster analyzer with Shared Neareset Neighbor method.
+      #
+      # @param n_neighbors [Integer] The number of neighbors to be used for finding k-nearest neighbors.
+      # @param eps [Integer] The threshold value for finding connected components based on similarity.
+      # @param min_samples [Integer] The number of neighbor samples to be used for the criterion whether a point is a core point.
+      # @param metric [String] The metric to calculate the distances.
+      #   If metric is 'euclidean', Euclidean distance is calculated for distance between points.
+      #   If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
+      def initialize(n_neighbors: 10, eps: 5, min_samples: 5, metric: 'euclidean')
+        check_params_integer(n_neighbors: n_neighbors, min_samples: min_samples)
+        check_params_string(metric: metric)
+        @params = {}
+        @params[:n_neighbors] = n_neighbors
+        @params[:eps] = eps
+        @params[:min_samples] = min_samples
+        @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
+        @core_sample_ids = nil
+        @labels = nil
+      end
+      # Analysis clusters with given training data.
+      #
+      # @overload fit(x) -> SNN
+      #   @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      #     If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
+      # @return [SNN] The learned cluster analyzer itself.
+      def fit(x, _y = nil)
+        super
+      end
+      # Analysis clusters and assign samples to clusters.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for cluster analysis.
+      #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
+      def fit_predict(x)
+        super
+      end
+      private
+      def calc_pairwise_metrics(x)
+        distance_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
+        n_samples = distance_mat.shape[0]
+        adjacency_mat = Numo::DFloat.zeros(n_samples, n_samples)
+        n_samples.times do |n|
+          neighbor_ids = distance_mat[n, true].sort_index[0...@params[:n_neighbors]]
+          adjacency_mat[n, neighbor_ids] = 1
+        end
+        adjacency_mat.dot(adjacency_mat.transpose)
+      end
+      def region_query(similarity_arr)
+        similarity_arr.gt(@params[:eps]).where.to_a
+      end
+    end
+  end
+end

data/lib/rumale/decomposition/pca.rb CHANGED

@@ -80,7 +80,8 @@ module Rumale
         covariance_mat = centered_x.transpose.dot(centered_x) / (n_samples - 1)
         if @params[:solver] == 'evd' && enable_linalg?
           _, evecs = Numo::Linalg.eigh(covariance_mat, vals_range: (n_features - @params[:n_components])...n_features)
-          @components = evecs.reverse(1).transpose.dup
+          comps = evecs.reverse(1).transpose
+          @components = @params[:n_components] == 1 ? comps[0, true].dup : comps.dup
         else
           @params[:n_components].times do
             comp_vec = Rumale::Utils.rand_uniform(n_features, sub_rng)

data/lib/rumale/linear_model/linear_regression.rb CHANGED

@@ -6,7 +6,7 @@ require 'rumale/base/regressor'
 module Rumale
   module LinearModel
     # LinearRegression is a class that implements ordinary least square linear regression
-    # with mini-batch stochastic gradient descent optimization.
+    # with mini-batch stochastic gradient descent optimization or singular value decomposition.
     #
     # @example
     #   estimator =
@@ -14,6 +14,11 @@ module Rumale
     #   estimator.fit(training_samples, traininig_values)
     #   results = estimator.predict(testing_samples)
     #
+    #   # If Numo::Linalg is installed, you can specify 'svd' for the solver option.
+    #   require 'numo/linalg/autoloader'
+    #   estimator = Rumale::LinearModel::LinearRegression.new(solver: 'svd')
+    #   estimator.fit(training_samples, traininig_values)
+    #   results = estimator.predict(testing_samples)
     class LinearRegression < BaseLinearModel
       include Base::Regressor
@@ -34,23 +39,32 @@ module Rumale
       # @param fit_bias [Boolean] The flag indicating whether to fit the bias term.
       # @param bias_scale [Float] The scale of the bias term.
       # @param max_iter [Integer] The maximum number of iterations.
+      #   If solver = 'svd', this parameter is ignored.
       # @param batch_size [Integer] The size of the mini batches.
+      #   If solver = 'svd', this parameter is ignored.
       # @param optimizer [Optimizer] The optimizer to calculate adaptive learning rate.
       #   If nil is given, Nadam is used.
+      #   If solver = 'svd', this parameter is ignored.
+      # @param solver [String] The algorithm to calculate weights. ('sgd' or 'svd').
+      #   'sgd' uses the stochastic gradient descent optimization.
+      #   'svd' performs singular value decomposition of samples.
       # @param n_jobs [Integer] The number of jobs for running the fit method in parallel.
       #   If nil is given, the method does not execute in parallel.
       #   If zero or less is given, it becomes equal to the number of processors.
       #   This parameter is ignored if the Parallel gem is not loaded.
       # @param random_seed [Integer] The seed value using to initialize the random generator.
       def initialize(fit_bias: false, bias_scale: 1.0, max_iter: 1000, batch_size: 10, optimizer: nil,
-                     n_jobs: nil, random_seed: nil)
+                     solver: 'sgd', n_jobs: nil, random_seed: nil)
         check_params_float(bias_scale: bias_scale)
         check_params_integer(max_iter: max_iter, batch_size: batch_size)
         check_params_boolean(fit_bias: fit_bias)
+        check_params_string(solver: solver)
         check_params_type_or_nil(Integer, n_jobs: n_jobs, random_seed: random_seed)
         check_params_positive(max_iter: max_iter, batch_size: batch_size)
         keywd_args = method(:initialize).parameters.map { |_t, arg| [arg, binding.local_variable_get(arg)] }.to_h.merge(reg_param: 0.0)
+        keywd_args.delete(:solver)
         super(keywd_args)
+        @params[:solver] = solver != 'svd' ? 'sgd' : 'svd'
       end
       # Fit the model with given training data.
@@ -63,20 +77,10 @@ module Rumale
         check_tvalue_array(y)
         check_sample_tvalue_size(x, y)
-        n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
-        n_features = x.shape[1]
-        if n_outputs > 1
-          @weight_vec = Numo::DFloat.zeros(n_outputs, n_features)
-          @bias_term = Numo::DFloat.zeros(n_outputs)
-          if enable_parallel?
-            models = parallel_map(n_outputs) { |n| partial_fit(x, y[true, n]) }
-            n_outputs.times { |n| @weight_vec[n, true], @bias_term[n] = models[n] }
-          else
-            n_outputs.times { |n| @weight_vec[n, true], @bias_term[n] = partial_fit(x, y[true, n]) }
-          end
+        if @params[:solver] == 'svd' && enable_linalg?
+          fit_svd(x, y)
         else
-          @weight_vec, @bias_term = partial_fit(x, y)
+          fit_sgd(x, y)
         end
         self
@@ -112,6 +116,41 @@ module Rumale
       private
+      def fit_svd(x, y)
+        samples = @params[:fit_bias] ? expand_feature(x) : x
+        s, u, vt = Numo::Linalg.svd(samples, driver: 'sdd', job: 'S')
+        d = (s / s**2).diag
+        w = vt.transpose.dot(d).dot(u.transpose).dot(y)
+        is_single_target_vals = y.shape[1].nil?
+        if @params[:fit_bias]
+          @weight_vec = is_single_target_vals ? w[0...-1].dup : w[0...-1, true].dup
+          @bias_term = is_single_target_vals ? w[-1] : w[-1, true].dup
+        else
+          @weight_vec = w.dup
+          @bias_term = is_single_target_vals ? 0 : Numo::DFloat.zeros(y.shape[1])
+        end
+      end
+      def fit_sgd(x, y)
+        n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
+        n_features = x.shape[1]
+        if n_outputs > 1
+          @weight_vec = Numo::DFloat.zeros(n_outputs, n_features)
+          @bias_term = Numo::DFloat.zeros(n_outputs)
+          if enable_parallel?
+            models = parallel_map(n_outputs) { |n| partial_fit(x, y[true, n]) }
+            n_outputs.times { |n| @weight_vec[n, true], @bias_term[n] = models[n] }
+          else
+            n_outputs.times { |n| @weight_vec[n, true], @bias_term[n] = partial_fit(x, y[true, n]) }
+          end
+        else
+          @weight_vec, @bias_term = partial_fit(x, y)
+        end
+      end
       def calc_loss_gradient(x, y, weight)
         2.0 * (x.dot(weight) - y)
       end

data/lib/rumale/linear_model/ridge.rb CHANGED

@@ -6,7 +6,7 @@ require 'rumale/base/regressor'
 module Rumale
   module LinearModel
     # Ridge is a class that implements Ridge Regression
-    # with mini-batch stochastic gradient descent optimization.
+    # with mini-batch stochastic gradient descent optimization or singular value decomposition.
     #
     # @example
     #   estimator =
@@ -14,6 +14,11 @@ module Rumale
     #   estimator.fit(training_samples, traininig_values)
     #   results = estimator.predict(testing_samples)
     #
+    #   # If Numo::Linalg is installed, you can specify 'svd' for the solver option.
+    #   require 'numo/linalg/autoloader'
+    #   estimator = Rumale::LinearModel::Ridge.new(reg_param: 0.1, solver: 'svd')
+    #   estimator.fit(training_samples, traininig_values)
+    #   results = estimator.predict(testing_samples)
     class Ridge < BaseLinearModel
       include Base::Regressor
@@ -35,22 +40,32 @@ module Rumale
       # @param fit_bias [Boolean] The flag indicating whether to fit the bias term.
       # @param bias_scale [Float] The scale of the bias term.
       # @param max_iter [Integer] The maximum number of iterations.
+      #   If solver = 'svd', this parameter is ignored.
       # @param batch_size [Integer] The size of the mini batches.
+      #   If solver = 'svd', this parameter is ignored.
       # @param optimizer [Optimizer] The optimizer to calculate adaptive learning rate.
       #   If nil is given, Nadam is used.
+      #   If solver = 'svd', this parameter is ignored.
+      # @param solver [String] The algorithm to calculate weights. ('sgd' or 'svd').
+      #   'sgd' uses the stochastic gradient descent optimization.
+      #   'svd' performs singular value decomposition of samples.
       # @param n_jobs [Integer] The number of jobs for running the fit method in parallel.
       #   If nil is given, the method does not execute in parallel.
       #   If zero or less is given, it becomes equal to the number of processors.
-      #   This parameter is ignored if the Parallel gem is not loaded.
+      #   This parameter is ignored if the Parallel gem is not loaded or the solver is 'svd'.
       # @param random_seed [Integer] The seed value using to initialize the random generator.
       def initialize(reg_param: 1.0, fit_bias: false, bias_scale: 1.0, max_iter: 1000, batch_size: 10, optimizer: nil,
-                     n_jobs: nil, random_seed: nil)
+                     solver: 'sgd', n_jobs: nil, random_seed: nil)
         check_params_float(reg_param: reg_param, bias_scale: bias_scale)
         check_params_integer(max_iter: max_iter, batch_size: batch_size)
         check_params_boolean(fit_bias: fit_bias)
+        check_params_string(solver: solver)
         check_params_type_or_nil(Integer, n_jobs: n_jobs, random_seed: random_seed)
         check_params_positive(reg_param: reg_param, max_iter: max_iter, batch_size: batch_size)
-        super
+        keywd_args = method(:initialize).parameters.map { |_t, arg| [arg, binding.local_variable_get(arg)] }.to_h
+        keywd_args.delete(:solver)
+        super(keywd_args)
+        @params[:solver] = solver != 'svd' ? 'sgd' : 'svd'
       end
       # Fit the model with given training data.
@@ -63,20 +78,10 @@ module Rumale
         check_tvalue_array(y)
         check_sample_tvalue_size(x, y)
-        n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
-        n_features = x.shape[1]
-        if n_outputs > 1
-          @weight_vec = Numo::DFloat.zeros(n_outputs, n_features)
-          @bias_term = Numo::DFloat.zeros(n_outputs)
-          if enable_parallel?
-            models = parallel_map(n_outputs) { |n| partial_fit(x, y[true, n]) }
-            n_outputs.times { |n| @weight_vec[n, true], @bias_term[n] = models[n] }
-          else
-            n_outputs.times { |n| @weight_vec[n, true], @bias_term[n] = partial_fit(x, y[true, n]) }
-          end
+        if @params[:solver] == 'svd' && enable_linalg?
+          fit_svd(x, y)
         else
-          @weight_vec, @bias_term = partial_fit(x, y)
+          fit_sgd(x, y)
         end
         self
@@ -112,6 +117,41 @@ module Rumale
       private
+      def fit_svd(x, y)
+        samples = @params[:fit_bias] ? expand_feature(x) : x
+        s, u, vt = Numo::Linalg.svd(samples, driver: 'sdd', job: 'S')
+        d = (s / (s**2 + @params[:reg_param])).diag
+        w = vt.transpose.dot(d).dot(u.transpose).dot(y)
+        is_single_target_vals = y.shape[1].nil?
+        if @params[:fit_bias]
+          @weight_vec = is_single_target_vals ? w[0...-1].dup : w[0...-1, true].dup
+          @bias_term = is_single_target_vals ? w[-1] : w[-1, true].dup
+        else
+          @weight_vec = w.dup
+          @bias_term = is_single_target_vals ? 0 : Numo::DFloat.zeros(y.shape[1])
+        end
+      end
+      def fit_sgd(x, y)
+        n_outputs = y.shape[1].nil? ? 1 : y.shape[1]
+        n_features = x.shape[1]
+        if n_outputs > 1
+          @weight_vec = Numo::DFloat.zeros(n_outputs, n_features)
+          @bias_term = Numo::DFloat.zeros(n_outputs)
+          if enable_parallel?
+            models = parallel_map(n_outputs) { |n| partial_fit(x, y[true, n]) }
+            n_outputs.times { |n| @weight_vec[n, true], @bias_term[n] = models[n] }
+          else
+            n_outputs.times { |n| @weight_vec[n, true], @bias_term[n] = partial_fit(x, y[true, n]) }
+          end
+        else
+          @weight_vec, @bias_term = partial_fit(x, y)
+        end
+      end
       def calc_loss_gradient(x, y, weight)
         2.0 * (x.dot(weight) - y)
       end

data/lib/rumale/pairwise_metric.rb CHANGED

@@ -18,6 +18,24 @@ module Rumale
         Numo::NMath.sqrt(squared_error(x, y).abs)
       end
+      # Calculate the pairwise manhattan distances between x and y.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
+      # @param y [Numo::DFloat] (shape: [n_samples_y, n_features])
+      # @return [Numo::DFloat] (shape: [n_samples_x, n_samples_x] or [n_samples_x, n_samples_y] if y is given)
+      def manhattan_distance(x, y = nil)
+        y = x if y.nil?
+        Rumale::Validation.check_sample_array(x)
+        Rumale::Validation.check_sample_array(y)
+        n_samples_x = x.shape[0]
+        n_samples_y = y.shape[0]
+        distance_mat = Numo::DFloat.zeros(n_samples_x, n_samples_y)
+        n_samples_x.times do |n|
+          distance_mat[n, true] = (y - x[n, true]).abs.sum(axis: 1)
+        end
+        distance_mat
+      end
       # Calculate the pairwise squared errors between x and y.
       #
       # @param x [Numo::DFloat] (shape: [n_samples_x, n_features])
@@ -27,11 +45,6 @@ module Rumale
         y = x if y.nil?
         Rumale::Validation.check_sample_array(x)
         Rumale::Validation.check_sample_array(y)
-        # sum_x_vec = (x**2).sum(1)
-        # sum_y_vec = (y**2).sum(1)
-        # dot_xy_mat = x.dot(y.transpose)
-        # dot_xy_mat * -2.0 + sum_x_vec.tile(y.shape[0], 1).transpose + sum_y_vec.tile(x.shape[0], 1)
-        #
         n_features = x.shape[1]
         one_vec = Numo::DFloat.ones(n_features).expand_dims(1)
         sum_x_vec = (x**2).dot(one_vec)

data/lib/rumale/version.rb CHANGED

@@ -3,5 +3,5 @@
 # Rumale is a machine learning library in Ruby.
 module Rumale
   # The version of Rumale you are using.
-  VERSION = '0.13.0'
+  VERSION = '0.13.1'
 end

data/rumale.gemspec CHANGED

@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
     Rumale currently supports Linear / Kernel Support Vector Machine,
     Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
     Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
-    K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
+    K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, SNN, Power Iteration Clustering,
     Multidimensional Scaling, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
   MSG
   spec.homepage      = 'https://github.com/yoshoku/rumale'

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rumale
 version: !ruby/object:Gem::Version
-  version: 0.13.0
+  version: 0.13.1
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-08-24 00:00:00.000000000 Z
+date: 2019-09-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray
@@ -128,7 +128,7 @@ description: |
   Rumale currently supports Linear / Kernel Support Vector Machine,
   Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
   Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
-  K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
+  K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, SNN, Power Iteration Clustering,
   Multidimensional Scaling, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
 email:
 - yoshoku@outlook.com
@@ -166,6 +166,7 @@ files:
 - lib/rumale/clustering/k_means.rb
 - lib/rumale/clustering/k_medoids.rb
 - lib/rumale/clustering/power_iteration.rb
+- lib/rumale/clustering/snn.rb
 - lib/rumale/dataset.rb
 - lib/rumale/decomposition/nmf.rb
 - lib/rumale/decomposition/pca.rb