RubyGems - rumale - Versions diffs - 0.20.0 → 0.22.0 - Mend

rumale 0.20.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +4 -4
data/.github/workflows/build.yml +23 -0
data/.rubocop.yml +15 -95
data/CHANGELOG.md +28 -0
data/Gemfile +4 -2
data/README.md +5 -2
data/lib/rumale.rb +3 -0
data/lib/rumale/clustering/hdbscan.rb +2 -2
data/lib/rumale/clustering/snn.rb +1 -1
data/lib/rumale/dataset.rb +1 -1
data/lib/rumale/decomposition/nmf.rb +2 -2
data/lib/rumale/ensemble/random_forest_classifier.rb +1 -1
data/lib/rumale/ensemble/random_forest_regressor.rb +1 -1
data/lib/rumale/evaluation_measure/roc_auc.rb +3 -0
data/lib/rumale/feature_extraction/feature_hasher.rb +1 -1
data/lib/rumale/feature_extraction/hash_vectorizer.rb +1 -1
data/lib/rumale/linear_model/base_sgd.rb +1 -1
data/lib/rumale/linear_model/elastic_net.rb +2 -2
data/lib/rumale/linear_model/lasso.rb +2 -2
data/lib/rumale/linear_model/linear_regression.rb +2 -2
data/lib/rumale/linear_model/logistic_regression.rb +123 -35
data/lib/rumale/linear_model/ridge.rb +2 -2
data/lib/rumale/linear_model/svc.rb +2 -2
data/lib/rumale/linear_model/svr.rb +2 -2
data/lib/rumale/manifold/tsne.rb +1 -1
data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +13 -45
data/lib/rumale/model_selection/group_k_fold.rb +93 -0
data/lib/rumale/model_selection/group_shuffle_split.rb +115 -0
data/lib/rumale/model_selection/shuffle_split.rb +4 -4
data/lib/rumale/model_selection/stratified_k_fold.rb +1 -1
data/lib/rumale/model_selection/stratified_shuffle_split.rb +13 -9
data/lib/rumale/model_selection/time_series_split.rb +91 -0
data/lib/rumale/pipeline/pipeline.rb +1 -1
data/lib/rumale/probabilistic_output.rb +1 -1
data/lib/rumale/tree/base_decision_tree.rb +2 -9
data/lib/rumale/tree/gradient_tree_regressor.rb +3 -10
data/lib/rumale/version.rb +1 -1
data/rumale.gemspec +1 -0
metadata +21 -4
data/.coveralls.yml +0 -1

data/lib/rumale/linear_model/ridge.rb CHANGED

@@ -10,7 +10,7 @@ module Rumale
     #
     # @example
     #   estimator =
-    #     Rumale::LinearModel::Ridge.new(reg_param: 0.1, max_iter: 500, batch_size: 20, random_seed: 1)
+    #     Rumale::LinearModel::Ridge.new(reg_param: 0.1, max_iter: 1000, batch_size: 20, random_seed: 1)
     #   estimator.fit(training_samples, traininig_values)
     #   results = estimator.predict(testing_samples)
     #
@@ -70,7 +70,7 @@ module Rumale
       # @param random_seed [Integer] The seed value using to initialize the random generator.
       def initialize(learning_rate: 0.01, decay: nil, momentum: 0.9,
                      reg_param: 1.0, fit_bias: true, bias_scale: 1.0,
-                     max_iter: 200, batch_size: 50, tol: 1e-4,
+                     max_iter: 1000, batch_size: 50, tol: 1e-4,
                      solver: 'auto',
                      n_jobs: nil, verbose: false, random_seed: nil)
         check_params_numeric(learning_rate: learning_rate, momentum: momentum,

data/lib/rumale/linear_model/svc.rb CHANGED

@@ -17,7 +17,7 @@ module Rumale
     #
     # @example
     #   estimator =
-    #     Rumale::LinearModel::SVC.new(reg_param: 1.0, max_iter: 200, batch_size: 50, random_seed: 1)
+    #     Rumale::LinearModel::SVC.new(reg_param: 1.0, max_iter: 1000, batch_size: 50, random_seed: 1)
     #   estimator.fit(training_samples, traininig_labels)
     #   results = estimator.predict(testing_samples)
     #
@@ -74,7 +74,7 @@ module Rumale
       def initialize(learning_rate: 0.01, decay: nil, momentum: 0.9,
                      penalty: 'l2', reg_param: 1.0, l1_ratio: 0.5,
                      fit_bias: true, bias_scale: 1.0,
-                     max_iter: 200, batch_size: 50, tol: 1e-4,
+                     max_iter: 1000, batch_size: 50, tol: 1e-4,
                      probability: false,
                      n_jobs: nil, verbose: false, random_seed: nil)
         check_params_numeric(learning_rate: learning_rate, momentum: momentum,

data/lib/rumale/linear_model/svr.rb CHANGED

@@ -14,7 +14,7 @@ module Rumale
     #
     # @example
     #   estimator =
-    #     Rumale::LinearModel::SVR.new(reg_param: 1.0, epsilon: 0.1, max_iter: 200, batch_size: 50, random_seed: 1)
+    #     Rumale::LinearModel::SVR.new(reg_param: 1.0, epsilon: 0.1, max_iter: 1000, batch_size: 50, random_seed: 1)
     #   estimator.fit(training_samples, traininig_target_values)
     #   results = estimator.predict(testing_samples)
     #
@@ -68,7 +68,7 @@ module Rumale
                      penalty: 'l2', reg_param: 1.0, l1_ratio: 0.5,
                      fit_bias: true, bias_scale: 1.0,
                      epsilon: 0.1,
-                     max_iter: 200, batch_size: 50, tol: 1e-4,
+                     max_iter: 1000, batch_size: 50, tol: 1e-4,
                      n_jobs: nil, verbose: false, random_seed: nil)
         check_params_numeric(learning_rate: learning_rate, momentum: momentum,
                              reg_param: reg_param, bias_scale: bias_scale, epsilon: epsilon,

data/lib/rumale/manifold/tsne.rb CHANGED

@@ -102,7 +102,7 @@ module Rumale
           break if terminate?(hi_prob_mat, lo_prob_mat)
           a = hi_prob_mat * lo_prob_mat
-          b = lo_prob_mat * lo_prob_mat
+          b = lo_prob_mat**2
           y = (b.dot(one_vec) * y + (a - b).dot(y)) / a.dot(one_vec)
           lo_prob_mat = t_distributed_probability_matrix(y)
           @n_iter = t + 1

data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb CHANGED

@@ -2,13 +2,13 @@
 require 'rumale/base/base_estimator'
 require 'rumale/base/transformer'
+require 'lbfgsb'
 module Rumale
   module MetricLearning
     # NeighbourhoodComponentAnalysis is a class that implements Neighbourhood Component Analysis.
     #
     # @example
-    #   require 'mopti'
     #   require 'rumale'
     #
     #   transformer = Rumale::MetricLearning::NeighbourhoodComponentAnalysis.new
@@ -39,7 +39,9 @@ module Rumale
       # @param init [String] The initialization method for components ('random' or 'pca').
       # @param max_iter [Integer] The maximum number of iterations.
       # @param tol [Float] The tolerance of termination criterion.
+      #   This value is given as tol / Lbfgsb::DBL_EPSILON to the factr argument of Lbfgsb.minimize method.
       # @param verbose [Boolean] The flag indicating whether to output loss during iteration.
+      #   If true is given, 'iterate.dat' file is generated by lbfgsb.rb.
       # @param random_seed [Integer] The seed value using to initialize the random generator.
       def initialize(n_components: nil, init: 'random', max_iter: 100, tol: 1e-6, verbose: false, random_seed: nil)
         check_params_numeric_or_nil(n_components: n_components, random_seed: random_seed)
@@ -65,8 +67,6 @@ module Rumale
       # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
       # @return [NeighbourhoodComponentAnalysis] The learned classifier itself.
       def fit(x, y)
-        raise 'NeighbourhoodComponentAnalysis#fit requires Mopti but that is not loaded.' unless enable_mopti?
         x = check_convert_sample_array(x)
         y = check_convert_label_array(y)
         check_sample_label_size(x, y)
@@ -102,17 +102,9 @@ module Rumale
       private
-      def enable_mopti?
-        if defined?(Mopti).nil?
-          warn('NeighbourhoodComponentAnalysis#fit requires Mopti but that is not loaded. You should intall and load mopti gem in advance.')
-          return false
-        end
-        true
-      end
       def init_components(x, n_features, n_components)
         if @params[:init] == 'pca'
-          pca = Rumale::Decomposition::PCA.new(n_components: n_components, solver: 'evd')
+          pca = Rumale::Decomposition::PCA.new(n_components: n_components)
           pca.fit(x).components.flatten.dup
         else
           Rumale::Utils.rand_normal([n_features, n_components], @rng.dup).flatten.dup
@@ -127,28 +119,18 @@ module Rumale
         res[:x] = comp_init
         res[:n_iter] = 0
         # perform optimization.
-        optimizer = Mopti::ScaledConjugateGradient.new(
-          fnc: method(:nca_loss), jcb: method(:nca_dloss),
-          x_init: comp_init, args: [x, y],
-          max_iter: @params[:max_iter], ftol: @params[:tol]
+        verbose = @params[:verbose] ? 1 : -1
+        res = Lbfgsb.minimize(
+          fnc: method(:nca_fnc), jcb: true, x_init: comp_init, args: [x, y],
+          maxiter: @params[:max_iter], factr: @params[:tol] / Lbfgsb::DBL_EPSILON, verbose: verbose
         )
-        fold = 0.0
-        dold = 0.0
-        optimizer.each do |prm|
-          res = prm
-          puts "[NeighbourhoodComponentAnalysis] The value of objective function after #{res[:n_iter]} epochs: #{x.shape[0] - res[:fnc]}" if @params[:verbose]
-          break if (fold - res[:fnc]).abs <= @params[:tol] && (dold - res[:jcb]).abs <= @params[:tol]
-          fold = res[:fnc]
-          dold = res[:jcb]
-        end
         # return the results.
         n_iter = res[:n_iter]
         comps = n_components == 1 ? res[:x].dup : res[:x].reshape(n_components, n_features)
         [comps, n_iter]
       end
-      def nca_loss(w, x, y)
+      def nca_fnc(w, x, y)
         # initialize some variables.
         n_samples, n_features = x.shape
         n_components = w.size / n_features
@@ -157,32 +139,18 @@ module Rumale
         z = x.dot(w.transpose)
         # calculate probability matrix.
         prob_mat = probability_matrix(z)
-        # calculate loss.
+        # calculate loss and gradient.
         # NOTE:
         # NCA attempts to maximize its objective function.
         # For the minization algorithm, the objective function value is subtracted from the maixmum value (n_samples).
         mask_mat = y.expand_dims(1).eq(y)
         masked_prob_mat = prob_mat * mask_mat
-        n_samples - masked_prob_mat.sum
-      end
-      def nca_dloss(w, x, y)
-        # initialize some variables.
-        n_features = x.shape[1]
-        n_components = w.size / n_features
-        # projection.
-        w = w.reshape(n_components, n_features)
-        z = x.dot(w.transpose)
-        # calculate probability matrix.
-        prob_mat = probability_matrix(z)
-        # calculate gradient.
-        mask_mat = y.expand_dims(1).eq(y)
-        masked_prob_mat = prob_mat * mask_mat
+        loss = n_samples - masked_prob_mat.sum
         weighted_prob_mat = masked_prob_mat - prob_mat * masked_prob_mat.sum(1).expand_dims(1)
         weighted_prob_mat += weighted_prob_mat.transpose
         weighted_prob_mat[weighted_prob_mat.diag_indices] = -weighted_prob_mat.sum(0)
-        gradient = 2 * z.transpose.dot(weighted_prob_mat).dot(x)
-        -gradient.flatten.dup
+        gradient = -2 * z.transpose.dot(weighted_prob_mat).dot(x)
+        [loss, gradient.flatten.dup]
       end
       def probability_matrix(z)

data/lib/rumale/model_selection/group_k_fold.rb ADDED

@@ -0,0 +1,93 @@
+# frozen_string_literal: true
+require 'rumale/base/splitter'
+require 'rumale/preprocessing/label_encoder'
+module Rumale
+  module ModelSelection
+    # GroupKFold is a class that generates the set of data indices for K-fold cross-validation.
+    # The data points belonging to the same group do not be split into different folds.
+    # The number of groups should be greater than or equal to the number of splits.
+    #
+    # @example
+    #   cv = Rumale::ModelSelection::GroupKFold.new(n_splits: 3)
+    #   x = Numo::DFloat.new(8, 2).rand
+    #   groups = Numo::Int32[1, 1, 1, 2, 2, 3, 3, 3]
+    #   cv.split(x, nil, groups).each do |train_ids, test_ids|
+    #     puts '---'
+    #     pp train_ids
+    #     pp test_ids
+    #   end
+    #
+    #   # ---
+    #   # [0, 1, 2, 3, 4]
+    #   # [5, 6, 7]
+    #   # ---
+    #   # [3, 4, 5, 6, 7]
+    #   # [0, 1, 2]
+    #   # ---
+    #   # [0, 1, 2, 5, 6, 7]
+    #   # [3, 4]
+    #
+    class GroupKFold
+      include Base::Splitter
+      # Return the number of folds.
+      # @return [Integer]
+      attr_reader :n_splits
+      # Create a new data splitter for grouped K-fold cross validation.
+      #
+      # @param n_splits [Integer] The number of folds.
+      def initialize(n_splits: 5)
+        check_params_numeric(n_splits: n_splits)
+        @n_splits = n_splits
+      end
+      # Generate data indices for grouped K-fold cross validation.
+      #
+      # @overload split(x, y, groups) -> Array
+      #   @param x [Numo::DFloat] (shape: [n_samples, n_features])
+      #     The dataset to be used to generate data indices for grouped K-fold cross validation.
+      #   @param y [Numo::Int32] (shape: [n_samples])
+      #     This argument exists to unify the interface between the K-fold methods, it is not used in the method.
+      #   @param groups [Numo::Int32] (shape: [n_samples])
+      #     The group labels to be used to generate data indices for grouped K-fold cross validation.
+      # @return [Array] The set of data indices for constructing the training and testing dataset in each fold.
+      def split(x, _y, groups)
+        x = check_convert_sample_array(x)
+        groups = check_convert_label_array(groups)
+        check_sample_label_size(x, groups)
+        encoder = Rumale::Preprocessing::LabelEncoder.new
+        groups = encoder.fit_transform(groups)
+        n_groups = encoder.classes.size
+        raise ArgumentError, 'The number of groups should be greater than or equal to the number of splits.' if n_groups < @n_splits
+        n_samples_per_group = groups.bincount
+        group_ids = n_samples_per_group.sort_index.reverse
+        n_samples_per_group = n_samples_per_group[group_ids]
+        n_samples_per_fold = Numo::Int32.zeros(@n_splits)
+        group_to_fold = Numo::Int32.zeros(n_groups)
+        n_samples_per_group.each_with_index do |weight, id|
+          min_sample_fold_id = n_samples_per_fold.min_index
+          n_samples_per_fold[min_sample_fold_id] += weight
+          group_to_fold[group_ids[id]] = min_sample_fold_id
+        end
+        n_samples = x.shape[0]
+        sample_ids = Array(0...n_samples)
+        fold_ids = group_to_fold[groups]
+        Array.new(@n_splits) do |fid|
+          test_ids = fold_ids.eq(fid).where.to_a
+          train_ids = sample_ids - test_ids
+          [train_ids, test_ids]
+        end
+      end
+    end
+  end
+end

data/lib/rumale/model_selection/group_shuffle_split.rb ADDED

@@ -0,0 +1,115 @@
+# frozen_string_literal: true
+require 'rumale/base/splitter'
+module Rumale
+  module ModelSelection
+    # GroupShuffleSplit is a class that generates the set of data indices
+    # for random permutation cross-validation by randomly selecting group labels.
+    #
+    # @example
+    #   cv = Rumale::ModelSelection::GroupShuffleSplit.new(n_splits: 2, test_size: 0.2, random_seed: 1)
+    #   x = Numo::DFloat.new(8, 2).rand
+    #   groups = Numo::Int32[1, 1, 1, 2, 2, 3, 3, 3]
+    #   cv.split(x, nil, groups).each do |train_ids, test_ids|
+    #     puts '---'
+    #     pp train_ids
+    #     pp test_ids
+    #   end
+    #
+    #   # ---
+    #   # [0, 1, 2, 5, 6, 7]
+    #   # [3, 4]
+    #   # ---
+    #   # [3, 4, 5, 6, 7]
+    #   # [0, 1, 2]
+    #
+    class GroupShuffleSplit
+      include Base::Splitter
+      # Return the number of folds.
+      # @return [Integer]
+      attr_reader :n_splits
+      # Return the random generator for shuffling the dataset.
+      # @return [Random]
+      attr_reader :rng
+      # Create a new data splitter for random permutation cross validation with given group labels.
+      #
+      # @param n_splits [Integer] The number of folds.
+      # @param test_size [Float] The ratio of number of groups for test data.
+      # @param train_size [Float/Nil] The ratio of number of groups for train data.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      def initialize(n_splits: 5, test_size: 0.2, train_size: nil, random_seed: nil)
+        check_params_numeric(n_splits: n_splits, test_size: test_size)
+        check_params_numeric_or_nil(train_size: train_size, random_seed: random_seed)
+        check_params_positive(n_splits: n_splits)
+        check_params_positive(test_size: test_size)
+        check_params_positive(train_size: train_size) unless train_size.nil?
+        @n_splits = n_splits
+        @test_size = test_size
+        @train_size = train_size
+        @random_seed = random_seed
+        @random_seed ||= srand
+        @rng = Random.new(@random_seed)
+      end
+      # Generate train and test data indices by randomly selecting group labels.
+      #
+      # @overload split(x, y, groups) -> Array
+      #   @param x [Numo::DFloat] (shape: [n_samples, n_features])
+      #     The dataset to be used to generate data indices for random permutation cross validation.
+      #   @param y [Numo::Int32] (shape: [n_samples])
+      #     This argument exists to unify the interface between the K-fold methods, it is not used in the method.
+      #   @param groups [Numo::Int32] (shape: [n_samples])
+      #     The group labels to be used to generate data indices for random permutation cross validation.
+      # @return [Array] The set of data indices for constructing the training and testing dataset in each fold.
+      def split(x, _y, groups)
+        x = check_convert_sample_array(x)
+        groups = check_convert_label_array(groups)
+        check_sample_label_size(x, groups)
+        classes = groups.to_a.uniq.sort
+        n_groups = classes.size
+        n_test_groups = (@test_size * n_groups).ceil.to_i
+        n_train_groups = @train_size.nil? ? n_groups - n_test_groups : (@train_size * n_groups).floor.to_i
+        unless n_test_groups.between?(1, n_groups)
+          raise RangeError,
+                'The number of groups in test split must be not less than 1 and not more than the number of groups.'
+        end
+        unless n_train_groups.between?(1, n_groups)
+          raise RangeError,
+                'The number of groups in train split must be not less than 1 and not more than the number of groups.'
+        end
+        if (n_test_groups + n_train_groups) > n_groups
+          raise RangeError,
+                'The total number of groups in test split and train split must be not more than the number of groups.'
+        end
+        sub_rng = @rng.dup
+        Array.new(@n_splits) do
+          test_group_ids = classes.sample(n_test_groups, random: sub_rng)
+          train_group_ids = if @train_size.nil?
+                              classes - test_group_ids
+                            else
+                              (classes - test_group_ids).sample(n_train_groups, random: sub_rng)
+                            end
+          test_ids = in1d(groups, test_group_ids).where.to_a
+          train_ids = in1d(groups, train_group_ids).where.to_a
+          [train_ids, test_ids]
+        end
+      end
+      private
+      def in1d(a, b)
+        res = Numo::Bit.zeros(a.shape[0])
+        b.each { |v| res |= a.eq(v) }
+        res
+      end
+    end
+  end
+end

data/lib/rumale/model_selection/shuffle_split.rb CHANGED

@@ -54,19 +54,19 @@ module Rumale
         x = check_convert_sample_array(x)
         # Initialize and check some variables.
         n_samples = x.shape[0]
-        n_test_samples = (@test_size * n_samples).to_i
-        n_train_samples = @train_size.nil? ? n_samples - n_test_samples : (@train_size * n_samples).to_i
+        n_test_samples = (@test_size * n_samples).ceil.to_i
+        n_train_samples = @train_size.nil? ? n_samples - n_test_samples : (@train_size * n_samples).floor.to_i
         unless @n_splits.between?(1, n_samples)
           raise ArgumentError,
                 'The value of n_splits must be not less than 1 and not more than the number of samples.'
         end
         unless n_test_samples.between?(1, n_samples)
           raise RangeError,
-                'The number of sample in test split must be not less than 1 and not more than the number of samples.'
+                'The number of samples in test split must be not less than 1 and not more than the number of samples.'
         end
         unless n_train_samples.between?(1, n_samples)
           raise RangeError,
-                'The number of sample in train split must be not less than 1 and not more than the number of samples.'
+                'The number of samples in train split must be not less than 1 and not more than the number of samples.'
         end
         if (n_test_samples + n_train_samples) > n_samples
           raise RangeError,

data/lib/rumale/model_selection/stratified_k_fold.rb CHANGED

@@ -30,7 +30,7 @@ module Rumale
       # @return [Random]
       attr_reader :rng
-      # Create a new data splitter for K-fold cross validation.
+      # Create a new data splitter for stratified K-fold cross validation.
       #
       # @param n_splits [Integer] The number of folds.
       # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset.

data/lib/rumale/model_selection/stratified_shuffle_split.rb CHANGED

@@ -66,15 +66,15 @@ module Rumale
           raise ArgumentError,
                 'The value of n_splits must be not less than 1 and not more than the number of samples in each class.'
         end
-        unless enough_data_size_each_class?(y, @test_size)
+        unless enough_data_size_each_class?(y, @test_size, 'test')
           raise RangeError,
-                'The number of sample in test split must be not less than 1 and not more than the number of samples in each class.'
+                'The number of samples in test split must be not less than 1 and not more than the number of samples in each class.'
         end
-        unless enough_data_size_each_class?(y, train_sz)
+        unless enough_data_size_each_class?(y, train_sz, 'train')
           raise RangeError,
-                'The number of sample in train split must be not less than 1 and not more than the number of samples in each class.'
+                'The number of samples in train split must be not less than 1 and not more than the number of samples in each class.'
         end
-        unless enough_data_size_each_class?(y, train_sz + @test_size)
+        unless enough_data_size_each_class?(y, train_sz + @test_size, 'train')
           raise RangeError,
                 'The total number of samples in test split and train split must be not more than the number of samples in each class.'
         end
@@ -85,12 +85,12 @@ module Rumale
           test_ids = []
           sample_ids_each_class.each do |sample_ids|
             n_samples = sample_ids.size
-            n_test_samples = (@test_size * n_samples).to_i
-            n_train_samples = (train_sz * n_samples).to_i
+            n_test_samples = (@test_size * n_samples).ceil.to_i
             test_ids += sample_ids.sample(n_test_samples, random: sub_rng)
             train_ids += if @train_size.nil?
                            sample_ids - test_ids
                          else
+                           n_train_samples = (train_sz * n_samples).floor.to_i
                            (sample_ids - test_ids).sample(n_train_samples, random: sub_rng)
                          end
           end
@@ -104,9 +104,13 @@ module Rumale
         y.to_a.uniq.map { |label| y.eq(label).where.size }.all? { |n_samples| @n_splits.between?(1, n_samples) }
       end
-      def enough_data_size_each_class?(y, data_size)
+      def enough_data_size_each_class?(y, data_size, data_type)
         y.to_a.uniq.map { |label| y.eq(label).where.size }.all? do |n_samples|
-          (data_size * n_samples).to_i.between?(1, n_samples)
+          if data_type == 'test'
+            (data_size * n_samples).ceil.to_i.between?(1, n_samples)
+          else
+            (data_size * n_samples).floor.to_i.between?(1, n_samples)
+          end
         end
       end
     end