RubyGems - rumale - Versions diffs - 0.23.3 → 0.24.0 - Mend

rumale 0.23.3 → 0.24.0

Files changed (142) hide show

checksums.yaml +4 -4
data/LICENSE.txt +5 -1
data/README.md +3 -288
data/lib/rumale/version.rb +1 -1
data/lib/rumale.rb +20 -131
metadata +252 -150
data/CHANGELOG.md +0 -643
data/CODE_OF_CONDUCT.md +0 -74
data/ext/rumale/extconf.rb +0 -37
data/ext/rumale/rumaleext.c +0 -545
data/ext/rumale/rumaleext.h +0 -12
data/lib/rumale/base/base_estimator.rb +0 -49
data/lib/rumale/base/classifier.rb +0 -36
data/lib/rumale/base/cluster_analyzer.rb +0 -31
data/lib/rumale/base/evaluator.rb +0 -17
data/lib/rumale/base/regressor.rb +0 -36
data/lib/rumale/base/splitter.rb +0 -21
data/lib/rumale/base/transformer.rb +0 -22
data/lib/rumale/clustering/dbscan.rb +0 -123
data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
data/lib/rumale/clustering/hdbscan.rb +0 -291
data/lib/rumale/clustering/k_means.rb +0 -122
data/lib/rumale/clustering/k_medoids.rb +0 -141
data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
data/lib/rumale/clustering/power_iteration.rb +0 -127
data/lib/rumale/clustering/single_linkage.rb +0 -203
data/lib/rumale/clustering/snn.rb +0 -76
data/lib/rumale/clustering/spectral_clustering.rb +0 -115
data/lib/rumale/dataset.rb +0 -246
data/lib/rumale/decomposition/factor_analysis.rb +0 -150
data/lib/rumale/decomposition/fast_ica.rb +0 -188
data/lib/rumale/decomposition/nmf.rb +0 -124
data/lib/rumale/decomposition/pca.rb +0 -159
data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
data/lib/rumale/ensemble/voting_classifier.rb +0 -126
data/lib/rumale/ensemble/voting_regressor.rb +0 -82
data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
data/lib/rumale/evaluation_measure/f_score.rb +0 -50
data/lib/rumale/evaluation_measure/function.rb +0 -147
data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
data/lib/rumale/evaluation_measure/precision.rb +0 -50
data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
data/lib/rumale/evaluation_measure/purity.rb +0 -40
data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
data/lib/rumale/evaluation_measure/recall.rb +0 -50
data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
data/lib/rumale/kernel_approximation/rbf.rb +0 -102
data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
data/lib/rumale/linear_model/base_sgd.rb +0 -285
data/lib/rumale/linear_model/elastic_net.rb +0 -119
data/lib/rumale/linear_model/lasso.rb +0 -115
data/lib/rumale/linear_model/linear_regression.rb +0 -201
data/lib/rumale/linear_model/logistic_regression.rb +0 -275
data/lib/rumale/linear_model/nnls.rb +0 -137
data/lib/rumale/linear_model/ridge.rb +0 -209
data/lib/rumale/linear_model/svc.rb +0 -213
data/lib/rumale/linear_model/svr.rb +0 -132
data/lib/rumale/manifold/mds.rb +0 -155
data/lib/rumale/manifold/tsne.rb +0 -222
data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
data/lib/rumale/metric_learning/mlkr.rb +0 -161
data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
data/lib/rumale/model_selection/cross_validation.rb +0 -125
data/lib/rumale/model_selection/function.rb +0 -42
data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
data/lib/rumale/model_selection/group_k_fold.rb +0 -93
data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
data/lib/rumale/model_selection/k_fold.rb +0 -81
data/lib/rumale/model_selection/shuffle_split.rb +0 -90
data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
data/lib/rumale/model_selection/time_series_split.rb +0 -91
data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
data/lib/rumale/neural_network/adam.rb +0 -56
data/lib/rumale/neural_network/base_mlp.rb +0 -248
data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
data/lib/rumale/pairwise_metric.rb +0 -152
data/lib/rumale/pipeline/feature_union.rb +0 -69
data/lib/rumale/pipeline/pipeline.rb +0 -175
data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
data/lib/rumale/preprocessing/binarizer.rb +0 -60
data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
data/lib/rumale/preprocessing/label_encoder.rb +0 -79
data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
data/lib/rumale/probabilistic_output.rb +0 -114
data/lib/rumale/tree/base_decision_tree.rb +0 -150
data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
data/lib/rumale/tree/node.rb +0 -39
data/lib/rumale/utils.rb +0 -42
data/lib/rumale/validation.rb +0 -128
data/lib/rumale/values.rb +0 -13

data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb DELETED Viewed

@@ -1,167 +0,0 @@
-# frozen_string_literal: true
-require 'rumale/base/base_estimator'
-require 'rumale/base/transformer'
-require 'rumale/utils'
-require 'rumale/pairwise_metric'
-require 'lbfgsb'
-module Rumale
-  module MetricLearning
-    # NeighbourhoodComponentAnalysis is a class that implements Neighbourhood Component Analysis.
-    #
-    # @example
-    #   require 'rumale'
-    #
-    #   transformer = Rumale::MetricLearning::NeighbourhoodComponentAnalysis.new
-    #   transformer.fit(training_samples, traininig_labels)
-    #   low_samples = transformer.transform(testing_samples)
-    #
-    # *Reference*
-    # - Goldberger, J., Roweis, S., Hinton, G., and Salakhutdinov, R., "Neighbourhood Component Analysis," Advances in NIPS'17, pp. 513--520, 2005.
-    class NeighbourhoodComponentAnalysis
-      include Base::BaseEstimator
-      include Base::Transformer
-      # Returns the neighbourhood components.
-      # @return [Numo::DFloat] (shape: [n_components, n_features])
-      attr_reader :components
-      # Return the number of iterations run for optimization
-      # @return [Integer]
-      attr_reader :n_iter
-      # Return the random generator.
-      # @return [Random]
-      attr_reader :rng
-      # Create a new transformer with NeighbourhoodComponentAnalysis.
-      #
-      # @param n_components [Integer] The number of components.
-      # @param init [String] The initialization method for components ('random' or 'pca').
-      # @param max_iter [Integer] The maximum number of iterations.
-      # @param tol [Float] The tolerance of termination criterion.
-      #   This value is given as tol / Lbfgsb::DBL_EPSILON to the factr argument of Lbfgsb.minimize method.
-      # @param verbose [Boolean] The flag indicating whether to output loss during iteration.
-      #   If true is given, 'iterate.dat' file is generated by lbfgsb.rb.
-      # @param random_seed [Integer] The seed value using to initialize the random generator.
-      def initialize(n_components: nil, init: 'random', max_iter: 100, tol: 1e-6, verbose: false, random_seed: nil)
-        check_params_numeric_or_nil(n_components: n_components, random_seed: random_seed)
-        check_params_numeric(max_iter: max_iter, tol: tol)
-        check_params_string(init: init)
-        check_params_boolean(verbose: verbose)
-        @params = {}
-        @params[:n_components] = n_components
-        @params[:init] = init
-        @params[:max_iter] = max_iter
-        @params[:tol] = tol
-        @params[:verbose] = verbose
-        @params[:random_seed] = random_seed
-        @params[:random_seed] ||= srand
-        @components = nil
-        @n_iter = nil
-        @rng = Random.new(@params[:random_seed])
-      end
-      # Fit the model with given training data.
-      #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
-      # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
-      # @return [NeighbourhoodComponentAnalysis] The learned classifier itself.
-      def fit(x, y)
-        x = check_convert_sample_array(x)
-        y = check_convert_label_array(y)
-        check_sample_label_size(x, y)
-        n_features = x.shape[1]
-        n_components = if @params[:n_components].nil?
-                         n_features
-                       else
-                         [n_features, @params[:n_components]].min
-                       end
-        @components, @n_iter = optimize_components(x, y, n_features, n_components)
-        self
-      end
-      # Fit the model with training data, and then transform them with the learned model.
-      #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
-      # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
-      # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
-      def fit_transform(x, y)
-        x = check_convert_sample_array(x)
-        y = check_convert_label_array(y)
-        fit(x, y).transform(x)
-      end
-      # Transform the given data with the learned model.
-      #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
-      # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
-      def transform(x)
-        x = check_convert_sample_array(x)
-        x.dot(@components.transpose)
-      end
-      private
-      def init_components(x, n_features, n_components)
-        if @params[:init] == 'pca'
-          pca = Rumale::Decomposition::PCA.new(n_components: n_components)
-          pca.fit(x).components.flatten.dup
-        else
-          Rumale::Utils.rand_normal([n_features, n_components], @rng.dup).flatten.dup
-        end
-      end
-      def optimize_components(x, y, n_features, n_components)
-        # initialize components.
-        comp_init = init_components(x, n_features, n_components)
-        # initialize optimization results.
-        res = {}
-        res[:x] = comp_init
-        res[:n_iter] = 0
-        # perform optimization.
-        verbose = @params[:verbose] ? 1 : -1
-        res = Lbfgsb.minimize(
-          fnc: method(:nca_fnc), jcb: true, x_init: comp_init, args: [x, y],
-          maxiter: @params[:max_iter], factr: @params[:tol] / Lbfgsb::DBL_EPSILON, verbose: verbose
-        )
-        # return the results.
-        n_iter = res[:n_iter]
-        comps = n_components == 1 ? res[:x].dup : res[:x].reshape(n_components, n_features)
-        [comps, n_iter]
-      end
-      def nca_fnc(w, x, y)
-        # initialize some variables.
-        n_samples, n_features = x.shape
-        n_components = w.size / n_features
-        # projection.
-        w = w.reshape(n_components, n_features)
-        z = x.dot(w.transpose)
-        # calculate probability matrix.
-        prob_mat = probability_matrix(z)
-        # calculate loss and gradient.
-        # NOTE:
-        # NCA attempts to maximize its objective function.
-        # For the minization algorithm, the objective function value is subtracted from the maixmum value (n_samples).
-        mask_mat = y.expand_dims(1).eq(y)
-        masked_prob_mat = prob_mat * mask_mat
-        loss = n_samples - masked_prob_mat.sum
-        sum_probs = masked_prob_mat.sum(1)
-        weight_mat = (sum_probs.expand_dims(1) * prob_mat - masked_prob_mat)
-        weight_mat += weight_mat.transpose
-        weight_mat = weight_mat.sum(0).diag - weight_mat
-        gradient = -2 * z.transpose.dot(weight_mat).dot(x)
-        [loss, gradient.flatten.dup]
-      end
-      def probability_matrix(z)
-        prob_mat = Numo::NMath.exp(-Rumale::PairwiseMetric.squared_error(z))
-        prob_mat[prob_mat.diag_indices] = 0.0
-        prob_mat /= prob_mat.sum(1).expand_dims(1)
-        prob_mat
-      end
-    end
-  end
-end

data/lib/rumale/model_selection/cross_validation.rb DELETED Viewed

@@ -1,125 +0,0 @@
-# frozen_string_literal: true
-require 'rumale/validation'
-require 'rumale/base/base_estimator'
-require 'rumale/base/classifier'
-require 'rumale/base/regressor'
-require 'rumale/base/splitter'
-require 'rumale/base/evaluator'
-require 'rumale/evaluation_measure/log_loss'
-module Rumale
-  # This module consists of the classes for model validation techniques.
-  module ModelSelection
-    # CrossValidation is a class that evaluates a given classifier with cross-validation method.
-    #
-    # @example
-    #   svc = Rumale::LinearModel::SVC.new
-    #   kf = Rumale::ModelSelection::StratifiedKFold.new(n_splits: 5)
-    #   cv = Rumale::ModelSelection::CrossValidation.new(estimator: svc, splitter: kf)
-    #   report = cv.perform(samples, labels)
-    #   mean_test_score = report[:test_score].inject(:+) / kf.n_splits
-    #
-    class CrossValidation
-      include Validation
-      # Return the classifier of which performance is evaluated.
-      # @return [Classifier]
-      attr_reader :estimator
-      # Return the splitter that divides dataset.
-      # @return [Splitter]
-      attr_reader :splitter
-      # Return the evaluator that calculates score.
-      # @return [Evaluator]
-      attr_reader :evaluator
-      # Return the flag indicating whether to caculate the score of training dataset.
-      # @return [Boolean]
-      attr_reader :return_train_score
-      # Create a new evaluator with cross-validation method.
-      #
-      # @param estimator [Classifier] The classifier of which performance is evaluated.
-      # @param splitter [Splitter] The splitter that divides dataset to training and testing dataset.
-      # @param evaluator [Evaluator] The evaluator that calculates score of estimator results.
-      # @param return_train_score [Boolean] The flag indicating whether to calculate the score of training dataset.
-      def initialize(estimator: nil, splitter: nil, evaluator: nil, return_train_score: false)
-        check_params_type(Rumale::Base::BaseEstimator, estimator: estimator)
-        check_params_type(Rumale::Base::Splitter, splitter: splitter)
-        check_params_type_or_nil(Rumale::Base::Evaluator, evaluator: evaluator)
-        check_params_boolean(return_train_score: return_train_score)
-        @estimator = estimator
-        @splitter = splitter
-        @evaluator = evaluator
-        @return_train_score = return_train_score
-      end
-      # Perform the evalution of given classifier with cross-validation method.
-      #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features])
-      #   The dataset to be used to evaluate the estimator.
-      # @param y [Numo::Int32 / Numo::DFloat] (shape: [n_samples] / [n_samples, n_outputs])
-      #   The labels to be used to evaluate the classifier / The target values to be used to evaluate the regressor.
-      # @return [Hash] The report summarizing the results of cross-validation.
-      #   * :fit_time (Array<Float>) The calculation times of fitting the estimator for each split.
-      #   * :test_score (Array<Float>) The scores of testing dataset for each split.
-      #   * :train_score (Array<Float>) The scores of training dataset for each split. This option is nil if
-      #     the return_train_score is false.
-      def perform(x, y)
-        x = check_convert_sample_array(x)
-        case @estimator
-        when Rumale::Base::Classifier
-          y = check_convert_label_array(y)
-          check_sample_label_size(x, y)
-        when Rumale::Base::Regressor
-          y = check_convert_tvalue_array(y)
-          check_sample_tvalue_size(x, y)
-        else
-          y = Numo::NArray.asarray(y)
-        end
-        # Initialize the report of cross validation.
-        report = { test_score: [], train_score: nil, fit_time: [] }
-        report[:train_score] = [] if @return_train_score
-        # Evaluate the estimator on each split.
-        @splitter.split(x, y).each do |train_ids, test_ids|
-          # Split dataset into training and testing dataset.
-          feature_ids = !kernel_machine? || train_ids
-          train_x = x[train_ids, feature_ids]
-          train_y = y.shape[1].nil? ? y[train_ids] : y[train_ids, true]
-          test_x = x[test_ids, feature_ids]
-          test_y = y.shape[1].nil? ? y[test_ids] : y[test_ids, true]
-          # Fit the estimator.
-          start_time = Time.now.to_i
-          @estimator.fit(train_x, train_y)
-          # Calculate scores and prepare the report.
-          report[:fit_time].push(Time.now.to_i - start_time)
-          if @evaluator.nil?
-            report[:test_score].push(@estimator.score(test_x, test_y))
-            report[:train_score].push(@estimator.score(train_x, train_y)) if @return_train_score
-          elsif log_loss?
-            report[:test_score].push(@evaluator.score(test_y, @estimator.predict_proba(test_x)))
-            report[:train_score].push(@evaluator.score(train_y, @estimator.predict_proba(train_x))) if @return_train_score
-          else
-            report[:test_score].push(@evaluator.score(test_y, @estimator.predict(test_x)))
-            report[:train_score].push(@evaluator.score(train_y, @estimator.predict(train_x))) if @return_train_score
-          end
-        end
-        report
-      end
-      private
-      def kernel_machine?
-        class_name = @estimator.class.to_s
-        class_name = @estimator.params[:estimator].class.to_s if class_name.include?('Multiclass')
-        class_name.include?('KernelMachine')
-      end
-      def log_loss?
-        @evaluator.is_a?(Rumale::EvaluationMeasure::LogLoss)
-      end
-    end
-  end
-end

data/lib/rumale/model_selection/function.rb DELETED Viewed

@@ -1,42 +0,0 @@
-# frozen_string_literal: true
-require 'rumale/model_selection/shuffle_split'
-require 'rumale/model_selection/stratified_shuffle_split'
-module Rumale
-  module ModelSelection
-    module_function
-    # Split randomly data set into test and train data.
-    #
-    # @example
-    #   x_train, x_test, y_train, y_test = Rumale::ModelSelection.train_test_split(x, y, test_size: 0.2, stratify: true, random_seed: 1)
-    #
-    # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The dataset to be used to generate data indices.
-    # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used to generate data indices for stratified random permutation.
-    #   If stratify = false, this parameter is ignored.
-    # @param test_size [Float] The ratio of number of samples for test data.
-    # @param train_size [Float] The ratio of number of samples for train data.
-    #   If nil is given, it sets to 1 - test_size.
-    # @param stratify [Boolean] The flag indicating whether to perform stratify split.
-    # @param random_seed [Integer] The seed value using to initialize the random generator.
-    # @return [Array<Numo::NArray>] The set of training and testing data.
-    def train_test_split(x, y = nil, test_size: 0.1, train_size: nil, stratify: false, random_seed: nil)
-      splitter = if stratify
-                   Rumale::ModelSelection::StratifiedShuffleSplit.new(
-                     n_splits: 1, test_size: test_size, train_size: train_size, random_seed: random_seed
-                   )
-                 else
-                   Rumale::ModelSelection::ShuffleSplit.new(
-                     n_splits: 1, test_size: test_size, train_size: train_size, random_seed: random_seed
-                   )
-                 end
-      train_ids, test_ids = splitter.split(x, y).first
-      x_train = x[train_ids, true].dup
-      y_train = y[train_ids].dup
-      x_test = x[test_ids, true].dup
-      y_test = y[test_ids].dup
-      [x_train, x_test, y_train, y_test]
-    end
-  end
-end

data/lib/rumale/model_selection/grid_search_cv.rb DELETED Viewed

@@ -1,225 +0,0 @@
-# frozen_string_literal: true
-require 'rumale/validation'
-require 'rumale/base/base_estimator'
-require 'rumale/base/evaluator'
-require 'rumale/base/splitter'
-require 'rumale/pipeline/pipeline'
-module Rumale
-  module ModelSelection
-    # GridSearchCV is a class that performs hyperparameter optimization with grid search method.
-    #
-    # @example
-    #   rfc = Rumale::Ensemble::RandomForestClassifier.new(random_seed: 1)
-    #   pg = { n_estimators: [5, 10], max_depth: [3, 5], max_leaf_nodes: [15, 31] }
-    #   kf = Rumale::ModelSelection::StratifiedKFold.new(n_splits: 5)
-    #   gs = Rumale::ModelSelection::GridSearchCV.new(estimator: rfc, param_grid: pg, splitter: kf)
-    #   gs.fit(samples, labels)
-    #   p gs.cv_results
-    #   p gs.best_params
-    #
-    # @example
-    #   rbf = Rumale::KernelApproximation::RBF.new(random_seed: 1)
-    #   svc = Rumale::LinearModel::SVC.new(random_seed: 1)
-    #   pipe = Rumale::Pipeline::Pipeline.new(steps: { rbf: rbf, svc: svc })
-    #   pg = { rbf__gamma: [32.0, 1.0], rbf__n_components: [4, 128], svc__reg_param: [16.0, 0.1] }
-    #   kf = Rumale::ModelSelection::StratifiedKFold.new(n_splits: 5)
-    #   gs = Rumale::ModelSelection::GridSearchCV.new(estimator: pipe, param_grid: pg, splitter: kf)
-    #   gs.fit(samples, labels)
-    #   p gs.cv_results
-    #   p gs.best_params
-    #
-    class GridSearchCV
-      include Base::BaseEstimator
-      include Validation
-      # Return the result of cross validation for each parameter.
-      # @return [Hash]
-      attr_reader :cv_results
-      # Return the score of the estimator learned with the best parameter.
-      # @return [Float]
-      attr_reader :best_score
-      # Return the best parameter set.
-      # @return [Hash]
-      attr_reader :best_params
-      # Return the index of the best parameter.
-      # @return [Integer]
-      attr_reader :best_index
-      # Return the estimator learned with the best parameter.
-      # @return [Estimator]
-      attr_reader :best_estimator
-      # Create a new grid search method.
-      #
-      # @param estimator [Classifier/Regresor] The estimator to be searched for optimal parameters with grid search method.
-      # @param param_grid [Array<Hash>] The parameter sets is represented with array of hash that
-      #   consists of parameter names as keys and array of parameter values as values.
-      # @param splitter [Splitter] The splitter that divides dataset to training and testing dataset on cross validation.
-      # @param evaluator [Evaluator] The evaluator that calculates score of estimator results on cross validation.
-      #   If nil is given, the score method of estimator is used to evaluation.
-      # @param greater_is_better [Boolean] The flag that indicates whether the estimator is better as
-      #   evaluation score is larger.
-      def initialize(estimator: nil, param_grid: nil, splitter: nil, evaluator: nil, greater_is_better: true)
-        check_params_type(Rumale::Base::BaseEstimator, estimator: estimator)
-        check_params_type(Rumale::Base::Splitter, splitter: splitter)
-        check_params_type_or_nil(Rumale::Base::Evaluator, evaluator: evaluator)
-        check_params_boolean(greater_is_better: greater_is_better)
-        @params = {}
-        @params[:param_grid] = valid_param_grid(param_grid)
-        @params[:estimator] = Marshal.load(Marshal.dump(estimator))
-        @params[:splitter] = Marshal.load(Marshal.dump(splitter))
-        @params[:evaluator] = Marshal.load(Marshal.dump(evaluator))
-        @params[:greater_is_better] = greater_is_better
-        @cv_results = nil
-        @best_score = nil
-        @best_params = nil
-        @best_index = nil
-        @best_estimator = nil
-      end
-      # Fit the model with given training data and all sets of parameters.
-      #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
-      # @param y [Numo::NArray] (shape: [n_samples, n_outputs]) The target values or labels to be used for fitting the model.
-      # @return [GridSearchCV] The learned estimator with grid search.
-      def fit(x, y)
-        x = check_convert_sample_array(x)
-        init_attrs
-        param_combinations.each do |prm_set|
-          prm_set.each do |prms|
-            report = perform_cross_validation(x, y, prms)
-            store_cv_result(prms, report)
-          end
-        end
-        find_best_params
-        @best_estimator = configurated_estimator(@best_params)
-        @best_estimator.fit(x, y)
-        self
-      end
-      # Call the decision_function method of learned estimator with the best parameter.
-      #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
-      # @return [Numo::DFloat] (shape: [n_samples]) Confidence score per sample.
-      def decision_function(x)
-        x = check_convert_sample_array(x)
-        @best_estimator.decision_function(x)
-      end
-      # Call the predict method of learned estimator with the best parameter.
-      #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to obtain prediction result.
-      # @return [Numo::NArray] Predicted results.
-      def predict(x)
-        x = check_convert_sample_array(x)
-        @best_estimator.predict(x)
-      end
-      # Call the predict_log_proba method of learned estimator with the best parameter.
-      #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the log-probailities.
-      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted log-probability of each class per sample.
-      def predict_log_proba(x)
-        x = check_convert_sample_array(x)
-        @best_estimator.predict_log_proba(x)
-      end
-      # Call the predict_proba method of learned estimator with the best parameter.
-      #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
-      # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
-      def predict_proba(x)
-        x = check_convert_sample_array(x)
-        @best_estimator.predict_proba(x)
-      end
-      # Call the score method of learned estimator with the best parameter.
-      #
-      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) Testing data.
-      # @param y [Numo::NArray] (shape: [n_samples, n_outputs]) True target values or labels for testing data.
-      # @return [Float] The score of estimator.
-      def score(x, y)
-        x = check_convert_sample_array(x)
-        @best_estimator.score(x, y)
-      end
-      private
-      def valid_param_grid(grid)
-        raise TypeError, 'Expect class of param_grid to be Hash or Array' unless grid.is_a?(Hash) || grid.is_a?(Array)
-        grid = [grid] if grid.is_a?(Hash)
-        grid.each do |h|
-          raise TypeError, 'Expect class of elements in param_grid to be Hash' unless h.is_a?(Hash)
-          raise TypeError, 'Expect class of parameter values in param_grid to be Array' unless h.values.all?(Array)
-        end
-        grid
-      end
-      def param_combinations
-        @param_combinations ||= @params[:param_grid].map do |prm|
-          x = prm.sort.to_h.map { |k, v| [k].product(v) }
-          x[0].product(*x[1...x.size]).map(&:to_h)
-        end
-      end
-      def perform_cross_validation(x, y, prms)
-        est = configurated_estimator(prms)
-        cv = CrossValidation.new(estimator: est, splitter: @params[:splitter],
-                                 evaluator: @params[:evaluator], return_train_score: true)
-        cv.perform(x, y)
-      end
-      def configurated_estimator(prms)
-        estimator = Marshal.load(Marshal.dump(@params[:estimator]))
-        if @params[:estimator].is_a?(Rumale::Pipeline::Pipeline)
-          prms.each do |k, v|
-            est_name, prm_name = k.to_s.split('__')
-            estimator.steps[est_name.to_sym].params[prm_name.to_sym] = v
-          end
-        else
-          prms.each { |k, v| estimator.params[k] = v }
-        end
-        estimator
-      end
-      def init_attrs
-        @cv_results = %i[mean_test_score std_test_score
-                         mean_train_score std_train_score
-                         mean_fit_time std_fit_time params].map { |v| [v, []] }.to_h
-        @best_score = nil
-        @best_params = nil
-        @best_index = nil
-        @best_estimator = nil
-      end
-      def store_cv_result(prms, report)
-        test_scores = Numo::DFloat[*report[:test_score]]
-        train_scores = Numo::DFloat[*report[:train_score]]
-        fit_times = Numo::DFloat[*report[:fit_time]]
-        @cv_results[:mean_test_score].push(test_scores.mean)
-        @cv_results[:std_test_score].push(test_scores.stddev)
-        @cv_results[:mean_train_score].push(train_scores.mean)
-        @cv_results[:std_train_score].push(train_scores.stddev)
-        @cv_results[:mean_fit_time].push(fit_times.mean)
-        @cv_results[:std_fit_time].push(fit_times.stddev)
-        @cv_results[:params].push(prms)
-      end
-      def find_best_params
-        @best_score = @params[:greater_is_better] ? @cv_results[:mean_test_score].max : @cv_results[:mean_test_score].min
-        @best_index = @cv_results[:mean_test_score].index(@best_score)
-        @best_params = @cv_results[:params][@best_index]
-      end
-    end
-  end
-end

data/lib/rumale/model_selection/group_k_fold.rb DELETED Viewed

@@ -1,93 +0,0 @@
-# frozen_string_literal: true
-require 'rumale/base/splitter'
-require 'rumale/preprocessing/label_encoder'
-module Rumale
-  module ModelSelection
-    # GroupKFold is a class that generates the set of data indices for K-fold cross-validation.
-    # The data points belonging to the same group do not be split into different folds.
-    # The number of groups should be greater than or equal to the number of splits.
-    #
-    # @example
-    #   cv = Rumale::ModelSelection::GroupKFold.new(n_splits: 3)
-    #   x = Numo::DFloat.new(8, 2).rand
-    #   groups = Numo::Int32[1, 1, 1, 2, 2, 3, 3, 3]
-    #   cv.split(x, nil, groups).each do |train_ids, test_ids|
-    #     puts '---'
-    #     pp train_ids
-    #     pp test_ids
-    #   end
-    #
-    #   # ---
-    #   # [0, 1, 2, 3, 4]
-    #   # [5, 6, 7]
-    #   # ---
-    #   # [3, 4, 5, 6, 7]
-    #   # [0, 1, 2]
-    #   # ---
-    #   # [0, 1, 2, 5, 6, 7]
-    #   # [3, 4]
-    #
-    class GroupKFold
-      include Base::Splitter
-      # Return the number of folds.
-      # @return [Integer]
-      attr_reader :n_splits
-      # Create a new data splitter for grouped K-fold cross validation.
-      #
-      # @param n_splits [Integer] The number of folds.
-      def initialize(n_splits: 5)
-        check_params_numeric(n_splits: n_splits)
-        @n_splits = n_splits
-      end
-      # Generate data indices for grouped K-fold cross validation.
-      #
-      # @overload split(x, y, groups) -> Array
-      #   @param x [Numo::DFloat] (shape: [n_samples, n_features])
-      #     The dataset to be used to generate data indices for grouped K-fold cross validation.
-      #   @param y [Numo::Int32] (shape: [n_samples])
-      #     This argument exists to unify the interface between the K-fold methods, it is not used in the method.
-      #   @param groups [Numo::Int32] (shape: [n_samples])
-      #     The group labels to be used to generate data indices for grouped K-fold cross validation.
-      # @return [Array] The set of data indices for constructing the training and testing dataset in each fold.
-      def split(x, _y, groups)
-        x = check_convert_sample_array(x)
-        groups = check_convert_label_array(groups)
-        check_sample_label_size(x, groups)
-        encoder = Rumale::Preprocessing::LabelEncoder.new
-        groups = encoder.fit_transform(groups)
-        n_groups = encoder.classes.size
-        raise ArgumentError, 'The number of groups should be greater than or equal to the number of splits.' if n_groups < @n_splits
-        n_samples_per_group = groups.bincount
-        group_ids = n_samples_per_group.sort_index.reverse
-        n_samples_per_group = n_samples_per_group[group_ids]
-        n_samples_per_fold = Numo::Int32.zeros(@n_splits)
-        group_to_fold = Numo::Int32.zeros(n_groups)
-        n_samples_per_group.each_with_index do |weight, id|
-          min_sample_fold_id = n_samples_per_fold.min_index
-          n_samples_per_fold[min_sample_fold_id] += weight
-          group_to_fold[group_ids[id]] = min_sample_fold_id
-        end
-        n_samples = x.shape[0]
-        sample_ids = Array(0...n_samples)
-        fold_ids = group_to_fold[groups]
-        Array.new(@n_splits) do |fid|
-          test_ids = fold_ids.eq(fid).where.to_a
-          train_ids = sample_ids - test_ids
-          [train_ids, test_ids]
-        end
-      end
-    end
-  end
-end