RubyGems - rumale - Versions diffs - 0.18.7 → 0.20.0 - Mend

rumale 0.18.7 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

checksums.yaml +4 -4
data/.rubocop.yml +66 -1
data/CHANGELOG.md +46 -0
data/Gemfile +2 -0
data/README.md +5 -36
data/lib/rumale.rb +5 -10
data/lib/rumale/clustering/hdbscan.rb +1 -1
data/lib/rumale/clustering/k_means.rb +1 -1
data/lib/rumale/clustering/k_medoids.rb +1 -1
data/lib/rumale/clustering/mini_batch_k_means.rb +139 -0
data/lib/rumale/dataset.rb +3 -3
data/lib/rumale/decomposition/pca.rb +23 -5
data/lib/rumale/feature_extraction/feature_hasher.rb +14 -1
data/lib/rumale/feature_extraction/tfidf_transformer.rb +113 -0
data/lib/rumale/kernel_approximation/nystroem.rb +1 -1
data/lib/rumale/kernel_machine/kernel_svc.rb +1 -1
data/lib/rumale/linear_model/base_sgd.rb +1 -1
data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +13 -1
data/lib/rumale/model_selection/cross_validation.rb +3 -2
data/lib/rumale/model_selection/k_fold.rb +1 -1
data/lib/rumale/model_selection/shuffle_split.rb +1 -1
data/lib/rumale/multiclass/one_vs_rest_classifier.rb +2 -2
data/lib/rumale/nearest_neighbors/vp_tree.rb +1 -1
data/lib/rumale/neural_network/adam.rb +1 -1
data/lib/rumale/neural_network/base_mlp.rb +1 -1
data/lib/rumale/preprocessing/binarizer.rb +60 -0
data/lib/rumale/preprocessing/l1_normalizer.rb +62 -0
data/lib/rumale/preprocessing/l2_normalizer.rb +2 -1
data/lib/rumale/preprocessing/max_normalizer.rb +62 -0
data/lib/rumale/version.rb +1 -1
data/rumale.gemspec +1 -3
metadata +11 -44
data/lib/rumale/linear_model/base_linear_model.rb +0 -101
data/lib/rumale/optimizer/ada_grad.rb +0 -39
data/lib/rumale/optimizer/adam.rb +0 -53
data/lib/rumale/optimizer/nadam.rb +0 -62
data/lib/rumale/optimizer/rmsprop.rb +0 -47
data/lib/rumale/optimizer/sgd.rb +0 -43
data/lib/rumale/optimizer/yellow_fin.rb +0 -101
data/lib/rumale/polynomial_model/base_factorization_machine.rb +0 -121
data/lib/rumale/polynomial_model/factorization_machine_classifier.rb +0 -215
data/lib/rumale/polynomial_model/factorization_machine_regressor.rb +0 -129

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 5e3069531e5acbdaab178769d20684a0fa260e29f6c39a645632b903fff8cce0
-  data.tar.gz: d39c7e61a20b1bce23ccbb9d809bb06f1babce101f509c78c6da2a64e95f180f
+  metadata.gz: 358515f8785eb3de2e6571a957ca76cece6b774bb022c1a0951c92d44ab422b4
+  data.tar.gz: '0289b7eb382cd3300845412af0fd43626f4f827bb719083c879b574e3ab37eb0'
 SHA512:
-  metadata.gz: eb9077b26d63f153eefd4c68ea57083e12b6a465d06864da8b24a3f9d2aff907b8de350ade5c96e8e9ec28997424839b91ac884d787a7bff7c2a44d212addd81
-  data.tar.gz: 7d94c6d80e16ed405f87a7c777b4922863e42922a5b33046df4bc42d9daa5f5243ebb6c5492cb2d20120215cdce9a5c4b0ac3156012bf38cf91e7717b2c51c22
+  metadata.gz: f03fc0f27f99ed4acea3fb7d7bf34017c1dbf923b20dabc9a78d6d44f0b151bc9dc78ba24d122f81607a43fd1852e398a603b75b87656a2f79109f87c0db0d98
+  data.tar.gz: 69f6b8892f6bfb4c43706513245c3fba687dcb6a347c1c5185a70d5e45a024b2848a019bfae48726e1f49212878e8d6d67c811ec5f4a990fdbb3a2841efdfe9b

data/.rubocop.yml CHANGED

@@ -24,6 +24,15 @@ Style/HashTransformKeys:
 Style/HashTransformValues:
   Enabled: true
+Lint/DeprecatedOpenSSLConstant:
+  Enabled: true
+Lint/DuplicateElsifCondition:
+  Enabled: true
+Lint/MixedRegexpCaptureTypes:
+  Enabled: true
 Lint/RaiseException:
   Enabled: true
@@ -34,7 +43,6 @@ Layout/LineLength:
   Max: 145
   IgnoredPatterns: ['(\A|\s)#']
 Metrics/ModuleLength:
   Max: 200
@@ -70,15 +78,48 @@ Naming/MethodParameterName:
 Naming/ConstantName:
   Enabled: false
+Style/AccessorGrouping:
+  Enabled: true
+Style/ArrayCoercion:
+  Enabled: true
+Style/BisectedAttrAccessor:
+  Enabled: true
+Style/CaseLikeIf:
+  Enabled: true
 Style/ExponentialNotation:
   Enabled: true
 Style/FormatStringToken:
   Enabled: false
+Style/HashAsLastArrayItem:
+  Enabled: true
+Style/HashLikeCase:
+  Enabled: true
 Style/NumericLiterals:
   Enabled: false
+Style/RedundantAssignment:
+  Enabled: true
+Style/RedundantFetchBlock:
+  Enabled: true
+Style/RedundantFileExtensionInRequire:
+  Enabled: true
+Style/RedundantRegexpCharacterClass:
+  Enabled: true
+Style/RedundantRegexpEscape:
+  Enabled: true
 Style/SlicingWithRange:
   Enabled: true
@@ -91,6 +132,30 @@ Layout/EmptyLinesAroundAttributeAccessor:
 Layout/SpaceAroundMethodCallOperator:
   Enabled: true
+Performance/AncestorsInclude:
+  Enabled: true
+Performance/BigDecimalWithNumericArgument:
+  Enabled: true
+Performance/RedundantSortBlock:
+  Enabled: true
+Performance/RedundantStringChars:
+  Enabled: true
+Performance/ReverseFirst:
+  Enabled: true
+Performance/SortReverse:
+  Enabled: true
+Performance/Squeeze:
+  Enabled: true
+Performance/StringInclude:
+  Enabled: true
 RSpec/MultipleExpectations:
   Enabled: false

data/CHANGELOG.md CHANGED

@@ -1,3 +1,49 @@
+# 0.20.0
+## Breaking changes
+- Delete deprecated estimators such as PolynomialModel, Optimizer, and BaseLinearModel.
+# 0.19.3
+- Add preprocessing class for [Binarizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/Binarizer.html)
+- Add preprocessing class for [MaxNormalizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/MaxNormalizer.html)
+- Refactor some codes with Rubocop.
+# 0.19.2
+- Fix L2Normalizer to avoid zero divide.
+- Add preprocssing class for [L1Normalizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/L1Normalizer.html).
+- Add transformer class for [TfidfTransformer](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/TfidfTransformer.html).
+# 0.19.1
+- Add cluster analysis class for [mini-batch K-Means](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/MiniBatchKMeans.html).
+- Fix some typos.
+# 0.19.0
+## Breaking changes
+- Change mmh3 and mopti gem to non-runtime dependent library.
+  - The mmh3 gem is used in [FeatureHasher](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/FeatureHasher.html).
+  You only need to require mmh3 gem when using FeatureHasher.
+    ```ruby
+    require 'mmh3'
+    require 'rumale'
+    encoder = Rumale::FeatureExtraction::FeatureHasher.new
+    ```
+  - The mopti gem is used in [NeighbourhoodComponentAnalysis](https://yoshoku.github.io/rumale/doc/Rumale/MetricLearning/NeighbourhoodComponentAnalysis.html).
+  You only need to require mopti gem when using NeighbourhoodComponentAnalysis.
+    ```ruby
+    require 'mopti'
+    require 'rumale'
+    transformer = Rumale::MetricLearning::NeighbourhoodComponentAnalysis.new
+    ```
+- Change the default value of solver parameter on [PCA](https://yoshoku.github.io/rumale/doc/Rumale/Decomposition/PCA.html) to 'auto'.
+If Numo::Linalg is loaded, 'evd' is selected for the solver, otherwise 'fpt' is selected.
+- Deprecate [PolynomialModel](https://yoshoku.github.io/rumale/doc/Rumale/PolynomialModel.html), [Optimizer](https://yoshoku.github.io/rumale/doc/Rumale/Optimizer.html), and the estimators contained in them. They will be deleted in version 0.20.0.
+  - Many machine learning libraries do not contain factorization machine algorithms, they are provided by another compatible library.
+  In addition, there are no plans to implement estimators in PolynomialModel.
+  Thus, the author decided to deprecate PolynomialModel.
+  - Currently, the Optimizer classes are only used by PolynomialModel estimators.
+  Therefore, they have been deprecated together with PolynomialModel.
 # 0.18.7
 - Fix to convert target_name to string array in [classification_report method](https://yoshoku.github.io/rumale/doc/Rumale/EvaluationMeasure.html#classification_report-class_method).
 - Refactor some codes with Rubocop.

data/Gemfile CHANGED

@@ -4,6 +4,8 @@ source 'https://rubygems.org'
 gemspec
 gem 'coveralls', '~> 0.8'
+gem 'mmh3', '>= 1.0'
+gem 'mopti', '>= 0.1.0'
 gem 'numo-linalg', '>= 0.1.4'
 gem 'parallel', '>= 1.17.0'
 gem 'rake', '~> 12.0'

data/README.md CHANGED

@@ -11,7 +11,7 @@
 Rumale (**Ru**by **ma**chine **le**arning) is a machine learning library in Ruby.
 Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
 Rumale supports Support Vector Machine,
-Logistic Regression, Ridge, Lasso, Factorization Machine,
+Logistic Regression, Ridge, Lasso,
 Multi-layer Perceptron,
 Naive Bayes, Decision Tree, Gradient Tree Boosting, Random Forest,
 K-Means, Gaussian Mixture Model, DBSCAN, Spectral Clustering,
@@ -42,39 +42,7 @@ Or install it yourself as:
 ## Usage
-### Example 1. XOR data
-First, let's classify simple xor data.
-```ruby
-require 'rumale'
-# Prepare XOR data.
-samples = [[0, 0], [0, 1], [1, 0], [1, 1]]
-labels = [0, 1, 1, 0]
-# Train classifier with nearest neighbor rule.
-estimator = Rumale::NearestNeighbors::KNeighborsClassifier.new(n_neighbors: 1)
-estimator.fit(samples, labels)
-# Predict labels.
-p labels
-p estimator.predict(samples)
-```
-Execution of the above script result in the following.
-```ruby
-[0, 1, 1, 0]
-Numo::Int32#shape=[4]
-[0, 1, 1, 0]
-```
-The basic usage of Rumale is to first train the model with the fit method
-and then estimate with the predict method.
-In addition, Rumale recommends using arrays such as feature vectors and labels with
-[Numo::NArray](https://github.com/ruby-numo/numo-narray).
-### Example 2. Pendigits dataset classification
+### Example 1. Pendigits dataset classification
 Rumale provides function loading libsvm format dataset file.
 We start by downloading the pendigits dataset from LIBSVM Data web site.
@@ -137,7 +105,7 @@ $ ruby test.rb
 Accuracy: 98.7%
 ```
-### Example 3. Cross-validation
+### Example 2. Cross-validation
 ```ruby
 require 'rumale'
@@ -168,7 +136,7 @@ $ ruby cross_validation.rb
 5-CV mean log-loss: 0.355
 ```
-### Example 4. Pipeline
+### Example 3. Pipeline
 ```ruby
 require 'rumale'
@@ -203,6 +171,7 @@ $ ruby pipeline.rb
 ## Speed up
 ### Numo::Linalg
+Rumale uses [Numo::NArray](https://github.com/ruby-numo/numo-narray) for typed arrays.
 Loading the [Numo::Linalg](https://github.com/ruby-numo/numo-linalg) allows to perform matrix product of Numo::NArray using BLAS libraries.
 For example, using the [OpenBLAS](https://github.com/xianyi/OpenBLAS) speeds up many estimators in Rumale.

data/lib/rumale.rb CHANGED

@@ -18,17 +18,10 @@ require 'rumale/base/cluster_analyzer'
 require 'rumale/base/transformer'
 require 'rumale/base/splitter'
 require 'rumale/base/evaluator'
-require 'rumale/optimizer/sgd'
-require 'rumale/optimizer/ada_grad'
-require 'rumale/optimizer/rmsprop'
-require 'rumale/optimizer/adam'
-require 'rumale/optimizer/nadam'
-require 'rumale/optimizer/yellow_fin'
 require 'rumale/pipeline/pipeline'
 require 'rumale/pipeline/feature_union'
 require 'rumale/kernel_approximation/rbf'
 require 'rumale/kernel_approximation/nystroem'
-require 'rumale/linear_model/base_linear_model'
 require 'rumale/linear_model/base_sgd'
 require 'rumale/linear_model/svc'
 require 'rumale/linear_model/svr'
@@ -41,9 +34,6 @@ require 'rumale/kernel_machine/kernel_svc'
 require 'rumale/kernel_machine/kernel_pca'
 require 'rumale/kernel_machine/kernel_fda'
 require 'rumale/kernel_machine/kernel_ridge'
-require 'rumale/polynomial_model/base_factorization_machine'
-require 'rumale/polynomial_model/factorization_machine_classifier'
-require 'rumale/polynomial_model/factorization_machine_regressor'
 require 'rumale/multiclass/one_vs_rest_classifier'
 require 'rumale/nearest_neighbors/vp_tree'
 require 'rumale/nearest_neighbors/k_neighbors_classifier'
@@ -70,6 +60,7 @@ require 'rumale/ensemble/random_forest_regressor'
 require 'rumale/ensemble/extra_trees_classifier'
 require 'rumale/ensemble/extra_trees_regressor'
 require 'rumale/clustering/k_means'
+require 'rumale/clustering/mini_batch_k_means'
 require 'rumale/clustering/k_medoids'
 require 'rumale/clustering/gaussian_mixture'
 require 'rumale/clustering/dbscan'
@@ -92,7 +83,10 @@ require 'rumale/neural_network/mlp_regressor'
 require 'rumale/neural_network/mlp_classifier'
 require 'rumale/feature_extraction/hash_vectorizer'
 require 'rumale/feature_extraction/feature_hasher'
+require 'rumale/feature_extraction/tfidf_transformer'
 require 'rumale/preprocessing/l2_normalizer'
+require 'rumale/preprocessing/l1_normalizer'
+require 'rumale/preprocessing/max_normalizer'
 require 'rumale/preprocessing/min_max_scaler'
 require 'rumale/preprocessing/max_abs_scaler'
 require 'rumale/preprocessing/standard_scaler'
@@ -101,6 +95,7 @@ require 'rumale/preprocessing/label_binarizer'
 require 'rumale/preprocessing/label_encoder'
 require 'rumale/preprocessing/one_hot_encoder'
 require 'rumale/preprocessing/ordinal_encoder'
+require 'rumale/preprocessing/binarizer'
 require 'rumale/preprocessing/polynomial_features'
 require 'rumale/model_selection/k_fold'
 require 'rumale/model_selection/stratified_k_fold'

data/lib/rumale/clustering/hdbscan.rb CHANGED

@@ -232,7 +232,7 @@ module Rumale
       end
       def flatten(tree, stabilities)
-        node_ids = stabilities.keys.sort { |a, b| b <=> a }.slice(0, stabilities.size - 1)
+        node_ids = stabilities.keys.sort.reverse.slice(0, stabilities.size - 1)
         cluster_tree = tree.select { |edge| edge.n_elements > 1 }
         is_cluster = node_ids.each_with_object({}) { |n_id, h| h[n_id] = true }

data/lib/rumale/clustering/k_means.rb CHANGED

@@ -103,7 +103,7 @@ module Rumale
         # random initialize
         n_samples = x.shape[0]
         sub_rng = @rng.dup
-        rand_id = [*0...n_samples].sample(@params[:n_clusters], random: sub_rng)
+        rand_id = Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng)
         @cluster_centers = x[rand_id, true].dup
         return unless @params[:init] == 'k-means++'

data/lib/rumale/clustering/k_medoids.rb CHANGED

@@ -124,7 +124,7 @@ module Rumale
         # random initialize
         n_samples = distance_mat.shape[0]
         sub_rng = @rng.dup
-        @medoid_ids = Numo::Int32.asarray([*0...n_samples].sample(@params[:n_clusters], random: sub_rng))
+        @medoid_ids = Numo::Int32.asarray(Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng))
         return unless @params[:init] == 'k-means++'
         # k-means++ initialize

data/lib/rumale/clustering/mini_batch_k_means.rb ADDED

@@ -0,0 +1,139 @@
+# frozen_string_literal: true
+require 'rumale/base/base_estimator'
+require 'rumale/base/cluster_analyzer'
+require 'rumale/pairwise_metric'
+module Rumale
+  module Clustering
+    # MniBatchKMeans is a class that implements K-Means cluster analysis
+    # with mini-batch stochastic gradient descent (SGD).
+    #
+    # @example
+    #   analyzer = Rumale::Clustering::MiniBatchKMeans.new(n_clusters: 10, max_iter: 50, batch_size: 50, random_seed: 1)
+    #   cluster_labels = analyzer.fit_predict(samples)
+    #
+    # *Reference*
+    # - Sculley, D., "Web-scale k-means clustering," Proc. WWW'10, pp. 1177--1178, 2010.
+    class MiniBatchKMeans
+      include Base::BaseEstimator
+      include Base::ClusterAnalyzer
+      # Return the centroids.
+      # @return [Numo::DFloat] (shape: [n_clusters, n_features])
+      attr_reader :cluster_centers
+      # Return the random generator.
+      # @return [Random]
+      attr_reader :rng
+      # Create a new cluster analyzer with K-Means method with mini-batch SGD.
+      #
+      # @param n_clusters [Integer] The number of clusters.
+      # @param init [String] The initialization method for centroids ('random' or 'k-means++').
+      # @param max_iter [Integer] The maximum number of iterations.
+      # @param batch_size [Integer] The size of the mini batches.
+      # @param tol [Float] The tolerance of termination criterion.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      def initialize(n_clusters: 8, init: 'k-means++', max_iter: 100, batch_size: 100, tol: 1.0e-4, random_seed: nil)
+        check_params_numeric(n_clusters: n_clusters, max_iter: max_iter, batch_size: batch_size, tol: tol)
+        check_params_string(init: init)
+        check_params_numeric_or_nil(random_seed: random_seed)
+        check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
+        @params = {}
+        @params[:n_clusters] = n_clusters
+        @params[:init] = init == 'random' ? 'random' : 'k-means++'
+        @params[:max_iter] = max_iter
+        @params[:batch_size] = batch_size
+        @params[:tol] = tol
+        @params[:random_seed] = random_seed
+        @params[:random_seed] ||= srand
+        @cluster_centers = nil
+        @rng = Random.new(@params[:random_seed])
+      end
+      # Analysis clusters with given training data.
+      #
+      # @overload fit(x) -> MiniBatchKMeans
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      # @return [KMeans] The learned cluster analyzer itself.
+      def fit(x, _y = nil)
+        x = check_convert_sample_array(x)
+        # initialization.
+        n_samples = x.shape[0]
+        update_counter = Numo::Int32.zeros(@params[:n_clusters])
+        sub_rng = @rng.dup
+        init_cluster_centers(x, sub_rng)
+        # optimization with mini-batch sgd.
+        @params[:max_iter].times do |_t|
+          sample_ids = Array(0...n_samples).shuffle(random: sub_rng)
+          old_centers = @cluster_centers.dup
+          until (subset_ids = sample_ids.shift(@params[:batch_size])).empty?
+            # sub sampling
+            sub_x = x[subset_ids, true]
+            # assign nearest centroids
+            cluster_labels = assign_cluster(sub_x)
+            # update centroids
+            @params[:n_clusters].times do |c|
+              assigned_bits = cluster_labels.eq(c)
+              next unless assigned_bits.count.positive?
+              update_counter[c] += 1
+              learning_rate = 1.fdiv(update_counter[c])
+              update = sub_x[assigned_bits.where, true].mean(axis: 0)
+              @cluster_centers[c, true] = (1 - learning_rate) * @cluster_centers[c, true] + learning_rate * update
+            end
+          end
+          error = Numo::NMath.sqrt(((old_centers - @cluster_centers)**2).sum(axis: 1)).mean
+          break if error <= @params[:tol]
+        end
+        self
+      end
+      # Predict cluster labels for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
+      def predict(x)
+        x = check_convert_sample_array(x)
+        assign_cluster(x)
+      end
+      # Analysis clusters and assign samples to clusters.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
+      # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
+      def fit_predict(x)
+        x = check_convert_sample_array(x)
+        fit(x)
+        predict(x)
+      end
+      private
+      def assign_cluster(x)
+        distance_matrix = PairwiseMetric.euclidean_distance(x, @cluster_centers)
+        distance_matrix.min_index(axis: 1) - Numo::Int32[*0.step(distance_matrix.size - 1, @cluster_centers.shape[0])]
+      end
+      def init_cluster_centers(x, sub_rng)
+        # random initialize
+        n_samples = x.shape[0]
+        rand_id = Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng)
+        @cluster_centers = x[rand_id, true].dup
+        return unless @params[:init] == 'k-means++'
+        # k-means++ initialize
+        (1...@params[:n_clusters]).each do |n|
+          distance_matrix = PairwiseMetric.euclidean_distance(x, @cluster_centers[0...n, true])
+          min_distances = distance_matrix.flatten[distance_matrix.min_index(axis: 1)]
+          probs = min_distances**2 / (min_distances**2).sum
+          cum_probs = probs.cumsum
+          selected_id = cum_probs.gt(sub_rng.rand).where.to_a.first
+          @cluster_centers[n, true] = x[selected_id, true].dup
+        end
+      end
+    end
+  end
+end