RubyGems - rumale - Versions diffs - 0.19.1 → 0.19.2 - Mend

rumale 0.19.1 → 0.19.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/lib/rumale.rb +2 -0
data/lib/rumale/feature_extraction/tfidf_transformer.rb +113 -0
data/lib/rumale/preprocessing/l1_normalizer.rb +62 -0
data/lib/rumale/preprocessing/l2_normalizer.rb +2 -1
data/lib/rumale/version.rb +1 -1
metadata +4 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f49170105721cfebcae9f1a424e9a858650d78225541a8cb63b0ad4c70734988
-  data.tar.gz: ecc35086328eee1066252e75b8cd638256039e93beebc0bce5714493fe72570b
+  metadata.gz: 1bff2e1e6182aa954be00ed107ed1bd81220298f89514b4b31304f8890ff27c4
+  data.tar.gz: '09b185f468baf9dbec6280fa6c06984c95919308f1d2247277bf30348ed392bc'
 SHA512:
-  metadata.gz: 68f432bb34ff6c8e467a91d7c7e3aa07e816c2dd8807defc9e4e82e7a720c925062dbd27c8a7ec3294ecef2d71041baead2510edaf03a1eee210dc811eede22d
-  data.tar.gz: 5854eacc12de6c3cdcdbab0f9b4e73fc64d1be0533732348da6b4d6dcb0be9f115e2415501b05148fd021fa844ac0c25adc1bb858432a02ca6fe19d30a3538c7
+  metadata.gz: 6d8f1fcaffcd6714c6156fc615d87e6b6950e82ab40fc7434cfc5a014d6c08eb0170ee7c45d8fed978c2a52f839b1ce647fd6e088cbab2ea45e517b34c88407a
+  data.tar.gz: b255ae4c24cdc91ebad59f79ee5a58c5d2a5ffa79bda0ac221e3a33bd824d2fd94e5cd83f3a06e54a2dc537a074276cea5a71651deeee2a304d23e963ff92c9d

data/CHANGELOG.md CHANGED

@@ -1,3 +1,8 @@
+# 0.19.2
+- Fix L2Normalizer to avoid zero divide.
+- Add preprocssing class for [L1Normalizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/L1Normalizer.html).
+- Add transformer class for [TfidfTransformer](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/TfidfTransformer.html).
 # 0.19.1
 - Add cluster analysis class for [mini-batch K-Means](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/MiniBatchKMeans.html).
 - Fix some typos.

data/lib/rumale.rb CHANGED

@@ -93,7 +93,9 @@ require 'rumale/neural_network/mlp_regressor'
 require 'rumale/neural_network/mlp_classifier'
 require 'rumale/feature_extraction/hash_vectorizer'
 require 'rumale/feature_extraction/feature_hasher'
+require 'rumale/feature_extraction/tfidf_transformer'
 require 'rumale/preprocessing/l2_normalizer'
+require 'rumale/preprocessing/l1_normalizer'
 require 'rumale/preprocessing/min_max_scaler'
 require 'rumale/preprocessing/max_abs_scaler'
 require 'rumale/preprocessing/standard_scaler'

data/lib/rumale/feature_extraction/tfidf_transformer.rb ADDED

@@ -0,0 +1,113 @@
+# frozen_string_literal: true
+require 'rumale/base/base_estimator'
+require 'rumale/base/transformer'
+require 'rumale/preprocessing/l1_normalizer'
+require 'rumale/preprocessing/l2_normalizer'
+module Rumale
+  module FeatureExtraction
+    # Transform sample matrix with term frequecy (tf) to a normalized tf-idf (inverse document frequency) reprensentation.
+    #
+    # @example
+    #   encoder = Rumale::FeatureExtraction::HashVectorizer.new
+    #   x = encoder.fit_transform([
+    #     { foo: 1, bar: 2 },
+    #     { foo: 3, baz: 1 }
+    #   ])
+    #
+    #   # > pp x
+    #   # Numo::DFloat#shape=[2,3]
+    #   # [[2, 0, 1],
+    #   #  [0, 1, 3]]
+    #
+    #   transformer = Rumale::FeatureExtraction::TfidfTransformer.new
+    #   x_tfidf = transformer.fit_transform(x)
+    #
+    #   # > pp x_tfidf
+    #   # Numo::DFloat#shape=[2,3]
+    #   # [[0.959056, 0, 0.283217],
+    #   #  [0, 0.491506, 0.870874]]
+    #
+    # *Reference*
+    # - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
+    class TfidfTransformer
+      include Base::BaseEstimator
+      include Base::Transformer
+      # Return the vector consists of inverse document frequency.
+      # @return [Numo::DFloat] (shape: [n_features])
+      attr_reader :idf
+      # Create a new transfomer for converting tf vectors to tf-idf vectors.
+      #
+      # @param norm [String] The normalization method to be used ('l1', 'l2' and 'none').
+      # @param use_idf [Boolean] The flag indicating whether to use inverse document frequency weighting.
+      # @param smooth_idf [Boolean] The flag indicating whether to apply idf smoothing by log((n_samples + 1) / (df + 1)) + 1.
+      # @param sublinear_tf [Boolean] The flag indicating whether to perform subliner tf scaling by 1 + log(tf).
+      def initialize(norm: 'l2', use_idf: true, smooth_idf: false, sublinear_tf: false)
+        check_params_string(norm: norm)
+        check_params_boolean(use_idf: use_idf, smooth_idf: smooth_idf, sublinear_tf: sublinear_tf)
+        @params = {}
+        @params[:norm] = norm
+        @params[:use_idf] = use_idf
+        @params[:smooth_idf] = smooth_idf
+        @params[:sublinear_tf] = sublinear_tf
+        @idf = nil
+      end
+      # Calculate the inverse document frequency for weighting.
+      #
+      # @overload fit(x) -> TfidfTransformer
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the idf values.
+      # @return [TfidfTransformer]
+      def fit(x, _y = nil)
+        return self unless @params[:use_idf]
+        x = check_convert_sample_array(x)
+        n_samples = x.shape[0]
+        df = x.class.cast(x.gt(0.0).count(0))
+        if @params[:smooth_idf]
+          df += 1
+          n_samples += 1
+        end
+        @idf = Numo::NMath.log(n_samples / df) + 1
+        self
+      end
+      # Calculate the idf values, and then transfrom samples to the tf-idf representation.
+      #
+      # @overload fit_transform(x) -> Numo::DFloat
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate idf and be transformed to tf-idf representation.
+      # @return [Numo::DFloat] The transformed samples.
+      def fit_transform(x, _y = nil)
+        fit(x).transform(x)
+      end
+      # Perform transforming the given samples to the tf-idf representation.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
+      # @return [Numo::DFloat] The transformed samples.
+      def transform(x)
+        x = check_convert_sample_array(x)
+        z = x.dup
+        z[z.ne(0)] = Numo::NMath.log(z[z.ne(0)]) + 1 if @params[:sublinear_tf]
+        z *= @idf if @params[:use_idf]
+        case @params[:norm]
+        when 'l2'
+          z = Rumale::Preprocessing::L2Normalizer.new.fit_transform(z)
+        when 'l1'
+          z = Rumale::Preprocessing::L1Normalizer.new.fit_transform(z)
+        end
+        z
+      end
+    end
+  end
+end

data/lib/rumale/preprocessing/l1_normalizer.rb ADDED

@@ -0,0 +1,62 @@
+# frozen_string_literal: true
+require 'rumale/base/base_estimator'
+require 'rumale/base/transformer'
+module Rumale
+  module Preprocessing
+    # Normalize samples to unit L1-norm.
+    #
+    # @example
+    #   normalizer = Rumale::Preprocessing::L1Normalizer.new
+    #   new_samples = normalizer.fit_transform(samples)
+    class L1Normalizer
+      include Base::BaseEstimator
+      include Base::Transformer
+      # Return the vector consists of L1-norm for each sample.
+      # @return [Numo::DFloat] (shape: [n_samples])
+      attr_reader :norm_vec # :nodoc:
+      # Create a new normalizer for normaliing to L1-norm.
+      def initialize
+        @params = {}
+        @norm_vec = nil
+      end
+      # Calculate L1-norms of each sample.
+      #
+      # @overload fit(x) -> L1Normalizer
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L1-norms.
+      # @return [L1Normalizer]
+      def fit(x, _y = nil)
+        x = check_convert_sample_array(x)
+        @norm_vec = x.abs.sum(1)
+        @norm_vec[@norm_vec.eq(0)] = 1
+        self
+      end
+      # Calculate L1-norms of each sample, and then normalize samples to L1-norm.
+      #
+      # @overload fit_transform(x) -> Numo::DFloat
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L1-norms.
+      # @return [Numo::DFloat] The normalized samples.
+      def fit_transform(x, _y = nil)
+        x = check_convert_sample_array(x)
+        fit(x)
+        x / @norm_vec.expand_dims(1)
+      end
+      # Calculate L1-norms of each sample, and then normalize samples to L1-norm.
+      # This method calls the fit_transform method. This method exists for the Pipeline class.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L1-norms.
+      # @return [Numo::DFloat] The normalized samples.
+      def transform(x)
+        fit_transform(x)
+      end
+    end
+  end
+end

data/lib/rumale/preprocessing/l2_normalizer.rb CHANGED

@@ -34,6 +34,7 @@ module Rumale
       def fit(x, _y = nil)
         x = check_convert_sample_array(x)
         @norm_vec = Numo::NMath.sqrt((x**2).sum(1))
+        @norm_vec[@norm_vec.eq(0)] = 1
         self
       end
@@ -46,7 +47,7 @@ module Rumale
       def fit_transform(x, _y = nil)
         x = check_convert_sample_array(x)
         fit(x)
-        x / @norm_vec.tile(x.shape[1], 1).transpose
+        x / @norm_vec.expand_dims(1)
       end
       # Calculate L2-norms of each sample, and then normalize samples to unit L2-norm.

data/lib/rumale/version.rb CHANGED

@@ -3,5 +3,5 @@
 # Rumale is a machine learning library in Ruby.
 module Rumale
   # The version of Rumale you are using.
-  VERSION = '0.19.1'
+  VERSION = '0.19.2'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rumale
 version: !ruby/object:Gem::Version
-  version: 0.19.1
+  version: 0.19.2
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2020-06-06 00:00:00.000000000 Z
+date: 2020-06-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray
@@ -113,6 +113,7 @@ files:
 - lib/rumale/evaluation_measure/silhouette_score.rb
 - lib/rumale/feature_extraction/feature_hasher.rb
 - lib/rumale/feature_extraction/hash_vectorizer.rb
+- lib/rumale/feature_extraction/tfidf_transformer.rb
 - lib/rumale/kernel_approximation/nystroem.rb
 - lib/rumale/kernel_approximation/rbf.rb
 - lib/rumale/kernel_machine/kernel_fda.rb
@@ -166,6 +167,7 @@ files:
 - lib/rumale/polynomial_model/factorization_machine_classifier.rb
 - lib/rumale/polynomial_model/factorization_machine_regressor.rb
 - lib/rumale/preprocessing/bin_discretizer.rb
+- lib/rumale/preprocessing/l1_normalizer.rb
 - lib/rumale/preprocessing/l2_normalizer.rb
 - lib/rumale/preprocessing/label_binarizer.rb
 - lib/rumale/preprocessing/label_encoder.rb