RubyGems - rumale - Versions diffs - 0.19.1 → 0.19.2 - Mend

rumale 0.19.1 → 0.19.2

Files changed (8) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/lib/rumale.rb +2 -0
data/lib/rumale/feature_extraction/tfidf_transformer.rb +113 -0
data/lib/rumale/preprocessing/l1_normalizer.rb +62 -0
data/lib/rumale/preprocessing/l2_normalizer.rb +2 -1
data/lib/rumale/version.rb +1 -1
metadata +4 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f49170105721cfebcae9f1a424e9a858650d78225541a8cb63b0ad4c70734988
-  data.tar.gz: ecc35086328eee1066252e75b8cd638256039e93beebc0bce5714493fe72570b
+  metadata.gz: 1bff2e1e6182aa954be00ed107ed1bd81220298f89514b4b31304f8890ff27c4
+  data.tar.gz: '09b185f468baf9dbec6280fa6c06984c95919308f1d2247277bf30348ed392bc'
 SHA512:
-  metadata.gz: 68f432bb34ff6c8e467a91d7c7e3aa07e816c2dd8807defc9e4e82e7a720c925062dbd27c8a7ec3294ecef2d71041baead2510edaf03a1eee210dc811eede22d
-  data.tar.gz: 5854eacc12de6c3cdcdbab0f9b4e73fc64d1be0533732348da6b4d6dcb0be9f115e2415501b05148fd021fa844ac0c25adc1bb858432a02ca6fe19d30a3538c7
+  metadata.gz: 6d8f1fcaffcd6714c6156fc615d87e6b6950e82ab40fc7434cfc5a014d6c08eb0170ee7c45d8fed978c2a52f839b1ce647fd6e088cbab2ea45e517b34c88407a
+  data.tar.gz: b255ae4c24cdc91ebad59f79ee5a58c5d2a5ffa79bda0ac221e3a33bd824d2fd94e5cd83f3a06e54a2dc537a074276cea5a71651deeee2a304d23e963ff92c9d

data/CHANGELOG.md CHANGED

@@ -1,3 +1,8 @@
+# 0.19.2
+- Fix L2Normalizer to avoid zero divide.
+- Add preprocssing class for [L1Normalizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/L1Normalizer.html).
+- Add transformer class for [TfidfTransformer](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/TfidfTransformer.html).
 # 0.19.1
 - Add cluster analysis class for [mini-batch K-Means](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/MiniBatchKMeans.html).
 - Fix some typos.

data/lib/rumale.rb CHANGED

@@ -93,7 +93,9 @@ require 'rumale/neural_network/mlp_regressor'
 require 'rumale/neural_network/mlp_classifier'
 require 'rumale/feature_extraction/hash_vectorizer'
 require 'rumale/feature_extraction/feature_hasher'
+require 'rumale/feature_extraction/tfidf_transformer'
 require 'rumale/preprocessing/l2_normalizer'
+require 'rumale/preprocessing/l1_normalizer'
 require 'rumale/preprocessing/min_max_scaler'
 require 'rumale/preprocessing/max_abs_scaler'
 require 'rumale/preprocessing/standard_scaler'

data/lib/rumale/feature_extraction/tfidf_transformer.rb ADDED

@@ -0,0 +1,113 @@
+# frozen_string_literal: true
+require 'rumale/base/base_estimator'
+require 'rumale/base/transformer'
+require 'rumale/preprocessing/l1_normalizer'
+require 'rumale/preprocessing/l2_normalizer'
+module Rumale
+  module FeatureExtraction
+    # Transform sample matrix with term frequecy (tf) to a normalized tf-idf (inverse document frequency) reprensentation.
+    #
+    # @example
+    #   encoder = Rumale::FeatureExtraction::HashVectorizer.new
+    #   x = encoder.fit_transform([
+    #     { foo: 1, bar: 2 },
+    #     { foo: 3, baz: 1 }
+    #   ])
+    #
+    #   # > pp x
+    #   # Numo::DFloat#shape=[2,3]
+    #   # [[2, 0, 1],
+    #   #  [0, 1, 3]]
+    #
+    #   transformer = Rumale::FeatureExtraction::TfidfTransformer.new
+    #   x_tfidf = transformer.fit_transform(x)
+    #
+    #   # > pp x_tfidf
+    #   # Numo::DFloat#shape=[2,3]
+    #   # [[0.959056, 0, 0.283217],
+    #   #  [0, 0.491506, 0.870874]]
+    #
+    # *Reference*
+    # - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
+    class TfidfTransformer
+      include Base::BaseEstimator
+      include Base::Transformer
+      # Return the vector consists of inverse document frequency.
+      # @return [Numo::DFloat] (shape: [n_features])
+      attr_reader :idf
+      # Create a new transfomer for converting tf vectors to tf-idf vectors.
+      #
+      # @param norm [String] The normalization method to be used ('l1', 'l2' and 'none').
+      # @param use_idf [Boolean] The flag indicating whether to use inverse document frequency weighting.
+      # @param smooth_idf [Boolean] The flag indicating whether to apply idf smoothing by log((n_samples + 1) / (df + 1)) + 1.
+      # @param sublinear_tf [Boolean] The flag indicating whether to perform subliner tf scaling by 1 + log(tf).
+      def initialize(norm: 'l2', use_idf: true, smooth_idf: false, sublinear_tf: false)
+        check_params_string(norm: norm)
+        check_params_boolean(use_idf: use_idf, smooth_idf: smooth_idf, sublinear_tf: sublinear_tf)
+        @params = {}
+        @params[:norm] = norm
+        @params[:use_idf] = use_idf
+        @params[:smooth_idf] = smooth_idf
+        @params[:sublinear_tf] = sublinear_tf
+        @idf = nil
+      end
+      # Calculate the inverse document frequency for weighting.
+      #
+      # @overload fit(x) -> TfidfTransformer
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the idf values.
+      # @return [TfidfTransformer]
+      def fit(x, _y = nil)
+        return self unless @params[:use_idf]
+        x = check_convert_sample_array(x)
+        n_samples = x.shape[0]
+        df = x.class.cast(x.gt(0.0).count(0))
+        if @params[:smooth_idf]
+          df += 1
+          n_samples += 1
+        end
+        @idf = Numo::NMath.log(n_samples / df) + 1
+        self
+      end
+      # Calculate the idf values, and then transfrom samples to the tf-idf representation.
+      #
+      # @overload fit_transform(x) -> Numo::DFloat
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate idf and be transformed to tf-idf representation.
+      # @return [Numo::DFloat] The transformed samples.
+      def fit_transform(x, _y = nil)
+        fit(x).transform(x)
+      end
+      # Perform transforming the given samples to the tf-idf representation.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
+      # @return [Numo::DFloat] The transformed samples.
+      def transform(x)
+        x = check_convert_sample_array(x)
+        z = x.dup
+        z[z.ne(0)] = Numo::NMath.log(z[z.ne(0)]) + 1 if @params[:sublinear_tf]
+        z *= @idf if @params[:use_idf]
+        case @params[:norm]
+        when 'l2'
+          z = Rumale::Preprocessing::L2Normalizer.new.fit_transform(z)
+        when 'l1'
+          z = Rumale::Preprocessing::L1Normalizer.new.fit_transform(z)
+        end
+        z
+      end
+    end
+  end
+end

data/lib/rumale/preprocessing/l1_normalizer.rb ADDED

@@ -0,0 +1,62 @@
+# frozen_string_literal: true
+require 'rumale/base/base_estimator'
+require 'rumale/base/transformer'
+module Rumale
+  module Preprocessing
+    # Normalize samples to unit L1-norm.
+    #
+    # @example
+    #   normalizer = Rumale::Preprocessing::L1Normalizer.new
+    #   new_samples = normalizer.fit_transform(samples)
+    class L1Normalizer
+      include Base::BaseEstimator
+      include Base::Transformer
+      # Return the vector consists of L1-norm for each sample.
+      # @return [Numo::DFloat] (shape: [n_samples])
+      attr_reader :norm_vec # :nodoc:
+      # Create a new normalizer for normaliing to L1-norm.
+      def initialize
+        @params = {}
+        @norm_vec = nil
+      end
+      # Calculate L1-norms of each sample.
+      #
+      # @overload fit(x) -> L1Normalizer
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L1-norms.
+      # @return [L1Normalizer]
+      def fit(x, _y = nil)
+        x = check_convert_sample_array(x)
+        @norm_vec = x.abs.sum(1)
+        @norm_vec[@norm_vec.eq(0)] = 1
+        self
+      end
+      # Calculate L1-norms of each sample, and then normalize samples to L1-norm.
+      #
+      # @overload fit_transform(x) -> Numo::DFloat
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L1-norms.
+      # @return [Numo::DFloat] The normalized samples.
+      def fit_transform(x, _y = nil)
+        x = check_convert_sample_array(x)
+        fit(x)
+        x / @norm_vec.expand_dims(1)
+      end
+      # Calculate L1-norms of each sample, and then normalize samples to L1-norm.
+      # This method calls the fit_transform method. This method exists for the Pipeline class.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L1-norms.
+      # @return [Numo::DFloat] The normalized samples.
+      def transform(x)
+        fit_transform(x)
+      end
+    end
+  end
+end

data/lib/rumale/preprocessing/l2_normalizer.rb CHANGED

@@ -34,6 +34,7 @@ module Rumale
       def fit(x, _y = nil)
         x = check_convert_sample_array(x)
         @norm_vec = Numo::NMath.sqrt((x**2).sum(1))
+        @norm_vec[@norm_vec.eq(0)] = 1
         self
       end
@@ -46,7 +47,7 @@ module Rumale
       def fit_transform(x, _y = nil)
         x = check_convert_sample_array(x)
         fit(x)
-        x / @norm_vec.tile(x.shape[1], 1).transpose
+        x / @norm_vec.expand_dims(1)
       end
       # Calculate L2-norms of each sample, and then normalize samples to unit L2-norm.

data/lib/rumale/version.rb CHANGED

@@ -3,5 +3,5 @@
 # Rumale is a machine learning library in Ruby.
 module Rumale
   # The version of Rumale you are using.
-  VERSION = '0.19.1'
+  VERSION = '0.19.2'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rumale
 version: !ruby/object:Gem::Version
-  version: 0.19.1
+  version: 0.19.2
 platform: ruby
 authors:
 - yoshoku
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2020-06-06 00:00:00.000000000 Z
+date: 2020-06-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: numo-narray
@@ -113,6 +113,7 @@ files:
 - lib/rumale/evaluation_measure/silhouette_score.rb
 - lib/rumale/feature_extraction/feature_hasher.rb
 - lib/rumale/feature_extraction/hash_vectorizer.rb
+- lib/rumale/feature_extraction/tfidf_transformer.rb
 - lib/rumale/kernel_approximation/nystroem.rb
 - lib/rumale/kernel_approximation/rbf.rb
 - lib/rumale/kernel_machine/kernel_fda.rb
@@ -166,6 +167,7 @@ files:
 - lib/rumale/polynomial_model/factorization_machine_classifier.rb
 - lib/rumale/polynomial_model/factorization_machine_regressor.rb
 - lib/rumale/preprocessing/bin_discretizer.rb
+- lib/rumale/preprocessing/l1_normalizer.rb
 - lib/rumale/preprocessing/l2_normalizer.rb
 - lib/rumale/preprocessing/label_binarizer.rb
 - lib/rumale/preprocessing/label_encoder.rb