rumale 0.19.1 → 0.19.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f49170105721cfebcae9f1a424e9a858650d78225541a8cb63b0ad4c70734988
4
- data.tar.gz: ecc35086328eee1066252e75b8cd638256039e93beebc0bce5714493fe72570b
3
+ metadata.gz: 1bff2e1e6182aa954be00ed107ed1bd81220298f89514b4b31304f8890ff27c4
4
+ data.tar.gz: '09b185f468baf9dbec6280fa6c06984c95919308f1d2247277bf30348ed392bc'
5
5
  SHA512:
6
- metadata.gz: 68f432bb34ff6c8e467a91d7c7e3aa07e816c2dd8807defc9e4e82e7a720c925062dbd27c8a7ec3294ecef2d71041baead2510edaf03a1eee210dc811eede22d
7
- data.tar.gz: 5854eacc12de6c3cdcdbab0f9b4e73fc64d1be0533732348da6b4d6dcb0be9f115e2415501b05148fd021fa844ac0c25adc1bb858432a02ca6fe19d30a3538c7
6
+ metadata.gz: 6d8f1fcaffcd6714c6156fc615d87e6b6950e82ab40fc7434cfc5a014d6c08eb0170ee7c45d8fed978c2a52f839b1ce647fd6e088cbab2ea45e517b34c88407a
7
+ data.tar.gz: b255ae4c24cdc91ebad59f79ee5a58c5d2a5ffa79bda0ac221e3a33bd824d2fd94e5cd83f3a06e54a2dc537a074276cea5a71651deeee2a304d23e963ff92c9d
@@ -1,3 +1,8 @@
1
+ # 0.19.2
2
+ - Fix L2Normalizer to avoid zero divide.
3
+ - Add preprocssing class for [L1Normalizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/L1Normalizer.html).
4
+ - Add transformer class for [TfidfTransformer](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/TfidfTransformer.html).
5
+
1
6
  # 0.19.1
2
7
  - Add cluster analysis class for [mini-batch K-Means](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/MiniBatchKMeans.html).
3
8
  - Fix some typos.
@@ -93,7 +93,9 @@ require 'rumale/neural_network/mlp_regressor'
93
93
  require 'rumale/neural_network/mlp_classifier'
94
94
  require 'rumale/feature_extraction/hash_vectorizer'
95
95
  require 'rumale/feature_extraction/feature_hasher'
96
+ require 'rumale/feature_extraction/tfidf_transformer'
96
97
  require 'rumale/preprocessing/l2_normalizer'
98
+ require 'rumale/preprocessing/l1_normalizer'
97
99
  require 'rumale/preprocessing/min_max_scaler'
98
100
  require 'rumale/preprocessing/max_abs_scaler'
99
101
  require 'rumale/preprocessing/standard_scaler'
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+ require 'rumale/preprocessing/l1_normalizer'
6
+ require 'rumale/preprocessing/l2_normalizer'
7
+
8
+ module Rumale
9
+ module FeatureExtraction
10
+ # Transform sample matrix with term frequecy (tf) to a normalized tf-idf (inverse document frequency) reprensentation.
11
+ #
12
+ # @example
13
+ # encoder = Rumale::FeatureExtraction::HashVectorizer.new
14
+ # x = encoder.fit_transform([
15
+ # { foo: 1, bar: 2 },
16
+ # { foo: 3, baz: 1 }
17
+ # ])
18
+ #
19
+ # # > pp x
20
+ # # Numo::DFloat#shape=[2,3]
21
+ # # [[2, 0, 1],
22
+ # # [0, 1, 3]]
23
+ #
24
+ # transformer = Rumale::FeatureExtraction::TfidfTransformer.new
25
+ # x_tfidf = transformer.fit_transform(x)
26
+ #
27
+ # # > pp x_tfidf
28
+ # # Numo::DFloat#shape=[2,3]
29
+ # # [[0.959056, 0, 0.283217],
30
+ # # [0, 0.491506, 0.870874]]
31
+ #
32
+ # *Reference*
33
+ # - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
34
+ class TfidfTransformer
35
+ include Base::BaseEstimator
36
+ include Base::Transformer
37
+
38
+ # Return the vector consists of inverse document frequency.
39
+ # @return [Numo::DFloat] (shape: [n_features])
40
+ attr_reader :idf
41
+
42
+ # Create a new transfomer for converting tf vectors to tf-idf vectors.
43
+ #
44
+ # @param norm [String] The normalization method to be used ('l1', 'l2' and 'none').
45
+ # @param use_idf [Boolean] The flag indicating whether to use inverse document frequency weighting.
46
+ # @param smooth_idf [Boolean] The flag indicating whether to apply idf smoothing by log((n_samples + 1) / (df + 1)) + 1.
47
+ # @param sublinear_tf [Boolean] The flag indicating whether to perform subliner tf scaling by 1 + log(tf).
48
+ def initialize(norm: 'l2', use_idf: true, smooth_idf: false, sublinear_tf: false)
49
+ check_params_string(norm: norm)
50
+ check_params_boolean(use_idf: use_idf, smooth_idf: smooth_idf, sublinear_tf: sublinear_tf)
51
+ @params = {}
52
+ @params[:norm] = norm
53
+ @params[:use_idf] = use_idf
54
+ @params[:smooth_idf] = smooth_idf
55
+ @params[:sublinear_tf] = sublinear_tf
56
+ @idf = nil
57
+ end
58
+
59
+ # Calculate the inverse document frequency for weighting.
60
+ #
61
+ # @overload fit(x) -> TfidfTransformer
62
+ #
63
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the idf values.
64
+ # @return [TfidfTransformer]
65
+ def fit(x, _y = nil)
66
+ return self unless @params[:use_idf]
67
+
68
+ x = check_convert_sample_array(x)
69
+
70
+ n_samples = x.shape[0]
71
+ df = x.class.cast(x.gt(0.0).count(0))
72
+
73
+ if @params[:smooth_idf]
74
+ df += 1
75
+ n_samples += 1
76
+ end
77
+
78
+ @idf = Numo::NMath.log(n_samples / df) + 1
79
+
80
+ self
81
+ end
82
+
83
+ # Calculate the idf values, and then transfrom samples to the tf-idf representation.
84
+ #
85
+ # @overload fit_transform(x) -> Numo::DFloat
86
+ #
87
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate idf and be transformed to tf-idf representation.
88
+ # @return [Numo::DFloat] The transformed samples.
89
+ def fit_transform(x, _y = nil)
90
+ fit(x).transform(x)
91
+ end
92
+
93
+ # Perform transforming the given samples to the tf-idf representation.
94
+ #
95
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
96
+ # @return [Numo::DFloat] The transformed samples.
97
+ def transform(x)
98
+ x = check_convert_sample_array(x)
99
+ z = x.dup
100
+
101
+ z[z.ne(0)] = Numo::NMath.log(z[z.ne(0)]) + 1 if @params[:sublinear_tf]
102
+ z *= @idf if @params[:use_idf]
103
+ case @params[:norm]
104
+ when 'l2'
105
+ z = Rumale::Preprocessing::L2Normalizer.new.fit_transform(z)
106
+ when 'l1'
107
+ z = Rumale::Preprocessing::L1Normalizer.new.fit_transform(z)
108
+ end
109
+ z
110
+ end
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+
6
+ module Rumale
7
+ module Preprocessing
8
+ # Normalize samples to unit L1-norm.
9
+ #
10
+ # @example
11
+ # normalizer = Rumale::Preprocessing::L1Normalizer.new
12
+ # new_samples = normalizer.fit_transform(samples)
13
+ class L1Normalizer
14
+ include Base::BaseEstimator
15
+ include Base::Transformer
16
+
17
+ # Return the vector consists of L1-norm for each sample.
18
+ # @return [Numo::DFloat] (shape: [n_samples])
19
+ attr_reader :norm_vec # :nodoc:
20
+
21
+ # Create a new normalizer for normaliing to L1-norm.
22
+ def initialize
23
+ @params = {}
24
+ @norm_vec = nil
25
+ end
26
+
27
+ # Calculate L1-norms of each sample.
28
+ #
29
+ # @overload fit(x) -> L1Normalizer
30
+ #
31
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L1-norms.
32
+ # @return [L1Normalizer]
33
+ def fit(x, _y = nil)
34
+ x = check_convert_sample_array(x)
35
+ @norm_vec = x.abs.sum(1)
36
+ @norm_vec[@norm_vec.eq(0)] = 1
37
+ self
38
+ end
39
+
40
+ # Calculate L1-norms of each sample, and then normalize samples to L1-norm.
41
+ #
42
+ # @overload fit_transform(x) -> Numo::DFloat
43
+ #
44
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L1-norms.
45
+ # @return [Numo::DFloat] The normalized samples.
46
+ def fit_transform(x, _y = nil)
47
+ x = check_convert_sample_array(x)
48
+ fit(x)
49
+ x / @norm_vec.expand_dims(1)
50
+ end
51
+
52
+ # Calculate L1-norms of each sample, and then normalize samples to L1-norm.
53
+ # This method calls the fit_transform method. This method exists for the Pipeline class.
54
+ #
55
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L1-norms.
56
+ # @return [Numo::DFloat] The normalized samples.
57
+ def transform(x)
58
+ fit_transform(x)
59
+ end
60
+ end
61
+ end
62
+ end
@@ -34,6 +34,7 @@ module Rumale
34
34
  def fit(x, _y = nil)
35
35
  x = check_convert_sample_array(x)
36
36
  @norm_vec = Numo::NMath.sqrt((x**2).sum(1))
37
+ @norm_vec[@norm_vec.eq(0)] = 1
37
38
  self
38
39
  end
39
40
 
@@ -46,7 +47,7 @@ module Rumale
46
47
  def fit_transform(x, _y = nil)
47
48
  x = check_convert_sample_array(x)
48
49
  fit(x)
49
- x / @norm_vec.tile(x.shape[1], 1).transpose
50
+ x / @norm_vec.expand_dims(1)
50
51
  end
51
52
 
52
53
  # Calculate L2-norms of each sample, and then normalize samples to unit L2-norm.
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.19.1'
6
+ VERSION = '0.19.2'
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.19.1
4
+ version: 0.19.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-06-06 00:00:00.000000000 Z
11
+ date: 2020-06-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -113,6 +113,7 @@ files:
113
113
  - lib/rumale/evaluation_measure/silhouette_score.rb
114
114
  - lib/rumale/feature_extraction/feature_hasher.rb
115
115
  - lib/rumale/feature_extraction/hash_vectorizer.rb
116
+ - lib/rumale/feature_extraction/tfidf_transformer.rb
116
117
  - lib/rumale/kernel_approximation/nystroem.rb
117
118
  - lib/rumale/kernel_approximation/rbf.rb
118
119
  - lib/rumale/kernel_machine/kernel_fda.rb
@@ -166,6 +167,7 @@ files:
166
167
  - lib/rumale/polynomial_model/factorization_machine_classifier.rb
167
168
  - lib/rumale/polynomial_model/factorization_machine_regressor.rb
168
169
  - lib/rumale/preprocessing/bin_discretizer.rb
170
+ - lib/rumale/preprocessing/l1_normalizer.rb
169
171
  - lib/rumale/preprocessing/l2_normalizer.rb
170
172
  - lib/rumale/preprocessing/label_binarizer.rb
171
173
  - lib/rumale/preprocessing/label_encoder.rb