rumale 0.19.1 → 0.19.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f49170105721cfebcae9f1a424e9a858650d78225541a8cb63b0ad4c70734988
4
- data.tar.gz: ecc35086328eee1066252e75b8cd638256039e93beebc0bce5714493fe72570b
3
+ metadata.gz: 1bff2e1e6182aa954be00ed107ed1bd81220298f89514b4b31304f8890ff27c4
4
+ data.tar.gz: '09b185f468baf9dbec6280fa6c06984c95919308f1d2247277bf30348ed392bc'
5
5
  SHA512:
6
- metadata.gz: 68f432bb34ff6c8e467a91d7c7e3aa07e816c2dd8807defc9e4e82e7a720c925062dbd27c8a7ec3294ecef2d71041baead2510edaf03a1eee210dc811eede22d
7
- data.tar.gz: 5854eacc12de6c3cdcdbab0f9b4e73fc64d1be0533732348da6b4d6dcb0be9f115e2415501b05148fd021fa844ac0c25adc1bb858432a02ca6fe19d30a3538c7
6
+ metadata.gz: 6d8f1fcaffcd6714c6156fc615d87e6b6950e82ab40fc7434cfc5a014d6c08eb0170ee7c45d8fed978c2a52f839b1ce647fd6e088cbab2ea45e517b34c88407a
7
+ data.tar.gz: b255ae4c24cdc91ebad59f79ee5a58c5d2a5ffa79bda0ac221e3a33bd824d2fd94e5cd83f3a06e54a2dc537a074276cea5a71651deeee2a304d23e963ff92c9d
@@ -1,3 +1,8 @@
1
+ # 0.19.2
2
+ - Fix L2Normalizer to avoid zero divide.
3
+ - Add preprocssing class for [L1Normalizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/L1Normalizer.html).
4
+ - Add transformer class for [TfidfTransformer](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/TfidfTransformer.html).
5
+
1
6
  # 0.19.1
2
7
  - Add cluster analysis class for [mini-batch K-Means](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/MiniBatchKMeans.html).
3
8
  - Fix some typos.
@@ -93,7 +93,9 @@ require 'rumale/neural_network/mlp_regressor'
93
93
  require 'rumale/neural_network/mlp_classifier'
94
94
  require 'rumale/feature_extraction/hash_vectorizer'
95
95
  require 'rumale/feature_extraction/feature_hasher'
96
+ require 'rumale/feature_extraction/tfidf_transformer'
96
97
  require 'rumale/preprocessing/l2_normalizer'
98
+ require 'rumale/preprocessing/l1_normalizer'
97
99
  require 'rumale/preprocessing/min_max_scaler'
98
100
  require 'rumale/preprocessing/max_abs_scaler'
99
101
  require 'rumale/preprocessing/standard_scaler'
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+ require 'rumale/preprocessing/l1_normalizer'
6
+ require 'rumale/preprocessing/l2_normalizer'
7
+
8
+ module Rumale
9
+ module FeatureExtraction
10
+ # Transform sample matrix with term frequecy (tf) to a normalized tf-idf (inverse document frequency) reprensentation.
11
+ #
12
+ # @example
13
+ # encoder = Rumale::FeatureExtraction::HashVectorizer.new
14
+ # x = encoder.fit_transform([
15
+ # { foo: 1, bar: 2 },
16
+ # { foo: 3, baz: 1 }
17
+ # ])
18
+ #
19
+ # # > pp x
20
+ # # Numo::DFloat#shape=[2,3]
21
+ # # [[2, 0, 1],
22
+ # # [0, 1, 3]]
23
+ #
24
+ # transformer = Rumale::FeatureExtraction::TfidfTransformer.new
25
+ # x_tfidf = transformer.fit_transform(x)
26
+ #
27
+ # # > pp x_tfidf
28
+ # # Numo::DFloat#shape=[2,3]
29
+ # # [[0.959056, 0, 0.283217],
30
+ # # [0, 0.491506, 0.870874]]
31
+ #
32
+ # *Reference*
33
+ # - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
34
+ class TfidfTransformer
35
+ include Base::BaseEstimator
36
+ include Base::Transformer
37
+
38
+ # Return the vector consists of inverse document frequency.
39
+ # @return [Numo::DFloat] (shape: [n_features])
40
+ attr_reader :idf
41
+
42
+ # Create a new transfomer for converting tf vectors to tf-idf vectors.
43
+ #
44
+ # @param norm [String] The normalization method to be used ('l1', 'l2' and 'none').
45
+ # @param use_idf [Boolean] The flag indicating whether to use inverse document frequency weighting.
46
+ # @param smooth_idf [Boolean] The flag indicating whether to apply idf smoothing by log((n_samples + 1) / (df + 1)) + 1.
47
+ # @param sublinear_tf [Boolean] The flag indicating whether to perform subliner tf scaling by 1 + log(tf).
48
+ def initialize(norm: 'l2', use_idf: true, smooth_idf: false, sublinear_tf: false)
49
+ check_params_string(norm: norm)
50
+ check_params_boolean(use_idf: use_idf, smooth_idf: smooth_idf, sublinear_tf: sublinear_tf)
51
+ @params = {}
52
+ @params[:norm] = norm
53
+ @params[:use_idf] = use_idf
54
+ @params[:smooth_idf] = smooth_idf
55
+ @params[:sublinear_tf] = sublinear_tf
56
+ @idf = nil
57
+ end
58
+
59
+ # Calculate the inverse document frequency for weighting.
60
+ #
61
+ # @overload fit(x) -> TfidfTransformer
62
+ #
63
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the idf values.
64
+ # @return [TfidfTransformer]
65
+ def fit(x, _y = nil)
66
+ return self unless @params[:use_idf]
67
+
68
+ x = check_convert_sample_array(x)
69
+
70
+ n_samples = x.shape[0]
71
+ df = x.class.cast(x.gt(0.0).count(0))
72
+
73
+ if @params[:smooth_idf]
74
+ df += 1
75
+ n_samples += 1
76
+ end
77
+
78
+ @idf = Numo::NMath.log(n_samples / df) + 1
79
+
80
+ self
81
+ end
82
+
83
+ # Calculate the idf values, and then transfrom samples to the tf-idf representation.
84
+ #
85
+ # @overload fit_transform(x) -> Numo::DFloat
86
+ #
87
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate idf and be transformed to tf-idf representation.
88
+ # @return [Numo::DFloat] The transformed samples.
89
+ def fit_transform(x, _y = nil)
90
+ fit(x).transform(x)
91
+ end
92
+
93
+ # Perform transforming the given samples to the tf-idf representation.
94
+ #
95
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
96
+ # @return [Numo::DFloat] The transformed samples.
97
+ def transform(x)
98
+ x = check_convert_sample_array(x)
99
+ z = x.dup
100
+
101
+ z[z.ne(0)] = Numo::NMath.log(z[z.ne(0)]) + 1 if @params[:sublinear_tf]
102
+ z *= @idf if @params[:use_idf]
103
+ case @params[:norm]
104
+ when 'l2'
105
+ z = Rumale::Preprocessing::L2Normalizer.new.fit_transform(z)
106
+ when 'l1'
107
+ z = Rumale::Preprocessing::L1Normalizer.new.fit_transform(z)
108
+ end
109
+ z
110
+ end
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+
6
+ module Rumale
7
+ module Preprocessing
8
+ # Normalize samples to unit L1-norm.
9
+ #
10
+ # @example
11
+ # normalizer = Rumale::Preprocessing::L1Normalizer.new
12
+ # new_samples = normalizer.fit_transform(samples)
13
+ class L1Normalizer
14
+ include Base::BaseEstimator
15
+ include Base::Transformer
16
+
17
+ # Return the vector consists of L1-norm for each sample.
18
+ # @return [Numo::DFloat] (shape: [n_samples])
19
+ attr_reader :norm_vec # :nodoc:
20
+
21
+ # Create a new normalizer for normaliing to L1-norm.
22
+ def initialize
23
+ @params = {}
24
+ @norm_vec = nil
25
+ end
26
+
27
+ # Calculate L1-norms of each sample.
28
+ #
29
+ # @overload fit(x) -> L1Normalizer
30
+ #
31
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L1-norms.
32
+ # @return [L1Normalizer]
33
+ def fit(x, _y = nil)
34
+ x = check_convert_sample_array(x)
35
+ @norm_vec = x.abs.sum(1)
36
+ @norm_vec[@norm_vec.eq(0)] = 1
37
+ self
38
+ end
39
+
40
+ # Calculate L1-norms of each sample, and then normalize samples to L1-norm.
41
+ #
42
+ # @overload fit_transform(x) -> Numo::DFloat
43
+ #
44
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L1-norms.
45
+ # @return [Numo::DFloat] The normalized samples.
46
+ def fit_transform(x, _y = nil)
47
+ x = check_convert_sample_array(x)
48
+ fit(x)
49
+ x / @norm_vec.expand_dims(1)
50
+ end
51
+
52
+ # Calculate L1-norms of each sample, and then normalize samples to L1-norm.
53
+ # This method calls the fit_transform method. This method exists for the Pipeline class.
54
+ #
55
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate L1-norms.
56
+ # @return [Numo::DFloat] The normalized samples.
57
+ def transform(x)
58
+ fit_transform(x)
59
+ end
60
+ end
61
+ end
62
+ end
@@ -34,6 +34,7 @@ module Rumale
34
34
  def fit(x, _y = nil)
35
35
  x = check_convert_sample_array(x)
36
36
  @norm_vec = Numo::NMath.sqrt((x**2).sum(1))
37
+ @norm_vec[@norm_vec.eq(0)] = 1
37
38
  self
38
39
  end
39
40
 
@@ -46,7 +47,7 @@ module Rumale
46
47
  def fit_transform(x, _y = nil)
47
48
  x = check_convert_sample_array(x)
48
49
  fit(x)
49
- x / @norm_vec.tile(x.shape[1], 1).transpose
50
+ x / @norm_vec.expand_dims(1)
50
51
  end
51
52
 
52
53
  # Calculate L2-norms of each sample, and then normalize samples to unit L2-norm.
@@ -3,5 +3,5 @@
3
3
  # Rumale is a machine learning library in Ruby.
4
4
  module Rumale
5
5
  # The version of Rumale you are using.
6
- VERSION = '0.19.1'
6
+ VERSION = '0.19.2'
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rumale
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.19.1
4
+ version: 0.19.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-06-06 00:00:00.000000000 Z
11
+ date: 2020-06-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -113,6 +113,7 @@ files:
113
113
  - lib/rumale/evaluation_measure/silhouette_score.rb
114
114
  - lib/rumale/feature_extraction/feature_hasher.rb
115
115
  - lib/rumale/feature_extraction/hash_vectorizer.rb
116
+ - lib/rumale/feature_extraction/tfidf_transformer.rb
116
117
  - lib/rumale/kernel_approximation/nystroem.rb
117
118
  - lib/rumale/kernel_approximation/rbf.rb
118
119
  - lib/rumale/kernel_machine/kernel_fda.rb
@@ -166,6 +167,7 @@ files:
166
167
  - lib/rumale/polynomial_model/factorization_machine_classifier.rb
167
168
  - lib/rumale/polynomial_model/factorization_machine_regressor.rb
168
169
  - lib/rumale/preprocessing/bin_discretizer.rb
170
+ - lib/rumale/preprocessing/l1_normalizer.rb
169
171
  - lib/rumale/preprocessing/l2_normalizer.rb
170
172
  - lib/rumale/preprocessing/label_binarizer.rb
171
173
  - lib/rumale/preprocessing/label_encoder.rb