rumale 0.19.1 → 0.20.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +5 -29
  3. data/CHANGELOG.md +28 -0
  4. data/lib/rumale.rb +7 -10
  5. data/lib/rumale/clustering/hdbscan.rb +3 -3
  6. data/lib/rumale/clustering/k_means.rb +1 -1
  7. data/lib/rumale/clustering/k_medoids.rb +1 -1
  8. data/lib/rumale/clustering/mini_batch_k_means.rb +2 -2
  9. data/lib/rumale/dataset.rb +4 -4
  10. data/lib/rumale/decomposition/nmf.rb +2 -2
  11. data/lib/rumale/ensemble/random_forest_classifier.rb +1 -1
  12. data/lib/rumale/ensemble/random_forest_regressor.rb +1 -1
  13. data/lib/rumale/feature_extraction/feature_hasher.rb +1 -1
  14. data/lib/rumale/feature_extraction/hash_vectorizer.rb +1 -1
  15. data/lib/rumale/feature_extraction/tfidf_transformer.rb +113 -0
  16. data/lib/rumale/kernel_approximation/nystroem.rb +1 -1
  17. data/lib/rumale/kernel_machine/kernel_svc.rb +1 -1
  18. data/lib/rumale/linear_model/base_sgd.rb +1 -1
  19. data/lib/rumale/manifold/tsne.rb +1 -1
  20. data/lib/rumale/model_selection/cross_validation.rb +3 -2
  21. data/lib/rumale/model_selection/group_k_fold.rb +93 -0
  22. data/lib/rumale/model_selection/group_shuffle_split.rb +115 -0
  23. data/lib/rumale/model_selection/k_fold.rb +1 -1
  24. data/lib/rumale/model_selection/shuffle_split.rb +5 -5
  25. data/lib/rumale/model_selection/stratified_k_fold.rb +1 -1
  26. data/lib/rumale/model_selection/stratified_shuffle_split.rb +13 -9
  27. data/lib/rumale/model_selection/time_series_split.rb +91 -0
  28. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +2 -2
  29. data/lib/rumale/nearest_neighbors/vp_tree.rb +1 -1
  30. data/lib/rumale/neural_network/base_mlp.rb +1 -1
  31. data/lib/rumale/preprocessing/binarizer.rb +60 -0
  32. data/lib/rumale/preprocessing/l1_normalizer.rb +62 -0
  33. data/lib/rumale/preprocessing/l2_normalizer.rb +2 -1
  34. data/lib/rumale/preprocessing/max_normalizer.rb +62 -0
  35. data/lib/rumale/probabilistic_output.rb +1 -1
  36. data/lib/rumale/version.rb +1 -1
  37. metadata +9 -12
  38. data/lib/rumale/linear_model/base_linear_model.rb +0 -102
  39. data/lib/rumale/optimizer/ada_grad.rb +0 -42
  40. data/lib/rumale/optimizer/adam.rb +0 -56
  41. data/lib/rumale/optimizer/nadam.rb +0 -67
  42. data/lib/rumale/optimizer/rmsprop.rb +0 -50
  43. data/lib/rumale/optimizer/sgd.rb +0 -46
  44. data/lib/rumale/optimizer/yellow_fin.rb +0 -104
  45. data/lib/rumale/polynomial_model/base_factorization_machine.rb +0 -125
  46. data/lib/rumale/polynomial_model/factorization_machine_classifier.rb +0 -220
  47. data/lib/rumale/polynomial_model/factorization_machine_regressor.rb +0 -134
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f49170105721cfebcae9f1a424e9a858650d78225541a8cb63b0ad4c70734988
4
- data.tar.gz: ecc35086328eee1066252e75b8cd638256039e93beebc0bce5714493fe72570b
3
+ metadata.gz: 5d8c93acbf38fbd07e5df224010abbdd4269a6ce3bbf8112a0eba652a606785d
4
+ data.tar.gz: e7cb00a802420854835c92f011425f3054bfcc1052bf7b3664da1f95834ef435
5
5
  SHA512:
6
- metadata.gz: 68f432bb34ff6c8e467a91d7c7e3aa07e816c2dd8807defc9e4e82e7a720c925062dbd27c8a7ec3294ecef2d71041baead2510edaf03a1eee210dc811eede22d
7
- data.tar.gz: 5854eacc12de6c3cdcdbab0f9b4e73fc64d1be0533732348da6b4d6dcb0be9f115e2415501b05148fd021fa844ac0c25adc1bb858432a02ca6fe19d30a3538c7
6
+ metadata.gz: f95fdd89b84dad02e516ee0479b1cddfb101cb96de897b6e7fa3fba546272a243cff5cfe954cb51942ec1ab23cf3028b183db86b52fab00a35d15be7eee5bf92
7
+ data.tar.gz: e5f6235e88dd47b9002a2154cabd2c1e64afb6cbb5b0745b411c7e5559351e925c9db8ec332724e301b83215662b3582e79a9e997f0338846514b234dabf1fc3
@@ -3,6 +3,7 @@ require:
3
3
  - rubocop-rspec
4
4
 
5
5
  AllCops:
6
+ NewCops: enable
6
7
  TargetRubyVersion: 2.5
7
8
  DisplayCopNames: true
8
9
  DisplayStyleGuide: true
@@ -15,25 +16,12 @@ AllCops:
15
16
  Style/Documentation:
16
17
  Enabled: false
17
18
 
18
- Style/HashEachMethods:
19
- Enabled: true
20
-
21
- Style/HashTransformKeys:
22
- Enabled: true
23
-
24
- Style/HashTransformValues:
25
- Enabled: true
26
-
27
- Lint/RaiseException:
28
- Enabled: true
29
-
30
- Lint/StructNewOverride:
31
- Enabled: true
32
-
33
19
  Layout/LineLength:
34
20
  Max: 145
35
21
  IgnoredPatterns: ['(\A|\s)#']
36
22
 
23
+ Lint/MissingSuper:
24
+ Enabled: false
37
25
 
38
26
  Metrics/ModuleLength:
39
27
  Max: 200
@@ -70,26 +58,14 @@ Naming/MethodParameterName:
70
58
  Naming/ConstantName:
71
59
  Enabled: false
72
60
 
73
- Style/ExponentialNotation:
74
- Enabled: true
75
-
76
61
  Style/FormatStringToken:
77
62
  Enabled: false
78
63
 
79
64
  Style/NumericLiterals:
80
65
  Enabled: false
81
66
 
82
- Style/SlicingWithRange:
83
- Enabled: true
84
-
85
- Layout/EmptyLineAfterGuardClause:
86
- Enabled: true
87
-
88
- Layout/EmptyLinesAroundAttributeAccessor:
89
- Enabled: true
90
-
91
- Layout/SpaceAroundMethodCallOperator:
92
- Enabled: true
67
+ Style/StringConcatenation:
68
+ Enabled: false
93
69
 
94
70
  RSpec/MultipleExpectations:
95
71
  Enabled: false
@@ -1,8 +1,36 @@
1
+ # 0.20.2
2
+ - Add cross-validator class for time-series data.
3
+ - [TimeSeriesSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/TimeSeriesSplit.html)
4
+
5
+ # 0.20.1
6
+ - Add cross-validator classes that split data according group labels.
7
+ - [GroupKFold](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/GroupKFold.html)
8
+ - [GroupShuffleSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/GroupShuffleSplit.html)
9
+ - Fix fraction treating of the number of samples on shuffle split cross-validator classes.
10
+ - [ShuffleSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/ShuffleSplit.html)
11
+ - [StratifiedShuffleSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/StratifiedShuffleSplit.html)
12
+ - Refactor some codes with Rubocop.
13
+
14
+ # 0.20.0
15
+ ## Breaking changes
16
+ - Delete deprecated estimators such as PolynomialModel, Optimizer, and BaseLinearModel.
17
+
18
+ # 0.19.3
19
+ - Add preprocessing class for [Binarizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/Binarizer.html)
20
+ - Add preprocessing class for [MaxNormalizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/MaxNormalizer.html)
21
+ - Refactor some codes with Rubocop.
22
+
23
+ # 0.19.2
24
+ - Fix L2Normalizer to avoid zero divide.
25
+ - Add preprocssing class for [L1Normalizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/L1Normalizer.html).
26
+ - Add transformer class for [TfidfTransformer](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/TfidfTransformer.html).
27
+
1
28
  # 0.19.1
2
29
  - Add cluster analysis class for [mini-batch K-Means](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/MiniBatchKMeans.html).
3
30
  - Fix some typos.
4
31
 
5
32
  # 0.19.0
33
+ ## Breaking changes
6
34
  - Change mmh3 and mopti gem to non-runtime dependent library.
7
35
  - The mmh3 gem is used in [FeatureHasher](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/FeatureHasher.html).
8
36
  You only need to require mmh3 gem when using FeatureHasher.
@@ -18,17 +18,10 @@ require 'rumale/base/cluster_analyzer'
18
18
  require 'rumale/base/transformer'
19
19
  require 'rumale/base/splitter'
20
20
  require 'rumale/base/evaluator'
21
- require 'rumale/optimizer/sgd'
22
- require 'rumale/optimizer/ada_grad'
23
- require 'rumale/optimizer/rmsprop'
24
- require 'rumale/optimizer/adam'
25
- require 'rumale/optimizer/nadam'
26
- require 'rumale/optimizer/yellow_fin'
27
21
  require 'rumale/pipeline/pipeline'
28
22
  require 'rumale/pipeline/feature_union'
29
23
  require 'rumale/kernel_approximation/rbf'
30
24
  require 'rumale/kernel_approximation/nystroem'
31
- require 'rumale/linear_model/base_linear_model'
32
25
  require 'rumale/linear_model/base_sgd'
33
26
  require 'rumale/linear_model/svc'
34
27
  require 'rumale/linear_model/svr'
@@ -41,9 +34,6 @@ require 'rumale/kernel_machine/kernel_svc'
41
34
  require 'rumale/kernel_machine/kernel_pca'
42
35
  require 'rumale/kernel_machine/kernel_fda'
43
36
  require 'rumale/kernel_machine/kernel_ridge'
44
- require 'rumale/polynomial_model/base_factorization_machine'
45
- require 'rumale/polynomial_model/factorization_machine_classifier'
46
- require 'rumale/polynomial_model/factorization_machine_regressor'
47
37
  require 'rumale/multiclass/one_vs_rest_classifier'
48
38
  require 'rumale/nearest_neighbors/vp_tree'
49
39
  require 'rumale/nearest_neighbors/k_neighbors_classifier'
@@ -93,7 +83,10 @@ require 'rumale/neural_network/mlp_regressor'
93
83
  require 'rumale/neural_network/mlp_classifier'
94
84
  require 'rumale/feature_extraction/hash_vectorizer'
95
85
  require 'rumale/feature_extraction/feature_hasher'
86
+ require 'rumale/feature_extraction/tfidf_transformer'
96
87
  require 'rumale/preprocessing/l2_normalizer'
88
+ require 'rumale/preprocessing/l1_normalizer'
89
+ require 'rumale/preprocessing/max_normalizer'
97
90
  require 'rumale/preprocessing/min_max_scaler'
98
91
  require 'rumale/preprocessing/max_abs_scaler'
99
92
  require 'rumale/preprocessing/standard_scaler'
@@ -102,11 +95,15 @@ require 'rumale/preprocessing/label_binarizer'
102
95
  require 'rumale/preprocessing/label_encoder'
103
96
  require 'rumale/preprocessing/one_hot_encoder'
104
97
  require 'rumale/preprocessing/ordinal_encoder'
98
+ require 'rumale/preprocessing/binarizer'
105
99
  require 'rumale/preprocessing/polynomial_features'
106
100
  require 'rumale/model_selection/k_fold'
101
+ require 'rumale/model_selection/group_k_fold'
107
102
  require 'rumale/model_selection/stratified_k_fold'
108
103
  require 'rumale/model_selection/shuffle_split'
104
+ require 'rumale/model_selection/group_shuffle_split'
109
105
  require 'rumale/model_selection/stratified_shuffle_split'
106
+ require 'rumale/model_selection/time_series_split'
110
107
  require 'rumale/model_selection/cross_validation'
111
108
  require 'rumale/model_selection/grid_search_cv'
112
109
  require 'rumale/model_selection/function'
@@ -136,7 +136,7 @@ module Rumale
136
136
  res
137
137
  end
138
138
 
139
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
139
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
140
140
  def condense_tree(hierarchy, min_cluster_size)
141
141
  n_edges = hierarchy.size
142
142
  root = 2 * n_edges
@@ -232,7 +232,7 @@ module Rumale
232
232
  end
233
233
 
234
234
  def flatten(tree, stabilities)
235
- node_ids = stabilities.keys.sort { |a, b| b <=> a }.slice(0, stabilities.size - 1)
235
+ node_ids = stabilities.keys.sort.reverse.slice(0, stabilities.size - 1)
236
236
 
237
237
  cluster_tree = tree.select { |edge| edge.n_elements > 1 }
238
238
  is_cluster = node_ids.each_with_object({}) { |n_id, h| h[n_id] = true }
@@ -265,7 +265,7 @@ module Rumale
265
265
  end
266
266
  res
267
267
  end
268
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
268
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
269
269
  end
270
270
  end
271
271
  end
@@ -103,7 +103,7 @@ module Rumale
103
103
  # random initialize
104
104
  n_samples = x.shape[0]
105
105
  sub_rng = @rng.dup
106
- rand_id = [*0...n_samples].sample(@params[:n_clusters], random: sub_rng)
106
+ rand_id = Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng)
107
107
  @cluster_centers = x[rand_id, true].dup
108
108
  return unless @params[:init] == 'k-means++'
109
109
 
@@ -124,7 +124,7 @@ module Rumale
124
124
  # random initialize
125
125
  n_samples = distance_mat.shape[0]
126
126
  sub_rng = @rng.dup
127
- @medoid_ids = Numo::Int32.asarray([*0...n_samples].sample(@params[:n_clusters], random: sub_rng))
127
+ @medoid_ids = Numo::Int32.asarray(Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng))
128
128
  return unless @params[:init] == 'k-means++'
129
129
 
130
130
  # k-means++ initialize
@@ -67,7 +67,7 @@ module Rumale
67
67
  init_cluster_centers(x, sub_rng)
68
68
  # optimization with mini-batch sgd.
69
69
  @params[:max_iter].times do |_t|
70
- sample_ids = [*0...n_samples].shuffle(random: sub_rng)
70
+ sample_ids = Array(0...n_samples).shuffle(random: sub_rng)
71
71
  old_centers = @cluster_centers.dup
72
72
  until (subset_ids = sample_ids.shift(@params[:batch_size])).empty?
73
73
  # sub sampling
@@ -120,7 +120,7 @@ module Rumale
120
120
  def init_cluster_centers(x, sub_rng)
121
121
  # random initialize
122
122
  n_samples = x.shape[0]
123
- rand_id = [*0...n_samples].sample(@params[:n_clusters], random: sub_rng)
123
+ rand_id = Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng)
124
124
  @cluster_centers = x[rand_id, true].dup
125
125
  return unless @params[:init] == 'k-means++'
126
126
 
@@ -81,7 +81,7 @@ module Rumale
81
81
  y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
82
82
  # shuffle data indices.
83
83
  if shuffle
84
- rand_ids = [*0...n_samples].shuffle(random: rng.dup)
84
+ rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
85
85
  x = x[rand_ids, true].dup
86
86
  y = y[rand_ids].dup
87
87
  end
@@ -118,7 +118,7 @@ module Rumale
118
118
  y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
119
119
  # shuffle data indices.
120
120
  if shuffle
121
- rand_ids = [*0...n_samples].shuffle(random: rng.dup)
121
+ rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
122
122
  x = x[rand_ids, true].dup
123
123
  y = y[rand_ids].dup
124
124
  end
@@ -173,7 +173,7 @@ module Rumale
173
173
  end
174
174
  # shuffle data.
175
175
  if shuffle
176
- rand_ids = [*0...n_samples].shuffle(random: rng.dup)
176
+ rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
177
177
  x = x[rand_ids, true].dup
178
178
  y = y[rand_ids].dup
179
179
  end
@@ -225,7 +225,7 @@ module Rumale
225
225
  line = dump_label(label, label_type.to_s)
226
226
  ftvec.to_a.each_with_index do |val, n|
227
227
  idx = n + (zero_based == false ? 1 : 0)
228
- line += format(" %d:#{value_type}", idx, val) if val != 0.0
228
+ line += format(" %d:#{value_type}", idx, val) if val != 0
229
229
  end
230
230
  line
231
231
  end
@@ -77,7 +77,7 @@ module Rumale
77
77
  # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
78
78
  def transform(x)
79
79
  x = check_convert_sample_array(x)
80
- partial_fit(x, false)
80
+ partial_fit(x, update_comps: false)
81
81
  end
82
82
 
83
83
  # Inverse transform the given transformed data with the learned model.
@@ -91,7 +91,7 @@ module Rumale
91
91
 
92
92
  private
93
93
 
94
- def partial_fit(x, update_comps = true)
94
+ def partial_fit(x, update_comps: true)
95
95
  # initialize some variables.
96
96
  n_samples, n_features = x.shape
97
97
  scale = Math.sqrt(x.mean / @params[:n_components])
@@ -85,7 +85,7 @@ module Rumale
85
85
  # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
86
86
  # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
87
87
  # @return [RandomForestClassifier] The learned classifier itself.
88
- def fit(x, y)
88
+ def fit(x, y) # rubocop:disable Metrics/AbcSize
89
89
  x = check_convert_sample_array(x)
90
90
  y = check_convert_label_array(y)
91
91
  check_sample_label_size(x, y)
@@ -79,7 +79,7 @@ module Rumale
79
79
  # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
80
80
  # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
81
81
  # @return [RandomForestRegressor] The learned regressor itself.
82
- def fit(x, y)
82
+ def fit(x, y) # rubocop:disable Metrics/AbcSize
83
83
  x = check_convert_sample_array(x)
84
84
  y = check_convert_tvalue_array(y)
85
85
  check_sample_tvalue_size(x, y)
@@ -67,7 +67,7 @@ module Rumale
67
67
  def transform(x)
68
68
  raise 'FeatureHasher#transform requires Mmh3 but that is not loaded.' unless enable_mmh3?
69
69
 
70
- x = [x] unless x.is_a?(Array)
70
+ x = [x] unless x.is_a?(Array) # rubocop:disable Style/ArrayCoercion
71
71
  n_samples = x.size
72
72
 
73
73
  z = Numo::DFloat.zeros(n_samples, n_features)
@@ -99,7 +99,7 @@ module Rumale
99
99
  # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
100
100
  # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
101
101
  def transform(x)
102
- x = [x] unless x.is_a?(Array)
102
+ x = [x] unless x.is_a?(Array) # rubocop:disable Style/ArrayCoercion
103
103
  n_samples = x.size
104
104
  n_features = @vocabulary.size
105
105
  z = Numo::DFloat.zeros(n_samples, n_features)
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+ require 'rumale/preprocessing/l1_normalizer'
6
+ require 'rumale/preprocessing/l2_normalizer'
7
+
8
+ module Rumale
9
+ module FeatureExtraction
10
+ # Transform sample matrix with term frequecy (tf) to a normalized tf-idf (inverse document frequency) reprensentation.
11
+ #
12
+ # @example
13
+ # encoder = Rumale::FeatureExtraction::HashVectorizer.new
14
+ # x = encoder.fit_transform([
15
+ # { foo: 1, bar: 2 },
16
+ # { foo: 3, baz: 1 }
17
+ # ])
18
+ #
19
+ # # > pp x
20
+ # # Numo::DFloat#shape=[2,3]
21
+ # # [[2, 0, 1],
22
+ # # [0, 1, 3]]
23
+ #
24
+ # transformer = Rumale::FeatureExtraction::TfidfTransformer.new
25
+ # x_tfidf = transformer.fit_transform(x)
26
+ #
27
+ # # > pp x_tfidf
28
+ # # Numo::DFloat#shape=[2,3]
29
+ # # [[0.959056, 0, 0.283217],
30
+ # # [0, 0.491506, 0.870874]]
31
+ #
32
+ # *Reference*
33
+ # - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
34
+ class TfidfTransformer
35
+ include Base::BaseEstimator
36
+ include Base::Transformer
37
+
38
+ # Return the vector consists of inverse document frequency.
39
+ # @return [Numo::DFloat] (shape: [n_features])
40
+ attr_reader :idf
41
+
42
+ # Create a new transfomer for converting tf vectors to tf-idf vectors.
43
+ #
44
+ # @param norm [String] The normalization method to be used ('l1', 'l2' and 'none').
45
+ # @param use_idf [Boolean] The flag indicating whether to use inverse document frequency weighting.
46
+ # @param smooth_idf [Boolean] The flag indicating whether to apply idf smoothing by log((n_samples + 1) / (df + 1)) + 1.
47
+ # @param sublinear_tf [Boolean] The flag indicating whether to perform subliner tf scaling by 1 + log(tf).
48
+ def initialize(norm: 'l2', use_idf: true, smooth_idf: false, sublinear_tf: false)
49
+ check_params_string(norm: norm)
50
+ check_params_boolean(use_idf: use_idf, smooth_idf: smooth_idf, sublinear_tf: sublinear_tf)
51
+ @params = {}
52
+ @params[:norm] = norm
53
+ @params[:use_idf] = use_idf
54
+ @params[:smooth_idf] = smooth_idf
55
+ @params[:sublinear_tf] = sublinear_tf
56
+ @idf = nil
57
+ end
58
+
59
+ # Calculate the inverse document frequency for weighting.
60
+ #
61
+ # @overload fit(x) -> TfidfTransformer
62
+ #
63
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the idf values.
64
+ # @return [TfidfTransformer]
65
+ def fit(x, _y = nil)
66
+ return self unless @params[:use_idf]
67
+
68
+ x = check_convert_sample_array(x)
69
+
70
+ n_samples = x.shape[0]
71
+ df = x.class.cast(x.gt(0.0).count(0))
72
+
73
+ if @params[:smooth_idf]
74
+ df += 1
75
+ n_samples += 1
76
+ end
77
+
78
+ @idf = Numo::NMath.log(n_samples / df) + 1
79
+
80
+ self
81
+ end
82
+
83
+ # Calculate the idf values, and then transfrom samples to the tf-idf representation.
84
+ #
85
+ # @overload fit_transform(x) -> Numo::DFloat
86
+ #
87
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate idf and be transformed to tf-idf representation.
88
+ # @return [Numo::DFloat] The transformed samples.
89
+ def fit_transform(x, _y = nil)
90
+ fit(x).transform(x)
91
+ end
92
+
93
+ # Perform transforming the given samples to the tf-idf representation.
94
+ #
95
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
96
+ # @return [Numo::DFloat] The transformed samples.
97
+ def transform(x)
98
+ x = check_convert_sample_array(x)
99
+ z = x.dup
100
+
101
+ z[z.ne(0)] = Numo::NMath.log(z[z.ne(0)]) + 1 if @params[:sublinear_tf]
102
+ z *= @idf if @params[:use_idf]
103
+ case @params[:norm]
104
+ when 'l2'
105
+ z = Rumale::Preprocessing::L2Normalizer.new.fit_transform(z)
106
+ when 'l1'
107
+ z = Rumale::Preprocessing::L1Normalizer.new.fit_transform(z)
108
+ end
109
+ z
110
+ end
111
+ end
112
+ end
113
+ end
@@ -69,7 +69,7 @@ module Rumale
69
69
  n_components = [1, [@params[:n_components], n_samples].min].max
70
70
 
71
71
  # random sampling.
72
- @component_indices = Numo::Int32.cast([*0...n_samples].shuffle(random: sub_rng)[0...n_components])
72
+ @component_indices = Numo::Int32.cast(Array(0...n_samples).shuffle(random: sub_rng)[0...n_components])
73
73
  @components = x[@component_indices, true]
74
74
 
75
75
  # calculate normalizing factor.