rumale 0.19.1 → 0.20.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +5 -29
  3. data/CHANGELOG.md +28 -0
  4. data/lib/rumale.rb +7 -10
  5. data/lib/rumale/clustering/hdbscan.rb +3 -3
  6. data/lib/rumale/clustering/k_means.rb +1 -1
  7. data/lib/rumale/clustering/k_medoids.rb +1 -1
  8. data/lib/rumale/clustering/mini_batch_k_means.rb +2 -2
  9. data/lib/rumale/dataset.rb +4 -4
  10. data/lib/rumale/decomposition/nmf.rb +2 -2
  11. data/lib/rumale/ensemble/random_forest_classifier.rb +1 -1
  12. data/lib/rumale/ensemble/random_forest_regressor.rb +1 -1
  13. data/lib/rumale/feature_extraction/feature_hasher.rb +1 -1
  14. data/lib/rumale/feature_extraction/hash_vectorizer.rb +1 -1
  15. data/lib/rumale/feature_extraction/tfidf_transformer.rb +113 -0
  16. data/lib/rumale/kernel_approximation/nystroem.rb +1 -1
  17. data/lib/rumale/kernel_machine/kernel_svc.rb +1 -1
  18. data/lib/rumale/linear_model/base_sgd.rb +1 -1
  19. data/lib/rumale/manifold/tsne.rb +1 -1
  20. data/lib/rumale/model_selection/cross_validation.rb +3 -2
  21. data/lib/rumale/model_selection/group_k_fold.rb +93 -0
  22. data/lib/rumale/model_selection/group_shuffle_split.rb +115 -0
  23. data/lib/rumale/model_selection/k_fold.rb +1 -1
  24. data/lib/rumale/model_selection/shuffle_split.rb +5 -5
  25. data/lib/rumale/model_selection/stratified_k_fold.rb +1 -1
  26. data/lib/rumale/model_selection/stratified_shuffle_split.rb +13 -9
  27. data/lib/rumale/model_selection/time_series_split.rb +91 -0
  28. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +2 -2
  29. data/lib/rumale/nearest_neighbors/vp_tree.rb +1 -1
  30. data/lib/rumale/neural_network/base_mlp.rb +1 -1
  31. data/lib/rumale/preprocessing/binarizer.rb +60 -0
  32. data/lib/rumale/preprocessing/l1_normalizer.rb +62 -0
  33. data/lib/rumale/preprocessing/l2_normalizer.rb +2 -1
  34. data/lib/rumale/preprocessing/max_normalizer.rb +62 -0
  35. data/lib/rumale/probabilistic_output.rb +1 -1
  36. data/lib/rumale/version.rb +1 -1
  37. metadata +9 -12
  38. data/lib/rumale/linear_model/base_linear_model.rb +0 -102
  39. data/lib/rumale/optimizer/ada_grad.rb +0 -42
  40. data/lib/rumale/optimizer/adam.rb +0 -56
  41. data/lib/rumale/optimizer/nadam.rb +0 -67
  42. data/lib/rumale/optimizer/rmsprop.rb +0 -50
  43. data/lib/rumale/optimizer/sgd.rb +0 -46
  44. data/lib/rumale/optimizer/yellow_fin.rb +0 -104
  45. data/lib/rumale/polynomial_model/base_factorization_machine.rb +0 -125
  46. data/lib/rumale/polynomial_model/factorization_machine_classifier.rb +0 -220
  47. data/lib/rumale/polynomial_model/factorization_machine_regressor.rb +0 -134
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f49170105721cfebcae9f1a424e9a858650d78225541a8cb63b0ad4c70734988
4
- data.tar.gz: ecc35086328eee1066252e75b8cd638256039e93beebc0bce5714493fe72570b
3
+ metadata.gz: 5d8c93acbf38fbd07e5df224010abbdd4269a6ce3bbf8112a0eba652a606785d
4
+ data.tar.gz: e7cb00a802420854835c92f011425f3054bfcc1052bf7b3664da1f95834ef435
5
5
  SHA512:
6
- metadata.gz: 68f432bb34ff6c8e467a91d7c7e3aa07e816c2dd8807defc9e4e82e7a720c925062dbd27c8a7ec3294ecef2d71041baead2510edaf03a1eee210dc811eede22d
7
- data.tar.gz: 5854eacc12de6c3cdcdbab0f9b4e73fc64d1be0533732348da6b4d6dcb0be9f115e2415501b05148fd021fa844ac0c25adc1bb858432a02ca6fe19d30a3538c7
6
+ metadata.gz: f95fdd89b84dad02e516ee0479b1cddfb101cb96de897b6e7fa3fba546272a243cff5cfe954cb51942ec1ab23cf3028b183db86b52fab00a35d15be7eee5bf92
7
+ data.tar.gz: e5f6235e88dd47b9002a2154cabd2c1e64afb6cbb5b0745b411c7e5559351e925c9db8ec332724e301b83215662b3582e79a9e997f0338846514b234dabf1fc3
@@ -3,6 +3,7 @@ require:
3
3
  - rubocop-rspec
4
4
 
5
5
  AllCops:
6
+ NewCops: enable
6
7
  TargetRubyVersion: 2.5
7
8
  DisplayCopNames: true
8
9
  DisplayStyleGuide: true
@@ -15,25 +16,12 @@ AllCops:
15
16
  Style/Documentation:
16
17
  Enabled: false
17
18
 
18
- Style/HashEachMethods:
19
- Enabled: true
20
-
21
- Style/HashTransformKeys:
22
- Enabled: true
23
-
24
- Style/HashTransformValues:
25
- Enabled: true
26
-
27
- Lint/RaiseException:
28
- Enabled: true
29
-
30
- Lint/StructNewOverride:
31
- Enabled: true
32
-
33
19
  Layout/LineLength:
34
20
  Max: 145
35
21
  IgnoredPatterns: ['(\A|\s)#']
36
22
 
23
+ Lint/MissingSuper:
24
+ Enabled: false
37
25
 
38
26
  Metrics/ModuleLength:
39
27
  Max: 200
@@ -70,26 +58,14 @@ Naming/MethodParameterName:
70
58
  Naming/ConstantName:
71
59
  Enabled: false
72
60
 
73
- Style/ExponentialNotation:
74
- Enabled: true
75
-
76
61
  Style/FormatStringToken:
77
62
  Enabled: false
78
63
 
79
64
  Style/NumericLiterals:
80
65
  Enabled: false
81
66
 
82
- Style/SlicingWithRange:
83
- Enabled: true
84
-
85
- Layout/EmptyLineAfterGuardClause:
86
- Enabled: true
87
-
88
- Layout/EmptyLinesAroundAttributeAccessor:
89
- Enabled: true
90
-
91
- Layout/SpaceAroundMethodCallOperator:
92
- Enabled: true
67
+ Style/StringConcatenation:
68
+ Enabled: false
93
69
 
94
70
  RSpec/MultipleExpectations:
95
71
  Enabled: false
@@ -1,8 +1,36 @@
1
+ # 0.20.2
2
+ - Add cross-validator class for time-series data.
3
+ - [TimeSeriesSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/TimeSeriesSplit.html)
4
+
5
+ # 0.20.1
6
+ - Add cross-validator classes that split data according group labels.
7
+ - [GroupKFold](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/GroupKFold.html)
8
+ - [GroupShuffleSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/GroupShuffleSplit.html)
9
+ - Fix fraction treating of the number of samples on shuffle split cross-validator classes.
10
+ - [ShuffleSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/ShuffleSplit.html)
11
+ - [StratifiedShuffleSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/StratifiedShuffleSplit.html)
12
+ - Refactor some codes with Rubocop.
13
+
14
+ # 0.20.0
15
+ ## Breaking changes
16
+ - Delete deprecated estimators such as PolynomialModel, Optimizer, and BaseLinearModel.
17
+
18
+ # 0.19.3
19
+ - Add preprocessing class for [Binarizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/Binarizer.html)
20
+ - Add preprocessing class for [MaxNormalizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/MaxNormalizer.html)
21
+ - Refactor some codes with Rubocop.
22
+
23
+ # 0.19.2
24
+ - Fix L2Normalizer to avoid zero divide.
25
+ - Add preprocssing class for [L1Normalizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/L1Normalizer.html).
26
+ - Add transformer class for [TfidfTransformer](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/TfidfTransformer.html).
27
+
1
28
  # 0.19.1
2
29
  - Add cluster analysis class for [mini-batch K-Means](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/MiniBatchKMeans.html).
3
30
  - Fix some typos.
4
31
 
5
32
  # 0.19.0
33
+ ## Breaking changes
6
34
  - Change mmh3 and mopti gem to non-runtime dependent library.
7
35
  - The mmh3 gem is used in [FeatureHasher](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/FeatureHasher.html).
8
36
  You only need to require mmh3 gem when using FeatureHasher.
@@ -18,17 +18,10 @@ require 'rumale/base/cluster_analyzer'
18
18
  require 'rumale/base/transformer'
19
19
  require 'rumale/base/splitter'
20
20
  require 'rumale/base/evaluator'
21
- require 'rumale/optimizer/sgd'
22
- require 'rumale/optimizer/ada_grad'
23
- require 'rumale/optimizer/rmsprop'
24
- require 'rumale/optimizer/adam'
25
- require 'rumale/optimizer/nadam'
26
- require 'rumale/optimizer/yellow_fin'
27
21
  require 'rumale/pipeline/pipeline'
28
22
  require 'rumale/pipeline/feature_union'
29
23
  require 'rumale/kernel_approximation/rbf'
30
24
  require 'rumale/kernel_approximation/nystroem'
31
- require 'rumale/linear_model/base_linear_model'
32
25
  require 'rumale/linear_model/base_sgd'
33
26
  require 'rumale/linear_model/svc'
34
27
  require 'rumale/linear_model/svr'
@@ -41,9 +34,6 @@ require 'rumale/kernel_machine/kernel_svc'
41
34
  require 'rumale/kernel_machine/kernel_pca'
42
35
  require 'rumale/kernel_machine/kernel_fda'
43
36
  require 'rumale/kernel_machine/kernel_ridge'
44
- require 'rumale/polynomial_model/base_factorization_machine'
45
- require 'rumale/polynomial_model/factorization_machine_classifier'
46
- require 'rumale/polynomial_model/factorization_machine_regressor'
47
37
  require 'rumale/multiclass/one_vs_rest_classifier'
48
38
  require 'rumale/nearest_neighbors/vp_tree'
49
39
  require 'rumale/nearest_neighbors/k_neighbors_classifier'
@@ -93,7 +83,10 @@ require 'rumale/neural_network/mlp_regressor'
93
83
  require 'rumale/neural_network/mlp_classifier'
94
84
  require 'rumale/feature_extraction/hash_vectorizer'
95
85
  require 'rumale/feature_extraction/feature_hasher'
86
+ require 'rumale/feature_extraction/tfidf_transformer'
96
87
  require 'rumale/preprocessing/l2_normalizer'
88
+ require 'rumale/preprocessing/l1_normalizer'
89
+ require 'rumale/preprocessing/max_normalizer'
97
90
  require 'rumale/preprocessing/min_max_scaler'
98
91
  require 'rumale/preprocessing/max_abs_scaler'
99
92
  require 'rumale/preprocessing/standard_scaler'
@@ -102,11 +95,15 @@ require 'rumale/preprocessing/label_binarizer'
102
95
  require 'rumale/preprocessing/label_encoder'
103
96
  require 'rumale/preprocessing/one_hot_encoder'
104
97
  require 'rumale/preprocessing/ordinal_encoder'
98
+ require 'rumale/preprocessing/binarizer'
105
99
  require 'rumale/preprocessing/polynomial_features'
106
100
  require 'rumale/model_selection/k_fold'
101
+ require 'rumale/model_selection/group_k_fold'
107
102
  require 'rumale/model_selection/stratified_k_fold'
108
103
  require 'rumale/model_selection/shuffle_split'
104
+ require 'rumale/model_selection/group_shuffle_split'
109
105
  require 'rumale/model_selection/stratified_shuffle_split'
106
+ require 'rumale/model_selection/time_series_split'
110
107
  require 'rumale/model_selection/cross_validation'
111
108
  require 'rumale/model_selection/grid_search_cv'
112
109
  require 'rumale/model_selection/function'
@@ -136,7 +136,7 @@ module Rumale
136
136
  res
137
137
  end
138
138
 
139
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
139
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
140
140
  def condense_tree(hierarchy, min_cluster_size)
141
141
  n_edges = hierarchy.size
142
142
  root = 2 * n_edges
@@ -232,7 +232,7 @@ module Rumale
232
232
  end
233
233
 
234
234
  def flatten(tree, stabilities)
235
- node_ids = stabilities.keys.sort { |a, b| b <=> a }.slice(0, stabilities.size - 1)
235
+ node_ids = stabilities.keys.sort.reverse.slice(0, stabilities.size - 1)
236
236
 
237
237
  cluster_tree = tree.select { |edge| edge.n_elements > 1 }
238
238
  is_cluster = node_ids.each_with_object({}) { |n_id, h| h[n_id] = true }
@@ -265,7 +265,7 @@ module Rumale
265
265
  end
266
266
  res
267
267
  end
268
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
268
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
269
269
  end
270
270
  end
271
271
  end
@@ -103,7 +103,7 @@ module Rumale
103
103
  # random initialize
104
104
  n_samples = x.shape[0]
105
105
  sub_rng = @rng.dup
106
- rand_id = [*0...n_samples].sample(@params[:n_clusters], random: sub_rng)
106
+ rand_id = Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng)
107
107
  @cluster_centers = x[rand_id, true].dup
108
108
  return unless @params[:init] == 'k-means++'
109
109
 
@@ -124,7 +124,7 @@ module Rumale
124
124
  # random initialize
125
125
  n_samples = distance_mat.shape[0]
126
126
  sub_rng = @rng.dup
127
- @medoid_ids = Numo::Int32.asarray([*0...n_samples].sample(@params[:n_clusters], random: sub_rng))
127
+ @medoid_ids = Numo::Int32.asarray(Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng))
128
128
  return unless @params[:init] == 'k-means++'
129
129
 
130
130
  # k-means++ initialize
@@ -67,7 +67,7 @@ module Rumale
67
67
  init_cluster_centers(x, sub_rng)
68
68
  # optimization with mini-batch sgd.
69
69
  @params[:max_iter].times do |_t|
70
- sample_ids = [*0...n_samples].shuffle(random: sub_rng)
70
+ sample_ids = Array(0...n_samples).shuffle(random: sub_rng)
71
71
  old_centers = @cluster_centers.dup
72
72
  until (subset_ids = sample_ids.shift(@params[:batch_size])).empty?
73
73
  # sub sampling
@@ -120,7 +120,7 @@ module Rumale
120
120
  def init_cluster_centers(x, sub_rng)
121
121
  # random initialize
122
122
  n_samples = x.shape[0]
123
- rand_id = [*0...n_samples].sample(@params[:n_clusters], random: sub_rng)
123
+ rand_id = Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng)
124
124
  @cluster_centers = x[rand_id, true].dup
125
125
  return unless @params[:init] == 'k-means++'
126
126
 
@@ -81,7 +81,7 @@ module Rumale
81
81
  y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
82
82
  # shuffle data indices.
83
83
  if shuffle
84
- rand_ids = [*0...n_samples].shuffle(random: rng.dup)
84
+ rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
85
85
  x = x[rand_ids, true].dup
86
86
  y = y[rand_ids].dup
87
87
  end
@@ -118,7 +118,7 @@ module Rumale
118
118
  y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
119
119
  # shuffle data indices.
120
120
  if shuffle
121
- rand_ids = [*0...n_samples].shuffle(random: rng.dup)
121
+ rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
122
122
  x = x[rand_ids, true].dup
123
123
  y = y[rand_ids].dup
124
124
  end
@@ -173,7 +173,7 @@ module Rumale
173
173
  end
174
174
  # shuffle data.
175
175
  if shuffle
176
- rand_ids = [*0...n_samples].shuffle(random: rng.dup)
176
+ rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
177
177
  x = x[rand_ids, true].dup
178
178
  y = y[rand_ids].dup
179
179
  end
@@ -225,7 +225,7 @@ module Rumale
225
225
  line = dump_label(label, label_type.to_s)
226
226
  ftvec.to_a.each_with_index do |val, n|
227
227
  idx = n + (zero_based == false ? 1 : 0)
228
- line += format(" %d:#{value_type}", idx, val) if val != 0.0
228
+ line += format(" %d:#{value_type}", idx, val) if val != 0
229
229
  end
230
230
  line
231
231
  end
@@ -77,7 +77,7 @@ module Rumale
77
77
  # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
78
78
  def transform(x)
79
79
  x = check_convert_sample_array(x)
80
- partial_fit(x, false)
80
+ partial_fit(x, update_comps: false)
81
81
  end
82
82
 
83
83
  # Inverse transform the given transformed data with the learned model.
@@ -91,7 +91,7 @@ module Rumale
91
91
 
92
92
  private
93
93
 
94
- def partial_fit(x, update_comps = true)
94
+ def partial_fit(x, update_comps: true)
95
95
  # initialize some variables.
96
96
  n_samples, n_features = x.shape
97
97
  scale = Math.sqrt(x.mean / @params[:n_components])
@@ -85,7 +85,7 @@ module Rumale
85
85
  # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
86
86
  # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
87
87
  # @return [RandomForestClassifier] The learned classifier itself.
88
- def fit(x, y)
88
+ def fit(x, y) # rubocop:disable Metrics/AbcSize
89
89
  x = check_convert_sample_array(x)
90
90
  y = check_convert_label_array(y)
91
91
  check_sample_label_size(x, y)
@@ -79,7 +79,7 @@ module Rumale
79
79
  # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
80
80
  # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
81
81
  # @return [RandomForestRegressor] The learned regressor itself.
82
- def fit(x, y)
82
+ def fit(x, y) # rubocop:disable Metrics/AbcSize
83
83
  x = check_convert_sample_array(x)
84
84
  y = check_convert_tvalue_array(y)
85
85
  check_sample_tvalue_size(x, y)
@@ -67,7 +67,7 @@ module Rumale
67
67
  def transform(x)
68
68
  raise 'FeatureHasher#transform requires Mmh3 but that is not loaded.' unless enable_mmh3?
69
69
 
70
- x = [x] unless x.is_a?(Array)
70
+ x = [x] unless x.is_a?(Array) # rubocop:disable Style/ArrayCoercion
71
71
  n_samples = x.size
72
72
 
73
73
  z = Numo::DFloat.zeros(n_samples, n_features)
@@ -99,7 +99,7 @@ module Rumale
99
99
  # @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
100
100
  # @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
101
101
  def transform(x)
102
- x = [x] unless x.is_a?(Array)
102
+ x = [x] unless x.is_a?(Array) # rubocop:disable Style/ArrayCoercion
103
103
  n_samples = x.size
104
104
  n_features = @vocabulary.size
105
105
  z = Numo::DFloat.zeros(n_samples, n_features)
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rumale/base/base_estimator'
4
+ require 'rumale/base/transformer'
5
+ require 'rumale/preprocessing/l1_normalizer'
6
+ require 'rumale/preprocessing/l2_normalizer'
7
+
8
+ module Rumale
9
+ module FeatureExtraction
10
+ # Transform sample matrix with term frequecy (tf) to a normalized tf-idf (inverse document frequency) reprensentation.
11
+ #
12
+ # @example
13
+ # encoder = Rumale::FeatureExtraction::HashVectorizer.new
14
+ # x = encoder.fit_transform([
15
+ # { foo: 1, bar: 2 },
16
+ # { foo: 3, baz: 1 }
17
+ # ])
18
+ #
19
+ # # > pp x
20
+ # # Numo::DFloat#shape=[2,3]
21
+ # # [[2, 0, 1],
22
+ # # [0, 1, 3]]
23
+ #
24
+ # transformer = Rumale::FeatureExtraction::TfidfTransformer.new
25
+ # x_tfidf = transformer.fit_transform(x)
26
+ #
27
+ # # > pp x_tfidf
28
+ # # Numo::DFloat#shape=[2,3]
29
+ # # [[0.959056, 0, 0.283217],
30
+ # # [0, 0.491506, 0.870874]]
31
+ #
32
+ # *Reference*
33
+ # - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
34
+ class TfidfTransformer
35
+ include Base::BaseEstimator
36
+ include Base::Transformer
37
+
38
+ # Return the vector consists of inverse document frequency.
39
+ # @return [Numo::DFloat] (shape: [n_features])
40
+ attr_reader :idf
41
+
42
+ # Create a new transfomer for converting tf vectors to tf-idf vectors.
43
+ #
44
+ # @param norm [String] The normalization method to be used ('l1', 'l2' and 'none').
45
+ # @param use_idf [Boolean] The flag indicating whether to use inverse document frequency weighting.
46
+ # @param smooth_idf [Boolean] The flag indicating whether to apply idf smoothing by log((n_samples + 1) / (df + 1)) + 1.
47
+ # @param sublinear_tf [Boolean] The flag indicating whether to perform subliner tf scaling by 1 + log(tf).
48
+ def initialize(norm: 'l2', use_idf: true, smooth_idf: false, sublinear_tf: false)
49
+ check_params_string(norm: norm)
50
+ check_params_boolean(use_idf: use_idf, smooth_idf: smooth_idf, sublinear_tf: sublinear_tf)
51
+ @params = {}
52
+ @params[:norm] = norm
53
+ @params[:use_idf] = use_idf
54
+ @params[:smooth_idf] = smooth_idf
55
+ @params[:sublinear_tf] = sublinear_tf
56
+ @idf = nil
57
+ end
58
+
59
+ # Calculate the inverse document frequency for weighting.
60
+ #
61
+ # @overload fit(x) -> TfidfTransformer
62
+ #
63
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the idf values.
64
+ # @return [TfidfTransformer]
65
+ def fit(x, _y = nil)
66
+ return self unless @params[:use_idf]
67
+
68
+ x = check_convert_sample_array(x)
69
+
70
+ n_samples = x.shape[0]
71
+ df = x.class.cast(x.gt(0.0).count(0))
72
+
73
+ if @params[:smooth_idf]
74
+ df += 1
75
+ n_samples += 1
76
+ end
77
+
78
+ @idf = Numo::NMath.log(n_samples / df) + 1
79
+
80
+ self
81
+ end
82
+
83
+ # Calculate the idf values, and then transfrom samples to the tf-idf representation.
84
+ #
85
+ # @overload fit_transform(x) -> Numo::DFloat
86
+ #
87
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate idf and be transformed to tf-idf representation.
88
+ # @return [Numo::DFloat] The transformed samples.
89
+ def fit_transform(x, _y = nil)
90
+ fit(x).transform(x)
91
+ end
92
+
93
+ # Perform transforming the given samples to the tf-idf representation.
94
+ #
95
+ # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
96
+ # @return [Numo::DFloat] The transformed samples.
97
+ def transform(x)
98
+ x = check_convert_sample_array(x)
99
+ z = x.dup
100
+
101
+ z[z.ne(0)] = Numo::NMath.log(z[z.ne(0)]) + 1 if @params[:sublinear_tf]
102
+ z *= @idf if @params[:use_idf]
103
+ case @params[:norm]
104
+ when 'l2'
105
+ z = Rumale::Preprocessing::L2Normalizer.new.fit_transform(z)
106
+ when 'l1'
107
+ z = Rumale::Preprocessing::L1Normalizer.new.fit_transform(z)
108
+ end
109
+ z
110
+ end
111
+ end
112
+ end
113
+ end
@@ -69,7 +69,7 @@ module Rumale
69
69
  n_components = [1, [@params[:n_components], n_samples].min].max
70
70
 
71
71
  # random sampling.
72
- @component_indices = Numo::Int32.cast([*0...n_samples].shuffle(random: sub_rng)[0...n_components])
72
+ @component_indices = Numo::Int32.cast(Array(0...n_samples).shuffle(random: sub_rng)[0...n_components])
73
73
  @components = x[@component_indices, true]
74
74
 
75
75
  # calculate normalizing factor.