rumale 0.19.1 → 0.20.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +5 -29
- data/CHANGELOG.md +28 -0
- data/lib/rumale.rb +7 -10
- data/lib/rumale/clustering/hdbscan.rb +3 -3
- data/lib/rumale/clustering/k_means.rb +1 -1
- data/lib/rumale/clustering/k_medoids.rb +1 -1
- data/lib/rumale/clustering/mini_batch_k_means.rb +2 -2
- data/lib/rumale/dataset.rb +4 -4
- data/lib/rumale/decomposition/nmf.rb +2 -2
- data/lib/rumale/ensemble/random_forest_classifier.rb +1 -1
- data/lib/rumale/ensemble/random_forest_regressor.rb +1 -1
- data/lib/rumale/feature_extraction/feature_hasher.rb +1 -1
- data/lib/rumale/feature_extraction/hash_vectorizer.rb +1 -1
- data/lib/rumale/feature_extraction/tfidf_transformer.rb +113 -0
- data/lib/rumale/kernel_approximation/nystroem.rb +1 -1
- data/lib/rumale/kernel_machine/kernel_svc.rb +1 -1
- data/lib/rumale/linear_model/base_sgd.rb +1 -1
- data/lib/rumale/manifold/tsne.rb +1 -1
- data/lib/rumale/model_selection/cross_validation.rb +3 -2
- data/lib/rumale/model_selection/group_k_fold.rb +93 -0
- data/lib/rumale/model_selection/group_shuffle_split.rb +115 -0
- data/lib/rumale/model_selection/k_fold.rb +1 -1
- data/lib/rumale/model_selection/shuffle_split.rb +5 -5
- data/lib/rumale/model_selection/stratified_k_fold.rb +1 -1
- data/lib/rumale/model_selection/stratified_shuffle_split.rb +13 -9
- data/lib/rumale/model_selection/time_series_split.rb +91 -0
- data/lib/rumale/multiclass/one_vs_rest_classifier.rb +2 -2
- data/lib/rumale/nearest_neighbors/vp_tree.rb +1 -1
- data/lib/rumale/neural_network/base_mlp.rb +1 -1
- data/lib/rumale/preprocessing/binarizer.rb +60 -0
- data/lib/rumale/preprocessing/l1_normalizer.rb +62 -0
- data/lib/rumale/preprocessing/l2_normalizer.rb +2 -1
- data/lib/rumale/preprocessing/max_normalizer.rb +62 -0
- data/lib/rumale/probabilistic_output.rb +1 -1
- data/lib/rumale/version.rb +1 -1
- metadata +9 -12
- data/lib/rumale/linear_model/base_linear_model.rb +0 -102
- data/lib/rumale/optimizer/ada_grad.rb +0 -42
- data/lib/rumale/optimizer/adam.rb +0 -56
- data/lib/rumale/optimizer/nadam.rb +0 -67
- data/lib/rumale/optimizer/rmsprop.rb +0 -50
- data/lib/rumale/optimizer/sgd.rb +0 -46
- data/lib/rumale/optimizer/yellow_fin.rb +0 -104
- data/lib/rumale/polynomial_model/base_factorization_machine.rb +0 -125
- data/lib/rumale/polynomial_model/factorization_machine_classifier.rb +0 -220
- data/lib/rumale/polynomial_model/factorization_machine_regressor.rb +0 -134
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5d8c93acbf38fbd07e5df224010abbdd4269a6ce3bbf8112a0eba652a606785d
|
4
|
+
data.tar.gz: e7cb00a802420854835c92f011425f3054bfcc1052bf7b3664da1f95834ef435
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f95fdd89b84dad02e516ee0479b1cddfb101cb96de897b6e7fa3fba546272a243cff5cfe954cb51942ec1ab23cf3028b183db86b52fab00a35d15be7eee5bf92
|
7
|
+
data.tar.gz: e5f6235e88dd47b9002a2154cabd2c1e64afb6cbb5b0745b411c7e5559351e925c9db8ec332724e301b83215662b3582e79a9e997f0338846514b234dabf1fc3
|
data/.rubocop.yml
CHANGED
@@ -3,6 +3,7 @@ require:
|
|
3
3
|
- rubocop-rspec
|
4
4
|
|
5
5
|
AllCops:
|
6
|
+
NewCops: enable
|
6
7
|
TargetRubyVersion: 2.5
|
7
8
|
DisplayCopNames: true
|
8
9
|
DisplayStyleGuide: true
|
@@ -15,25 +16,12 @@ AllCops:
|
|
15
16
|
Style/Documentation:
|
16
17
|
Enabled: false
|
17
18
|
|
18
|
-
Style/HashEachMethods:
|
19
|
-
Enabled: true
|
20
|
-
|
21
|
-
Style/HashTransformKeys:
|
22
|
-
Enabled: true
|
23
|
-
|
24
|
-
Style/HashTransformValues:
|
25
|
-
Enabled: true
|
26
|
-
|
27
|
-
Lint/RaiseException:
|
28
|
-
Enabled: true
|
29
|
-
|
30
|
-
Lint/StructNewOverride:
|
31
|
-
Enabled: true
|
32
|
-
|
33
19
|
Layout/LineLength:
|
34
20
|
Max: 145
|
35
21
|
IgnoredPatterns: ['(\A|\s)#']
|
36
22
|
|
23
|
+
Lint/MissingSuper:
|
24
|
+
Enabled: false
|
37
25
|
|
38
26
|
Metrics/ModuleLength:
|
39
27
|
Max: 200
|
@@ -70,26 +58,14 @@ Naming/MethodParameterName:
|
|
70
58
|
Naming/ConstantName:
|
71
59
|
Enabled: false
|
72
60
|
|
73
|
-
Style/ExponentialNotation:
|
74
|
-
Enabled: true
|
75
|
-
|
76
61
|
Style/FormatStringToken:
|
77
62
|
Enabled: false
|
78
63
|
|
79
64
|
Style/NumericLiterals:
|
80
65
|
Enabled: false
|
81
66
|
|
82
|
-
Style/
|
83
|
-
Enabled:
|
84
|
-
|
85
|
-
Layout/EmptyLineAfterGuardClause:
|
86
|
-
Enabled: true
|
87
|
-
|
88
|
-
Layout/EmptyLinesAroundAttributeAccessor:
|
89
|
-
Enabled: true
|
90
|
-
|
91
|
-
Layout/SpaceAroundMethodCallOperator:
|
92
|
-
Enabled: true
|
67
|
+
Style/StringConcatenation:
|
68
|
+
Enabled: false
|
93
69
|
|
94
70
|
RSpec/MultipleExpectations:
|
95
71
|
Enabled: false
|
data/CHANGELOG.md
CHANGED
@@ -1,8 +1,36 @@
|
|
1
|
+
# 0.20.2
|
2
|
+
- Add cross-validator class for time-series data.
|
3
|
+
- [TimeSeriesSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/TimeSeriesSplit.html)
|
4
|
+
|
5
|
+
# 0.20.1
|
6
|
+
- Add cross-validator classes that split data according group labels.
|
7
|
+
- [GroupKFold](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/GroupKFold.html)
|
8
|
+
- [GroupShuffleSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/GroupShuffleSplit.html)
|
9
|
+
- Fix fraction treating of the number of samples on shuffle split cross-validator classes.
|
10
|
+
- [ShuffleSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/ShuffleSplit.html)
|
11
|
+
- [StratifiedShuffleSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/StratifiedShuffleSplit.html)
|
12
|
+
- Refactor some codes with Rubocop.
|
13
|
+
|
14
|
+
# 0.20.0
|
15
|
+
## Breaking changes
|
16
|
+
- Delete deprecated estimators such as PolynomialModel, Optimizer, and BaseLinearModel.
|
17
|
+
|
18
|
+
# 0.19.3
|
19
|
+
- Add preprocessing class for [Binarizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/Binarizer.html)
|
20
|
+
- Add preprocessing class for [MaxNormalizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/MaxNormalizer.html)
|
21
|
+
- Refactor some codes with Rubocop.
|
22
|
+
|
23
|
+
# 0.19.2
|
24
|
+
- Fix L2Normalizer to avoid zero divide.
|
25
|
+
- Add preprocssing class for [L1Normalizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/L1Normalizer.html).
|
26
|
+
- Add transformer class for [TfidfTransformer](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/TfidfTransformer.html).
|
27
|
+
|
1
28
|
# 0.19.1
|
2
29
|
- Add cluster analysis class for [mini-batch K-Means](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/MiniBatchKMeans.html).
|
3
30
|
- Fix some typos.
|
4
31
|
|
5
32
|
# 0.19.0
|
33
|
+
## Breaking changes
|
6
34
|
- Change mmh3 and mopti gem to non-runtime dependent library.
|
7
35
|
- The mmh3 gem is used in [FeatureHasher](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/FeatureHasher.html).
|
8
36
|
You only need to require mmh3 gem when using FeatureHasher.
|
data/lib/rumale.rb
CHANGED
@@ -18,17 +18,10 @@ require 'rumale/base/cluster_analyzer'
|
|
18
18
|
require 'rumale/base/transformer'
|
19
19
|
require 'rumale/base/splitter'
|
20
20
|
require 'rumale/base/evaluator'
|
21
|
-
require 'rumale/optimizer/sgd'
|
22
|
-
require 'rumale/optimizer/ada_grad'
|
23
|
-
require 'rumale/optimizer/rmsprop'
|
24
|
-
require 'rumale/optimizer/adam'
|
25
|
-
require 'rumale/optimizer/nadam'
|
26
|
-
require 'rumale/optimizer/yellow_fin'
|
27
21
|
require 'rumale/pipeline/pipeline'
|
28
22
|
require 'rumale/pipeline/feature_union'
|
29
23
|
require 'rumale/kernel_approximation/rbf'
|
30
24
|
require 'rumale/kernel_approximation/nystroem'
|
31
|
-
require 'rumale/linear_model/base_linear_model'
|
32
25
|
require 'rumale/linear_model/base_sgd'
|
33
26
|
require 'rumale/linear_model/svc'
|
34
27
|
require 'rumale/linear_model/svr'
|
@@ -41,9 +34,6 @@ require 'rumale/kernel_machine/kernel_svc'
|
|
41
34
|
require 'rumale/kernel_machine/kernel_pca'
|
42
35
|
require 'rumale/kernel_machine/kernel_fda'
|
43
36
|
require 'rumale/kernel_machine/kernel_ridge'
|
44
|
-
require 'rumale/polynomial_model/base_factorization_machine'
|
45
|
-
require 'rumale/polynomial_model/factorization_machine_classifier'
|
46
|
-
require 'rumale/polynomial_model/factorization_machine_regressor'
|
47
37
|
require 'rumale/multiclass/one_vs_rest_classifier'
|
48
38
|
require 'rumale/nearest_neighbors/vp_tree'
|
49
39
|
require 'rumale/nearest_neighbors/k_neighbors_classifier'
|
@@ -93,7 +83,10 @@ require 'rumale/neural_network/mlp_regressor'
|
|
93
83
|
require 'rumale/neural_network/mlp_classifier'
|
94
84
|
require 'rumale/feature_extraction/hash_vectorizer'
|
95
85
|
require 'rumale/feature_extraction/feature_hasher'
|
86
|
+
require 'rumale/feature_extraction/tfidf_transformer'
|
96
87
|
require 'rumale/preprocessing/l2_normalizer'
|
88
|
+
require 'rumale/preprocessing/l1_normalizer'
|
89
|
+
require 'rumale/preprocessing/max_normalizer'
|
97
90
|
require 'rumale/preprocessing/min_max_scaler'
|
98
91
|
require 'rumale/preprocessing/max_abs_scaler'
|
99
92
|
require 'rumale/preprocessing/standard_scaler'
|
@@ -102,11 +95,15 @@ require 'rumale/preprocessing/label_binarizer'
|
|
102
95
|
require 'rumale/preprocessing/label_encoder'
|
103
96
|
require 'rumale/preprocessing/one_hot_encoder'
|
104
97
|
require 'rumale/preprocessing/ordinal_encoder'
|
98
|
+
require 'rumale/preprocessing/binarizer'
|
105
99
|
require 'rumale/preprocessing/polynomial_features'
|
106
100
|
require 'rumale/model_selection/k_fold'
|
101
|
+
require 'rumale/model_selection/group_k_fold'
|
107
102
|
require 'rumale/model_selection/stratified_k_fold'
|
108
103
|
require 'rumale/model_selection/shuffle_split'
|
104
|
+
require 'rumale/model_selection/group_shuffle_split'
|
109
105
|
require 'rumale/model_selection/stratified_shuffle_split'
|
106
|
+
require 'rumale/model_selection/time_series_split'
|
110
107
|
require 'rumale/model_selection/cross_validation'
|
111
108
|
require 'rumale/model_selection/grid_search_cv'
|
112
109
|
require 'rumale/model_selection/function'
|
@@ -136,7 +136,7 @@ module Rumale
|
|
136
136
|
res
|
137
137
|
end
|
138
138
|
|
139
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
|
139
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
140
140
|
def condense_tree(hierarchy, min_cluster_size)
|
141
141
|
n_edges = hierarchy.size
|
142
142
|
root = 2 * n_edges
|
@@ -232,7 +232,7 @@ module Rumale
|
|
232
232
|
end
|
233
233
|
|
234
234
|
def flatten(tree, stabilities)
|
235
|
-
node_ids = stabilities.keys.sort
|
235
|
+
node_ids = stabilities.keys.sort.reverse.slice(0, stabilities.size - 1)
|
236
236
|
|
237
237
|
cluster_tree = tree.select { |edge| edge.n_elements > 1 }
|
238
238
|
is_cluster = node_ids.each_with_object({}) { |n_id, h| h[n_id] = true }
|
@@ -265,7 +265,7 @@ module Rumale
|
|
265
265
|
end
|
266
266
|
res
|
267
267
|
end
|
268
|
-
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
|
268
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
269
269
|
end
|
270
270
|
end
|
271
271
|
end
|
@@ -103,7 +103,7 @@ module Rumale
|
|
103
103
|
# random initialize
|
104
104
|
n_samples = x.shape[0]
|
105
105
|
sub_rng = @rng.dup
|
106
|
-
rand_id =
|
106
|
+
rand_id = Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng)
|
107
107
|
@cluster_centers = x[rand_id, true].dup
|
108
108
|
return unless @params[:init] == 'k-means++'
|
109
109
|
|
@@ -124,7 +124,7 @@ module Rumale
|
|
124
124
|
# random initialize
|
125
125
|
n_samples = distance_mat.shape[0]
|
126
126
|
sub_rng = @rng.dup
|
127
|
-
@medoid_ids = Numo::Int32.asarray(
|
127
|
+
@medoid_ids = Numo::Int32.asarray(Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng))
|
128
128
|
return unless @params[:init] == 'k-means++'
|
129
129
|
|
130
130
|
# k-means++ initialize
|
@@ -67,7 +67,7 @@ module Rumale
|
|
67
67
|
init_cluster_centers(x, sub_rng)
|
68
68
|
# optimization with mini-batch sgd.
|
69
69
|
@params[:max_iter].times do |_t|
|
70
|
-
sample_ids =
|
70
|
+
sample_ids = Array(0...n_samples).shuffle(random: sub_rng)
|
71
71
|
old_centers = @cluster_centers.dup
|
72
72
|
until (subset_ids = sample_ids.shift(@params[:batch_size])).empty?
|
73
73
|
# sub sampling
|
@@ -120,7 +120,7 @@ module Rumale
|
|
120
120
|
def init_cluster_centers(x, sub_rng)
|
121
121
|
# random initialize
|
122
122
|
n_samples = x.shape[0]
|
123
|
-
rand_id =
|
123
|
+
rand_id = Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng)
|
124
124
|
@cluster_centers = x[rand_id, true].dup
|
125
125
|
return unless @params[:init] == 'k-means++'
|
126
126
|
|
data/lib/rumale/dataset.rb
CHANGED
@@ -81,7 +81,7 @@ module Rumale
|
|
81
81
|
y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
|
82
82
|
# shuffle data indices.
|
83
83
|
if shuffle
|
84
|
-
rand_ids =
|
84
|
+
rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
|
85
85
|
x = x[rand_ids, true].dup
|
86
86
|
y = y[rand_ids].dup
|
87
87
|
end
|
@@ -118,7 +118,7 @@ module Rumale
|
|
118
118
|
y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
|
119
119
|
# shuffle data indices.
|
120
120
|
if shuffle
|
121
|
-
rand_ids =
|
121
|
+
rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
|
122
122
|
x = x[rand_ids, true].dup
|
123
123
|
y = y[rand_ids].dup
|
124
124
|
end
|
@@ -173,7 +173,7 @@ module Rumale
|
|
173
173
|
end
|
174
174
|
# shuffle data.
|
175
175
|
if shuffle
|
176
|
-
rand_ids =
|
176
|
+
rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
|
177
177
|
x = x[rand_ids, true].dup
|
178
178
|
y = y[rand_ids].dup
|
179
179
|
end
|
@@ -225,7 +225,7 @@ module Rumale
|
|
225
225
|
line = dump_label(label, label_type.to_s)
|
226
226
|
ftvec.to_a.each_with_index do |val, n|
|
227
227
|
idx = n + (zero_based == false ? 1 : 0)
|
228
|
-
line += format(" %d:#{value_type}", idx, val) if val != 0
|
228
|
+
line += format(" %d:#{value_type}", idx, val) if val != 0
|
229
229
|
end
|
230
230
|
line
|
231
231
|
end
|
@@ -77,7 +77,7 @@ module Rumale
|
|
77
77
|
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
|
78
78
|
def transform(x)
|
79
79
|
x = check_convert_sample_array(x)
|
80
|
-
partial_fit(x, false)
|
80
|
+
partial_fit(x, update_comps: false)
|
81
81
|
end
|
82
82
|
|
83
83
|
# Inverse transform the given transformed data with the learned model.
|
@@ -91,7 +91,7 @@ module Rumale
|
|
91
91
|
|
92
92
|
private
|
93
93
|
|
94
|
-
def partial_fit(x, update_comps
|
94
|
+
def partial_fit(x, update_comps: true)
|
95
95
|
# initialize some variables.
|
96
96
|
n_samples, n_features = x.shape
|
97
97
|
scale = Math.sqrt(x.mean / @params[:n_components])
|
@@ -85,7 +85,7 @@ module Rumale
|
|
85
85
|
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
86
86
|
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
87
87
|
# @return [RandomForestClassifier] The learned classifier itself.
|
88
|
-
def fit(x, y)
|
88
|
+
def fit(x, y) # rubocop:disable Metrics/AbcSize
|
89
89
|
x = check_convert_sample_array(x)
|
90
90
|
y = check_convert_label_array(y)
|
91
91
|
check_sample_label_size(x, y)
|
@@ -79,7 +79,7 @@ module Rumale
|
|
79
79
|
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
80
80
|
# @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
|
81
81
|
# @return [RandomForestRegressor] The learned regressor itself.
|
82
|
-
def fit(x, y)
|
82
|
+
def fit(x, y) # rubocop:disable Metrics/AbcSize
|
83
83
|
x = check_convert_sample_array(x)
|
84
84
|
y = check_convert_tvalue_array(y)
|
85
85
|
check_sample_tvalue_size(x, y)
|
@@ -67,7 +67,7 @@ module Rumale
|
|
67
67
|
def transform(x)
|
68
68
|
raise 'FeatureHasher#transform requires Mmh3 but that is not loaded.' unless enable_mmh3?
|
69
69
|
|
70
|
-
x = [x] unless x.is_a?(Array)
|
70
|
+
x = [x] unless x.is_a?(Array) # rubocop:disable Style/ArrayCoercion
|
71
71
|
n_samples = x.size
|
72
72
|
|
73
73
|
z = Numo::DFloat.zeros(n_samples, n_features)
|
@@ -99,7 +99,7 @@ module Rumale
|
|
99
99
|
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
100
100
|
# @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
101
101
|
def transform(x)
|
102
|
-
x = [x] unless x.is_a?(Array)
|
102
|
+
x = [x] unless x.is_a?(Array) # rubocop:disable Style/ArrayCoercion
|
103
103
|
n_samples = x.size
|
104
104
|
n_features = @vocabulary.size
|
105
105
|
z = Numo::DFloat.zeros(n_samples, n_features)
|
@@ -0,0 +1,113 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rumale/base/base_estimator'
|
4
|
+
require 'rumale/base/transformer'
|
5
|
+
require 'rumale/preprocessing/l1_normalizer'
|
6
|
+
require 'rumale/preprocessing/l2_normalizer'
|
7
|
+
|
8
|
+
module Rumale
|
9
|
+
module FeatureExtraction
|
10
|
+
# Transform sample matrix with term frequecy (tf) to a normalized tf-idf (inverse document frequency) reprensentation.
|
11
|
+
#
|
12
|
+
# @example
|
13
|
+
# encoder = Rumale::FeatureExtraction::HashVectorizer.new
|
14
|
+
# x = encoder.fit_transform([
|
15
|
+
# { foo: 1, bar: 2 },
|
16
|
+
# { foo: 3, baz: 1 }
|
17
|
+
# ])
|
18
|
+
#
|
19
|
+
# # > pp x
|
20
|
+
# # Numo::DFloat#shape=[2,3]
|
21
|
+
# # [[2, 0, 1],
|
22
|
+
# # [0, 1, 3]]
|
23
|
+
#
|
24
|
+
# transformer = Rumale::FeatureExtraction::TfidfTransformer.new
|
25
|
+
# x_tfidf = transformer.fit_transform(x)
|
26
|
+
#
|
27
|
+
# # > pp x_tfidf
|
28
|
+
# # Numo::DFloat#shape=[2,3]
|
29
|
+
# # [[0.959056, 0, 0.283217],
|
30
|
+
# # [0, 0.491506, 0.870874]]
|
31
|
+
#
|
32
|
+
# *Reference*
|
33
|
+
# - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
|
34
|
+
class TfidfTransformer
|
35
|
+
include Base::BaseEstimator
|
36
|
+
include Base::Transformer
|
37
|
+
|
38
|
+
# Return the vector consists of inverse document frequency.
|
39
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
40
|
+
attr_reader :idf
|
41
|
+
|
42
|
+
# Create a new transfomer for converting tf vectors to tf-idf vectors.
|
43
|
+
#
|
44
|
+
# @param norm [String] The normalization method to be used ('l1', 'l2' and 'none').
|
45
|
+
# @param use_idf [Boolean] The flag indicating whether to use inverse document frequency weighting.
|
46
|
+
# @param smooth_idf [Boolean] The flag indicating whether to apply idf smoothing by log((n_samples + 1) / (df + 1)) + 1.
|
47
|
+
# @param sublinear_tf [Boolean] The flag indicating whether to perform subliner tf scaling by 1 + log(tf).
|
48
|
+
def initialize(norm: 'l2', use_idf: true, smooth_idf: false, sublinear_tf: false)
|
49
|
+
check_params_string(norm: norm)
|
50
|
+
check_params_boolean(use_idf: use_idf, smooth_idf: smooth_idf, sublinear_tf: sublinear_tf)
|
51
|
+
@params = {}
|
52
|
+
@params[:norm] = norm
|
53
|
+
@params[:use_idf] = use_idf
|
54
|
+
@params[:smooth_idf] = smooth_idf
|
55
|
+
@params[:sublinear_tf] = sublinear_tf
|
56
|
+
@idf = nil
|
57
|
+
end
|
58
|
+
|
59
|
+
# Calculate the inverse document frequency for weighting.
|
60
|
+
#
|
61
|
+
# @overload fit(x) -> TfidfTransformer
|
62
|
+
#
|
63
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the idf values.
|
64
|
+
# @return [TfidfTransformer]
|
65
|
+
def fit(x, _y = nil)
|
66
|
+
return self unless @params[:use_idf]
|
67
|
+
|
68
|
+
x = check_convert_sample_array(x)
|
69
|
+
|
70
|
+
n_samples = x.shape[0]
|
71
|
+
df = x.class.cast(x.gt(0.0).count(0))
|
72
|
+
|
73
|
+
if @params[:smooth_idf]
|
74
|
+
df += 1
|
75
|
+
n_samples += 1
|
76
|
+
end
|
77
|
+
|
78
|
+
@idf = Numo::NMath.log(n_samples / df) + 1
|
79
|
+
|
80
|
+
self
|
81
|
+
end
|
82
|
+
|
83
|
+
# Calculate the idf values, and then transfrom samples to the tf-idf representation.
|
84
|
+
#
|
85
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
86
|
+
#
|
87
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate idf and be transformed to tf-idf representation.
|
88
|
+
# @return [Numo::DFloat] The transformed samples.
|
89
|
+
def fit_transform(x, _y = nil)
|
90
|
+
fit(x).transform(x)
|
91
|
+
end
|
92
|
+
|
93
|
+
# Perform transforming the given samples to the tf-idf representation.
|
94
|
+
#
|
95
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
|
96
|
+
# @return [Numo::DFloat] The transformed samples.
|
97
|
+
def transform(x)
|
98
|
+
x = check_convert_sample_array(x)
|
99
|
+
z = x.dup
|
100
|
+
|
101
|
+
z[z.ne(0)] = Numo::NMath.log(z[z.ne(0)]) + 1 if @params[:sublinear_tf]
|
102
|
+
z *= @idf if @params[:use_idf]
|
103
|
+
case @params[:norm]
|
104
|
+
when 'l2'
|
105
|
+
z = Rumale::Preprocessing::L2Normalizer.new.fit_transform(z)
|
106
|
+
when 'l1'
|
107
|
+
z = Rumale::Preprocessing::L1Normalizer.new.fit_transform(z)
|
108
|
+
end
|
109
|
+
z
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
@@ -69,7 +69,7 @@ module Rumale
|
|
69
69
|
n_components = [1, [@params[:n_components], n_samples].min].max
|
70
70
|
|
71
71
|
# random sampling.
|
72
|
-
@component_indices = Numo::Int32.cast(
|
72
|
+
@component_indices = Numo::Int32.cast(Array(0...n_samples).shuffle(random: sub_rng)[0...n_components])
|
73
73
|
@components = x[@component_indices, true]
|
74
74
|
|
75
75
|
# calculate normalizing factor.
|