rumale 0.19.1 → 0.20.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +5 -29
- data/CHANGELOG.md +28 -0
- data/lib/rumale.rb +7 -10
- data/lib/rumale/clustering/hdbscan.rb +3 -3
- data/lib/rumale/clustering/k_means.rb +1 -1
- data/lib/rumale/clustering/k_medoids.rb +1 -1
- data/lib/rumale/clustering/mini_batch_k_means.rb +2 -2
- data/lib/rumale/dataset.rb +4 -4
- data/lib/rumale/decomposition/nmf.rb +2 -2
- data/lib/rumale/ensemble/random_forest_classifier.rb +1 -1
- data/lib/rumale/ensemble/random_forest_regressor.rb +1 -1
- data/lib/rumale/feature_extraction/feature_hasher.rb +1 -1
- data/lib/rumale/feature_extraction/hash_vectorizer.rb +1 -1
- data/lib/rumale/feature_extraction/tfidf_transformer.rb +113 -0
- data/lib/rumale/kernel_approximation/nystroem.rb +1 -1
- data/lib/rumale/kernel_machine/kernel_svc.rb +1 -1
- data/lib/rumale/linear_model/base_sgd.rb +1 -1
- data/lib/rumale/manifold/tsne.rb +1 -1
- data/lib/rumale/model_selection/cross_validation.rb +3 -2
- data/lib/rumale/model_selection/group_k_fold.rb +93 -0
- data/lib/rumale/model_selection/group_shuffle_split.rb +115 -0
- data/lib/rumale/model_selection/k_fold.rb +1 -1
- data/lib/rumale/model_selection/shuffle_split.rb +5 -5
- data/lib/rumale/model_selection/stratified_k_fold.rb +1 -1
- data/lib/rumale/model_selection/stratified_shuffle_split.rb +13 -9
- data/lib/rumale/model_selection/time_series_split.rb +91 -0
- data/lib/rumale/multiclass/one_vs_rest_classifier.rb +2 -2
- data/lib/rumale/nearest_neighbors/vp_tree.rb +1 -1
- data/lib/rumale/neural_network/base_mlp.rb +1 -1
- data/lib/rumale/preprocessing/binarizer.rb +60 -0
- data/lib/rumale/preprocessing/l1_normalizer.rb +62 -0
- data/lib/rumale/preprocessing/l2_normalizer.rb +2 -1
- data/lib/rumale/preprocessing/max_normalizer.rb +62 -0
- data/lib/rumale/probabilistic_output.rb +1 -1
- data/lib/rumale/version.rb +1 -1
- metadata +9 -12
- data/lib/rumale/linear_model/base_linear_model.rb +0 -102
- data/lib/rumale/optimizer/ada_grad.rb +0 -42
- data/lib/rumale/optimizer/adam.rb +0 -56
- data/lib/rumale/optimizer/nadam.rb +0 -67
- data/lib/rumale/optimizer/rmsprop.rb +0 -50
- data/lib/rumale/optimizer/sgd.rb +0 -46
- data/lib/rumale/optimizer/yellow_fin.rb +0 -104
- data/lib/rumale/polynomial_model/base_factorization_machine.rb +0 -125
- data/lib/rumale/polynomial_model/factorization_machine_classifier.rb +0 -220
- data/lib/rumale/polynomial_model/factorization_machine_regressor.rb +0 -134
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5d8c93acbf38fbd07e5df224010abbdd4269a6ce3bbf8112a0eba652a606785d
|
|
4
|
+
data.tar.gz: e7cb00a802420854835c92f011425f3054bfcc1052bf7b3664da1f95834ef435
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f95fdd89b84dad02e516ee0479b1cddfb101cb96de897b6e7fa3fba546272a243cff5cfe954cb51942ec1ab23cf3028b183db86b52fab00a35d15be7eee5bf92
|
|
7
|
+
data.tar.gz: e5f6235e88dd47b9002a2154cabd2c1e64afb6cbb5b0745b411c7e5559351e925c9db8ec332724e301b83215662b3582e79a9e997f0338846514b234dabf1fc3
|
data/.rubocop.yml
CHANGED
|
@@ -3,6 +3,7 @@ require:
|
|
|
3
3
|
- rubocop-rspec
|
|
4
4
|
|
|
5
5
|
AllCops:
|
|
6
|
+
NewCops: enable
|
|
6
7
|
TargetRubyVersion: 2.5
|
|
7
8
|
DisplayCopNames: true
|
|
8
9
|
DisplayStyleGuide: true
|
|
@@ -15,25 +16,12 @@ AllCops:
|
|
|
15
16
|
Style/Documentation:
|
|
16
17
|
Enabled: false
|
|
17
18
|
|
|
18
|
-
Style/HashEachMethods:
|
|
19
|
-
Enabled: true
|
|
20
|
-
|
|
21
|
-
Style/HashTransformKeys:
|
|
22
|
-
Enabled: true
|
|
23
|
-
|
|
24
|
-
Style/HashTransformValues:
|
|
25
|
-
Enabled: true
|
|
26
|
-
|
|
27
|
-
Lint/RaiseException:
|
|
28
|
-
Enabled: true
|
|
29
|
-
|
|
30
|
-
Lint/StructNewOverride:
|
|
31
|
-
Enabled: true
|
|
32
|
-
|
|
33
19
|
Layout/LineLength:
|
|
34
20
|
Max: 145
|
|
35
21
|
IgnoredPatterns: ['(\A|\s)#']
|
|
36
22
|
|
|
23
|
+
Lint/MissingSuper:
|
|
24
|
+
Enabled: false
|
|
37
25
|
|
|
38
26
|
Metrics/ModuleLength:
|
|
39
27
|
Max: 200
|
|
@@ -70,26 +58,14 @@ Naming/MethodParameterName:
|
|
|
70
58
|
Naming/ConstantName:
|
|
71
59
|
Enabled: false
|
|
72
60
|
|
|
73
|
-
Style/ExponentialNotation:
|
|
74
|
-
Enabled: true
|
|
75
|
-
|
|
76
61
|
Style/FormatStringToken:
|
|
77
62
|
Enabled: false
|
|
78
63
|
|
|
79
64
|
Style/NumericLiterals:
|
|
80
65
|
Enabled: false
|
|
81
66
|
|
|
82
|
-
Style/
|
|
83
|
-
Enabled:
|
|
84
|
-
|
|
85
|
-
Layout/EmptyLineAfterGuardClause:
|
|
86
|
-
Enabled: true
|
|
87
|
-
|
|
88
|
-
Layout/EmptyLinesAroundAttributeAccessor:
|
|
89
|
-
Enabled: true
|
|
90
|
-
|
|
91
|
-
Layout/SpaceAroundMethodCallOperator:
|
|
92
|
-
Enabled: true
|
|
67
|
+
Style/StringConcatenation:
|
|
68
|
+
Enabled: false
|
|
93
69
|
|
|
94
70
|
RSpec/MultipleExpectations:
|
|
95
71
|
Enabled: false
|
data/CHANGELOG.md
CHANGED
|
@@ -1,8 +1,36 @@
|
|
|
1
|
+
# 0.20.2
|
|
2
|
+
- Add cross-validator class for time-series data.
|
|
3
|
+
- [TimeSeriesSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/TimeSeriesSplit.html)
|
|
4
|
+
|
|
5
|
+
# 0.20.1
|
|
6
|
+
- Add cross-validator classes that split data according group labels.
|
|
7
|
+
- [GroupKFold](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/GroupKFold.html)
|
|
8
|
+
- [GroupShuffleSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/GroupShuffleSplit.html)
|
|
9
|
+
- Fix fraction treating of the number of samples on shuffle split cross-validator classes.
|
|
10
|
+
- [ShuffleSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/ShuffleSplit.html)
|
|
11
|
+
- [StratifiedShuffleSplit](https://yoshoku.github.io/rumale/doc/Rumale/ModelSelection/StratifiedShuffleSplit.html)
|
|
12
|
+
- Refactor some codes with Rubocop.
|
|
13
|
+
|
|
14
|
+
# 0.20.0
|
|
15
|
+
## Breaking changes
|
|
16
|
+
- Delete deprecated estimators such as PolynomialModel, Optimizer, and BaseLinearModel.
|
|
17
|
+
|
|
18
|
+
# 0.19.3
|
|
19
|
+
- Add preprocessing class for [Binarizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/Binarizer.html)
|
|
20
|
+
- Add preprocessing class for [MaxNormalizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/MaxNormalizer.html)
|
|
21
|
+
- Refactor some codes with Rubocop.
|
|
22
|
+
|
|
23
|
+
# 0.19.2
|
|
24
|
+
- Fix L2Normalizer to avoid zero divide.
|
|
25
|
+
- Add preprocssing class for [L1Normalizer](https://yoshoku.github.io/rumale/doc/Rumale/Preprocessing/L1Normalizer.html).
|
|
26
|
+
- Add transformer class for [TfidfTransformer](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/TfidfTransformer.html).
|
|
27
|
+
|
|
1
28
|
# 0.19.1
|
|
2
29
|
- Add cluster analysis class for [mini-batch K-Means](https://yoshoku.github.io/rumale/doc/Rumale/Clustering/MiniBatchKMeans.html).
|
|
3
30
|
- Fix some typos.
|
|
4
31
|
|
|
5
32
|
# 0.19.0
|
|
33
|
+
## Breaking changes
|
|
6
34
|
- Change mmh3 and mopti gem to non-runtime dependent library.
|
|
7
35
|
- The mmh3 gem is used in [FeatureHasher](https://yoshoku.github.io/rumale/doc/Rumale/FeatureExtraction/FeatureHasher.html).
|
|
8
36
|
You only need to require mmh3 gem when using FeatureHasher.
|
data/lib/rumale.rb
CHANGED
|
@@ -18,17 +18,10 @@ require 'rumale/base/cluster_analyzer'
|
|
|
18
18
|
require 'rumale/base/transformer'
|
|
19
19
|
require 'rumale/base/splitter'
|
|
20
20
|
require 'rumale/base/evaluator'
|
|
21
|
-
require 'rumale/optimizer/sgd'
|
|
22
|
-
require 'rumale/optimizer/ada_grad'
|
|
23
|
-
require 'rumale/optimizer/rmsprop'
|
|
24
|
-
require 'rumale/optimizer/adam'
|
|
25
|
-
require 'rumale/optimizer/nadam'
|
|
26
|
-
require 'rumale/optimizer/yellow_fin'
|
|
27
21
|
require 'rumale/pipeline/pipeline'
|
|
28
22
|
require 'rumale/pipeline/feature_union'
|
|
29
23
|
require 'rumale/kernel_approximation/rbf'
|
|
30
24
|
require 'rumale/kernel_approximation/nystroem'
|
|
31
|
-
require 'rumale/linear_model/base_linear_model'
|
|
32
25
|
require 'rumale/linear_model/base_sgd'
|
|
33
26
|
require 'rumale/linear_model/svc'
|
|
34
27
|
require 'rumale/linear_model/svr'
|
|
@@ -41,9 +34,6 @@ require 'rumale/kernel_machine/kernel_svc'
|
|
|
41
34
|
require 'rumale/kernel_machine/kernel_pca'
|
|
42
35
|
require 'rumale/kernel_machine/kernel_fda'
|
|
43
36
|
require 'rumale/kernel_machine/kernel_ridge'
|
|
44
|
-
require 'rumale/polynomial_model/base_factorization_machine'
|
|
45
|
-
require 'rumale/polynomial_model/factorization_machine_classifier'
|
|
46
|
-
require 'rumale/polynomial_model/factorization_machine_regressor'
|
|
47
37
|
require 'rumale/multiclass/one_vs_rest_classifier'
|
|
48
38
|
require 'rumale/nearest_neighbors/vp_tree'
|
|
49
39
|
require 'rumale/nearest_neighbors/k_neighbors_classifier'
|
|
@@ -93,7 +83,10 @@ require 'rumale/neural_network/mlp_regressor'
|
|
|
93
83
|
require 'rumale/neural_network/mlp_classifier'
|
|
94
84
|
require 'rumale/feature_extraction/hash_vectorizer'
|
|
95
85
|
require 'rumale/feature_extraction/feature_hasher'
|
|
86
|
+
require 'rumale/feature_extraction/tfidf_transformer'
|
|
96
87
|
require 'rumale/preprocessing/l2_normalizer'
|
|
88
|
+
require 'rumale/preprocessing/l1_normalizer'
|
|
89
|
+
require 'rumale/preprocessing/max_normalizer'
|
|
97
90
|
require 'rumale/preprocessing/min_max_scaler'
|
|
98
91
|
require 'rumale/preprocessing/max_abs_scaler'
|
|
99
92
|
require 'rumale/preprocessing/standard_scaler'
|
|
@@ -102,11 +95,15 @@ require 'rumale/preprocessing/label_binarizer'
|
|
|
102
95
|
require 'rumale/preprocessing/label_encoder'
|
|
103
96
|
require 'rumale/preprocessing/one_hot_encoder'
|
|
104
97
|
require 'rumale/preprocessing/ordinal_encoder'
|
|
98
|
+
require 'rumale/preprocessing/binarizer'
|
|
105
99
|
require 'rumale/preprocessing/polynomial_features'
|
|
106
100
|
require 'rumale/model_selection/k_fold'
|
|
101
|
+
require 'rumale/model_selection/group_k_fold'
|
|
107
102
|
require 'rumale/model_selection/stratified_k_fold'
|
|
108
103
|
require 'rumale/model_selection/shuffle_split'
|
|
104
|
+
require 'rumale/model_selection/group_shuffle_split'
|
|
109
105
|
require 'rumale/model_selection/stratified_shuffle_split'
|
|
106
|
+
require 'rumale/model_selection/time_series_split'
|
|
110
107
|
require 'rumale/model_selection/cross_validation'
|
|
111
108
|
require 'rumale/model_selection/grid_search_cv'
|
|
112
109
|
require 'rumale/model_selection/function'
|
|
@@ -136,7 +136,7 @@ module Rumale
|
|
|
136
136
|
res
|
|
137
137
|
end
|
|
138
138
|
|
|
139
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
139
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
140
140
|
def condense_tree(hierarchy, min_cluster_size)
|
|
141
141
|
n_edges = hierarchy.size
|
|
142
142
|
root = 2 * n_edges
|
|
@@ -232,7 +232,7 @@ module Rumale
|
|
|
232
232
|
end
|
|
233
233
|
|
|
234
234
|
def flatten(tree, stabilities)
|
|
235
|
-
node_ids = stabilities.keys.sort
|
|
235
|
+
node_ids = stabilities.keys.sort.reverse.slice(0, stabilities.size - 1)
|
|
236
236
|
|
|
237
237
|
cluster_tree = tree.select { |edge| edge.n_elements > 1 }
|
|
238
238
|
is_cluster = node_ids.each_with_object({}) { |n_id, h| h[n_id] = true }
|
|
@@ -265,7 +265,7 @@ module Rumale
|
|
|
265
265
|
end
|
|
266
266
|
res
|
|
267
267
|
end
|
|
268
|
-
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
268
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
269
269
|
end
|
|
270
270
|
end
|
|
271
271
|
end
|
|
@@ -103,7 +103,7 @@ module Rumale
|
|
|
103
103
|
# random initialize
|
|
104
104
|
n_samples = x.shape[0]
|
|
105
105
|
sub_rng = @rng.dup
|
|
106
|
-
rand_id =
|
|
106
|
+
rand_id = Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng)
|
|
107
107
|
@cluster_centers = x[rand_id, true].dup
|
|
108
108
|
return unless @params[:init] == 'k-means++'
|
|
109
109
|
|
|
@@ -124,7 +124,7 @@ module Rumale
|
|
|
124
124
|
# random initialize
|
|
125
125
|
n_samples = distance_mat.shape[0]
|
|
126
126
|
sub_rng = @rng.dup
|
|
127
|
-
@medoid_ids = Numo::Int32.asarray(
|
|
127
|
+
@medoid_ids = Numo::Int32.asarray(Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng))
|
|
128
128
|
return unless @params[:init] == 'k-means++'
|
|
129
129
|
|
|
130
130
|
# k-means++ initialize
|
|
@@ -67,7 +67,7 @@ module Rumale
|
|
|
67
67
|
init_cluster_centers(x, sub_rng)
|
|
68
68
|
# optimization with mini-batch sgd.
|
|
69
69
|
@params[:max_iter].times do |_t|
|
|
70
|
-
sample_ids =
|
|
70
|
+
sample_ids = Array(0...n_samples).shuffle(random: sub_rng)
|
|
71
71
|
old_centers = @cluster_centers.dup
|
|
72
72
|
until (subset_ids = sample_ids.shift(@params[:batch_size])).empty?
|
|
73
73
|
# sub sampling
|
|
@@ -120,7 +120,7 @@ module Rumale
|
|
|
120
120
|
def init_cluster_centers(x, sub_rng)
|
|
121
121
|
# random initialize
|
|
122
122
|
n_samples = x.shape[0]
|
|
123
|
-
rand_id =
|
|
123
|
+
rand_id = Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng)
|
|
124
124
|
@cluster_centers = x[rand_id, true].dup
|
|
125
125
|
return unless @params[:init] == 'k-means++'
|
|
126
126
|
|
data/lib/rumale/dataset.rb
CHANGED
|
@@ -81,7 +81,7 @@ module Rumale
|
|
|
81
81
|
y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
|
|
82
82
|
# shuffle data indices.
|
|
83
83
|
if shuffle
|
|
84
|
-
rand_ids =
|
|
84
|
+
rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
|
|
85
85
|
x = x[rand_ids, true].dup
|
|
86
86
|
y = y[rand_ids].dup
|
|
87
87
|
end
|
|
@@ -118,7 +118,7 @@ module Rumale
|
|
|
118
118
|
y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
|
|
119
119
|
# shuffle data indices.
|
|
120
120
|
if shuffle
|
|
121
|
-
rand_ids =
|
|
121
|
+
rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
|
|
122
122
|
x = x[rand_ids, true].dup
|
|
123
123
|
y = y[rand_ids].dup
|
|
124
124
|
end
|
|
@@ -173,7 +173,7 @@ module Rumale
|
|
|
173
173
|
end
|
|
174
174
|
# shuffle data.
|
|
175
175
|
if shuffle
|
|
176
|
-
rand_ids =
|
|
176
|
+
rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
|
|
177
177
|
x = x[rand_ids, true].dup
|
|
178
178
|
y = y[rand_ids].dup
|
|
179
179
|
end
|
|
@@ -225,7 +225,7 @@ module Rumale
|
|
|
225
225
|
line = dump_label(label, label_type.to_s)
|
|
226
226
|
ftvec.to_a.each_with_index do |val, n|
|
|
227
227
|
idx = n + (zero_based == false ? 1 : 0)
|
|
228
|
-
line += format(" %d:#{value_type}", idx, val) if val != 0
|
|
228
|
+
line += format(" %d:#{value_type}", idx, val) if val != 0
|
|
229
229
|
end
|
|
230
230
|
line
|
|
231
231
|
end
|
|
@@ -77,7 +77,7 @@ module Rumale
|
|
|
77
77
|
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
|
|
78
78
|
def transform(x)
|
|
79
79
|
x = check_convert_sample_array(x)
|
|
80
|
-
partial_fit(x, false)
|
|
80
|
+
partial_fit(x, update_comps: false)
|
|
81
81
|
end
|
|
82
82
|
|
|
83
83
|
# Inverse transform the given transformed data with the learned model.
|
|
@@ -91,7 +91,7 @@ module Rumale
|
|
|
91
91
|
|
|
92
92
|
private
|
|
93
93
|
|
|
94
|
-
def partial_fit(x, update_comps
|
|
94
|
+
def partial_fit(x, update_comps: true)
|
|
95
95
|
# initialize some variables.
|
|
96
96
|
n_samples, n_features = x.shape
|
|
97
97
|
scale = Math.sqrt(x.mean / @params[:n_components])
|
|
@@ -85,7 +85,7 @@ module Rumale
|
|
|
85
85
|
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
|
86
86
|
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
|
|
87
87
|
# @return [RandomForestClassifier] The learned classifier itself.
|
|
88
|
-
def fit(x, y)
|
|
88
|
+
def fit(x, y) # rubocop:disable Metrics/AbcSize
|
|
89
89
|
x = check_convert_sample_array(x)
|
|
90
90
|
y = check_convert_label_array(y)
|
|
91
91
|
check_sample_label_size(x, y)
|
|
@@ -79,7 +79,7 @@ module Rumale
|
|
|
79
79
|
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
|
80
80
|
# @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
|
|
81
81
|
# @return [RandomForestRegressor] The learned regressor itself.
|
|
82
|
-
def fit(x, y)
|
|
82
|
+
def fit(x, y) # rubocop:disable Metrics/AbcSize
|
|
83
83
|
x = check_convert_sample_array(x)
|
|
84
84
|
y = check_convert_tvalue_array(y)
|
|
85
85
|
check_sample_tvalue_size(x, y)
|
|
@@ -67,7 +67,7 @@ module Rumale
|
|
|
67
67
|
def transform(x)
|
|
68
68
|
raise 'FeatureHasher#transform requires Mmh3 but that is not loaded.' unless enable_mmh3?
|
|
69
69
|
|
|
70
|
-
x = [x] unless x.is_a?(Array)
|
|
70
|
+
x = [x] unless x.is_a?(Array) # rubocop:disable Style/ArrayCoercion
|
|
71
71
|
n_samples = x.size
|
|
72
72
|
|
|
73
73
|
z = Numo::DFloat.zeros(n_samples, n_features)
|
|
@@ -99,7 +99,7 @@ module Rumale
|
|
|
99
99
|
# @param x [Array<Hash>] (shape: [n_samples]) The array of hash consisting of feature names and values.
|
|
100
100
|
# @return [Numo::DFloat] (shape: [n_samples, n_features]) The encoded sample array.
|
|
101
101
|
def transform(x)
|
|
102
|
-
x = [x] unless x.is_a?(Array)
|
|
102
|
+
x = [x] unless x.is_a?(Array) # rubocop:disable Style/ArrayCoercion
|
|
103
103
|
n_samples = x.size
|
|
104
104
|
n_features = @vocabulary.size
|
|
105
105
|
z = Numo::DFloat.zeros(n_samples, n_features)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'rumale/base/base_estimator'
|
|
4
|
+
require 'rumale/base/transformer'
|
|
5
|
+
require 'rumale/preprocessing/l1_normalizer'
|
|
6
|
+
require 'rumale/preprocessing/l2_normalizer'
|
|
7
|
+
|
|
8
|
+
module Rumale
|
|
9
|
+
module FeatureExtraction
|
|
10
|
+
# Transform sample matrix with term frequecy (tf) to a normalized tf-idf (inverse document frequency) reprensentation.
|
|
11
|
+
#
|
|
12
|
+
# @example
|
|
13
|
+
# encoder = Rumale::FeatureExtraction::HashVectorizer.new
|
|
14
|
+
# x = encoder.fit_transform([
|
|
15
|
+
# { foo: 1, bar: 2 },
|
|
16
|
+
# { foo: 3, baz: 1 }
|
|
17
|
+
# ])
|
|
18
|
+
#
|
|
19
|
+
# # > pp x
|
|
20
|
+
# # Numo::DFloat#shape=[2,3]
|
|
21
|
+
# # [[2, 0, 1],
|
|
22
|
+
# # [0, 1, 3]]
|
|
23
|
+
#
|
|
24
|
+
# transformer = Rumale::FeatureExtraction::TfidfTransformer.new
|
|
25
|
+
# x_tfidf = transformer.fit_transform(x)
|
|
26
|
+
#
|
|
27
|
+
# # > pp x_tfidf
|
|
28
|
+
# # Numo::DFloat#shape=[2,3]
|
|
29
|
+
# # [[0.959056, 0, 0.283217],
|
|
30
|
+
# # [0, 0.491506, 0.870874]]
|
|
31
|
+
#
|
|
32
|
+
# *Reference*
|
|
33
|
+
# - Manning, C D., Raghavan, P., and Schutze, H., "Introduction to Information Retrieval," Cambridge University Press., 2008.
|
|
34
|
+
class TfidfTransformer
|
|
35
|
+
include Base::BaseEstimator
|
|
36
|
+
include Base::Transformer
|
|
37
|
+
|
|
38
|
+
# Return the vector consists of inverse document frequency.
|
|
39
|
+
# @return [Numo::DFloat] (shape: [n_features])
|
|
40
|
+
attr_reader :idf
|
|
41
|
+
|
|
42
|
+
# Create a new transfomer for converting tf vectors to tf-idf vectors.
|
|
43
|
+
#
|
|
44
|
+
# @param norm [String] The normalization method to be used ('l1', 'l2' and 'none').
|
|
45
|
+
# @param use_idf [Boolean] The flag indicating whether to use inverse document frequency weighting.
|
|
46
|
+
# @param smooth_idf [Boolean] The flag indicating whether to apply idf smoothing by log((n_samples + 1) / (df + 1)) + 1.
|
|
47
|
+
# @param sublinear_tf [Boolean] The flag indicating whether to perform subliner tf scaling by 1 + log(tf).
|
|
48
|
+
def initialize(norm: 'l2', use_idf: true, smooth_idf: false, sublinear_tf: false)
|
|
49
|
+
check_params_string(norm: norm)
|
|
50
|
+
check_params_boolean(use_idf: use_idf, smooth_idf: smooth_idf, sublinear_tf: sublinear_tf)
|
|
51
|
+
@params = {}
|
|
52
|
+
@params[:norm] = norm
|
|
53
|
+
@params[:use_idf] = use_idf
|
|
54
|
+
@params[:smooth_idf] = smooth_idf
|
|
55
|
+
@params[:sublinear_tf] = sublinear_tf
|
|
56
|
+
@idf = nil
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Calculate the inverse document frequency for weighting.
|
|
60
|
+
#
|
|
61
|
+
# @overload fit(x) -> TfidfTransformer
|
|
62
|
+
#
|
|
63
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the idf values.
|
|
64
|
+
# @return [TfidfTransformer]
|
|
65
|
+
def fit(x, _y = nil)
|
|
66
|
+
return self unless @params[:use_idf]
|
|
67
|
+
|
|
68
|
+
x = check_convert_sample_array(x)
|
|
69
|
+
|
|
70
|
+
n_samples = x.shape[0]
|
|
71
|
+
df = x.class.cast(x.gt(0.0).count(0))
|
|
72
|
+
|
|
73
|
+
if @params[:smooth_idf]
|
|
74
|
+
df += 1
|
|
75
|
+
n_samples += 1
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
@idf = Numo::NMath.log(n_samples / df) + 1
|
|
79
|
+
|
|
80
|
+
self
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Calculate the idf values, and then transfrom samples to the tf-idf representation.
|
|
84
|
+
#
|
|
85
|
+
# @overload fit_transform(x) -> Numo::DFloat
|
|
86
|
+
#
|
|
87
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate idf and be transformed to tf-idf representation.
|
|
88
|
+
# @return [Numo::DFloat] The transformed samples.
|
|
89
|
+
def fit_transform(x, _y = nil)
|
|
90
|
+
fit(x).transform(x)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Perform transforming the given samples to the tf-idf representation.
|
|
94
|
+
#
|
|
95
|
+
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be transformed.
|
|
96
|
+
# @return [Numo::DFloat] The transformed samples.
|
|
97
|
+
def transform(x)
|
|
98
|
+
x = check_convert_sample_array(x)
|
|
99
|
+
z = x.dup
|
|
100
|
+
|
|
101
|
+
z[z.ne(0)] = Numo::NMath.log(z[z.ne(0)]) + 1 if @params[:sublinear_tf]
|
|
102
|
+
z *= @idf if @params[:use_idf]
|
|
103
|
+
case @params[:norm]
|
|
104
|
+
when 'l2'
|
|
105
|
+
z = Rumale::Preprocessing::L2Normalizer.new.fit_transform(z)
|
|
106
|
+
when 'l1'
|
|
107
|
+
z = Rumale::Preprocessing::L1Normalizer.new.fit_transform(z)
|
|
108
|
+
end
|
|
109
|
+
z
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
@@ -69,7 +69,7 @@ module Rumale
|
|
|
69
69
|
n_components = [1, [@params[:n_components], n_samples].min].max
|
|
70
70
|
|
|
71
71
|
# random sampling.
|
|
72
|
-
@component_indices = Numo::Int32.cast(
|
|
72
|
+
@component_indices = Numo::Int32.cast(Array(0...n_samples).shuffle(random: sub_rng)[0...n_components])
|
|
73
73
|
@components = x[@component_indices, true]
|
|
74
74
|
|
|
75
75
|
# calculate normalizing factor.
|