rumale 0.23.3 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE.txt +5 -1
- data/README.md +3 -288
- data/lib/rumale/version.rb +1 -1
- data/lib/rumale.rb +20 -131
- metadata +252 -150
- data/CHANGELOG.md +0 -643
- data/CODE_OF_CONDUCT.md +0 -74
- data/ext/rumale/extconf.rb +0 -37
- data/ext/rumale/rumaleext.c +0 -545
- data/ext/rumale/rumaleext.h +0 -12
- data/lib/rumale/base/base_estimator.rb +0 -49
- data/lib/rumale/base/classifier.rb +0 -36
- data/lib/rumale/base/cluster_analyzer.rb +0 -31
- data/lib/rumale/base/evaluator.rb +0 -17
- data/lib/rumale/base/regressor.rb +0 -36
- data/lib/rumale/base/splitter.rb +0 -21
- data/lib/rumale/base/transformer.rb +0 -22
- data/lib/rumale/clustering/dbscan.rb +0 -123
- data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
- data/lib/rumale/clustering/hdbscan.rb +0 -291
- data/lib/rumale/clustering/k_means.rb +0 -122
- data/lib/rumale/clustering/k_medoids.rb +0 -141
- data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
- data/lib/rumale/clustering/power_iteration.rb +0 -127
- data/lib/rumale/clustering/single_linkage.rb +0 -203
- data/lib/rumale/clustering/snn.rb +0 -76
- data/lib/rumale/clustering/spectral_clustering.rb +0 -115
- data/lib/rumale/dataset.rb +0 -246
- data/lib/rumale/decomposition/factor_analysis.rb +0 -150
- data/lib/rumale/decomposition/fast_ica.rb +0 -188
- data/lib/rumale/decomposition/nmf.rb +0 -124
- data/lib/rumale/decomposition/pca.rb +0 -159
- data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
- data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
- data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
- data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
- data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
- data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
- data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
- data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
- data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
- data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
- data/lib/rumale/ensemble/voting_classifier.rb +0 -126
- data/lib/rumale/ensemble/voting_regressor.rb +0 -82
- data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
- data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
- data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
- data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
- data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
- data/lib/rumale/evaluation_measure/f_score.rb +0 -50
- data/lib/rumale/evaluation_measure/function.rb +0 -147
- data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
- data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
- data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
- data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
- data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
- data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
- data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
- data/lib/rumale/evaluation_measure/precision.rb +0 -50
- data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
- data/lib/rumale/evaluation_measure/purity.rb +0 -40
- data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
- data/lib/rumale/evaluation_measure/recall.rb +0 -50
- data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
- data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
- data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
- data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
- data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
- data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
- data/lib/rumale/kernel_approximation/rbf.rb +0 -102
- data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
- data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
- data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
- data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
- data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
- data/lib/rumale/linear_model/base_sgd.rb +0 -285
- data/lib/rumale/linear_model/elastic_net.rb +0 -119
- data/lib/rumale/linear_model/lasso.rb +0 -115
- data/lib/rumale/linear_model/linear_regression.rb +0 -201
- data/lib/rumale/linear_model/logistic_regression.rb +0 -275
- data/lib/rumale/linear_model/nnls.rb +0 -137
- data/lib/rumale/linear_model/ridge.rb +0 -209
- data/lib/rumale/linear_model/svc.rb +0 -213
- data/lib/rumale/linear_model/svr.rb +0 -132
- data/lib/rumale/manifold/mds.rb +0 -155
- data/lib/rumale/manifold/tsne.rb +0 -222
- data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
- data/lib/rumale/metric_learning/mlkr.rb +0 -161
- data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
- data/lib/rumale/model_selection/cross_validation.rb +0 -125
- data/lib/rumale/model_selection/function.rb +0 -42
- data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
- data/lib/rumale/model_selection/group_k_fold.rb +0 -93
- data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
- data/lib/rumale/model_selection/k_fold.rb +0 -81
- data/lib/rumale/model_selection/shuffle_split.rb +0 -90
- data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
- data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
- data/lib/rumale/model_selection/time_series_split.rb +0 -91
- data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
- data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
- data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
- data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
- data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
- data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
- data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
- data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
- data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
- data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
- data/lib/rumale/neural_network/adam.rb +0 -56
- data/lib/rumale/neural_network/base_mlp.rb +0 -248
- data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
- data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
- data/lib/rumale/pairwise_metric.rb +0 -152
- data/lib/rumale/pipeline/feature_union.rb +0 -69
- data/lib/rumale/pipeline/pipeline.rb +0 -175
- data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
- data/lib/rumale/preprocessing/binarizer.rb +0 -60
- data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
- data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
- data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
- data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
- data/lib/rumale/preprocessing/label_encoder.rb +0 -79
- data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
- data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
- data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
- data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
- data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
- data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
- data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
- data/lib/rumale/probabilistic_output.rb +0 -114
- data/lib/rumale/tree/base_decision_tree.rb +0 -150
- data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
- data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
- data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
- data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
- data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
- data/lib/rumale/tree/node.rb +0 -39
- data/lib/rumale/utils.rb +0 -42
- data/lib/rumale/validation.rb +0 -128
- data/lib/rumale/values.rb +0 -13
data/lib/rumale/dataset.rb
DELETED
|
@@ -1,246 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'csv'
|
|
4
|
-
require 'rumale/validation'
|
|
5
|
-
require 'rumale/utils'
|
|
6
|
-
require 'rumale/preprocessing/min_max_scaler'
|
|
7
|
-
|
|
8
|
-
module Rumale
|
|
9
|
-
# Module for loading and saving a dataset file.
|
|
10
|
-
module Dataset
|
|
11
|
-
class << self
|
|
12
|
-
# Load a dataset with the libsvm file format into Numo::NArray.
|
|
13
|
-
#
|
|
14
|
-
# @param filename [String] A path to a dataset file.
|
|
15
|
-
# @param n_features [Integer/Nil] The number of features of data to load.
|
|
16
|
-
# If nil is given, it will be detected automatically from given file.
|
|
17
|
-
# @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
|
|
18
|
-
# @param dtype [Numo::NArray] Data type of Numo::NArray for features to be loaded.
|
|
19
|
-
#
|
|
20
|
-
# @return [Array<Numo::NArray>]
|
|
21
|
-
# Returns array containing the (n_samples x n_features) matrix for feature vectors
|
|
22
|
-
# and (n_samples) vector for labels or target values.
|
|
23
|
-
def load_libsvm_file(filename, n_features: nil, zero_based: false, dtype: Numo::DFloat)
|
|
24
|
-
ftvecs = []
|
|
25
|
-
labels = []
|
|
26
|
-
n_features_detected = 0
|
|
27
|
-
CSV.foreach(filename, col_sep: "\s", headers: false) do |line|
|
|
28
|
-
label, ftvec, max_idx = parse_libsvm_line(line, zero_based)
|
|
29
|
-
labels.push(label)
|
|
30
|
-
ftvecs.push(ftvec)
|
|
31
|
-
n_features_detected = max_idx if n_features_detected < max_idx
|
|
32
|
-
end
|
|
33
|
-
n_features ||= n_features_detected
|
|
34
|
-
n_features = [n_features, n_features_detected].max
|
|
35
|
-
[convert_to_matrix(ftvecs, n_features, dtype), Numo::NArray.asarray(labels)]
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
# Dump the dataset with the libsvm file format.
|
|
39
|
-
#
|
|
40
|
-
# @param data [Numo::NArray] (shape: [n_samples, n_features]) matrix consisting of feature vectors.
|
|
41
|
-
# @param labels [Numo::NArray] (shape: [n_samples]) matrix consisting of labels or target values.
|
|
42
|
-
# @param filename [String] A path to the output libsvm file.
|
|
43
|
-
# @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
|
|
44
|
-
def dump_libsvm_file(data, labels, filename, zero_based: false)
|
|
45
|
-
n_samples = [data.shape[0], labels.shape[0]].min
|
|
46
|
-
single_label = labels.shape[1].nil?
|
|
47
|
-
label_type = detect_dtype(labels)
|
|
48
|
-
value_type = detect_dtype(data)
|
|
49
|
-
File.open(filename, 'w') do |file|
|
|
50
|
-
n_samples.times do |n|
|
|
51
|
-
label = single_label ? labels[n] : labels[n, true].to_a
|
|
52
|
-
file.puts(dump_libsvm_line(label, data[n, true],
|
|
53
|
-
label_type, value_type, zero_based))
|
|
54
|
-
end
|
|
55
|
-
end
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
# Generate a two-dimensional data set consisting of an inner circle and an outer circle.
|
|
59
|
-
#
|
|
60
|
-
# @param n_samples [Integer] The number of samples.
|
|
61
|
-
# @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
|
|
62
|
-
# @param noise [Float] The standard deviaion of gaussian noise added to the data.
|
|
63
|
-
# If nil is given, no noise is added.
|
|
64
|
-
# @param factor [Float] The scale factor between inner and outer circles. The interval of factor is (0, 1).
|
|
65
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
|
66
|
-
def make_circles(n_samples, shuffle: true, noise: nil, factor: 0.8, random_seed: nil)
|
|
67
|
-
Rumale::Validation.check_params_numeric(n_samples: n_samples, factor: factor)
|
|
68
|
-
Rumale::Validation.check_params_boolean(shuffle: shuffle)
|
|
69
|
-
Rumale::Validation.check_params_numeric_or_nil(noise: noise, random_seed: random_seed)
|
|
70
|
-
raise ArgumentError, 'The number of samples must be more than 2.' if n_samples <= 1
|
|
71
|
-
raise RangeError, 'The interval of factor is (0, 1).' if factor <= 0 || factor >= 1
|
|
72
|
-
|
|
73
|
-
# initialize some variables.
|
|
74
|
-
rs = random_seed
|
|
75
|
-
rs ||= srand
|
|
76
|
-
rng = Random.new(rs)
|
|
77
|
-
n_samples_out = n_samples.fdiv(2).to_i
|
|
78
|
-
n_samples_in = n_samples - n_samples_out
|
|
79
|
-
# make two circles.
|
|
80
|
-
linsp_out = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_out)
|
|
81
|
-
linsp_in = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_in)
|
|
82
|
-
circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose
|
|
83
|
-
circle_in = Numo::DFloat[Numo::NMath.cos(linsp_in), Numo::NMath.sin(linsp_in)].transpose
|
|
84
|
-
x = Numo::DFloat.vstack([circle_out, factor * circle_in])
|
|
85
|
-
y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
|
|
86
|
-
# shuffle data indices.
|
|
87
|
-
if shuffle
|
|
88
|
-
rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
|
|
89
|
-
x = x[rand_ids, true].dup
|
|
90
|
-
y = y[rand_ids].dup
|
|
91
|
-
end
|
|
92
|
-
# add gaussian noise.
|
|
93
|
-
x += Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil?
|
|
94
|
-
[x, y]
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
# Generate a two-dimensional data set consisting of two half circles shifted.
|
|
98
|
-
#
|
|
99
|
-
# @param n_samples [Integer] The number of samples.
|
|
100
|
-
# @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
|
|
101
|
-
# @param noise [Float] The standard deviaion of gaussian noise added to the data.
|
|
102
|
-
# If nil is given, no noise is added.
|
|
103
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
|
104
|
-
def make_moons(n_samples, shuffle: true, noise: nil, random_seed: nil)
|
|
105
|
-
Rumale::Validation.check_params_numeric(n_samples: n_samples)
|
|
106
|
-
Rumale::Validation.check_params_boolean(shuffle: shuffle)
|
|
107
|
-
Rumale::Validation.check_params_numeric_or_nil(noise: noise, random_seed: random_seed)
|
|
108
|
-
raise ArgumentError, 'The number of samples must be more than 2.' if n_samples <= 1
|
|
109
|
-
|
|
110
|
-
# initialize some variables.
|
|
111
|
-
rs = random_seed
|
|
112
|
-
rs ||= srand
|
|
113
|
-
rng = Random.new(rs)
|
|
114
|
-
n_samples_out = n_samples.fdiv(2).to_i
|
|
115
|
-
n_samples_in = n_samples - n_samples_out
|
|
116
|
-
# make two half circles.
|
|
117
|
-
linsp_out = Numo::DFloat.linspace(0, Math::PI, n_samples_out)
|
|
118
|
-
linsp_in = Numo::DFloat.linspace(0, Math::PI, n_samples_in)
|
|
119
|
-
circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose
|
|
120
|
-
circle_in = Numo::DFloat[1 - Numo::NMath.cos(linsp_in), 1 - Numo::NMath.sin(linsp_in) - 0.5].transpose
|
|
121
|
-
x = Numo::DFloat.vstack([circle_out, circle_in])
|
|
122
|
-
y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
|
|
123
|
-
# shuffle data indices.
|
|
124
|
-
if shuffle
|
|
125
|
-
rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
|
|
126
|
-
x = x[rand_ids, true].dup
|
|
127
|
-
y = y[rand_ids].dup
|
|
128
|
-
end
|
|
129
|
-
# add gaussian noise.
|
|
130
|
-
x += Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil?
|
|
131
|
-
[x, y]
|
|
132
|
-
end
|
|
133
|
-
|
|
134
|
-
# Generate Gaussian blobs.
|
|
135
|
-
#
|
|
136
|
-
# @param n_samples [Integer] The total number of samples.
|
|
137
|
-
# @param n_features [Integer] The number of features.
|
|
138
|
-
# If "centers" parameter is given as a Numo::DFloat array, this parameter is ignored.
|
|
139
|
-
# @param centers [Integer/Numo::DFloat/Nil] The number of cluster centroids or the fixed cluster centroids.
|
|
140
|
-
# If nil is given, the number of cluster centroids is set to 3.
|
|
141
|
-
# @param cluster_std [Float] The standard deviation of the clusters.
|
|
142
|
-
# @param center_box [Array] The bounding box for each cluster centroids.
|
|
143
|
-
# If "centers" parameter is given as a Numo::DFloat array, this parameter is ignored.
|
|
144
|
-
# @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
|
|
145
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
|
146
|
-
def make_blobs(n_samples = 1000, n_features = 2,
|
|
147
|
-
centers: nil, cluster_std: 1.0, center_box: [-10, 10], shuffle: true, random_seed: nil)
|
|
148
|
-
Rumale::Validation.check_params_numeric(n_samples: n_samples, n_features: n_features, cluster_std: cluster_std)
|
|
149
|
-
Rumale::Validation.check_params_type(Array, center_box: center_box)
|
|
150
|
-
Rumale::Validation.check_params_boolean(shuffle: shuffle)
|
|
151
|
-
Rumale::Validation.check_params_numeric_or_nil(random_seed: random_seed)
|
|
152
|
-
# initialize rng.
|
|
153
|
-
rs = random_seed
|
|
154
|
-
rs ||= srand
|
|
155
|
-
rng = Random.new(rs)
|
|
156
|
-
# initialize centers.
|
|
157
|
-
if centers.is_a?(Numo::DFloat)
|
|
158
|
-
n_centers = centers.shape[0]
|
|
159
|
-
n_features = centers.shape[1]
|
|
160
|
-
else
|
|
161
|
-
n_centers = centers.is_a?(Integer) ? centers : 3
|
|
162
|
-
center_min = center_box.first
|
|
163
|
-
center_max = center_box.last
|
|
164
|
-
centers = Rumale::Utils.rand_uniform([n_centers, n_features], rng)
|
|
165
|
-
normalizer = Rumale::Preprocessing::MinMaxScaler.new(feature_range: [center_min, center_max])
|
|
166
|
-
centers = normalizer.fit_transform(centers)
|
|
167
|
-
end
|
|
168
|
-
# generate blobs.
|
|
169
|
-
sz_cluster = [n_samples / n_centers] * n_centers
|
|
170
|
-
(n_samples % n_centers).times { |n| sz_cluster[n] += 1 }
|
|
171
|
-
x = Rumale::Utils.rand_normal([sz_cluster[0], n_features], rng, 0.0, cluster_std) + centers[0, true]
|
|
172
|
-
y = Numo::Int32.zeros(sz_cluster[0])
|
|
173
|
-
(1...n_centers).each do |n|
|
|
174
|
-
c = Rumale::Utils.rand_normal([sz_cluster[n], n_features], rng, 0.0, cluster_std) + centers[n, true]
|
|
175
|
-
x = Numo::DFloat.vstack([x, c])
|
|
176
|
-
y = y.concatenate(Numo::Int32.zeros(sz_cluster[n]) + n)
|
|
177
|
-
end
|
|
178
|
-
# shuffle data.
|
|
179
|
-
if shuffle
|
|
180
|
-
rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
|
|
181
|
-
x = x[rand_ids, true].dup
|
|
182
|
-
y = y[rand_ids].dup
|
|
183
|
-
end
|
|
184
|
-
[x, y]
|
|
185
|
-
end
|
|
186
|
-
|
|
187
|
-
private
|
|
188
|
-
|
|
189
|
-
def parse_libsvm_line(line, zero_based)
|
|
190
|
-
label = parse_label(line.shift)
|
|
191
|
-
adj_idx = zero_based == false ? 1 : 0
|
|
192
|
-
max_idx = -1
|
|
193
|
-
ftvec = []
|
|
194
|
-
while (el = line.shift)
|
|
195
|
-
idx, val = el.split(':')
|
|
196
|
-
idx = idx.to_i - adj_idx
|
|
197
|
-
val = val.to_i.to_s == val ? val.to_i : val.to_f
|
|
198
|
-
max_idx = idx if max_idx < idx
|
|
199
|
-
ftvec.push([idx, val])
|
|
200
|
-
end
|
|
201
|
-
[label, ftvec, max_idx]
|
|
202
|
-
end
|
|
203
|
-
|
|
204
|
-
def parse_label(label)
|
|
205
|
-
lbl_arr = label.split(',').map { |lbl| lbl.to_i.to_s == lbl ? lbl.to_i : lbl.to_f }
|
|
206
|
-
lbl_arr.size > 1 ? lbl_arr : lbl_arr[0]
|
|
207
|
-
end
|
|
208
|
-
|
|
209
|
-
def convert_to_matrix(data, n_features, dtype)
|
|
210
|
-
mat = []
|
|
211
|
-
data.each do |ft|
|
|
212
|
-
vec = Array.new(n_features) { 0 }
|
|
213
|
-
ft.each { |el| vec[el[0]] = el[1] }
|
|
214
|
-
mat.push(vec)
|
|
215
|
-
end
|
|
216
|
-
dtype.asarray(mat)
|
|
217
|
-
end
|
|
218
|
-
|
|
219
|
-
def detect_dtype(data)
|
|
220
|
-
arr_type_str = Numo::NArray.array_type(data).to_s
|
|
221
|
-
type = '%s'
|
|
222
|
-
type = '%d' if ['Numo::Int8', 'Numo::Int16', 'Numo::Int32', 'Numo::Int64'].include?(arr_type_str)
|
|
223
|
-
type = '%d' if ['Numo::UInt8', 'Numo::UInt16', 'Numo::UInt32', 'Numo::UInt64'].include?(arr_type_str)
|
|
224
|
-
type = '%.10g' if ['Numo::SFloat', 'Numo::DFloat'].include?(arr_type_str)
|
|
225
|
-
type
|
|
226
|
-
end
|
|
227
|
-
|
|
228
|
-
def dump_libsvm_line(label, ftvec, label_type, value_type, zero_based)
|
|
229
|
-
line = dump_label(label, label_type.to_s)
|
|
230
|
-
ftvec.to_a.each_with_index do |val, n|
|
|
231
|
-
idx = n + (zero_based == false ? 1 : 0)
|
|
232
|
-
line += format(" %d:#{value_type}", idx, val) if val != 0
|
|
233
|
-
end
|
|
234
|
-
line
|
|
235
|
-
end
|
|
236
|
-
|
|
237
|
-
def dump_label(label, label_type_str)
|
|
238
|
-
if label.is_a?(Array)
|
|
239
|
-
label.map { |lbl| format(label_type_str, lbl) }.join(',')
|
|
240
|
-
else
|
|
241
|
-
format(label_type_str, label)
|
|
242
|
-
end
|
|
243
|
-
end
|
|
244
|
-
end
|
|
245
|
-
end
|
|
246
|
-
end
|
|
@@ -1,150 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/base/base_estimator'
|
|
4
|
-
require 'rumale/base/transformer'
|
|
5
|
-
require 'rumale/utils'
|
|
6
|
-
|
|
7
|
-
module Rumale
|
|
8
|
-
module Decomposition
|
|
9
|
-
# FactorAnalysis is a class that implements fator analysis with EM algorithm.
|
|
10
|
-
#
|
|
11
|
-
# @example
|
|
12
|
-
# require 'numo/linalg/autoloader'
|
|
13
|
-
# decomposer = Rumale::Decomposition::FactorAnalysis.new(n_components: 2)
|
|
14
|
-
# representaion = decomposer.fit_transform(samples)
|
|
15
|
-
#
|
|
16
|
-
# *Reference*
|
|
17
|
-
# - Barber, D., "Bayesian Reasoning and Machine Learning," Cambridge University Press, 2012.
|
|
18
|
-
class FactorAnalysis
|
|
19
|
-
include Base::BaseEstimator
|
|
20
|
-
include Base::Transformer
|
|
21
|
-
|
|
22
|
-
# Returns the mean vector.
|
|
23
|
-
# @return [Numo::DFloat] (shape: [n_features])
|
|
24
|
-
attr_reader :mean
|
|
25
|
-
|
|
26
|
-
# Returns the estimated noise variance for each feature.
|
|
27
|
-
# @return [Numo::DFloat] (shape: [n_features])
|
|
28
|
-
attr_reader :noise_variance
|
|
29
|
-
|
|
30
|
-
# Returns the components with maximum variance.
|
|
31
|
-
# @return [Numo::DFloat] (shape: [n_components, n_features])
|
|
32
|
-
attr_reader :components
|
|
33
|
-
|
|
34
|
-
# Returns the log likelihood at each iteration.
|
|
35
|
-
# @return [Numo::DFloat] (shape: [n_iter])
|
|
36
|
-
attr_reader :loglike
|
|
37
|
-
|
|
38
|
-
# Return the number of iterations run for optimization
|
|
39
|
-
# @return [Integer]
|
|
40
|
-
attr_reader :n_iter
|
|
41
|
-
|
|
42
|
-
# Create a new transformer with factor analysis.
|
|
43
|
-
#
|
|
44
|
-
# @param n_components [Integer] The number of components (dimensionality of latent space).
|
|
45
|
-
# @param max_iter [Integer] The maximum number of iterations.
|
|
46
|
-
# @param tol [Float/Nil] The tolerance of termination criterion for EM algorithm.
|
|
47
|
-
# If nil is given, iterate EM steps up to the maximum number of iterations.
|
|
48
|
-
def initialize(n_components: 2, max_iter: 100, tol: 1e-8)
|
|
49
|
-
check_params_numeric(n_components: n_components, max_iter: max_iter)
|
|
50
|
-
check_params_numeric_or_nil(tol: tol)
|
|
51
|
-
check_params_positive(n_components: n_components, max_iter: max_iter)
|
|
52
|
-
@params = {}
|
|
53
|
-
@params[:n_components] = n_components
|
|
54
|
-
@params[:max_iter] = max_iter
|
|
55
|
-
@params[:tol] = tol
|
|
56
|
-
@mean = nil
|
|
57
|
-
@noise_variance = nil
|
|
58
|
-
@components = nil
|
|
59
|
-
@loglike = nil
|
|
60
|
-
@n_iter = nil
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
# Fit the model with given training data.
|
|
64
|
-
#
|
|
65
|
-
# @overload fit(x) -> FactorAnalysis
|
|
66
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
|
67
|
-
# @return [FactorAnalysis] The learned transformer itself.
|
|
68
|
-
def fit(x, _y = nil)
|
|
69
|
-
raise 'FactorAnalysis#fit requires Numo::Linalg but that is not loaded.' unless enable_linalg?
|
|
70
|
-
|
|
71
|
-
# initialize some variables.
|
|
72
|
-
n_samples, n_features = x.shape
|
|
73
|
-
@mean = x.mean(0)
|
|
74
|
-
centered_x = x - @mean
|
|
75
|
-
cov_mat = centered_x.transpose.dot(centered_x) / n_samples
|
|
76
|
-
sample_vars = x.var(0)
|
|
77
|
-
sqrt_n_samples = Math.sqrt(n_samples)
|
|
78
|
-
@noise_variance = Numo::DFloat.ones(n_features)
|
|
79
|
-
|
|
80
|
-
# run optimization.
|
|
81
|
-
old_loglike = 0.0
|
|
82
|
-
@n_iter = 0
|
|
83
|
-
@loglike = [] unless @params[:tol].nil?
|
|
84
|
-
@params[:max_iter].times do |t|
|
|
85
|
-
@n_iter = t + 1
|
|
86
|
-
sqrt_noise_variance = Numo::NMath.sqrt(@noise_variance)
|
|
87
|
-
scaled_x = centered_x / (sqrt_noise_variance * sqrt_n_samples + 1e-12)
|
|
88
|
-
s, u = truncate_svd(scaled_x, @params[:n_components])
|
|
89
|
-
scaler = Numo::NMath.sqrt(Numo::DFloat.maximum(s**2 - 1.0, 0.0))
|
|
90
|
-
@components = (sqrt_noise_variance.diag.dot(u) * scaler).transpose.dup
|
|
91
|
-
@noise_variance = Numo::DFloat.maximum(sample_vars - @components.transpose.dot(@components).diagonal, 1e-12)
|
|
92
|
-
next if @params[:tol].nil?
|
|
93
|
-
|
|
94
|
-
new_loglike = log_likelihood(cov_mat, @components, @noise_variance)
|
|
95
|
-
@loglike.push(new_loglike)
|
|
96
|
-
break if (old_loglike - new_loglike).abs <= @params[:tol]
|
|
97
|
-
|
|
98
|
-
old_loglike = new_loglike
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
@loglike = Numo::DFloat.cast(@loglike) unless @params[:tol].nil?
|
|
102
|
-
@components = @components[0, true].dup if @params[:n_components] == 1
|
|
103
|
-
self
|
|
104
|
-
end
|
|
105
|
-
|
|
106
|
-
# Fit the model with training data, and then transform them with the learned model.
|
|
107
|
-
#
|
|
108
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
|
109
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
|
110
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
|
|
111
|
-
def fit_transform(x, _y = nil)
|
|
112
|
-
x = check_convert_sample_array(x)
|
|
113
|
-
raise 'FactorAnalysis#fit_transform requires Numo::Linalg but that is not loaded.' unless enable_linalg?
|
|
114
|
-
|
|
115
|
-
fit(x).transform(x)
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
# Transform the given data with the learned model.
|
|
119
|
-
#
|
|
120
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
|
|
121
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
|
|
122
|
-
def transform(x)
|
|
123
|
-
x = check_convert_sample_array(x)
|
|
124
|
-
raise 'FactorAnalysis#transform requires Numo::Linalg but that is not loaded.' unless enable_linalg?
|
|
125
|
-
|
|
126
|
-
factors = @params[:n_components] == 1 ? @components.expand_dims(0) : @components
|
|
127
|
-
centered_x = x - @mean
|
|
128
|
-
beta = Numo::Linalg.inv(Numo::DFloat.eye(factors.shape[0]) + (factors / @noise_variance).dot(factors.transpose))
|
|
129
|
-
z = centered_x.dot((beta.dot(factors) / @noise_variance).transpose)
|
|
130
|
-
@params[:n_components] == 1 ? z[true, 0].dup : z
|
|
131
|
-
end
|
|
132
|
-
|
|
133
|
-
private
|
|
134
|
-
|
|
135
|
-
def log_likelihood(cov_mat, factors, noise_vars)
|
|
136
|
-
n_samples = noise_vars.size
|
|
137
|
-
fact_cov_mat = factors.transpose.dot(factors) + noise_vars.diag
|
|
138
|
-
n_samples.fdiv(2) * Math.log(Numo::Linalg.det(fact_cov_mat)) + Numo::Linalg.inv(fact_cov_mat).dot(cov_mat).trace
|
|
139
|
-
end
|
|
140
|
-
|
|
141
|
-
def truncate_svd(x, k)
|
|
142
|
-
m = x.shape[1]
|
|
143
|
-
eig_vals, eig_vecs = Numo::Linalg.eigh(x.transpose.dot(x), vals_range: (m - k)...m)
|
|
144
|
-
s = Numo::NMath.sqrt(eig_vals.reverse.dup)
|
|
145
|
-
u = eig_vecs.reverse(1).dup
|
|
146
|
-
[s, u]
|
|
147
|
-
end
|
|
148
|
-
end
|
|
149
|
-
end
|
|
150
|
-
end
|
|
@@ -1,188 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rumale/base/base_estimator'
|
|
4
|
-
require 'rumale/base/transformer'
|
|
5
|
-
|
|
6
|
-
module Rumale
|
|
7
|
-
module Decomposition
|
|
8
|
-
# FastICA is a class that implments Fast Independent Component Analaysis.
|
|
9
|
-
#
|
|
10
|
-
# @example
|
|
11
|
-
# require 'numo/linalg/autoloader'
|
|
12
|
-
#
|
|
13
|
-
# transformer = Rumale::Decomposition::FastICA.new(n_components: 2, random_seed: 1)
|
|
14
|
-
# source_data = transformer.fit_transform(observed_data)
|
|
15
|
-
#
|
|
16
|
-
# *Reference*
|
|
17
|
-
# - Hyvarinen, A., "Fast and Robust Fixed-Point Algorithms for Independent Component Analysis," IEEE Trans. Neural Networks, Vol. 10 (3), pp. 626--634, 1999.
|
|
18
|
-
# - Hyvarinen, A., and Oja, E., "Independent Component Analysis: Algorithms and Applications," Neural Networks, Vol. 13 (4-5), pp. 411--430, 2000.
|
|
19
|
-
class FastICA
|
|
20
|
-
include Base::BaseEstimator
|
|
21
|
-
include Base::Transformer
|
|
22
|
-
|
|
23
|
-
# Returns the unmixing matrix.
|
|
24
|
-
# @return [Numo::DFloat] (shape: [n_components, n_features])
|
|
25
|
-
attr_reader :components
|
|
26
|
-
|
|
27
|
-
# Returns the mixing matrix.
|
|
28
|
-
# @return [Numo::DFloat] (shape: [n_features, n_components])
|
|
29
|
-
attr_reader :mixing
|
|
30
|
-
|
|
31
|
-
# Returns the number of iterations when converged.
|
|
32
|
-
# @return [Integer]
|
|
33
|
-
attr_reader :n_iter
|
|
34
|
-
|
|
35
|
-
# Return the random generator.
|
|
36
|
-
# @return [Random]
|
|
37
|
-
attr_reader :rng
|
|
38
|
-
|
|
39
|
-
# Create a new transformer with FastICA.
|
|
40
|
-
#
|
|
41
|
-
# @param n_components [Integer] The number of independent components.
|
|
42
|
-
# @param whiten [Boolean] The flag indicating whether to perform whitening.
|
|
43
|
-
# @param fun [String] The type of contrast function ('logcosh', 'exp', or 'cube').
|
|
44
|
-
# @param alpha [Float] The parameter of contrast function for 'logcosh' and 'exp'.
|
|
45
|
-
# If fun = 'cube', this parameter is ignored.
|
|
46
|
-
# @param max_iter [Integer] The maximum number of iterations.
|
|
47
|
-
# @param tol [Float] The tolerance of termination criterion.
|
|
48
|
-
# @param random_seed [Integer] The seed value using to initialize the random generator.
|
|
49
|
-
def initialize(n_components: 2, whiten: true, fun: 'logcosh', alpha: 1.0, max_iter: 200, tol: 1e-4, random_seed: nil)
|
|
50
|
-
check_params_numeric(n_components: n_components, max_iter: max_iter, alpha: alpha, tol: tol)
|
|
51
|
-
check_params_boolean(whiten: whiten)
|
|
52
|
-
check_params_string(fun: fun)
|
|
53
|
-
check_params_numeric_or_nil(random_seed: random_seed)
|
|
54
|
-
check_params_positive(n_components: n_components, max_iter: max_iter, tol: tol)
|
|
55
|
-
@params = {}
|
|
56
|
-
@params[:n_components] = n_components
|
|
57
|
-
@params[:whiten] = whiten
|
|
58
|
-
@params[:fun] = fun
|
|
59
|
-
@params[:alpha] = alpha
|
|
60
|
-
@params[:max_iter] = max_iter
|
|
61
|
-
@params[:tol] = tol
|
|
62
|
-
@params[:random_seed] = random_seed
|
|
63
|
-
@params[:random_seed] ||= srand
|
|
64
|
-
@components = nil
|
|
65
|
-
@mixing = nil
|
|
66
|
-
@n_iter = nil
|
|
67
|
-
@mean = nil
|
|
68
|
-
@rng = Random.new(@params[:random_seed])
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
# Fit the model with given training data.
|
|
72
|
-
#
|
|
73
|
-
# @overload fit(x) -> FastICA
|
|
74
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
|
75
|
-
# @return [FastICA] The learned transformer itself.
|
|
76
|
-
def fit(x, _y = nil)
|
|
77
|
-
x = check_convert_sample_array(x)
|
|
78
|
-
raise 'FastICA#fit requires Numo::Linalg but that is not loaded.' unless enable_linalg?
|
|
79
|
-
|
|
80
|
-
@mean, whiten_mat = whitening(x, @params[:n_components]) if @params[:whiten]
|
|
81
|
-
wx = @params[:whiten] ? (x - @mean).dot(whiten_mat.transpose) : x
|
|
82
|
-
unmixing, @n_iter = ica(wx, @params[:fun], @params[:max_iter], @params[:tol], @rng.dup)
|
|
83
|
-
@components = @params[:whiten] ? unmixing.dot(whiten_mat) : unmixing
|
|
84
|
-
@mixing = Numo::Linalg.pinv(@components).dup
|
|
85
|
-
if @params[:n_components] == 1
|
|
86
|
-
@components = @components.flatten.dup
|
|
87
|
-
@mixing = @mixing.flatten.dup
|
|
88
|
-
end
|
|
89
|
-
self
|
|
90
|
-
end
|
|
91
|
-
|
|
92
|
-
# Fit the model with training data, and then transform them with the learned model.
|
|
93
|
-
#
|
|
94
|
-
# @overload fit_transform(x) -> Numo::DFloat
|
|
95
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
|
|
96
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
|
|
97
|
-
def fit_transform(x, _y = nil)
|
|
98
|
-
x = check_convert_sample_array(x)
|
|
99
|
-
raise 'FastICA#fit_transform requires Numo::Linalg but that is not loaded.' unless enable_linalg?
|
|
100
|
-
|
|
101
|
-
fit(x).transform(x)
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
# Transform the given data with the learned model.
|
|
105
|
-
#
|
|
106
|
-
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
|
|
107
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
|
|
108
|
-
def transform(x)
|
|
109
|
-
x = check_convert_sample_array(x)
|
|
110
|
-
cx = @params[:whiten] ? (x - @mean) : x
|
|
111
|
-
cx.dot(@components.transpose)
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
# Inverse transform the given transformed data with the learned model.
|
|
115
|
-
#
|
|
116
|
-
# @param z [Numo::DFloat] (shape: [n_samples, n_components]) The source data reconstructed to the mixed data.
|
|
117
|
-
# @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The mixed data.
|
|
118
|
-
def inverse_transform(z)
|
|
119
|
-
z = check_convert_sample_array(z)
|
|
120
|
-
m = @mixing.shape[1].nil? ? @mixing.expand_dims(0).transpose : @mixing
|
|
121
|
-
x = z.dot(m.transpose)
|
|
122
|
-
x += @mean if @params[:whiten]
|
|
123
|
-
x
|
|
124
|
-
end
|
|
125
|
-
|
|
126
|
-
private
|
|
127
|
-
|
|
128
|
-
def whitening(x, n_components)
|
|
129
|
-
n_samples, n_features = x.shape
|
|
130
|
-
mean_vec = x.mean(0)
|
|
131
|
-
centered_x = x - mean_vec
|
|
132
|
-
covar_mat = centered_x.transpose.dot(centered_x) / n_samples
|
|
133
|
-
eig_vals, eig_vecs = Numo::Linalg.eigh(covar_mat, vals_range: (n_features - n_components)...n_features)
|
|
134
|
-
[mean_vec, (eig_vecs.reverse(1).dup * (1 / Numo::NMath.sqrt(eig_vals.reverse.dup))).transpose.dup]
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
def ica(x, fun, max_iter, tol, sub_rng)
|
|
138
|
-
n_samples, n_components = x.shape
|
|
139
|
-
w = decorrelation(Rumale::Utils.rand_normal([n_components, n_components], sub_rng))
|
|
140
|
-
n_iters = 0
|
|
141
|
-
max_iter.times do |t|
|
|
142
|
-
n_iters = t + 1
|
|
143
|
-
gx, ggx = gradient(x.dot(w.transpose), fun)
|
|
144
|
-
new_w = decorrelation(gx.transpose.dot(x) / n_samples - w * ggx / n_samples)
|
|
145
|
-
err = (new_w - w).abs.max
|
|
146
|
-
w = new_w
|
|
147
|
-
break if err <= tol
|
|
148
|
-
end
|
|
149
|
-
[w, n_iters]
|
|
150
|
-
end
|
|
151
|
-
|
|
152
|
-
def decorrelation(w)
|
|
153
|
-
eig_vals, eig_vecs = Numo::Linalg.eigh(w.dot(w.transpose))
|
|
154
|
-
decorr_mat = (eig_vecs * (1 / Numo::NMath.sqrt(eig_vals))).dot(eig_vecs.transpose)
|
|
155
|
-
decorr_mat.dot(w)
|
|
156
|
-
end
|
|
157
|
-
|
|
158
|
-
def gradient(x, func)
|
|
159
|
-
case func
|
|
160
|
-
when 'exp'
|
|
161
|
-
grad_exp(x, @params[:alpha])
|
|
162
|
-
when 'cube'
|
|
163
|
-
grad_cube(x)
|
|
164
|
-
else
|
|
165
|
-
grad_logcosh(x, @params[:alpha])
|
|
166
|
-
end
|
|
167
|
-
end
|
|
168
|
-
|
|
169
|
-
def grad_logcosh(x, alpha)
|
|
170
|
-
gx = Numo::NMath.tanh(alpha * x)
|
|
171
|
-
ggx = (alpha * (1 - gx**2)).sum(0)
|
|
172
|
-
[gx, ggx]
|
|
173
|
-
end
|
|
174
|
-
|
|
175
|
-
def grad_exp(x, alpha)
|
|
176
|
-
squared_x = x**2
|
|
177
|
-
exp_x = Numo::NMath.exp(-0.5 * alpha * squared_x)
|
|
178
|
-
gx = exp_x * x
|
|
179
|
-
ggx = (exp_x * (1 - alpha * squared_x)).sum(0)
|
|
180
|
-
[gx, ggx]
|
|
181
|
-
end
|
|
182
|
-
|
|
183
|
-
def grad_cube(x)
|
|
184
|
-
[x**3, (3 * x**2).sum(0)]
|
|
185
|
-
end
|
|
186
|
-
end
|
|
187
|
-
end
|
|
188
|
-
end
|