rumale 0.23.3 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +5 -1
  3. data/README.md +3 -288
  4. data/lib/rumale/version.rb +1 -1
  5. data/lib/rumale.rb +20 -131
  6. metadata +252 -150
  7. data/CHANGELOG.md +0 -643
  8. data/CODE_OF_CONDUCT.md +0 -74
  9. data/ext/rumale/extconf.rb +0 -37
  10. data/ext/rumale/rumaleext.c +0 -545
  11. data/ext/rumale/rumaleext.h +0 -12
  12. data/lib/rumale/base/base_estimator.rb +0 -49
  13. data/lib/rumale/base/classifier.rb +0 -36
  14. data/lib/rumale/base/cluster_analyzer.rb +0 -31
  15. data/lib/rumale/base/evaluator.rb +0 -17
  16. data/lib/rumale/base/regressor.rb +0 -36
  17. data/lib/rumale/base/splitter.rb +0 -21
  18. data/lib/rumale/base/transformer.rb +0 -22
  19. data/lib/rumale/clustering/dbscan.rb +0 -123
  20. data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
  21. data/lib/rumale/clustering/hdbscan.rb +0 -291
  22. data/lib/rumale/clustering/k_means.rb +0 -122
  23. data/lib/rumale/clustering/k_medoids.rb +0 -141
  24. data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
  25. data/lib/rumale/clustering/power_iteration.rb +0 -127
  26. data/lib/rumale/clustering/single_linkage.rb +0 -203
  27. data/lib/rumale/clustering/snn.rb +0 -76
  28. data/lib/rumale/clustering/spectral_clustering.rb +0 -115
  29. data/lib/rumale/dataset.rb +0 -246
  30. data/lib/rumale/decomposition/factor_analysis.rb +0 -150
  31. data/lib/rumale/decomposition/fast_ica.rb +0 -188
  32. data/lib/rumale/decomposition/nmf.rb +0 -124
  33. data/lib/rumale/decomposition/pca.rb +0 -159
  34. data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
  35. data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
  36. data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
  37. data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
  38. data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
  39. data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
  40. data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
  41. data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
  42. data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
  43. data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
  44. data/lib/rumale/ensemble/voting_classifier.rb +0 -126
  45. data/lib/rumale/ensemble/voting_regressor.rb +0 -82
  46. data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
  47. data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
  48. data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
  49. data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
  50. data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
  51. data/lib/rumale/evaluation_measure/f_score.rb +0 -50
  52. data/lib/rumale/evaluation_measure/function.rb +0 -147
  53. data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
  54. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
  55. data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
  56. data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
  57. data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
  58. data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
  59. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
  60. data/lib/rumale/evaluation_measure/precision.rb +0 -50
  61. data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
  62. data/lib/rumale/evaluation_measure/purity.rb +0 -40
  63. data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
  64. data/lib/rumale/evaluation_measure/recall.rb +0 -50
  65. data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
  66. data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
  67. data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
  68. data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
  69. data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
  70. data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
  71. data/lib/rumale/kernel_approximation/rbf.rb +0 -102
  72. data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
  73. data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
  74. data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
  75. data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
  76. data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
  77. data/lib/rumale/linear_model/base_sgd.rb +0 -285
  78. data/lib/rumale/linear_model/elastic_net.rb +0 -119
  79. data/lib/rumale/linear_model/lasso.rb +0 -115
  80. data/lib/rumale/linear_model/linear_regression.rb +0 -201
  81. data/lib/rumale/linear_model/logistic_regression.rb +0 -275
  82. data/lib/rumale/linear_model/nnls.rb +0 -137
  83. data/lib/rumale/linear_model/ridge.rb +0 -209
  84. data/lib/rumale/linear_model/svc.rb +0 -213
  85. data/lib/rumale/linear_model/svr.rb +0 -132
  86. data/lib/rumale/manifold/mds.rb +0 -155
  87. data/lib/rumale/manifold/tsne.rb +0 -222
  88. data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
  89. data/lib/rumale/metric_learning/mlkr.rb +0 -161
  90. data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
  91. data/lib/rumale/model_selection/cross_validation.rb +0 -125
  92. data/lib/rumale/model_selection/function.rb +0 -42
  93. data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
  94. data/lib/rumale/model_selection/group_k_fold.rb +0 -93
  95. data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
  96. data/lib/rumale/model_selection/k_fold.rb +0 -81
  97. data/lib/rumale/model_selection/shuffle_split.rb +0 -90
  98. data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
  99. data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
  100. data/lib/rumale/model_selection/time_series_split.rb +0 -91
  101. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
  102. data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
  103. data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
  104. data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
  105. data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
  106. data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
  107. data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
  108. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
  109. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
  110. data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
  111. data/lib/rumale/neural_network/adam.rb +0 -56
  112. data/lib/rumale/neural_network/base_mlp.rb +0 -248
  113. data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
  114. data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
  115. data/lib/rumale/pairwise_metric.rb +0 -152
  116. data/lib/rumale/pipeline/feature_union.rb +0 -69
  117. data/lib/rumale/pipeline/pipeline.rb +0 -175
  118. data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
  119. data/lib/rumale/preprocessing/binarizer.rb +0 -60
  120. data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
  121. data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
  122. data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
  123. data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
  124. data/lib/rumale/preprocessing/label_encoder.rb +0 -79
  125. data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
  126. data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
  127. data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
  128. data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
  129. data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
  130. data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
  131. data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
  132. data/lib/rumale/probabilistic_output.rb +0 -114
  133. data/lib/rumale/tree/base_decision_tree.rb +0 -150
  134. data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
  135. data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
  136. data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
  137. data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
  138. data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
  139. data/lib/rumale/tree/node.rb +0 -39
  140. data/lib/rumale/utils.rb +0 -42
  141. data/lib/rumale/validation.rb +0 -128
  142. data/lib/rumale/values.rb +0 -13
@@ -1,246 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'csv'
4
- require 'rumale/validation'
5
- require 'rumale/utils'
6
- require 'rumale/preprocessing/min_max_scaler'
7
-
8
- module Rumale
9
- # Module for loading and saving a dataset file.
10
- module Dataset
11
- class << self
12
- # Load a dataset with the libsvm file format into Numo::NArray.
13
- #
14
- # @param filename [String] A path to a dataset file.
15
- # @param n_features [Integer/Nil] The number of features of data to load.
16
- # If nil is given, it will be detected automatically from given file.
17
- # @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
18
- # @param dtype [Numo::NArray] Data type of Numo::NArray for features to be loaded.
19
- #
20
- # @return [Array<Numo::NArray>]
21
- # Returns array containing the (n_samples x n_features) matrix for feature vectors
22
- # and (n_samples) vector for labels or target values.
23
- def load_libsvm_file(filename, n_features: nil, zero_based: false, dtype: Numo::DFloat)
24
- ftvecs = []
25
- labels = []
26
- n_features_detected = 0
27
- CSV.foreach(filename, col_sep: "\s", headers: false) do |line|
28
- label, ftvec, max_idx = parse_libsvm_line(line, zero_based)
29
- labels.push(label)
30
- ftvecs.push(ftvec)
31
- n_features_detected = max_idx if n_features_detected < max_idx
32
- end
33
- n_features ||= n_features_detected
34
- n_features = [n_features, n_features_detected].max
35
- [convert_to_matrix(ftvecs, n_features, dtype), Numo::NArray.asarray(labels)]
36
- end
37
-
38
- # Dump the dataset with the libsvm file format.
39
- #
40
- # @param data [Numo::NArray] (shape: [n_samples, n_features]) matrix consisting of feature vectors.
41
- # @param labels [Numo::NArray] (shape: [n_samples]) matrix consisting of labels or target values.
42
- # @param filename [String] A path to the output libsvm file.
43
- # @param zero_based [Boolean] Whether the column index starts from 0 (true) or 1 (false).
44
- def dump_libsvm_file(data, labels, filename, zero_based: false)
45
- n_samples = [data.shape[0], labels.shape[0]].min
46
- single_label = labels.shape[1].nil?
47
- label_type = detect_dtype(labels)
48
- value_type = detect_dtype(data)
49
- File.open(filename, 'w') do |file|
50
- n_samples.times do |n|
51
- label = single_label ? labels[n] : labels[n, true].to_a
52
- file.puts(dump_libsvm_line(label, data[n, true],
53
- label_type, value_type, zero_based))
54
- end
55
- end
56
- end
57
-
58
- # Generate a two-dimensional data set consisting of an inner circle and an outer circle.
59
- #
60
- # @param n_samples [Integer] The number of samples.
61
- # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
62
- # @param noise [Float] The standard deviaion of gaussian noise added to the data.
63
- # If nil is given, no noise is added.
64
- # @param factor [Float] The scale factor between inner and outer circles. The interval of factor is (0, 1).
65
- # @param random_seed [Integer] The seed value using to initialize the random generator.
66
- def make_circles(n_samples, shuffle: true, noise: nil, factor: 0.8, random_seed: nil)
67
- Rumale::Validation.check_params_numeric(n_samples: n_samples, factor: factor)
68
- Rumale::Validation.check_params_boolean(shuffle: shuffle)
69
- Rumale::Validation.check_params_numeric_or_nil(noise: noise, random_seed: random_seed)
70
- raise ArgumentError, 'The number of samples must be more than 2.' if n_samples <= 1
71
- raise RangeError, 'The interval of factor is (0, 1).' if factor <= 0 || factor >= 1
72
-
73
- # initialize some variables.
74
- rs = random_seed
75
- rs ||= srand
76
- rng = Random.new(rs)
77
- n_samples_out = n_samples.fdiv(2).to_i
78
- n_samples_in = n_samples - n_samples_out
79
- # make two circles.
80
- linsp_out = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_out)
81
- linsp_in = Numo::DFloat.linspace(0, 2 * Math::PI, n_samples_in)
82
- circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose
83
- circle_in = Numo::DFloat[Numo::NMath.cos(linsp_in), Numo::NMath.sin(linsp_in)].transpose
84
- x = Numo::DFloat.vstack([circle_out, factor * circle_in])
85
- y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
86
- # shuffle data indices.
87
- if shuffle
88
- rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
89
- x = x[rand_ids, true].dup
90
- y = y[rand_ids].dup
91
- end
92
- # add gaussian noise.
93
- x += Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil?
94
- [x, y]
95
- end
96
-
97
- # Generate a two-dimensional data set consisting of two half circles shifted.
98
- #
99
- # @param n_samples [Integer] The number of samples.
100
- # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
101
- # @param noise [Float] The standard deviaion of gaussian noise added to the data.
102
- # If nil is given, no noise is added.
103
- # @param random_seed [Integer] The seed value using to initialize the random generator.
104
- def make_moons(n_samples, shuffle: true, noise: nil, random_seed: nil)
105
- Rumale::Validation.check_params_numeric(n_samples: n_samples)
106
- Rumale::Validation.check_params_boolean(shuffle: shuffle)
107
- Rumale::Validation.check_params_numeric_or_nil(noise: noise, random_seed: random_seed)
108
- raise ArgumentError, 'The number of samples must be more than 2.' if n_samples <= 1
109
-
110
- # initialize some variables.
111
- rs = random_seed
112
- rs ||= srand
113
- rng = Random.new(rs)
114
- n_samples_out = n_samples.fdiv(2).to_i
115
- n_samples_in = n_samples - n_samples_out
116
- # make two half circles.
117
- linsp_out = Numo::DFloat.linspace(0, Math::PI, n_samples_out)
118
- linsp_in = Numo::DFloat.linspace(0, Math::PI, n_samples_in)
119
- circle_out = Numo::DFloat[Numo::NMath.cos(linsp_out), Numo::NMath.sin(linsp_out)].transpose
120
- circle_in = Numo::DFloat[1 - Numo::NMath.cos(linsp_in), 1 - Numo::NMath.sin(linsp_in) - 0.5].transpose
121
- x = Numo::DFloat.vstack([circle_out, circle_in])
122
- y = Numo::Int32.hstack([Numo::Int32.zeros(n_samples_out), Numo::Int32.ones(n_samples_in)])
123
- # shuffle data indices.
124
- if shuffle
125
- rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
126
- x = x[rand_ids, true].dup
127
- y = y[rand_ids].dup
128
- end
129
- # add gaussian noise.
130
- x += Rumale::Utils.rand_normal(x.shape, rng.dup, 0.0, noise) unless noise.nil?
131
- [x, y]
132
- end
133
-
134
- # Generate Gaussian blobs.
135
- #
136
- # @param n_samples [Integer] The total number of samples.
137
- # @param n_features [Integer] The number of features.
138
- # If "centers" parameter is given as a Numo::DFloat array, this parameter is ignored.
139
- # @param centers [Integer/Numo::DFloat/Nil] The number of cluster centroids or the fixed cluster centroids.
140
- # If nil is given, the number of cluster centroids is set to 3.
141
- # @param cluster_std [Float] The standard deviation of the clusters.
142
- # @param center_box [Array] The bounding box for each cluster centroids.
143
- # If "centers" parameter is given as a Numo::DFloat array, this parameter is ignored.
144
- # @param shuffle [Boolean] The flag indicating whether to shuffle the dataset
145
- # @param random_seed [Integer] The seed value using to initialize the random generator.
146
- def make_blobs(n_samples = 1000, n_features = 2,
147
- centers: nil, cluster_std: 1.0, center_box: [-10, 10], shuffle: true, random_seed: nil)
148
- Rumale::Validation.check_params_numeric(n_samples: n_samples, n_features: n_features, cluster_std: cluster_std)
149
- Rumale::Validation.check_params_type(Array, center_box: center_box)
150
- Rumale::Validation.check_params_boolean(shuffle: shuffle)
151
- Rumale::Validation.check_params_numeric_or_nil(random_seed: random_seed)
152
- # initialize rng.
153
- rs = random_seed
154
- rs ||= srand
155
- rng = Random.new(rs)
156
- # initialize centers.
157
- if centers.is_a?(Numo::DFloat)
158
- n_centers = centers.shape[0]
159
- n_features = centers.shape[1]
160
- else
161
- n_centers = centers.is_a?(Integer) ? centers : 3
162
- center_min = center_box.first
163
- center_max = center_box.last
164
- centers = Rumale::Utils.rand_uniform([n_centers, n_features], rng)
165
- normalizer = Rumale::Preprocessing::MinMaxScaler.new(feature_range: [center_min, center_max])
166
- centers = normalizer.fit_transform(centers)
167
- end
168
- # generate blobs.
169
- sz_cluster = [n_samples / n_centers] * n_centers
170
- (n_samples % n_centers).times { |n| sz_cluster[n] += 1 }
171
- x = Rumale::Utils.rand_normal([sz_cluster[0], n_features], rng, 0.0, cluster_std) + centers[0, true]
172
- y = Numo::Int32.zeros(sz_cluster[0])
173
- (1...n_centers).each do |n|
174
- c = Rumale::Utils.rand_normal([sz_cluster[n], n_features], rng, 0.0, cluster_std) + centers[n, true]
175
- x = Numo::DFloat.vstack([x, c])
176
- y = y.concatenate(Numo::Int32.zeros(sz_cluster[n]) + n)
177
- end
178
- # shuffle data.
179
- if shuffle
180
- rand_ids = Array(0...n_samples).shuffle(random: rng.dup)
181
- x = x[rand_ids, true].dup
182
- y = y[rand_ids].dup
183
- end
184
- [x, y]
185
- end
186
-
187
- private
188
-
189
- def parse_libsvm_line(line, zero_based)
190
- label = parse_label(line.shift)
191
- adj_idx = zero_based == false ? 1 : 0
192
- max_idx = -1
193
- ftvec = []
194
- while (el = line.shift)
195
- idx, val = el.split(':')
196
- idx = idx.to_i - adj_idx
197
- val = val.to_i.to_s == val ? val.to_i : val.to_f
198
- max_idx = idx if max_idx < idx
199
- ftvec.push([idx, val])
200
- end
201
- [label, ftvec, max_idx]
202
- end
203
-
204
- def parse_label(label)
205
- lbl_arr = label.split(',').map { |lbl| lbl.to_i.to_s == lbl ? lbl.to_i : lbl.to_f }
206
- lbl_arr.size > 1 ? lbl_arr : lbl_arr[0]
207
- end
208
-
209
- def convert_to_matrix(data, n_features, dtype)
210
- mat = []
211
- data.each do |ft|
212
- vec = Array.new(n_features) { 0 }
213
- ft.each { |el| vec[el[0]] = el[1] }
214
- mat.push(vec)
215
- end
216
- dtype.asarray(mat)
217
- end
218
-
219
- def detect_dtype(data)
220
- arr_type_str = Numo::NArray.array_type(data).to_s
221
- type = '%s'
222
- type = '%d' if ['Numo::Int8', 'Numo::Int16', 'Numo::Int32', 'Numo::Int64'].include?(arr_type_str)
223
- type = '%d' if ['Numo::UInt8', 'Numo::UInt16', 'Numo::UInt32', 'Numo::UInt64'].include?(arr_type_str)
224
- type = '%.10g' if ['Numo::SFloat', 'Numo::DFloat'].include?(arr_type_str)
225
- type
226
- end
227
-
228
- def dump_libsvm_line(label, ftvec, label_type, value_type, zero_based)
229
- line = dump_label(label, label_type.to_s)
230
- ftvec.to_a.each_with_index do |val, n|
231
- idx = n + (zero_based == false ? 1 : 0)
232
- line += format(" %d:#{value_type}", idx, val) if val != 0
233
- end
234
- line
235
- end
236
-
237
- def dump_label(label, label_type_str)
238
- if label.is_a?(Array)
239
- label.map { |lbl| format(label_type_str, lbl) }.join(',')
240
- else
241
- format(label_type_str, label)
242
- end
243
- end
244
- end
245
- end
246
- end
@@ -1,150 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/transformer'
5
- require 'rumale/utils'
6
-
7
- module Rumale
8
- module Decomposition
9
- # FactorAnalysis is a class that implements fator analysis with EM algorithm.
10
- #
11
- # @example
12
- # require 'numo/linalg/autoloader'
13
- # decomposer = Rumale::Decomposition::FactorAnalysis.new(n_components: 2)
14
- # representaion = decomposer.fit_transform(samples)
15
- #
16
- # *Reference*
17
- # - Barber, D., "Bayesian Reasoning and Machine Learning," Cambridge University Press, 2012.
18
- class FactorAnalysis
19
- include Base::BaseEstimator
20
- include Base::Transformer
21
-
22
- # Returns the mean vector.
23
- # @return [Numo::DFloat] (shape: [n_features])
24
- attr_reader :mean
25
-
26
- # Returns the estimated noise variance for each feature.
27
- # @return [Numo::DFloat] (shape: [n_features])
28
- attr_reader :noise_variance
29
-
30
- # Returns the components with maximum variance.
31
- # @return [Numo::DFloat] (shape: [n_components, n_features])
32
- attr_reader :components
33
-
34
- # Returns the log likelihood at each iteration.
35
- # @return [Numo::DFloat] (shape: [n_iter])
36
- attr_reader :loglike
37
-
38
- # Return the number of iterations run for optimization
39
- # @return [Integer]
40
- attr_reader :n_iter
41
-
42
- # Create a new transformer with factor analysis.
43
- #
44
- # @param n_components [Integer] The number of components (dimensionality of latent space).
45
- # @param max_iter [Integer] The maximum number of iterations.
46
- # @param tol [Float/Nil] The tolerance of termination criterion for EM algorithm.
47
- # If nil is given, iterate EM steps up to the maximum number of iterations.
48
- def initialize(n_components: 2, max_iter: 100, tol: 1e-8)
49
- check_params_numeric(n_components: n_components, max_iter: max_iter)
50
- check_params_numeric_or_nil(tol: tol)
51
- check_params_positive(n_components: n_components, max_iter: max_iter)
52
- @params = {}
53
- @params[:n_components] = n_components
54
- @params[:max_iter] = max_iter
55
- @params[:tol] = tol
56
- @mean = nil
57
- @noise_variance = nil
58
- @components = nil
59
- @loglike = nil
60
- @n_iter = nil
61
- end
62
-
63
- # Fit the model with given training data.
64
- #
65
- # @overload fit(x) -> FactorAnalysis
66
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
67
- # @return [FactorAnalysis] The learned transformer itself.
68
- def fit(x, _y = nil)
69
- raise 'FactorAnalysis#fit requires Numo::Linalg but that is not loaded.' unless enable_linalg?
70
-
71
- # initialize some variables.
72
- n_samples, n_features = x.shape
73
- @mean = x.mean(0)
74
- centered_x = x - @mean
75
- cov_mat = centered_x.transpose.dot(centered_x) / n_samples
76
- sample_vars = x.var(0)
77
- sqrt_n_samples = Math.sqrt(n_samples)
78
- @noise_variance = Numo::DFloat.ones(n_features)
79
-
80
- # run optimization.
81
- old_loglike = 0.0
82
- @n_iter = 0
83
- @loglike = [] unless @params[:tol].nil?
84
- @params[:max_iter].times do |t|
85
- @n_iter = t + 1
86
- sqrt_noise_variance = Numo::NMath.sqrt(@noise_variance)
87
- scaled_x = centered_x / (sqrt_noise_variance * sqrt_n_samples + 1e-12)
88
- s, u = truncate_svd(scaled_x, @params[:n_components])
89
- scaler = Numo::NMath.sqrt(Numo::DFloat.maximum(s**2 - 1.0, 0.0))
90
- @components = (sqrt_noise_variance.diag.dot(u) * scaler).transpose.dup
91
- @noise_variance = Numo::DFloat.maximum(sample_vars - @components.transpose.dot(@components).diagonal, 1e-12)
92
- next if @params[:tol].nil?
93
-
94
- new_loglike = log_likelihood(cov_mat, @components, @noise_variance)
95
- @loglike.push(new_loglike)
96
- break if (old_loglike - new_loglike).abs <= @params[:tol]
97
-
98
- old_loglike = new_loglike
99
- end
100
-
101
- @loglike = Numo::DFloat.cast(@loglike) unless @params[:tol].nil?
102
- @components = @components[0, true].dup if @params[:n_components] == 1
103
- self
104
- end
105
-
106
- # Fit the model with training data, and then transform them with the learned model.
107
- #
108
- # @overload fit_transform(x) -> Numo::DFloat
109
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
110
- # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
111
- def fit_transform(x, _y = nil)
112
- x = check_convert_sample_array(x)
113
- raise 'FactorAnalysis#fit_transform requires Numo::Linalg but that is not loaded.' unless enable_linalg?
114
-
115
- fit(x).transform(x)
116
- end
117
-
118
- # Transform the given data with the learned model.
119
- #
120
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
121
- # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
122
- def transform(x)
123
- x = check_convert_sample_array(x)
124
- raise 'FactorAnalysis#transform requires Numo::Linalg but that is not loaded.' unless enable_linalg?
125
-
126
- factors = @params[:n_components] == 1 ? @components.expand_dims(0) : @components
127
- centered_x = x - @mean
128
- beta = Numo::Linalg.inv(Numo::DFloat.eye(factors.shape[0]) + (factors / @noise_variance).dot(factors.transpose))
129
- z = centered_x.dot((beta.dot(factors) / @noise_variance).transpose)
130
- @params[:n_components] == 1 ? z[true, 0].dup : z
131
- end
132
-
133
- private
134
-
135
- def log_likelihood(cov_mat, factors, noise_vars)
136
- n_samples = noise_vars.size
137
- fact_cov_mat = factors.transpose.dot(factors) + noise_vars.diag
138
- n_samples.fdiv(2) * Math.log(Numo::Linalg.det(fact_cov_mat)) + Numo::Linalg.inv(fact_cov_mat).dot(cov_mat).trace
139
- end
140
-
141
- def truncate_svd(x, k)
142
- m = x.shape[1]
143
- eig_vals, eig_vecs = Numo::Linalg.eigh(x.transpose.dot(x), vals_range: (m - k)...m)
144
- s = Numo::NMath.sqrt(eig_vals.reverse.dup)
145
- u = eig_vecs.reverse(1).dup
146
- [s, u]
147
- end
148
- end
149
- end
150
- end
@@ -1,188 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/transformer'
5
-
6
- module Rumale
7
- module Decomposition
8
- # FastICA is a class that implments Fast Independent Component Analaysis.
9
- #
10
- # @example
11
- # require 'numo/linalg/autoloader'
12
- #
13
- # transformer = Rumale::Decomposition::FastICA.new(n_components: 2, random_seed: 1)
14
- # source_data = transformer.fit_transform(observed_data)
15
- #
16
- # *Reference*
17
- # - Hyvarinen, A., "Fast and Robust Fixed-Point Algorithms for Independent Component Analysis," IEEE Trans. Neural Networks, Vol. 10 (3), pp. 626--634, 1999.
18
- # - Hyvarinen, A., and Oja, E., "Independent Component Analysis: Algorithms and Applications," Neural Networks, Vol. 13 (4-5), pp. 411--430, 2000.
19
- class FastICA
20
- include Base::BaseEstimator
21
- include Base::Transformer
22
-
23
- # Returns the unmixing matrix.
24
- # @return [Numo::DFloat] (shape: [n_components, n_features])
25
- attr_reader :components
26
-
27
- # Returns the mixing matrix.
28
- # @return [Numo::DFloat] (shape: [n_features, n_components])
29
- attr_reader :mixing
30
-
31
- # Returns the number of iterations when converged.
32
- # @return [Integer]
33
- attr_reader :n_iter
34
-
35
- # Return the random generator.
36
- # @return [Random]
37
- attr_reader :rng
38
-
39
- # Create a new transformer with FastICA.
40
- #
41
- # @param n_components [Integer] The number of independent components.
42
- # @param whiten [Boolean] The flag indicating whether to perform whitening.
43
- # @param fun [String] The type of contrast function ('logcosh', 'exp', or 'cube').
44
- # @param alpha [Float] The parameter of contrast function for 'logcosh' and 'exp'.
45
- # If fun = 'cube', this parameter is ignored.
46
- # @param max_iter [Integer] The maximum number of iterations.
47
- # @param tol [Float] The tolerance of termination criterion.
48
- # @param random_seed [Integer] The seed value using to initialize the random generator.
49
- def initialize(n_components: 2, whiten: true, fun: 'logcosh', alpha: 1.0, max_iter: 200, tol: 1e-4, random_seed: nil)
50
- check_params_numeric(n_components: n_components, max_iter: max_iter, alpha: alpha, tol: tol)
51
- check_params_boolean(whiten: whiten)
52
- check_params_string(fun: fun)
53
- check_params_numeric_or_nil(random_seed: random_seed)
54
- check_params_positive(n_components: n_components, max_iter: max_iter, tol: tol)
55
- @params = {}
56
- @params[:n_components] = n_components
57
- @params[:whiten] = whiten
58
- @params[:fun] = fun
59
- @params[:alpha] = alpha
60
- @params[:max_iter] = max_iter
61
- @params[:tol] = tol
62
- @params[:random_seed] = random_seed
63
- @params[:random_seed] ||= srand
64
- @components = nil
65
- @mixing = nil
66
- @n_iter = nil
67
- @mean = nil
68
- @rng = Random.new(@params[:random_seed])
69
- end
70
-
71
- # Fit the model with given training data.
72
- #
73
- # @overload fit(x) -> FastICA
74
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
75
- # @return [FastICA] The learned transformer itself.
76
- def fit(x, _y = nil)
77
- x = check_convert_sample_array(x)
78
- raise 'FastICA#fit requires Numo::Linalg but that is not loaded.' unless enable_linalg?
79
-
80
- @mean, whiten_mat = whitening(x, @params[:n_components]) if @params[:whiten]
81
- wx = @params[:whiten] ? (x - @mean).dot(whiten_mat.transpose) : x
82
- unmixing, @n_iter = ica(wx, @params[:fun], @params[:max_iter], @params[:tol], @rng.dup)
83
- @components = @params[:whiten] ? unmixing.dot(whiten_mat) : unmixing
84
- @mixing = Numo::Linalg.pinv(@components).dup
85
- if @params[:n_components] == 1
86
- @components = @components.flatten.dup
87
- @mixing = @mixing.flatten.dup
88
- end
89
- self
90
- end
91
-
92
- # Fit the model with training data, and then transform them with the learned model.
93
- #
94
- # @overload fit_transform(x) -> Numo::DFloat
95
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
96
- # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data
97
- def fit_transform(x, _y = nil)
98
- x = check_convert_sample_array(x)
99
- raise 'FastICA#fit_transform requires Numo::Linalg but that is not loaded.' unless enable_linalg?
100
-
101
- fit(x).transform(x)
102
- end
103
-
104
- # Transform the given data with the learned model.
105
- #
106
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The data to be transformed with the learned model.
107
- # @return [Numo::DFloat] (shape: [n_samples, n_components]) The transformed data.
108
- def transform(x)
109
- x = check_convert_sample_array(x)
110
- cx = @params[:whiten] ? (x - @mean) : x
111
- cx.dot(@components.transpose)
112
- end
113
-
114
- # Inverse transform the given transformed data with the learned model.
115
- #
116
- # @param z [Numo::DFloat] (shape: [n_samples, n_components]) The source data reconstructed to the mixed data.
117
- # @return [Numo::DFloat] (shape: [n_samples, n_featuress]) The mixed data.
118
- def inverse_transform(z)
119
- z = check_convert_sample_array(z)
120
- m = @mixing.shape[1].nil? ? @mixing.expand_dims(0).transpose : @mixing
121
- x = z.dot(m.transpose)
122
- x += @mean if @params[:whiten]
123
- x
124
- end
125
-
126
- private
127
-
128
- def whitening(x, n_components)
129
- n_samples, n_features = x.shape
130
- mean_vec = x.mean(0)
131
- centered_x = x - mean_vec
132
- covar_mat = centered_x.transpose.dot(centered_x) / n_samples
133
- eig_vals, eig_vecs = Numo::Linalg.eigh(covar_mat, vals_range: (n_features - n_components)...n_features)
134
- [mean_vec, (eig_vecs.reverse(1).dup * (1 / Numo::NMath.sqrt(eig_vals.reverse.dup))).transpose.dup]
135
- end
136
-
137
- def ica(x, fun, max_iter, tol, sub_rng)
138
- n_samples, n_components = x.shape
139
- w = decorrelation(Rumale::Utils.rand_normal([n_components, n_components], sub_rng))
140
- n_iters = 0
141
- max_iter.times do |t|
142
- n_iters = t + 1
143
- gx, ggx = gradient(x.dot(w.transpose), fun)
144
- new_w = decorrelation(gx.transpose.dot(x) / n_samples - w * ggx / n_samples)
145
- err = (new_w - w).abs.max
146
- w = new_w
147
- break if err <= tol
148
- end
149
- [w, n_iters]
150
- end
151
-
152
- def decorrelation(w)
153
- eig_vals, eig_vecs = Numo::Linalg.eigh(w.dot(w.transpose))
154
- decorr_mat = (eig_vecs * (1 / Numo::NMath.sqrt(eig_vals))).dot(eig_vecs.transpose)
155
- decorr_mat.dot(w)
156
- end
157
-
158
- def gradient(x, func)
159
- case func
160
- when 'exp'
161
- grad_exp(x, @params[:alpha])
162
- when 'cube'
163
- grad_cube(x)
164
- else
165
- grad_logcosh(x, @params[:alpha])
166
- end
167
- end
168
-
169
- def grad_logcosh(x, alpha)
170
- gx = Numo::NMath.tanh(alpha * x)
171
- ggx = (alpha * (1 - gx**2)).sum(0)
172
- [gx, ggx]
173
- end
174
-
175
- def grad_exp(x, alpha)
176
- squared_x = x**2
177
- exp_x = Numo::NMath.exp(-0.5 * alpha * squared_x)
178
- gx = exp_x * x
179
- ggx = (exp_x * (1 - alpha * squared_x)).sum(0)
180
- [gx, ggx]
181
- end
182
-
183
- def grad_cube(x)
184
- [x**3, (3 * x**2).sum(0)]
185
- end
186
- end
187
- end
188
- end