rumale 0.23.3 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +5 -1
  3. data/README.md +3 -288
  4. data/lib/rumale/version.rb +1 -1
  5. data/lib/rumale.rb +20 -131
  6. metadata +252 -150
  7. data/CHANGELOG.md +0 -643
  8. data/CODE_OF_CONDUCT.md +0 -74
  9. data/ext/rumale/extconf.rb +0 -37
  10. data/ext/rumale/rumaleext.c +0 -545
  11. data/ext/rumale/rumaleext.h +0 -12
  12. data/lib/rumale/base/base_estimator.rb +0 -49
  13. data/lib/rumale/base/classifier.rb +0 -36
  14. data/lib/rumale/base/cluster_analyzer.rb +0 -31
  15. data/lib/rumale/base/evaluator.rb +0 -17
  16. data/lib/rumale/base/regressor.rb +0 -36
  17. data/lib/rumale/base/splitter.rb +0 -21
  18. data/lib/rumale/base/transformer.rb +0 -22
  19. data/lib/rumale/clustering/dbscan.rb +0 -123
  20. data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
  21. data/lib/rumale/clustering/hdbscan.rb +0 -291
  22. data/lib/rumale/clustering/k_means.rb +0 -122
  23. data/lib/rumale/clustering/k_medoids.rb +0 -141
  24. data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
  25. data/lib/rumale/clustering/power_iteration.rb +0 -127
  26. data/lib/rumale/clustering/single_linkage.rb +0 -203
  27. data/lib/rumale/clustering/snn.rb +0 -76
  28. data/lib/rumale/clustering/spectral_clustering.rb +0 -115
  29. data/lib/rumale/dataset.rb +0 -246
  30. data/lib/rumale/decomposition/factor_analysis.rb +0 -150
  31. data/lib/rumale/decomposition/fast_ica.rb +0 -188
  32. data/lib/rumale/decomposition/nmf.rb +0 -124
  33. data/lib/rumale/decomposition/pca.rb +0 -159
  34. data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
  35. data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
  36. data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
  37. data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
  38. data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
  39. data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
  40. data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
  41. data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
  42. data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
  43. data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
  44. data/lib/rumale/ensemble/voting_classifier.rb +0 -126
  45. data/lib/rumale/ensemble/voting_regressor.rb +0 -82
  46. data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
  47. data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
  48. data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
  49. data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
  50. data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
  51. data/lib/rumale/evaluation_measure/f_score.rb +0 -50
  52. data/lib/rumale/evaluation_measure/function.rb +0 -147
  53. data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
  54. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
  55. data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
  56. data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
  57. data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
  58. data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
  59. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
  60. data/lib/rumale/evaluation_measure/precision.rb +0 -50
  61. data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
  62. data/lib/rumale/evaluation_measure/purity.rb +0 -40
  63. data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
  64. data/lib/rumale/evaluation_measure/recall.rb +0 -50
  65. data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
  66. data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
  67. data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
  68. data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
  69. data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
  70. data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
  71. data/lib/rumale/kernel_approximation/rbf.rb +0 -102
  72. data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
  73. data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
  74. data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
  75. data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
  76. data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
  77. data/lib/rumale/linear_model/base_sgd.rb +0 -285
  78. data/lib/rumale/linear_model/elastic_net.rb +0 -119
  79. data/lib/rumale/linear_model/lasso.rb +0 -115
  80. data/lib/rumale/linear_model/linear_regression.rb +0 -201
  81. data/lib/rumale/linear_model/logistic_regression.rb +0 -275
  82. data/lib/rumale/linear_model/nnls.rb +0 -137
  83. data/lib/rumale/linear_model/ridge.rb +0 -209
  84. data/lib/rumale/linear_model/svc.rb +0 -213
  85. data/lib/rumale/linear_model/svr.rb +0 -132
  86. data/lib/rumale/manifold/mds.rb +0 -155
  87. data/lib/rumale/manifold/tsne.rb +0 -222
  88. data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
  89. data/lib/rumale/metric_learning/mlkr.rb +0 -161
  90. data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
  91. data/lib/rumale/model_selection/cross_validation.rb +0 -125
  92. data/lib/rumale/model_selection/function.rb +0 -42
  93. data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
  94. data/lib/rumale/model_selection/group_k_fold.rb +0 -93
  95. data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
  96. data/lib/rumale/model_selection/k_fold.rb +0 -81
  97. data/lib/rumale/model_selection/shuffle_split.rb +0 -90
  98. data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
  99. data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
  100. data/lib/rumale/model_selection/time_series_split.rb +0 -91
  101. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
  102. data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
  103. data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
  104. data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
  105. data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
  106. data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
  107. data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
  108. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
  109. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
  110. data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
  111. data/lib/rumale/neural_network/adam.rb +0 -56
  112. data/lib/rumale/neural_network/base_mlp.rb +0 -248
  113. data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
  114. data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
  115. data/lib/rumale/pairwise_metric.rb +0 -152
  116. data/lib/rumale/pipeline/feature_union.rb +0 -69
  117. data/lib/rumale/pipeline/pipeline.rb +0 -175
  118. data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
  119. data/lib/rumale/preprocessing/binarizer.rb +0 -60
  120. data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
  121. data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
  122. data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
  123. data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
  124. data/lib/rumale/preprocessing/label_encoder.rb +0 -79
  125. data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
  126. data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
  127. data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
  128. data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
  129. data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
  130. data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
  131. data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
  132. data/lib/rumale/probabilistic_output.rb +0 -114
  133. data/lib/rumale/tree/base_decision_tree.rb +0 -150
  134. data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
  135. data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
  136. data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
  137. data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
  138. data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
  139. data/lib/rumale/tree/node.rb +0 -39
  140. data/lib/rumale/utils.rb +0 -42
  141. data/lib/rumale/validation.rb +0 -128
  142. data/lib/rumale/values.rb +0 -13
@@ -1,291 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/cluster_analyzer'
5
- require 'rumale/pairwise_metric'
6
- require 'rumale/clustering/single_linkage'
7
-
8
- module Rumale
9
- module Clustering
10
- # HDBSCAN is a class that implements HDBSCAN cluster analysis.
11
- #
12
- # @example
13
- # analyzer = Rumale::Clustering::HDBSCAN.new(min_samples: 5)
14
- # cluster_labels = analyzer.fit_predict(samples)
15
- #
16
- # *Reference*
17
- # - Campello, R J. G. B., Moulavi, D., Zimek, A., and Sander, J., "Hierarchical Density Estimates for Data Clustering, Visualization, and Outlier Detection," TKDD, Vol. 10 (1), pp. 5:1--5:51, 2015.
18
- # - Campello, R J. G. B., Moulavi, D., and Sander, J., "Density-Based Clustering Based on Hierarchical Density Estimates," Proc. PAKDD'13, pp. 160--172, 2013.
19
- # - Lelis, L., and Sander, J., "Semi-Supervised Density-Based Clustering," Proc. ICDM'09, pp. 842--847, 2009.
20
- class HDBSCAN
21
- include Base::BaseEstimator
22
- include Base::ClusterAnalyzer
23
-
24
- # Return the cluster labels. The negative cluster label indicates that the point is noise.
25
- # @return [Numo::Int32] (shape: [n_samples])
26
- attr_reader :labels
27
-
28
- # Create a new cluster analyzer with HDBSCAN algorithm.
29
- #
30
- # @param min_samples [Integer] The number of neighbor samples to be used for the criterion whether a point is a core point.
31
- # @param min_cluster_size [Integer/Nil] The minimum size of cluster. If nil is given, it is set equal to min_samples.
32
- # @param metric [String] The metric to calculate the distances.
33
- # If metric is 'euclidean', Euclidean distance is calculated for distance between points.
34
- # If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
35
- def initialize(min_samples: 10, min_cluster_size: nil, metric: 'euclidean')
36
- check_params_numeric(min_samples: min_samples)
37
- check_params_numeric_or_nil(min_cluster_size: min_cluster_size)
38
- check_params_string(metric: metric)
39
- check_params_positive(min_samples: min_samples)
40
- @params = {}
41
- @params[:min_samples] = min_samples
42
- @params[:min_cluster_size] = min_cluster_size || min_samples
43
- @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
44
- @labels = nil
45
- end
46
-
47
- # Analysis clusters with given training data.
48
- #
49
- # @overload fit(x) -> HDBSCAN
50
- #
51
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
52
- # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
53
- # @return [HDBSCAN] The learned cluster analyzer itself.
54
- def fit(x, _y = nil)
55
- x = check_convert_sample_array(x)
56
- raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
57
-
58
- fit_predict(x)
59
- self
60
- end
61
-
62
- # Analysis clusters and assign samples to clusters.
63
- #
64
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for cluster analysis.
65
- # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
66
- # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
67
- def fit_predict(x)
68
- x = check_convert_sample_array(x)
69
- raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
70
-
71
- distance_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
72
- @labels = partial_fit(distance_mat)
73
- end
74
-
75
- private
76
-
77
- # @!visibility private
78
- class UnionFind
79
- def initialize(n)
80
- @parent = Numo::Int32.new(n).seq
81
- @rank = Numo::Int32.zeros(n)
82
- end
83
-
84
- # @!visibility private
85
- def union(x, y)
86
- x_root = find(x)
87
- y_root = find(y)
88
-
89
- return if x_root == y_root
90
-
91
- # :nocov:
92
- if @rank[x_root] < @rank[y_root]
93
- @parent[x_root] = y_root
94
- else
95
- @parent[y_root] = x_root
96
- @rank[x_root] += 1 if @rank[x_root] == @rank[y_root]
97
- end
98
- # :nocov:
99
-
100
- nil
101
- end
102
-
103
- # @!visibility private
104
- def find(x)
105
- @parent[x] = find(@parent[x]) if @parent[x] != x
106
- @parent[x]
107
- end
108
- end
109
-
110
- # @!visibility private
111
- class Node
112
- # @!visibility private
113
- attr_reader :x, :y, :weight, :n_elements
114
-
115
- # @!visibility private
116
- def initialize(x:, y:, weight:, n_elements: 0)
117
- @x = x
118
- @y = y
119
- @weight = weight
120
- @n_elements = n_elements
121
- end
122
-
123
- # @!visibility private
124
- def ==(other)
125
- # :nocov:
126
- x == other.x && y == other.y && weight == other.weight && n_elements == other.n_elements
127
- # :nocov:
128
- end
129
- end
130
-
131
- private_constant :UnionFind, :Node
132
-
133
- def partial_fit(distance_mat)
134
- mr_distance_mat = mutual_reachability_distances(distance_mat, @params[:min_samples])
135
- hierarchy = Rumale::Clustering::SingleLinkage.new(n_clusters: 1, metric: 'precomputed').fit(mr_distance_mat).hierarchy
136
- tree = condense_tree(hierarchy, @params[:min_cluster_size])
137
- stabilities = cluster_stability(tree)
138
- flatten(tree, stabilities)
139
- end
140
-
141
- def mutual_reachability_distances(distance_mat, min_samples)
142
- core_distances = distance_mat.sort(axis: 1)[true, min_samples + 1]
143
- Numo::DFloat.maximum(core_distances.expand_dims(1), Numo::DFloat.maximum(core_distances, distance_mat))
144
- end
145
-
146
- def breadth_first_search_hierarchy(hierarchy, root)
147
- n_edges = hierarchy.size
148
- n_points = n_edges + 1
149
- to_process = [root]
150
- res = []
151
- while to_process.any?
152
- res.concat(to_process)
153
- to_process = to_process.select { |n| n >= n_points }.map { |n| n - n_points }
154
- to_process = to_process.map { |n| [hierarchy[n].x, hierarchy[n].y] }.flatten if to_process.any?
155
- end
156
- res
157
- end
158
-
159
- # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
160
- def condense_tree(hierarchy, min_cluster_size)
161
- n_edges = hierarchy.size
162
- root = 2 * n_edges
163
- n_points = n_edges + 1
164
- next_label = n_points + 1
165
-
166
- node_ids = breadth_first_search_hierarchy(hierarchy, root)
167
-
168
- relabel = Numo::Int32.zeros(root + 1)
169
- relabel[root] = n_points
170
- res = []
171
- visited = {}
172
-
173
- node_ids.each do |n_id|
174
- next if visited[n_id] || n_id < n_points
175
-
176
- edge = hierarchy[n_id - n_points]
177
-
178
- density = edge.weight > 0.0 ? 1.fdiv(edge.weight) : Float::INFINITY
179
- n_x_elements = edge.x >= n_points ? hierarchy[edge.x - n_points].n_elements : 1
180
- n_y_elements = edge.y >= n_points ? hierarchy[edge.y - n_points].n_elements : 1
181
-
182
- if n_x_elements >= min_cluster_size && n_y_elements >= min_cluster_size
183
- relabel[edge.x] = next_label
184
- res.push(Node.new(x: relabel[n_id], y: relabel[edge.x], weight: density, n_elements: n_x_elements))
185
- next_label += 1
186
- relabel[edge.y] = next_label
187
- res.push(Node.new(x: relabel[n_id], y: relabel[edge.y], weight: density, n_elements: n_y_elements))
188
- next_label += 1
189
- elsif n_x_elements < min_cluster_size && n_y_elements < min_cluster_size
190
- breadth_first_search_hierarchy(hierarchy, edge.x).each do |sn_id|
191
- res.push(Node.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
192
- visited[sn_id] = true
193
- end
194
- breadth_first_search_hierarchy(hierarchy, edge.y).each do |sn_id|
195
- res.push(Node.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
196
- visited[sn_id] = true
197
- end
198
- elsif n_x_elements < min_cluster_size
199
- relabel[edge.y] = relabel[n_id]
200
- breadth_first_search_hierarchy(hierarchy, edge.x).each do |sn_id|
201
- res.push(Node.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
202
- visited[sn_id] = true
203
- end
204
- elsif n_y_elements < min_cluster_size
205
- relabel[edge.x] = relabel[n_id]
206
- breadth_first_search_hierarchy(hierarchy, edge.y).each do |sn_id|
207
- res.push(Node.new(x: relabel[n_id], y: sn_id, weight: density, n_elements: 1)) if sn_id < n_points
208
- visited[sn_id] = true
209
- end
210
- end
211
- end
212
- res
213
- end
214
-
215
- def cluster_stability(tree)
216
- tree.sort! { |a, b| a.weight <=> b.weight }
217
-
218
- root = tree.map(&:x).min
219
- child_max = tree.map(&:y).max
220
- child_max = root if child_max < root
221
- densities = Numo::DFloat.zeros(child_max + 1) + Float::INFINITY
222
-
223
- current = tree[0].y
224
- density_min = tree[0].weight
225
- tree.each do |edge|
226
- if edge.x == current
227
- density_min = [density_min, edge.weight].min
228
- else
229
- densities[current] = density_min
230
- current = edge.y
231
- density_min = edge.weight
232
- end
233
- end
234
-
235
- densities[current] = density_min if current != tree[0].y
236
- densities[root] = 0.0
237
-
238
- tree.each_with_object({}) do |edge, stab|
239
- stab[edge.x] ||= 0.0
240
- stab[edge.x] += (edge.weight - densities[edge.x]) * edge.n_elements
241
- end
242
- end
243
-
244
- def breadth_first_search_tree(tree, root)
245
- to_process = [root]
246
- res = []
247
- while to_process.any?
248
- res.concat(to_process)
249
- to_process = tree.select { |v| to_process.include?(v.x) }.map(&:y)
250
- end
251
- res
252
- end
253
-
254
- def flatten(tree, stabilities)
255
- node_ids = stabilities.keys.sort.reverse.slice(0, stabilities.size - 1)
256
-
257
- cluster_tree = tree.select { |edge| edge.n_elements > 1 }
258
- is_cluster = node_ids.each_with_object({}) { |n_id, h| h[n_id] = true }
259
-
260
- node_ids.each do |n_id|
261
- children = cluster_tree.select { |node| node.x == n_id }.map(&:y)
262
- subtree_stability = children.inject(0.0) { |sum, c_id| sum + stabilities[c_id] }
263
- if subtree_stability > stabilities[n_id]
264
- is_cluster[n_id] = false
265
- stabilities[n_id] = subtree_stability
266
- else
267
- breadth_first_search_tree(cluster_tree, n_id).each do |sn_id|
268
- is_cluster[sn_id] = false if sn_id != n_id
269
- end
270
- end
271
- end
272
-
273
- cluster_label_map = {}
274
- is_cluster.select { |_k, v| v == true }.keys.uniq.sort.each_with_index { |n_idx, c_idx| cluster_label_map[n_idx] = c_idx }
275
-
276
- parent_arr = tree.map(&:x)
277
- uf = UnionFind.new(parent_arr.max + 1)
278
- tree.each { |edge| uf.union(edge.x, edge.y) if cluster_label_map[edge.y].nil? }
279
-
280
- root = parent_arr.min
281
- res = Numo::Int32.zeros(root)
282
- root.times do |n|
283
- cluster = uf.find(n)
284
- res[n] = cluster < root ? -1 : cluster_label_map[cluster] || -1
285
- end
286
- res
287
- end
288
- # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
289
- end
290
- end
291
- end
@@ -1,122 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/cluster_analyzer'
5
- require 'rumale/pairwise_metric'
6
-
7
- module Rumale
8
- # This module consists of classes that implement cluster analysis methods.
9
- module Clustering
10
- # KMeans is a class that implements K-Means cluster analysis.
11
- # The current implementation uses the Euclidean distance for analyzing the clusters.
12
- #
13
- # @example
14
- # analyzer = Rumale::Clustering::KMeans.new(n_clusters: 10, max_iter: 50)
15
- # cluster_labels = analyzer.fit_predict(samples)
16
- #
17
- # *Reference*
18
- # - Arthur, D., and Vassilvitskii, S., "k-means++: the advantages of careful seeding," Proc. SODA'07, pp. 1027--1035, 2007.
19
- class KMeans
20
- include Base::BaseEstimator
21
- include Base::ClusterAnalyzer
22
-
23
- # Return the centroids.
24
- # @return [Numo::DFloat] (shape: [n_clusters, n_features])
25
- attr_reader :cluster_centers
26
-
27
- # Return the random generator.
28
- # @return [Random]
29
- attr_reader :rng
30
-
31
- # Create a new cluster analyzer with K-Means method.
32
- #
33
- # @param n_clusters [Integer] The number of clusters.
34
- # @param init [String] The initialization method for centroids ('random' or 'k-means++').
35
- # @param max_iter [Integer] The maximum number of iterations.
36
- # @param tol [Float] The tolerance of termination criterion.
37
- # @param random_seed [Integer] The seed value using to initialize the random generator.
38
- def initialize(n_clusters: 8, init: 'k-means++', max_iter: 50, tol: 1.0e-4, random_seed: nil)
39
- check_params_numeric(n_clusters: n_clusters, max_iter: max_iter, tol: tol)
40
- check_params_string(init: init)
41
- check_params_numeric_or_nil(random_seed: random_seed)
42
- check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
43
- @params = {}
44
- @params[:n_clusters] = n_clusters
45
- @params[:init] = init == 'random' ? 'random' : 'k-means++'
46
- @params[:max_iter] = max_iter
47
- @params[:tol] = tol
48
- @params[:random_seed] = random_seed
49
- @params[:random_seed] ||= srand
50
- @cluster_centers = nil
51
- @rng = Random.new(@params[:random_seed])
52
- end
53
-
54
- # Analysis clusters with given training data.
55
- #
56
- # @overload fit(x) -> KMeans
57
- #
58
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
59
- # @return [KMeans] The learned cluster analyzer itself.
60
- def fit(x, _y = nil)
61
- x = check_convert_sample_array(x)
62
- init_cluster_centers(x)
63
- @params[:max_iter].times do |_t|
64
- cluster_labels = assign_cluster(x)
65
- old_centers = @cluster_centers.dup
66
- @params[:n_clusters].times do |n|
67
- assigned_bits = cluster_labels.eq(n)
68
- @cluster_centers[n, true] = x[assigned_bits.where, true].mean(axis: 0) if assigned_bits.count.positive?
69
- end
70
- error = Numo::NMath.sqrt(((old_centers - @cluster_centers)**2).sum(axis: 1)).mean
71
- break if error <= @params[:tol]
72
- end
73
- self
74
- end
75
-
76
- # Predict cluster labels for samples.
77
- #
78
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
79
- # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
80
- def predict(x)
81
- x = check_convert_sample_array(x)
82
- assign_cluster(x)
83
- end
84
-
85
- # Analysis clusters and assign samples to clusters.
86
- #
87
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
88
- # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
89
- def fit_predict(x)
90
- x = check_convert_sample_array(x)
91
- fit(x)
92
- predict(x)
93
- end
94
-
95
- private
96
-
97
- def assign_cluster(x)
98
- distance_matrix = PairwiseMetric.euclidean_distance(x, @cluster_centers)
99
- distance_matrix.min_index(axis: 1) - Numo::Int32[*0.step(distance_matrix.size - 1, @cluster_centers.shape[0])]
100
- end
101
-
102
- def init_cluster_centers(x)
103
- # random initialize
104
- n_samples = x.shape[0]
105
- sub_rng = @rng.dup
106
- rand_id = Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng)
107
- @cluster_centers = x[rand_id, true].dup
108
- return unless @params[:init] == 'k-means++'
109
-
110
- # k-means++ initialize
111
- (1...@params[:n_clusters]).each do |n|
112
- distance_matrix = PairwiseMetric.euclidean_distance(x, @cluster_centers[0...n, true])
113
- min_distances = distance_matrix.flatten[distance_matrix.min_index(axis: 1)]
114
- probs = min_distances**2 / (min_distances**2).sum
115
- cum_probs = probs.cumsum
116
- selected_id = cum_probs.gt(sub_rng.rand).where.to_a.first
117
- @cluster_centers[n, true] = x[selected_id, true].dup
118
- end
119
- end
120
- end
121
- end
122
- end
@@ -1,141 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/cluster_analyzer'
5
- require 'rumale/pairwise_metric'
6
-
7
- module Rumale
8
- module Clustering
9
- # KMedoids is a class that implements K-Medoids cluster analysis.
10
- #
11
- # @example
12
- # analyzer = Rumale::Clustering::KMedoids.new(n_clusters: 10, max_iter: 50)
13
- # cluster_labels = analyzer.fit_predict(samples)
14
- #
15
- # *Reference*
16
- # - Arthur, D., and Vassilvitskii, S., "k-means++: the advantages of careful seeding," Proc. SODA'07, pp. 1027--1035, 2007.
17
- class KMedoids
18
- include Base::BaseEstimator
19
- include Base::ClusterAnalyzer
20
-
21
- # Return the indices of medoids.
22
- # @return [Numo::Int32] (shape: [n_clusters])
23
- attr_reader :medoid_ids
24
-
25
- # Return the random generator.
26
- # @return [Random]
27
- attr_reader :rng
28
-
29
- # Create a new cluster analyzer with K-Medoids method.
30
- #
31
- # @param n_clusters [Integer] The number of clusters.
32
- # @param metric [String] The metric to calculate the distances.
33
- # If metric is 'euclidean', Euclidean distance is calculated for distance between points.
34
- # If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
35
- # @param init [String] The initialization method for centroids ('random' or 'k-means++').
36
- # @param max_iter [Integer] The maximum number of iterations.
37
- # @param tol [Float] The tolerance of termination criterion.
38
- # @param random_seed [Integer] The seed value using to initialize the random generator.
39
- def initialize(n_clusters: 8, metric: 'euclidean', init: 'k-means++', max_iter: 50, tol: 1.0e-4, random_seed: nil)
40
- check_params_numeric(n_clusters: n_clusters, max_iter: max_iter, tol: tol)
41
- check_params_string(metric: metric, init: init)
42
- check_params_numeric_or_nil(random_seed: random_seed)
43
- check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
44
- @params = {}
45
- @params[:n_clusters] = n_clusters
46
- @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
47
- @params[:init] = init == 'random' ? 'random' : 'k-means++'
48
- @params[:max_iter] = max_iter
49
- @params[:tol] = tol
50
- @params[:random_seed] = random_seed
51
- @params[:random_seed] ||= srand
52
- @medoid_ids = nil
53
- @cluster_centers = nil
54
- @rng = Random.new(@params[:random_seed])
55
- end
56
-
57
- # Analysis clusters with given training data.
58
- #
59
- # @overload fit(x) -> KMedoids
60
- #
61
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
62
- # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
63
- # @return [KMedoids] The learned cluster analyzer itself.
64
- def fit(x, _not_used = nil)
65
- x = check_convert_sample_array(x)
66
- raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
67
-
68
- # initialize some varibales.
69
- distance_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
70
- init_cluster_centers(distance_mat)
71
- error = distance_mat[true, @medoid_ids].mean
72
- @params[:max_iter].times do |_t|
73
- cluster_labels = assign_cluster(distance_mat[true, @medoid_ids])
74
- @params[:n_clusters].times do |n|
75
- assigned_ids = cluster_labels.eq(n).where
76
- @medoid_ids[n] = assigned_ids[distance_mat[assigned_ids, assigned_ids].sum(axis: 1).min_index]
77
- end
78
- new_error = distance_mat[true, @medoid_ids].mean
79
- break if (error - new_error).abs <= @params[:tol]
80
-
81
- error = new_error
82
- end
83
- @cluster_centers = x[@medoid_ids, true].dup if @params[:metric] == 'euclidean'
84
- self
85
- end
86
-
87
- # Predict cluster labels for samples.
88
- #
89
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
90
- # If the metric is 'precomputed', x must be distances between samples and medoids (shape: [n_samples, n_clusters]).
91
- # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
92
- def predict(x)
93
- x = check_convert_sample_array(x)
94
- distance_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x, @cluster_centers)
95
- if @params[:metric] == 'precomputed' && distance_mat.shape[1] != @medoid_ids.size
96
- raise ArgumentError, 'Expect the size input matrix to be n_samples-by-n_clusters.'
97
- end
98
-
99
- assign_cluster(distance_mat)
100
- end
101
-
102
- # Analysis clusters and assign samples to clusters.
103
- #
104
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
105
- # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
106
- # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
107
- def fit_predict(x)
108
- x = check_convert_sample_array(x)
109
- fit(x)
110
- if @params[:metric] == 'precomputed'
111
- predict(x[true, @medoid_ids])
112
- else
113
- predict(x)
114
- end
115
- end
116
-
117
- private
118
-
119
- def assign_cluster(distances_to_medoids)
120
- distances_to_medoids.min_index(axis: 1) - Numo::Int32[*0.step(distances_to_medoids.size - 1, @params[:n_clusters])]
121
- end
122
-
123
- def init_cluster_centers(distance_mat)
124
- # random initialize
125
- n_samples = distance_mat.shape[0]
126
- sub_rng = @rng.dup
127
- @medoid_ids = Numo::Int32.asarray(Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng))
128
- return unless @params[:init] == 'k-means++'
129
-
130
- # k-means++ initialize
131
- (1...@params[:n_clusters]).each do |n|
132
- distances = distance_mat[true, @medoid_ids[0...n]]
133
- min_distances = distances.flatten[distances.min_index(axis: 1)]
134
- probs = min_distances**2 / (min_distances**2).sum
135
- cum_probs = probs.cumsum
136
- @medoid_ids[n] = cum_probs.gt(sub_rng.rand).where.to_a.first
137
- end
138
- end
139
- end
140
- end
141
- end
@@ -1,139 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/cluster_analyzer'
5
- require 'rumale/pairwise_metric'
6
-
7
- module Rumale
8
- module Clustering
9
- # MniBatchKMeans is a class that implements K-Means cluster analysis
10
- # with mini-batch stochastic gradient descent (SGD).
11
- #
12
- # @example
13
- # analyzer = Rumale::Clustering::MiniBatchKMeans.new(n_clusters: 10, max_iter: 50, batch_size: 50, random_seed: 1)
14
- # cluster_labels = analyzer.fit_predict(samples)
15
- #
16
- # *Reference*
17
- # - Sculley, D., "Web-scale k-means clustering," Proc. WWW'10, pp. 1177--1178, 2010.
18
- class MiniBatchKMeans
19
- include Base::BaseEstimator
20
- include Base::ClusterAnalyzer
21
-
22
- # Return the centroids.
23
- # @return [Numo::DFloat] (shape: [n_clusters, n_features])
24
- attr_reader :cluster_centers
25
-
26
- # Return the random generator.
27
- # @return [Random]
28
- attr_reader :rng
29
-
30
- # Create a new cluster analyzer with K-Means method with mini-batch SGD.
31
- #
32
- # @param n_clusters [Integer] The number of clusters.
33
- # @param init [String] The initialization method for centroids ('random' or 'k-means++').
34
- # @param max_iter [Integer] The maximum number of iterations.
35
- # @param batch_size [Integer] The size of the mini batches.
36
- # @param tol [Float] The tolerance of termination criterion.
37
- # @param random_seed [Integer] The seed value using to initialize the random generator.
38
- def initialize(n_clusters: 8, init: 'k-means++', max_iter: 100, batch_size: 100, tol: 1.0e-4, random_seed: nil)
39
- check_params_numeric(n_clusters: n_clusters, max_iter: max_iter, batch_size: batch_size, tol: tol)
40
- check_params_string(init: init)
41
- check_params_numeric_or_nil(random_seed: random_seed)
42
- check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
43
- @params = {}
44
- @params[:n_clusters] = n_clusters
45
- @params[:init] = init == 'random' ? 'random' : 'k-means++'
46
- @params[:max_iter] = max_iter
47
- @params[:batch_size] = batch_size
48
- @params[:tol] = tol
49
- @params[:random_seed] = random_seed
50
- @params[:random_seed] ||= srand
51
- @cluster_centers = nil
52
- @rng = Random.new(@params[:random_seed])
53
- end
54
-
55
- # Analysis clusters with given training data.
56
- #
57
- # @overload fit(x) -> MiniBatchKMeans
58
- #
59
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
60
- # @return [KMeans] The learned cluster analyzer itself.
61
- def fit(x, _y = nil)
62
- x = check_convert_sample_array(x)
63
- # initialization.
64
- n_samples = x.shape[0]
65
- update_counter = Numo::Int32.zeros(@params[:n_clusters])
66
- sub_rng = @rng.dup
67
- init_cluster_centers(x, sub_rng)
68
- # optimization with mini-batch sgd.
69
- @params[:max_iter].times do |_t|
70
- sample_ids = Array(0...n_samples).shuffle(random: sub_rng)
71
- old_centers = @cluster_centers.dup
72
- until (subset_ids = sample_ids.shift(@params[:batch_size])).empty?
73
- # sub sampling
74
- sub_x = x[subset_ids, true]
75
- # assign nearest centroids
76
- cluster_labels = assign_cluster(sub_x)
77
- # update centroids
78
- @params[:n_clusters].times do |c|
79
- assigned_bits = cluster_labels.eq(c)
80
- next unless assigned_bits.count.positive?
81
-
82
- update_counter[c] += 1
83
- learning_rate = 1.fdiv(update_counter[c])
84
- update = sub_x[assigned_bits.where, true].mean(axis: 0)
85
- @cluster_centers[c, true] = (1 - learning_rate) * @cluster_centers[c, true] + learning_rate * update
86
- end
87
- end
88
- error = Numo::NMath.sqrt(((old_centers - @cluster_centers)**2).sum(axis: 1)).mean
89
- break if error <= @params[:tol]
90
- end
91
- self
92
- end
93
-
94
- # Predict cluster labels for samples.
95
- #
96
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
97
- # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
98
- def predict(x)
99
- x = check_convert_sample_array(x)
100
- assign_cluster(x)
101
- end
102
-
103
- # Analysis clusters and assign samples to clusters.
104
- #
105
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
106
- # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
107
- def fit_predict(x)
108
- x = check_convert_sample_array(x)
109
- fit(x)
110
- predict(x)
111
- end
112
-
113
- private
114
-
115
- def assign_cluster(x)
116
- distance_matrix = PairwiseMetric.euclidean_distance(x, @cluster_centers)
117
- distance_matrix.min_index(axis: 1) - Numo::Int32[*0.step(distance_matrix.size - 1, @cluster_centers.shape[0])]
118
- end
119
-
120
- def init_cluster_centers(x, sub_rng)
121
- # random initialize
122
- n_samples = x.shape[0]
123
- rand_id = Array(0...n_samples).sample(@params[:n_clusters], random: sub_rng)
124
- @cluster_centers = x[rand_id, true].dup
125
- return unless @params[:init] == 'k-means++'
126
-
127
- # k-means++ initialize
128
- (1...@params[:n_clusters]).each do |n|
129
- distance_matrix = PairwiseMetric.euclidean_distance(x, @cluster_centers[0...n, true])
130
- min_distances = distance_matrix.flatten[distance_matrix.min_index(axis: 1)]
131
- probs = min_distances**2 / (min_distances**2).sum
132
- cum_probs = probs.cumsum
133
- selected_id = cum_probs.gt(sub_rng.rand).where.to_a.first
134
- @cluster_centers[n, true] = x[selected_id, true].dup
135
- end
136
- end
137
- end
138
- end
139
- end