rumale 0.23.3 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +5 -1
  3. data/README.md +3 -288
  4. data/lib/rumale/version.rb +1 -1
  5. data/lib/rumale.rb +20 -131
  6. metadata +252 -150
  7. data/CHANGELOG.md +0 -643
  8. data/CODE_OF_CONDUCT.md +0 -74
  9. data/ext/rumale/extconf.rb +0 -37
  10. data/ext/rumale/rumaleext.c +0 -545
  11. data/ext/rumale/rumaleext.h +0 -12
  12. data/lib/rumale/base/base_estimator.rb +0 -49
  13. data/lib/rumale/base/classifier.rb +0 -36
  14. data/lib/rumale/base/cluster_analyzer.rb +0 -31
  15. data/lib/rumale/base/evaluator.rb +0 -17
  16. data/lib/rumale/base/regressor.rb +0 -36
  17. data/lib/rumale/base/splitter.rb +0 -21
  18. data/lib/rumale/base/transformer.rb +0 -22
  19. data/lib/rumale/clustering/dbscan.rb +0 -123
  20. data/lib/rumale/clustering/gaussian_mixture.rb +0 -218
  21. data/lib/rumale/clustering/hdbscan.rb +0 -291
  22. data/lib/rumale/clustering/k_means.rb +0 -122
  23. data/lib/rumale/clustering/k_medoids.rb +0 -141
  24. data/lib/rumale/clustering/mini_batch_k_means.rb +0 -139
  25. data/lib/rumale/clustering/power_iteration.rb +0 -127
  26. data/lib/rumale/clustering/single_linkage.rb +0 -203
  27. data/lib/rumale/clustering/snn.rb +0 -76
  28. data/lib/rumale/clustering/spectral_clustering.rb +0 -115
  29. data/lib/rumale/dataset.rb +0 -246
  30. data/lib/rumale/decomposition/factor_analysis.rb +0 -150
  31. data/lib/rumale/decomposition/fast_ica.rb +0 -188
  32. data/lib/rumale/decomposition/nmf.rb +0 -124
  33. data/lib/rumale/decomposition/pca.rb +0 -159
  34. data/lib/rumale/ensemble/ada_boost_classifier.rb +0 -179
  35. data/lib/rumale/ensemble/ada_boost_regressor.rb +0 -160
  36. data/lib/rumale/ensemble/extra_trees_classifier.rb +0 -139
  37. data/lib/rumale/ensemble/extra_trees_regressor.rb +0 -125
  38. data/lib/rumale/ensemble/gradient_boosting_classifier.rb +0 -306
  39. data/lib/rumale/ensemble/gradient_boosting_regressor.rb +0 -237
  40. data/lib/rumale/ensemble/random_forest_classifier.rb +0 -189
  41. data/lib/rumale/ensemble/random_forest_regressor.rb +0 -153
  42. data/lib/rumale/ensemble/stacking_classifier.rb +0 -215
  43. data/lib/rumale/ensemble/stacking_regressor.rb +0 -163
  44. data/lib/rumale/ensemble/voting_classifier.rb +0 -126
  45. data/lib/rumale/ensemble/voting_regressor.rb +0 -82
  46. data/lib/rumale/evaluation_measure/accuracy.rb +0 -29
  47. data/lib/rumale/evaluation_measure/adjusted_rand_score.rb +0 -74
  48. data/lib/rumale/evaluation_measure/calinski_harabasz_score.rb +0 -56
  49. data/lib/rumale/evaluation_measure/davies_bouldin_score.rb +0 -53
  50. data/lib/rumale/evaluation_measure/explained_variance_score.rb +0 -39
  51. data/lib/rumale/evaluation_measure/f_score.rb +0 -50
  52. data/lib/rumale/evaluation_measure/function.rb +0 -147
  53. data/lib/rumale/evaluation_measure/log_loss.rb +0 -45
  54. data/lib/rumale/evaluation_measure/mean_absolute_error.rb +0 -29
  55. data/lib/rumale/evaluation_measure/mean_squared_error.rb +0 -29
  56. data/lib/rumale/evaluation_measure/mean_squared_log_error.rb +0 -29
  57. data/lib/rumale/evaluation_measure/median_absolute_error.rb +0 -30
  58. data/lib/rumale/evaluation_measure/mutual_information.rb +0 -49
  59. data/lib/rumale/evaluation_measure/normalized_mutual_information.rb +0 -53
  60. data/lib/rumale/evaluation_measure/precision.rb +0 -50
  61. data/lib/rumale/evaluation_measure/precision_recall.rb +0 -96
  62. data/lib/rumale/evaluation_measure/purity.rb +0 -40
  63. data/lib/rumale/evaluation_measure/r2_score.rb +0 -43
  64. data/lib/rumale/evaluation_measure/recall.rb +0 -50
  65. data/lib/rumale/evaluation_measure/roc_auc.rb +0 -130
  66. data/lib/rumale/evaluation_measure/silhouette_score.rb +0 -82
  67. data/lib/rumale/feature_extraction/feature_hasher.rb +0 -110
  68. data/lib/rumale/feature_extraction/hash_vectorizer.rb +0 -155
  69. data/lib/rumale/feature_extraction/tfidf_transformer.rb +0 -113
  70. data/lib/rumale/kernel_approximation/nystroem.rb +0 -126
  71. data/lib/rumale/kernel_approximation/rbf.rb +0 -102
  72. data/lib/rumale/kernel_machine/kernel_fda.rb +0 -120
  73. data/lib/rumale/kernel_machine/kernel_pca.rb +0 -97
  74. data/lib/rumale/kernel_machine/kernel_ridge.rb +0 -82
  75. data/lib/rumale/kernel_machine/kernel_ridge_classifier.rb +0 -92
  76. data/lib/rumale/kernel_machine/kernel_svc.rb +0 -193
  77. data/lib/rumale/linear_model/base_sgd.rb +0 -285
  78. data/lib/rumale/linear_model/elastic_net.rb +0 -119
  79. data/lib/rumale/linear_model/lasso.rb +0 -115
  80. data/lib/rumale/linear_model/linear_regression.rb +0 -201
  81. data/lib/rumale/linear_model/logistic_regression.rb +0 -275
  82. data/lib/rumale/linear_model/nnls.rb +0 -137
  83. data/lib/rumale/linear_model/ridge.rb +0 -209
  84. data/lib/rumale/linear_model/svc.rb +0 -213
  85. data/lib/rumale/linear_model/svr.rb +0 -132
  86. data/lib/rumale/manifold/mds.rb +0 -155
  87. data/lib/rumale/manifold/tsne.rb +0 -222
  88. data/lib/rumale/metric_learning/fisher_discriminant_analysis.rb +0 -113
  89. data/lib/rumale/metric_learning/mlkr.rb +0 -161
  90. data/lib/rumale/metric_learning/neighbourhood_component_analysis.rb +0 -167
  91. data/lib/rumale/model_selection/cross_validation.rb +0 -125
  92. data/lib/rumale/model_selection/function.rb +0 -42
  93. data/lib/rumale/model_selection/grid_search_cv.rb +0 -225
  94. data/lib/rumale/model_selection/group_k_fold.rb +0 -93
  95. data/lib/rumale/model_selection/group_shuffle_split.rb +0 -115
  96. data/lib/rumale/model_selection/k_fold.rb +0 -81
  97. data/lib/rumale/model_selection/shuffle_split.rb +0 -90
  98. data/lib/rumale/model_selection/stratified_k_fold.rb +0 -99
  99. data/lib/rumale/model_selection/stratified_shuffle_split.rb +0 -118
  100. data/lib/rumale/model_selection/time_series_split.rb +0 -91
  101. data/lib/rumale/multiclass/one_vs_rest_classifier.rb +0 -83
  102. data/lib/rumale/naive_bayes/base_naive_bayes.rb +0 -47
  103. data/lib/rumale/naive_bayes/bernoulli_nb.rb +0 -82
  104. data/lib/rumale/naive_bayes/complement_nb.rb +0 -85
  105. data/lib/rumale/naive_bayes/gaussian_nb.rb +0 -69
  106. data/lib/rumale/naive_bayes/multinomial_nb.rb +0 -74
  107. data/lib/rumale/naive_bayes/negation_nb.rb +0 -71
  108. data/lib/rumale/nearest_neighbors/k_neighbors_classifier.rb +0 -133
  109. data/lib/rumale/nearest_neighbors/k_neighbors_regressor.rb +0 -108
  110. data/lib/rumale/nearest_neighbors/vp_tree.rb +0 -132
  111. data/lib/rumale/neural_network/adam.rb +0 -56
  112. data/lib/rumale/neural_network/base_mlp.rb +0 -248
  113. data/lib/rumale/neural_network/mlp_classifier.rb +0 -120
  114. data/lib/rumale/neural_network/mlp_regressor.rb +0 -90
  115. data/lib/rumale/pairwise_metric.rb +0 -152
  116. data/lib/rumale/pipeline/feature_union.rb +0 -69
  117. data/lib/rumale/pipeline/pipeline.rb +0 -175
  118. data/lib/rumale/preprocessing/bin_discretizer.rb +0 -93
  119. data/lib/rumale/preprocessing/binarizer.rb +0 -60
  120. data/lib/rumale/preprocessing/kernel_calculator.rb +0 -92
  121. data/lib/rumale/preprocessing/l1_normalizer.rb +0 -62
  122. data/lib/rumale/preprocessing/l2_normalizer.rb +0 -63
  123. data/lib/rumale/preprocessing/label_binarizer.rb +0 -89
  124. data/lib/rumale/preprocessing/label_encoder.rb +0 -79
  125. data/lib/rumale/preprocessing/max_abs_scaler.rb +0 -61
  126. data/lib/rumale/preprocessing/max_normalizer.rb +0 -62
  127. data/lib/rumale/preprocessing/min_max_scaler.rb +0 -76
  128. data/lib/rumale/preprocessing/one_hot_encoder.rb +0 -100
  129. data/lib/rumale/preprocessing/ordinal_encoder.rb +0 -109
  130. data/lib/rumale/preprocessing/polynomial_features.rb +0 -109
  131. data/lib/rumale/preprocessing/standard_scaler.rb +0 -71
  132. data/lib/rumale/probabilistic_output.rb +0 -114
  133. data/lib/rumale/tree/base_decision_tree.rb +0 -150
  134. data/lib/rumale/tree/decision_tree_classifier.rb +0 -150
  135. data/lib/rumale/tree/decision_tree_regressor.rb +0 -116
  136. data/lib/rumale/tree/extra_tree_classifier.rb +0 -107
  137. data/lib/rumale/tree/extra_tree_regressor.rb +0 -94
  138. data/lib/rumale/tree/gradient_tree_regressor.rb +0 -202
  139. data/lib/rumale/tree/node.rb +0 -39
  140. data/lib/rumale/utils.rb +0 -42
  141. data/lib/rumale/validation.rb +0 -128
  142. data/lib/rumale/values.rb +0 -13
@@ -1,127 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/cluster_analyzer'
5
- require 'rumale/pairwise_metric'
6
-
7
- module Rumale
8
- module Clustering
9
- # PowerIteration is a class that implements power iteration clustering.
10
- #
11
- # @example
12
- # analyzer = Rumale::Clustering::PowerIteration.new(n_clusters: 10, gamma: 8.0, max_iter: 1000)
13
- # cluster_labels = analyzer.fit_predict(samples)
14
- #
15
- # *Reference*
16
- # - Lin, F., and Cohen, W W., "Power Iteration Clustering," Proc. ICML'10, pp. 655--662, 2010.
17
- class PowerIteration
18
- include Base::BaseEstimator
19
- include Base::ClusterAnalyzer
20
-
21
- # Return the data in embedded space.
22
- # @return [Numo::DFloat] (shape: [n_samples])
23
- attr_reader :embedding
24
-
25
- # Return the cluster labels.
26
- # @return [Numo::Int32] (shape: [n_samples])
27
- attr_reader :labels
28
-
29
- # Return the number of iterations run for optimization
30
- # @return [Integer]
31
- attr_reader :n_iter
32
-
33
- # Create a new cluster analyzer with power iteration clustering.
34
- #
35
- # @param n_clusters [Integer] The number of clusters.
36
- # @param affinity [String] The representation of affinity matrix ('rbf' or 'precomputed').
37
- # @param gamma [Float] The parameter of rbf kernel, if nil it is 1 / n_features.
38
- # If affinity = 'precomputed', this parameter is ignored.
39
- # @param init [String] The initialization method for centroids of K-Means clustering ('random' or 'k-means++').
40
- # @param max_iter [Integer] The maximum number of iterations.
41
- # @param tol [Float] The tolerance of termination criterion.
42
- # @param eps [Float] A small value close to zero to avoid zero division error.
43
- # @param random_seed [Integer] The seed value using to initialize the random generator.
44
- def initialize(n_clusters: 8, affinity: 'rbf', gamma: nil, init: 'k-means++', max_iter: 1000, tol: 1.0e-8, eps: 1.0e-5, random_seed: nil)
45
- check_params_numeric(n_clusters: n_clusters, max_iter: max_iter, tol: tol, eps: eps)
46
- check_params_numeric_or_nil(gamma: gamma, random_seed: random_seed)
47
- check_params_string(affinity: affinity, init: init)
48
- check_params_positive(n_clusters: n_clusters, max_iter: max_iter, tol: tol, eps: eps)
49
- @params = {}
50
- @params[:n_clusters] = n_clusters
51
- @params[:affinity] = affinity
52
- @params[:gamma] = gamma
53
- @params[:init] = init == 'random' ? 'random' : 'k-means++'
54
- @params[:max_iter] = max_iter
55
- @params[:tol] = tol
56
- @params[:eps] = eps
57
- @params[:random_seed] = random_seed
58
- @params[:random_seed] ||= srand
59
- @embedding = nil
60
- @labels = nil
61
- @n_iter = nil
62
- end
63
-
64
- # Analysis clusters with given training data.
65
- #
66
- # @overload fit(x) -> PowerIteration
67
- #
68
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
69
- # If the metric is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
70
- # @return [PowerIteration] The learned cluster analyzer itself.
71
- def fit(x, _y = nil)
72
- x = check_convert_sample_array(x)
73
- raise ArgumentError, 'Expect the input affinity matrix to be square.' if @params[:affinity] == 'precomputed' && x.shape[0] != x.shape[1]
74
-
75
- fit_predict(x)
76
- self
77
- end
78
-
79
- # Analysis clusters and assign samples to clusters.
80
- #
81
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
82
- # If the metric is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
83
- # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
84
- def fit_predict(x)
85
- x = check_convert_sample_array(x)
86
- raise ArgumentError, 'Expect the input affinity matrix to be square.' if @params[:affinity] == 'precomputed' && x.shape[0] != x.shape[1]
87
-
88
- affinity_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.rbf_kernel(x, nil, @params[:gamma])
89
- @embedding, @n_iter = embedded_space(affinity_mat, @params[:max_iter], @params[:tol].fdiv(affinity_mat.shape[0]))
90
- @labels = line_kmeans_clustering(@embedding)
91
- end
92
-
93
- private
94
-
95
- def embedded_space(affinity_mat, max_iter, tol)
96
- affinity_mat[affinity_mat.diag_indices] = 0.0
97
-
98
- degrees = affinity_mat.sum(axis: 1)
99
- normalized_affinity_mat = (1.0 / degrees).diag.dot(affinity_mat)
100
-
101
- iters = 0
102
- embedded_line = degrees / degrees.sum
103
- n_samples = embedded_line.shape[0]
104
- error = Numo::DFloat.ones(n_samples)
105
- max_iter.times do |t|
106
- iters = t + 1
107
- new_embedded_line = normalized_affinity_mat.dot(embedded_line)
108
- new_embedded_line /= new_embedded_line.abs.sum
109
- new_error = (new_embedded_line - embedded_line).abs
110
- break if (new_error - error).abs.max <= tol
111
-
112
- embedded_line = new_embedded_line
113
- error = new_error
114
- end
115
-
116
- [embedded_line, iters]
117
- end
118
-
119
- def line_kmeans_clustering(vec)
120
- Rumale::Clustering::KMeans.new(
121
- n_clusters: @params[:n_clusters], init: @params[:init],
122
- max_iter: @params[:max_iter], tol: @params[:tol], random_seed: @params[:random_seed]
123
- ).fit_predict(vec.expand_dims(1))
124
- end
125
- end
126
- end
127
- end
@@ -1,203 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/cluster_analyzer'
5
- require 'rumale/pairwise_metric'
6
-
7
- module Rumale
8
- module Clustering
9
- # SingleLinkage is a class that implements hierarchical cluster analysis with single linakge method.
10
- # This class is used internally for HDBSCAN.
11
- #
12
- # @example
13
- # analyzer = Rumale::Clustering::SingleLinkage.new(n_clusters: 2)
14
- # cluster_labels = analyzer.fit_predict(samples)
15
- #
16
- # *Reference*
17
- # - Mullner, D., "Modern hierarchical, agglomerative clustering algorithms," arXiv:1109.2378, 2011.
18
- class SingleLinkage
19
- include Base::BaseEstimator
20
- include Base::ClusterAnalyzer
21
-
22
- # Return the cluster labels.
23
- # @return [Numo::Int32] (shape: [n_samples])
24
- attr_reader :labels
25
-
26
- # Return the hierarchical structure.
27
- # @return [Array<SingleLinkage::Node>] (shape: [n_samples - 1])
28
- attr_reader :hierarchy
29
-
30
- # Create a new cluster analyzer with single linkage algorithm.
31
- #
32
- # @param n_clusters [Integer] The number of clusters.
33
- # @param metric [String] The metric to calculate the distances.
34
- # If metric is 'euclidean', Euclidean distance is calculated for distance between points.
35
- # If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
36
- def initialize(n_clusters: 2, metric: 'euclidean')
37
- check_params_numeric(n_clusters: n_clusters)
38
- check_params_string(metric: metric)
39
- @params = {}
40
- @params[:n_clusters] = n_clusters
41
- @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
42
- @labels = nil
43
- @hierarchy = nil
44
- end
45
-
46
- # Analysis clusters with given training data.
47
- #
48
- # @overload fit(x) -> SingleLinkage
49
- #
50
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
51
- # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
52
- # @return [SingleLinkage] The learned cluster analyzer itself.
53
- def fit(x, _y = nil)
54
- x = check_convert_sample_array(x)
55
- raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
56
-
57
- fit_predict(x)
58
- self
59
- end
60
-
61
- # Analysis clusters and assign samples to clusters.
62
- #
63
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for cluster analysis.
64
- # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
65
- # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
66
- def fit_predict(x)
67
- x = check_convert_sample_array(x)
68
- raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
69
-
70
- distance_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
71
- @labels = partial_fit(distance_mat)
72
- end
73
-
74
- private
75
-
76
- # @!visibility private
77
- class UnionFind
78
- def initialize(n)
79
- @parent = Numo::Int32.zeros(2 * n - 1) - 1
80
- @size = Numo::Int32.hstack([Numo::Int32.ones(n), Numo::Int32.zeros(n - 1)])
81
- @next_label = n
82
- end
83
-
84
- # @!visibility private
85
- def union(x, y)
86
- size = @size[x] + @size[y]
87
- @parent[x] = @next_label
88
- @parent[y] = @next_label
89
- @size[@next_label] = size
90
- @next_label += 1
91
- size
92
- end
93
-
94
- # @!visibility private
95
- def find(x)
96
- p = x
97
- x = @parent[x] while @parent[x] != -1
98
- while @parent[p] != x
99
- p = @parent[p]
100
- @parent[p] = x
101
- end
102
- x
103
- end
104
- end
105
-
106
- # @!visibility private
107
- class Node
108
- # @!visibility private
109
- attr_reader :x, :y, :weight, :n_elements
110
-
111
- # @!visibility private
112
- def initialize(x:, y:, weight:, n_elements: 0)
113
- @x = x
114
- @y = y
115
- @weight = weight
116
- @n_elements = n_elements
117
- end
118
-
119
- # @!visibility private
120
- def ==(other)
121
- x == other.x && y == other.y && weight == other.weight && n_elements == other.n_elements
122
- end
123
- end
124
-
125
- private_constant :UnionFind, :Node
126
-
127
- def partial_fit(distance_mat)
128
- mst = minimum_spanning_tree(distance_mat)
129
- @hierarchy = single_linkage_hierarchy(mst)
130
- flatten(@hierarchy, @params[:n_clusters])
131
- end
132
-
133
- def minimum_spanning_tree(complete_graph)
134
- n_samples = complete_graph.shape[0]
135
- n_edges = n_samples - 1
136
- curr_weights = Numo::DFloat.zeros(n_samples) + Float::INFINITY
137
- curr_labels = Numo::Int32.new(n_samples).seq
138
- next_node = 0
139
- mst = Array.new(n_edges) do
140
- curr_node = next_node
141
- target = curr_labels.ne(curr_node)
142
- curr_labels = curr_labels[target]
143
- curr_weights = Numo::DFloat.minimum(curr_weights[target], complete_graph[curr_node, curr_labels])
144
- next_node = curr_labels[curr_weights.min_index]
145
- weight = curr_weights.min
146
- Node.new(x: curr_node, y: next_node, weight: weight)
147
- end
148
- mst.sort! { |a, b| a.weight <=> b.weight }
149
- end
150
-
151
- def single_linkage_hierarchy(mst)
152
- n_edges = mst.size
153
- n_nodes = n_edges + 1
154
- uf = UnionFind.new(n_nodes)
155
- Array.new(n_edges) do |n|
156
- x_root = uf.find(mst[n].x)
157
- y_root = uf.find(mst[n].y)
158
- x_root, y_root = [y_root, x_root] unless x_root < y_root
159
- weight = mst[n].weight
160
- n_samples = uf.union(x_root, y_root)
161
- Node.new(x: x_root, y: y_root, weight: weight, n_elements: n_samples)
162
- end
163
- end
164
-
165
- def descedent_ids(hierarchy_, start_node)
166
- n_samples = hierarchy_.size + 1
167
- return [start_node] if start_node < n_samples
168
-
169
- res = []
170
- indices = [start_node]
171
- n_indices = 1
172
- while n_indices.positive?
173
- idx = indices.pop
174
- if idx < n_samples
175
- res.push(idx)
176
- n_indices -= 1
177
- else
178
- indices.push(hierarchy_[idx - n_samples].x)
179
- indices.push(hierarchy_[idx - n_samples].y)
180
- n_indices += 1
181
- end
182
- end
183
- res
184
- end
185
-
186
- def flatten(hierarchy_, n_clusters)
187
- n_samples = hierarchy_.size + 1
188
- return Numo::Int32.zeros(n_samples) if n_clusters < 2
189
-
190
- nodes = [-([hierarchy_[-1].x, hierarchy_[-1].y].max + 1)]
191
- (n_clusters - 1).times do
192
- children = hierarchy_[-nodes[0] - n_samples]
193
- nodes.push(-children.x)
194
- nodes.push(-children.y)
195
- nodes.sort!.shift
196
- end
197
- res = Numo::Int32.zeros(n_samples)
198
- nodes.each_with_index { |sid, cluster_id| res[descedent_ids(hierarchy_, -sid)] = cluster_id }
199
- res
200
- end
201
- end
202
- end
203
- end
@@ -1,76 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/pairwise_metric'
4
- require 'rumale/clustering/dbscan'
5
-
6
- module Rumale
7
- module Clustering
8
- # SNN is a class that implements Shared Nearest Neighbor cluster analysis.
9
- # The SNN method is a variation of DBSCAN that uses similarity based on k-nearest neighbors as a metric.
10
- #
11
- # @example
12
- # analyzer = Rumale::Clustering::SNN.new(n_neighbros: 10, eps: 5, min_samples: 5)
13
- # cluster_labels = analyzer.fit_predict(samples)
14
- #
15
- # *Reference*
16
- # - Ertoz, L., Steinbach, M., and Kumar, V., "Finding Clusters of Different Sizes, Shapes, and Densities in Noisy, High Dimensional Data," Proc. SDM'03, pp. 47--58, 2003.
17
- # - Houle, M E., Kriegel, H-P., Kroger, P., Schubert, E., and Zimek, A., "Can Shared-Neighbor Distances Defeat the Curse of Dimensionality?," Proc. SSDBM'10, pp. 482--500, 2010.
18
- class SNN < DBSCAN
19
- # Create a new cluster analyzer with Shared Neareset Neighbor method.
20
- #
21
- # @param n_neighbors [Integer] The number of neighbors to be used for finding k-nearest neighbors.
22
- # @param eps [Integer] The threshold value for finding connected components based on similarity.
23
- # @param min_samples [Integer] The number of neighbor samples to be used for the criterion whether a point is a core point.
24
- # @param metric [String] The metric to calculate the distances.
25
- # If metric is 'euclidean', Euclidean distance is calculated for distance between points.
26
- # If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
27
- def initialize(n_neighbors: 10, eps: 5, min_samples: 5, metric: 'euclidean')
28
- check_params_numeric(n_neighbors: n_neighbors, min_samples: min_samples)
29
- check_params_string(metric: metric)
30
- @params = {}
31
- @params[:n_neighbors] = n_neighbors
32
- @params[:eps] = eps
33
- @params[:min_samples] = min_samples
34
- @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
35
- @core_sample_ids = nil
36
- @labels = nil
37
- end
38
-
39
- # Analysis clusters with given training data.
40
- #
41
- # @overload fit(x) -> SNN
42
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
43
- # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
44
- # @return [SNN] The learned cluster analyzer itself.
45
- def fit(x, _y = nil)
46
- super
47
- end
48
-
49
- # Analysis clusters and assign samples to clusters.
50
- #
51
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for cluster analysis.
52
- # If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
53
- # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
54
- def fit_predict(x) # rubocop:disable Lint/UselessMethodDefinition
55
- super
56
- end
57
-
58
- private
59
-
60
- def calc_pairwise_metrics(x)
61
- distance_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
62
- n_samples = distance_mat.shape[0]
63
- adjacency_mat = Numo::DFloat.zeros(n_samples, n_samples)
64
- n_samples.times do |n|
65
- neighbor_ids = distance_mat[n, true].sort_index[0...@params[:n_neighbors]]
66
- adjacency_mat[n, neighbor_ids] = 1
67
- end
68
- adjacency_mat.dot(adjacency_mat.transpose)
69
- end
70
-
71
- def region_query(similarity_arr)
72
- similarity_arr.gt(@params[:eps]).where.to_a
73
- end
74
- end
75
- end
76
- end
@@ -1,115 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rumale/base/base_estimator'
4
- require 'rumale/base/cluster_analyzer'
5
- require 'rumale/pairwise_metric'
6
- require 'rumale/preprocessing/l2_normalizer'
7
-
8
- module Rumale
9
- module Clustering
10
- # SpectralClustering is a class that implements the normalized spectral clustering.
11
- #
12
- # @example
13
- # require 'numo/linalg/autoloader'
14
- #
15
- # analyzer = Rumale::Clustering::SpectralClustering.new(n_clusters: 10, gamma: 8.0)
16
- # cluster_labels = analyzer.fit_predict(samples)
17
- #
18
- # *Reference*
19
- # - Ng, A Y., Jordan, M I., and Weiss, Y., "On Spectral Clustering: Analyssi and an algorithm," Proc. NIPS'01, pp. 849--856, 2001.
20
- # - von Luxburg, U., "A tutorial on spectral clustering," Statistics and Computing, Vol. 17 (4), pp. 395--416, 2007.
21
- class SpectralClustering
22
- include Base::BaseEstimator
23
- include Base::ClusterAnalyzer
24
-
25
- # Return the data in embedded space.
26
- # @return [Numo::DFloat] (shape: [n_samples, n_clusters])
27
- attr_reader :embedding
28
-
29
- # Return the cluster labels.
30
- # @return [Numo::Int32] (shape: [n_samples])
31
- attr_reader :labels
32
-
33
- # Create a new cluster analyzer with normalized spectral clustering.
34
- #
35
- # @param n_clusters [Integer] The number of clusters.
36
- # @param affinity [String] The representation of affinity matrix ('rbf' or 'precomputed').
37
- # If affinity = 'rbf', the class performs the normalized spectral clustering with the fully connected graph weighted by rbf kernel.
38
- # @param gamma [Float] The parameter of rbf kernel, if nil it is 1 / n_features.
39
- # If affinity = 'precomputed', this parameter is ignored.
40
- # @param init [String] The initialization method for centroids of K-Means clustering ('random' or 'k-means++').
41
- # @param max_iter [Integer] The maximum number of iterations for K-Means clustering.
42
- # @param tol [Float] The tolerance of termination criterion for K-Means clustering.
43
- # @param random_seed [Integer] The seed value using to initialize the random generator.
44
- def initialize(n_clusters: 2, affinity: 'rbf', gamma: nil, init: 'k-means++', max_iter: 10, tol: 1.0e-8, random_seed: nil)
45
- check_params_numeric(n_clusters: n_clusters, max_iter: max_iter, tol: tol)
46
- check_params_numeric_or_nil(gamma: gamma, random_seed: random_seed)
47
- check_params_string(affinity: affinity, init: init)
48
- check_params_positive(n_clusters: n_clusters, max_iter: max_iter, tol: tol)
49
- @params = {}
50
- @params[:n_clusters] = n_clusters
51
- @params[:affinity] = affinity
52
- @params[:gamma] = gamma
53
- @params[:init] = init == 'random' ? 'random' : 'k-means++'
54
- @params[:max_iter] = max_iter
55
- @params[:tol] = tol
56
- @params[:random_seed] = random_seed
57
- @params[:random_seed] ||= srand
58
- @embedding = nil
59
- @labels = nil
60
- end
61
-
62
- # Analysis clusters with given training data.
63
- # To execute this method, Numo::Linalg must be loaded.
64
- #
65
- # @overload fit(x) -> SpectralClustering
66
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
67
- # If the metric is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
68
- # @return [SpectralClustering] The learned cluster analyzer itself.
69
- def fit(x, _y = nil)
70
- x = check_convert_sample_array(x)
71
- raise ArgumentError, 'Expect the input affinity matrix to be square.' if @params[:affinity] == 'precomputed' && x.shape[0] != x.shape[1]
72
- raise 'SpectralClustering#fit requires Numo::Linalg but that is not loaded.' unless enable_linalg?
73
-
74
- fit_predict(x)
75
- self
76
- end
77
-
78
- # Analysis clusters and assign samples to clusters.
79
- # To execute this method, Numo::Linalg must be loaded.
80
- #
81
- # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
82
- # If the metric is 'precomputed', x must be a square affinity matrix (shape: [n_samples, n_samples]).
83
- # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
84
- def fit_predict(x)
85
- x = check_convert_sample_array(x)
86
- raise ArgumentError, 'Expect the input affinity matrix to be square.' if @params[:affinity] == 'precomputed' && x.shape[0] != x.shape[1]
87
- raise 'SpectralClustering#fit_predict requires Numo::Linalg but that is not loaded.' unless enable_linalg?
88
-
89
- affinity_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.rbf_kernel(x, nil, @params[:gamma])
90
- @embedding = embedded_space(affinity_mat, @params[:n_clusters])
91
- normalized_embedding = Rumale::Preprocessing::L2Normalizer.new.fit_transform(@embedding)
92
- @labels = kmeans_clustering(normalized_embedding)
93
- end
94
-
95
- private
96
-
97
- def embedded_space(affinity_mat, n_clusters)
98
- affinity_mat[affinity_mat.diag_indices] = 0.0
99
- degrees = 1.0 / Numo::NMath.sqrt(affinity_mat.sum(axis: 1))
100
- laplacian_mat = degrees.diag.dot(affinity_mat).dot(degrees.diag)
101
-
102
- n_samples = affinity_mat.shape[0]
103
- _, eig_vecs = Numo::Linalg.eigh(laplacian_mat, vals_range: (n_samples - n_clusters)...n_samples)
104
- eig_vecs.reverse(1).dup
105
- end
106
-
107
- def kmeans_clustering(x)
108
- Rumale::Clustering::KMeans.new(
109
- n_clusters: @params[:n_clusters], init: @params[:init],
110
- max_iter: @params[:max_iter], tol: @params[:tol], random_seed: @params[:random_seed]
111
- ).fit_predict(x)
112
- end
113
- end
114
- end
115
- end