clusterkit 0.1.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.simplecov +47 -0
  4. data/CHANGELOG.md +35 -0
  5. data/CLAUDE.md +226 -0
  6. data/Cargo.toml +8 -0
  7. data/Gemfile +17 -0
  8. data/IMPLEMENTATION_NOTES.md +143 -0
  9. data/LICENSE.txt +21 -0
  10. data/PYTHON_COMPARISON.md +183 -0
  11. data/README.md +499 -0
  12. data/Rakefile +245 -0
  13. data/clusterkit.gemspec +45 -0
  14. data/docs/KNOWN_ISSUES.md +130 -0
  15. data/docs/RUST_ERROR_HANDLING.md +164 -0
  16. data/docs/TEST_FIXTURES.md +170 -0
  17. data/docs/UMAP_EXPLAINED.md +362 -0
  18. data/docs/UMAP_TROUBLESHOOTING.md +284 -0
  19. data/docs/VERBOSE_OUTPUT.md +84 -0
  20. data/examples/hdbscan_example.rb +147 -0
  21. data/examples/optimal_kmeans_example.rb +96 -0
  22. data/examples/pca_example.rb +114 -0
  23. data/examples/reproducible_umap.rb +99 -0
  24. data/examples/verbose_control.rb +43 -0
  25. data/ext/clusterkit/Cargo.toml +25 -0
  26. data/ext/clusterkit/extconf.rb +4 -0
  27. data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +115 -0
  28. data/ext/clusterkit/src/clustering.rs +267 -0
  29. data/ext/clusterkit/src/embedder.rs +413 -0
  30. data/ext/clusterkit/src/lib.rs +22 -0
  31. data/ext/clusterkit/src/svd.rs +112 -0
  32. data/ext/clusterkit/src/tests.rs +16 -0
  33. data/ext/clusterkit/src/utils.rs +33 -0
  34. data/lib/clusterkit/clustering/hdbscan.rb +177 -0
  35. data/lib/clusterkit/clustering.rb +213 -0
  36. data/lib/clusterkit/clusterkit.rb +9 -0
  37. data/lib/clusterkit/configuration.rb +24 -0
  38. data/lib/clusterkit/dimensionality/pca.rb +251 -0
  39. data/lib/clusterkit/dimensionality/svd.rb +144 -0
  40. data/lib/clusterkit/dimensionality/umap.rb +311 -0
  41. data/lib/clusterkit/dimensionality.rb +29 -0
  42. data/lib/clusterkit/hdbscan_api_design.rb +142 -0
  43. data/lib/clusterkit/preprocessing.rb +106 -0
  44. data/lib/clusterkit/silence.rb +42 -0
  45. data/lib/clusterkit/utils.rb +51 -0
  46. data/lib/clusterkit/version.rb +5 -0
  47. data/lib/clusterkit.rb +93 -0
  48. data/lib/tasks/visualize.rake +641 -0
  49. metadata +194 -0
@@ -0,0 +1,144 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../clusterkit'
4
+
5
+ module ClusterKit
6
+ module Dimensionality
7
+ # Singular Value Decomposition
8
+ # Decomposes a matrix into U, S, V^T components
9
+ class SVD
10
+ attr_reader :n_components, :n_iter, :random_seed
11
+ attr_reader :u, :s, :vt
12
+
13
+ # Initialize a new SVD instance
14
+ # @param n_components [Integer] Number of components to compute
15
+ # @param n_iter [Integer] Number of iterations for randomized algorithm (default: 2)
16
+ # @param random_seed [Integer, nil] Random seed for reproducibility
17
+ def initialize(n_components: nil, n_iter: 2, random_seed: nil)
18
+ @n_components = n_components
19
+ @n_iter = n_iter
20
+ @random_seed = random_seed
21
+ @fitted = false
22
+ end
23
+
24
+ # Fit the model and transform data in one step
25
+ # @param data [Array<Array<Numeric>>] Input data
26
+ # @return [Array] Returns [U, S, Vt] matrices
27
+ def fit_transform(data)
28
+ validate_input(data)
29
+
30
+ # Store reference to original data for transform detection
31
+ @original_data_id = data.object_id
32
+
33
+ # Determine n_components if not set
34
+ n_comp = @n_components || [data.size, data.first.size].min
35
+
36
+ # Call the Rust implementation
37
+ @u, @s, @vt = self.class.randomized_svd(data, n_comp, n_iter: @n_iter)
38
+ @fitted = true
39
+
40
+ [@u, @s, @vt]
41
+ end
42
+
43
+ # Fit the model to data
44
+ # @param data [Array<Array<Numeric>>] Input data
45
+ # @return [self]
46
+ def fit(data)
47
+ fit_transform(data)
48
+ self
49
+ end
50
+
51
+ # Get the U matrix (left singular vectors)
52
+ # @return [Array<Array<Float>>] U matrix
53
+ def components_u
54
+ raise RuntimeError, "Model must be fitted first" unless fitted?
55
+ @u
56
+ end
57
+
58
+ # Get the singular values
59
+ # @return [Array<Float>] Singular values
60
+ def singular_values
61
+ raise RuntimeError, "Model must be fitted first" unless fitted?
62
+ @s
63
+ end
64
+
65
+ # Get the V^T matrix (right singular vectors, transposed)
66
+ # @return [Array<Array<Float>>] V^T matrix
67
+ def components_vt
68
+ raise RuntimeError, "Model must be fitted first" unless fitted?
69
+ @vt
70
+ end
71
+
72
+ # Check if the model has been fitted
73
+ # @return [Boolean]
74
+ def fitted?
75
+ @fitted
76
+ end
77
+
78
+ # Transform data using fitted SVD (project onto components)
79
+ # @param data [Array<Array<Numeric>>] Data to transform
80
+ # @return [Array<Array<Float>>] Transformed data (U * S)
81
+ def transform(data)
82
+ raise RuntimeError, "Model must be fitted first" unless fitted?
83
+ validate_input(data)
84
+
85
+ # For SVD, transform typically means projecting onto the components
86
+ # This is equivalent to data * V (or data * V^T.T)
87
+ # But for dimensionality reduction, we usually want U * S
88
+ # which is already computed in fit_transform
89
+
90
+ # If transforming new data, we'd need to project it
91
+ # For now, return U * S for the fitted data
92
+ if data.object_id == @original_data_id
93
+ # Same data that was fitted - return U * S
94
+ @u.map.with_index do |row, i|
95
+ row.map.with_index { |val, j| val * @s[j] }
96
+ end
97
+ else
98
+ # New data - would need proper projection
99
+ raise NotImplementedError, "Transform for new data not yet implemented"
100
+ end
101
+ end
102
+
103
+ # Inverse transform (reconstruct from components)
104
+ # @param transformed_data [Array<Array<Float>>] Transformed data
105
+ # @return [Array<Array<Float>>] Reconstructed data
106
+ def inverse_transform(transformed_data)
107
+ raise RuntimeError, "Model must be fitted first" unless fitted?
108
+
109
+ # Reconstruction: (U * S) * V^T
110
+ # transformed_data should be U * S
111
+ # We multiply by V^T to reconstruct
112
+
113
+ result = []
114
+ transformed_data.each do |row|
115
+ reconstructed = Array.new(@vt.first.size, 0.0)
116
+ row.each_with_index do |val, i|
117
+ @vt[i].each_with_index do |v, j|
118
+ reconstructed[j] += val * v
119
+ end
120
+ end
121
+ result << reconstructed
122
+ end
123
+ result
124
+ end
125
+
126
+ # Class method for randomized SVD (kept for compatibility)
127
+ # @param matrix [Array<Array<Numeric>>] Input matrix
128
+ # @param k [Integer] Number of components
129
+ # @param n_iter [Integer] Number of iterations
130
+ # @return [Array] Returns [U, S, Vt]
131
+ def self.randomized_svd(matrix, k, n_iter: 2)
132
+ ::ClusterKit::SVD.randomized_svd_rust(matrix, k, n_iter)
133
+ end
134
+
135
+ private
136
+
137
+ def validate_input(data)
138
+ raise ArgumentError, "Input must be an array" unless data.is_a?(Array)
139
+ raise ArgumentError, "Input cannot be empty" if data.empty?
140
+ raise ArgumentError, "Input must be a 2D array" unless data.first.is_a?(Array)
141
+ end
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,311 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require 'json'
5
+ require_relative '../configuration'
6
+ require_relative '../silence'
7
+
8
+ module ClusterKit
9
+ module Dimensionality
10
+ class UMAP
11
+ attr_reader :n_components, :n_neighbors, :random_seed, :nb_grad_batch, :nb_sampling_by_edge
12
+
13
+ # Initialize a new UMAP instance
14
+ # @param n_components [Integer] Target number of dimensions (default: 2)
15
+ # @param n_neighbors [Integer] Number of neighbors for manifold approximation (default: 15)
16
+ # @param random_seed [Integer, nil] Random seed for reproducibility (default: nil)
17
+ # @param nb_grad_batch [Integer] Number of gradient descent batches (default: 10)
18
+ # Controls training iterations - lower = faster but less accurate
19
+ # @param nb_sampling_by_edge [Integer] Number of negative samples per edge (default: 8)
20
+ # Controls sampling quality - lower = faster but less accurate
21
+ def initialize(n_components: 2, n_neighbors: 15, random_seed: nil,
22
+ nb_grad_batch: 10, nb_sampling_by_edge: 8)
23
+ @n_components = n_components
24
+ @n_neighbors = n_neighbors
25
+ @random_seed = random_seed
26
+ @nb_grad_batch = nb_grad_batch
27
+ @nb_sampling_by_edge = nb_sampling_by_edge
28
+ @fitted = false
29
+ # Don't create RustUMAP yet - will be created in fit/fit_transform with adjusted parameters
30
+ @rust_umap = nil
31
+ end
32
+
33
+ # Fit the model to the data (training)
34
+ # @param data [Array<Array<Numeric>>] Training data as 2D array
35
+ # @return [self] Returns self for method chaining
36
+ # @note UMAP's training process inherently produces embeddings. Since the
37
+ # underlying Rust implementation doesn't separate training from
38
+ # transformation, we call fit_transform but discard the embeddings.
39
+ # Use fit_transform if you need both training and the transformed data.
40
+ def fit(data)
41
+ validate_input(data)
42
+
43
+ # Always recreate RustUMAP for fit to ensure fresh fit
44
+ @rust_umap = nil
45
+ create_rust_umap_with_adjusted_params(data)
46
+
47
+ # UMAP doesn't separate training from transformation internally,
48
+ # so we call fit_transform but discard the result
49
+ begin
50
+ Silence.maybe_silence do
51
+ @rust_umap.fit_transform(data)
52
+ end
53
+ @fitted = true
54
+ self
55
+ rescue StandardError => e
56
+ handle_umap_error(e, data)
57
+ rescue => e
58
+ # Handle fatal errors that aren't StandardError
59
+ handle_umap_error(RuntimeError.new(e.message), data)
60
+ end
61
+ end
62
+
63
+ # Transform data using the fitted model
64
+ # @param data [Array<Array<Numeric>>] Data to transform
65
+ # @return [Array<Array<Float>>] Transformed data in reduced dimensions
66
+ # @raise [RuntimeError] If model hasn't been fitted yet
67
+ def transform(data)
68
+ raise RuntimeError, "Model must be fitted before transform. Call fit or fit_transform first." unless fitted?
69
+ validate_input(data, check_min_samples: false)
70
+ Silence.maybe_silence do
71
+ @rust_umap.transform(data)
72
+ end
73
+ end
74
+
75
+ # Fit the model and transform the data in one step
76
+ # @param data [Array<Array<Numeric>>] Training data as 2D array
77
+ # @return [Array<Array<Float>>] Transformed data in reduced dimensions
78
+ def fit_transform(data)
79
+ validate_input(data)
80
+
81
+ # Always recreate RustUMAP for fit_transform to ensure fresh fit
82
+ @rust_umap = nil
83
+ create_rust_umap_with_adjusted_params(data)
84
+
85
+ begin
86
+ result = Silence.maybe_silence do
87
+ @rust_umap.fit_transform(data)
88
+ end
89
+ @fitted = true
90
+ result
91
+ rescue StandardError => e
92
+ handle_umap_error(e, data)
93
+ rescue => e
94
+ # Handle fatal errors that aren't StandardError
95
+ handle_umap_error(RuntimeError.new(e.message), data)
96
+ end
97
+ end
98
+
99
+ # Check if the model has been fitted
100
+ # @return [Boolean] true if model is fitted, false otherwise
101
+ def fitted?
102
+ @fitted
103
+ end
104
+
105
+ # Save the fitted model to a file
106
+ # @param path [String] Path where to save the model
107
+ # @raise [RuntimeError] If model hasn't been fitted yet
108
+ def save(path)
109
+ raise RuntimeError, "No model to save. Call fit or fit_transform first." unless fitted?
110
+
111
+ # Ensure directory exists
112
+ dir = File.dirname(path)
113
+ FileUtils.mkdir_p(dir) unless dir == '.' || dir == '/'
114
+
115
+ @rust_umap.save_model(path)
116
+ end
117
+
118
+ # Load a fitted model from a file
119
+ # @param path [String] Path to the saved model
120
+ # @return [UMAP] A new UMAP instance with the loaded model
121
+ # @raise [ArgumentError] If file doesn't exist
122
+ def self.load(path)
123
+ raise ArgumentError, "File not found: #{path}" unless File.exist?(path)
124
+
125
+ # Load the Rust model
126
+ rust_umap = ::ClusterKit::RustUMAP.load_model(path)
127
+
128
+ # Create a new UMAP instance with the loaded model
129
+ instance = allocate
130
+ instance.instance_variable_set(:@rust_umap, rust_umap)
131
+ instance.instance_variable_set(:@fitted, true)
132
+ # The model file should contain these parameters, but for now we don't have access
133
+ instance.instance_variable_set(:@n_components, nil)
134
+ instance.instance_variable_set(:@n_neighbors, nil)
135
+ instance.instance_variable_set(:@random_seed, nil)
136
+
137
+ instance
138
+ end
139
+
140
+ # Export transformed data to JSON (utility method for caching)
141
+ # @param data [Array<Array<Float>>] Transformed data to export
142
+ # @param path [String] Path where to save the data
143
+ def self.export_data(data, path)
144
+ File.write(path, JSON.pretty_generate(data))
145
+ end
146
+
147
+ # Import transformed data from JSON (utility method for caching)
148
+ # @param path [String] Path to the saved data
149
+ # @return [Array<Array<Float>>] The loaded data
150
+ def self.import_data(path)
151
+ JSON.parse(File.read(path))
152
+ end
153
+
154
+ private
155
+
156
+ def handle_umap_error(error, data)
157
+ error_msg = error.message
158
+ n_samples = data.size
159
+
160
+ case error_msg
161
+ when /isolated point/i, /graph will not be connected/i
162
+ raise ::ClusterKit::IsolatedPointError, <<~MSG
163
+ UMAP found isolated points in your data that are too far from other points.
164
+
165
+ This typically happens when:
166
+ • Your data contains outliers that are very different from other points
167
+ • You're using random data without inherent structure
168
+ • The n_neighbors parameter (#{@n_neighbors}) is too high for your data distribution
169
+
170
+ Solutions:
171
+ 1. Reduce n_neighbors (try 5 or even 3): UMAP.new(n_neighbors: 5)
172
+ 2. Remove outliers from your data before applying UMAP
173
+ 3. Ensure your data has some structure (not purely random)
174
+ 4. For small datasets (< 50 points), consider using PCA instead
175
+
176
+ Your data: #{n_samples} samples, #{data.first&.size || 0} dimensions
177
+ MSG
178
+
179
+ when /assertion failed.*box_size/i
180
+ raise ::ClusterKit::ConvergenceError, <<~MSG
181
+ UMAP failed to converge due to numerical instability in your data.
182
+
183
+ This typically happens when:
184
+ • Data points are too spread out or have extreme values
185
+ • The scale of different features varies wildly
186
+ • There are duplicate or nearly-duplicate points
187
+
188
+ Solutions:
189
+ 1. Normalize your data first: ClusterKit::Preprocessing.normalize(data)
190
+ 2. Use a smaller n_neighbors value: UMAP.new(n_neighbors: 5)
191
+ 3. Check for and remove duplicate points
192
+ 4. Scale your data to a reasonable range (e.g., 0-1 or -1 to 1)
193
+
194
+ Your data: #{n_samples} samples, #{data.first&.size || 0} dimensions
195
+ MSG
196
+
197
+ when /n_neighbors.*larger than/i, /too many neighbors/i
198
+ raise ::ClusterKit::InvalidParameterError, <<~MSG
199
+ The n_neighbors parameter (#{@n_neighbors}) is too large for your dataset size (#{n_samples}).
200
+
201
+ UMAP needs n_neighbors to be less than the number of samples.
202
+ Suggested value: #{[5, (n_samples * 0.1).to_i].max}
203
+
204
+ This should have been auto-adjusted. If you're seeing this error, please report it.
205
+ MSG
206
+
207
+ else
208
+ # For unknown errors, still provide some guidance
209
+ raise ::ClusterKit::Error, <<~MSG
210
+ UMAP encountered an error: #{error_msg}
211
+
212
+ Common solutions:
213
+ 1. Try reducing n_neighbors (current: #{@n_neighbors})
214
+ 2. Normalize your data first
215
+ 3. Check for NaN or infinite values in your data
216
+ 4. Ensure you have at least 10 data points
217
+
218
+ If this persists, consider using PCA for dimensionality reduction instead.
219
+ MSG
220
+ end
221
+ end
222
+
223
+ def validate_input(data, check_min_samples: true)
224
+ raise ArgumentError, "Input must be an array" unless data.is_a?(Array)
225
+ raise ArgumentError, "Input cannot be empty" if data.empty?
226
+
227
+ first_row = data.first
228
+ raise ArgumentError, "Input must be a 2D array (array of arrays)" unless first_row.is_a?(Array)
229
+
230
+ row_length = first_row.length
231
+ min_val = Float::INFINITY
232
+ max_val = -Float::INFINITY
233
+
234
+ # First validate data structure and types
235
+ data.each_with_index do |row, i|
236
+ unless row.is_a?(Array)
237
+ raise ArgumentError, "Row #{i} is not an array"
238
+ end
239
+
240
+ if row.length != row_length
241
+ raise ArgumentError, "All rows must have the same length (row #{i} has #{row.length} elements, expected #{row_length})"
242
+ end
243
+
244
+ row.each_with_index do |val, j|
245
+ unless val.is_a?(Numeric)
246
+ raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
247
+ end
248
+
249
+ # Only check for NaN/Infinite on floats
250
+ if val.is_a?(Float) && (val.nan? || val.infinite?)
251
+ raise ArgumentError, "Element at position [#{i}, #{j}] is NaN or Infinite"
252
+ end
253
+
254
+ # Track data range
255
+ val_f = val.to_f
256
+ min_val = val_f if val_f < min_val
257
+ max_val = val_f if val_f > max_val
258
+ end
259
+ end
260
+
261
+ # Check for sufficient data points after validating structure (only for fit operations)
262
+ if check_min_samples && data.size < 10
263
+ raise ::ClusterKit::InsufficientDataError, <<~MSG
264
+ UMAP requires at least 10 data points, but only #{data.size} provided.
265
+
266
+ For small datasets, consider:
267
+ 1. Using PCA instead: ClusterKit::Dimensionality::PCA.new(n_components: 2)
268
+ 2. Collecting more data points
269
+ 3. Using simpler visualization methods
270
+ MSG
271
+ end
272
+
273
+ # Check for extreme data ranges that might cause numerical issues
274
+ data_range = max_val - min_val
275
+ if data_range > 1000
276
+ warn "WARNING: Large data range detected (#{data_range.round(2)}). Consider normalizing your data to prevent numerical instability."
277
+ end
278
+ end
279
+
280
+ def create_rust_umap_with_adjusted_params(data)
281
+ # Only create if not already created
282
+ return if @rust_umap
283
+
284
+ n_samples = data.size
285
+
286
+ # Automatically adjust n_neighbors if it's too high for the dataset
287
+ # n_neighbors should be less than n_samples
288
+ # Use a reasonable default: min(15, n_samples / 4) but at least 2
289
+ max_neighbors = [n_samples - 1, 2].max # At least 2, but less than n_samples
290
+ suggested_neighbors = [[15, n_samples / 4].min.to_i, 2].max
291
+
292
+ adjusted_n_neighbors = @n_neighbors
293
+ if @n_neighbors > max_neighbors
294
+ adjusted_n_neighbors = [suggested_neighbors, max_neighbors].min
295
+
296
+ if ::ClusterKit.configuration.verbose
297
+ warn "UMAP: Adjusted n_neighbors from #{@n_neighbors} to #{adjusted_n_neighbors} for dataset with #{n_samples} samples"
298
+ end
299
+ end
300
+
301
+ @rust_umap = ::ClusterKit::RustUMAP.new({
302
+ n_components: @n_components,
303
+ n_neighbors: adjusted_n_neighbors,
304
+ random_seed: @random_seed,
305
+ nb_grad_batch: @nb_grad_batch,
306
+ nb_sampling_by_edge: @nb_sampling_by_edge
307
+ })
308
+ end
309
+ end
310
+ end
311
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClusterKit
4
+ # Module for dimensionality reduction algorithms
5
+ module Dimensionality
6
+ # Load classes - can't use autoload with require issues
7
+ require_relative "dimensionality/umap"
8
+ require_relative "dimensionality/pca"
9
+ require_relative "dimensionality/svd"
10
+
11
+ # Module-level evaluation methods
12
+
13
+ # Calculate reconstruction error for a dimensionality reduction
14
+ # @param original_data [Array<Array<Numeric>>] Original high-dimensional data
15
+ # @param reconstructed_data [Array<Array<Numeric>>] Reconstructed data
16
+ # @return [Float] Mean squared reconstruction error
17
+ def self.reconstruction_error(original_data, reconstructed_data)
18
+ raise ArgumentError, "Data sizes don't match" if original_data.size != reconstructed_data.size
19
+
20
+ total_error = 0.0
21
+ original_data.zip(reconstructed_data).each do |orig, recon|
22
+ error = orig.zip(recon).map { |o, r| (o - r) ** 2 }.sum
23
+ total_error += error
24
+ end
25
+
26
+ total_error / original_data.size
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,142 @@
1
+ # API Design for HDBSCAN to match KMeans pattern
2
+
3
+ module ClusterKit
4
+ module Clustering
5
+
6
+ # HDBSCAN clustering algorithm - matching KMeans API pattern
7
+ class HDBSCAN
8
+ attr_reader :min_samples, :min_cluster_size, :labels, :probabilities,
9
+ :outlier_scores, :cluster_persistence
10
+
11
+ # Initialize HDBSCAN clusterer (matches KMeans pattern)
12
+ # @param min_samples [Integer] Min neighborhood size for core points (default: 5)
13
+ # @param min_cluster_size [Integer] Minimum size of clusters (default: 5)
14
+ # @param metric [String] Distance metric (default: 'euclidean')
15
+ def initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean')
16
+ raise ArgumentError, "min_samples must be positive" unless min_samples > 0
17
+ raise ArgumentError, "min_cluster_size must be positive" unless min_cluster_size > 0
18
+ @min_samples = min_samples
19
+ @min_cluster_size = min_cluster_size
20
+ @metric = metric
21
+ @fitted = false
22
+ end
23
+
24
+ # Fit the HDBSCAN model (matches KMeans.fit)
25
+ # @param data [Array] 2D array of data points
26
+ # @return [self] Returns self for method chaining
27
+ def fit(data)
28
+ validate_data(data)
29
+
30
+ # Call Rust implementation (hdbscan crate)
31
+ result = Clustering.hdbscan_rust(data, @min_samples, @min_cluster_size, @metric)
32
+
33
+ @labels = result[:labels]
34
+ @probabilities = result[:probabilities]
35
+ @outlier_scores = result[:outlier_scores]
36
+ @cluster_persistence = result[:cluster_persistence]
37
+ @fitted = true
38
+
39
+ self
40
+ end
41
+
42
+ # HDBSCAN doesn't support predict for new points (unlike KMeans)
43
+ # But we keep the method for API consistency
44
+ # @param data [Array] 2D array of data points
45
+ # @return [Array] Returns nil or raises
46
+ def predict(data)
47
+ raise NotImplementedError, "HDBSCAN does not support prediction on new data. " \
48
+ "Use approximate_predict for approximate membership"
49
+ end
50
+
51
+ # Fit the model and return labels (matches KMeans.fit_predict)
52
+ # @param data [Array] 2D array of data points
53
+ # @return [Array] Cluster labels (-1 for noise)
54
+ def fit_predict(data)
55
+ fit(data)
56
+ @labels
57
+ end
58
+
59
+ # Check if model has been fitted (matches KMeans.fitted?)
60
+ # @return [Boolean] True if fitted
61
+ def fitted?
62
+ @fitted
63
+ end
64
+
65
+ # Get number of clusters found (similar to KMeans.k but discovered)
66
+ # @return [Integer] Number of clusters (excluding noise)
67
+ def n_clusters
68
+ return 0 unless fitted?
69
+ @labels.max + 1 rescue 0
70
+ end
71
+
72
+ # Get noise ratio (HDBSCAN-specific but follows naming pattern)
73
+ # @return [Float] Fraction of points labeled as noise
74
+ def noise_ratio
75
+ return 0.0 unless fitted?
76
+ @labels.count(-1).to_f / @labels.length
77
+ end
78
+
79
+ private
80
+
81
+ def validate_data(data)
82
+ # Exact same validation as KMeans for consistency
83
+ raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
84
+ raise ArgumentError, "Data cannot be empty" if data.empty?
85
+ raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
86
+
87
+ row_length = data.first.length
88
+ unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
89
+ raise ArgumentError, "All rows must have the same length"
90
+ end
91
+
92
+ data.each_with_index do |row, i|
93
+ row.each_with_index do |val, j|
94
+ unless val.is_a?(Numeric)
95
+ raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end
101
+
102
+ # Module-level convenience methods (matching KMeans pattern)
103
+ class << self
104
+ # Perform HDBSCAN clustering (matches Clustering.kmeans signature)
105
+ # @param data [Array] 2D array of data points
106
+ # @param min_samples [Integer] Min neighborhood size for core points
107
+ # @param min_cluster_size [Integer] Minimum size of clusters
108
+ # @return [Hash] Result hash with :labels, :probabilities, :outlier_scores
109
+ def hdbscan(data, min_samples: 5, min_cluster_size: 5)
110
+ clusterer = HDBSCAN.new(min_samples: min_samples, min_cluster_size: min_cluster_size)
111
+ clusterer.fit(data)
112
+ {
113
+ labels: clusterer.labels,
114
+ probabilities: clusterer.probabilities,
115
+ outlier_scores: clusterer.outlier_scores,
116
+ n_clusters: clusterer.n_clusters,
117
+ noise_ratio: clusterer.noise_ratio
118
+ }
119
+ end
120
+ end
121
+ end
122
+ end
123
+
124
+ # Usage comparison:
125
+
126
+ # KMeans usage:
127
+ kmeans = ClusterKit::Clustering::KMeans.new(k: 3)
128
+ kmeans.fit(data)
129
+ labels = kmeans.labels
130
+ # or
131
+ labels = kmeans.fit_predict(data)
132
+
133
+ # HDBSCAN usage (identical pattern):
134
+ hdbscan = ClusterKit::Clustering::HDBSCAN.new(min_samples: 5, min_cluster_size: 5)
135
+ hdbscan.fit(data)
136
+ labels = hdbscan.labels
137
+ # or
138
+ labels = hdbscan.fit_predict(data)
139
+
140
+ # Module-level convenience (both follow same pattern):
141
+ result = ClusterKit::Clustering.kmeans(data, 3)
142
+ result = ClusterKit::Clustering.hdbscan(data, min_samples: 5)