clusterkit 0.3.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.simplecov +47 -0
  4. data/CHANGELOG.md +35 -0
  5. data/CLAUDE.md +226 -0
  6. data/Cargo.lock +3228 -0
  7. data/Cargo.toml +8 -0
  8. data/Gemfile +17 -0
  9. data/IMPLEMENTATION_NOTES.md +143 -0
  10. data/LICENSE.txt +21 -0
  11. data/PYTHON_COMPARISON.md +183 -0
  12. data/README.md +744 -0
  13. data/Rakefile +259 -0
  14. data/docs/KNOWN_ISSUES.md +130 -0
  15. data/docs/RUST_ERROR_HANDLING.md +164 -0
  16. data/docs/TEST_FIXTURES.md +170 -0
  17. data/docs/UMAP_EXPLAINED.md +362 -0
  18. data/docs/UMAP_TROUBLESHOOTING.md +284 -0
  19. data/docs/VERBOSE_OUTPUT.md +84 -0
  20. data/docs/assets/clusterkit-wide.png +0 -0
  21. data/docs/assets/clusterkit.png +0 -0
  22. data/docs/assets/visualization.png +0 -0
  23. data/examples/hdbscan_example.rb +147 -0
  24. data/examples/optimal_kmeans_example.rb +96 -0
  25. data/examples/pca_example.rb +114 -0
  26. data/examples/reproducible_umap.rb +99 -0
  27. data/examples/verbose_control.rb +43 -0
  28. data/ext/clusterkit/Cargo.toml +26 -0
  29. data/ext/clusterkit/extconf.rb +23 -0
  30. data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +80 -0
  31. data/ext/clusterkit/src/clustering.rs +221 -0
  32. data/ext/clusterkit/src/embedder.rs +349 -0
  33. data/ext/clusterkit/src/hnsw.rs +579 -0
  34. data/ext/clusterkit/src/lib.rs +24 -0
  35. data/ext/clusterkit/src/svd.rs +89 -0
  36. data/ext/clusterkit/src/tests.rs +16 -0
  37. data/ext/clusterkit/src/utils.rs +183 -0
  38. data/lib/clusterkit/3.1/clusterkit.so +0 -0
  39. data/lib/clusterkit/3.2/clusterkit.so +0 -0
  40. data/lib/clusterkit/3.3/clusterkit.so +0 -0
  41. data/lib/clusterkit/3.4/clusterkit.so +0 -0
  42. data/lib/clusterkit/clustering/hdbscan.rb +164 -0
  43. data/lib/clusterkit/clustering.rb +194 -0
  44. data/lib/clusterkit/clusterkit.rb +14 -0
  45. data/lib/clusterkit/configuration.rb +24 -0
  46. data/lib/clusterkit/data_validator.rb +132 -0
  47. data/lib/clusterkit/dimensionality/pca.rb +251 -0
  48. data/lib/clusterkit/dimensionality/svd.rb +175 -0
  49. data/lib/clusterkit/dimensionality/umap.rb +282 -0
  50. data/lib/clusterkit/dimensionality.rb +29 -0
  51. data/lib/clusterkit/hdbscan_api_design.rb +142 -0
  52. data/lib/clusterkit/hnsw.rb +251 -0
  53. data/lib/clusterkit/preprocessing.rb +106 -0
  54. data/lib/clusterkit/silence.rb +42 -0
  55. data/lib/clusterkit/utils.rb +51 -0
  56. data/lib/clusterkit/version.rb +5 -0
  57. data/lib/clusterkit.rb +105 -0
  58. data/lib/tasks/visualize.rake +641 -0
  59. metadata +220 -0
@@ -0,0 +1,282 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require 'json'
5
+ require_relative '../configuration'
6
+ require_relative '../silence'
7
+ require_relative '../data_validator'
8
+
9
+ module ClusterKit
10
+ module Dimensionality
11
+ class UMAP
12
+ attr_reader :n_components, :n_neighbors, :random_seed, :nb_grad_batch, :nb_sampling_by_edge
13
+
14
+ # Initialize a new UMAP instance
15
+ # @param n_components [Integer] Target number of dimensions (default: 2)
16
+ # @param n_neighbors [Integer] Number of neighbors for manifold approximation (default: 15)
17
+ # @param random_seed [Integer, nil] Random seed for reproducibility (default: nil)
18
+ # @param nb_grad_batch [Integer] Number of gradient descent batches (default: 10)
19
+ # Controls training iterations - lower = faster but less accurate
20
+ # @param nb_sampling_by_edge [Integer] Number of negative samples per edge (default: 8)
21
+ # Controls sampling quality - lower = faster but less accurate
22
+ def initialize(n_components: 2, n_neighbors: 15, random_seed: nil,
23
+ nb_grad_batch: 10, nb_sampling_by_edge: 8)
24
+ @n_components = n_components
25
+ @n_neighbors = n_neighbors
26
+ @random_seed = random_seed
27
+ @nb_grad_batch = nb_grad_batch
28
+ @nb_sampling_by_edge = nb_sampling_by_edge
29
+ @fitted = false
30
+ # Don't create RustUMAP yet - will be created in fit/fit_transform with adjusted parameters
31
+ @rust_umap = nil
32
+ end
33
+
34
+ # Fit the model to the data (training)
35
+ # @param data [Array<Array<Numeric>>] Training data as 2D array
36
+ # @return [self] Returns self for method chaining
37
+ # @note UMAP's training process inherently produces embeddings. Since the
38
+ # underlying Rust implementation doesn't separate training from
39
+ # transformation, we call fit_transform but discard the embeddings.
40
+ # Use fit_transform if you need both training and the transformed data.
41
+ def fit(data)
42
+ validate_input(data)
43
+
44
+ # Always recreate RustUMAP for fit to ensure fresh fit
45
+ @rust_umap = nil
46
+ create_rust_umap_with_adjusted_params(data)
47
+
48
+ # UMAP doesn't separate training from transformation internally,
49
+ # so we call fit_transform but discard the result
50
+ begin
51
+ Silence.maybe_silence do
52
+ @rust_umap.fit_transform(data)
53
+ end
54
+ @fitted = true
55
+ self
56
+ rescue StandardError => e
57
+ handle_umap_error(e, data)
58
+ rescue => e
59
+ # Handle fatal errors that aren't StandardError
60
+ handle_umap_error(RuntimeError.new(e.message), data)
61
+ end
62
+ end
63
+
64
+ # Transform data using the fitted model
65
+ # @param data [Array<Array<Numeric>>] Data to transform
66
+ # @return [Array<Array<Float>>] Transformed data in reduced dimensions
67
+ # @raise [RuntimeError] If model hasn't been fitted yet
68
+ def transform(data)
69
+ raise RuntimeError, "Model must be fitted before transform. Call fit or fit_transform first." unless fitted?
70
+ validate_input(data, check_min_samples: false)
71
+ Silence.maybe_silence do
72
+ @rust_umap.transform(data)
73
+ end
74
+ end
75
+
76
+ # Fit the model and transform the data in one step
77
+ # @param data [Array<Array<Numeric>>] Training data as 2D array
78
+ # @return [Array<Array<Float>>] Transformed data in reduced dimensions
79
+ def fit_transform(data)
80
+ validate_input(data)
81
+
82
+ # Always recreate RustUMAP for fit_transform to ensure fresh fit
83
+ @rust_umap = nil
84
+ create_rust_umap_with_adjusted_params(data)
85
+
86
+ begin
87
+ result = Silence.maybe_silence do
88
+ @rust_umap.fit_transform(data)
89
+ end
90
+ @fitted = true
91
+ result
92
+ rescue StandardError => e
93
+ handle_umap_error(e, data)
94
+ rescue => e
95
+ # Handle fatal errors that aren't StandardError
96
+ handle_umap_error(RuntimeError.new(e.message), data)
97
+ end
98
+ end
99
+
100
+ # Check if the model has been fitted
101
+ # @return [Boolean] true if model is fitted, false otherwise
102
+ def fitted?
103
+ @fitted
104
+ end
105
+
106
+ # Save the fitted model to a file
107
+ # @param path [String] Path where to save the model
108
+ # @raise [RuntimeError] If model hasn't been fitted yet
109
+ def save_model(path)
110
+ raise RuntimeError, "No model to save. Call fit or fit_transform first." unless fitted?
111
+
112
+ # Ensure directory exists
113
+ dir = File.dirname(path)
114
+ FileUtils.mkdir_p(dir) unless dir == '.' || dir == '/'
115
+
116
+ @rust_umap.save_model(path)
117
+ end
118
+
119
+ # Load a fitted model from a file
120
+ # @param path [String] Path to the saved model
121
+ # @return [UMAP] A new UMAP instance with the loaded model
122
+ # @raise [ArgumentError] If file doesn't exist
123
+ def self.load_model(path)
124
+ raise ArgumentError, "File not found: #{path}" unless File.exist?(path)
125
+
126
+ # Load the Rust model (access private constant)
127
+ rust_umap = ::ClusterKit.const_get(:RustUMAP).load_model(path)
128
+
129
+ # Create a new UMAP instance with the loaded model
130
+ instance = allocate
131
+ instance.instance_variable_set(:@rust_umap, rust_umap)
132
+ instance.instance_variable_set(:@fitted, true)
133
+ # The model file should contain these parameters, but for now we don't have access
134
+ instance.instance_variable_set(:@n_components, nil)
135
+ instance.instance_variable_set(:@n_neighbors, nil)
136
+ instance.instance_variable_set(:@random_seed, nil)
137
+
138
+ instance
139
+ end
140
+
141
+ # Save transformed data to JSON file
142
+ # @param data [Array<Array<Float>>] Transformed data to save
143
+ # @param path [String] Path where to save the data
144
+ def self.save_data(data, path)
145
+ FileUtils.mkdir_p(File.dirname(path)) unless File.dirname(path) == '.'
146
+ File.write(path, JSON.pretty_generate(data))
147
+ end
148
+
149
+ # Load transformed data from JSON file
150
+ # @param path [String] Path to the saved data
151
+ # @return [Array<Array<Float>>] The loaded data
152
+ # @raise [ArgumentError] If file doesn't exist
153
+ def self.load_data(path)
154
+ raise ArgumentError, "File not found: #{path}" unless File.exist?(path)
155
+ JSON.parse(File.read(path))
156
+ end
157
+
158
+ private
159
+
160
+ def handle_umap_error(error, data)
161
+ error_msg = error.message
162
+ n_samples = data.size
163
+
164
+ case error_msg
165
+ when /isolated point/i, /graph will not be connected/i
166
+ raise ::ClusterKit::IsolatedPointError, <<~MSG
167
+ UMAP found isolated points in your data that are too far from other points.
168
+
169
+ This typically happens when:
170
+ • Your data contains outliers that are very different from other points
171
+ • You're using random data without inherent structure
172
+ • The n_neighbors parameter (#{@n_neighbors}) is too high for your data distribution
173
+
174
+ Solutions:
175
+ 1. Reduce n_neighbors (try 5 or even 3): UMAP.new(n_neighbors: 5)
176
+ 2. Remove outliers from your data before applying UMAP
177
+ 3. Ensure your data has some structure (not purely random)
178
+ 4. For small datasets (< 50 points), consider using PCA instead
179
+
180
+ Your data: #{n_samples} samples, #{data.first&.size || 0} dimensions
181
+ MSG
182
+
183
+ when /assertion failed.*box_size/i
184
+ raise ::ClusterKit::ConvergenceError, <<~MSG
185
+ UMAP failed to converge due to numerical instability in your data.
186
+
187
+ This typically happens when:
188
+ • Data points are too spread out or have extreme values
189
+ • The scale of different features varies wildly
190
+ • There are duplicate or nearly-duplicate points
191
+
192
+ Solutions:
193
+ 1. Normalize your data first: ClusterKit::Preprocessing.normalize(data)
194
+ 2. Use a smaller n_neighbors value: UMAP.new(n_neighbors: 5)
195
+ 3. Check for and remove duplicate points
196
+ 4. Scale your data to a reasonable range (e.g., 0-1 or -1 to 1)
197
+
198
+ Your data: #{n_samples} samples, #{data.first&.size || 0} dimensions
199
+ MSG
200
+
201
+ when /n_neighbors.*larger than/i, /too many neighbors/i
202
+ raise ::ClusterKit::InvalidParameterError, <<~MSG
203
+ The n_neighbors parameter (#{@n_neighbors}) is too large for your dataset size (#{n_samples}).
204
+
205
+ UMAP needs n_neighbors to be less than the number of samples.
206
+ Suggested value: #{[5, (n_samples * 0.1).to_i].max}
207
+
208
+ This should have been auto-adjusted. If you're seeing this error, please report it.
209
+ MSG
210
+
211
+ else
212
+ # For unknown errors, still provide some guidance
213
+ raise ::ClusterKit::Error, <<~MSG
214
+ UMAP encountered an error: #{error_msg}
215
+
216
+ Common solutions:
217
+ 1. Try reducing n_neighbors (current: #{@n_neighbors})
218
+ 2. Normalize your data first
219
+ 3. Check for NaN or infinite values in your data
220
+ 4. Ensure you have at least 10 data points
221
+
222
+ If this persists, consider using PCA for dimensionality reduction instead.
223
+ MSG
224
+ end
225
+ end
226
+
227
+ def validate_input(data, check_min_samples: true)
228
+ # Use shared validation for common checks
229
+ DataValidator.validate_standard(data)
230
+
231
+ # UMAP-specific validations
232
+ if check_min_samples && data.size < 10
233
+ raise ::ClusterKit::InsufficientDataError, <<~MSG
234
+ UMAP requires at least 10 data points, but only #{data.size} provided.
235
+
236
+ For small datasets, consider:
237
+ 1. Using PCA instead: ClusterKit::Dimensionality::PCA.new(n_components: 2)
238
+ 2. Collecting more data points
239
+ 3. Using simpler visualization methods
240
+ MSG
241
+ end
242
+
243
+ # Check for extreme data ranges that might cause numerical issues
244
+ stats = DataValidator.data_statistics(data)
245
+ if stats[:data_range] > 1000
246
+ warn "WARNING: Large data range detected (#{stats[:data_range].round(2)}). Consider normalizing your data to prevent numerical instability."
247
+ end
248
+ end
249
+
250
+ def create_rust_umap_with_adjusted_params(data)
251
+ # Only create if not already created
252
+ return if @rust_umap
253
+
254
+ n_samples = data.size
255
+
256
+ # Automatically adjust n_neighbors if it's too high for the dataset
257
+ # n_neighbors should be less than n_samples
258
+ # Use a reasonable default: min(15, n_samples / 4) but at least 2
259
+ max_neighbors = [n_samples - 1, 2].max # At least 2, but less than n_samples
260
+ suggested_neighbors = [[15, n_samples / 4].min.to_i, 2].max
261
+
262
+ adjusted_n_neighbors = @n_neighbors
263
+ if @n_neighbors > max_neighbors
264
+ adjusted_n_neighbors = [suggested_neighbors, max_neighbors].min
265
+
266
+ if ::ClusterKit.configuration.verbose
267
+ warn "UMAP: Adjusted n_neighbors from #{@n_neighbors} to #{adjusted_n_neighbors} for dataset with #{n_samples} samples"
268
+ end
269
+ end
270
+
271
+ # Access the private constant from inside the module
272
+ @rust_umap = ::ClusterKit.const_get(:RustUMAP).new({
273
+ n_components: @n_components,
274
+ n_neighbors: adjusted_n_neighbors,
275
+ random_seed: @random_seed,
276
+ nb_grad_batch: @nb_grad_batch,
277
+ nb_sampling_by_edge: @nb_sampling_by_edge
278
+ })
279
+ end
280
+ end
281
+ end
282
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClusterKit
4
+ # Module for dimensionality reduction algorithms
5
+ module Dimensionality
6
+ # Load classes - can't use autoload with require issues
7
+ require_relative "dimensionality/umap"
8
+ require_relative "dimensionality/pca"
9
+ require_relative "dimensionality/svd"
10
+
11
+ # Module-level evaluation methods
12
+
13
+ # Calculate reconstruction error for a dimensionality reduction
14
+ # @param original_data [Array<Array<Numeric>>] Original high-dimensional data
15
+ # @param reconstructed_data [Array<Array<Numeric>>] Reconstructed data
16
+ # @return [Float] Mean squared reconstruction error
17
+ def self.reconstruction_error(original_data, reconstructed_data)
18
+ raise ArgumentError, "Data sizes don't match" if original_data.size != reconstructed_data.size
19
+
20
+ total_error = 0.0
21
+ original_data.zip(reconstructed_data).each do |orig, recon|
22
+ error = orig.zip(recon).map { |o, r| (o - r) ** 2 }.sum
23
+ total_error += error
24
+ end
25
+
26
+ total_error / original_data.size
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,142 @@
1
+ # API Design for HDBSCAN to match KMeans pattern
2
+
3
+ module ClusterKit
4
+ module Clustering
5
+
6
+ # HDBSCAN clustering algorithm - matching KMeans API pattern
7
+ class HDBSCAN
8
+ attr_reader :min_samples, :min_cluster_size, :labels, :probabilities,
9
+ :outlier_scores, :cluster_persistence
10
+
11
+ # Initialize HDBSCAN clusterer (matches KMeans pattern)
12
+ # @param min_samples [Integer] Min neighborhood size for core points (default: 5)
13
+ # @param min_cluster_size [Integer] Minimum size of clusters (default: 5)
14
+ # @param metric [String] Distance metric (default: 'euclidean')
15
+ def initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean')
16
+ raise ArgumentError, "min_samples must be positive" unless min_samples > 0
17
+ raise ArgumentError, "min_cluster_size must be positive" unless min_cluster_size > 0
18
+ @min_samples = min_samples
19
+ @min_cluster_size = min_cluster_size
20
+ @metric = metric
21
+ @fitted = false
22
+ end
23
+
24
+ # Fit the HDBSCAN model (matches KMeans.fit)
25
+ # @param data [Array] 2D array of data points
26
+ # @return [self] Returns self for method chaining
27
+ def fit(data)
28
+ validate_data(data)
29
+
30
+ # Call Rust implementation (hdbscan crate)
31
+ result = Clustering.hdbscan_rust(data, @min_samples, @min_cluster_size, @metric)
32
+
33
+ @labels = result[:labels]
34
+ @probabilities = result[:probabilities]
35
+ @outlier_scores = result[:outlier_scores]
36
+ @cluster_persistence = result[:cluster_persistence]
37
+ @fitted = true
38
+
39
+ self
40
+ end
41
+
42
+ # HDBSCAN doesn't support predict for new points (unlike KMeans)
43
+ # But we keep the method for API consistency
44
+ # @param data [Array] 2D array of data points
45
+ # @return [Array] Returns nil or raises
46
+ def predict(data)
47
+ raise NotImplementedError, "HDBSCAN does not support prediction on new data. " \
48
+ "Use approximate_predict for approximate membership"
49
+ end
50
+
51
+ # Fit the model and return labels (matches KMeans.fit_predict)
52
+ # @param data [Array] 2D array of data points
53
+ # @return [Array] Cluster labels (-1 for noise)
54
+ def fit_predict(data)
55
+ fit(data)
56
+ @labels
57
+ end
58
+
59
+ # Check if model has been fitted (matches KMeans.fitted?)
60
+ # @return [Boolean] True if fitted
61
+ def fitted?
62
+ @fitted
63
+ end
64
+
65
+ # Get number of clusters found (similar to KMeans.k but discovered)
66
+ # @return [Integer] Number of clusters (excluding noise)
67
+ def n_clusters
68
+ return 0 unless fitted?
69
+ @labels.max + 1 rescue 0
70
+ end
71
+
72
+ # Get noise ratio (HDBSCAN-specific but follows naming pattern)
73
+ # @return [Float] Fraction of points labeled as noise
74
+ def noise_ratio
75
+ return 0.0 unless fitted?
76
+ @labels.count(-1).to_f / @labels.length
77
+ end
78
+
79
+ private
80
+
81
+ def validate_data(data)
82
+ # Exact same validation as KMeans for consistency
83
+ raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
84
+ raise ArgumentError, "Data cannot be empty" if data.empty?
85
+ raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
86
+
87
+ row_length = data.first.length
88
+ unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
89
+ raise ArgumentError, "All rows must have the same length"
90
+ end
91
+
92
+ data.each_with_index do |row, i|
93
+ row.each_with_index do |val, j|
94
+ unless val.is_a?(Numeric)
95
+ raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end
101
+
102
+ # Module-level convenience methods (matching KMeans pattern)
103
+ class << self
104
+ # Perform HDBSCAN clustering (matches Clustering.kmeans signature)
105
+ # @param data [Array] 2D array of data points
106
+ # @param min_samples [Integer] Min neighborhood size for core points
107
+ # @param min_cluster_size [Integer] Minimum size of clusters
108
+ # @return [Hash] Result hash with :labels, :probabilities, :outlier_scores
109
+ def hdbscan(data, min_samples: 5, min_cluster_size: 5)
110
+ clusterer = HDBSCAN.new(min_samples: min_samples, min_cluster_size: min_cluster_size)
111
+ clusterer.fit(data)
112
+ {
113
+ labels: clusterer.labels,
114
+ probabilities: clusterer.probabilities,
115
+ outlier_scores: clusterer.outlier_scores,
116
+ n_clusters: clusterer.n_clusters,
117
+ noise_ratio: clusterer.noise_ratio
118
+ }
119
+ end
120
+ end
121
+ end
122
+ end
123
+
124
+ # Usage comparison:
125
+
126
+ # KMeans usage:
127
+ kmeans = ClusterKit::Clustering::KMeans.new(k: 3)
128
+ kmeans.fit(data)
129
+ labels = kmeans.labels
130
+ # or
131
+ labels = kmeans.fit_predict(data)
132
+
133
+ # HDBSCAN usage (identical pattern):
134
+ hdbscan = ClusterKit::Clustering::HDBSCAN.new(min_samples: 5, min_cluster_size: 5)
135
+ hdbscan.fit(data)
136
+ labels = hdbscan.labels
137
+ # or
138
+ labels = hdbscan.fit_predict(data)
139
+
140
+ # Module-level convenience (both follow same pattern):
141
+ result = ClusterKit::Clustering.kmeans(data, 3)
142
+ result = ClusterKit::Clustering.hdbscan(data, min_samples: 5)
@@ -0,0 +1,251 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClusterKit
4
+ # HNSW (Hierarchical Navigable Small World) index for fast approximate nearest neighbor search
5
+ #
6
+ # @example Basic usage
7
+ # index = ClusterKit::HNSW.new(dim: 128, space: :euclidean)
8
+ # index.add_batch(vectors, labels: labels)
9
+ # neighbors = index.search(query_vector, k: 10)
10
+ #
11
+ # @example With metadata
12
+ # index = ClusterKit::HNSW.new(dim: 768, space: :cosine)
13
+ # index.add_item(vector, label: "doc_1", metadata: { title: "Introduction", date: "2024-01-01" })
14
+ # results = index.search_with_metadata(query, k: 5)
15
+ # # => [{ label: "doc_1", distance: 0.23, metadata: { title: "...", date: "..." } }, ...]
16
+ class HNSW
17
+ # Note: The actual HNSW class is defined in Rust (ext/clusterkit/src/hnsw.rs)
18
+ # This Ruby file adds additional convenience methods and documentation.
19
+ # The Rust implementation provides these core methods:
20
+ # - new(kwargs) - constructor
21
+ # - add_item(vector, kwargs) - add single item
22
+ # - add_batch(vectors, kwargs) - add multiple items
23
+ # - search(query, kwargs) - search for neighbors
24
+ # - search_with_metadata(query, kwargs) - search with metadata
25
+ # - size() - get number of items
26
+ # - config() - get configuration
27
+ # - stats() - get statistics
28
+ # - set_ef(ef) - set search quality parameter
29
+ # - save(path) - save to file
30
+
31
+ # Initialize is actually handled by the Rust code
32
+ # This documentation is for reference
33
+ #
34
+ # @param dim [Integer] Dimension of vectors (required)
35
+ # @param space [Symbol] Distance metric: :euclidean, :cosine, or :inner_product (default: :euclidean)
36
+ # @param max_elements [Integer] Maximum number of elements (default: 10_000)
37
+ # @param m [Integer] Number of bi-directional links (default: 16)
38
+ # @param ef_construction [Integer] Size of dynamic candidate list (default: 200)
39
+ # @param random_seed [Integer, nil] Random seed for reproducible builds (default: nil)
40
+ # @param dynamic_list [Boolean] Allow index to grow dynamically (not yet implemented)
41
+
42
+ # Fit the index with training data (alias for add_batch)
43
+ #
44
+ # @param data [Array<Array>, Numo::NArray] Training vectors
45
+ # @param labels [Array, nil] Optional labels for vectors
46
+ # @return [self]
47
+ def fit(data, labels: nil)
48
+ add_batch(data, labels: labels)
49
+ self
50
+ end
51
+
52
+ # Fit and return transformed data (for compatibility with sklearn-like interface)
53
+ #
54
+ # @param data [Array<Array>, Numo::NArray] Training vectors
55
+ # @return [self]
56
+ def fit_transform(data)
57
+ fit(data)
58
+ self
59
+ end
60
+
61
+ # Add a vector using the << operator
62
+ #
63
+ # @param vector [Array, Numo::NArray] Vector to add
64
+ # @return [self]
65
+ def <<(vector)
66
+ add_item(vector, {})
67
+ self
68
+ end
69
+
70
+ # Alias for search that always includes distances
71
+ #
72
+ # @param query [Array, Numo::NArray] Query vector
73
+ # @param k [Integer] Number of neighbors
74
+ # @param ef [Integer, nil] Search parameter (higher = better quality, slower)
75
+ # @return [Array<Array>] Array of [indices, distances]
76
+ def knn_query(query, k: 10, ef: nil)
77
+ search(query, k: k, ef: ef, include_distances: true)
78
+ end
79
+
80
+ # Batch search for multiple queries
81
+ #
82
+ # @param queries [Array<Array>, Numo::NArray] Multiple query vectors
83
+ # @param k [Integer] Number of neighbors per query
84
+ # @param parallel [Boolean] Process queries in parallel
85
+ # @return [Array<Array>] Results for each query
86
+ def batch_search(queries, k: 10, parallel: true)
87
+ queries = ensure_array(queries)
88
+
89
+ if parallel && queries.size > 1
90
+ require 'parallel'
91
+ Parallel.map(queries) { |query| search(query, k: k) }
92
+ else
93
+ queries.map { |query| search(query, k: k) }
94
+ end
95
+ rescue LoadError
96
+ # Parallel gem not available, fall back to sequential
97
+ queries.map { |query| search(query, k: k) }
98
+ end
99
+
100
+ # Range search - find all points within a given radius
101
+ #
102
+ # @param query [Array, Numo::NArray] Query vector
103
+ # @param radius [Float] Search radius
104
+ # @param limit [Integer, nil] Maximum number of results
105
+ # @return [Array<Hash>] Results within radius
106
+ def range_search(query, radius:, limit: nil)
107
+ # Get a large number of candidates
108
+ k = limit || size
109
+ k = [k, size].min
110
+
111
+ results = search_with_metadata(query, k: k)
112
+
113
+ # Filter by radius
114
+ results.select { |r| r[:distance] <= radius }
115
+ .take(limit || results.size)
116
+ end
117
+
118
+ # Check if index is empty
119
+ # @return [Boolean]
120
+ def empty?
121
+ size == 0
122
+ end
123
+
124
+ # Clear all elements from the index
125
+ #
126
+ # @return [self]
127
+ def clear!
128
+ # Would need to recreate the index
129
+ raise NotImplementedError, "Clear not yet implemented"
130
+ end
131
+
132
+ # Check if a label exists in the index
133
+ #
134
+ # @param label [String, Integer] Label to check
135
+ # @return [Boolean]
136
+ def include?(label)
137
+ # This would need to be implemented in Rust
138
+ # For now, return false
139
+ false
140
+ end
141
+
142
+ # Get recall rate for a test set
143
+ #
144
+ # @param test_queries [Array<Array>] Query vectors
145
+ # @param ground_truth [Array<Array>] True nearest neighbors for each query
146
+ # @param k [Integer] Number of neighbors to evaluate
147
+ # @return [Float] Recall rate (0.0 to 1.0)
148
+ def recall(test_queries, ground_truth, k: 10)
149
+ test_queries = ensure_array(test_queries)
150
+
151
+ require 'set'
152
+ total_correct = 0
153
+ total_possible = 0
154
+
155
+ test_queries.each_with_index do |query, i|
156
+ predicted = Set.new(search(query, k: k))
157
+ actual = Set.new(ground_truth[i].take(k))
158
+
159
+ total_correct += (predicted & actual).size
160
+ total_possible += [k, actual.size].min
161
+ end
162
+
163
+ total_possible > 0 ? total_correct.to_f / total_possible : 0.0
164
+ end
165
+
166
+ # Load an index from file
167
+ # Note: This uses Box::leak internally to work around hnsw_rs lifetime constraints
168
+ # This causes a small memory leak - the HnswIo struct won't be freed until program exit
169
+ #
170
+ # @param path [String] File path to load from
171
+ # @return [HNSW] New HNSW instance loaded from file
172
+ # (The actual implementation is in Rust)
173
+
174
+ # Create an index from embeddings produced by UMAP or other dimensionality reduction
175
+ #
176
+ # @param embeddings [Array<Array>, Numo::NArray] Embedding vectors
177
+ # @param kwargs [Hash] Additional options for HNSW initialization
178
+ # @return [HNSW] New HNSW instance
179
+ def self.from_embedding(embeddings, **kwargs)
180
+ embeddings = ensure_array(embeddings)
181
+
182
+ dim = embeddings.first.size
183
+ index = new(dim: dim, **kwargs)
184
+ index.fit(embeddings)
185
+ index
186
+ end
187
+
188
+ # Builder pattern for creating HNSW indices
189
+ class Builder
190
+ def initialize
191
+ @config = {}
192
+ end
193
+
194
+ def space(type)
195
+ @config[:space] = type
196
+ self
197
+ end
198
+
199
+ def dimensions(dim)
200
+ @config[:dim] = dim
201
+ self
202
+ end
203
+
204
+ def max_elements(n)
205
+ @config[:max_elements] = n
206
+ self
207
+ end
208
+
209
+ def m_parameter(m)
210
+ @config[:m] = m
211
+ self
212
+ end
213
+
214
+ def ef_construction(ef)
215
+ @config[:ef_construction] = ef
216
+ self
217
+ end
218
+
219
+ def seed(seed)
220
+ @config[:random_seed] = seed
221
+ self
222
+ end
223
+
224
+ def build
225
+ HNSW.new(**@config)
226
+ end
227
+ end
228
+
229
+ private
230
+
231
+ # Ensure input is a proper array format
232
+ def ensure_array(data)
233
+ case data
234
+ when Array
235
+ data
236
+ else
237
+ data.respond_to?(:to_a) ? data.to_a : raise(ArgumentError, "Data must be convertible to Array")
238
+ end
239
+ end
240
+
241
+ # Class method to make it available to class methods
242
+ def self.ensure_array(data)
243
+ case data
244
+ when Array
245
+ data
246
+ else
247
+ data.respond_to?(:to_a) ? data.to_a : raise(ArgumentError, "Data must be convertible to Array")
248
+ end
249
+ end
250
+ end
251
+ end