clusterkit 0.3.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.simplecov +47 -0
- data/CHANGELOG.md +35 -0
- data/CLAUDE.md +226 -0
- data/Cargo.lock +3228 -0
- data/Cargo.toml +8 -0
- data/Gemfile +17 -0
- data/IMPLEMENTATION_NOTES.md +143 -0
- data/LICENSE.txt +21 -0
- data/PYTHON_COMPARISON.md +183 -0
- data/README.md +744 -0
- data/Rakefile +259 -0
- data/docs/KNOWN_ISSUES.md +130 -0
- data/docs/RUST_ERROR_HANDLING.md +164 -0
- data/docs/TEST_FIXTURES.md +170 -0
- data/docs/UMAP_EXPLAINED.md +362 -0
- data/docs/UMAP_TROUBLESHOOTING.md +284 -0
- data/docs/VERBOSE_OUTPUT.md +84 -0
- data/docs/assets/clusterkit-wide.png +0 -0
- data/docs/assets/clusterkit.png +0 -0
- data/docs/assets/visualization.png +0 -0
- data/examples/hdbscan_example.rb +147 -0
- data/examples/optimal_kmeans_example.rb +96 -0
- data/examples/pca_example.rb +114 -0
- data/examples/reproducible_umap.rb +99 -0
- data/examples/verbose_control.rb +43 -0
- data/ext/clusterkit/Cargo.toml +26 -0
- data/ext/clusterkit/extconf.rb +23 -0
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +80 -0
- data/ext/clusterkit/src/clustering.rs +221 -0
- data/ext/clusterkit/src/embedder.rs +349 -0
- data/ext/clusterkit/src/hnsw.rs +579 -0
- data/ext/clusterkit/src/lib.rs +24 -0
- data/ext/clusterkit/src/svd.rs +89 -0
- data/ext/clusterkit/src/tests.rs +16 -0
- data/ext/clusterkit/src/utils.rs +183 -0
- data/lib/clusterkit/3.1/clusterkit.bundle +0 -0
- data/lib/clusterkit/3.2/clusterkit.bundle +0 -0
- data/lib/clusterkit/3.3/clusterkit.bundle +0 -0
- data/lib/clusterkit/3.4/clusterkit.bundle +0 -0
- data/lib/clusterkit/clustering/hdbscan.rb +164 -0
- data/lib/clusterkit/clustering.rb +194 -0
- data/lib/clusterkit/clusterkit.rb +14 -0
- data/lib/clusterkit/configuration.rb +24 -0
- data/lib/clusterkit/data_validator.rb +132 -0
- data/lib/clusterkit/dimensionality/pca.rb +251 -0
- data/lib/clusterkit/dimensionality/svd.rb +175 -0
- data/lib/clusterkit/dimensionality/umap.rb +282 -0
- data/lib/clusterkit/dimensionality.rb +29 -0
- data/lib/clusterkit/hdbscan_api_design.rb +142 -0
- data/lib/clusterkit/hnsw.rb +251 -0
- data/lib/clusterkit/preprocessing.rb +106 -0
- data/lib/clusterkit/silence.rb +42 -0
- data/lib/clusterkit/utils.rb +51 -0
- data/lib/clusterkit/version.rb +5 -0
- data/lib/clusterkit.rb +105 -0
- data/lib/tasks/visualize.rake +641 -0
- metadata +214 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
require 'json'
|
|
5
|
+
require_relative '../configuration'
|
|
6
|
+
require_relative '../silence'
|
|
7
|
+
require_relative '../data_validator'
|
|
8
|
+
|
|
9
|
+
module ClusterKit
|
|
10
|
+
module Dimensionality
|
|
11
|
+
class UMAP
|
|
12
|
+
attr_reader :n_components, :n_neighbors, :random_seed, :nb_grad_batch, :nb_sampling_by_edge
|
|
13
|
+
|
|
14
|
+
# Initialize a new UMAP instance
|
|
15
|
+
# @param n_components [Integer] Target number of dimensions (default: 2)
|
|
16
|
+
# @param n_neighbors [Integer] Number of neighbors for manifold approximation (default: 15)
|
|
17
|
+
# @param random_seed [Integer, nil] Random seed for reproducibility (default: nil)
|
|
18
|
+
# @param nb_grad_batch [Integer] Number of gradient descent batches (default: 10)
|
|
19
|
+
# Controls training iterations - lower = faster but less accurate
|
|
20
|
+
# @param nb_sampling_by_edge [Integer] Number of negative samples per edge (default: 8)
|
|
21
|
+
# Controls sampling quality - lower = faster but less accurate
|
|
22
|
+
def initialize(n_components: 2, n_neighbors: 15, random_seed: nil,
|
|
23
|
+
nb_grad_batch: 10, nb_sampling_by_edge: 8)
|
|
24
|
+
@n_components = n_components
|
|
25
|
+
@n_neighbors = n_neighbors
|
|
26
|
+
@random_seed = random_seed
|
|
27
|
+
@nb_grad_batch = nb_grad_batch
|
|
28
|
+
@nb_sampling_by_edge = nb_sampling_by_edge
|
|
29
|
+
@fitted = false
|
|
30
|
+
# Don't create RustUMAP yet - will be created in fit/fit_transform with adjusted parameters
|
|
31
|
+
@rust_umap = nil
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Fit the model to the data (training)
|
|
35
|
+
# @param data [Array<Array<Numeric>>] Training data as 2D array
|
|
36
|
+
# @return [self] Returns self for method chaining
|
|
37
|
+
# @note UMAP's training process inherently produces embeddings. Since the
|
|
38
|
+
# underlying Rust implementation doesn't separate training from
|
|
39
|
+
# transformation, we call fit_transform but discard the embeddings.
|
|
40
|
+
# Use fit_transform if you need both training and the transformed data.
|
|
41
|
+
def fit(data)
|
|
42
|
+
validate_input(data)
|
|
43
|
+
|
|
44
|
+
# Always recreate RustUMAP for fit to ensure fresh fit
|
|
45
|
+
@rust_umap = nil
|
|
46
|
+
create_rust_umap_with_adjusted_params(data)
|
|
47
|
+
|
|
48
|
+
# UMAP doesn't separate training from transformation internally,
|
|
49
|
+
# so we call fit_transform but discard the result
|
|
50
|
+
begin
|
|
51
|
+
Silence.maybe_silence do
|
|
52
|
+
@rust_umap.fit_transform(data)
|
|
53
|
+
end
|
|
54
|
+
@fitted = true
|
|
55
|
+
self
|
|
56
|
+
rescue StandardError => e
|
|
57
|
+
handle_umap_error(e, data)
|
|
58
|
+
rescue => e
|
|
59
|
+
# Handle fatal errors that aren't StandardError
|
|
60
|
+
handle_umap_error(RuntimeError.new(e.message), data)
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Transform data using the fitted model
|
|
65
|
+
# @param data [Array<Array<Numeric>>] Data to transform
|
|
66
|
+
# @return [Array<Array<Float>>] Transformed data in reduced dimensions
|
|
67
|
+
# @raise [RuntimeError] If model hasn't been fitted yet
|
|
68
|
+
def transform(data)
|
|
69
|
+
raise RuntimeError, "Model must be fitted before transform. Call fit or fit_transform first." unless fitted?
|
|
70
|
+
validate_input(data, check_min_samples: false)
|
|
71
|
+
Silence.maybe_silence do
|
|
72
|
+
@rust_umap.transform(data)
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Fit the model and transform the data in one step
|
|
77
|
+
# @param data [Array<Array<Numeric>>] Training data as 2D array
|
|
78
|
+
# @return [Array<Array<Float>>] Transformed data in reduced dimensions
|
|
79
|
+
def fit_transform(data)
|
|
80
|
+
validate_input(data)
|
|
81
|
+
|
|
82
|
+
# Always recreate RustUMAP for fit_transform to ensure fresh fit
|
|
83
|
+
@rust_umap = nil
|
|
84
|
+
create_rust_umap_with_adjusted_params(data)
|
|
85
|
+
|
|
86
|
+
begin
|
|
87
|
+
result = Silence.maybe_silence do
|
|
88
|
+
@rust_umap.fit_transform(data)
|
|
89
|
+
end
|
|
90
|
+
@fitted = true
|
|
91
|
+
result
|
|
92
|
+
rescue StandardError => e
|
|
93
|
+
handle_umap_error(e, data)
|
|
94
|
+
rescue => e
|
|
95
|
+
# Handle fatal errors that aren't StandardError
|
|
96
|
+
handle_umap_error(RuntimeError.new(e.message), data)
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Check if the model has been fitted
|
|
101
|
+
# @return [Boolean] true if model is fitted, false otherwise
|
|
102
|
+
def fitted?
|
|
103
|
+
@fitted
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Save the fitted model to a file
|
|
107
|
+
# @param path [String] Path where to save the model
|
|
108
|
+
# @raise [RuntimeError] If model hasn't been fitted yet
|
|
109
|
+
def save_model(path)
|
|
110
|
+
raise RuntimeError, "No model to save. Call fit or fit_transform first." unless fitted?
|
|
111
|
+
|
|
112
|
+
# Ensure directory exists
|
|
113
|
+
dir = File.dirname(path)
|
|
114
|
+
FileUtils.mkdir_p(dir) unless dir == '.' || dir == '/'
|
|
115
|
+
|
|
116
|
+
@rust_umap.save_model(path)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Load a fitted model from a file
|
|
120
|
+
# @param path [String] Path to the saved model
|
|
121
|
+
# @return [UMAP] A new UMAP instance with the loaded model
|
|
122
|
+
# @raise [ArgumentError] If file doesn't exist
|
|
123
|
+
def self.load_model(path)
|
|
124
|
+
raise ArgumentError, "File not found: #{path}" unless File.exist?(path)
|
|
125
|
+
|
|
126
|
+
# Load the Rust model (access private constant)
|
|
127
|
+
rust_umap = ::ClusterKit.const_get(:RustUMAP).load_model(path)
|
|
128
|
+
|
|
129
|
+
# Create a new UMAP instance with the loaded model
|
|
130
|
+
instance = allocate
|
|
131
|
+
instance.instance_variable_set(:@rust_umap, rust_umap)
|
|
132
|
+
instance.instance_variable_set(:@fitted, true)
|
|
133
|
+
# The model file should contain these parameters, but for now we don't have access
|
|
134
|
+
instance.instance_variable_set(:@n_components, nil)
|
|
135
|
+
instance.instance_variable_set(:@n_neighbors, nil)
|
|
136
|
+
instance.instance_variable_set(:@random_seed, nil)
|
|
137
|
+
|
|
138
|
+
instance
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Save transformed data to JSON file
|
|
142
|
+
# @param data [Array<Array<Float>>] Transformed data to save
|
|
143
|
+
# @param path [String] Path where to save the data
|
|
144
|
+
def self.save_data(data, path)
|
|
145
|
+
FileUtils.mkdir_p(File.dirname(path)) unless File.dirname(path) == '.'
|
|
146
|
+
File.write(path, JSON.pretty_generate(data))
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Load transformed data from JSON file
|
|
150
|
+
# @param path [String] Path to the saved data
|
|
151
|
+
# @return [Array<Array<Float>>] The loaded data
|
|
152
|
+
# @raise [ArgumentError] If file doesn't exist
|
|
153
|
+
def self.load_data(path)
|
|
154
|
+
raise ArgumentError, "File not found: #{path}" unless File.exist?(path)
|
|
155
|
+
JSON.parse(File.read(path))
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
private
|
|
159
|
+
|
|
160
|
+
def handle_umap_error(error, data)
|
|
161
|
+
error_msg = error.message
|
|
162
|
+
n_samples = data.size
|
|
163
|
+
|
|
164
|
+
case error_msg
|
|
165
|
+
when /isolated point/i, /graph will not be connected/i
|
|
166
|
+
raise ::ClusterKit::IsolatedPointError, <<~MSG
|
|
167
|
+
UMAP found isolated points in your data that are too far from other points.
|
|
168
|
+
|
|
169
|
+
This typically happens when:
|
|
170
|
+
• Your data contains outliers that are very different from other points
|
|
171
|
+
• You're using random data without inherent structure
|
|
172
|
+
• The n_neighbors parameter (#{@n_neighbors}) is too high for your data distribution
|
|
173
|
+
|
|
174
|
+
Solutions:
|
|
175
|
+
1. Reduce n_neighbors (try 5 or even 3): UMAP.new(n_neighbors: 5)
|
|
176
|
+
2. Remove outliers from your data before applying UMAP
|
|
177
|
+
3. Ensure your data has some structure (not purely random)
|
|
178
|
+
4. For small datasets (< 50 points), consider using PCA instead
|
|
179
|
+
|
|
180
|
+
Your data: #{n_samples} samples, #{data.first&.size || 0} dimensions
|
|
181
|
+
MSG
|
|
182
|
+
|
|
183
|
+
when /assertion failed.*box_size/i
|
|
184
|
+
raise ::ClusterKit::ConvergenceError, <<~MSG
|
|
185
|
+
UMAP failed to converge due to numerical instability in your data.
|
|
186
|
+
|
|
187
|
+
This typically happens when:
|
|
188
|
+
• Data points are too spread out or have extreme values
|
|
189
|
+
• The scale of different features varies wildly
|
|
190
|
+
• There are duplicate or nearly-duplicate points
|
|
191
|
+
|
|
192
|
+
Solutions:
|
|
193
|
+
1. Normalize your data first: ClusterKit::Preprocessing.normalize(data)
|
|
194
|
+
2. Use a smaller n_neighbors value: UMAP.new(n_neighbors: 5)
|
|
195
|
+
3. Check for and remove duplicate points
|
|
196
|
+
4. Scale your data to a reasonable range (e.g., 0-1 or -1 to 1)
|
|
197
|
+
|
|
198
|
+
Your data: #{n_samples} samples, #{data.first&.size || 0} dimensions
|
|
199
|
+
MSG
|
|
200
|
+
|
|
201
|
+
when /n_neighbors.*larger than/i, /too many neighbors/i
|
|
202
|
+
raise ::ClusterKit::InvalidParameterError, <<~MSG
|
|
203
|
+
The n_neighbors parameter (#{@n_neighbors}) is too large for your dataset size (#{n_samples}).
|
|
204
|
+
|
|
205
|
+
UMAP needs n_neighbors to be less than the number of samples.
|
|
206
|
+
Suggested value: #{[5, (n_samples * 0.1).to_i].max}
|
|
207
|
+
|
|
208
|
+
This should have been auto-adjusted. If you're seeing this error, please report it.
|
|
209
|
+
MSG
|
|
210
|
+
|
|
211
|
+
else
|
|
212
|
+
# For unknown errors, still provide some guidance
|
|
213
|
+
raise ::ClusterKit::Error, <<~MSG
|
|
214
|
+
UMAP encountered an error: #{error_msg}
|
|
215
|
+
|
|
216
|
+
Common solutions:
|
|
217
|
+
1. Try reducing n_neighbors (current: #{@n_neighbors})
|
|
218
|
+
2. Normalize your data first
|
|
219
|
+
3. Check for NaN or infinite values in your data
|
|
220
|
+
4. Ensure you have at least 10 data points
|
|
221
|
+
|
|
222
|
+
If this persists, consider using PCA for dimensionality reduction instead.
|
|
223
|
+
MSG
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def validate_input(data, check_min_samples: true)
|
|
228
|
+
# Use shared validation for common checks
|
|
229
|
+
DataValidator.validate_standard(data)
|
|
230
|
+
|
|
231
|
+
# UMAP-specific validations
|
|
232
|
+
if check_min_samples && data.size < 10
|
|
233
|
+
raise ::ClusterKit::InsufficientDataError, <<~MSG
|
|
234
|
+
UMAP requires at least 10 data points, but only #{data.size} provided.
|
|
235
|
+
|
|
236
|
+
For small datasets, consider:
|
|
237
|
+
1. Using PCA instead: ClusterKit::Dimensionality::PCA.new(n_components: 2)
|
|
238
|
+
2. Collecting more data points
|
|
239
|
+
3. Using simpler visualization methods
|
|
240
|
+
MSG
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# Check for extreme data ranges that might cause numerical issues
|
|
244
|
+
stats = DataValidator.data_statistics(data)
|
|
245
|
+
if stats[:data_range] > 1000
|
|
246
|
+
warn "WARNING: Large data range detected (#{stats[:data_range].round(2)}). Consider normalizing your data to prevent numerical instability."
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
def create_rust_umap_with_adjusted_params(data)
|
|
251
|
+
# Only create if not already created
|
|
252
|
+
return if @rust_umap
|
|
253
|
+
|
|
254
|
+
n_samples = data.size
|
|
255
|
+
|
|
256
|
+
# Automatically adjust n_neighbors if it's too high for the dataset
|
|
257
|
+
# n_neighbors should be less than n_samples
|
|
258
|
+
# Use a reasonable default: min(15, n_samples / 4) but at least 2
|
|
259
|
+
max_neighbors = [n_samples - 1, 2].max # At least 2, but less than n_samples
|
|
260
|
+
suggested_neighbors = [[15, n_samples / 4].min.to_i, 2].max
|
|
261
|
+
|
|
262
|
+
adjusted_n_neighbors = @n_neighbors
|
|
263
|
+
if @n_neighbors > max_neighbors
|
|
264
|
+
adjusted_n_neighbors = [suggested_neighbors, max_neighbors].min
|
|
265
|
+
|
|
266
|
+
if ::ClusterKit.configuration.verbose
|
|
267
|
+
warn "UMAP: Adjusted n_neighbors from #{@n_neighbors} to #{adjusted_n_neighbors} for dataset with #{n_samples} samples"
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
# Access the private constant from inside the module
|
|
272
|
+
@rust_umap = ::ClusterKit.const_get(:RustUMAP).new({
|
|
273
|
+
n_components: @n_components,
|
|
274
|
+
n_neighbors: adjusted_n_neighbors,
|
|
275
|
+
random_seed: @random_seed,
|
|
276
|
+
nb_grad_batch: @nb_grad_batch,
|
|
277
|
+
nb_sampling_by_edge: @nb_sampling_by_edge
|
|
278
|
+
})
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ClusterKit
|
|
4
|
+
# Module for dimensionality reduction algorithms
|
|
5
|
+
module Dimensionality
|
|
6
|
+
# Load classes - can't use autoload with require issues
|
|
7
|
+
require_relative "dimensionality/umap"
|
|
8
|
+
require_relative "dimensionality/pca"
|
|
9
|
+
require_relative "dimensionality/svd"
|
|
10
|
+
|
|
11
|
+
# Module-level evaluation methods
|
|
12
|
+
|
|
13
|
+
# Calculate reconstruction error for a dimensionality reduction
|
|
14
|
+
# @param original_data [Array<Array<Numeric>>] Original high-dimensional data
|
|
15
|
+
# @param reconstructed_data [Array<Array<Numeric>>] Reconstructed data
|
|
16
|
+
# @return [Float] Mean squared reconstruction error
|
|
17
|
+
def self.reconstruction_error(original_data, reconstructed_data)
|
|
18
|
+
raise ArgumentError, "Data sizes don't match" if original_data.size != reconstructed_data.size
|
|
19
|
+
|
|
20
|
+
total_error = 0.0
|
|
21
|
+
original_data.zip(reconstructed_data).each do |orig, recon|
|
|
22
|
+
error = orig.zip(recon).map { |o, r| (o - r) ** 2 }.sum
|
|
23
|
+
total_error += error
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
total_error / original_data.size
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
# API Design for HDBSCAN to match KMeans pattern
|
|
2
|
+
|
|
3
|
+
module ClusterKit
|
|
4
|
+
module Clustering
|
|
5
|
+
|
|
6
|
+
# HDBSCAN clustering algorithm - matching KMeans API pattern
|
|
7
|
+
class HDBSCAN
|
|
8
|
+
attr_reader :min_samples, :min_cluster_size, :labels, :probabilities,
|
|
9
|
+
:outlier_scores, :cluster_persistence
|
|
10
|
+
|
|
11
|
+
# Initialize HDBSCAN clusterer (matches KMeans pattern)
|
|
12
|
+
# @param min_samples [Integer] Min neighborhood size for core points (default: 5)
|
|
13
|
+
# @param min_cluster_size [Integer] Minimum size of clusters (default: 5)
|
|
14
|
+
# @param metric [String] Distance metric (default: 'euclidean')
|
|
15
|
+
def initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean')
|
|
16
|
+
raise ArgumentError, "min_samples must be positive" unless min_samples > 0
|
|
17
|
+
raise ArgumentError, "min_cluster_size must be positive" unless min_cluster_size > 0
|
|
18
|
+
@min_samples = min_samples
|
|
19
|
+
@min_cluster_size = min_cluster_size
|
|
20
|
+
@metric = metric
|
|
21
|
+
@fitted = false
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Fit the HDBSCAN model (matches KMeans.fit)
|
|
25
|
+
# @param data [Array] 2D array of data points
|
|
26
|
+
# @return [self] Returns self for method chaining
|
|
27
|
+
def fit(data)
|
|
28
|
+
validate_data(data)
|
|
29
|
+
|
|
30
|
+
# Call Rust implementation (hdbscan crate)
|
|
31
|
+
result = Clustering.hdbscan_rust(data, @min_samples, @min_cluster_size, @metric)
|
|
32
|
+
|
|
33
|
+
@labels = result[:labels]
|
|
34
|
+
@probabilities = result[:probabilities]
|
|
35
|
+
@outlier_scores = result[:outlier_scores]
|
|
36
|
+
@cluster_persistence = result[:cluster_persistence]
|
|
37
|
+
@fitted = true
|
|
38
|
+
|
|
39
|
+
self
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# HDBSCAN doesn't support predict for new points (unlike KMeans)
|
|
43
|
+
# But we keep the method for API consistency
|
|
44
|
+
# @param data [Array] 2D array of data points
|
|
45
|
+
# @return [Array] Returns nil or raises
|
|
46
|
+
def predict(data)
|
|
47
|
+
raise NotImplementedError, "HDBSCAN does not support prediction on new data. " \
|
|
48
|
+
"Use approximate_predict for approximate membership"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Fit the model and return labels (matches KMeans.fit_predict)
|
|
52
|
+
# @param data [Array] 2D array of data points
|
|
53
|
+
# @return [Array] Cluster labels (-1 for noise)
|
|
54
|
+
def fit_predict(data)
|
|
55
|
+
fit(data)
|
|
56
|
+
@labels
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Check if model has been fitted (matches KMeans.fitted?)
|
|
60
|
+
# @return [Boolean] True if fitted
|
|
61
|
+
def fitted?
|
|
62
|
+
@fitted
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Get number of clusters found (similar to KMeans.k but discovered)
|
|
66
|
+
# @return [Integer] Number of clusters (excluding noise)
|
|
67
|
+
def n_clusters
|
|
68
|
+
return 0 unless fitted?
|
|
69
|
+
@labels.max + 1 rescue 0
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Get noise ratio (HDBSCAN-specific but follows naming pattern)
|
|
73
|
+
# @return [Float] Fraction of points labeled as noise
|
|
74
|
+
def noise_ratio
|
|
75
|
+
return 0.0 unless fitted?
|
|
76
|
+
@labels.count(-1).to_f / @labels.length
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
private
|
|
80
|
+
|
|
81
|
+
def validate_data(data)
|
|
82
|
+
# Exact same validation as KMeans for consistency
|
|
83
|
+
raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
|
|
84
|
+
raise ArgumentError, "Data cannot be empty" if data.empty?
|
|
85
|
+
raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
|
|
86
|
+
|
|
87
|
+
row_length = data.first.length
|
|
88
|
+
unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
|
|
89
|
+
raise ArgumentError, "All rows must have the same length"
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
data.each_with_index do |row, i|
|
|
93
|
+
row.each_with_index do |val, j|
|
|
94
|
+
unless val.is_a?(Numeric)
|
|
95
|
+
raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Module-level convenience methods (matching KMeans pattern)
|
|
103
|
+
class << self
|
|
104
|
+
# Perform HDBSCAN clustering (matches Clustering.kmeans signature)
|
|
105
|
+
# @param data [Array] 2D array of data points
|
|
106
|
+
# @param min_samples [Integer] Min neighborhood size for core points
|
|
107
|
+
# @param min_cluster_size [Integer] Minimum size of clusters
|
|
108
|
+
# @return [Hash] Result hash with :labels, :probabilities, :outlier_scores
|
|
109
|
+
def hdbscan(data, min_samples: 5, min_cluster_size: 5)
|
|
110
|
+
clusterer = HDBSCAN.new(min_samples: min_samples, min_cluster_size: min_cluster_size)
|
|
111
|
+
clusterer.fit(data)
|
|
112
|
+
{
|
|
113
|
+
labels: clusterer.labels,
|
|
114
|
+
probabilities: clusterer.probabilities,
|
|
115
|
+
outlier_scores: clusterer.outlier_scores,
|
|
116
|
+
n_clusters: clusterer.n_clusters,
|
|
117
|
+
noise_ratio: clusterer.noise_ratio
|
|
118
|
+
}
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Usage comparison:
|
|
125
|
+
|
|
126
|
+
# KMeans usage:
|
|
127
|
+
kmeans = ClusterKit::Clustering::KMeans.new(k: 3)
|
|
128
|
+
kmeans.fit(data)
|
|
129
|
+
labels = kmeans.labels
|
|
130
|
+
# or
|
|
131
|
+
labels = kmeans.fit_predict(data)
|
|
132
|
+
|
|
133
|
+
# HDBSCAN usage (identical pattern):
|
|
134
|
+
hdbscan = ClusterKit::Clustering::HDBSCAN.new(min_samples: 5, min_cluster_size: 5)
|
|
135
|
+
hdbscan.fit(data)
|
|
136
|
+
labels = hdbscan.labels
|
|
137
|
+
# or
|
|
138
|
+
labels = hdbscan.fit_predict(data)
|
|
139
|
+
|
|
140
|
+
# Module-level convenience (both follow same pattern):
|
|
141
|
+
result = ClusterKit::Clustering.kmeans(data, 3)
|
|
142
|
+
result = ClusterKit::Clustering.hdbscan(data, min_samples: 5)
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ClusterKit
|
|
4
|
+
# HNSW (Hierarchical Navigable Small World) index for fast approximate nearest neighbor search
|
|
5
|
+
#
|
|
6
|
+
# @example Basic usage
|
|
7
|
+
# index = ClusterKit::HNSW.new(dim: 128, space: :euclidean)
|
|
8
|
+
# index.add_batch(vectors, labels: labels)
|
|
9
|
+
# neighbors = index.search(query_vector, k: 10)
|
|
10
|
+
#
|
|
11
|
+
# @example With metadata
|
|
12
|
+
# index = ClusterKit::HNSW.new(dim: 768, space: :cosine)
|
|
13
|
+
# index.add_item(vector, label: "doc_1", metadata: { title: "Introduction", date: "2024-01-01" })
|
|
14
|
+
# results = index.search_with_metadata(query, k: 5)
|
|
15
|
+
# # => [{ label: "doc_1", distance: 0.23, metadata: { title: "...", date: "..." } }, ...]
|
|
16
|
+
class HNSW
|
|
17
|
+
# Note: The actual HNSW class is defined in Rust (ext/clusterkit/src/hnsw.rs)
|
|
18
|
+
# This Ruby file adds additional convenience methods and documentation.
|
|
19
|
+
# The Rust implementation provides these core methods:
|
|
20
|
+
# - new(kwargs) - constructor
|
|
21
|
+
# - add_item(vector, kwargs) - add single item
|
|
22
|
+
# - add_batch(vectors, kwargs) - add multiple items
|
|
23
|
+
# - search(query, kwargs) - search for neighbors
|
|
24
|
+
# - search_with_metadata(query, kwargs) - search with metadata
|
|
25
|
+
# - size() - get number of items
|
|
26
|
+
# - config() - get configuration
|
|
27
|
+
# - stats() - get statistics
|
|
28
|
+
# - set_ef(ef) - set search quality parameter
|
|
29
|
+
# - save(path) - save to file
|
|
30
|
+
|
|
31
|
+
# Initialize is actually handled by the Rust code
|
|
32
|
+
# This documentation is for reference
|
|
33
|
+
#
|
|
34
|
+
# @param dim [Integer] Dimension of vectors (required)
|
|
35
|
+
# @param space [Symbol] Distance metric: :euclidean, :cosine, or :inner_product (default: :euclidean)
|
|
36
|
+
# @param max_elements [Integer] Maximum number of elements (default: 10_000)
|
|
37
|
+
# @param m [Integer] Number of bi-directional links (default: 16)
|
|
38
|
+
# @param ef_construction [Integer] Size of dynamic candidate list (default: 200)
|
|
39
|
+
# @param random_seed [Integer, nil] Random seed for reproducible builds (default: nil)
|
|
40
|
+
# @param dynamic_list [Boolean] Allow index to grow dynamically (not yet implemented)
|
|
41
|
+
|
|
42
|
+
# Fit the index with training data (alias for add_batch)
|
|
43
|
+
#
|
|
44
|
+
# @param data [Array<Array>, Numo::NArray] Training vectors
|
|
45
|
+
# @param labels [Array, nil] Optional labels for vectors
|
|
46
|
+
# @return [self]
|
|
47
|
+
def fit(data, labels: nil)
|
|
48
|
+
add_batch(data, labels: labels)
|
|
49
|
+
self
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Fit and return transformed data (for compatibility with sklearn-like interface)
|
|
53
|
+
#
|
|
54
|
+
# @param data [Array<Array>, Numo::NArray] Training vectors
|
|
55
|
+
# @return [self]
|
|
56
|
+
def fit_transform(data)
|
|
57
|
+
fit(data)
|
|
58
|
+
self
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Add a vector using the << operator
|
|
62
|
+
#
|
|
63
|
+
# @param vector [Array, Numo::NArray] Vector to add
|
|
64
|
+
# @return [self]
|
|
65
|
+
def <<(vector)
|
|
66
|
+
add_item(vector, {})
|
|
67
|
+
self
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Alias for search that always includes distances
|
|
71
|
+
#
|
|
72
|
+
# @param query [Array, Numo::NArray] Query vector
|
|
73
|
+
# @param k [Integer] Number of neighbors
|
|
74
|
+
# @param ef [Integer, nil] Search parameter (higher = better quality, slower)
|
|
75
|
+
# @return [Array<Array>] Array of [indices, distances]
|
|
76
|
+
def knn_query(query, k: 10, ef: nil)
|
|
77
|
+
search(query, k: k, ef: ef, include_distances: true)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Batch search for multiple queries
|
|
81
|
+
#
|
|
82
|
+
# @param queries [Array<Array>, Numo::NArray] Multiple query vectors
|
|
83
|
+
# @param k [Integer] Number of neighbors per query
|
|
84
|
+
# @param parallel [Boolean] Process queries in parallel
|
|
85
|
+
# @return [Array<Array>] Results for each query
|
|
86
|
+
def batch_search(queries, k: 10, parallel: true)
|
|
87
|
+
queries = ensure_array(queries)
|
|
88
|
+
|
|
89
|
+
if parallel && queries.size > 1
|
|
90
|
+
require 'parallel'
|
|
91
|
+
Parallel.map(queries) { |query| search(query, k: k) }
|
|
92
|
+
else
|
|
93
|
+
queries.map { |query| search(query, k: k) }
|
|
94
|
+
end
|
|
95
|
+
rescue LoadError
|
|
96
|
+
# Parallel gem not available, fall back to sequential
|
|
97
|
+
queries.map { |query| search(query, k: k) }
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Range search - find all points within a given radius
|
|
101
|
+
#
|
|
102
|
+
# @param query [Array, Numo::NArray] Query vector
|
|
103
|
+
# @param radius [Float] Search radius
|
|
104
|
+
# @param limit [Integer, nil] Maximum number of results
|
|
105
|
+
# @return [Array<Hash>] Results within radius
|
|
106
|
+
def range_search(query, radius:, limit: nil)
|
|
107
|
+
# Get a large number of candidates
|
|
108
|
+
k = limit || size
|
|
109
|
+
k = [k, size].min
|
|
110
|
+
|
|
111
|
+
results = search_with_metadata(query, k: k)
|
|
112
|
+
|
|
113
|
+
# Filter by radius
|
|
114
|
+
results.select { |r| r[:distance] <= radius }
|
|
115
|
+
.take(limit || results.size)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Check if index is empty
|
|
119
|
+
# @return [Boolean]
|
|
120
|
+
def empty?
|
|
121
|
+
size == 0
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Clear all elements from the index
|
|
125
|
+
#
|
|
126
|
+
# @return [self]
|
|
127
|
+
def clear!
|
|
128
|
+
# Would need to recreate the index
|
|
129
|
+
raise NotImplementedError, "Clear not yet implemented"
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Check if a label exists in the index
|
|
133
|
+
#
|
|
134
|
+
# @param label [String, Integer] Label to check
|
|
135
|
+
# @return [Boolean]
|
|
136
|
+
def include?(label)
|
|
137
|
+
# This would need to be implemented in Rust
|
|
138
|
+
# For now, return false
|
|
139
|
+
false
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Get recall rate for a test set
|
|
143
|
+
#
|
|
144
|
+
# @param test_queries [Array<Array>] Query vectors
|
|
145
|
+
# @param ground_truth [Array<Array>] True nearest neighbors for each query
|
|
146
|
+
# @param k [Integer] Number of neighbors to evaluate
|
|
147
|
+
# @return [Float] Recall rate (0.0 to 1.0)
|
|
148
|
+
def recall(test_queries, ground_truth, k: 10)
|
|
149
|
+
test_queries = ensure_array(test_queries)
|
|
150
|
+
|
|
151
|
+
require 'set'
|
|
152
|
+
total_correct = 0
|
|
153
|
+
total_possible = 0
|
|
154
|
+
|
|
155
|
+
test_queries.each_with_index do |query, i|
|
|
156
|
+
predicted = Set.new(search(query, k: k))
|
|
157
|
+
actual = Set.new(ground_truth[i].take(k))
|
|
158
|
+
|
|
159
|
+
total_correct += (predicted & actual).size
|
|
160
|
+
total_possible += [k, actual.size].min
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
total_possible > 0 ? total_correct.to_f / total_possible : 0.0
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Load an index from file
|
|
167
|
+
# Note: This uses Box::leak internally to work around hnsw_rs lifetime constraints
|
|
168
|
+
# This causes a small memory leak - the HnswIo struct won't be freed until program exit
|
|
169
|
+
#
|
|
170
|
+
# @param path [String] File path to load from
|
|
171
|
+
# @return [HNSW] New HNSW instance loaded from file
|
|
172
|
+
# (The actual implementation is in Rust)
|
|
173
|
+
|
|
174
|
+
# Create an index from embeddings produced by UMAP or other dimensionality reduction
|
|
175
|
+
#
|
|
176
|
+
# @param embeddings [Array<Array>, Numo::NArray] Embedding vectors
|
|
177
|
+
# @param kwargs [Hash] Additional options for HNSW initialization
|
|
178
|
+
# @return [HNSW] New HNSW instance
|
|
179
|
+
def self.from_embedding(embeddings, **kwargs)
|
|
180
|
+
embeddings = ensure_array(embeddings)
|
|
181
|
+
|
|
182
|
+
dim = embeddings.first.size
|
|
183
|
+
index = new(dim: dim, **kwargs)
|
|
184
|
+
index.fit(embeddings)
|
|
185
|
+
index
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Builder pattern for creating HNSW indices
|
|
189
|
+
class Builder
|
|
190
|
+
def initialize
|
|
191
|
+
@config = {}
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def space(type)
|
|
195
|
+
@config[:space] = type
|
|
196
|
+
self
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def dimensions(dim)
|
|
200
|
+
@config[:dim] = dim
|
|
201
|
+
self
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def max_elements(n)
|
|
205
|
+
@config[:max_elements] = n
|
|
206
|
+
self
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
def m_parameter(m)
|
|
210
|
+
@config[:m] = m
|
|
211
|
+
self
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def ef_construction(ef)
|
|
215
|
+
@config[:ef_construction] = ef
|
|
216
|
+
self
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
def seed(seed)
|
|
220
|
+
@config[:random_seed] = seed
|
|
221
|
+
self
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
def build
|
|
225
|
+
HNSW.new(**@config)
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
private
|
|
230
|
+
|
|
231
|
+
# Ensure input is a proper array format
|
|
232
|
+
def ensure_array(data)
|
|
233
|
+
case data
|
|
234
|
+
when Array
|
|
235
|
+
data
|
|
236
|
+
else
|
|
237
|
+
data.respond_to?(:to_a) ? data.to_a : raise(ArgumentError, "Data must be convertible to Array")
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Class method to make it available to class methods
|
|
242
|
+
def self.ensure_array(data)
|
|
243
|
+
case data
|
|
244
|
+
when Array
|
|
245
|
+
data
|
|
246
|
+
else
|
|
247
|
+
data.respond_to?(:to_a) ? data.to_a : raise(ArgumentError, "Data must be convertible to Array")
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
end
|