clusterkit 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.simplecov +47 -0
- data/CHANGELOG.md +35 -0
- data/CLAUDE.md +226 -0
- data/Cargo.toml +8 -0
- data/Gemfile +17 -0
- data/IMPLEMENTATION_NOTES.md +143 -0
- data/LICENSE.txt +21 -0
- data/PYTHON_COMPARISON.md +183 -0
- data/README.md +499 -0
- data/Rakefile +245 -0
- data/clusterkit.gemspec +45 -0
- data/docs/KNOWN_ISSUES.md +130 -0
- data/docs/RUST_ERROR_HANDLING.md +164 -0
- data/docs/TEST_FIXTURES.md +170 -0
- data/docs/UMAP_EXPLAINED.md +362 -0
- data/docs/UMAP_TROUBLESHOOTING.md +284 -0
- data/docs/VERBOSE_OUTPUT.md +84 -0
- data/examples/hdbscan_example.rb +147 -0
- data/examples/optimal_kmeans_example.rb +96 -0
- data/examples/pca_example.rb +114 -0
- data/examples/reproducible_umap.rb +99 -0
- data/examples/verbose_control.rb +43 -0
- data/ext/clusterkit/Cargo.toml +25 -0
- data/ext/clusterkit/extconf.rb +4 -0
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +115 -0
- data/ext/clusterkit/src/clustering.rs +267 -0
- data/ext/clusterkit/src/embedder.rs +413 -0
- data/ext/clusterkit/src/lib.rs +22 -0
- data/ext/clusterkit/src/svd.rs +112 -0
- data/ext/clusterkit/src/tests.rs +16 -0
- data/ext/clusterkit/src/utils.rs +33 -0
- data/lib/clusterkit/clustering/hdbscan.rb +177 -0
- data/lib/clusterkit/clustering.rb +213 -0
- data/lib/clusterkit/clusterkit.rb +9 -0
- data/lib/clusterkit/configuration.rb +24 -0
- data/lib/clusterkit/dimensionality/pca.rb +251 -0
- data/lib/clusterkit/dimensionality/svd.rb +144 -0
- data/lib/clusterkit/dimensionality/umap.rb +311 -0
- data/lib/clusterkit/dimensionality.rb +29 -0
- data/lib/clusterkit/hdbscan_api_design.rb +142 -0
- data/lib/clusterkit/preprocessing.rb +106 -0
- data/lib/clusterkit/silence.rb +42 -0
- data/lib/clusterkit/utils.rb +51 -0
- data/lib/clusterkit/version.rb +5 -0
- data/lib/clusterkit.rb +93 -0
- data/lib/tasks/visualize.rake +641 -0
- metadata +194 -0
@@ -0,0 +1,144 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../clusterkit'
|
4
|
+
|
5
|
+
module ClusterKit
|
6
|
+
module Dimensionality
|
7
|
+
# Singular Value Decomposition
|
8
|
+
# Decomposes a matrix into U, S, V^T components
|
9
|
+
class SVD
|
10
|
+
attr_reader :n_components, :n_iter, :random_seed
|
11
|
+
attr_reader :u, :s, :vt
|
12
|
+
|
13
|
+
# Initialize a new SVD instance
|
14
|
+
# @param n_components [Integer] Number of components to compute
|
15
|
+
# @param n_iter [Integer] Number of iterations for randomized algorithm (default: 2)
|
16
|
+
# @param random_seed [Integer, nil] Random seed for reproducibility
|
17
|
+
def initialize(n_components: nil, n_iter: 2, random_seed: nil)
|
18
|
+
@n_components = n_components
|
19
|
+
@n_iter = n_iter
|
20
|
+
@random_seed = random_seed
|
21
|
+
@fitted = false
|
22
|
+
end
|
23
|
+
|
24
|
+
# Fit the model and transform data in one step
|
25
|
+
# @param data [Array<Array<Numeric>>] Input data
|
26
|
+
# @return [Array] Returns [U, S, Vt] matrices
|
27
|
+
def fit_transform(data)
|
28
|
+
validate_input(data)
|
29
|
+
|
30
|
+
# Store reference to original data for transform detection
|
31
|
+
@original_data_id = data.object_id
|
32
|
+
|
33
|
+
# Determine n_components if not set
|
34
|
+
n_comp = @n_components || [data.size, data.first.size].min
|
35
|
+
|
36
|
+
# Call the Rust implementation
|
37
|
+
@u, @s, @vt = self.class.randomized_svd(data, n_comp, n_iter: @n_iter)
|
38
|
+
@fitted = true
|
39
|
+
|
40
|
+
[@u, @s, @vt]
|
41
|
+
end
|
42
|
+
|
43
|
+
# Fit the model to data
|
44
|
+
# @param data [Array<Array<Numeric>>] Input data
|
45
|
+
# @return [self]
|
46
|
+
def fit(data)
|
47
|
+
fit_transform(data)
|
48
|
+
self
|
49
|
+
end
|
50
|
+
|
51
|
+
# Get the U matrix (left singular vectors)
|
52
|
+
# @return [Array<Array<Float>>] U matrix
|
53
|
+
def components_u
|
54
|
+
raise RuntimeError, "Model must be fitted first" unless fitted?
|
55
|
+
@u
|
56
|
+
end
|
57
|
+
|
58
|
+
# Get the singular values
|
59
|
+
# @return [Array<Float>] Singular values
|
60
|
+
def singular_values
|
61
|
+
raise RuntimeError, "Model must be fitted first" unless fitted?
|
62
|
+
@s
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get the V^T matrix (right singular vectors, transposed)
|
66
|
+
# @return [Array<Array<Float>>] V^T matrix
|
67
|
+
def components_vt
|
68
|
+
raise RuntimeError, "Model must be fitted first" unless fitted?
|
69
|
+
@vt
|
70
|
+
end
|
71
|
+
|
72
|
+
# Check if the model has been fitted
|
73
|
+
# @return [Boolean]
|
74
|
+
def fitted?
|
75
|
+
@fitted
|
76
|
+
end
|
77
|
+
|
78
|
+
# Transform data using fitted SVD (project onto components)
|
79
|
+
# @param data [Array<Array<Numeric>>] Data to transform
|
80
|
+
# @return [Array<Array<Float>>] Transformed data (U * S)
|
81
|
+
def transform(data)
|
82
|
+
raise RuntimeError, "Model must be fitted first" unless fitted?
|
83
|
+
validate_input(data)
|
84
|
+
|
85
|
+
# For SVD, transform typically means projecting onto the components
|
86
|
+
# This is equivalent to data * V (or data * V^T.T)
|
87
|
+
# But for dimensionality reduction, we usually want U * S
|
88
|
+
# which is already computed in fit_transform
|
89
|
+
|
90
|
+
# If transforming new data, we'd need to project it
|
91
|
+
# For now, return U * S for the fitted data
|
92
|
+
if data.object_id == @original_data_id
|
93
|
+
# Same data that was fitted - return U * S
|
94
|
+
@u.map.with_index do |row, i|
|
95
|
+
row.map.with_index { |val, j| val * @s[j] }
|
96
|
+
end
|
97
|
+
else
|
98
|
+
# New data - would need proper projection
|
99
|
+
raise NotImplementedError, "Transform for new data not yet implemented"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# Inverse transform (reconstruct from components)
|
104
|
+
# @param transformed_data [Array<Array<Float>>] Transformed data
|
105
|
+
# @return [Array<Array<Float>>] Reconstructed data
|
106
|
+
def inverse_transform(transformed_data)
|
107
|
+
raise RuntimeError, "Model must be fitted first" unless fitted?
|
108
|
+
|
109
|
+
# Reconstruction: (U * S) * V^T
|
110
|
+
# transformed_data should be U * S
|
111
|
+
# We multiply by V^T to reconstruct
|
112
|
+
|
113
|
+
result = []
|
114
|
+
transformed_data.each do |row|
|
115
|
+
reconstructed = Array.new(@vt.first.size, 0.0)
|
116
|
+
row.each_with_index do |val, i|
|
117
|
+
@vt[i].each_with_index do |v, j|
|
118
|
+
reconstructed[j] += val * v
|
119
|
+
end
|
120
|
+
end
|
121
|
+
result << reconstructed
|
122
|
+
end
|
123
|
+
result
|
124
|
+
end
|
125
|
+
|
126
|
+
# Class method for randomized SVD (kept for compatibility)
|
127
|
+
# @param matrix [Array<Array<Numeric>>] Input matrix
|
128
|
+
# @param k [Integer] Number of components
|
129
|
+
# @param n_iter [Integer] Number of iterations
|
130
|
+
# @return [Array] Returns [U, S, Vt]
|
131
|
+
def self.randomized_svd(matrix, k, n_iter: 2)
|
132
|
+
::ClusterKit::SVD.randomized_svd_rust(matrix, k, n_iter)
|
133
|
+
end
|
134
|
+
|
135
|
+
private
|
136
|
+
|
137
|
+
def validate_input(data)
|
138
|
+
raise ArgumentError, "Input must be an array" unless data.is_a?(Array)
|
139
|
+
raise ArgumentError, "Input cannot be empty" if data.empty?
|
140
|
+
raise ArgumentError, "Input must be a 2D array" unless data.first.is_a?(Array)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,311 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'fileutils'
|
4
|
+
require 'json'
|
5
|
+
require_relative '../configuration'
|
6
|
+
require_relative '../silence'
|
7
|
+
|
8
|
+
module ClusterKit
|
9
|
+
module Dimensionality
|
10
|
+
class UMAP
|
11
|
+
attr_reader :n_components, :n_neighbors, :random_seed, :nb_grad_batch, :nb_sampling_by_edge
|
12
|
+
|
13
|
+
# Initialize a new UMAP instance
|
14
|
+
# @param n_components [Integer] Target number of dimensions (default: 2)
|
15
|
+
# @param n_neighbors [Integer] Number of neighbors for manifold approximation (default: 15)
|
16
|
+
# @param random_seed [Integer, nil] Random seed for reproducibility (default: nil)
|
17
|
+
# @param nb_grad_batch [Integer] Number of gradient descent batches (default: 10)
|
18
|
+
# Controls training iterations - lower = faster but less accurate
|
19
|
+
# @param nb_sampling_by_edge [Integer] Number of negative samples per edge (default: 8)
|
20
|
+
# Controls sampling quality - lower = faster but less accurate
|
21
|
+
def initialize(n_components: 2, n_neighbors: 15, random_seed: nil,
|
22
|
+
nb_grad_batch: 10, nb_sampling_by_edge: 8)
|
23
|
+
@n_components = n_components
|
24
|
+
@n_neighbors = n_neighbors
|
25
|
+
@random_seed = random_seed
|
26
|
+
@nb_grad_batch = nb_grad_batch
|
27
|
+
@nb_sampling_by_edge = nb_sampling_by_edge
|
28
|
+
@fitted = false
|
29
|
+
# Don't create RustUMAP yet - will be created in fit/fit_transform with adjusted parameters
|
30
|
+
@rust_umap = nil
|
31
|
+
end
|
32
|
+
|
33
|
+
# Fit the model to the data (training)
|
34
|
+
# @param data [Array<Array<Numeric>>] Training data as 2D array
|
35
|
+
# @return [self] Returns self for method chaining
|
36
|
+
# @note UMAP's training process inherently produces embeddings. Since the
|
37
|
+
# underlying Rust implementation doesn't separate training from
|
38
|
+
# transformation, we call fit_transform but discard the embeddings.
|
39
|
+
# Use fit_transform if you need both training and the transformed data.
|
40
|
+
def fit(data)
|
41
|
+
validate_input(data)
|
42
|
+
|
43
|
+
# Always recreate RustUMAP for fit to ensure fresh fit
|
44
|
+
@rust_umap = nil
|
45
|
+
create_rust_umap_with_adjusted_params(data)
|
46
|
+
|
47
|
+
# UMAP doesn't separate training from transformation internally,
|
48
|
+
# so we call fit_transform but discard the result
|
49
|
+
begin
|
50
|
+
Silence.maybe_silence do
|
51
|
+
@rust_umap.fit_transform(data)
|
52
|
+
end
|
53
|
+
@fitted = true
|
54
|
+
self
|
55
|
+
rescue StandardError => e
|
56
|
+
handle_umap_error(e, data)
|
57
|
+
rescue => e
|
58
|
+
# Handle fatal errors that aren't StandardError
|
59
|
+
handle_umap_error(RuntimeError.new(e.message), data)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# Transform data using the fitted model
|
64
|
+
# @param data [Array<Array<Numeric>>] Data to transform
|
65
|
+
# @return [Array<Array<Float>>] Transformed data in reduced dimensions
|
66
|
+
# @raise [RuntimeError] If model hasn't been fitted yet
|
67
|
+
def transform(data)
|
68
|
+
raise RuntimeError, "Model must be fitted before transform. Call fit or fit_transform first." unless fitted?
|
69
|
+
validate_input(data, check_min_samples: false)
|
70
|
+
Silence.maybe_silence do
|
71
|
+
@rust_umap.transform(data)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# Fit the model and transform the data in one step
|
76
|
+
# @param data [Array<Array<Numeric>>] Training data as 2D array
|
77
|
+
# @return [Array<Array<Float>>] Transformed data in reduced dimensions
|
78
|
+
def fit_transform(data)
|
79
|
+
validate_input(data)
|
80
|
+
|
81
|
+
# Always recreate RustUMAP for fit_transform to ensure fresh fit
|
82
|
+
@rust_umap = nil
|
83
|
+
create_rust_umap_with_adjusted_params(data)
|
84
|
+
|
85
|
+
begin
|
86
|
+
result = Silence.maybe_silence do
|
87
|
+
@rust_umap.fit_transform(data)
|
88
|
+
end
|
89
|
+
@fitted = true
|
90
|
+
result
|
91
|
+
rescue StandardError => e
|
92
|
+
handle_umap_error(e, data)
|
93
|
+
rescue => e
|
94
|
+
# Handle fatal errors that aren't StandardError
|
95
|
+
handle_umap_error(RuntimeError.new(e.message), data)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# Check if the model has been fitted
|
100
|
+
# @return [Boolean] true if model is fitted, false otherwise
|
101
|
+
def fitted?
|
102
|
+
@fitted
|
103
|
+
end
|
104
|
+
|
105
|
+
# Save the fitted model to a file
|
106
|
+
# @param path [String] Path where to save the model
|
107
|
+
# @raise [RuntimeError] If model hasn't been fitted yet
|
108
|
+
def save(path)
|
109
|
+
raise RuntimeError, "No model to save. Call fit or fit_transform first." unless fitted?
|
110
|
+
|
111
|
+
# Ensure directory exists
|
112
|
+
dir = File.dirname(path)
|
113
|
+
FileUtils.mkdir_p(dir) unless dir == '.' || dir == '/'
|
114
|
+
|
115
|
+
@rust_umap.save_model(path)
|
116
|
+
end
|
117
|
+
|
118
|
+
# Load a fitted model from a file
|
119
|
+
# @param path [String] Path to the saved model
|
120
|
+
# @return [UMAP] A new UMAP instance with the loaded model
|
121
|
+
# @raise [ArgumentError] If file doesn't exist
|
122
|
+
def self.load(path)
|
123
|
+
raise ArgumentError, "File not found: #{path}" unless File.exist?(path)
|
124
|
+
|
125
|
+
# Load the Rust model
|
126
|
+
rust_umap = ::ClusterKit::RustUMAP.load_model(path)
|
127
|
+
|
128
|
+
# Create a new UMAP instance with the loaded model
|
129
|
+
instance = allocate
|
130
|
+
instance.instance_variable_set(:@rust_umap, rust_umap)
|
131
|
+
instance.instance_variable_set(:@fitted, true)
|
132
|
+
# The model file should contain these parameters, but for now we don't have access
|
133
|
+
instance.instance_variable_set(:@n_components, nil)
|
134
|
+
instance.instance_variable_set(:@n_neighbors, nil)
|
135
|
+
instance.instance_variable_set(:@random_seed, nil)
|
136
|
+
|
137
|
+
instance
|
138
|
+
end
|
139
|
+
|
140
|
+
# Export transformed data to JSON (utility method for caching)
|
141
|
+
# @param data [Array<Array<Float>>] Transformed data to export
|
142
|
+
# @param path [String] Path where to save the data
|
143
|
+
def self.export_data(data, path)
|
144
|
+
File.write(path, JSON.pretty_generate(data))
|
145
|
+
end
|
146
|
+
|
147
|
+
# Import transformed data from JSON (utility method for caching)
|
148
|
+
# @param path [String] Path to the saved data
|
149
|
+
# @return [Array<Array<Float>>] The loaded data
|
150
|
+
def self.import_data(path)
|
151
|
+
JSON.parse(File.read(path))
|
152
|
+
end
|
153
|
+
|
154
|
+
private
|
155
|
+
|
156
|
+
def handle_umap_error(error, data)
|
157
|
+
error_msg = error.message
|
158
|
+
n_samples = data.size
|
159
|
+
|
160
|
+
case error_msg
|
161
|
+
when /isolated point/i, /graph will not be connected/i
|
162
|
+
raise ::ClusterKit::IsolatedPointError, <<~MSG
|
163
|
+
UMAP found isolated points in your data that are too far from other points.
|
164
|
+
|
165
|
+
This typically happens when:
|
166
|
+
• Your data contains outliers that are very different from other points
|
167
|
+
• You're using random data without inherent structure
|
168
|
+
• The n_neighbors parameter (#{@n_neighbors}) is too high for your data distribution
|
169
|
+
|
170
|
+
Solutions:
|
171
|
+
1. Reduce n_neighbors (try 5 or even 3): UMAP.new(n_neighbors: 5)
|
172
|
+
2. Remove outliers from your data before applying UMAP
|
173
|
+
3. Ensure your data has some structure (not purely random)
|
174
|
+
4. For small datasets (< 50 points), consider using PCA instead
|
175
|
+
|
176
|
+
Your data: #{n_samples} samples, #{data.first&.size || 0} dimensions
|
177
|
+
MSG
|
178
|
+
|
179
|
+
when /assertion failed.*box_size/i
|
180
|
+
raise ::ClusterKit::ConvergenceError, <<~MSG
|
181
|
+
UMAP failed to converge due to numerical instability in your data.
|
182
|
+
|
183
|
+
This typically happens when:
|
184
|
+
• Data points are too spread out or have extreme values
|
185
|
+
• The scale of different features varies wildly
|
186
|
+
• There are duplicate or nearly-duplicate points
|
187
|
+
|
188
|
+
Solutions:
|
189
|
+
1. Normalize your data first: ClusterKit::Preprocessing.normalize(data)
|
190
|
+
2. Use a smaller n_neighbors value: UMAP.new(n_neighbors: 5)
|
191
|
+
3. Check for and remove duplicate points
|
192
|
+
4. Scale your data to a reasonable range (e.g., 0-1 or -1 to 1)
|
193
|
+
|
194
|
+
Your data: #{n_samples} samples, #{data.first&.size || 0} dimensions
|
195
|
+
MSG
|
196
|
+
|
197
|
+
when /n_neighbors.*larger than/i, /too many neighbors/i
|
198
|
+
raise ::ClusterKit::InvalidParameterError, <<~MSG
|
199
|
+
The n_neighbors parameter (#{@n_neighbors}) is too large for your dataset size (#{n_samples}).
|
200
|
+
|
201
|
+
UMAP needs n_neighbors to be less than the number of samples.
|
202
|
+
Suggested value: #{[5, (n_samples * 0.1).to_i].max}
|
203
|
+
|
204
|
+
This should have been auto-adjusted. If you're seeing this error, please report it.
|
205
|
+
MSG
|
206
|
+
|
207
|
+
else
|
208
|
+
# For unknown errors, still provide some guidance
|
209
|
+
raise ::ClusterKit::Error, <<~MSG
|
210
|
+
UMAP encountered an error: #{error_msg}
|
211
|
+
|
212
|
+
Common solutions:
|
213
|
+
1. Try reducing n_neighbors (current: #{@n_neighbors})
|
214
|
+
2. Normalize your data first
|
215
|
+
3. Check for NaN or infinite values in your data
|
216
|
+
4. Ensure you have at least 10 data points
|
217
|
+
|
218
|
+
If this persists, consider using PCA for dimensionality reduction instead.
|
219
|
+
MSG
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
def validate_input(data, check_min_samples: true)
|
224
|
+
raise ArgumentError, "Input must be an array" unless data.is_a?(Array)
|
225
|
+
raise ArgumentError, "Input cannot be empty" if data.empty?
|
226
|
+
|
227
|
+
first_row = data.first
|
228
|
+
raise ArgumentError, "Input must be a 2D array (array of arrays)" unless first_row.is_a?(Array)
|
229
|
+
|
230
|
+
row_length = first_row.length
|
231
|
+
min_val = Float::INFINITY
|
232
|
+
max_val = -Float::INFINITY
|
233
|
+
|
234
|
+
# First validate data structure and types
|
235
|
+
data.each_with_index do |row, i|
|
236
|
+
unless row.is_a?(Array)
|
237
|
+
raise ArgumentError, "Row #{i} is not an array"
|
238
|
+
end
|
239
|
+
|
240
|
+
if row.length != row_length
|
241
|
+
raise ArgumentError, "All rows must have the same length (row #{i} has #{row.length} elements, expected #{row_length})"
|
242
|
+
end
|
243
|
+
|
244
|
+
row.each_with_index do |val, j|
|
245
|
+
unless val.is_a?(Numeric)
|
246
|
+
raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
|
247
|
+
end
|
248
|
+
|
249
|
+
# Only check for NaN/Infinite on floats
|
250
|
+
if val.is_a?(Float) && (val.nan? || val.infinite?)
|
251
|
+
raise ArgumentError, "Element at position [#{i}, #{j}] is NaN or Infinite"
|
252
|
+
end
|
253
|
+
|
254
|
+
# Track data range
|
255
|
+
val_f = val.to_f
|
256
|
+
min_val = val_f if val_f < min_val
|
257
|
+
max_val = val_f if val_f > max_val
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
# Check for sufficient data points after validating structure (only for fit operations)
|
262
|
+
if check_min_samples && data.size < 10
|
263
|
+
raise ::ClusterKit::InsufficientDataError, <<~MSG
|
264
|
+
UMAP requires at least 10 data points, but only #{data.size} provided.
|
265
|
+
|
266
|
+
For small datasets, consider:
|
267
|
+
1. Using PCA instead: ClusterKit::Dimensionality::PCA.new(n_components: 2)
|
268
|
+
2. Collecting more data points
|
269
|
+
3. Using simpler visualization methods
|
270
|
+
MSG
|
271
|
+
end
|
272
|
+
|
273
|
+
# Check for extreme data ranges that might cause numerical issues
|
274
|
+
data_range = max_val - min_val
|
275
|
+
if data_range > 1000
|
276
|
+
warn "WARNING: Large data range detected (#{data_range.round(2)}). Consider normalizing your data to prevent numerical instability."
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
def create_rust_umap_with_adjusted_params(data)
|
281
|
+
# Only create if not already created
|
282
|
+
return if @rust_umap
|
283
|
+
|
284
|
+
n_samples = data.size
|
285
|
+
|
286
|
+
# Automatically adjust n_neighbors if it's too high for the dataset
|
287
|
+
# n_neighbors should be less than n_samples
|
288
|
+
# Use a reasonable default: min(15, n_samples / 4) but at least 2
|
289
|
+
max_neighbors = [n_samples - 1, 2].max # At least 2, but less than n_samples
|
290
|
+
suggested_neighbors = [[15, n_samples / 4].min.to_i, 2].max
|
291
|
+
|
292
|
+
adjusted_n_neighbors = @n_neighbors
|
293
|
+
if @n_neighbors > max_neighbors
|
294
|
+
adjusted_n_neighbors = [suggested_neighbors, max_neighbors].min
|
295
|
+
|
296
|
+
if ::ClusterKit.configuration.verbose
|
297
|
+
warn "UMAP: Adjusted n_neighbors from #{@n_neighbors} to #{adjusted_n_neighbors} for dataset with #{n_samples} samples"
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
@rust_umap = ::ClusterKit::RustUMAP.new({
|
302
|
+
n_components: @n_components,
|
303
|
+
n_neighbors: adjusted_n_neighbors,
|
304
|
+
random_seed: @random_seed,
|
305
|
+
nb_grad_batch: @nb_grad_batch,
|
306
|
+
nb_sampling_by_edge: @nb_sampling_by_edge
|
307
|
+
})
|
308
|
+
end
|
309
|
+
end
|
310
|
+
end
|
311
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ClusterKit
|
4
|
+
# Module for dimensionality reduction algorithms
|
5
|
+
module Dimensionality
|
6
|
+
# Load classes - can't use autoload with require issues
|
7
|
+
require_relative "dimensionality/umap"
|
8
|
+
require_relative "dimensionality/pca"
|
9
|
+
require_relative "dimensionality/svd"
|
10
|
+
|
11
|
+
# Module-level evaluation methods
|
12
|
+
|
13
|
+
# Calculate reconstruction error for a dimensionality reduction
|
14
|
+
# @param original_data [Array<Array<Numeric>>] Original high-dimensional data
|
15
|
+
# @param reconstructed_data [Array<Array<Numeric>>] Reconstructed data
|
16
|
+
# @return [Float] Mean squared reconstruction error
|
17
|
+
def self.reconstruction_error(original_data, reconstructed_data)
|
18
|
+
raise ArgumentError, "Data sizes don't match" if original_data.size != reconstructed_data.size
|
19
|
+
|
20
|
+
total_error = 0.0
|
21
|
+
original_data.zip(reconstructed_data).each do |orig, recon|
|
22
|
+
error = orig.zip(recon).map { |o, r| (o - r) ** 2 }.sum
|
23
|
+
total_error += error
|
24
|
+
end
|
25
|
+
|
26
|
+
total_error / original_data.size
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
# API Design for HDBSCAN to match KMeans pattern
|
2
|
+
|
3
|
+
module ClusterKit
|
4
|
+
module Clustering
|
5
|
+
|
6
|
+
# HDBSCAN clustering algorithm - matching KMeans API pattern
|
7
|
+
class HDBSCAN
|
8
|
+
attr_reader :min_samples, :min_cluster_size, :labels, :probabilities,
|
9
|
+
:outlier_scores, :cluster_persistence
|
10
|
+
|
11
|
+
# Initialize HDBSCAN clusterer (matches KMeans pattern)
|
12
|
+
# @param min_samples [Integer] Min neighborhood size for core points (default: 5)
|
13
|
+
# @param min_cluster_size [Integer] Minimum size of clusters (default: 5)
|
14
|
+
# @param metric [String] Distance metric (default: 'euclidean')
|
15
|
+
def initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean')
|
16
|
+
raise ArgumentError, "min_samples must be positive" unless min_samples > 0
|
17
|
+
raise ArgumentError, "min_cluster_size must be positive" unless min_cluster_size > 0
|
18
|
+
@min_samples = min_samples
|
19
|
+
@min_cluster_size = min_cluster_size
|
20
|
+
@metric = metric
|
21
|
+
@fitted = false
|
22
|
+
end
|
23
|
+
|
24
|
+
# Fit the HDBSCAN model (matches KMeans.fit)
|
25
|
+
# @param data [Array] 2D array of data points
|
26
|
+
# @return [self] Returns self for method chaining
|
27
|
+
def fit(data)
|
28
|
+
validate_data(data)
|
29
|
+
|
30
|
+
# Call Rust implementation (hdbscan crate)
|
31
|
+
result = Clustering.hdbscan_rust(data, @min_samples, @min_cluster_size, @metric)
|
32
|
+
|
33
|
+
@labels = result[:labels]
|
34
|
+
@probabilities = result[:probabilities]
|
35
|
+
@outlier_scores = result[:outlier_scores]
|
36
|
+
@cluster_persistence = result[:cluster_persistence]
|
37
|
+
@fitted = true
|
38
|
+
|
39
|
+
self
|
40
|
+
end
|
41
|
+
|
42
|
+
# HDBSCAN doesn't support predict for new points (unlike KMeans)
|
43
|
+
# But we keep the method for API consistency
|
44
|
+
# @param data [Array] 2D array of data points
|
45
|
+
# @return [Array] Returns nil or raises
|
46
|
+
def predict(data)
|
47
|
+
raise NotImplementedError, "HDBSCAN does not support prediction on new data. " \
|
48
|
+
"Use approximate_predict for approximate membership"
|
49
|
+
end
|
50
|
+
|
51
|
+
# Fit the model and return labels (matches KMeans.fit_predict)
|
52
|
+
# @param data [Array] 2D array of data points
|
53
|
+
# @return [Array] Cluster labels (-1 for noise)
|
54
|
+
def fit_predict(data)
|
55
|
+
fit(data)
|
56
|
+
@labels
|
57
|
+
end
|
58
|
+
|
59
|
+
# Check if model has been fitted (matches KMeans.fitted?)
|
60
|
+
# @return [Boolean] True if fitted
|
61
|
+
def fitted?
|
62
|
+
@fitted
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get number of clusters found (similar to KMeans.k but discovered)
|
66
|
+
# @return [Integer] Number of clusters (excluding noise)
|
67
|
+
def n_clusters
|
68
|
+
return 0 unless fitted?
|
69
|
+
@labels.max + 1 rescue 0
|
70
|
+
end
|
71
|
+
|
72
|
+
# Get noise ratio (HDBSCAN-specific but follows naming pattern)
|
73
|
+
# @return [Float] Fraction of points labeled as noise
|
74
|
+
def noise_ratio
|
75
|
+
return 0.0 unless fitted?
|
76
|
+
@labels.count(-1).to_f / @labels.length
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
|
81
|
+
def validate_data(data)
|
82
|
+
# Exact same validation as KMeans for consistency
|
83
|
+
raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
|
84
|
+
raise ArgumentError, "Data cannot be empty" if data.empty?
|
85
|
+
raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
|
86
|
+
|
87
|
+
row_length = data.first.length
|
88
|
+
unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
|
89
|
+
raise ArgumentError, "All rows must have the same length"
|
90
|
+
end
|
91
|
+
|
92
|
+
data.each_with_index do |row, i|
|
93
|
+
row.each_with_index do |val, j|
|
94
|
+
unless val.is_a?(Numeric)
|
95
|
+
raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Module-level convenience methods (matching KMeans pattern)
|
103
|
+
class << self
|
104
|
+
# Perform HDBSCAN clustering (matches Clustering.kmeans signature)
|
105
|
+
# @param data [Array] 2D array of data points
|
106
|
+
# @param min_samples [Integer] Min neighborhood size for core points
|
107
|
+
# @param min_cluster_size [Integer] Minimum size of clusters
|
108
|
+
# @return [Hash] Result hash with :labels, :probabilities, :outlier_scores
|
109
|
+
def hdbscan(data, min_samples: 5, min_cluster_size: 5)
|
110
|
+
clusterer = HDBSCAN.new(min_samples: min_samples, min_cluster_size: min_cluster_size)
|
111
|
+
clusterer.fit(data)
|
112
|
+
{
|
113
|
+
labels: clusterer.labels,
|
114
|
+
probabilities: clusterer.probabilities,
|
115
|
+
outlier_scores: clusterer.outlier_scores,
|
116
|
+
n_clusters: clusterer.n_clusters,
|
117
|
+
noise_ratio: clusterer.noise_ratio
|
118
|
+
}
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# Usage comparison:
|
125
|
+
|
126
|
+
# KMeans usage:
|
127
|
+
kmeans = ClusterKit::Clustering::KMeans.new(k: 3)
|
128
|
+
kmeans.fit(data)
|
129
|
+
labels = kmeans.labels
|
130
|
+
# or
|
131
|
+
labels = kmeans.fit_predict(data)
|
132
|
+
|
133
|
+
# HDBSCAN usage (identical pattern):
|
134
|
+
hdbscan = ClusterKit::Clustering::HDBSCAN.new(min_samples: 5, min_cluster_size: 5)
|
135
|
+
hdbscan.fit(data)
|
136
|
+
labels = hdbscan.labels
|
137
|
+
# or
|
138
|
+
labels = hdbscan.fit_predict(data)
|
139
|
+
|
140
|
+
# Module-level convenience (both follow same pattern):
|
141
|
+
result = ClusterKit::Clustering.kmeans(data, 3)
|
142
|
+
result = ClusterKit::Clustering.hdbscan(data, min_samples: 5)
|