clusterkit 0.1.0.pre.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +25 -0
- data/lib/clusterkit/dimensionality/umap.rb +68 -64
- data/lib/clusterkit/version.rb +1 -1
- data/lib/clusterkit.rb +6 -8
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4df777f3b01fea2f411cde6233b2abd5c1ab01903117ee8f377a1ae67bb6510b
|
4
|
+
data.tar.gz: cbf92ed9e86d14a3959b81f348e69f4eb2fa3bcd92d86149c4872ba9a197f7c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bae2d9fac87d5cb27458ae8ace71edffe754216ea5b9af088c98fecfc4489dc3f572ca45e7d926ed658516dac2ab18b6bcf04fc138daf48a01d17569f5698521
|
7
|
+
data.tar.gz: 560c59116a016c60dae79ed3817158005551c8a5e6a1d7d7b47a67cdb87a0c566d07db7771bcad0507e21111d380b1b4795bd85a5e75d5e00ce1229a181704b9
|
data/README.md
CHANGED
@@ -38,6 +38,22 @@ This gem would not be possible without these foundational libraries. Please cons
|
|
38
38
|
- Comparison of different algorithms
|
39
39
|
- Built-in rake tasks for quick experimentation
|
40
40
|
|
41
|
+
## API Structure
|
42
|
+
|
43
|
+
ClusterKit organizes its functionality into clear modules:
|
44
|
+
|
45
|
+
- **`ClusterKit::Dimensionality`** - All dimensionality reduction algorithms
|
46
|
+
- `ClusterKit::Dimensionality::UMAP` - UMAP implementation
|
47
|
+
- `ClusterKit::Dimensionality::PCA` - PCA implementation
|
48
|
+
- `ClusterKit::Dimensionality::SVD` - SVD implementation
|
49
|
+
- **`ClusterKit::Clustering`** - All clustering algorithms
|
50
|
+
- `ClusterKit::Clustering::KMeans` - K-means clustering
|
51
|
+
- `ClusterKit::Clustering::HDBSCAN` - HDBSCAN clustering
|
52
|
+
- **`ClusterKit::Utils`** - Utility functions
|
53
|
+
- **`ClusterKit::Preprocessing`** - Data preprocessing tools
|
54
|
+
|
55
|
+
All user-facing classes are in these modules. Implementation details are kept private.
|
56
|
+
|
41
57
|
## Installation
|
42
58
|
|
43
59
|
Add this line to your application's Gemfile:
|
@@ -222,6 +238,15 @@ test_data = all_data[150..-1] # Last 50 samples for testing
|
|
222
238
|
umap.fit(training_data)
|
223
239
|
test_embedded = umap.transform(test_data)
|
224
240
|
|
241
|
+
# Save and load fitted models
|
242
|
+
umap.save_model("umap_model.bin") # Save the fitted model
|
243
|
+
loaded_umap = ClusterKit::Dimensionality::UMAP.load_model("umap_model.bin") # Load it later
|
244
|
+
new_data_embedded = loaded_umap.transform(new_data) # Use loaded model for new data
|
245
|
+
|
246
|
+
# Save and load transformed data (useful for caching results)
|
247
|
+
ClusterKit::Dimensionality::UMAP.save_data(embedded, "embeddings.json")
|
248
|
+
cached_embeddings = ClusterKit::Dimensionality::UMAP.load_data("embeddings.json")
|
249
|
+
|
225
250
|
# Note: The library automatically adjusts n_neighbors if it's too large for your dataset
|
226
251
|
```
|
227
252
|
|
@@ -9,7 +9,7 @@ module ClusterKit
|
|
9
9
|
module Dimensionality
|
10
10
|
class UMAP
|
11
11
|
attr_reader :n_components, :n_neighbors, :random_seed, :nb_grad_batch, :nb_sampling_by_edge
|
12
|
-
|
12
|
+
|
13
13
|
# Initialize a new UMAP instance
|
14
14
|
# @param n_components [Integer] Target number of dimensions (default: 2)
|
15
15
|
# @param n_neighbors [Integer] Number of neighbors for manifold approximation (default: 15)
|
@@ -18,7 +18,7 @@ module ClusterKit
|
|
18
18
|
# Controls training iterations - lower = faster but less accurate
|
19
19
|
# @param nb_sampling_by_edge [Integer] Number of negative samples per edge (default: 8)
|
20
20
|
# Controls sampling quality - lower = faster but less accurate
|
21
|
-
def initialize(n_components: 2, n_neighbors: 15, random_seed: nil,
|
21
|
+
def initialize(n_components: 2, n_neighbors: 15, random_seed: nil,
|
22
22
|
nb_grad_batch: 10, nb_sampling_by_edge: 8)
|
23
23
|
@n_components = n_components
|
24
24
|
@n_neighbors = n_neighbors
|
@@ -29,21 +29,21 @@ module ClusterKit
|
|
29
29
|
# Don't create RustUMAP yet - will be created in fit/fit_transform with adjusted parameters
|
30
30
|
@rust_umap = nil
|
31
31
|
end
|
32
|
-
|
32
|
+
|
33
33
|
# Fit the model to the data (training)
|
34
34
|
# @param data [Array<Array<Numeric>>] Training data as 2D array
|
35
35
|
# @return [self] Returns self for method chaining
|
36
36
|
# @note UMAP's training process inherently produces embeddings. Since the
|
37
|
-
# underlying Rust implementation doesn't separate training from
|
37
|
+
# underlying Rust implementation doesn't separate training from
|
38
38
|
# transformation, we call fit_transform but discard the embeddings.
|
39
39
|
# Use fit_transform if you need both training and the transformed data.
|
40
40
|
def fit(data)
|
41
41
|
validate_input(data)
|
42
|
-
|
42
|
+
|
43
43
|
# Always recreate RustUMAP for fit to ensure fresh fit
|
44
44
|
@rust_umap = nil
|
45
45
|
create_rust_umap_with_adjusted_params(data)
|
46
|
-
|
46
|
+
|
47
47
|
# UMAP doesn't separate training from transformation internally,
|
48
48
|
# so we call fit_transform but discard the result
|
49
49
|
begin
|
@@ -59,7 +59,7 @@ module ClusterKit
|
|
59
59
|
handle_umap_error(RuntimeError.new(e.message), data)
|
60
60
|
end
|
61
61
|
end
|
62
|
-
|
62
|
+
|
63
63
|
# Transform data using the fitted model
|
64
64
|
# @param data [Array<Array<Numeric>>] Data to transform
|
65
65
|
# @return [Array<Array<Float>>] Transformed data in reduced dimensions
|
@@ -71,17 +71,17 @@ module ClusterKit
|
|
71
71
|
@rust_umap.transform(data)
|
72
72
|
end
|
73
73
|
end
|
74
|
-
|
74
|
+
|
75
75
|
# Fit the model and transform the data in one step
|
76
76
|
# @param data [Array<Array<Numeric>>] Training data as 2D array
|
77
77
|
# @return [Array<Array<Float>>] Transformed data in reduced dimensions
|
78
78
|
def fit_transform(data)
|
79
79
|
validate_input(data)
|
80
|
-
|
80
|
+
|
81
81
|
# Always recreate RustUMAP for fit_transform to ensure fresh fit
|
82
82
|
@rust_umap = nil
|
83
83
|
create_rust_umap_with_adjusted_params(data)
|
84
|
-
|
84
|
+
|
85
85
|
begin
|
86
86
|
result = Silence.maybe_silence do
|
87
87
|
@rust_umap.fit_transform(data)
|
@@ -95,36 +95,36 @@ module ClusterKit
|
|
95
95
|
handle_umap_error(RuntimeError.new(e.message), data)
|
96
96
|
end
|
97
97
|
end
|
98
|
-
|
98
|
+
|
99
99
|
# Check if the model has been fitted
|
100
100
|
# @return [Boolean] true if model is fitted, false otherwise
|
101
101
|
def fitted?
|
102
102
|
@fitted
|
103
103
|
end
|
104
|
-
|
104
|
+
|
105
105
|
# Save the fitted model to a file
|
106
106
|
# @param path [String] Path where to save the model
|
107
107
|
# @raise [RuntimeError] If model hasn't been fitted yet
|
108
|
-
def
|
108
|
+
def save_model(path)
|
109
109
|
raise RuntimeError, "No model to save. Call fit or fit_transform first." unless fitted?
|
110
|
-
|
110
|
+
|
111
111
|
# Ensure directory exists
|
112
112
|
dir = File.dirname(path)
|
113
113
|
FileUtils.mkdir_p(dir) unless dir == '.' || dir == '/'
|
114
|
-
|
114
|
+
|
115
115
|
@rust_umap.save_model(path)
|
116
116
|
end
|
117
|
-
|
117
|
+
|
118
118
|
# Load a fitted model from a file
|
119
119
|
# @param path [String] Path to the saved model
|
120
120
|
# @return [UMAP] A new UMAP instance with the loaded model
|
121
121
|
# @raise [ArgumentError] If file doesn't exist
|
122
|
-
def self.
|
122
|
+
def self.load_model(path)
|
123
123
|
raise ArgumentError, "File not found: #{path}" unless File.exist?(path)
|
124
|
-
|
125
|
-
# Load the Rust model
|
126
|
-
rust_umap = ::ClusterKit
|
127
|
-
|
124
|
+
|
125
|
+
# Load the Rust model (access private constant)
|
126
|
+
rust_umap = ::ClusterKit.const_get(:RustUMAP).load_model(path)
|
127
|
+
|
128
128
|
# Create a new UMAP instance with the loaded model
|
129
129
|
instance = allocate
|
130
130
|
instance.instance_variable_set(:@rust_umap, rust_umap)
|
@@ -133,172 +133,176 @@ module ClusterKit
|
|
133
133
|
instance.instance_variable_set(:@n_components, nil)
|
134
134
|
instance.instance_variable_set(:@n_neighbors, nil)
|
135
135
|
instance.instance_variable_set(:@random_seed, nil)
|
136
|
-
|
136
|
+
|
137
137
|
instance
|
138
138
|
end
|
139
|
-
|
140
|
-
#
|
141
|
-
# @param data [Array<Array<Float>>] Transformed data to
|
139
|
+
|
140
|
+
# Save transformed data to JSON file
|
141
|
+
# @param data [Array<Array<Float>>] Transformed data to save
|
142
142
|
# @param path [String] Path where to save the data
|
143
|
-
def self.
|
143
|
+
def self.save_data(data, path)
|
144
|
+
FileUtils.mkdir_p(File.dirname(path)) unless File.dirname(path) == '.'
|
144
145
|
File.write(path, JSON.pretty_generate(data))
|
145
146
|
end
|
146
|
-
|
147
|
-
#
|
147
|
+
|
148
|
+
# Load transformed data from JSON file
|
148
149
|
# @param path [String] Path to the saved data
|
149
150
|
# @return [Array<Array<Float>>] The loaded data
|
150
|
-
|
151
|
+
# @raise [ArgumentError] If file doesn't exist
|
152
|
+
def self.load_data(path)
|
153
|
+
raise ArgumentError, "File not found: #{path}" unless File.exist?(path)
|
151
154
|
JSON.parse(File.read(path))
|
152
155
|
end
|
153
|
-
|
156
|
+
|
154
157
|
private
|
155
|
-
|
158
|
+
|
156
159
|
def handle_umap_error(error, data)
|
157
160
|
error_msg = error.message
|
158
161
|
n_samples = data.size
|
159
|
-
|
162
|
+
|
160
163
|
case error_msg
|
161
164
|
when /isolated point/i, /graph will not be connected/i
|
162
165
|
raise ::ClusterKit::IsolatedPointError, <<~MSG
|
163
166
|
UMAP found isolated points in your data that are too far from other points.
|
164
|
-
|
167
|
+
|
165
168
|
This typically happens when:
|
166
169
|
• Your data contains outliers that are very different from other points
|
167
170
|
• You're using random data without inherent structure
|
168
171
|
• The n_neighbors parameter (#{@n_neighbors}) is too high for your data distribution
|
169
|
-
|
172
|
+
|
170
173
|
Solutions:
|
171
174
|
1. Reduce n_neighbors (try 5 or even 3): UMAP.new(n_neighbors: 5)
|
172
175
|
2. Remove outliers from your data before applying UMAP
|
173
176
|
3. Ensure your data has some structure (not purely random)
|
174
177
|
4. For small datasets (< 50 points), consider using PCA instead
|
175
|
-
|
178
|
+
|
176
179
|
Your data: #{n_samples} samples, #{data.first&.size || 0} dimensions
|
177
180
|
MSG
|
178
|
-
|
181
|
+
|
179
182
|
when /assertion failed.*box_size/i
|
180
183
|
raise ::ClusterKit::ConvergenceError, <<~MSG
|
181
184
|
UMAP failed to converge due to numerical instability in your data.
|
182
|
-
|
185
|
+
|
183
186
|
This typically happens when:
|
184
187
|
• Data points are too spread out or have extreme values
|
185
188
|
• The scale of different features varies wildly
|
186
189
|
• There are duplicate or nearly-duplicate points
|
187
|
-
|
190
|
+
|
188
191
|
Solutions:
|
189
192
|
1. Normalize your data first: ClusterKit::Preprocessing.normalize(data)
|
190
|
-
2. Use a smaller n_neighbors value: UMAP.new(n_neighbors: 5)
|
193
|
+
2. Use a smaller n_neighbors value: UMAP.new(n_neighbors: 5)
|
191
194
|
3. Check for and remove duplicate points
|
192
195
|
4. Scale your data to a reasonable range (e.g., 0-1 or -1 to 1)
|
193
|
-
|
196
|
+
|
194
197
|
Your data: #{n_samples} samples, #{data.first&.size || 0} dimensions
|
195
198
|
MSG
|
196
|
-
|
199
|
+
|
197
200
|
when /n_neighbors.*larger than/i, /too many neighbors/i
|
198
201
|
raise ::ClusterKit::InvalidParameterError, <<~MSG
|
199
202
|
The n_neighbors parameter (#{@n_neighbors}) is too large for your dataset size (#{n_samples}).
|
200
|
-
|
203
|
+
|
201
204
|
UMAP needs n_neighbors to be less than the number of samples.
|
202
205
|
Suggested value: #{[5, (n_samples * 0.1).to_i].max}
|
203
|
-
|
206
|
+
|
204
207
|
This should have been auto-adjusted. If you're seeing this error, please report it.
|
205
208
|
MSG
|
206
|
-
|
209
|
+
|
207
210
|
else
|
208
211
|
# For unknown errors, still provide some guidance
|
209
212
|
raise ::ClusterKit::Error, <<~MSG
|
210
213
|
UMAP encountered an error: #{error_msg}
|
211
|
-
|
214
|
+
|
212
215
|
Common solutions:
|
213
216
|
1. Try reducing n_neighbors (current: #{@n_neighbors})
|
214
217
|
2. Normalize your data first
|
215
218
|
3. Check for NaN or infinite values in your data
|
216
219
|
4. Ensure you have at least 10 data points
|
217
|
-
|
220
|
+
|
218
221
|
If this persists, consider using PCA for dimensionality reduction instead.
|
219
222
|
MSG
|
220
223
|
end
|
221
224
|
end
|
222
|
-
|
225
|
+
|
223
226
|
def validate_input(data, check_min_samples: true)
|
224
227
|
raise ArgumentError, "Input must be an array" unless data.is_a?(Array)
|
225
228
|
raise ArgumentError, "Input cannot be empty" if data.empty?
|
226
|
-
|
229
|
+
|
227
230
|
first_row = data.first
|
228
231
|
raise ArgumentError, "Input must be a 2D array (array of arrays)" unless first_row.is_a?(Array)
|
229
|
-
|
232
|
+
|
230
233
|
row_length = first_row.length
|
231
234
|
min_val = Float::INFINITY
|
232
235
|
max_val = -Float::INFINITY
|
233
|
-
|
236
|
+
|
234
237
|
# First validate data structure and types
|
235
238
|
data.each_with_index do |row, i|
|
236
239
|
unless row.is_a?(Array)
|
237
240
|
raise ArgumentError, "Row #{i} is not an array"
|
238
241
|
end
|
239
|
-
|
242
|
+
|
240
243
|
if row.length != row_length
|
241
244
|
raise ArgumentError, "All rows must have the same length (row #{i} has #{row.length} elements, expected #{row_length})"
|
242
245
|
end
|
243
|
-
|
246
|
+
|
244
247
|
row.each_with_index do |val, j|
|
245
248
|
unless val.is_a?(Numeric)
|
246
249
|
raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
|
247
250
|
end
|
248
|
-
|
251
|
+
|
249
252
|
# Only check for NaN/Infinite on floats
|
250
253
|
if val.is_a?(Float) && (val.nan? || val.infinite?)
|
251
254
|
raise ArgumentError, "Element at position [#{i}, #{j}] is NaN or Infinite"
|
252
255
|
end
|
253
|
-
|
256
|
+
|
254
257
|
# Track data range
|
255
258
|
val_f = val.to_f
|
256
259
|
min_val = val_f if val_f < min_val
|
257
260
|
max_val = val_f if val_f > max_val
|
258
261
|
end
|
259
262
|
end
|
260
|
-
|
263
|
+
|
261
264
|
# Check for sufficient data points after validating structure (only for fit operations)
|
262
265
|
if check_min_samples && data.size < 10
|
263
266
|
raise ::ClusterKit::InsufficientDataError, <<~MSG
|
264
267
|
UMAP requires at least 10 data points, but only #{data.size} provided.
|
265
|
-
|
268
|
+
|
266
269
|
For small datasets, consider:
|
267
270
|
1. Using PCA instead: ClusterKit::Dimensionality::PCA.new(n_components: 2)
|
268
271
|
2. Collecting more data points
|
269
272
|
3. Using simpler visualization methods
|
270
273
|
MSG
|
271
274
|
end
|
272
|
-
|
275
|
+
|
273
276
|
# Check for extreme data ranges that might cause numerical issues
|
274
277
|
data_range = max_val - min_val
|
275
278
|
if data_range > 1000
|
276
279
|
warn "WARNING: Large data range detected (#{data_range.round(2)}). Consider normalizing your data to prevent numerical instability."
|
277
280
|
end
|
278
281
|
end
|
279
|
-
|
282
|
+
|
280
283
|
def create_rust_umap_with_adjusted_params(data)
|
281
284
|
# Only create if not already created
|
282
285
|
return if @rust_umap
|
283
|
-
|
286
|
+
|
284
287
|
n_samples = data.size
|
285
|
-
|
288
|
+
|
286
289
|
# Automatically adjust n_neighbors if it's too high for the dataset
|
287
290
|
# n_neighbors should be less than n_samples
|
288
291
|
# Use a reasonable default: min(15, n_samples / 4) but at least 2
|
289
292
|
max_neighbors = [n_samples - 1, 2].max # At least 2, but less than n_samples
|
290
293
|
suggested_neighbors = [[15, n_samples / 4].min.to_i, 2].max
|
291
|
-
|
294
|
+
|
292
295
|
adjusted_n_neighbors = @n_neighbors
|
293
296
|
if @n_neighbors > max_neighbors
|
294
297
|
adjusted_n_neighbors = [suggested_neighbors, max_neighbors].min
|
295
|
-
|
298
|
+
|
296
299
|
if ::ClusterKit.configuration.verbose
|
297
300
|
warn "UMAP: Adjusted n_neighbors from #{@n_neighbors} to #{adjusted_n_neighbors} for dataset with #{n_samples} samples"
|
298
301
|
end
|
299
302
|
end
|
300
|
-
|
301
|
-
|
303
|
+
|
304
|
+
# Access the private constant from inside the module
|
305
|
+
@rust_umap = ::ClusterKit.const_get(:RustUMAP).new({
|
302
306
|
n_components: @n_components,
|
303
307
|
n_neighbors: adjusted_n_neighbors,
|
304
308
|
random_seed: @random_seed,
|
data/lib/clusterkit/version.rb
CHANGED
data/lib/clusterkit.rb
CHANGED
@@ -21,20 +21,18 @@ module ClusterKit
|
|
21
21
|
class DisconnectedGraphError < DataError; end
|
22
22
|
class InsufficientDataError < DataError; end
|
23
23
|
|
24
|
-
# Load modules - can't use autoload with require_relative path issues
|
25
|
-
require_relative "clusterkit/dimensionality"
|
26
|
-
require_relative "clusterkit/clustering"
|
27
|
-
|
28
24
|
# Autoload utilities
|
29
25
|
autoload :Utils, "clusterkit/utils"
|
30
26
|
autoload :Preprocessing, "clusterkit/preprocessing"
|
31
27
|
autoload :Silence, "clusterkit/silence"
|
32
28
|
|
33
|
-
# Load the extension
|
34
|
-
require_relative "clusterkit/
|
35
|
-
|
36
|
-
# Now load the modules that depend on the extension
|
29
|
+
# Load modules that depend on the extension
|
30
|
+
require_relative "clusterkit/dimensionality"
|
37
31
|
require_relative "clusterkit/clustering"
|
32
|
+
|
33
|
+
# Make RustUMAP private - it's an implementation detail
|
34
|
+
# Users should use Dimensionality::UMAP instead
|
35
|
+
private_constant :RustUMAP if const_defined?(:RustUMAP)
|
38
36
|
|
39
37
|
class << self
|
40
38
|
# Quick UMAP embedding
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: clusterkit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Petersen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-08-
|
11
|
+
date: 2025-08-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: csv
|