clusterkit 0.1.0.pre.1 → 0.1.0.pre.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fd025da9b7f5c97e370d05fb1062484cb99b0aaaa4a7c310eb27df78336c91b4
4
- data.tar.gz: 7665cf847930bc47cb04adc1f466ba09ea33c14da5f242434b08493047945f91
3
+ metadata.gz: 502f5f9d65ce255e4f075ca506ec0a20fddc27f14cce2df67947055e3b92801e
4
+ data.tar.gz: e1f26d070737d8fb4e15eb5ed9231eda7bd7e30473e883837ddfd7c780d54645
5
5
  SHA512:
6
- metadata.gz: a8db6d4738ad99a20887aef90398d4163bdd8bc47bbdb8dda74496adc105602051999e50d2bc6003b4981d16b8121151d71cf55618691262b4a092f3c46a2545
7
- data.tar.gz: 1a6e43a00d19d7fdaf35be6deffa5215e994a1f013b140aa23ac9bbb9d982facde94d710a894cba39e1fbd39bccd7c020b86a99e9d32d2aa256f717844623e5e
6
+ metadata.gz: bccf375389b9c0b98a1426470830b69675b3f7be15ae691366433eaa313270980e71d6232812fb1a2b823a5f4a1370fd9d62e1d54b0b150c491dda19d0c41d63
7
+ data.tar.gz: 5478910b6c42c50aed1f9e8184638a4e0a3903c34136c724f25c2a531385d89f24d599c4b649b6e569a5e043dd2d9a36dfbce5d796d8e42369613831e38dbe0e
data/README.md CHANGED
@@ -38,6 +38,22 @@ This gem would not be possible without these foundational libraries. Please cons
38
38
  - Comparison of different algorithms
39
39
  - Built-in rake tasks for quick experimentation
40
40
 
41
+ ## API Structure
42
+
43
+ ClusterKit organizes its functionality into clear modules:
44
+
45
+ - **`ClusterKit::Dimensionality`** - All dimensionality reduction algorithms
46
+ - `ClusterKit::Dimensionality::UMAP` - UMAP implementation
47
+ - `ClusterKit::Dimensionality::PCA` - PCA implementation
48
+ - `ClusterKit::Dimensionality::SVD` - SVD implementation
49
+ - **`ClusterKit::Clustering`** - All clustering algorithms
50
+ - `ClusterKit::Clustering::KMeans` - K-means clustering
51
+ - `ClusterKit::Clustering::HDBSCAN` - HDBSCAN clustering
52
+ - **`ClusterKit::Utils`** - Utility functions
53
+ - **`ClusterKit::Preprocessing`** - Data preprocessing tools
54
+
55
+ All user-facing classes are in these modules. Implementation details are kept private.
56
+
41
57
  ## Installation
42
58
 
43
59
  Add this line to your application's Gemfile:
@@ -222,6 +238,15 @@ test_data = all_data[150..-1] # Last 50 samples for testing
222
238
  umap.fit(training_data)
223
239
  test_embedded = umap.transform(test_data)
224
240
 
241
+ # Save and load fitted models
242
+ umap.save_model("umap_model.bin") # Save the fitted model
243
+ loaded_umap = ClusterKit::Dimensionality::UMAP.load_model("umap_model.bin") # Load it later
244
+ new_data_embedded = loaded_umap.transform(new_data) # Use loaded model for new data
245
+
246
+ # Save and load transformed data (useful for caching results)
247
+ ClusterKit::Dimensionality::UMAP.save_data(embedded, "embeddings.json")
248
+ cached_embeddings = ClusterKit::Dimensionality::UMAP.load_data("embeddings.json")
249
+
225
250
  # Note: The library automatically adjusts n_neighbors if it's too large for your dataset
226
251
  ```
227
252
 
@@ -9,7 +9,7 @@ module ClusterKit
9
9
  module Dimensionality
10
10
  class UMAP
11
11
  attr_reader :n_components, :n_neighbors, :random_seed, :nb_grad_batch, :nb_sampling_by_edge
12
-
12
+
13
13
  # Initialize a new UMAP instance
14
14
  # @param n_components [Integer] Target number of dimensions (default: 2)
15
15
  # @param n_neighbors [Integer] Number of neighbors for manifold approximation (default: 15)
@@ -18,7 +18,7 @@ module ClusterKit
18
18
  # Controls training iterations - lower = faster but less accurate
19
19
  # @param nb_sampling_by_edge [Integer] Number of negative samples per edge (default: 8)
20
20
  # Controls sampling quality - lower = faster but less accurate
21
- def initialize(n_components: 2, n_neighbors: 15, random_seed: nil,
21
+ def initialize(n_components: 2, n_neighbors: 15, random_seed: nil,
22
22
  nb_grad_batch: 10, nb_sampling_by_edge: 8)
23
23
  @n_components = n_components
24
24
  @n_neighbors = n_neighbors
@@ -29,21 +29,21 @@ module ClusterKit
29
29
  # Don't create RustUMAP yet - will be created in fit/fit_transform with adjusted parameters
30
30
  @rust_umap = nil
31
31
  end
32
-
32
+
33
33
  # Fit the model to the data (training)
34
34
  # @param data [Array<Array<Numeric>>] Training data as 2D array
35
35
  # @return [self] Returns self for method chaining
36
36
  # @note UMAP's training process inherently produces embeddings. Since the
37
- # underlying Rust implementation doesn't separate training from
37
+ # underlying Rust implementation doesn't separate training from
38
38
  # transformation, we call fit_transform but discard the embeddings.
39
39
  # Use fit_transform if you need both training and the transformed data.
40
40
  def fit(data)
41
41
  validate_input(data)
42
-
42
+
43
43
  # Always recreate RustUMAP for fit to ensure fresh fit
44
44
  @rust_umap = nil
45
45
  create_rust_umap_with_adjusted_params(data)
46
-
46
+
47
47
  # UMAP doesn't separate training from transformation internally,
48
48
  # so we call fit_transform but discard the result
49
49
  begin
@@ -59,7 +59,7 @@ module ClusterKit
59
59
  handle_umap_error(RuntimeError.new(e.message), data)
60
60
  end
61
61
  end
62
-
62
+
63
63
  # Transform data using the fitted model
64
64
  # @param data [Array<Array<Numeric>>] Data to transform
65
65
  # @return [Array<Array<Float>>] Transformed data in reduced dimensions
@@ -71,17 +71,17 @@ module ClusterKit
71
71
  @rust_umap.transform(data)
72
72
  end
73
73
  end
74
-
74
+
75
75
  # Fit the model and transform the data in one step
76
76
  # @param data [Array<Array<Numeric>>] Training data as 2D array
77
77
  # @return [Array<Array<Float>>] Transformed data in reduced dimensions
78
78
  def fit_transform(data)
79
79
  validate_input(data)
80
-
80
+
81
81
  # Always recreate RustUMAP for fit_transform to ensure fresh fit
82
82
  @rust_umap = nil
83
83
  create_rust_umap_with_adjusted_params(data)
84
-
84
+
85
85
  begin
86
86
  result = Silence.maybe_silence do
87
87
  @rust_umap.fit_transform(data)
@@ -95,36 +95,36 @@ module ClusterKit
95
95
  handle_umap_error(RuntimeError.new(e.message), data)
96
96
  end
97
97
  end
98
-
98
+
99
99
  # Check if the model has been fitted
100
100
  # @return [Boolean] true if model is fitted, false otherwise
101
101
  def fitted?
102
102
  @fitted
103
103
  end
104
-
104
+
105
105
  # Save the fitted model to a file
106
106
  # @param path [String] Path where to save the model
107
107
  # @raise [RuntimeError] If model hasn't been fitted yet
108
- def save(path)
108
+ def save_model(path)
109
109
  raise RuntimeError, "No model to save. Call fit or fit_transform first." unless fitted?
110
-
110
+
111
111
  # Ensure directory exists
112
112
  dir = File.dirname(path)
113
113
  FileUtils.mkdir_p(dir) unless dir == '.' || dir == '/'
114
-
114
+
115
115
  @rust_umap.save_model(path)
116
116
  end
117
-
117
+
118
118
  # Load a fitted model from a file
119
119
  # @param path [String] Path to the saved model
120
120
  # @return [UMAP] A new UMAP instance with the loaded model
121
121
  # @raise [ArgumentError] If file doesn't exist
122
- def self.load(path)
122
+ def self.load_model(path)
123
123
  raise ArgumentError, "File not found: #{path}" unless File.exist?(path)
124
-
125
- # Load the Rust model
126
- rust_umap = ::ClusterKit::RustUMAP.load_model(path)
127
-
124
+
125
+ # Load the Rust model (access private constant)
126
+ rust_umap = ::ClusterKit.const_get(:RustUMAP).load_model(path)
127
+
128
128
  # Create a new UMAP instance with the loaded model
129
129
  instance = allocate
130
130
  instance.instance_variable_set(:@rust_umap, rust_umap)
@@ -133,172 +133,176 @@ module ClusterKit
133
133
  instance.instance_variable_set(:@n_components, nil)
134
134
  instance.instance_variable_set(:@n_neighbors, nil)
135
135
  instance.instance_variable_set(:@random_seed, nil)
136
-
136
+
137
137
  instance
138
138
  end
139
-
140
- # Export transformed data to JSON (utility method for caching)
141
- # @param data [Array<Array<Float>>] Transformed data to export
139
+
140
+ # Save transformed data to JSON file
141
+ # @param data [Array<Array<Float>>] Transformed data to save
142
142
  # @param path [String] Path where to save the data
143
- def self.export_data(data, path)
143
+ def self.save_data(data, path)
144
+ FileUtils.mkdir_p(File.dirname(path)) unless File.dirname(path) == '.'
144
145
  File.write(path, JSON.pretty_generate(data))
145
146
  end
146
-
147
- # Import transformed data from JSON (utility method for caching)
147
+
148
+ # Load transformed data from JSON file
148
149
  # @param path [String] Path to the saved data
149
150
  # @return [Array<Array<Float>>] The loaded data
150
- def self.import_data(path)
151
+ # @raise [ArgumentError] If file doesn't exist
152
+ def self.load_data(path)
153
+ raise ArgumentError, "File not found: #{path}" unless File.exist?(path)
151
154
  JSON.parse(File.read(path))
152
155
  end
153
-
156
+
154
157
  private
155
-
158
+
156
159
  def handle_umap_error(error, data)
157
160
  error_msg = error.message
158
161
  n_samples = data.size
159
-
162
+
160
163
  case error_msg
161
164
  when /isolated point/i, /graph will not be connected/i
162
165
  raise ::ClusterKit::IsolatedPointError, <<~MSG
163
166
  UMAP found isolated points in your data that are too far from other points.
164
-
167
+
165
168
  This typically happens when:
166
169
  • Your data contains outliers that are very different from other points
167
170
  • You're using random data without inherent structure
168
171
  • The n_neighbors parameter (#{@n_neighbors}) is too high for your data distribution
169
-
172
+
170
173
  Solutions:
171
174
  1. Reduce n_neighbors (try 5 or even 3): UMAP.new(n_neighbors: 5)
172
175
  2. Remove outliers from your data before applying UMAP
173
176
  3. Ensure your data has some structure (not purely random)
174
177
  4. For small datasets (< 50 points), consider using PCA instead
175
-
178
+
176
179
  Your data: #{n_samples} samples, #{data.first&.size || 0} dimensions
177
180
  MSG
178
-
181
+
179
182
  when /assertion failed.*box_size/i
180
183
  raise ::ClusterKit::ConvergenceError, <<~MSG
181
184
  UMAP failed to converge due to numerical instability in your data.
182
-
185
+
183
186
  This typically happens when:
184
187
  • Data points are too spread out or have extreme values
185
188
  • The scale of different features varies wildly
186
189
  • There are duplicate or nearly-duplicate points
187
-
190
+
188
191
  Solutions:
189
192
  1. Normalize your data first: ClusterKit::Preprocessing.normalize(data)
190
- 2. Use a smaller n_neighbors value: UMAP.new(n_neighbors: 5)
193
+ 2. Use a smaller n_neighbors value: UMAP.new(n_neighbors: 5)
191
194
  3. Check for and remove duplicate points
192
195
  4. Scale your data to a reasonable range (e.g., 0-1 or -1 to 1)
193
-
196
+
194
197
  Your data: #{n_samples} samples, #{data.first&.size || 0} dimensions
195
198
  MSG
196
-
199
+
197
200
  when /n_neighbors.*larger than/i, /too many neighbors/i
198
201
  raise ::ClusterKit::InvalidParameterError, <<~MSG
199
202
  The n_neighbors parameter (#{@n_neighbors}) is too large for your dataset size (#{n_samples}).
200
-
203
+
201
204
  UMAP needs n_neighbors to be less than the number of samples.
202
205
  Suggested value: #{[5, (n_samples * 0.1).to_i].max}
203
-
206
+
204
207
  This should have been auto-adjusted. If you're seeing this error, please report it.
205
208
  MSG
206
-
209
+
207
210
  else
208
211
  # For unknown errors, still provide some guidance
209
212
  raise ::ClusterKit::Error, <<~MSG
210
213
  UMAP encountered an error: #{error_msg}
211
-
214
+
212
215
  Common solutions:
213
216
  1. Try reducing n_neighbors (current: #{@n_neighbors})
214
217
  2. Normalize your data first
215
218
  3. Check for NaN or infinite values in your data
216
219
  4. Ensure you have at least 10 data points
217
-
220
+
218
221
  If this persists, consider using PCA for dimensionality reduction instead.
219
222
  MSG
220
223
  end
221
224
  end
222
-
225
+
223
226
  def validate_input(data, check_min_samples: true)
224
227
  raise ArgumentError, "Input must be an array" unless data.is_a?(Array)
225
228
  raise ArgumentError, "Input cannot be empty" if data.empty?
226
-
229
+
227
230
  first_row = data.first
228
231
  raise ArgumentError, "Input must be a 2D array (array of arrays)" unless first_row.is_a?(Array)
229
-
232
+
230
233
  row_length = first_row.length
231
234
  min_val = Float::INFINITY
232
235
  max_val = -Float::INFINITY
233
-
236
+
234
237
  # First validate data structure and types
235
238
  data.each_with_index do |row, i|
236
239
  unless row.is_a?(Array)
237
240
  raise ArgumentError, "Row #{i} is not an array"
238
241
  end
239
-
242
+
240
243
  if row.length != row_length
241
244
  raise ArgumentError, "All rows must have the same length (row #{i} has #{row.length} elements, expected #{row_length})"
242
245
  end
243
-
246
+
244
247
  row.each_with_index do |val, j|
245
248
  unless val.is_a?(Numeric)
246
249
  raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
247
250
  end
248
-
251
+
249
252
  # Only check for NaN/Infinite on floats
250
253
  if val.is_a?(Float) && (val.nan? || val.infinite?)
251
254
  raise ArgumentError, "Element at position [#{i}, #{j}] is NaN or Infinite"
252
255
  end
253
-
256
+
254
257
  # Track data range
255
258
  val_f = val.to_f
256
259
  min_val = val_f if val_f < min_val
257
260
  max_val = val_f if val_f > max_val
258
261
  end
259
262
  end
260
-
263
+
261
264
  # Check for sufficient data points after validating structure (only for fit operations)
262
265
  if check_min_samples && data.size < 10
263
266
  raise ::ClusterKit::InsufficientDataError, <<~MSG
264
267
  UMAP requires at least 10 data points, but only #{data.size} provided.
265
-
268
+
266
269
  For small datasets, consider:
267
270
  1. Using PCA instead: ClusterKit::Dimensionality::PCA.new(n_components: 2)
268
271
  2. Collecting more data points
269
272
  3. Using simpler visualization methods
270
273
  MSG
271
274
  end
272
-
275
+
273
276
  # Check for extreme data ranges that might cause numerical issues
274
277
  data_range = max_val - min_val
275
278
  if data_range > 1000
276
279
  warn "WARNING: Large data range detected (#{data_range.round(2)}). Consider normalizing your data to prevent numerical instability."
277
280
  end
278
281
  end
279
-
282
+
280
283
  def create_rust_umap_with_adjusted_params(data)
281
284
  # Only create if not already created
282
285
  return if @rust_umap
283
-
286
+
284
287
  n_samples = data.size
285
-
288
+
286
289
  # Automatically adjust n_neighbors if it's too high for the dataset
287
290
  # n_neighbors should be less than n_samples
288
291
  # Use a reasonable default: min(15, n_samples / 4) but at least 2
289
292
  max_neighbors = [n_samples - 1, 2].max # At least 2, but less than n_samples
290
293
  suggested_neighbors = [[15, n_samples / 4].min.to_i, 2].max
291
-
294
+
292
295
  adjusted_n_neighbors = @n_neighbors
293
296
  if @n_neighbors > max_neighbors
294
297
  adjusted_n_neighbors = [suggested_neighbors, max_neighbors].min
295
-
298
+
296
299
  if ::ClusterKit.configuration.verbose
297
300
  warn "UMAP: Adjusted n_neighbors from #{@n_neighbors} to #{adjusted_n_neighbors} for dataset with #{n_samples} samples"
298
301
  end
299
302
  end
300
-
301
- @rust_umap = ::ClusterKit::RustUMAP.new({
303
+
304
+ # Access the private constant from inside the module
305
+ @rust_umap = ::ClusterKit.const_get(:RustUMAP).new({
302
306
  n_components: @n_components,
303
307
  n_neighbors: adjusted_n_neighbors,
304
308
  random_seed: @random_seed,
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ClusterKit
4
- VERSION = "0.1.0.pre.1"
4
+ VERSION = "0.1.0.pre.2"
5
5
  end
data/lib/clusterkit.rb CHANGED
@@ -21,20 +21,18 @@ module ClusterKit
21
21
  class DisconnectedGraphError < DataError; end
22
22
  class InsufficientDataError < DataError; end
23
23
 
24
- # Load modules - can't use autoload with require_relative path issues
25
- require_relative "clusterkit/dimensionality"
26
- require_relative "clusterkit/clustering"
27
-
28
24
  # Autoload utilities
29
25
  autoload :Utils, "clusterkit/utils"
30
26
  autoload :Preprocessing, "clusterkit/preprocessing"
31
27
  autoload :Silence, "clusterkit/silence"
32
28
 
33
- # Load the extension first
34
- require_relative "clusterkit/clusterkit"
35
-
36
- # Now load the modules that depend on the extension
29
+ # Load modules that depend on the extension
30
+ require_relative "clusterkit/dimensionality"
37
31
  require_relative "clusterkit/clustering"
32
+
33
+ # Make RustUMAP private - it's an implementation detail
34
+ # Users should use Dimensionality::UMAP instead
35
+ private_constant :RustUMAP if const_defined?(:RustUMAP)
38
36
 
39
37
  class << self
40
38
  # Quick UMAP embedding
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: clusterkit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.pre.1
4
+ version: 0.1.0.pre.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Petersen