clusterkit 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +3236 -0
- data/README.md +227 -7
- data/docs/KNOWN_ISSUES.md +5 -5
- data/docs/RUST_ERROR_HANDLING.md +6 -6
- data/docs/assets/clusterkit-wide.png +0 -0
- data/docs/assets/clusterkit.png +0 -0
- data/docs/assets/visualization.png +0 -0
- data/ext/clusterkit/Cargo.toml +5 -4
- data/ext/clusterkit/extconf.rb +9 -1
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +27 -62
- data/ext/clusterkit/src/clustering.rs +68 -114
- data/ext/clusterkit/src/embedder.rs +48 -131
- data/ext/clusterkit/src/hnsw.rs +579 -0
- data/ext/clusterkit/src/lib.rs +7 -5
- data/ext/clusterkit/src/svd.rs +35 -58
- data/ext/clusterkit/src/utils.rs +159 -9
- data/lib/clusterkit/clustering/hdbscan.rb +4 -17
- data/lib/clusterkit/clustering.rb +4 -23
- data/lib/clusterkit/data_validator.rb +132 -0
- data/lib/clusterkit/dimensionality/pca.rb +12 -12
- data/lib/clusterkit/dimensionality/svd.rb +47 -16
- data/lib/clusterkit/dimensionality/umap.rb +7 -40
- data/lib/clusterkit/hnsw.rb +251 -0
- data/lib/clusterkit/version.rb +1 -1
- data/lib/clusterkit.rb +2 -1
- metadata +40 -20
- data/clusterkit.gemspec +0 -45
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative '../clusterkit'
|
|
4
|
+
require_relative '../data_validator'
|
|
4
5
|
|
|
5
6
|
module ClusterKit
|
|
6
7
|
module Dimensionality
|
|
@@ -8,7 +9,7 @@ module ClusterKit
|
|
|
8
9
|
# Decomposes a matrix into U, S, V^T components
|
|
9
10
|
class SVD
|
|
10
11
|
attr_reader :n_components, :n_iter, :random_seed
|
|
11
|
-
attr_reader :u, :s, :vt
|
|
12
|
+
attr_reader :u, :s, :vt, :n_features
|
|
12
13
|
|
|
13
14
|
# Initialize a new SVD instance
|
|
14
15
|
# @param n_components [Integer] Number of components to compute
|
|
@@ -27,7 +28,8 @@ module ClusterKit
|
|
|
27
28
|
def fit_transform(data)
|
|
28
29
|
validate_input(data)
|
|
29
30
|
|
|
30
|
-
# Store
|
|
31
|
+
# Store data characteristics for later transform operations
|
|
32
|
+
@n_features = data.first.size
|
|
31
33
|
@original_data_id = data.object_id
|
|
32
34
|
|
|
33
35
|
# Determine n_components if not set
|
|
@@ -77,26 +79,21 @@ module ClusterKit
|
|
|
77
79
|
|
|
78
80
|
# Transform data using fitted SVD (project onto components)
|
|
79
81
|
# @param data [Array<Array<Numeric>>] Data to transform
|
|
80
|
-
# @return [Array<Array<Float>>] Transformed data
|
|
82
|
+
# @return [Array<Array<Float>>] Transformed data projected onto SVD components
|
|
81
83
|
def transform(data)
|
|
82
84
|
raise RuntimeError, "Model must be fitted first" unless fitted?
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
# For SVD, transform typically means projecting onto the components
|
|
86
|
-
# This is equivalent to data * V (or data * V^T.T)
|
|
87
|
-
# But for dimensionality reduction, we usually want U * S
|
|
88
|
-
# which is already computed in fit_transform
|
|
85
|
+
validate_transform_input(data)
|
|
89
86
|
|
|
90
|
-
# If transforming new data, we'd need to project it
|
|
91
|
-
# For now, return U * S for the fitted data
|
|
92
87
|
if data.object_id == @original_data_id
|
|
93
88
|
# Same data that was fitted - return U * S
|
|
94
89
|
@u.map.with_index do |row, i|
|
|
95
90
|
row.map.with_index { |val, j| val * @s[j] }
|
|
96
91
|
end
|
|
97
92
|
else
|
|
98
|
-
# New data -
|
|
99
|
-
|
|
93
|
+
# New data - project onto V components: data × V
|
|
94
|
+
# Since we have V^T, we need to transpose it back to V
|
|
95
|
+
# V = V^T^T, so we project: data × V^T^T
|
|
96
|
+
transform_new_data(data)
|
|
100
97
|
end
|
|
101
98
|
end
|
|
102
99
|
|
|
@@ -135,9 +132,43 @@ module ClusterKit
|
|
|
135
132
|
private
|
|
136
133
|
|
|
137
134
|
def validate_input(data)
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
135
|
+
DataValidator.validate_standard(data, check_finite: false)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def validate_transform_input(data)
|
|
139
|
+
DataValidator.validate_standard(data, check_finite: false)
|
|
140
|
+
|
|
141
|
+
# Check feature count matches training data
|
|
142
|
+
if data.first.size != @n_features
|
|
143
|
+
raise ArgumentError, "New data has #{data.first.size} features, but model was fitted with #{@n_features} features"
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Transform new data by projecting onto V components
|
|
148
|
+
# Mathematical operation: new_data × V, where V = V^T^T
|
|
149
|
+
def transform_new_data(data)
|
|
150
|
+
# V^T is stored as @vt (shape: n_components × n_features)
|
|
151
|
+
# We need V (shape: n_features × n_components)
|
|
152
|
+
# V = V^T^T, so we transpose @vt
|
|
153
|
+
|
|
154
|
+
result = []
|
|
155
|
+
data.each do |sample|
|
|
156
|
+
# Project sample onto each component (column of V = row of V^T)
|
|
157
|
+
projected = Array.new(@vt.size, 0.0)
|
|
158
|
+
|
|
159
|
+
@vt.each_with_index do |vt_row, comp_idx|
|
|
160
|
+
# Dot product: sample · vt_row (this is sample · V[:, comp_idx])
|
|
161
|
+
dot_product = 0.0
|
|
162
|
+
sample.each_with_index do |val, feat_idx|
|
|
163
|
+
dot_product += val * vt_row[feat_idx]
|
|
164
|
+
end
|
|
165
|
+
projected[comp_idx] = dot_product
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
result << projected
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
result
|
|
141
172
|
end
|
|
142
173
|
end
|
|
143
174
|
end
|
|
@@ -4,6 +4,7 @@ require 'fileutils'
|
|
|
4
4
|
require 'json'
|
|
5
5
|
require_relative '../configuration'
|
|
6
6
|
require_relative '../silence'
|
|
7
|
+
require_relative '../data_validator'
|
|
7
8
|
|
|
8
9
|
module ClusterKit
|
|
9
10
|
module Dimensionality
|
|
@@ -224,44 +225,10 @@ module ClusterKit
|
|
|
224
225
|
end
|
|
225
226
|
|
|
226
227
|
def validate_input(data, check_min_samples: true)
|
|
227
|
-
|
|
228
|
-
|
|
228
|
+
# Use shared validation for common checks
|
|
229
|
+
DataValidator.validate_standard(data)
|
|
229
230
|
|
|
230
|
-
|
|
231
|
-
raise ArgumentError, "Input must be a 2D array (array of arrays)" unless first_row.is_a?(Array)
|
|
232
|
-
|
|
233
|
-
row_length = first_row.length
|
|
234
|
-
min_val = Float::INFINITY
|
|
235
|
-
max_val = -Float::INFINITY
|
|
236
|
-
|
|
237
|
-
# First validate data structure and types
|
|
238
|
-
data.each_with_index do |row, i|
|
|
239
|
-
unless row.is_a?(Array)
|
|
240
|
-
raise ArgumentError, "Row #{i} is not an array"
|
|
241
|
-
end
|
|
242
|
-
|
|
243
|
-
if row.length != row_length
|
|
244
|
-
raise ArgumentError, "All rows must have the same length (row #{i} has #{row.length} elements, expected #{row_length})"
|
|
245
|
-
end
|
|
246
|
-
|
|
247
|
-
row.each_with_index do |val, j|
|
|
248
|
-
unless val.is_a?(Numeric)
|
|
249
|
-
raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
|
|
250
|
-
end
|
|
251
|
-
|
|
252
|
-
# Only check for NaN/Infinite on floats
|
|
253
|
-
if val.is_a?(Float) && (val.nan? || val.infinite?)
|
|
254
|
-
raise ArgumentError, "Element at position [#{i}, #{j}] is NaN or Infinite"
|
|
255
|
-
end
|
|
256
|
-
|
|
257
|
-
# Track data range
|
|
258
|
-
val_f = val.to_f
|
|
259
|
-
min_val = val_f if val_f < min_val
|
|
260
|
-
max_val = val_f if val_f > max_val
|
|
261
|
-
end
|
|
262
|
-
end
|
|
263
|
-
|
|
264
|
-
# Check for sufficient data points after validating structure (only for fit operations)
|
|
231
|
+
# UMAP-specific validations
|
|
265
232
|
if check_min_samples && data.size < 10
|
|
266
233
|
raise ::ClusterKit::InsufficientDataError, <<~MSG
|
|
267
234
|
UMAP requires at least 10 data points, but only #{data.size} provided.
|
|
@@ -274,9 +241,9 @@ module ClusterKit
|
|
|
274
241
|
end
|
|
275
242
|
|
|
276
243
|
# Check for extreme data ranges that might cause numerical issues
|
|
277
|
-
|
|
278
|
-
if data_range > 1000
|
|
279
|
-
warn "WARNING: Large data range detected (#{data_range.round(2)}). Consider normalizing your data to prevent numerical instability."
|
|
244
|
+
stats = DataValidator.data_statistics(data)
|
|
245
|
+
if stats[:data_range] > 1000
|
|
246
|
+
warn "WARNING: Large data range detected (#{stats[:data_range].round(2)}). Consider normalizing your data to prevent numerical instability."
|
|
280
247
|
end
|
|
281
248
|
end
|
|
282
249
|
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ClusterKit
|
|
4
|
+
# HNSW (Hierarchical Navigable Small World) index for fast approximate nearest neighbor search
|
|
5
|
+
#
|
|
6
|
+
# @example Basic usage
|
|
7
|
+
# index = ClusterKit::HNSW.new(dim: 128, space: :euclidean)
|
|
8
|
+
# index.add_batch(vectors, labels: labels)
|
|
9
|
+
# neighbors = index.search(query_vector, k: 10)
|
|
10
|
+
#
|
|
11
|
+
# @example With metadata
|
|
12
|
+
# index = ClusterKit::HNSW.new(dim: 768, space: :cosine)
|
|
13
|
+
# index.add_item(vector, label: "doc_1", metadata: { title: "Introduction", date: "2024-01-01" })
|
|
14
|
+
# results = index.search_with_metadata(query, k: 5)
|
|
15
|
+
# # => [{ label: "doc_1", distance: 0.23, metadata: { title: "...", date: "..." } }, ...]
|
|
16
|
+
class HNSW
|
|
17
|
+
# Note: The actual HNSW class is defined in Rust (ext/clusterkit/src/hnsw.rs)
|
|
18
|
+
# This Ruby file adds additional convenience methods and documentation.
|
|
19
|
+
# The Rust implementation provides these core methods:
|
|
20
|
+
# - new(kwargs) - constructor
|
|
21
|
+
# - add_item(vector, kwargs) - add single item
|
|
22
|
+
# - add_batch(vectors, kwargs) - add multiple items
|
|
23
|
+
# - search(query, kwargs) - search for neighbors
|
|
24
|
+
# - search_with_metadata(query, kwargs) - search with metadata
|
|
25
|
+
# - size() - get number of items
|
|
26
|
+
# - config() - get configuration
|
|
27
|
+
# - stats() - get statistics
|
|
28
|
+
# - set_ef(ef) - set search quality parameter
|
|
29
|
+
# - save(path) - save to file
|
|
30
|
+
|
|
31
|
+
# Initialize is actually handled by the Rust code
|
|
32
|
+
# This documentation is for reference
|
|
33
|
+
#
|
|
34
|
+
# @param dim [Integer] Dimension of vectors (required)
|
|
35
|
+
# @param space [Symbol] Distance metric: :euclidean, :cosine, or :inner_product (default: :euclidean)
|
|
36
|
+
# @param max_elements [Integer] Maximum number of elements (default: 10_000)
|
|
37
|
+
# @param m [Integer] Number of bi-directional links (default: 16)
|
|
38
|
+
# @param ef_construction [Integer] Size of dynamic candidate list (default: 200)
|
|
39
|
+
# @param random_seed [Integer, nil] Random seed for reproducible builds (default: nil)
|
|
40
|
+
# @param dynamic_list [Boolean] Allow index to grow dynamically (not yet implemented)
|
|
41
|
+
|
|
42
|
+
# Fit the index with training data (alias for add_batch)
|
|
43
|
+
#
|
|
44
|
+
# @param data [Array<Array>, Numo::NArray] Training vectors
|
|
45
|
+
# @param labels [Array, nil] Optional labels for vectors
|
|
46
|
+
# @return [self]
|
|
47
|
+
def fit(data, labels: nil)
|
|
48
|
+
add_batch(data, labels: labels)
|
|
49
|
+
self
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Fit and return transformed data (for compatibility with sklearn-like interface)
|
|
53
|
+
#
|
|
54
|
+
# @param data [Array<Array>, Numo::NArray] Training vectors
|
|
55
|
+
# @return [self]
|
|
56
|
+
def fit_transform(data)
|
|
57
|
+
fit(data)
|
|
58
|
+
self
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Add a vector using the << operator
|
|
62
|
+
#
|
|
63
|
+
# @param vector [Array, Numo::NArray] Vector to add
|
|
64
|
+
# @return [self]
|
|
65
|
+
def <<(vector)
|
|
66
|
+
add_item(vector, {})
|
|
67
|
+
self
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Alias for search that always includes distances
|
|
71
|
+
#
|
|
72
|
+
# @param query [Array, Numo::NArray] Query vector
|
|
73
|
+
# @param k [Integer] Number of neighbors
|
|
74
|
+
# @param ef [Integer, nil] Search parameter (higher = better quality, slower)
|
|
75
|
+
# @return [Array<Array>] Array of [indices, distances]
|
|
76
|
+
def knn_query(query, k: 10, ef: nil)
|
|
77
|
+
search(query, k: k, ef: ef, include_distances: true)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Batch search for multiple queries
|
|
81
|
+
#
|
|
82
|
+
# @param queries [Array<Array>, Numo::NArray] Multiple query vectors
|
|
83
|
+
# @param k [Integer] Number of neighbors per query
|
|
84
|
+
# @param parallel [Boolean] Process queries in parallel
|
|
85
|
+
# @return [Array<Array>] Results for each query
|
|
86
|
+
def batch_search(queries, k: 10, parallel: true)
|
|
87
|
+
queries = ensure_array(queries)
|
|
88
|
+
|
|
89
|
+
if parallel && queries.size > 1
|
|
90
|
+
require 'parallel'
|
|
91
|
+
Parallel.map(queries) { |query| search(query, k: k) }
|
|
92
|
+
else
|
|
93
|
+
queries.map { |query| search(query, k: k) }
|
|
94
|
+
end
|
|
95
|
+
rescue LoadError
|
|
96
|
+
# Parallel gem not available, fall back to sequential
|
|
97
|
+
queries.map { |query| search(query, k: k) }
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Range search - find all points within a given radius
|
|
101
|
+
#
|
|
102
|
+
# @param query [Array, Numo::NArray] Query vector
|
|
103
|
+
# @param radius [Float] Search radius
|
|
104
|
+
# @param limit [Integer, nil] Maximum number of results
|
|
105
|
+
# @return [Array<Hash>] Results within radius
|
|
106
|
+
def range_search(query, radius:, limit: nil)
|
|
107
|
+
# Get a large number of candidates
|
|
108
|
+
k = limit || size
|
|
109
|
+
k = [k, size].min
|
|
110
|
+
|
|
111
|
+
results = search_with_metadata(query, k: k)
|
|
112
|
+
|
|
113
|
+
# Filter by radius
|
|
114
|
+
results.select { |r| r[:distance] <= radius }
|
|
115
|
+
.take(limit || results.size)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Check if index is empty
|
|
119
|
+
# @return [Boolean]
|
|
120
|
+
def empty?
|
|
121
|
+
size == 0
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Clear all elements from the index
|
|
125
|
+
#
|
|
126
|
+
# @return [self]
|
|
127
|
+
def clear!
|
|
128
|
+
# Would need to recreate the index
|
|
129
|
+
raise NotImplementedError, "Clear not yet implemented"
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Check if a label exists in the index
|
|
133
|
+
#
|
|
134
|
+
# @param label [String, Integer] Label to check
|
|
135
|
+
# @return [Boolean]
|
|
136
|
+
def include?(label)
|
|
137
|
+
# This would need to be implemented in Rust
|
|
138
|
+
# For now, return false
|
|
139
|
+
false
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Get recall rate for a test set
|
|
143
|
+
#
|
|
144
|
+
# @param test_queries [Array<Array>] Query vectors
|
|
145
|
+
# @param ground_truth [Array<Array>] True nearest neighbors for each query
|
|
146
|
+
# @param k [Integer] Number of neighbors to evaluate
|
|
147
|
+
# @return [Float] Recall rate (0.0 to 1.0)
|
|
148
|
+
def recall(test_queries, ground_truth, k: 10)
|
|
149
|
+
test_queries = ensure_array(test_queries)
|
|
150
|
+
|
|
151
|
+
require 'set'
|
|
152
|
+
total_correct = 0
|
|
153
|
+
total_possible = 0
|
|
154
|
+
|
|
155
|
+
test_queries.each_with_index do |query, i|
|
|
156
|
+
predicted = Set.new(search(query, k: k))
|
|
157
|
+
actual = Set.new(ground_truth[i].take(k))
|
|
158
|
+
|
|
159
|
+
total_correct += (predicted & actual).size
|
|
160
|
+
total_possible += [k, actual.size].min
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
total_possible > 0 ? total_correct.to_f / total_possible : 0.0
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Load an index from file
|
|
167
|
+
# Note: This uses Box::leak internally to work around hnsw_rs lifetime constraints
|
|
168
|
+
# This causes a small memory leak - the HnswIo struct won't be freed until program exit
|
|
169
|
+
#
|
|
170
|
+
# @param path [String] File path to load from
|
|
171
|
+
# @return [HNSW] New HNSW instance loaded from file
|
|
172
|
+
# (The actual implementation is in Rust)
|
|
173
|
+
|
|
174
|
+
# Create an index from embeddings produced by UMAP or other dimensionality reduction
|
|
175
|
+
#
|
|
176
|
+
# @param embeddings [Array<Array>, Numo::NArray] Embedding vectors
|
|
177
|
+
# @param kwargs [Hash] Additional options for HNSW initialization
|
|
178
|
+
# @return [HNSW] New HNSW instance
|
|
179
|
+
def self.from_embedding(embeddings, **kwargs)
|
|
180
|
+
embeddings = ensure_array(embeddings)
|
|
181
|
+
|
|
182
|
+
dim = embeddings.first.size
|
|
183
|
+
index = new(dim: dim, **kwargs)
|
|
184
|
+
index.fit(embeddings)
|
|
185
|
+
index
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Builder pattern for creating HNSW indices
|
|
189
|
+
class Builder
|
|
190
|
+
def initialize
|
|
191
|
+
@config = {}
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def space(type)
|
|
195
|
+
@config[:space] = type
|
|
196
|
+
self
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def dimensions(dim)
|
|
200
|
+
@config[:dim] = dim
|
|
201
|
+
self
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def max_elements(n)
|
|
205
|
+
@config[:max_elements] = n
|
|
206
|
+
self
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
def m_parameter(m)
|
|
210
|
+
@config[:m] = m
|
|
211
|
+
self
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def ef_construction(ef)
|
|
215
|
+
@config[:ef_construction] = ef
|
|
216
|
+
self
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
def seed(seed)
|
|
220
|
+
@config[:random_seed] = seed
|
|
221
|
+
self
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
def build
|
|
225
|
+
HNSW.new(**@config)
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
private
|
|
230
|
+
|
|
231
|
+
# Ensure input is a proper array format
|
|
232
|
+
def ensure_array(data)
|
|
233
|
+
case data
|
|
234
|
+
when Array
|
|
235
|
+
data
|
|
236
|
+
else
|
|
237
|
+
data.respond_to?(:to_a) ? data.to_a : raise(ArgumentError, "Data must be convertible to Array")
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Class method to make it available to class methods
|
|
242
|
+
def self.ensure_array(data)
|
|
243
|
+
case data
|
|
244
|
+
when Array
|
|
245
|
+
data
|
|
246
|
+
else
|
|
247
|
+
data.respond_to?(:to_a) ? data.to_a : raise(ArgumentError, "Data must be convertible to Array")
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
end
|
data/lib/clusterkit/version.rb
CHANGED
data/lib/clusterkit.rb
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "clusterkit/version"
|
|
4
|
-
|
|
4
|
+
require "clusterkit/clusterkit"
|
|
5
5
|
require_relative "clusterkit/configuration"
|
|
6
6
|
|
|
7
7
|
# Main module for ClusterKit gem
|
|
@@ -29,6 +29,7 @@ module ClusterKit
|
|
|
29
29
|
# Load modules that depend on the extension
|
|
30
30
|
require_relative "clusterkit/dimensionality"
|
|
31
31
|
require_relative "clusterkit/clustering"
|
|
32
|
+
require_relative "clusterkit/hnsw"
|
|
32
33
|
|
|
33
34
|
# Make RustUMAP private - it's an implementation detail
|
|
34
35
|
# Users should use Dimensionality::UMAP instead
|
metadata
CHANGED
|
@@ -1,17 +1,31 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: clusterkit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Chris Petersen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-03-24 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
|
-
name:
|
|
14
|
+
name: rb_sys
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0.9'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0.9'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: benchmark
|
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
|
16
30
|
requirements:
|
|
17
31
|
- - ">="
|
|
@@ -25,47 +39,47 @@ dependencies:
|
|
|
25
39
|
- !ruby/object:Gem::Version
|
|
26
40
|
version: '0'
|
|
27
41
|
- !ruby/object:Gem::Dependency
|
|
28
|
-
name:
|
|
42
|
+
name: csv
|
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
|
30
44
|
requirements:
|
|
31
|
-
- - "
|
|
45
|
+
- - ">="
|
|
32
46
|
- !ruby/object:Gem::Version
|
|
33
|
-
version: '
|
|
47
|
+
version: '0'
|
|
34
48
|
type: :development
|
|
35
49
|
prerelease: false
|
|
36
50
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
51
|
requirements:
|
|
38
|
-
- - "
|
|
52
|
+
- - ">="
|
|
39
53
|
- !ruby/object:Gem::Version
|
|
40
|
-
version: '
|
|
54
|
+
version: '0'
|
|
41
55
|
- !ruby/object:Gem::Dependency
|
|
42
|
-
name: rake
|
|
56
|
+
name: rake
|
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
|
44
58
|
requirements:
|
|
45
59
|
- - "~>"
|
|
46
60
|
- !ruby/object:Gem::Version
|
|
47
|
-
version: '
|
|
61
|
+
version: '13.0'
|
|
48
62
|
type: :development
|
|
49
63
|
prerelease: false
|
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
|
51
65
|
requirements:
|
|
52
66
|
- - "~>"
|
|
53
67
|
- !ruby/object:Gem::Version
|
|
54
|
-
version: '
|
|
68
|
+
version: '13.0'
|
|
55
69
|
- !ruby/object:Gem::Dependency
|
|
56
|
-
name:
|
|
70
|
+
name: rake-compiler
|
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
|
58
72
|
requirements:
|
|
59
73
|
- - "~>"
|
|
60
74
|
- !ruby/object:Gem::Version
|
|
61
|
-
version: '
|
|
75
|
+
version: '1.2'
|
|
62
76
|
type: :development
|
|
63
77
|
prerelease: false
|
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
|
65
79
|
requirements:
|
|
66
80
|
- - "~>"
|
|
67
81
|
- !ruby/object:Gem::Version
|
|
68
|
-
version: '
|
|
82
|
+
version: '1.2'
|
|
69
83
|
- !ruby/object:Gem::Dependency
|
|
70
84
|
name: rspec
|
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -122,6 +136,7 @@ files:
|
|
|
122
136
|
- ".simplecov"
|
|
123
137
|
- CHANGELOG.md
|
|
124
138
|
- CLAUDE.md
|
|
139
|
+
- Cargo.lock
|
|
125
140
|
- Cargo.toml
|
|
126
141
|
- Gemfile
|
|
127
142
|
- IMPLEMENTATION_NOTES.md
|
|
@@ -129,13 +144,15 @@ files:
|
|
|
129
144
|
- PYTHON_COMPARISON.md
|
|
130
145
|
- README.md
|
|
131
146
|
- Rakefile
|
|
132
|
-
- clusterkit.gemspec
|
|
133
147
|
- docs/KNOWN_ISSUES.md
|
|
134
148
|
- docs/RUST_ERROR_HANDLING.md
|
|
135
149
|
- docs/TEST_FIXTURES.md
|
|
136
150
|
- docs/UMAP_EXPLAINED.md
|
|
137
151
|
- docs/UMAP_TROUBLESHOOTING.md
|
|
138
152
|
- docs/VERBOSE_OUTPUT.md
|
|
153
|
+
- docs/assets/clusterkit-wide.png
|
|
154
|
+
- docs/assets/clusterkit.png
|
|
155
|
+
- docs/assets/visualization.png
|
|
139
156
|
- examples/hdbscan_example.rb
|
|
140
157
|
- examples/optimal_kmeans_example.rb
|
|
141
158
|
- examples/pca_example.rb
|
|
@@ -146,6 +163,7 @@ files:
|
|
|
146
163
|
- ext/clusterkit/src/clustering.rs
|
|
147
164
|
- ext/clusterkit/src/clustering/hdbscan_wrapper.rs
|
|
148
165
|
- ext/clusterkit/src/embedder.rs
|
|
166
|
+
- ext/clusterkit/src/hnsw.rs
|
|
149
167
|
- ext/clusterkit/src/lib.rs
|
|
150
168
|
- ext/clusterkit/src/svd.rs
|
|
151
169
|
- ext/clusterkit/src/tests.rs
|
|
@@ -155,23 +173,25 @@ files:
|
|
|
155
173
|
- lib/clusterkit/clustering/hdbscan.rb
|
|
156
174
|
- lib/clusterkit/clusterkit.rb
|
|
157
175
|
- lib/clusterkit/configuration.rb
|
|
176
|
+
- lib/clusterkit/data_validator.rb
|
|
158
177
|
- lib/clusterkit/dimensionality.rb
|
|
159
178
|
- lib/clusterkit/dimensionality/pca.rb
|
|
160
179
|
- lib/clusterkit/dimensionality/svd.rb
|
|
161
180
|
- lib/clusterkit/dimensionality/umap.rb
|
|
162
181
|
- lib/clusterkit/hdbscan_api_design.rb
|
|
182
|
+
- lib/clusterkit/hnsw.rb
|
|
163
183
|
- lib/clusterkit/preprocessing.rb
|
|
164
184
|
- lib/clusterkit/silence.rb
|
|
165
185
|
- lib/clusterkit/utils.rb
|
|
166
186
|
- lib/clusterkit/version.rb
|
|
167
187
|
- lib/tasks/visualize.rake
|
|
168
|
-
homepage: https://github.com/
|
|
188
|
+
homepage: https://github.com/scientist-labs/clusterkit
|
|
169
189
|
licenses:
|
|
170
190
|
- MIT
|
|
171
191
|
metadata:
|
|
172
|
-
homepage_uri: https://github.com/
|
|
173
|
-
source_code_uri: https://github.com/
|
|
174
|
-
changelog_uri: https://github.com/
|
|
192
|
+
homepage_uri: https://github.com/scientist-labs/clusterkit
|
|
193
|
+
source_code_uri: https://github.com/scientist-labs/clusterkit
|
|
194
|
+
changelog_uri: https://github.com/scientist-labs/clusterkit/blob/main/CHANGELOG.md
|
|
175
195
|
post_install_message:
|
|
176
196
|
rdoc_options: []
|
|
177
197
|
require_paths:
|
|
@@ -187,7 +207,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
187
207
|
- !ruby/object:Gem::Version
|
|
188
208
|
version: '0'
|
|
189
209
|
requirements: []
|
|
190
|
-
rubygems_version: 3.5.
|
|
210
|
+
rubygems_version: 3.5.22
|
|
191
211
|
signing_key:
|
|
192
212
|
specification_version: 4
|
|
193
213
|
summary: High-performance clustering and dimensionality reduction for Ruby
|
data/clusterkit.gemspec
DELETED
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
require_relative "lib/clusterkit/version"
|
|
2
|
-
|
|
3
|
-
Gem::Specification.new do |spec|
|
|
4
|
-
spec.name = "clusterkit"
|
|
5
|
-
spec.version = ClusterKit::VERSION
|
|
6
|
-
spec.authors = ["Chris Petersen"]
|
|
7
|
-
spec.email = ["chris@petersen.io"]
|
|
8
|
-
|
|
9
|
-
spec.summary = "High-performance clustering and dimensionality reduction for Ruby"
|
|
10
|
-
spec.description = "A comprehensive clustering toolkit for Ruby, providing UMAP, PCA, K-means, HDBSCAN and more. Built on top of annembed and hdbscan Rust crates for blazing-fast performance."
|
|
11
|
-
spec.homepage = "https://github.com/cpetersen/clusterkit"
|
|
12
|
-
spec.license = "MIT"
|
|
13
|
-
spec.required_ruby_version = ">= 2.7.0"
|
|
14
|
-
|
|
15
|
-
spec.metadata["homepage_uri"] = spec.homepage
|
|
16
|
-
spec.metadata["source_code_uri"] = spec.homepage
|
|
17
|
-
spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
|
|
18
|
-
|
|
19
|
-
# Specify which files should be added to the gem when it is released.
|
|
20
|
-
spec.files = Dir.chdir(__dir__) do
|
|
21
|
-
`git ls-files -z`.split("\x0").reject do |f|
|
|
22
|
-
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
|
23
|
-
end + Dir["ext/**/*.rs", "ext/**/*.toml"]
|
|
24
|
-
end
|
|
25
|
-
spec.bindir = "exe"
|
|
26
|
-
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
27
|
-
spec.require_paths = ["lib"]
|
|
28
|
-
spec.extensions = ["ext/clusterkit/extconf.rb"]
|
|
29
|
-
|
|
30
|
-
# Runtime dependencies
|
|
31
|
-
# Numo is optional but recommended for better performance
|
|
32
|
-
# spec.add_dependency "numo-narray", "~> 0.9"
|
|
33
|
-
|
|
34
|
-
# Development dependencies
|
|
35
|
-
spec.add_development_dependency "csv"
|
|
36
|
-
spec.add_development_dependency "rake", "~> 13.0"
|
|
37
|
-
spec.add_development_dependency "rake-compiler", "~> 1.2"
|
|
38
|
-
spec.add_development_dependency "rb_sys", "~> 0.9"
|
|
39
|
-
spec.add_development_dependency "rspec", "~> 3.0"
|
|
40
|
-
spec.add_development_dependency "simplecov", "~> 0.22"
|
|
41
|
-
spec.add_development_dependency "yard", "~> 0.9"
|
|
42
|
-
|
|
43
|
-
# For more information and examples about making a new gem, check out our
|
|
44
|
-
# guide at: https://bundler.io/guides/creating_gem.html
|
|
45
|
-
end
|