clusterkit 0.3.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.simplecov +47 -0
  4. data/CHANGELOG.md +35 -0
  5. data/CLAUDE.md +226 -0
  6. data/Cargo.lock +3228 -0
  7. data/Cargo.toml +8 -0
  8. data/Gemfile +17 -0
  9. data/IMPLEMENTATION_NOTES.md +143 -0
  10. data/LICENSE.txt +21 -0
  11. data/PYTHON_COMPARISON.md +183 -0
  12. data/README.md +744 -0
  13. data/Rakefile +259 -0
  14. data/docs/KNOWN_ISSUES.md +130 -0
  15. data/docs/RUST_ERROR_HANDLING.md +164 -0
  16. data/docs/TEST_FIXTURES.md +170 -0
  17. data/docs/UMAP_EXPLAINED.md +362 -0
  18. data/docs/UMAP_TROUBLESHOOTING.md +284 -0
  19. data/docs/VERBOSE_OUTPUT.md +84 -0
  20. data/docs/assets/clusterkit-wide.png +0 -0
  21. data/docs/assets/clusterkit.png +0 -0
  22. data/docs/assets/visualization.png +0 -0
  23. data/examples/hdbscan_example.rb +147 -0
  24. data/examples/optimal_kmeans_example.rb +96 -0
  25. data/examples/pca_example.rb +114 -0
  26. data/examples/reproducible_umap.rb +99 -0
  27. data/examples/verbose_control.rb +43 -0
  28. data/ext/clusterkit/Cargo.toml +26 -0
  29. data/ext/clusterkit/extconf.rb +23 -0
  30. data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +80 -0
  31. data/ext/clusterkit/src/clustering.rs +221 -0
  32. data/ext/clusterkit/src/embedder.rs +349 -0
  33. data/ext/clusterkit/src/hnsw.rs +579 -0
  34. data/ext/clusterkit/src/lib.rs +24 -0
  35. data/ext/clusterkit/src/svd.rs +89 -0
  36. data/ext/clusterkit/src/tests.rs +16 -0
  37. data/ext/clusterkit/src/utils.rs +183 -0
  38. data/lib/clusterkit/3.1/clusterkit.bundle +0 -0
  39. data/lib/clusterkit/3.2/clusterkit.bundle +0 -0
  40. data/lib/clusterkit/3.3/clusterkit.bundle +0 -0
  41. data/lib/clusterkit/3.4/clusterkit.bundle +0 -0
  42. data/lib/clusterkit/clustering/hdbscan.rb +164 -0
  43. data/lib/clusterkit/clustering.rb +194 -0
  44. data/lib/clusterkit/clusterkit.rb +14 -0
  45. data/lib/clusterkit/configuration.rb +24 -0
  46. data/lib/clusterkit/data_validator.rb +132 -0
  47. data/lib/clusterkit/dimensionality/pca.rb +251 -0
  48. data/lib/clusterkit/dimensionality/svd.rb +175 -0
  49. data/lib/clusterkit/dimensionality/umap.rb +282 -0
  50. data/lib/clusterkit/dimensionality.rb +29 -0
  51. data/lib/clusterkit/hdbscan_api_design.rb +142 -0
  52. data/lib/clusterkit/hnsw.rb +251 -0
  53. data/lib/clusterkit/preprocessing.rb +106 -0
  54. data/lib/clusterkit/silence.rb +42 -0
  55. data/lib/clusterkit/utils.rb +51 -0
  56. data/lib/clusterkit/version.rb +5 -0
  57. data/lib/clusterkit.rb +105 -0
  58. data/lib/tasks/visualize.rake +641 -0
  59. metadata +214 -0
@@ -0,0 +1,132 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClusterKit
4
+ # Shared data validation methods for all algorithms
5
+ module DataValidator
6
+ class << self
7
+ # Validate basic data structure and types
8
+ # @param data [Array] Data to validate
9
+ # @raise [ArgumentError] If data structure is invalid
10
+ def validate_basic_structure(data)
11
+ raise ArgumentError, "Input must be an array" unless data.is_a?(Array)
12
+ raise ArgumentError, "Input cannot be empty" if data.empty?
13
+
14
+ first_row = data.first
15
+ raise ArgumentError, "Input must be a 2D array (array of arrays)" unless first_row.is_a?(Array)
16
+ end
17
+
18
+ # Validate row consistency (all rows have same length)
19
+ # @param data [Array] 2D array to validate
20
+ # @raise [ArgumentError] If rows have different lengths
21
+ def validate_row_consistency(data)
22
+ row_length = data.first.length
23
+
24
+ data.each_with_index do |row, i|
25
+ unless row.is_a?(Array)
26
+ raise ArgumentError, "Row #{i} is not an array"
27
+ end
28
+
29
+ if row.length != row_length
30
+ raise ArgumentError, "All rows must have the same length (row #{i} has #{row.length} elements, expected #{row_length})"
31
+ end
32
+ end
33
+ end
34
+
35
+ # Validate that all elements are numeric
36
+ # @param data [Array] 2D array to validate
37
+ # @raise [ArgumentError] If any element is not numeric
38
+ def validate_numeric_types(data)
39
+ data.each_with_index do |row, i|
40
+ row.each_with_index do |val, j|
41
+ unless val.is_a?(Numeric)
42
+ raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
43
+ end
44
+ end
45
+ end
46
+ end
47
+
48
+ # Validate finite values (no NaN or Infinite)
49
+ # @param data [Array] 2D array to validate
50
+ # @raise [ArgumentError] If any float is NaN or Infinite
51
+ def validate_finite_values(data)
52
+ data.each_with_index do |row, i|
53
+ row.each_with_index do |val, j|
54
+ # Only check for NaN/Infinite on floats
55
+ if val.is_a?(Float) && (val.nan? || val.infinite?)
56
+ raise ArgumentError, "Element at position [#{i}, #{j}] is NaN or Infinite"
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ # Standard validation for most algorithms
63
+ # @param data [Array] 2D array to validate
64
+ # @param check_finite [Boolean] Whether to check for NaN/Infinite values
65
+ # @raise [ArgumentError] If data is invalid
66
+ def validate_standard(data, check_finite: true)
67
+ validate_basic_structure(data)
68
+ validate_row_consistency(data)
69
+ validate_numeric_types(data)
70
+ validate_finite_values(data) if check_finite
71
+ end
72
+
73
+ # Validation for clustering algorithms (KMeans, HDBSCAN) with specific error messages
74
+ # @param data [Array] 2D array to validate
75
+ # @param check_finite [Boolean] Whether to check for NaN/Infinite values
76
+ # @raise [ArgumentError] If data is invalid
77
+ def validate_clustering(data, check_finite: false)
78
+ raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
79
+ raise ArgumentError, "Data cannot be empty" if data.empty?
80
+ raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
81
+
82
+ validate_row_consistency(data)
83
+ validate_numeric_types(data)
84
+ validate_finite_values(data) if check_finite
85
+ end
86
+
87
+ # Validation for PCA with specific error messages (same as clustering but without finite checks)
88
+ # @param data [Array] 2D array to validate
89
+ # @raise [ArgumentError] If data is invalid
90
+ def validate_pca(data)
91
+ raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
92
+ raise ArgumentError, "Data cannot be empty" if data.empty?
93
+ raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
94
+
95
+ validate_row_consistency(data)
96
+ validate_numeric_types(data)
97
+ end
98
+
99
+ # Get data statistics for warnings/error context
100
+ # @param data [Array] 2D array
101
+ # @return [Hash] Statistics about the data
102
+ def data_statistics(data)
103
+ return { n_samples: 0, n_features: 0, data_range: 0.0 } if data.empty?
104
+
105
+ n_samples = data.size
106
+ n_features = data.first&.size || 0
107
+
108
+ # Calculate data range for warnings
109
+ min_val = Float::INFINITY
110
+ max_val = -Float::INFINITY
111
+
112
+ data.each do |row|
113
+ row.each do |val|
114
+ val_f = val.to_f
115
+ min_val = val_f if val_f < min_val
116
+ max_val = val_f if val_f > max_val
117
+ end
118
+ end
119
+
120
+ data_range = max_val - min_val
121
+
122
+ {
123
+ n_samples: n_samples,
124
+ n_features: n_features,
125
+ data_range: data_range,
126
+ min_value: min_val,
127
+ max_value: max_val
128
+ }
129
+ end
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,251 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../clusterkit'
4
+ require_relative 'svd'
5
+ require_relative '../data_validator'
6
+
7
+ module ClusterKit
8
+ module Dimensionality
9
+ # Principal Component Analysis using SVD
10
+ # PCA is a linear dimensionality reduction technique that finds
11
+ # the directions of maximum variance in the data
12
+ class PCA
13
+ attr_reader :n_components, :components, :explained_variance, :explained_variance_ratio, :mean
14
+
15
+ # Initialize PCA
16
+ # @param n_components [Integer] Number of principal components to keep
17
+ def initialize(n_components: 2)
18
+ @n_components = n_components
19
+ @fitted = false
20
+ end
21
+
22
+ # Fit the PCA model
23
+ # @param data [Array] 2D array of data points (n_samples × n_features)
24
+ # @return [self] Returns self for method chaining
25
+ def fit(data)
26
+ validate_data(data)
27
+
28
+ # Center the data (subtract mean from each feature)
29
+ @mean = calculate_mean(data)
30
+ centered_data = center_data(data, @mean)
31
+
32
+ # Perform SVD on centered data
33
+ # U contains the transformed data, S contains singular values, VT contains components
34
+ u, s, vt = perform_svd(centered_data)
35
+
36
+ # Store the principal components (eigenvectors)
37
+ @components = vt # Shape: (n_components, n_features)
38
+
39
+ # Store singular values for consistency
40
+ @singular_values = s
41
+
42
+ # Calculate explained variance (eigenvalues)
43
+ n_samples = data.size.to_f
44
+ @explained_variance = s.map { |val| (val ** 2) / (n_samples - 1) }
45
+
46
+ # Calculate explained variance ratio
47
+ total_variance = calculate_total_variance(centered_data, n_samples)
48
+ @explained_variance_ratio = @explained_variance.map { |var| var / total_variance }
49
+
50
+ @fitted = true
51
+ self
52
+ end
53
+
54
+ # Transform data using the fitted PCA model
55
+ # @param data [Array] 2D array of data points
56
+ # @return [Array] Transformed data in principal component space
57
+ def transform(data)
58
+ raise RuntimeError, "Model must be fitted before transform" unless fitted?
59
+ validate_data(data)
60
+
61
+ # Center the data using the stored mean
62
+ centered_data = center_data(data, @mean)
63
+
64
+ # Project onto principal components
65
+ # Result = centered_data × components.T
66
+ project_data(centered_data, @components)
67
+ end
68
+
69
+ # Fit the model and transform the data in one step
70
+ # @param data [Array] 2D array of data points
71
+ # @return [Array] Transformed data
72
+ def fit_transform(data)
73
+ validate_data(data)
74
+
75
+ # Center the data (subtract mean from each feature)
76
+ @mean = calculate_mean(data)
77
+ centered_data = center_data(data, @mean)
78
+
79
+ # Perform SVD on centered data
80
+ u, s, vt = perform_svd(centered_data)
81
+
82
+ # Store the principal components (eigenvectors)
83
+ @components = vt
84
+
85
+ # Store singular values for later use
86
+ @singular_values = s
87
+
88
+ # Calculate explained variance (eigenvalues)
89
+ n_samples = data.size.to_f
90
+ @explained_variance = s.map { |val| (val ** 2) / (n_samples - 1) }
91
+
92
+ # Calculate explained variance ratio
93
+ total_variance = calculate_total_variance(centered_data, n_samples)
94
+ @explained_variance_ratio = @explained_variance.map { |var| var / total_variance }
95
+
96
+ @fitted = true
97
+
98
+ # For PCA, the transformed data is U * S
99
+ # Scale U by singular values
100
+ transformed = []
101
+ u.each do |row|
102
+ scaled_row = row.each_with_index.map { |val, i| val * s[i] }
103
+ transformed << scaled_row
104
+ end
105
+ transformed
106
+ end
107
+
108
+ # Inverse transform - reconstruct data from principal components
109
+ # @param data [Array] Transformed data in PC space
110
+ # @return [Array] Reconstructed data in original space
111
+ def inverse_transform(data)
112
+ raise RuntimeError, "Model must be fitted before inverse_transform" unless fitted?
113
+
114
+ # Reconstruct: data × components + mean
115
+ reconstructed = []
116
+ data.each do |sample|
117
+ reconstructed_sample = Array.new(@mean.size, 0.0)
118
+
119
+ sample.each_with_index do |value, i|
120
+ @components[i].each_with_index do |comp_val, j|
121
+ reconstructed_sample[j] += value * comp_val
122
+ end
123
+ end
124
+
125
+ # Add back the mean
126
+ reconstructed_sample = reconstructed_sample.zip(@mean).map { |r, m| r + m }
127
+ reconstructed << reconstructed_sample
128
+ end
129
+
130
+ reconstructed
131
+ end
132
+
133
+ # Get the amount of variance explained by each component
134
+ # @return [Array] Explained variance for each component
135
+ def explained_variance
136
+ raise RuntimeError, "Model must be fitted first" unless fitted?
137
+ @explained_variance
138
+ end
139
+
140
+ # Get the percentage of variance explained by each component
141
+ # @return [Array] Explained variance ratio for each component
142
+ def explained_variance_ratio
143
+ raise RuntimeError, "Model must be fitted first" unless fitted?
144
+ @explained_variance_ratio
145
+ end
146
+
147
+ # Get cumulative explained variance ratio
148
+ # @return [Array] Cumulative sum of explained variance ratios
149
+ def cumulative_explained_variance_ratio
150
+ raise RuntimeError, "Model must be fitted first" unless fitted?
151
+
152
+ cumsum = []
153
+ sum = 0.0
154
+ @explained_variance_ratio.each do |ratio|
155
+ sum += ratio
156
+ cumsum << sum
157
+ end
158
+ cumsum
159
+ end
160
+
161
+ # Check if model has been fitted
162
+ # @return [Boolean] True if fitted
163
+ def fitted?
164
+ @fitted
165
+ end
166
+
167
+ private
168
+
169
+ def validate_data(data)
170
+ # Use shared validation for common checks
171
+ DataValidator.validate_pca(data)
172
+
173
+ # PCA-specific validations
174
+ if data.size < @n_components
175
+ raise ArgumentError, "n_components (#{@n_components}) cannot be larger than n_samples (#{data.size})"
176
+ end
177
+
178
+ if data.first.size < @n_components
179
+ raise ArgumentError, "n_components (#{@n_components}) cannot be larger than n_features (#{data.first.size})"
180
+ end
181
+ end
182
+
183
+ def calculate_mean(data)
184
+ n_features = data.first.size
185
+ mean = Array.new(n_features, 0.0)
186
+
187
+ data.each do |row|
188
+ row.each_with_index do |val, i|
189
+ mean[i] += val
190
+ end
191
+ end
192
+
193
+ mean.map { |sum| sum / data.size.to_f }
194
+ end
195
+
196
+ def center_data(data, mean)
197
+ data.map do |row|
198
+ row.zip(mean).map { |val, m| val - m }
199
+ end
200
+ end
201
+
202
+ def calculate_total_variance(centered_data, n_samples)
203
+ total_var = 0.0
204
+
205
+ centered_data.each do |row|
206
+ row.each do |val|
207
+ total_var += val ** 2
208
+ end
209
+ end
210
+
211
+ total_var / (n_samples - 1)
212
+ end
213
+
214
+ def project_data(centered_data, components)
215
+ # Matrix multiplication: centered_data × components.T
216
+ transformed = []
217
+
218
+ centered_data.each do |sample|
219
+ projected = Array.new(@n_components, 0.0)
220
+
221
+ components.each_with_index do |component, i|
222
+ dot_product = 0.0
223
+ sample.each_with_index do |val, j|
224
+ dot_product += val * component[j]
225
+ end
226
+ projected[i] = dot_product
227
+ end
228
+
229
+ transformed << projected
230
+ end
231
+
232
+ transformed
233
+ end
234
+
235
+ # Shared SVD computation for both fit and fit_transform
236
+ # Ensures both methods use identical SVD invocation and parameters
237
+ def perform_svd(centered_data)
238
+ SVD.randomized_svd(centered_data, @n_components, n_iter: 5)
239
+ end
240
+ end
241
+
242
+ # Module-level convenience method
243
+ # @param data [Array] 2D array of data points
244
+ # @param n_components [Integer] Number of components
245
+ # @return [Array] Transformed data
246
+ def self.pca(data, n_components: 2)
247
+ pca = PCA.new(n_components: n_components)
248
+ pca.fit_transform(data)
249
+ end
250
+ end
251
+ end
@@ -0,0 +1,175 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../clusterkit'
4
+ require_relative '../data_validator'
5
+
6
+ module ClusterKit
7
+ module Dimensionality
8
+ # Singular Value Decomposition
9
+ # Decomposes a matrix into U, S, V^T components
10
+ class SVD
11
+ attr_reader :n_components, :n_iter, :random_seed
12
+ attr_reader :u, :s, :vt, :n_features
13
+
14
+ # Initialize a new SVD instance
15
+ # @param n_components [Integer] Number of components to compute
16
+ # @param n_iter [Integer] Number of iterations for randomized algorithm (default: 2)
17
+ # @param random_seed [Integer, nil] Random seed for reproducibility
18
+ def initialize(n_components: nil, n_iter: 2, random_seed: nil)
19
+ @n_components = n_components
20
+ @n_iter = n_iter
21
+ @random_seed = random_seed
22
+ @fitted = false
23
+ end
24
+
25
+ # Fit the model and transform data in one step
26
+ # @param data [Array<Array<Numeric>>] Input data
27
+ # @return [Array] Returns [U, S, Vt] matrices
28
+ def fit_transform(data)
29
+ validate_input(data)
30
+
31
+ # Store data characteristics for later transform operations
32
+ @n_features = data.first.size
33
+ @original_data_id = data.object_id
34
+
35
+ # Determine n_components if not set
36
+ n_comp = @n_components || [data.size, data.first.size].min
37
+
38
+ # Call the Rust implementation
39
+ @u, @s, @vt = self.class.randomized_svd(data, n_comp, n_iter: @n_iter)
40
+ @fitted = true
41
+
42
+ [@u, @s, @vt]
43
+ end
44
+
45
+ # Fit the model to data
46
+ # @param data [Array<Array<Numeric>>] Input data
47
+ # @return [self]
48
+ def fit(data)
49
+ fit_transform(data)
50
+ self
51
+ end
52
+
53
+ # Get the U matrix (left singular vectors)
54
+ # @return [Array<Array<Float>>] U matrix
55
+ def components_u
56
+ raise RuntimeError, "Model must be fitted first" unless fitted?
57
+ @u
58
+ end
59
+
60
+ # Get the singular values
61
+ # @return [Array<Float>] Singular values
62
+ def singular_values
63
+ raise RuntimeError, "Model must be fitted first" unless fitted?
64
+ @s
65
+ end
66
+
67
+ # Get the V^T matrix (right singular vectors, transposed)
68
+ # @return [Array<Array<Float>>] V^T matrix
69
+ def components_vt
70
+ raise RuntimeError, "Model must be fitted first" unless fitted?
71
+ @vt
72
+ end
73
+
74
+ # Check if the model has been fitted
75
+ # @return [Boolean]
76
+ def fitted?
77
+ @fitted
78
+ end
79
+
80
+ # Transform data using fitted SVD (project onto components)
81
+ # @param data [Array<Array<Numeric>>] Data to transform
82
+ # @return [Array<Array<Float>>] Transformed data projected onto SVD components
83
+ def transform(data)
84
+ raise RuntimeError, "Model must be fitted first" unless fitted?
85
+ validate_transform_input(data)
86
+
87
+ if data.object_id == @original_data_id
88
+ # Same data that was fitted - return U * S
89
+ @u.map.with_index do |row, i|
90
+ row.map.with_index { |val, j| val * @s[j] }
91
+ end
92
+ else
93
+ # New data - project onto V components: data × V
94
+ # Since we have V^T, we need to transpose it back to V
95
+ # V = V^T^T, so we project: data × V^T^T
96
+ transform_new_data(data)
97
+ end
98
+ end
99
+
100
+ # Inverse transform (reconstruct from components)
101
+ # @param transformed_data [Array<Array<Float>>] Transformed data
102
+ # @return [Array<Array<Float>>] Reconstructed data
103
+ def inverse_transform(transformed_data)
104
+ raise RuntimeError, "Model must be fitted first" unless fitted?
105
+
106
+ # Reconstruction: (U * S) * V^T
107
+ # transformed_data should be U * S
108
+ # We multiply by V^T to reconstruct
109
+
110
+ result = []
111
+ transformed_data.each do |row|
112
+ reconstructed = Array.new(@vt.first.size, 0.0)
113
+ row.each_with_index do |val, i|
114
+ @vt[i].each_with_index do |v, j|
115
+ reconstructed[j] += val * v
116
+ end
117
+ end
118
+ result << reconstructed
119
+ end
120
+ result
121
+ end
122
+
123
+ # Class method for randomized SVD (kept for compatibility)
124
+ # @param matrix [Array<Array<Numeric>>] Input matrix
125
+ # @param k [Integer] Number of components
126
+ # @param n_iter [Integer] Number of iterations
127
+ # @return [Array] Returns [U, S, Vt]
128
+ def self.randomized_svd(matrix, k, n_iter: 2)
129
+ ::ClusterKit::SVD.randomized_svd_rust(matrix, k, n_iter)
130
+ end
131
+
132
+ private
133
+
134
+ def validate_input(data)
135
+ DataValidator.validate_standard(data, check_finite: false)
136
+ end
137
+
138
+ def validate_transform_input(data)
139
+ DataValidator.validate_standard(data, check_finite: false)
140
+
141
+ # Check feature count matches training data
142
+ if data.first.size != @n_features
143
+ raise ArgumentError, "New data has #{data.first.size} features, but model was fitted with #{@n_features} features"
144
+ end
145
+ end
146
+
147
+ # Transform new data by projecting onto V components
148
+ # Mathematical operation: new_data × V, where V = V^T^T
149
+ def transform_new_data(data)
150
+ # V^T is stored as @vt (shape: n_components × n_features)
151
+ # We need V (shape: n_features × n_components)
152
+ # V = V^T^T, so we transpose @vt
153
+
154
+ result = []
155
+ data.each do |sample|
156
+ # Project sample onto each component (column of V = row of V^T)
157
+ projected = Array.new(@vt.size, 0.0)
158
+
159
+ @vt.each_with_index do |vt_row, comp_idx|
160
+ # Dot product: sample · vt_row (this is sample · V[:, comp_idx])
161
+ dot_product = 0.0
162
+ sample.each_with_index do |val, feat_idx|
163
+ dot_product += val * vt_row[feat_idx]
164
+ end
165
+ projected[comp_idx] = dot_product
166
+ end
167
+
168
+ result << projected
169
+ end
170
+
171
+ result
172
+ end
173
+ end
174
+ end
175
+ end