clusterkit 0.1.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.simplecov +47 -0
  4. data/CHANGELOG.md +35 -0
  5. data/CLAUDE.md +226 -0
  6. data/Cargo.toml +8 -0
  7. data/Gemfile +17 -0
  8. data/IMPLEMENTATION_NOTES.md +143 -0
  9. data/LICENSE.txt +21 -0
  10. data/PYTHON_COMPARISON.md +183 -0
  11. data/README.md +499 -0
  12. data/Rakefile +245 -0
  13. data/clusterkit.gemspec +45 -0
  14. data/docs/KNOWN_ISSUES.md +130 -0
  15. data/docs/RUST_ERROR_HANDLING.md +164 -0
  16. data/docs/TEST_FIXTURES.md +170 -0
  17. data/docs/UMAP_EXPLAINED.md +362 -0
  18. data/docs/UMAP_TROUBLESHOOTING.md +284 -0
  19. data/docs/VERBOSE_OUTPUT.md +84 -0
  20. data/examples/hdbscan_example.rb +147 -0
  21. data/examples/optimal_kmeans_example.rb +96 -0
  22. data/examples/pca_example.rb +114 -0
  23. data/examples/reproducible_umap.rb +99 -0
  24. data/examples/verbose_control.rb +43 -0
  25. data/ext/clusterkit/Cargo.toml +25 -0
  26. data/ext/clusterkit/extconf.rb +4 -0
  27. data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +115 -0
  28. data/ext/clusterkit/src/clustering.rs +267 -0
  29. data/ext/clusterkit/src/embedder.rs +413 -0
  30. data/ext/clusterkit/src/lib.rs +22 -0
  31. data/ext/clusterkit/src/svd.rs +112 -0
  32. data/ext/clusterkit/src/tests.rs +16 -0
  33. data/ext/clusterkit/src/utils.rs +33 -0
  34. data/lib/clusterkit/clustering/hdbscan.rb +177 -0
  35. data/lib/clusterkit/clustering.rb +213 -0
  36. data/lib/clusterkit/clusterkit.rb +9 -0
  37. data/lib/clusterkit/configuration.rb +24 -0
  38. data/lib/clusterkit/dimensionality/pca.rb +251 -0
  39. data/lib/clusterkit/dimensionality/svd.rb +144 -0
  40. data/lib/clusterkit/dimensionality/umap.rb +311 -0
  41. data/lib/clusterkit/dimensionality.rb +29 -0
  42. data/lib/clusterkit/hdbscan_api_design.rb +142 -0
  43. data/lib/clusterkit/preprocessing.rb +106 -0
  44. data/lib/clusterkit/silence.rb +42 -0
  45. data/lib/clusterkit/utils.rb +51 -0
  46. data/lib/clusterkit/version.rb +5 -0
  47. data/lib/clusterkit.rb +93 -0
  48. data/lib/tasks/visualize.rake +641 -0
  49. metadata +194 -0
@@ -0,0 +1,177 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClusterKit
4
+ module Clustering
5
+ # HDBSCAN clustering algorithm - matching KMeans API pattern
6
+ class HDBSCAN
7
+ attr_reader :min_samples, :min_cluster_size, :metric, :labels, :probabilities,
8
+ :outlier_scores, :cluster_persistence
9
+
10
+ # Initialize HDBSCAN clusterer (matches KMeans pattern)
11
+ # @param min_samples [Integer] Min neighborhood size for core points (default: 5)
12
+ # @param min_cluster_size [Integer] Minimum size of clusters (default: 5)
13
+ # @param metric [String] Distance metric (default: 'euclidean')
14
+ def initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean')
15
+ raise ArgumentError, "min_samples must be positive" unless min_samples > 0
16
+ raise ArgumentError, "min_cluster_size must be positive" unless min_cluster_size > 0
17
+
18
+ valid_metrics = ['euclidean', 'l2', 'manhattan', 'l1', 'cosine']
19
+ unless valid_metrics.include?(metric)
20
+ raise ArgumentError, "metric must be one of: #{valid_metrics.join(', ')}"
21
+ end
22
+
23
+ @min_samples = min_samples
24
+ @min_cluster_size = min_cluster_size
25
+ @metric = metric
26
+ @fitted = false
27
+ end
28
+
29
+ # Fit the HDBSCAN model (matches KMeans.fit)
30
+ # @param data [Array] 2D array of data points
31
+ # @return [self] Returns self for method chaining
32
+ def fit(data)
33
+ validate_data(data)
34
+
35
+ # Call Rust implementation (hdbscan crate)
36
+ result = Clustering.hdbscan_rust(data, @min_samples, @min_cluster_size, @metric)
37
+
38
+ @labels = result["labels"]
39
+ @probabilities = result["probabilities"]
40
+ @outlier_scores = result["outlier_scores"]
41
+ @cluster_persistence = result["cluster_persistence"]
42
+ @fitted = true
43
+
44
+ self
45
+ end
46
+
47
+ # HDBSCAN doesn't support predict for new points (unlike KMeans)
48
+ # But we keep the method for API consistency
49
+ # @param data [Array] 2D array of data points
50
+ # @return [Array] Returns nil or raises
51
+ def predict(data)
52
+ raise NotImplementedError, "HDBSCAN does not support prediction on new data. " \
53
+ "Use approximate_predict for approximate membership"
54
+ end
55
+
56
+ # Fit the model and return labels (matches KMeans.fit_predict)
57
+ # @param data [Array] 2D array of data points
58
+ # @return [Array] Cluster labels (-1 for noise)
59
+ def fit_predict(data)
60
+ fit(data)
61
+ @labels
62
+ end
63
+
64
+ # Check if model has been fitted (matches KMeans.fitted?)
65
+ # @return [Boolean] True if fitted
66
+ def fitted?
67
+ @fitted
68
+ end
69
+
70
+ # Get number of clusters found (similar to KMeans.k but discovered)
71
+ # @return [Integer] Number of clusters (excluding noise)
72
+ def n_clusters
73
+ return 0 unless fitted?
74
+ # Count unique labels excluding -1 (noise)
75
+ unique_labels = @labels.uniq.reject { |l| l == -1 }
76
+ unique_labels.length
77
+ end
78
+
79
+ # Get noise ratio (HDBSCAN-specific but follows naming pattern)
80
+ # @return [Float] Fraction of points labeled as noise
81
+ def noise_ratio
82
+ return 0.0 unless fitted?
83
+ @labels.count(-1).to_f / @labels.length
84
+ end
85
+
86
+ # Get the number of noise points
87
+ # @return [Integer] Number of points labeled as noise
88
+ def n_noise_points
89
+ return 0 unless fitted?
90
+ @labels.count(-1)
91
+ end
92
+
93
+ # Get indices of noise points
94
+ # @return [Array<Integer>] Indices of points labeled as noise
95
+ def noise_indices
96
+ return [] unless fitted?
97
+ @labels.each_with_index.select { |label, _| label == -1 }.map { |_, idx| idx }
98
+ end
99
+
100
+ # Get indices of points in each cluster
101
+ # @return [Hash<Integer, Array<Integer>>] Mapping of cluster label to point indices
102
+ def cluster_indices
103
+ return {} unless fitted?
104
+
105
+ result = {}
106
+ @labels.each_with_index do |label, idx|
107
+ next if label == -1 # Skip noise points
108
+ result[label] ||= []
109
+ result[label] << idx
110
+ end
111
+ result
112
+ end
113
+
114
+ # Get summary statistics
115
+ # @return [Hash] Summary of clustering results
116
+ def summary
117
+ return {} unless fitted?
118
+
119
+ {
120
+ n_clusters: n_clusters,
121
+ n_noise_points: n_noise_points,
122
+ noise_ratio: noise_ratio,
123
+ cluster_sizes: cluster_indices.transform_values(&:length),
124
+ cluster_persistence: @cluster_persistence
125
+ }
126
+ end
127
+
128
+ private
129
+
130
+ def validate_data(data)
131
+ # Exact same validation as KMeans for consistency
132
+ raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
133
+ raise ArgumentError, "Data cannot be empty" if data.empty?
134
+ raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
135
+
136
+ row_length = data.first.length
137
+ unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
138
+ raise ArgumentError, "All rows must have the same length"
139
+ end
140
+
141
+ data.each_with_index do |row, i|
142
+ row.each_with_index do |val, j|
143
+ unless val.is_a?(Numeric)
144
+ raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
145
+ end
146
+ end
147
+ end
148
+ end
149
+ end
150
+
151
+ # Module-level convenience methods (matching KMeans pattern)
152
+ class << self
153
+ # Perform HDBSCAN clustering (matches Clustering.kmeans signature pattern)
154
+ # @param data [Array] 2D array of data points
155
+ # @param min_samples [Integer] Min neighborhood size for core points
156
+ # @param min_cluster_size [Integer] Minimum size of clusters
157
+ # @param metric [String] Distance metric
158
+ # @return [Hash] Result hash with :labels, :probabilities, :outlier_scores
159
+ def hdbscan(data, min_samples: 5, min_cluster_size: 5, metric: 'euclidean')
160
+ clusterer = HDBSCAN.new(
161
+ min_samples: min_samples,
162
+ min_cluster_size: min_cluster_size,
163
+ metric: metric
164
+ )
165
+ clusterer.fit(data)
166
+ {
167
+ labels: clusterer.labels,
168
+ probabilities: clusterer.probabilities,
169
+ outlier_scores: clusterer.outlier_scores,
170
+ n_clusters: clusterer.n_clusters,
171
+ noise_ratio: clusterer.noise_ratio,
172
+ cluster_persistence: clusterer.cluster_persistence || {}
173
+ }
174
+ end
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,213 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'clusterkit'
4
+ require_relative 'clustering/hdbscan'
5
+
6
+ module ClusterKit
7
+ # Module for clustering algorithms
8
+ module Clustering
9
+ # K-means clustering algorithm
10
+ class KMeans
11
+ attr_reader :k, :max_iter, :centroids, :labels, :inertia
12
+
13
+ # Initialize K-means clusterer
14
+ # @param k [Integer] Number of clusters
15
+ # @param max_iter [Integer] Maximum iterations (default: 300)
16
+ # @param random_seed [Integer] Random seed for reproducibility (optional)
17
+ def initialize(k:, max_iter: 300, random_seed: nil)
18
+ raise ArgumentError, "k must be positive" unless k > 0
19
+ @k = k
20
+ @max_iter = max_iter
21
+ @random_seed = random_seed
22
+ @fitted = false
23
+ end
24
+
25
+ # Fit the K-means model
26
+ # @param data [Array] 2D array of data points
27
+ # @return [self] Returns self for method chaining
28
+ def fit(data)
29
+ validate_data(data)
30
+
31
+ # Set random seed if provided
32
+ srand(@random_seed) if @random_seed
33
+
34
+ # Call Rust implementation
35
+ @labels, @centroids, @inertia = Clustering.kmeans_rust(data, @k, @max_iter)
36
+ @fitted = true
37
+
38
+ self
39
+ end
40
+
41
+ # Predict cluster labels for new data
42
+ # @param data [Array] 2D array of data points
43
+ # @return [Array] Cluster labels
44
+ def predict(data)
45
+ raise RuntimeError, "Model must be fitted before predict" unless fitted?
46
+ validate_data(data)
47
+
48
+ Clustering.kmeans_predict_rust(data, @centroids)
49
+ end
50
+
51
+ # Fit the model and return labels
52
+ # @param data [Array] 2D array of data points
53
+ # @return [Array] Cluster labels
54
+ def fit_predict(data)
55
+ fit(data)
56
+ @labels
57
+ end
58
+
59
+ # Check if model has been fitted
60
+ # @return [Boolean] True if fitted
61
+ def fitted?
62
+ @fitted
63
+ end
64
+
65
+ # Get cluster centers
66
+ # @return [Array] 2D array of cluster centers
67
+ def cluster_centers
68
+ @centroids
69
+ end
70
+
71
+ # Get the sum of squared distances of samples to their closest cluster center
72
+ # @return [Float] Inertia value
73
+ def inertia
74
+ @inertia
75
+ end
76
+
77
+ # Class methods for K-means specific utilities
78
+ class << self
79
+ # Find optimal number of clusters using elbow method
80
+ # @param data [Array] 2D array of data points
81
+ # @param k_range [Range] Range of k values to try
82
+ # @param max_iter [Integer] Maximum iterations per k
83
+ # @return [Hash] Mapping of k to inertia values
84
+ def elbow_method(data, k_range: 2..10, max_iter: 300)
85
+ results = {}
86
+
87
+ k_range.each do |k|
88
+ kmeans = new(k: k, max_iter: max_iter)
89
+ kmeans.fit(data)
90
+ results[k] = kmeans.inertia
91
+ end
92
+
93
+ results
94
+ end
95
+
96
+ # Detect optimal k from elbow method results
97
+ # @param elbow_results [Hash] Mapping of k to inertia values (from elbow_method)
98
+ # @param fallback_k [Integer] Default k to return if detection fails (default: 3)
99
+ # @return [Integer] Optimal number of clusters
100
+ def detect_optimal_k(elbow_results, fallback_k: 3)
101
+ return fallback_k if elbow_results.nil? || elbow_results.empty?
102
+
103
+ k_values = elbow_results.keys.sort
104
+ return k_values.first if k_values.size == 1
105
+
106
+ # Find the k with the largest drop in inertia
107
+ max_drop = 0
108
+ optimal_k = k_values.first
109
+
110
+ k_values.each_cons(2) do |k1, k2|
111
+ drop = elbow_results[k1] - elbow_results[k2]
112
+ if drop > max_drop
113
+ max_drop = drop
114
+ optimal_k = k2 # Use k after the drop
115
+ end
116
+ end
117
+
118
+ optimal_k
119
+ end
120
+
121
+ # Find optimal k and return it
122
+ # @param data [Array] 2D array of data points
123
+ # @param k_range [Range] Range of k values to try (default: 2..10)
124
+ # @param max_iter [Integer] Maximum iterations (default: 300)
125
+ # @return [Integer] Optimal number of clusters
126
+ def optimal_k(data, k_range: 2..10, max_iter: 300)
127
+ elbow_results = elbow_method(data, k_range: k_range, max_iter: max_iter)
128
+ detect_optimal_k(elbow_results)
129
+ end
130
+ end
131
+
132
+ private
133
+
134
+ def validate_data(data)
135
+ raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
136
+ raise ArgumentError, "Data cannot be empty" if data.empty?
137
+ raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
138
+
139
+ # Check all rows have same length
140
+ row_length = data.first.length
141
+ unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
142
+ raise ArgumentError, "All rows must have the same length"
143
+ end
144
+
145
+ # Check all values are numeric
146
+ data.each_with_index do |row, i|
147
+ row.each_with_index do |val, j|
148
+ unless val.is_a?(Numeric)
149
+ raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
150
+ end
151
+ end
152
+ end
153
+ end
154
+ end
155
+
156
+ # Module-level methods for cross-algorithm functionality
157
+ class << self
158
+ # Calculate silhouette score for any clustering result
159
+ # @param data [Array] 2D array of data points
160
+ # @param labels [Array] Cluster labels
161
+ # @return [Float] Mean silhouette coefficient
162
+ def silhouette_score(data, labels)
163
+ n_samples = data.size
164
+ unique_labels = labels.uniq
165
+
166
+ return 0.0 if unique_labels.size == 1
167
+
168
+ silhouette_values = []
169
+
170
+ data.each_with_index do |point, i|
171
+ cluster_label = labels[i]
172
+
173
+ # Calculate mean intra-cluster distance
174
+ same_cluster_indices = labels.each_index.select { |j| labels[j] == cluster_label && j != i }
175
+ if same_cluster_indices.empty?
176
+ silhouette_values << 0.0
177
+ next
178
+ end
179
+
180
+ a = same_cluster_indices.sum { |j| euclidean_distance(point, data[j]) } / same_cluster_indices.size.to_f
181
+
182
+ # Calculate mean nearest-cluster distance
183
+ b = Float::INFINITY
184
+ unique_labels.each do |other_label|
185
+ next if other_label == cluster_label
186
+
187
+ other_cluster_indices = labels.each_index.select { |j| labels[j] == other_label }
188
+ next if other_cluster_indices.empty?
189
+
190
+ mean_dist = other_cluster_indices.sum { |j| euclidean_distance(point, data[j]) } / other_cluster_indices.size.to_f
191
+ b = mean_dist if mean_dist < b
192
+ end
193
+
194
+ # Calculate silhouette value for this point
195
+ if a == 0.0 && b == 0.0
196
+ s = 0.0 # When all points are identical
197
+ else
198
+ s = (b - a) / [a, b].max
199
+ end
200
+ silhouette_values << s
201
+ end
202
+
203
+ silhouette_values.sum / silhouette_values.size.to_f
204
+ end
205
+
206
+ private
207
+
208
+ def euclidean_distance(a, b)
209
+ Math.sqrt(a.zip(b).sum { |x, y| (x - y) ** 2 })
210
+ end
211
+ end
212
+ end
213
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ begin
4
+ # Try to load the compiled extension
5
+ require_relative "clusterkit.bundle"
6
+ rescue LoadError
7
+ # If that fails, try the .so extension (Linux)
8
+ require_relative "clusterkit.so"
9
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClusterKit
4
+ class << self
5
+ attr_accessor :configuration
6
+ end
7
+
8
+ def self.configure
9
+ self.configuration ||= Configuration.new
10
+ yield(configuration) if block_given?
11
+ end
12
+
13
+ class Configuration
14
+ attr_accessor :verbose
15
+
16
+ def initialize
17
+ # Default to quiet unless explicitly set or debug env var is present
18
+ @verbose = ENV['CLUSTERKIT_VERBOSE'] == 'true' || ENV['DEBUG'] == 'true'
19
+ end
20
+ end
21
+ end
22
+
23
+ # Initialize default configuration
24
+ ClusterKit.configure
@@ -0,0 +1,251 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../clusterkit'
4
+ require_relative 'svd'
5
+
6
+ module ClusterKit
7
+ module Dimensionality
8
+ # Principal Component Analysis using SVD
9
+ # PCA is a linear dimensionality reduction technique that finds
10
+ # the directions of maximum variance in the data
11
+ class PCA
12
+ attr_reader :n_components, :components, :explained_variance, :explained_variance_ratio, :mean
13
+
14
+ # Initialize PCA
15
+ # @param n_components [Integer] Number of principal components to keep
16
+ def initialize(n_components: 2)
17
+ @n_components = n_components
18
+ @fitted = false
19
+ end
20
+
21
+ # Fit the PCA model
22
+ # @param data [Array] 2D array of data points (n_samples × n_features)
23
+ # @return [self] Returns self for method chaining
24
+ def fit(data)
25
+ validate_data(data)
26
+
27
+ # Center the data (subtract mean from each feature)
28
+ @mean = calculate_mean(data)
29
+ centered_data = center_data(data, @mean)
30
+
31
+ # Perform SVD on centered data
32
+ # U contains the transformed data, S contains singular values, VT contains components
33
+ u, s, vt = ClusterKit.svd(centered_data, @n_components, n_iter: 5)
34
+
35
+ # Store the principal components (eigenvectors)
36
+ @components = vt # Shape: (n_components, n_features)
37
+
38
+ # Store singular values for consistency
39
+ @singular_values = s
40
+
41
+ # Calculate explained variance (eigenvalues)
42
+ n_samples = data.size.to_f
43
+ @explained_variance = s.map { |val| (val ** 2) / (n_samples - 1) }
44
+
45
+ # Calculate explained variance ratio
46
+ total_variance = calculate_total_variance(centered_data, n_samples)
47
+ @explained_variance_ratio = @explained_variance.map { |var| var / total_variance }
48
+
49
+ @fitted = true
50
+ self
51
+ end
52
+
53
+ # Transform data using the fitted PCA model
54
+ # @param data [Array] 2D array of data points
55
+ # @return [Array] Transformed data in principal component space
56
+ def transform(data)
57
+ raise RuntimeError, "Model must be fitted before transform" unless fitted?
58
+ validate_data(data)
59
+
60
+ # Center the data using the stored mean
61
+ centered_data = center_data(data, @mean)
62
+
63
+ # Project onto principal components
64
+ # Result = centered_data × components.T
65
+ project_data(centered_data, @components)
66
+ end
67
+
68
+ # Fit the model and transform the data in one step
69
+ # @param data [Array] 2D array of data points
70
+ # @return [Array] Transformed data
71
+ def fit_transform(data)
72
+ validate_data(data)
73
+
74
+ # Center the data (subtract mean from each feature)
75
+ @mean = calculate_mean(data)
76
+ centered_data = center_data(data, @mean)
77
+
78
+ # Perform SVD on centered data
79
+ u, s, vt = SVD.randomized_svd(centered_data, @n_components, n_iter: 5)
80
+
81
+ # Store the principal components (eigenvectors)
82
+ @components = vt
83
+
84
+ # Store singular values for later use
85
+ @singular_values = s
86
+
87
+ # Calculate explained variance (eigenvalues)
88
+ n_samples = data.size.to_f
89
+ @explained_variance = s.map { |val| (val ** 2) / (n_samples - 1) }
90
+
91
+ # Calculate explained variance ratio
92
+ total_variance = calculate_total_variance(centered_data, n_samples)
93
+ @explained_variance_ratio = @explained_variance.map { |var| var / total_variance }
94
+
95
+ @fitted = true
96
+
97
+ # For PCA, the transformed data is U * S
98
+ # Scale U by singular values
99
+ transformed = []
100
+ u.each do |row|
101
+ scaled_row = row.each_with_index.map { |val, i| val * s[i] }
102
+ transformed << scaled_row
103
+ end
104
+ transformed
105
+ end
106
+
107
+ # Inverse transform - reconstruct data from principal components
108
+ # @param data [Array] Transformed data in PC space
109
+ # @return [Array] Reconstructed data in original space
110
+ def inverse_transform(data)
111
+ raise RuntimeError, "Model must be fitted before inverse_transform" unless fitted?
112
+
113
+ # Reconstruct: data × components + mean
114
+ reconstructed = []
115
+ data.each do |sample|
116
+ reconstructed_sample = Array.new(@mean.size, 0.0)
117
+
118
+ sample.each_with_index do |value, i|
119
+ @components[i].each_with_index do |comp_val, j|
120
+ reconstructed_sample[j] += value * comp_val
121
+ end
122
+ end
123
+
124
+ # Add back the mean
125
+ reconstructed_sample = reconstructed_sample.zip(@mean).map { |r, m| r + m }
126
+ reconstructed << reconstructed_sample
127
+ end
128
+
129
+ reconstructed
130
+ end
131
+
132
+ # Get the amount of variance explained by each component
133
+ # @return [Array] Explained variance for each component
134
+ def explained_variance
135
+ raise RuntimeError, "Model must be fitted first" unless fitted?
136
+ @explained_variance
137
+ end
138
+
139
+ # Get the percentage of variance explained by each component
140
+ # @return [Array] Explained variance ratio for each component
141
+ def explained_variance_ratio
142
+ raise RuntimeError, "Model must be fitted first" unless fitted?
143
+ @explained_variance_ratio
144
+ end
145
+
146
+ # Get cumulative explained variance ratio
147
+ # @return [Array] Cumulative sum of explained variance ratios
148
+ def cumulative_explained_variance_ratio
149
+ raise RuntimeError, "Model must be fitted first" unless fitted?
150
+
151
+ cumsum = []
152
+ sum = 0.0
153
+ @explained_variance_ratio.each do |ratio|
154
+ sum += ratio
155
+ cumsum << sum
156
+ end
157
+ cumsum
158
+ end
159
+
160
+ # Check if model has been fitted
161
+ # @return [Boolean] True if fitted
162
+ def fitted?
163
+ @fitted
164
+ end
165
+
166
+ private
167
+
168
+ def validate_data(data)
169
+ raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
170
+ raise ArgumentError, "Data cannot be empty" if data.empty?
171
+ raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
172
+
173
+ # Check all rows have same length
174
+ row_length = data.first.length
175
+ unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
176
+ raise ArgumentError, "All rows must have the same length"
177
+ end
178
+
179
+ # Check we have enough samples for n_components
180
+ if data.size < @n_components
181
+ raise ArgumentError, "n_components (#{@n_components}) cannot be larger than n_samples (#{data.size})"
182
+ end
183
+
184
+ if data.first.size < @n_components
185
+ raise ArgumentError, "n_components (#{@n_components}) cannot be larger than n_features (#{data.first.size})"
186
+ end
187
+ end
188
+
189
+ def calculate_mean(data)
190
+ n_features = data.first.size
191
+ mean = Array.new(n_features, 0.0)
192
+
193
+ data.each do |row|
194
+ row.each_with_index do |val, i|
195
+ mean[i] += val
196
+ end
197
+ end
198
+
199
+ mean.map { |sum| sum / data.size.to_f }
200
+ end
201
+
202
+ def center_data(data, mean)
203
+ data.map do |row|
204
+ row.zip(mean).map { |val, m| val - m }
205
+ end
206
+ end
207
+
208
+ def calculate_total_variance(centered_data, n_samples)
209
+ total_var = 0.0
210
+
211
+ centered_data.each do |row|
212
+ row.each do |val|
213
+ total_var += val ** 2
214
+ end
215
+ end
216
+
217
+ total_var / (n_samples - 1)
218
+ end
219
+
220
+ def project_data(centered_data, components)
221
+ # Matrix multiplication: centered_data × components.T
222
+ transformed = []
223
+
224
+ centered_data.each do |sample|
225
+ projected = Array.new(@n_components, 0.0)
226
+
227
+ components.each_with_index do |component, i|
228
+ dot_product = 0.0
229
+ sample.each_with_index do |val, j|
230
+ dot_product += val * component[j]
231
+ end
232
+ projected[i] = dot_product
233
+ end
234
+
235
+ transformed << projected
236
+ end
237
+
238
+ transformed
239
+ end
240
+ end
241
+
242
+ # Module-level convenience method
243
+ # @param data [Array] 2D array of data points
244
+ # @param n_components [Integer] Number of components
245
+ # @return [Array] Transformed data
246
+ def self.pca(data, n_components: 2)
247
+ pca = PCA.new(n_components: n_components)
248
+ pca.fit_transform(data)
249
+ end
250
+ end
251
+ end