clusterkit 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.simplecov +47 -0
- data/CHANGELOG.md +35 -0
- data/CLAUDE.md +226 -0
- data/Cargo.toml +8 -0
- data/Gemfile +17 -0
- data/IMPLEMENTATION_NOTES.md +143 -0
- data/LICENSE.txt +21 -0
- data/PYTHON_COMPARISON.md +183 -0
- data/README.md +499 -0
- data/Rakefile +245 -0
- data/clusterkit.gemspec +45 -0
- data/docs/KNOWN_ISSUES.md +130 -0
- data/docs/RUST_ERROR_HANDLING.md +164 -0
- data/docs/TEST_FIXTURES.md +170 -0
- data/docs/UMAP_EXPLAINED.md +362 -0
- data/docs/UMAP_TROUBLESHOOTING.md +284 -0
- data/docs/VERBOSE_OUTPUT.md +84 -0
- data/examples/hdbscan_example.rb +147 -0
- data/examples/optimal_kmeans_example.rb +96 -0
- data/examples/pca_example.rb +114 -0
- data/examples/reproducible_umap.rb +99 -0
- data/examples/verbose_control.rb +43 -0
- data/ext/clusterkit/Cargo.toml +25 -0
- data/ext/clusterkit/extconf.rb +4 -0
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +115 -0
- data/ext/clusterkit/src/clustering.rs +267 -0
- data/ext/clusterkit/src/embedder.rs +413 -0
- data/ext/clusterkit/src/lib.rs +22 -0
- data/ext/clusterkit/src/svd.rs +112 -0
- data/ext/clusterkit/src/tests.rs +16 -0
- data/ext/clusterkit/src/utils.rs +33 -0
- data/lib/clusterkit/clustering/hdbscan.rb +177 -0
- data/lib/clusterkit/clustering.rb +213 -0
- data/lib/clusterkit/clusterkit.rb +9 -0
- data/lib/clusterkit/configuration.rb +24 -0
- data/lib/clusterkit/dimensionality/pca.rb +251 -0
- data/lib/clusterkit/dimensionality/svd.rb +144 -0
- data/lib/clusterkit/dimensionality/umap.rb +311 -0
- data/lib/clusterkit/dimensionality.rb +29 -0
- data/lib/clusterkit/hdbscan_api_design.rb +142 -0
- data/lib/clusterkit/preprocessing.rb +106 -0
- data/lib/clusterkit/silence.rb +42 -0
- data/lib/clusterkit/utils.rb +51 -0
- data/lib/clusterkit/version.rb +5 -0
- data/lib/clusterkit.rb +93 -0
- data/lib/tasks/visualize.rake +641 -0
- metadata +194 -0
@@ -0,0 +1,177 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ClusterKit
|
4
|
+
module Clustering
|
5
|
+
# HDBSCAN clustering algorithm - matching KMeans API pattern
|
6
|
+
class HDBSCAN
|
7
|
+
attr_reader :min_samples, :min_cluster_size, :metric, :labels, :probabilities,
|
8
|
+
:outlier_scores, :cluster_persistence
|
9
|
+
|
10
|
+
# Initialize HDBSCAN clusterer (matches KMeans pattern)
|
11
|
+
# @param min_samples [Integer] Min neighborhood size for core points (default: 5)
|
12
|
+
# @param min_cluster_size [Integer] Minimum size of clusters (default: 5)
|
13
|
+
# @param metric [String] Distance metric (default: 'euclidean')
|
14
|
+
def initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean')
|
15
|
+
raise ArgumentError, "min_samples must be positive" unless min_samples > 0
|
16
|
+
raise ArgumentError, "min_cluster_size must be positive" unless min_cluster_size > 0
|
17
|
+
|
18
|
+
valid_metrics = ['euclidean', 'l2', 'manhattan', 'l1', 'cosine']
|
19
|
+
unless valid_metrics.include?(metric)
|
20
|
+
raise ArgumentError, "metric must be one of: #{valid_metrics.join(', ')}"
|
21
|
+
end
|
22
|
+
|
23
|
+
@min_samples = min_samples
|
24
|
+
@min_cluster_size = min_cluster_size
|
25
|
+
@metric = metric
|
26
|
+
@fitted = false
|
27
|
+
end
|
28
|
+
|
29
|
+
# Fit the HDBSCAN model (matches KMeans.fit)
|
30
|
+
# @param data [Array] 2D array of data points
|
31
|
+
# @return [self] Returns self for method chaining
|
32
|
+
def fit(data)
|
33
|
+
validate_data(data)
|
34
|
+
|
35
|
+
# Call Rust implementation (hdbscan crate)
|
36
|
+
result = Clustering.hdbscan_rust(data, @min_samples, @min_cluster_size, @metric)
|
37
|
+
|
38
|
+
@labels = result["labels"]
|
39
|
+
@probabilities = result["probabilities"]
|
40
|
+
@outlier_scores = result["outlier_scores"]
|
41
|
+
@cluster_persistence = result["cluster_persistence"]
|
42
|
+
@fitted = true
|
43
|
+
|
44
|
+
self
|
45
|
+
end
|
46
|
+
|
47
|
+
# HDBSCAN doesn't support predict for new points (unlike KMeans)
|
48
|
+
# But we keep the method for API consistency
|
49
|
+
# @param data [Array] 2D array of data points
|
50
|
+
# @return [Array] Returns nil or raises
|
51
|
+
def predict(data)
|
52
|
+
raise NotImplementedError, "HDBSCAN does not support prediction on new data. " \
|
53
|
+
"Use approximate_predict for approximate membership"
|
54
|
+
end
|
55
|
+
|
56
|
+
# Fit the model and return labels (matches KMeans.fit_predict)
|
57
|
+
# @param data [Array] 2D array of data points
|
58
|
+
# @return [Array] Cluster labels (-1 for noise)
|
59
|
+
def fit_predict(data)
|
60
|
+
fit(data)
|
61
|
+
@labels
|
62
|
+
end
|
63
|
+
|
64
|
+
# Check if model has been fitted (matches KMeans.fitted?)
|
65
|
+
# @return [Boolean] True if fitted
|
66
|
+
def fitted?
|
67
|
+
@fitted
|
68
|
+
end
|
69
|
+
|
70
|
+
# Get number of clusters found (similar to KMeans.k but discovered)
|
71
|
+
# @return [Integer] Number of clusters (excluding noise)
|
72
|
+
def n_clusters
|
73
|
+
return 0 unless fitted?
|
74
|
+
# Count unique labels excluding -1 (noise)
|
75
|
+
unique_labels = @labels.uniq.reject { |l| l == -1 }
|
76
|
+
unique_labels.length
|
77
|
+
end
|
78
|
+
|
79
|
+
# Get noise ratio (HDBSCAN-specific but follows naming pattern)
|
80
|
+
# @return [Float] Fraction of points labeled as noise
|
81
|
+
def noise_ratio
|
82
|
+
return 0.0 unless fitted?
|
83
|
+
@labels.count(-1).to_f / @labels.length
|
84
|
+
end
|
85
|
+
|
86
|
+
# Get the number of noise points
|
87
|
+
# @return [Integer] Number of points labeled as noise
|
88
|
+
def n_noise_points
|
89
|
+
return 0 unless fitted?
|
90
|
+
@labels.count(-1)
|
91
|
+
end
|
92
|
+
|
93
|
+
# Get indices of noise points
|
94
|
+
# @return [Array<Integer>] Indices of points labeled as noise
|
95
|
+
def noise_indices
|
96
|
+
return [] unless fitted?
|
97
|
+
@labels.each_with_index.select { |label, _| label == -1 }.map { |_, idx| idx }
|
98
|
+
end
|
99
|
+
|
100
|
+
# Get indices of points in each cluster
|
101
|
+
# @return [Hash<Integer, Array<Integer>>] Mapping of cluster label to point indices
|
102
|
+
def cluster_indices
|
103
|
+
return {} unless fitted?
|
104
|
+
|
105
|
+
result = {}
|
106
|
+
@labels.each_with_index do |label, idx|
|
107
|
+
next if label == -1 # Skip noise points
|
108
|
+
result[label] ||= []
|
109
|
+
result[label] << idx
|
110
|
+
end
|
111
|
+
result
|
112
|
+
end
|
113
|
+
|
114
|
+
# Get summary statistics
|
115
|
+
# @return [Hash] Summary of clustering results
|
116
|
+
def summary
|
117
|
+
return {} unless fitted?
|
118
|
+
|
119
|
+
{
|
120
|
+
n_clusters: n_clusters,
|
121
|
+
n_noise_points: n_noise_points,
|
122
|
+
noise_ratio: noise_ratio,
|
123
|
+
cluster_sizes: cluster_indices.transform_values(&:length),
|
124
|
+
cluster_persistence: @cluster_persistence
|
125
|
+
}
|
126
|
+
end
|
127
|
+
|
128
|
+
private
|
129
|
+
|
130
|
+
def validate_data(data)
|
131
|
+
# Exact same validation as KMeans for consistency
|
132
|
+
raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
|
133
|
+
raise ArgumentError, "Data cannot be empty" if data.empty?
|
134
|
+
raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
|
135
|
+
|
136
|
+
row_length = data.first.length
|
137
|
+
unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
|
138
|
+
raise ArgumentError, "All rows must have the same length"
|
139
|
+
end
|
140
|
+
|
141
|
+
data.each_with_index do |row, i|
|
142
|
+
row.each_with_index do |val, j|
|
143
|
+
unless val.is_a?(Numeric)
|
144
|
+
raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
# Module-level convenience methods (matching KMeans pattern)
|
152
|
+
class << self
|
153
|
+
# Perform HDBSCAN clustering (matches Clustering.kmeans signature pattern)
|
154
|
+
# @param data [Array] 2D array of data points
|
155
|
+
# @param min_samples [Integer] Min neighborhood size for core points
|
156
|
+
# @param min_cluster_size [Integer] Minimum size of clusters
|
157
|
+
# @param metric [String] Distance metric
|
158
|
+
# @return [Hash] Result hash with :labels, :probabilities, :outlier_scores
|
159
|
+
def hdbscan(data, min_samples: 5, min_cluster_size: 5, metric: 'euclidean')
|
160
|
+
clusterer = HDBSCAN.new(
|
161
|
+
min_samples: min_samples,
|
162
|
+
min_cluster_size: min_cluster_size,
|
163
|
+
metric: metric
|
164
|
+
)
|
165
|
+
clusterer.fit(data)
|
166
|
+
{
|
167
|
+
labels: clusterer.labels,
|
168
|
+
probabilities: clusterer.probabilities,
|
169
|
+
outlier_scores: clusterer.outlier_scores,
|
170
|
+
n_clusters: clusterer.n_clusters,
|
171
|
+
noise_ratio: clusterer.noise_ratio,
|
172
|
+
cluster_persistence: clusterer.cluster_persistence || {}
|
173
|
+
}
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
@@ -0,0 +1,213 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'clusterkit'
|
4
|
+
require_relative 'clustering/hdbscan'
|
5
|
+
|
6
|
+
module ClusterKit
|
7
|
+
# Module for clustering algorithms
|
8
|
+
module Clustering
|
9
|
+
# K-means clustering algorithm
|
10
|
+
class KMeans
|
11
|
+
attr_reader :k, :max_iter, :centroids, :labels, :inertia
|
12
|
+
|
13
|
+
# Initialize K-means clusterer
|
14
|
+
# @param k [Integer] Number of clusters
|
15
|
+
# @param max_iter [Integer] Maximum iterations (default: 300)
|
16
|
+
# @param random_seed [Integer] Random seed for reproducibility (optional)
|
17
|
+
def initialize(k:, max_iter: 300, random_seed: nil)
|
18
|
+
raise ArgumentError, "k must be positive" unless k > 0
|
19
|
+
@k = k
|
20
|
+
@max_iter = max_iter
|
21
|
+
@random_seed = random_seed
|
22
|
+
@fitted = false
|
23
|
+
end
|
24
|
+
|
25
|
+
# Fit the K-means model
|
26
|
+
# @param data [Array] 2D array of data points
|
27
|
+
# @return [self] Returns self for method chaining
|
28
|
+
def fit(data)
|
29
|
+
validate_data(data)
|
30
|
+
|
31
|
+
# Set random seed if provided
|
32
|
+
srand(@random_seed) if @random_seed
|
33
|
+
|
34
|
+
# Call Rust implementation
|
35
|
+
@labels, @centroids, @inertia = Clustering.kmeans_rust(data, @k, @max_iter)
|
36
|
+
@fitted = true
|
37
|
+
|
38
|
+
self
|
39
|
+
end
|
40
|
+
|
41
|
+
# Predict cluster labels for new data
|
42
|
+
# @param data [Array] 2D array of data points
|
43
|
+
# @return [Array] Cluster labels
|
44
|
+
def predict(data)
|
45
|
+
raise RuntimeError, "Model must be fitted before predict" unless fitted?
|
46
|
+
validate_data(data)
|
47
|
+
|
48
|
+
Clustering.kmeans_predict_rust(data, @centroids)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Fit the model and return labels
|
52
|
+
# @param data [Array] 2D array of data points
|
53
|
+
# @return [Array] Cluster labels
|
54
|
+
def fit_predict(data)
|
55
|
+
fit(data)
|
56
|
+
@labels
|
57
|
+
end
|
58
|
+
|
59
|
+
# Check if model has been fitted
|
60
|
+
# @return [Boolean] True if fitted
|
61
|
+
def fitted?
|
62
|
+
@fitted
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get cluster centers
|
66
|
+
# @return [Array] 2D array of cluster centers
|
67
|
+
def cluster_centers
|
68
|
+
@centroids
|
69
|
+
end
|
70
|
+
|
71
|
+
# Get the sum of squared distances of samples to their closest cluster center
|
72
|
+
# @return [Float] Inertia value
|
73
|
+
def inertia
|
74
|
+
@inertia
|
75
|
+
end
|
76
|
+
|
77
|
+
# Class methods for K-means specific utilities
|
78
|
+
class << self
|
79
|
+
# Find optimal number of clusters using elbow method
|
80
|
+
# @param data [Array] 2D array of data points
|
81
|
+
# @param k_range [Range] Range of k values to try
|
82
|
+
# @param max_iter [Integer] Maximum iterations per k
|
83
|
+
# @return [Hash] Mapping of k to inertia values
|
84
|
+
def elbow_method(data, k_range: 2..10, max_iter: 300)
|
85
|
+
results = {}
|
86
|
+
|
87
|
+
k_range.each do |k|
|
88
|
+
kmeans = new(k: k, max_iter: max_iter)
|
89
|
+
kmeans.fit(data)
|
90
|
+
results[k] = kmeans.inertia
|
91
|
+
end
|
92
|
+
|
93
|
+
results
|
94
|
+
end
|
95
|
+
|
96
|
+
# Detect optimal k from elbow method results
|
97
|
+
# @param elbow_results [Hash] Mapping of k to inertia values (from elbow_method)
|
98
|
+
# @param fallback_k [Integer] Default k to return if detection fails (default: 3)
|
99
|
+
# @return [Integer] Optimal number of clusters
|
100
|
+
def detect_optimal_k(elbow_results, fallback_k: 3)
|
101
|
+
return fallback_k if elbow_results.nil? || elbow_results.empty?
|
102
|
+
|
103
|
+
k_values = elbow_results.keys.sort
|
104
|
+
return k_values.first if k_values.size == 1
|
105
|
+
|
106
|
+
# Find the k with the largest drop in inertia
|
107
|
+
max_drop = 0
|
108
|
+
optimal_k = k_values.first
|
109
|
+
|
110
|
+
k_values.each_cons(2) do |k1, k2|
|
111
|
+
drop = elbow_results[k1] - elbow_results[k2]
|
112
|
+
if drop > max_drop
|
113
|
+
max_drop = drop
|
114
|
+
optimal_k = k2 # Use k after the drop
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
optimal_k
|
119
|
+
end
|
120
|
+
|
121
|
+
# Find optimal k and return it
|
122
|
+
# @param data [Array] 2D array of data points
|
123
|
+
# @param k_range [Range] Range of k values to try (default: 2..10)
|
124
|
+
# @param max_iter [Integer] Maximum iterations (default: 300)
|
125
|
+
# @return [Integer] Optimal number of clusters
|
126
|
+
def optimal_k(data, k_range: 2..10, max_iter: 300)
|
127
|
+
elbow_results = elbow_method(data, k_range: k_range, max_iter: max_iter)
|
128
|
+
detect_optimal_k(elbow_results)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
private
|
133
|
+
|
134
|
+
def validate_data(data)
|
135
|
+
raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
|
136
|
+
raise ArgumentError, "Data cannot be empty" if data.empty?
|
137
|
+
raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
|
138
|
+
|
139
|
+
# Check all rows have same length
|
140
|
+
row_length = data.first.length
|
141
|
+
unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
|
142
|
+
raise ArgumentError, "All rows must have the same length"
|
143
|
+
end
|
144
|
+
|
145
|
+
# Check all values are numeric
|
146
|
+
data.each_with_index do |row, i|
|
147
|
+
row.each_with_index do |val, j|
|
148
|
+
unless val.is_a?(Numeric)
|
149
|
+
raise ArgumentError, "Element at position [#{i}, #{j}] is not numeric"
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
# Module-level methods for cross-algorithm functionality
|
157
|
+
class << self
|
158
|
+
# Calculate silhouette score for any clustering result
|
159
|
+
# @param data [Array] 2D array of data points
|
160
|
+
# @param labels [Array] Cluster labels
|
161
|
+
# @return [Float] Mean silhouette coefficient
|
162
|
+
def silhouette_score(data, labels)
|
163
|
+
n_samples = data.size
|
164
|
+
unique_labels = labels.uniq
|
165
|
+
|
166
|
+
return 0.0 if unique_labels.size == 1
|
167
|
+
|
168
|
+
silhouette_values = []
|
169
|
+
|
170
|
+
data.each_with_index do |point, i|
|
171
|
+
cluster_label = labels[i]
|
172
|
+
|
173
|
+
# Calculate mean intra-cluster distance
|
174
|
+
same_cluster_indices = labels.each_index.select { |j| labels[j] == cluster_label && j != i }
|
175
|
+
if same_cluster_indices.empty?
|
176
|
+
silhouette_values << 0.0
|
177
|
+
next
|
178
|
+
end
|
179
|
+
|
180
|
+
a = same_cluster_indices.sum { |j| euclidean_distance(point, data[j]) } / same_cluster_indices.size.to_f
|
181
|
+
|
182
|
+
# Calculate mean nearest-cluster distance
|
183
|
+
b = Float::INFINITY
|
184
|
+
unique_labels.each do |other_label|
|
185
|
+
next if other_label == cluster_label
|
186
|
+
|
187
|
+
other_cluster_indices = labels.each_index.select { |j| labels[j] == other_label }
|
188
|
+
next if other_cluster_indices.empty?
|
189
|
+
|
190
|
+
mean_dist = other_cluster_indices.sum { |j| euclidean_distance(point, data[j]) } / other_cluster_indices.size.to_f
|
191
|
+
b = mean_dist if mean_dist < b
|
192
|
+
end
|
193
|
+
|
194
|
+
# Calculate silhouette value for this point
|
195
|
+
if a == 0.0 && b == 0.0
|
196
|
+
s = 0.0 # When all points are identical
|
197
|
+
else
|
198
|
+
s = (b - a) / [a, b].max
|
199
|
+
end
|
200
|
+
silhouette_values << s
|
201
|
+
end
|
202
|
+
|
203
|
+
silhouette_values.sum / silhouette_values.size.to_f
|
204
|
+
end
|
205
|
+
|
206
|
+
private
|
207
|
+
|
208
|
+
def euclidean_distance(a, b)
|
209
|
+
Math.sqrt(a.zip(b).sum { |x, y| (x - y) ** 2 })
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ClusterKit
|
4
|
+
class << self
|
5
|
+
attr_accessor :configuration
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.configure
|
9
|
+
self.configuration ||= Configuration.new
|
10
|
+
yield(configuration) if block_given?
|
11
|
+
end
|
12
|
+
|
13
|
+
class Configuration
|
14
|
+
attr_accessor :verbose
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
# Default to quiet unless explicitly set or debug env var is present
|
18
|
+
@verbose = ENV['CLUSTERKIT_VERBOSE'] == 'true' || ENV['DEBUG'] == 'true'
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Initialize default configuration
|
24
|
+
ClusterKit.configure
|
@@ -0,0 +1,251 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../clusterkit'
|
4
|
+
require_relative 'svd'
|
5
|
+
|
6
|
+
module ClusterKit
|
7
|
+
module Dimensionality
|
8
|
+
# Principal Component Analysis using SVD
|
9
|
+
# PCA is a linear dimensionality reduction technique that finds
|
10
|
+
# the directions of maximum variance in the data
|
11
|
+
class PCA
|
12
|
+
attr_reader :n_components, :components, :explained_variance, :explained_variance_ratio, :mean
|
13
|
+
|
14
|
+
# Initialize PCA
|
15
|
+
# @param n_components [Integer] Number of principal components to keep
|
16
|
+
def initialize(n_components: 2)
|
17
|
+
@n_components = n_components
|
18
|
+
@fitted = false
|
19
|
+
end
|
20
|
+
|
21
|
+
# Fit the PCA model
|
22
|
+
# @param data [Array] 2D array of data points (n_samples × n_features)
|
23
|
+
# @return [self] Returns self for method chaining
|
24
|
+
def fit(data)
|
25
|
+
validate_data(data)
|
26
|
+
|
27
|
+
# Center the data (subtract mean from each feature)
|
28
|
+
@mean = calculate_mean(data)
|
29
|
+
centered_data = center_data(data, @mean)
|
30
|
+
|
31
|
+
# Perform SVD on centered data
|
32
|
+
# U contains the transformed data, S contains singular values, VT contains components
|
33
|
+
u, s, vt = ClusterKit.svd(centered_data, @n_components, n_iter: 5)
|
34
|
+
|
35
|
+
# Store the principal components (eigenvectors)
|
36
|
+
@components = vt # Shape: (n_components, n_features)
|
37
|
+
|
38
|
+
# Store singular values for consistency
|
39
|
+
@singular_values = s
|
40
|
+
|
41
|
+
# Calculate explained variance (eigenvalues)
|
42
|
+
n_samples = data.size.to_f
|
43
|
+
@explained_variance = s.map { |val| (val ** 2) / (n_samples - 1) }
|
44
|
+
|
45
|
+
# Calculate explained variance ratio
|
46
|
+
total_variance = calculate_total_variance(centered_data, n_samples)
|
47
|
+
@explained_variance_ratio = @explained_variance.map { |var| var / total_variance }
|
48
|
+
|
49
|
+
@fitted = true
|
50
|
+
self
|
51
|
+
end
|
52
|
+
|
53
|
+
# Transform data using the fitted PCA model
|
54
|
+
# @param data [Array] 2D array of data points
|
55
|
+
# @return [Array] Transformed data in principal component space
|
56
|
+
def transform(data)
|
57
|
+
raise RuntimeError, "Model must be fitted before transform" unless fitted?
|
58
|
+
validate_data(data)
|
59
|
+
|
60
|
+
# Center the data using the stored mean
|
61
|
+
centered_data = center_data(data, @mean)
|
62
|
+
|
63
|
+
# Project onto principal components
|
64
|
+
# Result = centered_data × components.T
|
65
|
+
project_data(centered_data, @components)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Fit the model and transform the data in one step
|
69
|
+
# @param data [Array] 2D array of data points
|
70
|
+
# @return [Array] Transformed data
|
71
|
+
def fit_transform(data)
|
72
|
+
validate_data(data)
|
73
|
+
|
74
|
+
# Center the data (subtract mean from each feature)
|
75
|
+
@mean = calculate_mean(data)
|
76
|
+
centered_data = center_data(data, @mean)
|
77
|
+
|
78
|
+
# Perform SVD on centered data
|
79
|
+
u, s, vt = SVD.randomized_svd(centered_data, @n_components, n_iter: 5)
|
80
|
+
|
81
|
+
# Store the principal components (eigenvectors)
|
82
|
+
@components = vt
|
83
|
+
|
84
|
+
# Store singular values for later use
|
85
|
+
@singular_values = s
|
86
|
+
|
87
|
+
# Calculate explained variance (eigenvalues)
|
88
|
+
n_samples = data.size.to_f
|
89
|
+
@explained_variance = s.map { |val| (val ** 2) / (n_samples - 1) }
|
90
|
+
|
91
|
+
# Calculate explained variance ratio
|
92
|
+
total_variance = calculate_total_variance(centered_data, n_samples)
|
93
|
+
@explained_variance_ratio = @explained_variance.map { |var| var / total_variance }
|
94
|
+
|
95
|
+
@fitted = true
|
96
|
+
|
97
|
+
# For PCA, the transformed data is U * S
|
98
|
+
# Scale U by singular values
|
99
|
+
transformed = []
|
100
|
+
u.each do |row|
|
101
|
+
scaled_row = row.each_with_index.map { |val, i| val * s[i] }
|
102
|
+
transformed << scaled_row
|
103
|
+
end
|
104
|
+
transformed
|
105
|
+
end
|
106
|
+
|
107
|
+
# Inverse transform - reconstruct data from principal components
|
108
|
+
# @param data [Array] Transformed data in PC space
|
109
|
+
# @return [Array] Reconstructed data in original space
|
110
|
+
def inverse_transform(data)
|
111
|
+
raise RuntimeError, "Model must be fitted before inverse_transform" unless fitted?
|
112
|
+
|
113
|
+
# Reconstruct: data × components + mean
|
114
|
+
reconstructed = []
|
115
|
+
data.each do |sample|
|
116
|
+
reconstructed_sample = Array.new(@mean.size, 0.0)
|
117
|
+
|
118
|
+
sample.each_with_index do |value, i|
|
119
|
+
@components[i].each_with_index do |comp_val, j|
|
120
|
+
reconstructed_sample[j] += value * comp_val
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# Add back the mean
|
125
|
+
reconstructed_sample = reconstructed_sample.zip(@mean).map { |r, m| r + m }
|
126
|
+
reconstructed << reconstructed_sample
|
127
|
+
end
|
128
|
+
|
129
|
+
reconstructed
|
130
|
+
end
|
131
|
+
|
132
|
+
# Get the amount of variance explained by each component
|
133
|
+
# @return [Array] Explained variance for each component
|
134
|
+
def explained_variance
|
135
|
+
raise RuntimeError, "Model must be fitted first" unless fitted?
|
136
|
+
@explained_variance
|
137
|
+
end
|
138
|
+
|
139
|
+
# Get the percentage of variance explained by each component
|
140
|
+
# @return [Array] Explained variance ratio for each component
|
141
|
+
def explained_variance_ratio
|
142
|
+
raise RuntimeError, "Model must be fitted first" unless fitted?
|
143
|
+
@explained_variance_ratio
|
144
|
+
end
|
145
|
+
|
146
|
+
# Get cumulative explained variance ratio
|
147
|
+
# @return [Array] Cumulative sum of explained variance ratios
|
148
|
+
def cumulative_explained_variance_ratio
|
149
|
+
raise RuntimeError, "Model must be fitted first" unless fitted?
|
150
|
+
|
151
|
+
cumsum = []
|
152
|
+
sum = 0.0
|
153
|
+
@explained_variance_ratio.each do |ratio|
|
154
|
+
sum += ratio
|
155
|
+
cumsum << sum
|
156
|
+
end
|
157
|
+
cumsum
|
158
|
+
end
|
159
|
+
|
160
|
+
# Check if model has been fitted
|
161
|
+
# @return [Boolean] True if fitted
|
162
|
+
def fitted?
|
163
|
+
@fitted
|
164
|
+
end
|
165
|
+
|
166
|
+
private
|
167
|
+
|
168
|
+
def validate_data(data)
|
169
|
+
raise ArgumentError, "Data must be an array" unless data.is_a?(Array)
|
170
|
+
raise ArgumentError, "Data cannot be empty" if data.empty?
|
171
|
+
raise ArgumentError, "Data must be 2D array" unless data.first.is_a?(Array)
|
172
|
+
|
173
|
+
# Check all rows have same length
|
174
|
+
row_length = data.first.length
|
175
|
+
unless data.all? { |row| row.is_a?(Array) && row.length == row_length }
|
176
|
+
raise ArgumentError, "All rows must have the same length"
|
177
|
+
end
|
178
|
+
|
179
|
+
# Check we have enough samples for n_components
|
180
|
+
if data.size < @n_components
|
181
|
+
raise ArgumentError, "n_components (#{@n_components}) cannot be larger than n_samples (#{data.size})"
|
182
|
+
end
|
183
|
+
|
184
|
+
if data.first.size < @n_components
|
185
|
+
raise ArgumentError, "n_components (#{@n_components}) cannot be larger than n_features (#{data.first.size})"
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
def calculate_mean(data)
|
190
|
+
n_features = data.first.size
|
191
|
+
mean = Array.new(n_features, 0.0)
|
192
|
+
|
193
|
+
data.each do |row|
|
194
|
+
row.each_with_index do |val, i|
|
195
|
+
mean[i] += val
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
mean.map { |sum| sum / data.size.to_f }
|
200
|
+
end
|
201
|
+
|
202
|
+
def center_data(data, mean)
|
203
|
+
data.map do |row|
|
204
|
+
row.zip(mean).map { |val, m| val - m }
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def calculate_total_variance(centered_data, n_samples)
|
209
|
+
total_var = 0.0
|
210
|
+
|
211
|
+
centered_data.each do |row|
|
212
|
+
row.each do |val|
|
213
|
+
total_var += val ** 2
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
total_var / (n_samples - 1)
|
218
|
+
end
|
219
|
+
|
220
|
+
def project_data(centered_data, components)
|
221
|
+
# Matrix multiplication: centered_data × components.T
|
222
|
+
transformed = []
|
223
|
+
|
224
|
+
centered_data.each do |sample|
|
225
|
+
projected = Array.new(@n_components, 0.0)
|
226
|
+
|
227
|
+
components.each_with_index do |component, i|
|
228
|
+
dot_product = 0.0
|
229
|
+
sample.each_with_index do |val, j|
|
230
|
+
dot_product += val * component[j]
|
231
|
+
end
|
232
|
+
projected[i] = dot_product
|
233
|
+
end
|
234
|
+
|
235
|
+
transformed << projected
|
236
|
+
end
|
237
|
+
|
238
|
+
transformed
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
# Module-level convenience method
|
243
|
+
# @param data [Array] 2D array of data points
|
244
|
+
# @param n_components [Integer] Number of components
|
245
|
+
# @return [Array] Transformed data
|
246
|
+
def self.pca(data, n_components: 2)
|
247
|
+
pca = PCA.new(n_components: n_components)
|
248
|
+
pca.fit_transform(data)
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|