clusterkit 0.3.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.simplecov +47 -0
- data/CHANGELOG.md +35 -0
- data/CLAUDE.md +226 -0
- data/Cargo.lock +3228 -0
- data/Cargo.toml +8 -0
- data/Gemfile +17 -0
- data/IMPLEMENTATION_NOTES.md +143 -0
- data/LICENSE.txt +21 -0
- data/PYTHON_COMPARISON.md +183 -0
- data/README.md +744 -0
- data/Rakefile +259 -0
- data/docs/KNOWN_ISSUES.md +130 -0
- data/docs/RUST_ERROR_HANDLING.md +164 -0
- data/docs/TEST_FIXTURES.md +170 -0
- data/docs/UMAP_EXPLAINED.md +362 -0
- data/docs/UMAP_TROUBLESHOOTING.md +284 -0
- data/docs/VERBOSE_OUTPUT.md +84 -0
- data/docs/assets/clusterkit-wide.png +0 -0
- data/docs/assets/clusterkit.png +0 -0
- data/docs/assets/visualization.png +0 -0
- data/examples/hdbscan_example.rb +147 -0
- data/examples/optimal_kmeans_example.rb +96 -0
- data/examples/pca_example.rb +114 -0
- data/examples/reproducible_umap.rb +99 -0
- data/examples/verbose_control.rb +43 -0
- data/ext/clusterkit/Cargo.toml +26 -0
- data/ext/clusterkit/extconf.rb +23 -0
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +80 -0
- data/ext/clusterkit/src/clustering.rs +221 -0
- data/ext/clusterkit/src/embedder.rs +349 -0
- data/ext/clusterkit/src/hnsw.rs +579 -0
- data/ext/clusterkit/src/lib.rs +24 -0
- data/ext/clusterkit/src/svd.rs +89 -0
- data/ext/clusterkit/src/tests.rs +16 -0
- data/ext/clusterkit/src/utils.rs +183 -0
- data/lib/clusterkit/3.1/clusterkit.so +0 -0
- data/lib/clusterkit/3.2/clusterkit.so +0 -0
- data/lib/clusterkit/3.3/clusterkit.so +0 -0
- data/lib/clusterkit/3.4/clusterkit.so +0 -0
- data/lib/clusterkit/clustering/hdbscan.rb +164 -0
- data/lib/clusterkit/clustering.rb +194 -0
- data/lib/clusterkit/clusterkit.rb +14 -0
- data/lib/clusterkit/configuration.rb +24 -0
- data/lib/clusterkit/data_validator.rb +132 -0
- data/lib/clusterkit/dimensionality/pca.rb +251 -0
- data/lib/clusterkit/dimensionality/svd.rb +175 -0
- data/lib/clusterkit/dimensionality/umap.rb +282 -0
- data/lib/clusterkit/dimensionality.rb +29 -0
- data/lib/clusterkit/hdbscan_api_design.rb +142 -0
- data/lib/clusterkit/hnsw.rb +251 -0
- data/lib/clusterkit/preprocessing.rb +106 -0
- data/lib/clusterkit/silence.rb +42 -0
- data/lib/clusterkit/utils.rb +51 -0
- data/lib/clusterkit/version.rb +5 -0
- data/lib/clusterkit.rb +105 -0
- data/lib/tasks/visualize.rake +641 -0
- metadata +220 -0
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
use magnus::{function, prelude::*, Error, Value, RArray, TryConvert, Float, Integer, Ruby};
|
|
2
|
+
use ndarray::Array2;
|
|
3
|
+
|
|
4
|
+
pub fn init(parent: &magnus::RModule) -> Result<(), Error> {
|
|
5
|
+
let utils_module = parent.define_module("Utils")?;
|
|
6
|
+
|
|
7
|
+
utils_module.define_singleton_method(
|
|
8
|
+
"estimate_intrinsic_dimension_rust",
|
|
9
|
+
function!(estimate_intrinsic_dimension, 2),
|
|
10
|
+
)?;
|
|
11
|
+
|
|
12
|
+
utils_module.define_singleton_method(
|
|
13
|
+
"estimate_hubness_rust",
|
|
14
|
+
function!(estimate_hubness, 1),
|
|
15
|
+
)?;
|
|
16
|
+
|
|
17
|
+
Ok(())
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
fn estimate_intrinsic_dimension(_data: Value, _k_neighbors: usize) -> Result<f64, Error> {
|
|
21
|
+
let ruby = Ruby::get().unwrap();
|
|
22
|
+
Err(Error::new(
|
|
23
|
+
ruby.exception_not_imp_error(),
|
|
24
|
+
"Dimension estimation not implemented yet",
|
|
25
|
+
))
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
fn estimate_hubness(_data: Value) -> Result<Value, Error> {
|
|
29
|
+
let ruby = Ruby::get().unwrap();
|
|
30
|
+
Err(Error::new(
|
|
31
|
+
ruby.exception_not_imp_error(),
|
|
32
|
+
"Hubness estimation not implemented yet",
|
|
33
|
+
))
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/// Convert Ruby 2D array to ndarray Array2<f64>
|
|
37
|
+
/// Handles validation and provides consistent error messages
|
|
38
|
+
pub fn ruby_array_to_ndarray2(data: Value) -> Result<Array2<f64>, Error> {
|
|
39
|
+
let ruby = Ruby::get().unwrap();
|
|
40
|
+
let rarray: RArray = TryConvert::try_convert(data)?;
|
|
41
|
+
let n_samples = rarray.len();
|
|
42
|
+
|
|
43
|
+
if n_samples == 0 {
|
|
44
|
+
return Err(Error::new(
|
|
45
|
+
ruby.exception_arg_error(),
|
|
46
|
+
"Data cannot be empty",
|
|
47
|
+
));
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Get dimensions from first row
|
|
51
|
+
let first_row: RArray = rarray.entry::<RArray>(0)?;
|
|
52
|
+
let n_features = first_row.len();
|
|
53
|
+
|
|
54
|
+
if n_features == 0 {
|
|
55
|
+
return Err(Error::new(
|
|
56
|
+
ruby.exception_arg_error(),
|
|
57
|
+
"Data rows cannot be empty",
|
|
58
|
+
));
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Create ndarray and populate
|
|
62
|
+
let mut data_array = Array2::<f64>::zeros((n_samples, n_features));
|
|
63
|
+
for i in 0..n_samples {
|
|
64
|
+
let row: RArray = rarray.entry(i as isize)?;
|
|
65
|
+
|
|
66
|
+
// Validate row length consistency
|
|
67
|
+
if row.len() != n_features {
|
|
68
|
+
return Err(Error::new(
|
|
69
|
+
ruby.exception_arg_error(),
|
|
70
|
+
format!("Row {} has {} elements, expected {}", i, row.len(), n_features),
|
|
71
|
+
));
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
for j in 0..n_features {
|
|
75
|
+
let val: f64 = row.entry(j as isize)?;
|
|
76
|
+
data_array[[i, j]] = val;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
Ok(data_array)
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/// Convert Ruby 2D array to Vec<Vec<f64>>
|
|
84
|
+
/// Handles validation and provides consistent error messages
|
|
85
|
+
pub fn ruby_array_to_vec_vec_f64(data: Value) -> Result<Vec<Vec<f64>>, Error> {
|
|
86
|
+
let ruby = Ruby::get().unwrap();
|
|
87
|
+
let rarray: RArray = TryConvert::try_convert(data)?;
|
|
88
|
+
let n_samples = rarray.len();
|
|
89
|
+
|
|
90
|
+
if n_samples == 0 {
|
|
91
|
+
return Err(Error::new(
|
|
92
|
+
ruby.exception_arg_error(),
|
|
93
|
+
"Data cannot be empty",
|
|
94
|
+
));
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
let mut data_vec: Vec<Vec<f64>> = Vec::with_capacity(n_samples);
|
|
98
|
+
let mut expected_features: Option<usize> = None;
|
|
99
|
+
|
|
100
|
+
for i in 0..n_samples {
|
|
101
|
+
let row: RArray = rarray.entry(i as isize)?;
|
|
102
|
+
let n_features = row.len();
|
|
103
|
+
|
|
104
|
+
// Check row length consistency
|
|
105
|
+
match expected_features {
|
|
106
|
+
Some(expected) => {
|
|
107
|
+
if n_features != expected {
|
|
108
|
+
return Err(Error::new(
|
|
109
|
+
ruby.exception_arg_error(),
|
|
110
|
+
format!("Row {} has {} elements, expected {}", i, n_features, expected),
|
|
111
|
+
));
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
None => expected_features = Some(n_features),
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
let mut row_vec: Vec<f64> = Vec::with_capacity(n_features);
|
|
118
|
+
for j in 0..n_features {
|
|
119
|
+
let val: f64 = row.entry(j as isize)?;
|
|
120
|
+
row_vec.push(val);
|
|
121
|
+
}
|
|
122
|
+
data_vec.push(row_vec);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
Ok(data_vec)
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/// Convert Ruby 2D array to Vec<Vec<f32>>
|
|
129
|
+
/// For algorithms that require f32 precision (like UMAP)
|
|
130
|
+
pub fn ruby_array_to_vec_vec_f32(data: Value) -> Result<Vec<Vec<f32>>, Error> {
|
|
131
|
+
let ruby = Ruby::get().unwrap();
|
|
132
|
+
let rarray: RArray = TryConvert::try_convert(data)?;
|
|
133
|
+
let array_len = rarray.len();
|
|
134
|
+
|
|
135
|
+
if array_len == 0 {
|
|
136
|
+
return Err(Error::new(
|
|
137
|
+
ruby.exception_arg_error(),
|
|
138
|
+
"Input data cannot be empty",
|
|
139
|
+
));
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
let mut rust_data: Vec<Vec<f32>> = Vec::with_capacity(array_len);
|
|
143
|
+
|
|
144
|
+
for i in 0..array_len {
|
|
145
|
+
let row = rarray.entry::<Value>(i as isize)?;
|
|
146
|
+
let row_array = RArray::try_convert(row).map_err(|_| {
|
|
147
|
+
Error::new(
|
|
148
|
+
ruby.exception_type_error(),
|
|
149
|
+
"Expected array of arrays (2D array)",
|
|
150
|
+
)
|
|
151
|
+
})?;
|
|
152
|
+
|
|
153
|
+
let mut rust_row: Vec<f32> = Vec::new();
|
|
154
|
+
let row_len = row_array.len();
|
|
155
|
+
|
|
156
|
+
for j in 0..row_len {
|
|
157
|
+
let val = row_array.entry::<Value>(j as isize)?;
|
|
158
|
+
let float_val = if let Ok(f) = Float::try_convert(val) {
|
|
159
|
+
f.to_f64() as f32
|
|
160
|
+
} else if let Ok(i) = Integer::try_convert(val) {
|
|
161
|
+
i.to_i64()? as f32
|
|
162
|
+
} else {
|
|
163
|
+
return Err(Error::new(
|
|
164
|
+
ruby.exception_type_error(),
|
|
165
|
+
"All values must be numeric",
|
|
166
|
+
));
|
|
167
|
+
};
|
|
168
|
+
rust_row.push(float_val);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Validate row length consistency
|
|
172
|
+
if !rust_data.is_empty() && rust_row.len() != rust_data[0].len() {
|
|
173
|
+
return Err(Error::new(
|
|
174
|
+
ruby.exception_arg_error(),
|
|
175
|
+
"All rows must have the same length",
|
|
176
|
+
));
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
rust_data.push(rust_row);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
Ok(rust_data)
|
|
183
|
+
}
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../data_validator'
|
|
4
|
+
|
|
5
|
+
module ClusterKit
|
|
6
|
+
module Clustering
|
|
7
|
+
# HDBSCAN clustering algorithm - matching KMeans API pattern
|
|
8
|
+
class HDBSCAN
|
|
9
|
+
attr_reader :min_samples, :min_cluster_size, :metric, :labels, :probabilities,
|
|
10
|
+
:outlier_scores, :cluster_persistence
|
|
11
|
+
|
|
12
|
+
# Initialize HDBSCAN clusterer (matches KMeans pattern)
|
|
13
|
+
# @param min_samples [Integer] Min neighborhood size for core points (default: 5)
|
|
14
|
+
# @param min_cluster_size [Integer] Minimum size of clusters (default: 5)
|
|
15
|
+
# @param metric [String] Distance metric (default: 'euclidean')
|
|
16
|
+
def initialize(min_samples: 5, min_cluster_size: 5, metric: 'euclidean')
|
|
17
|
+
raise ArgumentError, "min_samples must be positive" unless min_samples > 0
|
|
18
|
+
raise ArgumentError, "min_cluster_size must be positive" unless min_cluster_size > 0
|
|
19
|
+
|
|
20
|
+
valid_metrics = ['euclidean', 'l2', 'manhattan', 'l1', 'cosine']
|
|
21
|
+
unless valid_metrics.include?(metric)
|
|
22
|
+
raise ArgumentError, "metric must be one of: #{valid_metrics.join(', ')}"
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
@min_samples = min_samples
|
|
26
|
+
@min_cluster_size = min_cluster_size
|
|
27
|
+
@metric = metric
|
|
28
|
+
@fitted = false
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Fit the HDBSCAN model (matches KMeans.fit)
|
|
32
|
+
# @param data [Array] 2D array of data points
|
|
33
|
+
# @return [self] Returns self for method chaining
|
|
34
|
+
def fit(data)
|
|
35
|
+
validate_data(data)
|
|
36
|
+
|
|
37
|
+
# Call Rust implementation (hdbscan crate)
|
|
38
|
+
result = Clustering.hdbscan_rust(data, @min_samples, @min_cluster_size, @metric)
|
|
39
|
+
|
|
40
|
+
@labels = result["labels"]
|
|
41
|
+
@probabilities = result["probabilities"]
|
|
42
|
+
@outlier_scores = result["outlier_scores"]
|
|
43
|
+
@cluster_persistence = result["cluster_persistence"]
|
|
44
|
+
@fitted = true
|
|
45
|
+
|
|
46
|
+
self
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# HDBSCAN doesn't support predict for new points (unlike KMeans)
|
|
50
|
+
# But we keep the method for API consistency
|
|
51
|
+
# @param data [Array] 2D array of data points
|
|
52
|
+
# @return [Array] Returns nil or raises
|
|
53
|
+
def predict(data)
|
|
54
|
+
raise NotImplementedError, "HDBSCAN does not support prediction on new data. " \
|
|
55
|
+
"Use approximate_predict for approximate membership"
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Fit the model and return labels (matches KMeans.fit_predict)
|
|
59
|
+
# @param data [Array] 2D array of data points
|
|
60
|
+
# @return [Array] Cluster labels (-1 for noise)
|
|
61
|
+
def fit_predict(data)
|
|
62
|
+
fit(data)
|
|
63
|
+
@labels
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Check if model has been fitted (matches KMeans.fitted?)
|
|
67
|
+
# @return [Boolean] True if fitted
|
|
68
|
+
def fitted?
|
|
69
|
+
@fitted
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Get number of clusters found (similar to KMeans.k but discovered)
|
|
73
|
+
# @return [Integer] Number of clusters (excluding noise)
|
|
74
|
+
def n_clusters
|
|
75
|
+
return 0 unless fitted?
|
|
76
|
+
# Count unique labels excluding -1 (noise)
|
|
77
|
+
unique_labels = @labels.uniq.reject { |l| l == -1 }
|
|
78
|
+
unique_labels.length
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Get noise ratio (HDBSCAN-specific but follows naming pattern)
|
|
82
|
+
# @return [Float] Fraction of points labeled as noise
|
|
83
|
+
def noise_ratio
|
|
84
|
+
return 0.0 unless fitted?
|
|
85
|
+
@labels.count(-1).to_f / @labels.length
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Get the number of noise points
|
|
89
|
+
# @return [Integer] Number of points labeled as noise
|
|
90
|
+
def n_noise_points
|
|
91
|
+
return 0 unless fitted?
|
|
92
|
+
@labels.count(-1)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Get indices of noise points
|
|
96
|
+
# @return [Array<Integer>] Indices of points labeled as noise
|
|
97
|
+
def noise_indices
|
|
98
|
+
return [] unless fitted?
|
|
99
|
+
@labels.each_with_index.select { |label, _| label == -1 }.map { |_, idx| idx }
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Get indices of points in each cluster
|
|
103
|
+
# @return [Hash<Integer, Array<Integer>>] Mapping of cluster label to point indices
|
|
104
|
+
def cluster_indices
|
|
105
|
+
return {} unless fitted?
|
|
106
|
+
|
|
107
|
+
result = {}
|
|
108
|
+
@labels.each_with_index do |label, idx|
|
|
109
|
+
next if label == -1 # Skip noise points
|
|
110
|
+
result[label] ||= []
|
|
111
|
+
result[label] << idx
|
|
112
|
+
end
|
|
113
|
+
result
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Get summary statistics
|
|
117
|
+
# @return [Hash] Summary of clustering results
|
|
118
|
+
def summary
|
|
119
|
+
return {} unless fitted?
|
|
120
|
+
|
|
121
|
+
{
|
|
122
|
+
n_clusters: n_clusters,
|
|
123
|
+
n_noise_points: n_noise_points,
|
|
124
|
+
noise_ratio: noise_ratio,
|
|
125
|
+
cluster_sizes: cluster_indices.transform_values(&:length),
|
|
126
|
+
cluster_persistence: @cluster_persistence
|
|
127
|
+
}
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
private
|
|
131
|
+
|
|
132
|
+
def validate_data(data)
|
|
133
|
+
# Use same validation as KMeans for consistency
|
|
134
|
+
DataValidator.validate_clustering(data, check_finite: false)
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Module-level convenience methods (matching KMeans pattern)
|
|
139
|
+
class << self
|
|
140
|
+
# Perform HDBSCAN clustering (matches Clustering.kmeans signature pattern)
|
|
141
|
+
# @param data [Array] 2D array of data points
|
|
142
|
+
# @param min_samples [Integer] Min neighborhood size for core points
|
|
143
|
+
# @param min_cluster_size [Integer] Minimum size of clusters
|
|
144
|
+
# @param metric [String] Distance metric
|
|
145
|
+
# @return [Hash] Result hash with :labels, :probabilities, :outlier_scores
|
|
146
|
+
def hdbscan(data, min_samples: 5, min_cluster_size: 5, metric: 'euclidean')
|
|
147
|
+
clusterer = HDBSCAN.new(
|
|
148
|
+
min_samples: min_samples,
|
|
149
|
+
min_cluster_size: min_cluster_size,
|
|
150
|
+
metric: metric
|
|
151
|
+
)
|
|
152
|
+
clusterer.fit(data)
|
|
153
|
+
{
|
|
154
|
+
labels: clusterer.labels,
|
|
155
|
+
probabilities: clusterer.probabilities,
|
|
156
|
+
outlier_scores: clusterer.outlier_scores,
|
|
157
|
+
n_clusters: clusterer.n_clusters,
|
|
158
|
+
noise_ratio: clusterer.noise_ratio,
|
|
159
|
+
cluster_persistence: clusterer.cluster_persistence || {}
|
|
160
|
+
}
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
end
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'clusterkit'
|
|
4
|
+
require_relative 'clustering/hdbscan'
|
|
5
|
+
require_relative 'data_validator'
|
|
6
|
+
|
|
7
|
+
module ClusterKit
|
|
8
|
+
# Module for clustering algorithms
|
|
9
|
+
module Clustering
|
|
10
|
+
# K-means clustering algorithm
|
|
11
|
+
class KMeans
|
|
12
|
+
attr_reader :k, :max_iter, :centroids, :labels, :inertia
|
|
13
|
+
|
|
14
|
+
# Initialize K-means clusterer
|
|
15
|
+
# @param k [Integer] Number of clusters
|
|
16
|
+
# @param max_iter [Integer] Maximum iterations (default: 300)
|
|
17
|
+
# @param random_seed [Integer] Random seed for reproducibility (optional)
|
|
18
|
+
def initialize(k:, max_iter: 300, random_seed: nil)
|
|
19
|
+
raise ArgumentError, "k must be positive" unless k > 0
|
|
20
|
+
@k = k
|
|
21
|
+
@max_iter = max_iter
|
|
22
|
+
@random_seed = random_seed
|
|
23
|
+
@fitted = false
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Fit the K-means model
|
|
27
|
+
# @param data [Array] 2D array of data points
|
|
28
|
+
# @return [self] Returns self for method chaining
|
|
29
|
+
def fit(data)
|
|
30
|
+
validate_data(data)
|
|
31
|
+
|
|
32
|
+
# Call Rust implementation with optional seed
|
|
33
|
+
@labels, @centroids, @inertia = Clustering.kmeans_rust(data, @k, @max_iter, @random_seed)
|
|
34
|
+
@fitted = true
|
|
35
|
+
|
|
36
|
+
self
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Predict cluster labels for new data
|
|
40
|
+
# @param data [Array] 2D array of data points
|
|
41
|
+
# @return [Array] Cluster labels
|
|
42
|
+
def predict(data)
|
|
43
|
+
raise RuntimeError, "Model must be fitted before predict" unless fitted?
|
|
44
|
+
validate_data(data)
|
|
45
|
+
|
|
46
|
+
Clustering.kmeans_predict_rust(data, @centroids)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Fit the model and return labels
|
|
50
|
+
# @param data [Array] 2D array of data points
|
|
51
|
+
# @return [Array] Cluster labels
|
|
52
|
+
def fit_predict(data)
|
|
53
|
+
fit(data)
|
|
54
|
+
@labels
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Check if model has been fitted
|
|
58
|
+
# @return [Boolean] True if fitted
|
|
59
|
+
def fitted?
|
|
60
|
+
@fitted
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Get cluster centers
|
|
64
|
+
# @return [Array] 2D array of cluster centers
|
|
65
|
+
def cluster_centers
|
|
66
|
+
@centroids
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Get the sum of squared distances of samples to their closest cluster center
|
|
70
|
+
# @return [Float] Inertia value
|
|
71
|
+
def inertia
|
|
72
|
+
@inertia
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Class methods for K-means specific utilities
|
|
76
|
+
class << self
|
|
77
|
+
# Find optimal number of clusters using elbow method
|
|
78
|
+
# @param data [Array] 2D array of data points
|
|
79
|
+
# @param k_range [Range] Range of k values to try
|
|
80
|
+
# @param max_iter [Integer] Maximum iterations per k
|
|
81
|
+
# @return [Hash] Mapping of k to inertia values
|
|
82
|
+
def elbow_method(data, k_range: 2..10, max_iter: 300)
|
|
83
|
+
results = {}
|
|
84
|
+
|
|
85
|
+
k_range.each do |k|
|
|
86
|
+
kmeans = new(k: k, max_iter: max_iter)
|
|
87
|
+
kmeans.fit(data)
|
|
88
|
+
results[k] = kmeans.inertia
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
results
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Detect optimal k from elbow method results
|
|
95
|
+
# @param elbow_results [Hash] Mapping of k to inertia values (from elbow_method)
|
|
96
|
+
# @param fallback_k [Integer] Default k to return if detection fails (default: 3)
|
|
97
|
+
# @return [Integer] Optimal number of clusters
|
|
98
|
+
def detect_optimal_k(elbow_results, fallback_k: 3)
|
|
99
|
+
return fallback_k if elbow_results.nil? || elbow_results.empty?
|
|
100
|
+
|
|
101
|
+
k_values = elbow_results.keys.sort
|
|
102
|
+
return k_values.first if k_values.size == 1
|
|
103
|
+
|
|
104
|
+
# Find the k with the largest drop in inertia
|
|
105
|
+
max_drop = 0
|
|
106
|
+
optimal_k = k_values.first
|
|
107
|
+
|
|
108
|
+
k_values.each_cons(2) do |k1, k2|
|
|
109
|
+
drop = elbow_results[k1] - elbow_results[k2]
|
|
110
|
+
if drop > max_drop
|
|
111
|
+
max_drop = drop
|
|
112
|
+
optimal_k = k2 # Use k after the drop
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
optimal_k
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Find optimal k and return it
|
|
120
|
+
# @param data [Array] 2D array of data points
|
|
121
|
+
# @param k_range [Range] Range of k values to try (default: 2..10)
|
|
122
|
+
# @param max_iter [Integer] Maximum iterations (default: 300)
|
|
123
|
+
# @return [Integer] Optimal number of clusters
|
|
124
|
+
def optimal_k(data, k_range: 2..10, max_iter: 300)
|
|
125
|
+
elbow_results = elbow_method(data, k_range: k_range, max_iter: max_iter)
|
|
126
|
+
detect_optimal_k(elbow_results)
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
private
|
|
131
|
+
|
|
132
|
+
def validate_data(data)
|
|
133
|
+
DataValidator.validate_clustering(data, check_finite: false)
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Module-level methods for cross-algorithm functionality
|
|
138
|
+
class << self
|
|
139
|
+
# Calculate silhouette score for any clustering result
|
|
140
|
+
# @param data [Array] 2D array of data points
|
|
141
|
+
# @param labels [Array] Cluster labels
|
|
142
|
+
# @return [Float] Mean silhouette coefficient
|
|
143
|
+
def silhouette_score(data, labels)
|
|
144
|
+
n_samples = data.size
|
|
145
|
+
unique_labels = labels.uniq
|
|
146
|
+
|
|
147
|
+
return 0.0 if unique_labels.size == 1
|
|
148
|
+
|
|
149
|
+
silhouette_values = []
|
|
150
|
+
|
|
151
|
+
data.each_with_index do |point, i|
|
|
152
|
+
cluster_label = labels[i]
|
|
153
|
+
|
|
154
|
+
# Calculate mean intra-cluster distance
|
|
155
|
+
same_cluster_indices = labels.each_index.select { |j| labels[j] == cluster_label && j != i }
|
|
156
|
+
if same_cluster_indices.empty?
|
|
157
|
+
silhouette_values << 0.0
|
|
158
|
+
next
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
a = same_cluster_indices.sum { |j| euclidean_distance(point, data[j]) } / same_cluster_indices.size.to_f
|
|
162
|
+
|
|
163
|
+
# Calculate mean nearest-cluster distance
|
|
164
|
+
b = Float::INFINITY
|
|
165
|
+
unique_labels.each do |other_label|
|
|
166
|
+
next if other_label == cluster_label
|
|
167
|
+
|
|
168
|
+
other_cluster_indices = labels.each_index.select { |j| labels[j] == other_label }
|
|
169
|
+
next if other_cluster_indices.empty?
|
|
170
|
+
|
|
171
|
+
mean_dist = other_cluster_indices.sum { |j| euclidean_distance(point, data[j]) } / other_cluster_indices.size.to_f
|
|
172
|
+
b = mean_dist if mean_dist < b
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Calculate silhouette value for this point
|
|
176
|
+
if a == 0.0 && b == 0.0
|
|
177
|
+
s = 0.0 # When all points are identical
|
|
178
|
+
else
|
|
179
|
+
s = (b - a) / [a, b].max
|
|
180
|
+
end
|
|
181
|
+
silhouette_values << s
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
silhouette_values.sum / silhouette_values.size.to_f
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
private
|
|
188
|
+
|
|
189
|
+
def euclidean_distance(a, b)
|
|
190
|
+
Math.sqrt(a.zip(b).sum { |x, y| (x - y) ** 2 })
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Load the compiled Rust extension. Precompiled (platform) gems install it into a
|
|
4
|
+
# Ruby-ABI-versioned subdir (lib/clusterkit/<major.minor>/clusterkit.{so,bundle}) so a
|
|
5
|
+
# single fat gem can carry a binary per Ruby version; source/dev builds place it flat at
|
|
6
|
+
# lib/clusterkit/clusterkit.{so,bundle}. Try the versioned path first, fall back to the
|
|
7
|
+
# flat one. Resolution goes through $LOAD_PATH (`require`, never `require_relative`)
|
|
8
|
+
# because RubyGems installs native extensions outside the gem's lib/ dir.
|
|
9
|
+
begin
|
|
10
|
+
RUBY_VERSION =~ /(\d+\.\d+)/
|
|
11
|
+
require "clusterkit/#{Regexp.last_match(1)}/clusterkit"
|
|
12
|
+
rescue LoadError
|
|
13
|
+
require "clusterkit/clusterkit"
|
|
14
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ClusterKit
|
|
4
|
+
class << self
|
|
5
|
+
attr_accessor :configuration
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def self.configure
|
|
9
|
+
self.configuration ||= Configuration.new
|
|
10
|
+
yield(configuration) if block_given?
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
class Configuration
|
|
14
|
+
attr_accessor :verbose
|
|
15
|
+
|
|
16
|
+
def initialize
|
|
17
|
+
# Default to quiet unless explicitly set or debug env var is present
|
|
18
|
+
@verbose = ENV['CLUSTERKIT_VERBOSE'] == 'true' || ENV['DEBUG'] == 'true'
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Initialize default configuration
|
|
24
|
+
ClusterKit.configure
|