clusterkit 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.simplecov +47 -0
- data/CHANGELOG.md +35 -0
- data/CLAUDE.md +226 -0
- data/Cargo.toml +8 -0
- data/Gemfile +17 -0
- data/IMPLEMENTATION_NOTES.md +143 -0
- data/LICENSE.txt +21 -0
- data/PYTHON_COMPARISON.md +183 -0
- data/README.md +499 -0
- data/Rakefile +245 -0
- data/clusterkit.gemspec +45 -0
- data/docs/KNOWN_ISSUES.md +130 -0
- data/docs/RUST_ERROR_HANDLING.md +164 -0
- data/docs/TEST_FIXTURES.md +170 -0
- data/docs/UMAP_EXPLAINED.md +362 -0
- data/docs/UMAP_TROUBLESHOOTING.md +284 -0
- data/docs/VERBOSE_OUTPUT.md +84 -0
- data/examples/hdbscan_example.rb +147 -0
- data/examples/optimal_kmeans_example.rb +96 -0
- data/examples/pca_example.rb +114 -0
- data/examples/reproducible_umap.rb +99 -0
- data/examples/verbose_control.rb +43 -0
- data/ext/clusterkit/Cargo.toml +25 -0
- data/ext/clusterkit/extconf.rb +4 -0
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +115 -0
- data/ext/clusterkit/src/clustering.rs +267 -0
- data/ext/clusterkit/src/embedder.rs +413 -0
- data/ext/clusterkit/src/lib.rs +22 -0
- data/ext/clusterkit/src/svd.rs +112 -0
- data/ext/clusterkit/src/tests.rs +16 -0
- data/ext/clusterkit/src/utils.rs +33 -0
- data/lib/clusterkit/clustering/hdbscan.rb +177 -0
- data/lib/clusterkit/clustering.rb +213 -0
- data/lib/clusterkit/clusterkit.rb +9 -0
- data/lib/clusterkit/configuration.rb +24 -0
- data/lib/clusterkit/dimensionality/pca.rb +251 -0
- data/lib/clusterkit/dimensionality/svd.rb +144 -0
- data/lib/clusterkit/dimensionality/umap.rb +311 -0
- data/lib/clusterkit/dimensionality.rb +29 -0
- data/lib/clusterkit/hdbscan_api_design.rb +142 -0
- data/lib/clusterkit/preprocessing.rb +106 -0
- data/lib/clusterkit/silence.rb +42 -0
- data/lib/clusterkit/utils.rb +51 -0
- data/lib/clusterkit/version.rb +5 -0
- data/lib/clusterkit.rb +93 -0
- data/lib/tasks/visualize.rake +641 -0
- metadata +194 -0
@@ -0,0 +1,641 @@
|
|
1
|
+
namespace :clusterkit do
|
2
|
+
desc "Generate interactive visualization comparing dimensionality reduction and clustering methods"
|
3
|
+
task :visualize, [:output_file, :dataset, :clustering] do |t, args|
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'clusterkit'
|
6
|
+
require 'json'
|
7
|
+
|
8
|
+
output_file = args[:output_file] || 'clusterkit_visualization.html'
|
9
|
+
dataset_type = args[:dataset] || 'clusters'
|
10
|
+
clustering_method = args[:clustering] || 'both' # 'kmeans', 'hdbscan', or 'both'
|
11
|
+
|
12
|
+
puts "Generating visualization with dataset: #{dataset_type}, clustering: #{clustering_method}"
|
13
|
+
|
14
|
+
# Generate dataset based on type
|
15
|
+
data, true_labels, dataset_name = case dataset_type
|
16
|
+
when 'swiss'
|
17
|
+
generate_swiss_roll
|
18
|
+
when 'iris'
|
19
|
+
generate_iris_like_data
|
20
|
+
else
|
21
|
+
generate_clustered_data
|
22
|
+
end
|
23
|
+
|
24
|
+
puts "Generated #{data.size} points in #{data.first.size} dimensions"
|
25
|
+
|
26
|
+
# Reduce dimensions
|
27
|
+
print "Running UMAP..."
|
28
|
+
umap = ClusterKit::Dimensionality::UMAP.new(n_components: 2, n_neighbors: 15, random_seed: 42)
|
29
|
+
umap_data = umap.fit_transform(data)
|
30
|
+
puts " done"
|
31
|
+
|
32
|
+
# Create 20D UMAP for HDBSCAN (better for density-based clustering)
|
33
|
+
print "Running UMAP to 20D for HDBSCAN..."
|
34
|
+
umap_20d = ClusterKit::Dimensionality::UMAP.new(n_components: 20, n_neighbors: 15, random_seed: 42)
|
35
|
+
umap_data_20d = umap_20d.fit_transform(data)
|
36
|
+
puts " done"
|
37
|
+
|
38
|
+
print "Running PCA..."
|
39
|
+
pca = ClusterKit::Dimensionality::PCA.new(n_components: 2)
|
40
|
+
pca_data = pca.fit_transform(data)
|
41
|
+
variance_explained = pca.cumulative_explained_variance_ratio[-1]
|
42
|
+
puts " done (explained variance: #{(variance_explained * 100).round(1)}%)"
|
43
|
+
|
44
|
+
print "Running SVD..."
|
45
|
+
u, s, vt = ClusterKit.svd(data, 2, n_iter: 5)
|
46
|
+
svd_data = u
|
47
|
+
puts " done"
|
48
|
+
|
49
|
+
# Initialize clustering results
|
50
|
+
clustering_results = {}
|
51
|
+
metrics = {
|
52
|
+
pca_variance_explained: variance_explained
|
53
|
+
}
|
54
|
+
|
55
|
+
# Perform K-means clustering if requested
|
56
|
+
if clustering_method == 'kmeans' || clustering_method == 'both'
|
57
|
+
print "Clustering with K-means..."
|
58
|
+
|
59
|
+
# Find optimal k using elbow method
|
60
|
+
elbow_results = ClusterKit::Clustering::KMeans.elbow_method(umap_data, k_range: 2..6)
|
61
|
+
|
62
|
+
# Use library method to detect optimal k
|
63
|
+
optimal_k = ClusterKit::Clustering::KMeans.detect_optimal_k(elbow_results)
|
64
|
+
|
65
|
+
puts "\n Elbow method results:"
|
66
|
+
elbow_results.sort.each do |k, inertia|
|
67
|
+
puts " k=#{k}: #{inertia.round(2)}"
|
68
|
+
end
|
69
|
+
puts " Detected optimal k: #{optimal_k}"
|
70
|
+
|
71
|
+
kmeans_umap = ClusterKit::Clustering::KMeans.new(k: optimal_k, random_seed: 42)
|
72
|
+
kmeans_labels_umap = kmeans_umap.fit_predict(umap_data)
|
73
|
+
|
74
|
+
kmeans_pca = ClusterKit::Clustering::KMeans.new(k: optimal_k, random_seed: 42)
|
75
|
+
kmeans_labels_pca = kmeans_pca.fit_predict(pca_data)
|
76
|
+
|
77
|
+
kmeans_svd = ClusterKit::Clustering::KMeans.new(k: optimal_k, random_seed: 42)
|
78
|
+
kmeans_labels_svd = kmeans_svd.fit_predict(svd_data)
|
79
|
+
|
80
|
+
# Calculate K-means metrics
|
81
|
+
silhouette_umap_kmeans = ClusterKit::Clustering.silhouette_score(umap_data, kmeans_labels_umap)
|
82
|
+
silhouette_pca_kmeans = ClusterKit::Clustering.silhouette_score(pca_data, kmeans_labels_pca)
|
83
|
+
silhouette_svd_kmeans = ClusterKit::Clustering.silhouette_score(svd_data, kmeans_labels_svd)
|
84
|
+
|
85
|
+
clustering_results[:kmeans] = {
|
86
|
+
labels_umap: kmeans_labels_umap,
|
87
|
+
labels_pca: kmeans_labels_pca,
|
88
|
+
labels_svd: kmeans_labels_svd,
|
89
|
+
optimal_k: optimal_k,
|
90
|
+
elbow_results: elbow_results
|
91
|
+
}
|
92
|
+
|
93
|
+
metrics[:kmeans] = {
|
94
|
+
silhouette_umap: silhouette_umap_kmeans,
|
95
|
+
silhouette_pca: silhouette_pca_kmeans,
|
96
|
+
silhouette_svd: silhouette_svd_kmeans,
|
97
|
+
optimal_k: optimal_k
|
98
|
+
}
|
99
|
+
|
100
|
+
puts " done"
|
101
|
+
end
|
102
|
+
|
103
|
+
# Perform HDBSCAN clustering if requested
|
104
|
+
if clustering_method == 'hdbscan' || clustering_method == 'both'
|
105
|
+
print "Clustering with HDBSCAN..."
|
106
|
+
|
107
|
+
# HDBSCAN on 20D UMAP (better for density-based clustering)
|
108
|
+
hdbscan = ClusterKit::Clustering::HDBSCAN.new(
|
109
|
+
min_samples: 5,
|
110
|
+
min_cluster_size: 10
|
111
|
+
)
|
112
|
+
hdbscan_labels_20d = hdbscan.fit_predict(umap_data_20d)
|
113
|
+
|
114
|
+
# For visualization consistency, also cluster the 2D projections
|
115
|
+
hdbscan_2d = ClusterKit::Clustering::HDBSCAN.new(
|
116
|
+
min_samples: 5,
|
117
|
+
min_cluster_size: 10
|
118
|
+
)
|
119
|
+
hdbscan_labels_umap = hdbscan_2d.fit_predict(umap_data)
|
120
|
+
|
121
|
+
hdbscan_pca = ClusterKit::Clustering::HDBSCAN.new(
|
122
|
+
min_samples: 5,
|
123
|
+
min_cluster_size: 10
|
124
|
+
)
|
125
|
+
hdbscan_labels_pca = hdbscan_pca.fit_predict(pca_data)
|
126
|
+
|
127
|
+
hdbscan_svd = ClusterKit::Clustering::HDBSCAN.new(
|
128
|
+
min_samples: 5,
|
129
|
+
min_cluster_size: 10
|
130
|
+
)
|
131
|
+
hdbscan_labels_svd = hdbscan_svd.fit_predict(svd_data)
|
132
|
+
|
133
|
+
puts "\n HDBSCAN results (20D):"
|
134
|
+
puts " Clusters found: #{hdbscan.n_clusters}"
|
135
|
+
puts " Noise points: #{hdbscan.n_noise_points} (#{(hdbscan.noise_ratio * 100).round(1)}%)"
|
136
|
+
|
137
|
+
# Calculate HDBSCAN metrics (excluding noise for silhouette)
|
138
|
+
non_noise_mask_umap = hdbscan_labels_umap.map { |l| l != -1 }
|
139
|
+
non_noise_mask_pca = hdbscan_labels_pca.map { |l| l != -1 }
|
140
|
+
non_noise_mask_svd = hdbscan_labels_svd.map { |l| l != -1 }
|
141
|
+
|
142
|
+
# Filter out noise points for silhouette calculation
|
143
|
+
if non_noise_mask_umap.any? { |m| m }
|
144
|
+
filtered_data_umap = umap_data.select.with_index { |_, i| non_noise_mask_umap[i] }
|
145
|
+
filtered_labels_umap = hdbscan_labels_umap.select.with_index { |l, i| non_noise_mask_umap[i] }
|
146
|
+
silhouette_umap_hdbscan = filtered_labels_umap.uniq.size > 1 ?
|
147
|
+
ClusterKit::Clustering.silhouette_score(filtered_data_umap, filtered_labels_umap) : 0.0
|
148
|
+
else
|
149
|
+
silhouette_umap_hdbscan = 0.0
|
150
|
+
end
|
151
|
+
|
152
|
+
if non_noise_mask_pca.any? { |m| m }
|
153
|
+
filtered_data_pca = pca_data.select.with_index { |_, i| non_noise_mask_pca[i] }
|
154
|
+
filtered_labels_pca = hdbscan_labels_pca.select.with_index { |l, i| non_noise_mask_pca[i] }
|
155
|
+
silhouette_pca_hdbscan = filtered_labels_pca.uniq.size > 1 ?
|
156
|
+
ClusterKit::Clustering.silhouette_score(filtered_data_pca, filtered_labels_pca) : 0.0
|
157
|
+
else
|
158
|
+
silhouette_pca_hdbscan = 0.0
|
159
|
+
end
|
160
|
+
|
161
|
+
if non_noise_mask_svd.any? { |m| m }
|
162
|
+
filtered_data_svd = svd_data.select.with_index { |_, i| non_noise_mask_svd[i] }
|
163
|
+
filtered_labels_svd = hdbscan_labels_svd.select.with_index { |l, i| non_noise_mask_svd[i] }
|
164
|
+
silhouette_svd_hdbscan = filtered_labels_svd.uniq.size > 1 ?
|
165
|
+
ClusterKit::Clustering.silhouette_score(filtered_data_svd, filtered_labels_svd) : 0.0
|
166
|
+
else
|
167
|
+
silhouette_svd_hdbscan = 0.0
|
168
|
+
end
|
169
|
+
|
170
|
+
clustering_results[:hdbscan] = {
|
171
|
+
labels_umap: hdbscan_labels_umap,
|
172
|
+
labels_pca: hdbscan_labels_pca,
|
173
|
+
labels_svd: hdbscan_labels_svd,
|
174
|
+
labels_20d: hdbscan_labels_20d, # The main HDBSCAN result
|
175
|
+
n_clusters: hdbscan.n_clusters,
|
176
|
+
n_noise: hdbscan.n_noise_points,
|
177
|
+
noise_ratio: hdbscan.noise_ratio
|
178
|
+
}
|
179
|
+
|
180
|
+
metrics[:hdbscan] = {
|
181
|
+
silhouette_umap: silhouette_umap_hdbscan,
|
182
|
+
silhouette_pca: silhouette_pca_hdbscan,
|
183
|
+
silhouette_svd: silhouette_svd_hdbscan,
|
184
|
+
n_clusters: hdbscan.n_clusters,
|
185
|
+
noise_ratio: hdbscan.noise_ratio
|
186
|
+
}
|
187
|
+
|
188
|
+
puts " done"
|
189
|
+
end
|
190
|
+
|
191
|
+
# Generate HTML
|
192
|
+
html = generate_visualization_html(
|
193
|
+
data: data,
|
194
|
+
umap_data: umap_data,
|
195
|
+
pca_data: pca_data,
|
196
|
+
svd_data: svd_data,
|
197
|
+
true_labels: true_labels,
|
198
|
+
clustering_results: clustering_results,
|
199
|
+
dataset_name: dataset_name,
|
200
|
+
metrics: metrics,
|
201
|
+
clustering_method: clustering_method
|
202
|
+
)
|
203
|
+
|
204
|
+
File.write(output_file, html)
|
205
|
+
puts "\nVisualization saved to: #{output_file}"
|
206
|
+
puts "Open in browser: open #{output_file}"
|
207
|
+
end
|
208
|
+
|
209
|
+
def generate_clustered_data(n_points_per_cluster: 50, n_features: 50, n_clusters: 3)
|
210
|
+
data = []
|
211
|
+
labels = []
|
212
|
+
|
213
|
+
n_clusters.times do |cluster_id|
|
214
|
+
# Keep values smaller and normalized to avoid UMAP issues
|
215
|
+
center = Array.new(n_features) { (rand - 0.5) * 0.3 + cluster_id * 0.3 }
|
216
|
+
|
217
|
+
n_points_per_cluster.times do
|
218
|
+
point = center.map { |c| c + (rand - 0.5) * 0.1 }
|
219
|
+
data << point
|
220
|
+
labels << cluster_id
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
# Add some noise points for HDBSCAN testing
|
225
|
+
(n_points_per_cluster * 0.2).to_i.times do
|
226
|
+
point = Array.new(n_features) { rand * 2 - 1 } # Random noise
|
227
|
+
data << point
|
228
|
+
labels << -1 # Mark as noise
|
229
|
+
end
|
230
|
+
|
231
|
+
[data, labels, "Gaussian Clusters with Noise"]
|
232
|
+
end
|
233
|
+
|
234
|
+
def generate_swiss_roll(n_points: 150)
|
235
|
+
data = []
|
236
|
+
labels = []
|
237
|
+
|
238
|
+
n_points.times do |i|
|
239
|
+
t = 0.5 * Math::PI * (1 + 2 * i.to_f / n_points)
|
240
|
+
height = rand
|
241
|
+
|
242
|
+
x = t * Math.cos(t) * 0.1
|
243
|
+
y = height * 0.1
|
244
|
+
z = t * Math.sin(t) * 0.1
|
245
|
+
|
246
|
+
point = [x, y, z]
|
247
|
+
|
248
|
+
# Add correlated features
|
249
|
+
10.times do |j|
|
250
|
+
point << x * Math.sin(j) + y * Math.cos(j) + (rand - 0.5) * 0.01
|
251
|
+
end
|
252
|
+
|
253
|
+
# Add random features
|
254
|
+
37.times do
|
255
|
+
point << rand * 0.01
|
256
|
+
end
|
257
|
+
|
258
|
+
data << point
|
259
|
+
labels << (t / (3 * Math::PI) * 3).to_i
|
260
|
+
end
|
261
|
+
|
262
|
+
[data, labels, "Swiss Roll"]
|
263
|
+
end
|
264
|
+
|
265
|
+
def generate_iris_like_data
|
266
|
+
data = []
|
267
|
+
labels = []
|
268
|
+
|
269
|
+
species_params = [
|
270
|
+
{ sepal_length: 0.5, sepal_width: 0.34, petal_length: 0.15, petal_width: 0.02 },
|
271
|
+
{ sepal_length: 0.59, sepal_width: 0.28, petal_length: 0.43, petal_width: 0.13 },
|
272
|
+
{ sepal_length: 0.65, sepal_width: 0.30, petal_length: 0.55, petal_width: 0.20 }
|
273
|
+
]
|
274
|
+
|
275
|
+
species_params.each_with_index do |params, species_id|
|
276
|
+
50.times do
|
277
|
+
features = [
|
278
|
+
params[:sepal_length] + (rand - 0.5) * 0.08,
|
279
|
+
params[:sepal_width] + (rand - 0.5) * 0.06,
|
280
|
+
params[:petal_length] + (rand - 0.5) * 0.08,
|
281
|
+
params[:petal_width] + (rand - 0.5) * 0.04
|
282
|
+
]
|
283
|
+
|
284
|
+
# Expand to 50 dimensions
|
285
|
+
expanded = features.dup
|
286
|
+
|
287
|
+
features.each do |f1|
|
288
|
+
features.each do |f2|
|
289
|
+
expanded << f1 * f2 * 0.01
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
features.each_with_index do |f, i|
|
294
|
+
expanded << Math.sin(f) * 0.01 * (i + 1)
|
295
|
+
expanded << Math.cos(f) * 0.01 * (i + 1)
|
296
|
+
end
|
297
|
+
|
298
|
+
while expanded.length < 50
|
299
|
+
expanded << rand * 0.01
|
300
|
+
end
|
301
|
+
|
302
|
+
data << expanded[0...50]
|
303
|
+
labels << species_id
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
[data, labels, "Iris-like Dataset"]
|
308
|
+
end
|
309
|
+
|
310
|
+
def generate_visualization_html(data:, umap_data:, pca_data:, svd_data:, true_labels:,
|
311
|
+
clustering_results:, dataset_name:, metrics:, clustering_method:)
|
312
|
+
# Prepare plots based on clustering method
|
313
|
+
plots_html = ""
|
314
|
+
|
315
|
+
if clustering_method == 'both'
|
316
|
+
# Show both K-means and HDBSCAN side by side
|
317
|
+
plots_html = generate_comparison_plots(
|
318
|
+
umap_data, pca_data, svd_data, true_labels,
|
319
|
+
clustering_results[:kmeans], clustering_results[:hdbscan]
|
320
|
+
)
|
321
|
+
elsif clustering_method == 'kmeans'
|
322
|
+
plots_html = generate_kmeans_plots(
|
323
|
+
umap_data, pca_data, svd_data, true_labels,
|
324
|
+
clustering_results[:kmeans]
|
325
|
+
)
|
326
|
+
elsif clustering_method == 'hdbscan'
|
327
|
+
plots_html = generate_hdbscan_plots(
|
328
|
+
umap_data, pca_data, svd_data, true_labels,
|
329
|
+
clustering_results[:hdbscan]
|
330
|
+
)
|
331
|
+
end
|
332
|
+
|
333
|
+
# Generate metrics HTML
|
334
|
+
metrics_html = generate_metrics_html(metrics, clustering_method)
|
335
|
+
|
336
|
+
<<~HTML
|
337
|
+
<!DOCTYPE html>
|
338
|
+
<html>
|
339
|
+
<head>
|
340
|
+
<title>ClusterKit Visualization - #{dataset_name}</title>
|
341
|
+
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
|
342
|
+
<style>
|
343
|
+
body {
|
344
|
+
font-family: Arial, sans-serif;
|
345
|
+
margin: 20px;
|
346
|
+
background: #f5f5f5;
|
347
|
+
}
|
348
|
+
h1 {
|
349
|
+
color: #333;
|
350
|
+
text-align: center;
|
351
|
+
}
|
352
|
+
.container {
|
353
|
+
display: grid;
|
354
|
+
grid-template-columns: repeat(#{clustering_method == 'both' ? 3 : 2}, 1fr);
|
355
|
+
gap: 20px;
|
356
|
+
max-width: #{clustering_method == 'both' ? 1800 : 1400}px;
|
357
|
+
margin: 0 auto;
|
358
|
+
}
|
359
|
+
.plot {
|
360
|
+
background: white;
|
361
|
+
border-radius: 8px;
|
362
|
+
padding: 10px;
|
363
|
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
364
|
+
}
|
365
|
+
.stats {
|
366
|
+
background: white;
|
367
|
+
border-radius: 8px;
|
368
|
+
padding: 20px;
|
369
|
+
margin: 20px auto;
|
370
|
+
max-width: #{clustering_method == 'both' ? 1800 : 1400}px;
|
371
|
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
372
|
+
}
|
373
|
+
table {
|
374
|
+
width: 100%;
|
375
|
+
border-collapse: collapse;
|
376
|
+
}
|
377
|
+
th, td {
|
378
|
+
padding: 10px;
|
379
|
+
text-align: left;
|
380
|
+
border-bottom: 1px solid #ddd;
|
381
|
+
}
|
382
|
+
th {
|
383
|
+
background-color: #f8f8f8;
|
384
|
+
font-weight: bold;
|
385
|
+
}
|
386
|
+
.metrics {
|
387
|
+
display: grid;
|
388
|
+
grid-template-columns: repeat(#{clustering_method == 'both' ? 6 : 4}, 1fr);
|
389
|
+
gap: 20px;
|
390
|
+
margin-top: 20px;
|
391
|
+
}
|
392
|
+
.metric-card {
|
393
|
+
background: #f8f8f8;
|
394
|
+
padding: 15px;
|
395
|
+
border-radius: 5px;
|
396
|
+
text-align: center;
|
397
|
+
}
|
398
|
+
.metric-value {
|
399
|
+
font-size: 24px;
|
400
|
+
font-weight: bold;
|
401
|
+
color: #333;
|
402
|
+
}
|
403
|
+
.metric-label {
|
404
|
+
color: #666;
|
405
|
+
margin-top: 5px;
|
406
|
+
font-size: 12px;
|
407
|
+
}
|
408
|
+
.noise-point {
|
409
|
+
opacity: 0.3;
|
410
|
+
}
|
411
|
+
</style>
|
412
|
+
</head>
|
413
|
+
<body>
|
414
|
+
<h1>Dimensionality Reduction & Clustering Analysis</h1>
|
415
|
+
<h2 style="text-align: center; color: #666;">Dataset: #{dataset_name} | Method: #{clustering_method.capitalize}</h2>
|
416
|
+
|
417
|
+
<div class="stats">
|
418
|
+
<h2>Dataset Information</h2>
|
419
|
+
<table>
|
420
|
+
<tr>
|
421
|
+
<th>Property</th>
|
422
|
+
<th>Value</th>
|
423
|
+
</tr>
|
424
|
+
<tr>
|
425
|
+
<td>Original Dimensions</td>
|
426
|
+
<td>#{data.first.size}</td>
|
427
|
+
</tr>
|
428
|
+
<tr>
|
429
|
+
<td>Number of Points</td>
|
430
|
+
<td>#{data.size}</td>
|
431
|
+
</tr>
|
432
|
+
<tr>
|
433
|
+
<td>True Number of Clusters</td>
|
434
|
+
<td>#{true_labels.reject { |l| l == -1 }.uniq.size}</td>
|
435
|
+
</tr>
|
436
|
+
</table>
|
437
|
+
|
438
|
+
#{metrics_html}
|
439
|
+
</div>
|
440
|
+
|
441
|
+
<div class="container">
|
442
|
+
#{plots_html}
|
443
|
+
</div>
|
444
|
+
|
445
|
+
#{generate_additional_plots(metrics, clustering_method)}
|
446
|
+
|
447
|
+
</body>
|
448
|
+
</html>
|
449
|
+
HTML
|
450
|
+
end
|
451
|
+
|
452
|
+
def generate_metrics_html(metrics, clustering_method)
|
453
|
+
html = '<div class="metrics">'
|
454
|
+
|
455
|
+
if clustering_method == 'kmeans' || clustering_method == 'both'
|
456
|
+
kmeans_metrics = metrics[:kmeans]
|
457
|
+
html += <<~HTML
|
458
|
+
<div class="metric-card">
|
459
|
+
<div class="metric-value">#{kmeans_metrics[:optimal_k]}</div>
|
460
|
+
<div class="metric-label">K-means<br>Optimal K</div>
|
461
|
+
</div>
|
462
|
+
<div class="metric-card">
|
463
|
+
<div class="metric-value">#{kmeans_metrics[:silhouette_umap].round(3)}</div>
|
464
|
+
<div class="metric-label">K-means UMAP<br>Silhouette</div>
|
465
|
+
</div>
|
466
|
+
<div class="metric-card">
|
467
|
+
<div class="metric-value">#{kmeans_metrics[:silhouette_pca].round(3)}</div>
|
468
|
+
<div class="metric-label">K-means PCA<br>Silhouette</div>
|
469
|
+
</div>
|
470
|
+
HTML
|
471
|
+
end
|
472
|
+
|
473
|
+
if clustering_method == 'hdbscan' || clustering_method == 'both'
|
474
|
+
hdbscan_metrics = metrics[:hdbscan]
|
475
|
+
html += <<~HTML
|
476
|
+
<div class="metric-card">
|
477
|
+
<div class="metric-value">#{hdbscan_metrics[:n_clusters]}</div>
|
478
|
+
<div class="metric-label">HDBSCAN<br>Clusters Found</div>
|
479
|
+
</div>
|
480
|
+
<div class="metric-card">
|
481
|
+
<div class="metric-value">#{(hdbscan_metrics[:noise_ratio] * 100).round(1)}%</div>
|
482
|
+
<div class="metric-label">HDBSCAN<br>Noise Ratio</div>
|
483
|
+
</div>
|
484
|
+
<div class="metric-card">
|
485
|
+
<div class="metric-value">#{hdbscan_metrics[:silhouette_umap].round(3)}</div>
|
486
|
+
<div class="metric-label">HDBSCAN UMAP<br>Silhouette</div>
|
487
|
+
</div>
|
488
|
+
HTML
|
489
|
+
end
|
490
|
+
|
491
|
+
html += <<~HTML
|
492
|
+
<div class="metric-card">
|
493
|
+
<div class="metric-value">#{(metrics[:pca_variance_explained] * 100).round(1)}%</div>
|
494
|
+
<div class="metric-label">PCA Variance<br>Explained</div>
|
495
|
+
</div>
|
496
|
+
HTML
|
497
|
+
|
498
|
+
html += '</div>'
|
499
|
+
html
|
500
|
+
end
|
501
|
+
|
502
|
+
def generate_comparison_plots(umap_data, pca_data, svd_data, true_labels, kmeans_results, hdbscan_results)
|
503
|
+
plots = []
|
504
|
+
|
505
|
+
# Row 1: True labels
|
506
|
+
plots << create_plot_div('true-umap', umap_data, true_labels, 'UMAP - True Labels', 'UMAP')
|
507
|
+
plots << create_plot_div('true-pca', pca_data, true_labels, 'PCA - True Labels', 'PC')
|
508
|
+
plots << create_plot_div('true-svd', svd_data, true_labels, 'SVD - True Labels', 'Component')
|
509
|
+
|
510
|
+
# Row 2: K-means
|
511
|
+
plots << create_plot_div('kmeans-umap', umap_data, kmeans_results[:labels_umap], 'UMAP - K-means', 'UMAP')
|
512
|
+
plots << create_plot_div('kmeans-pca', pca_data, kmeans_results[:labels_pca], 'PCA - K-means', 'PC')
|
513
|
+
plots << create_plot_div('kmeans-svd', svd_data, kmeans_results[:labels_svd], 'SVD - K-means', 'Component')
|
514
|
+
|
515
|
+
# Row 3: HDBSCAN
|
516
|
+
plots << create_plot_div('hdbscan-umap', umap_data, hdbscan_results[:labels_umap], 'UMAP - HDBSCAN', 'UMAP', true)
|
517
|
+
plots << create_plot_div('hdbscan-pca', pca_data, hdbscan_results[:labels_pca], 'PCA - HDBSCAN', 'PC', true)
|
518
|
+
plots << create_plot_div('hdbscan-svd', svd_data, hdbscan_results[:labels_svd], 'SVD - HDBSCAN', 'Component', true)
|
519
|
+
|
520
|
+
plots.join("\n")
|
521
|
+
end
|
522
|
+
|
523
|
+
def generate_kmeans_plots(umap_data, pca_data, svd_data, true_labels, kmeans_results)
|
524
|
+
plots = []
|
525
|
+
|
526
|
+
# Row 1: True labels
|
527
|
+
plots << create_plot_div('true-umap', umap_data, true_labels, 'UMAP - True Labels', 'UMAP')
|
528
|
+
plots << create_plot_div('true-pca', pca_data, true_labels, 'PCA - True Labels', 'PC')
|
529
|
+
|
530
|
+
# Row 2: K-means
|
531
|
+
plots << create_plot_div('kmeans-umap', umap_data, kmeans_results[:labels_umap], 'UMAP - K-means', 'UMAP')
|
532
|
+
plots << create_plot_div('kmeans-pca', pca_data, kmeans_results[:labels_pca], 'PCA - K-means', 'PC')
|
533
|
+
|
534
|
+
# Row 3: SVD
|
535
|
+
plots << create_plot_div('true-svd', svd_data, true_labels, 'SVD - True Labels', 'Component')
|
536
|
+
plots << create_plot_div('kmeans-svd', svd_data, kmeans_results[:labels_svd], 'SVD - K-means', 'Component')
|
537
|
+
|
538
|
+
plots.join("\n")
|
539
|
+
end
|
540
|
+
|
541
|
+
def generate_hdbscan_plots(umap_data, pca_data, svd_data, true_labels, hdbscan_results)
|
542
|
+
plots = []
|
543
|
+
|
544
|
+
# Row 1: True labels
|
545
|
+
plots << create_plot_div('true-umap', umap_data, true_labels, 'UMAP - True Labels', 'UMAP')
|
546
|
+
plots << create_plot_div('true-pca', pca_data, true_labels, 'PCA - True Labels', 'PC')
|
547
|
+
|
548
|
+
# Row 2: HDBSCAN
|
549
|
+
plots << create_plot_div('hdbscan-umap', umap_data, hdbscan_results[:labels_umap], 'UMAP - HDBSCAN', 'UMAP', true)
|
550
|
+
plots << create_plot_div('hdbscan-pca', pca_data, hdbscan_results[:labels_pca], 'PCA - HDBSCAN', 'PC', true)
|
551
|
+
|
552
|
+
# Row 3: SVD
|
553
|
+
plots << create_plot_div('true-svd', svd_data, true_labels, 'SVD - True Labels', 'Component')
|
554
|
+
plots << create_plot_div('hdbscan-svd', svd_data, hdbscan_results[:labels_svd], 'SVD - HDBSCAN', 'Component', true)
|
555
|
+
|
556
|
+
plots.join("\n")
|
557
|
+
end
|
558
|
+
|
559
|
+
def create_plot_div(id, data, labels, title, axis_prefix, has_noise = false)
|
560
|
+
# Handle noise points specially for HDBSCAN
|
561
|
+
colors = if has_noise
|
562
|
+
labels.map { |l| l == -1 ? 'gray' : l }
|
563
|
+
else
|
564
|
+
labels
|
565
|
+
end
|
566
|
+
|
567
|
+
marker_props = if has_noise
|
568
|
+
# Make noise points smaller and semi-transparent
|
569
|
+
sizes = labels.map { |l| l == -1 ? 5 : 8 }
|
570
|
+
opacities = labels.map { |l| l == -1 ? 0.3 : 0.8 }
|
571
|
+
"size: [#{sizes.join(',')}], opacity: [#{opacities.join(',')}],"
|
572
|
+
else
|
573
|
+
"size: 8,"
|
574
|
+
end
|
575
|
+
|
576
|
+
<<~HTML
|
577
|
+
<div class="plot" id="#{id}"></div>
|
578
|
+
<script>
|
579
|
+
Plotly.newPlot('#{id}', [{
|
580
|
+
x: #{data.map { |p| p[0] }.to_json},
|
581
|
+
y: #{data.map { |p| p[1] }.to_json},
|
582
|
+
mode: 'markers',
|
583
|
+
marker: {
|
584
|
+
color: #{colors.to_json},
|
585
|
+
#{marker_props}
|
586
|
+
colorscale: 'Viridis',
|
587
|
+
showscale: false
|
588
|
+
},
|
589
|
+
type: 'scatter'
|
590
|
+
}], {
|
591
|
+
title: '#{title}',
|
592
|
+
xaxis: { title: '#{axis_prefix} 1' },
|
593
|
+
yaxis: { title: '#{axis_prefix} 2' },
|
594
|
+
height: 400
|
595
|
+
});
|
596
|
+
</script>
|
597
|
+
HTML
|
598
|
+
end
|
599
|
+
|
600
|
+
def generate_additional_plots(metrics, clustering_method)
|
601
|
+
plots = []
|
602
|
+
|
603
|
+
if clustering_method == 'kmeans' || clustering_method == 'both'
|
604
|
+
if metrics[:kmeans] && metrics[:kmeans][:elbow_results]
|
605
|
+
elbow_data = metrics[:kmeans][:elbow_results]
|
606
|
+
plots << <<~HTML
|
607
|
+
<div class="stats">
|
608
|
+
<h2>K-means Elbow Method Results</h2>
|
609
|
+
<div id="elbow-plot" style="height: 400px;"></div>
|
610
|
+
</div>
|
611
|
+
<script>
|
612
|
+
const elbowData = #{elbow_data.to_a.sort.to_h.to_json};
|
613
|
+
Plotly.newPlot('elbow-plot', [{
|
614
|
+
x: Object.keys(elbowData),
|
615
|
+
y: Object.values(elbowData),
|
616
|
+
mode: 'lines+markers',
|
617
|
+
marker: { size: 10 },
|
618
|
+
line: { width: 2 }
|
619
|
+
}], {
|
620
|
+
title: 'Elbow Method - Optimal K Selection',
|
621
|
+
xaxis: { title: 'Number of Clusters (k)' },
|
622
|
+
yaxis: { title: 'Inertia' },
|
623
|
+
height: 400,
|
624
|
+
annotations: [{
|
625
|
+
x: #{metrics[:kmeans][:optimal_k]},
|
626
|
+
y: elbowData[#{metrics[:kmeans][:optimal_k]}],
|
627
|
+
text: 'Optimal K',
|
628
|
+
showarrow: true,
|
629
|
+
arrowhead: 7,
|
630
|
+
ax: 0,
|
631
|
+
ay: -40
|
632
|
+
}]
|
633
|
+
});
|
634
|
+
</script>
|
635
|
+
HTML
|
636
|
+
end
|
637
|
+
end
|
638
|
+
|
639
|
+
plots.join("\n")
|
640
|
+
end
|
641
|
+
end
|