clusterkit 0.1.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.simplecov +47 -0
  4. data/CHANGELOG.md +35 -0
  5. data/CLAUDE.md +226 -0
  6. data/Cargo.toml +8 -0
  7. data/Gemfile +17 -0
  8. data/IMPLEMENTATION_NOTES.md +143 -0
  9. data/LICENSE.txt +21 -0
  10. data/PYTHON_COMPARISON.md +183 -0
  11. data/README.md +499 -0
  12. data/Rakefile +245 -0
  13. data/clusterkit.gemspec +45 -0
  14. data/docs/KNOWN_ISSUES.md +130 -0
  15. data/docs/RUST_ERROR_HANDLING.md +164 -0
  16. data/docs/TEST_FIXTURES.md +170 -0
  17. data/docs/UMAP_EXPLAINED.md +362 -0
  18. data/docs/UMAP_TROUBLESHOOTING.md +284 -0
  19. data/docs/VERBOSE_OUTPUT.md +84 -0
  20. data/examples/hdbscan_example.rb +147 -0
  21. data/examples/optimal_kmeans_example.rb +96 -0
  22. data/examples/pca_example.rb +114 -0
  23. data/examples/reproducible_umap.rb +99 -0
  24. data/examples/verbose_control.rb +43 -0
  25. data/ext/clusterkit/Cargo.toml +25 -0
  26. data/ext/clusterkit/extconf.rb +4 -0
  27. data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +115 -0
  28. data/ext/clusterkit/src/clustering.rs +267 -0
  29. data/ext/clusterkit/src/embedder.rs +413 -0
  30. data/ext/clusterkit/src/lib.rs +22 -0
  31. data/ext/clusterkit/src/svd.rs +112 -0
  32. data/ext/clusterkit/src/tests.rs +16 -0
  33. data/ext/clusterkit/src/utils.rs +33 -0
  34. data/lib/clusterkit/clustering/hdbscan.rb +177 -0
  35. data/lib/clusterkit/clustering.rb +213 -0
  36. data/lib/clusterkit/clusterkit.rb +9 -0
  37. data/lib/clusterkit/configuration.rb +24 -0
  38. data/lib/clusterkit/dimensionality/pca.rb +251 -0
  39. data/lib/clusterkit/dimensionality/svd.rb +144 -0
  40. data/lib/clusterkit/dimensionality/umap.rb +311 -0
  41. data/lib/clusterkit/dimensionality.rb +29 -0
  42. data/lib/clusterkit/hdbscan_api_design.rb +142 -0
  43. data/lib/clusterkit/preprocessing.rb +106 -0
  44. data/lib/clusterkit/silence.rb +42 -0
  45. data/lib/clusterkit/utils.rb +51 -0
  46. data/lib/clusterkit/version.rb +5 -0
  47. data/lib/clusterkit.rb +93 -0
  48. data/lib/tasks/visualize.rake +641 -0
  49. metadata +194 -0
@@ -0,0 +1,641 @@
1
+ namespace :clusterkit do
2
+ desc "Generate interactive visualization comparing dimensionality reduction and clustering methods"
3
+ task :visualize, [:output_file, :dataset, :clustering] do |t, args|
4
+ require 'bundler/setup'
5
+ require 'clusterkit'
6
+ require 'json'
7
+
8
+ output_file = args[:output_file] || 'clusterkit_visualization.html'
9
+ dataset_type = args[:dataset] || 'clusters'
10
+ clustering_method = args[:clustering] || 'both' # 'kmeans', 'hdbscan', or 'both'
11
+
12
+ puts "Generating visualization with dataset: #{dataset_type}, clustering: #{clustering_method}"
13
+
14
+ # Generate dataset based on type
15
+ data, true_labels, dataset_name = case dataset_type
16
+ when 'swiss'
17
+ generate_swiss_roll
18
+ when 'iris'
19
+ generate_iris_like_data
20
+ else
21
+ generate_clustered_data
22
+ end
23
+
24
+ puts "Generated #{data.size} points in #{data.first.size} dimensions"
25
+
26
+ # Reduce dimensions
27
+ print "Running UMAP..."
28
+ umap = ClusterKit::Dimensionality::UMAP.new(n_components: 2, n_neighbors: 15, random_seed: 42)
29
+ umap_data = umap.fit_transform(data)
30
+ puts " done"
31
+
32
+ # Create 20D UMAP for HDBSCAN (better for density-based clustering)
33
+ print "Running UMAP to 20D for HDBSCAN..."
34
+ umap_20d = ClusterKit::Dimensionality::UMAP.new(n_components: 20, n_neighbors: 15, random_seed: 42)
35
+ umap_data_20d = umap_20d.fit_transform(data)
36
+ puts " done"
37
+
38
+ print "Running PCA..."
39
+ pca = ClusterKit::Dimensionality::PCA.new(n_components: 2)
40
+ pca_data = pca.fit_transform(data)
41
+ variance_explained = pca.cumulative_explained_variance_ratio[-1]
42
+ puts " done (explained variance: #{(variance_explained * 100).round(1)}%)"
43
+
44
+ print "Running SVD..."
45
+ u, s, vt = ClusterKit.svd(data, 2, n_iter: 5)
46
+ svd_data = u
47
+ puts " done"
48
+
49
+ # Initialize clustering results
50
+ clustering_results = {}
51
+ metrics = {
52
+ pca_variance_explained: variance_explained
53
+ }
54
+
55
+ # Perform K-means clustering if requested
56
+ if clustering_method == 'kmeans' || clustering_method == 'both'
57
+ print "Clustering with K-means..."
58
+
59
+ # Find optimal k using elbow method
60
+ elbow_results = ClusterKit::Clustering::KMeans.elbow_method(umap_data, k_range: 2..6)
61
+
62
+ # Use library method to detect optimal k
63
+ optimal_k = ClusterKit::Clustering::KMeans.detect_optimal_k(elbow_results)
64
+
65
+ puts "\n Elbow method results:"
66
+ elbow_results.sort.each do |k, inertia|
67
+ puts " k=#{k}: #{inertia.round(2)}"
68
+ end
69
+ puts " Detected optimal k: #{optimal_k}"
70
+
71
+ kmeans_umap = ClusterKit::Clustering::KMeans.new(k: optimal_k, random_seed: 42)
72
+ kmeans_labels_umap = kmeans_umap.fit_predict(umap_data)
73
+
74
+ kmeans_pca = ClusterKit::Clustering::KMeans.new(k: optimal_k, random_seed: 42)
75
+ kmeans_labels_pca = kmeans_pca.fit_predict(pca_data)
76
+
77
+ kmeans_svd = ClusterKit::Clustering::KMeans.new(k: optimal_k, random_seed: 42)
78
+ kmeans_labels_svd = kmeans_svd.fit_predict(svd_data)
79
+
80
+ # Calculate K-means metrics
81
+ silhouette_umap_kmeans = ClusterKit::Clustering.silhouette_score(umap_data, kmeans_labels_umap)
82
+ silhouette_pca_kmeans = ClusterKit::Clustering.silhouette_score(pca_data, kmeans_labels_pca)
83
+ silhouette_svd_kmeans = ClusterKit::Clustering.silhouette_score(svd_data, kmeans_labels_svd)
84
+
85
+ clustering_results[:kmeans] = {
86
+ labels_umap: kmeans_labels_umap,
87
+ labels_pca: kmeans_labels_pca,
88
+ labels_svd: kmeans_labels_svd,
89
+ optimal_k: optimal_k,
90
+ elbow_results: elbow_results
91
+ }
92
+
93
+ metrics[:kmeans] = {
94
+ silhouette_umap: silhouette_umap_kmeans,
95
+ silhouette_pca: silhouette_pca_kmeans,
96
+ silhouette_svd: silhouette_svd_kmeans,
97
+ optimal_k: optimal_k
98
+ }
99
+
100
+ puts " done"
101
+ end
102
+
103
+ # Perform HDBSCAN clustering if requested
104
+ if clustering_method == 'hdbscan' || clustering_method == 'both'
105
+ print "Clustering with HDBSCAN..."
106
+
107
+ # HDBSCAN on 20D UMAP (better for density-based clustering)
108
+ hdbscan = ClusterKit::Clustering::HDBSCAN.new(
109
+ min_samples: 5,
110
+ min_cluster_size: 10
111
+ )
112
+ hdbscan_labels_20d = hdbscan.fit_predict(umap_data_20d)
113
+
114
+ # For visualization consistency, also cluster the 2D projections
115
+ hdbscan_2d = ClusterKit::Clustering::HDBSCAN.new(
116
+ min_samples: 5,
117
+ min_cluster_size: 10
118
+ )
119
+ hdbscan_labels_umap = hdbscan_2d.fit_predict(umap_data)
120
+
121
+ hdbscan_pca = ClusterKit::Clustering::HDBSCAN.new(
122
+ min_samples: 5,
123
+ min_cluster_size: 10
124
+ )
125
+ hdbscan_labels_pca = hdbscan_pca.fit_predict(pca_data)
126
+
127
+ hdbscan_svd = ClusterKit::Clustering::HDBSCAN.new(
128
+ min_samples: 5,
129
+ min_cluster_size: 10
130
+ )
131
+ hdbscan_labels_svd = hdbscan_svd.fit_predict(svd_data)
132
+
133
+ puts "\n HDBSCAN results (20D):"
134
+ puts " Clusters found: #{hdbscan.n_clusters}"
135
+ puts " Noise points: #{hdbscan.n_noise_points} (#{(hdbscan.noise_ratio * 100).round(1)}%)"
136
+
137
+ # Calculate HDBSCAN metrics (excluding noise for silhouette)
138
+ non_noise_mask_umap = hdbscan_labels_umap.map { |l| l != -1 }
139
+ non_noise_mask_pca = hdbscan_labels_pca.map { |l| l != -1 }
140
+ non_noise_mask_svd = hdbscan_labels_svd.map { |l| l != -1 }
141
+
142
+ # Filter out noise points for silhouette calculation
143
+ if non_noise_mask_umap.any? { |m| m }
144
+ filtered_data_umap = umap_data.select.with_index { |_, i| non_noise_mask_umap[i] }
145
+ filtered_labels_umap = hdbscan_labels_umap.select.with_index { |l, i| non_noise_mask_umap[i] }
146
+ silhouette_umap_hdbscan = filtered_labels_umap.uniq.size > 1 ?
147
+ ClusterKit::Clustering.silhouette_score(filtered_data_umap, filtered_labels_umap) : 0.0
148
+ else
149
+ silhouette_umap_hdbscan = 0.0
150
+ end
151
+
152
+ if non_noise_mask_pca.any? { |m| m }
153
+ filtered_data_pca = pca_data.select.with_index { |_, i| non_noise_mask_pca[i] }
154
+ filtered_labels_pca = hdbscan_labels_pca.select.with_index { |l, i| non_noise_mask_pca[i] }
155
+ silhouette_pca_hdbscan = filtered_labels_pca.uniq.size > 1 ?
156
+ ClusterKit::Clustering.silhouette_score(filtered_data_pca, filtered_labels_pca) : 0.0
157
+ else
158
+ silhouette_pca_hdbscan = 0.0
159
+ end
160
+
161
+ if non_noise_mask_svd.any? { |m| m }
162
+ filtered_data_svd = svd_data.select.with_index { |_, i| non_noise_mask_svd[i] }
163
+ filtered_labels_svd = hdbscan_labels_svd.select.with_index { |l, i| non_noise_mask_svd[i] }
164
+ silhouette_svd_hdbscan = filtered_labels_svd.uniq.size > 1 ?
165
+ ClusterKit::Clustering.silhouette_score(filtered_data_svd, filtered_labels_svd) : 0.0
166
+ else
167
+ silhouette_svd_hdbscan = 0.0
168
+ end
169
+
170
+ clustering_results[:hdbscan] = {
171
+ labels_umap: hdbscan_labels_umap,
172
+ labels_pca: hdbscan_labels_pca,
173
+ labels_svd: hdbscan_labels_svd,
174
+ labels_20d: hdbscan_labels_20d, # The main HDBSCAN result
175
+ n_clusters: hdbscan.n_clusters,
176
+ n_noise: hdbscan.n_noise_points,
177
+ noise_ratio: hdbscan.noise_ratio
178
+ }
179
+
180
+ metrics[:hdbscan] = {
181
+ silhouette_umap: silhouette_umap_hdbscan,
182
+ silhouette_pca: silhouette_pca_hdbscan,
183
+ silhouette_svd: silhouette_svd_hdbscan,
184
+ n_clusters: hdbscan.n_clusters,
185
+ noise_ratio: hdbscan.noise_ratio
186
+ }
187
+
188
+ puts " done"
189
+ end
190
+
191
+ # Generate HTML
192
+ html = generate_visualization_html(
193
+ data: data,
194
+ umap_data: umap_data,
195
+ pca_data: pca_data,
196
+ svd_data: svd_data,
197
+ true_labels: true_labels,
198
+ clustering_results: clustering_results,
199
+ dataset_name: dataset_name,
200
+ metrics: metrics,
201
+ clustering_method: clustering_method
202
+ )
203
+
204
+ File.write(output_file, html)
205
+ puts "\nVisualization saved to: #{output_file}"
206
+ puts "Open in browser: open #{output_file}"
207
+ end
208
+
209
+ def generate_clustered_data(n_points_per_cluster: 50, n_features: 50, n_clusters: 3)
210
+ data = []
211
+ labels = []
212
+
213
+ n_clusters.times do |cluster_id|
214
+ # Keep values smaller and normalized to avoid UMAP issues
215
+ center = Array.new(n_features) { (rand - 0.5) * 0.3 + cluster_id * 0.3 }
216
+
217
+ n_points_per_cluster.times do
218
+ point = center.map { |c| c + (rand - 0.5) * 0.1 }
219
+ data << point
220
+ labels << cluster_id
221
+ end
222
+ end
223
+
224
+ # Add some noise points for HDBSCAN testing
225
+ (n_points_per_cluster * 0.2).to_i.times do
226
+ point = Array.new(n_features) { rand * 2 - 1 } # Random noise
227
+ data << point
228
+ labels << -1 # Mark as noise
229
+ end
230
+
231
+ [data, labels, "Gaussian Clusters with Noise"]
232
+ end
233
+
234
+ def generate_swiss_roll(n_points: 150)
235
+ data = []
236
+ labels = []
237
+
238
+ n_points.times do |i|
239
+ t = 0.5 * Math::PI * (1 + 2 * i.to_f / n_points)
240
+ height = rand
241
+
242
+ x = t * Math.cos(t) * 0.1
243
+ y = height * 0.1
244
+ z = t * Math.sin(t) * 0.1
245
+
246
+ point = [x, y, z]
247
+
248
+ # Add correlated features
249
+ 10.times do |j|
250
+ point << x * Math.sin(j) + y * Math.cos(j) + (rand - 0.5) * 0.01
251
+ end
252
+
253
+ # Add random features
254
+ 37.times do
255
+ point << rand * 0.01
256
+ end
257
+
258
+ data << point
259
+ labels << (t / (3 * Math::PI) * 3).to_i
260
+ end
261
+
262
+ [data, labels, "Swiss Roll"]
263
+ end
264
+
265
+ def generate_iris_like_data
266
+ data = []
267
+ labels = []
268
+
269
+ species_params = [
270
+ { sepal_length: 0.5, sepal_width: 0.34, petal_length: 0.15, petal_width: 0.02 },
271
+ { sepal_length: 0.59, sepal_width: 0.28, petal_length: 0.43, petal_width: 0.13 },
272
+ { sepal_length: 0.65, sepal_width: 0.30, petal_length: 0.55, petal_width: 0.20 }
273
+ ]
274
+
275
+ species_params.each_with_index do |params, species_id|
276
+ 50.times do
277
+ features = [
278
+ params[:sepal_length] + (rand - 0.5) * 0.08,
279
+ params[:sepal_width] + (rand - 0.5) * 0.06,
280
+ params[:petal_length] + (rand - 0.5) * 0.08,
281
+ params[:petal_width] + (rand - 0.5) * 0.04
282
+ ]
283
+
284
+ # Expand to 50 dimensions
285
+ expanded = features.dup
286
+
287
+ features.each do |f1|
288
+ features.each do |f2|
289
+ expanded << f1 * f2 * 0.01
290
+ end
291
+ end
292
+
293
+ features.each_with_index do |f, i|
294
+ expanded << Math.sin(f) * 0.01 * (i + 1)
295
+ expanded << Math.cos(f) * 0.01 * (i + 1)
296
+ end
297
+
298
+ while expanded.length < 50
299
+ expanded << rand * 0.01
300
+ end
301
+
302
+ data << expanded[0...50]
303
+ labels << species_id
304
+ end
305
+ end
306
+
307
+ [data, labels, "Iris-like Dataset"]
308
+ end
309
+
310
+ def generate_visualization_html(data:, umap_data:, pca_data:, svd_data:, true_labels:,
311
+ clustering_results:, dataset_name:, metrics:, clustering_method:)
312
+ # Prepare plots based on clustering method
313
+ plots_html = ""
314
+
315
+ if clustering_method == 'both'
316
+ # Show both K-means and HDBSCAN side by side
317
+ plots_html = generate_comparison_plots(
318
+ umap_data, pca_data, svd_data, true_labels,
319
+ clustering_results[:kmeans], clustering_results[:hdbscan]
320
+ )
321
+ elsif clustering_method == 'kmeans'
322
+ plots_html = generate_kmeans_plots(
323
+ umap_data, pca_data, svd_data, true_labels,
324
+ clustering_results[:kmeans]
325
+ )
326
+ elsif clustering_method == 'hdbscan'
327
+ plots_html = generate_hdbscan_plots(
328
+ umap_data, pca_data, svd_data, true_labels,
329
+ clustering_results[:hdbscan]
330
+ )
331
+ end
332
+
333
+ # Generate metrics HTML
334
+ metrics_html = generate_metrics_html(metrics, clustering_method)
335
+
336
+ <<~HTML
337
+ <!DOCTYPE html>
338
+ <html>
339
+ <head>
340
+ <title>ClusterKit Visualization - #{dataset_name}</title>
341
+ <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
342
+ <style>
343
+ body {
344
+ font-family: Arial, sans-serif;
345
+ margin: 20px;
346
+ background: #f5f5f5;
347
+ }
348
+ h1 {
349
+ color: #333;
350
+ text-align: center;
351
+ }
352
+ .container {
353
+ display: grid;
354
+ grid-template-columns: repeat(#{clustering_method == 'both' ? 3 : 2}, 1fr);
355
+ gap: 20px;
356
+ max-width: #{clustering_method == 'both' ? 1800 : 1400}px;
357
+ margin: 0 auto;
358
+ }
359
+ .plot {
360
+ background: white;
361
+ border-radius: 8px;
362
+ padding: 10px;
363
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
364
+ }
365
+ .stats {
366
+ background: white;
367
+ border-radius: 8px;
368
+ padding: 20px;
369
+ margin: 20px auto;
370
+ max-width: #{clustering_method == 'both' ? 1800 : 1400}px;
371
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
372
+ }
373
+ table {
374
+ width: 100%;
375
+ border-collapse: collapse;
376
+ }
377
+ th, td {
378
+ padding: 10px;
379
+ text-align: left;
380
+ border-bottom: 1px solid #ddd;
381
+ }
382
+ th {
383
+ background-color: #f8f8f8;
384
+ font-weight: bold;
385
+ }
386
+ .metrics {
387
+ display: grid;
388
+ grid-template-columns: repeat(#{clustering_method == 'both' ? 6 : 4}, 1fr);
389
+ gap: 20px;
390
+ margin-top: 20px;
391
+ }
392
+ .metric-card {
393
+ background: #f8f8f8;
394
+ padding: 15px;
395
+ border-radius: 5px;
396
+ text-align: center;
397
+ }
398
+ .metric-value {
399
+ font-size: 24px;
400
+ font-weight: bold;
401
+ color: #333;
402
+ }
403
+ .metric-label {
404
+ color: #666;
405
+ margin-top: 5px;
406
+ font-size: 12px;
407
+ }
408
+ .noise-point {
409
+ opacity: 0.3;
410
+ }
411
+ </style>
412
+ </head>
413
+ <body>
414
+ <h1>Dimensionality Reduction & Clustering Analysis</h1>
415
+ <h2 style="text-align: center; color: #666;">Dataset: #{dataset_name} | Method: #{clustering_method.capitalize}</h2>
416
+
417
+ <div class="stats">
418
+ <h2>Dataset Information</h2>
419
+ <table>
420
+ <tr>
421
+ <th>Property</th>
422
+ <th>Value</th>
423
+ </tr>
424
+ <tr>
425
+ <td>Original Dimensions</td>
426
+ <td>#{data.first.size}</td>
427
+ </tr>
428
+ <tr>
429
+ <td>Number of Points</td>
430
+ <td>#{data.size}</td>
431
+ </tr>
432
+ <tr>
433
+ <td>True Number of Clusters</td>
434
+ <td>#{true_labels.reject { |l| l == -1 }.uniq.size}</td>
435
+ </tr>
436
+ </table>
437
+
438
+ #{metrics_html}
439
+ </div>
440
+
441
+ <div class="container">
442
+ #{plots_html}
443
+ </div>
444
+
445
+ #{generate_additional_plots(metrics, clustering_method)}
446
+
447
+ </body>
448
+ </html>
449
+ HTML
450
+ end
451
+
452
+ def generate_metrics_html(metrics, clustering_method)
453
+ html = '<div class="metrics">'
454
+
455
+ if clustering_method == 'kmeans' || clustering_method == 'both'
456
+ kmeans_metrics = metrics[:kmeans]
457
+ html += <<~HTML
458
+ <div class="metric-card">
459
+ <div class="metric-value">#{kmeans_metrics[:optimal_k]}</div>
460
+ <div class="metric-label">K-means<br>Optimal K</div>
461
+ </div>
462
+ <div class="metric-card">
463
+ <div class="metric-value">#{kmeans_metrics[:silhouette_umap].round(3)}</div>
464
+ <div class="metric-label">K-means UMAP<br>Silhouette</div>
465
+ </div>
466
+ <div class="metric-card">
467
+ <div class="metric-value">#{kmeans_metrics[:silhouette_pca].round(3)}</div>
468
+ <div class="metric-label">K-means PCA<br>Silhouette</div>
469
+ </div>
470
+ HTML
471
+ end
472
+
473
+ if clustering_method == 'hdbscan' || clustering_method == 'both'
474
+ hdbscan_metrics = metrics[:hdbscan]
475
+ html += <<~HTML
476
+ <div class="metric-card">
477
+ <div class="metric-value">#{hdbscan_metrics[:n_clusters]}</div>
478
+ <div class="metric-label">HDBSCAN<br>Clusters Found</div>
479
+ </div>
480
+ <div class="metric-card">
481
+ <div class="metric-value">#{(hdbscan_metrics[:noise_ratio] * 100).round(1)}%</div>
482
+ <div class="metric-label">HDBSCAN<br>Noise Ratio</div>
483
+ </div>
484
+ <div class="metric-card">
485
+ <div class="metric-value">#{hdbscan_metrics[:silhouette_umap].round(3)}</div>
486
+ <div class="metric-label">HDBSCAN UMAP<br>Silhouette</div>
487
+ </div>
488
+ HTML
489
+ end
490
+
491
+ html += <<~HTML
492
+ <div class="metric-card">
493
+ <div class="metric-value">#{(metrics[:pca_variance_explained] * 100).round(1)}%</div>
494
+ <div class="metric-label">PCA Variance<br>Explained</div>
495
+ </div>
496
+ HTML
497
+
498
+ html += '</div>'
499
+ html
500
+ end
501
+
502
+ def generate_comparison_plots(umap_data, pca_data, svd_data, true_labels, kmeans_results, hdbscan_results)
503
+ plots = []
504
+
505
+ # Row 1: True labels
506
+ plots << create_plot_div('true-umap', umap_data, true_labels, 'UMAP - True Labels', 'UMAP')
507
+ plots << create_plot_div('true-pca', pca_data, true_labels, 'PCA - True Labels', 'PC')
508
+ plots << create_plot_div('true-svd', svd_data, true_labels, 'SVD - True Labels', 'Component')
509
+
510
+ # Row 2: K-means
511
+ plots << create_plot_div('kmeans-umap', umap_data, kmeans_results[:labels_umap], 'UMAP - K-means', 'UMAP')
512
+ plots << create_plot_div('kmeans-pca', pca_data, kmeans_results[:labels_pca], 'PCA - K-means', 'PC')
513
+ plots << create_plot_div('kmeans-svd', svd_data, kmeans_results[:labels_svd], 'SVD - K-means', 'Component')
514
+
515
+ # Row 3: HDBSCAN
516
+ plots << create_plot_div('hdbscan-umap', umap_data, hdbscan_results[:labels_umap], 'UMAP - HDBSCAN', 'UMAP', true)
517
+ plots << create_plot_div('hdbscan-pca', pca_data, hdbscan_results[:labels_pca], 'PCA - HDBSCAN', 'PC', true)
518
+ plots << create_plot_div('hdbscan-svd', svd_data, hdbscan_results[:labels_svd], 'SVD - HDBSCAN', 'Component', true)
519
+
520
+ plots.join("\n")
521
+ end
522
+
523
+ def generate_kmeans_plots(umap_data, pca_data, svd_data, true_labels, kmeans_results)
524
+ plots = []
525
+
526
+ # Row 1: True labels
527
+ plots << create_plot_div('true-umap', umap_data, true_labels, 'UMAP - True Labels', 'UMAP')
528
+ plots << create_plot_div('true-pca', pca_data, true_labels, 'PCA - True Labels', 'PC')
529
+
530
+ # Row 2: K-means
531
+ plots << create_plot_div('kmeans-umap', umap_data, kmeans_results[:labels_umap], 'UMAP - K-means', 'UMAP')
532
+ plots << create_plot_div('kmeans-pca', pca_data, kmeans_results[:labels_pca], 'PCA - K-means', 'PC')
533
+
534
+ # Row 3: SVD
535
+ plots << create_plot_div('true-svd', svd_data, true_labels, 'SVD - True Labels', 'Component')
536
+ plots << create_plot_div('kmeans-svd', svd_data, kmeans_results[:labels_svd], 'SVD - K-means', 'Component')
537
+
538
+ plots.join("\n")
539
+ end
540
+
541
+ def generate_hdbscan_plots(umap_data, pca_data, svd_data, true_labels, hdbscan_results)
542
+ plots = []
543
+
544
+ # Row 1: True labels
545
+ plots << create_plot_div('true-umap', umap_data, true_labels, 'UMAP - True Labels', 'UMAP')
546
+ plots << create_plot_div('true-pca', pca_data, true_labels, 'PCA - True Labels', 'PC')
547
+
548
+ # Row 2: HDBSCAN
549
+ plots << create_plot_div('hdbscan-umap', umap_data, hdbscan_results[:labels_umap], 'UMAP - HDBSCAN', 'UMAP', true)
550
+ plots << create_plot_div('hdbscan-pca', pca_data, hdbscan_results[:labels_pca], 'PCA - HDBSCAN', 'PC', true)
551
+
552
+ # Row 3: SVD
553
+ plots << create_plot_div('true-svd', svd_data, true_labels, 'SVD - True Labels', 'Component')
554
+ plots << create_plot_div('hdbscan-svd', svd_data, hdbscan_results[:labels_svd], 'SVD - HDBSCAN', 'Component', true)
555
+
556
+ plots.join("\n")
557
+ end
558
+
559
+ def create_plot_div(id, data, labels, title, axis_prefix, has_noise = false)
560
+ # Handle noise points specially for HDBSCAN
561
+ colors = if has_noise
562
+ labels.map { |l| l == -1 ? 'gray' : l }
563
+ else
564
+ labels
565
+ end
566
+
567
+ marker_props = if has_noise
568
+ # Make noise points smaller and semi-transparent
569
+ sizes = labels.map { |l| l == -1 ? 5 : 8 }
570
+ opacities = labels.map { |l| l == -1 ? 0.3 : 0.8 }
571
+ "size: [#{sizes.join(',')}], opacity: [#{opacities.join(',')}],"
572
+ else
573
+ "size: 8,"
574
+ end
575
+
576
+ <<~HTML
577
+ <div class="plot" id="#{id}"></div>
578
+ <script>
579
+ Plotly.newPlot('#{id}', [{
580
+ x: #{data.map { |p| p[0] }.to_json},
581
+ y: #{data.map { |p| p[1] }.to_json},
582
+ mode: 'markers',
583
+ marker: {
584
+ color: #{colors.to_json},
585
+ #{marker_props}
586
+ colorscale: 'Viridis',
587
+ showscale: false
588
+ },
589
+ type: 'scatter'
590
+ }], {
591
+ title: '#{title}',
592
+ xaxis: { title: '#{axis_prefix} 1' },
593
+ yaxis: { title: '#{axis_prefix} 2' },
594
+ height: 400
595
+ });
596
+ </script>
597
+ HTML
598
+ end
599
+
600
+ def generate_additional_plots(metrics, clustering_method)
601
+ plots = []
602
+
603
+ if clustering_method == 'kmeans' || clustering_method == 'both'
604
+ if metrics[:kmeans] && metrics[:kmeans][:elbow_results]
605
+ elbow_data = metrics[:kmeans][:elbow_results]
606
+ plots << <<~HTML
607
+ <div class="stats">
608
+ <h2>K-means Elbow Method Results</h2>
609
+ <div id="elbow-plot" style="height: 400px;"></div>
610
+ </div>
611
+ <script>
612
+ const elbowData = #{elbow_data.to_a.sort.to_h.to_json};
613
+ Plotly.newPlot('elbow-plot', [{
614
+ x: Object.keys(elbowData),
615
+ y: Object.values(elbowData),
616
+ mode: 'lines+markers',
617
+ marker: { size: 10 },
618
+ line: { width: 2 }
619
+ }], {
620
+ title: 'Elbow Method - Optimal K Selection',
621
+ xaxis: { title: 'Number of Clusters (k)' },
622
+ yaxis: { title: 'Inertia' },
623
+ height: 400,
624
+ annotations: [{
625
+ x: #{metrics[:kmeans][:optimal_k]},
626
+ y: elbowData[#{metrics[:kmeans][:optimal_k]}],
627
+ text: 'Optimal K',
628
+ showarrow: true,
629
+ arrowhead: 7,
630
+ ax: 0,
631
+ ay: -40
632
+ }]
633
+ });
634
+ </script>
635
+ HTML
636
+ end
637
+ end
638
+
639
+ plots.join("\n")
640
+ end
641
+ end