bayesian-sparse-gmm 0.2.2__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/.gitignore +2 -1
  2. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/PKG-INFO +33 -2
  3. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/README.md +32 -1
  4. bayesian_sparse_gmm-0.3.0/benchmarks/benchmark.py +78 -0
  5. bayesian_sparse_gmm-0.3.0/benchmarks/benchmark_stress_test.py +182 -0
  6. bayesian_sparse_gmm-0.3.0/benchmarks/evaluate.py +862 -0
  7. bayesian_sparse_gmm-0.3.0/benchmarks/run_olivetti_only.py +9 -0
  8. bayesian_sparse_gmm-0.3.0/benchmarks/tune_svi.py +37 -0
  9. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/__init__.py +1 -1
  10. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/backends/_base.py +14 -4
  11. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/backends/_cuda.py +13 -1
  12. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/backends/_numba.py +59 -0
  13. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/backends/_numpy.py +16 -6
  14. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/config.py +19 -4
  15. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/model.py +138 -71
  16. bayesian_sparse_gmm-0.3.0/src/bayesian_sparse_gmm/state.py +39 -0
  17. bayesian_sparse_gmm-0.3.0/src/bayesian_sparse_gmm/svi.py +270 -0
  18. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_model.py +27 -0
  19. bayesian_sparse_gmm-0.2.2/evaluate.py +0 -503
  20. bayesian_sparse_gmm-0.2.2/src/bayesian_sparse_gmm/state.py +0 -18
  21. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/.flake8 +0 -0
  22. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/.github/workflows/release.yml +0 -0
  23. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/LICENSE +0 -0
  24. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/pyproject.toml +0 -0
  25. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/backends/__init__.py +0 -0
  26. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/diagnostics.py +0 -0
  27. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/postprocessing.py +0 -0
  28. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/sampler.py +0 -0
  29. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/urn.py +0 -0
  30. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/utils.py +0 -0
  31. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/__init__.py +0 -0
  32. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_backends.py +0 -0
  33. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_cuda.py +0 -0
  34. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_diagnostics.py +0 -0
  35. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_identity_cov.py +0 -0
  36. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_laplace.py +0 -0
  37. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_numba.py +0 -0
  38. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_phase1.py +0 -0
  39. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_sampler.py +0 -0
  40. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_synthetic.py +0 -0
  41. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_theta_update.py +0 -0
  42. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_urn_weight.py +0 -0
  43. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_variable_k.py +0 -0
  44. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_xi.py +0 -0
  45. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_xi_update.py +0 -0
  46. {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/uv.lock +0 -0
@@ -142,4 +142,5 @@ cython_debug/
142
142
  /visualize/
143
143
  # /tests/
144
144
  /docs/
145
- /.benchmarks/
145
+ /.benchmarks/
146
+ /benchmarks/visualize/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bayesian-sparse-gmm
3
- Version: 0.2.2
3
+ Version: 0.3.0
4
4
  Summary: Bayesian Sparse Gaussian Mixture Model implementation in Python
5
5
  Author-email: Nam Nam <nampvh4436@gmail.com>
6
6
  License: MIT
@@ -26,6 +26,11 @@ Description-Content-Type: text/markdown
26
26
 
27
27
  # Bayesian Sparse GMM
28
28
 
29
+ ![Python](https://img.shields.io/badge/python-3.9%20--%203.12-blue?logo=python&style=flat-square)
30
+ ![CUDA](https://img.shields.io/badge/CUDA-Accelerated-76B900?logo=nvidia&style=flat-square)
31
+ ![CuPy](https://img.shields.io/badge/CuPy-Supported-7F22FE?style=flat-square)
32
+ ![Numba](https://img.shields.io/badge/Numba-Accelerated-FE7A15?style=flat-square)
33
+
29
34
  Bayesian Sparse Gaussian Mixture Model (GMM) implementation in Python.
30
35
 
31
36
  ## Installation
@@ -79,6 +84,13 @@ print(f"Feature inclusion probabilities: {model.feature_probabilities_.round(3)}
79
84
  labels = model.predict(X)
80
85
  ```
81
86
 
87
+ ## Optimization Methods
88
+
89
+ The model supports two optimization architectures via the `optimizer` parameter:
90
+
91
+ 1. **Gibbs Sampling (MCMC)** (`optimizer="default"`): The original, mathematically exact implementation. It explores the full posterior distribution but requires processing the entire dataset per iteration. Best for smaller datasets or when exact posterior distributions are required.
92
+ 2. **Stochastic Variational Inference (SVI)** (`optimizer="svi"`): A highly-scalable approach using Coordinate Ascent Variational Inference (CAVI) with Natural Gradients. It processes data in mini-batches, making it orders of magnitude faster and capable of scaling to extremely large datasets ($N \gg 10,000$).
93
+
82
94
  ## GPU / CUDA Acceleration
83
95
 
84
96
  The model supports three backends, selected via the `backend` parameter:
@@ -164,14 +176,29 @@ Understanding the key hyperparameters is crucial for fine-tuning the model's spa
164
176
 
165
177
  | Parameter | Type | Default | Description |
166
178
  |-----------|------|---------|-------------|
179
+ | `optimizer` | `str` | `"default"` | Choose `"default"` for exact MCMC (Gibbs Sampling) or `"svi"` for scalable Stochastic Variational Inference (mini-batch). |
167
180
  | `K_max` | `int` | `15` | The maximum possible number of clusters. The algorithm will automatically find the active number of clusters $K \le K_{max}$. Should be set safely higher than the expected number of true clusters. |
168
181
  | `lambda_0` | `float` | `1000.0` | **Spike rate** of the Spike-and-Slab LASSO prior. A larger value aggressively forces non-informative (noise) features closer to zero. Must satisfy `lambda_0 >> lambda_1`. |
169
182
  | `lambda_1` | `float` | `0.1` | **Slab rate**. A smaller value allows informative features to deviate freely from zero to capture the cluster structure. |
170
183
  | `alpha` | `float` | `1.0` | Dirichlet concentration parameter for the cluster weight prior. Controls the prior belief over the distribution of cluster sizes. |
171
184
  | `theta` | `float` | `0.1` | Prior probability of a feature being included in the active set (the slab component). Smaller values induce stronger sparsity (fewer features selected). |
185
+
186
+ ### MCMC Parameters (`optimizer="default"`)
187
+
188
+ | Parameter | Type | Default | Description |
189
+ |-----------|------|---------|-------------|
172
190
  | `burn_in` | `int` | `500` | Number of initial MCMC iterations discarded to allow the Markov chain to converge to the stationary distribution. |
173
191
  | `n_iter` | `int` | `1000` | Total number of MCMC iterations. The number of samples used for posterior inference is `n_iter - burn_in`. |
174
192
 
193
+ ### SVI Parameters (`optimizer="svi"`)
194
+
195
+ | Parameter | Type | Default | Description |
196
+ |-----------|------|---------|-------------|
197
+ | `epochs` | `int` | `100` | Total number of passes over the dataset during variational inference. |
198
+ | `batch_size` | `int` | `256` | Mini-batch size for SVI updates. |
199
+ | `delay_rho` | `float` | `1.0` | Learning rate delay parameter ($\tau_0$) to stabilize early iterations. |
200
+ | `forgetting_rate`| `float`| `0.75` | Forgetting rate ($\kappa \in (0.5, 1.0]$) controlling the learning rate decay $\rho_t = (t + \tau_0)^{-\kappa}$. |
201
+
175
202
  *Tip: For extremely high-dimensional datasets with heavy noise, tuning `lambda_0` to be larger and `theta` to be smaller will encourage more aggressive feature selection.*
176
203
 
177
204
  ## Reference
@@ -187,4 +214,8 @@ Understanding the key hyperparameters is crucial for fine-tuning the model's spa
187
214
  pages = {1--50},
188
215
  url = {http://jmlr.org/papers/v26/23-0142.html}
189
216
  }
190
- ```
217
+ ```
218
+
219
+ ## Contributors
220
+
221
+ * **Nam Nam** ([@Neeze](https://github.com/Neeze)) - Developer of the SVI (Stochastic Variational Inference) optimizer, GPU/CUDA acceleration, and benchmarking suite.
@@ -1,5 +1,10 @@
1
1
  # Bayesian Sparse GMM
2
2
 
3
+ ![Python](https://img.shields.io/badge/python-3.9%20--%203.12-blue?logo=python&style=flat-square)
4
+ ![CUDA](https://img.shields.io/badge/CUDA-Accelerated-76B900?logo=nvidia&style=flat-square)
5
+ ![CuPy](https://img.shields.io/badge/CuPy-Supported-7F22FE?style=flat-square)
6
+ ![Numba](https://img.shields.io/badge/Numba-Accelerated-FE7A15?style=flat-square)
7
+
3
8
  Bayesian Sparse Gaussian Mixture Model (GMM) implementation in Python.
4
9
 
5
10
  ## Installation
@@ -53,6 +58,13 @@ print(f"Feature inclusion probabilities: {model.feature_probabilities_.round(3)}
53
58
  labels = model.predict(X)
54
59
  ```
55
60
 
61
+ ## Optimization Methods
62
+
63
+ The model supports two optimization architectures via the `optimizer` parameter:
64
+
65
+ 1. **Gibbs Sampling (MCMC)** (`optimizer="default"`): The original, mathematically exact implementation. It explores the full posterior distribution but requires processing the entire dataset per iteration. Best for smaller datasets or when exact posterior distributions are required.
66
+ 2. **Stochastic Variational Inference (SVI)** (`optimizer="svi"`): A highly-scalable approach using Coordinate Ascent Variational Inference (CAVI) with Natural Gradients. It processes data in mini-batches, making it orders of magnitude faster and capable of scaling to extremely large datasets ($N \gg 10,000$).
67
+
56
68
  ## GPU / CUDA Acceleration
57
69
 
58
70
  The model supports three backends, selected via the `backend` parameter:
@@ -138,14 +150,29 @@ Understanding the key hyperparameters is crucial for fine-tuning the model's spa
138
150
 
139
151
  | Parameter | Type | Default | Description |
140
152
  |-----------|------|---------|-------------|
153
+ | `optimizer` | `str` | `"default"` | Choose `"default"` for exact MCMC (Gibbs Sampling) or `"svi"` for scalable Stochastic Variational Inference (mini-batch). |
141
154
  | `K_max` | `int` | `15` | The maximum possible number of clusters. The algorithm will automatically find the active number of clusters $K \le K_{max}$. Should be set safely higher than the expected number of true clusters. |
142
155
  | `lambda_0` | `float` | `1000.0` | **Spike rate** of the Spike-and-Slab LASSO prior. A larger value aggressively forces non-informative (noise) features closer to zero. Must satisfy `lambda_0 >> lambda_1`. |
143
156
  | `lambda_1` | `float` | `0.1` | **Slab rate**. A smaller value allows informative features to deviate freely from zero to capture the cluster structure. |
144
157
  | `alpha` | `float` | `1.0` | Dirichlet concentration parameter for the cluster weight prior. Controls the prior belief over the distribution of cluster sizes. |
145
158
  | `theta` | `float` | `0.1` | Prior probability of a feature being included in the active set (the slab component). Smaller values induce stronger sparsity (fewer features selected). |
159
+
160
+ ### MCMC Parameters (`optimizer="default"`)
161
+
162
+ | Parameter | Type | Default | Description |
163
+ |-----------|------|---------|-------------|
146
164
  | `burn_in` | `int` | `500` | Number of initial MCMC iterations discarded to allow the Markov chain to converge to the stationary distribution. |
147
165
  | `n_iter` | `int` | `1000` | Total number of MCMC iterations. The number of samples used for posterior inference is `n_iter - burn_in`. |
148
166
 
167
+ ### SVI Parameters (`optimizer="svi"`)
168
+
169
+ | Parameter | Type | Default | Description |
170
+ |-----------|------|---------|-------------|
171
+ | `epochs` | `int` | `100` | Total number of passes over the dataset during variational inference. |
172
+ | `batch_size` | `int` | `256` | Mini-batch size for SVI updates. |
173
+ | `delay_rho` | `float` | `1.0` | Learning rate delay parameter ($\tau_0$) to stabilize early iterations. |
174
+ | `forgetting_rate`| `float`| `0.75` | Forgetting rate ($\kappa \in (0.5, 1.0]$) controlling the learning rate decay $\rho_t = (t + \tau_0)^{-\kappa}$. |
175
+
149
176
  *Tip: For extremely high-dimensional datasets with heavy noise, tuning `lambda_0` to be larger and `theta` to be smaller will encourage more aggressive feature selection.*
150
177
 
151
178
  ## Reference
@@ -161,4 +188,8 @@ Understanding the key hyperparameters is crucial for fine-tuning the model's spa
161
188
  pages = {1--50},
162
189
  url = {http://jmlr.org/papers/v26/23-0142.html}
163
190
  }
164
- ```
191
+ ```
192
+
193
+ ## Contributors
194
+
195
+ * **Nam Nam** ([@Neeze](https://github.com/Neeze)) - Developer of the SVI (Stochastic Variational Inference) optimizer, GPU/CUDA acceleration, and benchmarking suite.
@@ -0,0 +1,78 @@
1
+ import time
2
+ import numpy as np
3
+ from sklearn.preprocessing import StandardScaler
4
+ from bayesian_sparse_gmm.model import BayesianSparseGMM
5
+
6
+ def run_benchmark(n=5000, p=50, K_true=5, signal_features=10, backend="cuda"):
7
+ print(f"\n=======================================================")
8
+ print(f"BENCHMARK: SVI vs MCMC (n={n}, p={p}, K={K_true})")
9
+ print(f"Backend: {backend.upper()}")
10
+ print(f"=======================================================\n")
11
+
12
+ # Generate Synthetic Sparse Data
13
+ rng = np.random.default_rng(42)
14
+ means = np.zeros((K_true, p))
15
+ means[:, :signal_features] = rng.normal(0, 3.0, size=(K_true, signal_features))
16
+
17
+ n_per_k = n // K_true
18
+ X_parts = [rng.normal(means[k], 1.0, size=(n_per_k, p)) for k in range(K_true)]
19
+ X_raw = np.vstack(X_parts)
20
+ y = np.repeat(np.arange(K_true), n_per_k)
21
+
22
+ # Shuffle
23
+ shuf = rng.permutation(len(y))
24
+ X_raw, y = X_raw[shuf], y[shuf]
25
+
26
+ # Standardize
27
+ X = StandardScaler().fit_transform(X_raw)
28
+
29
+ # --- MCMC (Default) ---
30
+ print("Running MCMC (Gibbs Sampling) ...")
31
+ t0 = time.time()
32
+ gmm_mcmc = BayesianSparseGMM(
33
+ K_max=10,
34
+ optimizer="default",
35
+ n_iter=100, # Small number for benchmarking
36
+ burn_in=20,
37
+ backend=backend,
38
+ random_state=42,
39
+ use_identity_covariance=True
40
+ )
41
+ gmm_mcmc.fit(X)
42
+ t_mcmc = time.time() - t0
43
+
44
+ # --- SVI ---
45
+ print("Running SVI (Natural Gradients) ...")
46
+ t0 = time.time()
47
+ gmm_svi = BayesianSparseGMM(
48
+ K_max=10,
49
+ optimizer="svi",
50
+ epochs=10, # 10 passes over the dataset
51
+ batch_size=256,
52
+ backend=backend,
53
+ random_state=42,
54
+ use_identity_covariance=True
55
+ )
56
+ gmm_svi.fit(X)
57
+ t_svi = time.time() - t0
58
+
59
+ print("\n--- RESULTS ---")
60
+ print(f"MCMC Time: {t_mcmc:.2f} seconds")
61
+ print(f"SVI Time: {t_svi:.2f} seconds")
62
+ print(f"Speedup: {t_mcmc/t_svi:.2f}x")
63
+
64
+ print(f"\nMCMC Found Clusters: {gmm_mcmc.K_hat_}")
65
+ print(f"SVI Found Clusters: {gmm_svi.K_hat_}")
66
+
67
+ n_sel_mcmc = len(gmm_mcmc.selected_features_)
68
+ n_sel_svi = len(gmm_svi.selected_features_)
69
+ print(f"MCMC Selected Features: {n_sel_mcmc}/{p}")
70
+ print(f"SVI Selected Features: {n_sel_svi}/{p}")
71
+
72
+ if __name__ == "__main__":
73
+ import argparse
74
+ parser = argparse.ArgumentParser()
75
+ parser.add_argument("--backend", default="cuda", type=str)
76
+ args = parser.parse_args()
77
+
78
+ run_benchmark(n=10000, p=100, K_true=10, signal_features=15, backend=args.backend)
@@ -0,0 +1,182 @@
1
+ import time
2
+ import os
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ from sklearn.cluster import DBSCAN
6
+ from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, v_measure_score
7
+ from sklearn.datasets import make_moons
8
+ from sklearn.preprocessing import StandardScaler
9
+
10
+ from bayesian_sparse_gmm.model import BayesianSparseGMM
11
+
12
+ def generate_complex_arcs(n_samples=10000, n_features=1024, noise_level=0.15):
13
+ """
14
+ Generate a complex dataset with arc-shaped clusters mixed together, embedded in high dimensions.
15
+ """
16
+ rng = np.random.default_rng(42)
17
+ # Create 4 arcs (moons) that intersect
18
+ n_per_pair = n_samples // 2
19
+
20
+ # Pair 1: Standard moons
21
+ X1, y1 = make_moons(n_samples=n_per_pair, noise=noise_level, random_state=42)
22
+
23
+ # Pair 2: Rotated and shifted moons
24
+ X2, y2 = make_moons(n_samples=n_per_pair, noise=noise_level, random_state=43)
25
+ y2 += 2 # Cluster labels 2 and 3
26
+
27
+ # Rotate by 60 degrees
28
+ theta = np.radians(60)
29
+ c, s = np.cos(theta), np.sin(theta)
30
+ R = np.array([[c, -s], [s, c]])
31
+ X2 = X2 @ R
32
+
33
+ # Shift to overlap with Pair 1
34
+ X2[:, 0] += 0.3
35
+ X2[:, 1] += 0.3
36
+
37
+ X_2d = np.vstack((X1, X2))
38
+ y = np.concatenate((y1, y2))
39
+
40
+ # Embed in high-dimensional space
41
+ X = np.zeros((n_samples, n_features))
42
+ X[:, 0] = X_2d[:, 0]
43
+ X[:, 1] = X_2d[:, 1]
44
+
45
+ # Add a few derived non-linear signal features
46
+ X[:, 2] = X_2d[:, 0] * X_2d[:, 1]
47
+ X[:, 3] = X_2d[:, 0] ** 2
48
+ X[:, 4] = X_2d[:, 1] ** 2
49
+
50
+ # Add complex pure noise features
51
+ noise_features = rng.normal(0, 1.5, size=(n_samples, n_features - 5))
52
+ X[:, 5:] = noise_features
53
+
54
+ # Randomly permute the features so the signal is not just in the first 5 dims
55
+ feature_shuf = rng.permutation(n_features)
56
+ X = X[:, feature_shuf]
57
+
58
+ # Shuffle samples
59
+ sample_shuf = rng.permutation(n_samples)
60
+ X, y, X_2d = X[sample_shuf], y[sample_shuf], X_2d[sample_shuf]
61
+
62
+ # Scale
63
+ X = StandardScaler().fit_transform(X)
64
+
65
+ return X, y, X_2d, feature_shuf
66
+
67
+ def run_stress_test_benchmark(backend="cuda"):
68
+ n_samples = 10000
69
+ n_features = 1024
70
+
71
+ print(f"\n=======================================================")
72
+ print(f"STRESS TEST BENCHMARK: ARC-SHAPED CLUSTERS")
73
+ print(f"n={n_samples}, p={n_features}, backend={backend.upper()}")
74
+ print(f"=======================================================\n")
75
+
76
+ print("Generating complex dataset...")
77
+ X, y, X_2d, feature_shuf = generate_complex_arcs(n_samples, n_features)
78
+
79
+ # Find original signal feature indices
80
+ signal_indices = [np.where(feature_shuf == i)[0][0] for i in range(5)]
81
+ print(f"Signal feature indices: {signal_indices}")
82
+
83
+ # --- DBSCAN ---
84
+ # In 1024 dimensions, distance values are large.
85
+ # For N(0,1) variables, expected Euclidean distance is roughly sqrt(2*1024) ~ 45
86
+ # We set a somewhat large eps and min_samples to see how it handles it.
87
+ print("\nRunning DBSCAN ...")
88
+ t0 = time.time()
89
+ dbscan = DBSCAN(eps=40.0, min_samples=10)
90
+ dbscan.fit(X)
91
+ t_dbscan = time.time() - t0
92
+
93
+ # --- BSGMM (SVI) ---
94
+ print("\nRunning Bayesian Sparse GMM (SVI) ...")
95
+ t0 = time.time()
96
+ gmm = BayesianSparseGMM(
97
+ K_max=10,
98
+ optimizer="svi",
99
+ epochs=100,
100
+ batch_size=512,
101
+ lambda_0=1000.0,
102
+ lambda_1=0.1,
103
+ theta=0.5,
104
+ backend=backend,
105
+ random_state=42,
106
+ use_identity_covariance=True,
107
+ verbose=1
108
+ )
109
+ gmm.fit(X)
110
+ t_bsgmm = time.time() - t0
111
+
112
+ # --- Results ---
113
+ ari_db = adjusted_rand_score(y, dbscan.labels_)
114
+ ami_db = adjusted_mutual_info_score(y, dbscan.labels_)
115
+ v_db = v_measure_score(y, dbscan.labels_)
116
+
117
+ ari_gmm = adjusted_rand_score(y, gmm.labels_)
118
+ ami_gmm = adjusted_mutual_info_score(y, gmm.labels_)
119
+ v_gmm = v_measure_score(y, gmm.labels_)
120
+
121
+ print("\n--- RESULTS ---")
122
+ print(f"DBSCAN Time: {t_dbscan:.2f} seconds")
123
+ print(f"BSGMM Time: {t_bsgmm:.2f} seconds")
124
+ print(f"Speedup: {t_dbscan/t_bsgmm:.2f}x (Note: DBSCAN doesn't scale well with dimensions)")
125
+
126
+ print(f"\nDBSCAN Clusters found: {len(np.unique(dbscan.labels_))} (Noise points: {np.sum(dbscan.labels_ == -1)})")
127
+ print(f"BSGMM Clusters found: {gmm.K_hat_}")
128
+
129
+ print(f"\nDBSCAN - ARI: {ari_db:.4f} | AMI: {ami_db:.4f} | V: {v_db:.4f}")
130
+ print(f"BSGMM - ARI: {ari_gmm:.4f} | AMI: {ami_gmm:.4f} | V: {v_gmm:.4f}")
131
+
132
+ n_sel = len(gmm.selected_features_)
133
+ print(f"\nBSGMM Selected Features: {n_sel}/{n_features}")
134
+
135
+ # Check if BSGMM found the true signal features
136
+ true_sig = set(signal_indices)
137
+ sel = set(gmm.selected_features_)
138
+ found = true_sig.intersection(sel)
139
+ print(f"Signal features found: {found} (out of {true_sig})")
140
+
141
+ # --- Visualization ---
142
+ os.makedirs("./visualize", exist_ok=True)
143
+
144
+ fig, axes = plt.subplots(1, 3, figsize=(18, 5))
145
+ fig.suptitle(f"Stress Test: Mixed Arcs (n={n_samples}, p={n_features})", fontsize=14, fontweight='bold')
146
+
147
+ # Ground Truth
148
+ pal = plt.cm.tab10(np.linspace(0, 0.9, 10))
149
+ for k in np.unique(y):
150
+ m = y == k
151
+ axes[0].scatter(X_2d[m, 0], X_2d[m, 1], c=[pal[k % 10]], s=5, alpha=0.5)
152
+ axes[0].set_title("Ground Truth (2D Projection)")
153
+
154
+ # DBSCAN
155
+ unique_db = np.unique(dbscan.labels_)
156
+ pal_db = plt.cm.tab20(np.linspace(0, 1, max(len(unique_db), 2)))
157
+ for idx, k in enumerate(unique_db):
158
+ m = dbscan.labels_ == k
159
+ color = 'k' if k == -1 else pal_db[idx % 20]
160
+ axes[1].scatter(X_2d[m, 0], X_2d[m, 1], c=[color], s=5, alpha=0.5)
161
+ axes[1].set_title(f"DBSCAN (ARI={ari_db:.3f})")
162
+
163
+ # BSGMM
164
+ unique_gmm = np.unique(gmm.labels_)
165
+ pal_gmm = plt.cm.tab20(np.linspace(0, 1, max(len(unique_gmm), 2)))
166
+ for idx, k in enumerate(unique_gmm):
167
+ m = gmm.labels_ == k
168
+ axes[2].scatter(X_2d[m, 0], X_2d[m, 1], c=[pal_gmm[idx % 20]], s=5, alpha=0.5)
169
+ axes[2].set_title(f"BSGMM SVI (ARI={ari_gmm:.3f}, {n_sel} feats)")
170
+
171
+ plt.tight_layout()
172
+ plt.savefig("./visualize/stress_test_arcs.png", dpi=150, bbox_inches='tight')
173
+ plt.close()
174
+ print("Saved './visualize/stress_test_arcs.png'")
175
+
176
+ if __name__ == "__main__":
177
+ import argparse
178
+ parser = argparse.ArgumentParser()
179
+ parser.add_argument("--backend", default="cuda", type=str)
180
+ args = parser.parse_args()
181
+
182
+ run_stress_test_benchmark(backend=args.backend)