bayesian-sparse-gmm 0.2.2__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/.gitignore +2 -1
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/PKG-INFO +33 -2
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/README.md +32 -1
- bayesian_sparse_gmm-0.3.0/benchmarks/benchmark.py +78 -0
- bayesian_sparse_gmm-0.3.0/benchmarks/benchmark_stress_test.py +182 -0
- bayesian_sparse_gmm-0.3.0/benchmarks/evaluate.py +862 -0
- bayesian_sparse_gmm-0.3.0/benchmarks/run_olivetti_only.py +9 -0
- bayesian_sparse_gmm-0.3.0/benchmarks/tune_svi.py +37 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/__init__.py +1 -1
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/backends/_base.py +14 -4
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/backends/_cuda.py +13 -1
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/backends/_numba.py +59 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/backends/_numpy.py +16 -6
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/config.py +19 -4
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/model.py +138 -71
- bayesian_sparse_gmm-0.3.0/src/bayesian_sparse_gmm/state.py +39 -0
- bayesian_sparse_gmm-0.3.0/src/bayesian_sparse_gmm/svi.py +270 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_model.py +27 -0
- bayesian_sparse_gmm-0.2.2/evaluate.py +0 -503
- bayesian_sparse_gmm-0.2.2/src/bayesian_sparse_gmm/state.py +0 -18
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/.flake8 +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/.github/workflows/release.yml +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/LICENSE +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/pyproject.toml +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/backends/__init__.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/diagnostics.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/postprocessing.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/sampler.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/urn.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/src/bayesian_sparse_gmm/utils.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/__init__.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_backends.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_cuda.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_diagnostics.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_identity_cov.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_laplace.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_numba.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_phase1.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_sampler.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_synthetic.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_theta_update.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_urn_weight.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_variable_k.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_xi.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/tests/test_xi_update.py +0 -0
- {bayesian_sparse_gmm-0.2.2 → bayesian_sparse_gmm-0.3.0}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bayesian-sparse-gmm
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Bayesian Sparse Gaussian Mixture Model implementation in Python
|
|
5
5
|
Author-email: Nam Nam <nampvh4436@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -26,6 +26,11 @@ Description-Content-Type: text/markdown
|
|
|
26
26
|
|
|
27
27
|
# Bayesian Sparse GMM
|
|
28
28
|
|
|
29
|
+

|
|
30
|
+

|
|
31
|
+

|
|
32
|
+

|
|
33
|
+
|
|
29
34
|
Bayesian Sparse Gaussian Mixture Model (GMM) implementation in Python.
|
|
30
35
|
|
|
31
36
|
## Installation
|
|
@@ -79,6 +84,13 @@ print(f"Feature inclusion probabilities: {model.feature_probabilities_.round(3)}
|
|
|
79
84
|
labels = model.predict(X)
|
|
80
85
|
```
|
|
81
86
|
|
|
87
|
+
## Optimization Methods
|
|
88
|
+
|
|
89
|
+
The model supports two optimization architectures via the `optimizer` parameter:
|
|
90
|
+
|
|
91
|
+
1. **Gibbs Sampling (MCMC)** (`optimizer="default"`): The original, mathematically exact implementation. It explores the full posterior distribution but requires processing the entire dataset per iteration. Best for smaller datasets or when exact posterior distributions are required.
|
|
92
|
+
2. **Stochastic Variational Inference (SVI)** (`optimizer="svi"`): A highly-scalable approach using Coordinate Ascent Variational Inference (CAVI) with Natural Gradients. It processes data in mini-batches, making it orders of magnitude faster and capable of scaling to extremely large datasets ($N \gg 10,000$).
|
|
93
|
+
|
|
82
94
|
## GPU / CUDA Acceleration
|
|
83
95
|
|
|
84
96
|
The model supports three backends, selected via the `backend` parameter:
|
|
@@ -164,14 +176,29 @@ Understanding the key hyperparameters is crucial for fine-tuning the model's spa
|
|
|
164
176
|
|
|
165
177
|
| Parameter | Type | Default | Description |
|
|
166
178
|
|-----------|------|---------|-------------|
|
|
179
|
+
| `optimizer` | `str` | `"default"` | Choose `"default"` for exact MCMC (Gibbs Sampling) or `"svi"` for scalable Stochastic Variational Inference (mini-batch). |
|
|
167
180
|
| `K_max` | `int` | `15` | The maximum possible number of clusters. The algorithm will automatically find the active number of clusters $K \le K_{max}$. Should be set safely higher than the expected number of true clusters. |
|
|
168
181
|
| `lambda_0` | `float` | `1000.0` | **Spike rate** of the Spike-and-Slab LASSO prior. A larger value aggressively forces non-informative (noise) features closer to zero. Must satisfy `lambda_0 >> lambda_1`. |
|
|
169
182
|
| `lambda_1` | `float` | `0.1` | **Slab rate**. A smaller value allows informative features to deviate freely from zero to capture the cluster structure. |
|
|
170
183
|
| `alpha` | `float` | `1.0` | Dirichlet concentration parameter for the cluster weight prior. Controls the prior belief over the distribution of cluster sizes. |
|
|
171
184
|
| `theta` | `float` | `0.1` | Prior probability of a feature being included in the active set (the slab component). Smaller values induce stronger sparsity (fewer features selected). |
|
|
185
|
+
|
|
186
|
+
### MCMC Parameters (`optimizer="default"`)
|
|
187
|
+
|
|
188
|
+
| Parameter | Type | Default | Description |
|
|
189
|
+
|-----------|------|---------|-------------|
|
|
172
190
|
| `burn_in` | `int` | `500` | Number of initial MCMC iterations discarded to allow the Markov chain to converge to the stationary distribution. |
|
|
173
191
|
| `n_iter` | `int` | `1000` | Total number of MCMC iterations. The number of samples used for posterior inference is `n_iter - burn_in`. |
|
|
174
192
|
|
|
193
|
+
### SVI Parameters (`optimizer="svi"`)
|
|
194
|
+
|
|
195
|
+
| Parameter | Type | Default | Description |
|
|
196
|
+
|-----------|------|---------|-------------|
|
|
197
|
+
| `epochs` | `int` | `100` | Total number of passes over the dataset during variational inference. |
|
|
198
|
+
| `batch_size` | `int` | `256` | Mini-batch size for SVI updates. |
|
|
199
|
+
| `delay_rho` | `float` | `1.0` | Learning rate delay parameter ($\tau_0$) to stabilize early iterations. |
|
|
200
|
+
| `forgetting_rate`| `float`| `0.75` | Forgetting rate ($\kappa \in (0.5, 1.0]$) controlling the learning rate decay $\rho_t = (t + \tau_0)^{-\kappa}$. |
|
|
201
|
+
|
|
175
202
|
*Tip: For extremely high-dimensional datasets with heavy noise, tuning `lambda_0` to be larger and `theta` to be smaller will encourage more aggressive feature selection.*
|
|
176
203
|
|
|
177
204
|
## Reference
|
|
@@ -187,4 +214,8 @@ Understanding the key hyperparameters is crucial for fine-tuning the model's spa
|
|
|
187
214
|
pages = {1--50},
|
|
188
215
|
url = {http://jmlr.org/papers/v26/23-0142.html}
|
|
189
216
|
}
|
|
190
|
-
```
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## Contributors
|
|
220
|
+
|
|
221
|
+
* **Nam Nam** ([@Neeze](https://github.com/Neeze)) - Developer of the SVI (Stochastic Variational Inference) optimizer, GPU/CUDA acceleration, and benchmarking suite.
|
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
# Bayesian Sparse GMM
|
|
2
2
|
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+

|
|
6
|
+

|
|
7
|
+
|
|
3
8
|
Bayesian Sparse Gaussian Mixture Model (GMM) implementation in Python.
|
|
4
9
|
|
|
5
10
|
## Installation
|
|
@@ -53,6 +58,13 @@ print(f"Feature inclusion probabilities: {model.feature_probabilities_.round(3)}
|
|
|
53
58
|
labels = model.predict(X)
|
|
54
59
|
```
|
|
55
60
|
|
|
61
|
+
## Optimization Methods
|
|
62
|
+
|
|
63
|
+
The model supports two optimization architectures via the `optimizer` parameter:
|
|
64
|
+
|
|
65
|
+
1. **Gibbs Sampling (MCMC)** (`optimizer="default"`): The original, mathematically exact implementation. It explores the full posterior distribution but requires processing the entire dataset per iteration. Best for smaller datasets or when exact posterior distributions are required.
|
|
66
|
+
2. **Stochastic Variational Inference (SVI)** (`optimizer="svi"`): A highly-scalable approach using Coordinate Ascent Variational Inference (CAVI) with Natural Gradients. It processes data in mini-batches, making it orders of magnitude faster and capable of scaling to extremely large datasets ($N \gg 10,000$).
|
|
67
|
+
|
|
56
68
|
## GPU / CUDA Acceleration
|
|
57
69
|
|
|
58
70
|
The model supports three backends, selected via the `backend` parameter:
|
|
@@ -138,14 +150,29 @@ Understanding the key hyperparameters is crucial for fine-tuning the model's spa
|
|
|
138
150
|
|
|
139
151
|
| Parameter | Type | Default | Description |
|
|
140
152
|
|-----------|------|---------|-------------|
|
|
153
|
+
| `optimizer` | `str` | `"default"` | Choose `"default"` for exact MCMC (Gibbs Sampling) or `"svi"` for scalable Stochastic Variational Inference (mini-batch). |
|
|
141
154
|
| `K_max` | `int` | `15` | The maximum possible number of clusters. The algorithm will automatically find the active number of clusters $K \le K_{max}$. Should be set safely higher than the expected number of true clusters. |
|
|
142
155
|
| `lambda_0` | `float` | `1000.0` | **Spike rate** of the Spike-and-Slab LASSO prior. A larger value aggressively forces non-informative (noise) features closer to zero. Must satisfy `lambda_0 >> lambda_1`. |
|
|
143
156
|
| `lambda_1` | `float` | `0.1` | **Slab rate**. A smaller value allows informative features to deviate freely from zero to capture the cluster structure. |
|
|
144
157
|
| `alpha` | `float` | `1.0` | Dirichlet concentration parameter for the cluster weight prior. Controls the prior belief over the distribution of cluster sizes. |
|
|
145
158
|
| `theta` | `float` | `0.1` | Prior probability of a feature being included in the active set (the slab component). Smaller values induce stronger sparsity (fewer features selected). |
|
|
159
|
+
|
|
160
|
+
### MCMC Parameters (`optimizer="default"`)
|
|
161
|
+
|
|
162
|
+
| Parameter | Type | Default | Description |
|
|
163
|
+
|-----------|------|---------|-------------|
|
|
146
164
|
| `burn_in` | `int` | `500` | Number of initial MCMC iterations discarded to allow the Markov chain to converge to the stationary distribution. |
|
|
147
165
|
| `n_iter` | `int` | `1000` | Total number of MCMC iterations. The number of samples used for posterior inference is `n_iter - burn_in`. |
|
|
148
166
|
|
|
167
|
+
### SVI Parameters (`optimizer="svi"`)
|
|
168
|
+
|
|
169
|
+
| Parameter | Type | Default | Description |
|
|
170
|
+
|-----------|------|---------|-------------|
|
|
171
|
+
| `epochs` | `int` | `100` | Total number of passes over the dataset during variational inference. |
|
|
172
|
+
| `batch_size` | `int` | `256` | Mini-batch size for SVI updates. |
|
|
173
|
+
| `delay_rho` | `float` | `1.0` | Learning rate delay parameter ($\tau_0$) to stabilize early iterations. |
|
|
174
|
+
| `forgetting_rate`| `float`| `0.75` | Forgetting rate ($\kappa \in (0.5, 1.0]$) controlling the learning rate decay $\rho_t = (t + \tau_0)^{-\kappa}$. |
|
|
175
|
+
|
|
149
176
|
*Tip: For extremely high-dimensional datasets with heavy noise, tuning `lambda_0` to be larger and `theta` to be smaller will encourage more aggressive feature selection.*
|
|
150
177
|
|
|
151
178
|
## Reference
|
|
@@ -161,4 +188,8 @@ Understanding the key hyperparameters is crucial for fine-tuning the model's spa
|
|
|
161
188
|
pages = {1--50},
|
|
162
189
|
url = {http://jmlr.org/papers/v26/23-0142.html}
|
|
163
190
|
}
|
|
164
|
-
```
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## Contributors
|
|
194
|
+
|
|
195
|
+
* **Nam Nam** ([@Neeze](https://github.com/Neeze)) - Developer of the SVI (Stochastic Variational Inference) optimizer, GPU/CUDA acceleration, and benchmarking suite.
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.preprocessing import StandardScaler
|
|
4
|
+
from bayesian_sparse_gmm.model import BayesianSparseGMM
|
|
5
|
+
|
|
6
|
+
def run_benchmark(n=5000, p=50, K_true=5, signal_features=10, backend="cuda"):
|
|
7
|
+
print(f"\n=======================================================")
|
|
8
|
+
print(f"BENCHMARK: SVI vs MCMC (n={n}, p={p}, K={K_true})")
|
|
9
|
+
print(f"Backend: {backend.upper()}")
|
|
10
|
+
print(f"=======================================================\n")
|
|
11
|
+
|
|
12
|
+
# Generate Synthetic Sparse Data
|
|
13
|
+
rng = np.random.default_rng(42)
|
|
14
|
+
means = np.zeros((K_true, p))
|
|
15
|
+
means[:, :signal_features] = rng.normal(0, 3.0, size=(K_true, signal_features))
|
|
16
|
+
|
|
17
|
+
n_per_k = n // K_true
|
|
18
|
+
X_parts = [rng.normal(means[k], 1.0, size=(n_per_k, p)) for k in range(K_true)]
|
|
19
|
+
X_raw = np.vstack(X_parts)
|
|
20
|
+
y = np.repeat(np.arange(K_true), n_per_k)
|
|
21
|
+
|
|
22
|
+
# Shuffle
|
|
23
|
+
shuf = rng.permutation(len(y))
|
|
24
|
+
X_raw, y = X_raw[shuf], y[shuf]
|
|
25
|
+
|
|
26
|
+
# Standardize
|
|
27
|
+
X = StandardScaler().fit_transform(X_raw)
|
|
28
|
+
|
|
29
|
+
# --- MCMC (Default) ---
|
|
30
|
+
print("Running MCMC (Gibbs Sampling) ...")
|
|
31
|
+
t0 = time.time()
|
|
32
|
+
gmm_mcmc = BayesianSparseGMM(
|
|
33
|
+
K_max=10,
|
|
34
|
+
optimizer="default",
|
|
35
|
+
n_iter=100, # Small number for benchmarking
|
|
36
|
+
burn_in=20,
|
|
37
|
+
backend=backend,
|
|
38
|
+
random_state=42,
|
|
39
|
+
use_identity_covariance=True
|
|
40
|
+
)
|
|
41
|
+
gmm_mcmc.fit(X)
|
|
42
|
+
t_mcmc = time.time() - t0
|
|
43
|
+
|
|
44
|
+
# --- SVI ---
|
|
45
|
+
print("Running SVI (Natural Gradients) ...")
|
|
46
|
+
t0 = time.time()
|
|
47
|
+
gmm_svi = BayesianSparseGMM(
|
|
48
|
+
K_max=10,
|
|
49
|
+
optimizer="svi",
|
|
50
|
+
epochs=10, # 10 passes over the dataset
|
|
51
|
+
batch_size=256,
|
|
52
|
+
backend=backend,
|
|
53
|
+
random_state=42,
|
|
54
|
+
use_identity_covariance=True
|
|
55
|
+
)
|
|
56
|
+
gmm_svi.fit(X)
|
|
57
|
+
t_svi = time.time() - t0
|
|
58
|
+
|
|
59
|
+
print("\n--- RESULTS ---")
|
|
60
|
+
print(f"MCMC Time: {t_mcmc:.2f} seconds")
|
|
61
|
+
print(f"SVI Time: {t_svi:.2f} seconds")
|
|
62
|
+
print(f"Speedup: {t_mcmc/t_svi:.2f}x")
|
|
63
|
+
|
|
64
|
+
print(f"\nMCMC Found Clusters: {gmm_mcmc.K_hat_}")
|
|
65
|
+
print(f"SVI Found Clusters: {gmm_svi.K_hat_}")
|
|
66
|
+
|
|
67
|
+
n_sel_mcmc = len(gmm_mcmc.selected_features_)
|
|
68
|
+
n_sel_svi = len(gmm_svi.selected_features_)
|
|
69
|
+
print(f"MCMC Selected Features: {n_sel_mcmc}/{p}")
|
|
70
|
+
print(f"SVI Selected Features: {n_sel_svi}/{p}")
|
|
71
|
+
|
|
72
|
+
if __name__ == "__main__":
|
|
73
|
+
import argparse
|
|
74
|
+
parser = argparse.ArgumentParser()
|
|
75
|
+
parser.add_argument("--backend", default="cuda", type=str)
|
|
76
|
+
args = parser.parse_args()
|
|
77
|
+
|
|
78
|
+
run_benchmark(n=10000, p=100, K_true=10, signal_features=15, backend=args.backend)
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import os
|
|
3
|
+
import numpy as np
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
from sklearn.cluster import DBSCAN
|
|
6
|
+
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, v_measure_score
|
|
7
|
+
from sklearn.datasets import make_moons
|
|
8
|
+
from sklearn.preprocessing import StandardScaler
|
|
9
|
+
|
|
10
|
+
from bayesian_sparse_gmm.model import BayesianSparseGMM
|
|
11
|
+
|
|
12
|
+
def generate_complex_arcs(n_samples=10000, n_features=1024, noise_level=0.15):
|
|
13
|
+
"""
|
|
14
|
+
Generate a complex dataset with arc-shaped clusters mixed together, embedded in high dimensions.
|
|
15
|
+
"""
|
|
16
|
+
rng = np.random.default_rng(42)
|
|
17
|
+
# Create 4 arcs (moons) that intersect
|
|
18
|
+
n_per_pair = n_samples // 2
|
|
19
|
+
|
|
20
|
+
# Pair 1: Standard moons
|
|
21
|
+
X1, y1 = make_moons(n_samples=n_per_pair, noise=noise_level, random_state=42)
|
|
22
|
+
|
|
23
|
+
# Pair 2: Rotated and shifted moons
|
|
24
|
+
X2, y2 = make_moons(n_samples=n_per_pair, noise=noise_level, random_state=43)
|
|
25
|
+
y2 += 2 # Cluster labels 2 and 3
|
|
26
|
+
|
|
27
|
+
# Rotate by 60 degrees
|
|
28
|
+
theta = np.radians(60)
|
|
29
|
+
c, s = np.cos(theta), np.sin(theta)
|
|
30
|
+
R = np.array([[c, -s], [s, c]])
|
|
31
|
+
X2 = X2 @ R
|
|
32
|
+
|
|
33
|
+
# Shift to overlap with Pair 1
|
|
34
|
+
X2[:, 0] += 0.3
|
|
35
|
+
X2[:, 1] += 0.3
|
|
36
|
+
|
|
37
|
+
X_2d = np.vstack((X1, X2))
|
|
38
|
+
y = np.concatenate((y1, y2))
|
|
39
|
+
|
|
40
|
+
# Embed in high-dimensional space
|
|
41
|
+
X = np.zeros((n_samples, n_features))
|
|
42
|
+
X[:, 0] = X_2d[:, 0]
|
|
43
|
+
X[:, 1] = X_2d[:, 1]
|
|
44
|
+
|
|
45
|
+
# Add a few derived non-linear signal features
|
|
46
|
+
X[:, 2] = X_2d[:, 0] * X_2d[:, 1]
|
|
47
|
+
X[:, 3] = X_2d[:, 0] ** 2
|
|
48
|
+
X[:, 4] = X_2d[:, 1] ** 2
|
|
49
|
+
|
|
50
|
+
# Add complex pure noise features
|
|
51
|
+
noise_features = rng.normal(0, 1.5, size=(n_samples, n_features - 5))
|
|
52
|
+
X[:, 5:] = noise_features
|
|
53
|
+
|
|
54
|
+
# Randomly permute the features so the signal is not just in the first 5 dims
|
|
55
|
+
feature_shuf = rng.permutation(n_features)
|
|
56
|
+
X = X[:, feature_shuf]
|
|
57
|
+
|
|
58
|
+
# Shuffle samples
|
|
59
|
+
sample_shuf = rng.permutation(n_samples)
|
|
60
|
+
X, y, X_2d = X[sample_shuf], y[sample_shuf], X_2d[sample_shuf]
|
|
61
|
+
|
|
62
|
+
# Scale
|
|
63
|
+
X = StandardScaler().fit_transform(X)
|
|
64
|
+
|
|
65
|
+
return X, y, X_2d, feature_shuf
|
|
66
|
+
|
|
67
|
+
def run_stress_test_benchmark(backend="cuda"):
|
|
68
|
+
n_samples = 10000
|
|
69
|
+
n_features = 1024
|
|
70
|
+
|
|
71
|
+
print(f"\n=======================================================")
|
|
72
|
+
print(f"STRESS TEST BENCHMARK: ARC-SHAPED CLUSTERS")
|
|
73
|
+
print(f"n={n_samples}, p={n_features}, backend={backend.upper()}")
|
|
74
|
+
print(f"=======================================================\n")
|
|
75
|
+
|
|
76
|
+
print("Generating complex dataset...")
|
|
77
|
+
X, y, X_2d, feature_shuf = generate_complex_arcs(n_samples, n_features)
|
|
78
|
+
|
|
79
|
+
# Find original signal feature indices
|
|
80
|
+
signal_indices = [np.where(feature_shuf == i)[0][0] for i in range(5)]
|
|
81
|
+
print(f"Signal feature indices: {signal_indices}")
|
|
82
|
+
|
|
83
|
+
# --- DBSCAN ---
|
|
84
|
+
# In 1024 dimensions, distance values are large.
|
|
85
|
+
# For N(0,1) variables, expected Euclidean distance is roughly sqrt(2*1024) ~ 45
|
|
86
|
+
# We set a somewhat large eps and min_samples to see how it handles it.
|
|
87
|
+
print("\nRunning DBSCAN ...")
|
|
88
|
+
t0 = time.time()
|
|
89
|
+
dbscan = DBSCAN(eps=40.0, min_samples=10)
|
|
90
|
+
dbscan.fit(X)
|
|
91
|
+
t_dbscan = time.time() - t0
|
|
92
|
+
|
|
93
|
+
# --- BSGMM (SVI) ---
|
|
94
|
+
print("\nRunning Bayesian Sparse GMM (SVI) ...")
|
|
95
|
+
t0 = time.time()
|
|
96
|
+
gmm = BayesianSparseGMM(
|
|
97
|
+
K_max=10,
|
|
98
|
+
optimizer="svi",
|
|
99
|
+
epochs=100,
|
|
100
|
+
batch_size=512,
|
|
101
|
+
lambda_0=1000.0,
|
|
102
|
+
lambda_1=0.1,
|
|
103
|
+
theta=0.5,
|
|
104
|
+
backend=backend,
|
|
105
|
+
random_state=42,
|
|
106
|
+
use_identity_covariance=True,
|
|
107
|
+
verbose=1
|
|
108
|
+
)
|
|
109
|
+
gmm.fit(X)
|
|
110
|
+
t_bsgmm = time.time() - t0
|
|
111
|
+
|
|
112
|
+
# --- Results ---
|
|
113
|
+
ari_db = adjusted_rand_score(y, dbscan.labels_)
|
|
114
|
+
ami_db = adjusted_mutual_info_score(y, dbscan.labels_)
|
|
115
|
+
v_db = v_measure_score(y, dbscan.labels_)
|
|
116
|
+
|
|
117
|
+
ari_gmm = adjusted_rand_score(y, gmm.labels_)
|
|
118
|
+
ami_gmm = adjusted_mutual_info_score(y, gmm.labels_)
|
|
119
|
+
v_gmm = v_measure_score(y, gmm.labels_)
|
|
120
|
+
|
|
121
|
+
print("\n--- RESULTS ---")
|
|
122
|
+
print(f"DBSCAN Time: {t_dbscan:.2f} seconds")
|
|
123
|
+
print(f"BSGMM Time: {t_bsgmm:.2f} seconds")
|
|
124
|
+
print(f"Speedup: {t_dbscan/t_bsgmm:.2f}x (Note: DBSCAN doesn't scale well with dimensions)")
|
|
125
|
+
|
|
126
|
+
print(f"\nDBSCAN Clusters found: {len(np.unique(dbscan.labels_))} (Noise points: {np.sum(dbscan.labels_ == -1)})")
|
|
127
|
+
print(f"BSGMM Clusters found: {gmm.K_hat_}")
|
|
128
|
+
|
|
129
|
+
print(f"\nDBSCAN - ARI: {ari_db:.4f} | AMI: {ami_db:.4f} | V: {v_db:.4f}")
|
|
130
|
+
print(f"BSGMM - ARI: {ari_gmm:.4f} | AMI: {ami_gmm:.4f} | V: {v_gmm:.4f}")
|
|
131
|
+
|
|
132
|
+
n_sel = len(gmm.selected_features_)
|
|
133
|
+
print(f"\nBSGMM Selected Features: {n_sel}/{n_features}")
|
|
134
|
+
|
|
135
|
+
# Check if BSGMM found the true signal features
|
|
136
|
+
true_sig = set(signal_indices)
|
|
137
|
+
sel = set(gmm.selected_features_)
|
|
138
|
+
found = true_sig.intersection(sel)
|
|
139
|
+
print(f"Signal features found: {found} (out of {true_sig})")
|
|
140
|
+
|
|
141
|
+
# --- Visualization ---
|
|
142
|
+
os.makedirs("./visualize", exist_ok=True)
|
|
143
|
+
|
|
144
|
+
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
|
|
145
|
+
fig.suptitle(f"Stress Test: Mixed Arcs (n={n_samples}, p={n_features})", fontsize=14, fontweight='bold')
|
|
146
|
+
|
|
147
|
+
# Ground Truth
|
|
148
|
+
pal = plt.cm.tab10(np.linspace(0, 0.9, 10))
|
|
149
|
+
for k in np.unique(y):
|
|
150
|
+
m = y == k
|
|
151
|
+
axes[0].scatter(X_2d[m, 0], X_2d[m, 1], c=[pal[k % 10]], s=5, alpha=0.5)
|
|
152
|
+
axes[0].set_title("Ground Truth (2D Projection)")
|
|
153
|
+
|
|
154
|
+
# DBSCAN
|
|
155
|
+
unique_db = np.unique(dbscan.labels_)
|
|
156
|
+
pal_db = plt.cm.tab20(np.linspace(0, 1, max(len(unique_db), 2)))
|
|
157
|
+
for idx, k in enumerate(unique_db):
|
|
158
|
+
m = dbscan.labels_ == k
|
|
159
|
+
color = 'k' if k == -1 else pal_db[idx % 20]
|
|
160
|
+
axes[1].scatter(X_2d[m, 0], X_2d[m, 1], c=[color], s=5, alpha=0.5)
|
|
161
|
+
axes[1].set_title(f"DBSCAN (ARI={ari_db:.3f})")
|
|
162
|
+
|
|
163
|
+
# BSGMM
|
|
164
|
+
unique_gmm = np.unique(gmm.labels_)
|
|
165
|
+
pal_gmm = plt.cm.tab20(np.linspace(0, 1, max(len(unique_gmm), 2)))
|
|
166
|
+
for idx, k in enumerate(unique_gmm):
|
|
167
|
+
m = gmm.labels_ == k
|
|
168
|
+
axes[2].scatter(X_2d[m, 0], X_2d[m, 1], c=[pal_gmm[idx % 20]], s=5, alpha=0.5)
|
|
169
|
+
axes[2].set_title(f"BSGMM SVI (ARI={ari_gmm:.3f}, {n_sel} feats)")
|
|
170
|
+
|
|
171
|
+
plt.tight_layout()
|
|
172
|
+
plt.savefig("./visualize/stress_test_arcs.png", dpi=150, bbox_inches='tight')
|
|
173
|
+
plt.close()
|
|
174
|
+
print("Saved './visualize/stress_test_arcs.png'")
|
|
175
|
+
|
|
176
|
+
if __name__ == "__main__":
|
|
177
|
+
import argparse
|
|
178
|
+
parser = argparse.ArgumentParser()
|
|
179
|
+
parser.add_argument("--backend", default="cuda", type=str)
|
|
180
|
+
args = parser.parse_args()
|
|
181
|
+
|
|
182
|
+
run_stress_test_benchmark(backend=args.backend)
|