bayesian-sparse-gmm 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ from .model import BayesianSparseGMM
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ __all__ = ["BayesianSparseGMM"]
@@ -0,0 +1,52 @@
1
+ import warnings
2
+
3
+ from ._base import ComputeBackend
4
+ from ._numpy import NumpyBackend
5
+
6
+ try:
7
+ from ._numba import NumbaBackend
8
+
9
+ NUMBA_AVAILABLE = True
10
+ except ImportError:
11
+ NUMBA_AVAILABLE = False
12
+
13
+ from ._cuda import CUPY_AVAILABLE, CUDABackend
14
+
15
+
16
+ def select_backend(preference: str = "auto", use_cuda: bool = False) -> ComputeBackend:
17
+ """Select the best available compute backend based on preference."""
18
+ preference = preference.lower()
19
+
20
+ if preference == "numpy":
21
+ return NumpyBackend()
22
+
23
+ if preference == "cuda":
24
+ if CUPY_AVAILABLE:
25
+ return CUDABackend()
26
+ if NUMBA_AVAILABLE:
27
+ return NumbaBackend(use_cuda=True)
28
+ warnings.warn(
29
+ "CUDA backend requested but neither CuPy nor Numba CUDA is available. Falling back to NumPy."
30
+ )
31
+ return NumpyBackend()
32
+
33
+ if preference in ("numba", "auto"):
34
+ if preference == "numba":
35
+ if NUMBA_AVAILABLE:
36
+ return NumbaBackend(use_cuda=use_cuda)
37
+ warnings.warn(
38
+ "Numba backend requested but Numba is not available. Falling back to NumPy."
39
+ )
40
+ return NumpyBackend()
41
+
42
+ # preference == "auto"
43
+ if use_cuda:
44
+ if CUPY_AVAILABLE:
45
+ return CUDABackend()
46
+ if NUMBA_AVAILABLE:
47
+ return NumbaBackend(use_cuda=True)
48
+ if NUMBA_AVAILABLE:
49
+ return NumbaBackend(use_cuda=False)
50
+ return NumpyBackend()
51
+
52
+ raise ValueError(f"Unknown backend preference: {preference}")
@@ -0,0 +1,46 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ import numpy as np
4
+
5
+
6
+ class ComputeBackend(ABC):
7
+ """Abstract base class for all compute backends."""
8
+
9
+ @abstractmethod
10
+ def compute_cluster_log_probs(
11
+ self, X: np.ndarray, mu: np.ndarray, log_w: np.ndarray, sigma2: np.ndarray
12
+ ) -> np.ndarray:
13
+ """Step 2: Compute log probability of cluster assignment for all samples and clusters.
14
+
15
+ Returns shape (n, K_max)
16
+ """
17
+ pass
18
+
19
+ @abstractmethod
20
+ def compute_sufficient_stats(
21
+ self, X: np.ndarray, z: np.ndarray, K_max: int
22
+ ) -> tuple[np.ndarray, np.ndarray]:
23
+ """Step 4: Compute cluster sizes and feature sums per cluster.
24
+
25
+ Returns (n_k, sum_x)
26
+ """
27
+ pass
28
+
29
+ @abstractmethod
30
+ def sample_cluster_means(
31
+ self,
32
+ sum_x: np.ndarray,
33
+ n_k: np.ndarray,
34
+ tau2: np.ndarray,
35
+ sigma2: np.ndarray,
36
+ rng: np.random.Generator,
37
+ ) -> np.ndarray:
38
+ """Step 4b: Sample cluster means mu[k,j] ~ Normal."""
39
+ pass
40
+
41
+ @abstractmethod
42
+ def sample_inverse_gaussian(
43
+ self, mu_abs: np.ndarray, lam: np.ndarray, rng: np.random.Generator
44
+ ) -> np.ndarray:
45
+ """Step 4a: Sample tau^2 via Inverse Gaussian."""
46
+ pass
@@ -0,0 +1,106 @@
1
+ import numpy as np
2
+
3
+ from ._base import ComputeBackend
4
+
5
+ try:
6
+ import cupy as cp
7
+
8
+ CUPY_AVAILABLE = True
9
+ except ImportError:
10
+ CUPY_AVAILABLE = False
11
+ cp = None
12
+
13
+
14
+ class CUDABackend(ComputeBackend):
15
+ """CuPy-based GPU accelerated compute backend."""
16
+
17
+ def __init__(self):
18
+ if not CUPY_AVAILABLE:
19
+ raise ImportError(
20
+ "CuPy is not installed. Please install cupy to use CUDABackend."
21
+ )
22
+
23
+ def compute_cluster_log_probs(
24
+ self, X: np.ndarray, mu: np.ndarray, log_w: np.ndarray
25
+ ) -> np.ndarray:
26
+ """Step 2: Compute log probability of cluster assignment for all samples and clusters on GPU."""
27
+ X_gpu = cp.asarray(X)
28
+ mu_gpu = cp.asarray(mu)
29
+ log_w_gpu = cp.asarray(log_w)
30
+
31
+ x_sq = cp.sum(X_gpu**2, axis=1, keepdims=True)
32
+ mu_sq = cp.sum(mu_gpu**2, axis=1, keepdims=True).T
33
+ dist = x_sq - 2.0 * cp.dot(X_gpu, mu_gpu.T) + mu_sq
34
+ dist = cp.maximum(dist, 0.0)
35
+
36
+ log_probs = log_w_gpu - 0.5 * dist
37
+ return cp.asnumpy(log_probs)
38
+
39
+ def compute_sufficient_stats(
40
+ self, X: np.ndarray, z: np.ndarray, K_max: int
41
+ ) -> tuple[np.ndarray, np.ndarray]:
42
+ """Step 4: Compute cluster sizes and feature sums per cluster on GPU."""
43
+ X_gpu = cp.asarray(X)
44
+ z_gpu = cp.asarray(z)
45
+
46
+ n_k = cp.bincount(z_gpu, minlength=K_max)
47
+ sum_x = cp.zeros((K_max, X.shape[1]), dtype=X_gpu.dtype)
48
+ cp.scatter_add(sum_x, z_gpu, X_gpu)
49
+
50
+ return cp.asnumpy(n_k), cp.asnumpy(sum_x)
51
+
52
+ def sample_cluster_means(
53
+ self,
54
+ sum_x: np.ndarray,
55
+ n_k: np.ndarray,
56
+ tau2: np.ndarray,
57
+ rng: np.random.Generator,
58
+ ) -> np.ndarray:
59
+ """Step 4b: Sample cluster means mu[k,j] ~ Normal on GPU."""
60
+ sum_x_gpu = cp.asarray(sum_x)
61
+ n_k_gpu = cp.asarray(n_k)
62
+ tau2_gpu = cp.asarray(tau2)
63
+
64
+ noise = rng.normal(size=sum_x.shape)
65
+ noise_gpu = cp.asarray(noise)
66
+
67
+ post_var = 1.0 / (n_k_gpu[:, cp.newaxis] + 1.0 / tau2_gpu)
68
+ post_mean = post_var * sum_x_gpu
69
+ mu_gpu = post_mean + cp.sqrt(post_var) * noise_gpu
70
+
71
+ return cp.asnumpy(mu_gpu)
72
+
73
+ def sample_inverse_gaussian(
74
+ self, mu_abs: np.ndarray, lam: np.ndarray, rng: np.random.Generator
75
+ ) -> np.ndarray:
76
+ """Step 4a: Sample tau^2 via Inverse Gaussian on GPU using stable formula."""
77
+ mu_abs_gpu = cp.asarray(mu_abs)
78
+ lam_gpu = cp.asarray(lam)
79
+
80
+ y_noise = rng.normal(size=mu_abs.shape) ** 2
81
+ u_noise = rng.uniform(size=mu_abs.shape)
82
+
83
+ y_gpu = cp.asarray(y_noise)
84
+ u_gpu = cp.asarray(u_noise)
85
+
86
+ inv_mean = lam_gpu / (mu_abs_gpu + 1e-10)
87
+ inv_mean = cp.minimum(inv_mean, 1e5)
88
+ shape = lam_gpu**2
89
+ inv_mean2 = inv_mean**2
90
+
91
+ term = cp.sqrt(
92
+ cp.maximum(0.0, 4.0 * inv_mean * shape * y_gpu + inv_mean2 * (y_gpu**2))
93
+ )
94
+ x1 = inv_mean / (
95
+ 1.0 + (inv_mean * y_gpu) / (2.0 * shape) + term / (2.0 * shape)
96
+ )
97
+
98
+ cond = inv_mean / (inv_mean + x1 + 1e-15)
99
+ mask = u_gpu <= cond
100
+
101
+ res = cp.empty_like(inv_mean)
102
+ res[mask] = x1[mask]
103
+ res[~mask] = inv_mean2[~mask] / (x1[~mask] + 1e-15)
104
+
105
+ tau2_gpu = 1.0 / res
106
+ return cp.asnumpy(tau2_gpu)
@@ -0,0 +1,274 @@
1
+ import warnings
2
+
3
+ import numpy as np
4
+ from numba import cuda, njit, prange
5
+
6
+ from ._base import ComputeBackend
7
+
8
+ # =====================================================================
9
+ # Numba CPU parallel kernels
10
+ # =====================================================================
11
+
12
+
13
+ @njit(parallel=True, fastmath=True, cache=True)
14
+ def _compute_cluster_log_probs_numba(X, mu, log_w, sigma2):
15
+ n, p = X.shape
16
+ K_max = mu.shape[0]
17
+ log_probs = np.empty((n, K_max), dtype=X.dtype)
18
+ for i in prange(n):
19
+ for k in range(K_max):
20
+ dist = 0.0
21
+ for j in range(p):
22
+ diff = X[i, j] - mu[k, j]
23
+ dist += diff * diff / sigma2[j]
24
+ log_probs[i, k] = log_w[k] - 0.5 * dist
25
+ return log_probs
26
+
27
+
28
+ @njit(parallel=True, fastmath=True, cache=True)
29
+ def _compute_sufficient_stats_numba(X, z, K_max):
30
+ n, p = X.shape
31
+ n_k = np.zeros(K_max, dtype=np.int64)
32
+ for i in range(n):
33
+ n_k[z[i]] += 1
34
+
35
+ sum_x = np.zeros((K_max, p), dtype=X.dtype)
36
+ for j in prange(p):
37
+ for i in range(n):
38
+ k = z[i]
39
+ sum_x[k, j] += X[i, j]
40
+
41
+ return n_k, sum_x
42
+
43
+
44
+ @njit(parallel=True, fastmath=True, cache=True)
45
+ def _sample_cluster_means_numba(sum_x, n_k, tau2, sigma2, noise):
46
+ K_max, p = sum_x.shape
47
+ mu = np.empty((K_max, p), dtype=sum_x.dtype)
48
+ for k in prange(K_max):
49
+ nk = n_k[k]
50
+ for j in range(p):
51
+ post_var = 1.0 / (nk / sigma2[j] + 1.0 / tau2[k, j])
52
+ post_mean = post_var * (sum_x[k, j] / sigma2[j])
53
+ mu[k, j] = post_mean + np.sqrt(post_var) * noise[k, j]
54
+ return mu
55
+
56
+
57
+ @njit(parallel=True, fastmath=True, cache=True)
58
+ def _sample_inverse_gaussian_numba(mu_abs, lam, y_noise, u_noise):
59
+ K_max, p = mu_abs.shape
60
+ tau2 = np.empty((K_max, p), dtype=mu_abs.dtype)
61
+ for k in prange(K_max):
62
+ for j in range(p):
63
+ mean = lam[0, j] / (mu_abs[k, j] + 1e-10)
64
+ if mean > 1e5:
65
+ mean = 1e5
66
+ sh = lam[0, j] ** 2
67
+ mean2 = mean**2
68
+ y = y_noise[k, j]
69
+ term = np.sqrt(np.maximum(0.0, 4.0 * mean * sh * y + mean2 * (y**2)))
70
+ x1 = mean / (1.0 + (mean * y) / (2.0 * sh) + term / (2.0 * sh))
71
+ u = u_noise[k, j]
72
+
73
+ cond = mean / (mean + x1 + 1e-15)
74
+ if u <= cond:
75
+ inv_tau2 = x1
76
+ else:
77
+ inv_tau2 = mean2 / (x1 + 1e-15)
78
+ tau2[k, j] = 1.0 / inv_tau2
79
+ return tau2
80
+
81
+
82
+ # =====================================================================
83
+ # Numba CUDA GPU kernels
84
+ # =====================================================================
85
+
86
+
87
+ @cuda.jit
88
+ def _compute_cluster_log_probs_cuda(X, mu, log_w, sigma2, out):
89
+ i = cuda.grid(1)
90
+ n = X.shape[0]
91
+ K_max = mu.shape[0]
92
+ p = X.shape[1]
93
+ if i < n:
94
+ for k in range(K_max):
95
+ dist = 0.0
96
+ for j in range(p):
97
+ diff = X[i, j] - mu[k, j]
98
+ dist += diff * diff / sigma2[j]
99
+ out[i, k] = log_w[k] - 0.5 * dist
100
+
101
+
102
+ @cuda.jit
103
+ def _compute_sufficient_stats_cuda(X, z, n_k, sum_x):
104
+ i = cuda.grid(1)
105
+ n = X.shape[0]
106
+ p = X.shape[1]
107
+ if i < n:
108
+ k = z[i]
109
+ cuda.atomic.add(n_k, k, 1)
110
+ for j in range(p):
111
+ cuda.atomic.add(sum_x, (k, j), X[i, j])
112
+
113
+
114
+ @cuda.jit
115
+ def _sample_cluster_means_cuda(sum_x, n_k, tau2, sigma2, noise, out):
116
+ k = cuda.grid(1)
117
+ K_max = sum_x.shape[0]
118
+ p = sum_x.shape[1]
119
+ if k < K_max:
120
+ nk = n_k[k]
121
+ for j in range(p):
122
+ post_var = 1.0 / (nk / sigma2[j] + 1.0 / tau2[k, j])
123
+ post_mean = post_var * (sum_x[k, j] / sigma2[j])
124
+ out[k, j] = post_mean + np.sqrt(post_var) * noise[k, j]
125
+
126
+
127
+ @cuda.jit
128
+ def _sample_inverse_gaussian_cuda(mu_abs, lam, y_noise, u_noise, out_tau2):
129
+ k = cuda.grid(1)
130
+ K_max = mu_abs.shape[0]
131
+ p = mu_abs.shape[1]
132
+ if k < K_max:
133
+ for j in range(p):
134
+ mean = lam[0, j] / (mu_abs[k, j] + 1e-10)
135
+ if mean > 1e5:
136
+ mean = 1e5
137
+ sh = lam[0, j] ** 2
138
+ mean2 = mean**2
139
+ y = y_noise[k, j]
140
+ term = np.sqrt(np.maximum(0.0, 4.0 * mean * sh * y + mean2 * (y**2)))
141
+ x1 = mean / (1.0 + (mean * y) / (2.0 * sh) + term / (2.0 * sh))
142
+ u = u_noise[k, j]
143
+
144
+ cond = mean / (mean + x1 + 1e-15)
145
+ if u <= cond:
146
+ inv_tau2 = x1
147
+ else:
148
+ inv_tau2 = mean2 / (x1 + 1e-15)
149
+ out_tau2[k, j] = 1.0 / inv_tau2
150
+
151
+
152
+ # =====================================================================
153
+ # Numba backend class wrapper
154
+ # =====================================================================
155
+
156
+
157
+ class NumbaBackend(ComputeBackend):
158
+ """Numba-accelerated compute backend supporting CPU parallel and GPU CUDA."""
159
+
160
+ def __init__(self, use_cuda: bool = False):
161
+ self.use_cuda = use_cuda
162
+ if use_cuda:
163
+ if not cuda.is_available():
164
+ warnings.warn(
165
+ "CUDA is requested but Numba CUDA is not available. Falling back to CPU multi-core."
166
+ )
167
+ self.use_cuda = False
168
+
169
+ def compute_cluster_log_probs(
170
+ self, X: np.ndarray, mu: np.ndarray, log_w: np.ndarray, sigma2: np.ndarray
171
+ ) -> np.ndarray:
172
+ if self.use_cuda:
173
+ n, p = X.shape
174
+ K_max = mu.shape[0]
175
+ out = np.empty((n, K_max), dtype=X.dtype)
176
+
177
+ d_X = cuda.to_device(X)
178
+ d_mu = cuda.to_device(mu)
179
+ d_log_w = cuda.to_device(log_w)
180
+ d_sigma2 = cuda.to_device(sigma2)
181
+ d_out = cuda.to_device(out)
182
+
183
+ threads_per_block = 256
184
+ blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
185
+ _compute_cluster_log_probs_cuda[blocks_per_grid, threads_per_block](
186
+ d_X, d_mu, d_log_w, d_sigma2, d_out
187
+ )
188
+
189
+ d_out.copy_to_host(out)
190
+ return out
191
+ else:
192
+ return _compute_cluster_log_probs_numba(X, mu, log_w, sigma2)
193
+
194
+ def compute_sufficient_stats(
195
+ self, X: np.ndarray, z: np.ndarray, K_max: int
196
+ ) -> tuple[np.ndarray, np.ndarray]:
197
+ if self.use_cuda:
198
+ n, p = X.shape
199
+ n_k = np.zeros(K_max, dtype=np.int64)
200
+ sum_x = np.zeros((K_max, p), dtype=X.dtype)
201
+
202
+ d_X = cuda.to_device(X)
203
+ d_z = cuda.to_device(z)
204
+ d_n_k = cuda.to_device(n_k)
205
+ d_sum_x = cuda.to_device(sum_x)
206
+
207
+ threads_per_block = 256
208
+ blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
209
+ _compute_sufficient_stats_cuda[blocks_per_grid, threads_per_block](
210
+ d_X, d_z, d_n_k, d_sum_x
211
+ )
212
+
213
+ d_n_k.copy_to_host(n_k)
214
+ d_sum_x.copy_to_host(sum_x)
215
+ return n_k, sum_x
216
+ else:
217
+ return _compute_sufficient_stats_numba(X, z, K_max)
218
+
219
+ def sample_cluster_means(
220
+ self,
221
+ sum_x: np.ndarray,
222
+ n_k: np.ndarray,
223
+ tau2: np.ndarray,
224
+ sigma2: np.ndarray,
225
+ rng: np.random.Generator,
226
+ ) -> np.ndarray:
227
+ noise = rng.normal(size=sum_x.shape)
228
+ if self.use_cuda:
229
+ K_max, p = sum_x.shape
230
+ out = np.empty((K_max, p), dtype=sum_x.dtype)
231
+
232
+ d_sum_x = cuda.to_device(sum_x)
233
+ d_n_k = cuda.to_device(n_k)
234
+ d_tau2 = cuda.to_device(tau2)
235
+ d_sigma2 = cuda.to_device(sigma2)
236
+ d_noise = cuda.to_device(noise)
237
+ d_out = cuda.to_device(out)
238
+
239
+ threads_per_block = 64
240
+ blocks_per_grid = (K_max + threads_per_block - 1) // threads_per_block
241
+ _sample_cluster_means_cuda[blocks_per_grid, threads_per_block](
242
+ d_sum_x, d_n_k, d_tau2, d_sigma2, d_noise, d_out
243
+ )
244
+
245
+ d_out.copy_to_host(out)
246
+ return out
247
+ else:
248
+ return _sample_cluster_means_numba(sum_x, n_k, tau2, sigma2, noise)
249
+
250
+ def sample_inverse_gaussian(
251
+ self, mu_abs: np.ndarray, lam: np.ndarray, rng: np.random.Generator
252
+ ) -> np.ndarray:
253
+ y_noise = rng.normal(size=mu_abs.shape) ** 2
254
+ u_noise = rng.uniform(size=mu_abs.shape)
255
+ if self.use_cuda:
256
+ K_max, p = mu_abs.shape
257
+ out_tau2 = np.empty((K_max, p), dtype=mu_abs.dtype)
258
+
259
+ d_mu_abs = cuda.to_device(mu_abs)
260
+ d_lam = cuda.to_device(lam)
261
+ d_y_noise = cuda.to_device(y_noise)
262
+ d_u_noise = cuda.to_device(u_noise)
263
+ d_out_tau2 = cuda.to_device(out_tau2)
264
+
265
+ threads_per_block = 64
266
+ blocks_per_grid = (K_max + threads_per_block - 1) // threads_per_block
267
+ _sample_inverse_gaussian_cuda[blocks_per_grid, threads_per_block](
268
+ d_mu_abs, d_lam, d_y_noise, d_u_noise, d_out_tau2
269
+ )
270
+
271
+ d_out_tau2.copy_to_host(out_tau2)
272
+ return out_tau2
273
+ else:
274
+ return _sample_inverse_gaussian_numba(mu_abs, lam, y_noise, u_noise)
@@ -0,0 +1,53 @@
1
+ import numpy as np
2
+
3
+ from ..utils import sample_inverse_gaussian as utils_sample_inverse_gaussian
4
+ from ._base import ComputeBackend
5
+
6
+
7
+ class NumpyBackend(ComputeBackend):
8
+ """NumPy-based compute backend for the Bayesian Sparse GMM."""
9
+
10
+ def compute_cluster_log_probs(
11
+ self, X: np.ndarray, mu: np.ndarray, log_w: np.ndarray, sigma2: np.ndarray
12
+ ) -> np.ndarray:
13
+ """Step 2: Compute log probability of cluster assignment for all samples and clusters."""
14
+ # Scale X and mu by sqrt(sigma2) to compute diagonal Mahalanobis distance efficiently via GEMM trick
15
+ std = np.sqrt(sigma2)
16
+ X_scaled = X / std
17
+ mu_scaled = mu / std
18
+ x_sq = np.sum(X_scaled**2, axis=1, keepdims=True)
19
+ mu_sq = np.sum(mu_scaled**2, axis=1, keepdims=True).T
20
+ dist = x_sq - 2.0 * np.dot(X_scaled, mu_scaled.T) + mu_sq
21
+ dist = np.maximum(dist, 0.0)
22
+ return log_w - 0.5 * dist
23
+
24
+ def compute_sufficient_stats(
25
+ self, X: np.ndarray, z: np.ndarray, K_max: int
26
+ ) -> tuple[np.ndarray, np.ndarray]:
27
+ """Step 4: Compute cluster sizes and feature sums per cluster."""
28
+ n_k = np.bincount(z, minlength=K_max)
29
+ sum_x = np.zeros((K_max, X.shape[1]), dtype=X.dtype)
30
+ np.add.at(sum_x, z, X)
31
+ return n_k, sum_x
32
+
33
+ def sample_cluster_means(
34
+ self,
35
+ sum_x: np.ndarray,
36
+ n_k: np.ndarray,
37
+ tau2: np.ndarray,
38
+ sigma2: np.ndarray,
39
+ rng: np.random.Generator,
40
+ ) -> np.ndarray:
41
+ """Step 4b: Sample cluster means mu[k,j] ~ Normal."""
42
+ post_var = 1.0 / (n_k[:, np.newaxis] / sigma2[np.newaxis, :] + 1.0 / tau2)
43
+ post_mean = post_var * (sum_x / sigma2[np.newaxis, :])
44
+ return rng.normal(loc=post_mean, scale=np.sqrt(post_var))
45
+
46
+ def sample_inverse_gaussian(
47
+ self, mu_abs: np.ndarray, lam: np.ndarray, rng: np.random.Generator
48
+ ) -> np.ndarray:
49
+ """Step 4a: Sample tau^2 via Inverse Gaussian."""
50
+ inv_mean = lam / (mu_abs + 1e-10)
51
+ shape = lam**2
52
+ inv_tau2 = utils_sample_inverse_gaussian(inv_mean, shape, rng)
53
+ return 1.0 / inv_tau2
@@ -0,0 +1,29 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+
5
+ @dataclass(frozen=True)
6
+ class HyperParams:
7
+ """Hyperparameters for the Bayesian GMM."""
8
+
9
+ lambda_0: float = 1000.0
10
+ lambda_1: float = 0.1
11
+ alpha: float = 0.01
12
+ theta: float = 0.1
13
+ a_sigma: float = 1.0
14
+ b_sigma: float = 1.0
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class SamplerConfig:
19
+ """Configuration options for the Gibbs sampler."""
20
+
21
+ K_max: int = 15
22
+ n_iter: int = 2000
23
+ burn_in: int = 500
24
+ thinning: int = 1
25
+ warm_up_iters: int = 50
26
+ backend: str = "auto"
27
+ n_jobs: int = -1
28
+ random_state: Optional[int] = None
29
+ verbose: int = 0
@@ -0,0 +1,111 @@
1
+ import numpy as np
2
+
3
+
4
+ def _prepare_chains(trace: np.ndarray) -> tuple[np.ndarray, tuple]:
5
+ """Prepare input trace array to be of shape (M, N, D) and return original shape."""
6
+ orig_shape = trace.shape
7
+
8
+ # 1D: (samples,) -> (1, samples, 1)
9
+ if trace.ndim == 1:
10
+ return trace[np.newaxis, :, np.newaxis], ()
11
+
12
+ # 2D: (samples, D) -> (1, samples, D)
13
+ if trace.ndim == 2:
14
+ return trace[np.newaxis, :, :], (orig_shape[1],)
15
+
16
+ # 3D: (samples, K, P) -> (1, samples, K*P)
17
+ if trace.ndim == 3:
18
+ samples, k, p = trace.shape
19
+ return trace.reshape(1, samples, k * p), (k, p)
20
+
21
+ # 4D: (chains, samples, K, P) -> (chains, samples, K*P)
22
+ if trace.ndim == 4:
23
+ chains, samples, k, p = trace.shape
24
+ return trace.reshape(chains, samples, k * p), (k, p)
25
+
26
+ raise ValueError(f"Unsupported trace dimensions: {trace.ndim}")
27
+
28
+
29
+ def gelman_rubin(trace: np.ndarray) -> np.ndarray:
30
+ """Compute Gelman-Rubin R-hat statistic for trace variables.
31
+
32
+ Supports single or multiple chains. For single chain inputs, split-R-hat is computed.
33
+ """
34
+ chains, param_shape = _prepare_chains(trace)
35
+ M, N, D = chains.shape
36
+
37
+ if N < 4:
38
+ # Too few samples to split/estimate
39
+ rhat_flat = np.ones(D)
40
+ return rhat_flat.reshape(param_shape) if param_shape else rhat_flat[0]
41
+
42
+ if M < 2:
43
+ # Split-R-hat: split each chain in half
44
+ N_half = N // 2
45
+ chains_split = np.empty((2 * M, N_half, D), dtype=chains.dtype)
46
+ for m in range(M):
47
+ chains_split[2 * m] = chains[m, :N_half]
48
+ chains_split[2 * m + 1] = chains[m, N_half : 2 * N_half]
49
+ chains = chains_split
50
+ M, N, D = chains.shape
51
+
52
+ # Means per chain
53
+ chain_means = np.mean(chains, axis=1)
54
+ overall_mean = np.mean(chain_means, axis=0)
55
+
56
+ # Within-chain variance
57
+ chain_vars = np.var(chains, axis=1, ddof=1)
58
+ W = np.mean(chain_vars, axis=0)
59
+
60
+ # Between-chain variance
61
+ B = (N / (M - 1.0)) * np.sum((chain_means - overall_mean) ** 2, axis=0)
62
+
63
+ # Estimated marginal variance
64
+ var_theta = ((N - 1.0) / N) * W + (1.0 / N) * B
65
+
66
+ rhat_flat = np.empty_like(var_theta)
67
+ zero_w = W == 0.0
68
+ rhat_flat[zero_w] = 1.0
69
+ rhat_flat[~zero_w] = np.sqrt(var_theta[~zero_w] / W[~zero_w])
70
+
71
+ if param_shape:
72
+ return rhat_flat.reshape(param_shape)
73
+ return rhat_flat[0]
74
+
75
+
76
+ def effective_sample_size(trace: np.ndarray) -> np.ndarray:
77
+ """Compute Effective Sample Size (ESS) for MCMC chains."""
78
+ chains, param_shape = _prepare_chains(trace)
79
+ M, N, D = chains.shape
80
+
81
+ ess_flat = np.empty(D)
82
+
83
+ for d in range(D):
84
+ rhos = []
85
+ for m in range(M):
86
+ x = chains[m, :, d]
87
+ mean = np.mean(x)
88
+ var = np.var(x)
89
+ if var == 0.0:
90
+ rhos.append(np.zeros(N))
91
+ continue
92
+
93
+ xp = x - mean
94
+ corr = np.correlate(xp, xp, mode="full")
95
+ autocorr = corr[N - 1 :] / (N * var)
96
+ rhos.append(autocorr)
97
+
98
+ avg_rho = np.mean(rhos, axis=0)
99
+
100
+ sum_rho = 0.0
101
+ for t in range(1, N - 1, 2):
102
+ val = avg_rho[t] + avg_rho[t + 1]
103
+ if val < 0:
104
+ break
105
+ sum_rho += avg_rho[t] + avg_rho[t + 1]
106
+
107
+ ess_flat[d] = (M * N) / (1.0 + 2.0 * sum_rho)
108
+
109
+ if param_shape:
110
+ return ess_flat.reshape(param_shape)
111
+ return ess_flat[0]
@@ -0,0 +1,203 @@
1
+ from typing import Any, Dict, Optional
2
+
3
+ import numpy as np
4
+ from sklearn.base import BaseEstimator, ClusterMixin
5
+ from sklearn.utils.validation import check_array, check_is_fitted
6
+
7
+ from .backends import select_backend
8
+ from .config import HyperParams, SamplerConfig
9
+ from .sampler import GibbsSampler
10
+ from .utils import log_sum_exp
11
+
12
+
13
+ class BayesianSparseGMM(BaseEstimator, ClusterMixin):
14
+ """Bayesian Sparse Gaussian Mixture Model for high-dimensional clustering.
15
+
16
+ Parameters
17
+ ----------
18
+ K_max : int, default=15
19
+ Maximum number of clusters.
20
+ n_iter : int, default=2000
21
+ Number of Gibbs sampler iterations.
22
+ burn_in : int, default=500
23
+ Number of burn-in iterations to discard.
24
+ thinning : int, default=1
25
+ Thinning interval for MCMC samples.
26
+ lambda_0 : float, default=1000.0
27
+ Spike prior parameter (large value for sparse features).
28
+ lambda_1 : float, default=0.1
29
+ Slab prior parameter (small value for active features).
30
+ alpha : float, default=0.01
31
+ Dirichlet prior parameter for mixing weights.
32
+ theta : float, default=0.1
33
+ Prior probability of a feature being informative (Slab).
34
+ backend : str, default='auto'
35
+ Computation backend: 'numpy', 'numba', or 'auto'.
36
+ n_jobs : int, default=-1
37
+ Number of parallel jobs (for Numba backend).
38
+ random_state : int, optional
39
+ Seed for the random number generator.
40
+ verbose : int, default=0
41
+ Progress reporting interval.
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ K_max: int = 15,
47
+ n_iter: int = 2000,
48
+ burn_in: int = 500,
49
+ thinning: int = 1,
50
+ warm_up_iters: int = 50,
51
+ lambda_0: float = 1000.0,
52
+ lambda_1: float = 0.1,
53
+ alpha: float = 0.01,
54
+ theta: float = 0.1,
55
+ a_sigma: float = 1.0,
56
+ b_sigma: float = 1.0,
57
+ backend: str = "auto",
58
+ n_jobs: int = -1,
59
+ random_state: Optional[int] = None,
60
+ verbose: int = 0,
61
+ ):
62
+ self.K_max = K_max
63
+ self.n_iter = n_iter
64
+ self.burn_in = burn_in
65
+ self.thinning = thinning
66
+ self.warm_up_iters = warm_up_iters
67
+ self.lambda_0 = lambda_0
68
+ self.lambda_1 = lambda_1
69
+ self.alpha = alpha
70
+ self.theta = theta
71
+ self.a_sigma = a_sigma
72
+ self.b_sigma = b_sigma
73
+ self.backend = backend
74
+ self.n_jobs = n_jobs
75
+ self.random_state = random_state
76
+ self.verbose = verbose
77
+
78
+ def fit(self, X: np.ndarray, y: Any = None) -> "BayesianSparseGMM":
79
+ """Fit the GMM model using Gibbs sampling."""
80
+ X = check_array(X, dtype=[np.float64, np.float32])
81
+
82
+ config = SamplerConfig(
83
+ K_max=self.K_max,
84
+ n_iter=self.n_iter,
85
+ burn_in=self.burn_in,
86
+ thinning=self.thinning,
87
+ warm_up_iters=self.warm_up_iters,
88
+ backend=self.backend,
89
+ n_jobs=self.n_jobs,
90
+ random_state=self.random_state,
91
+ verbose=self.verbose,
92
+ )
93
+ hyperparams = HyperParams(
94
+ lambda_0=self.lambda_0,
95
+ lambda_1=self.lambda_1,
96
+ alpha=self.alpha,
97
+ theta=self.theta,
98
+ a_sigma=self.a_sigma,
99
+ b_sigma=self.b_sigma,
100
+ )
101
+
102
+ self.backend_ = select_backend(config.backend)
103
+ sampler = GibbsSampler(config, hyperparams, self.backend_)
104
+
105
+ self.states_ = sampler.run(X, seed=self.random_state)
106
+
107
+ from .postprocessing import align_labels
108
+
109
+ self.states_ = align_labels(self.states_)
110
+
111
+ self.w_ = np.mean([state.w for state in self.states_], axis=0)
112
+ self.means_ = np.mean([state.mu for state in self.states_], axis=0)
113
+
114
+ # Consolidate local gamma states
115
+ self.feature_probabilities_2d_ = np.mean(
116
+ [state.gamma for state in self.states_], axis=0
117
+ )
118
+
119
+ # Final label assignment based on mode over samples
120
+ z_samples = np.array([state.z for state in self.states_])
121
+ labels = np.empty(X.shape[0], dtype=int)
122
+ for i in range(X.shape[0]):
123
+ labels[i] = np.argmax(np.bincount(z_samples[:, i]))
124
+ self.labels_ = labels
125
+
126
+ # Compute 1D feature probabilities for backward compatibility based on active clusters
127
+ active_clusters = np.unique(self.labels_)
128
+ if len(active_clusters) > 0:
129
+ self.feature_probabilities_ = np.max(
130
+ self.feature_probabilities_2d_[active_clusters], axis=0
131
+ )
132
+ else:
133
+ self.feature_probabilities_ = np.max(self.feature_probabilities_2d_, axis=0)
134
+ self.selected_features_ = np.where(self.feature_probabilities_ > 0.5)[0]
135
+
136
+ return self
137
+
138
+ def predict_proba(self, X: np.ndarray) -> np.ndarray:
139
+ """Predict posterior probability of each cluster for each sample."""
140
+ X = check_array(X, dtype=[np.float64, np.float32])
141
+ check_is_fitted(self, "states_")
142
+
143
+ n = X.shape[0]
144
+ threshold = 1.0 / (2.0 * n)
145
+ all_probs = []
146
+ for state in self.states_:
147
+ w_safe = np.where(state.w < threshold, 1e-300, state.w)
148
+ log_w = np.log(w_safe)
149
+ log_probs = self.backend_.compute_cluster_log_probs(
150
+ X, state.mu, log_w, state.sigma2
151
+ )
152
+
153
+ max_log = np.max(log_probs, axis=1, keepdims=True)
154
+ probs = np.exp(log_probs - max_log)
155
+ probs /= np.sum(probs, axis=1, keepdims=True)
156
+ all_probs.append(probs)
157
+
158
+ return np.mean(all_probs, axis=0)
159
+
160
+ def predict(self, X: np.ndarray) -> np.ndarray:
161
+ """Predict cluster index for each sample."""
162
+ return np.argmax(self.predict_proba(X), axis=1)
163
+
164
+ def score(self, X: np.ndarray, y: Any = None) -> float:
165
+ """Compute the average GMM log-likelihood of the dataset."""
166
+ X = check_array(X, dtype=[np.float64, np.float32])
167
+ check_is_fitted(self, "states_")
168
+
169
+ n, p = X.shape
170
+ threshold = 1.0 / (2.0 * n)
171
+
172
+ log_liks = []
173
+ for state in self.states_:
174
+ w_safe = np.where(state.w < threshold, 1e-300, state.w)
175
+ log_w = np.log(w_safe)
176
+ log_probs = self.backend_.compute_cluster_log_probs(
177
+ X, state.mu, log_w, state.sigma2
178
+ )
179
+
180
+ const = -0.5 * p * np.log(2.0 * np.pi) - 0.5 * np.sum(np.log(state.sigma2))
181
+ sample_log_lik = log_sum_exp(log_probs, axis=1) + const
182
+ log_liks.append(np.mean(sample_log_lik))
183
+
184
+ return float(np.mean(log_liks))
185
+
186
+ @property
187
+ def n_clusters_(self) -> int:
188
+ """Number of active clusters."""
189
+ check_is_fitted(self, "labels_")
190
+ return len(np.unique(self.labels_))
191
+
192
+ @property
193
+ def trace_(self) -> Dict[str, np.ndarray]:
194
+ """Full trace of MCMC samples."""
195
+ check_is_fitted(self, "states_")
196
+ return {
197
+ "z": np.array([state.z for state in self.states_]),
198
+ "w": np.array([state.w for state in self.states_]),
199
+ "mu": np.array([state.mu for state in self.states_]),
200
+ "gamma": np.array([state.gamma for state in self.states_]),
201
+ "theta": np.array([state.theta for state in self.states_]),
202
+ "sigma2": np.array([state.sigma2 for state in self.states_]),
203
+ }
@@ -0,0 +1,46 @@
1
+ from typing import List
2
+
3
+ import numpy as np
4
+ from scipy.optimize import linear_sum_assignment
5
+
6
+ from .state import SamplerState
7
+
8
+
9
+ def align_labels(states: List[SamplerState]) -> List[SamplerState]:
10
+ """Align cluster labels across MCMC iterations to solve label switching.
11
+
12
+ Uses the final state as the reference and applies the Hungarian algorithm
13
+ (linear sum assignment) to match clusters in each state to the reference based
14
+ on the Euclidean distance of their cluster means.
15
+ """
16
+ if not states:
17
+ return states
18
+
19
+ ref_idx = int(np.argmax([np.std(s.w) for s in states]))
20
+ ref_mu = states[ref_idx].mu
21
+
22
+ for state in states:
23
+ diff = state.mu[:, np.newaxis, :] - ref_mu[np.newaxis, :, :]
24
+ cost_matrix = np.sum(diff**2, axis=2)
25
+ row_ind, col_ind = linear_sum_assignment(cost_matrix)
26
+
27
+ new_mu = np.empty_like(state.mu)
28
+ new_w = np.empty_like(state.w)
29
+ new_tau2 = np.empty_like(state.tau2)
30
+ new_gamma = np.empty_like(state.gamma)
31
+ new_z = np.empty_like(state.z)
32
+
33
+ for r, c in zip(row_ind, col_ind):
34
+ new_mu[c] = state.mu[r]
35
+ new_w[c] = state.w[r]
36
+ new_tau2[c] = state.tau2[r]
37
+ new_gamma[c] = state.gamma[r]
38
+ new_z[state.z == r] = c
39
+
40
+ state.mu = new_mu
41
+ state.w = new_w
42
+ state.tau2 = new_tau2
43
+ state.gamma = new_gamma
44
+ state.z = new_z
45
+
46
+ return states
@@ -0,0 +1,167 @@
1
+ from typing import List, Optional
2
+
3
+ import numpy as np
4
+ from tqdm import tqdm
5
+
6
+ from .backends._base import ComputeBackend
7
+ from .config import HyperParams, SamplerConfig
8
+ from .state import SamplerState
9
+
10
+
11
+ class GibbsSampler:
12
+ """Gibbs Sampler orchestrator for Bayesian Sparse GMM."""
13
+
14
+ def __init__(
15
+ self, config: SamplerConfig, hyperparams: HyperParams, backend: ComputeBackend
16
+ ):
17
+ self.config = config
18
+ self.hyperparams = hyperparams
19
+ self.backend = backend
20
+
21
+ def initialize(self, X: np.ndarray, rng: np.random.Generator) -> SamplerState:
22
+ """Initialize the sampler state using K-Means++ or random assignment."""
23
+ n, p = X.shape
24
+ K_max = self.config.K_max
25
+
26
+ try:
27
+ from sklearn.cluster import KMeans
28
+
29
+ kmeans = KMeans(
30
+ n_clusters=K_max,
31
+ init="k-means++",
32
+ n_init=1,
33
+ random_state=rng.integers(0, 2**31 - 1),
34
+ )
35
+ kmeans.fit(X)
36
+ z = kmeans.labels_
37
+ mu = kmeans.cluster_centers_.copy()
38
+
39
+ except Exception:
40
+ z = rng.choice(K_max, size=n)
41
+ mu = rng.normal(size=(K_max, p))
42
+
43
+ w = np.ones(K_max) / K_max
44
+ gamma = np.ones((K_max, p), dtype=np.int32)
45
+ theta = self.hyperparams.theta
46
+ tau2 = np.ones((K_max, p))
47
+ sigma2 = np.ones(p)
48
+
49
+ return SamplerState(
50
+ z=z,
51
+ w=w,
52
+ mu=mu,
53
+ gamma=gamma,
54
+ theta=theta,
55
+ tau2=tau2,
56
+ sigma2=sigma2,
57
+ iteration=0,
58
+ )
59
+
60
+ def sample_step(
61
+ self, X: np.ndarray, state: SamplerState, rng: np.random.Generator
62
+ ) -> SamplerState:
63
+ """Execute one complete Gibbs sampling iteration."""
64
+ K_max = self.config.K_max
65
+ n, p = X.shape
66
+ hp = self.hyperparams
67
+
68
+ # STEP 1: Update cluster assignments (z) using the previous mixing weights
69
+ threshold = 1.0 / (2.0 * n)
70
+ w_safe = np.where(state.w < threshold, 1e-300, state.w)
71
+ log_w = np.log(w_safe)
72
+ log_probs = self.backend.compute_cluster_log_probs(
73
+ X, state.mu, log_w, state.sigma2
74
+ )
75
+ max_log = np.max(log_probs, axis=1, keepdims=True)
76
+ probs = np.exp(log_probs - max_log)
77
+ probs /= np.sum(probs, axis=1, keepdims=True)
78
+
79
+ cumsum = np.cumsum(probs, axis=1)
80
+ u = rng.uniform(size=(n, 1))
81
+ z = np.sum(cumsum < u, axis=1)
82
+ z = np.clip(z, 0, K_max - 1)
83
+
84
+ # STEP 2: Update cluster mixing weights (w) using the new cluster assignments
85
+ n_k = np.bincount(z, minlength=K_max)
86
+ w = rng.dirichlet(hp.alpha + n_k)
87
+
88
+ # STEP 3: Update feature inclusion indicators (gamma) using active cluster mask
89
+ active_mask = n_k > 0
90
+ active_K = int(np.sum(active_mask))
91
+ if active_K == 0:
92
+ active_K = K_max
93
+ active_mask = np.ones(K_max, dtype=bool)
94
+
95
+ gamma = np.zeros((K_max, p), dtype=np.int32)
96
+ if state.iteration < self.config.warm_up_iters:
97
+ # During warm-up, force gamma to 1 for all active clusters to allow means to migrate
98
+ gamma[active_mask] = 1
99
+ else:
100
+ # Local feature selection: sample gamma_kj independently for active clusters
101
+ active_mu = state.mu[active_mask]
102
+ log_laplace_slab = (
103
+ np.log(hp.lambda_1) - np.log(2.0)
104
+ ) - hp.lambda_1 * np.abs(active_mu)
105
+ log_laplace_spike = (
106
+ np.log(hp.lambda_0) - np.log(2.0)
107
+ ) - hp.lambda_0 * np.abs(active_mu)
108
+
109
+ safe_theta = np.clip(state.theta, 1e-15, 1.0 - 1e-15)
110
+ log_P_slab = np.log(safe_theta) + log_laplace_slab
111
+ log_P_spike = np.log(1.0 - safe_theta) + log_laplace_spike
112
+
113
+ max_log = np.maximum(log_P_slab, log_P_spike)
114
+ prob_slab = np.exp(log_P_slab - max_log)
115
+ prob_spike = np.exp(log_P_spike - max_log)
116
+ p_slab = prob_slab / (prob_slab + prob_spike)
117
+
118
+ gamma[active_mask] = rng.binomial(1, p_slab)
119
+
120
+ # STEP 3b: Update feature-specific variances (sigma2)
121
+ # sigma2_j ~ Inverse-Gamma(a_sigma + N / 2, b_sigma + 0.5 * sum_i (X_ij - mu_{Z_i, j})^2)
122
+ residuals_sq = (X - state.mu[z]) ** 2
123
+ sum_residuals_sq = np.sum(residuals_sq, axis=0)
124
+
125
+ shape_sig = hp.a_sigma + 0.5 * n
126
+ scale_sig = 1.0 / (hp.b_sigma + 0.5 * sum_residuals_sq)
127
+
128
+ gamma_sig_sample = rng.gamma(shape_sig, scale_sig)
129
+ sigma2 = 1.0 / np.maximum(gamma_sig_sample, 1e-15)
130
+
131
+ # STEP 4: Update cluster means (mu) and auxiliary variables (tau2)
132
+ lam = np.where(gamma == 1, hp.lambda_1, hp.lambda_0)
133
+ tau2 = self.backend.sample_inverse_gaussian(np.abs(state.mu), lam, rng)
134
+
135
+ n_k_new, sum_x = self.backend.compute_sufficient_stats(X, z, K_max)
136
+ mu = self.backend.sample_cluster_means(sum_x, n_k_new, tau2, sigma2, rng)
137
+
138
+ return SamplerState(
139
+ z=z,
140
+ w=w,
141
+ mu=mu,
142
+ gamma=gamma,
143
+ theta=state.theta,
144
+ tau2=tau2,
145
+ sigma2=sigma2,
146
+ iteration=state.iteration + 1,
147
+ )
148
+
149
+ def run(self, X: np.ndarray, seed: Optional[int] = None) -> List[SamplerState]:
150
+ """Run the Gibbs sampler chain and return thinned post-burn-in states."""
151
+ rng = np.random.default_rng(seed)
152
+ state = self.initialize(X, rng)
153
+
154
+ n_iter = self.config.n_iter
155
+ burn_in = self.config.burn_in
156
+ thinning = self.config.thinning
157
+ verbose = self.config.verbose
158
+
159
+ states = []
160
+ iterator = tqdm(range(1, n_iter + 1), disable=not verbose, desc="Gibbs Sampler")
161
+ for i in iterator:
162
+ state = self.sample_step(X, state, rng)
163
+
164
+ if i > burn_in and (i - burn_in) % thinning == 0:
165
+ states.append(state)
166
+
167
+ return states
@@ -0,0 +1,17 @@
1
+ from dataclasses import dataclass
2
+
3
+ import numpy as np
4
+
5
+
6
+ @dataclass
7
+ class SamplerState:
8
+ """MCMC sampler state at a single iteration."""
9
+
10
+ z: np.ndarray
11
+ w: np.ndarray
12
+ mu: np.ndarray
13
+ gamma: np.ndarray
14
+ theta: float
15
+ tau2: np.ndarray
16
+ sigma2: np.ndarray
17
+ iteration: int
@@ -0,0 +1,30 @@
1
+ import numpy as np
2
+
3
+
4
+ def log_sum_exp(x: np.ndarray, axis: int = -1, keepdims: bool = False) -> np.ndarray:
5
+ """Compute log of sum of exponentials in a numerically stable way."""
6
+ x_max = np.max(x, axis=axis, keepdims=True)
7
+ res = x_max + np.log(np.sum(np.exp(x - x_max), axis=axis, keepdims=True))
8
+ if not keepdims:
9
+ res = np.squeeze(res, axis=axis)
10
+ return res
11
+
12
+
13
+ def sample_inverse_gaussian(
14
+ mean: np.ndarray, shape: np.ndarray, rng: np.random.Generator
15
+ ) -> np.ndarray:
16
+ """Sample from Inverse Gaussian distribution using Michael et al. (1976)."""
17
+ mean = np.minimum(mean, 1e5)
18
+ y = rng.normal(size=mean.shape) ** 2
19
+ mean2 = mean**2
20
+ term = np.sqrt(np.maximum(0.0, 4.0 * mean * shape * y + mean2 * (y**2)))
21
+ x1 = mean / (1.0 + (mean * y) / (2.0 * shape) + term / (2.0 * shape))
22
+ u = rng.uniform(size=mean.shape)
23
+
24
+ cond = mean / (mean + x1 + 1e-15)
25
+ mask = u <= cond
26
+
27
+ res = np.empty_like(mean)
28
+ res[mask] = x1[mask]
29
+ res[~mask] = mean2[~mask] / (x1[~mask] + 1e-15)
30
+ return res
@@ -0,0 +1,108 @@
1
+ Metadata-Version: 2.4
2
+ Name: bayesian-sparse-gmm
3
+ Version: 0.1.0
4
+ Summary: Bayesian Sparse Gaussian Mixture Model implementation in Python
5
+ Author-email: Nam Nam <nampvh4436@gmail.com>
6
+ License: MIT
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
+ Requires-Python: >=3.9
12
+ Requires-Dist: numpy>=1.20.0
13
+ Requires-Dist: scikit-learn>=1.0.0
14
+ Requires-Dist: scipy>=1.7.0
15
+ Requires-Dist: tqdm>=4.60.0
16
+ Provides-Extra: dev
17
+ Requires-Dist: black>=22.0.0; extra == 'dev'
18
+ Requires-Dist: flake8>=4.0.0; extra == 'dev'
19
+ Requires-Dist: isort>=5.10.0; extra == 'dev'
20
+ Requires-Dist: matplotlib>=3.5.0; extra == 'dev'
21
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
22
+ Description-Content-Type: text/markdown
23
+
24
+ # Bayesian Sparse GMM
25
+
26
+ Bayesian Sparse Gaussian Mixture Model (GMM) implementation in Python.
27
+
28
+ This model employs a sparsity-inducing prior (e.g., a Dirichlet distribution with parameter $\alpha_0 < 1$) over mixture component weights to automatically determine/prune the number of active components.
29
+
30
+ ## Installation
31
+
32
+ To install the latest release:
33
+
34
+ ```bash
35
+ pip install bayesian-sparse-gmm
36
+ ```
37
+
38
+ Or for development (editable mode):
39
+
40
+ ```bash
41
+ git clone https://github.com/Coalyx/bayesian-sparse-gmm.git
42
+ cd bayesian-sparse-gmm
43
+ pip install -e .
44
+ ```
45
+
46
+ ## Quick Start
47
+
48
+ ```python
49
+ import numpy as np
50
+ from sklearn.datasets import make_blobs
51
+ from sklearn.preprocessing import StandardScaler
52
+ from bayesian_sparse_gmm import BayesianSparseGMM
53
+
54
+ # Append noise dimensions to true clusters to verify that the model successfully performs feature selection.
55
+ rng = np.random.default_rng(42)
56
+ X_clean, _ = make_blobs(n_samples=200, centers=3, n_features=2, cluster_std=0.5, random_state=42)
57
+ X_noise = rng.normal(loc=0.0, scale=1.0, size=(200, 8))
58
+ X = np.hstack([X_clean, X_noise])
59
+
60
+ # Standardize features to satisfy the zero-mean assumptions in the prior structure.
61
+ X = StandardScaler().fit_transform(X)
62
+
63
+ model = BayesianSparseGMM(
64
+ K_max=5,
65
+ n_iter=300,
66
+ burn_in=100,
67
+ lambda_0=10.0,
68
+ lambda_1=0.05,
69
+ random_state=42,
70
+ verbose=0
71
+ )
72
+ model.fit(X)
73
+
74
+ print(f"Number of active clusters: {model.n_clusters_}")
75
+ print(f"Selected informative features: {model.selected_features_}")
76
+ print(f"Feature inclusion probabilities: {model.feature_probabilities_.round(3)}")
77
+
78
+ labels = model.predict(X)
79
+ ```
80
+
81
+ ## Development and Testing
82
+
83
+ Install development dependencies:
84
+
85
+ ```bash
86
+ pip install -e ".[dev]"
87
+ ```
88
+
89
+ Run tests using `pytest`:
90
+
91
+ ```bash
92
+ pytest
93
+ ```
94
+
95
+ ## Reference
96
+
97
+ ```bib
98
+ @article{JMLR:v26:23-0142,
99
+ author = {Dapeng Yao and Fangzheng Xie and Yanxun Xu},
100
+ title = {Bayesian Sparse Gaussian Mixture Model for Clustering in High Dimensions},
101
+ journal = {Journal of Machine Learning Research},
102
+ year = {2025},
103
+ volume = {26},
104
+ number = {21},
105
+ pages = {1--50},
106
+ url = {http://jmlr.org/papers/v26/23-0142.html}
107
+ }
108
+ ```
@@ -0,0 +1,16 @@
1
+ bayesian_sparse_gmm/__init__.py,sha256=zp-sMET8SlJ0Yief18xABg6ZhKk5sJklPidSeDTmVBM,93
2
+ bayesian_sparse_gmm/config.py,sha256=fotJjoKIOLjyhaUfBU4aNVv6xpUjDGsKJLjbgXweOhY,634
3
+ bayesian_sparse_gmm/diagnostics.py,sha256=Sq0RfGaNxMPD_53myuCWSqdtbz26XaHiUHJAcEj_yYU,3401
4
+ bayesian_sparse_gmm/model.py,sha256=i_skapVvvAE4TLNPb41noNi3OrZ_x2RymV8DTY_krL0,7271
5
+ bayesian_sparse_gmm/postprocessing.py,sha256=ZBAP-70nk2BHUXQ7LNJ-I0O3rGfvdq0ByFhcehrBUc8,1434
6
+ bayesian_sparse_gmm/sampler.py,sha256=wWakTN8Ju_SA13s5ievtmzTmDopApuEBLjuQ-Il3gr0,5915
7
+ bayesian_sparse_gmm/state.py,sha256=qV-s174UGYH-1kUt6wnidd7MP54pBthqfPfqQTfEQ5s,297
8
+ bayesian_sparse_gmm/utils.py,sha256=pL-0ZdV29a5DxqkvaBbPyry3MmanqHNMfYUn_jiZmck,1044
9
+ bayesian_sparse_gmm/backends/__init__.py,sha256=VUsyddYY88DyAPJOI0mS7GiHjIfzVbnQmLAf0sj7JRI,1552
10
+ bayesian_sparse_gmm/backends/_base.py,sha256=E7InzD5jQcYzVl-AJ9IW2337U_D3TN82VtunTn81H1Y,1254
11
+ bayesian_sparse_gmm/backends/_cuda.py,sha256=FHPO9seyFVIDHt7pcM3xGmMeYiOI18i8UsBwS-Jh55U,3313
12
+ bayesian_sparse_gmm/backends/_numba.py,sha256=jACpeg3YNK4feDlkF0OfJ_EAOYiaWIRckeXr--lwnsc,9037
13
+ bayesian_sparse_gmm/backends/_numpy.py,sha256=tztz3TESWDQWZinGa6qcj4hsoCz6PpFHiEMaLiLEz-0,2129
14
+ bayesian_sparse_gmm-0.1.0.dist-info/METADATA,sha256=xjh1Z0UPVYQEom3ylQdo7jVfzuiFrm4E46M_m1tjHGA,2958
15
+ bayesian_sparse_gmm-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
16
+ bayesian_sparse_gmm-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any