ummd 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ummd/__init__.py ADDED
@@ -0,0 +1,10 @@
1
+ from .ummd import MMD, kernel_matrix, calc_MMD, perm_MMD, perm_uMMD, generate_ummd_input
2
+
3
+ __all__ = [
4
+ "MMD",
5
+ "kernel_matrix",
6
+ "calc_MMD",
7
+ "perm_MMD",
8
+ "perm_uMMD",
9
+ "generate_ummd_input",
10
+ ]
ummd/py.typed ADDED
File without changes
ummd/ummd.py ADDED
@@ -0,0 +1,466 @@
1
+ """UMMD: a space and time efficient Maximum Mean Discrepancy two-sample test implementation for data with repeated sample values.
2
+
3
+ Maximum Mean Discrepancy (MMD) is a kernel-based test for whether two samples
4
+ are drawn from the same distribution. The naive kernel matrix costs O(N^2) in
5
+ time and memory; this implementation collapses repeated observations and works
6
+ over the U unique values instead, giving O(U^2), which can be a huge improvement
7
+ for data with many repeated values. Significance is assessed by permutation, with optional
8
+ testing over multiple RBF bandwidths aggregated via the Cauchy combination test.
9
+
10
+ Main entry point
11
+ ----------------
12
+ MMD : run the (unique) MMD two-sample test and return statistics and p-values.
13
+
14
+
15
+ Example
16
+ -------
17
+ >>> import numpy as np
18
+ >>> from ummd import MMD
19
+ >>> rng = np.random.default_rng(0)
20
+ >>> x = rng.integers(2, 7, size=200)
21
+ >>> y = rng.integers(-5, 2, size=200)
22
+ >>> res = MMD(x, y, n_permutations=999, bandwidths=5, cauchy_weighting='centered')
23
+ >>> res["p-value"]
24
+
25
+ References
26
+ ----------
27
+ Gretton et al. (2012), A Kernel Two-Sample Test.
28
+ Schrab et al. (2023), MMD Aggregated Two-Sample Test.
29
+ Liu and Xie (2019), Cauchy Combination Test.
30
+ """
31
+
32
+ import time
33
+ from scipy.spatial.distance import cdist, pdist
34
+ import numpy as np
35
+ import functools
36
+
37
+
38
+ def timer(func):
39
+ @functools.wraps(func)
40
+ def wrapper(*args, **kwargs):
41
+ start = time.perf_counter()
42
+ res = func(*args, **kwargs)
43
+ end = time.perf_counter()
44
+ wrapper.time_taken = end - start
45
+ return res
46
+
47
+ return wrapper
48
+
49
+
50
+ @timer
51
+ def kernel_matrix(x, y, gammas):
52
+ """Compute the RBF (Gaussian) kernel matrix between two distributions.
53
+
54
+ One kernel matrix is produced per bandwidth, using squared Euclidean distance
55
+ with gamma = 1/(2*sigma**2), i.e. k(a, b) = exp(-gamma * ||a - b||**2).
56
+
57
+ Parameters
58
+ ----------
59
+ x : np.ndarray, shape (m, d)
60
+ First distribution with ``m`` samples and ``d`` dimensions.
61
+ y : np.ndarray, shape (n, d)
62
+ Second distribution with ``n`` samples and ``d`` dimensions.
63
+ gammas : np.ndarray, shape (b,)
64
+ 1-D array of RBF kernel precisions, one per bandwidth.
65
+
66
+ Returns
67
+ -------
68
+ np.ndarray, shape (b, m, n)
69
+ Kernel matrices for each bandwidth, where ``b`` is the number of bandwidths,
70
+ ``m`` is the number of samples in ``x``, and ``n`` is the number of samples in ``y``.
71
+
72
+ Raises
73
+ ------
74
+ AssertionError
75
+ If ``gammas`` is not a 1D array.
76
+ """
77
+
78
+ assert isinstance(gammas, np.ndarray) and gammas.ndim == 1, (
79
+ "Gammas must be a 1D array of bandwidths."
80
+ )
81
+
82
+ D = cdist(x, y, metric="sqeuclidean") # [m, n]
83
+ K = np.exp(-gammas[:, None, None] * D[None, :, :]) # [bandwidths, m, n]
84
+ return K
85
+
86
+
87
+ @timer
88
+ def calc_MMD(K: np.array, s: np.array):
89
+ """Calculate the biased MMD statistic given a kernel distance matrix and a sample weighting vector.
90
+
91
+ Parameters
92
+ ----------
93
+ K : np.ndarray, shape (b, m, n)
94
+ Kernel distance matrix.
95
+ s : np.ndarray, shape (m + n,)
96
+ Sample weighting vector representing class frequency (1/m) and negated indexes from y (-1/n).
97
+
98
+ Returns
99
+ -------
100
+ res : np.ndarray, shape (b,)
101
+ MMD values for each tested bandwidth.
102
+ """
103
+ return s @ K @ s.T
104
+
105
+
106
+ def perm_MMD(K, s, rng, n_permutations=999):
107
+ """Calculate the biased MMD statistic across n_permutations.
108
+
109
+ Parameters
110
+ ----------
111
+ K : np.ndarray, shape (b, m, n)
112
+ Kernel distance matrix.
113
+ s : np.ndarray, shape (m + n,)
114
+ Sample weighting vector representing class frequency (1/m) and negated indexes from y (-1/n).
115
+ rng : np.random.Generator
116
+ Random number generator for permutation.
117
+ n_permutations : int, optional
118
+ Number of permutations to perform (default is 999).
119
+
120
+ Returns
121
+ -------
122
+ res : np.ndarray, shape (n_permutations, b)
123
+ MMD values for each tested bandwidth.
124
+ """
125
+ S = np.repeat(
126
+ s[np.newaxis, :], repeats=n_permutations, axis=0
127
+ ) # [permutations, m + n]
128
+ S = rng.permuted(S, axis=1)
129
+ perms = np.sum((S @ K) * S, 2) # [bandwidths, permutations]
130
+ return np.moveaxis(perms, 1, 0) # [permutations, bandwidths]
131
+
132
+
133
+ def perm_uMMD(K, x_idx, y_idx, rng, n_permutations=0):
134
+ """Calculate the biased MMD statistic for n_permutations of unique values.
135
+
136
+ Requires a np.bincount across u * n_permutations over perm_MMD function; this adds time and space complexity
137
+ but reclaims improved efficiency in cases with many repeated values.
138
+
139
+ Parameters
140
+ ----------
141
+ K : np.ndarray, shape (b, u, u)
142
+ Kernel distance matrix of unique values where ``u`` is the number of unique values.
143
+ x_idx : np.ndarray, shape (m,)
144
+ Sample index vector for the first distribution.
145
+ y_idx : np.ndarray, shape (n,)
146
+ Sample index vector for the second distribution.
147
+ rng : np.random.Generator
148
+ Random number generator for permutation.
149
+ n_permutations : int, optional
150
+ Number of permutations to perform (default is 0).
151
+
152
+ Returns
153
+ -------
154
+ res : np.ndarray, shape (n_permutations, b)
155
+ MMD values for each tested bandwidth.
156
+ """
157
+
158
+ xy_idx = np.concatenate((x_idx, y_idx))
159
+ m = len(x_idx)
160
+ n = len(y_idx)
161
+ u = K.shape[-1]
162
+
163
+ S = np.repeat(
164
+ xy_idx[np.newaxis, :], repeats=n_permutations, axis=0
165
+ ) # [permutations, m + n]
166
+ S = rng.permuted(S, axis=1)
167
+ X = S[:, :m] # [permutations, m]
168
+
169
+ # Vectorising bincount requires an offset trick. Add a new u index for each permutation and then bincount that.
170
+ # Reshape that back to the original dimensions and you get the counts of the unique indexes for each permutation.
171
+ # U_y can be easily calculated per permutation since U_x + U_y must = U_xy.
172
+ U_xy = np.bincount(xy_idx, minlength=u) # [u, ]
173
+ offsets = np.arange(n_permutations)[:, None] * u # [permutations, 1]
174
+ U_x = np.bincount(
175
+ (X + offsets).ravel(), # [permutations * m, ]
176
+ minlength=n_permutations * u,
177
+ ).reshape(n_permutations, u) # [permutations, u]
178
+ U_y = U_xy - U_x # [permutations, u]
179
+
180
+ U = np.divide(U_x, m) - np.divide(U_y, n) # [permutations, u]
181
+
182
+ perms = np.sum((U @ K) * U, 2) # [bandwidths, permutations]
183
+ return np.moveaxis(perms, 1, 0) # [permutations, bandwidths]
184
+
185
+
186
+ def get_bandwidths(xy, n=10):
187
+ """Generate bandwidths for the RBF kernel based on the pairwise distances of the pooled sample.
188
+
189
+ Generate a geometric grid of n sigma length-scales spanning the range of pairwise Euclidean distances
190
+ across all samples. See Schrab et al. (2023) MMD Aggregated Two-Sample Test for motivation of this formula.
191
+
192
+ Parameters
193
+ ----------
194
+ xy : np.ndarray, shape (m + n, d)
195
+ Pooled samples from both distributions.
196
+ n : int, optional
197
+ Number of bandwidths to generate (default is 10).
198
+
199
+
200
+ Returns
201
+ -------
202
+ sigmas : np.ndarray, shape (n,)
203
+ Sigma length-scales.
204
+ """
205
+ D = pdist(xy, "euclidean")
206
+ lambda_min, lambda_max = D.min(), D.max()
207
+ t = np.arange(n) / (n - 1)
208
+ sigmas = (lambda_min / 2) * ((2 * lambda_max) / (lambda_min / 2)) ** t
209
+ return sigmas
210
+
211
+
212
+ def cauchy_combination(p_vals, weight_distribution="uniform"):
213
+ """Combine p-values across bandwidths using the Cauchy combination method.
214
+
215
+ Follows the formula ``T = sum(w_i * tan((0.5 - p_i) * pi))`` where ``w_i`` are the weights for each p-value and ``p_i`` are the individual p-values.
216
+ See Liu and Xie (2019) Cauchy Combination Test... for more details.
217
+
218
+ Parameters
219
+ ----------
220
+ p_vals : np.ndarray, shape (b,)
221
+ Array of p-values to combine, where ``b`` is the number of bandwidths.
222
+ weight_distribution : str or None, optional
223
+ Method for weighting p-values in the combination. Options are:
224
+ - "uniform": Equal weights for all p-values (default).
225
+ - "left": More weight on smaller p-values.
226
+ - "right": More weight on larger p-values.
227
+ - "centered": More weight on p-values near 0.5.
228
+ - None: No combination, return NaN for the combined p-value.
229
+
230
+ Returns
231
+ -------
232
+ cauchy_p : float
233
+ Combined p-value from the Cauchy combination method.
234
+
235
+ Raises
236
+ ------
237
+ ValueError
238
+ If an invalid weight distribution is provided.
239
+ """
240
+
241
+ p_vals = np.clip(
242
+ p_vals, 1e-30, 1 - 1e-30
243
+ ) # Avoid extreme p-values that can cause numerical issues
244
+
245
+ def norm(x):
246
+ return x / np.sum(x)
247
+
248
+ match weight_distribution:
249
+ case "uniform":
250
+ w = norm(np.ones(len(p_vals)))
251
+ case "left":
252
+ w = norm(1 / np.arange(1, len(p_vals) + 1))
253
+ case "right":
254
+ w = norm(1 / np.arange(len(p_vals), 0, -1))
255
+ case "centered":
256
+ mid = (len(p_vals) - 1) / 2
257
+ w = norm(np.exp(-0.5 * ((np.arange(len(p_vals)) - mid) / (mid / 2)) ** 2))
258
+ case None:
259
+ return np.nan # No combination, return NaN for the combined p-value
260
+ case _:
261
+ raise ValueError(
262
+ "Invalid weight distribution. Must be one of ['uniform', 'left', 'right', 'centered', None]."
263
+ )
264
+
265
+ # Cauchy combination formula
266
+ T = np.sum(w * np.tan((0.5 - p_vals) * np.pi))
267
+ cauchy_p = 0.5 - (np.arctan(T) / np.pi)
268
+ return cauchy_p
269
+
270
+
271
+ @timer
272
+ def generate_ummd_input(x, y):
273
+ """Convert two distributions into the unique values and index vectors representing the values in each distribution.
274
+
275
+ Parameters
276
+ ----------
277
+ x : np.ndarray, shape (m, d)
278
+ First distribution with ``m`` samples and ``d`` dimensions.
279
+ y : np.ndarray, shape (n, d)
280
+ Second distribution with ``n`` samples and ``d`` dimensions.
281
+
282
+ Returns
283
+ -------
284
+ unique_values : np.ndarray, shape (u, d)
285
+ Unique values from the combined distributions.
286
+ x_idx : np.ndarray, shape (m,)
287
+ Index vector representing the positions of ``x`` values in the unique values array.
288
+ y_idx : np.ndarray, shape (n,)
289
+ Index vector representing the positions of ``y`` values in the unique values array.
290
+ """
291
+ unique_values, inverse = np.unique(
292
+ np.concatenate((x, y), axis=0), axis=0, return_inverse=True
293
+ )
294
+ x_idx = inverse[: len(x)]
295
+ y_idx = inverse[len(x) :]
296
+ return unique_values, x_idx, y_idx
297
+
298
+
299
+ @timer
300
+ def MMD(
301
+ x,
302
+ y,
303
+ unique=True,
304
+ bandwidths="median",
305
+ n_permutations=0,
306
+ perm_batch_size=999,
307
+ cauchy_weighting="uniform",
308
+ seed=11,
309
+ ):
310
+ """Calculate the MMD of two distributions.
311
+
312
+ Maximum Mean Discrepancy (MMD) is a kernel-based distance measure between distributions allowing identification in second moment differences.
313
+ The backbone of the test is based on kernel distance matrices, namely following the formula ``MMD^2 = K_x + K_y - 2K_xy``
314
+ where ``K_x`` and ``K_y`` are kernel distances between each entry of X and Y distributions respectively,
315
+ and ``K_xy`` is the cross-kernel distance matrix between each value of X with each value of Y.
316
+ The kernel matrix itself requires O(N^2) time and space complexity per bandwidth, which can be reduced to O(U^2)
317
+ where U is the number of unique values across both distributions with the unique value optimisation.
318
+
319
+ Parameters
320
+ ----------
321
+ x : np.ndarray, shape (m, d)
322
+ First distribution with ``m`` samples and ``d`` dimensions.
323
+ y : np.ndarray, shape (n, d)
324
+ Second distribution with ``n`` samples and ``d`` dimensions.
325
+ unique : bool
326
+ Whether to use the unique value optimisation, which can be much faster for discrete data with many repeated values. Default: True.
327
+ bandwidths : str or int or np.ndarray, shape (b,)
328
+ Kernel bandwidths as sigma length-scales (same units as the data). One of:
329
+ - "median": median pairwise Euclidean distance of the pooled unique sample (default).
330
+ - int: generate that many bandwidths spanning the pooled pairwise distances (see get_bandwidths).
331
+ - 1-D np.array: the sigma values to test.
332
+ Each sigma is converted internally to an RBF gamma via gamma = 1 / (2 * sigma**2).
333
+ n_permutations : int
334
+ number of permutations to approximate p-value. Default: 0.
335
+ perm_batch_size : int
336
+ number of permutations to calculate in each batch. Default: 999.
337
+ cauchy_weighting: str or None
338
+ Method for weighting p-values across bandwidths in the cauchy combination. If None,
339
+ p-values per bandwidth are returned without aggregation. Weighting options:
340
+ - "centered": Highest weight on bandwidths near the median, decreasing towards the extremes.
341
+ - "uniform": Equal weight on p-values across all bandwidths (default).
342
+ - "left": More weight on smaller bandwidths.
343
+ - "right": More weight on larger bandwidths.
344
+ - None: No Cauchy aggregation.
345
+ seed : int
346
+ Random seed for reproducibility. Default: 11.
347
+
348
+ Returns
349
+ -------
350
+ res : dict
351
+ Dictionary of MMD results with attributes:
352
+ - bandwidths: bandwidths used in the RBF kernel.
353
+ - n_permutations: number of permutations used to approximate p-value.
354
+ - biased_MMD: MMD statistic per bandwidth.
355
+ - p-values_per_bandwidth: permuation derived p-values for each bandwidth tested.
356
+ - cauchy_method: method used for Cauchy combination.
357
+ - p-value: Cauchy adjusted p-value.
358
+
359
+ Raises
360
+ ------
361
+ ValueError
362
+ If bandwidths parameter is invalid.
363
+ If cauchy_weighting parameter is invalid.
364
+ """
365
+
366
+ # Check for 2d array
367
+ if x.ndim == 1:
368
+ x = x[:, None]
369
+ if y.ndim == 1:
370
+ y = y[:, None]
371
+
372
+ m = len(x)
373
+ n = len(y)
374
+
375
+ xy = np.concatenate((x, y), axis=0) # [(m + n), d]
376
+
377
+ # Resolve bandwidths
378
+ if isinstance(bandwidths, np.ndarray):
379
+ if bandwidths.ndim != 1:
380
+ raise ValueError("Bandwidths array must be 1D.")
381
+ elif bandwidths is None or bandwidths == "median":
382
+ bandwidths = np.array(
383
+ [np.median(pdist(np.unique(xy, axis=0), metric="euclidean"))]
384
+ )
385
+ elif isinstance(bandwidths, (int, np.integer)):
386
+ if bandwidths <= 1:
387
+ bandwidths = np.array(
388
+ [np.median(pdist(np.unique(xy, axis=0), metric="euclidean"))]
389
+ )
390
+ else:
391
+ bandwidths = get_bandwidths(np.unique(xy, axis=0), n=bandwidths)
392
+ else:
393
+ raise ValueError("Bandwidths must be None, 'median', an int, or a 1D np.array.")
394
+
395
+ # Convert bandwidths to gammas
396
+ gammas = 1.0 / (2.0 * bandwidths**2)
397
+
398
+ # Calulate MMD
399
+ if unique:
400
+ unique_values, x_idx, y_idx = generate_ummd_input(x, y) # [u, d], [m, ], [n, ]
401
+ u = len(unique_values)
402
+ K = kernel_matrix(unique_values, unique_values, gammas) # [bandwidths, u, u]
403
+ s_x = np.bincount(x_idx, minlength=u) / m # [u, ]
404
+ s_y = np.bincount(y_idx, minlength=u) / n # [u, ]
405
+ s = s_x - s_y # [u, ]
406
+ else:
407
+ K = kernel_matrix(xy, xy, gammas) # [bandwidths, (m + n), (m + n)]
408
+
409
+ s_x = np.ones(m) / m # [m, ]
410
+ s_y = np.ones(n) / n * -1 # [n, ]
411
+ s = np.concatenate((s_x, s_y)) # [(m + n), ]
412
+
413
+ # Define results output dictionary
414
+ res = {
415
+ "bandwidths": bandwidths,
416
+ "n_permutations": n_permutations,
417
+ "biased_MMD": None,
418
+ "p-values_per_bandwidth": None,
419
+ "cauchy_method": cauchy_weighting,
420
+ "p-value": None,
421
+ }
422
+
423
+ obs = calc_MMD(K, s) # [bandwidths, ]
424
+ res["biased_MMD"] = obs
425
+
426
+ if n_permutations > 0:
427
+ # NOTE: p-values will not be identical for a given seed and n_permutations between unique=True and unique=False.
428
+ # The unique and brute-force paths sample the same permutation null but realize different draws at a given seed,
429
+ # so p-values differ by O(1/√B) Monte-Carlo error (independent of repeats); they converge with increasing n_permutations.
430
+
431
+ batches = np.arange(0, n_permutations, perm_batch_size)
432
+ rng = np.random.default_rng(seed)
433
+ perms = np.empty(
434
+ (n_permutations, len(bandwidths))
435
+ ) # [permutations, bandwidths]
436
+
437
+ # Batch permutations
438
+ for batch_start in batches:
439
+ n_batch = min(perm_batch_size, n_permutations - batch_start)
440
+ if unique:
441
+ perms[batch_start : batch_start + n_batch] = perm_uMMD(
442
+ K, x_idx, y_idx, rng=rng, n_permutations=n_batch
443
+ ) # [batch_size, bandwidths]
444
+ else:
445
+ perms[batch_start : batch_start + n_batch] = perm_MMD(
446
+ K, s, rng=rng, n_permutations=n_batch
447
+ ) # [batch_size, bandwidths]
448
+
449
+ p_values = (np.sum(perms.round(10) >= obs.round(10), axis=0) + 1) / (
450
+ n_permutations + 1
451
+ ) # [bandwidths, ]
452
+ res["p-values_per_bandwidth"] = p_values.round(6)
453
+
454
+ # cauchy combination of p-values across bandwidths
455
+ if cauchy_weighting is not None and len(bandwidths) > 1:
456
+ if isinstance(cauchy_weighting, str):
457
+ if cauchy_weighting not in ["uniform", "left", "right", "centered"]:
458
+ raise ValueError(
459
+ "Invalid cauchy weighting method. Must be one of ['uniform', 'left', 'right', 'centered']."
460
+ )
461
+
462
+ res["p-value"] = cauchy_combination(
463
+ p_values, weight_distribution=cauchy_weighting
464
+ )
465
+
466
+ return res
@@ -0,0 +1,73 @@
1
+ Metadata-Version: 2.4
2
+ Name: ummd
3
+ Version: 0.1.0
4
+ Summary: Efficient Maximum Mean Discrepancy two-sample testing for data with duplicate observations, scaling with unique values rather than sample size.
5
+ Project-URL: Homepage, https://github.com/pshdrugdiscoveryai/ummd
6
+ Project-URL: Repository, https://github.com/pshdrugdiscoveryai/ummd
7
+ Project-URL: Issues, https://github.com/pshdrugdiscoveryai/ummd/issues
8
+ Author-email: Morgan Thomas <morgan.thomas@ed.ac.uk>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: hypothesis-testing,kernel-methods,maximum-mean-discrepancy,mmd,statistics,two-sample-test
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
18
+ Classifier: Typing :: Typed
19
+ Requires-Python: >=3.12
20
+ Requires-Dist: numpy>=2.4.6
21
+ Requires-Dist: scipy>=1.17.1
22
+ Provides-Extra: dev
23
+ Requires-Dist: ipykernel>=7.2.0; extra == 'dev'
24
+ Requires-Dist: ipywidgets>=8.0; extra == 'dev'
25
+ Requires-Dist: matplotlib>=3.10.9; extra == 'dev'
26
+ Requires-Dist: memory-profiler>=0.61.0; extra == 'dev'
27
+ Requires-Dist: pytest>=8.0; extra == 'dev'
28
+ Requires-Dist: ruff>=0.15.16; extra == 'dev'
29
+ Requires-Dist: seaborn>=0.13.2; extra == 'dev'
30
+ Requires-Dist: tqdm>=4.67.3; extra == 'dev'
31
+ Description-Content-Type: text/markdown
32
+
33
+ # Unique Maximum Mean Discrepancy (uMMD)
34
+
35
+ An efficient implementation of the Maximum Mean Discrepancy two-sample test for datasets with duplicate observations via count-weighting of unique values. This implementation scales with unique data values rather than sample size.
36
+
37
+ ## Installation
38
+
39
+ ```bash
40
+ pip install ummd
41
+ ```
42
+
43
+ ## Quick start
44
+
45
+ ```python
46
+ import numpy as np
47
+ from ummd import MMD
48
+
49
+ rng = np.random.default_rng(0)
50
+ x = rng.integers(0, 10, size=500) # sample from one distribution
51
+ y = rng.integers(2, 12, size=500) # sample from a shifted distribution
52
+
53
+ result = MMD(x, y, unique=True, bandwidths=10, n_permutations=999)
54
+
55
+ print(result["biased_MMD"]) # MMD statistic per bandwidth
56
+ # [ 0.04408069 0.053788 0.06124013 0.06328209 0.06290089 0.0602459 0.04713144 0.02831863 0.01431563 0.0066321 ]
57
+
58
+ print(result["p-value"]) # combined p-value across bandwidths
59
+ # 0.001
60
+ ```
61
+
62
+ ## Interpreting the result
63
+
64
+ MMD returns a dictionary with:
65
+
66
+ - `biased_MMD`: the MMD statistic for each tested bandwidth
67
+ - `p-values_per_bandwidth`: permutation p-value for each bandwidth
68
+ - `p-value`: a single Cauchy-combined p-value across the bandwidths
69
+ - `bandwidths`: the kernel bandwidths actually used
70
+
71
+ ## Why uMMD
72
+
73
+ A standard MMD test builds an `N x N` kernel matrix, so cost grows with sample size. When your data has many repeated values (counts, categories, discretised measurements), uMMD instead works over the `u` unique values, where `u << n`, giving the same test at a fraction of the cost.
@@ -0,0 +1,7 @@
1
+ ummd/__init__.py,sha256=5zdo3S6rJf9UKQwzd73p07xEvYksHdcjGHJelXUw1Ds,212
2
+ ummd/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ ummd/ummd.py,sha256=WzPzznxABfX60TquMC5gLH3R-Rhu5AS5xQUZrBMmTJI,17402
4
+ ummd-0.1.0.dist-info/METADATA,sha256=mR7rWx52WzqutD6br0lO1knZAz7x9Y_vCa1IWQ4lprA,2939
5
+ ummd-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
6
+ ummd-0.1.0.dist-info/licenses/LICENSE,sha256=xg6gE-FxOCmHvvXT5fXGug28klSdzpjW56AEccOgOX4,1095
7
+ ummd-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Pandemic Science Hub Drug Discovery AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.