PyPI - ummd - Versions diffs - 0.1.0__tar.gz - Mend

ummd 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

ummd-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,221 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# Redis
+*.rdb
+*.aof
+*.pid
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+# ActiveMQ
+activemq-data/
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# .idea/
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer,
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Temporary file for partial code execution
+tempCodeRunnerFile.py
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Streamlit
+.streamlit/secrets.toml
+# Custom
+.claude

ummd-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Pandemic Science Hub Drug Discovery AI
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

ummd-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,73 @@
+Metadata-Version: 2.4
+Name: ummd
+Version: 0.1.0
+Summary: Efficient Maximum Mean Discrepancy two-sample testing for data with duplicate observations, scaling with unique values rather than sample size.
+Project-URL: Homepage, https://github.com/pshdrugdiscoveryai/ummd
+Project-URL: Repository, https://github.com/pshdrugdiscoveryai/ummd
+Project-URL: Issues, https://github.com/pshdrugdiscoveryai/ummd/issues
+Author-email: Morgan Thomas <morgan.thomas@ed.ac.uk>
+License-Expression: MIT
+License-File: LICENSE
+Keywords: hypothesis-testing,kernel-methods,maximum-mean-discrepancy,mmd,statistics,two-sample-test
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Mathematics
+Classifier: Typing :: Typed
+Requires-Python: >=3.12
+Requires-Dist: numpy>=2.4.6
+Requires-Dist: scipy>=1.17.1
+Provides-Extra: dev
+Requires-Dist: ipykernel>=7.2.0; extra == 'dev'
+Requires-Dist: ipywidgets>=8.0; extra == 'dev'
+Requires-Dist: matplotlib>=3.10.9; extra == 'dev'
+Requires-Dist: memory-profiler>=0.61.0; extra == 'dev'
+Requires-Dist: pytest>=8.0; extra == 'dev'
+Requires-Dist: ruff>=0.15.16; extra == 'dev'
+Requires-Dist: seaborn>=0.13.2; extra == 'dev'
+Requires-Dist: tqdm>=4.67.3; extra == 'dev'
+Description-Content-Type: text/markdown
+# Unique Maximum Mean Discrepancy (uMMD)
+An efficient implementation of the Maximum Mean Discrepancy two-sample test for datasets with duplicate observations via count-weighting of unique values. This implementation scales with unique data values rather than sample size.
+## Installation
+```bash
+pip install ummd
+```
+## Quick start
+```python
+import numpy as np
+from ummd import MMD
+rng = np.random.default_rng(0)
+x = rng.integers(0, 10, size=500)   # sample from one distribution
+y = rng.integers(2, 12, size=500)   # sample from a shifted distribution
+result = MMD(x, y, unique=True, bandwidths=10, n_permutations=999)
+print(result["biased_MMD"])   # MMD statistic per bandwidth
+# [ 0.04408069 0.053788   0.06124013 0.06328209 0.06290089 0.0602459 0.04713144 0.02831863 0.01431563 0.0066321 ]
+print(result["p-value"])    # combined p-value across bandwidths
+# 0.001
+```
+## Interpreting the result
+MMD returns a dictionary with:
+- `biased_MMD`: the MMD statistic for each tested bandwidth
+- `p-values_per_bandwidth`: permutation p-value for each bandwidth
+- `p-value`: a single Cauchy-combined p-value across the bandwidths
+- `bandwidths`: the kernel bandwidths actually used
+## Why uMMD
+A standard MMD test builds an `N x N` kernel matrix, so cost grows with sample size. When your data has many repeated values (counts, categories, discretised measurements), uMMD instead works over the `u` unique values, where `u << n`, giving the same test at a fraction of the cost.

ummd-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,41 @@
+# Unique Maximum Mean Discrepancy (uMMD)
+An efficient implementation of the Maximum Mean Discrepancy two-sample test for datasets with duplicate observations via count-weighting of unique values. This implementation scales with unique data values rather than sample size.
+## Installation
+```bash
+pip install ummd
+```
+## Quick start
+```python
+import numpy as np
+from ummd import MMD
+rng = np.random.default_rng(0)
+x = rng.integers(0, 10, size=500)   # sample from one distribution
+y = rng.integers(2, 12, size=500)   # sample from a shifted distribution
+result = MMD(x, y, unique=True, bandwidths=10, n_permutations=999)
+print(result["biased_MMD"])   # MMD statistic per bandwidth
+# [ 0.04408069 0.053788   0.06124013 0.06328209 0.06290089 0.0602459 0.04713144 0.02831863 0.01431563 0.0066321 ]
+print(result["p-value"])    # combined p-value across bandwidths
+# 0.001
+```
+## Interpreting the result
+MMD returns a dictionary with:
+- `biased_MMD`: the MMD statistic for each tested bandwidth
+- `p-values_per_bandwidth`: permutation p-value for each bandwidth
+- `p-value`: a single Cauchy-combined p-value across the bandwidths
+- `bandwidths`: the kernel bandwidths actually used
+## Why uMMD
+A standard MMD test builds an `N x N` kernel matrix, so cost grows with sample size. When your data has many repeated values (counts, categories, discretised measurements), uMMD instead works over the `u` unique values, where `u << n`, giving the same test at a fraction of the cost.

ummd-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,49 @@
+[project]
+name = "ummd"
+version = "0.1.0"
+description = "Efficient Maximum Mean Discrepancy two-sample testing for data with duplicate observations, scaling with unique values rather than sample size."
+authors = [
+    { name = "Morgan Thomas", email = "morgan.thomas@ed.ac.uk" },
+]
+license = "MIT"
+license-files = ["LICENSE"]
+keywords = ["statistics", "hypothesis-testing", "mmd", "maximum-mean-discrepancy", "two-sample-test", "kernel-methods"]
+readme = "README.md"
+requires-python = ">=3.12"
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Science/Research",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.12",
+    "Operating System :: OS Independent",
+    "Topic :: Scientific/Engineering :: Mathematics",
+    "Typing :: Typed",
+]
+dependencies = [
+    "numpy>=2.4.6",
+    "scipy>=1.17.1",
+]
+[project.urls]
+Homepage = "https://github.com/pshdrugdiscoveryai/ummd"
+Repository = "https://github.com/pshdrugdiscoveryai/ummd"
+Issues = "https://github.com/pshdrugdiscoveryai/ummd/issues"
+[project.optional-dependencies]
+dev = [
+    "ipykernel>=7.2.0",
+    "matplotlib>=3.10.9",
+    "pytest>=8.0",
+    "seaborn>=0.13.2",
+    "memory-profiler>=0.61.0",
+    "tqdm>=4.67.3",
+    "ipywidgets>=8.0",
+    "ruff>=0.15.16",
+]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.sdist]
+include = ["src/ummd", "README.md", "LICENSE", "pyproject.toml"]

ummd-0.1.0/src/ummd/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+from .ummd import MMD, kernel_matrix, calc_MMD, perm_MMD, perm_uMMD, generate_ummd_input
+__all__ = [
+    "MMD",
+    "kernel_matrix",
+    "calc_MMD",
+    "perm_MMD",
+    "perm_uMMD",
+    "generate_ummd_input",
+]

ummd-0.1.0/src/ummd/py.typed ADDED Viewed

File without changes

ummd-0.1.0/src/ummd/ummd.py ADDED Viewed

@@ -0,0 +1,466 @@
+"""UMMD: a space and time efficient Maximum Mean Discrepancy two-sample test implementation for data with repeated sample values.
+Maximum Mean Discrepancy (MMD) is a kernel-based test for whether two samples
+are drawn from the same distribution. The naive kernel matrix costs O(N^2) in
+time and memory; this implementation collapses repeated observations and works
+over the U unique values instead, giving O(U^2), which can be a huge improvement
+for data with many repeated values. Significance is assessed by permutation, with optional
+testing over multiple RBF bandwidths aggregated via the Cauchy combination test.
+Main entry point
+----------------
+MMD : run the (unique) MMD two-sample test and return statistics and p-values.
+Example
+-------
+>>> import numpy as np
+>>> from ummd import MMD
+>>> rng = np.random.default_rng(0)
+>>> x = rng.integers(2, 7, size=200)
+>>> y = rng.integers(-5, 2, size=200)
+>>> res = MMD(x, y, n_permutations=999, bandwidths=5, cauchy_weighting='centered')
+>>> res["p-value"]
+References
+----------
+Gretton et al. (2012), A Kernel Two-Sample Test.
+Schrab et al. (2023), MMD Aggregated Two-Sample Test.
+Liu and Xie (2019), Cauchy Combination Test.
+"""
+import time
+from scipy.spatial.distance import cdist, pdist
+import numpy as np
+import functools
+def timer(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        start = time.perf_counter()
+        res = func(*args, **kwargs)
+        end = time.perf_counter()
+        wrapper.time_taken = end - start
+        return res
+    return wrapper
+@timer
+def kernel_matrix(x, y, gammas):
+    """Compute the RBF (Gaussian) kernel matrix between two distributions.
+    One kernel matrix is produced per bandwidth, using squared Euclidean distance
+    with gamma = 1/(2*sigma**2), i.e. k(a, b) = exp(-gamma * ||a - b||**2).
+    Parameters
+    ----------
+    x : np.ndarray, shape (m, d)
+        First distribution with ``m`` samples and ``d`` dimensions.
+    y : np.ndarray, shape (n, d)
+        Second distribution with ``n`` samples and ``d`` dimensions.
+    gammas : np.ndarray, shape (b,)
+        1-D array of RBF kernel precisions, one per bandwidth.
+    Returns
+    -------
+    np.ndarray, shape (b, m, n)
+        Kernel matrices for each bandwidth, where ``b`` is the number of bandwidths,
+        ``m`` is the number of samples in ``x``, and ``n`` is the number of samples in ``y``.
+    Raises
+    ------
+    AssertionError
+        If ``gammas`` is not a 1D array.
+    """
+    assert isinstance(gammas, np.ndarray) and gammas.ndim == 1, (
+        "Gammas must be a 1D array of bandwidths."
+    )
+    D = cdist(x, y, metric="sqeuclidean")  # [m, n]
+    K = np.exp(-gammas[:, None, None] * D[None, :, :])  # [bandwidths, m, n]
+    return K
+@timer
+def calc_MMD(K: np.array, s: np.array):
+    """Calculate the biased MMD statistic given a kernel distance matrix and a sample weighting vector.
+    Parameters
+    ----------
+    K : np.ndarray, shape (b, m, n)
+        Kernel distance matrix.
+    s : np.ndarray, shape (m + n,)
+        Sample weighting vector representing class frequency (1/m) and negated indexes from y (-1/n).
+    Returns
+    -------
+    res : np.ndarray, shape (b,)
+        MMD values for each tested bandwidth.
+    """
+    return s @ K @ s.T
+def perm_MMD(K, s, rng, n_permutations=999):
+    """Calculate the biased MMD statistic across n_permutations.
+    Parameters
+    ----------
+    K : np.ndarray, shape (b, m, n)
+        Kernel distance matrix.
+    s : np.ndarray, shape (m + n,)
+        Sample weighting vector representing class frequency (1/m) and negated indexes from y (-1/n).
+    rng : np.random.Generator
+        Random number generator for permutation.
+    n_permutations : int, optional
+        Number of permutations to perform (default is 999).
+    Returns
+    -------
+    res : np.ndarray, shape (n_permutations, b)
+        MMD values for each tested bandwidth.
+    """
+    S = np.repeat(
+        s[np.newaxis, :], repeats=n_permutations, axis=0
+    )  # [permutations, m + n]
+    S = rng.permuted(S, axis=1)
+    perms = np.sum((S @ K) * S, 2)  # [bandwidths, permutations]
+    return np.moveaxis(perms, 1, 0)  # [permutations, bandwidths]
+def perm_uMMD(K, x_idx, y_idx, rng, n_permutations=0):
+    """Calculate the biased MMD statistic for n_permutations of unique values.
+    Requires a np.bincount across u * n_permutations over perm_MMD function; this adds time and space complexity
+    but reclaims improved efficiency in cases with many repeated values.
+    Parameters
+    ----------
+    K : np.ndarray, shape (b, u, u)
+        Kernel distance matrix of unique values where ``u`` is the number of unique values.
+    x_idx : np.ndarray, shape (m,)
+        Sample index vector for the first distribution.
+    y_idx : np.ndarray, shape (n,)
+        Sample index vector for the second distribution.
+    rng : np.random.Generator
+        Random number generator for permutation.
+    n_permutations : int, optional
+        Number of permutations to perform (default is 0).
+    Returns
+    -------
+    res : np.ndarray, shape (n_permutations, b)
+        MMD values for each tested bandwidth.
+    """
+    xy_idx = np.concatenate((x_idx, y_idx))
+    m = len(x_idx)
+    n = len(y_idx)
+    u = K.shape[-1]
+    S = np.repeat(
+        xy_idx[np.newaxis, :], repeats=n_permutations, axis=0
+    )  # [permutations, m + n]
+    S = rng.permuted(S, axis=1)
+    X = S[:, :m]  # [permutations, m]
+    # Vectorising bincount requires an offset trick. Add a new u index for each permutation and then bincount that.
+    # Reshape that back to the original dimensions and you get the counts of the unique indexes for each permutation.
+    # U_y can be easily calculated per permutation since U_x + U_y must = U_xy.
+    U_xy = np.bincount(xy_idx, minlength=u)  # [u, ]
+    offsets = np.arange(n_permutations)[:, None] * u  # [permutations, 1]
+    U_x = np.bincount(
+        (X + offsets).ravel(),  # [permutations * m, ]
+        minlength=n_permutations * u,
+    ).reshape(n_permutations, u)  # [permutations, u]
+    U_y = U_xy - U_x  # [permutations, u]
+    U = np.divide(U_x, m) - np.divide(U_y, n)  # [permutations, u]
+    perms = np.sum((U @ K) * U, 2)  # [bandwidths, permutations]
+    return np.moveaxis(perms, 1, 0)  # [permutations, bandwidths]
+def get_bandwidths(xy, n=10):
+    """Generate bandwidths for the RBF kernel based on the pairwise distances of the pooled sample.
+    Generate a geometric grid of n sigma length-scales spanning the range of pairwise Euclidean distances
+    across all samples. See Schrab et al. (2023) MMD Aggregated Two-Sample Test for motivation of this formula.
+    Parameters
+    ----------
+    xy : np.ndarray, shape (m + n, d)
+        Pooled samples from both distributions.
+    n : int, optional
+        Number of bandwidths to generate (default is 10).
+    Returns
+    -------
+    sigmas : np.ndarray, shape (n,)
+        Sigma length-scales.
+    """
+    D = pdist(xy, "euclidean")
+    lambda_min, lambda_max = D.min(), D.max()
+    t = np.arange(n) / (n - 1)
+    sigmas = (lambda_min / 2) * ((2 * lambda_max) / (lambda_min / 2)) ** t
+    return sigmas
+def cauchy_combination(p_vals, weight_distribution="uniform"):
+    """Combine p-values across bandwidths using the Cauchy combination method.
+    Follows the formula ``T = sum(w_i * tan((0.5 - p_i) * pi))`` where ``w_i`` are the weights for each p-value and ``p_i`` are the individual p-values.
+    See Liu and Xie (2019) Cauchy Combination Test... for more details.
+    Parameters
+    ----------
+    p_vals : np.ndarray, shape (b,)
+        Array of p-values to combine, where ``b`` is the number of bandwidths.
+    weight_distribution : str or None, optional
+        Method for weighting p-values in the combination. Options are:
+        - "uniform": Equal weights for all p-values (default).
+        - "left": More weight on smaller p-values.
+        - "right": More weight on larger p-values.
+        - "centered": More weight on p-values near 0.5.
+        - None: No combination, return NaN for the combined p-value.
+    Returns
+    -------
+    cauchy_p : float
+        Combined p-value from the Cauchy combination method.
+    Raises
+    ------
+    ValueError
+        If an invalid weight distribution is provided.
+    """
+    p_vals = np.clip(
+        p_vals, 1e-30, 1 - 1e-30
+    )  # Avoid extreme p-values that can cause numerical issues
+    def norm(x):
+        return x / np.sum(x)
+    match weight_distribution:
+        case "uniform":
+            w = norm(np.ones(len(p_vals)))
+        case "left":
+            w = norm(1 / np.arange(1, len(p_vals) + 1))
+        case "right":
+            w = norm(1 / np.arange(len(p_vals), 0, -1))
+        case "centered":
+            mid = (len(p_vals) - 1) / 2
+            w = norm(np.exp(-0.5 * ((np.arange(len(p_vals)) - mid) / (mid / 2)) ** 2))
+        case None:
+            return np.nan  # No combination, return NaN for the combined p-value
+        case _:
+            raise ValueError(
+                "Invalid weight distribution. Must be one of ['uniform', 'left', 'right', 'centered', None]."
+            )
+    # Cauchy combination formula
+    T = np.sum(w * np.tan((0.5 - p_vals) * np.pi))
+    cauchy_p = 0.5 - (np.arctan(T) / np.pi)
+    return cauchy_p
+@timer
+def generate_ummd_input(x, y):
+    """Convert two distributions into the unique values and index vectors representing the values in each distribution.
+    Parameters
+    ----------
+    x : np.ndarray, shape (m, d)
+        First distribution with ``m`` samples and ``d`` dimensions.
+    y : np.ndarray, shape (n, d)
+        Second distribution with ``n`` samples and ``d`` dimensions.
+    Returns
+    -------
+    unique_values : np.ndarray, shape (u, d)
+        Unique values from the combined distributions.
+    x_idx : np.ndarray, shape (m,)
+        Index vector representing the positions of ``x`` values in the unique values array.
+    y_idx : np.ndarray, shape (n,)
+        Index vector representing the positions of ``y`` values in the unique values array.
+    """
+    unique_values, inverse = np.unique(
+        np.concatenate((x, y), axis=0), axis=0, return_inverse=True
+    )
+    x_idx = inverse[: len(x)]
+    y_idx = inverse[len(x) :]
+    return unique_values, x_idx, y_idx
+@timer
+def MMD(
+    x,
+    y,
+    unique=True,
+    bandwidths="median",
+    n_permutations=0,
+    perm_batch_size=999,
+    cauchy_weighting="uniform",
+    seed=11,
+):
+    """Calculate the MMD of two distributions.
+    Maximum Mean Discrepancy (MMD) is a kernel-based distance measure between distributions allowing identification in second moment differences.
+    The backbone of the test is based on kernel distance matrices, namely following the formula ``MMD^2 = K_x + K_y - 2K_xy``
+    where ``K_x`` and ``K_y`` are kernel distances between each entry of X and Y distributions respectively,
+    and ``K_xy`` is the cross-kernel distance matrix between each value of X with each value of Y.
+    The kernel matrix itself requires O(N^2) time and space complexity per bandwidth, which can be reduced to O(U^2)
+    where U is the number of unique values across both distributions with the unique value optimisation.
+    Parameters
+    ----------
+    x : np.ndarray, shape (m, d)
+        First distribution with ``m`` samples and ``d`` dimensions.
+    y : np.ndarray, shape (n, d)
+        Second distribution with ``n`` samples and ``d`` dimensions.
+    unique : bool
+        Whether to use the unique value optimisation, which can be much faster for discrete data with many repeated values. Default: True.
+    bandwidths : str or int or np.ndarray, shape (b,)
+        Kernel bandwidths as sigma length-scales (same units as the data). One of:
+        - "median": median pairwise Euclidean distance of the pooled unique sample (default).
+        - int: generate that many bandwidths spanning the pooled pairwise distances (see get_bandwidths).
+        - 1-D np.array: the sigma values to test.
+        Each sigma is converted internally to an RBF gamma via gamma = 1 / (2 * sigma**2).
+    n_permutations : int
+        number of permutations to approximate p-value. Default: 0.
+    perm_batch_size : int
+        number of permutations to calculate in each batch. Default: 999.
+    cauchy_weighting: str or None
+        Method for weighting p-values across bandwidths in the cauchy combination. If None,
+        p-values per bandwidth are returned without aggregation. Weighting options:
+            - "centered": Highest weight on bandwidths near the median, decreasing towards the extremes.
+            - "uniform": Equal weight on p-values across all bandwidths (default).
+            - "left": More weight on smaller bandwidths.
+            - "right": More weight on larger bandwidths.
+            - None: No Cauchy aggregation.
+    seed : int
+        Random seed for reproducibility. Default: 11.
+    Returns
+    -------
+    res : dict
+        Dictionary of MMD results with attributes:
+            - bandwidths: bandwidths used in the RBF kernel.
+            - n_permutations: number of permutations used to approximate p-value.
+            - biased_MMD: MMD statistic per bandwidth.
+            - p-values_per_bandwidth: permuation derived p-values for each bandwidth tested.
+            - cauchy_method: method used for Cauchy combination.
+            - p-value: Cauchy adjusted p-value.
+    Raises
+    ------
+    ValueError
+        If bandwidths parameter is invalid.
+        If cauchy_weighting parameter is invalid.
+    """
+    # Check for 2d array
+    if x.ndim == 1:
+        x = x[:, None]
+    if y.ndim == 1:
+        y = y[:, None]
+    m = len(x)
+    n = len(y)
+    xy = np.concatenate((x, y), axis=0)  # [(m + n), d]
+    # Resolve bandwidths
+    if isinstance(bandwidths, np.ndarray):
+        if bandwidths.ndim != 1:
+            raise ValueError("Bandwidths array must be 1D.")
+    elif bandwidths is None or bandwidths == "median":
+        bandwidths = np.array(
+            [np.median(pdist(np.unique(xy, axis=0), metric="euclidean"))]
+        )
+    elif isinstance(bandwidths, (int, np.integer)):
+        if bandwidths <= 1:
+            bandwidths = np.array(
+                [np.median(pdist(np.unique(xy, axis=0), metric="euclidean"))]
+            )
+        else:
+            bandwidths = get_bandwidths(np.unique(xy, axis=0), n=bandwidths)
+    else:
+        raise ValueError("Bandwidths must be None, 'median', an int, or a 1D np.array.")
+    # Convert bandwidths to gammas
+    gammas = 1.0 / (2.0 * bandwidths**2)
+    # Calulate MMD
+    if unique:
+        unique_values, x_idx, y_idx = generate_ummd_input(x, y)  # [u, d], [m, ], [n, ]
+        u = len(unique_values)
+        K = kernel_matrix(unique_values, unique_values, gammas)  # [bandwidths, u, u]
+        s_x = np.bincount(x_idx, minlength=u) / m  # [u, ]
+        s_y = np.bincount(y_idx, minlength=u) / n  # [u, ]
+        s = s_x - s_y  # [u, ]
+    else:
+        K = kernel_matrix(xy, xy, gammas)  # [bandwidths, (m + n), (m + n)]
+        s_x = np.ones(m) / m  # [m, ]
+        s_y = np.ones(n) / n * -1  # [n, ]
+        s = np.concatenate((s_x, s_y))  # [(m + n), ]
+    # Define results output dictionary
+    res = {
+        "bandwidths": bandwidths,
+        "n_permutations": n_permutations,
+        "biased_MMD": None,
+        "p-values_per_bandwidth": None,
+        "cauchy_method": cauchy_weighting,
+        "p-value": None,
+    }
+    obs = calc_MMD(K, s)  # [bandwidths, ]
+    res["biased_MMD"] = obs
+    if n_permutations > 0:
+        # NOTE: p-values will not be identical for a given seed and n_permutations between unique=True and unique=False.
+        # The unique and brute-force paths sample the same permutation null but realize different draws at a given seed,
+        # so p-values differ by O(1/√B) Monte-Carlo error (independent of repeats); they converge with increasing n_permutations.
+        batches = np.arange(0, n_permutations, perm_batch_size)
+        rng = np.random.default_rng(seed)
+        perms = np.empty(
+            (n_permutations, len(bandwidths))
+        )  # [permutations, bandwidths]
+        # Batch permutations
+        for batch_start in batches:
+            n_batch = min(perm_batch_size, n_permutations - batch_start)
+            if unique:
+                perms[batch_start : batch_start + n_batch] = perm_uMMD(
+                    K, x_idx, y_idx, rng=rng, n_permutations=n_batch
+                )  # [batch_size, bandwidths]
+            else:
+                perms[batch_start : batch_start + n_batch] = perm_MMD(
+                    K, s, rng=rng, n_permutations=n_batch
+                )  # [batch_size, bandwidths]
+        p_values = (np.sum(perms.round(10) >= obs.round(10), axis=0) + 1) / (
+            n_permutations + 1
+        )  # [bandwidths, ]
+        res["p-values_per_bandwidth"] = p_values.round(6)
+        # cauchy combination of p-values across bandwidths
+        if cauchy_weighting is not None and len(bandwidths) > 1:
+            if isinstance(cauchy_weighting, str):
+                if cauchy_weighting not in ["uniform", "left", "right", "centered"]:
+                    raise ValueError(
+                        "Invalid cauchy weighting method. Must be one of ['uniform', 'left', 'right', 'centered']."
+                    )
+            res["p-value"] = cauchy_combination(
+                p_values, weight_distribution=cauchy_weighting
+            )
+    return res