PyPI - fastpercentile - Versions diffs - 0.1.0__tar.gz - Mend

fastpercentile 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

fastpercentile-0.1.0/LICENSE +21 -0
fastpercentile-0.1.0/PKG-INFO +118 -0
fastpercentile-0.1.0/README.md +73 -0
fastpercentile-0.1.0/fastpercentile/__init__.py +3 -0
fastpercentile-0.1.0/fastpercentile/core.py +340 -0
fastpercentile-0.1.0/fastpercentile.egg-info/PKG-INFO +118 -0
fastpercentile-0.1.0/fastpercentile.egg-info/SOURCES.txt +11 -0
fastpercentile-0.1.0/fastpercentile.egg-info/dependency_links.txt +1 -0
fastpercentile-0.1.0/fastpercentile.egg-info/requires.txt +7 -0
fastpercentile-0.1.0/fastpercentile.egg-info/top_level.txt +1 -0
fastpercentile-0.1.0/pyproject.toml +39 -0
fastpercentile-0.1.0/setup.cfg +4 -0
fastpercentile-0.1.0/tests/test_percentile.py +143 -0

fastpercentile-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Jasper Phelps
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

fastpercentile-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,118 @@
+Metadata-Version: 2.4
+Name: fastpercentile
+Version: 0.1.0
+Summary: Very fast percentile calculation for small-integer dtypes via parallel histogram
+Author-email: Jasper Phelps <jasper.s.phelps@gmail.com>
+License: MIT License
+        Copyright (c) 2026 Jasper Phelps
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: Repository, https://github.com/jasper-tms/fastpercentile
+Keywords: percentile,quantile,histogram,numba,numpy,fast
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Development Status :: 4 - Beta
+Classifier: Topic :: Scientific/Engineering
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy
+Requires-Dist: numba
+Provides-Extra: dev
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: pytest-cov; extra == "dev"
+Requires-Dist: pynrrd; extra == "dev"
+Dynamic: license-file
+# fastpercentile: Memory-bandwidth-bound percentile for small-integer arrays
+[![Tests](https://github.com/jasper-tms/fastpercentile/actions/workflows/tests.yml/badge.svg)](https://github.com/jasper-tms/fastpercentile/actions/workflows/tests.yml)
+[![PyPI version](https://img.shields.io/pypi/v/fastpercentile)](https://pypi.org/project/fastpercentile/)
+[![License](https://img.shields.io/github/license/jasper-tms/fastpercentile)](https://github.com/jasper-tms/fastpercentile/blob/main/LICENSE)
+`np.percentile` is O(n) but in practice has a brutal constant factor — on a 1.7-billion-element `uint16` volume it takes ~22 s, vs ~0.15 s for `np.max`. There is no good reason for percentile to be so much slower than max: both can be done in a single pass over the array.
+For small-integer dtypes (`int8`, `uint8`, `int16`, `uint16`) the data only takes one of at most 65 536 distinct values, so a single parallel pass into a histogram captures everything you need to compute any percentile. After the histogram is built, walking the cumulative count to find the bin holding each requested rank costs essentially nothing.
+This package implements that, in a few hundred lines of `numba`. On a 32-thread workstation it runs at DRAM bandwidth — about as fast as `np.max`, and ~300× faster than `np.percentile`:
+```
+np.max         : 0.148 s
+fastpercentile : 0.072 s   <- four percentiles in one pass
+np.percentile  : 22.2  s
+```
+Auxiliary memory is ~16 MB regardless of input size (32 threads × one 65 536-bin local table each, plus a final reduced histogram), so it adds no measurable RAM pressure on top of the input.
+### Usage
+```python
+import numpy as np
+import fastpercentile
+arr = np.random.randint(0, 65536, size=(305, 96, 69, 846), dtype=np.uint16)
+# A scalar percentile
+p99 = fastpercentile.percentile(arr, 99)
+# Multiple percentiles in a single pass over the data
+p1, p50, p99, p99_9 = fastpercentile.percentile(arr, [1, 50, 99, 99.9])
+# Or just grab the histogram if you want to do something else with it
+hist = fastpercentile.histogram(arr)  # length 65536 for uint16
+```
+Results match `numpy.percentile(arr, q)` with the default `'linear'` interpolation method (typically exact for integer inputs).
+### Supported dtypes
+`int8`, `uint8`, `int16`, `uint16`. Floats and 32/64-bit integers are not supported because a direct histogram is not feasible for them — for those, use `numpy.percentile` or `bottleneck.nanpercentile`.
+### Memory layout
+A 4D array loaded by `pynrrd` is typically Fortran-contiguous. `fastpercentile` walks raw memory, so it does not care about C vs F order — both are handled as a no-copy view. Arbitrarily-strided arrays fall back to a copy.
+### Installation
+**Option 1:** `pip install` from PyPI:
+    pip install fastpercentile
+**Option 2:** `pip install` directly from GitHub:
+    pip install git+https://github.com/jasper-tms/fastpercentile.git
+**Option 3:** First `git clone` this repo and then `pip install` it from your clone:
+    cd ~/repos
+    git clone https://github.com/jasper-tms/fastpercentile.git
+    cd fastpercentile
+    pip install '.[dev]'
+### Notes on threading
+`fastpercentile` uses every logical core on the machine by default (via `numba.get_num_threads()`). To limit it for a particular call, pass `n_threads=N`; to set it globally, use `numba.set_num_threads(N)` or the `NUMBA_NUM_THREADS` environment variable. On most systems the workload saturates DRAM bandwidth around `nproc / 2` threads, so reserving a few cores for the rest of the machine costs little throughput.

fastpercentile-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,73 @@
+# fastpercentile: Memory-bandwidth-bound percentile for small-integer arrays
+[![Tests](https://github.com/jasper-tms/fastpercentile/actions/workflows/tests.yml/badge.svg)](https://github.com/jasper-tms/fastpercentile/actions/workflows/tests.yml)
+[![PyPI version](https://img.shields.io/pypi/v/fastpercentile)](https://pypi.org/project/fastpercentile/)
+[![License](https://img.shields.io/github/license/jasper-tms/fastpercentile)](https://github.com/jasper-tms/fastpercentile/blob/main/LICENSE)
+`np.percentile` is O(n) but in practice has a brutal constant factor — on a 1.7-billion-element `uint16` volume it takes ~22 s, vs ~0.15 s for `np.max`. There is no good reason for percentile to be so much slower than max: both can be done in a single pass over the array.
+For small-integer dtypes (`int8`, `uint8`, `int16`, `uint16`) the data only takes one of at most 65 536 distinct values, so a single parallel pass into a histogram captures everything you need to compute any percentile. After the histogram is built, walking the cumulative count to find the bin holding each requested rank costs essentially nothing.
+This package implements that, in a few hundred lines of `numba`. On a 32-thread workstation it runs at DRAM bandwidth — about as fast as `np.max`, and ~300× faster than `np.percentile`:
+```
+np.max         : 0.148 s
+fastpercentile : 0.072 s   <- four percentiles in one pass
+np.percentile  : 22.2  s
+```
+Auxiliary memory is ~16 MB regardless of input size (32 threads × one 65 536-bin local table each, plus a final reduced histogram), so it adds no measurable RAM pressure on top of the input.
+### Usage
+```python
+import numpy as np
+import fastpercentile
+arr = np.random.randint(0, 65536, size=(305, 96, 69, 846), dtype=np.uint16)
+# A scalar percentile
+p99 = fastpercentile.percentile(arr, 99)
+# Multiple percentiles in a single pass over the data
+p1, p50, p99, p99_9 = fastpercentile.percentile(arr, [1, 50, 99, 99.9])
+# Or just grab the histogram if you want to do something else with it
+hist = fastpercentile.histogram(arr)  # length 65536 for uint16
+```
+Results match `numpy.percentile(arr, q)` with the default `'linear'` interpolation method (typically exact for integer inputs).
+### Supported dtypes
+`int8`, `uint8`, `int16`, `uint16`. Floats and 32/64-bit integers are not supported because a direct histogram is not feasible for them — for those, use `numpy.percentile` or `bottleneck.nanpercentile`.
+### Memory layout
+A 4D array loaded by `pynrrd` is typically Fortran-contiguous. `fastpercentile` walks raw memory, so it does not care about C vs F order — both are handled as a no-copy view. Arbitrarily-strided arrays fall back to a copy.
+### Installation
+**Option 1:** `pip install` from PyPI:
+    pip install fastpercentile
+**Option 2:** `pip install` directly from GitHub:
+    pip install git+https://github.com/jasper-tms/fastpercentile.git
+**Option 3:** First `git clone` this repo and then `pip install` it from your clone:
+    cd ~/repos
+    git clone https://github.com/jasper-tms/fastpercentile.git
+    cd fastpercentile
+    pip install '.[dev]'
+### Notes on threading
+`fastpercentile` uses every logical core on the machine by default (via `numba.get_num_threads()`). To limit it for a particular call, pass `n_threads=N`; to set it globally, use `numba.set_num_threads(N)` or the `NUMBA_NUM_THREADS` environment variable. On most systems the workload saturates DRAM bandwidth around `nproc / 2` threads, so reserving a few cores for the rest of the machine costs little throughput.

fastpercentile-0.1.0/fastpercentile/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env python3
+from .core import *

fastpercentile-0.1.0/fastpercentile/core.py ADDED Viewed

@@ -0,0 +1,340 @@
+#!/usr/bin/env python3
+"""
+Memory-bandwidth-bound percentile for small-integer arrays.
+For `int8`, `uint8`, `int16`, and `uint16` inputs of any shape this
+module computes one or more percentiles in a single parallel pass
+over the data with results matching `numpy.percentile` (default
+'linear' / method 7 interpolation).
+The idea
+--------
+For an N-element array with at most B distinct values (B = 256 for
+8-bit, 65536 for 16-bit), a single linear scan into a B-bin
+histogram captures all the information we need to find any rank.
+This is the same shape of work as `np.max` -- one pass, branchless
+inner loop -- so it runs at memory bandwidth.  After the scan we
+walk the cumulative histogram to locate the bins containing each
+requested rank; that walk is O(B + n_percentiles) which is
+negligible.
+Public API
+----------
+`percentile(arr, q, n_threads=None)`
+    Compute one or more percentiles.  Mirrors `numpy.percentile`.
+`histogram(arr, n_threads=None)`
+    Build a parallel histogram of `arr`.
+"""
+import numpy as np
+from numba import njit, prange, get_num_threads
+from typing import Sequence, Union
+__all__ = ['percentile', 'histogram', 'warmup']
+# --------------------------------------------------------------- #
+# Parallel histograms.  Each thread fills a private table; we then
+# reduce.  Bin index for signed types is `value + offset` so that
+# the binning is monotonic in the original value.
+# --------------------------------------------------------------- #
+@njit(cache=True, parallel=True, boundscheck=False)
+def _hist_u8(arr: np.ndarray,
+             n_threads: int) -> np.ndarray:
+    """
+    Parallel histogram for uint8.
+    """
+    n_bins = 256
+    n = arr.shape[0]
+    local = np.zeros((n_threads, n_bins), dtype=np.int64)
+    chunk = (n + n_threads - 1) // n_threads
+    for t in prange(n_threads):
+        start = t * chunk
+        end = start + chunk
+        if end > n:
+            end = n
+        loc = local[t]
+        for i in range(start, end):
+            loc[arr[i]] += 1
+    out = np.zeros(n_bins, dtype=np.int64)
+    for t in range(n_threads):
+        for b in range(n_bins):
+            out[b] += local[t, b]
+    return out
+@njit(cache=True, parallel=True, boundscheck=False)
+def _hist_i8(arr: np.ndarray,
+             n_threads: int) -> np.ndarray:
+    """
+    Parallel histogram for int8 (offset = 128).
+    """
+    n_bins = 256
+    n = arr.shape[0]
+    local = np.zeros((n_threads, n_bins), dtype=np.int64)
+    chunk = (n + n_threads - 1) // n_threads
+    for t in prange(n_threads):
+        start = t * chunk
+        end = start + chunk
+        if end > n:
+            end = n
+        loc = local[t]
+        for i in range(start, end):
+            loc[arr[i] + 128] += 1
+    out = np.zeros(n_bins, dtype=np.int64)
+    for t in range(n_threads):
+        for b in range(n_bins):
+            out[b] += local[t, b]
+    return out
+@njit(cache=True, parallel=True, boundscheck=False)
+def _hist_u16(arr: np.ndarray,
+              n_threads: int) -> np.ndarray:
+    """
+    Parallel histogram for uint16.
+    """
+    n_bins = 65536
+    n = arr.shape[0]
+    local = np.zeros((n_threads, n_bins), dtype=np.int64)
+    chunk = (n + n_threads - 1) // n_threads
+    for t in prange(n_threads):
+        start = t * chunk
+        end = start + chunk
+        if end > n:
+            end = n
+        loc = local[t]
+        for i in range(start, end):
+            loc[arr[i]] += 1
+    out = np.zeros(n_bins, dtype=np.int64)
+    for t in range(n_threads):
+        for b in range(n_bins):
+            out[b] += local[t, b]
+    return out
+@njit(cache=True, parallel=True, boundscheck=False)
+def _hist_i16(arr: np.ndarray,
+              n_threads: int) -> np.ndarray:
+    """
+    Parallel histogram for int16 (offset = 32768).
+    """
+    n_bins = 65536
+    n = arr.shape[0]
+    local = np.zeros((n_threads, n_bins), dtype=np.int64)
+    chunk = (n + n_threads - 1) // n_threads
+    for t in prange(n_threads):
+        start = t * chunk
+        end = start + chunk
+        if end > n:
+            end = n
+        loc = local[t]
+        for i in range(start, end):
+            loc[arr[i] + 32768] += 1
+    out = np.zeros(n_bins, dtype=np.int64)
+    for t in range(n_threads):
+        for b in range(n_bins):
+            out[b] += local[t, b]
+    return out
+# --------------------------------------------------------------- #
+# Rank-walking from a finished histogram.  Caller must sort the
+# percentiles ascending so we can scan the cumulative count once.
+# --------------------------------------------------------------- #
+@njit(cache=True, boundscheck=False)
+def _ranks_from_hist(hist: np.ndarray,
+                     ranks_lo: np.ndarray,
+                     ranks_hi: np.ndarray,
+                     fracs: np.ndarray,
+                     offset: int) -> np.ndarray:
+    """
+    Walk the cumulative histogram to find the bin holding each
+    integer rank, then linearly interpolate.
+    Parameters
+    ----------
+    hist : int64 array
+        Histogram counts.
+    ranks_lo, ranks_hi : int64 arrays of equal length
+        0-indexed integer ranks bracketing each query.  Both must
+        be sorted ascending.
+    fracs : float64 array
+        Fractional position between `ranks_lo` and `ranks_hi`.
+    offset : int
+        Subtracted from bin indices to recover original values.
+    Returns
+    -------
+    float64 array of interpolated percentile values.
+    """
+    n_q = ranks_lo.shape[0]
+    n_bins = hist.shape[0]
+    out = np.empty(n_q, dtype=np.float64)
+    cum = np.int64(0)
+    bin_idx = 0
+    for q in range(n_q):
+        target_lo = ranks_lo[q]
+        while bin_idx < n_bins and cum + hist[bin_idx] <= target_lo:
+            cum += hist[bin_idx]
+            bin_idx += 1
+        bin_lo = bin_idx
+        target_hi = ranks_hi[q]
+        cum_hi = cum
+        bin_idx_hi = bin_idx
+        while bin_idx_hi < n_bins and cum_hi + hist[bin_idx_hi] <= target_hi:
+            cum_hi += hist[bin_idx_hi]
+            bin_idx_hi += 1
+        bin_hi = bin_idx_hi
+        val_lo = float(bin_lo - offset)
+        val_hi = float(bin_hi - offset)
+        out[q] = val_lo + fracs[q] * (val_hi - val_lo)
+    return out
+# --------------------------------------------------------------- #
+# Public entry points.
+# --------------------------------------------------------------- #
+_DTYPE_DISPATCH = {
+    np.dtype('uint8'):  (_hist_u8, 0),
+    np.dtype('int8'):   (_hist_i8, 128),
+    np.dtype('uint16'): (_hist_u16, 0),
+    np.dtype('int16'):  (_hist_i16, 32768),
+}
+def _as_flat_view(arr: np.ndarray) -> np.ndarray:
+    """
+    Return a 1D view of `arr` without copying when possible.
+    Order does not matter for histograms, so for either C- or
+    F-contiguous input we just walk the raw memory.  For
+    arbitrarily-strided input we fall back to a copy.
+    """
+    if arr.flags.c_contiguous:
+        return arr.reshape(-1)
+    if arr.flags.f_contiguous:
+        return arr.ravel(order='F')
+    return np.ascontiguousarray(arr).ravel()
+def histogram(arr: np.ndarray,
+              n_threads: Union[int, None] = None) -> np.ndarray:
+    """
+    Build a parallel histogram of `arr`.
+    Parameters
+    ----------
+    arr : np.ndarray of int8/uint8/int16/uint16
+        Any shape; treated as a flat sequence of values.
+    n_threads : int, optional
+        Number of parallel histogram threads.  Defaults to
+        `numba.get_num_threads()`.
+    Returns
+    -------
+    np.ndarray of int64
+        Length 256 for 8-bit input, 65536 for 16-bit input.  The
+        bin index for value `v` is `v + offset`, where offset is 0
+        for unsigned dtypes and `2 ** (bits - 1)` for signed
+        dtypes.
+    """
+    if n_threads is None:
+        n_threads = get_num_threads()
+    arr_flat = _as_flat_view(arr)
+    try:
+        hist_fn, _ = _DTYPE_DISPATCH[arr_flat.dtype]
+    except KeyError:
+        raise TypeError(
+            'fastpercentile only supports int8/uint8/int16/uint16, '
+            'got ' + str(arr_flat.dtype))
+    return hist_fn(arr_flat, n_threads)
+def percentile(arr: np.ndarray,
+               q: Union[float, Sequence[float], np.ndarray],
+               n_threads: Union[int, None] = None
+               ) -> Union[float, np.ndarray]:
+    """
+    Compute one or more percentiles of `arr` via a parallel
+    histogram and a cumulative-count walk.
+    Matches `numpy.percentile(arr, q)` with the default 'linear'
+    interpolation method to within float rounding (typically exact
+    for integer inputs).
+    Parameters
+    ----------
+    arr : np.ndarray of int8/uint8/int16/uint16
+        Any shape; treated as a flat sequence of values.
+    q : float or sequence of floats in [0, 100]
+        Percentile(s) to compute.
+    n_threads : int, optional
+        Number of parallel histogram threads.  Defaults to
+        `numba.get_num_threads()`.
+    Returns
+    -------
+    float or np.ndarray of float64
+        Scalar if `q` is scalar, else an array shaped like
+        `np.atleast_1d(q)`.
+    """
+    if n_threads is None:
+        n_threads = get_num_threads()
+    arr_flat = _as_flat_view(arr)
+    n_total = arr_flat.size
+    if n_total == 0:
+        raise ValueError('percentile of empty array is undefined')
+    try:
+        hist_fn, offset = _DTYPE_DISPATCH[arr_flat.dtype]
+    except KeyError:
+        raise TypeError(
+            'fastpercentile only supports int8/uint8/int16/uint16, '
+            'got ' + str(arr_flat.dtype))
+    hist = hist_fn(arr_flat, n_threads)
+    q_in = np.asarray(q, dtype=np.float64)
+    q_was_scalar = (q_in.ndim == 0)
+    q_arr = np.atleast_1d(q_in)
+    if (q_arr < 0).any() or (q_arr > 100).any():
+        raise ValueError('percentiles must lie in [0, 100]')
+    # Sort percentiles ascending so we can scan the cumulative
+    # histogram once; remember the inverse permutation.
+    order = np.argsort(q_arr, kind='stable')
+    q_sorted = q_arr[order]
+    exact_rank = (n_total - 1) * q_sorted / 100.0
+    ranks_lo = np.floor(exact_rank).astype(np.int64)
+    ranks_hi = np.minimum(ranks_lo + 1, n_total - 1)
+    fracs = exact_rank - ranks_lo
+    sorted_out = _ranks_from_hist(hist, ranks_lo, ranks_hi, fracs, offset)
+    out = np.empty_like(sorted_out)
+    out[order] = sorted_out
+    if q_was_scalar:
+        return float(out[0])
+    return out
+def warmup() -> None:
+    """
+    Trigger JIT compilation for all four dtype paths so the first
+    real call has no compile latency.
+    """
+    for dtype in (np.uint8, np.int8, np.uint16, np.int16):
+        tiny = np.zeros(8, dtype=dtype)
+        percentile(tiny, [0.0, 50.0, 100.0])

fastpercentile-0.1.0/fastpercentile.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,118 @@
+Metadata-Version: 2.4
+Name: fastpercentile
+Version: 0.1.0
+Summary: Very fast percentile calculation for small-integer dtypes via parallel histogram
+Author-email: Jasper Phelps <jasper.s.phelps@gmail.com>
+License: MIT License
+        Copyright (c) 2026 Jasper Phelps
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: Repository, https://github.com/jasper-tms/fastpercentile
+Keywords: percentile,quantile,histogram,numba,numpy,fast
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Development Status :: 4 - Beta
+Classifier: Topic :: Scientific/Engineering
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy
+Requires-Dist: numba
+Provides-Extra: dev
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: pytest-cov; extra == "dev"
+Requires-Dist: pynrrd; extra == "dev"
+Dynamic: license-file
+# fastpercentile: Memory-bandwidth-bound percentile for small-integer arrays
+[![Tests](https://github.com/jasper-tms/fastpercentile/actions/workflows/tests.yml/badge.svg)](https://github.com/jasper-tms/fastpercentile/actions/workflows/tests.yml)
+[![PyPI version](https://img.shields.io/pypi/v/fastpercentile)](https://pypi.org/project/fastpercentile/)
+[![License](https://img.shields.io/github/license/jasper-tms/fastpercentile)](https://github.com/jasper-tms/fastpercentile/blob/main/LICENSE)
+`np.percentile` is O(n) but in practice has a brutal constant factor — on a 1.7-billion-element `uint16` volume it takes ~22 s, vs ~0.15 s for `np.max`. There is no good reason for percentile to be so much slower than max: both can be done in a single pass over the array.
+For small-integer dtypes (`int8`, `uint8`, `int16`, `uint16`) the data only takes one of at most 65 536 distinct values, so a single parallel pass into a histogram captures everything you need to compute any percentile. After the histogram is built, walking the cumulative count to find the bin holding each requested rank costs essentially nothing.
+This package implements that, in a few hundred lines of `numba`. On a 32-thread workstation it runs at DRAM bandwidth — about as fast as `np.max`, and ~300× faster than `np.percentile`:
+```
+np.max         : 0.148 s
+fastpercentile : 0.072 s   <- four percentiles in one pass
+np.percentile  : 22.2  s
+```
+Auxiliary memory is ~16 MB regardless of input size (32 threads × one 65 536-bin local table each, plus a final reduced histogram), so it adds no measurable RAM pressure on top of the input.
+### Usage
+```python
+import numpy as np
+import fastpercentile
+arr = np.random.randint(0, 65536, size=(305, 96, 69, 846), dtype=np.uint16)
+# A scalar percentile
+p99 = fastpercentile.percentile(arr, 99)
+# Multiple percentiles in a single pass over the data
+p1, p50, p99, p99_9 = fastpercentile.percentile(arr, [1, 50, 99, 99.9])
+# Or just grab the histogram if you want to do something else with it
+hist = fastpercentile.histogram(arr)  # length 65536 for uint16
+```
+Results match `numpy.percentile(arr, q)` with the default `'linear'` interpolation method (typically exact for integer inputs).
+### Supported dtypes
+`int8`, `uint8`, `int16`, `uint16`. Floats and 32/64-bit integers are not supported because a direct histogram is not feasible for them — for those, use `numpy.percentile` or `bottleneck.nanpercentile`.
+### Memory layout
+A 4D array loaded by `pynrrd` is typically Fortran-contiguous. `fastpercentile` walks raw memory, so it does not care about C vs F order — both are handled as a no-copy view. Arbitrarily-strided arrays fall back to a copy.
+### Installation
+**Option 1:** `pip install` from PyPI:
+    pip install fastpercentile
+**Option 2:** `pip install` directly from GitHub:
+    pip install git+https://github.com/jasper-tms/fastpercentile.git
+**Option 3:** First `git clone` this repo and then `pip install` it from your clone:
+    cd ~/repos
+    git clone https://github.com/jasper-tms/fastpercentile.git
+    cd fastpercentile
+    pip install '.[dev]'
+### Notes on threading
+`fastpercentile` uses every logical core on the machine by default (via `numba.get_num_threads()`). To limit it for a particular call, pass `n_threads=N`; to set it globally, use `numba.set_num_threads(N)` or the `NUMBA_NUM_THREADS` environment variable. On most systems the workload saturates DRAM bandwidth around `nproc / 2` threads, so reserving a few cores for the rest of the machine costs little throughput.

fastpercentile-0.1.0/fastpercentile.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,11 @@
+LICENSE
+README.md
+pyproject.toml
+fastpercentile/__init__.py
+fastpercentile/core.py
+fastpercentile.egg-info/PKG-INFO
+fastpercentile.egg-info/SOURCES.txt
+fastpercentile.egg-info/dependency_links.txt
+fastpercentile.egg-info/requires.txt
+fastpercentile.egg-info/top_level.txt
+tests/test_percentile.py

fastpercentile-0.1.0/fastpercentile.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

fastpercentile-0.1.0/fastpercentile.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,7 @@
+numpy
+numba
+[dev]
+pytest
+pytest-cov
+pynrrd

fastpercentile-0.1.0/fastpercentile.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ fastpercentile

fastpercentile-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,39 @@
+[build-system]
+requires = ['setuptools', 'wheel']
+build-backend = 'setuptools.build_meta'
+[project]
+name = 'fastpercentile'
+version = '0.1.0'
+description = 'Very fast percentile calculation for small-integer dtypes via parallel histogram'
+readme.file = 'README.md'
+readme.content-type = 'text/markdown'
+requires-python = '>=3.9'
+license = {file = 'LICENSE'}
+keywords = ['percentile', 'quantile', 'histogram', 'numba', 'numpy', 'fast']
+authors = [{name = 'Jasper Phelps', email = 'jasper.s.phelps@gmail.com'}]
+classifiers = [
+    'Programming Language :: Python :: 3',
+    'License :: OSI Approved :: MIT License',
+    'Operating System :: OS Independent',
+    'Development Status :: 4 - Beta',
+    'Topic :: Scientific/Engineering',
+]
+urls = {Repository = 'https://github.com/jasper-tms/fastpercentile'}
+dependencies = [
+    'numpy',
+    'numba',
+]
+[project.optional-dependencies]
+dev = [
+    'pytest',
+    'pytest-cov',
+    'pynrrd',
+]
+[tool.pytest.ini_options]
+testpaths = ['tests']
+[tool.setuptools.packages.find]
+include = ['fastpercentile']

fastpercentile-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

fastpercentile-0.1.0/tests/test_percentile.py ADDED Viewed

@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+"""
+Correctness tests for `fastpercentile.percentile` and
+`fastpercentile.histogram`.
+"""
+import numpy as np
+import pytest
+import fastpercentile
+from conftest import DEFAULT_QS
+def test_matches_numpy_1d(warm_jit, random_1d):
+    """
+    Multi-percentile call should match `np.percentile` exactly on
+    integer inputs.
+    """
+    expected = np.percentile(random_1d, DEFAULT_QS)
+    got = fastpercentile.percentile(random_1d, DEFAULT_QS)
+    assert isinstance(got, np.ndarray)
+    assert got.shape == expected.shape
+    assert np.max(np.abs(expected - got)) < 1e-9
+def test_scalar_q_returns_scalar(warm_jit, random_1d):
+    """
+    Passing a scalar `q` should return a Python float, matching
+    numpy's behaviour.
+    """
+    got = fastpercentile.percentile(random_1d, 50)
+    assert isinstance(got, float)
+    expected = float(np.percentile(random_1d, 50))
+    assert abs(got - expected) < 1e-9
+def test_unsorted_q_returns_in_input_order(warm_jit, random_1d):
+    """
+    The returned array should be in the order of the input `q`,
+    not the sorted order we use internally.
+    """
+    qs = [99.0, 1.0, 50.0]
+    expected = np.percentile(random_1d, qs)
+    got = fastpercentile.percentile(random_1d, qs)
+    assert np.allclose(expected, got, atol=1e-9)
+def test_endpoints():
+    """
+    Percentile 0 should equal the min, 100 should equal the max.
+    """
+    arr = np.arange(1000, dtype=np.uint16)
+    lo, hi = fastpercentile.percentile(arr, [0, 100])
+    assert lo == 0.0
+    assert hi == 999.0
+def test_handles_fortran_contiguous():
+    """
+    Fortran-contiguous input must not trigger a multi-GB ravel
+    copy; we just walk the raw memory.
+    """
+    arr = np.asfortranarray(
+        np.random.default_rng(1).integers(
+            0, 65536, size=(20, 30, 40), dtype=np.uint16))
+    assert arr.flags.f_contiguous and not arr.flags.c_contiguous
+    expected = np.percentile(arr, [25, 50, 75])
+    got = fastpercentile.percentile(arr, [25, 50, 75])
+    assert np.allclose(expected, got, atol=1e-9)
+def test_handles_strided_view():
+    """
+    A non-contiguous slice should still produce the right answer
+    (via fallback copy).
+    """
+    arr = np.random.default_rng(2).integers(
+        0, 65536, size=(100, 100), dtype=np.uint16)
+    sliced = arr[::2, ::3]
+    assert not sliced.flags.c_contiguous
+    assert not sliced.flags.f_contiguous
+    expected = np.percentile(sliced, [10, 90])
+    got = fastpercentile.percentile(sliced, [10, 90])
+    assert np.allclose(expected, got, atol=1e-9)
+def test_unsupported_dtype_raises():
+    """
+    Floats and 32/64-bit ints should error explicitly rather than
+    silently producing garbage.
+    """
+    with pytest.raises(TypeError, match='int8/uint8/int16/uint16'):
+        fastpercentile.percentile(np.zeros(10, dtype=np.float32), 50)
+    with pytest.raises(TypeError):
+        fastpercentile.percentile(np.zeros(10, dtype=np.uint32), 50)
+def test_q_out_of_range_raises():
+    """
+    Percentiles outside [0, 100] should error.
+    """
+    arr = np.arange(100, dtype=np.uint8)
+    with pytest.raises(ValueError):
+        fastpercentile.percentile(arr, [-1, 50])
+    with pytest.raises(ValueError):
+        fastpercentile.percentile(arr, [50, 101])
+def test_empty_raises():
+    """
+    Percentile of an empty array is undefined.
+    """
+    with pytest.raises(ValueError):
+        fastpercentile.percentile(np.array([], dtype=np.uint8), 50)
+def test_histogram_counts_match_bincount(random_1d, dtype):
+    """
+    `histogram` should produce counts identical to `np.bincount`,
+    after accounting for the signed-dtype offset.
+    """
+    info = np.iinfo(dtype)
+    h = fastpercentile.histogram(random_1d)
+    expected_n_bins = 2 ** (info.bits)
+    assert h.shape == (expected_n_bins,)
+    if dtype.kind == 'u':
+        ref = np.bincount(random_1d.astype(np.int64),
+                          minlength=expected_n_bins)
+    else:
+        offset = 2 ** (info.bits - 1)
+        ref = np.bincount(random_1d.astype(np.int64) + offset,
+                          minlength=expected_n_bins)
+    assert np.array_equal(h, ref)
+    assert h.sum() == random_1d.size
+def test_n_threads_argument(random_1d):
+    """
+    Restricting `n_threads` should produce identical results.
+    """
+    expected = fastpercentile.percentile(random_1d, DEFAULT_QS)
+    got = fastpercentile.percentile(random_1d, DEFAULT_QS, n_threads=1)
+    assert np.allclose(expected, got, atol=1e-9)