fastpercentile 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jasper Phelps
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,118 @@
1
+ Metadata-Version: 2.4
2
+ Name: fastpercentile
3
+ Version: 0.1.0
4
+ Summary: Very fast percentile calculation for small-integer dtypes via parallel histogram
5
+ Author-email: Jasper Phelps <jasper.s.phelps@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Jasper Phelps
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Repository, https://github.com/jasper-tms/fastpercentile
29
+ Keywords: percentile,quantile,histogram,numba,numpy,fast
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: License :: OSI Approved :: MIT License
32
+ Classifier: Operating System :: OS Independent
33
+ Classifier: Development Status :: 4 - Beta
34
+ Classifier: Topic :: Scientific/Engineering
35
+ Requires-Python: >=3.9
36
+ Description-Content-Type: text/markdown
37
+ License-File: LICENSE
38
+ Requires-Dist: numpy
39
+ Requires-Dist: numba
40
+ Provides-Extra: dev
41
+ Requires-Dist: pytest; extra == "dev"
42
+ Requires-Dist: pytest-cov; extra == "dev"
43
+ Requires-Dist: pynrrd; extra == "dev"
44
+ Dynamic: license-file
45
+
46
+ # fastpercentile: Memory-bandwidth-bound percentile for small-integer arrays
47
+
48
+ [![Tests](https://github.com/jasper-tms/fastpercentile/actions/workflows/tests.yml/badge.svg)](https://github.com/jasper-tms/fastpercentile/actions/workflows/tests.yml)
49
+ [![PyPI version](https://img.shields.io/pypi/v/fastpercentile)](https://pypi.org/project/fastpercentile/)
50
+ [![License](https://img.shields.io/github/license/jasper-tms/fastpercentile)](https://github.com/jasper-tms/fastpercentile/blob/main/LICENSE)
51
+
52
+ `np.percentile` is O(n) but in practice has a brutal constant factor — on a 1.7-billion-element `uint16` volume it takes ~22 s, vs ~0.15 s for `np.max`. There is no good reason for percentile to be so much slower than max: both can be done in a single pass over the array.
53
+
54
+ For small-integer dtypes (`int8`, `uint8`, `int16`, `uint16`) the data only takes one of at most 65 536 distinct values, so a single parallel pass into a histogram captures everything you need to compute any percentile. After the histogram is built, walking the cumulative count to find the bin holding each requested rank costs essentially nothing.
55
+
56
+ This package implements that, in a few hundred lines of `numba`. On a 32-thread workstation it runs at DRAM bandwidth — about as fast as `np.max`, and ~300× faster than `np.percentile`:
57
+
58
+ ```
59
+ np.max : 0.148 s
60
+ fastpercentile : 0.072 s <- four percentiles in one pass
61
+ np.percentile : 22.2 s
62
+ ```
63
+
64
+ Auxiliary memory is ~16 MB regardless of input size (32 threads × one 65 536-bin local table each, plus a final reduced histogram), so it adds no measurable RAM pressure on top of the input.
65
+
66
+
67
+ ### Usage
68
+
69
+ ```python
70
+ import numpy as np
71
+ import fastpercentile
72
+
73
+ arr = np.random.randint(0, 65536, size=(305, 96, 69, 846), dtype=np.uint16)
74
+
75
+ # A scalar percentile
76
+ p99 = fastpercentile.percentile(arr, 99)
77
+
78
+ # Multiple percentiles in a single pass over the data
79
+ p1, p50, p99, p99_9 = fastpercentile.percentile(arr, [1, 50, 99, 99.9])
80
+
81
+ # Or just grab the histogram if you want to do something else with it
82
+ hist = fastpercentile.histogram(arr) # length 65536 for uint16
83
+ ```
84
+
85
+ Results match `numpy.percentile(arr, q)` with the default `'linear'` interpolation method (typically exact for integer inputs).
86
+
87
+
88
+ ### Supported dtypes
89
+
90
+ `int8`, `uint8`, `int16`, `uint16`. Floats and 32/64-bit integers are not supported because a direct histogram is not feasible for them — for those, use `numpy.percentile` or `bottleneck.nanpercentile`.
91
+
92
+
93
+ ### Memory layout
94
+
95
+ A 4D array loaded by `pynrrd` is typically Fortran-contiguous. `fastpercentile` walks raw memory, so it does not care about C vs F order — both are handled as a no-copy view. Arbitrarily-strided arrays fall back to a copy.
96
+
97
+
98
+ ### Installation
99
+
100
+ **Option 1:** `pip install` from PyPI:
101
+
102
+ pip install fastpercentile
103
+
104
+ **Option 2:** `pip install` directly from GitHub:
105
+
106
+ pip install git+https://github.com/jasper-tms/fastpercentile.git
107
+
108
+ **Option 3:** First `git clone` this repo and then `pip install` it from your clone:
109
+
110
+ cd ~/repos
111
+ git clone https://github.com/jasper-tms/fastpercentile.git
112
+ cd fastpercentile
113
+ pip install '.[dev]'
114
+
115
+
116
+ ### Notes on threading
117
+
118
+ `fastpercentile` uses every logical core on the machine by default (via `numba.get_num_threads()`). To limit it for a particular call, pass `n_threads=N`; to set it globally, use `numba.set_num_threads(N)` or the `NUMBA_NUM_THREADS` environment variable. On most systems the workload saturates DRAM bandwidth around `nproc / 2` threads, so reserving a few cores for the rest of the machine costs little throughput.
@@ -0,0 +1,73 @@
1
+ # fastpercentile: Memory-bandwidth-bound percentile for small-integer arrays
2
+
3
+ [![Tests](https://github.com/jasper-tms/fastpercentile/actions/workflows/tests.yml/badge.svg)](https://github.com/jasper-tms/fastpercentile/actions/workflows/tests.yml)
4
+ [![PyPI version](https://img.shields.io/pypi/v/fastpercentile)](https://pypi.org/project/fastpercentile/)
5
+ [![License](https://img.shields.io/github/license/jasper-tms/fastpercentile)](https://github.com/jasper-tms/fastpercentile/blob/main/LICENSE)
6
+
7
+ `np.percentile` is O(n) but in practice has a brutal constant factor — on a 1.7-billion-element `uint16` volume it takes ~22 s, vs ~0.15 s for `np.max`. There is no good reason for percentile to be so much slower than max: both can be done in a single pass over the array.
8
+
9
+ For small-integer dtypes (`int8`, `uint8`, `int16`, `uint16`) the data only takes one of at most 65 536 distinct values, so a single parallel pass into a histogram captures everything you need to compute any percentile. After the histogram is built, walking the cumulative count to find the bin holding each requested rank costs essentially nothing.
10
+
11
+ This package implements that, in a few hundred lines of `numba`. On a 32-thread workstation it runs at DRAM bandwidth — about as fast as `np.max`, and ~300× faster than `np.percentile`:
12
+
13
+ ```
14
+ np.max : 0.148 s
15
+ fastpercentile : 0.072 s <- four percentiles in one pass
16
+ np.percentile : 22.2 s
17
+ ```
18
+
19
+ Auxiliary memory is ~16 MB regardless of input size (32 threads × one 65 536-bin local table each, plus a final reduced histogram), so it adds no measurable RAM pressure on top of the input.
20
+
21
+
22
+ ### Usage
23
+
24
+ ```python
25
+ import numpy as np
26
+ import fastpercentile
27
+
28
+ arr = np.random.randint(0, 65536, size=(305, 96, 69, 846), dtype=np.uint16)
29
+
30
+ # A scalar percentile
31
+ p99 = fastpercentile.percentile(arr, 99)
32
+
33
+ # Multiple percentiles in a single pass over the data
34
+ p1, p50, p99, p99_9 = fastpercentile.percentile(arr, [1, 50, 99, 99.9])
35
+
36
+ # Or just grab the histogram if you want to do something else with it
37
+ hist = fastpercentile.histogram(arr) # length 65536 for uint16
38
+ ```
39
+
40
+ Results match `numpy.percentile(arr, q)` with the default `'linear'` interpolation method (typically exact for integer inputs).
41
+
42
+
43
+ ### Supported dtypes
44
+
45
+ `int8`, `uint8`, `int16`, `uint16`. Floats and 32/64-bit integers are not supported because a direct histogram is not feasible for them — for those, use `numpy.percentile` or `bottleneck.nanpercentile`.
46
+
47
+
48
+ ### Memory layout
49
+
50
+ A 4D array loaded by `pynrrd` is typically Fortran-contiguous. `fastpercentile` walks raw memory, so it does not care about C vs F order — both are handled as a no-copy view. Arbitrarily-strided arrays fall back to a copy.
51
+
52
+
53
+ ### Installation
54
+
55
+ **Option 1:** `pip install` from PyPI:
56
+
57
+ pip install fastpercentile
58
+
59
+ **Option 2:** `pip install` directly from GitHub:
60
+
61
+ pip install git+https://github.com/jasper-tms/fastpercentile.git
62
+
63
+ **Option 3:** First `git clone` this repo and then `pip install` it from your clone:
64
+
65
+ cd ~/repos
66
+ git clone https://github.com/jasper-tms/fastpercentile.git
67
+ cd fastpercentile
68
+ pip install '.[dev]'
69
+
70
+
71
+ ### Notes on threading
72
+
73
+ `fastpercentile` uses every logical core on the machine by default (via `numba.get_num_threads()`). To limit it for a particular call, pass `n_threads=N`; to set it globally, use `numba.set_num_threads(N)` or the `NUMBA_NUM_THREADS` environment variable. On most systems the workload saturates DRAM bandwidth around `nproc / 2` threads, so reserving a few cores for the rest of the machine costs little throughput.
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from .core import *
@@ -0,0 +1,340 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Memory-bandwidth-bound percentile for small-integer arrays.
4
+
5
+ For `int8`, `uint8`, `int16`, and `uint16` inputs of any shape this
6
+ module computes one or more percentiles in a single parallel pass
7
+ over the data with results matching `numpy.percentile` (default
8
+ 'linear' / method 7 interpolation).
9
+
10
+ The idea
11
+ --------
12
+ For an N-element array with at most B distinct values (B = 256 for
13
+ 8-bit, 65536 for 16-bit), a single linear scan into a B-bin
14
+ histogram captures all the information we need to find any rank.
15
+ This is the same shape of work as `np.max` -- one pass, branchless
16
+ inner loop -- so it runs at memory bandwidth. After the scan we
17
+ walk the cumulative histogram to locate the bins containing each
18
+ requested rank; that walk is O(B + n_percentiles) which is
19
+ negligible.
20
+
21
+ Public API
22
+ ----------
23
+ `percentile(arr, q, n_threads=None)`
24
+ Compute one or more percentiles. Mirrors `numpy.percentile`.
25
+
26
+ `histogram(arr, n_threads=None)`
27
+ Build a parallel histogram of `arr`.
28
+ """
29
+ import numpy as np
30
+ from numba import njit, prange, get_num_threads
31
+ from typing import Sequence, Union
32
+
33
+ __all__ = ['percentile', 'histogram', 'warmup']
34
+
35
+
36
+ # --------------------------------------------------------------- #
37
+ # Parallel histograms. Each thread fills a private table; we then
38
+ # reduce. Bin index for signed types is `value + offset` so that
39
+ # the binning is monotonic in the original value.
40
+ # --------------------------------------------------------------- #
41
+
42
+
43
+ @njit(cache=True, parallel=True, boundscheck=False)
44
+ def _hist_u8(arr: np.ndarray,
45
+ n_threads: int) -> np.ndarray:
46
+ """
47
+ Parallel histogram for uint8.
48
+ """
49
+ n_bins = 256
50
+ n = arr.shape[0]
51
+ local = np.zeros((n_threads, n_bins), dtype=np.int64)
52
+ chunk = (n + n_threads - 1) // n_threads
53
+ for t in prange(n_threads):
54
+ start = t * chunk
55
+ end = start + chunk
56
+ if end > n:
57
+ end = n
58
+ loc = local[t]
59
+ for i in range(start, end):
60
+ loc[arr[i]] += 1
61
+ out = np.zeros(n_bins, dtype=np.int64)
62
+ for t in range(n_threads):
63
+ for b in range(n_bins):
64
+ out[b] += local[t, b]
65
+ return out
66
+
67
+
68
+ @njit(cache=True, parallel=True, boundscheck=False)
69
+ def _hist_i8(arr: np.ndarray,
70
+ n_threads: int) -> np.ndarray:
71
+ """
72
+ Parallel histogram for int8 (offset = 128).
73
+ """
74
+ n_bins = 256
75
+ n = arr.shape[0]
76
+ local = np.zeros((n_threads, n_bins), dtype=np.int64)
77
+ chunk = (n + n_threads - 1) // n_threads
78
+ for t in prange(n_threads):
79
+ start = t * chunk
80
+ end = start + chunk
81
+ if end > n:
82
+ end = n
83
+ loc = local[t]
84
+ for i in range(start, end):
85
+ loc[arr[i] + 128] += 1
86
+ out = np.zeros(n_bins, dtype=np.int64)
87
+ for t in range(n_threads):
88
+ for b in range(n_bins):
89
+ out[b] += local[t, b]
90
+ return out
91
+
92
+
93
+ @njit(cache=True, parallel=True, boundscheck=False)
94
+ def _hist_u16(arr: np.ndarray,
95
+ n_threads: int) -> np.ndarray:
96
+ """
97
+ Parallel histogram for uint16.
98
+ """
99
+ n_bins = 65536
100
+ n = arr.shape[0]
101
+ local = np.zeros((n_threads, n_bins), dtype=np.int64)
102
+ chunk = (n + n_threads - 1) // n_threads
103
+ for t in prange(n_threads):
104
+ start = t * chunk
105
+ end = start + chunk
106
+ if end > n:
107
+ end = n
108
+ loc = local[t]
109
+ for i in range(start, end):
110
+ loc[arr[i]] += 1
111
+ out = np.zeros(n_bins, dtype=np.int64)
112
+ for t in range(n_threads):
113
+ for b in range(n_bins):
114
+ out[b] += local[t, b]
115
+ return out
116
+
117
+
118
+ @njit(cache=True, parallel=True, boundscheck=False)
119
+ def _hist_i16(arr: np.ndarray,
120
+ n_threads: int) -> np.ndarray:
121
+ """
122
+ Parallel histogram for int16 (offset = 32768).
123
+ """
124
+ n_bins = 65536
125
+ n = arr.shape[0]
126
+ local = np.zeros((n_threads, n_bins), dtype=np.int64)
127
+ chunk = (n + n_threads - 1) // n_threads
128
+ for t in prange(n_threads):
129
+ start = t * chunk
130
+ end = start + chunk
131
+ if end > n:
132
+ end = n
133
+ loc = local[t]
134
+ for i in range(start, end):
135
+ loc[arr[i] + 32768] += 1
136
+ out = np.zeros(n_bins, dtype=np.int64)
137
+ for t in range(n_threads):
138
+ for b in range(n_bins):
139
+ out[b] += local[t, b]
140
+ return out
141
+
142
+
143
+ # --------------------------------------------------------------- #
144
+ # Rank-walking from a finished histogram. Caller must sort the
145
+ # percentiles ascending so we can scan the cumulative count once.
146
+ # --------------------------------------------------------------- #
147
+
148
+
149
+ @njit(cache=True, boundscheck=False)
150
+ def _ranks_from_hist(hist: np.ndarray,
151
+ ranks_lo: np.ndarray,
152
+ ranks_hi: np.ndarray,
153
+ fracs: np.ndarray,
154
+ offset: int) -> np.ndarray:
155
+ """
156
+ Walk the cumulative histogram to find the bin holding each
157
+ integer rank, then linearly interpolate.
158
+
159
+ Parameters
160
+ ----------
161
+ hist : int64 array
162
+ Histogram counts.
163
+ ranks_lo, ranks_hi : int64 arrays of equal length
164
+ 0-indexed integer ranks bracketing each query. Both must
165
+ be sorted ascending.
166
+ fracs : float64 array
167
+ Fractional position between `ranks_lo` and `ranks_hi`.
168
+ offset : int
169
+ Subtracted from bin indices to recover original values.
170
+
171
+ Returns
172
+ -------
173
+ float64 array of interpolated percentile values.
174
+ """
175
+ n_q = ranks_lo.shape[0]
176
+ n_bins = hist.shape[0]
177
+ out = np.empty(n_q, dtype=np.float64)
178
+
179
+ cum = np.int64(0)
180
+ bin_idx = 0
181
+ for q in range(n_q):
182
+ target_lo = ranks_lo[q]
183
+ while bin_idx < n_bins and cum + hist[bin_idx] <= target_lo:
184
+ cum += hist[bin_idx]
185
+ bin_idx += 1
186
+ bin_lo = bin_idx
187
+
188
+ target_hi = ranks_hi[q]
189
+ cum_hi = cum
190
+ bin_idx_hi = bin_idx
191
+ while bin_idx_hi < n_bins and cum_hi + hist[bin_idx_hi] <= target_hi:
192
+ cum_hi += hist[bin_idx_hi]
193
+ bin_idx_hi += 1
194
+ bin_hi = bin_idx_hi
195
+
196
+ val_lo = float(bin_lo - offset)
197
+ val_hi = float(bin_hi - offset)
198
+ out[q] = val_lo + fracs[q] * (val_hi - val_lo)
199
+ return out
200
+
201
+
202
+ # --------------------------------------------------------------- #
203
+ # Public entry points.
204
+ # --------------------------------------------------------------- #
205
+
206
+
207
+ _DTYPE_DISPATCH = {
208
+ np.dtype('uint8'): (_hist_u8, 0),
209
+ np.dtype('int8'): (_hist_i8, 128),
210
+ np.dtype('uint16'): (_hist_u16, 0),
211
+ np.dtype('int16'): (_hist_i16, 32768),
212
+ }
213
+
214
+
215
+ def _as_flat_view(arr: np.ndarray) -> np.ndarray:
216
+ """
217
+ Return a 1D view of `arr` without copying when possible.
218
+
219
+ Order does not matter for histograms, so for either C- or
220
+ F-contiguous input we just walk the raw memory. For
221
+ arbitrarily-strided input we fall back to a copy.
222
+ """
223
+ if arr.flags.c_contiguous:
224
+ return arr.reshape(-1)
225
+ if arr.flags.f_contiguous:
226
+ return arr.ravel(order='F')
227
+ return np.ascontiguousarray(arr).ravel()
228
+
229
+
230
+ def histogram(arr: np.ndarray,
231
+ n_threads: Union[int, None] = None) -> np.ndarray:
232
+ """
233
+ Build a parallel histogram of `arr`.
234
+
235
+ Parameters
236
+ ----------
237
+ arr : np.ndarray of int8/uint8/int16/uint16
238
+ Any shape; treated as a flat sequence of values.
239
+ n_threads : int, optional
240
+ Number of parallel histogram threads. Defaults to
241
+ `numba.get_num_threads()`.
242
+
243
+ Returns
244
+ -------
245
+ np.ndarray of int64
246
+ Length 256 for 8-bit input, 65536 for 16-bit input. The
247
+ bin index for value `v` is `v + offset`, where offset is 0
248
+ for unsigned dtypes and `2 ** (bits - 1)` for signed
249
+ dtypes.
250
+ """
251
+ if n_threads is None:
252
+ n_threads = get_num_threads()
253
+ arr_flat = _as_flat_view(arr)
254
+ try:
255
+ hist_fn, _ = _DTYPE_DISPATCH[arr_flat.dtype]
256
+ except KeyError:
257
+ raise TypeError(
258
+ 'fastpercentile only supports int8/uint8/int16/uint16, '
259
+ 'got ' + str(arr_flat.dtype))
260
+ return hist_fn(arr_flat, n_threads)
261
+
262
+
263
+ def percentile(arr: np.ndarray,
264
+ q: Union[float, Sequence[float], np.ndarray],
265
+ n_threads: Union[int, None] = None
266
+ ) -> Union[float, np.ndarray]:
267
+ """
268
+ Compute one or more percentiles of `arr` via a parallel
269
+ histogram and a cumulative-count walk.
270
+
271
+ Matches `numpy.percentile(arr, q)` with the default 'linear'
272
+ interpolation method to within float rounding (typically exact
273
+ for integer inputs).
274
+
275
+ Parameters
276
+ ----------
277
+ arr : np.ndarray of int8/uint8/int16/uint16
278
+ Any shape; treated as a flat sequence of values.
279
+ q : float or sequence of floats in [0, 100]
280
+ Percentile(s) to compute.
281
+ n_threads : int, optional
282
+ Number of parallel histogram threads. Defaults to
283
+ `numba.get_num_threads()`.
284
+
285
+ Returns
286
+ -------
287
+ float or np.ndarray of float64
288
+ Scalar if `q` is scalar, else an array shaped like
289
+ `np.atleast_1d(q)`.
290
+ """
291
+ if n_threads is None:
292
+ n_threads = get_num_threads()
293
+
294
+ arr_flat = _as_flat_view(arr)
295
+ n_total = arr_flat.size
296
+ if n_total == 0:
297
+ raise ValueError('percentile of empty array is undefined')
298
+
299
+ try:
300
+ hist_fn, offset = _DTYPE_DISPATCH[arr_flat.dtype]
301
+ except KeyError:
302
+ raise TypeError(
303
+ 'fastpercentile only supports int8/uint8/int16/uint16, '
304
+ 'got ' + str(arr_flat.dtype))
305
+
306
+ hist = hist_fn(arr_flat, n_threads)
307
+
308
+ q_in = np.asarray(q, dtype=np.float64)
309
+ q_was_scalar = (q_in.ndim == 0)
310
+ q_arr = np.atleast_1d(q_in)
311
+ if (q_arr < 0).any() or (q_arr > 100).any():
312
+ raise ValueError('percentiles must lie in [0, 100]')
313
+
314
+ # Sort percentiles ascending so we can scan the cumulative
315
+ # histogram once; remember the inverse permutation.
316
+ order = np.argsort(q_arr, kind='stable')
317
+ q_sorted = q_arr[order]
318
+
319
+ exact_rank = (n_total - 1) * q_sorted / 100.0
320
+ ranks_lo = np.floor(exact_rank).astype(np.int64)
321
+ ranks_hi = np.minimum(ranks_lo + 1, n_total - 1)
322
+ fracs = exact_rank - ranks_lo
323
+
324
+ sorted_out = _ranks_from_hist(hist, ranks_lo, ranks_hi, fracs, offset)
325
+
326
+ out = np.empty_like(sorted_out)
327
+ out[order] = sorted_out
328
+ if q_was_scalar:
329
+ return float(out[0])
330
+ return out
331
+
332
+
333
+ def warmup() -> None:
334
+ """
335
+ Trigger JIT compilation for all four dtype paths so the first
336
+ real call has no compile latency.
337
+ """
338
+ for dtype in (np.uint8, np.int8, np.uint16, np.int16):
339
+ tiny = np.zeros(8, dtype=dtype)
340
+ percentile(tiny, [0.0, 50.0, 100.0])
@@ -0,0 +1,118 @@
1
+ Metadata-Version: 2.4
2
+ Name: fastpercentile
3
+ Version: 0.1.0
4
+ Summary: Very fast percentile calculation for small-integer dtypes via parallel histogram
5
+ Author-email: Jasper Phelps <jasper.s.phelps@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Jasper Phelps
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Repository, https://github.com/jasper-tms/fastpercentile
29
+ Keywords: percentile,quantile,histogram,numba,numpy,fast
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: License :: OSI Approved :: MIT License
32
+ Classifier: Operating System :: OS Independent
33
+ Classifier: Development Status :: 4 - Beta
34
+ Classifier: Topic :: Scientific/Engineering
35
+ Requires-Python: >=3.9
36
+ Description-Content-Type: text/markdown
37
+ License-File: LICENSE
38
+ Requires-Dist: numpy
39
+ Requires-Dist: numba
40
+ Provides-Extra: dev
41
+ Requires-Dist: pytest; extra == "dev"
42
+ Requires-Dist: pytest-cov; extra == "dev"
43
+ Requires-Dist: pynrrd; extra == "dev"
44
+ Dynamic: license-file
45
+
46
+ # fastpercentile: Memory-bandwidth-bound percentile for small-integer arrays
47
+
48
+ [![Tests](https://github.com/jasper-tms/fastpercentile/actions/workflows/tests.yml/badge.svg)](https://github.com/jasper-tms/fastpercentile/actions/workflows/tests.yml)
49
+ [![PyPI version](https://img.shields.io/pypi/v/fastpercentile)](https://pypi.org/project/fastpercentile/)
50
+ [![License](https://img.shields.io/github/license/jasper-tms/fastpercentile)](https://github.com/jasper-tms/fastpercentile/blob/main/LICENSE)
51
+
52
+ `np.percentile` is O(n) but in practice has a brutal constant factor — on a 1.7-billion-element `uint16` volume it takes ~22 s, vs ~0.15 s for `np.max`. There is no good reason for percentile to be so much slower than max: both can be done in a single pass over the array.
53
+
54
+ For small-integer dtypes (`int8`, `uint8`, `int16`, `uint16`) the data only takes one of at most 65 536 distinct values, so a single parallel pass into a histogram captures everything you need to compute any percentile. After the histogram is built, walking the cumulative count to find the bin holding each requested rank costs essentially nothing.
55
+
56
+ This package implements that, in a few hundred lines of `numba`. On a 32-thread workstation it runs at DRAM bandwidth — about as fast as `np.max`, and ~300× faster than `np.percentile`:
57
+
58
+ ```
59
+ np.max : 0.148 s
60
+ fastpercentile : 0.072 s <- four percentiles in one pass
61
+ np.percentile : 22.2 s
62
+ ```
63
+
64
+ Auxiliary memory is ~16 MB regardless of input size (32 threads × one 65 536-bin local table each, plus a final reduced histogram), so it adds no measurable RAM pressure on top of the input.
65
+
66
+
67
+ ### Usage
68
+
69
+ ```python
70
+ import numpy as np
71
+ import fastpercentile
72
+
73
+ arr = np.random.randint(0, 65536, size=(305, 96, 69, 846), dtype=np.uint16)
74
+
75
+ # A scalar percentile
76
+ p99 = fastpercentile.percentile(arr, 99)
77
+
78
+ # Multiple percentiles in a single pass over the data
79
+ p1, p50, p99, p99_9 = fastpercentile.percentile(arr, [1, 50, 99, 99.9])
80
+
81
+ # Or just grab the histogram if you want to do something else with it
82
+ hist = fastpercentile.histogram(arr) # length 65536 for uint16
83
+ ```
84
+
85
+ Results match `numpy.percentile(arr, q)` with the default `'linear'` interpolation method (typically exact for integer inputs).
86
+
87
+
88
+ ### Supported dtypes
89
+
90
+ `int8`, `uint8`, `int16`, `uint16`. Floats and 32/64-bit integers are not supported because a direct histogram is not feasible for them — for those, use `numpy.percentile` or `bottleneck.nanpercentile`.
91
+
92
+
93
+ ### Memory layout
94
+
95
+ A 4D array loaded by `pynrrd` is typically Fortran-contiguous. `fastpercentile` walks raw memory, so it does not care about C vs F order — both are handled as a no-copy view. Arbitrarily-strided arrays fall back to a copy.
96
+
97
+
98
+ ### Installation
99
+
100
+ **Option 1:** `pip install` from PyPI:
101
+
102
+ pip install fastpercentile
103
+
104
+ **Option 2:** `pip install` directly from GitHub:
105
+
106
+ pip install git+https://github.com/jasper-tms/fastpercentile.git
107
+
108
+ **Option 3:** First `git clone` this repo and then `pip install` it from your clone:
109
+
110
+ cd ~/repos
111
+ git clone https://github.com/jasper-tms/fastpercentile.git
112
+ cd fastpercentile
113
+ pip install '.[dev]'
114
+
115
+
116
+ ### Notes on threading
117
+
118
+ `fastpercentile` uses every logical core on the machine by default (via `numba.get_num_threads()`). To limit it for a particular call, pass `n_threads=N`; to set it globally, use `numba.set_num_threads(N)` or the `NUMBA_NUM_THREADS` environment variable. On most systems the workload saturates DRAM bandwidth around `nproc / 2` threads, so reserving a few cores for the rest of the machine costs little throughput.
@@ -0,0 +1,11 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ fastpercentile/__init__.py
5
+ fastpercentile/core.py
6
+ fastpercentile.egg-info/PKG-INFO
7
+ fastpercentile.egg-info/SOURCES.txt
8
+ fastpercentile.egg-info/dependency_links.txt
9
+ fastpercentile.egg-info/requires.txt
10
+ fastpercentile.egg-info/top_level.txt
11
+ tests/test_percentile.py
@@ -0,0 +1,7 @@
1
+ numpy
2
+ numba
3
+
4
+ [dev]
5
+ pytest
6
+ pytest-cov
7
+ pynrrd
@@ -0,0 +1 @@
1
+ fastpercentile
@@ -0,0 +1,39 @@
1
+ [build-system]
2
+ requires = ['setuptools', 'wheel']
3
+ build-backend = 'setuptools.build_meta'
4
+
5
+ [project]
6
+ name = 'fastpercentile'
7
+ version = '0.1.0'
8
+ description = 'Very fast percentile calculation for small-integer dtypes via parallel histogram'
9
+ readme.file = 'README.md'
10
+ readme.content-type = 'text/markdown'
11
+ requires-python = '>=3.9'
12
+ license = {file = 'LICENSE'}
13
+ keywords = ['percentile', 'quantile', 'histogram', 'numba', 'numpy', 'fast']
14
+ authors = [{name = 'Jasper Phelps', email = 'jasper.s.phelps@gmail.com'}]
15
+ classifiers = [
16
+ 'Programming Language :: Python :: 3',
17
+ 'License :: OSI Approved :: MIT License',
18
+ 'Operating System :: OS Independent',
19
+ 'Development Status :: 4 - Beta',
20
+ 'Topic :: Scientific/Engineering',
21
+ ]
22
+ urls = {Repository = 'https://github.com/jasper-tms/fastpercentile'}
23
+ dependencies = [
24
+ 'numpy',
25
+ 'numba',
26
+ ]
27
+
28
+ [project.optional-dependencies]
29
+ dev = [
30
+ 'pytest',
31
+ 'pytest-cov',
32
+ 'pynrrd',
33
+ ]
34
+
35
+ [tool.pytest.ini_options]
36
+ testpaths = ['tests']
37
+
38
+ [tool.setuptools.packages.find]
39
+ include = ['fastpercentile']
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,143 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Correctness tests for `fastpercentile.percentile` and
4
+ `fastpercentile.histogram`.
5
+ """
6
+ import numpy as np
7
+ import pytest
8
+
9
+ import fastpercentile
10
+
11
+ from conftest import DEFAULT_QS
12
+
13
+
14
+ def test_matches_numpy_1d(warm_jit, random_1d):
15
+ """
16
+ Multi-percentile call should match `np.percentile` exactly on
17
+ integer inputs.
18
+ """
19
+ expected = np.percentile(random_1d, DEFAULT_QS)
20
+ got = fastpercentile.percentile(random_1d, DEFAULT_QS)
21
+ assert isinstance(got, np.ndarray)
22
+ assert got.shape == expected.shape
23
+ assert np.max(np.abs(expected - got)) < 1e-9
24
+
25
+
26
+ def test_scalar_q_returns_scalar(warm_jit, random_1d):
27
+ """
28
+ Passing a scalar `q` should return a Python float, matching
29
+ numpy's behaviour.
30
+ """
31
+ got = fastpercentile.percentile(random_1d, 50)
32
+ assert isinstance(got, float)
33
+ expected = float(np.percentile(random_1d, 50))
34
+ assert abs(got - expected) < 1e-9
35
+
36
+
37
+ def test_unsorted_q_returns_in_input_order(warm_jit, random_1d):
38
+ """
39
+ The returned array should be in the order of the input `q`,
40
+ not the sorted order we use internally.
41
+ """
42
+ qs = [99.0, 1.0, 50.0]
43
+ expected = np.percentile(random_1d, qs)
44
+ got = fastpercentile.percentile(random_1d, qs)
45
+ assert np.allclose(expected, got, atol=1e-9)
46
+
47
+
48
+ def test_endpoints():
49
+ """
50
+ Percentile 0 should equal the min, 100 should equal the max.
51
+ """
52
+ arr = np.arange(1000, dtype=np.uint16)
53
+ lo, hi = fastpercentile.percentile(arr, [0, 100])
54
+ assert lo == 0.0
55
+ assert hi == 999.0
56
+
57
+
58
+ def test_handles_fortran_contiguous():
59
+ """
60
+ Fortran-contiguous input must not trigger a multi-GB ravel
61
+ copy; we just walk the raw memory.
62
+ """
63
+ arr = np.asfortranarray(
64
+ np.random.default_rng(1).integers(
65
+ 0, 65536, size=(20, 30, 40), dtype=np.uint16))
66
+ assert arr.flags.f_contiguous and not arr.flags.c_contiguous
67
+ expected = np.percentile(arr, [25, 50, 75])
68
+ got = fastpercentile.percentile(arr, [25, 50, 75])
69
+ assert np.allclose(expected, got, atol=1e-9)
70
+
71
+
72
+ def test_handles_strided_view():
73
+ """
74
+ A non-contiguous slice should still produce the right answer
75
+ (via fallback copy).
76
+ """
77
+ arr = np.random.default_rng(2).integers(
78
+ 0, 65536, size=(100, 100), dtype=np.uint16)
79
+ sliced = arr[::2, ::3]
80
+ assert not sliced.flags.c_contiguous
81
+ assert not sliced.flags.f_contiguous
82
+ expected = np.percentile(sliced, [10, 90])
83
+ got = fastpercentile.percentile(sliced, [10, 90])
84
+ assert np.allclose(expected, got, atol=1e-9)
85
+
86
+
87
+ def test_unsupported_dtype_raises():
88
+ """
89
+ Floats and 32/64-bit ints should error explicitly rather than
90
+ silently producing garbage.
91
+ """
92
+ with pytest.raises(TypeError, match='int8/uint8/int16/uint16'):
93
+ fastpercentile.percentile(np.zeros(10, dtype=np.float32), 50)
94
+ with pytest.raises(TypeError):
95
+ fastpercentile.percentile(np.zeros(10, dtype=np.uint32), 50)
96
+
97
+
98
+ def test_q_out_of_range_raises():
99
+ """
100
+ Percentiles outside [0, 100] should error.
101
+ """
102
+ arr = np.arange(100, dtype=np.uint8)
103
+ with pytest.raises(ValueError):
104
+ fastpercentile.percentile(arr, [-1, 50])
105
+ with pytest.raises(ValueError):
106
+ fastpercentile.percentile(arr, [50, 101])
107
+
108
+
109
+ def test_empty_raises():
110
+ """
111
+ Percentile of an empty array is undefined.
112
+ """
113
+ with pytest.raises(ValueError):
114
+ fastpercentile.percentile(np.array([], dtype=np.uint8), 50)
115
+
116
+
117
+ def test_histogram_counts_match_bincount(random_1d, dtype):
118
+ """
119
+ `histogram` should produce counts identical to `np.bincount`,
120
+ after accounting for the signed-dtype offset.
121
+ """
122
+ info = np.iinfo(dtype)
123
+ h = fastpercentile.histogram(random_1d)
124
+ expected_n_bins = 2 ** (info.bits)
125
+ assert h.shape == (expected_n_bins,)
126
+ if dtype.kind == 'u':
127
+ ref = np.bincount(random_1d.astype(np.int64),
128
+ minlength=expected_n_bins)
129
+ else:
130
+ offset = 2 ** (info.bits - 1)
131
+ ref = np.bincount(random_1d.astype(np.int64) + offset,
132
+ minlength=expected_n_bins)
133
+ assert np.array_equal(h, ref)
134
+ assert h.sum() == random_1d.size
135
+
136
+
137
+ def test_n_threads_argument(random_1d):
138
+ """
139
+ Restricting `n_threads` should produce identical results.
140
+ """
141
+ expected = fastpercentile.percentile(random_1d, DEFAULT_QS)
142
+ got = fastpercentile.percentile(random_1d, DEFAULT_QS, n_threads=1)
143
+ assert np.allclose(expected, got, atol=1e-9)