views-frames 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- views_frames/__init__.py +41 -0
- views_frames/_typing.py +24 -0
- views_frames/_validation.py +114 -0
- views_frames/conformance/__init__.py +123 -0
- views_frames/feature_frame.py +188 -0
- views_frames/index.py +309 -0
- views_frames/io/__init__.py +13 -0
- views_frames/io/arrow.py +103 -0
- views_frames/io/npz.py +59 -0
- views_frames/metadata.py +36 -0
- views_frames/prediction_frame.py +143 -0
- views_frames/protocols.py +82 -0
- views_frames/py.typed +0 -0
- views_frames/spatial_level.py +41 -0
- views_frames/target_frame.py +138 -0
- views_frames-1.0.0.dist-info/METADATA +624 -0
- views_frames-1.0.0.dist-info/RECORD +27 -0
- views_frames-1.0.0.dist-info/WHEEL +4 -0
- views_frames-1.0.0.dist-info/licenses/LICENSE +21 -0
- views_frames_summarize/__init__.py +29 -0
- views_frames_summarize/_common.py +68 -0
- views_frames_summarize/aggregate.py +83 -0
- views_frames_summarize/collapse.py +37 -0
- views_frames_summarize/conformance.py +40 -0
- views_frames_summarize/interval.py +62 -0
- views_frames_summarize/point.py +104 -0
- views_frames_summarize/py.typed +0 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Simon Polichinel von der Maase
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""views_frames_summarize — posterior / sample-axis summarization over frames.
|
|
2
|
+
|
|
3
|
+
A sibling package to `views_frames` (ADR-017): it operates on frames and owns the
|
|
4
|
+
volatile statistics the leaf must not. Depends on `views_frames` + numpy only;
|
|
5
|
+
never the reverse (enforced by ``tests/test_import_enforcement.py``).
|
|
6
|
+
|
|
7
|
+
Conventions (ADR-017): point estimates (mean/median/MAP, generic ``collapse``)
|
|
8
|
+
return a `(N, …, 1)` **frame**; interval estimates (HDI, quantiles) return numpy
|
|
9
|
+
arrays **aligned to the input frame's index** (the caller holds the index).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from views_frames_summarize.aggregate import (
|
|
15
|
+
aggregate_distributions,
|
|
16
|
+
aggregate_distributions_arrays,
|
|
17
|
+
)
|
|
18
|
+
from views_frames_summarize.collapse import collapse
|
|
19
|
+
from views_frames_summarize.interval import hdi, quantiles
|
|
20
|
+
from views_frames_summarize.point import map_estimate
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"aggregate_distributions",
|
|
24
|
+
"aggregate_distributions_arrays",
|
|
25
|
+
"collapse",
|
|
26
|
+
"hdi",
|
|
27
|
+
"map_estimate",
|
|
28
|
+
"quantiles",
|
|
29
|
+
]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Shared helpers for the summarize package.
|
|
2
|
+
|
|
3
|
+
Rebuilds a frame of the same concrete type with new values, preserving the index
|
|
4
|
+
and metadata — the structural plumbing every reducer needs.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from collections.abc import Callable
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from numpy.typing import NDArray
|
|
13
|
+
|
|
14
|
+
from views_frames import (
|
|
15
|
+
FeatureFrame,
|
|
16
|
+
PredictionFrame,
|
|
17
|
+
SpatioTemporalIndex,
|
|
18
|
+
TargetFrame,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
AnyFrame = PredictionFrame | FeatureFrame | TargetFrame
|
|
22
|
+
|
|
23
|
+
# Default row-block size for the memory-bounded estimators. Blocking caps the peak
|
|
24
|
+
# memory of an estimator at ``O(block * …)`` regardless of row count, so the
|
|
25
|
+
# full-grid reduction path stays well under the #181 OOM (register C-22, C-25).
|
|
26
|
+
ROW_BLOCK = 1 << 16
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def block_apply(
|
|
30
|
+
values: NDArray[np.float32],
|
|
31
|
+
block_rows: int,
|
|
32
|
+
fn: Callable[[NDArray[np.float32]], NDArray[np.float32]],
|
|
33
|
+
) -> NDArray[np.float32]:
|
|
34
|
+
"""Apply ``fn`` to row-blocks of ``values`` (over axis 0), concatenating results.
|
|
35
|
+
|
|
36
|
+
``fn`` maps a ``(block, …)`` slice to a ``(block, …)`` result (same axis-0
|
|
37
|
+
length). Peak memory is bounded by one block's working set, not the whole grid.
|
|
38
|
+
Frames at or below ``block_rows`` rows take the single-shot path (no copy).
|
|
39
|
+
"""
|
|
40
|
+
n = values.shape[0]
|
|
41
|
+
if n <= block_rows:
|
|
42
|
+
return fn(values)
|
|
43
|
+
parts = [
|
|
44
|
+
fn(values[start : start + block_rows]) for start in range(0, n, block_rows)
|
|
45
|
+
]
|
|
46
|
+
return np.concatenate(parts, axis=0)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def rebuild(
|
|
50
|
+
frame: AnyFrame,
|
|
51
|
+
values: NDArray[np.float32],
|
|
52
|
+
index: SpatioTemporalIndex | None = None,
|
|
53
|
+
) -> AnyFrame:
|
|
54
|
+
"""Return a frame of the same type as ``frame`` with new ``values``.
|
|
55
|
+
|
|
56
|
+
The metadata (and, for `FeatureFrame`, `feature_names`) is preserved. The index
|
|
57
|
+
defaults to the input frame's; pass ``index`` to rebuild at a different index
|
|
58
|
+
(e.g. after cross-level aggregation). The new values are validated by the
|
|
59
|
+
frame's constructor.
|
|
60
|
+
"""
|
|
61
|
+
idx = frame.index if index is None else index
|
|
62
|
+
if isinstance(frame, FeatureFrame):
|
|
63
|
+
return FeatureFrame(values, idx, frame.feature_names, frame.metadata)
|
|
64
|
+
if isinstance(frame, PredictionFrame):
|
|
65
|
+
return PredictionFrame(values, idx, frame.metadata)
|
|
66
|
+
if isinstance(frame, TargetFrame):
|
|
67
|
+
return TargetFrame(values, idx, frame.metadata)
|
|
68
|
+
raise TypeError(f"unsupported frame type: {type(frame).__name__}")
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Conservation-correct cross-level aggregation of sample distributions (ADR-017).
|
|
2
|
+
|
|
3
|
+
Sum the per-cell sample arrays across the cells of each coarser unit **preserving the
|
|
4
|
+
sample index** (joint sampling), so the aggregated uncertainty is correct —
|
|
5
|
+
``HDI(sum) != sum(HDI)`` (the faoapi C-70 concern). The ``(time, unit) ->
|
|
6
|
+
target_unit`` mapping is **injected** by the caller (the same map the leaf's
|
|
7
|
+
``cross_level_align`` takes — time-varying, register C-20); no geography is
|
|
8
|
+
embedded here.
|
|
9
|
+
|
|
10
|
+
The mapping may be a Python ``dict`` (``aggregate_distributions``) or parallel
|
|
11
|
+
arrays (``aggregate_distributions_arrays``) — the columnar form avoids building a
|
|
12
|
+
~10.5M-key dict at grid scale (register C-26).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from collections.abc import Mapping
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
from numpy.typing import NDArray
|
|
22
|
+
|
|
23
|
+
from views_frames import SpatialLevel, SpatioTemporalIndex
|
|
24
|
+
from views_frames_summarize._common import AnyFrame, rebuild
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def aggregate_distributions(
|
|
28
|
+
frame: AnyFrame,
|
|
29
|
+
mapping: Mapping[tuple[int, int], int],
|
|
30
|
+
target_level: SpatialLevel,
|
|
31
|
+
) -> AnyFrame:
|
|
32
|
+
"""Aggregate a frame's sample distributions up to ``target_level``.
|
|
33
|
+
|
|
34
|
+
Rows are grouped by ``(time, target_unit)`` — where ``target_unit`` comes from the
|
|
35
|
+
injected ``(time, unit) -> target_unit`` ``mapping`` via the leaf's
|
|
36
|
+
``cross_level_align`` — and the sample arrays are summed **element-wise across the
|
|
37
|
+
constituent cells** (joint sampling). Time is preserved.
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
ValueError: ``mapping`` is missing/empty, is not keyed by ``(time, unit)``
|
|
41
|
+
pairs, or a row's ``(time, unit)`` has no entry (inherited from
|
|
42
|
+
``cross_level_align`` — the leaf never guesses a mapping).
|
|
43
|
+
"""
|
|
44
|
+
remapped = frame.index.cross_level_align(mapping, target_level)
|
|
45
|
+
return _aggregate_to(frame, remapped, target_level)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def aggregate_distributions_arrays(
|
|
49
|
+
frame: AnyFrame,
|
|
50
|
+
map_keys: NDArray[np.integer[Any]],
|
|
51
|
+
map_vals: NDArray[np.integer[Any]],
|
|
52
|
+
target_level: SpatialLevel,
|
|
53
|
+
) -> AnyFrame:
|
|
54
|
+
"""Columnar ``aggregate_distributions`` — the mapping as parallel arrays.
|
|
55
|
+
|
|
56
|
+
Identical semantics, but the mapping is injected as ``map_keys`` ``(M, 2)`` and
|
|
57
|
+
``map_vals`` ``(M,)`` rather than a Python ``dict``, so a producer holding a
|
|
58
|
+
grid-scale time-varying mapping never materializes a giant dict (register C-26).
|
|
59
|
+
Delegates to the leaf's ``cross_level_align_arrays``.
|
|
60
|
+
"""
|
|
61
|
+
remapped = frame.index.cross_level_align_arrays(map_keys, map_vals, target_level)
|
|
62
|
+
return _aggregate_to(frame, remapped, target_level)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _aggregate_to(
|
|
66
|
+
frame: AnyFrame, remapped: SpatioTemporalIndex, target_level: SpatialLevel
|
|
67
|
+
) -> AnyFrame:
|
|
68
|
+
"""Sum samples within each ``(time, target_unit)`` group of ``remapped``."""
|
|
69
|
+
keys = np.stack(
|
|
70
|
+
[remapped.time.astype(np.int64), remapped.unit.astype(np.int64)], axis=1
|
|
71
|
+
)
|
|
72
|
+
unique, inverse = np.unique(keys, axis=0, return_inverse=True)
|
|
73
|
+
inverse = np.asarray(inverse).reshape(-1)
|
|
74
|
+
|
|
75
|
+
agg = np.zeros((unique.shape[0], *frame.values.shape[1:]), dtype=np.float32)
|
|
76
|
+
np.add.at(agg, inverse, frame.values)
|
|
77
|
+
|
|
78
|
+
agg_index = SpatioTemporalIndex(
|
|
79
|
+
time=np.asarray(unique[:, 0], dtype=np.int64),
|
|
80
|
+
unit=np.asarray(unique[:, 1], dtype=np.int64),
|
|
81
|
+
level=target_level,
|
|
82
|
+
)
|
|
83
|
+
return rebuild(frame, agg, agg_index)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""`collapse` — the generic sample-axis fold (a point estimate over the samples).
|
|
2
|
+
|
|
3
|
+
The statistic is **injected** by the caller (e.g. ``np.mean``, ``np.median``); this
|
|
4
|
+
package owns the *mechanism* (reduce the trailing axis, rebuild a valid frame), not
|
|
5
|
+
a menu of statistics. This is the operation that was removed from the leaf (ADR-017).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Callable
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
from views_frames_summarize._common import AnyFrame, rebuild
|
|
16
|
+
|
|
17
|
+
# A reducer is applied as ``reducer(values, axis=-1)`` and reduces the sample axis.
|
|
18
|
+
Reducer = Callable[..., Any]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def collapse(frame: AnyFrame, reducer: Reducer) -> AnyFrame:
|
|
22
|
+
"""Reduce the trailing sample axis with ``reducer``, returning a new frame.
|
|
23
|
+
|
|
24
|
+
``reducer`` is called as ``reducer(frame.values, axis=-1)`` — any numpy-style
|
|
25
|
+
reduction works (``np.mean``, ``np.median``, ``np.max`` …). The result is a
|
|
26
|
+
point estimate with an explicit trailing axis of size 1 (e.g. `(N, S) → (N, 1)`,
|
|
27
|
+
`(N, F, S) → (N, F, 1)`).
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
frame: A frame with a trailing sample axis.
|
|
31
|
+
reducer: A callable taking ``(values, axis=-1)`` and reducing that axis.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
A new frame of the same type with the sample axis collapsed to size 1.
|
|
35
|
+
"""
|
|
36
|
+
reduced = np.asarray(reducer(frame.values, axis=-1), dtype=np.float32)
|
|
37
|
+
return rebuild(frame, reduced[..., np.newaxis])
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Conformance checks for the summarize package (ADR-016/017).
|
|
2
|
+
|
|
3
|
+
A consumer can re-run these against its own frame factories to confirm the
|
|
4
|
+
summarizers behave: point estimates return same-type `(N, …, 1)` frames; interval
|
|
5
|
+
estimates return arrays aligned to the input frame's rows.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
from views_frames_summarize._common import AnyFrame
|
|
13
|
+
from views_frames_summarize.collapse import collapse
|
|
14
|
+
from views_frames_summarize.interval import hdi, quantiles
|
|
15
|
+
from views_frames_summarize.point import map_estimate
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def assert_summarizer_contract(frame: AnyFrame) -> None:
|
|
19
|
+
"""Assert the summarizers behave on ``frame``.
|
|
20
|
+
|
|
21
|
+
Raises:
|
|
22
|
+
AssertionError: a summarizer violates its output contract.
|
|
23
|
+
"""
|
|
24
|
+
n = frame.n_rows
|
|
25
|
+
|
|
26
|
+
point = collapse(frame, np.mean)
|
|
27
|
+
assert type(point) is type(frame), "collapse must return the same frame type"
|
|
28
|
+
assert point.values.shape[-1] == 1, "collapse must reduce the sample axis to 1"
|
|
29
|
+
assert point.n_rows == n, "collapse must preserve rows"
|
|
30
|
+
|
|
31
|
+
mode = map_estimate(frame)
|
|
32
|
+
assert mode.values.shape[-1] == 1 and mode.n_rows == n, "map_estimate → (N,…,1)"
|
|
33
|
+
|
|
34
|
+
lo_hi = hdi(frame, mass=0.9)
|
|
35
|
+
assert lo_hi.shape[0] == n, "hdi must be aligned to the frame's rows"
|
|
36
|
+
assert lo_hi.shape[-1] == 2, "hdi must produce (lower, upper)"
|
|
37
|
+
|
|
38
|
+
qs = quantiles(frame, [0.1, 0.5, 0.9])
|
|
39
|
+
assert qs.shape[0] == n, "quantiles must be aligned to the frame's rows"
|
|
40
|
+
assert qs.shape[-1] == 3, "quantiles must produce one column per quantile"
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Interval estimates over the sample axis (ADR-017).
|
|
2
|
+
|
|
3
|
+
Return numpy arrays **aligned to the input frame's index** (the caller holds the
|
|
4
|
+
index): `hdi` → `(N, …, 2)` lower/upper; `quantiles` → `(N, …, len(qs))`.
|
|
5
|
+
|
|
6
|
+
Both reduce the **trailing** sample axis and are vectorized (no per-row Python
|
|
7
|
+
loop). They run in **row-blocks** (`block_rows`) so peak memory is bounded by one
|
|
8
|
+
block's working set rather than a full-grid sorted copy — the same discipline as
|
|
9
|
+
`map_estimate`, so the whole reduction family stays under the #181 OOM (register
|
|
10
|
+
C-25).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from collections.abc import Sequence
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
from numpy.typing import NDArray
|
|
19
|
+
|
|
20
|
+
from views_frames_summarize._common import ROW_BLOCK, AnyFrame, block_apply
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def hdi(
|
|
24
|
+
frame: AnyFrame, mass: float = 0.9, *, block_rows: int = ROW_BLOCK
|
|
25
|
+
) -> NDArray[np.float32]:
|
|
26
|
+
"""Per-row highest-density interval over the sample axis → `(N, …, 2)`.
|
|
27
|
+
|
|
28
|
+
The shortest interval containing ``floor(mass * S)`` samples (empirical HDI),
|
|
29
|
+
computed vectorized over the trailing axis, in row-blocks of ``block_rows``.
|
|
30
|
+
"""
|
|
31
|
+
s = frame.values.shape[-1]
|
|
32
|
+
k = int(np.floor(mass * s))
|
|
33
|
+
|
|
34
|
+
def _hdi_block(vals: NDArray[np.float32]) -> NDArray[np.float32]:
|
|
35
|
+
srt = np.sort(vals, axis=-1)
|
|
36
|
+
if k < 1:
|
|
37
|
+
lower = srt[..., 0]
|
|
38
|
+
return np.stack([lower, lower], axis=-1)
|
|
39
|
+
# widest-to-narrowest: for each candidate start i, width = srt[i+k] - srt[i];
|
|
40
|
+
# the narrowest window is the HDI. argmin returns the first minimum.
|
|
41
|
+
widths = srt[..., k:] - srt[..., : s - k]
|
|
42
|
+
i = np.argmin(widths, axis=-1)
|
|
43
|
+
lower = np.take_along_axis(srt, i[..., np.newaxis], axis=-1)[..., 0]
|
|
44
|
+
upper = np.take_along_axis(srt, (i + k)[..., np.newaxis], axis=-1)[..., 0]
|
|
45
|
+
return np.stack([lower, upper], axis=-1)
|
|
46
|
+
|
|
47
|
+
out = block_apply(frame.values, block_rows, _hdi_block)
|
|
48
|
+
return np.asarray(out, dtype=np.float32)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def quantiles(
|
|
52
|
+
frame: AnyFrame, qs: Sequence[float], *, block_rows: int = ROW_BLOCK
|
|
53
|
+
) -> NDArray[np.float32]:
|
|
54
|
+
"""Per-row quantiles over the sample axis → `(N, …, len(qs))`, index-aligned."""
|
|
55
|
+
q_levels = np.asarray(qs, dtype=np.float64)
|
|
56
|
+
|
|
57
|
+
def _q_block(vals: NDArray[np.float32]) -> NDArray[np.float32]:
|
|
58
|
+
q = np.quantile(vals, q_levels, axis=-1)
|
|
59
|
+
return np.moveaxis(np.asarray(q, dtype=np.float32), 0, -1)
|
|
60
|
+
|
|
61
|
+
out = block_apply(frame.values, block_rows, _q_block)
|
|
62
|
+
return np.asarray(out, dtype=np.float32)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Point estimates over the sample axis (ADR-017) — return a `(N, …, 1)` frame.
|
|
2
|
+
|
|
3
|
+
`map_estimate` is the maximum-a-posteriori estimate as faoapi/reporting compute it:
|
|
4
|
+
the empirical density peak (histogram), with a zero-mass→0 rule for the
|
|
5
|
+
zero-inflated conflict distributions. The mechanism reduces the **trailing** axis;
|
|
6
|
+
the leaf guarantees that axis is the sample axis (ADR-012).
|
|
7
|
+
|
|
8
|
+
The histogram is computed **batched in row-blocks** (no per-row Python loop) so
|
|
9
|
+
it scales to the full grid (register C-22). Blocking caps peak memory at
|
|
10
|
+
``O(block * bins)`` regardless of row count — a whole-grid batch would allocate a
|
|
11
|
+
``rows × bins`` counts matrix and re-introduce the #181 OOM. The batched binning
|
|
12
|
+
reproduces ``numpy.histogram``'s uniform-bin **counts** and breaks ties on the
|
|
13
|
+
integer counts (lowest-index), so the selected bin is **deterministic and
|
|
14
|
+
identical on every numpy version** (register C-24); the bin centre matches the
|
|
15
|
+
per-row reference to float32 precision (proven by `test_summarize_scale.py`).
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
from numpy.typing import NDArray
|
|
22
|
+
|
|
23
|
+
from views_frames_summarize._common import ROW_BLOCK, AnyFrame, rebuild
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def map_estimate(
|
|
27
|
+
frame: AnyFrame,
|
|
28
|
+
*,
|
|
29
|
+
bins: int = 100,
|
|
30
|
+
zero_mass_threshold: float = 0.3,
|
|
31
|
+
block_rows: int = ROW_BLOCK,
|
|
32
|
+
) -> AnyFrame:
|
|
33
|
+
"""Per-row MAP estimate over the sample axis → a `(N, …, 1)` frame.
|
|
34
|
+
|
|
35
|
+
For each row: if a fraction ``>= zero_mass_threshold`` of the samples is ~0 the
|
|
36
|
+
MAP is ``0.0``; otherwise it is the centre of the densest histogram bin. The
|
|
37
|
+
work runs in row-blocks of ``block_rows`` to bound peak memory (register C-22).
|
|
38
|
+
"""
|
|
39
|
+
values = frame.values
|
|
40
|
+
lead = values.shape[:-1]
|
|
41
|
+
s = values.shape[-1]
|
|
42
|
+
# Bin in the input dtype, exactly as the v0.2.0 per-row np.histogram did —
|
|
43
|
+
# upcasting to float64 would shift the bin edges and pick a different mode.
|
|
44
|
+
flat = np.ascontiguousarray(values).reshape(-1, s)
|
|
45
|
+
|
|
46
|
+
result = np.empty(flat.shape[0], dtype=np.float32)
|
|
47
|
+
for start in range(0, flat.shape[0], block_rows):
|
|
48
|
+
block = flat[start : start + block_rows]
|
|
49
|
+
centers = _batched_map(block, bins)
|
|
50
|
+
mass_at_zero = np.mean(np.isclose(block, 0.0, atol=1e-8), axis=1)
|
|
51
|
+
result[start : start + block_rows] = np.where(
|
|
52
|
+
mass_at_zero >= zero_mass_threshold, 0.0, centers
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
reduced = result.reshape(lead)[..., np.newaxis]
|
|
56
|
+
return rebuild(frame, reduced)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _batched_map(flat: NDArray[np.float32], bins: int) -> NDArray[np.float32]:
|
|
60
|
+
"""Centre of the densest histogram bin for each row of a row-block ``(M, S)``.
|
|
61
|
+
|
|
62
|
+
Reproduces ``numpy.histogram``'s uniform-bin path row-by-row but vectorized:
|
|
63
|
+
same dtype, same edges (``linspace``), same float-rounding correction, so the
|
|
64
|
+
per-row bin counts — and therefore the argmax and bin centre — are identical.
|
|
65
|
+
"""
|
|
66
|
+
m = flat.shape[0]
|
|
67
|
+
dtype = flat.dtype
|
|
68
|
+
first = flat.min(axis=1)
|
|
69
|
+
last = flat.max(axis=1)
|
|
70
|
+
# all-equal rows: numpy widens the range to (v - 0.5, v + 0.5).
|
|
71
|
+
degenerate = first == last
|
|
72
|
+
half = np.array(0.5, dtype=dtype)
|
|
73
|
+
first = np.where(degenerate, first - half, first)
|
|
74
|
+
last = np.where(degenerate, last + half, last)
|
|
75
|
+
span = last - first
|
|
76
|
+
|
|
77
|
+
# Per-row bin edges — numpy.histogram builds these with linspace at bin dtype.
|
|
78
|
+
edges = np.linspace(first, last, bins + 1, axis=1).astype(dtype) # (M, bins + 1)
|
|
79
|
+
|
|
80
|
+
# numpy's uniform-bin index: ((a - first) / span) * bins, then the exact
|
|
81
|
+
# float-rounding correction against the gathered edges.
|
|
82
|
+
f_idx = ((flat - first[:, None]) / span[:, None]) * bins
|
|
83
|
+
idx = f_idx.astype(np.intp)
|
|
84
|
+
idx[idx == bins] = bins - 1
|
|
85
|
+
left = np.take_along_axis(edges, idx, axis=1)
|
|
86
|
+
idx[flat < left] -= 1
|
|
87
|
+
right = np.take_along_axis(edges, idx + 1, axis=1)
|
|
88
|
+
idx[(flat >= right) & (idx != bins - 1)] += 1
|
|
89
|
+
|
|
90
|
+
# Batched bincount: offset each row into its own length-``bins`` block.
|
|
91
|
+
offsets = idx + (np.arange(m)[:, None] * bins)
|
|
92
|
+
counts = np.bincount(offsets.ravel(), minlength=m * bins).reshape(m, bins)
|
|
93
|
+
|
|
94
|
+
# The densest bin = the one with the most samples. Tie-break on the **integer
|
|
95
|
+
# counts** (lowest-index wins), not on ``counts / width`` density: the bins are
|
|
96
|
+
# uniform so density and counts agree on the winner — *except* on ties, where
|
|
97
|
+
# the float64 bin widths differ by ~1 ulp across numpy versions and flip the
|
|
98
|
+
# argmax (register C-24). Integer ``argmax`` is deterministic and identical on
|
|
99
|
+
# every numpy build, so ``map_estimate`` is portable and reproducible.
|
|
100
|
+
densest = np.argmax(counts, axis=1)
|
|
101
|
+
|
|
102
|
+
lo = np.take_along_axis(edges, densest[:, None], axis=1)[:, 0]
|
|
103
|
+
hi = np.take_along_axis(edges, (densest + 1)[:, None], axis=1)[:, 0]
|
|
104
|
+
return np.asarray((lo + hi) / 2.0, dtype=np.float32)
|
|
File without changes
|