views-frames 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- views_frames/__init__.py +41 -0
- views_frames/_typing.py +24 -0
- views_frames/_validation.py +114 -0
- views_frames/conformance/__init__.py +123 -0
- views_frames/feature_frame.py +188 -0
- views_frames/index.py +309 -0
- views_frames/io/__init__.py +13 -0
- views_frames/io/arrow.py +103 -0
- views_frames/io/npz.py +59 -0
- views_frames/metadata.py +36 -0
- views_frames/prediction_frame.py +143 -0
- views_frames/protocols.py +82 -0
- views_frames/py.typed +0 -0
- views_frames/spatial_level.py +41 -0
- views_frames/target_frame.py +138 -0
- views_frames-1.0.0.dist-info/METADATA +624 -0
- views_frames-1.0.0.dist-info/RECORD +27 -0
- views_frames-1.0.0.dist-info/WHEEL +4 -0
- views_frames-1.0.0.dist-info/licenses/LICENSE +21 -0
- views_frames_summarize/__init__.py +29 -0
- views_frames_summarize/_common.py +68 -0
- views_frames_summarize/aggregate.py +83 -0
- views_frames_summarize/collapse.py +37 -0
- views_frames_summarize/conformance.py +40 -0
- views_frames_summarize/interval.py +62 -0
- views_frames_summarize/point.py +104 -0
- views_frames_summarize/py.typed +0 -0
views_frames/index.py
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
"""`SpatioTemporalIndex` — the genuinely-reused alignment primitive.
|
|
2
|
+
|
|
3
|
+
`{time, unit, level}` integer arrays plus **same-level** pure-numpy alignment
|
|
4
|
+
(intersect / reindex / is_superset_of / argsort / searchsorted). Cross-level
|
|
5
|
+
(cm↔pgm) alignment is exposed via `cross_level_align`, whose mapping is
|
|
6
|
+
**injected by the consumer** and never embedded or fetched here (ADR-014,
|
|
7
|
+
register C-14).
|
|
8
|
+
|
|
9
|
+
The same-level join is the pure-numpy unwrap of the proven
|
|
10
|
+
`pd.Index.get_indexer` pattern in `views-faoapi/.../data/handlers.py`.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from collections.abc import Mapping
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
from numpy.typing import NDArray
|
|
19
|
+
|
|
20
|
+
from views_frames._typing import IntArray
|
|
21
|
+
from views_frames._validation import validate_identifiers
|
|
22
|
+
from views_frames.spatial_level import SpatialLevel
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SpatioTemporalIndex:
|
|
26
|
+
"""An immutable ``{time, unit, level}`` row index with same-level alignment.
|
|
27
|
+
|
|
28
|
+
**Row-uniqueness stance (register C-21).** A frame *may* contain duplicate
|
|
29
|
+
``(time, unit)`` rows — ``cross_level_align`` deliberately produces them (many
|
|
30
|
+
pgm cells map to one country, to be summed by ``aggregate_distributions``), so
|
|
31
|
+
uniqueness is **not** a global invariant and is **not** validated at
|
|
32
|
+
construction. The **same-level joins** (``searchsorted``/``reindex``/
|
|
33
|
+
``intersect``/``is_superset_of``), however, assume one row per ``(time, unit)``
|
|
34
|
+
and give undefined results on duplicates. A consumer that needs that guarantee
|
|
35
|
+
should check :meth:`has_unique_rows` before joining; the default path stays
|
|
36
|
+
allocation-free.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
time: IntArray,
|
|
42
|
+
unit: IntArray,
|
|
43
|
+
level: SpatialLevel,
|
|
44
|
+
) -> None:
|
|
45
|
+
if not isinstance(level, SpatialLevel):
|
|
46
|
+
raise TypeError(
|
|
47
|
+
f"level must be a SpatialLevel, got {type(level).__name__}"
|
|
48
|
+
)
|
|
49
|
+
is_array = isinstance(time, np.ndarray) and time.ndim >= 1
|
|
50
|
+
n = int(time.shape[0]) if is_array else -1
|
|
51
|
+
validate_identifiers({"time": time, "unit": unit}, n_rows=n)
|
|
52
|
+
# store as read-only views so the value object cannot be mutated in place
|
|
53
|
+
self._time = np.ascontiguousarray(time)
|
|
54
|
+
self._unit = np.ascontiguousarray(unit)
|
|
55
|
+
self._time.setflags(write=False)
|
|
56
|
+
self._unit.setflags(write=False)
|
|
57
|
+
self._level = level
|
|
58
|
+
|
|
59
|
+
# ---- core surface -------------------------------------------------------
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def time(self) -> IntArray:
|
|
63
|
+
"""The time identifier array (read-only)."""
|
|
64
|
+
return self._time
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def unit(self) -> IntArray:
|
|
68
|
+
"""The unit identifier array (read-only)."""
|
|
69
|
+
return self._unit
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def level(self) -> SpatialLevel:
|
|
73
|
+
"""The spatial level (cm/pgm) of these rows."""
|
|
74
|
+
return self._level
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def n_rows(self) -> int:
|
|
78
|
+
"""Number of rows (the first axis length)."""
|
|
79
|
+
return int(self._time.shape[0])
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def identifiers(self) -> dict[str, IntArray]:
|
|
83
|
+
"""The integer identifier arrays, keyed by name."""
|
|
84
|
+
return {"time": self._time, "unit": self._unit}
|
|
85
|
+
|
|
86
|
+
def __len__(self) -> int:
|
|
87
|
+
return self.n_rows
|
|
88
|
+
|
|
89
|
+
def __eq__(self, other: object) -> bool:
|
|
90
|
+
if not isinstance(other, SpatioTemporalIndex):
|
|
91
|
+
return NotImplemented
|
|
92
|
+
return (
|
|
93
|
+
self._level == other._level
|
|
94
|
+
and np.array_equal(self._time, other._time)
|
|
95
|
+
and np.array_equal(self._unit, other._unit)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def __hash__(self) -> int: # value objects are immutable; hash by identity surface
|
|
99
|
+
return hash((self._level, self._time.tobytes(), self._unit.tobytes()))
|
|
100
|
+
|
|
101
|
+
# ---- internal key representation ---------------------------------------
|
|
102
|
+
|
|
103
|
+
def _keys(self) -> NDArray[np.int64]:
|
|
104
|
+
"""A contiguous ``(N, 2)`` int64 ``(time, unit)`` key array."""
|
|
105
|
+
return np.ascontiguousarray(
|
|
106
|
+
np.stack([self._time.astype(np.int64), self._unit.astype(np.int64)], axis=1)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def _row_view(keys: NDArray[np.int64]) -> NDArray[np.void]:
|
|
111
|
+
"""View each ``(time, unit)`` row as a single void scalar for set ops."""
|
|
112
|
+
return np.ascontiguousarray(keys).view(
|
|
113
|
+
np.dtype((np.void, keys.dtype.itemsize * keys.shape[1]))
|
|
114
|
+
).reshape(-1)
|
|
115
|
+
|
|
116
|
+
def _require_same_level(self, other: SpatioTemporalIndex) -> None:
|
|
117
|
+
if self._level != other._level:
|
|
118
|
+
raise ValueError(
|
|
119
|
+
"same-level operation requires equal SpatialLevel; "
|
|
120
|
+
f"got {self._level} and {other._level}. Use cross_level_align."
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# ---- same-level alignment ----------------------------------------------
|
|
124
|
+
|
|
125
|
+
def argsort(self) -> NDArray[np.intp]:
|
|
126
|
+
"""Positions that sort the rows by ``(time, unit)`` (time-major)."""
|
|
127
|
+
return np.asarray(np.lexsort((self._unit, self._time)), dtype=np.intp)
|
|
128
|
+
|
|
129
|
+
def searchsorted(self, other: SpatioTemporalIndex) -> NDArray[np.intp]:
|
|
130
|
+
"""For each row of ``other``, its position in ``self`` (-1 if absent).
|
|
131
|
+
|
|
132
|
+
The pure-numpy analogue of ``pd.Index.get_indexer``: a same-level join.
|
|
133
|
+
"""
|
|
134
|
+
self._require_same_level(other)
|
|
135
|
+
self_rows = self._row_view(self._keys())
|
|
136
|
+
other_rows = self._row_view(other._keys())
|
|
137
|
+
order = np.argsort(self_rows, kind="stable")
|
|
138
|
+
sorted_rows = self_rows[order]
|
|
139
|
+
pos = np.searchsorted(sorted_rows, other_rows)
|
|
140
|
+
pos = np.clip(pos, 0, len(sorted_rows) - 1)
|
|
141
|
+
found = sorted_rows[pos] == other_rows
|
|
142
|
+
result = np.where(found, order[pos], -1)
|
|
143
|
+
return result.astype(np.intp)
|
|
144
|
+
|
|
145
|
+
def reindex(self, other: SpatioTemporalIndex) -> NDArray[np.intp]:
|
|
146
|
+
"""Alias of :meth:`searchsorted` — positions to align ``self`` to ``other``."""
|
|
147
|
+
return self.searchsorted(other)
|
|
148
|
+
|
|
149
|
+
def is_superset_of(self, other: SpatioTemporalIndex) -> bool:
|
|
150
|
+
"""True iff every row of ``other`` is present in ``self`` (same level)."""
|
|
151
|
+
self._require_same_level(other)
|
|
152
|
+
self_rows = self._row_view(self._keys())
|
|
153
|
+
other_rows = self._row_view(other._keys())
|
|
154
|
+
return bool(np.isin(other_rows, self_rows).all())
|
|
155
|
+
|
|
156
|
+
def intersect(self, other: SpatioTemporalIndex) -> SpatioTemporalIndex:
|
|
157
|
+
"""A new index of the rows present in **both** ``self`` and ``other``."""
|
|
158
|
+
self._require_same_level(other)
|
|
159
|
+
common = np.intersect1d(
|
|
160
|
+
self._row_view(self._keys()), self._row_view(other._keys())
|
|
161
|
+
)
|
|
162
|
+
keys = common.view(np.int64).reshape(-1, 2)
|
|
163
|
+
return SpatioTemporalIndex(
|
|
164
|
+
time=keys[:, 0].copy(), unit=keys[:, 1].copy(), level=self._level
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
def has_unique_rows(self) -> bool:
|
|
168
|
+
"""True iff every ``(time, unit)`` row is unique (register C-21).
|
|
169
|
+
|
|
170
|
+
Duplicates are **allowed** in a frame, but the same-level joins
|
|
171
|
+
(``searchsorted``/``reindex``/``intersect``/``is_superset_of``) assume
|
|
172
|
+
uniqueness. Call this before joining if the caller cannot otherwise
|
|
173
|
+
guarantee it. ``O(n log n)``; not run by default.
|
|
174
|
+
"""
|
|
175
|
+
rows = self._row_view(self._keys())
|
|
176
|
+
return bool(len(np.unique(rows)) == rows.shape[0])
|
|
177
|
+
|
|
178
|
+
def select(
|
|
179
|
+
self, indexer: IntArray | NDArray[np.bool_]
|
|
180
|
+
) -> SpatioTemporalIndex:
|
|
181
|
+
"""A new index of the rows at integer positions **or** a boolean mask.
|
|
182
|
+
|
|
183
|
+
The row-selection primitive the frame-level ``select``/``reindex`` build on:
|
|
184
|
+
``indexer`` is applied to ``time`` and ``unit`` by numpy fancy indexing
|
|
185
|
+
(so an integer position array reorders/repeats, a boolean mask filters).
|
|
186
|
+
"""
|
|
187
|
+
return SpatioTemporalIndex(
|
|
188
|
+
time=self._time[indexer], unit=self._unit[indexer], level=self._level
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# ---- cross-level alignment (ADR-014) -----------------------------------
|
|
192
|
+
|
|
193
|
+
def cross_level_align(
|
|
194
|
+
self,
|
|
195
|
+
mapping: Mapping[tuple[int, int], int],
|
|
196
|
+
target_level: SpatialLevel,
|
|
197
|
+
) -> SpatioTemporalIndex:
|
|
198
|
+
"""Remap each row's ``unit`` to ``target_level`` using an injected mapping.
|
|
199
|
+
|
|
200
|
+
The cross-level (cm↔pgm) join needs an external, **time-varying**
|
|
201
|
+
``(time, unit) -> target_unit`` mapping (e.g. ``(month_id, priogrid_id) ->
|
|
202
|
+
country_id``): a cell's country assignment changes by month, so the key is
|
|
203
|
+
``(time, unit)``, not ``unit`` alone (ADR-014; register C-20). The leaf owns
|
|
204
|
+
this **operation**; the **mapping is supplied by the caller** and is never
|
|
205
|
+
embedded or fetched here. Time is preserved.
|
|
206
|
+
|
|
207
|
+
The remap is vectorized — ``(time, unit)`` keys are viewed as void scalars
|
|
208
|
+
and matched with a single ``searchsorted`` against the sorted mapping keys —
|
|
209
|
+
so it scales to the full grid (no per-row Python loop; register C-22).
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
mapping: A ``{(time, unit): target_unit}`` mapping injected by the
|
|
213
|
+
consumer, keyed by the ``(time, unit)`` pair.
|
|
214
|
+
target_level: The ``SpatialLevel`` of the produced index.
|
|
215
|
+
|
|
216
|
+
Raises:
|
|
217
|
+
ValueError: ``mapping`` is missing/empty, is not keyed by ``(time,
|
|
218
|
+
unit)`` pairs, or a row's ``(time, unit)`` has no entry in
|
|
219
|
+
``mapping`` (the leaf never guesses a mapping).
|
|
220
|
+
TypeError: ``target_level`` is not a ``SpatialLevel``.
|
|
221
|
+
"""
|
|
222
|
+
if not isinstance(target_level, SpatialLevel):
|
|
223
|
+
got = type(target_level).__name__
|
|
224
|
+
raise TypeError(f"target_level must be a SpatialLevel, got {got}")
|
|
225
|
+
if mapping is None or len(mapping) == 0:
|
|
226
|
+
raise ValueError(
|
|
227
|
+
"cross_level_align requires an injected (time, unit)->target_unit "
|
|
228
|
+
"mapping; the leaf never embeds or fetches it (ADR-014)."
|
|
229
|
+
)
|
|
230
|
+
map_keys = np.array(list(mapping.keys()), dtype=np.int64)
|
|
231
|
+
if map_keys.ndim != 2 or map_keys.shape[1] != 2:
|
|
232
|
+
raise ValueError(
|
|
233
|
+
"cross_level_align mapping must be keyed by (time, unit) pairs "
|
|
234
|
+
"(register C-20); got keys that are not 2-tuples."
|
|
235
|
+
)
|
|
236
|
+
map_vals = np.array(list(mapping.values()), dtype=self._unit.dtype)
|
|
237
|
+
return self._remap(map_keys, map_vals, target_level)
|
|
238
|
+
|
|
239
|
+
def cross_level_align_arrays(
|
|
240
|
+
self,
|
|
241
|
+
map_keys: IntArray,
|
|
242
|
+
map_vals: IntArray,
|
|
243
|
+
target_level: SpatialLevel,
|
|
244
|
+
) -> SpatioTemporalIndex:
|
|
245
|
+
"""Columnar form of :meth:`cross_level_align` for grid-scale mappings.
|
|
246
|
+
|
|
247
|
+
Identical semantics, but the ``(time, unit) -> target_unit`` mapping is
|
|
248
|
+
injected as **parallel arrays** — ``map_keys`` of shape ``(M, 2)`` and
|
|
249
|
+
``map_vals`` of shape ``(M,)`` — rather than a Python ``dict``. At full-grid
|
|
250
|
+
scale building and materializing a ~10.5M-key dict is the dominant cost
|
|
251
|
+
(~30× slower, ~10× the memory of the columnar form; register C-26); a
|
|
252
|
+
producer that already holds the mapping columnar passes it straight through.
|
|
253
|
+
|
|
254
|
+
Raises:
|
|
255
|
+
ValueError: ``map_keys`` is not ``(M, 2)``, ``map_vals`` is not length
|
|
256
|
+
``M``, the mapping is empty, or a row's ``(time, unit)`` is absent.
|
|
257
|
+
TypeError: ``target_level`` is not a ``SpatialLevel``.
|
|
258
|
+
"""
|
|
259
|
+
if not isinstance(target_level, SpatialLevel):
|
|
260
|
+
got = type(target_level).__name__
|
|
261
|
+
raise TypeError(f"target_level must be a SpatialLevel, got {got}")
|
|
262
|
+
keys = np.ascontiguousarray(map_keys, dtype=np.int64)
|
|
263
|
+
vals = np.asarray(map_vals)
|
|
264
|
+
if keys.ndim != 2 or keys.shape[1] != 2:
|
|
265
|
+
raise ValueError(
|
|
266
|
+
"cross_level_align_arrays map_keys must be an (M, 2) array of "
|
|
267
|
+
"(time, unit) rows (register C-20/C-26)."
|
|
268
|
+
)
|
|
269
|
+
if vals.shape != (keys.shape[0],):
|
|
270
|
+
raise ValueError(
|
|
271
|
+
"cross_level_align_arrays map_vals must be a length-M array "
|
|
272
|
+
"aligned to map_keys."
|
|
273
|
+
)
|
|
274
|
+
if keys.shape[0] == 0:
|
|
275
|
+
raise ValueError(
|
|
276
|
+
"cross_level_align_arrays requires a non-empty mapping; the leaf "
|
|
277
|
+
"never embeds or fetches it (ADR-014)."
|
|
278
|
+
)
|
|
279
|
+
return self._remap(keys, vals, target_level)
|
|
280
|
+
|
|
281
|
+
def _remap(
|
|
282
|
+
self, map_keys: IntArray, map_vals: IntArray, target_level: SpatialLevel
|
|
283
|
+
) -> SpatioTemporalIndex:
|
|
284
|
+
"""The vectorized ``(time, unit) -> target`` remap shared by both entries.
|
|
285
|
+
|
|
286
|
+
``map_keys`` is coerced to a contiguous int64 ``(M, 2)`` so the void-view
|
|
287
|
+
keys match ``self``'s; a single ``searchsorted`` does the lookup; a missing
|
|
288
|
+
``(time, unit)`` fails loud.
|
|
289
|
+
"""
|
|
290
|
+
keys = np.ascontiguousarray(map_keys, dtype=np.int64)
|
|
291
|
+
map_rows = self._row_view(keys)
|
|
292
|
+
order = np.argsort(map_rows, kind="stable")
|
|
293
|
+
sorted_rows = map_rows[order]
|
|
294
|
+
|
|
295
|
+
self_rows = self._row_view(self._keys())
|
|
296
|
+
pos = np.clip(
|
|
297
|
+
np.searchsorted(sorted_rows, self_rows), 0, len(sorted_rows) - 1
|
|
298
|
+
)
|
|
299
|
+
found = sorted_rows[pos] == self_rows
|
|
300
|
+
if not bool(found.all()):
|
|
301
|
+
miss = int(np.argmax(~found))
|
|
302
|
+
t, u = int(self._time[miss]), int(self._unit[miss])
|
|
303
|
+
raise ValueError(
|
|
304
|
+
f"(time, unit) ({t}, {u}) has no entry in the injected mapping"
|
|
305
|
+
)
|
|
306
|
+
mapped = np.asarray(map_vals[order[pos]], dtype=self._unit.dtype)
|
|
307
|
+
return SpatioTemporalIndex(
|
|
308
|
+
time=self._time.copy(), unit=mapped, level=target_level
|
|
309
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Serialization adapters — frame ↔ bytes *format*, never *transport* (ADR-009).
|
|
2
|
+
|
|
3
|
+
Two scalable formats; list-in-cell object-dtype is banned (README §7):
|
|
4
|
+
|
|
5
|
+
- `npz` — native ``values.npy`` + ``identifiers.npz`` (mmap-capable).
|
|
6
|
+
- `arrow` — flat-columnar parquet (the scalable interchange format).
|
|
7
|
+
|
|
8
|
+
`pyarrow` is imported only inside this subpackage, never in the core frames.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
__all__: list[str] = []
|
views_frames/io/arrow.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""Flat-columnar (parquet) serialization — the scalable interchange format.
|
|
2
|
+
|
|
3
|
+
One scalar cell per ``(time, unit, sample)`` (features become columns for a 3-D
|
|
4
|
+
feature frame); the scalable replacement for the banned list-in-cell encoding
|
|
5
|
+
(README §7). This is the **only** module permitted to import ``pyarrow`` (the
|
|
6
|
+
optional ``[arrow]`` extra). Operates on a frame's state dict (register C-09).
|
|
7
|
+
|
|
8
|
+
The reconstruction shape (``n_features`` / ``n_samples``) and the header (level,
|
|
9
|
+
metadata, feature_names) ride in the parquet schema key-value metadata so the
|
|
10
|
+
round-trip is exact.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pyarrow as pa
|
|
21
|
+
import pyarrow.parquet as pq
|
|
22
|
+
from numpy.typing import NDArray
|
|
23
|
+
|
|
24
|
+
from views_frames._typing import IntArray
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def save(
|
|
28
|
+
path: Path | str,
|
|
29
|
+
*,
|
|
30
|
+
values: NDArray[np.float32],
|
|
31
|
+
time: IntArray,
|
|
32
|
+
unit: IntArray,
|
|
33
|
+
level: str,
|
|
34
|
+
metadata: dict[str, Any],
|
|
35
|
+
feature_names: list[str] | None = None,
|
|
36
|
+
) -> None:
|
|
37
|
+
"""Write a frame's state as flat-columnar parquet (one scalar cell per row)."""
|
|
38
|
+
if values.ndim == 2:
|
|
39
|
+
n, s = values.shape
|
|
40
|
+
n_features = 0
|
|
41
|
+
elif values.ndim == 3:
|
|
42
|
+
n, n_features, s = values.shape
|
|
43
|
+
else:
|
|
44
|
+
raise ValueError(f"unsupported values.ndim={values.ndim}")
|
|
45
|
+
|
|
46
|
+
time_col = np.repeat(time, s)
|
|
47
|
+
unit_col = np.repeat(unit, s)
|
|
48
|
+
sample_col = np.tile(np.arange(s, dtype=np.int32), n)
|
|
49
|
+
columns: dict[str, NDArray[Any]] = {
|
|
50
|
+
"time": time_col,
|
|
51
|
+
"unit": unit_col,
|
|
52
|
+
"sample": sample_col,
|
|
53
|
+
}
|
|
54
|
+
if values.ndim == 2:
|
|
55
|
+
columns["value"] = values.reshape(n * s)
|
|
56
|
+
else:
|
|
57
|
+
for f in range(n_features):
|
|
58
|
+
columns[f"f{f}"] = np.ascontiguousarray(values[:, f, :]).reshape(n * s)
|
|
59
|
+
|
|
60
|
+
header = {
|
|
61
|
+
"level": level,
|
|
62
|
+
"metadata": metadata,
|
|
63
|
+
"feature_names": feature_names,
|
|
64
|
+
"n_features": n_features,
|
|
65
|
+
"n_samples": s,
|
|
66
|
+
"ndim": int(values.ndim),
|
|
67
|
+
}
|
|
68
|
+
table = pa.table(columns)
|
|
69
|
+
table = table.replace_schema_metadata({"views_frames": json.dumps(header)})
|
|
70
|
+
pq.write_table(table, str(path))
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def load(path: Path | str) -> dict[str, Any]:
|
|
74
|
+
"""Read a flat-columnar parquet frame state written by :func:`save`."""
|
|
75
|
+
table = pq.read_table(str(path))
|
|
76
|
+
raw = table.schema.metadata or {}
|
|
77
|
+
header = json.loads(raw[b"views_frames"].decode())
|
|
78
|
+
s = int(header["n_samples"])
|
|
79
|
+
ndim = int(header["ndim"])
|
|
80
|
+
|
|
81
|
+
time_col = table.column("time").to_numpy()
|
|
82
|
+
unit_col = table.column("unit").to_numpy()
|
|
83
|
+
n = time_col.shape[0] // s
|
|
84
|
+
time = time_col.reshape(n, s)[:, 0]
|
|
85
|
+
unit = unit_col.reshape(n, s)[:, 0]
|
|
86
|
+
|
|
87
|
+
if ndim == 2:
|
|
88
|
+
values = table.column("value").to_numpy().reshape(n, s).astype(np.float32)
|
|
89
|
+
else:
|
|
90
|
+
n_features = int(header["n_features"])
|
|
91
|
+
stacked = [
|
|
92
|
+
table.column(f"f{f}").to_numpy().reshape(n, s) for f in range(n_features)
|
|
93
|
+
]
|
|
94
|
+
values = np.stack(stacked, axis=1).astype(np.float32)
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
"values": values,
|
|
98
|
+
"time": np.ascontiguousarray(time),
|
|
99
|
+
"unit": np.ascontiguousarray(unit),
|
|
100
|
+
"level": header["level"],
|
|
101
|
+
"metadata": header.get("metadata", {}),
|
|
102
|
+
"feature_names": header.get("feature_names"),
|
|
103
|
+
}
|
views_frames/io/npz.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Native serialization: ``values.npy`` + ``identifiers.npz`` (+ JSON header).
|
|
2
|
+
|
|
3
|
+
Operates on a frame's **state dict** — it carries no per-frame schema (register
|
|
4
|
+
C-09); each frame maps its fields to/from the state. The ``mmap`` path returns a
|
|
5
|
+
read-only memmap and preserves the subclass so peak RAM stays the working set
|
|
6
|
+
(register C-07, README §7) — the proven ``PredictionFrame`` idiom.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Literal
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
from numpy.typing import NDArray
|
|
17
|
+
|
|
18
|
+
from views_frames._typing import IntArray
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def save(
|
|
22
|
+
directory: Path | str,
|
|
23
|
+
*,
|
|
24
|
+
values: NDArray[np.float32],
|
|
25
|
+
time: IntArray,
|
|
26
|
+
unit: IntArray,
|
|
27
|
+
level: str,
|
|
28
|
+
metadata: dict[str, Any],
|
|
29
|
+
feature_names: list[str] | None = None,
|
|
30
|
+
) -> None:
|
|
31
|
+
"""Write a frame's state (npy values + npz identifiers + json header)."""
|
|
32
|
+
directory = Path(directory)
|
|
33
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
np.save(directory / "values.npy", values)
|
|
35
|
+
np.savez(directory / "identifiers.npz", time=time, unit=unit)
|
|
36
|
+
header: dict[str, Any] = {"level": level, "metadata": metadata}
|
|
37
|
+
if feature_names is not None:
|
|
38
|
+
header["feature_names"] = feature_names
|
|
39
|
+
payload = json.dumps(header, sort_keys=True, default=str)
|
|
40
|
+
(directory / "header.json").write_text(payload)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def load(directory: Path | str, *, mmap: bool = False) -> dict[str, Any]:
|
|
44
|
+
"""Read a frame's state; ``mmap=True`` returns ``values`` as a read-only memmap."""
|
|
45
|
+
directory = Path(directory)
|
|
46
|
+
mmap_mode: Literal["r"] | None = "r" if mmap else None
|
|
47
|
+
values = np.load(directory / "values.npy", mmap_mode=mmap_mode)
|
|
48
|
+
with np.load(directory / "identifiers.npz") as npz:
|
|
49
|
+
time = npz["time"]
|
|
50
|
+
unit = npz["unit"]
|
|
51
|
+
header = json.loads((directory / "header.json").read_text())
|
|
52
|
+
return {
|
|
53
|
+
"values": values,
|
|
54
|
+
"time": time,
|
|
55
|
+
"unit": unit,
|
|
56
|
+
"level": header["level"],
|
|
57
|
+
"metadata": header.get("metadata", {}),
|
|
58
|
+
"feature_names": header.get("feature_names"),
|
|
59
|
+
}
|
views_frames/metadata.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""`FrameMetadata` — the typed, optional-extensible provenance header (ADR-013).
|
|
2
|
+
|
|
3
|
+
Not a free-form dict: a frozen dataclass with all-optional, validated fields, so
|
|
4
|
+
adding a field is a MINOR change and consumers cannot diverge on key names (the
|
|
5
|
+
store-side cause of reporting's C-48). It is the typed home for run/eval identity.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Mapping
|
|
11
|
+
from dataclasses import dataclass, fields
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class FrameMetadata:
|
|
17
|
+
"""Optional provenance carried by a frame. All fields default to ``None``."""
|
|
18
|
+
|
|
19
|
+
model: str | None = None
|
|
20
|
+
run_type: str | None = None
|
|
21
|
+
timestamp: int | None = None
|
|
22
|
+
seed: int | None = None
|
|
23
|
+
|
|
24
|
+
def to_dict(self) -> dict[str, Any]:
|
|
25
|
+
"""Serialize to a plain dict, omitting unset (``None``) fields."""
|
|
26
|
+
return {
|
|
27
|
+
f.name: getattr(self, f.name)
|
|
28
|
+
for f in fields(self)
|
|
29
|
+
if getattr(self, f.name) is not None
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def from_dict(cls, data: Mapping[str, Any]) -> FrameMetadata:
|
|
34
|
+
"""Reconstruct from a dict, ignoring unknown keys (forward-compatible)."""
|
|
35
|
+
known = {f.name for f in fields(cls)}
|
|
36
|
+
return cls(**{k: v for k, v in data.items() if k in known})
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""`PredictionFrame` — model outputs (ŷ samples): ``y_pred (N, S)`` float32.
|
|
2
|
+
|
|
3
|
+
A sibling frame (no shared base; ADR-011 Option C). Relocated from
|
|
4
|
+
views-pipeline-core and rewritten **numpy-only** — the original imports pandas
|
|
5
|
+
(``pd.isna``); here identifier validation is the integer-dtype check in
|
|
6
|
+
``_validation`` (register C-17). The sample axis is always explicit (`S >= 1`;
|
|
7
|
+
ADR-012).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
from numpy.typing import NDArray
|
|
16
|
+
|
|
17
|
+
from views_frames._typing import IntArray
|
|
18
|
+
from views_frames._validation import coerce_values, validate_values
|
|
19
|
+
from views_frames.index import SpatioTemporalIndex
|
|
20
|
+
from views_frames.io import npz
|
|
21
|
+
from views_frames.metadata import FrameMetadata
|
|
22
|
+
from views_frames.spatial_level import SpatialLevel
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PredictionFrame:
|
|
26
|
+
"""Immutable model-output frame: ``(N, S)`` float32 + a spatiotemporal index."""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
y_pred: object,
|
|
31
|
+
index: SpatioTemporalIndex,
|
|
32
|
+
metadata: FrameMetadata | None = None,
|
|
33
|
+
) -> None:
|
|
34
|
+
values = coerce_values(y_pred)
|
|
35
|
+
validate_values(values)
|
|
36
|
+
if values.ndim != 2:
|
|
37
|
+
raise ValueError(
|
|
38
|
+
f"PredictionFrame y_pred must be 2D (N, S), got ndim={values.ndim}"
|
|
39
|
+
)
|
|
40
|
+
if values.shape[0] != index.n_rows:
|
|
41
|
+
raise ValueError(
|
|
42
|
+
f"y_pred has {values.shape[0]} rows but index has {index.n_rows}"
|
|
43
|
+
)
|
|
44
|
+
self._values = values
|
|
45
|
+
self._index = index
|
|
46
|
+
self._metadata = metadata if metadata is not None else FrameMetadata()
|
|
47
|
+
|
|
48
|
+
# ---- core surface -------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def values(self) -> NDArray[np.float32]:
|
|
52
|
+
"""The ``(N, S)`` float32 value array."""
|
|
53
|
+
return self._values
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def index(self) -> SpatioTemporalIndex:
|
|
57
|
+
"""The spatiotemporal row index."""
|
|
58
|
+
return self._index
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def metadata(self) -> FrameMetadata:
|
|
62
|
+
"""The typed provenance header."""
|
|
63
|
+
return self._metadata
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def n_rows(self) -> int:
|
|
67
|
+
"""Number of rows ``N``."""
|
|
68
|
+
return int(self._values.shape[0])
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def identifiers(self) -> dict[str, IntArray]:
|
|
72
|
+
"""The integer identifier arrays from the index."""
|
|
73
|
+
return self._index.identifiers
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def sample_count(self) -> int:
|
|
77
|
+
"""Size of the trailing sample axis ``S``."""
|
|
78
|
+
return int(self._values.shape[-1])
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def is_sample(self) -> bool:
|
|
82
|
+
"""True iff ``sample_count > 1``."""
|
|
83
|
+
return self.sample_count > 1
|
|
84
|
+
|
|
85
|
+
# ---- operations ---------------------------------------------------------
|
|
86
|
+
|
|
87
|
+
def with_metadata(self, metadata: FrameMetadata) -> PredictionFrame:
|
|
88
|
+
"""Return a new frame with replaced metadata, **sharing** the values buffer."""
|
|
89
|
+
new = PredictionFrame.__new__(PredictionFrame)
|
|
90
|
+
new._values = self._values
|
|
91
|
+
new._index = self._index
|
|
92
|
+
new._metadata = metadata
|
|
93
|
+
return new
|
|
94
|
+
|
|
95
|
+
def select(self, indexer: IntArray | NDArray[np.bool_]) -> PredictionFrame:
|
|
96
|
+
"""A new frame of the rows at integer positions **or** a boolean mask.
|
|
97
|
+
|
|
98
|
+
Rows are selected by numpy fancy indexing — an integer array reorders or
|
|
99
|
+
repeats, a boolean mask filters. Metadata is preserved; the selection
|
|
100
|
+
**copies** (the result does not share the buffer). An empty selection
|
|
101
|
+
yields an empty frame.
|
|
102
|
+
"""
|
|
103
|
+
return PredictionFrame(
|
|
104
|
+
self._values[indexer], self._index.select(indexer), self._metadata
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def reindex(self, other: SpatioTemporalIndex) -> PredictionFrame:
|
|
108
|
+
"""Align this frame to ``other``'s rows, returning a new frame.
|
|
109
|
+
|
|
110
|
+
Fails loud unless this frame's index is a **superset** of ``other`` (so
|
|
111
|
+
every target row is present). The frame-level companion to the index's
|
|
112
|
+
``reindex``/``searchsorted``, which return positions rather than a frame.
|
|
113
|
+
"""
|
|
114
|
+
if not self._index.is_superset_of(other):
|
|
115
|
+
raise ValueError(
|
|
116
|
+
"reindex requires this frame's index to be a superset of `other`; "
|
|
117
|
+
"some target rows are absent"
|
|
118
|
+
)
|
|
119
|
+
return self.select(self._index.searchsorted(other))
|
|
120
|
+
|
|
121
|
+
# ---- persistence --------------------------------------------------------
|
|
122
|
+
|
|
123
|
+
def save(self, directory: Path | str) -> None:
|
|
124
|
+
"""Serialize to ``directory`` (npy + npz + header)."""
|
|
125
|
+
npz.save(
|
|
126
|
+
directory,
|
|
127
|
+
values=self._values,
|
|
128
|
+
time=self._index.time,
|
|
129
|
+
unit=self._index.unit,
|
|
130
|
+
level=self._index.level.value,
|
|
131
|
+
metadata=self._metadata.to_dict(),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def load(cls, directory: Path | str, mmap: bool = False) -> PredictionFrame:
|
|
136
|
+
"""Deserialize a frame from ``directory``; ``mmap`` propagates."""
|
|
137
|
+
state = npz.load(directory, mmap=mmap)
|
|
138
|
+
index = SpatioTemporalIndex(
|
|
139
|
+
time=state["time"],
|
|
140
|
+
unit=state["unit"],
|
|
141
|
+
level=SpatialLevel(state["level"]),
|
|
142
|
+
)
|
|
143
|
+
return cls(state["values"], index, FrameMetadata.from_dict(state["metadata"]))
|