pyvista-zstd 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyvista_zstd/__init__.py +31 -0
- pyvista_zstd/_version.py +24 -0
- pyvista_zstd/append.py +553 -0
- pyvista_zstd/pyvista_zstd.py +1864 -0
- pyvista_zstd-0.2.4.dist-info/METADATA +340 -0
- pyvista_zstd-0.2.4.dist-info/RECORD +10 -0
- pyvista_zstd-0.2.4.dist-info/WHEEL +5 -0
- pyvista_zstd-0.2.4.dist-info/entry_points.txt +7 -0
- pyvista_zstd-0.2.4.dist-info/licenses/LICENSE +21 -0
- pyvista_zstd-0.2.4.dist-info/top_level.txt +1 -0
pyvista_zstd/__init__.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""VTK zstandard compression library."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from importlib.metadata import PackageNotFoundError
|
|
6
|
+
from importlib.metadata import version
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
__version__ = version("pyvista-zstd")
|
|
10
|
+
except PackageNotFoundError: # pragma: no cover
|
|
11
|
+
__version__ = "unknown"
|
|
12
|
+
|
|
13
|
+
from pyvista_zstd.append import AppendReader
|
|
14
|
+
from pyvista_zstd.append import append_arrays
|
|
15
|
+
from pyvista_zstd.append import read_array
|
|
16
|
+
from pyvista_zstd.pyvista_zstd import FILE_VERSION
|
|
17
|
+
from pyvista_zstd.pyvista_zstd import Reader
|
|
18
|
+
from pyvista_zstd.pyvista_zstd import Writer
|
|
19
|
+
from pyvista_zstd.pyvista_zstd import read
|
|
20
|
+
from pyvista_zstd.pyvista_zstd import write
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"FILE_VERSION",
|
|
24
|
+
"AppendReader",
|
|
25
|
+
"Reader",
|
|
26
|
+
"Writer",
|
|
27
|
+
"append_arrays",
|
|
28
|
+
"read",
|
|
29
|
+
"read_array",
|
|
30
|
+
"write",
|
|
31
|
+
]
|
pyvista_zstd/_version.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# file generated by vcs-versioning
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"__version__",
|
|
7
|
+
"__version_tuple__",
|
|
8
|
+
"version",
|
|
9
|
+
"version_tuple",
|
|
10
|
+
"__commit_id__",
|
|
11
|
+
"commit_id",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
version: str
|
|
15
|
+
__version__: str
|
|
16
|
+
__version_tuple__: tuple[int | str, ...]
|
|
17
|
+
version_tuple: tuple[int | str, ...]
|
|
18
|
+
commit_id: str | None
|
|
19
|
+
__commit_id__: str | None
|
|
20
|
+
|
|
21
|
+
__version__ = version = '0.2.4'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 2, 4)
|
|
23
|
+
|
|
24
|
+
__commit_id__ = commit_id = None
|
pyvista_zstd/append.py
ADDED
|
@@ -0,0 +1,553 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Incremental append + partial/columnar read for the ``.pv`` container.
|
|
3
|
+
|
|
4
|
+
Motivation
|
|
5
|
+
----------
|
|
6
|
+
The upstream :class:`pyvista_zstd.Writer` serialises a whole dataset in
|
|
7
|
+
one shot: every array is independently zstd-compressed into a pair of
|
|
8
|
+
frames (``[packed-metadata-frame][raw-array-frame]``), the frames are
|
|
9
|
+
concatenated, and a fixed-width footer table plus a trailing
|
|
10
|
+
``<Q>`` frame-count is written at the very end of the file.
|
|
11
|
+
|
|
12
|
+
Because the footer lives at the *end* and every body frame is addressed
|
|
13
|
+
by an absolute cumulative-compressed-byte offset, a new block can be
|
|
14
|
+
appended by:
|
|
15
|
+
|
|
16
|
+
1. truncating the file at the start of the existing footer
|
|
17
|
+
(``frame_ends[-1]``), leaving every previously-committed compressed
|
|
18
|
+
frame byte-for-byte untouched,
|
|
19
|
+
2. writing the new block's compressed frames after it,
|
|
20
|
+
3. writing a *new* footer that covers the old frames (offsets unchanged)
|
|
21
|
+
plus the new ones, then the new ``<Q>`` count.
|
|
22
|
+
|
|
23
|
+
This is what :func:`append_arrays` does. No previously-written
|
|
24
|
+
compressed frame is ever re-read or re-compressed.
|
|
25
|
+
|
|
26
|
+
On-disk layout (unchanged from upstream)
|
|
27
|
+
-----------------------------------------
|
|
28
|
+
::
|
|
29
|
+
|
|
30
|
+
[frame 0 ........ frame N-1] # body: 2 frames per array
|
|
31
|
+
[<QQ> x N: (cum_compressed_end, decompressed_size)] # footer table
|
|
32
|
+
[<Q>: N] # trailing frame count
|
|
33
|
+
|
|
34
|
+
Appended arrays are recorded two ways so they round-trip through the
|
|
35
|
+
*unmodified* upstream reader:
|
|
36
|
+
|
|
37
|
+
* their two frames are added to the body and footer,
|
|
38
|
+
* their names are added to the file-metadata ``frame_names`` list, and
|
|
39
|
+
* they are registered as ``field_data`` arrays on the root dataset's
|
|
40
|
+
:class:`~pyvista_zstd.pyvista_zstd.DataSetMetadata` (under the
|
|
41
|
+
``…__field_data`` suffix) so :pyattr:`Reader.available_field_arrays`
|
|
42
|
+
and :meth:`Reader.read` surface them without any reader change.
|
|
43
|
+
|
|
44
|
+
Crash safety
|
|
45
|
+
------------
|
|
46
|
+
The append is staged into a sibling temp file (a byte-copy of the body
|
|
47
|
+
prefix + new frames + new footer) and then :func:`os.replace`-d over the
|
|
48
|
+
original. ``os.replace`` is atomic on POSIX and Windows, so an
|
|
49
|
+
interrupted append either leaves the original fully intact or completes;
|
|
50
|
+
it can never leave a half-written footer that destroys committed blocks.
|
|
51
|
+
|
|
52
|
+
Partial / columnar read
|
|
53
|
+
-----------------------
|
|
54
|
+
:func:`read_array` (and :class:`AppendReader`) decompress exactly the
|
|
55
|
+
two frames of one named block — never the rest of the file — so a single
|
|
56
|
+
array can be loaded back without touching any other block.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
from __future__ import annotations
|
|
60
|
+
|
|
61
|
+
import json
|
|
62
|
+
import os
|
|
63
|
+
from pathlib import Path
|
|
64
|
+
import struct
|
|
65
|
+
from typing import IO
|
|
66
|
+
from typing import TYPE_CHECKING
|
|
67
|
+
|
|
68
|
+
import numpy as np
|
|
69
|
+
import zstandard as zstd
|
|
70
|
+
|
|
71
|
+
from pyvista_zstd.pyvista_zstd import _FILTER_NONE
|
|
72
|
+
from pyvista_zstd.pyvista_zstd import _FILTER_SHUFFLE
|
|
73
|
+
from pyvista_zstd.pyvista_zstd import DS_METADATA_KEY
|
|
74
|
+
from pyvista_zstd.pyvista_zstd import FIELD_DATA_SUFFIX
|
|
75
|
+
from pyvista_zstd.pyvista_zstd import FILE_METADATA_KEY
|
|
76
|
+
from pyvista_zstd.pyvista_zstd import FILE_VERSION
|
|
77
|
+
from pyvista_zstd.pyvista_zstd import MULTIBLOCK_METADATA_KEY
|
|
78
|
+
from pyvista_zstd.pyvista_zstd import SUPPORTED_READ_SUFFIXES
|
|
79
|
+
from pyvista_zstd.pyvista_zstd import UID_N_CHAR
|
|
80
|
+
from pyvista_zstd.pyvista_zstd import ArrayInfo
|
|
81
|
+
from pyvista_zstd.pyvista_zstd import DataSetMetadata
|
|
82
|
+
from pyvista_zstd.pyvista_zstd import ZstdFileMetadata
|
|
83
|
+
from pyvista_zstd.pyvista_zstd import _pack_array_metadata
|
|
84
|
+
from pyvista_zstd.pyvista_zstd import _reconstruct_array
|
|
85
|
+
from pyvista_zstd.pyvista_zstd import _resolve_shuffle
|
|
86
|
+
from pyvista_zstd.pyvista_zstd import _shuffle_bytes
|
|
87
|
+
|
|
88
|
+
if TYPE_CHECKING: # pragma: no cover - typing only
|
|
89
|
+
from collections.abc import Iterable
|
|
90
|
+
|
|
91
|
+
from numpy.typing import NDArray
|
|
92
|
+
|
|
93
|
+
from pyvista_zstd.pyvista_zstd import ShuffleSpec
|
|
94
|
+
|
|
95
|
+
__all__ = [
|
|
96
|
+
"AppendReader",
|
|
97
|
+
"append_arrays",
|
|
98
|
+
"read_array",
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
# Mirror the upstream Reader's sanity bound on the trailing frame count.
|
|
102
|
+
_MAX_FRAMES = 1_000_000
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _read_footer(f: IO[bytes]) -> tuple[int, list[tuple[int, int]]]:
|
|
106
|
+
"""
|
|
107
|
+
Return ``(num_frames, frame_meta)`` from an open ``"rb"`` handle.
|
|
108
|
+
|
|
109
|
+
``frame_meta[i]`` is ``(cumulative_compressed_end, decompressed_size)``
|
|
110
|
+
exactly as written by :meth:`pyvista_zstd.Writer.write`.
|
|
111
|
+
"""
|
|
112
|
+
f.seek(-8, os.SEEK_END)
|
|
113
|
+
num_frames = struct.unpack("<Q", f.read(8))[0]
|
|
114
|
+
if num_frames == 0 or num_frames > _MAX_FRAMES:
|
|
115
|
+
msg = "Bad number of frames. File may be corrupted."
|
|
116
|
+
raise RuntimeError(msg)
|
|
117
|
+
f.seek(-(8 + num_frames * UID_N_CHAR), os.SEEK_END)
|
|
118
|
+
raw = f.read(num_frames * UID_N_CHAR)
|
|
119
|
+
frame_meta = [struct.unpack("<QQ", raw[i * UID_N_CHAR : (i + 1) * UID_N_CHAR]) for i in range(num_frames)]
|
|
120
|
+
return num_frames, frame_meta
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _frame_bounds(frame_meta: list[tuple[int, int]]) -> tuple[list[int], list[int]]:
|
|
124
|
+
"""Return per-frame ``(starts, ends)`` of compressed byte ranges."""
|
|
125
|
+
ends = [end for end, _ in frame_meta]
|
|
126
|
+
starts = [0, *ends[:-1]]
|
|
127
|
+
return starts, ends
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _decompress_two_blobs(
|
|
131
|
+
meta_blob: bytes,
|
|
132
|
+
data_blob: bytes,
|
|
133
|
+
decompressed_sizes: tuple[int, int],
|
|
134
|
+
) -> tuple[str, NDArray]:
|
|
135
|
+
"""Decompress one array from its two already-sliced compressed frames."""
|
|
136
|
+
dctx = zstd.ZstdDecompressor()
|
|
137
|
+
segments = dctx.multi_decompress_to_buffer(
|
|
138
|
+
[meta_blob, data_blob],
|
|
139
|
+
decompressed_sizes=struct.pack("<QQ", *decompressed_sizes),
|
|
140
|
+
threads=0,
|
|
141
|
+
)
|
|
142
|
+
return _reconstruct_array(segments[0], segments[1])
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _read_frame_pair(f: IO[bytes], starts: list[int], ends: list[int], ai: int) -> tuple[bytes, bytes]:
|
|
146
|
+
"""Seek-read the two compressed frames of array index ``ai`` from ``f``."""
|
|
147
|
+
mi, di = ai * 2, ai * 2 + 1
|
|
148
|
+
f.seek(starts[mi])
|
|
149
|
+
meta_blob = f.read(ends[mi] - starts[mi])
|
|
150
|
+
f.seek(starts[di])
|
|
151
|
+
data_blob = f.read(ends[di] - starts[di])
|
|
152
|
+
return meta_blob, data_blob
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _load_file_meta_and_root_ds(
|
|
156
|
+
f: IO[bytes],
|
|
157
|
+
frame_meta: list[tuple[int, int]],
|
|
158
|
+
) -> tuple[ZstdFileMetadata, str, DataSetMetadata]:
|
|
159
|
+
"""
|
|
160
|
+
Decode the file-metadata frame and the root ``__ds_metadata`` frame.
|
|
161
|
+
|
|
162
|
+
``f`` is an open seekable ``"rb"`` handle. Only the two small
|
|
163
|
+
metadata frame-pairs are read — never the array bodies — so this is
|
|
164
|
+
cheap even on multi-gigabyte files.
|
|
165
|
+
|
|
166
|
+
Returns ``(file_meta, root_ds_id, root_ds_meta)``.
|
|
167
|
+
"""
|
|
168
|
+
starts, ends = _frame_bounds(frame_meta)
|
|
169
|
+
n_arrays = len(frame_meta) // 2
|
|
170
|
+
|
|
171
|
+
# File metadata is always the final array (last two frames).
|
|
172
|
+
meta_blob, data_blob = _read_frame_pair(f, starts, ends, n_arrays - 1)
|
|
173
|
+
_, arr = _decompress_two_blobs(meta_blob, data_blob, (frame_meta[-2][1], frame_meta[-1][1]))
|
|
174
|
+
file_meta = ZstdFileMetadata.from_json(arr.tobytes().decode("utf-8"))
|
|
175
|
+
|
|
176
|
+
# ``MULTIBLOCK_METADATA_KEY`` also ends with ``DS_METADATA_KEY``; a
|
|
177
|
+
# MultiBlock file has no single root dataset to append to, so reject it
|
|
178
|
+
# cleanly instead of misparsing its multiblock metadata as a dataset.
|
|
179
|
+
frame_names = file_meta.frame_names
|
|
180
|
+
if any(fname.endswith(MULTIBLOCK_METADATA_KEY) for fname in frame_names):
|
|
181
|
+
msg = "appending to MultiBlock .pv files is not supported."
|
|
182
|
+
raise NotImplementedError(msg)
|
|
183
|
+
|
|
184
|
+
# Find the root dataset metadata frame by name.
|
|
185
|
+
root_idx = None
|
|
186
|
+
for i, fname in enumerate(frame_names):
|
|
187
|
+
if fname.endswith(DS_METADATA_KEY):
|
|
188
|
+
root_idx = i
|
|
189
|
+
break
|
|
190
|
+
if root_idx is None: # pragma: no cover - malformed file
|
|
191
|
+
msg = "No dataset metadata frame found; cannot append."
|
|
192
|
+
raise RuntimeError(msg)
|
|
193
|
+
|
|
194
|
+
ds_id = frame_names[root_idx][:UID_N_CHAR]
|
|
195
|
+
mi, di = root_idx * 2, root_idx * 2 + 1
|
|
196
|
+
meta_blob, data_blob = _read_frame_pair(f, starts, ends, root_idx)
|
|
197
|
+
_, ds_arr = _decompress_two_blobs(meta_blob, data_blob, (frame_meta[mi][1], frame_meta[di][1]))
|
|
198
|
+
ds_meta = DataSetMetadata.from_array(ds_arr)
|
|
199
|
+
return file_meta, ds_id, ds_meta
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _compress_frames(payloads: list[bytes], *, level: int) -> list[bytes]:
|
|
203
|
+
"""Compress each payload into its own single-frame zstd buffer."""
|
|
204
|
+
cctx = zstd.ZstdCompressor(level=level, threads=0)
|
|
205
|
+
buff = cctx.multi_compress_to_buffer(payloads, threads=0)
|
|
206
|
+
return [buff[i].tobytes() for i in range(len(buff))]
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def append_arrays( # noqa: C901, PLR0912, PLR0915
|
|
210
|
+
filename: Path | str,
|
|
211
|
+
arrays: dict[str, NDArray],
|
|
212
|
+
*,
|
|
213
|
+
level: int | None = None,
|
|
214
|
+
shuffle: ShuffleSpec = False,
|
|
215
|
+
) -> None:
|
|
216
|
+
"""
|
|
217
|
+
Append named arrays to an existing ``.pv`` file in place.
|
|
218
|
+
|
|
219
|
+
Each array is stored as an independently-zstd-compressed
|
|
220
|
+
``field_data`` block. Previously-written compressed frames are
|
|
221
|
+
**never** re-read or re-compressed; only the small footer + the
|
|
222
|
+
file/dataset metadata frames are rewritten, and the whole update is
|
|
223
|
+
committed atomically via :func:`os.replace`.
|
|
224
|
+
|
|
225
|
+
Parameters
|
|
226
|
+
----------
|
|
227
|
+
filename : pathlib.Path | str
|
|
228
|
+
Path to an existing ``.pv`` file (written by
|
|
229
|
+
:func:`pyvista_zstd.write` or a previous :func:`append_arrays`).
|
|
230
|
+
arrays : dict[str, numpy.ndarray]
|
|
231
|
+
Mapping of ``name -> array``. Names must not already exist as a
|
|
232
|
+
field array in the file. Arrays are stored verbatim (bit-exact
|
|
233
|
+
round trip) regardless of dtype/shape.
|
|
234
|
+
level : int, optional
|
|
235
|
+
zstd compression level for the new blocks. Defaults to the
|
|
236
|
+
file's recorded ``compression_level`` so appended blocks match
|
|
237
|
+
the original.
|
|
238
|
+
shuffle : {"auto", True, False}, default: False
|
|
239
|
+
Optionally apply the reversible byte-shuffle pre-filter to the
|
|
240
|
+
appended blocks (see :func:`pyvista_zstd.write`). Disabled by
|
|
241
|
+
default; ``"auto"`` shuffles a multibyte floating-point array only
|
|
242
|
+
when a quick trial compression shows it shrinks the data. When any
|
|
243
|
+
appended block is shuffled the file is promoted to format version 1;
|
|
244
|
+
already-written frames are untouched and keep their own per-block
|
|
245
|
+
encoding.
|
|
246
|
+
|
|
247
|
+
Notes
|
|
248
|
+
-----
|
|
249
|
+
The appended blocks become visible as ``field_data`` arrays:
|
|
250
|
+
:meth:`pyvista_zstd.Reader.read` returns them in ``grid.field_data``
|
|
251
|
+
and :pyattr:`pyvista_zstd.Reader.available_field_arrays` lists them.
|
|
252
|
+
Use :func:`read_array` / :class:`AppendReader` to read one block back
|
|
253
|
+
without decompressing the rest of the file.
|
|
254
|
+
|
|
255
|
+
"""
|
|
256
|
+
path = Path(filename)
|
|
257
|
+
if path.suffix not in SUPPORTED_READ_SUFFIXES:
|
|
258
|
+
msg = f"Filename must end in one of {SUPPORTED_READ_SUFFIXES}, not '{path.suffix}'"
|
|
259
|
+
raise ValueError(msg)
|
|
260
|
+
if not arrays:
|
|
261
|
+
return
|
|
262
|
+
|
|
263
|
+
with path.open("rb") as f:
|
|
264
|
+
_num_frames, frame_meta = _read_footer(f)
|
|
265
|
+
file_meta, ds_id, ds_meta = _load_file_meta_and_root_ds(f, frame_meta)
|
|
266
|
+
|
|
267
|
+
level = file_meta.compression_level if level is None else int(level)
|
|
268
|
+
|
|
269
|
+
# Validate names against the existing field arrays.
|
|
270
|
+
existing_fields = set(ds_meta.field_data_keys)
|
|
271
|
+
for name in arrays:
|
|
272
|
+
if name in existing_fields:
|
|
273
|
+
msg = (
|
|
274
|
+
f"field array {name!r} already exists in {path.name}; append_arrays does not overwrite existing blocks."
|
|
275
|
+
)
|
|
276
|
+
raise ValueError(msg)
|
|
277
|
+
|
|
278
|
+
# ------------------------------------------------------------------
|
|
279
|
+
# The file-metadata frame and the root dataset-metadata frame must be
|
|
280
|
+
# rewritten (they grow), so we drop their *old* frames and re-emit
|
|
281
|
+
# them at the tail. Their old compressed bytes are simply not copied
|
|
282
|
+
# into the new body prefix.
|
|
283
|
+
#
|
|
284
|
+
# frame_names ordering invariant (upstream): per array there are two
|
|
285
|
+
# frames; the file-metadata array is always last. The root
|
|
286
|
+
# __ds_metadata array sits somewhere in the body. We rebuild the
|
|
287
|
+
# frame_names list with the new field arrays inserted *before* the
|
|
288
|
+
# ds-metadata + file-metadata tail so the upstream reader's
|
|
289
|
+
# name->index mapping stays consistent.
|
|
290
|
+
# ------------------------------------------------------------------
|
|
291
|
+
starts, ends = _frame_bounds(frame_meta)
|
|
292
|
+
frame_names = list(file_meta.frame_names)
|
|
293
|
+
|
|
294
|
+
# Upstream invariant: there are ``n_arrays = num_frames // 2`` arrays
|
|
295
|
+
# (2 frames each), but ``frame_names`` records only the first
|
|
296
|
+
# ``n_arrays - 1`` of them — the trailing file-metadata array's name
|
|
297
|
+
# is deliberately *omitted* (``Writer.write`` snapshots ``frame_names``
|
|
298
|
+
# before appending the file-metadata array). The reader recovers the
|
|
299
|
+
# file-metadata frame positionally (last two frames), so we MUST keep
|
|
300
|
+
# this exclusion to stay byte/▷reader-compatible.
|
|
301
|
+
n_arrays = len(frame_meta) // 2
|
|
302
|
+
if len(frame_names) != n_arrays - 1: # pragma: no cover - malformed file
|
|
303
|
+
msg = f"frame_names length {len(frame_names)} != n_arrays-1 {n_arrays - 1}; cannot append."
|
|
304
|
+
raise RuntimeError(msg)
|
|
305
|
+
file_meta_array_idx = n_arrays - 1 # positional; name not in frame_names
|
|
306
|
+
|
|
307
|
+
# Index of the root ds-metadata array (in array units).
|
|
308
|
+
root_ds_array_idx = next(i for i, n in enumerate(frame_names) if n.endswith(DS_METADATA_KEY))
|
|
309
|
+
|
|
310
|
+
# Keep all body arrays EXCEPT the root ds-meta and the file-meta,
|
|
311
|
+
# whose compressed bytes we will re-emit. Everything else is copied
|
|
312
|
+
# verbatim (byte-for-byte) from the original body.
|
|
313
|
+
rewritten = {root_ds_array_idx, file_meta_array_idx}
|
|
314
|
+
kept_array_indices = [i for i in range(n_arrays) if i not in rewritten]
|
|
315
|
+
|
|
316
|
+
# New body = concatenation of kept compressed frame-pairs, in order
|
|
317
|
+
# (copied verbatim by offset, never decompressed), followed by new
|
|
318
|
+
# field-array frame-pairs, then the regenerated ds-meta frame-pair
|
|
319
|
+
# and file-meta frame-pair. We record per-frame ``(orig_start,
|
|
320
|
+
# orig_end, decompressed_size)`` for the kept frames so they can be
|
|
321
|
+
# streamed straight from the source file into the temp file.
|
|
322
|
+
kept_frame_specs: list[tuple[int, int, int]] = [] # (start, end, decomp)
|
|
323
|
+
new_frame_names: list[str] = []
|
|
324
|
+
for ai in kept_array_indices:
|
|
325
|
+
mi, di = ai * 2, ai * 2 + 1
|
|
326
|
+
kept_frame_specs.append((starts[mi], ends[mi], frame_meta[mi][1]))
|
|
327
|
+
kept_frame_specs.append((starts[di], ends[di], frame_meta[di][1]))
|
|
328
|
+
new_frame_names.append(frame_names[ai])
|
|
329
|
+
|
|
330
|
+
# New field-data arrays.
|
|
331
|
+
new_field_info: dict[str, ArrayInfo] = {}
|
|
332
|
+
new_payloads: list[bytes] = []
|
|
333
|
+
new_payload_decomp: list[int] = []
|
|
334
|
+
new_payload_names: list[str] = []
|
|
335
|
+
any_shuffled = False
|
|
336
|
+
for name, raw in arrays.items():
|
|
337
|
+
arr = np.ascontiguousarray(raw)
|
|
338
|
+
frame_name = f"{ds_id}{name}{FIELD_DATA_SUFFIX}"
|
|
339
|
+
filter_id = _FILTER_SHUFFLE if _resolve_shuffle(arr, shuffle, level) else _FILTER_NONE
|
|
340
|
+
any_shuffled = any_shuffled or filter_id != _FILTER_NONE
|
|
341
|
+
meta_payload = _pack_array_metadata(frame_name, arr, filter_id)
|
|
342
|
+
if filter_id == _FILTER_SHUFFLE:
|
|
343
|
+
data_payload = _shuffle_bytes(arr).tobytes()
|
|
344
|
+
else:
|
|
345
|
+
data_payload = arr.ravel().view(np.uint8).tobytes()
|
|
346
|
+
new_payloads.append(meta_payload)
|
|
347
|
+
new_payloads.append(data_payload)
|
|
348
|
+
new_payload_decomp.append(len(meta_payload))
|
|
349
|
+
new_payload_decomp.append(len(data_payload))
|
|
350
|
+
new_payload_names.append(frame_name)
|
|
351
|
+
new_field_info[name] = ArrayInfo(shape=tuple(int(s) for s in arr.shape), dtype=str(arr.dtype))
|
|
352
|
+
|
|
353
|
+
# Regenerate root dataset metadata with merged field_data_keys.
|
|
354
|
+
merged_field_keys = dict(ds_meta.field_data_keys)
|
|
355
|
+
merged_field_keys.update(new_field_info)
|
|
356
|
+
ds_meta_new = _with_field_keys(ds_meta, merged_field_keys)
|
|
357
|
+
ds_meta_arr = ds_meta_new.to_array()
|
|
358
|
+
ds_meta_frame_name = f"{ds_id}{DS_METADATA_KEY}"
|
|
359
|
+
ds_meta_payloads = [
|
|
360
|
+
_pack_array_metadata(ds_meta_frame_name, ds_meta_arr),
|
|
361
|
+
ds_meta_arr.ravel().view(np.uint8).tobytes(),
|
|
362
|
+
]
|
|
363
|
+
|
|
364
|
+
# Regenerate file metadata. Final on-disk array order is:
|
|
365
|
+
# [kept arrays ...][new field arrays ...][ds-meta][file-meta]
|
|
366
|
+
# ``frame_names`` records every array EXCEPT the trailing file-meta
|
|
367
|
+
# (upstream invariant — see above), so it ends at ds-meta.
|
|
368
|
+
final_frame_names = [
|
|
369
|
+
*new_frame_names,
|
|
370
|
+
*new_payload_names,
|
|
371
|
+
ds_meta_frame_name,
|
|
372
|
+
]
|
|
373
|
+
# Promote the file version if any appended block uses a byte-filter, so an
|
|
374
|
+
# older reader cleanly refuses it. Kept frames retain their own per-block
|
|
375
|
+
# encoding (recorded in their metadata frames), so a previously-unfiltered
|
|
376
|
+
# file gaining a shuffled block stays internally consistent.
|
|
377
|
+
new_version = max(file_meta.file_version, FILE_VERSION) if any_shuffled else file_meta.file_version
|
|
378
|
+
file_meta_new = ZstdFileMetadata(
|
|
379
|
+
frame_names=final_frame_names,
|
|
380
|
+
compression_level=file_meta.compression_level,
|
|
381
|
+
compression=file_meta.compression,
|
|
382
|
+
file_version=new_version,
|
|
383
|
+
)
|
|
384
|
+
file_meta_arr = file_meta_new.to_array()
|
|
385
|
+
file_meta_payloads = [
|
|
386
|
+
_pack_array_metadata(FILE_METADATA_KEY, file_meta_arr),
|
|
387
|
+
file_meta_arr.ravel().view(np.uint8).tobytes(),
|
|
388
|
+
]
|
|
389
|
+
|
|
390
|
+
# Compress the regenerated/new payloads (kept frames are reused as-is).
|
|
391
|
+
to_compress = [*new_payloads, *ds_meta_payloads, *file_meta_payloads]
|
|
392
|
+
to_compress_decomp = [
|
|
393
|
+
*new_payload_decomp,
|
|
394
|
+
len(ds_meta_payloads[0]),
|
|
395
|
+
len(ds_meta_payloads[1]),
|
|
396
|
+
len(file_meta_payloads[0]),
|
|
397
|
+
len(file_meta_payloads[1]),
|
|
398
|
+
]
|
|
399
|
+
compressed = _compress_frames(to_compress, level=level)
|
|
400
|
+
|
|
401
|
+
# Stream-assemble the new file: kept frames copied verbatim from the
|
|
402
|
+
# source by offset, then the freshly-compressed new/metadata frames,
|
|
403
|
+
# then the regenerated footer. Cumulative compressed offsets are
|
|
404
|
+
# tracked as we write so the footer table stays exact.
|
|
405
|
+
tmp_path = path.with_suffix(path.suffix + ".append.tmp")
|
|
406
|
+
copy_chunk = 8 * 1024 * 1024
|
|
407
|
+
try:
|
|
408
|
+
with path.open("rb") as src, tmp_path.open("wb") as out:
|
|
409
|
+
offset = 0
|
|
410
|
+
footer_entries: list[tuple[int, int]] = []
|
|
411
|
+
|
|
412
|
+
# 1) kept frames — verbatim offset copy, no decompression.
|
|
413
|
+
for start, end, dsz in kept_frame_specs:
|
|
414
|
+
src.seek(start)
|
|
415
|
+
remaining = end - start
|
|
416
|
+
while remaining > 0:
|
|
417
|
+
chunk = src.read(min(copy_chunk, remaining))
|
|
418
|
+
if not chunk: # pragma: no cover - truncated source
|
|
419
|
+
msg = "Source .pv truncated while copying kept frames."
|
|
420
|
+
raise RuntimeError(msg)
|
|
421
|
+
out.write(chunk)
|
|
422
|
+
remaining -= len(chunk)
|
|
423
|
+
offset += end - start
|
|
424
|
+
footer_entries.append((offset, dsz))
|
|
425
|
+
|
|
426
|
+
# 2) new + regenerated-metadata frames.
|
|
427
|
+
for part, dsz in zip(compressed, to_compress_decomp, strict=True):
|
|
428
|
+
out.write(part)
|
|
429
|
+
offset += len(part)
|
|
430
|
+
footer_entries.append((offset, dsz))
|
|
431
|
+
|
|
432
|
+
# 3) footer table + trailing frame count.
|
|
433
|
+
out.writelines(struct.pack("<QQ", off, dsz) for off, dsz in footer_entries)
|
|
434
|
+
out.write(struct.pack("<Q", len(footer_entries)))
|
|
435
|
+
out.flush()
|
|
436
|
+
os.fsync(out.fileno())
|
|
437
|
+
# Atomic commit: replace the original only once the staged file is
|
|
438
|
+
# fully written + fsync'd. ``Path.replace`` is atomic on POSIX and
|
|
439
|
+
# Windows, so an interrupted append cannot corrupt committed blocks.
|
|
440
|
+
tmp_path.replace(path)
|
|
441
|
+
finally:
|
|
442
|
+
if tmp_path.exists(): # pragma: no cover - cleanup on failure
|
|
443
|
+
tmp_path.unlink()
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def _with_field_keys(meta: DataSetMetadata, field_keys: dict[str, ArrayInfo]) -> DataSetMetadata:
|
|
447
|
+
"""
|
|
448
|
+
Return a copy of ``meta`` with ``field_data_keys`` replaced.
|
|
449
|
+
|
|
450
|
+
:class:`DataSetMetadata` is a frozen slotted dataclass; rebuild it
|
|
451
|
+
from its JSON image (the only public round-trip) with the field keys
|
|
452
|
+
swapped in.
|
|
453
|
+
"""
|
|
454
|
+
raw = json.loads(meta.to_json())
|
|
455
|
+
raw["field_data_keys"] = {k: {"shape": list(v.shape), "dtype": v.dtype} for k, v in field_keys.items()}
|
|
456
|
+
return DataSetMetadata.from_json(json.dumps(raw, separators=(",", ":")))
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def read_array(filename: Path | str, name: str) -> NDArray:
|
|
460
|
+
"""
|
|
461
|
+
Read a single appended (field) array back without full decompression.
|
|
462
|
+
|
|
463
|
+
Decompresses exactly the two frames of the named block — the rest of
|
|
464
|
+
the file is never decompressed, so this stays cheap even on a
|
|
465
|
+
multi-gigabyte container.
|
|
466
|
+
|
|
467
|
+
Parameters
|
|
468
|
+
----------
|
|
469
|
+
filename : pathlib.Path | str
|
|
470
|
+
Path to a ``.pv`` file.
|
|
471
|
+
name : str
|
|
472
|
+
Field-array name (the key passed to :func:`append_arrays`, or any
|
|
473
|
+
field-data key present in the file).
|
|
474
|
+
|
|
475
|
+
Returns
|
|
476
|
+
-------
|
|
477
|
+
numpy.ndarray
|
|
478
|
+
The stored array, bit-exact.
|
|
479
|
+
|
|
480
|
+
"""
|
|
481
|
+
return AppendReader(filename).read_array(name)
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
class AppendReader:
|
|
485
|
+
"""
|
|
486
|
+
Partial reader for ``.pv`` field-data blocks.
|
|
487
|
+
|
|
488
|
+
Opens a ``.pv`` file and exposes its field arrays for *individual*
|
|
489
|
+
read-back, decompressing only the requested block's two frames. The
|
|
490
|
+
footer + a small set of metadata frames are read on construction so
|
|
491
|
+
repeated single-block reads are cheap.
|
|
492
|
+
|
|
493
|
+
Parameters
|
|
494
|
+
----------
|
|
495
|
+
filename : pathlib.Path | str
|
|
496
|
+
Path to a ``.pv`` file.
|
|
497
|
+
|
|
498
|
+
Examples
|
|
499
|
+
--------
|
|
500
|
+
>>> from pyvista_zstd.append import AppendReader
|
|
501
|
+
>>> r = AppendReader("data.pv") # doctest: +SKIP
|
|
502
|
+
>>> r.field_array_names # doctest: +SKIP
|
|
503
|
+
['col_0000', 'col_0001']
|
|
504
|
+
>>> col = r.read_array("col_0001") # doctest: +SKIP
|
|
505
|
+
|
|
506
|
+
"""
|
|
507
|
+
|
|
508
|
+
def __init__(self, filename: Path | str) -> None:
|
|
509
|
+
"""Open ``filename`` and read its footer + metadata frames."""
|
|
510
|
+
self._path = Path(filename)
|
|
511
|
+
if self._path.suffix not in SUPPORTED_READ_SUFFIXES:
|
|
512
|
+
msg = f"Filename must end in one of {SUPPORTED_READ_SUFFIXES}, not '{self._path.suffix}'"
|
|
513
|
+
raise ValueError(msg)
|
|
514
|
+
with self._path.open("rb") as f:
|
|
515
|
+
self._num_frames, self._frame_meta = _read_footer(f)
|
|
516
|
+
self._file_meta, self._ds_id, self._ds_meta = _load_file_meta_and_root_ds(f, self._frame_meta)
|
|
517
|
+
self._starts, self._ends = _frame_bounds(self._frame_meta)
|
|
518
|
+
# name -> array index
|
|
519
|
+
self._name_to_idx = {n: i for i, n in enumerate(self._file_meta.frame_names)}
|
|
520
|
+
|
|
521
|
+
@property
|
|
522
|
+
def field_array_names(self) -> list[str]:
|
|
523
|
+
"""Names of field-data arrays available for partial read."""
|
|
524
|
+
return list(self._ds_meta.field_data_keys)
|
|
525
|
+
|
|
526
|
+
@property
|
|
527
|
+
def field_array_info(self) -> dict[str, ArrayInfo]:
|
|
528
|
+
"""Mapping ``name -> ArrayInfo(shape, dtype)`` for field arrays."""
|
|
529
|
+
return dict(self._ds_meta.field_data_keys)
|
|
530
|
+
|
|
531
|
+
def __contains__(self, name: str) -> bool:
|
|
532
|
+
"""Return whether field array ``name`` is present."""
|
|
533
|
+
return name in self._ds_meta.field_data_keys
|
|
534
|
+
|
|
535
|
+
def read_array(self, name: str) -> NDArray:
|
|
536
|
+
"""Decompress and return the single field array ``name``."""
|
|
537
|
+
if name not in self._ds_meta.field_data_keys:
|
|
538
|
+
msg = f"field array {name!r} not found; available: {sorted(self._ds_meta.field_data_keys)}"
|
|
539
|
+
raise KeyError(msg)
|
|
540
|
+
frame_name = f"{self._ds_id}{name}{FIELD_DATA_SUFFIX}"
|
|
541
|
+
ai = self._name_to_idx.get(frame_name)
|
|
542
|
+
if ai is None: # pragma: no cover - metadata/name desync
|
|
543
|
+
msg = f"frame for field array {name!r} not found in frame index."
|
|
544
|
+
raise KeyError(msg)
|
|
545
|
+
mi, di = ai * 2, ai * 2 + 1
|
|
546
|
+
with self._path.open("rb") as f:
|
|
547
|
+
meta_blob, data_blob = _read_frame_pair(f, self._starts, self._ends, ai)
|
|
548
|
+
_, arr = _decompress_two_blobs(meta_blob, data_blob, (self._frame_meta[mi][1], self._frame_meta[di][1]))
|
|
549
|
+
return arr
|
|
550
|
+
|
|
551
|
+
def read_arrays(self, names: Iterable[str]) -> dict[str, NDArray]:
|
|
552
|
+
"""Decompress and return several field arrays by name."""
|
|
553
|
+
return {n: self.read_array(n) for n in names}
|