lsst-pipe-base 29.2025.3900__py3-none-any.whl → 29.2025.4100__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/pipe/base/_task_metadata.py +15 -0
- lsst/pipe/base/dot_tools.py +14 -152
- lsst/pipe/base/exec_fixup_data_id.py +17 -44
- lsst/pipe/base/execution_graph_fixup.py +49 -18
- lsst/pipe/base/graph/_versionDeserializers.py +6 -5
- lsst/pipe/base/graph/graph.py +30 -10
- lsst/pipe/base/graph/graphSummary.py +30 -0
- lsst/pipe/base/graph_walker.py +119 -0
- lsst/pipe/base/log_capture.py +5 -2
- lsst/pipe/base/mermaid_tools.py +11 -64
- lsst/pipe/base/mp_graph_executor.py +298 -236
- lsst/pipe/base/pipeline_graph/io.py +1 -1
- lsst/pipe/base/quantum_graph/__init__.py +32 -0
- lsst/pipe/base/quantum_graph/_common.py +632 -0
- lsst/pipe/base/quantum_graph/_multiblock.py +808 -0
- lsst/pipe/base/quantum_graph/_predicted.py +1950 -0
- lsst/pipe/base/quantum_graph/visualization.py +302 -0
- lsst/pipe/base/quantum_graph_builder.py +292 -34
- lsst/pipe/base/quantum_graph_executor.py +2 -1
- lsst/pipe/base/quantum_provenance_graph.py +16 -7
- lsst/pipe/base/quantum_reports.py +45 -0
- lsst/pipe/base/separable_pipeline_executor.py +126 -15
- lsst/pipe/base/simple_pipeline_executor.py +44 -43
- lsst/pipe/base/single_quantum_executor.py +1 -40
- lsst/pipe/base/tests/mocks/__init__.py +1 -1
- lsst/pipe/base/tests/mocks/_pipeline_task.py +16 -1
- lsst/pipe/base/tests/mocks/{_in_memory_repo.py → _repo.py} +324 -45
- lsst/pipe/base/tests/mocks/_storage_class.py +51 -0
- lsst/pipe/base/tests/simpleQGraph.py +11 -5
- lsst/pipe/base/version.py +1 -1
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/METADATA +2 -1
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/RECORD +40 -34
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/WHEEL +0 -0
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/entry_points.txt +0 -0
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/LICENSE +0 -0
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/top_level.txt +0 -0
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,808 @@
|
|
|
1
|
+
# This file is part of pipe_base.
|
|
2
|
+
#
|
|
3
|
+
# Developed for the LSST Data Management System.
|
|
4
|
+
# This product includes software developed by the LSST Project
|
|
5
|
+
# (http://www.lsst.org).
|
|
6
|
+
# See the COPYRIGHT file at the top-level directory of this distribution
|
|
7
|
+
# for details of code ownership.
|
|
8
|
+
#
|
|
9
|
+
# This software is dual licensed under the GNU General Public License and also
|
|
10
|
+
# under a 3-clause BSD license. Recipients may choose which of these licenses
|
|
11
|
+
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
|
|
12
|
+
# respectively. If you choose the GPL option then the following text applies
|
|
13
|
+
# (but note that there is still no warranty even if you opt for BSD instead):
|
|
14
|
+
#
|
|
15
|
+
# This program is free software: you can redistribute it and/or modify
|
|
16
|
+
# it under the terms of the GNU General Public License as published by
|
|
17
|
+
# the Free Software Foundation, either version 3 of the License, or
|
|
18
|
+
# (at your option) any later version.
|
|
19
|
+
#
|
|
20
|
+
# This program is distributed in the hope that it will be useful,
|
|
21
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
22
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
23
|
+
# GNU General Public License for more details.
|
|
24
|
+
#
|
|
25
|
+
# You should have received a copy of the GNU General Public License
|
|
26
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
__all__ = (
|
|
31
|
+
"Address",
|
|
32
|
+
"AddressReader",
|
|
33
|
+
"AddressRow",
|
|
34
|
+
"AddressWriter",
|
|
35
|
+
"Compressor",
|
|
36
|
+
"Decompressor",
|
|
37
|
+
"InvalidQuantumGraphFileError",
|
|
38
|
+
"MultiblockReader",
|
|
39
|
+
"MultiblockWriter",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
import dataclasses
|
|
43
|
+
import logging
|
|
44
|
+
import uuid
|
|
45
|
+
from collections.abc import Iterator
|
|
46
|
+
from contextlib import contextmanager
|
|
47
|
+
from io import BufferedReader, BytesIO
|
|
48
|
+
from operator import attrgetter
|
|
49
|
+
from typing import IO, TYPE_CHECKING, Protocol, TypeAlias, TypeVar
|
|
50
|
+
|
|
51
|
+
import pydantic
|
|
52
|
+
|
|
53
|
+
if TYPE_CHECKING:
|
|
54
|
+
import zipfile
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
_LOG = logging.getLogger(__name__)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
_T = TypeVar("_T", bound=pydantic.BaseModel)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
UUID_int: TypeAlias = int
|
|
64
|
+
|
|
65
|
+
MAX_UUID_INT: UUID_int = 2**128
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
DEFAULT_PAGE_SIZE: int = 5_000_000
|
|
69
|
+
"""Default page size for reading chunks of quantum graph files.
|
|
70
|
+
|
|
71
|
+
This is intended to be large enough to avoid any possibility of individual
|
|
72
|
+
reads suffering from per-seek overheads, especially in network file access,
|
|
73
|
+
while still being small enough to only minimally slow down tiny reads of
|
|
74
|
+
individual quanta (especially for execution).
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class Compressor(Protocol):
|
|
79
|
+
"""A protocol for objects with a `compress` method that takes and returns
|
|
80
|
+
`bytes`.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
def compress(self, data: bytes) -> bytes:
|
|
84
|
+
"""Compress the given data.
|
|
85
|
+
|
|
86
|
+
Parameters
|
|
87
|
+
----------
|
|
88
|
+
data : `bytes`
|
|
89
|
+
Uncompressed data.
|
|
90
|
+
|
|
91
|
+
Returns
|
|
92
|
+
-------
|
|
93
|
+
compressed : `bytes`
|
|
94
|
+
Compressed data.
|
|
95
|
+
"""
|
|
96
|
+
...
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class Decompressor(Protocol):
|
|
100
|
+
"""A protocol for objects with a `decompress` method that takes and returns
|
|
101
|
+
`bytes`.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
def decompress(self, data: bytes) -> bytes:
|
|
105
|
+
"""Decompress the given data.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
data : `bytes`
|
|
110
|
+
Compressed data.
|
|
111
|
+
|
|
112
|
+
Returns
|
|
113
|
+
-------
|
|
114
|
+
decompressed : `bytes`
|
|
115
|
+
Uncompressed data.
|
|
116
|
+
"""
|
|
117
|
+
...
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class InvalidQuantumGraphFileError(RuntimeError):
|
|
121
|
+
"""An exception raised when a quantum graph file has internal
|
|
122
|
+
inconsistencies or does not actually appear to be a quantum graph file.
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@dataclasses.dataclass(slots=True)
|
|
127
|
+
class Address:
|
|
128
|
+
"""Struct that holds an address into a multi-block file."""
|
|
129
|
+
|
|
130
|
+
offset: int = 0
|
|
131
|
+
"""Byte offset for the block."""
|
|
132
|
+
|
|
133
|
+
size: int = 0
|
|
134
|
+
"""Size of the block.
|
|
135
|
+
|
|
136
|
+
This always includes the size of the tiny header that records the block
|
|
137
|
+
size. That header does not include the size of the header, so these sizes
|
|
138
|
+
differ by the ``int_size`` used to write the multi-block file.
|
|
139
|
+
|
|
140
|
+
A size of zero is used (with, by convention, an offset of zero) to indicate
|
|
141
|
+
an absent block.
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
def __str__(self) -> str:
|
|
145
|
+
return f"{self.offset:06}[{self.size:06}]"
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@dataclasses.dataclass(slots=True)
|
|
149
|
+
class AddressRow:
|
|
150
|
+
"""The in-memory representation of a single row in an address file."""
|
|
151
|
+
|
|
152
|
+
key: uuid.UUID
|
|
153
|
+
"""Universally unique identifier for this row."""
|
|
154
|
+
|
|
155
|
+
index: int
|
|
156
|
+
"""Monotonically increasing integer ID; unique within this file only."""
|
|
157
|
+
|
|
158
|
+
addresses: list[Address] = dataclasses.field(default_factory=list)
|
|
159
|
+
"""Offsets and sizes into multi-block files."""
|
|
160
|
+
|
|
161
|
+
def write(self, stream: IO[bytes], int_size: int) -> None:
|
|
162
|
+
"""Write this address row to a file-like object.
|
|
163
|
+
|
|
164
|
+
Parameters
|
|
165
|
+
----------
|
|
166
|
+
stream : `typing.IO` [ `bytes` ]
|
|
167
|
+
Binary file-like object.
|
|
168
|
+
int_size : `int`
|
|
169
|
+
Number of bytes to use for all integers.
|
|
170
|
+
"""
|
|
171
|
+
stream.write(self.key.bytes)
|
|
172
|
+
stream.write(self.index.to_bytes(int_size))
|
|
173
|
+
for address in self.addresses:
|
|
174
|
+
stream.write(address.offset.to_bytes(int_size))
|
|
175
|
+
stream.write(address.size.to_bytes(int_size))
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def read(cls, stream: IO[bytes], n_addresses: int, int_size: int) -> AddressRow:
|
|
179
|
+
"""Read this address row from a file-like object.
|
|
180
|
+
|
|
181
|
+
Parameters
|
|
182
|
+
----------
|
|
183
|
+
stream : `typing.IO` [ `bytes` ]
|
|
184
|
+
Binary file-like object.
|
|
185
|
+
n_addresses : `int`
|
|
186
|
+
Number of addresses included in each row.
|
|
187
|
+
int_size : `int`
|
|
188
|
+
Number of bytes to use for all integers.
|
|
189
|
+
"""
|
|
190
|
+
key = uuid.UUID(int=int.from_bytes(stream.read(16)))
|
|
191
|
+
index = int.from_bytes(stream.read(int_size))
|
|
192
|
+
row = AddressRow(key, index)
|
|
193
|
+
for _ in range(n_addresses):
|
|
194
|
+
offset = int.from_bytes(stream.read(int_size))
|
|
195
|
+
size = int.from_bytes(stream.read(int_size))
|
|
196
|
+
row.addresses.append(Address(offset, size))
|
|
197
|
+
return row
|
|
198
|
+
|
|
199
|
+
def __str__(self) -> str:
|
|
200
|
+
return f"{self.key} {self.index:06} {' '.join(str(a) for a in self.addresses)}"
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
@dataclasses.dataclass
|
|
204
|
+
class AddressWriter:
|
|
205
|
+
"""A helper object for writing address files for multi-block files."""
|
|
206
|
+
|
|
207
|
+
indices: dict[uuid.UUID, int] = dataclasses.field(default_factory=dict)
|
|
208
|
+
"""Mapping from UUID to internal integer ID.
|
|
209
|
+
|
|
210
|
+
The internal integer ID must always correspond to the index into the
|
|
211
|
+
sorted list of all UUIDs, but this `dict` need not be sorted itself.
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
addresses: list[dict[uuid.UUID, Address]] = dataclasses.field(default_factory=list)
|
|
215
|
+
"""Addresses to store with each UUID.
|
|
216
|
+
|
|
217
|
+
Every key in one of these dictionaries must have an entry in `indices`.
|
|
218
|
+
The converse is not true.
|
|
219
|
+
"""
|
|
220
|
+
|
|
221
|
+
def write(self, stream: IO[bytes], int_size: int) -> None:
|
|
222
|
+
"""Write all addresses to a file-like object.
|
|
223
|
+
|
|
224
|
+
Parameters
|
|
225
|
+
----------
|
|
226
|
+
stream : `typing.IO` [ `bytes` ]
|
|
227
|
+
Binary file-like object.
|
|
228
|
+
int_size : `int`
|
|
229
|
+
Number of bytes to use for all integers.
|
|
230
|
+
"""
|
|
231
|
+
for n, address_map in enumerate(self.addresses):
|
|
232
|
+
if not self.indices.keys() >= address_map.keys():
|
|
233
|
+
raise AssertionError(
|
|
234
|
+
f"Logic bug in quantum graph I/O: address map {n} of {len(self.addresses)} has IDs "
|
|
235
|
+
f"{address_map.keys() - self.indices.keys()} not in the index map."
|
|
236
|
+
)
|
|
237
|
+
stream.write(int_size.to_bytes(1))
|
|
238
|
+
stream.write(len(self.indices).to_bytes(int_size))
|
|
239
|
+
stream.write(len(self.addresses).to_bytes(int_size))
|
|
240
|
+
empty_address = Address()
|
|
241
|
+
for key in sorted(self.indices.keys(), key=attrgetter("int")):
|
|
242
|
+
row = AddressRow(key, self.indices[key], [m.get(key, empty_address) for m in self.addresses])
|
|
243
|
+
_LOG.debug("Wrote address %s.", row)
|
|
244
|
+
row.write(stream, int_size)
|
|
245
|
+
|
|
246
|
+
def write_to_zip(self, zf: zipfile.ZipFile, name: str, int_size: int) -> None:
|
|
247
|
+
"""Write all addresses to a file in a zip archive.
|
|
248
|
+
|
|
249
|
+
Parameters
|
|
250
|
+
----------
|
|
251
|
+
zf : `zipfile.ZipFile`
|
|
252
|
+
Zip archive to add the file to.
|
|
253
|
+
name : `str`
|
|
254
|
+
Base name for the address file; an extension will be added.
|
|
255
|
+
int_size : `int`
|
|
256
|
+
Number of bytes to use for all integers.
|
|
257
|
+
"""
|
|
258
|
+
with zf.open(f"{name}.addr", mode="w") as stream:
|
|
259
|
+
self.write(stream, int_size=int_size)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
@dataclasses.dataclass
|
|
263
|
+
class AddressPage:
|
|
264
|
+
"""A page of addresses in the `AddressReader`."""
|
|
265
|
+
|
|
266
|
+
file_offset: int
|
|
267
|
+
"""Offset in bytes to this page from the beginning of the file."""
|
|
268
|
+
|
|
269
|
+
begin: int
|
|
270
|
+
"""Index of the first row in this page."""
|
|
271
|
+
|
|
272
|
+
n_rows: int
|
|
273
|
+
"""Number of rows in this page."""
|
|
274
|
+
|
|
275
|
+
read: bool = False
|
|
276
|
+
"""Whether this page has already been read."""
|
|
277
|
+
|
|
278
|
+
@property
|
|
279
|
+
def end(self) -> int:
|
|
280
|
+
"""One past the last row index in this page."""
|
|
281
|
+
return self.begin + self.n_rows
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
@dataclasses.dataclass
|
|
285
|
+
class PageBounds:
|
|
286
|
+
"""A page index and the UUID interval that page covers."""
|
|
287
|
+
|
|
288
|
+
page_index: int
|
|
289
|
+
"""Index into the page array."""
|
|
290
|
+
|
|
291
|
+
uuid_int_begin: UUID_int
|
|
292
|
+
"""Integer representation of the smallest UUID in this page."""
|
|
293
|
+
|
|
294
|
+
uuid_int_end: UUID_int
|
|
295
|
+
"""One larger than the integer representation of the largest UUID in this
|
|
296
|
+
page.
|
|
297
|
+
"""
|
|
298
|
+
|
|
299
|
+
def __str__(self) -> str:
|
|
300
|
+
return f"{self.page_index} [{self.uuid_int_begin:x}:{self.uuid_int_end:x}]"
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
@dataclasses.dataclass
|
|
304
|
+
class AddressReader:
|
|
305
|
+
"""A helper object for reading address files for multi-block files."""
|
|
306
|
+
|
|
307
|
+
stream: IO[bytes]
|
|
308
|
+
"""Stream to read from."""
|
|
309
|
+
|
|
310
|
+
int_size: int
|
|
311
|
+
"""Size of each integer in bytes."""
|
|
312
|
+
|
|
313
|
+
n_rows: int
|
|
314
|
+
"""Number of rows in the file."""
|
|
315
|
+
|
|
316
|
+
n_addresses: int
|
|
317
|
+
"""Number of addresses in each row."""
|
|
318
|
+
|
|
319
|
+
rows_per_page: int
|
|
320
|
+
"""Number of addresses in each page."""
|
|
321
|
+
|
|
322
|
+
rows: dict[uuid.UUID, AddressRow] = dataclasses.field(default_factory=dict)
|
|
323
|
+
"""Rows that have already been read."""
|
|
324
|
+
|
|
325
|
+
rows_by_index: dict[int, AddressRow] = dataclasses.field(default_factory=dict)
|
|
326
|
+
"""Rows that have already been read, keyed by integer index."""
|
|
327
|
+
|
|
328
|
+
pages: list[AddressPage] = dataclasses.field(default_factory=list)
|
|
329
|
+
page_bounds: dict[int, PageBounds] = dataclasses.field(default_factory=dict)
|
|
330
|
+
"""Mapping from page index to page boundary information."""
|
|
331
|
+
|
|
332
|
+
@classmethod
|
|
333
|
+
def from_stream(
|
|
334
|
+
cls, stream: IO[bytes], *, page_size: int, n_addresses: int, int_size: int
|
|
335
|
+
) -> AddressReader:
|
|
336
|
+
"""Construct from a stream by reading the header.
|
|
337
|
+
|
|
338
|
+
Parameters
|
|
339
|
+
----------
|
|
340
|
+
stream : `typing.IO` [ `bytes` ]
|
|
341
|
+
File-like object to read from.
|
|
342
|
+
page_size : `int`
|
|
343
|
+
Approximate number of bytes to read at a time when searching for an
|
|
344
|
+
address.
|
|
345
|
+
n_addresses : `int`
|
|
346
|
+
Number of addresses to expect per row. This is checked against
|
|
347
|
+
the size embedded in the file.
|
|
348
|
+
int_size : `int`
|
|
349
|
+
Number of bytes to use for all integers. This is checked against
|
|
350
|
+
the size embedded in the file.
|
|
351
|
+
"""
|
|
352
|
+
header_size = cls.compute_header_size(int_size)
|
|
353
|
+
row_size = cls.compute_row_size(int_size, n_addresses)
|
|
354
|
+
# Read the raw header page.
|
|
355
|
+
header_page_data = stream.read(header_size)
|
|
356
|
+
if len(header_page_data) < header_size:
|
|
357
|
+
raise InvalidQuantumGraphFileError("Address file unexpectedly truncated.")
|
|
358
|
+
# Interpret the raw header data and initialize the reader instance.
|
|
359
|
+
header_page_stream = BytesIO(header_page_data)
|
|
360
|
+
file_int_size = int.from_bytes(header_page_stream.read(1))
|
|
361
|
+
if file_int_size != int_size:
|
|
362
|
+
raise InvalidQuantumGraphFileError(
|
|
363
|
+
f"int size in address file ({file_int_size}) does not match int size in header ({int_size})."
|
|
364
|
+
)
|
|
365
|
+
n_rows = int.from_bytes(header_page_stream.read(int_size))
|
|
366
|
+
file_n_addresses = int.from_bytes(header_page_stream.read(int_size))
|
|
367
|
+
if file_n_addresses != n_addresses:
|
|
368
|
+
raise InvalidQuantumGraphFileError(
|
|
369
|
+
f"Incorrect number of addresses per row: expected {n_addresses}, got {file_n_addresses}."
|
|
370
|
+
)
|
|
371
|
+
rows_per_page = max(page_size // row_size, 1)
|
|
372
|
+
# Construct an instance.
|
|
373
|
+
self = cls(stream, int_size, n_rows, n_addresses, rows_per_page=rows_per_page)
|
|
374
|
+
# Calculate positions of each page of rows.
|
|
375
|
+
row_index = 0
|
|
376
|
+
file_offset = header_size
|
|
377
|
+
while row_index < n_rows:
|
|
378
|
+
self.pages.append(AddressPage(file_offset=file_offset, begin=row_index, n_rows=rows_per_page))
|
|
379
|
+
row_index += rows_per_page
|
|
380
|
+
file_offset += rows_per_page * row_size
|
|
381
|
+
if row_index != n_rows:
|
|
382
|
+
# Last page was too big.
|
|
383
|
+
self.pages[-1].n_rows -= row_index - n_rows
|
|
384
|
+
assert sum(p.n_rows for p in self.pages) == n_rows, "Bad logic setting page row counts."
|
|
385
|
+
return self
|
|
386
|
+
|
|
387
|
+
@classmethod
|
|
388
|
+
@contextmanager
|
|
389
|
+
def open_in_zip(
|
|
390
|
+
cls,
|
|
391
|
+
zf: zipfile.ZipFile,
|
|
392
|
+
name: str,
|
|
393
|
+
*,
|
|
394
|
+
page_size: int,
|
|
395
|
+
n_addresses: int,
|
|
396
|
+
int_size: int,
|
|
397
|
+
) -> Iterator[AddressReader]:
|
|
398
|
+
"""Make a reader for an address file in a zip archive.
|
|
399
|
+
|
|
400
|
+
Parameters
|
|
401
|
+
----------
|
|
402
|
+
zf : `zipfile.ZipFile`
|
|
403
|
+
Zip archive to read the file from.
|
|
404
|
+
name : `str`
|
|
405
|
+
Base name for the address file; an extension will be added.
|
|
406
|
+
page_size : `int`
|
|
407
|
+
Approximate number of bytes to read at a time when searching for an
|
|
408
|
+
address.
|
|
409
|
+
n_addresses : `int`
|
|
410
|
+
Number of addresses to expect per row. This is checked against
|
|
411
|
+
the size embedded in the file.
|
|
412
|
+
int_size : `int`
|
|
413
|
+
Number of bytes to use for all integers. This is checked against
|
|
414
|
+
the size embedded in the file.
|
|
415
|
+
|
|
416
|
+
Returns
|
|
417
|
+
-------
|
|
418
|
+
reader : `contextlib.AbstractContextManager` [ `AddressReader` ]
|
|
419
|
+
Context manager that returns a reader when entered.
|
|
420
|
+
"""
|
|
421
|
+
with zf.open(f"{name}.addr", mode="r") as stream:
|
|
422
|
+
yield cls.from_stream(stream, page_size=page_size, n_addresses=n_addresses, int_size=int_size)
|
|
423
|
+
|
|
424
|
+
@staticmethod
|
|
425
|
+
def compute_header_size(int_size: int) -> int:
|
|
426
|
+
"""Return the size (in bytes) of the header of an address file.
|
|
427
|
+
|
|
428
|
+
Parameters
|
|
429
|
+
----------
|
|
430
|
+
int_size : `int`
|
|
431
|
+
Size of each integer in bytes.
|
|
432
|
+
|
|
433
|
+
Returns
|
|
434
|
+
-------
|
|
435
|
+
size : `int`
|
|
436
|
+
Size of the header in bytes.
|
|
437
|
+
"""
|
|
438
|
+
return (
|
|
439
|
+
1 # int_size
|
|
440
|
+
+ int_size # number of rows
|
|
441
|
+
+ int_size # number of addresses in each row
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
@staticmethod
|
|
445
|
+
def compute_row_size(int_size: int, n_addresses: int) -> int:
|
|
446
|
+
"""Return the size (in bytes) of each row of an address file.
|
|
447
|
+
|
|
448
|
+
Parameters
|
|
449
|
+
----------
|
|
450
|
+
int_size : `int`
|
|
451
|
+
Size of each integer in bytes.
|
|
452
|
+
n_addresses : `int`
|
|
453
|
+
Number of addresses in each row.
|
|
454
|
+
|
|
455
|
+
Returns
|
|
456
|
+
-------
|
|
457
|
+
size : `int`
|
|
458
|
+
Size of each row in bytes.
|
|
459
|
+
"""
|
|
460
|
+
return (
|
|
461
|
+
16 # uuid
|
|
462
|
+
+ int_size
|
|
463
|
+
* (
|
|
464
|
+
1 # index
|
|
465
|
+
+ 2 * n_addresses
|
|
466
|
+
)
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
@property
|
|
470
|
+
def row_size(self) -> int:
|
|
471
|
+
"""The size (in bytes) of each row of this address file."""
|
|
472
|
+
return self.compute_row_size(self.int_size, self.n_addresses)
|
|
473
|
+
|
|
474
|
+
def read_all(self) -> dict[uuid.UUID, AddressRow]:
|
|
475
|
+
"""Read all addresses in the file.
|
|
476
|
+
|
|
477
|
+
Returns
|
|
478
|
+
-------
|
|
479
|
+
rows : `dict` [ `uuid.UUID`, `AddressRow` ]
|
|
480
|
+
Mapping of loaded address rows, keyed by UUID.
|
|
481
|
+
"""
|
|
482
|
+
# Skip any pages from the beginning that have already been read; this
|
|
483
|
+
# nicely handles both the case where we already read everything (or
|
|
484
|
+
# there was nothing to read) while giving us a page with a file offset
|
|
485
|
+
# to start from.
|
|
486
|
+
for page in self.pages:
|
|
487
|
+
if not page.read:
|
|
488
|
+
break
|
|
489
|
+
else:
|
|
490
|
+
return self.rows
|
|
491
|
+
# Read the entire rest of the file into memory.
|
|
492
|
+
self.stream.seek(page.file_offset)
|
|
493
|
+
data = self.stream.read()
|
|
494
|
+
buffer = BytesIO(data)
|
|
495
|
+
# Shortcut out if we've already read everything, but don't bother
|
|
496
|
+
# optimizing previous partial reads.
|
|
497
|
+
while len(self.rows) < self.n_rows:
|
|
498
|
+
self._read_row(buffer)
|
|
499
|
+
# Delete all pages; they don't matter anymore, and that's easier than
|
|
500
|
+
# updating them to reflect the reads we've done.
|
|
501
|
+
self.pages.clear()
|
|
502
|
+
return self.rows
|
|
503
|
+
|
|
504
|
+
def find(self, key: uuid.UUID) -> AddressRow:
|
|
505
|
+
"""Read the row for the given UUID.
|
|
506
|
+
|
|
507
|
+
Parameters
|
|
508
|
+
----------
|
|
509
|
+
key : `uuid.UUID`
|
|
510
|
+
UUID to find.
|
|
511
|
+
|
|
512
|
+
Returns
|
|
513
|
+
-------
|
|
514
|
+
row : `AddressRow`
|
|
515
|
+
Addresses for the given UUID.
|
|
516
|
+
"""
|
|
517
|
+
match key:
|
|
518
|
+
case uuid.UUID():
|
|
519
|
+
return self._find_uuid(key)
|
|
520
|
+
case _:
|
|
521
|
+
raise TypeError(f"Invalid argument: {key}.")
|
|
522
|
+
|
|
523
|
+
def _find_uuid(self, target: uuid.UUID) -> AddressRow:
|
|
524
|
+
if (row := self.rows.get(target)) is not None:
|
|
525
|
+
return row
|
|
526
|
+
if self.n_rows == 0 or not self.pages:
|
|
527
|
+
raise LookupError(f"Address for {target} not found.")
|
|
528
|
+
|
|
529
|
+
# Use a binary search to find the page containing the target UUID.
|
|
530
|
+
left = 0
|
|
531
|
+
right = len(self.pages) - 1
|
|
532
|
+
while left <= right:
|
|
533
|
+
mid = left + ((right - left) // 2)
|
|
534
|
+
self._read_page(mid)
|
|
535
|
+
if (row := self.rows.get(target)) is not None:
|
|
536
|
+
return row
|
|
537
|
+
bounds = self.page_bounds[mid]
|
|
538
|
+
if target.int < bounds.uuid_int_begin:
|
|
539
|
+
right = mid - 1
|
|
540
|
+
elif target.int > bounds.uuid_int_end:
|
|
541
|
+
left = mid + 1
|
|
542
|
+
else:
|
|
543
|
+
# Should have been on this page, but it wasn't.
|
|
544
|
+
raise LookupError(f"Address for {target} not found.")
|
|
545
|
+
|
|
546
|
+
# Ran out of pages to search.
|
|
547
|
+
raise LookupError(f"Address for {target} not found.")
|
|
548
|
+
|
|
549
|
+
def _read_page(self, page_index: int, page_stream: BytesIO | None = None) -> bool:
|
|
550
|
+
page = self.pages[page_index]
|
|
551
|
+
if page.read:
|
|
552
|
+
return False
|
|
553
|
+
if page_stream is None:
|
|
554
|
+
self.stream.seek(page.file_offset)
|
|
555
|
+
page_stream = BytesIO(self.stream.read(page.n_rows * self.row_size))
|
|
556
|
+
row = self._read_row(page_stream)
|
|
557
|
+
uuid_int_begin = row.key.int
|
|
558
|
+
for _ in range(1, page.n_rows):
|
|
559
|
+
row = self._read_row(page_stream)
|
|
560
|
+
uuid_int_end = row.key.int + 1 # Python's loop scoping rules are actually useful here!
|
|
561
|
+
page.read = True
|
|
562
|
+
bounds = PageBounds(page_index=page_index, uuid_int_begin=uuid_int_begin, uuid_int_end=uuid_int_end)
|
|
563
|
+
self.page_bounds[page_index] = bounds
|
|
564
|
+
_LOG.debug("Read page %s with rows [%s:%s].", bounds, page.begin, page.end)
|
|
565
|
+
return True
|
|
566
|
+
|
|
567
|
+
def _read_row(self, page_stream: BytesIO) -> AddressRow:
|
|
568
|
+
row = AddressRow.read(page_stream, self.n_addresses, self.int_size)
|
|
569
|
+
self.rows[row.key] = row
|
|
570
|
+
self.rows_by_index[row.index] = row
|
|
571
|
+
_LOG.debug("Read address row %s.", row)
|
|
572
|
+
return row
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
@dataclasses.dataclass
|
|
576
|
+
class MultiblockWriter:
|
|
577
|
+
"""A helper object for writing multi-block files."""
|
|
578
|
+
|
|
579
|
+
stream: IO[bytes]
|
|
580
|
+
"""A binary file-like object to write to."""
|
|
581
|
+
|
|
582
|
+
int_size: int
|
|
583
|
+
"""Number of bytes to use for all integers."""
|
|
584
|
+
|
|
585
|
+
file_size: int = 0
|
|
586
|
+
"""Running size of the full file."""
|
|
587
|
+
|
|
588
|
+
addresses: dict[uuid.UUID, Address] = dataclasses.field(default_factory=dict)
|
|
589
|
+
"""Running map of all addresses added to the file so far.
|
|
590
|
+
|
|
591
|
+
When the multi-block file is fully written, this is appended to the
|
|
592
|
+
`AddressWriter.addresses` to write the corresponding address file.
|
|
593
|
+
"""
|
|
594
|
+
|
|
595
|
+
@classmethod
|
|
596
|
+
@contextmanager
|
|
597
|
+
def open_in_zip(cls, zf: zipfile.ZipFile, name: str, int_size: int) -> Iterator[MultiblockWriter]:
|
|
598
|
+
"""Open a writer for a file in a zip archive.
|
|
599
|
+
|
|
600
|
+
Parameters
|
|
601
|
+
----------
|
|
602
|
+
zf : `zipfile.ZipFile`
|
|
603
|
+
Zip archive to add the file to.
|
|
604
|
+
name : `str`
|
|
605
|
+
Base name for the multi-block file; an extension will be added.
|
|
606
|
+
int_size : `int`
|
|
607
|
+
Number of bytes to use for all integers.
|
|
608
|
+
|
|
609
|
+
Returns
|
|
610
|
+
-------
|
|
611
|
+
writer : `contextlib.AbstractContextManager` [ `MultiblockWriter` ]
|
|
612
|
+
Context manager that returns a writer when entered.
|
|
613
|
+
"""
|
|
614
|
+
with zf.open(f"{name}.mb", mode="w", force_zip64=True) as stream:
|
|
615
|
+
yield MultiblockWriter(stream, int_size)
|
|
616
|
+
|
|
617
|
+
def write_bytes(self, id: uuid.UUID, data: bytes) -> Address:
|
|
618
|
+
"""Write raw bytes to the multi-block file.
|
|
619
|
+
|
|
620
|
+
Parameters
|
|
621
|
+
----------
|
|
622
|
+
id : `uuid.UUID`
|
|
623
|
+
Unique ID of the object described by this block.
|
|
624
|
+
data : `bytes`
|
|
625
|
+
Data to store directly.
|
|
626
|
+
|
|
627
|
+
Returns
|
|
628
|
+
-------
|
|
629
|
+
address : `Address`
|
|
630
|
+
Address of the bytes just written.
|
|
631
|
+
"""
|
|
632
|
+
self.stream.write(len(data).to_bytes(self.int_size))
|
|
633
|
+
self.stream.write(data)
|
|
634
|
+
block_size = len(data) + self.int_size
|
|
635
|
+
address = Address(offset=self.file_size, size=block_size)
|
|
636
|
+
self.file_size += block_size
|
|
637
|
+
self.addresses[id] = address
|
|
638
|
+
return address
|
|
639
|
+
|
|
640
|
+
def write_model(self, id: uuid.UUID, model: pydantic.BaseModel, compressor: Compressor) -> Address:
|
|
641
|
+
"""Write raw bytes to the multi-block file.
|
|
642
|
+
|
|
643
|
+
Parameters
|
|
644
|
+
----------
|
|
645
|
+
id : `uuid.UUID`
|
|
646
|
+
Unique ID of the object described by this block.
|
|
647
|
+
model : `pydantic.BaseModel`
|
|
648
|
+
Model to convert to JSON and compress.
|
|
649
|
+
compressor : `Compressor`
|
|
650
|
+
Object with a `compress` method that takes and returns `bytes`.
|
|
651
|
+
|
|
652
|
+
Returns
|
|
653
|
+
-------
|
|
654
|
+
address : `Address`
|
|
655
|
+
Address of the bytes just written.
|
|
656
|
+
"""
|
|
657
|
+
json_data = model.model_dump_json().encode()
|
|
658
|
+
compressed_data = compressor.compress(json_data)
|
|
659
|
+
return self.write_bytes(id, compressed_data)
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
@dataclasses.dataclass
|
|
663
|
+
class MultiblockReader:
|
|
664
|
+
"""A helper object for reader multi-block files."""
|
|
665
|
+
|
|
666
|
+
stream: IO[bytes]
|
|
667
|
+
"""A binary file-like object to read from."""
|
|
668
|
+
|
|
669
|
+
int_size: int
|
|
670
|
+
"""Number of bytes to use for all integers."""
|
|
671
|
+
|
|
672
|
+
@classmethod
|
|
673
|
+
@contextmanager
|
|
674
|
+
def open_in_zip(cls, zf: zipfile.ZipFile, name: str, *, int_size: int) -> Iterator[MultiblockReader]:
|
|
675
|
+
"""Open a reader for a file in a zip archive.
|
|
676
|
+
|
|
677
|
+
Parameters
|
|
678
|
+
----------
|
|
679
|
+
zf : `zipfile.ZipFile`
|
|
680
|
+
Zip archive to read the file from.
|
|
681
|
+
name : `str`
|
|
682
|
+
Base name for the multi-block file; an extension will be added.
|
|
683
|
+
int_size : `int`
|
|
684
|
+
Number of bytes to use for all integers.
|
|
685
|
+
|
|
686
|
+
Returns
|
|
687
|
+
-------
|
|
688
|
+
reader : `contextlib.AbstractContextManager` [ `MultiblockReader` ]
|
|
689
|
+
Context manager that returns a reader when entered.
|
|
690
|
+
"""
|
|
691
|
+
with zf.open(f"{name}.mb", mode="r") as stream:
|
|
692
|
+
yield MultiblockReader(stream, int_size)
|
|
693
|
+
|
|
694
|
+
@classmethod
|
|
695
|
+
def read_all_bytes_in_zip(
|
|
696
|
+
cls, zf: zipfile.ZipFile, name: str, *, int_size: int, page_size: int
|
|
697
|
+
) -> Iterator[bytes]:
|
|
698
|
+
"""Iterate over all of the byte blocks in a file in a zip archive.
|
|
699
|
+
|
|
700
|
+
Parameters
|
|
701
|
+
----------
|
|
702
|
+
zf : `zipfile.ZipFile`
|
|
703
|
+
Zip archive to read the file from.
|
|
704
|
+
name : `str`
|
|
705
|
+
Base name for the multi-block file; an extension will be added.
|
|
706
|
+
int_size : `int`
|
|
707
|
+
Number of bytes to use for all integers.
|
|
708
|
+
page_size : `int`
|
|
709
|
+
Approximate number of bytes to read at a time.
|
|
710
|
+
|
|
711
|
+
Returns
|
|
712
|
+
-------
|
|
713
|
+
byte_iter : `~collections.abc.Iterator` [ `bytes` ]
|
|
714
|
+
Iterator over blocks.
|
|
715
|
+
"""
|
|
716
|
+
with zf.open(f"{name}.mb", mode="r") as zf_stream:
|
|
717
|
+
# The standard library typing of IO[bytes] tiers isn't consistent.
|
|
718
|
+
buffered_stream = BufferedReader(zf_stream) # type: ignore[type-var]
|
|
719
|
+
size_data = buffered_stream.read(int_size)
|
|
720
|
+
while size_data:
|
|
721
|
+
internal_size = int.from_bytes(size_data)
|
|
722
|
+
yield buffered_stream.read(internal_size)
|
|
723
|
+
size_data = buffered_stream.read(int_size)
|
|
724
|
+
|
|
725
|
+
@classmethod
|
|
726
|
+
def read_all_models_in_zip(
|
|
727
|
+
cls,
|
|
728
|
+
zf: zipfile.ZipFile,
|
|
729
|
+
name: str,
|
|
730
|
+
model_type: type[_T],
|
|
731
|
+
decompressor: Decompressor,
|
|
732
|
+
*,
|
|
733
|
+
int_size: int,
|
|
734
|
+
page_size: int,
|
|
735
|
+
) -> Iterator[_T]:
|
|
736
|
+
"""Iterate over all of the models in a file in a zip archive.
|
|
737
|
+
|
|
738
|
+
Parameters
|
|
739
|
+
----------
|
|
740
|
+
zf : `zipfile.ZipFile`
|
|
741
|
+
Zip archive to read the file from.
|
|
742
|
+
name : `str`
|
|
743
|
+
Base name for the multi-block file; an extension will be added.
|
|
744
|
+
model_type : `type` [ `pydantic.BaseModel` ]
|
|
745
|
+
Pydantic model to validate JSON with.
|
|
746
|
+
decompressor : `Decompressor`
|
|
747
|
+
Object with a `decompress` method that takes and returns `bytes`.
|
|
748
|
+
int_size : `int`
|
|
749
|
+
Number of bytes to use for all integers.
|
|
750
|
+
page_size : `int`
|
|
751
|
+
Approximate number of bytes to read at a time.
|
|
752
|
+
|
|
753
|
+
Returns
|
|
754
|
+
-------
|
|
755
|
+
model_iter : `~collections.abc.Iterator` [ `pydantic.BaseModel` ]
|
|
756
|
+
Iterator over model instances.
|
|
757
|
+
"""
|
|
758
|
+
for compressed_data in cls.read_all_bytes_in_zip(zf, name, int_size=int_size, page_size=page_size):
|
|
759
|
+
json_data = decompressor.decompress(compressed_data)
|
|
760
|
+
yield model_type.model_validate_json(json_data)
|
|
761
|
+
|
|
762
|
+
def read_bytes(self, address: Address) -> bytes | None:
|
|
763
|
+
"""Read raw bytes from the multi-block file.
|
|
764
|
+
|
|
765
|
+
Parameters
|
|
766
|
+
----------
|
|
767
|
+
address : `Address`
|
|
768
|
+
Offset and size of the data to read.
|
|
769
|
+
|
|
770
|
+
Returns
|
|
771
|
+
-------
|
|
772
|
+
data : `bytes` or `None`
|
|
773
|
+
Data read directly, or `None` if the address has zero size.
|
|
774
|
+
"""
|
|
775
|
+
if not address.size:
|
|
776
|
+
return None
|
|
777
|
+
self.stream.seek(address.offset)
|
|
778
|
+
data = self.stream.read(address.size)
|
|
779
|
+
internal_size = int.from_bytes(data[: self.int_size])
|
|
780
|
+
data = data[self.int_size :]
|
|
781
|
+
if len(data) != internal_size:
|
|
782
|
+
raise InvalidQuantumGraphFileError(
|
|
783
|
+
f"Internal size {internal_size} does not match loaded data size {len(data)}."
|
|
784
|
+
)
|
|
785
|
+
return data
|
|
786
|
+
|
|
787
|
+
def read_model(self, address: Address, model_type: type[_T], decompressor: Decompressor) -> _T | None:
|
|
788
|
+
"""Read a single compressed JSON block.
|
|
789
|
+
|
|
790
|
+
Parameters
|
|
791
|
+
----------
|
|
792
|
+
address : `Address`
|
|
793
|
+
Size and offset of the block.
|
|
794
|
+
model_type : `type` [ `pydantic.BaseModel` ]
|
|
795
|
+
Pydantic model to validate JSON with.
|
|
796
|
+
decompressor : `Decompressor`
|
|
797
|
+
Object with a `decompress` method that takes and returns `bytes`.
|
|
798
|
+
|
|
799
|
+
Returns
|
|
800
|
+
-------
|
|
801
|
+
model : `pydantic.BaseModel`
|
|
802
|
+
Validated model.
|
|
803
|
+
"""
|
|
804
|
+
compressed_data = self.read_bytes(address)
|
|
805
|
+
if compressed_data is None:
|
|
806
|
+
return None
|
|
807
|
+
json_data = decompressor.decompress(compressed_data)
|
|
808
|
+
return model_type.model_validate_json(json_data)
|