lsst-pipe-base 29.2025.3900__py3-none-any.whl → 29.2025.4100__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. lsst/pipe/base/_task_metadata.py +15 -0
  2. lsst/pipe/base/dot_tools.py +14 -152
  3. lsst/pipe/base/exec_fixup_data_id.py +17 -44
  4. lsst/pipe/base/execution_graph_fixup.py +49 -18
  5. lsst/pipe/base/graph/_versionDeserializers.py +6 -5
  6. lsst/pipe/base/graph/graph.py +30 -10
  7. lsst/pipe/base/graph/graphSummary.py +30 -0
  8. lsst/pipe/base/graph_walker.py +119 -0
  9. lsst/pipe/base/log_capture.py +5 -2
  10. lsst/pipe/base/mermaid_tools.py +11 -64
  11. lsst/pipe/base/mp_graph_executor.py +298 -236
  12. lsst/pipe/base/pipeline_graph/io.py +1 -1
  13. lsst/pipe/base/quantum_graph/__init__.py +32 -0
  14. lsst/pipe/base/quantum_graph/_common.py +632 -0
  15. lsst/pipe/base/quantum_graph/_multiblock.py +808 -0
  16. lsst/pipe/base/quantum_graph/_predicted.py +1950 -0
  17. lsst/pipe/base/quantum_graph/visualization.py +302 -0
  18. lsst/pipe/base/quantum_graph_builder.py +292 -34
  19. lsst/pipe/base/quantum_graph_executor.py +2 -1
  20. lsst/pipe/base/quantum_provenance_graph.py +16 -7
  21. lsst/pipe/base/quantum_reports.py +45 -0
  22. lsst/pipe/base/separable_pipeline_executor.py +126 -15
  23. lsst/pipe/base/simple_pipeline_executor.py +44 -43
  24. lsst/pipe/base/single_quantum_executor.py +1 -40
  25. lsst/pipe/base/tests/mocks/__init__.py +1 -1
  26. lsst/pipe/base/tests/mocks/_pipeline_task.py +16 -1
  27. lsst/pipe/base/tests/mocks/{_in_memory_repo.py → _repo.py} +324 -45
  28. lsst/pipe/base/tests/mocks/_storage_class.py +51 -0
  29. lsst/pipe/base/tests/simpleQGraph.py +11 -5
  30. lsst/pipe/base/version.py +1 -1
  31. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/METADATA +2 -1
  32. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/RECORD +40 -34
  33. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/WHEEL +0 -0
  34. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/entry_points.txt +0 -0
  35. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/COPYRIGHT +0 -0
  36. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/LICENSE +0 -0
  37. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/bsd_license.txt +0 -0
  38. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/gpl-v3.0.txt +0 -0
  39. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/top_level.txt +0 -0
  40. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/zip-safe +0 -0
@@ -0,0 +1,808 @@
1
+ # This file is part of pipe_base.
2
+ #
3
+ # Developed for the LSST Data Management System.
4
+ # This product includes software developed by the LSST Project
5
+ # (http://www.lsst.org).
6
+ # See the COPYRIGHT file at the top-level directory of this distribution
7
+ # for details of code ownership.
8
+ #
9
+ # This software is dual licensed under the GNU General Public License and also
10
+ # under a 3-clause BSD license. Recipients may choose which of these licenses
11
+ # to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12
+ # respectively. If you choose the GPL option then the following text applies
13
+ # (but note that there is still no warranty even if you opt for BSD instead):
14
+ #
15
+ # This program is free software: you can redistribute it and/or modify
16
+ # it under the terms of the GNU General Public License as published by
17
+ # the Free Software Foundation, either version 3 of the License, or
18
+ # (at your option) any later version.
19
+ #
20
+ # This program is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
+ # GNU General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU General Public License
26
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
27
+
28
+ from __future__ import annotations
29
+
30
+ __all__ = (
31
+ "Address",
32
+ "AddressReader",
33
+ "AddressRow",
34
+ "AddressWriter",
35
+ "Compressor",
36
+ "Decompressor",
37
+ "InvalidQuantumGraphFileError",
38
+ "MultiblockReader",
39
+ "MultiblockWriter",
40
+ )
41
+
42
+ import dataclasses
43
+ import logging
44
+ import uuid
45
+ from collections.abc import Iterator
46
+ from contextlib import contextmanager
47
+ from io import BufferedReader, BytesIO
48
+ from operator import attrgetter
49
+ from typing import IO, TYPE_CHECKING, Protocol, TypeAlias, TypeVar
50
+
51
+ import pydantic
52
+
53
+ if TYPE_CHECKING:
54
+ import zipfile
55
+
56
+
57
+ _LOG = logging.getLogger(__name__)
58
+
59
+
60
+ _T = TypeVar("_T", bound=pydantic.BaseModel)
61
+
62
+
63
+ UUID_int: TypeAlias = int
64
+
65
+ MAX_UUID_INT: UUID_int = 2**128
66
+
67
+
68
+ DEFAULT_PAGE_SIZE: int = 5_000_000
69
+ """Default page size for reading chunks of quantum graph files.
70
+
71
+ This is intended to be large enough to avoid any possibility of individual
72
+ reads suffering from per-seek overheads, especially in network file access,
73
+ while still being small enough to only minimally slow down tiny reads of
74
+ individual quanta (especially for execution).
75
+ """
76
+
77
+
78
+ class Compressor(Protocol):
79
+ """A protocol for objects with a `compress` method that takes and returns
80
+ `bytes`.
81
+ """
82
+
83
+ def compress(self, data: bytes) -> bytes:
84
+ """Compress the given data.
85
+
86
+ Parameters
87
+ ----------
88
+ data : `bytes`
89
+ Uncompressed data.
90
+
91
+ Returns
92
+ -------
93
+ compressed : `bytes`
94
+ Compressed data.
95
+ """
96
+ ...
97
+
98
+
99
+ class Decompressor(Protocol):
100
+ """A protocol for objects with a `decompress` method that takes and returns
101
+ `bytes`.
102
+ """
103
+
104
+ def decompress(self, data: bytes) -> bytes:
105
+ """Decompress the given data.
106
+
107
+ Parameters
108
+ ----------
109
+ data : `bytes`
110
+ Compressed data.
111
+
112
+ Returns
113
+ -------
114
+ decompressed : `bytes`
115
+ Uncompressed data.
116
+ """
117
+ ...
118
+
119
+
120
+ class InvalidQuantumGraphFileError(RuntimeError):
121
+ """An exception raised when a quantum graph file has internal
122
+ inconsistencies or does not actually appear to be a quantum graph file.
123
+ """
124
+
125
+
126
+ @dataclasses.dataclass(slots=True)
127
+ class Address:
128
+ """Struct that holds an address into a multi-block file."""
129
+
130
+ offset: int = 0
131
+ """Byte offset for the block."""
132
+
133
+ size: int = 0
134
+ """Size of the block.
135
+
136
+ This always includes the size of the tiny header that records the block
137
+ size. That header does not include the size of the header, so these sizes
138
+ differ by the ``int_size`` used to write the multi-block file.
139
+
140
+ A size of zero is used (with, by convention, an offset of zero) to indicate
141
+ an absent block.
142
+ """
143
+
144
+ def __str__(self) -> str:
145
+ return f"{self.offset:06}[{self.size:06}]"
146
+
147
+
148
+ @dataclasses.dataclass(slots=True)
149
+ class AddressRow:
150
+ """The in-memory representation of a single row in an address file."""
151
+
152
+ key: uuid.UUID
153
+ """Universally unique identifier for this row."""
154
+
155
+ index: int
156
+ """Monotonically increasing integer ID; unique within this file only."""
157
+
158
+ addresses: list[Address] = dataclasses.field(default_factory=list)
159
+ """Offsets and sizes into multi-block files."""
160
+
161
+ def write(self, stream: IO[bytes], int_size: int) -> None:
162
+ """Write this address row to a file-like object.
163
+
164
+ Parameters
165
+ ----------
166
+ stream : `typing.IO` [ `bytes` ]
167
+ Binary file-like object.
168
+ int_size : `int`
169
+ Number of bytes to use for all integers.
170
+ """
171
+ stream.write(self.key.bytes)
172
+ stream.write(self.index.to_bytes(int_size))
173
+ for address in self.addresses:
174
+ stream.write(address.offset.to_bytes(int_size))
175
+ stream.write(address.size.to_bytes(int_size))
176
+
177
+ @classmethod
178
+ def read(cls, stream: IO[bytes], n_addresses: int, int_size: int) -> AddressRow:
179
+ """Read this address row from a file-like object.
180
+
181
+ Parameters
182
+ ----------
183
+ stream : `typing.IO` [ `bytes` ]
184
+ Binary file-like object.
185
+ n_addresses : `int`
186
+ Number of addresses included in each row.
187
+ int_size : `int`
188
+ Number of bytes to use for all integers.
189
+ """
190
+ key = uuid.UUID(int=int.from_bytes(stream.read(16)))
191
+ index = int.from_bytes(stream.read(int_size))
192
+ row = AddressRow(key, index)
193
+ for _ in range(n_addresses):
194
+ offset = int.from_bytes(stream.read(int_size))
195
+ size = int.from_bytes(stream.read(int_size))
196
+ row.addresses.append(Address(offset, size))
197
+ return row
198
+
199
+ def __str__(self) -> str:
200
+ return f"{self.key} {self.index:06} {' '.join(str(a) for a in self.addresses)}"
201
+
202
+
203
+ @dataclasses.dataclass
204
+ class AddressWriter:
205
+ """A helper object for writing address files for multi-block files."""
206
+
207
+ indices: dict[uuid.UUID, int] = dataclasses.field(default_factory=dict)
208
+ """Mapping from UUID to internal integer ID.
209
+
210
+ The internal integer ID must always correspond to the index into the
211
+ sorted list of all UUIDs, but this `dict` need not be sorted itself.
212
+ """
213
+
214
+ addresses: list[dict[uuid.UUID, Address]] = dataclasses.field(default_factory=list)
215
+ """Addresses to store with each UUID.
216
+
217
+ Every key in one of these dictionaries must have an entry in `indices`.
218
+ The converse is not true.
219
+ """
220
+
221
+ def write(self, stream: IO[bytes], int_size: int) -> None:
222
+ """Write all addresses to a file-like object.
223
+
224
+ Parameters
225
+ ----------
226
+ stream : `typing.IO` [ `bytes` ]
227
+ Binary file-like object.
228
+ int_size : `int`
229
+ Number of bytes to use for all integers.
230
+ """
231
+ for n, address_map in enumerate(self.addresses):
232
+ if not self.indices.keys() >= address_map.keys():
233
+ raise AssertionError(
234
+ f"Logic bug in quantum graph I/O: address map {n} of {len(self.addresses)} has IDs "
235
+ f"{address_map.keys() - self.indices.keys()} not in the index map."
236
+ )
237
+ stream.write(int_size.to_bytes(1))
238
+ stream.write(len(self.indices).to_bytes(int_size))
239
+ stream.write(len(self.addresses).to_bytes(int_size))
240
+ empty_address = Address()
241
+ for key in sorted(self.indices.keys(), key=attrgetter("int")):
242
+ row = AddressRow(key, self.indices[key], [m.get(key, empty_address) for m in self.addresses])
243
+ _LOG.debug("Wrote address %s.", row)
244
+ row.write(stream, int_size)
245
+
246
+ def write_to_zip(self, zf: zipfile.ZipFile, name: str, int_size: int) -> None:
247
+ """Write all addresses to a file in a zip archive.
248
+
249
+ Parameters
250
+ ----------
251
+ zf : `zipfile.ZipFile`
252
+ Zip archive to add the file to.
253
+ name : `str`
254
+ Base name for the address file; an extension will be added.
255
+ int_size : `int`
256
+ Number of bytes to use for all integers.
257
+ """
258
+ with zf.open(f"{name}.addr", mode="w") as stream:
259
+ self.write(stream, int_size=int_size)
260
+
261
+
262
+ @dataclasses.dataclass
263
+ class AddressPage:
264
+ """A page of addresses in the `AddressReader`."""
265
+
266
+ file_offset: int
267
+ """Offset in bytes to this page from the beginning of the file."""
268
+
269
+ begin: int
270
+ """Index of the first row in this page."""
271
+
272
+ n_rows: int
273
+ """Number of rows in this page."""
274
+
275
+ read: bool = False
276
+ """Whether this page has already been read."""
277
+
278
+ @property
279
+ def end(self) -> int:
280
+ """One past the last row index in this page."""
281
+ return self.begin + self.n_rows
282
+
283
+
284
+ @dataclasses.dataclass
285
+ class PageBounds:
286
+ """A page index and the UUID interval that page covers."""
287
+
288
+ page_index: int
289
+ """Index into the page array."""
290
+
291
+ uuid_int_begin: UUID_int
292
+ """Integer representation of the smallest UUID in this page."""
293
+
294
+ uuid_int_end: UUID_int
295
+ """One larger than the integer representation of the largest UUID in this
296
+ page.
297
+ """
298
+
299
+ def __str__(self) -> str:
300
+ return f"{self.page_index} [{self.uuid_int_begin:x}:{self.uuid_int_end:x}]"
301
+
302
+
303
+ @dataclasses.dataclass
304
+ class AddressReader:
305
+ """A helper object for reading address files for multi-block files."""
306
+
307
+ stream: IO[bytes]
308
+ """Stream to read from."""
309
+
310
+ int_size: int
311
+ """Size of each integer in bytes."""
312
+
313
+ n_rows: int
314
+ """Number of rows in the file."""
315
+
316
+ n_addresses: int
317
+ """Number of addresses in each row."""
318
+
319
+ rows_per_page: int
320
+ """Number of addresses in each page."""
321
+
322
+ rows: dict[uuid.UUID, AddressRow] = dataclasses.field(default_factory=dict)
323
+ """Rows that have already been read."""
324
+
325
+ rows_by_index: dict[int, AddressRow] = dataclasses.field(default_factory=dict)
326
+ """Rows that have already been read, keyed by integer index."""
327
+
328
+ pages: list[AddressPage] = dataclasses.field(default_factory=list)
329
+ page_bounds: dict[int, PageBounds] = dataclasses.field(default_factory=dict)
330
+ """Mapping from page index to page boundary information."""
331
+
332
+ @classmethod
333
+ def from_stream(
334
+ cls, stream: IO[bytes], *, page_size: int, n_addresses: int, int_size: int
335
+ ) -> AddressReader:
336
+ """Construct from a stream by reading the header.
337
+
338
+ Parameters
339
+ ----------
340
+ stream : `typing.IO` [ `bytes` ]
341
+ File-like object to read from.
342
+ page_size : `int`
343
+ Approximate number of bytes to read at a time when searching for an
344
+ address.
345
+ n_addresses : `int`
346
+ Number of addresses to expect per row. This is checked against
347
+ the size embedded in the file.
348
+ int_size : `int`
349
+ Number of bytes to use for all integers. This is checked against
350
+ the size embedded in the file.
351
+ """
352
+ header_size = cls.compute_header_size(int_size)
353
+ row_size = cls.compute_row_size(int_size, n_addresses)
354
+ # Read the raw header page.
355
+ header_page_data = stream.read(header_size)
356
+ if len(header_page_data) < header_size:
357
+ raise InvalidQuantumGraphFileError("Address file unexpectedly truncated.")
358
+ # Interpret the raw header data and initialize the reader instance.
359
+ header_page_stream = BytesIO(header_page_data)
360
+ file_int_size = int.from_bytes(header_page_stream.read(1))
361
+ if file_int_size != int_size:
362
+ raise InvalidQuantumGraphFileError(
363
+ f"int size in address file ({file_int_size}) does not match int size in header ({int_size})."
364
+ )
365
+ n_rows = int.from_bytes(header_page_stream.read(int_size))
366
+ file_n_addresses = int.from_bytes(header_page_stream.read(int_size))
367
+ if file_n_addresses != n_addresses:
368
+ raise InvalidQuantumGraphFileError(
369
+ f"Incorrect number of addresses per row: expected {n_addresses}, got {file_n_addresses}."
370
+ )
371
+ rows_per_page = max(page_size // row_size, 1)
372
+ # Construct an instance.
373
+ self = cls(stream, int_size, n_rows, n_addresses, rows_per_page=rows_per_page)
374
+ # Calculate positions of each page of rows.
375
+ row_index = 0
376
+ file_offset = header_size
377
+ while row_index < n_rows:
378
+ self.pages.append(AddressPage(file_offset=file_offset, begin=row_index, n_rows=rows_per_page))
379
+ row_index += rows_per_page
380
+ file_offset += rows_per_page * row_size
381
+ if row_index != n_rows:
382
+ # Last page was too big.
383
+ self.pages[-1].n_rows -= row_index - n_rows
384
+ assert sum(p.n_rows for p in self.pages) == n_rows, "Bad logic setting page row counts."
385
+ return self
386
+
387
+ @classmethod
388
+ @contextmanager
389
+ def open_in_zip(
390
+ cls,
391
+ zf: zipfile.ZipFile,
392
+ name: str,
393
+ *,
394
+ page_size: int,
395
+ n_addresses: int,
396
+ int_size: int,
397
+ ) -> Iterator[AddressReader]:
398
+ """Make a reader for an address file in a zip archive.
399
+
400
+ Parameters
401
+ ----------
402
+ zf : `zipfile.ZipFile`
403
+ Zip archive to read the file from.
404
+ name : `str`
405
+ Base name for the address file; an extension will be added.
406
+ page_size : `int`
407
+ Approximate number of bytes to read at a time when searching for an
408
+ address.
409
+ n_addresses : `int`
410
+ Number of addresses to expect per row. This is checked against
411
+ the size embedded in the file.
412
+ int_size : `int`
413
+ Number of bytes to use for all integers. This is checked against
414
+ the size embedded in the file.
415
+
416
+ Returns
417
+ -------
418
+ reader : `contextlib.AbstractContextManager` [ `AddressReader` ]
419
+ Context manager that returns a reader when entered.
420
+ """
421
+ with zf.open(f"{name}.addr", mode="r") as stream:
422
+ yield cls.from_stream(stream, page_size=page_size, n_addresses=n_addresses, int_size=int_size)
423
+
424
+ @staticmethod
425
+ def compute_header_size(int_size: int) -> int:
426
+ """Return the size (in bytes) of the header of an address file.
427
+
428
+ Parameters
429
+ ----------
430
+ int_size : `int`
431
+ Size of each integer in bytes.
432
+
433
+ Returns
434
+ -------
435
+ size : `int`
436
+ Size of the header in bytes.
437
+ """
438
+ return (
439
+ 1 # int_size
440
+ + int_size # number of rows
441
+ + int_size # number of addresses in each row
442
+ )
443
+
444
+ @staticmethod
445
+ def compute_row_size(int_size: int, n_addresses: int) -> int:
446
+ """Return the size (in bytes) of each row of an address file.
447
+
448
+ Parameters
449
+ ----------
450
+ int_size : `int`
451
+ Size of each integer in bytes.
452
+ n_addresses : `int`
453
+ Number of addresses in each row.
454
+
455
+ Returns
456
+ -------
457
+ size : `int`
458
+ Size of each row in bytes.
459
+ """
460
+ return (
461
+ 16 # uuid
462
+ + int_size
463
+ * (
464
+ 1 # index
465
+ + 2 * n_addresses
466
+ )
467
+ )
468
+
469
+ @property
470
+ def row_size(self) -> int:
471
+ """The size (in bytes) of each row of this address file."""
472
+ return self.compute_row_size(self.int_size, self.n_addresses)
473
+
474
+ def read_all(self) -> dict[uuid.UUID, AddressRow]:
475
+ """Read all addresses in the file.
476
+
477
+ Returns
478
+ -------
479
+ rows : `dict` [ `uuid.UUID`, `AddressRow` ]
480
+ Mapping of loaded address rows, keyed by UUID.
481
+ """
482
+ # Skip any pages from the beginning that have already been read; this
483
+ # nicely handles both the case where we already read everything (or
484
+ # there was nothing to read) while giving us a page with a file offset
485
+ # to start from.
486
+ for page in self.pages:
487
+ if not page.read:
488
+ break
489
+ else:
490
+ return self.rows
491
+ # Read the entire rest of the file into memory.
492
+ self.stream.seek(page.file_offset)
493
+ data = self.stream.read()
494
+ buffer = BytesIO(data)
495
+ # Shortcut out if we've already read everything, but don't bother
496
+ # optimizing previous partial reads.
497
+ while len(self.rows) < self.n_rows:
498
+ self._read_row(buffer)
499
+ # Delete all pages; they don't matter anymore, and that's easier than
500
+ # updating them to reflect the reads we've done.
501
+ self.pages.clear()
502
+ return self.rows
503
+
504
+ def find(self, key: uuid.UUID) -> AddressRow:
505
+ """Read the row for the given UUID.
506
+
507
+ Parameters
508
+ ----------
509
+ key : `uuid.UUID`
510
+ UUID to find.
511
+
512
+ Returns
513
+ -------
514
+ row : `AddressRow`
515
+ Addresses for the given UUID.
516
+ """
517
+ match key:
518
+ case uuid.UUID():
519
+ return self._find_uuid(key)
520
+ case _:
521
+ raise TypeError(f"Invalid argument: {key}.")
522
+
523
+ def _find_uuid(self, target: uuid.UUID) -> AddressRow:
524
+ if (row := self.rows.get(target)) is not None:
525
+ return row
526
+ if self.n_rows == 0 or not self.pages:
527
+ raise LookupError(f"Address for {target} not found.")
528
+
529
+ # Use a binary search to find the page containing the target UUID.
530
+ left = 0
531
+ right = len(self.pages) - 1
532
+ while left <= right:
533
+ mid = left + ((right - left) // 2)
534
+ self._read_page(mid)
535
+ if (row := self.rows.get(target)) is not None:
536
+ return row
537
+ bounds = self.page_bounds[mid]
538
+ if target.int < bounds.uuid_int_begin:
539
+ right = mid - 1
540
+ elif target.int > bounds.uuid_int_end:
541
+ left = mid + 1
542
+ else:
543
+ # Should have been on this page, but it wasn't.
544
+ raise LookupError(f"Address for {target} not found.")
545
+
546
+ # Ran out of pages to search.
547
+ raise LookupError(f"Address for {target} not found.")
548
+
549
+ def _read_page(self, page_index: int, page_stream: BytesIO | None = None) -> bool:
550
+ page = self.pages[page_index]
551
+ if page.read:
552
+ return False
553
+ if page_stream is None:
554
+ self.stream.seek(page.file_offset)
555
+ page_stream = BytesIO(self.stream.read(page.n_rows * self.row_size))
556
+ row = self._read_row(page_stream)
557
+ uuid_int_begin = row.key.int
558
+ for _ in range(1, page.n_rows):
559
+ row = self._read_row(page_stream)
560
+ uuid_int_end = row.key.int + 1 # Python's loop scoping rules are actually useful here!
561
+ page.read = True
562
+ bounds = PageBounds(page_index=page_index, uuid_int_begin=uuid_int_begin, uuid_int_end=uuid_int_end)
563
+ self.page_bounds[page_index] = bounds
564
+ _LOG.debug("Read page %s with rows [%s:%s].", bounds, page.begin, page.end)
565
+ return True
566
+
567
+ def _read_row(self, page_stream: BytesIO) -> AddressRow:
568
+ row = AddressRow.read(page_stream, self.n_addresses, self.int_size)
569
+ self.rows[row.key] = row
570
+ self.rows_by_index[row.index] = row
571
+ _LOG.debug("Read address row %s.", row)
572
+ return row
573
+
574
+
575
+ @dataclasses.dataclass
576
+ class MultiblockWriter:
577
+ """A helper object for writing multi-block files."""
578
+
579
+ stream: IO[bytes]
580
+ """A binary file-like object to write to."""
581
+
582
+ int_size: int
583
+ """Number of bytes to use for all integers."""
584
+
585
+ file_size: int = 0
586
+ """Running size of the full file."""
587
+
588
+ addresses: dict[uuid.UUID, Address] = dataclasses.field(default_factory=dict)
589
+ """Running map of all addresses added to the file so far.
590
+
591
+ When the multi-block file is fully written, this is appended to the
592
+ `AddressWriter.addresses` to write the corresponding address file.
593
+ """
594
+
595
+ @classmethod
596
+ @contextmanager
597
+ def open_in_zip(cls, zf: zipfile.ZipFile, name: str, int_size: int) -> Iterator[MultiblockWriter]:
598
+ """Open a writer for a file in a zip archive.
599
+
600
+ Parameters
601
+ ----------
602
+ zf : `zipfile.ZipFile`
603
+ Zip archive to add the file to.
604
+ name : `str`
605
+ Base name for the multi-block file; an extension will be added.
606
+ int_size : `int`
607
+ Number of bytes to use for all integers.
608
+
609
+ Returns
610
+ -------
611
+ writer : `contextlib.AbstractContextManager` [ `MultiblockWriter` ]
612
+ Context manager that returns a writer when entered.
613
+ """
614
+ with zf.open(f"{name}.mb", mode="w", force_zip64=True) as stream:
615
+ yield MultiblockWriter(stream, int_size)
616
+
617
+ def write_bytes(self, id: uuid.UUID, data: bytes) -> Address:
618
+ """Write raw bytes to the multi-block file.
619
+
620
+ Parameters
621
+ ----------
622
+ id : `uuid.UUID`
623
+ Unique ID of the object described by this block.
624
+ data : `bytes`
625
+ Data to store directly.
626
+
627
+ Returns
628
+ -------
629
+ address : `Address`
630
+ Address of the bytes just written.
631
+ """
632
+ self.stream.write(len(data).to_bytes(self.int_size))
633
+ self.stream.write(data)
634
+ block_size = len(data) + self.int_size
635
+ address = Address(offset=self.file_size, size=block_size)
636
+ self.file_size += block_size
637
+ self.addresses[id] = address
638
+ return address
639
+
640
+ def write_model(self, id: uuid.UUID, model: pydantic.BaseModel, compressor: Compressor) -> Address:
641
+ """Write raw bytes to the multi-block file.
642
+
643
+ Parameters
644
+ ----------
645
+ id : `uuid.UUID`
646
+ Unique ID of the object described by this block.
647
+ model : `pydantic.BaseModel`
648
+ Model to convert to JSON and compress.
649
+ compressor : `Compressor`
650
+ Object with a `compress` method that takes and returns `bytes`.
651
+
652
+ Returns
653
+ -------
654
+ address : `Address`
655
+ Address of the bytes just written.
656
+ """
657
+ json_data = model.model_dump_json().encode()
658
+ compressed_data = compressor.compress(json_data)
659
+ return self.write_bytes(id, compressed_data)
660
+
661
+
662
+ @dataclasses.dataclass
663
+ class MultiblockReader:
664
+ """A helper object for reader multi-block files."""
665
+
666
+ stream: IO[bytes]
667
+ """A binary file-like object to read from."""
668
+
669
+ int_size: int
670
+ """Number of bytes to use for all integers."""
671
+
672
+ @classmethod
673
+ @contextmanager
674
+ def open_in_zip(cls, zf: zipfile.ZipFile, name: str, *, int_size: int) -> Iterator[MultiblockReader]:
675
+ """Open a reader for a file in a zip archive.
676
+
677
+ Parameters
678
+ ----------
679
+ zf : `zipfile.ZipFile`
680
+ Zip archive to read the file from.
681
+ name : `str`
682
+ Base name for the multi-block file; an extension will be added.
683
+ int_size : `int`
684
+ Number of bytes to use for all integers.
685
+
686
+ Returns
687
+ -------
688
+ reader : `contextlib.AbstractContextManager` [ `MultiblockReader` ]
689
+ Context manager that returns a reader when entered.
690
+ """
691
+ with zf.open(f"{name}.mb", mode="r") as stream:
692
+ yield MultiblockReader(stream, int_size)
693
+
694
+ @classmethod
695
+ def read_all_bytes_in_zip(
696
+ cls, zf: zipfile.ZipFile, name: str, *, int_size: int, page_size: int
697
+ ) -> Iterator[bytes]:
698
+ """Iterate over all of the byte blocks in a file in a zip archive.
699
+
700
+ Parameters
701
+ ----------
702
+ zf : `zipfile.ZipFile`
703
+ Zip archive to read the file from.
704
+ name : `str`
705
+ Base name for the multi-block file; an extension will be added.
706
+ int_size : `int`
707
+ Number of bytes to use for all integers.
708
+ page_size : `int`
709
+ Approximate number of bytes to read at a time.
710
+
711
+ Returns
712
+ -------
713
+ byte_iter : `~collections.abc.Iterator` [ `bytes` ]
714
+ Iterator over blocks.
715
+ """
716
+ with zf.open(f"{name}.mb", mode="r") as zf_stream:
717
+ # The standard library typing of IO[bytes] tiers isn't consistent.
718
+ buffered_stream = BufferedReader(zf_stream) # type: ignore[type-var]
719
+ size_data = buffered_stream.read(int_size)
720
+ while size_data:
721
+ internal_size = int.from_bytes(size_data)
722
+ yield buffered_stream.read(internal_size)
723
+ size_data = buffered_stream.read(int_size)
724
+
725
+ @classmethod
726
+ def read_all_models_in_zip(
727
+ cls,
728
+ zf: zipfile.ZipFile,
729
+ name: str,
730
+ model_type: type[_T],
731
+ decompressor: Decompressor,
732
+ *,
733
+ int_size: int,
734
+ page_size: int,
735
+ ) -> Iterator[_T]:
736
+ """Iterate over all of the models in a file in a zip archive.
737
+
738
+ Parameters
739
+ ----------
740
+ zf : `zipfile.ZipFile`
741
+ Zip archive to read the file from.
742
+ name : `str`
743
+ Base name for the multi-block file; an extension will be added.
744
+ model_type : `type` [ `pydantic.BaseModel` ]
745
+ Pydantic model to validate JSON with.
746
+ decompressor : `Decompressor`
747
+ Object with a `decompress` method that takes and returns `bytes`.
748
+ int_size : `int`
749
+ Number of bytes to use for all integers.
750
+ page_size : `int`
751
+ Approximate number of bytes to read at a time.
752
+
753
+ Returns
754
+ -------
755
+ model_iter : `~collections.abc.Iterator` [ `pydantic.BaseModel` ]
756
+ Iterator over model instances.
757
+ """
758
+ for compressed_data in cls.read_all_bytes_in_zip(zf, name, int_size=int_size, page_size=page_size):
759
+ json_data = decompressor.decompress(compressed_data)
760
+ yield model_type.model_validate_json(json_data)
761
+
762
+ def read_bytes(self, address: Address) -> bytes | None:
763
+ """Read raw bytes from the multi-block file.
764
+
765
+ Parameters
766
+ ----------
767
+ address : `Address`
768
+ Offset and size of the data to read.
769
+
770
+ Returns
771
+ -------
772
+ data : `bytes` or `None`
773
+ Data read directly, or `None` if the address has zero size.
774
+ """
775
+ if not address.size:
776
+ return None
777
+ self.stream.seek(address.offset)
778
+ data = self.stream.read(address.size)
779
+ internal_size = int.from_bytes(data[: self.int_size])
780
+ data = data[self.int_size :]
781
+ if len(data) != internal_size:
782
+ raise InvalidQuantumGraphFileError(
783
+ f"Internal size {internal_size} does not match loaded data size {len(data)}."
784
+ )
785
+ return data
786
+
787
+ def read_model(self, address: Address, model_type: type[_T], decompressor: Decompressor) -> _T | None:
788
+ """Read a single compressed JSON block.
789
+
790
+ Parameters
791
+ ----------
792
+ address : `Address`
793
+ Size and offset of the block.
794
+ model_type : `type` [ `pydantic.BaseModel` ]
795
+ Pydantic model to validate JSON with.
796
+ decompressor : `Decompressor`
797
+ Object with a `decompress` method that takes and returns `bytes`.
798
+
799
+ Returns
800
+ -------
801
+ model : `pydantic.BaseModel`
802
+ Validated model.
803
+ """
804
+ compressed_data = self.read_bytes(address)
805
+ if compressed_data is None:
806
+ return None
807
+ json_data = decompressor.decompress(compressed_data)
808
+ return model_type.model_validate_json(json_data)