lsst-pipe-base 29.2025.4100__py3-none-any.whl → 29.2025.4300__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. lsst/pipe/base/_status.py +1 -1
  2. lsst/pipe/base/cli/cmd/__init__.py +2 -2
  3. lsst/pipe/base/cli/cmd/commands.py +116 -1
  4. lsst/pipe/base/graph_walker.py +8 -4
  5. lsst/pipe/base/pipeline_graph/_pipeline_graph.py +30 -5
  6. lsst/pipe/base/quantum_graph/__init__.py +1 -0
  7. lsst/pipe/base/quantum_graph/_common.py +2 -1
  8. lsst/pipe/base/quantum_graph/_multiblock.py +41 -7
  9. lsst/pipe/base/quantum_graph/_predicted.py +62 -5
  10. lsst/pipe/base/quantum_graph/_provenance.py +1209 -0
  11. lsst/pipe/base/quantum_graph/aggregator/__init__.py +143 -0
  12. lsst/pipe/base/quantum_graph/aggregator/_communicators.py +981 -0
  13. lsst/pipe/base/quantum_graph/aggregator/_config.py +139 -0
  14. lsst/pipe/base/quantum_graph/aggregator/_ingester.py +312 -0
  15. lsst/pipe/base/quantum_graph/aggregator/_progress.py +208 -0
  16. lsst/pipe/base/quantum_graph/aggregator/_scanner.py +371 -0
  17. lsst/pipe/base/quantum_graph/aggregator/_structs.py +167 -0
  18. lsst/pipe/base/quantum_graph/aggregator/_supervisor.py +225 -0
  19. lsst/pipe/base/quantum_graph/aggregator/_writer.py +593 -0
  20. lsst/pipe/base/resource_usage.py +183 -0
  21. lsst/pipe/base/simple_pipeline_executor.py +4 -1
  22. lsst/pipe/base/tests/util.py +31 -0
  23. lsst/pipe/base/version.py +1 -1
  24. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/METADATA +1 -1
  25. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/RECORD +33 -22
  26. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/WHEEL +0 -0
  27. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/entry_points.txt +0 -0
  28. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/licenses/COPYRIGHT +0 -0
  29. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/licenses/LICENSE +0 -0
  30. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/licenses/bsd_license.txt +0 -0
  31. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/licenses/gpl-v3.0.txt +0 -0
  32. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/top_level.txt +0 -0
  33. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/zip-safe +0 -0
@@ -0,0 +1,593 @@
1
+ # This file is part of pipe_base.
2
+ #
3
+ # Developed for the LSST Data Management System.
4
+ # This product includes software developed by the LSST Project
5
+ # (http://www.lsst.org).
6
+ # See the COPYRIGHT file at the top-level directory of this distribution
7
+ # for details of code ownership.
8
+ #
9
+ # This software is dual licensed under the GNU General Public License and also
10
+ # under a 3-clause BSD license. Recipients may choose which of these licenses
11
+ # to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12
+ # respectively. If you choose the GPL option then the following text applies
13
+ # (but note that there is still no warranty even if you opt for BSD instead):
14
+ #
15
+ # This program is free software: you can redistribute it and/or modify
16
+ # it under the terms of the GNU General Public License as published by
17
+ # the Free Software Foundation, either version 3 of the License, or
18
+ # (at your option) any later version.
19
+ #
20
+ # This program is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
+ # GNU General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU General Public License
26
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
27
+
28
+ from __future__ import annotations
29
+
30
+ __all__ = ("Writer",)
31
+
32
+ import dataclasses
33
+ import enum
34
+ import itertools
35
+ import logging
36
+ import operator
37
+ import uuid
38
+ from typing import TypeVar
39
+
40
+ import networkx
41
+ import zstandard
42
+
43
+ from lsst.utils.packages import Packages
44
+
45
+ from ... import automatic_connection_constants as acc
46
+ from ...pipeline_graph import TaskImportMode
47
+ from .._common import BaseQuantumGraphWriter
48
+ from .._multiblock import Compressor, MultiblockWriter
49
+ from .._predicted import PredictedDatasetModel, PredictedQuantumGraphComponents, PredictedQuantumGraphReader
50
+ from .._provenance import (
51
+ DATASET_ADDRESS_INDEX,
52
+ DATASET_MB_NAME,
53
+ LOG_ADDRESS_INDEX,
54
+ LOG_MB_NAME,
55
+ METADATA_ADDRESS_INDEX,
56
+ METADATA_MB_NAME,
57
+ QUANTUM_ADDRESS_INDEX,
58
+ QUANTUM_MB_NAME,
59
+ ProvenanceDatasetModel,
60
+ ProvenanceInitQuantaModel,
61
+ ProvenanceInitQuantumModel,
62
+ ProvenanceQuantumModel,
63
+ )
64
+ from ._communicators import WriterCommunicator
65
+ from ._structs import ScanResult
66
+
67
+
68
+ class _CompressionState(enum.Enum):
69
+ """Enumeration of the possible states of compression in `_ScanData`."""
70
+
71
+ NOT_COMPRESSED = enum.auto()
72
+ """Nothing is compressed."""
73
+
74
+ LOG_AND_METADATA_COMPRESSED = enum.auto()
75
+ """Only the logs and metadata are compressed."""
76
+
77
+ ALL_COMPRESSED = enum.auto()
78
+ """All `bytes` are compressed."""
79
+
80
+
81
+ @dataclasses.dataclass
82
+ class _ScanData:
83
+ """Information from a quantum scan that has been partially processed for
84
+ writing.
85
+ """
86
+
87
+ quantum_id: uuid.UUID
88
+ """Unique ID of the quantum."""
89
+
90
+ log_id: uuid.UUID
91
+ """Unique ID of the log dataset."""
92
+
93
+ metadata_id: uuid.UUID
94
+ """Unique ID of the metadata dataset."""
95
+
96
+ quantum: bytes = b""
97
+ """Possibly-compressed JSON representation of the quantum provenance."""
98
+
99
+ datasets: dict[uuid.UUID, bytes] = dataclasses.field(default_factory=dict)
100
+ """Possibly-compressed JSON representation of output dataset provenance."""
101
+
102
+ log: bytes = b""
103
+ """Possibly-compressed log dataset content."""
104
+
105
+ metadata: bytes = b""
106
+ """Possibly-compressed metadata dataset content."""
107
+
108
+ compression: _CompressionState = _CompressionState.NOT_COMPRESSED
109
+ """Which data is compressed, if any."""
110
+
111
+ def compress(self, compressor: Compressor) -> None:
112
+ """Compress all data in place, if it isn't already.
113
+
114
+ Parameters
115
+ ----------
116
+ compressor : `Compressor`
117
+ Object that can compress `bytes`.
118
+ """
119
+ if self.compression is _CompressionState.NOT_COMPRESSED:
120
+ self.metadata = compressor.compress(self.metadata)
121
+ self.log = compressor.compress(self.log)
122
+ self.compression = _CompressionState.LOG_AND_METADATA_COMPRESSED
123
+ if self.compression is _CompressionState.LOG_AND_METADATA_COMPRESSED:
124
+ self.quantum = compressor.compress(self.quantum)
125
+ for key in self.datasets.keys():
126
+ self.datasets[key] = compressor.compress(self.datasets[key])
127
+ self.compression = _CompressionState.ALL_COMPRESSED
128
+
129
+
130
+ @dataclasses.dataclass
131
+ class _DataWriters:
132
+ """A struct of low-level writer objects for the main components of a
133
+ provenance quantum graph.
134
+
135
+ Parameters
136
+ ----------
137
+ comms : `WriterCommunicator`
138
+ Communicator helper object for the writer.
139
+ predicted : `.PredictedQuantumGraphComponents`
140
+ Components of the predicted graph.
141
+ indices : `dict` [ `uuid.UUID`, `int` ]
142
+ Mapping from UUID to internal integer ID, including both quanta and
143
+ datasets.
144
+ compressor : `Compressor`
145
+ Object that can compress `bytes`.
146
+ cdict_data : `bytes` or `None`, optional
147
+ Bytes representation of the compression dictionary used by the
148
+ compressor.
149
+ """
150
+
151
+ def __init__(
152
+ self,
153
+ comms: WriterCommunicator,
154
+ predicted: PredictedQuantumGraphComponents,
155
+ indices: dict[uuid.UUID, int],
156
+ compressor: Compressor,
157
+ cdict_data: bytes | None = None,
158
+ ) -> None:
159
+ assert comms.config.output_path is not None
160
+ header = predicted.header.model_copy()
161
+ header.graph_type = "provenance"
162
+ self.graph = comms.enter(
163
+ BaseQuantumGraphWriter.open(
164
+ comms.config.output_path,
165
+ header,
166
+ predicted.pipeline_graph,
167
+ indices,
168
+ address_filename="nodes",
169
+ compressor=compressor,
170
+ cdict_data=cdict_data,
171
+ ),
172
+ on_close="Finishing writing provenance quantum graph.",
173
+ is_progress_log=True,
174
+ )
175
+ self.graph.address_writer.addresses = [{}, {}, {}, {}]
176
+ self.logs = comms.enter(
177
+ MultiblockWriter.open_in_zip(self.graph.zf, LOG_MB_NAME, header.int_size, use_tempfile=True),
178
+ on_close="Copying logs into zip archive.",
179
+ is_progress_log=True,
180
+ )
181
+ self.graph.address_writer.addresses[LOG_ADDRESS_INDEX] = self.logs.addresses
182
+ self.metadata = comms.enter(
183
+ MultiblockWriter.open_in_zip(self.graph.zf, METADATA_MB_NAME, header.int_size, use_tempfile=True),
184
+ on_close="Copying metadata into zip archive.",
185
+ is_progress_log=True,
186
+ )
187
+ self.graph.address_writer.addresses[METADATA_ADDRESS_INDEX] = self.metadata.addresses
188
+ self.datasets = comms.enter(
189
+ MultiblockWriter.open_in_zip(self.graph.zf, DATASET_MB_NAME, header.int_size, use_tempfile=True),
190
+ on_close="Copying dataset provenance into zip archive.",
191
+ is_progress_log=True,
192
+ )
193
+ self.graph.address_writer.addresses[DATASET_ADDRESS_INDEX] = self.datasets.addresses
194
+ self.quanta = comms.enter(
195
+ MultiblockWriter.open_in_zip(self.graph.zf, QUANTUM_MB_NAME, header.int_size, use_tempfile=True),
196
+ on_close="Copying quantum provenance into zip archive.",
197
+ is_progress_log=True,
198
+ )
199
+ self.graph.address_writer.addresses[QUANTUM_ADDRESS_INDEX] = self.quanta.addresses
200
+
201
+ graph: BaseQuantumGraphWriter
202
+ """The parent graph writer."""
203
+
204
+ datasets: MultiblockWriter
205
+ """A writer for dataset provenance."""
206
+
207
+ quanta: MultiblockWriter
208
+ """A writer for quantum provenance."""
209
+
210
+ metadata: MultiblockWriter
211
+ """A writer for metadata content."""
212
+
213
+ logs: MultiblockWriter
214
+ """A writer for log content."""
215
+
216
+ @property
217
+ def compressor(self) -> Compressor:
218
+ """Object that should be used to compress all JSON blocks."""
219
+ return self.graph.compressor
220
+
221
+
222
+ @dataclasses.dataclass
223
+ class Writer:
224
+ """A helper class for the provenance aggregator actually writes the
225
+ provenance quantum graph file.
226
+ """
227
+
228
+ predicted_path: str
229
+ """Path to the predicted quantum graph."""
230
+
231
+ comms: WriterCommunicator
232
+ """Communicator object for this worker."""
233
+
234
+ predicted: PredictedQuantumGraphComponents = dataclasses.field(init=False)
235
+ """Components of the predicted quantum graph."""
236
+
237
+ existing_init_outputs: dict[uuid.UUID, set[uuid.UUID]] = dataclasses.field(default_factory=dict)
238
+ """Mapping that tracks which init-outputs exist.
239
+
240
+ This mapping is updated as scanners inform the writer about init-output
241
+ existence, since we want to write that provenance information out only at
242
+ the end.
243
+ """
244
+
245
+ indices: dict[uuid.UUID, int] = dataclasses.field(default_factory=dict)
246
+ """Mapping from UUID to internal integer ID, including both quanta and
247
+ datasets.
248
+
249
+ This is fully initialized at construction.
250
+ """
251
+
252
+ output_dataset_ids: set[uuid.UUID] = dataclasses.field(default_factory=set)
253
+ """The IDs of all datasets that are produced by this graph.
254
+
255
+ This is fully initialized at construction.
256
+ """
257
+
258
+ overall_inputs: dict[uuid.UUID, PredictedDatasetModel] = dataclasses.field(default_factory=dict)
259
+ """All datasets that are not produced by any quantum in this graph."""
260
+
261
+ xgraph: networkx.DiGraph = dataclasses.field(default_factory=networkx.DiGraph)
262
+ """A bipartite NetworkX graph linking datasets to quanta and quanta to
263
+ datasets.
264
+
265
+ This is fully initialized at construction. There are no node or edge
266
+ attributes in this graph; we only need it to store adjacency information
267
+ with datasets as well as with quanta.
268
+ """
269
+
270
+ pending_compression_training: list[_ScanData] = dataclasses.field(default_factory=list)
271
+ """Partially processed quantum scans that are being accumulated in order to
272
+ build a compression dictionary.
273
+ """
274
+
275
+ def __post_init__(self) -> None:
276
+ assert self.comms.config.output_path is not None, "Writer should not be used if writing is disabled."
277
+ self.comms.log.info("Reading predicted quantum graph.")
278
+ with PredictedQuantumGraphReader.open(
279
+ self.predicted_path, import_mode=TaskImportMode.DO_NOT_IMPORT
280
+ ) as reader:
281
+ self.comms.check_for_cancel()
282
+ reader.read_init_quanta()
283
+ self.comms.check_for_cancel()
284
+ reader.read_quantum_datasets()
285
+ self.predicted = reader.components
286
+ for predicted_init_quantum in self.predicted.init_quanta.root:
287
+ self.existing_init_outputs[predicted_init_quantum.quantum_id] = set()
288
+ self.comms.check_for_cancel()
289
+ self.comms.log.info("Generating integer indexes and identifying outputs.")
290
+ self._populate_indices_and_outputs()
291
+ self.comms.check_for_cancel()
292
+ self._populate_xgraph_and_inputs()
293
+ self.comms.check_for_cancel()
294
+ self.comms.log_progress(
295
+ # We add one here for 'packages', which we do ingest but don't
296
+ # record provenance for.
297
+ logging.INFO,
298
+ f"Graph has {len(self.output_dataset_ids) + 1} predicted output dataset(s).",
299
+ )
300
+
301
+ def _populate_indices_and_outputs(self) -> None:
302
+ all_uuids = set(self.predicted.quantum_indices.keys())
303
+ for quantum in self.comms.periodically_check_for_cancel(
304
+ itertools.chain(
305
+ self.predicted.init_quanta.root,
306
+ self.predicted.quantum_datasets.values(),
307
+ )
308
+ ):
309
+ if not quantum.task_label:
310
+ # Skip the 'packages' producer quantum.
311
+ continue
312
+ all_uuids.update(quantum.iter_input_dataset_ids())
313
+ self.output_dataset_ids.update(quantum.iter_output_dataset_ids())
314
+ all_uuids.update(self.output_dataset_ids)
315
+ self.indices = {
316
+ node_id: node_index
317
+ for node_index, node_id in self.comms.periodically_check_for_cancel(
318
+ enumerate(sorted(all_uuids, key=operator.attrgetter("int")))
319
+ )
320
+ }
321
+
322
+ def _populate_xgraph_and_inputs(self) -> None:
323
+ for predicted_quantum in self.comms.periodically_check_for_cancel(
324
+ itertools.chain(
325
+ self.predicted.init_quanta.root,
326
+ self.predicted.quantum_datasets.values(),
327
+ )
328
+ ):
329
+ if not predicted_quantum.task_label:
330
+ # Skip the 'packages' producer quantum.
331
+ continue
332
+ quantum_index = self.indices[predicted_quantum.quantum_id]
333
+ for predicted_input in itertools.chain.from_iterable(predicted_quantum.inputs.values()):
334
+ self.xgraph.add_edge(self.indices[predicted_input.dataset_id], quantum_index)
335
+ if predicted_input.dataset_id not in self.output_dataset_ids:
336
+ self.overall_inputs.setdefault(predicted_input.dataset_id, predicted_input)
337
+ for predicted_output in itertools.chain.from_iterable(predicted_quantum.outputs.values()):
338
+ self.xgraph.add_edge(quantum_index, self.indices[predicted_output.dataset_id])
339
+
340
+ @staticmethod
341
+ def run(predicted_path: str, comms: WriterCommunicator) -> None:
342
+ """Run the writer.
343
+
344
+ Parameters
345
+ ----------
346
+ predicted_path : `str`
347
+ Path to the predicted quantum graph.
348
+ comms : `WriterCommunicator`
349
+ Communicator for the writer.
350
+
351
+ Notes
352
+ -----
353
+ This method is designed to run as the ``target`` in
354
+ `WorkerContext.make_worker`.
355
+ """
356
+ with comms:
357
+ writer = Writer(predicted_path, comms)
358
+ writer.loop()
359
+
360
+ def loop(self) -> None:
361
+ """Run the main loop for the writer."""
362
+ data_writers: _DataWriters | None = None
363
+ if not self.comms.config.zstd_dict_size:
364
+ data_writers = self.make_data_writers()
365
+ self.comms.log.info("Polling for write requests from scanners.")
366
+ for request in self.comms.poll():
367
+ if data_writers is None:
368
+ self.pending_compression_training.extend(self.make_scan_data(request))
369
+ if len(self.pending_compression_training) >= self.comms.config.zstd_dict_n_inputs:
370
+ data_writers = self.make_data_writers()
371
+ else:
372
+ for scan_data in self.make_scan_data(request):
373
+ self.write_scan_data(scan_data, data_writers)
374
+ if data_writers is None:
375
+ data_writers = self.make_data_writers()
376
+ self.write_init_outputs(data_writers)
377
+
378
+ def make_data_writers(self) -> _DataWriters:
379
+ """Make a compression dictionary, open the low-level writers, and
380
+ write any accumulated scans that were needed to make the compression
381
+ dictionary.
382
+
383
+ Returns
384
+ -------
385
+ data_writers : `_DataWriters`
386
+ Low-level writers struct.
387
+ """
388
+ cdict = self.make_compression_dictionary()
389
+ self.comms.send_compression_dict(cdict.as_bytes())
390
+ assert self.comms.config.output_path is not None
391
+ self.comms.log.info("Opening output files.")
392
+ data_writers = _DataWriters(
393
+ self.comms,
394
+ self.predicted,
395
+ self.indices,
396
+ compressor=zstandard.ZstdCompressor(self.comms.config.zstd_level, cdict),
397
+ cdict_data=cdict.as_bytes(),
398
+ )
399
+ self.comms.check_for_cancel()
400
+ self.comms.log.info("Compressing and writing queued scan requests.")
401
+ for scan_data in self.pending_compression_training:
402
+ self.write_scan_data(scan_data, data_writers)
403
+ del self.pending_compression_training
404
+ self.comms.check_for_cancel()
405
+ self.write_overall_inputs(data_writers)
406
+ self.write_packages(data_writers)
407
+ self.comms.log.info("Returning to write request loop.")
408
+ return data_writers
409
+
410
+ def make_compression_dictionary(self) -> zstandard.ZstdCompressionDict:
411
+ """Make the compression dictionary.
412
+
413
+ Returns
414
+ -------
415
+ cdict : `zstandard.ZstdCompressionDict`
416
+ The compression dictionary.
417
+ """
418
+ if (
419
+ not self.comms.config.zstd_dict_size
420
+ or len(self.pending_compression_training) < self.comms.config.zstd_dict_n_inputs
421
+ ):
422
+ self.comms.log.info("Making compressor with no dictionary.")
423
+ return zstandard.ZstdCompressionDict(b"")
424
+ self.comms.log.info("Training compression dictionary.")
425
+ training_inputs: list[bytes] = []
426
+ # We start the dictionary training with *predicted* quantum dataset
427
+ # models, since those have almost all of the same attributes as the
428
+ # provenance quantum and dataset models, and we can get a nice random
429
+ # sample from just the first N, since they're ordered by UUID. We
430
+ # chop out the datastore records since those don't appear in the
431
+ # provenance graph.
432
+ for predicted_quantum in self.predicted.quantum_datasets.values():
433
+ if len(training_inputs) == self.comms.config.zstd_dict_n_inputs:
434
+ break
435
+ predicted_quantum.datastore_records.clear()
436
+ training_inputs.append(predicted_quantum.model_dump_json().encode())
437
+ # Add the provenance quanta, metadata, and logs we've accumulated.
438
+ for scan_data in self.pending_compression_training:
439
+ assert scan_data.compression is _CompressionState.NOT_COMPRESSED
440
+ training_inputs.append(scan_data.quantum)
441
+ training_inputs.append(scan_data.metadata)
442
+ training_inputs.append(scan_data.log)
443
+ return zstandard.train_dictionary(self.comms.config.zstd_dict_size, training_inputs)
444
+
445
+ def write_init_outputs(self, data_writers: _DataWriters) -> None:
446
+ """Write provenance for init-output datasets and init-quanta.
447
+
448
+ Parameters
449
+ ----------
450
+ data_writers : `_DataWriters`
451
+ Low-level writers struct.
452
+ """
453
+ self.comms.log.info("Writing init outputs.")
454
+ init_quanta = ProvenanceInitQuantaModel()
455
+ for predicted_init_quantum in self.predicted.init_quanta.root:
456
+ if not predicted_init_quantum.task_label:
457
+ # Skip the 'packages' producer quantum.
458
+ continue
459
+ existing_outputs = self.existing_init_outputs[predicted_init_quantum.quantum_id]
460
+ for predicted_output in itertools.chain.from_iterable(predicted_init_quantum.outputs.values()):
461
+ dataset_index = self.indices[predicted_output.dataset_id]
462
+ provenance_output = ProvenanceDatasetModel.from_predicted(
463
+ predicted_output,
464
+ producer=self.indices[predicted_init_quantum.quantum_id],
465
+ consumers=self.xgraph.successors(dataset_index),
466
+ )
467
+ provenance_output.exists = predicted_output.dataset_id in existing_outputs
468
+ data_writers.datasets.write_model(
469
+ provenance_output.dataset_id, provenance_output, data_writers.compressor
470
+ )
471
+ init_quanta.root.append(
472
+ ProvenanceInitQuantumModel.from_predicted(predicted_init_quantum, self.indices)
473
+ )
474
+ data_writers.graph.write_single_model("init_quanta", init_quanta)
475
+
476
+ def write_overall_inputs(self, data_writers: _DataWriters) -> None:
477
+ """Write provenance for overall-input datasets.
478
+
479
+ Parameters
480
+ ----------
481
+ data_writers : `_DataWriters`
482
+ Low-level writers struct.
483
+ """
484
+ self.comms.log.info("Writing overall inputs.")
485
+ for predicted_input in self.comms.periodically_check_for_cancel(self.overall_inputs.values()):
486
+ if predicted_input.dataset_id not in data_writers.datasets.addresses:
487
+ dataset_index = self.indices[predicted_input.dataset_id]
488
+ data_writers.datasets.write_model(
489
+ predicted_input.dataset_id,
490
+ ProvenanceDatasetModel.from_predicted(
491
+ predicted_input,
492
+ producer=None,
493
+ consumers=self.xgraph.successors(dataset_index),
494
+ ),
495
+ data_writers.compressor,
496
+ )
497
+ del self.overall_inputs
498
+
499
+ @staticmethod
500
+ def write_packages(data_writers: _DataWriters) -> None:
501
+ """Write package version information to the provenance graph.
502
+
503
+ Parameters
504
+ ----------
505
+ data_writers : `_DataWriters`
506
+ Low-level writers struct.
507
+ """
508
+ packages = Packages.fromSystem(include_all=True)
509
+ data = packages.toBytes("json")
510
+ data_writers.graph.write_single_block("packages", data)
511
+
512
+ def make_scan_data(self, request: ScanResult) -> list[_ScanData]:
513
+ """Process a `ScanResult` into `_ScanData`.
514
+
515
+ Parameters
516
+ ----------
517
+ request : `ScanResult`
518
+ Result of a quantum scan.
519
+
520
+ Returns
521
+ -------
522
+ data : `list` [ `_ScanData` ]
523
+ A zero- or single-element list of `_ScanData` to write or save for
524
+ compression-dict training. A zero-element list is returned if the
525
+ scan actually represents an init quantum.
526
+ """
527
+ if (existing_init_outputs := self.existing_init_outputs.get(request.quantum_id)) is not None:
528
+ self.comms.log.debug("Handling init-output scan for %s.", request.quantum_id)
529
+ existing_init_outputs.update(request.existing_outputs)
530
+ self.comms.report_write()
531
+ return []
532
+ self.comms.log.debug("Handling quantum scan for %s.", request.quantum_id)
533
+ predicted_quantum = self.predicted.quantum_datasets[request.quantum_id]
534
+ quantum_index = self.indices[predicted_quantum.quantum_id]
535
+ (metadata_output,) = predicted_quantum.outputs[acc.METADATA_OUTPUT_CONNECTION_NAME]
536
+ (log_output,) = predicted_quantum.outputs[acc.LOG_OUTPUT_CONNECTION_NAME]
537
+ data = _ScanData(
538
+ request.quantum_id,
539
+ metadata_id=metadata_output.dataset_id,
540
+ log_id=log_output.dataset_id,
541
+ compression=(
542
+ _CompressionState.LOG_AND_METADATA_COMPRESSED
543
+ if request.is_compressed
544
+ else _CompressionState.NOT_COMPRESSED
545
+ ),
546
+ )
547
+ for predicted_output in itertools.chain.from_iterable(predicted_quantum.outputs.values()):
548
+ dataset_index = self.indices[predicted_output.dataset_id]
549
+ provenance_output = ProvenanceDatasetModel.from_predicted(
550
+ predicted_output,
551
+ producer=quantum_index,
552
+ consumers=self.xgraph.successors(dataset_index),
553
+ )
554
+ provenance_output.exists = provenance_output.dataset_id in request.existing_outputs
555
+ data.datasets[provenance_output.dataset_id] = provenance_output.model_dump_json().encode()
556
+ provenance_quantum = ProvenanceQuantumModel.from_predicted(predicted_quantum, self.indices)
557
+ provenance_quantum.status = request.get_run_status()
558
+ provenance_quantum.caveats = request.caveats
559
+ provenance_quantum.exception = request.exception
560
+ provenance_quantum.resource_usage = request.resource_usage
561
+ data.quantum = provenance_quantum.model_dump_json().encode()
562
+ data.metadata = request.metadata
563
+ data.log = request.log
564
+ return [data]
565
+
566
+ def write_scan_data(self, scan_data: _ScanData, data_writers: _DataWriters) -> None:
567
+ """Write scan data to the provenance graph.
568
+
569
+ Parameters
570
+ ----------
571
+ scan_data : `_ScanData`
572
+ Preprocessed information to write.
573
+ data_writers : `_DataWriters`
574
+ Low-level writers struct.
575
+ """
576
+ self.comms.log.debug("Writing quantum %s.", scan_data.quantum_id)
577
+ scan_data.compress(data_writers.compressor)
578
+ data_writers.quanta.write_bytes(scan_data.quantum_id, scan_data.quantum)
579
+ for dataset_id, dataset_data in scan_data.datasets.items():
580
+ data_writers.datasets.write_bytes(dataset_id, dataset_data)
581
+ if scan_data.metadata:
582
+ address = data_writers.metadata.write_bytes(scan_data.quantum_id, scan_data.metadata)
583
+ data_writers.metadata.addresses[scan_data.metadata_id] = address
584
+ if scan_data.log:
585
+ address = data_writers.logs.write_bytes(scan_data.quantum_id, scan_data.log)
586
+ data_writers.logs.addresses[scan_data.log_id] = address
587
+ # We shouldn't need this predicted quantum anymore; delete it in the
588
+ # hopes that'll free up some memory.
589
+ del self.predicted.quantum_datasets[scan_data.quantum_id]
590
+ self.comms.report_write()
591
+
592
+
593
+ _T = TypeVar("_T")