lsst-pipe-base 29.2025.4800__py3-none-any.whl → 30.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/pipe/base/_instrument.py +6 -5
- lsst/pipe/base/caching_limited_butler.py +3 -0
- lsst/pipe/base/log_capture.py +39 -79
- lsst/pipe/base/log_on_close.py +79 -0
- lsst/pipe/base/mp_graph_executor.py +51 -15
- lsst/pipe/base/quantum_graph/_common.py +4 -3
- lsst/pipe/base/quantum_graph/_multiblock.py +6 -16
- lsst/pipe/base/quantum_graph/_predicted.py +106 -12
- lsst/pipe/base/quantum_graph/_provenance.py +657 -6
- lsst/pipe/base/quantum_graph/aggregator/_communicators.py +18 -50
- lsst/pipe/base/quantum_graph/aggregator/_ingester.py +14 -3
- lsst/pipe/base/quantum_graph/aggregator/_scanner.py +49 -232
- lsst/pipe/base/quantum_graph/aggregator/_structs.py +3 -113
- lsst/pipe/base/quantum_graph/aggregator/_supervisor.py +10 -5
- lsst/pipe/base/quantum_graph/aggregator/_writer.py +31 -348
- lsst/pipe/base/quantum_graph/formatter.py +101 -0
- lsst/pipe/base/quantum_graph_builder.py +12 -1
- lsst/pipe/base/quantum_graph_executor.py +116 -13
- lsst/pipe/base/quantum_graph_skeleton.py +1 -7
- lsst/pipe/base/script/register_instrument.py +4 -4
- lsst/pipe/base/script/retrieve_artifacts_for_quanta.py +5 -6
- lsst/pipe/base/script/transfer_from_graph.py +42 -42
- lsst/pipe/base/script/zip_from_graph.py +7 -8
- lsst/pipe/base/separable_pipeline_executor.py +18 -2
- lsst/pipe/base/simple_pipeline_executor.py +4 -3
- lsst/pipe/base/single_quantum_executor.py +70 -34
- lsst/pipe/base/tests/mocks/_repo.py +44 -16
- lsst/pipe/base/tests/simpleQGraph.py +43 -35
- lsst/pipe/base/version.py +1 -1
- {lsst_pipe_base-29.2025.4800.dist-info → lsst_pipe_base-30.0.0.dist-info}/METADATA +1 -1
- {lsst_pipe_base-29.2025.4800.dist-info → lsst_pipe_base-30.0.0.dist-info}/RECORD +39 -37
- {lsst_pipe_base-29.2025.4800.dist-info → lsst_pipe_base-30.0.0.dist-info}/WHEEL +1 -1
- {lsst_pipe_base-29.2025.4800.dist-info → lsst_pipe_base-30.0.0.dist-info}/entry_points.txt +0 -0
- {lsst_pipe_base-29.2025.4800.dist-info → lsst_pipe_base-30.0.0.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_pipe_base-29.2025.4800.dist-info → lsst_pipe_base-30.0.0.dist-info}/licenses/LICENSE +0 -0
- {lsst_pipe_base-29.2025.4800.dist-info → lsst_pipe_base-30.0.0.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_pipe_base-29.2025.4800.dist-info → lsst_pipe_base-30.0.0.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_pipe_base-29.2025.4800.dist-info → lsst_pipe_base-30.0.0.dist-info}/top_level.txt +0 -0
- {lsst_pipe_base-29.2025.4800.dist-info → lsst_pipe_base-30.0.0.dist-info}/zip-safe +0 -0
|
@@ -27,68 +27,16 @@
|
|
|
27
27
|
|
|
28
28
|
from __future__ import annotations
|
|
29
29
|
|
|
30
|
-
__all__ = (
|
|
31
|
-
"InProgressScan",
|
|
32
|
-
"IngestRequest",
|
|
33
|
-
"ScanReport",
|
|
34
|
-
"ScanStatus",
|
|
35
|
-
"WriteRequest",
|
|
36
|
-
)
|
|
30
|
+
__all__ = ("IngestRequest", "ScanReport")
|
|
37
31
|
|
|
38
32
|
import dataclasses
|
|
39
|
-
import enum
|
|
40
33
|
import uuid
|
|
41
34
|
|
|
42
35
|
from lsst.daf.butler.datastore.record_data import DatastoreRecordData
|
|
43
36
|
|
|
44
37
|
from .._common import DatastoreName
|
|
45
38
|
from .._predicted import PredictedDatasetModel
|
|
46
|
-
from .._provenance import
|
|
47
|
-
ProvenanceLogRecordsModel,
|
|
48
|
-
ProvenanceQuantumAttemptModel,
|
|
49
|
-
ProvenanceTaskMetadataModel,
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
class ScanStatus(enum.Enum):
|
|
54
|
-
"""Status enum for quantum scanning.
|
|
55
|
-
|
|
56
|
-
Note that this records the status for the *scanning* which is distinct
|
|
57
|
-
from the status of the quantum's execution.
|
|
58
|
-
"""
|
|
59
|
-
|
|
60
|
-
INCOMPLETE = enum.auto()
|
|
61
|
-
"""The quantum is not necessarily done running, and cannot be scanned
|
|
62
|
-
conclusively yet.
|
|
63
|
-
"""
|
|
64
|
-
|
|
65
|
-
ABANDONED = enum.auto()
|
|
66
|
-
"""The quantum's execution appears to have failed but we cannot rule out
|
|
67
|
-
the possibility that it could be recovered, but we've also waited long
|
|
68
|
-
enough (according to `ScannerTimeConfigDict.retry_timeout`) that it's time
|
|
69
|
-
to stop trying for now.
|
|
70
|
-
|
|
71
|
-
This state means a later run with `ScannerConfig.assume_complete` is
|
|
72
|
-
required.
|
|
73
|
-
"""
|
|
74
|
-
|
|
75
|
-
SUCCESSFUL = enum.auto()
|
|
76
|
-
"""The quantum was conclusively scanned and was executed successfully,
|
|
77
|
-
unblocking scans for downstream quanta.
|
|
78
|
-
"""
|
|
79
|
-
|
|
80
|
-
FAILED = enum.auto()
|
|
81
|
-
"""The quantum was conclusively scanned and failed execution, blocking
|
|
82
|
-
scans for downstream quanta.
|
|
83
|
-
"""
|
|
84
|
-
|
|
85
|
-
BLOCKED = enum.auto()
|
|
86
|
-
"""A quantum upstream of this one failed."""
|
|
87
|
-
|
|
88
|
-
INIT = enum.auto()
|
|
89
|
-
"""Init quanta need special handling, because they don't have logs and
|
|
90
|
-
metadata.
|
|
91
|
-
"""
|
|
39
|
+
from .._provenance import ProvenanceQuantumScanStatus
|
|
92
40
|
|
|
93
41
|
|
|
94
42
|
@dataclasses.dataclass
|
|
@@ -98,7 +46,7 @@ class ScanReport:
|
|
|
98
46
|
quantum_id: uuid.UUID
|
|
99
47
|
"""Unique ID of the quantum."""
|
|
100
48
|
|
|
101
|
-
status:
|
|
49
|
+
status: ProvenanceQuantumScanStatus
|
|
102
50
|
"""Combined status of the scan and the execution of the quantum."""
|
|
103
51
|
|
|
104
52
|
|
|
@@ -117,61 +65,3 @@ class IngestRequest:
|
|
|
117
65
|
|
|
118
66
|
def __bool__(self) -> bool:
|
|
119
67
|
return bool(self.datasets or self.records)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
@dataclasses.dataclass
|
|
123
|
-
class InProgressScan:
|
|
124
|
-
"""A struct that represents a quantum that is being scanned."""
|
|
125
|
-
|
|
126
|
-
quantum_id: uuid.UUID
|
|
127
|
-
"""Unique ID for the quantum."""
|
|
128
|
-
|
|
129
|
-
status: ScanStatus
|
|
130
|
-
"""Combined status for the scan and the execution of the quantum."""
|
|
131
|
-
|
|
132
|
-
attempts: list[ProvenanceQuantumAttemptModel] = dataclasses.field(default_factory=list)
|
|
133
|
-
"""Provenance information about each attempt to run the quantum."""
|
|
134
|
-
|
|
135
|
-
outputs: dict[uuid.UUID, bool] = dataclasses.field(default_factory=dict)
|
|
136
|
-
"""Unique IDs of the output datasets mapped to whether they were actually
|
|
137
|
-
produced.
|
|
138
|
-
"""
|
|
139
|
-
|
|
140
|
-
metadata: ProvenanceTaskMetadataModel = dataclasses.field(default_factory=ProvenanceTaskMetadataModel)
|
|
141
|
-
"""Task metadata information for each attempt.
|
|
142
|
-
"""
|
|
143
|
-
|
|
144
|
-
logs: ProvenanceLogRecordsModel = dataclasses.field(default_factory=ProvenanceLogRecordsModel)
|
|
145
|
-
"""Log records for each attempt.
|
|
146
|
-
"""
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
@dataclasses.dataclass
|
|
150
|
-
class WriteRequest:
|
|
151
|
-
"""A struct that represents a request to write provenance for a quantum."""
|
|
152
|
-
|
|
153
|
-
quantum_id: uuid.UUID
|
|
154
|
-
"""Unique ID for the quantum."""
|
|
155
|
-
|
|
156
|
-
status: ScanStatus
|
|
157
|
-
"""Combined status for the scan and the execution of the quantum."""
|
|
158
|
-
|
|
159
|
-
existing_outputs: set[uuid.UUID] = dataclasses.field(default_factory=set)
|
|
160
|
-
"""Unique IDs of the output datasets that were actually written."""
|
|
161
|
-
|
|
162
|
-
quantum: bytes = b""
|
|
163
|
-
"""Serialized quantum provenance model.
|
|
164
|
-
|
|
165
|
-
This may be empty for quanta that had no attempts.
|
|
166
|
-
"""
|
|
167
|
-
|
|
168
|
-
metadata: bytes = b""
|
|
169
|
-
"""Serialized task metadata."""
|
|
170
|
-
|
|
171
|
-
logs: bytes = b""
|
|
172
|
-
"""Serialized logs."""
|
|
173
|
-
|
|
174
|
-
is_compressed: bool = False
|
|
175
|
-
"""Whether the `quantum`, `metadata`, and `log` attributes are
|
|
176
|
-
compressed.
|
|
177
|
-
"""
|
|
@@ -42,6 +42,7 @@ from lsst.utils.usage import get_peak_mem_usage
|
|
|
42
42
|
from ...graph_walker import GraphWalker
|
|
43
43
|
from ...pipeline_graph import TaskImportMode
|
|
44
44
|
from .._predicted import PredictedQuantumGraphComponents, PredictedQuantumGraphReader
|
|
45
|
+
from .._provenance import ProvenanceQuantumScanData, ProvenanceQuantumScanStatus
|
|
45
46
|
from ._communicators import (
|
|
46
47
|
IngesterCommunicator,
|
|
47
48
|
ScannerCommunicator,
|
|
@@ -54,7 +55,7 @@ from ._communicators import (
|
|
|
54
55
|
from ._config import AggregatorConfig
|
|
55
56
|
from ._ingester import Ingester
|
|
56
57
|
from ._scanner import Scanner
|
|
57
|
-
from ._structs import ScanReport
|
|
58
|
+
from ._structs import ScanReport
|
|
58
59
|
from ._writer import Writer
|
|
59
60
|
|
|
60
61
|
|
|
@@ -126,18 +127,22 @@ class Supervisor:
|
|
|
126
127
|
Information about the scan.
|
|
127
128
|
"""
|
|
128
129
|
match scan_report.status:
|
|
129
|
-
case
|
|
130
|
+
case ProvenanceQuantumScanStatus.SUCCESSFUL | ProvenanceQuantumScanStatus.INIT:
|
|
130
131
|
self.comms.log.debug("Scan complete for %s: quantum succeeded.", scan_report.quantum_id)
|
|
131
132
|
self.walker.finish(scan_report.quantum_id)
|
|
132
|
-
case
|
|
133
|
+
case ProvenanceQuantumScanStatus.FAILED:
|
|
133
134
|
self.comms.log.debug("Scan complete for %s: quantum failed.", scan_report.quantum_id)
|
|
134
135
|
blocked_quanta = self.walker.fail(scan_report.quantum_id)
|
|
135
136
|
for blocked_quantum_id in blocked_quanta:
|
|
136
137
|
if self.comms.config.output_path is not None:
|
|
137
|
-
self.comms.request_write(
|
|
138
|
+
self.comms.request_write(
|
|
139
|
+
ProvenanceQuantumScanData(
|
|
140
|
+
blocked_quantum_id, status=ProvenanceQuantumScanStatus.BLOCKED
|
|
141
|
+
)
|
|
142
|
+
)
|
|
138
143
|
self.comms.progress.scans.update(1)
|
|
139
144
|
self.comms.progress.quantum_ingests.update(len(blocked_quanta))
|
|
140
|
-
case
|
|
145
|
+
case ProvenanceQuantumScanStatus.ABANDONED:
|
|
141
146
|
self.comms.log.debug("Abandoning scan for %s: quantum has not succeeded (yet).")
|
|
142
147
|
self.walker.fail(scan_report.quantum_id)
|
|
143
148
|
self.n_abandoned += 1
|
|
@@ -30,130 +30,14 @@ from __future__ import annotations
|
|
|
30
30
|
__all__ = ("Writer",)
|
|
31
31
|
|
|
32
32
|
import dataclasses
|
|
33
|
-
import itertools
|
|
34
|
-
import logging
|
|
35
|
-
import operator
|
|
36
|
-
import uuid
|
|
37
|
-
from typing import TypeVar
|
|
38
33
|
|
|
39
|
-
import networkx
|
|
40
34
|
import zstandard
|
|
41
35
|
|
|
42
|
-
from
|
|
43
|
-
|
|
44
|
-
from ... import automatic_connection_constants as acc
|
|
36
|
+
from ...log_on_close import LogOnClose
|
|
45
37
|
from ...pipeline_graph import TaskImportMode
|
|
46
|
-
from ..
|
|
47
|
-
from ..
|
|
48
|
-
from .._predicted import PredictedDatasetModel, PredictedQuantumGraphComponents, PredictedQuantumGraphReader
|
|
49
|
-
from .._provenance import (
|
|
50
|
-
DATASET_ADDRESS_INDEX,
|
|
51
|
-
DATASET_MB_NAME,
|
|
52
|
-
LOG_ADDRESS_INDEX,
|
|
53
|
-
LOG_MB_NAME,
|
|
54
|
-
METADATA_ADDRESS_INDEX,
|
|
55
|
-
METADATA_MB_NAME,
|
|
56
|
-
QUANTUM_ADDRESS_INDEX,
|
|
57
|
-
QUANTUM_MB_NAME,
|
|
58
|
-
ProvenanceDatasetModel,
|
|
59
|
-
ProvenanceInitQuantaModel,
|
|
60
|
-
ProvenanceInitQuantumModel,
|
|
61
|
-
ProvenanceQuantumModel,
|
|
62
|
-
)
|
|
38
|
+
from .._predicted import PredictedQuantumGraphComponents, PredictedQuantumGraphReader
|
|
39
|
+
from .._provenance import ProvenanceQuantumGraphWriter, ProvenanceQuantumScanData
|
|
63
40
|
from ._communicators import WriterCommunicator
|
|
64
|
-
from ._structs import WriteRequest
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
@dataclasses.dataclass
|
|
68
|
-
class _DataWriters:
|
|
69
|
-
"""A struct of low-level writer objects for the main components of a
|
|
70
|
-
provenance quantum graph.
|
|
71
|
-
|
|
72
|
-
Parameters
|
|
73
|
-
----------
|
|
74
|
-
comms : `WriterCommunicator`
|
|
75
|
-
Communicator helper object for the writer.
|
|
76
|
-
predicted : `.PredictedQuantumGraphComponents`
|
|
77
|
-
Components of the predicted graph.
|
|
78
|
-
indices : `dict` [ `uuid.UUID`, `int` ]
|
|
79
|
-
Mapping from UUID to internal integer ID, including both quanta and
|
|
80
|
-
datasets.
|
|
81
|
-
compressor : `Compressor`
|
|
82
|
-
Object that can compress `bytes`.
|
|
83
|
-
cdict_data : `bytes` or `None`, optional
|
|
84
|
-
Bytes representation of the compression dictionary used by the
|
|
85
|
-
compressor.
|
|
86
|
-
"""
|
|
87
|
-
|
|
88
|
-
def __init__(
|
|
89
|
-
self,
|
|
90
|
-
comms: WriterCommunicator,
|
|
91
|
-
predicted: PredictedQuantumGraphComponents,
|
|
92
|
-
indices: dict[uuid.UUID, int],
|
|
93
|
-
compressor: Compressor,
|
|
94
|
-
cdict_data: bytes | None = None,
|
|
95
|
-
) -> None:
|
|
96
|
-
assert comms.config.output_path is not None
|
|
97
|
-
header = predicted.header.model_copy()
|
|
98
|
-
header.graph_type = "provenance"
|
|
99
|
-
self.graph = comms.enter(
|
|
100
|
-
BaseQuantumGraphWriter.open(
|
|
101
|
-
comms.config.output_path,
|
|
102
|
-
header,
|
|
103
|
-
predicted.pipeline_graph,
|
|
104
|
-
indices,
|
|
105
|
-
address_filename="nodes",
|
|
106
|
-
compressor=compressor,
|
|
107
|
-
cdict_data=cdict_data,
|
|
108
|
-
),
|
|
109
|
-
on_close="Finishing writing provenance quantum graph.",
|
|
110
|
-
is_progress_log=True,
|
|
111
|
-
)
|
|
112
|
-
self.graph.address_writer.addresses = [{}, {}, {}, {}]
|
|
113
|
-
self.logs = comms.enter(
|
|
114
|
-
MultiblockWriter.open_in_zip(self.graph.zf, LOG_MB_NAME, header.int_size, use_tempfile=True),
|
|
115
|
-
on_close="Copying logs into zip archive.",
|
|
116
|
-
is_progress_log=True,
|
|
117
|
-
)
|
|
118
|
-
self.graph.address_writer.addresses[LOG_ADDRESS_INDEX] = self.logs.addresses
|
|
119
|
-
self.metadata = comms.enter(
|
|
120
|
-
MultiblockWriter.open_in_zip(self.graph.zf, METADATA_MB_NAME, header.int_size, use_tempfile=True),
|
|
121
|
-
on_close="Copying metadata into zip archive.",
|
|
122
|
-
is_progress_log=True,
|
|
123
|
-
)
|
|
124
|
-
self.graph.address_writer.addresses[METADATA_ADDRESS_INDEX] = self.metadata.addresses
|
|
125
|
-
self.datasets = comms.enter(
|
|
126
|
-
MultiblockWriter.open_in_zip(self.graph.zf, DATASET_MB_NAME, header.int_size, use_tempfile=True),
|
|
127
|
-
on_close="Copying dataset provenance into zip archive.",
|
|
128
|
-
is_progress_log=True,
|
|
129
|
-
)
|
|
130
|
-
self.graph.address_writer.addresses[DATASET_ADDRESS_INDEX] = self.datasets.addresses
|
|
131
|
-
self.quanta = comms.enter(
|
|
132
|
-
MultiblockWriter.open_in_zip(self.graph.zf, QUANTUM_MB_NAME, header.int_size, use_tempfile=True),
|
|
133
|
-
on_close="Copying quantum provenance into zip archive.",
|
|
134
|
-
is_progress_log=True,
|
|
135
|
-
)
|
|
136
|
-
self.graph.address_writer.addresses[QUANTUM_ADDRESS_INDEX] = self.quanta.addresses
|
|
137
|
-
|
|
138
|
-
graph: BaseQuantumGraphWriter
|
|
139
|
-
"""The parent graph writer."""
|
|
140
|
-
|
|
141
|
-
datasets: MultiblockWriter
|
|
142
|
-
"""A writer for dataset provenance."""
|
|
143
|
-
|
|
144
|
-
quanta: MultiblockWriter
|
|
145
|
-
"""A writer for quantum provenance."""
|
|
146
|
-
|
|
147
|
-
metadata: MultiblockWriter
|
|
148
|
-
"""A writer for metadata content."""
|
|
149
|
-
|
|
150
|
-
logs: MultiblockWriter
|
|
151
|
-
"""A writer for log content."""
|
|
152
|
-
|
|
153
|
-
@property
|
|
154
|
-
def compressor(self) -> Compressor:
|
|
155
|
-
"""Object that should be used to compress all JSON blocks."""
|
|
156
|
-
return self.graph.compressor
|
|
157
41
|
|
|
158
42
|
|
|
159
43
|
@dataclasses.dataclass
|
|
@@ -171,40 +55,7 @@ class Writer:
|
|
|
171
55
|
predicted: PredictedQuantumGraphComponents = dataclasses.field(init=False)
|
|
172
56
|
"""Components of the predicted quantum graph."""
|
|
173
57
|
|
|
174
|
-
|
|
175
|
-
"""Mapping that tracks which init-outputs exist.
|
|
176
|
-
|
|
177
|
-
This mapping is updated as scanners inform the writer about init-output
|
|
178
|
-
existence, since we want to write that provenance information out only at
|
|
179
|
-
the end.
|
|
180
|
-
"""
|
|
181
|
-
|
|
182
|
-
indices: dict[uuid.UUID, int] = dataclasses.field(default_factory=dict)
|
|
183
|
-
"""Mapping from UUID to internal integer ID, including both quanta and
|
|
184
|
-
datasets.
|
|
185
|
-
|
|
186
|
-
This is fully initialized at construction.
|
|
187
|
-
"""
|
|
188
|
-
|
|
189
|
-
output_dataset_ids: set[uuid.UUID] = dataclasses.field(default_factory=set)
|
|
190
|
-
"""The IDs of all datasets that are produced by this graph.
|
|
191
|
-
|
|
192
|
-
This is fully initialized at construction.
|
|
193
|
-
"""
|
|
194
|
-
|
|
195
|
-
overall_inputs: dict[uuid.UUID, PredictedDatasetModel] = dataclasses.field(default_factory=dict)
|
|
196
|
-
"""All datasets that are not produced by any quantum in this graph."""
|
|
197
|
-
|
|
198
|
-
xgraph: networkx.DiGraph = dataclasses.field(default_factory=networkx.DiGraph)
|
|
199
|
-
"""A bipartite NetworkX graph linking datasets to quanta and quanta to
|
|
200
|
-
datasets.
|
|
201
|
-
|
|
202
|
-
This is fully initialized at construction. There are no node or edge
|
|
203
|
-
attributes in this graph; we only need it to store adjacency information
|
|
204
|
-
with datasets as well as with quanta.
|
|
205
|
-
"""
|
|
206
|
-
|
|
207
|
-
pending_compression_training: list[WriteRequest] = dataclasses.field(default_factory=list)
|
|
58
|
+
pending_compression_training: list[ProvenanceQuantumScanData] = dataclasses.field(default_factory=list)
|
|
208
59
|
"""Unprocessed quantum scans that are being accumulated in order to
|
|
209
60
|
build a compression dictionary.
|
|
210
61
|
"""
|
|
@@ -220,58 +71,6 @@ class Writer:
|
|
|
220
71
|
self.comms.check_for_cancel()
|
|
221
72
|
reader.read_quantum_datasets()
|
|
222
73
|
self.predicted = reader.components
|
|
223
|
-
for predicted_init_quantum in self.predicted.init_quanta.root:
|
|
224
|
-
self.existing_init_outputs[predicted_init_quantum.quantum_id] = set()
|
|
225
|
-
self.comms.check_for_cancel()
|
|
226
|
-
self.comms.log.info("Generating integer indexes and identifying outputs.")
|
|
227
|
-
self._populate_indices_and_outputs()
|
|
228
|
-
self.comms.check_for_cancel()
|
|
229
|
-
self._populate_xgraph_and_inputs()
|
|
230
|
-
self.comms.check_for_cancel()
|
|
231
|
-
self.comms.log_progress(
|
|
232
|
-
# We add one here for 'packages', which we do ingest but don't
|
|
233
|
-
# record provenance for.
|
|
234
|
-
logging.INFO,
|
|
235
|
-
f"Graph has {len(self.output_dataset_ids) + 1} predicted output dataset(s).",
|
|
236
|
-
)
|
|
237
|
-
|
|
238
|
-
def _populate_indices_and_outputs(self) -> None:
|
|
239
|
-
all_uuids = set(self.predicted.quantum_datasets.keys())
|
|
240
|
-
for quantum in self.comms.periodically_check_for_cancel(
|
|
241
|
-
itertools.chain(
|
|
242
|
-
self.predicted.init_quanta.root,
|
|
243
|
-
self.predicted.quantum_datasets.values(),
|
|
244
|
-
)
|
|
245
|
-
):
|
|
246
|
-
if not quantum.task_label:
|
|
247
|
-
# Skip the 'packages' producer quantum.
|
|
248
|
-
continue
|
|
249
|
-
all_uuids.update(quantum.iter_input_dataset_ids())
|
|
250
|
-
self.output_dataset_ids.update(quantum.iter_output_dataset_ids())
|
|
251
|
-
all_uuids.update(self.output_dataset_ids)
|
|
252
|
-
self.indices = {
|
|
253
|
-
node_id: node_index
|
|
254
|
-
for node_index, node_id in self.comms.periodically_check_for_cancel(
|
|
255
|
-
enumerate(sorted(all_uuids, key=operator.attrgetter("int")))
|
|
256
|
-
)
|
|
257
|
-
}
|
|
258
|
-
|
|
259
|
-
def _populate_xgraph_and_inputs(self) -> None:
|
|
260
|
-
for predicted_quantum in self.comms.periodically_check_for_cancel(
|
|
261
|
-
itertools.chain(
|
|
262
|
-
self.predicted.init_quanta.root,
|
|
263
|
-
self.predicted.quantum_datasets.values(),
|
|
264
|
-
)
|
|
265
|
-
):
|
|
266
|
-
if not predicted_quantum.task_label:
|
|
267
|
-
# Skip the 'packages' producer quantum.
|
|
268
|
-
continue
|
|
269
|
-
for predicted_input in itertools.chain.from_iterable(predicted_quantum.inputs.values()):
|
|
270
|
-
self.xgraph.add_edge(predicted_input.dataset_id, predicted_quantum.quantum_id)
|
|
271
|
-
if predicted_input.dataset_id not in self.output_dataset_ids:
|
|
272
|
-
self.overall_inputs.setdefault(predicted_input.dataset_id, predicted_input)
|
|
273
|
-
for predicted_output in itertools.chain.from_iterable(predicted_quantum.outputs.values()):
|
|
274
|
-
self.xgraph.add_edge(predicted_quantum.quantum_id, predicted_output.dataset_id)
|
|
275
74
|
|
|
276
75
|
@staticmethod
|
|
277
76
|
def run(predicted_path: str, comms: WriterCommunicator) -> None:
|
|
@@ -295,52 +94,59 @@ class Writer:
|
|
|
295
94
|
|
|
296
95
|
def loop(self) -> None:
|
|
297
96
|
"""Run the main loop for the writer."""
|
|
298
|
-
|
|
97
|
+
qg_writer: ProvenanceQuantumGraphWriter | None = None
|
|
299
98
|
if not self.comms.config.zstd_dict_size:
|
|
300
|
-
|
|
99
|
+
qg_writer = self.make_qg_writer()
|
|
301
100
|
self.comms.log.info("Polling for write requests from scanners.")
|
|
302
101
|
for request in self.comms.poll():
|
|
303
|
-
if
|
|
102
|
+
if qg_writer is None:
|
|
304
103
|
self.pending_compression_training.append(request)
|
|
305
104
|
if len(self.pending_compression_training) >= self.comms.config.zstd_dict_n_inputs:
|
|
306
|
-
|
|
105
|
+
qg_writer = self.make_qg_writer()
|
|
307
106
|
else:
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
107
|
+
qg_writer.write_scan_data(request)
|
|
108
|
+
self.comms.report_write()
|
|
109
|
+
if qg_writer is None:
|
|
110
|
+
qg_writer = self.make_qg_writer()
|
|
111
|
+
self.comms.log.info("Writing init outputs.")
|
|
112
|
+
qg_writer.write_init_outputs(assume_existence=False)
|
|
312
113
|
|
|
313
|
-
def
|
|
114
|
+
def make_qg_writer(self) -> ProvenanceQuantumGraphWriter:
|
|
314
115
|
"""Make a compression dictionary, open the low-level writers, and
|
|
315
116
|
write any accumulated scans that were needed to make the compression
|
|
316
117
|
dictionary.
|
|
317
118
|
|
|
318
119
|
Returns
|
|
319
120
|
-------
|
|
320
|
-
|
|
121
|
+
qg_writer : `ProvenanceQuantumGraphWriter`
|
|
321
122
|
Low-level writers struct.
|
|
322
123
|
"""
|
|
323
124
|
cdict = self.make_compression_dictionary()
|
|
324
125
|
self.comms.send_compression_dict(cdict.as_bytes())
|
|
325
126
|
assert self.comms.config.output_path is not None
|
|
326
|
-
self.comms.log.info("Opening output files.")
|
|
327
|
-
|
|
328
|
-
self.comms,
|
|
329
|
-
self.
|
|
330
|
-
self.
|
|
331
|
-
|
|
127
|
+
self.comms.log.info("Opening output files and processing predicted graph.")
|
|
128
|
+
qg_writer = ProvenanceQuantumGraphWriter(
|
|
129
|
+
self.comms.config.output_path,
|
|
130
|
+
exit_stack=self.comms.exit_stack,
|
|
131
|
+
log_on_close=LogOnClose(self.comms.log_progress),
|
|
132
|
+
predicted=self.predicted,
|
|
133
|
+
zstd_level=self.comms.config.zstd_level,
|
|
332
134
|
cdict_data=cdict.as_bytes(),
|
|
135
|
+
loop_wrapper=self.comms.periodically_check_for_cancel,
|
|
136
|
+
log=self.comms.log,
|
|
333
137
|
)
|
|
334
138
|
self.comms.check_for_cancel()
|
|
335
139
|
self.comms.log.info("Compressing and writing queued scan requests.")
|
|
336
140
|
for request in self.pending_compression_training:
|
|
337
|
-
|
|
141
|
+
qg_writer.write_scan_data(request)
|
|
142
|
+
self.comms.report_write()
|
|
338
143
|
del self.pending_compression_training
|
|
339
144
|
self.comms.check_for_cancel()
|
|
340
|
-
self.
|
|
341
|
-
self.
|
|
145
|
+
self.comms.log.info("Writing overall inputs.")
|
|
146
|
+
qg_writer.write_overall_inputs(self.comms.periodically_check_for_cancel)
|
|
147
|
+
qg_writer.write_packages()
|
|
342
148
|
self.comms.log.info("Returning to write request loop.")
|
|
343
|
-
return
|
|
149
|
+
return qg_writer
|
|
344
150
|
|
|
345
151
|
def make_compression_dictionary(self) -> zstandard.ZstdCompressionDict:
|
|
346
152
|
"""Make the compression dictionary.
|
|
@@ -376,126 +182,3 @@ class Writer:
|
|
|
376
182
|
training_inputs.append(write_request.metadata)
|
|
377
183
|
training_inputs.append(write_request.logs)
|
|
378
184
|
return zstandard.train_dictionary(self.comms.config.zstd_dict_size, training_inputs)
|
|
379
|
-
|
|
380
|
-
def write_init_outputs(self, data_writers: _DataWriters) -> None:
|
|
381
|
-
"""Write provenance for init-output datasets and init-quanta.
|
|
382
|
-
|
|
383
|
-
Parameters
|
|
384
|
-
----------
|
|
385
|
-
data_writers : `_DataWriters`
|
|
386
|
-
Low-level writers struct.
|
|
387
|
-
"""
|
|
388
|
-
self.comms.log.info("Writing init outputs.")
|
|
389
|
-
init_quanta = ProvenanceInitQuantaModel()
|
|
390
|
-
for predicted_init_quantum in self.predicted.init_quanta.root:
|
|
391
|
-
if not predicted_init_quantum.task_label:
|
|
392
|
-
# Skip the 'packages' producer quantum.
|
|
393
|
-
continue
|
|
394
|
-
existing_outputs = self.existing_init_outputs[predicted_init_quantum.quantum_id]
|
|
395
|
-
for predicted_output in itertools.chain.from_iterable(predicted_init_quantum.outputs.values()):
|
|
396
|
-
provenance_output = ProvenanceDatasetModel.from_predicted(
|
|
397
|
-
predicted_output,
|
|
398
|
-
producer=predicted_init_quantum.quantum_id,
|
|
399
|
-
consumers=self.xgraph.successors(predicted_output.dataset_id),
|
|
400
|
-
)
|
|
401
|
-
provenance_output.produced = predicted_output.dataset_id in existing_outputs
|
|
402
|
-
data_writers.datasets.write_model(
|
|
403
|
-
provenance_output.dataset_id, provenance_output, data_writers.compressor
|
|
404
|
-
)
|
|
405
|
-
init_quanta.root.append(ProvenanceInitQuantumModel.from_predicted(predicted_init_quantum))
|
|
406
|
-
data_writers.graph.write_single_model("init_quanta", init_quanta)
|
|
407
|
-
|
|
408
|
-
def write_overall_inputs(self, data_writers: _DataWriters) -> None:
|
|
409
|
-
"""Write provenance for overall-input datasets.
|
|
410
|
-
|
|
411
|
-
Parameters
|
|
412
|
-
----------
|
|
413
|
-
data_writers : `_DataWriters`
|
|
414
|
-
Low-level writers struct.
|
|
415
|
-
"""
|
|
416
|
-
self.comms.log.info("Writing overall inputs.")
|
|
417
|
-
for predicted_input in self.comms.periodically_check_for_cancel(self.overall_inputs.values()):
|
|
418
|
-
if predicted_input.dataset_id not in data_writers.datasets.addresses:
|
|
419
|
-
data_writers.datasets.write_model(
|
|
420
|
-
predicted_input.dataset_id,
|
|
421
|
-
ProvenanceDatasetModel.from_predicted(
|
|
422
|
-
predicted_input,
|
|
423
|
-
producer=None,
|
|
424
|
-
consumers=self.xgraph.successors(predicted_input.dataset_id),
|
|
425
|
-
),
|
|
426
|
-
data_writers.compressor,
|
|
427
|
-
)
|
|
428
|
-
del self.overall_inputs
|
|
429
|
-
|
|
430
|
-
@staticmethod
|
|
431
|
-
def write_packages(data_writers: _DataWriters) -> None:
|
|
432
|
-
"""Write package version information to the provenance graph.
|
|
433
|
-
|
|
434
|
-
Parameters
|
|
435
|
-
----------
|
|
436
|
-
data_writers : `_DataWriters`
|
|
437
|
-
Low-level writers struct.
|
|
438
|
-
"""
|
|
439
|
-
packages = Packages.fromSystem(include_all=True)
|
|
440
|
-
data = packages.toBytes("json")
|
|
441
|
-
data_writers.graph.write_single_block("packages", data)
|
|
442
|
-
|
|
443
|
-
def process_request(self, request: WriteRequest, data_writers: _DataWriters) -> None:
|
|
444
|
-
"""Process a `WriteRequest` into `_ScanData`.
|
|
445
|
-
|
|
446
|
-
Parameters
|
|
447
|
-
----------
|
|
448
|
-
request : `WriteRequest`
|
|
449
|
-
Result of a quantum scan.
|
|
450
|
-
data_writers : `_DataWriters`
|
|
451
|
-
Low-level writers struct.
|
|
452
|
-
"""
|
|
453
|
-
if (existing_init_outputs := self.existing_init_outputs.get(request.quantum_id)) is not None:
|
|
454
|
-
self.comms.log.debug("Handling init-output scan for %s.", request.quantum_id)
|
|
455
|
-
existing_init_outputs.update(request.existing_outputs)
|
|
456
|
-
self.comms.report_write()
|
|
457
|
-
return
|
|
458
|
-
self.comms.log.debug("Handling quantum scan for %s.", request.quantum_id)
|
|
459
|
-
predicted_quantum = self.predicted.quantum_datasets[request.quantum_id]
|
|
460
|
-
outputs: dict[uuid.UUID, bytes] = {}
|
|
461
|
-
for predicted_output in itertools.chain.from_iterable(predicted_quantum.outputs.values()):
|
|
462
|
-
provenance_output = ProvenanceDatasetModel.from_predicted(
|
|
463
|
-
predicted_output,
|
|
464
|
-
producer=predicted_quantum.quantum_id,
|
|
465
|
-
consumers=self.xgraph.successors(predicted_output.dataset_id),
|
|
466
|
-
)
|
|
467
|
-
provenance_output.produced = provenance_output.dataset_id in request.existing_outputs
|
|
468
|
-
outputs[provenance_output.dataset_id] = data_writers.compressor.compress(
|
|
469
|
-
provenance_output.model_dump_json().encode()
|
|
470
|
-
)
|
|
471
|
-
if not request.quantum:
|
|
472
|
-
request.quantum = (
|
|
473
|
-
ProvenanceQuantumModel.from_predicted(predicted_quantum).model_dump_json().encode()
|
|
474
|
-
)
|
|
475
|
-
if request.is_compressed:
|
|
476
|
-
request.quantum = data_writers.compressor.compress(request.quantum)
|
|
477
|
-
if not request.is_compressed:
|
|
478
|
-
request.quantum = data_writers.compressor.compress(request.quantum)
|
|
479
|
-
if request.metadata:
|
|
480
|
-
request.metadata = data_writers.compressor.compress(request.metadata)
|
|
481
|
-
if request.logs:
|
|
482
|
-
request.logs = data_writers.compressor.compress(request.logs)
|
|
483
|
-
self.comms.log.debug("Writing quantum %s.", request.quantum_id)
|
|
484
|
-
data_writers.quanta.write_bytes(request.quantum_id, request.quantum)
|
|
485
|
-
for dataset_id, dataset_data in outputs.items():
|
|
486
|
-
data_writers.datasets.write_bytes(dataset_id, dataset_data)
|
|
487
|
-
if request.metadata:
|
|
488
|
-
(metadata_output,) = predicted_quantum.outputs[acc.METADATA_OUTPUT_CONNECTION_NAME]
|
|
489
|
-
address = data_writers.metadata.write_bytes(request.quantum_id, request.metadata)
|
|
490
|
-
data_writers.metadata.addresses[metadata_output.dataset_id] = address
|
|
491
|
-
if request.logs:
|
|
492
|
-
(log_output,) = predicted_quantum.outputs[acc.LOG_OUTPUT_CONNECTION_NAME]
|
|
493
|
-
address = data_writers.logs.write_bytes(request.quantum_id, request.logs)
|
|
494
|
-
data_writers.logs.addresses[log_output.dataset_id] = address
|
|
495
|
-
# We shouldn't need this predicted quantum anymore; delete it in the
|
|
496
|
-
# hopes that'll free up some memory.
|
|
497
|
-
del self.predicted.quantum_datasets[request.quantum_id]
|
|
498
|
-
self.comms.report_write()
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
_T = TypeVar("_T")
|