lsst-pipe-base 30.0.0__py3-none-any.whl → 30.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/pipe/base/_instrument.py +5 -6
- lsst/pipe/base/log_capture.py +79 -39
- lsst/pipe/base/mp_graph_executor.py +15 -51
- lsst/pipe/base/quantum_graph/_common.py +3 -4
- lsst/pipe/base/quantum_graph/_multiblock.py +16 -6
- lsst/pipe/base/quantum_graph/_predicted.py +12 -106
- lsst/pipe/base/quantum_graph/_provenance.py +6 -657
- lsst/pipe/base/quantum_graph/aggregator/_communicators.py +50 -18
- lsst/pipe/base/quantum_graph/aggregator/_scanner.py +229 -35
- lsst/pipe/base/quantum_graph/aggregator/_structs.py +113 -3
- lsst/pipe/base/quantum_graph/aggregator/_supervisor.py +5 -10
- lsst/pipe/base/quantum_graph/aggregator/_writer.py +348 -31
- lsst/pipe/base/quantum_graph_builder.py +1 -12
- lsst/pipe/base/quantum_graph_executor.py +13 -116
- lsst/pipe/base/quantum_graph_skeleton.py +7 -1
- lsst/pipe/base/separable_pipeline_executor.py +2 -18
- lsst/pipe/base/single_quantum_executor.py +35 -53
- lsst/pipe/base/version.py +1 -1
- {lsst_pipe_base-30.0.0.dist-info → lsst_pipe_base-30.0.0rc1.dist-info}/METADATA +1 -1
- {lsst_pipe_base-30.0.0.dist-info → lsst_pipe_base-30.0.0rc1.dist-info}/RECORD +28 -30
- {lsst_pipe_base-30.0.0.dist-info → lsst_pipe_base-30.0.0rc1.dist-info}/WHEEL +1 -1
- lsst/pipe/base/log_on_close.py +0 -79
- lsst/pipe/base/quantum_graph/formatter.py +0 -101
- {lsst_pipe_base-30.0.0.dist-info → lsst_pipe_base-30.0.0rc1.dist-info}/entry_points.txt +0 -0
- {lsst_pipe_base-30.0.0.dist-info → lsst_pipe_base-30.0.0rc1.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_pipe_base-30.0.0.dist-info → lsst_pipe_base-30.0.0rc1.dist-info}/licenses/LICENSE +0 -0
- {lsst_pipe_base-30.0.0.dist-info → lsst_pipe_base-30.0.0rc1.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_pipe_base-30.0.0.dist-info → lsst_pipe_base-30.0.0rc1.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_pipe_base-30.0.0.dist-info → lsst_pipe_base-30.0.0rc1.dist-info}/top_level.txt +0 -0
- {lsst_pipe_base-30.0.0.dist-info → lsst_pipe_base-30.0.0rc1.dist-info}/zip-safe +0 -0
|
@@ -51,17 +51,16 @@ import time
|
|
|
51
51
|
import uuid
|
|
52
52
|
from abc import ABC, abstractmethod
|
|
53
53
|
from collections.abc import Callable, Iterable, Iterator
|
|
54
|
-
from contextlib import ExitStack
|
|
54
|
+
from contextlib import AbstractContextManager, ExitStack, contextmanager
|
|
55
55
|
from traceback import format_exception
|
|
56
56
|
from types import TracebackType
|
|
57
57
|
from typing import Any, Literal, Self, TypeAlias, TypeVar, Union
|
|
58
58
|
|
|
59
|
-
from lsst.utils.logging import LsstLogAdapter
|
|
59
|
+
from lsst.utils.logging import VERBOSE, LsstLogAdapter
|
|
60
60
|
|
|
61
|
-
from .._provenance import ProvenanceQuantumScanData
|
|
62
61
|
from ._config import AggregatorConfig
|
|
63
62
|
from ._progress import ProgressManager, make_worker_log
|
|
64
|
-
from ._structs import IngestRequest, ScanReport
|
|
63
|
+
from ._structs import IngestRequest, ScanReport, WriteRequest
|
|
65
64
|
|
|
66
65
|
_T = TypeVar("_T")
|
|
67
66
|
|
|
@@ -362,9 +361,9 @@ class SupervisorCommunicator:
|
|
|
362
361
|
# scanner and the supervisor send one sentinal when done, and the
|
|
363
362
|
# writer waits for (n_scanners + 1) sentinals to arrive before it
|
|
364
363
|
# starts its shutdown.
|
|
365
|
-
self._write_requests: (
|
|
366
|
-
|
|
367
|
-
)
|
|
364
|
+
self._write_requests: Queue[WriteRequest | Literal[_Sentinel.NO_MORE_WRITE_REQUESTS]] | None = (
|
|
365
|
+
context.make_queue() if config.output_path is not None else None
|
|
366
|
+
)
|
|
368
367
|
# All other workers use this queue to send many different kinds of
|
|
369
368
|
# reports the supervisor. The supervisor waits for a _DONE sentinal
|
|
370
369
|
# from each worker before it finishes its shutdown.
|
|
@@ -462,12 +461,12 @@ class SupervisorCommunicator:
|
|
|
462
461
|
"""
|
|
463
462
|
self._scan_requests.put(_ScanRequest(quantum_id), block=False)
|
|
464
463
|
|
|
465
|
-
def request_write(self, request:
|
|
464
|
+
def request_write(self, request: WriteRequest) -> None:
|
|
466
465
|
"""Send a request to the writer to write provenance for the given scan.
|
|
467
466
|
|
|
468
467
|
Parameters
|
|
469
468
|
----------
|
|
470
|
-
request : `
|
|
469
|
+
request : `WriteRequest`
|
|
471
470
|
Information from scanning a quantum (or knowing you don't have to,
|
|
472
471
|
in the case of blocked quanta).
|
|
473
472
|
"""
|
|
@@ -622,11 +621,6 @@ class WorkerCommunicator:
|
|
|
622
621
|
self._exit_stack.__exit__(exc_type, exc_value, traceback)
|
|
623
622
|
return True
|
|
624
623
|
|
|
625
|
-
@property
|
|
626
|
-
def exit_stack(self) -> ExitStack:
|
|
627
|
-
"""A `contextlib.ExitStack` tied to the communicator."""
|
|
628
|
-
return self._exit_stack
|
|
629
|
-
|
|
630
624
|
def log_progress(self, level: int, message: str) -> None:
|
|
631
625
|
"""Send a high-level log message to the supervisor.
|
|
632
626
|
|
|
@@ -639,6 +633,44 @@ class WorkerCommunicator:
|
|
|
639
633
|
"""
|
|
640
634
|
self._reports.put(_ProgressLog(message=message, level=level), block=False)
|
|
641
635
|
|
|
636
|
+
def enter(
|
|
637
|
+
self,
|
|
638
|
+
cm: AbstractContextManager[_T],
|
|
639
|
+
on_close: str | None = None,
|
|
640
|
+
level: int = VERBOSE,
|
|
641
|
+
is_progress_log: bool = False,
|
|
642
|
+
) -> _T:
|
|
643
|
+
"""Enter a context manager that will be exited when the communicator's
|
|
644
|
+
context is exited.
|
|
645
|
+
|
|
646
|
+
Parameters
|
|
647
|
+
----------
|
|
648
|
+
cm : `contextlib.AbstractContextManager`
|
|
649
|
+
A context manager to enter.
|
|
650
|
+
on_close : `str`, optional
|
|
651
|
+
A log message to emit (on the worker's logger) just before the
|
|
652
|
+
given context manager is exited. This can be used to indicate
|
|
653
|
+
what's going on when an ``__exit__`` implementation has a lot of
|
|
654
|
+
work to do (e.g. moving a large file into a zip archive).
|
|
655
|
+
level : `int`, optional
|
|
656
|
+
Level for the ``on_close`` log message.
|
|
657
|
+
is_progress_log : `bool`, optional
|
|
658
|
+
If `True`, send the ``on_close`` message to the supervisor via
|
|
659
|
+
`log_progress` as well as the worker's logger.
|
|
660
|
+
"""
|
|
661
|
+
if on_close is None:
|
|
662
|
+
return self._exit_stack.enter_context(cm)
|
|
663
|
+
|
|
664
|
+
@contextmanager
|
|
665
|
+
def wrapper() -> Iterator[_T]:
|
|
666
|
+
with cm as result:
|
|
667
|
+
yield result
|
|
668
|
+
self.log.log(level, on_close)
|
|
669
|
+
if is_progress_log:
|
|
670
|
+
self.log_progress(level, on_close)
|
|
671
|
+
|
|
672
|
+
return self._exit_stack.enter_context(wrapper())
|
|
673
|
+
|
|
642
674
|
def check_for_cancel(self) -> None:
|
|
643
675
|
"""Check for a cancel signal from the supervisor and raise
|
|
644
676
|
`FatalWorkerError` if it is present.
|
|
@@ -696,12 +728,12 @@ class ScannerCommunicator(WorkerCommunicator):
|
|
|
696
728
|
else:
|
|
697
729
|
self._reports.put(_IngestReport(1), block=False)
|
|
698
730
|
|
|
699
|
-
def request_write(self, request:
|
|
731
|
+
def request_write(self, request: WriteRequest) -> None:
|
|
700
732
|
"""Ask the writer to write provenance for a quantum.
|
|
701
733
|
|
|
702
734
|
Parameters
|
|
703
735
|
----------
|
|
704
|
-
request : `
|
|
736
|
+
request : `WriteRequest`
|
|
705
737
|
Result of scanning a quantum.
|
|
706
738
|
"""
|
|
707
739
|
assert self._write_requests is not None, "Writer should not be used if writing is disabled."
|
|
@@ -881,12 +913,12 @@ class WriterCommunicator(WorkerCommunicator):
|
|
|
881
913
|
self._reports.put(_Sentinel.WRITER_DONE, block=False)
|
|
882
914
|
return result
|
|
883
915
|
|
|
884
|
-
def poll(self) -> Iterator[
|
|
916
|
+
def poll(self) -> Iterator[WriteRequest]:
|
|
885
917
|
"""Poll for writer requests from the scanner workers and supervisor.
|
|
886
918
|
|
|
887
919
|
Yields
|
|
888
920
|
------
|
|
889
|
-
request : `
|
|
921
|
+
request : `WriteRequest`
|
|
890
922
|
The result of a quantum scan.
|
|
891
923
|
|
|
892
924
|
Notes
|
|
@@ -38,19 +38,23 @@ from typing import Any, Literal, Self
|
|
|
38
38
|
import zstandard
|
|
39
39
|
|
|
40
40
|
from lsst.daf.butler import ButlerLogRecords, DatasetRef, QuantumBackedButler
|
|
41
|
+
from lsst.utils.iteration import ensure_iterable
|
|
41
42
|
|
|
42
43
|
from ... import automatic_connection_constants as acc
|
|
44
|
+
from ..._status import ExceptionInfo, QuantumAttemptStatus, QuantumSuccessCaveats
|
|
43
45
|
from ..._task_metadata import TaskMetadata
|
|
46
|
+
from ...log_capture import _ExecutionLogRecordsExtra
|
|
44
47
|
from ...pipeline_graph import PipelineGraph, TaskImportMode
|
|
48
|
+
from ...resource_usage import QuantumResourceUsage
|
|
45
49
|
from .._multiblock import Compressor
|
|
46
50
|
from .._predicted import (
|
|
47
51
|
PredictedDatasetModel,
|
|
48
52
|
PredictedQuantumDatasetsModel,
|
|
49
53
|
PredictedQuantumGraphReader,
|
|
50
54
|
)
|
|
51
|
-
from .._provenance import
|
|
55
|
+
from .._provenance import ProvenanceInitQuantumModel, ProvenanceQuantumAttemptModel, ProvenanceQuantumModel
|
|
52
56
|
from ._communicators import ScannerCommunicator
|
|
53
|
-
from ._structs import IngestRequest, ScanReport
|
|
57
|
+
from ._structs import IngestRequest, InProgressScan, ScanReport, ScanStatus, WriteRequest
|
|
54
58
|
|
|
55
59
|
|
|
56
60
|
@dataclasses.dataclass
|
|
@@ -90,7 +94,7 @@ class Scanner(AbstractContextManager):
|
|
|
90
94
|
if self.comms.config.mock_storage_classes:
|
|
91
95
|
import lsst.pipe.base.tests.mocks # noqa: F401
|
|
92
96
|
self.comms.log.verbose("Reading from predicted quantum graph.")
|
|
93
|
-
self.reader = self.comms.
|
|
97
|
+
self.reader = self.comms.enter(
|
|
94
98
|
PredictedQuantumGraphReader.open(self.predicted_path, import_mode=TaskImportMode.DO_NOT_IMPORT)
|
|
95
99
|
)
|
|
96
100
|
self.reader.read_dimension_data()
|
|
@@ -192,7 +196,7 @@ class Scanner(AbstractContextManager):
|
|
|
192
196
|
ref = self.reader.components.make_dataset_ref(predicted)
|
|
193
197
|
return self.qbb.stored(ref)
|
|
194
198
|
|
|
195
|
-
def scan_quantum(self, quantum_id: uuid.UUID) ->
|
|
199
|
+
def scan_quantum(self, quantum_id: uuid.UUID) -> InProgressScan:
|
|
196
200
|
"""Scan for a quantum's completion and error status, and its output
|
|
197
201
|
datasets' existence.
|
|
198
202
|
|
|
@@ -203,38 +207,76 @@ class Scanner(AbstractContextManager):
|
|
|
203
207
|
|
|
204
208
|
Returns
|
|
205
209
|
-------
|
|
206
|
-
result : `
|
|
210
|
+
result : `InProgressScan`
|
|
207
211
|
Scan result struct.
|
|
208
212
|
"""
|
|
209
213
|
if (predicted_quantum := self.init_quanta.get(quantum_id)) is not None:
|
|
210
|
-
result =
|
|
211
|
-
predicted_quantum.quantum_id, status=ProvenanceQuantumScanStatus.INIT
|
|
212
|
-
)
|
|
214
|
+
result = InProgressScan(predicted_quantum.quantum_id, status=ScanStatus.INIT)
|
|
213
215
|
self.comms.log.debug("Created init scan for %s (%s)", quantum_id, predicted_quantum.task_label)
|
|
214
216
|
else:
|
|
215
217
|
self.reader.read_quantum_datasets([quantum_id])
|
|
216
|
-
predicted_quantum = self.reader.components.quantum_datasets
|
|
218
|
+
predicted_quantum = self.reader.components.quantum_datasets[quantum_id]
|
|
217
219
|
self.comms.log.debug(
|
|
218
220
|
"Scanning %s (%s@%s)",
|
|
219
221
|
quantum_id,
|
|
220
222
|
predicted_quantum.task_label,
|
|
221
223
|
predicted_quantum.data_coordinate,
|
|
222
224
|
)
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
225
|
+
result = InProgressScan(predicted_quantum.quantum_id, ScanStatus.INCOMPLETE)
|
|
226
|
+
del self.reader.components.quantum_datasets[quantum_id]
|
|
227
|
+
last_attempt = ProvenanceQuantumAttemptModel()
|
|
228
|
+
if not self._read_log(predicted_quantum, result, last_attempt):
|
|
229
|
+
self.comms.log.debug("Abandoning scan for %s; no log dataset.", quantum_id)
|
|
230
|
+
self.comms.report_scan(ScanReport(result.quantum_id, result.status))
|
|
231
|
+
return result
|
|
232
|
+
if not self._read_metadata(predicted_quantum, result, last_attempt):
|
|
233
|
+
# We found the log dataset, but no metadata; this means the
|
|
234
|
+
# quantum failed, but a retry might still happen that could
|
|
235
|
+
# turn it into a success if we can't yet assume the run is
|
|
236
|
+
# complete.
|
|
237
|
+
self.comms.log.debug("Abandoning scan for %s.", quantum_id)
|
|
230
238
|
self.comms.report_scan(ScanReport(result.quantum_id, result.status))
|
|
231
239
|
return result
|
|
240
|
+
last_attempt.attempt = len(result.attempts)
|
|
241
|
+
result.attempts.append(last_attempt)
|
|
242
|
+
assert result.status is not ScanStatus.INCOMPLETE
|
|
243
|
+
assert result.status is not ScanStatus.ABANDONED
|
|
244
|
+
|
|
245
|
+
if len(result.logs.attempts) < len(result.attempts):
|
|
246
|
+
# Logs were not found for this attempt; must have been a hard error
|
|
247
|
+
# that kept the `finally` block from running or otherwise
|
|
248
|
+
# interrupted the writing of the logs.
|
|
249
|
+
result.logs.attempts.append(None)
|
|
250
|
+
if result.status is ScanStatus.SUCCESSFUL:
|
|
251
|
+
# But we found the metadata! Either that hard error happened
|
|
252
|
+
# at a very unlucky time (in between those two writes), or
|
|
253
|
+
# something even weirder happened.
|
|
254
|
+
result.attempts[-1].status = QuantumAttemptStatus.LOGS_MISSING
|
|
255
|
+
else:
|
|
256
|
+
result.attempts[-1].status = QuantumAttemptStatus.FAILED
|
|
257
|
+
if len(result.metadata.attempts) < len(result.attempts):
|
|
258
|
+
# Metadata missing usually just means a failure. In any case, the
|
|
259
|
+
# status will already be correct, either because it was set to a
|
|
260
|
+
# failure when we read the logs, or left at UNKNOWN if there were
|
|
261
|
+
# no logs. Note that scanners never process BLOCKED quanta at all.
|
|
262
|
+
result.metadata.attempts.append(None)
|
|
263
|
+
assert len(result.logs.attempts) == len(result.attempts) or len(result.metadata.attempts) == len(
|
|
264
|
+
result.attempts
|
|
265
|
+
), (
|
|
266
|
+
"The only way we can add more than one quantum attempt is by "
|
|
267
|
+
"extracting info stored with the logs, and that always appends "
|
|
268
|
+
"a log attempt and a metadata attempt, so this must be a bug in "
|
|
269
|
+
"the scanner."
|
|
270
|
+
)
|
|
271
|
+
# Scan for output dataset existence, skipping any the metadata reported
|
|
272
|
+
# on as well as and the metadata and logs themselves (since we just
|
|
273
|
+
# checked those).
|
|
232
274
|
for predicted_output in itertools.chain.from_iterable(predicted_quantum.outputs.values()):
|
|
233
|
-
if predicted_output.dataset_id not in result.
|
|
234
|
-
result.
|
|
275
|
+
if predicted_output.dataset_id not in result.outputs:
|
|
276
|
+
result.outputs[predicted_output.dataset_id] = self.scan_dataset(predicted_output)
|
|
235
277
|
to_ingest = self._make_ingest_request(predicted_quantum, result)
|
|
236
278
|
if self.comms.config.output_path is not None:
|
|
237
|
-
to_write =
|
|
279
|
+
to_write = self._make_write_request(predicted_quantum, result)
|
|
238
280
|
self.comms.request_write(to_write)
|
|
239
281
|
self.comms.request_ingest(to_ingest)
|
|
240
282
|
self.comms.report_scan(ScanReport(result.quantum_id, result.status))
|
|
@@ -242,7 +284,7 @@ class Scanner(AbstractContextManager):
|
|
|
242
284
|
return result
|
|
243
285
|
|
|
244
286
|
def _make_ingest_request(
|
|
245
|
-
self, predicted_quantum: PredictedQuantumDatasetsModel, result:
|
|
287
|
+
self, predicted_quantum: PredictedQuantumDatasetsModel, result: InProgressScan
|
|
246
288
|
) -> IngestRequest:
|
|
247
289
|
"""Make an ingest request from a quantum scan.
|
|
248
290
|
|
|
@@ -250,7 +292,7 @@ class Scanner(AbstractContextManager):
|
|
|
250
292
|
----------
|
|
251
293
|
predicted_quantum : `PredictedQuantumDatasetsModel`
|
|
252
294
|
Information about the predicted quantum.
|
|
253
|
-
result : `
|
|
295
|
+
result : `InProgressScan`
|
|
254
296
|
Result of a quantum scan.
|
|
255
297
|
|
|
256
298
|
Returns
|
|
@@ -263,7 +305,7 @@ class Scanner(AbstractContextManager):
|
|
|
263
305
|
}
|
|
264
306
|
to_ingest_predicted: list[PredictedDatasetModel] = []
|
|
265
307
|
to_ingest_refs: list[DatasetRef] = []
|
|
266
|
-
for dataset_id, was_produced in result.
|
|
308
|
+
for dataset_id, was_produced in result.outputs.items():
|
|
267
309
|
if was_produced:
|
|
268
310
|
predicted_output = predicted_outputs_by_id[dataset_id]
|
|
269
311
|
to_ingest_predicted.append(predicted_output)
|
|
@@ -271,18 +313,69 @@ class Scanner(AbstractContextManager):
|
|
|
271
313
|
to_ingest_records = self.qbb._datastore.export_predicted_records(to_ingest_refs)
|
|
272
314
|
return IngestRequest(result.quantum_id, to_ingest_predicted, to_ingest_records)
|
|
273
315
|
|
|
274
|
-
def
|
|
275
|
-
|
|
316
|
+
def _make_write_request(
|
|
317
|
+
self, predicted_quantum: PredictedQuantumDatasetsModel, result: InProgressScan
|
|
318
|
+
) -> WriteRequest:
|
|
319
|
+
"""Make a write request from a quantum scan.
|
|
276
320
|
|
|
277
321
|
Parameters
|
|
278
322
|
----------
|
|
279
323
|
predicted_quantum : `PredictedQuantumDatasetsModel`
|
|
280
324
|
Information about the predicted quantum.
|
|
325
|
+
result : `InProgressScan`
|
|
326
|
+
Result of a quantum scan.
|
|
281
327
|
|
|
282
328
|
Returns
|
|
283
329
|
-------
|
|
284
|
-
|
|
285
|
-
|
|
330
|
+
write_request : `WriteRequest`
|
|
331
|
+
A request to be sent to the writer.
|
|
332
|
+
"""
|
|
333
|
+
quantum: ProvenanceInitQuantumModel | ProvenanceQuantumModel
|
|
334
|
+
if result.status is ScanStatus.INIT:
|
|
335
|
+
quantum = ProvenanceInitQuantumModel.from_predicted(predicted_quantum)
|
|
336
|
+
else:
|
|
337
|
+
quantum = ProvenanceQuantumModel.from_predicted(predicted_quantum)
|
|
338
|
+
quantum.attempts = result.attempts
|
|
339
|
+
request = WriteRequest(
|
|
340
|
+
result.quantum_id,
|
|
341
|
+
result.status,
|
|
342
|
+
existing_outputs={
|
|
343
|
+
dataset_id for dataset_id, was_produced in result.outputs.items() if was_produced
|
|
344
|
+
},
|
|
345
|
+
quantum=quantum.model_dump_json().encode(),
|
|
346
|
+
logs=result.logs.model_dump_json().encode() if result.logs.attempts else b"",
|
|
347
|
+
metadata=result.metadata.model_dump_json().encode() if result.metadata.attempts else b"",
|
|
348
|
+
)
|
|
349
|
+
if self.compressor is not None:
|
|
350
|
+
request.quantum = self.compressor.compress(request.quantum)
|
|
351
|
+
request.logs = self.compressor.compress(request.logs) if request.logs else b""
|
|
352
|
+
request.metadata = self.compressor.compress(request.metadata) if request.metadata else b""
|
|
353
|
+
request.is_compressed = True
|
|
354
|
+
return request
|
|
355
|
+
|
|
356
|
+
def _read_metadata(
|
|
357
|
+
self,
|
|
358
|
+
predicted_quantum: PredictedQuantumDatasetsModel,
|
|
359
|
+
result: InProgressScan,
|
|
360
|
+
last_attempt: ProvenanceQuantumAttemptModel,
|
|
361
|
+
) -> bool:
|
|
362
|
+
"""Attempt to read the metadata dataset for a quantum to extract
|
|
363
|
+
provenance information from it.
|
|
364
|
+
|
|
365
|
+
Parameters
|
|
366
|
+
----------
|
|
367
|
+
predicted_quantum : `PredictedQuantumDatasetsModel`
|
|
368
|
+
Information about the predicted quantum.
|
|
369
|
+
result : `InProgressScan`
|
|
370
|
+
Result object to be modified in-place.
|
|
371
|
+
last_attempt : `ScanningProvenanceQuantumAttemptModel`
|
|
372
|
+
Structure to fill in with information about the last attempt to
|
|
373
|
+
run this quantum.
|
|
374
|
+
|
|
375
|
+
Returns
|
|
376
|
+
-------
|
|
377
|
+
complete : `bool`
|
|
378
|
+
Whether the quantum is complete.
|
|
286
379
|
"""
|
|
287
380
|
(predicted_dataset,) = predicted_quantum.outputs[acc.METADATA_OUTPUT_CONNECTION_NAME]
|
|
288
381
|
ref = self.reader.components.make_dataset_ref(predicted_dataset)
|
|
@@ -290,28 +383,129 @@ class Scanner(AbstractContextManager):
|
|
|
290
383
|
# This assumes QBB metadata writes are atomic, which should be the
|
|
291
384
|
# case. If it's not we'll probably get pydantic validation errors
|
|
292
385
|
# here.
|
|
293
|
-
|
|
386
|
+
metadata: TaskMetadata = self.qbb.get(ref, storageClass="TaskMetadata")
|
|
294
387
|
except FileNotFoundError:
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
388
|
+
result.outputs[ref.id] = False
|
|
389
|
+
if self.comms.config.assume_complete:
|
|
390
|
+
result.status = ScanStatus.FAILED
|
|
391
|
+
else:
|
|
392
|
+
result.status = ScanStatus.ABANDONED
|
|
393
|
+
return False
|
|
394
|
+
else:
|
|
395
|
+
result.status = ScanStatus.SUCCESSFUL
|
|
396
|
+
result.outputs[ref.id] = True
|
|
397
|
+
last_attempt.status = QuantumAttemptStatus.SUCCESSFUL
|
|
398
|
+
try:
|
|
399
|
+
# Int conversion guards against spurious conversion to
|
|
400
|
+
# float that can apparently sometimes happen in
|
|
401
|
+
# TaskMetadata.
|
|
402
|
+
last_attempt.caveats = QuantumSuccessCaveats(int(metadata["quantum"]["caveats"]))
|
|
403
|
+
except LookupError:
|
|
404
|
+
pass
|
|
405
|
+
try:
|
|
406
|
+
last_attempt.exception = ExceptionInfo._from_metadata(
|
|
407
|
+
metadata[predicted_quantum.task_label]["failure"]
|
|
408
|
+
)
|
|
409
|
+
except LookupError:
|
|
410
|
+
pass
|
|
411
|
+
try:
|
|
412
|
+
for id_str in ensure_iterable(metadata["quantum"].getArray("outputs")):
|
|
413
|
+
result.outputs[uuid.UUID(id_str)]
|
|
414
|
+
except LookupError:
|
|
415
|
+
pass
|
|
416
|
+
else:
|
|
417
|
+
# If the metadata told us what it wrote, anything not in that
|
|
418
|
+
# list was not written.
|
|
419
|
+
for predicted_output in itertools.chain.from_iterable(predicted_quantum.outputs.values()):
|
|
420
|
+
result.outputs.setdefault(predicted_output.dataset_id, False)
|
|
421
|
+
last_attempt.resource_usage = QuantumResourceUsage.from_task_metadata(metadata)
|
|
422
|
+
result.metadata.attempts.append(metadata)
|
|
423
|
+
return True
|
|
424
|
+
|
|
425
|
+
def _read_log(
|
|
426
|
+
self,
|
|
427
|
+
predicted_quantum: PredictedQuantumDatasetsModel,
|
|
428
|
+
result: InProgressScan,
|
|
429
|
+
last_attempt: ProvenanceQuantumAttemptModel,
|
|
430
|
+
) -> bool:
|
|
431
|
+
"""Attempt to read the log dataset for a quantum to test for the
|
|
432
|
+
quantum's completion (the log is always written last) and aggregate
|
|
433
|
+
the log content in the provenance quantum graph.
|
|
299
434
|
|
|
300
435
|
Parameters
|
|
301
436
|
----------
|
|
302
437
|
predicted_quantum : `PredictedQuantumDatasetsModel`
|
|
303
438
|
Information about the predicted quantum.
|
|
439
|
+
result : `InProgressScan`
|
|
440
|
+
Result object to be modified in-place.
|
|
441
|
+
last_attempt : `ScanningProvenanceQuantumAttemptModel`
|
|
442
|
+
Structure to fill in with information about the last attempt to
|
|
443
|
+
run this quantum.
|
|
304
444
|
|
|
305
445
|
Returns
|
|
306
446
|
-------
|
|
307
|
-
|
|
308
|
-
|
|
447
|
+
complete : `bool`
|
|
448
|
+
Whether the quantum is complete.
|
|
309
449
|
"""
|
|
310
450
|
(predicted_dataset,) = predicted_quantum.outputs[acc.LOG_OUTPUT_CONNECTION_NAME]
|
|
311
451
|
ref = self.reader.components.make_dataset_ref(predicted_dataset)
|
|
312
452
|
try:
|
|
313
453
|
# This assumes QBB log writes are atomic, which should be the case.
|
|
314
454
|
# If it's not we'll probably get pydantic validation errors here.
|
|
315
|
-
|
|
455
|
+
log_records: ButlerLogRecords = self.qbb.get(ref)
|
|
316
456
|
except FileNotFoundError:
|
|
317
|
-
|
|
457
|
+
result.outputs[ref.id] = False
|
|
458
|
+
if self.comms.config.assume_complete:
|
|
459
|
+
result.status = ScanStatus.FAILED
|
|
460
|
+
else:
|
|
461
|
+
result.status = ScanStatus.ABANDONED
|
|
462
|
+
return False
|
|
463
|
+
else:
|
|
464
|
+
# Set the attempt's run status to FAILED, since the default is
|
|
465
|
+
# UNKNOWN (i.e. logs *and* metadata are missing) and we now know
|
|
466
|
+
# the logs exist. This will usually get replaced by SUCCESSFUL
|
|
467
|
+
# when we look for metadata next.
|
|
468
|
+
last_attempt.status = QuantumAttemptStatus.FAILED
|
|
469
|
+
result.outputs[ref.id] = True
|
|
470
|
+
if log_records.extra:
|
|
471
|
+
log_extra = _ExecutionLogRecordsExtra.model_validate(log_records.extra)
|
|
472
|
+
self._extract_from_log_extra(log_extra, result, last_attempt=last_attempt)
|
|
473
|
+
result.logs.attempts.append(list(log_records))
|
|
474
|
+
return True
|
|
475
|
+
|
|
476
|
+
def _extract_from_log_extra(
|
|
477
|
+
self,
|
|
478
|
+
log_extra: _ExecutionLogRecordsExtra,
|
|
479
|
+
result: InProgressScan,
|
|
480
|
+
last_attempt: ProvenanceQuantumAttemptModel | None,
|
|
481
|
+
) -> None:
|
|
482
|
+
for previous_attempt_log_extra in log_extra.previous_attempts:
|
|
483
|
+
self._extract_from_log_extra(previous_attempt_log_extra, result, last_attempt=None)
|
|
484
|
+
quantum_attempt: ProvenanceQuantumAttemptModel
|
|
485
|
+
if last_attempt is None:
|
|
486
|
+
# This is not the last attempt, so it must be a failure.
|
|
487
|
+
quantum_attempt = ProvenanceQuantumAttemptModel(
|
|
488
|
+
attempt=len(result.attempts), status=QuantumAttemptStatus.FAILED
|
|
489
|
+
)
|
|
490
|
+
# We also need to get the logs from this extra provenance, since
|
|
491
|
+
# they won't be the main section of the log records.
|
|
492
|
+
result.logs.attempts.append(log_extra.logs)
|
|
493
|
+
# The special last attempt is only appended after we attempt to
|
|
494
|
+
# read metadata later, but we have to append this one now.
|
|
495
|
+
result.attempts.append(quantum_attempt)
|
|
496
|
+
else:
|
|
497
|
+
assert not log_extra.logs, "Logs for the last attempt should not be stored in the extra JSON."
|
|
498
|
+
quantum_attempt = last_attempt
|
|
499
|
+
if log_extra.exception is not None or log_extra.metadata is not None or last_attempt is None:
|
|
500
|
+
# We won't be getting a separate metadata dataset, so anything we
|
|
501
|
+
# might get from the metadata has to come from this extra
|
|
502
|
+
# provenance in the logs.
|
|
503
|
+
quantum_attempt.exception = log_extra.exception
|
|
504
|
+
if log_extra.metadata is not None:
|
|
505
|
+
quantum_attempt.resource_usage = QuantumResourceUsage.from_task_metadata(log_extra.metadata)
|
|
506
|
+
result.metadata.attempts.append(log_extra.metadata)
|
|
507
|
+
else:
|
|
508
|
+
result.metadata.attempts.append(None)
|
|
509
|
+
# Regardless of whether this is the last attempt or not, we can only
|
|
510
|
+
# get the previous_process_quanta from the log extra.
|
|
511
|
+
quantum_attempt.previous_process_quanta.extend(log_extra.previous_process_quanta)
|
|
@@ -27,16 +27,68 @@
|
|
|
27
27
|
|
|
28
28
|
from __future__ import annotations
|
|
29
29
|
|
|
30
|
-
__all__ = (
|
|
30
|
+
__all__ = (
|
|
31
|
+
"InProgressScan",
|
|
32
|
+
"IngestRequest",
|
|
33
|
+
"ScanReport",
|
|
34
|
+
"ScanStatus",
|
|
35
|
+
"WriteRequest",
|
|
36
|
+
)
|
|
31
37
|
|
|
32
38
|
import dataclasses
|
|
39
|
+
import enum
|
|
33
40
|
import uuid
|
|
34
41
|
|
|
35
42
|
from lsst.daf.butler.datastore.record_data import DatastoreRecordData
|
|
36
43
|
|
|
37
44
|
from .._common import DatastoreName
|
|
38
45
|
from .._predicted import PredictedDatasetModel
|
|
39
|
-
from .._provenance import
|
|
46
|
+
from .._provenance import (
|
|
47
|
+
ProvenanceLogRecordsModel,
|
|
48
|
+
ProvenanceQuantumAttemptModel,
|
|
49
|
+
ProvenanceTaskMetadataModel,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class ScanStatus(enum.Enum):
|
|
54
|
+
"""Status enum for quantum scanning.
|
|
55
|
+
|
|
56
|
+
Note that this records the status for the *scanning* which is distinct
|
|
57
|
+
from the status of the quantum's execution.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
INCOMPLETE = enum.auto()
|
|
61
|
+
"""The quantum is not necessarily done running, and cannot be scanned
|
|
62
|
+
conclusively yet.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
ABANDONED = enum.auto()
|
|
66
|
+
"""The quantum's execution appears to have failed but we cannot rule out
|
|
67
|
+
the possibility that it could be recovered, but we've also waited long
|
|
68
|
+
enough (according to `ScannerTimeConfigDict.retry_timeout`) that it's time
|
|
69
|
+
to stop trying for now.
|
|
70
|
+
|
|
71
|
+
This state means a later run with `ScannerConfig.assume_complete` is
|
|
72
|
+
required.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
SUCCESSFUL = enum.auto()
|
|
76
|
+
"""The quantum was conclusively scanned and was executed successfully,
|
|
77
|
+
unblocking scans for downstream quanta.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
FAILED = enum.auto()
|
|
81
|
+
"""The quantum was conclusively scanned and failed execution, blocking
|
|
82
|
+
scans for downstream quanta.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
BLOCKED = enum.auto()
|
|
86
|
+
"""A quantum upstream of this one failed."""
|
|
87
|
+
|
|
88
|
+
INIT = enum.auto()
|
|
89
|
+
"""Init quanta need special handling, because they don't have logs and
|
|
90
|
+
metadata.
|
|
91
|
+
"""
|
|
40
92
|
|
|
41
93
|
|
|
42
94
|
@dataclasses.dataclass
|
|
@@ -46,7 +98,7 @@ class ScanReport:
|
|
|
46
98
|
quantum_id: uuid.UUID
|
|
47
99
|
"""Unique ID of the quantum."""
|
|
48
100
|
|
|
49
|
-
status:
|
|
101
|
+
status: ScanStatus
|
|
50
102
|
"""Combined status of the scan and the execution of the quantum."""
|
|
51
103
|
|
|
52
104
|
|
|
@@ -65,3 +117,61 @@ class IngestRequest:
|
|
|
65
117
|
|
|
66
118
|
def __bool__(self) -> bool:
|
|
67
119
|
return bool(self.datasets or self.records)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@dataclasses.dataclass
|
|
123
|
+
class InProgressScan:
|
|
124
|
+
"""A struct that represents a quantum that is being scanned."""
|
|
125
|
+
|
|
126
|
+
quantum_id: uuid.UUID
|
|
127
|
+
"""Unique ID for the quantum."""
|
|
128
|
+
|
|
129
|
+
status: ScanStatus
|
|
130
|
+
"""Combined status for the scan and the execution of the quantum."""
|
|
131
|
+
|
|
132
|
+
attempts: list[ProvenanceQuantumAttemptModel] = dataclasses.field(default_factory=list)
|
|
133
|
+
"""Provenance information about each attempt to run the quantum."""
|
|
134
|
+
|
|
135
|
+
outputs: dict[uuid.UUID, bool] = dataclasses.field(default_factory=dict)
|
|
136
|
+
"""Unique IDs of the output datasets mapped to whether they were actually
|
|
137
|
+
produced.
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
metadata: ProvenanceTaskMetadataModel = dataclasses.field(default_factory=ProvenanceTaskMetadataModel)
|
|
141
|
+
"""Task metadata information for each attempt.
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
logs: ProvenanceLogRecordsModel = dataclasses.field(default_factory=ProvenanceLogRecordsModel)
|
|
145
|
+
"""Log records for each attempt.
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclasses.dataclass
|
|
150
|
+
class WriteRequest:
|
|
151
|
+
"""A struct that represents a request to write provenance for a quantum."""
|
|
152
|
+
|
|
153
|
+
quantum_id: uuid.UUID
|
|
154
|
+
"""Unique ID for the quantum."""
|
|
155
|
+
|
|
156
|
+
status: ScanStatus
|
|
157
|
+
"""Combined status for the scan and the execution of the quantum."""
|
|
158
|
+
|
|
159
|
+
existing_outputs: set[uuid.UUID] = dataclasses.field(default_factory=set)
|
|
160
|
+
"""Unique IDs of the output datasets that were actually written."""
|
|
161
|
+
|
|
162
|
+
quantum: bytes = b""
|
|
163
|
+
"""Serialized quantum provenance model.
|
|
164
|
+
|
|
165
|
+
This may be empty for quanta that had no attempts.
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
metadata: bytes = b""
|
|
169
|
+
"""Serialized task metadata."""
|
|
170
|
+
|
|
171
|
+
logs: bytes = b""
|
|
172
|
+
"""Serialized logs."""
|
|
173
|
+
|
|
174
|
+
is_compressed: bool = False
|
|
175
|
+
"""Whether the `quantum`, `metadata`, and `log` attributes are
|
|
176
|
+
compressed.
|
|
177
|
+
"""
|