lsst-pipe-base 29.2025.4500__py3-none-any.whl → 29.2025.4600__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/pipe/base/_status.py +156 -11
- lsst/pipe/base/log_capture.py +98 -7
- lsst/pipe/base/pipeline_graph/expressions.py +3 -3
- lsst/pipe/base/quantum_graph/_common.py +6 -0
- lsst/pipe/base/quantum_graph/_predicted.py +13 -17
- lsst/pipe/base/quantum_graph/_provenance.py +322 -106
- lsst/pipe/base/quantum_graph/aggregator/_communicators.py +9 -9
- lsst/pipe/base/quantum_graph/aggregator/_progress.py +77 -84
- lsst/pipe/base/quantum_graph/aggregator/_scanner.py +154 -53
- lsst/pipe/base/quantum_graph/aggregator/_structs.py +27 -34
- lsst/pipe/base/quantum_graph/aggregator/_supervisor.py +8 -7
- lsst/pipe/base/quantum_graph/aggregator/_writer.py +5 -8
- lsst/pipe/base/quantum_provenance_graph.py +2 -44
- lsst/pipe/base/single_quantum_executor.py +43 -9
- lsst/pipe/base/tests/mocks/_data_id_match.py +1 -1
- lsst/pipe/base/tests/mocks/_pipeline_task.py +1 -1
- lsst/pipe/base/version.py +1 -1
- {lsst_pipe_base-29.2025.4500.dist-info → lsst_pipe_base-29.2025.4600.dist-info}/METADATA +1 -1
- {lsst_pipe_base-29.2025.4500.dist-info → lsst_pipe_base-29.2025.4600.dist-info}/RECORD +27 -27
- {lsst_pipe_base-29.2025.4500.dist-info → lsst_pipe_base-29.2025.4600.dist-info}/WHEEL +0 -0
- {lsst_pipe_base-29.2025.4500.dist-info → lsst_pipe_base-29.2025.4600.dist-info}/entry_points.txt +0 -0
- {lsst_pipe_base-29.2025.4500.dist-info → lsst_pipe_base-29.2025.4600.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_pipe_base-29.2025.4500.dist-info → lsst_pipe_base-29.2025.4600.dist-info}/licenses/LICENSE +0 -0
- {lsst_pipe_base-29.2025.4500.dist-info → lsst_pipe_base-29.2025.4600.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_pipe_base-29.2025.4500.dist-info → lsst_pipe_base-29.2025.4600.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_pipe_base-29.2025.4500.dist-info → lsst_pipe_base-29.2025.4600.dist-info}/top_level.txt +0 -0
- {lsst_pipe_base-29.2025.4500.dist-info → lsst_pipe_base-29.2025.4600.dist-info}/zip-safe +0 -0
|
@@ -59,7 +59,7 @@ from typing import Any, Literal, Self, TypeAlias, TypeVar, Union
|
|
|
59
59
|
from lsst.utils.logging import VERBOSE, LsstLogAdapter
|
|
60
60
|
|
|
61
61
|
from ._config import AggregatorConfig
|
|
62
|
-
from ._progress import
|
|
62
|
+
from ._progress import ProgressManager, make_worker_log
|
|
63
63
|
from ._structs import IngestRequest, ScanReport, ScanResult
|
|
64
64
|
|
|
65
65
|
_T = TypeVar("_T")
|
|
@@ -340,7 +340,7 @@ class SupervisorCommunicator:
|
|
|
340
340
|
config: AggregatorConfig,
|
|
341
341
|
) -> None:
|
|
342
342
|
self.config = config
|
|
343
|
-
self.progress =
|
|
343
|
+
self.progress = ProgressManager(log, config)
|
|
344
344
|
self.n_scanners = n_scanners
|
|
345
345
|
# The supervisor sends scan requests to scanners on this queue.
|
|
346
346
|
# When complete, the supervisor sends n_scanners sentinals and each
|
|
@@ -406,13 +406,13 @@ class SupervisorCommunicator:
|
|
|
406
406
|
pass
|
|
407
407
|
case _Sentinel.INGESTER_DONE:
|
|
408
408
|
self._ingester_done = True
|
|
409
|
-
self.progress.
|
|
409
|
+
self.progress.quantum_ingests.close()
|
|
410
410
|
case _Sentinel.SCANNER_DONE:
|
|
411
411
|
self._n_scanners_done += 1
|
|
412
|
-
self.progress.
|
|
412
|
+
self.progress.scans.close()
|
|
413
413
|
case _Sentinel.WRITER_DONE:
|
|
414
414
|
self._writer_done = True
|
|
415
|
-
self.progress.
|
|
415
|
+
self.progress.writes.close()
|
|
416
416
|
case unexpected:
|
|
417
417
|
raise AssertionError(f"Unexpected message {unexpected!r} to supervisor.")
|
|
418
418
|
self.log.verbose(
|
|
@@ -530,9 +530,9 @@ class SupervisorCommunicator:
|
|
|
530
530
|
if not already_failing:
|
|
531
531
|
raise FatalWorkerError()
|
|
532
532
|
case _IngestReport(n_producers=n_producers):
|
|
533
|
-
self.progress.
|
|
533
|
+
self.progress.quantum_ingests.update(n_producers)
|
|
534
534
|
case _Sentinel.WRITE_REPORT:
|
|
535
|
-
self.progress.
|
|
535
|
+
self.progress.writes.update(1)
|
|
536
536
|
case _ProgressLog(message=message, level=level):
|
|
537
537
|
self.progress.log.log(level, "%s [after %0.1fs]", message, self.progress.elapsed_time)
|
|
538
538
|
case _:
|
|
@@ -626,10 +626,10 @@ class WorkerCommunicator:
|
|
|
626
626
|
|
|
627
627
|
Parameters
|
|
628
628
|
----------
|
|
629
|
-
message : `str`
|
|
630
|
-
Log message.
|
|
631
629
|
level : `int`
|
|
632
630
|
Log level. Should be ``VERBOSE`` or higher.
|
|
631
|
+
message : `str`
|
|
632
|
+
Log message.
|
|
633
633
|
"""
|
|
634
634
|
self._reports.put(_ProgressLog(message=message, level=level), block=False)
|
|
635
635
|
|
|
@@ -27,20 +27,86 @@
|
|
|
27
27
|
|
|
28
28
|
from __future__ import annotations
|
|
29
29
|
|
|
30
|
-
__all__ = ("
|
|
30
|
+
__all__ = ("ProgressCounter", "ProgressManager", "make_worker_log")
|
|
31
31
|
|
|
32
32
|
import logging
|
|
33
33
|
import os
|
|
34
34
|
import time
|
|
35
35
|
from types import TracebackType
|
|
36
|
-
from typing import Self
|
|
36
|
+
from typing import Any, Self
|
|
37
37
|
|
|
38
38
|
from lsst.utils.logging import TRACE, VERBOSE, LsstLogAdapter, PeriodicLogger, getLogger
|
|
39
39
|
|
|
40
40
|
from ._config import AggregatorConfig
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
class
|
|
43
|
+
class ProgressCounter:
|
|
44
|
+
"""A progress tracker for an individual aspect of the aggregation process.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
parent : `ProgressManager`
|
|
49
|
+
The parent progress manager object.
|
|
50
|
+
description : `str`
|
|
51
|
+
Human-readable description of this aspect.
|
|
52
|
+
unit : `str`
|
|
53
|
+
Unit (in plural form) for the items being counted.
|
|
54
|
+
total : `int`, optional
|
|
55
|
+
Expected total number of items. May be set later.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self, parent: ProgressManager, description: str, unit: str, total: int | None = None):
|
|
59
|
+
self._parent = parent
|
|
60
|
+
self.total = total
|
|
61
|
+
self._description = description
|
|
62
|
+
self._current = 0
|
|
63
|
+
self._unit = unit
|
|
64
|
+
self._bar: Any = None
|
|
65
|
+
|
|
66
|
+
def update(self, n: int) -> None:
|
|
67
|
+
"""Report that ``n`` new items have been processed.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
n : `int`
|
|
72
|
+
Number of new items processed.
|
|
73
|
+
"""
|
|
74
|
+
self._current += n
|
|
75
|
+
if self._parent.interactive:
|
|
76
|
+
if self._bar is None:
|
|
77
|
+
if n == self.total:
|
|
78
|
+
return
|
|
79
|
+
from tqdm import tqdm
|
|
80
|
+
|
|
81
|
+
self._bar = tqdm(desc=self._description, total=self.total, leave=False, unit=f" {self._unit}")
|
|
82
|
+
else:
|
|
83
|
+
self._bar.update(n)
|
|
84
|
+
if self._current == self.total:
|
|
85
|
+
self._bar.close()
|
|
86
|
+
self._parent._log_status()
|
|
87
|
+
|
|
88
|
+
def close(self) -> None:
|
|
89
|
+
"""Close the counter, guaranteeing that `update` will not be called
|
|
90
|
+
again.
|
|
91
|
+
"""
|
|
92
|
+
if self._bar is not None:
|
|
93
|
+
self._bar.close()
|
|
94
|
+
self._bar = None
|
|
95
|
+
|
|
96
|
+
def append_log_terms(self, msg: list[str]) -> None:
|
|
97
|
+
"""Append a log message for this counter to a list if it is active.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
msg : `list` [ `str` ]
|
|
102
|
+
List of messages to concatenate into a single line and log
|
|
103
|
+
together, to be modified in-place.
|
|
104
|
+
"""
|
|
105
|
+
if self.total is not None and self._current > 0 and self._current < self.total:
|
|
106
|
+
msg.append(f"{self._description} ({self._current} of {self.total} {self._unit})")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class ProgressManager:
|
|
44
110
|
"""A helper class for the provenance aggregator that handles reporting
|
|
45
111
|
progress to the user.
|
|
46
112
|
|
|
@@ -66,10 +132,9 @@ class Progress:
|
|
|
66
132
|
self.log = log
|
|
67
133
|
self.config = config
|
|
68
134
|
self._periodic_log = PeriodicLogger(self.log, config.log_status_interval)
|
|
69
|
-
self.
|
|
70
|
-
self.
|
|
71
|
-
self.
|
|
72
|
-
self._n_quanta: int | None = None
|
|
135
|
+
self.scans = ProgressCounter(self, "scanning", "quanta")
|
|
136
|
+
self.writes = ProgressCounter(self, "writing", "quanta")
|
|
137
|
+
self.quantum_ingests = ProgressCounter(self, "ingesting outputs", "quanta")
|
|
73
138
|
self.interactive = config.interactive_status
|
|
74
139
|
|
|
75
140
|
def __enter__(self) -> Self:
|
|
@@ -90,29 +155,6 @@ class Progress:
|
|
|
90
155
|
self._logging_redirect.__exit__(exc_type, exc_value, traceback)
|
|
91
156
|
return None
|
|
92
157
|
|
|
93
|
-
def set_n_quanta(self, n_quanta: int) -> None:
|
|
94
|
-
"""Set the total number of quanta.
|
|
95
|
-
|
|
96
|
-
Parameters
|
|
97
|
-
----------
|
|
98
|
-
n_quanta : `int`
|
|
99
|
-
Total number of quanta, including special "init" quanta.
|
|
100
|
-
|
|
101
|
-
Notes
|
|
102
|
-
-----
|
|
103
|
-
This method must be called before any of the ``report_*`` methods.
|
|
104
|
-
"""
|
|
105
|
-
self._n_quanta = n_quanta
|
|
106
|
-
if self.interactive:
|
|
107
|
-
from tqdm import tqdm
|
|
108
|
-
|
|
109
|
-
self._scan_progress = tqdm(desc="Scanning", total=n_quanta, leave=False, unit="quanta")
|
|
110
|
-
self._ingest_progress = tqdm(
|
|
111
|
-
desc="Ingesting", total=n_quanta, leave=False, smoothing=0.1, unit="quanta"
|
|
112
|
-
)
|
|
113
|
-
if self.config.output_path is not None:
|
|
114
|
-
self._write_progress = tqdm(desc="Writing", total=n_quanta, leave=False, unit="quanta")
|
|
115
|
-
|
|
116
158
|
@property
|
|
117
159
|
def elapsed_time(self) -> float:
|
|
118
160
|
"""The time in seconds since the start of the aggregator."""
|
|
@@ -120,60 +162,11 @@ class Progress:
|
|
|
120
162
|
|
|
121
163
|
def _log_status(self) -> None:
|
|
122
164
|
"""Invoke the periodic logger with the current status."""
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
self._n_written,
|
|
129
|
-
self._n_quanta,
|
|
130
|
-
self.elapsed_time,
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
def report_scan(self) -> None:
|
|
134
|
-
"""Report that a quantum was scanned."""
|
|
135
|
-
self._n_scanned += 1
|
|
136
|
-
if self.interactive:
|
|
137
|
-
self._scan_progress.update(1)
|
|
138
|
-
else:
|
|
139
|
-
self._log_status()
|
|
140
|
-
|
|
141
|
-
def finish_scans(self) -> None:
|
|
142
|
-
"""Report that all scanning is done."""
|
|
143
|
-
if self.interactive:
|
|
144
|
-
self._scan_progress.close()
|
|
145
|
-
|
|
146
|
-
def report_ingests(self, n_quanta: int) -> None:
|
|
147
|
-
"""Report that ingests for multiple quanta were completed.
|
|
148
|
-
|
|
149
|
-
Parameters
|
|
150
|
-
----------
|
|
151
|
-
n_quanta : `int`
|
|
152
|
-
Number of quanta whose outputs were ingested.
|
|
153
|
-
"""
|
|
154
|
-
self._n_ingested += n_quanta
|
|
155
|
-
if self.interactive:
|
|
156
|
-
self._ingest_progress.update(n_quanta)
|
|
157
|
-
else:
|
|
158
|
-
self._log_status()
|
|
159
|
-
|
|
160
|
-
def finish_ingests(self) -> None:
|
|
161
|
-
"""Report that all ingests are done."""
|
|
162
|
-
if self.interactive:
|
|
163
|
-
self._ingest_progress.close()
|
|
164
|
-
|
|
165
|
-
def report_write(self) -> None:
|
|
166
|
-
"""Report that a quantum's provenance was written."""
|
|
167
|
-
self._n_written += 1
|
|
168
|
-
if self.interactive:
|
|
169
|
-
self._write_progress.update()
|
|
170
|
-
else:
|
|
171
|
-
self._log_status()
|
|
172
|
-
|
|
173
|
-
def finish_writes(self) -> None:
|
|
174
|
-
"""Report that all writes are done."""
|
|
175
|
-
if self.interactive:
|
|
176
|
-
self._write_progress.close()
|
|
165
|
+
log_terms: list[str] = []
|
|
166
|
+
self.scans.append_log_terms(log_terms)
|
|
167
|
+
self.writes.append_log_terms(log_terms)
|
|
168
|
+
self.quantum_ingests.append_log_terms(log_terms)
|
|
169
|
+
self._periodic_log.log("Status after %0.1fs: %s.", self.elapsed_time, "; ".join(log_terms))
|
|
177
170
|
|
|
178
171
|
|
|
179
172
|
def make_worker_log(name: str, config: AggregatorConfig) -> LsstLogAdapter:
|
|
@@ -39,10 +39,10 @@ from lsst.daf.butler import ButlerLogRecords, DatasetRef, QuantumBackedButler
|
|
|
39
39
|
from lsst.utils.iteration import ensure_iterable
|
|
40
40
|
|
|
41
41
|
from ... import automatic_connection_constants as acc
|
|
42
|
-
from ..._status import QuantumSuccessCaveats
|
|
42
|
+
from ..._status import ExceptionInfo, QuantumAttemptStatus, QuantumSuccessCaveats
|
|
43
43
|
from ..._task_metadata import TaskMetadata
|
|
44
|
+
from ...log_capture import _ExecutionLogRecordsExtra
|
|
44
45
|
from ...pipeline_graph import PipelineGraph, TaskImportMode
|
|
45
|
-
from ...quantum_provenance_graph import ExceptionInfo
|
|
46
46
|
from ...resource_usage import QuantumResourceUsage
|
|
47
47
|
from .._multiblock import Compressor
|
|
48
48
|
from .._predicted import (
|
|
@@ -50,6 +50,7 @@ from .._predicted import (
|
|
|
50
50
|
PredictedQuantumDatasetsModel,
|
|
51
51
|
PredictedQuantumGraphReader,
|
|
52
52
|
)
|
|
53
|
+
from .._provenance import ProvenanceQuantumAttemptModel
|
|
53
54
|
from ._communicators import ScannerCommunicator
|
|
54
55
|
from ._structs import IngestRequest, ScanReport, ScanResult, ScanStatus
|
|
55
56
|
|
|
@@ -179,7 +180,7 @@ class Scanner:
|
|
|
179
180
|
Returns
|
|
180
181
|
-------
|
|
181
182
|
exists : `bool``
|
|
182
|
-
Whether the dataset exists
|
|
183
|
+
Whether the dataset exists.
|
|
183
184
|
"""
|
|
184
185
|
ref = self.reader.components.make_dataset_ref(predicted)
|
|
185
186
|
return self.qbb.stored(ref)
|
|
@@ -212,29 +213,67 @@ class Scanner:
|
|
|
212
213
|
)
|
|
213
214
|
result = ScanResult(predicted_quantum.quantum_id, ScanStatus.INCOMPLETE)
|
|
214
215
|
del self.reader.components.quantum_datasets[quantum_id]
|
|
215
|
-
|
|
216
|
-
if not self.
|
|
216
|
+
last_attempt = ProvenanceQuantumAttemptModel()
|
|
217
|
+
if not self._read_log(predicted_quantum, result, last_attempt):
|
|
217
218
|
self.comms.log.debug("Abandoning scan for %s; no log dataset.", quantum_id)
|
|
218
|
-
result.status = ScanStatus.ABANDONED
|
|
219
219
|
self.comms.report_scan(ScanReport(result.quantum_id, result.status))
|
|
220
220
|
return result
|
|
221
|
-
|
|
222
|
-
if result.metadata:
|
|
223
|
-
result.status = ScanStatus.SUCCESSFUL
|
|
224
|
-
result.existing_outputs.add(metadata_id)
|
|
225
|
-
elif self.comms.config.assume_complete:
|
|
226
|
-
result.status = ScanStatus.FAILED
|
|
227
|
-
else:
|
|
221
|
+
if not self._read_metadata(predicted_quantum, result, last_attempt):
|
|
228
222
|
# We found the log dataset, but no metadata; this means the
|
|
229
223
|
# quantum failed, but a retry might still happen that could
|
|
230
224
|
# turn it into a success if we can't yet assume the run is
|
|
231
225
|
# complete.
|
|
232
226
|
self.comms.log.debug("Abandoning scan for %s.", quantum_id)
|
|
233
|
-
result.status = ScanStatus.ABANDONED
|
|
234
227
|
self.comms.report_scan(ScanReport(result.quantum_id, result.status))
|
|
235
228
|
return result
|
|
236
|
-
|
|
237
|
-
|
|
229
|
+
last_attempt.attempt = len(result.attempts)
|
|
230
|
+
result.attempts.append(last_attempt)
|
|
231
|
+
assert result.status is not ScanStatus.INCOMPLETE
|
|
232
|
+
assert result.status is not ScanStatus.ABANDONED
|
|
233
|
+
assert result.log_model is not None, "Only set to None after converting to JSON."
|
|
234
|
+
assert result.metadata_model is not None, "Only set to None after converting to JSON."
|
|
235
|
+
|
|
236
|
+
if len(result.log_model.attempts) < len(result.attempts):
|
|
237
|
+
# Logs were not found for this attempt; must have been a hard error
|
|
238
|
+
# that kept the `finally` block from running or otherwise
|
|
239
|
+
# interrupted the writing of the logs.
|
|
240
|
+
result.log_model.attempts.append(None)
|
|
241
|
+
if result.status is ScanStatus.SUCCESSFUL:
|
|
242
|
+
# But we found the metadata! Either that hard error happened
|
|
243
|
+
# at a very unlucky time (in between those two writes), or
|
|
244
|
+
# something even weirder happened.
|
|
245
|
+
result.attempts[-1].status = QuantumAttemptStatus.LOGS_MISSING
|
|
246
|
+
else:
|
|
247
|
+
result.attempts[-1].status = QuantumAttemptStatus.FAILED
|
|
248
|
+
if len(result.metadata_model.attempts) < len(result.attempts):
|
|
249
|
+
# Metadata missing usually just means a failure. In any case, the
|
|
250
|
+
# status will already be correct, either because it was set to a
|
|
251
|
+
# failure when we read the logs, or left at UNKNOWN if there were
|
|
252
|
+
# no logs. Note that scanners never process BLOCKED quanta at all.
|
|
253
|
+
result.metadata_model.attempts.append(None)
|
|
254
|
+
assert len(result.log_model.attempts) == len(result.attempts) or len(
|
|
255
|
+
result.metadata_model.attempts
|
|
256
|
+
) == len(result.attempts), (
|
|
257
|
+
"The only way we can add more than one quantum attempt is by "
|
|
258
|
+
"extracting info stored with the logs, and that always appends "
|
|
259
|
+
"a log attempt and a metadata attempt, so this must be a bug in "
|
|
260
|
+
"the scanner."
|
|
261
|
+
)
|
|
262
|
+
# Now that we're done gathering the log and metadata information into
|
|
263
|
+
# models, dump them to JSON and delete the originals.
|
|
264
|
+
result.log_content = result.log_model.model_dump_json().encode()
|
|
265
|
+
result.log_model = None
|
|
266
|
+
result.metadata_content = result.metadata_model.model_dump_json().encode()
|
|
267
|
+
result.metadata_model = None
|
|
268
|
+
if self.compressor is not None:
|
|
269
|
+
if result.log_content is not None:
|
|
270
|
+
result.log_content = self.compressor.compress(result.log_content)
|
|
271
|
+
if result.metadata_content is not None:
|
|
272
|
+
result.metadata_content = self.compressor.compress(result.metadata_content)
|
|
273
|
+
result.is_compressed = True
|
|
274
|
+
# Scan for output dataset existence, skipping any the metadata reported
|
|
275
|
+
# as having been definitively written, as well as and the metadata and
|
|
276
|
+
# logs themselves (since we just checked those).
|
|
238
277
|
for predicted_output in itertools.chain.from_iterable(predicted_quantum.outputs.values()):
|
|
239
278
|
if predicted_output.dataset_id not in result.existing_outputs and self.scan_dataset(
|
|
240
279
|
predicted_output
|
|
@@ -242,8 +281,6 @@ class Scanner:
|
|
|
242
281
|
result.existing_outputs.add(predicted_output.dataset_id)
|
|
243
282
|
to_ingest = self._make_ingest_request(predicted_quantum, result)
|
|
244
283
|
self.comms.report_scan(ScanReport(result.quantum_id, result.status))
|
|
245
|
-
assert result.status is not ScanStatus.INCOMPLETE
|
|
246
|
-
assert result.status is not ScanStatus.ABANDONED
|
|
247
284
|
if self.comms.config.output_path is not None:
|
|
248
285
|
self.comms.request_write(result)
|
|
249
286
|
self.comms.request_ingest(to_ingest)
|
|
@@ -279,9 +316,12 @@ class Scanner:
|
|
|
279
316
|
to_ingest_records = self.qbb._datastore.export_predicted_records(to_ingest_refs)
|
|
280
317
|
return IngestRequest(result.quantum_id, to_ingest_predicted, to_ingest_records)
|
|
281
318
|
|
|
282
|
-
def
|
|
283
|
-
self,
|
|
284
|
-
|
|
319
|
+
def _read_metadata(
|
|
320
|
+
self,
|
|
321
|
+
predicted_quantum: PredictedQuantumDatasetsModel,
|
|
322
|
+
result: ScanResult,
|
|
323
|
+
last_attempt: ProvenanceQuantumAttemptModel,
|
|
324
|
+
) -> bool:
|
|
285
325
|
"""Attempt to read the metadata dataset for a quantum to extract
|
|
286
326
|
provenance information from it.
|
|
287
327
|
|
|
@@ -291,53 +331,62 @@ class Scanner:
|
|
|
291
331
|
Information about the predicted quantum.
|
|
292
332
|
result : `ScanResult`
|
|
293
333
|
Result object to be modified in-place.
|
|
334
|
+
last_attempt : `ScanningProvenanceQuantumAttemptModel`
|
|
335
|
+
Structure to fill in with information about the last attempt to
|
|
336
|
+
run this quantum.
|
|
294
337
|
|
|
295
338
|
Returns
|
|
296
339
|
-------
|
|
297
|
-
|
|
298
|
-
|
|
340
|
+
complete : `bool`
|
|
341
|
+
Whether the quantum is complete.
|
|
299
342
|
"""
|
|
300
|
-
assert not result.metadata, "We shouldn't be scanning again if we already read the metadata."
|
|
301
343
|
(predicted_dataset,) = predicted_quantum.outputs[acc.METADATA_OUTPUT_CONNECTION_NAME]
|
|
302
344
|
ref = self.reader.components.make_dataset_ref(predicted_dataset)
|
|
303
345
|
try:
|
|
304
346
|
# This assumes QBB metadata writes are atomic, which should be the
|
|
305
347
|
# case. If it's not we'll probably get pydantic validation errors
|
|
306
348
|
# here.
|
|
307
|
-
|
|
349
|
+
metadata: TaskMetadata = self.qbb.get(ref, storageClass="TaskMetadata")
|
|
308
350
|
except FileNotFoundError:
|
|
309
|
-
if
|
|
310
|
-
|
|
351
|
+
if self.comms.config.assume_complete:
|
|
352
|
+
result.status = ScanStatus.FAILED
|
|
353
|
+
else:
|
|
354
|
+
result.status = ScanStatus.ABANDONED
|
|
355
|
+
return False
|
|
311
356
|
else:
|
|
357
|
+
result.status = ScanStatus.SUCCESSFUL
|
|
358
|
+
result.existing_outputs.add(ref.id)
|
|
359
|
+
last_attempt.status = QuantumAttemptStatus.SUCCESSFUL
|
|
312
360
|
try:
|
|
313
361
|
# Int conversion guards against spurious conversion to
|
|
314
362
|
# float that can apparently sometimes happen in
|
|
315
363
|
# TaskMetadata.
|
|
316
|
-
|
|
364
|
+
last_attempt.caveats = QuantumSuccessCaveats(int(metadata["quantum"]["caveats"]))
|
|
317
365
|
except LookupError:
|
|
318
366
|
pass
|
|
319
367
|
try:
|
|
320
|
-
|
|
321
|
-
|
|
368
|
+
last_attempt.exception = ExceptionInfo._from_metadata(
|
|
369
|
+
metadata[predicted_quantum.task_label]["failure"]
|
|
322
370
|
)
|
|
323
371
|
except LookupError:
|
|
324
372
|
pass
|
|
325
373
|
try:
|
|
326
|
-
result.existing_outputs
|
|
327
|
-
uuid.UUID(id_str) for id_str in ensure_iterable(
|
|
328
|
-
|
|
374
|
+
result.existing_outputs.update(
|
|
375
|
+
uuid.UUID(id_str) for id_str in ensure_iterable(metadata["quantum"].getArray("outputs"))
|
|
376
|
+
)
|
|
329
377
|
except LookupError:
|
|
330
378
|
pass
|
|
331
|
-
|
|
332
|
-
result.
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
379
|
+
last_attempt.resource_usage = QuantumResourceUsage.from_task_metadata(metadata)
|
|
380
|
+
assert result.metadata_model is not None, "Only set to None after converting to JSON."
|
|
381
|
+
result.metadata_model.attempts.append(metadata)
|
|
382
|
+
return True
|
|
383
|
+
|
|
384
|
+
def _read_log(
|
|
385
|
+
self,
|
|
386
|
+
predicted_quantum: PredictedQuantumDatasetsModel,
|
|
387
|
+
result: ScanResult,
|
|
388
|
+
last_attempt: ProvenanceQuantumAttemptModel,
|
|
389
|
+
) -> bool:
|
|
341
390
|
"""Attempt to read the log dataset for a quantum to test for the
|
|
342
391
|
quantum's completion (the log is always written last) and aggregate
|
|
343
392
|
the log content in the provenance quantum graph.
|
|
@@ -348,24 +397,76 @@ class Scanner:
|
|
|
348
397
|
Information about the predicted quantum.
|
|
349
398
|
result : `ScanResult`
|
|
350
399
|
Result object to be modified in-place.
|
|
400
|
+
last_attempt : `ScanningProvenanceQuantumAttemptModel`
|
|
401
|
+
Structure to fill in with information about the last attempt to
|
|
402
|
+
run this quantum.
|
|
351
403
|
|
|
352
404
|
Returns
|
|
353
405
|
-------
|
|
354
|
-
|
|
355
|
-
|
|
406
|
+
complete : `bool`
|
|
407
|
+
Whether the quantum is complete.
|
|
356
408
|
"""
|
|
357
409
|
(predicted_dataset,) = predicted_quantum.outputs[acc.LOG_OUTPUT_CONNECTION_NAME]
|
|
358
410
|
ref = self.reader.components.make_dataset_ref(predicted_dataset)
|
|
359
411
|
try:
|
|
360
412
|
# This assumes QBB log writes are atomic, which should be the case.
|
|
361
413
|
# If it's not we'll probably get pydantic validation errors here.
|
|
362
|
-
|
|
414
|
+
log_records: ButlerLogRecords = self.qbb.get(ref)
|
|
363
415
|
except FileNotFoundError:
|
|
364
|
-
if
|
|
365
|
-
|
|
416
|
+
if self.comms.config.assume_complete:
|
|
417
|
+
result.status = ScanStatus.FAILED
|
|
418
|
+
else:
|
|
419
|
+
result.status = ScanStatus.ABANDONED
|
|
420
|
+
return False
|
|
366
421
|
else:
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
422
|
+
# Set the attempt's run status to FAILED, since the default is
|
|
423
|
+
# UNKNOWN (i.e. logs *and* metadata are missing) and we now know
|
|
424
|
+
# the logs exist. This will usually get replaced by SUCCESSFUL
|
|
425
|
+
# when we look for metadata next.
|
|
426
|
+
last_attempt.status = QuantumAttemptStatus.FAILED
|
|
427
|
+
result.existing_outputs.add(ref.id)
|
|
428
|
+
if log_records.extra:
|
|
429
|
+
log_extra = _ExecutionLogRecordsExtra.model_validate(log_records.extra)
|
|
430
|
+
self._extract_from_log_extra(log_extra, result, last_attempt=last_attempt)
|
|
431
|
+
assert result.log_model is not None, "Only set to None after converting to JSON."
|
|
432
|
+
result.log_model.attempts.append(list(log_records))
|
|
433
|
+
return True
|
|
434
|
+
|
|
435
|
+
def _extract_from_log_extra(
|
|
436
|
+
self,
|
|
437
|
+
log_extra: _ExecutionLogRecordsExtra,
|
|
438
|
+
result: ScanResult,
|
|
439
|
+
last_attempt: ProvenanceQuantumAttemptModel | None,
|
|
440
|
+
) -> None:
|
|
441
|
+
for previous_attempt_log_extra in log_extra.previous_attempts:
|
|
442
|
+
self._extract_from_log_extra(previous_attempt_log_extra, result, last_attempt=None)
|
|
443
|
+
quantum_attempt: ProvenanceQuantumAttemptModel
|
|
444
|
+
if last_attempt is None:
|
|
445
|
+
# This is not the last attempt, so it must be a failure.
|
|
446
|
+
quantum_attempt = ProvenanceQuantumAttemptModel(
|
|
447
|
+
attempt=len(result.attempts), status=QuantumAttemptStatus.FAILED
|
|
448
|
+
)
|
|
449
|
+
# We also need to get the logs from this extra provenance, since
|
|
450
|
+
# they won't be the main section of the log records.
|
|
451
|
+
assert result.log_model is not None, "Only set to None after converting to JSON."
|
|
452
|
+
result.log_model.attempts.append(log_extra.logs)
|
|
453
|
+
# The special last attempt is only appended after we attempt to
|
|
454
|
+
# read metadata later, but we have to append this one now.
|
|
455
|
+
result.attempts.append(quantum_attempt)
|
|
456
|
+
else:
|
|
457
|
+
assert not log_extra.logs, "Logs for the last attempt should not be stored in the extra JSON."
|
|
458
|
+
quantum_attempt = last_attempt
|
|
459
|
+
if log_extra.exception is not None or log_extra.metadata is not None or last_attempt is None:
|
|
460
|
+
# We won't be getting a separate metadata dataset, so anything we
|
|
461
|
+
# might get from the metadata has to come from this extra
|
|
462
|
+
# provenance in the logs.
|
|
463
|
+
quantum_attempt.exception = log_extra.exception
|
|
464
|
+
assert result.metadata_model is not None, "Only set to None after converting to JSON."
|
|
465
|
+
if log_extra.metadata is not None:
|
|
466
|
+
quantum_attempt.resource_usage = QuantumResourceUsage.from_task_metadata(log_extra.metadata)
|
|
467
|
+
result.metadata_model.attempts.append(log_extra.metadata)
|
|
468
|
+
else:
|
|
469
|
+
result.metadata_model.attempts.append(None)
|
|
470
|
+
# Regardless of whether this is the last attempt or not, we can only
|
|
471
|
+
# get the previous_process_quanta from the log extra.
|
|
472
|
+
quantum_attempt.previous_process_quanta.extend(log_extra.previous_process_quanta)
|
|
@@ -40,11 +40,13 @@ import uuid
|
|
|
40
40
|
|
|
41
41
|
from lsst.daf.butler.datastore.record_data import DatastoreRecordData
|
|
42
42
|
|
|
43
|
-
from ..._status import QuantumSuccessCaveats
|
|
44
|
-
from ...quantum_provenance_graph import ExceptionInfo, QuantumRunStatus
|
|
45
|
-
from ...resource_usage import QuantumResourceUsage
|
|
46
43
|
from .._common import DatastoreName
|
|
47
44
|
from .._predicted import PredictedDatasetModel
|
|
45
|
+
from .._provenance import (
|
|
46
|
+
ProvenanceLogRecordsModel,
|
|
47
|
+
ProvenanceQuantumAttemptModel,
|
|
48
|
+
ProvenanceTaskMetadataModel,
|
|
49
|
+
)
|
|
48
50
|
|
|
49
51
|
|
|
50
52
|
class ScanStatus(enum.Enum):
|
|
@@ -126,42 +128,33 @@ class ScanResult:
|
|
|
126
128
|
status: ScanStatus
|
|
127
129
|
"""Combined status for the scan and the execution of the quantum."""
|
|
128
130
|
|
|
129
|
-
|
|
130
|
-
"""
|
|
131
|
-
|
|
132
|
-
exception: ExceptionInfo | None = None
|
|
133
|
-
"""Information about an exception raised when the quantum was executing."""
|
|
134
|
-
|
|
135
|
-
resource_usage: QuantumResourceUsage | None = None
|
|
136
|
-
"""Resource usage information (timing, memory use) for this quantum."""
|
|
131
|
+
attempts: list[ProvenanceQuantumAttemptModel] = dataclasses.field(default_factory=list)
|
|
132
|
+
"""Provenance information about each attempt to run the quantum."""
|
|
137
133
|
|
|
138
134
|
existing_outputs: set[uuid.UUID] = dataclasses.field(default_factory=set)
|
|
139
135
|
"""Unique IDs of the output datasets that were actually written."""
|
|
140
136
|
|
|
141
|
-
|
|
142
|
-
|
|
137
|
+
metadata_model: ProvenanceTaskMetadataModel | None = dataclasses.field(
|
|
138
|
+
default_factory=ProvenanceTaskMetadataModel
|
|
139
|
+
)
|
|
140
|
+
"""Task metadata information for each attempt.
|
|
141
|
+
|
|
142
|
+
This is set to `None` to keep the pickle size small after it is saved
|
|
143
|
+
to `metadata_content`.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
metadata_content: bytes = b""
|
|
147
|
+
"""Serialized form of `metadata_model`."""
|
|
148
|
+
|
|
149
|
+
log_model: ProvenanceLogRecordsModel | None = dataclasses.field(default_factory=ProvenanceLogRecordsModel)
|
|
150
|
+
"""Log records for each attempt.
|
|
143
151
|
|
|
144
|
-
|
|
145
|
-
|
|
152
|
+
This is set to `None` to keep the pickle size small after it is saved
|
|
153
|
+
to `log_content`.
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
log_content: bytes = b""
|
|
157
|
+
"""Serialized form of `logs_model`."""
|
|
146
158
|
|
|
147
159
|
is_compressed: bool = False
|
|
148
160
|
"""Whether the `metadata` and `log` attributes are compressed."""
|
|
149
|
-
|
|
150
|
-
def get_run_status(self) -> QuantumRunStatus:
|
|
151
|
-
"""Translate the scan status and metadata/log presence into a run
|
|
152
|
-
status.
|
|
153
|
-
"""
|
|
154
|
-
if self.status is ScanStatus.BLOCKED:
|
|
155
|
-
return QuantumRunStatus.BLOCKED
|
|
156
|
-
if self.status is ScanStatus.INIT:
|
|
157
|
-
return QuantumRunStatus.SUCCESSFUL
|
|
158
|
-
if self.log:
|
|
159
|
-
if self.metadata:
|
|
160
|
-
return QuantumRunStatus.SUCCESSFUL
|
|
161
|
-
else:
|
|
162
|
-
return QuantumRunStatus.FAILED
|
|
163
|
-
else:
|
|
164
|
-
if self.metadata:
|
|
165
|
-
return QuantumRunStatus.LOGS_MISSING
|
|
166
|
-
else:
|
|
167
|
-
return QuantumRunStatus.METADATA_MISSING
|