lsst-pipe-base 30.2026.200__py3-none-any.whl → 30.2026.400__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/pipe/base/_instrument.py +10 -12
- lsst/pipe/base/_status.py +29 -10
- lsst/pipe/base/automatic_connection_constants.py +9 -1
- lsst/pipe/base/cli/cmd/__init__.py +16 -2
- lsst/pipe/base/cli/cmd/commands.py +42 -4
- lsst/pipe/base/connectionTypes.py +72 -160
- lsst/pipe/base/connections.py +3 -6
- lsst/pipe/base/execution_reports.py +0 -5
- lsst/pipe/base/log_capture.py +8 -4
- lsst/pipe/base/log_on_close.py +79 -0
- lsst/pipe/base/mp_graph_executor.py +51 -15
- lsst/pipe/base/pipeline.py +3 -4
- lsst/pipe/base/pipelineIR.py +0 -6
- lsst/pipe/base/pipelineTask.py +5 -7
- lsst/pipe/base/pipeline_graph/_edges.py +19 -7
- lsst/pipe/base/pipeline_graph/_pipeline_graph.py +8 -0
- lsst/pipe/base/quantum_graph/_common.py +7 -4
- lsst/pipe/base/quantum_graph/_multiblock.py +6 -16
- lsst/pipe/base/quantum_graph/_predicted.py +111 -10
- lsst/pipe/base/quantum_graph/_provenance.py +727 -26
- lsst/pipe/base/quantum_graph/aggregator/_communicators.py +26 -50
- lsst/pipe/base/quantum_graph/aggregator/_config.py +78 -9
- lsst/pipe/base/quantum_graph/aggregator/_ingester.py +12 -11
- lsst/pipe/base/quantum_graph/aggregator/_scanner.py +48 -234
- lsst/pipe/base/quantum_graph/aggregator/_structs.py +6 -116
- lsst/pipe/base/quantum_graph/aggregator/_supervisor.py +24 -18
- lsst/pipe/base/quantum_graph/aggregator/_writer.py +33 -350
- lsst/pipe/base/quantum_graph/formatter.py +171 -0
- lsst/pipe/base/quantum_graph/ingest_graph.py +356 -0
- lsst/pipe/base/quantum_graph_executor.py +116 -13
- lsst/pipe/base/quantum_provenance_graph.py +17 -2
- lsst/pipe/base/separable_pipeline_executor.py +18 -2
- lsst/pipe/base/single_quantum_executor.py +59 -41
- lsst/pipe/base/struct.py +4 -0
- lsst/pipe/base/version.py +1 -1
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/METADATA +2 -1
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/RECORD +45 -42
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/WHEEL +1 -1
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/entry_points.txt +0 -0
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/licenses/LICENSE +0 -0
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/top_level.txt +0 -0
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/zip-safe +0 -0
|
@@ -38,23 +38,19 @@ from typing import Any, Literal, Self
|
|
|
38
38
|
import zstandard
|
|
39
39
|
|
|
40
40
|
from lsst.daf.butler import ButlerLogRecords, DatasetRef, QuantumBackedButler
|
|
41
|
-
from lsst.utils.iteration import ensure_iterable
|
|
42
41
|
|
|
43
42
|
from ... import automatic_connection_constants as acc
|
|
44
|
-
from ..._status import ExceptionInfo, QuantumAttemptStatus, QuantumSuccessCaveats
|
|
45
43
|
from ..._task_metadata import TaskMetadata
|
|
46
|
-
from ...log_capture import _ExecutionLogRecordsExtra
|
|
47
44
|
from ...pipeline_graph import PipelineGraph, TaskImportMode
|
|
48
|
-
from ...resource_usage import QuantumResourceUsage
|
|
49
45
|
from .._multiblock import Compressor
|
|
50
46
|
from .._predicted import (
|
|
51
47
|
PredictedDatasetModel,
|
|
52
48
|
PredictedQuantumDatasetsModel,
|
|
53
49
|
PredictedQuantumGraphReader,
|
|
54
50
|
)
|
|
55
|
-
from .._provenance import
|
|
51
|
+
from .._provenance import ProvenanceQuantumScanModels, ProvenanceQuantumScanStatus
|
|
56
52
|
from ._communicators import ScannerCommunicator
|
|
57
|
-
from ._structs import IngestRequest,
|
|
53
|
+
from ._structs import IngestRequest, ScanReport
|
|
58
54
|
|
|
59
55
|
|
|
60
56
|
@dataclasses.dataclass
|
|
@@ -94,7 +90,7 @@ class Scanner(AbstractContextManager):
|
|
|
94
90
|
if self.comms.config.mock_storage_classes:
|
|
95
91
|
import lsst.pipe.base.tests.mocks # noqa: F401
|
|
96
92
|
self.comms.log.verbose("Reading from predicted quantum graph.")
|
|
97
|
-
self.reader = self.comms.
|
|
93
|
+
self.reader = self.comms.exit_stack.enter_context(
|
|
98
94
|
PredictedQuantumGraphReader.open(self.predicted_path, import_mode=TaskImportMode.DO_NOT_IMPORT)
|
|
99
95
|
)
|
|
100
96
|
self.reader.read_dimension_data()
|
|
@@ -196,7 +192,7 @@ class Scanner(AbstractContextManager):
|
|
|
196
192
|
ref = self.reader.components.make_dataset_ref(predicted)
|
|
197
193
|
return self.qbb.stored(ref)
|
|
198
194
|
|
|
199
|
-
def scan_quantum(self, quantum_id: uuid.UUID) ->
|
|
195
|
+
def scan_quantum(self, quantum_id: uuid.UUID) -> ProvenanceQuantumScanModels:
|
|
200
196
|
"""Scan for a quantum's completion and error status, and its output
|
|
201
197
|
datasets' existence.
|
|
202
198
|
|
|
@@ -207,76 +203,38 @@ class Scanner(AbstractContextManager):
|
|
|
207
203
|
|
|
208
204
|
Returns
|
|
209
205
|
-------
|
|
210
|
-
result : `
|
|
206
|
+
result : `ProvenanceQuantumScanModels`
|
|
211
207
|
Scan result struct.
|
|
212
208
|
"""
|
|
213
209
|
if (predicted_quantum := self.init_quanta.get(quantum_id)) is not None:
|
|
214
|
-
result =
|
|
210
|
+
result = ProvenanceQuantumScanModels(
|
|
211
|
+
predicted_quantum.quantum_id, status=ProvenanceQuantumScanStatus.INIT
|
|
212
|
+
)
|
|
215
213
|
self.comms.log.debug("Created init scan for %s (%s)", quantum_id, predicted_quantum.task_label)
|
|
216
214
|
else:
|
|
217
215
|
self.reader.read_quantum_datasets([quantum_id])
|
|
218
|
-
predicted_quantum = self.reader.components.quantum_datasets
|
|
216
|
+
predicted_quantum = self.reader.components.quantum_datasets.pop(quantum_id)
|
|
219
217
|
self.comms.log.debug(
|
|
220
218
|
"Scanning %s (%s@%s)",
|
|
221
219
|
quantum_id,
|
|
222
220
|
predicted_quantum.task_label,
|
|
223
221
|
predicted_quantum.data_coordinate,
|
|
224
222
|
)
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
if not self._read_metadata(predicted_quantum, result, last_attempt):
|
|
233
|
-
# We found the log dataset, but no metadata; this means the
|
|
234
|
-
# quantum failed, but a retry might still happen that could
|
|
235
|
-
# turn it into a success if we can't yet assume the run is
|
|
236
|
-
# complete.
|
|
237
|
-
self.comms.log.debug("Abandoning scan for %s.", quantum_id)
|
|
223
|
+
logs = self._read_log(predicted_quantum)
|
|
224
|
+
metadata = self._read_metadata(predicted_quantum)
|
|
225
|
+
result = ProvenanceQuantumScanModels.from_metadata_and_logs(
|
|
226
|
+
predicted_quantum, metadata, logs, incomplete=self.comms.config.incomplete
|
|
227
|
+
)
|
|
228
|
+
if result.status is ProvenanceQuantumScanStatus.ABANDONED:
|
|
229
|
+
self.comms.log.debug("Abandoning scan for failed quantum %s.", quantum_id)
|
|
238
230
|
self.comms.report_scan(ScanReport(result.quantum_id, result.status))
|
|
239
231
|
return result
|
|
240
|
-
last_attempt.attempt = len(result.attempts)
|
|
241
|
-
result.attempts.append(last_attempt)
|
|
242
|
-
assert result.status is not ScanStatus.INCOMPLETE
|
|
243
|
-
assert result.status is not ScanStatus.ABANDONED
|
|
244
|
-
|
|
245
|
-
if len(result.logs.attempts) < len(result.attempts):
|
|
246
|
-
# Logs were not found for this attempt; must have been a hard error
|
|
247
|
-
# that kept the `finally` block from running or otherwise
|
|
248
|
-
# interrupted the writing of the logs.
|
|
249
|
-
result.logs.attempts.append(None)
|
|
250
|
-
if result.status is ScanStatus.SUCCESSFUL:
|
|
251
|
-
# But we found the metadata! Either that hard error happened
|
|
252
|
-
# at a very unlucky time (in between those two writes), or
|
|
253
|
-
# something even weirder happened.
|
|
254
|
-
result.attempts[-1].status = QuantumAttemptStatus.LOGS_MISSING
|
|
255
|
-
else:
|
|
256
|
-
result.attempts[-1].status = QuantumAttemptStatus.FAILED
|
|
257
|
-
if len(result.metadata.attempts) < len(result.attempts):
|
|
258
|
-
# Metadata missing usually just means a failure. In any case, the
|
|
259
|
-
# status will already be correct, either because it was set to a
|
|
260
|
-
# failure when we read the logs, or left at UNKNOWN if there were
|
|
261
|
-
# no logs. Note that scanners never process BLOCKED quanta at all.
|
|
262
|
-
result.metadata.attempts.append(None)
|
|
263
|
-
assert len(result.logs.attempts) == len(result.attempts) or len(result.metadata.attempts) == len(
|
|
264
|
-
result.attempts
|
|
265
|
-
), (
|
|
266
|
-
"The only way we can add more than one quantum attempt is by "
|
|
267
|
-
"extracting info stored with the logs, and that always appends "
|
|
268
|
-
"a log attempt and a metadata attempt, so this must be a bug in "
|
|
269
|
-
"the scanner."
|
|
270
|
-
)
|
|
271
|
-
# Scan for output dataset existence, skipping any the metadata reported
|
|
272
|
-
# on as well as and the metadata and logs themselves (since we just
|
|
273
|
-
# checked those).
|
|
274
232
|
for predicted_output in itertools.chain.from_iterable(predicted_quantum.outputs.values()):
|
|
275
|
-
if predicted_output.dataset_id not in result.
|
|
276
|
-
result.
|
|
233
|
+
if predicted_output.dataset_id not in result.output_existence:
|
|
234
|
+
result.output_existence[predicted_output.dataset_id] = self.scan_dataset(predicted_output)
|
|
277
235
|
to_ingest = self._make_ingest_request(predicted_quantum, result)
|
|
278
|
-
if self.comms.config.
|
|
279
|
-
to_write =
|
|
236
|
+
if self.comms.config.is_writing_provenance:
|
|
237
|
+
to_write = result.to_scan_data(predicted_quantum, compressor=self.compressor)
|
|
280
238
|
self.comms.request_write(to_write)
|
|
281
239
|
self.comms.request_ingest(to_ingest)
|
|
282
240
|
self.comms.report_scan(ScanReport(result.quantum_id, result.status))
|
|
@@ -284,7 +242,7 @@ class Scanner(AbstractContextManager):
|
|
|
284
242
|
return result
|
|
285
243
|
|
|
286
244
|
def _make_ingest_request(
|
|
287
|
-
self, predicted_quantum: PredictedQuantumDatasetsModel, result:
|
|
245
|
+
self, predicted_quantum: PredictedQuantumDatasetsModel, result: ProvenanceQuantumScanModels
|
|
288
246
|
) -> IngestRequest:
|
|
289
247
|
"""Make an ingest request from a quantum scan.
|
|
290
248
|
|
|
@@ -292,7 +250,7 @@ class Scanner(AbstractContextManager):
|
|
|
292
250
|
----------
|
|
293
251
|
predicted_quantum : `PredictedQuantumDatasetsModel`
|
|
294
252
|
Information about the predicted quantum.
|
|
295
|
-
result : `
|
|
253
|
+
result : `ProvenanceQuantumScanModels`
|
|
296
254
|
Result of a quantum scan.
|
|
297
255
|
|
|
298
256
|
Returns
|
|
@@ -303,79 +261,36 @@ class Scanner(AbstractContextManager):
|
|
|
303
261
|
predicted_outputs_by_id = {
|
|
304
262
|
d.dataset_id: d for d in itertools.chain.from_iterable(predicted_quantum.outputs.values())
|
|
305
263
|
}
|
|
306
|
-
to_ingest_predicted: list[PredictedDatasetModel] = []
|
|
307
264
|
to_ingest_refs: list[DatasetRef] = []
|
|
308
|
-
|
|
309
|
-
|
|
265
|
+
to_ignore: set[uuid.UUID] = set()
|
|
266
|
+
if self.comms.config.promise_ingest_graph:
|
|
267
|
+
if result.status is ProvenanceQuantumScanStatus.INIT:
|
|
268
|
+
if predicted_quantum.task_label: # i.e. not the 'packages' producer
|
|
269
|
+
to_ignore.add(
|
|
270
|
+
predicted_quantum.outputs[acc.CONFIG_INIT_OUTPUT_CONNECTION_NAME][0].dataset_id
|
|
271
|
+
)
|
|
272
|
+
else:
|
|
273
|
+
to_ignore.add(predicted_quantum.outputs[acc.METADATA_OUTPUT_CONNECTION_NAME][0].dataset_id)
|
|
274
|
+
to_ignore.add(predicted_quantum.outputs[acc.LOG_OUTPUT_CONNECTION_NAME][0].dataset_id)
|
|
275
|
+
for dataset_id, was_produced in result.output_existence.items():
|
|
276
|
+
if was_produced and dataset_id not in to_ignore:
|
|
310
277
|
predicted_output = predicted_outputs_by_id[dataset_id]
|
|
311
|
-
to_ingest_predicted.append(predicted_output)
|
|
312
278
|
to_ingest_refs.append(self.reader.components.make_dataset_ref(predicted_output))
|
|
313
279
|
to_ingest_records = self.qbb._datastore.export_predicted_records(to_ingest_refs)
|
|
314
|
-
return IngestRequest(result.quantum_id,
|
|
280
|
+
return IngestRequest(result.quantum_id, to_ingest_refs, to_ingest_records)
|
|
315
281
|
|
|
316
|
-
def
|
|
317
|
-
|
|
318
|
-
) -> WriteRequest:
|
|
319
|
-
"""Make a write request from a quantum scan.
|
|
282
|
+
def _read_metadata(self, predicted_quantum: PredictedQuantumDatasetsModel) -> TaskMetadata | None:
|
|
283
|
+
"""Attempt to read the metadata dataset for a quantum.
|
|
320
284
|
|
|
321
285
|
Parameters
|
|
322
286
|
----------
|
|
323
287
|
predicted_quantum : `PredictedQuantumDatasetsModel`
|
|
324
288
|
Information about the predicted quantum.
|
|
325
|
-
result : `InProgressScan`
|
|
326
|
-
Result of a quantum scan.
|
|
327
289
|
|
|
328
290
|
Returns
|
|
329
291
|
-------
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
"""
|
|
333
|
-
quantum: ProvenanceInitQuantumModel | ProvenanceQuantumModel
|
|
334
|
-
if result.status is ScanStatus.INIT:
|
|
335
|
-
quantum = ProvenanceInitQuantumModel.from_predicted(predicted_quantum)
|
|
336
|
-
else:
|
|
337
|
-
quantum = ProvenanceQuantumModel.from_predicted(predicted_quantum)
|
|
338
|
-
quantum.attempts = result.attempts
|
|
339
|
-
request = WriteRequest(
|
|
340
|
-
result.quantum_id,
|
|
341
|
-
result.status,
|
|
342
|
-
existing_outputs={
|
|
343
|
-
dataset_id for dataset_id, was_produced in result.outputs.items() if was_produced
|
|
344
|
-
},
|
|
345
|
-
quantum=quantum.model_dump_json().encode(),
|
|
346
|
-
logs=result.logs.model_dump_json().encode() if result.logs.attempts else b"",
|
|
347
|
-
metadata=result.metadata.model_dump_json().encode() if result.metadata.attempts else b"",
|
|
348
|
-
)
|
|
349
|
-
if self.compressor is not None:
|
|
350
|
-
request.quantum = self.compressor.compress(request.quantum)
|
|
351
|
-
request.logs = self.compressor.compress(request.logs) if request.logs else b""
|
|
352
|
-
request.metadata = self.compressor.compress(request.metadata) if request.metadata else b""
|
|
353
|
-
request.is_compressed = True
|
|
354
|
-
return request
|
|
355
|
-
|
|
356
|
-
def _read_metadata(
|
|
357
|
-
self,
|
|
358
|
-
predicted_quantum: PredictedQuantumDatasetsModel,
|
|
359
|
-
result: InProgressScan,
|
|
360
|
-
last_attempt: ProvenanceQuantumAttemptModel,
|
|
361
|
-
) -> bool:
|
|
362
|
-
"""Attempt to read the metadata dataset for a quantum to extract
|
|
363
|
-
provenance information from it.
|
|
364
|
-
|
|
365
|
-
Parameters
|
|
366
|
-
----------
|
|
367
|
-
predicted_quantum : `PredictedQuantumDatasetsModel`
|
|
368
|
-
Information about the predicted quantum.
|
|
369
|
-
result : `InProgressScan`
|
|
370
|
-
Result object to be modified in-place.
|
|
371
|
-
last_attempt : `ScanningProvenanceQuantumAttemptModel`
|
|
372
|
-
Structure to fill in with information about the last attempt to
|
|
373
|
-
run this quantum.
|
|
374
|
-
|
|
375
|
-
Returns
|
|
376
|
-
-------
|
|
377
|
-
complete : `bool`
|
|
378
|
-
Whether the quantum is complete.
|
|
292
|
+
metadata : `...TaskMetadata` or `None`
|
|
293
|
+
Task metadata.
|
|
379
294
|
"""
|
|
380
295
|
(predicted_dataset,) = predicted_quantum.outputs[acc.METADATA_OUTPUT_CONNECTION_NAME]
|
|
381
296
|
ref = self.reader.components.make_dataset_ref(predicted_dataset)
|
|
@@ -383,129 +298,28 @@ class Scanner(AbstractContextManager):
|
|
|
383
298
|
# This assumes QBB metadata writes are atomic, which should be the
|
|
384
299
|
# case. If it's not we'll probably get pydantic validation errors
|
|
385
300
|
# here.
|
|
386
|
-
|
|
301
|
+
return self.qbb.get(ref, storageClass="TaskMetadata")
|
|
387
302
|
except FileNotFoundError:
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
result.status = ScanStatus.ABANDONED
|
|
393
|
-
return False
|
|
394
|
-
else:
|
|
395
|
-
result.status = ScanStatus.SUCCESSFUL
|
|
396
|
-
result.outputs[ref.id] = True
|
|
397
|
-
last_attempt.status = QuantumAttemptStatus.SUCCESSFUL
|
|
398
|
-
try:
|
|
399
|
-
# Int conversion guards against spurious conversion to
|
|
400
|
-
# float that can apparently sometimes happen in
|
|
401
|
-
# TaskMetadata.
|
|
402
|
-
last_attempt.caveats = QuantumSuccessCaveats(int(metadata["quantum"]["caveats"]))
|
|
403
|
-
except LookupError:
|
|
404
|
-
pass
|
|
405
|
-
try:
|
|
406
|
-
last_attempt.exception = ExceptionInfo._from_metadata(
|
|
407
|
-
metadata[predicted_quantum.task_label]["failure"]
|
|
408
|
-
)
|
|
409
|
-
except LookupError:
|
|
410
|
-
pass
|
|
411
|
-
try:
|
|
412
|
-
for id_str in ensure_iterable(metadata["quantum"].getArray("outputs")):
|
|
413
|
-
result.outputs[uuid.UUID(id_str)]
|
|
414
|
-
except LookupError:
|
|
415
|
-
pass
|
|
416
|
-
else:
|
|
417
|
-
# If the metadata told us what it wrote, anything not in that
|
|
418
|
-
# list was not written.
|
|
419
|
-
for predicted_output in itertools.chain.from_iterable(predicted_quantum.outputs.values()):
|
|
420
|
-
result.outputs.setdefault(predicted_output.dataset_id, False)
|
|
421
|
-
last_attempt.resource_usage = QuantumResourceUsage.from_task_metadata(metadata)
|
|
422
|
-
result.metadata.attempts.append(metadata)
|
|
423
|
-
return True
|
|
424
|
-
|
|
425
|
-
def _read_log(
|
|
426
|
-
self,
|
|
427
|
-
predicted_quantum: PredictedQuantumDatasetsModel,
|
|
428
|
-
result: InProgressScan,
|
|
429
|
-
last_attempt: ProvenanceQuantumAttemptModel,
|
|
430
|
-
) -> bool:
|
|
431
|
-
"""Attempt to read the log dataset for a quantum to test for the
|
|
432
|
-
quantum's completion (the log is always written last) and aggregate
|
|
433
|
-
the log content in the provenance quantum graph.
|
|
303
|
+
return None
|
|
304
|
+
|
|
305
|
+
def _read_log(self, predicted_quantum: PredictedQuantumDatasetsModel) -> ButlerLogRecords | None:
|
|
306
|
+
"""Attempt to read the log dataset for a quantum.
|
|
434
307
|
|
|
435
308
|
Parameters
|
|
436
309
|
----------
|
|
437
310
|
predicted_quantum : `PredictedQuantumDatasetsModel`
|
|
438
311
|
Information about the predicted quantum.
|
|
439
|
-
result : `InProgressScan`
|
|
440
|
-
Result object to be modified in-place.
|
|
441
|
-
last_attempt : `ScanningProvenanceQuantumAttemptModel`
|
|
442
|
-
Structure to fill in with information about the last attempt to
|
|
443
|
-
run this quantum.
|
|
444
312
|
|
|
445
313
|
Returns
|
|
446
314
|
-------
|
|
447
|
-
|
|
448
|
-
|
|
315
|
+
logs : `lsst.daf.butler.logging.ButlerLogRecords` or `None`
|
|
316
|
+
Task logs.
|
|
449
317
|
"""
|
|
450
318
|
(predicted_dataset,) = predicted_quantum.outputs[acc.LOG_OUTPUT_CONNECTION_NAME]
|
|
451
319
|
ref = self.reader.components.make_dataset_ref(predicted_dataset)
|
|
452
320
|
try:
|
|
453
321
|
# This assumes QBB log writes are atomic, which should be the case.
|
|
454
322
|
# If it's not we'll probably get pydantic validation errors here.
|
|
455
|
-
|
|
323
|
+
return self.qbb.get(ref)
|
|
456
324
|
except FileNotFoundError:
|
|
457
|
-
|
|
458
|
-
if self.comms.config.assume_complete:
|
|
459
|
-
result.status = ScanStatus.FAILED
|
|
460
|
-
else:
|
|
461
|
-
result.status = ScanStatus.ABANDONED
|
|
462
|
-
return False
|
|
463
|
-
else:
|
|
464
|
-
# Set the attempt's run status to FAILED, since the default is
|
|
465
|
-
# UNKNOWN (i.e. logs *and* metadata are missing) and we now know
|
|
466
|
-
# the logs exist. This will usually get replaced by SUCCESSFUL
|
|
467
|
-
# when we look for metadata next.
|
|
468
|
-
last_attempt.status = QuantumAttemptStatus.FAILED
|
|
469
|
-
result.outputs[ref.id] = True
|
|
470
|
-
if log_records.extra:
|
|
471
|
-
log_extra = _ExecutionLogRecordsExtra.model_validate(log_records.extra)
|
|
472
|
-
self._extract_from_log_extra(log_extra, result, last_attempt=last_attempt)
|
|
473
|
-
result.logs.attempts.append(list(log_records))
|
|
474
|
-
return True
|
|
475
|
-
|
|
476
|
-
def _extract_from_log_extra(
|
|
477
|
-
self,
|
|
478
|
-
log_extra: _ExecutionLogRecordsExtra,
|
|
479
|
-
result: InProgressScan,
|
|
480
|
-
last_attempt: ProvenanceQuantumAttemptModel | None,
|
|
481
|
-
) -> None:
|
|
482
|
-
for previous_attempt_log_extra in log_extra.previous_attempts:
|
|
483
|
-
self._extract_from_log_extra(previous_attempt_log_extra, result, last_attempt=None)
|
|
484
|
-
quantum_attempt: ProvenanceQuantumAttemptModel
|
|
485
|
-
if last_attempt is None:
|
|
486
|
-
# This is not the last attempt, so it must be a failure.
|
|
487
|
-
quantum_attempt = ProvenanceQuantumAttemptModel(
|
|
488
|
-
attempt=len(result.attempts), status=QuantumAttemptStatus.FAILED
|
|
489
|
-
)
|
|
490
|
-
# We also need to get the logs from this extra provenance, since
|
|
491
|
-
# they won't be the main section of the log records.
|
|
492
|
-
result.logs.attempts.append(log_extra.logs)
|
|
493
|
-
# The special last attempt is only appended after we attempt to
|
|
494
|
-
# read metadata later, but we have to append this one now.
|
|
495
|
-
result.attempts.append(quantum_attempt)
|
|
496
|
-
else:
|
|
497
|
-
assert not log_extra.logs, "Logs for the last attempt should not be stored in the extra JSON."
|
|
498
|
-
quantum_attempt = last_attempt
|
|
499
|
-
if log_extra.exception is not None or log_extra.metadata is not None or last_attempt is None:
|
|
500
|
-
# We won't be getting a separate metadata dataset, so anything we
|
|
501
|
-
# might get from the metadata has to come from this extra
|
|
502
|
-
# provenance in the logs.
|
|
503
|
-
quantum_attempt.exception = log_extra.exception
|
|
504
|
-
if log_extra.metadata is not None:
|
|
505
|
-
quantum_attempt.resource_usage = QuantumResourceUsage.from_task_metadata(log_extra.metadata)
|
|
506
|
-
result.metadata.attempts.append(log_extra.metadata)
|
|
507
|
-
else:
|
|
508
|
-
result.metadata.attempts.append(None)
|
|
509
|
-
# Regardless of whether this is the last attempt or not, we can only
|
|
510
|
-
# get the previous_process_quanta from the log extra.
|
|
511
|
-
quantum_attempt.previous_process_quanta.extend(log_extra.previous_process_quanta)
|
|
325
|
+
return None
|
|
@@ -27,68 +27,16 @@
|
|
|
27
27
|
|
|
28
28
|
from __future__ import annotations
|
|
29
29
|
|
|
30
|
-
__all__ = (
|
|
31
|
-
"InProgressScan",
|
|
32
|
-
"IngestRequest",
|
|
33
|
-
"ScanReport",
|
|
34
|
-
"ScanStatus",
|
|
35
|
-
"WriteRequest",
|
|
36
|
-
)
|
|
30
|
+
__all__ = ("IngestRequest", "ScanReport")
|
|
37
31
|
|
|
38
32
|
import dataclasses
|
|
39
|
-
import enum
|
|
40
33
|
import uuid
|
|
41
34
|
|
|
35
|
+
from lsst.daf.butler import DatasetRef
|
|
42
36
|
from lsst.daf.butler.datastore.record_data import DatastoreRecordData
|
|
43
37
|
|
|
44
38
|
from .._common import DatastoreName
|
|
45
|
-
from ..
|
|
46
|
-
from .._provenance import (
|
|
47
|
-
ProvenanceLogRecordsModel,
|
|
48
|
-
ProvenanceQuantumAttemptModel,
|
|
49
|
-
ProvenanceTaskMetadataModel,
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
class ScanStatus(enum.Enum):
|
|
54
|
-
"""Status enum for quantum scanning.
|
|
55
|
-
|
|
56
|
-
Note that this records the status for the *scanning* which is distinct
|
|
57
|
-
from the status of the quantum's execution.
|
|
58
|
-
"""
|
|
59
|
-
|
|
60
|
-
INCOMPLETE = enum.auto()
|
|
61
|
-
"""The quantum is not necessarily done running, and cannot be scanned
|
|
62
|
-
conclusively yet.
|
|
63
|
-
"""
|
|
64
|
-
|
|
65
|
-
ABANDONED = enum.auto()
|
|
66
|
-
"""The quantum's execution appears to have failed but we cannot rule out
|
|
67
|
-
the possibility that it could be recovered, but we've also waited long
|
|
68
|
-
enough (according to `ScannerTimeConfigDict.retry_timeout`) that it's time
|
|
69
|
-
to stop trying for now.
|
|
70
|
-
|
|
71
|
-
This state means a later run with `ScannerConfig.assume_complete` is
|
|
72
|
-
required.
|
|
73
|
-
"""
|
|
74
|
-
|
|
75
|
-
SUCCESSFUL = enum.auto()
|
|
76
|
-
"""The quantum was conclusively scanned and was executed successfully,
|
|
77
|
-
unblocking scans for downstream quanta.
|
|
78
|
-
"""
|
|
79
|
-
|
|
80
|
-
FAILED = enum.auto()
|
|
81
|
-
"""The quantum was conclusively scanned and failed execution, blocking
|
|
82
|
-
scans for downstream quanta.
|
|
83
|
-
"""
|
|
84
|
-
|
|
85
|
-
BLOCKED = enum.auto()
|
|
86
|
-
"""A quantum upstream of this one failed."""
|
|
87
|
-
|
|
88
|
-
INIT = enum.auto()
|
|
89
|
-
"""Init quanta need special handling, because they don't have logs and
|
|
90
|
-
metadata.
|
|
91
|
-
"""
|
|
39
|
+
from .._provenance import ProvenanceQuantumScanStatus
|
|
92
40
|
|
|
93
41
|
|
|
94
42
|
@dataclasses.dataclass
|
|
@@ -98,7 +46,7 @@ class ScanReport:
|
|
|
98
46
|
quantum_id: uuid.UUID
|
|
99
47
|
"""Unique ID of the quantum."""
|
|
100
48
|
|
|
101
|
-
status:
|
|
49
|
+
status: ProvenanceQuantumScanStatus
|
|
102
50
|
"""Combined status of the scan and the execution of the quantum."""
|
|
103
51
|
|
|
104
52
|
|
|
@@ -109,69 +57,11 @@ class IngestRequest:
|
|
|
109
57
|
producer_id: uuid.UUID
|
|
110
58
|
"""ID of the quantum that produced these datasets."""
|
|
111
59
|
|
|
112
|
-
|
|
60
|
+
refs: list[DatasetRef]
|
|
113
61
|
"""Registry information about the datasets."""
|
|
114
62
|
|
|
115
63
|
records: dict[DatastoreName, DatastoreRecordData]
|
|
116
64
|
"""Datastore information about the datasets."""
|
|
117
65
|
|
|
118
66
|
def __bool__(self) -> bool:
|
|
119
|
-
return bool(self.
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
@dataclasses.dataclass
|
|
123
|
-
class InProgressScan:
|
|
124
|
-
"""A struct that represents a quantum that is being scanned."""
|
|
125
|
-
|
|
126
|
-
quantum_id: uuid.UUID
|
|
127
|
-
"""Unique ID for the quantum."""
|
|
128
|
-
|
|
129
|
-
status: ScanStatus
|
|
130
|
-
"""Combined status for the scan and the execution of the quantum."""
|
|
131
|
-
|
|
132
|
-
attempts: list[ProvenanceQuantumAttemptModel] = dataclasses.field(default_factory=list)
|
|
133
|
-
"""Provenance information about each attempt to run the quantum."""
|
|
134
|
-
|
|
135
|
-
outputs: dict[uuid.UUID, bool] = dataclasses.field(default_factory=dict)
|
|
136
|
-
"""Unique IDs of the output datasets mapped to whether they were actually
|
|
137
|
-
produced.
|
|
138
|
-
"""
|
|
139
|
-
|
|
140
|
-
metadata: ProvenanceTaskMetadataModel = dataclasses.field(default_factory=ProvenanceTaskMetadataModel)
|
|
141
|
-
"""Task metadata information for each attempt.
|
|
142
|
-
"""
|
|
143
|
-
|
|
144
|
-
logs: ProvenanceLogRecordsModel = dataclasses.field(default_factory=ProvenanceLogRecordsModel)
|
|
145
|
-
"""Log records for each attempt.
|
|
146
|
-
"""
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
@dataclasses.dataclass
|
|
150
|
-
class WriteRequest:
|
|
151
|
-
"""A struct that represents a request to write provenance for a quantum."""
|
|
152
|
-
|
|
153
|
-
quantum_id: uuid.UUID
|
|
154
|
-
"""Unique ID for the quantum."""
|
|
155
|
-
|
|
156
|
-
status: ScanStatus
|
|
157
|
-
"""Combined status for the scan and the execution of the quantum."""
|
|
158
|
-
|
|
159
|
-
existing_outputs: set[uuid.UUID] = dataclasses.field(default_factory=set)
|
|
160
|
-
"""Unique IDs of the output datasets that were actually written."""
|
|
161
|
-
|
|
162
|
-
quantum: bytes = b""
|
|
163
|
-
"""Serialized quantum provenance model.
|
|
164
|
-
|
|
165
|
-
This may be empty for quanta that had no attempts.
|
|
166
|
-
"""
|
|
167
|
-
|
|
168
|
-
metadata: bytes = b""
|
|
169
|
-
"""Serialized task metadata."""
|
|
170
|
-
|
|
171
|
-
logs: bytes = b""
|
|
172
|
-
"""Serialized logs."""
|
|
173
|
-
|
|
174
|
-
is_compressed: bool = False
|
|
175
|
-
"""Whether the `quantum`, `metadata`, and `log` attributes are
|
|
176
|
-
compressed.
|
|
177
|
-
"""
|
|
67
|
+
return bool(self.refs or self.records)
|
|
@@ -42,6 +42,7 @@ from lsst.utils.usage import get_peak_mem_usage
|
|
|
42
42
|
from ...graph_walker import GraphWalker
|
|
43
43
|
from ...pipeline_graph import TaskImportMode
|
|
44
44
|
from .._predicted import PredictedQuantumGraphComponents, PredictedQuantumGraphReader
|
|
45
|
+
from .._provenance import ProvenanceQuantumScanData, ProvenanceQuantumScanStatus
|
|
45
46
|
from ._communicators import (
|
|
46
47
|
IngesterCommunicator,
|
|
47
48
|
ScannerCommunicator,
|
|
@@ -54,7 +55,7 @@ from ._communicators import (
|
|
|
54
55
|
from ._config import AggregatorConfig
|
|
55
56
|
from ._ingester import Ingester
|
|
56
57
|
from ._scanner import Scanner
|
|
57
|
-
from ._structs import ScanReport
|
|
58
|
+
from ._structs import ScanReport
|
|
58
59
|
from ._writer import Writer
|
|
59
60
|
|
|
60
61
|
|
|
@@ -116,6 +117,18 @@ class Supervisor:
|
|
|
116
117
|
self.comms.request_scan(ready_set.pop())
|
|
117
118
|
for scan_return in self.comms.poll():
|
|
118
119
|
self.handle_report(scan_return)
|
|
120
|
+
if self.comms.config.incomplete:
|
|
121
|
+
quantum_or_quanta = "quanta" if self.n_abandoned != 1 else "quantum"
|
|
122
|
+
self.comms.progress.log.info(
|
|
123
|
+
"%d %s incomplete/failed abandoned; re-run with incomplete=False to finish.",
|
|
124
|
+
self.n_abandoned,
|
|
125
|
+
quantum_or_quanta,
|
|
126
|
+
)
|
|
127
|
+
self.comms.progress.log.info(
|
|
128
|
+
"Scanning complete after %0.1fs; waiting for workers to finish.",
|
|
129
|
+
self.comms.progress.elapsed_time,
|
|
130
|
+
)
|
|
131
|
+
self.comms.wait_for_workers_to_finish()
|
|
119
132
|
|
|
120
133
|
def handle_report(self, scan_report: ScanReport) -> None:
|
|
121
134
|
"""Handle a report from a scanner.
|
|
@@ -126,18 +139,22 @@ class Supervisor:
|
|
|
126
139
|
Information about the scan.
|
|
127
140
|
"""
|
|
128
141
|
match scan_report.status:
|
|
129
|
-
case
|
|
142
|
+
case ProvenanceQuantumScanStatus.SUCCESSFUL | ProvenanceQuantumScanStatus.INIT:
|
|
130
143
|
self.comms.log.debug("Scan complete for %s: quantum succeeded.", scan_report.quantum_id)
|
|
131
144
|
self.walker.finish(scan_report.quantum_id)
|
|
132
|
-
case
|
|
145
|
+
case ProvenanceQuantumScanStatus.FAILED:
|
|
133
146
|
self.comms.log.debug("Scan complete for %s: quantum failed.", scan_report.quantum_id)
|
|
134
147
|
blocked_quanta = self.walker.fail(scan_report.quantum_id)
|
|
135
148
|
for blocked_quantum_id in blocked_quanta:
|
|
136
|
-
if self.comms.config.
|
|
137
|
-
self.comms.request_write(
|
|
149
|
+
if self.comms.config.is_writing_provenance:
|
|
150
|
+
self.comms.request_write(
|
|
151
|
+
ProvenanceQuantumScanData(
|
|
152
|
+
blocked_quantum_id, status=ProvenanceQuantumScanStatus.BLOCKED
|
|
153
|
+
)
|
|
154
|
+
)
|
|
138
155
|
self.comms.progress.scans.update(1)
|
|
139
156
|
self.comms.progress.quantum_ingests.update(len(blocked_quanta))
|
|
140
|
-
case
|
|
157
|
+
case ProvenanceQuantumScanStatus.ABANDONED:
|
|
141
158
|
self.comms.log.debug("Abandoning scan for %s: quantum has not succeeded (yet).")
|
|
142
159
|
self.walker.fail(scan_report.quantum_id)
|
|
143
160
|
self.n_abandoned += 1
|
|
@@ -167,7 +184,7 @@ def aggregate_graph(predicted_path: str, butler_path: str, config: AggregatorCon
|
|
|
167
184
|
writer: Worker | None = None
|
|
168
185
|
with SupervisorCommunicator(log, config.n_processes, ctx, config) as comms:
|
|
169
186
|
comms.progress.log.verbose("Starting workers.")
|
|
170
|
-
if config.
|
|
187
|
+
if config.is_writing_provenance:
|
|
171
188
|
writer_comms = WriterCommunicator(comms)
|
|
172
189
|
writer = ctx.make_worker(
|
|
173
190
|
target=Writer.run,
|
|
@@ -193,17 +210,6 @@ def aggregate_graph(predicted_path: str, butler_path: str, config: AggregatorCon
|
|
|
193
210
|
ingester.start()
|
|
194
211
|
supervisor = Supervisor(predicted_path, comms)
|
|
195
212
|
supervisor.loop()
|
|
196
|
-
log.info(
|
|
197
|
-
"Scanning complete after %0.1fs; waiting for workers to finish.",
|
|
198
|
-
comms.progress.elapsed_time,
|
|
199
|
-
)
|
|
200
|
-
comms.wait_for_workers_to_finish()
|
|
201
|
-
if supervisor.n_abandoned:
|
|
202
|
-
raise RuntimeError(
|
|
203
|
-
f"{supervisor.n_abandoned} {'quanta' if supervisor.n_abandoned > 1 else 'quantum'} "
|
|
204
|
-
"abandoned because they did not succeed. Re-run with assume_complete=True after all retry "
|
|
205
|
-
"attempts have been exhausted."
|
|
206
|
-
)
|
|
207
213
|
for w in scanners:
|
|
208
214
|
w.join()
|
|
209
215
|
ingester.join()
|