lsst-pipe-base 30.0.1rc1__py3-none-any.whl → 30.2025.5100__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. lsst/pipe/base/_instrument.py +20 -31
  2. lsst/pipe/base/_quantumContext.py +3 -3
  3. lsst/pipe/base/_status.py +10 -43
  4. lsst/pipe/base/_task_metadata.py +2 -2
  5. lsst/pipe/base/all_dimensions_quantum_graph_builder.py +3 -8
  6. lsst/pipe/base/automatic_connection_constants.py +1 -20
  7. lsst/pipe/base/cli/cmd/__init__.py +2 -18
  8. lsst/pipe/base/cli/cmd/commands.py +4 -149
  9. lsst/pipe/base/connectionTypes.py +160 -72
  10. lsst/pipe/base/connections.py +9 -6
  11. lsst/pipe/base/execution_reports.py +5 -0
  12. lsst/pipe/base/graph/graph.py +10 -11
  13. lsst/pipe/base/graph/quantumNode.py +4 -4
  14. lsst/pipe/base/graph_walker.py +10 -8
  15. lsst/pipe/base/log_capture.py +80 -40
  16. lsst/pipe/base/mp_graph_executor.py +15 -51
  17. lsst/pipe/base/pipeline.py +6 -5
  18. lsst/pipe/base/pipelineIR.py +8 -2
  19. lsst/pipe/base/pipelineTask.py +7 -5
  20. lsst/pipe/base/pipeline_graph/_dataset_types.py +2 -2
  21. lsst/pipe/base/pipeline_graph/_edges.py +22 -32
  22. lsst/pipe/base/pipeline_graph/_mapping_views.py +7 -4
  23. lsst/pipe/base/pipeline_graph/_pipeline_graph.py +7 -14
  24. lsst/pipe/base/pipeline_graph/expressions.py +2 -2
  25. lsst/pipe/base/pipeline_graph/io.py +10 -7
  26. lsst/pipe/base/pipeline_graph/visualization/_dot.py +12 -13
  27. lsst/pipe/base/pipeline_graph/visualization/_layout.py +18 -16
  28. lsst/pipe/base/pipeline_graph/visualization/_merge.py +7 -4
  29. lsst/pipe/base/pipeline_graph/visualization/_printer.py +10 -10
  30. lsst/pipe/base/pipeline_graph/visualization/_status_annotator.py +0 -7
  31. lsst/pipe/base/prerequisite_helpers.py +1 -2
  32. lsst/pipe/base/quantum_graph/_common.py +20 -19
  33. lsst/pipe/base/quantum_graph/_multiblock.py +31 -37
  34. lsst/pipe/base/quantum_graph/_predicted.py +13 -111
  35. lsst/pipe/base/quantum_graph/_provenance.py +45 -1136
  36. lsst/pipe/base/quantum_graph/aggregator/__init__.py +1 -0
  37. lsst/pipe/base/quantum_graph/aggregator/_communicators.py +289 -204
  38. lsst/pipe/base/quantum_graph/aggregator/_config.py +9 -87
  39. lsst/pipe/base/quantum_graph/aggregator/_ingester.py +12 -13
  40. lsst/pipe/base/quantum_graph/aggregator/_scanner.py +235 -49
  41. lsst/pipe/base/quantum_graph/aggregator/_structs.py +116 -6
  42. lsst/pipe/base/quantum_graph/aggregator/_supervisor.py +39 -29
  43. lsst/pipe/base/quantum_graph/aggregator/_writer.py +351 -34
  44. lsst/pipe/base/quantum_graph/visualization.py +1 -5
  45. lsst/pipe/base/quantum_graph_builder.py +8 -21
  46. lsst/pipe/base/quantum_graph_executor.py +13 -116
  47. lsst/pipe/base/quantum_graph_skeleton.py +29 -31
  48. lsst/pipe/base/quantum_provenance_graph.py +12 -29
  49. lsst/pipe/base/separable_pipeline_executor.py +3 -19
  50. lsst/pipe/base/single_quantum_executor.py +42 -67
  51. lsst/pipe/base/struct.py +0 -4
  52. lsst/pipe/base/testUtils.py +3 -3
  53. lsst/pipe/base/tests/mocks/_storage_class.py +1 -2
  54. lsst/pipe/base/version.py +1 -1
  55. {lsst_pipe_base-30.0.1rc1.dist-info → lsst_pipe_base-30.2025.5100.dist-info}/METADATA +3 -3
  56. lsst_pipe_base-30.2025.5100.dist-info/RECORD +125 -0
  57. {lsst_pipe_base-30.0.1rc1.dist-info → lsst_pipe_base-30.2025.5100.dist-info}/WHEEL +1 -1
  58. lsst/pipe/base/log_on_close.py +0 -76
  59. lsst/pipe/base/quantum_graph/aggregator/_workers.py +0 -303
  60. lsst/pipe/base/quantum_graph/formatter.py +0 -171
  61. lsst/pipe/base/quantum_graph/ingest_graph.py +0 -413
  62. lsst_pipe_base-30.0.1rc1.dist-info/RECORD +0 -129
  63. {lsst_pipe_base-30.0.1rc1.dist-info → lsst_pipe_base-30.2025.5100.dist-info}/entry_points.txt +0 -0
  64. {lsst_pipe_base-30.0.1rc1.dist-info → lsst_pipe_base-30.2025.5100.dist-info}/licenses/COPYRIGHT +0 -0
  65. {lsst_pipe_base-30.0.1rc1.dist-info → lsst_pipe_base-30.2025.5100.dist-info}/licenses/LICENSE +0 -0
  66. {lsst_pipe_base-30.0.1rc1.dist-info → lsst_pipe_base-30.2025.5100.dist-info}/licenses/bsd_license.txt +0 -0
  67. {lsst_pipe_base-30.0.1rc1.dist-info → lsst_pipe_base-30.2025.5100.dist-info}/licenses/gpl-v3.0.txt +0 -0
  68. {lsst_pipe_base-30.0.1rc1.dist-info → lsst_pipe_base-30.2025.5100.dist-info}/top_level.txt +0 -0
  69. {lsst_pipe_base-30.0.1rc1.dist-info → lsst_pipe_base-30.2025.5100.dist-info}/zip-safe +0 -0
@@ -31,33 +31,154 @@ __all__ = (
31
31
  "FatalWorkerError",
32
32
  "IngesterCommunicator",
33
33
  "ScannerCommunicator",
34
+ "SpawnProcessContext",
34
35
  "SupervisorCommunicator",
36
+ "ThreadingContext",
37
+ "WorkerContext",
35
38
  )
36
39
 
37
40
  import cProfile
38
41
  import dataclasses
39
42
  import enum
40
43
  import logging
44
+ import multiprocessing.context
45
+ import multiprocessing.synchronize
41
46
  import os
47
+ import queue
42
48
  import signal
49
+ import threading
43
50
  import time
44
51
  import uuid
45
- from collections.abc import Iterable, Iterator
46
- from contextlib import ExitStack
52
+ from abc import ABC, abstractmethod
53
+ from collections.abc import Callable, Iterable, Iterator
54
+ from contextlib import AbstractContextManager, ExitStack, contextmanager
47
55
  from traceback import format_exception
48
56
  from types import TracebackType
49
- from typing import Literal, Self, overload
57
+ from typing import Any, Literal, Self, TypeAlias, TypeVar, Union
50
58
 
51
- from lsst.utils.logging import LsstLogAdapter
59
+ from lsst.utils.logging import VERBOSE, LsstLogAdapter
52
60
 
53
- from .._provenance import ProvenanceQuantumScanData
54
61
  from ._config import AggregatorConfig
55
62
  from ._progress import ProgressManager, make_worker_log
56
- from ._structs import IngestRequest, ScanReport
57
- from ._workers import Event, Queue, Worker, WorkerFactory
63
+ from ._structs import IngestRequest, ScanReport, WriteRequest
64
+
65
+ _T = TypeVar("_T")
58
66
 
59
67
  _TINY_TIMEOUT = 0.01
60
68
 
69
+ # multiprocessing.Queue is a type according to the standard library type stubs,
70
+ # but it's really a function at runtime. But since the Python <= 3.11 type
71
+ # alias syntax uses the real runtime things we need to use strings, and hence
72
+ # we need to use Union. With Python 3.12's 'type' statement this gets cleaner.
73
+ Queue: TypeAlias = Union["queue.Queue[_T]", "multiprocessing.Queue[_T]"]
74
+
75
+ Event: TypeAlias = threading.Event | multiprocessing.synchronize.Event
76
+
77
+ Worker: TypeAlias = threading.Thread | multiprocessing.context.SpawnProcess
78
+
79
+
80
+ class WorkerContext(ABC):
81
+ """A simple abstract interface that can be implemented by both threading
82
+ and multiprocessing.
83
+ """
84
+
85
+ @abstractmethod
86
+ def make_queue(self) -> Queue[Any]:
87
+ """Make an empty queue that can be used to pass objects between
88
+ workers in this context.
89
+ """
90
+ raise NotImplementedError()
91
+
92
+ @abstractmethod
93
+ def make_event(self) -> Event:
94
+ """Make an event that can be used to communicate a boolean state change
95
+ to workers in this context.
96
+ """
97
+ raise NotImplementedError()
98
+
99
+ @abstractmethod
100
+ def make_worker(
101
+ self, target: Callable[..., None], args: tuple[Any, ...], name: str | None = None
102
+ ) -> Worker:
103
+ """Make a worker that runs the given callable.
104
+
105
+ Parameters
106
+ ----------
107
+ target : `~collections.abc.Callable`
108
+ A callable to invoke on the worker.
109
+ args : `tuple`
110
+ Positional arguments to pass to the callable.
111
+ name : `str`, optional
112
+ Human-readable name for the worker.
113
+
114
+ Returns
115
+ -------
116
+ worker : `threading.Thread` or `multiprocessing.Process`
117
+ Process or thread. Will need to have its ``start`` method called
118
+ to actually begin.
119
+ """
120
+ raise NotImplementedError()
121
+
122
+
123
+ class ThreadingContext(WorkerContext):
124
+ """An implementation of `WorkerContext` backed by the `threading`
125
+ module.
126
+ """
127
+
128
+ def make_queue(self) -> Queue[Any]:
129
+ return queue.Queue()
130
+
131
+ def make_event(self) -> Event:
132
+ return threading.Event()
133
+
134
+ def make_worker(
135
+ self, target: Callable[..., None], args: tuple[Any, ...], name: str | None = None
136
+ ) -> Worker:
137
+ return threading.Thread(target=target, args=args, name=name)
138
+
139
+
140
+ class SpawnProcessContext(WorkerContext):
141
+ """An implementation of `WorkerContext` backed by the `multiprocessing`
142
+ module, with new processes started by spawning.
143
+ """
144
+
145
+ def __init__(self) -> None:
146
+ self._ctx = multiprocessing.get_context("spawn")
147
+
148
+ def make_queue(self) -> Queue[Any]:
149
+ return self._ctx.Queue()
150
+
151
+ def make_event(self) -> Event:
152
+ return self._ctx.Event()
153
+
154
+ def make_worker(
155
+ self, target: Callable[..., None], args: tuple[Any, ...], name: str | None = None
156
+ ) -> Worker:
157
+ return self._ctx.Process(target=target, args=args, name=name)
158
+
159
+
160
+ def _get_from_queue(q: Queue[_T], block: bool = False, timeout: float | None = None) -> _T | None:
161
+ """Get an object from a queue and return `None` if it is empty.
162
+
163
+ Parameters
164
+ ----------
165
+ q : `Queue`
166
+ Queue to get an object from.
167
+ block : `bool`
168
+ Whether to block until an object is available.
169
+ timeout : `float` or `None`, optional
170
+ Maximum number of seconds to wait while blocking.
171
+
172
+ Returns
173
+ -------
174
+ obj : `object` or `None`
175
+ Object from the queue, or `None` if it was empty.
176
+ """
177
+ try:
178
+ return q.get(block=block, timeout=timeout)
179
+ except queue.Empty:
180
+ return None
181
+
61
182
 
62
183
  class FatalWorkerError(BaseException):
63
184
  """An exception raised by communicators when one worker (including the
@@ -66,12 +187,6 @@ class FatalWorkerError(BaseException):
66
187
  """
67
188
 
68
189
 
69
- class _WorkerCommunicationError(Exception):
70
- """An exception raised by communicators when a worker has died unexpectedly
71
- or become unresponsive.
72
- """
73
-
74
-
75
190
  class _Sentinel(enum.Enum):
76
191
  """Sentinel values used to indicate sequence points or worker shutdown
77
192
  conditions.
@@ -98,6 +213,21 @@ class _Sentinel(enum.Enum):
98
213
  quantum's provenance was written.
99
214
  """
100
215
 
216
+ SCANNER_DONE = enum.auto()
217
+ """Sentinel sent from scanners to the supervisor to report that they are
218
+ done and shutting down.
219
+ """
220
+
221
+ INGESTER_DONE = enum.auto()
222
+ """Sentinel sent from the ingester to the supervisor to report that it is
223
+ done and shutting down.
224
+ """
225
+
226
+ WRITER_DONE = enum.auto()
227
+ """Sentinel sent from the writer to the supervisor to report that it is
228
+ done and shutting down.
229
+ """
230
+
101
231
 
102
232
  @dataclasses.dataclass
103
233
  class _WorkerErrorMessage:
@@ -147,16 +277,6 @@ class _IngestReport:
147
277
  """
148
278
 
149
279
 
150
- @dataclasses.dataclass
151
- class _WorkerDone:
152
- """An internal struct passed from a worker to the supervisor when it has
153
- successfully completed all work.
154
- """
155
-
156
- name: str
157
- """Name of the worker reporting completion."""
158
-
159
-
160
280
  @dataclasses.dataclass
161
281
  class _ProgressLog:
162
282
  """A high-level log message sent from a worker to the supervisor.
@@ -183,22 +303,20 @@ class _CompressionDictionary:
183
303
  """
184
304
 
185
305
 
186
- type Report = (
306
+ Report: TypeAlias = (
187
307
  ScanReport
188
308
  | _IngestReport
189
309
  | _WorkerErrorMessage
190
310
  | _ProgressLog
191
- | _WorkerDone
192
- | Literal[_Sentinel.WRITE_REPORT]
311
+ | Literal[
312
+ _Sentinel.WRITE_REPORT,
313
+ _Sentinel.SCANNER_DONE,
314
+ _Sentinel.INGESTER_DONE,
315
+ _Sentinel.WRITER_DONE,
316
+ ]
193
317
  )
194
318
 
195
319
 
196
- def _disable_resources_parallelism() -> None:
197
- os.environ["LSST_RESOURCES_NUM_WORKERS"] = "1"
198
- os.environ.pop("LSST_RESOURCES_EXECUTOR", None)
199
- os.environ["LSST_S3_USE_THREADS"] = "False"
200
-
201
-
202
320
  class SupervisorCommunicator:
203
321
  """A helper object that lets the supervisor direct the other workers.
204
322
 
@@ -208,7 +326,7 @@ class SupervisorCommunicator:
208
326
  LSST-customized logger.
209
327
  n_scanners : `int`
210
328
  Number of scanner workers.
211
- worker_factory : `WorkerFactory`
329
+ context : `WorkerContext`
212
330
  Abstraction over threading vs. multiprocessing.
213
331
  config : `AggregatorConfig`
214
332
  Configuration for the aggregator.
@@ -218,7 +336,7 @@ class SupervisorCommunicator:
218
336
  self,
219
337
  log: LsstLogAdapter,
220
338
  n_scanners: int,
221
- worker_factory: WorkerFactory,
339
+ context: WorkerContext,
222
340
  config: AggregatorConfig,
223
341
  ) -> None:
224
342
  self.config = config
@@ -228,14 +346,14 @@ class SupervisorCommunicator:
228
346
  # When complete, the supervisor sends n_scanners sentinals and each
229
347
  # scanner is careful to only take one before it starts its shutdown.
230
348
  self._scan_requests: Queue[_ScanRequest | Literal[_Sentinel.NO_MORE_SCAN_REQUESTS]] = (
231
- worker_factory.make_queue()
349
+ context.make_queue()
232
350
  )
233
351
  # The scanners send ingest requests to the ingester on this queue. Each
234
352
  # scanner sends one sentinal when it is done, and the ingester is
235
353
  # careful to wait for n_scanners sentinals to arrive before it starts
236
354
  # its shutdown.
237
355
  self._ingest_requests: Queue[IngestRequest | Literal[_Sentinel.NO_MORE_INGEST_REQUESTS]] = (
238
- worker_factory.make_queue()
356
+ context.make_queue()
239
357
  )
240
358
  # The scanners send write requests to the writer on this queue (which
241
359
  # will be `None` if we're not writing). The supervisor also sends
@@ -243,24 +361,24 @@ class SupervisorCommunicator:
243
361
  # scanner and the supervisor send one sentinal when done, and the
244
362
  # writer waits for (n_scanners + 1) sentinals to arrive before it
245
363
  # starts its shutdown.
246
- self._write_requests: (
247
- Queue[ProvenanceQuantumScanData | Literal[_Sentinel.NO_MORE_WRITE_REQUESTS]] | None
248
- ) = worker_factory.make_queue() if config.is_writing_provenance else None
364
+ self._write_requests: Queue[WriteRequest | Literal[_Sentinel.NO_MORE_WRITE_REQUESTS]] | None = (
365
+ context.make_queue() if config.output_path is not None else None
366
+ )
249
367
  # All other workers use this queue to send many different kinds of
250
368
  # reports the supervisor. The supervisor waits for a _DONE sentinal
251
369
  # from each worker before it finishes its shutdown.
252
- self._reports: Queue[Report] = worker_factory.make_queue()
370
+ self._reports: Queue[Report] = context.make_queue()
253
371
  # The writer sends the compression dictionary to the scanners on this
254
372
  # queue. It puts n_scanners copies on the queue, and each scanner only
255
373
  # takes one. The compression_dict queue has no sentinal because it is
256
374
  # only used at most once; the supervisor takes responsibility for
257
375
  # clearing it out shutting down.
258
- self._compression_dict: Queue[_CompressionDictionary] = worker_factory.make_queue()
376
+ self._compression_dict: Queue[_CompressionDictionary] = context.make_queue()
259
377
  # The supervisor sets this event when it receives an interrupt request
260
378
  # from an exception in the main process (usually KeyboardInterrupt).
261
379
  # Worker communicators check this in their polling loops and raise
262
380
  # FatalWorkerError when they see it set.
263
- self._cancel_event: Event = worker_factory.make_event()
381
+ self._cancel_event: Event = context.make_event()
264
382
  # Track what state we are in closing down, so we can start at the right
265
383
  # point if we're interrupted and __exit__ needs to clean up. Note that
266
384
  # we can't rely on a non-exception __exit__ to do any shutdown work
@@ -269,77 +387,51 @@ class SupervisorCommunicator:
269
387
  self._sent_no_more_scan_requests = False
270
388
  self._sent_no_more_write_requests = False
271
389
  self._n_scanners_done = 0
272
- self.workers: dict[str, Worker] = {}
390
+ self._ingester_done = False
391
+ self._writer_done = self._write_requests is None
273
392
 
274
- def _wait_for_workers_to_finish(self, already_failing: bool = False) -> None:
275
- # Orderly shutdown, including exceptions: let workers clear out the
276
- # queues they're responsible for reading from.
393
+ def wait_for_workers_to_finish(self, already_failing: bool = False) -> None:
277
394
  if not self._sent_no_more_scan_requests:
278
395
  for _ in range(self.n_scanners):
279
- self._scan_requests.put(_Sentinel.NO_MORE_SCAN_REQUESTS)
396
+ self._scan_requests.put(_Sentinel.NO_MORE_SCAN_REQUESTS, block=False)
280
397
  self._sent_no_more_scan_requests = True
281
398
  if not self._sent_no_more_write_requests and self._write_requests is not None:
282
- self._write_requests.put(_Sentinel.NO_MORE_WRITE_REQUESTS)
399
+ self._write_requests.put(_Sentinel.NO_MORE_WRITE_REQUESTS, block=False)
283
400
  self._sent_no_more_write_requests = True
284
- while not all(w.successful for w in self.workers.values()):
401
+ while not (self._ingester_done and self._writer_done and self._n_scanners_done == self.n_scanners):
285
402
  match self._handle_progress_reports(
286
- self._get_report(block=True), already_failing=already_failing
403
+ self._reports.get(block=True), already_failing=already_failing
287
404
  ):
288
- case None | ScanReport():
405
+ case None | ScanReport() | _IngestReport():
289
406
  pass
290
- case _WorkerDone(name=worker_name):
291
- self.workers[worker_name].successful = True
292
- if worker_name == IngesterCommunicator.get_worker_name():
293
- self.progress.quantum_ingests.close()
294
- elif worker_name == WriterCommunicator.get_worker_name():
295
- self.progress.writes.close()
296
- else:
297
- self._n_scanners_done += 1
298
- if self._n_scanners_done == self.n_scanners:
299
- self.progress.scans.close()
407
+ case _Sentinel.INGESTER_DONE:
408
+ self._ingester_done = True
409
+ self.progress.quantum_ingests.close()
410
+ case _Sentinel.SCANNER_DONE:
411
+ self._n_scanners_done += 1
412
+ self.progress.scans.close()
413
+ case _Sentinel.WRITER_DONE:
414
+ self._writer_done = True
415
+ self.progress.writes.close()
300
416
  case unexpected:
301
417
  raise AssertionError(f"Unexpected message {unexpected!r} to supervisor.")
302
418
  self.log.verbose(
303
- "Waiting for workers [%s] to report successful completion.",
304
- ", ".join(w.name for w in self.workers.values() if not w.successful),
419
+ "Blocking on reports queue: ingester_done=%s, writer_done=%s, n_scanners_done=%s.",
420
+ self._ingester_done,
421
+ self._writer_done,
422
+ self._n_scanners_done,
305
423
  )
424
+ while _get_from_queue(self._compression_dict) is not None:
425
+ self.log.verbose("Flushing compression dict queue.")
306
426
  self.log.verbose("Checking that all queues are empty.")
307
- if self._scan_requests.clear():
308
- self.progress.log.warning("Scan request queue was not empty at shutdown.")
309
- self._scan_requests.kill()
310
- if self._ingest_requests.clear():
311
- self.progress.log.warning("Ingest request queue was not empty at shutdown.")
312
- self._ingest_requests.kill()
313
- if self._write_requests is not None and self._write_requests.clear():
314
- self.progress.log.warning("Write request queue was not empty at shutdown.")
315
- self._write_requests.kill()
316
- if self._reports.clear():
317
- self.progress.log.warning("Reports queue was not empty at shutdown.")
318
- self._reports.kill()
319
- if self._compression_dict.clear():
320
- self.progress.log.warning("Compression dictionary queue was not empty at shutdown.")
321
- self._compression_dict.kill()
322
- for worker in self.workers.values():
323
- self.log.verbose("Waiting for %s to shut down.", worker.name)
324
- worker.join()
325
-
326
- def _terminate(self) -> None:
327
- # Disorderly shutdown: we cannot assume any of the
328
- # multiprocessing.Queue object work, and in fact they may hang
329
- # if we try to do anything with them.
330
- self._scan_requests.kill()
331
- self._ingest_requests.kill()
427
+ self._expect_empty_queue(self._scan_requests)
428
+ self._expect_empty_queue(self._ingest_requests)
332
429
  if self._write_requests is not None:
333
- self._write_requests.kill()
334
- self._compression_dict.kill()
335
- self._reports.kill()
336
- for name, worker in self.workers.items():
337
- if worker.is_alive():
338
- self.progress.log.critical("Terminating worker %r.", name)
339
- worker.kill()
430
+ self._expect_empty_queue(self._write_requests)
431
+ self._expect_empty_queue(self._reports)
432
+ self._expect_empty_queue(self._compression_dict)
340
433
 
341
434
  def __enter__(self) -> Self:
342
- _disable_resources_parallelism()
343
435
  self.progress.__enter__()
344
436
  # We make the low-level logger in __enter__ instead of __init__ only
345
437
  # because that's the pattern used by true workers (where it matters).
@@ -353,23 +445,11 @@ class SupervisorCommunicator:
353
445
  traceback: TracebackType | None,
354
446
  ) -> None:
355
447
  if exc_type is not None:
356
- self._cancel_event.set()
357
- if exc_type is _WorkerCommunicationError:
358
- self.progress.log.critical("Worker '%s' was terminated before it could finish.", exc_value)
359
- self._terminate()
360
- return None
361
448
  if exc_type is not FatalWorkerError:
362
- self.progress.log.critical("Caught %s; attempting to shut down cleanly.", exc_type)
363
- try:
364
- self._wait_for_workers_to_finish(already_failing=exc_type is not None)
365
- except _WorkerCommunicationError as err:
366
- self.progress.log.critical(
367
- "Worker '%s' was terminated before it could finish (after scanning).", err
368
- )
369
- self._terminate()
370
- raise
449
+ self.progress.log.critical(f"Caught {exc_type.__name__}; attempting to shut down cleanly.")
450
+ self._cancel_event.set()
451
+ self.wait_for_workers_to_finish(already_failing=exc_type is not None)
371
452
  self.progress.__exit__(exc_type, exc_value, traceback)
372
- return None
373
453
 
374
454
  def request_scan(self, quantum_id: uuid.UUID) -> None:
375
455
  """Send a request to the scanners to scan the given quantum.
@@ -379,19 +459,19 @@ class SupervisorCommunicator:
379
459
  quantum_id : `uuid.UUID`
380
460
  ID of the quantum to scan.
381
461
  """
382
- self._scan_requests.put(_ScanRequest(quantum_id))
462
+ self._scan_requests.put(_ScanRequest(quantum_id), block=False)
383
463
 
384
- def request_write(self, request: ProvenanceQuantumScanData) -> None:
464
+ def request_write(self, request: WriteRequest) -> None:
385
465
  """Send a request to the writer to write provenance for the given scan.
386
466
 
387
467
  Parameters
388
468
  ----------
389
- request : `ProvenanceQuantumScanData`
469
+ request : `WriteRequest`
390
470
  Information from scanning a quantum (or knowing you don't have to,
391
471
  in the case of blocked quanta).
392
472
  """
393
473
  assert self._write_requests is not None, "Writer should not be used if writing is disabled."
394
- self._write_requests.put(request)
474
+ self._write_requests.put(request, block=False)
395
475
 
396
476
  def poll(self) -> Iterator[ScanReport]:
397
477
  """Poll for reports from workers while sending scan requests.
@@ -407,8 +487,9 @@ class SupervisorCommunicator:
407
487
  it continues until the report queue is empty.
408
488
  """
409
489
  block = True
410
- while report := self._get_report(block=block):
411
- match self._handle_progress_reports(report):
490
+ msg = _get_from_queue(self._reports, block=block)
491
+ while msg is not None:
492
+ match self._handle_progress_reports(msg):
412
493
  case ScanReport() as scan_report:
413
494
  block = False
414
495
  yield scan_report
@@ -416,40 +497,19 @@ class SupervisorCommunicator:
416
497
  pass
417
498
  case unexpected:
418
499
  raise AssertionError(f"Unexpected message {unexpected!r} to supervisor.")
419
-
420
- @overload
421
- def _get_report(self, block: Literal[True]) -> Report: ...
422
-
423
- @overload
424
- def _get_report(self, block: bool) -> Report | None: ...
425
-
426
- def _get_report(self, block: bool) -> Report | None:
427
- """Get a report from the reports queue, with timeout guards on
428
- blocking requests.
429
-
430
- This method may *return* WorkerCommunicatorError (rather than raise it)
431
- when a serious error occurred communicating with a subprocess. This
432
- is to avoid raising an exception in an __exit__ method (which calls
433
- method).
434
- """
435
- report = self._reports.get(block=block, timeout=self.config.worker_check_timeout)
436
- while report is None and block:
437
- # We hit the timeout; make sure all of the workers
438
- # that should be alive actually are.
439
- for name, worker in self.workers.items():
440
- if not worker.successful and not worker.is_alive():
441
- # Delete this worker from the list of workers so we don't
442
- # hit this condition again when we try to handle the
443
- # exception we raise.
444
- raise _WorkerCommunicationError(name)
445
- # If nothing is dead and we didn't hit the hang timeout, keep
446
- # trying.
447
- report = self._reports.get(block=block, timeout=self.config.worker_check_timeout)
448
- return report
500
+ msg = _get_from_queue(self._reports, block=block)
449
501
 
450
502
  def _handle_progress_reports(
451
503
  self, report: Report, already_failing: bool = False
452
- ) -> ScanReport | _WorkerDone | None:
504
+ ) -> (
505
+ ScanReport
506
+ | Literal[
507
+ _Sentinel.SCANNER_DONE,
508
+ _Sentinel.INGESTER_DONE,
509
+ _Sentinel.WRITER_DONE,
510
+ ]
511
+ | None
512
+ ):
453
513
  """Handle reports to the supervisor that can appear at any time, and
454
514
  are typically just updates to the progress we've made.
455
515
 
@@ -479,9 +539,15 @@ class SupervisorCommunicator:
479
539
  return report
480
540
  return None
481
541
 
542
+ @staticmethod
543
+ def _expect_empty_queue(queue: Queue[Any]) -> None:
544
+ """Assert that the given queue is empty."""
545
+ if (msg := _get_from_queue(queue, block=False, timeout=0)) is not None:
546
+ raise AssertionError(f"Queue is not empty; found {msg!r}.")
547
+
482
548
 
483
549
  class WorkerCommunicator:
484
- """A base class for non-supervisor worker communicators.
550
+ """A base class for non-supervisor workers.
485
551
 
486
552
  Parameters
487
553
  ----------
@@ -493,8 +559,8 @@ class WorkerCommunicator:
493
559
  Notes
494
560
  -----
495
561
  Each worker communicator is constructed in the main process and entered as
496
- a context manager *only* on the actual worker process, so attributes that
497
- cannot be pickled are constructed in ``__enter__`` instead of ``__init__``.
562
+ a context manager on the actual worker process, so attributes that cannot
563
+ be pickled are constructed in ``__enter__`` instead of ``__init__``.
498
564
 
499
565
  Worker communicators provide access to an `AggregatorConfig` and a logger
500
566
  to their workers. As context managers, they handle exceptions and ensure
@@ -514,7 +580,6 @@ class WorkerCommunicator:
514
580
  self._cancel_event = supervisor._cancel_event
515
581
 
516
582
  def __enter__(self) -> Self:
517
- _disable_resources_parallelism()
518
583
  self.log = make_worker_log(self.name, self.config)
519
584
  self.log.verbose("%s has PID %s (parent is %s).", self.name, os.getpid(), os.getppid())
520
585
  self._exit_stack = ExitStack().__enter__()
@@ -547,7 +612,8 @@ class WorkerCommunicator:
547
612
  _WorkerErrorMessage(
548
613
  self.name,
549
614
  "".join(format_exception(exc_type, exc_value, traceback)),
550
- )
615
+ ),
616
+ block=False,
551
617
  )
552
618
  self.log.debug("Error message sent to supervisor.")
553
619
  else:
@@ -555,11 +621,6 @@ class WorkerCommunicator:
555
621
  self._exit_stack.__exit__(exc_type, exc_value, traceback)
556
622
  return True
557
623
 
558
- @property
559
- def exit_stack(self) -> ExitStack:
560
- """A `contextlib.ExitStack` tied to the communicator."""
561
- return self._exit_stack
562
-
563
624
  def log_progress(self, level: int, message: str) -> None:
564
625
  """Send a high-level log message to the supervisor.
565
626
 
@@ -570,7 +631,45 @@ class WorkerCommunicator:
570
631
  message : `str`
571
632
  Log message.
572
633
  """
573
- self._reports.put(_ProgressLog(message=message, level=level))
634
+ self._reports.put(_ProgressLog(message=message, level=level), block=False)
635
+
636
+ def enter(
637
+ self,
638
+ cm: AbstractContextManager[_T],
639
+ on_close: str | None = None,
640
+ level: int = VERBOSE,
641
+ is_progress_log: bool = False,
642
+ ) -> _T:
643
+ """Enter a context manager that will be exited when the communicator's
644
+ context is exited.
645
+
646
+ Parameters
647
+ ----------
648
+ cm : `contextlib.AbstractContextManager`
649
+ A context manager to enter.
650
+ on_close : `str`, optional
651
+ A log message to emit (on the worker's logger) just before the
652
+ given context manager is exited. This can be used to indicate
653
+ what's going on when an ``__exit__`` implementation has a lot of
654
+ work to do (e.g. moving a large file into a zip archive).
655
+ level : `int`, optional
656
+ Level for the ``on_close`` log message.
657
+ is_progress_log : `bool`, optional
658
+ If `True`, send the ``on_close`` message to the supervisor via
659
+ `log_progress` as well as the worker's logger.
660
+ """
661
+ if on_close is None:
662
+ return self._exit_stack.enter_context(cm)
663
+
664
+ @contextmanager
665
+ def wrapper() -> Iterator[_T]:
666
+ with cm as result:
667
+ yield result
668
+ self.log.log(level, on_close)
669
+ if is_progress_log:
670
+ self.log_progress(level, on_close)
671
+
672
+ return self._exit_stack.enter_context(wrapper())
574
673
 
575
674
  def check_for_cancel(self) -> None:
576
675
  """Check for a cancel signal from the supervisor and raise
@@ -592,7 +691,7 @@ class ScannerCommunicator(WorkerCommunicator):
592
691
  """
593
692
 
594
693
  def __init__(self, supervisor: SupervisorCommunicator, scanner_id: int):
595
- super().__init__(supervisor, self.get_worker_name(scanner_id))
694
+ super().__init__(supervisor, f"scanner-{scanner_id:03d}")
596
695
  self.scanner_id = scanner_id
597
696
  self._scan_requests = supervisor._scan_requests
598
697
  self._ingest_requests = supervisor._ingest_requests
@@ -601,10 +700,6 @@ class ScannerCommunicator(WorkerCommunicator):
601
700
  self._got_no_more_scan_requests: bool = False
602
701
  self._sent_no_more_ingest_requests: bool = False
603
702
 
604
- @staticmethod
605
- def get_worker_name(scanner_id: int) -> str:
606
- return f"scanner-{scanner_id:03d}"
607
-
608
703
  def report_scan(self, msg: ScanReport) -> None:
609
704
  """Report a completed scan to the supervisor.
610
705
 
@@ -613,7 +708,7 @@ class ScannerCommunicator(WorkerCommunicator):
613
708
  msg : `ScanReport`
614
709
  Report to send.
615
710
  """
616
- self._reports.put(msg)
711
+ self._reports.put(msg, block=False)
617
712
 
618
713
  def request_ingest(self, request: IngestRequest) -> None:
619
714
  """Ask the ingester to ingest a quantum's outputs.
@@ -629,20 +724,20 @@ class ScannerCommunicator(WorkerCommunicator):
629
724
  as complete to the supervisor instead of sending it to the ingester.
630
725
  """
631
726
  if request:
632
- self._ingest_requests.put(request)
727
+ self._ingest_requests.put(request, block=False)
633
728
  else:
634
- self._reports.put(_IngestReport(1))
729
+ self._reports.put(_IngestReport(1), block=False)
635
730
 
636
- def request_write(self, request: ProvenanceQuantumScanData) -> None:
731
+ def request_write(self, request: WriteRequest) -> None:
637
732
  """Ask the writer to write provenance for a quantum.
638
733
 
639
734
  Parameters
640
735
  ----------
641
- request : `ProvenanceQuantumScanData`
736
+ request : `WriteRequest`
642
737
  Result of scanning a quantum.
643
738
  """
644
739
  assert self._write_requests is not None, "Writer should not be used if writing is disabled."
645
- self._write_requests.put(request)
740
+ self._write_requests.put(request, block=False)
646
741
 
647
742
  def get_compression_dict(self) -> bytes | None:
648
743
  """Attempt to get the compression dict from the writer.
@@ -658,7 +753,7 @@ class ScannerCommunicator(WorkerCommunicator):
658
753
  A scanner should only call this method before it actually has the
659
754
  compression dict.
660
755
  """
661
- if (cdict := self._compression_dict.get()) is not None:
756
+ if (cdict := _get_from_queue(self._compression_dict)) is not None:
662
757
  return cdict.data
663
758
  return None
664
759
 
@@ -677,7 +772,7 @@ class ScannerCommunicator(WorkerCommunicator):
677
772
  """
678
773
  while True:
679
774
  self.check_for_cancel()
680
- scan_request = self._scan_requests.get(block=True, timeout=self.config.worker_sleep)
775
+ scan_request = _get_from_queue(self._scan_requests, block=True, timeout=self.config.worker_sleep)
681
776
  if scan_request is _Sentinel.NO_MORE_SCAN_REQUESTS:
682
777
  self._got_no_more_scan_requests = True
683
778
  return
@@ -691,18 +786,20 @@ class ScannerCommunicator(WorkerCommunicator):
691
786
  traceback: TracebackType | None,
692
787
  ) -> bool | None:
693
788
  result = super().__exit__(exc_type, exc_value, traceback)
694
- self._ingest_requests.put(_Sentinel.NO_MORE_INGEST_REQUESTS)
789
+ self._ingest_requests.put(_Sentinel.NO_MORE_INGEST_REQUESTS, block=False)
695
790
  if self._write_requests is not None:
696
- self._write_requests.put(_Sentinel.NO_MORE_WRITE_REQUESTS)
791
+ self._write_requests.put(_Sentinel.NO_MORE_WRITE_REQUESTS, block=False)
697
792
  while not self._got_no_more_scan_requests:
793
+ self.log.debug("Clearing scan request queue (~%d remaining)", self._scan_requests.qsize())
698
794
  if (
699
795
  not self._got_no_more_scan_requests
700
- and self._scan_requests.get(block=True) is _Sentinel.NO_MORE_SCAN_REQUESTS
796
+ and self._scan_requests.get() is _Sentinel.NO_MORE_SCAN_REQUESTS
701
797
  ):
702
798
  self._got_no_more_scan_requests = True
703
- # We let the writer clear out the compression dict queue.
704
- self.log.verbose("Sending completion message.")
705
- self._reports.put(_WorkerDone(self.name))
799
+ # We let the supervisor clear out the compression dict queue, because
800
+ # a single scanner can't know if it ever got sent out or not.
801
+ self.log.verbose("Sending done sentinal.")
802
+ self._reports.put(_Sentinel.SCANNER_DONE, block=False)
706
803
  return result
707
804
 
708
805
 
@@ -716,15 +813,11 @@ class IngesterCommunicator(WorkerCommunicator):
716
813
  """
717
814
 
718
815
  def __init__(self, supervisor: SupervisorCommunicator):
719
- super().__init__(supervisor, self.get_worker_name())
816
+ super().__init__(supervisor, "ingester")
720
817
  self.n_scanners = supervisor.n_scanners
721
818
  self._ingest_requests = supervisor._ingest_requests
722
819
  self._n_requesters_done = 0
723
820
 
724
- @staticmethod
725
- def get_worker_name() -> str:
726
- return "ingester"
727
-
728
821
  def __exit__(
729
822
  self,
730
823
  exc_type: type[BaseException] | None,
@@ -740,8 +833,8 @@ class IngesterCommunicator(WorkerCommunicator):
740
833
  )
741
834
  if self._ingest_requests.get(block=True) is _Sentinel.NO_MORE_INGEST_REQUESTS:
742
835
  self._n_requesters_done += 1
743
- self.log.verbose("Sending completion message.")
744
- self._reports.put(_WorkerDone(self.name))
836
+ self.log.verbose("Sending done sentinal.")
837
+ self._reports.put(_Sentinel.INGESTER_DONE, block=False)
745
838
  return result
746
839
 
747
840
  def report_ingest(self, n_producers: int) -> None:
@@ -752,7 +845,7 @@ class IngesterCommunicator(WorkerCommunicator):
752
845
  n_producers : `int`
753
846
  Number of producing quanta whose datasets were ingested.
754
847
  """
755
- self._reports.put(_IngestReport(n_producers))
848
+ self._reports.put(_IngestReport(n_producers), block=False)
756
849
 
757
850
  def poll(self) -> Iterator[IngestRequest]:
758
851
  """Poll for ingest requests from the scanner workers.
@@ -769,7 +862,7 @@ class IngesterCommunicator(WorkerCommunicator):
769
862
  """
770
863
  while True:
771
864
  self.check_for_cancel()
772
- ingest_request = self._ingest_requests.get(block=True, timeout=_TINY_TIMEOUT)
865
+ ingest_request = _get_from_queue(self._ingest_requests, block=True, timeout=_TINY_TIMEOUT)
773
866
  if ingest_request is _Sentinel.NO_MORE_INGEST_REQUESTS:
774
867
  self._n_requesters_done += 1
775
868
  if self._n_requesters_done == self.n_scanners:
@@ -791,7 +884,7 @@ class WriterCommunicator(WorkerCommunicator):
791
884
 
792
885
  def __init__(self, supervisor: SupervisorCommunicator):
793
886
  assert supervisor._write_requests is not None
794
- super().__init__(supervisor, self.get_worker_name())
887
+ super().__init__(supervisor, "writer")
795
888
  self.n_scanners = supervisor.n_scanners
796
889
  self._write_requests = supervisor._write_requests
797
890
  self._compression_dict = supervisor._compression_dict
@@ -799,10 +892,6 @@ class WriterCommunicator(WorkerCommunicator):
799
892
  self._n_requesters_done = 0
800
893
  self._sent_compression_dict = False
801
894
 
802
- @staticmethod
803
- def get_worker_name() -> str:
804
- return "writer"
805
-
806
895
  def __exit__(
807
896
  self,
808
897
  exc_type: type[BaseException] | None,
@@ -820,20 +909,16 @@ class WriterCommunicator(WorkerCommunicator):
820
909
  )
821
910
  if self._write_requests.get(block=True) is _Sentinel.NO_MORE_WRITE_REQUESTS:
822
911
  self._n_requesters_done += 1
823
- if self._compression_dict.clear():
824
- self.log.verbose("Cleared out compression dictionary queue.")
825
- else:
826
- self.log.verbose("Compression dictionary queue was already empty.")
827
- self.log.verbose("Sending completion message.")
828
- self._reports.put(_WorkerDone(self.name))
912
+ self.log.verbose("Sending done sentinal.")
913
+ self._reports.put(_Sentinel.WRITER_DONE, block=False)
829
914
  return result
830
915
 
831
- def poll(self) -> Iterator[ProvenanceQuantumScanData]:
916
+ def poll(self) -> Iterator[WriteRequest]:
832
917
  """Poll for writer requests from the scanner workers and supervisor.
833
918
 
834
919
  Yields
835
920
  ------
836
- request : `ProvenanceQuantumScanData`
921
+ request : `WriteRequest`
837
922
  The result of a quantum scan.
838
923
 
839
924
  Notes
@@ -843,7 +928,7 @@ class WriterCommunicator(WorkerCommunicator):
843
928
  """
844
929
  while True:
845
930
  self.check_for_cancel()
846
- write_request = self._write_requests.get(block=True, timeout=_TINY_TIMEOUT)
931
+ write_request = _get_from_queue(self._write_requests, block=True, timeout=_TINY_TIMEOUT)
847
932
  if write_request is _Sentinel.NO_MORE_WRITE_REQUESTS:
848
933
  self._n_requesters_done += 1
849
934
  if self._n_requesters_done == self._n_requesters:
@@ -863,16 +948,16 @@ class WriterCommunicator(WorkerCommunicator):
863
948
  """
864
949
  self.log.debug("Sending compression dictionary.")
865
950
  for _ in range(self.n_scanners):
866
- self._compression_dict.put(_CompressionDictionary(cdict_data))
951
+ self._compression_dict.put(_CompressionDictionary(cdict_data), block=False)
867
952
  self._sent_compression_dict = True
868
953
 
869
954
  def report_write(self) -> None:
870
955
  """Report to the supervisor that provenance for a quantum was written
871
956
  to the graph.
872
957
  """
873
- self._reports.put(_Sentinel.WRITE_REPORT)
958
+ self._reports.put(_Sentinel.WRITE_REPORT, block=False)
874
959
 
875
- def periodically_check_for_cancel[T](self, iterable: Iterable[T], n: int = 100) -> Iterator[T]:
960
+ def periodically_check_for_cancel(self, iterable: Iterable[_T], n: int = 100) -> Iterator[_T]:
876
961
  """Iterate while checking for a cancellation signal every ``n``
877
962
  iterations.
878
963