lsst-pipe-base 30.0.0rc2__py3-none-any.whl → 30.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. lsst/pipe/base/_instrument.py +31 -20
  2. lsst/pipe/base/_quantumContext.py +3 -3
  3. lsst/pipe/base/_status.py +43 -10
  4. lsst/pipe/base/_task_metadata.py +2 -2
  5. lsst/pipe/base/all_dimensions_quantum_graph_builder.py +8 -3
  6. lsst/pipe/base/automatic_connection_constants.py +20 -1
  7. lsst/pipe/base/cli/cmd/__init__.py +18 -2
  8. lsst/pipe/base/cli/cmd/commands.py +149 -4
  9. lsst/pipe/base/connectionTypes.py +72 -160
  10. lsst/pipe/base/connections.py +6 -9
  11. lsst/pipe/base/execution_reports.py +0 -5
  12. lsst/pipe/base/graph/graph.py +11 -10
  13. lsst/pipe/base/graph/quantumNode.py +4 -4
  14. lsst/pipe/base/graph_walker.py +8 -10
  15. lsst/pipe/base/log_capture.py +40 -80
  16. lsst/pipe/base/log_on_close.py +76 -0
  17. lsst/pipe/base/mp_graph_executor.py +51 -15
  18. lsst/pipe/base/pipeline.py +5 -6
  19. lsst/pipe/base/pipelineIR.py +2 -8
  20. lsst/pipe/base/pipelineTask.py +5 -7
  21. lsst/pipe/base/pipeline_graph/_dataset_types.py +2 -2
  22. lsst/pipe/base/pipeline_graph/_edges.py +32 -22
  23. lsst/pipe/base/pipeline_graph/_mapping_views.py +4 -7
  24. lsst/pipe/base/pipeline_graph/_pipeline_graph.py +14 -7
  25. lsst/pipe/base/pipeline_graph/expressions.py +2 -2
  26. lsst/pipe/base/pipeline_graph/io.py +7 -10
  27. lsst/pipe/base/pipeline_graph/visualization/_dot.py +13 -12
  28. lsst/pipe/base/pipeline_graph/visualization/_layout.py +16 -18
  29. lsst/pipe/base/pipeline_graph/visualization/_merge.py +4 -7
  30. lsst/pipe/base/pipeline_graph/visualization/_printer.py +10 -10
  31. lsst/pipe/base/pipeline_graph/visualization/_status_annotator.py +7 -0
  32. lsst/pipe/base/prerequisite_helpers.py +2 -1
  33. lsst/pipe/base/quantum_graph/_common.py +19 -20
  34. lsst/pipe/base/quantum_graph/_multiblock.py +37 -31
  35. lsst/pipe/base/quantum_graph/_predicted.py +113 -15
  36. lsst/pipe/base/quantum_graph/_provenance.py +1136 -45
  37. lsst/pipe/base/quantum_graph/aggregator/__init__.py +0 -1
  38. lsst/pipe/base/quantum_graph/aggregator/_communicators.py +204 -289
  39. lsst/pipe/base/quantum_graph/aggregator/_config.py +87 -9
  40. lsst/pipe/base/quantum_graph/aggregator/_ingester.py +13 -12
  41. lsst/pipe/base/quantum_graph/aggregator/_scanner.py +49 -235
  42. lsst/pipe/base/quantum_graph/aggregator/_structs.py +6 -116
  43. lsst/pipe/base/quantum_graph/aggregator/_supervisor.py +29 -39
  44. lsst/pipe/base/quantum_graph/aggregator/_workers.py +303 -0
  45. lsst/pipe/base/quantum_graph/aggregator/_writer.py +34 -351
  46. lsst/pipe/base/quantum_graph/formatter.py +171 -0
  47. lsst/pipe/base/quantum_graph/ingest_graph.py +413 -0
  48. lsst/pipe/base/quantum_graph/visualization.py +5 -1
  49. lsst/pipe/base/quantum_graph_builder.py +33 -9
  50. lsst/pipe/base/quantum_graph_executor.py +116 -13
  51. lsst/pipe/base/quantum_graph_skeleton.py +31 -35
  52. lsst/pipe/base/quantum_provenance_graph.py +29 -12
  53. lsst/pipe/base/separable_pipeline_executor.py +19 -3
  54. lsst/pipe/base/single_quantum_executor.py +67 -42
  55. lsst/pipe/base/struct.py +4 -0
  56. lsst/pipe/base/testUtils.py +3 -3
  57. lsst/pipe/base/tests/mocks/_storage_class.py +2 -1
  58. lsst/pipe/base/version.py +1 -1
  59. {lsst_pipe_base-30.0.0rc2.dist-info → lsst_pipe_base-30.0.1.dist-info}/METADATA +3 -3
  60. lsst_pipe_base-30.0.1.dist-info/RECORD +129 -0
  61. {lsst_pipe_base-30.0.0rc2.dist-info → lsst_pipe_base-30.0.1.dist-info}/WHEEL +1 -1
  62. lsst_pipe_base-30.0.0rc2.dist-info/RECORD +0 -125
  63. {lsst_pipe_base-30.0.0rc2.dist-info → lsst_pipe_base-30.0.1.dist-info}/entry_points.txt +0 -0
  64. {lsst_pipe_base-30.0.0rc2.dist-info → lsst_pipe_base-30.0.1.dist-info}/licenses/COPYRIGHT +0 -0
  65. {lsst_pipe_base-30.0.0rc2.dist-info → lsst_pipe_base-30.0.1.dist-info}/licenses/LICENSE +0 -0
  66. {lsst_pipe_base-30.0.0rc2.dist-info → lsst_pipe_base-30.0.1.dist-info}/licenses/bsd_license.txt +0 -0
  67. {lsst_pipe_base-30.0.0rc2.dist-info → lsst_pipe_base-30.0.1.dist-info}/licenses/gpl-v3.0.txt +0 -0
  68. {lsst_pipe_base-30.0.0rc2.dist-info → lsst_pipe_base-30.0.1.dist-info}/top_level.txt +0 -0
  69. {lsst_pipe_base-30.0.0rc2.dist-info → lsst_pipe_base-30.0.1.dist-info}/zip-safe +0 -0
@@ -31,154 +31,33 @@ __all__ = (
31
31
  "FatalWorkerError",
32
32
  "IngesterCommunicator",
33
33
  "ScannerCommunicator",
34
- "SpawnProcessContext",
35
34
  "SupervisorCommunicator",
36
- "ThreadingContext",
37
- "WorkerContext",
38
35
  )
39
36
 
40
37
  import cProfile
41
38
  import dataclasses
42
39
  import enum
43
40
  import logging
44
- import multiprocessing.context
45
- import multiprocessing.synchronize
46
41
  import os
47
- import queue
48
42
  import signal
49
- import threading
50
43
  import time
51
44
  import uuid
52
- from abc import ABC, abstractmethod
53
- from collections.abc import Callable, Iterable, Iterator
54
- from contextlib import AbstractContextManager, ExitStack, contextmanager
45
+ from collections.abc import Iterable, Iterator
46
+ from contextlib import ExitStack
55
47
  from traceback import format_exception
56
48
  from types import TracebackType
57
- from typing import Any, Literal, Self, TypeAlias, TypeVar, Union
49
+ from typing import Literal, Self, overload
58
50
 
59
- from lsst.utils.logging import VERBOSE, LsstLogAdapter
51
+ from lsst.utils.logging import LsstLogAdapter
60
52
 
53
+ from .._provenance import ProvenanceQuantumScanData
61
54
  from ._config import AggregatorConfig
62
55
  from ._progress import ProgressManager, make_worker_log
63
- from ._structs import IngestRequest, ScanReport, WriteRequest
64
-
65
- _T = TypeVar("_T")
56
+ from ._structs import IngestRequest, ScanReport
57
+ from ._workers import Event, Queue, Worker, WorkerFactory
66
58
 
67
59
  _TINY_TIMEOUT = 0.01
68
60
 
69
- # multiprocessing.Queue is a type according to the standard library type stubs,
70
- # but it's really a function at runtime. But since the Python <= 3.11 type
71
- # alias syntax uses the real runtime things we need to use strings, and hence
72
- # we need to use Union. With Python 3.12's 'type' statement this gets cleaner.
73
- Queue: TypeAlias = Union["queue.Queue[_T]", "multiprocessing.Queue[_T]"]
74
-
75
- Event: TypeAlias = threading.Event | multiprocessing.synchronize.Event
76
-
77
- Worker: TypeAlias = threading.Thread | multiprocessing.context.SpawnProcess
78
-
79
-
80
- class WorkerContext(ABC):
81
- """A simple abstract interface that can be implemented by both threading
82
- and multiprocessing.
83
- """
84
-
85
- @abstractmethod
86
- def make_queue(self) -> Queue[Any]:
87
- """Make an empty queue that can be used to pass objects between
88
- workers in this context.
89
- """
90
- raise NotImplementedError()
91
-
92
- @abstractmethod
93
- def make_event(self) -> Event:
94
- """Make an event that can be used to communicate a boolean state change
95
- to workers in this context.
96
- """
97
- raise NotImplementedError()
98
-
99
- @abstractmethod
100
- def make_worker(
101
- self, target: Callable[..., None], args: tuple[Any, ...], name: str | None = None
102
- ) -> Worker:
103
- """Make a worker that runs the given callable.
104
-
105
- Parameters
106
- ----------
107
- target : `~collections.abc.Callable`
108
- A callable to invoke on the worker.
109
- args : `tuple`
110
- Positional arguments to pass to the callable.
111
- name : `str`, optional
112
- Human-readable name for the worker.
113
-
114
- Returns
115
- -------
116
- worker : `threading.Thread` or `multiprocessing.Process`
117
- Process or thread. Will need to have its ``start`` method called
118
- to actually begin.
119
- """
120
- raise NotImplementedError()
121
-
122
-
123
- class ThreadingContext(WorkerContext):
124
- """An implementation of `WorkerContext` backed by the `threading`
125
- module.
126
- """
127
-
128
- def make_queue(self) -> Queue[Any]:
129
- return queue.Queue()
130
-
131
- def make_event(self) -> Event:
132
- return threading.Event()
133
-
134
- def make_worker(
135
- self, target: Callable[..., None], args: tuple[Any, ...], name: str | None = None
136
- ) -> Worker:
137
- return threading.Thread(target=target, args=args, name=name)
138
-
139
-
140
- class SpawnProcessContext(WorkerContext):
141
- """An implementation of `WorkerContext` backed by the `multiprocessing`
142
- module, with new processes started by spawning.
143
- """
144
-
145
- def __init__(self) -> None:
146
- self._ctx = multiprocessing.get_context("spawn")
147
-
148
- def make_queue(self) -> Queue[Any]:
149
- return self._ctx.Queue()
150
-
151
- def make_event(self) -> Event:
152
- return self._ctx.Event()
153
-
154
- def make_worker(
155
- self, target: Callable[..., None], args: tuple[Any, ...], name: str | None = None
156
- ) -> Worker:
157
- return self._ctx.Process(target=target, args=args, name=name)
158
-
159
-
160
- def _get_from_queue(q: Queue[_T], block: bool = False, timeout: float | None = None) -> _T | None:
161
- """Get an object from a queue and return `None` if it is empty.
162
-
163
- Parameters
164
- ----------
165
- q : `Queue`
166
- Queue to get an object from.
167
- block : `bool`
168
- Whether to block until an object is available.
169
- timeout : `float` or `None`, optional
170
- Maximum number of seconds to wait while blocking.
171
-
172
- Returns
173
- -------
174
- obj : `object` or `None`
175
- Object from the queue, or `None` if it was empty.
176
- """
177
- try:
178
- return q.get(block=block, timeout=timeout)
179
- except queue.Empty:
180
- return None
181
-
182
61
 
183
62
  class FatalWorkerError(BaseException):
184
63
  """An exception raised by communicators when one worker (including the
@@ -187,6 +66,12 @@ class FatalWorkerError(BaseException):
187
66
  """
188
67
 
189
68
 
69
+ class _WorkerCommunicationError(Exception):
70
+ """An exception raised by communicators when a worker has died unexpectedly
71
+ or become unresponsive.
72
+ """
73
+
74
+
190
75
  class _Sentinel(enum.Enum):
191
76
  """Sentinel values used to indicate sequence points or worker shutdown
192
77
  conditions.
@@ -213,21 +98,6 @@ class _Sentinel(enum.Enum):
213
98
  quantum's provenance was written.
214
99
  """
215
100
 
216
- SCANNER_DONE = enum.auto()
217
- """Sentinel sent from scanners to the supervisor to report that they are
218
- done and shutting down.
219
- """
220
-
221
- INGESTER_DONE = enum.auto()
222
- """Sentinel sent from the ingester to the supervisor to report that it is
223
- done and shutting down.
224
- """
225
-
226
- WRITER_DONE = enum.auto()
227
- """Sentinel sent from the writer to the supervisor to report that it is
228
- done and shutting down.
229
- """
230
-
231
101
 
232
102
  @dataclasses.dataclass
233
103
  class _WorkerErrorMessage:
@@ -277,6 +147,16 @@ class _IngestReport:
277
147
  """
278
148
 
279
149
 
150
+ @dataclasses.dataclass
151
+ class _WorkerDone:
152
+ """An internal struct passed from a worker to the supervisor when it has
153
+ successfully completed all work.
154
+ """
155
+
156
+ name: str
157
+ """Name of the worker reporting completion."""
158
+
159
+
280
160
  @dataclasses.dataclass
281
161
  class _ProgressLog:
282
162
  """A high-level log message sent from a worker to the supervisor.
@@ -303,20 +183,22 @@ class _CompressionDictionary:
303
183
  """
304
184
 
305
185
 
306
- Report: TypeAlias = (
186
+ type Report = (
307
187
  ScanReport
308
188
  | _IngestReport
309
189
  | _WorkerErrorMessage
310
190
  | _ProgressLog
311
- | Literal[
312
- _Sentinel.WRITE_REPORT,
313
- _Sentinel.SCANNER_DONE,
314
- _Sentinel.INGESTER_DONE,
315
- _Sentinel.WRITER_DONE,
316
- ]
191
+ | _WorkerDone
192
+ | Literal[_Sentinel.WRITE_REPORT]
317
193
  )
318
194
 
319
195
 
196
+ def _disable_resources_parallelism() -> None:
197
+ os.environ["LSST_RESOURCES_NUM_WORKERS"] = "1"
198
+ os.environ.pop("LSST_RESOURCES_EXECUTOR", None)
199
+ os.environ["LSST_S3_USE_THREADS"] = "False"
200
+
201
+
320
202
  class SupervisorCommunicator:
321
203
  """A helper object that lets the supervisor direct the other workers.
322
204
 
@@ -326,7 +208,7 @@ class SupervisorCommunicator:
326
208
  LSST-customized logger.
327
209
  n_scanners : `int`
328
210
  Number of scanner workers.
329
- context : `WorkerContext`
211
+ worker_factory : `WorkerFactory`
330
212
  Abstraction over threading vs. multiprocessing.
331
213
  config : `AggregatorConfig`
332
214
  Configuration for the aggregator.
@@ -336,7 +218,7 @@ class SupervisorCommunicator:
336
218
  self,
337
219
  log: LsstLogAdapter,
338
220
  n_scanners: int,
339
- context: WorkerContext,
221
+ worker_factory: WorkerFactory,
340
222
  config: AggregatorConfig,
341
223
  ) -> None:
342
224
  self.config = config
@@ -346,14 +228,14 @@ class SupervisorCommunicator:
346
228
  # When complete, the supervisor sends n_scanners sentinals and each
347
229
  # scanner is careful to only take one before it starts its shutdown.
348
230
  self._scan_requests: Queue[_ScanRequest | Literal[_Sentinel.NO_MORE_SCAN_REQUESTS]] = (
349
- context.make_queue()
231
+ worker_factory.make_queue()
350
232
  )
351
233
  # The scanners send ingest requests to the ingester on this queue. Each
352
234
  # scanner sends one sentinal when it is done, and the ingester is
353
235
  # careful to wait for n_scanners sentinals to arrive before it starts
354
236
  # its shutdown.
355
237
  self._ingest_requests: Queue[IngestRequest | Literal[_Sentinel.NO_MORE_INGEST_REQUESTS]] = (
356
- context.make_queue()
238
+ worker_factory.make_queue()
357
239
  )
358
240
  # The scanners send write requests to the writer on this queue (which
359
241
  # will be `None` if we're not writing). The supervisor also sends
@@ -361,24 +243,24 @@ class SupervisorCommunicator:
361
243
  # scanner and the supervisor send one sentinal when done, and the
362
244
  # writer waits for (n_scanners + 1) sentinals to arrive before it
363
245
  # starts its shutdown.
364
- self._write_requests: Queue[WriteRequest | Literal[_Sentinel.NO_MORE_WRITE_REQUESTS]] | None = (
365
- context.make_queue() if config.output_path is not None else None
366
- )
246
+ self._write_requests: (
247
+ Queue[ProvenanceQuantumScanData | Literal[_Sentinel.NO_MORE_WRITE_REQUESTS]] | None
248
+ ) = worker_factory.make_queue() if config.is_writing_provenance else None
367
249
  # All other workers use this queue to send many different kinds of
368
250
  # reports the supervisor. The supervisor waits for a _DONE sentinal
369
251
  # from each worker before it finishes its shutdown.
370
- self._reports: Queue[Report] = context.make_queue()
252
+ self._reports: Queue[Report] = worker_factory.make_queue()
371
253
  # The writer sends the compression dictionary to the scanners on this
372
254
  # queue. It puts n_scanners copies on the queue, and each scanner only
373
255
  # takes one. The compression_dict queue has no sentinal because it is
374
256
  # only used at most once; the supervisor takes responsibility for
375
257
  # clearing it out shutting down.
376
- self._compression_dict: Queue[_CompressionDictionary] = context.make_queue()
258
+ self._compression_dict: Queue[_CompressionDictionary] = worker_factory.make_queue()
377
259
  # The supervisor sets this event when it receives an interrupt request
378
260
  # from an exception in the main process (usually KeyboardInterrupt).
379
261
  # Worker communicators check this in their polling loops and raise
380
262
  # FatalWorkerError when they see it set.
381
- self._cancel_event: Event = context.make_event()
263
+ self._cancel_event: Event = worker_factory.make_event()
382
264
  # Track what state we are in closing down, so we can start at the right
383
265
  # point if we're interrupted and __exit__ needs to clean up. Note that
384
266
  # we can't rely on a non-exception __exit__ to do any shutdown work
@@ -387,51 +269,77 @@ class SupervisorCommunicator:
387
269
  self._sent_no_more_scan_requests = False
388
270
  self._sent_no_more_write_requests = False
389
271
  self._n_scanners_done = 0
390
- self._ingester_done = False
391
- self._writer_done = self._write_requests is None
272
+ self.workers: dict[str, Worker] = {}
392
273
 
393
- def wait_for_workers_to_finish(self, already_failing: bool = False) -> None:
274
+ def _wait_for_workers_to_finish(self, already_failing: bool = False) -> None:
275
+ # Orderly shutdown, including exceptions: let workers clear out the
276
+ # queues they're responsible for reading from.
394
277
  if not self._sent_no_more_scan_requests:
395
278
  for _ in range(self.n_scanners):
396
- self._scan_requests.put(_Sentinel.NO_MORE_SCAN_REQUESTS, block=False)
279
+ self._scan_requests.put(_Sentinel.NO_MORE_SCAN_REQUESTS)
397
280
  self._sent_no_more_scan_requests = True
398
281
  if not self._sent_no_more_write_requests and self._write_requests is not None:
399
- self._write_requests.put(_Sentinel.NO_MORE_WRITE_REQUESTS, block=False)
282
+ self._write_requests.put(_Sentinel.NO_MORE_WRITE_REQUESTS)
400
283
  self._sent_no_more_write_requests = True
401
- while not (self._ingester_done and self._writer_done and self._n_scanners_done == self.n_scanners):
284
+ while not all(w.successful for w in self.workers.values()):
402
285
  match self._handle_progress_reports(
403
- self._reports.get(block=True), already_failing=already_failing
286
+ self._get_report(block=True), already_failing=already_failing
404
287
  ):
405
- case None | ScanReport() | _IngestReport():
288
+ case None | ScanReport():
406
289
  pass
407
- case _Sentinel.INGESTER_DONE:
408
- self._ingester_done = True
409
- self.progress.quantum_ingests.close()
410
- case _Sentinel.SCANNER_DONE:
411
- self._n_scanners_done += 1
412
- self.progress.scans.close()
413
- case _Sentinel.WRITER_DONE:
414
- self._writer_done = True
415
- self.progress.writes.close()
290
+ case _WorkerDone(name=worker_name):
291
+ self.workers[worker_name].successful = True
292
+ if worker_name == IngesterCommunicator.get_worker_name():
293
+ self.progress.quantum_ingests.close()
294
+ elif worker_name == WriterCommunicator.get_worker_name():
295
+ self.progress.writes.close()
296
+ else:
297
+ self._n_scanners_done += 1
298
+ if self._n_scanners_done == self.n_scanners:
299
+ self.progress.scans.close()
416
300
  case unexpected:
417
301
  raise AssertionError(f"Unexpected message {unexpected!r} to supervisor.")
418
302
  self.log.verbose(
419
- "Blocking on reports queue: ingester_done=%s, writer_done=%s, n_scanners_done=%s.",
420
- self._ingester_done,
421
- self._writer_done,
422
- self._n_scanners_done,
303
+ "Waiting for workers [%s] to report successful completion.",
304
+ ", ".join(w.name for w in self.workers.values() if not w.successful),
423
305
  )
424
- while _get_from_queue(self._compression_dict) is not None:
425
- self.log.verbose("Flushing compression dict queue.")
426
306
  self.log.verbose("Checking that all queues are empty.")
427
- self._expect_empty_queue(self._scan_requests)
428
- self._expect_empty_queue(self._ingest_requests)
307
+ if self._scan_requests.clear():
308
+ self.progress.log.warning("Scan request queue was not empty at shutdown.")
309
+ self._scan_requests.kill()
310
+ if self._ingest_requests.clear():
311
+ self.progress.log.warning("Ingest request queue was not empty at shutdown.")
312
+ self._ingest_requests.kill()
313
+ if self._write_requests is not None and self._write_requests.clear():
314
+ self.progress.log.warning("Write request queue was not empty at shutdown.")
315
+ self._write_requests.kill()
316
+ if self._reports.clear():
317
+ self.progress.log.warning("Reports queue was not empty at shutdown.")
318
+ self._reports.kill()
319
+ if self._compression_dict.clear():
320
+ self.progress.log.warning("Compression dictionary queue was not empty at shutdown.")
321
+ self._compression_dict.kill()
322
+ for worker in self.workers.values():
323
+ self.log.verbose("Waiting for %s to shut down.", worker.name)
324
+ worker.join()
325
+
326
+ def _terminate(self) -> None:
327
+ # Disorderly shutdown: we cannot assume any of the
328
+ # multiprocessing.Queue object work, and in fact they may hang
329
+ # if we try to do anything with them.
330
+ self._scan_requests.kill()
331
+ self._ingest_requests.kill()
429
332
  if self._write_requests is not None:
430
- self._expect_empty_queue(self._write_requests)
431
- self._expect_empty_queue(self._reports)
432
- self._expect_empty_queue(self._compression_dict)
333
+ self._write_requests.kill()
334
+ self._compression_dict.kill()
335
+ self._reports.kill()
336
+ for name, worker in self.workers.items():
337
+ if worker.is_alive():
338
+ self.progress.log.critical("Terminating worker %r.", name)
339
+ worker.kill()
433
340
 
434
341
  def __enter__(self) -> Self:
342
+ _disable_resources_parallelism()
435
343
  self.progress.__enter__()
436
344
  # We make the low-level logger in __enter__ instead of __init__ only
437
345
  # because that's the pattern used by true workers (where it matters).
@@ -445,11 +353,23 @@ class SupervisorCommunicator:
445
353
  traceback: TracebackType | None,
446
354
  ) -> None:
447
355
  if exc_type is not None:
448
- if exc_type is not FatalWorkerError:
449
- self.progress.log.critical(f"Caught {exc_type.__name__}; attempting to shut down cleanly.")
450
356
  self._cancel_event.set()
451
- self.wait_for_workers_to_finish(already_failing=exc_type is not None)
357
+ if exc_type is _WorkerCommunicationError:
358
+ self.progress.log.critical("Worker '%s' was terminated before it could finish.", exc_value)
359
+ self._terminate()
360
+ return None
361
+ if exc_type is not FatalWorkerError:
362
+ self.progress.log.critical("Caught %s; attempting to shut down cleanly.", exc_type)
363
+ try:
364
+ self._wait_for_workers_to_finish(already_failing=exc_type is not None)
365
+ except _WorkerCommunicationError as err:
366
+ self.progress.log.critical(
367
+ "Worker '%s' was terminated before it could finish (after scanning).", err
368
+ )
369
+ self._terminate()
370
+ raise
452
371
  self.progress.__exit__(exc_type, exc_value, traceback)
372
+ return None
453
373
 
454
374
  def request_scan(self, quantum_id: uuid.UUID) -> None:
455
375
  """Send a request to the scanners to scan the given quantum.
@@ -459,19 +379,19 @@ class SupervisorCommunicator:
459
379
  quantum_id : `uuid.UUID`
460
380
  ID of the quantum to scan.
461
381
  """
462
- self._scan_requests.put(_ScanRequest(quantum_id), block=False)
382
+ self._scan_requests.put(_ScanRequest(quantum_id))
463
383
 
464
- def request_write(self, request: WriteRequest) -> None:
384
+ def request_write(self, request: ProvenanceQuantumScanData) -> None:
465
385
  """Send a request to the writer to write provenance for the given scan.
466
386
 
467
387
  Parameters
468
388
  ----------
469
- request : `WriteRequest`
389
+ request : `ProvenanceQuantumScanData`
470
390
  Information from scanning a quantum (or knowing you don't have to,
471
391
  in the case of blocked quanta).
472
392
  """
473
393
  assert self._write_requests is not None, "Writer should not be used if writing is disabled."
474
- self._write_requests.put(request, block=False)
394
+ self._write_requests.put(request)
475
395
 
476
396
  def poll(self) -> Iterator[ScanReport]:
477
397
  """Poll for reports from workers while sending scan requests.
@@ -487,9 +407,8 @@ class SupervisorCommunicator:
487
407
  it continues until the report queue is empty.
488
408
  """
489
409
  block = True
490
- msg = _get_from_queue(self._reports, block=block)
491
- while msg is not None:
492
- match self._handle_progress_reports(msg):
410
+ while report := self._get_report(block=block):
411
+ match self._handle_progress_reports(report):
493
412
  case ScanReport() as scan_report:
494
413
  block = False
495
414
  yield scan_report
@@ -497,19 +416,40 @@ class SupervisorCommunicator:
497
416
  pass
498
417
  case unexpected:
499
418
  raise AssertionError(f"Unexpected message {unexpected!r} to supervisor.")
500
- msg = _get_from_queue(self._reports, block=block)
419
+
420
+ @overload
421
+ def _get_report(self, block: Literal[True]) -> Report: ...
422
+
423
+ @overload
424
+ def _get_report(self, block: bool) -> Report | None: ...
425
+
426
+ def _get_report(self, block: bool) -> Report | None:
427
+ """Get a report from the reports queue, with timeout guards on
428
+ blocking requests.
429
+
430
+ This method may *return* WorkerCommunicatorError (rather than raise it)
431
+ when a serious error occurred communicating with a subprocess. This
432
+ is to avoid raising an exception in an __exit__ method (which calls
433
+ method).
434
+ """
435
+ report = self._reports.get(block=block, timeout=self.config.worker_check_timeout)
436
+ while report is None and block:
437
+ # We hit the timeout; make sure all of the workers
438
+ # that should be alive actually are.
439
+ for name, worker in self.workers.items():
440
+ if not worker.successful and not worker.is_alive():
441
+ # Delete this worker from the list of workers so we don't
442
+ # hit this condition again when we try to handle the
443
+ # exception we raise.
444
+ raise _WorkerCommunicationError(name)
445
+ # If nothing is dead and we didn't hit the hang timeout, keep
446
+ # trying.
447
+ report = self._reports.get(block=block, timeout=self.config.worker_check_timeout)
448
+ return report
501
449
 
502
450
  def _handle_progress_reports(
503
451
  self, report: Report, already_failing: bool = False
504
- ) -> (
505
- ScanReport
506
- | Literal[
507
- _Sentinel.SCANNER_DONE,
508
- _Sentinel.INGESTER_DONE,
509
- _Sentinel.WRITER_DONE,
510
- ]
511
- | None
512
- ):
452
+ ) -> ScanReport | _WorkerDone | None:
513
453
  """Handle reports to the supervisor that can appear at any time, and
514
454
  are typically just updates to the progress we've made.
515
455
 
@@ -539,15 +479,9 @@ class SupervisorCommunicator:
539
479
  return report
540
480
  return None
541
481
 
542
- @staticmethod
543
- def _expect_empty_queue(queue: Queue[Any]) -> None:
544
- """Assert that the given queue is empty."""
545
- if (msg := _get_from_queue(queue, block=False, timeout=0)) is not None:
546
- raise AssertionError(f"Queue is not empty; found {msg!r}.")
547
-
548
482
 
549
483
  class WorkerCommunicator:
550
- """A base class for non-supervisor workers.
484
+ """A base class for non-supervisor worker communicators.
551
485
 
552
486
  Parameters
553
487
  ----------
@@ -559,8 +493,8 @@ class WorkerCommunicator:
559
493
  Notes
560
494
  -----
561
495
  Each worker communicator is constructed in the main process and entered as
562
- a context manager on the actual worker process, so attributes that cannot
563
- be pickled are constructed in ``__enter__`` instead of ``__init__``.
496
+ a context manager *only* on the actual worker process, so attributes that
497
+ cannot be pickled are constructed in ``__enter__`` instead of ``__init__``.
564
498
 
565
499
  Worker communicators provide access to an `AggregatorConfig` and a logger
566
500
  to their workers. As context managers, they handle exceptions and ensure
@@ -580,6 +514,7 @@ class WorkerCommunicator:
580
514
  self._cancel_event = supervisor._cancel_event
581
515
 
582
516
  def __enter__(self) -> Self:
517
+ _disable_resources_parallelism()
583
518
  self.log = make_worker_log(self.name, self.config)
584
519
  self.log.verbose("%s has PID %s (parent is %s).", self.name, os.getpid(), os.getppid())
585
520
  self._exit_stack = ExitStack().__enter__()
@@ -612,8 +547,7 @@ class WorkerCommunicator:
612
547
  _WorkerErrorMessage(
613
548
  self.name,
614
549
  "".join(format_exception(exc_type, exc_value, traceback)),
615
- ),
616
- block=False,
550
+ )
617
551
  )
618
552
  self.log.debug("Error message sent to supervisor.")
619
553
  else:
@@ -621,6 +555,11 @@ class WorkerCommunicator:
621
555
  self._exit_stack.__exit__(exc_type, exc_value, traceback)
622
556
  return True
623
557
 
558
+ @property
559
+ def exit_stack(self) -> ExitStack:
560
+ """A `contextlib.ExitStack` tied to the communicator."""
561
+ return self._exit_stack
562
+
624
563
  def log_progress(self, level: int, message: str) -> None:
625
564
  """Send a high-level log message to the supervisor.
626
565
 
@@ -631,45 +570,7 @@ class WorkerCommunicator:
631
570
  message : `str`
632
571
  Log message.
633
572
  """
634
- self._reports.put(_ProgressLog(message=message, level=level), block=False)
635
-
636
- def enter(
637
- self,
638
- cm: AbstractContextManager[_T],
639
- on_close: str | None = None,
640
- level: int = VERBOSE,
641
- is_progress_log: bool = False,
642
- ) -> _T:
643
- """Enter a context manager that will be exited when the communicator's
644
- context is exited.
645
-
646
- Parameters
647
- ----------
648
- cm : `contextlib.AbstractContextManager`
649
- A context manager to enter.
650
- on_close : `str`, optional
651
- A log message to emit (on the worker's logger) just before the
652
- given context manager is exited. This can be used to indicate
653
- what's going on when an ``__exit__`` implementation has a lot of
654
- work to do (e.g. moving a large file into a zip archive).
655
- level : `int`, optional
656
- Level for the ``on_close`` log message.
657
- is_progress_log : `bool`, optional
658
- If `True`, send the ``on_close`` message to the supervisor via
659
- `log_progress` as well as the worker's logger.
660
- """
661
- if on_close is None:
662
- return self._exit_stack.enter_context(cm)
663
-
664
- @contextmanager
665
- def wrapper() -> Iterator[_T]:
666
- with cm as result:
667
- yield result
668
- self.log.log(level, on_close)
669
- if is_progress_log:
670
- self.log_progress(level, on_close)
671
-
672
- return self._exit_stack.enter_context(wrapper())
573
+ self._reports.put(_ProgressLog(message=message, level=level))
673
574
 
674
575
  def check_for_cancel(self) -> None:
675
576
  """Check for a cancel signal from the supervisor and raise
@@ -691,7 +592,7 @@ class ScannerCommunicator(WorkerCommunicator):
691
592
  """
692
593
 
693
594
  def __init__(self, supervisor: SupervisorCommunicator, scanner_id: int):
694
- super().__init__(supervisor, f"scanner-{scanner_id:03d}")
595
+ super().__init__(supervisor, self.get_worker_name(scanner_id))
695
596
  self.scanner_id = scanner_id
696
597
  self._scan_requests = supervisor._scan_requests
697
598
  self._ingest_requests = supervisor._ingest_requests
@@ -700,6 +601,10 @@ class ScannerCommunicator(WorkerCommunicator):
700
601
  self._got_no_more_scan_requests: bool = False
701
602
  self._sent_no_more_ingest_requests: bool = False
702
603
 
604
+ @staticmethod
605
+ def get_worker_name(scanner_id: int) -> str:
606
+ return f"scanner-{scanner_id:03d}"
607
+
703
608
  def report_scan(self, msg: ScanReport) -> None:
704
609
  """Report a completed scan to the supervisor.
705
610
 
@@ -708,7 +613,7 @@ class ScannerCommunicator(WorkerCommunicator):
708
613
  msg : `ScanReport`
709
614
  Report to send.
710
615
  """
711
- self._reports.put(msg, block=False)
616
+ self._reports.put(msg)
712
617
 
713
618
  def request_ingest(self, request: IngestRequest) -> None:
714
619
  """Ask the ingester to ingest a quantum's outputs.
@@ -724,20 +629,20 @@ class ScannerCommunicator(WorkerCommunicator):
724
629
  as complete to the supervisor instead of sending it to the ingester.
725
630
  """
726
631
  if request:
727
- self._ingest_requests.put(request, block=False)
632
+ self._ingest_requests.put(request)
728
633
  else:
729
- self._reports.put(_IngestReport(1), block=False)
634
+ self._reports.put(_IngestReport(1))
730
635
 
731
- def request_write(self, request: WriteRequest) -> None:
636
+ def request_write(self, request: ProvenanceQuantumScanData) -> None:
732
637
  """Ask the writer to write provenance for a quantum.
733
638
 
734
639
  Parameters
735
640
  ----------
736
- request : `WriteRequest`
641
+ request : `ProvenanceQuantumScanData`
737
642
  Result of scanning a quantum.
738
643
  """
739
644
  assert self._write_requests is not None, "Writer should not be used if writing is disabled."
740
- self._write_requests.put(request, block=False)
645
+ self._write_requests.put(request)
741
646
 
742
647
  def get_compression_dict(self) -> bytes | None:
743
648
  """Attempt to get the compression dict from the writer.
@@ -753,7 +658,7 @@ class ScannerCommunicator(WorkerCommunicator):
753
658
  A scanner should only call this method before it actually has the
754
659
  compression dict.
755
660
  """
756
- if (cdict := _get_from_queue(self._compression_dict)) is not None:
661
+ if (cdict := self._compression_dict.get()) is not None:
757
662
  return cdict.data
758
663
  return None
759
664
 
@@ -772,7 +677,7 @@ class ScannerCommunicator(WorkerCommunicator):
772
677
  """
773
678
  while True:
774
679
  self.check_for_cancel()
775
- scan_request = _get_from_queue(self._scan_requests, block=True, timeout=self.config.worker_sleep)
680
+ scan_request = self._scan_requests.get(block=True, timeout=self.config.worker_sleep)
776
681
  if scan_request is _Sentinel.NO_MORE_SCAN_REQUESTS:
777
682
  self._got_no_more_scan_requests = True
778
683
  return
@@ -786,20 +691,18 @@ class ScannerCommunicator(WorkerCommunicator):
786
691
  traceback: TracebackType | None,
787
692
  ) -> bool | None:
788
693
  result = super().__exit__(exc_type, exc_value, traceback)
789
- self._ingest_requests.put(_Sentinel.NO_MORE_INGEST_REQUESTS, block=False)
694
+ self._ingest_requests.put(_Sentinel.NO_MORE_INGEST_REQUESTS)
790
695
  if self._write_requests is not None:
791
- self._write_requests.put(_Sentinel.NO_MORE_WRITE_REQUESTS, block=False)
696
+ self._write_requests.put(_Sentinel.NO_MORE_WRITE_REQUESTS)
792
697
  while not self._got_no_more_scan_requests:
793
- self.log.debug("Clearing scan request queue (~%d remaining)", self._scan_requests.qsize())
794
698
  if (
795
699
  not self._got_no_more_scan_requests
796
- and self._scan_requests.get() is _Sentinel.NO_MORE_SCAN_REQUESTS
700
+ and self._scan_requests.get(block=True) is _Sentinel.NO_MORE_SCAN_REQUESTS
797
701
  ):
798
702
  self._got_no_more_scan_requests = True
799
- # We let the supervisor clear out the compression dict queue, because
800
- # a single scanner can't know if it ever got sent out or not.
801
- self.log.verbose("Sending done sentinal.")
802
- self._reports.put(_Sentinel.SCANNER_DONE, block=False)
703
+ # We let the writer clear out the compression dict queue.
704
+ self.log.verbose("Sending completion message.")
705
+ self._reports.put(_WorkerDone(self.name))
803
706
  return result
804
707
 
805
708
 
@@ -813,11 +716,15 @@ class IngesterCommunicator(WorkerCommunicator):
813
716
  """
814
717
 
815
718
  def __init__(self, supervisor: SupervisorCommunicator):
816
- super().__init__(supervisor, "ingester")
719
+ super().__init__(supervisor, self.get_worker_name())
817
720
  self.n_scanners = supervisor.n_scanners
818
721
  self._ingest_requests = supervisor._ingest_requests
819
722
  self._n_requesters_done = 0
820
723
 
724
+ @staticmethod
725
+ def get_worker_name() -> str:
726
+ return "ingester"
727
+
821
728
  def __exit__(
822
729
  self,
823
730
  exc_type: type[BaseException] | None,
@@ -833,8 +740,8 @@ class IngesterCommunicator(WorkerCommunicator):
833
740
  )
834
741
  if self._ingest_requests.get(block=True) is _Sentinel.NO_MORE_INGEST_REQUESTS:
835
742
  self._n_requesters_done += 1
836
- self.log.verbose("Sending done sentinal.")
837
- self._reports.put(_Sentinel.INGESTER_DONE, block=False)
743
+ self.log.verbose("Sending completion message.")
744
+ self._reports.put(_WorkerDone(self.name))
838
745
  return result
839
746
 
840
747
  def report_ingest(self, n_producers: int) -> None:
@@ -845,7 +752,7 @@ class IngesterCommunicator(WorkerCommunicator):
845
752
  n_producers : `int`
846
753
  Number of producing quanta whose datasets were ingested.
847
754
  """
848
- self._reports.put(_IngestReport(n_producers), block=False)
755
+ self._reports.put(_IngestReport(n_producers))
849
756
 
850
757
  def poll(self) -> Iterator[IngestRequest]:
851
758
  """Poll for ingest requests from the scanner workers.
@@ -862,7 +769,7 @@ class IngesterCommunicator(WorkerCommunicator):
862
769
  """
863
770
  while True:
864
771
  self.check_for_cancel()
865
- ingest_request = _get_from_queue(self._ingest_requests, block=True, timeout=_TINY_TIMEOUT)
772
+ ingest_request = self._ingest_requests.get(block=True, timeout=_TINY_TIMEOUT)
866
773
  if ingest_request is _Sentinel.NO_MORE_INGEST_REQUESTS:
867
774
  self._n_requesters_done += 1
868
775
  if self._n_requesters_done == self.n_scanners:
@@ -884,7 +791,7 @@ class WriterCommunicator(WorkerCommunicator):
884
791
 
885
792
  def __init__(self, supervisor: SupervisorCommunicator):
886
793
  assert supervisor._write_requests is not None
887
- super().__init__(supervisor, "writer")
794
+ super().__init__(supervisor, self.get_worker_name())
888
795
  self.n_scanners = supervisor.n_scanners
889
796
  self._write_requests = supervisor._write_requests
890
797
  self._compression_dict = supervisor._compression_dict
@@ -892,6 +799,10 @@ class WriterCommunicator(WorkerCommunicator):
892
799
  self._n_requesters_done = 0
893
800
  self._sent_compression_dict = False
894
801
 
802
+ @staticmethod
803
+ def get_worker_name() -> str:
804
+ return "writer"
805
+
895
806
  def __exit__(
896
807
  self,
897
808
  exc_type: type[BaseException] | None,
@@ -909,16 +820,20 @@ class WriterCommunicator(WorkerCommunicator):
909
820
  )
910
821
  if self._write_requests.get(block=True) is _Sentinel.NO_MORE_WRITE_REQUESTS:
911
822
  self._n_requesters_done += 1
912
- self.log.verbose("Sending done sentinal.")
913
- self._reports.put(_Sentinel.WRITER_DONE, block=False)
823
+ if self._compression_dict.clear():
824
+ self.log.verbose("Cleared out compression dictionary queue.")
825
+ else:
826
+ self.log.verbose("Compression dictionary queue was already empty.")
827
+ self.log.verbose("Sending completion message.")
828
+ self._reports.put(_WorkerDone(self.name))
914
829
  return result
915
830
 
916
- def poll(self) -> Iterator[WriteRequest]:
831
+ def poll(self) -> Iterator[ProvenanceQuantumScanData]:
917
832
  """Poll for writer requests from the scanner workers and supervisor.
918
833
 
919
834
  Yields
920
835
  ------
921
- request : `WriteRequest`
836
+ request : `ProvenanceQuantumScanData`
922
837
  The result of a quantum scan.
923
838
 
924
839
  Notes
@@ -928,7 +843,7 @@ class WriterCommunicator(WorkerCommunicator):
928
843
  """
929
844
  while True:
930
845
  self.check_for_cancel()
931
- write_request = _get_from_queue(self._write_requests, block=True, timeout=_TINY_TIMEOUT)
846
+ write_request = self._write_requests.get(block=True, timeout=_TINY_TIMEOUT)
932
847
  if write_request is _Sentinel.NO_MORE_WRITE_REQUESTS:
933
848
  self._n_requesters_done += 1
934
849
  if self._n_requesters_done == self._n_requesters:
@@ -948,16 +863,16 @@ class WriterCommunicator(WorkerCommunicator):
948
863
  """
949
864
  self.log.debug("Sending compression dictionary.")
950
865
  for _ in range(self.n_scanners):
951
- self._compression_dict.put(_CompressionDictionary(cdict_data), block=False)
866
+ self._compression_dict.put(_CompressionDictionary(cdict_data))
952
867
  self._sent_compression_dict = True
953
868
 
954
869
  def report_write(self) -> None:
955
870
  """Report to the supervisor that provenance for a quantum was written
956
871
  to the graph.
957
872
  """
958
- self._reports.put(_Sentinel.WRITE_REPORT, block=False)
873
+ self._reports.put(_Sentinel.WRITE_REPORT)
959
874
 
960
- def periodically_check_for_cancel(self, iterable: Iterable[_T], n: int = 100) -> Iterator[_T]:
875
+ def periodically_check_for_cancel[T](self, iterable: Iterable[T], n: int = 100) -> Iterator[T]:
961
876
  """Iterate while checking for a cancellation signal every ``n``
962
877
  iterations.
963
878