lsst-pipe-base 30.0.0rc3__py3-none-any.whl → 30.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. lsst/pipe/base/_instrument.py +25 -15
  2. lsst/pipe/base/_quantumContext.py +3 -3
  3. lsst/pipe/base/_status.py +43 -10
  4. lsst/pipe/base/_task_metadata.py +2 -2
  5. lsst/pipe/base/all_dimensions_quantum_graph_builder.py +8 -3
  6. lsst/pipe/base/automatic_connection_constants.py +20 -1
  7. lsst/pipe/base/cli/cmd/__init__.py +18 -2
  8. lsst/pipe/base/cli/cmd/commands.py +149 -4
  9. lsst/pipe/base/connectionTypes.py +72 -160
  10. lsst/pipe/base/connections.py +6 -9
  11. lsst/pipe/base/execution_reports.py +0 -5
  12. lsst/pipe/base/graph/graph.py +11 -10
  13. lsst/pipe/base/graph/quantumNode.py +4 -4
  14. lsst/pipe/base/graph_walker.py +8 -10
  15. lsst/pipe/base/log_capture.py +1 -1
  16. lsst/pipe/base/log_on_close.py +4 -7
  17. lsst/pipe/base/pipeline.py +5 -6
  18. lsst/pipe/base/pipelineIR.py +2 -8
  19. lsst/pipe/base/pipelineTask.py +5 -7
  20. lsst/pipe/base/pipeline_graph/_dataset_types.py +2 -2
  21. lsst/pipe/base/pipeline_graph/_edges.py +32 -22
  22. lsst/pipe/base/pipeline_graph/_mapping_views.py +4 -7
  23. lsst/pipe/base/pipeline_graph/_pipeline_graph.py +14 -7
  24. lsst/pipe/base/pipeline_graph/expressions.py +2 -2
  25. lsst/pipe/base/pipeline_graph/io.py +7 -10
  26. lsst/pipe/base/pipeline_graph/visualization/_dot.py +13 -12
  27. lsst/pipe/base/pipeline_graph/visualization/_layout.py +16 -18
  28. lsst/pipe/base/pipeline_graph/visualization/_merge.py +4 -7
  29. lsst/pipe/base/pipeline_graph/visualization/_printer.py +10 -10
  30. lsst/pipe/base/pipeline_graph/visualization/_status_annotator.py +7 -0
  31. lsst/pipe/base/prerequisite_helpers.py +2 -1
  32. lsst/pipe/base/quantum_graph/_common.py +15 -17
  33. lsst/pipe/base/quantum_graph/_multiblock.py +36 -20
  34. lsst/pipe/base/quantum_graph/_predicted.py +7 -3
  35. lsst/pipe/base/quantum_graph/_provenance.py +501 -61
  36. lsst/pipe/base/quantum_graph/aggregator/__init__.py +0 -1
  37. lsst/pipe/base/quantum_graph/aggregator/_communicators.py +187 -240
  38. lsst/pipe/base/quantum_graph/aggregator/_config.py +87 -9
  39. lsst/pipe/base/quantum_graph/aggregator/_ingester.py +13 -12
  40. lsst/pipe/base/quantum_graph/aggregator/_scanner.py +15 -7
  41. lsst/pipe/base/quantum_graph/aggregator/_structs.py +3 -3
  42. lsst/pipe/base/quantum_graph/aggregator/_supervisor.py +19 -34
  43. lsst/pipe/base/quantum_graph/aggregator/_workers.py +303 -0
  44. lsst/pipe/base/quantum_graph/aggregator/_writer.py +3 -3
  45. lsst/pipe/base/quantum_graph/formatter.py +74 -4
  46. lsst/pipe/base/quantum_graph/ingest_graph.py +413 -0
  47. lsst/pipe/base/quantum_graph/visualization.py +5 -1
  48. lsst/pipe/base/quantum_graph_builder.py +21 -8
  49. lsst/pipe/base/quantum_graph_skeleton.py +31 -29
  50. lsst/pipe/base/quantum_provenance_graph.py +29 -12
  51. lsst/pipe/base/separable_pipeline_executor.py +1 -1
  52. lsst/pipe/base/single_quantum_executor.py +15 -8
  53. lsst/pipe/base/struct.py +4 -0
  54. lsst/pipe/base/testUtils.py +3 -3
  55. lsst/pipe/base/tests/mocks/_storage_class.py +2 -1
  56. lsst/pipe/base/version.py +1 -1
  57. {lsst_pipe_base-30.0.0rc3.dist-info → lsst_pipe_base-30.0.1rc1.dist-info}/METADATA +3 -3
  58. {lsst_pipe_base-30.0.0rc3.dist-info → lsst_pipe_base-30.0.1rc1.dist-info}/RECORD +66 -64
  59. {lsst_pipe_base-30.0.0rc3.dist-info → lsst_pipe_base-30.0.1rc1.dist-info}/WHEEL +1 -1
  60. {lsst_pipe_base-30.0.0rc3.dist-info → lsst_pipe_base-30.0.1rc1.dist-info}/entry_points.txt +0 -0
  61. {lsst_pipe_base-30.0.0rc3.dist-info → lsst_pipe_base-30.0.1rc1.dist-info}/licenses/COPYRIGHT +0 -0
  62. {lsst_pipe_base-30.0.0rc3.dist-info → lsst_pipe_base-30.0.1rc1.dist-info}/licenses/LICENSE +0 -0
  63. {lsst_pipe_base-30.0.0rc3.dist-info → lsst_pipe_base-30.0.1rc1.dist-info}/licenses/bsd_license.txt +0 -0
  64. {lsst_pipe_base-30.0.0rc3.dist-info → lsst_pipe_base-30.0.1rc1.dist-info}/licenses/gpl-v3.0.txt +0 -0
  65. {lsst_pipe_base-30.0.0rc3.dist-info → lsst_pipe_base-30.0.1rc1.dist-info}/top_level.txt +0 -0
  66. {lsst_pipe_base-30.0.0rc3.dist-info → lsst_pipe_base-30.0.1rc1.dist-info}/zip-safe +0 -0
@@ -31,30 +31,22 @@ __all__ = (
31
31
  "FatalWorkerError",
32
32
  "IngesterCommunicator",
33
33
  "ScannerCommunicator",
34
- "SpawnProcessContext",
35
34
  "SupervisorCommunicator",
36
- "ThreadingContext",
37
- "WorkerContext",
38
35
  )
39
36
 
40
37
  import cProfile
41
38
  import dataclasses
42
39
  import enum
43
40
  import logging
44
- import multiprocessing.context
45
- import multiprocessing.synchronize
46
41
  import os
47
- import queue
48
42
  import signal
49
- import threading
50
43
  import time
51
44
  import uuid
52
- from abc import ABC, abstractmethod
53
- from collections.abc import Callable, Iterable, Iterator
45
+ from collections.abc import Iterable, Iterator
54
46
  from contextlib import ExitStack
55
47
  from traceback import format_exception
56
48
  from types import TracebackType
57
- from typing import Any, Literal, Self, TypeAlias, TypeVar, Union
49
+ from typing import Literal, Self, overload
58
50
 
59
51
  from lsst.utils.logging import LsstLogAdapter
60
52
 
@@ -62,124 +54,10 @@ from .._provenance import ProvenanceQuantumScanData
62
54
  from ._config import AggregatorConfig
63
55
  from ._progress import ProgressManager, make_worker_log
64
56
  from ._structs import IngestRequest, ScanReport
65
-
66
- _T = TypeVar("_T")
57
+ from ._workers import Event, Queue, Worker, WorkerFactory
67
58
 
68
59
  _TINY_TIMEOUT = 0.01
69
60
 
70
- # multiprocessing.Queue is a type according to the standard library type stubs,
71
- # but it's really a function at runtime. But since the Python <= 3.11 type
72
- # alias syntax uses the real runtime things we need to use strings, and hence
73
- # we need to use Union. With Python 3.12's 'type' statement this gets cleaner.
74
- Queue: TypeAlias = Union["queue.Queue[_T]", "multiprocessing.Queue[_T]"]
75
-
76
- Event: TypeAlias = threading.Event | multiprocessing.synchronize.Event
77
-
78
- Worker: TypeAlias = threading.Thread | multiprocessing.context.SpawnProcess
79
-
80
-
81
- class WorkerContext(ABC):
82
- """A simple abstract interface that can be implemented by both threading
83
- and multiprocessing.
84
- """
85
-
86
- @abstractmethod
87
- def make_queue(self) -> Queue[Any]:
88
- """Make an empty queue that can be used to pass objects between
89
- workers in this context.
90
- """
91
- raise NotImplementedError()
92
-
93
- @abstractmethod
94
- def make_event(self) -> Event:
95
- """Make an event that can be used to communicate a boolean state change
96
- to workers in this context.
97
- """
98
- raise NotImplementedError()
99
-
100
- @abstractmethod
101
- def make_worker(
102
- self, target: Callable[..., None], args: tuple[Any, ...], name: str | None = None
103
- ) -> Worker:
104
- """Make a worker that runs the given callable.
105
-
106
- Parameters
107
- ----------
108
- target : `~collections.abc.Callable`
109
- A callable to invoke on the worker.
110
- args : `tuple`
111
- Positional arguments to pass to the callable.
112
- name : `str`, optional
113
- Human-readable name for the worker.
114
-
115
- Returns
116
- -------
117
- worker : `threading.Thread` or `multiprocessing.Process`
118
- Process or thread. Will need to have its ``start`` method called
119
- to actually begin.
120
- """
121
- raise NotImplementedError()
122
-
123
-
124
- class ThreadingContext(WorkerContext):
125
- """An implementation of `WorkerContext` backed by the `threading`
126
- module.
127
- """
128
-
129
- def make_queue(self) -> Queue[Any]:
130
- return queue.Queue()
131
-
132
- def make_event(self) -> Event:
133
- return threading.Event()
134
-
135
- def make_worker(
136
- self, target: Callable[..., None], args: tuple[Any, ...], name: str | None = None
137
- ) -> Worker:
138
- return threading.Thread(target=target, args=args, name=name)
139
-
140
-
141
- class SpawnProcessContext(WorkerContext):
142
- """An implementation of `WorkerContext` backed by the `multiprocessing`
143
- module, with new processes started by spawning.
144
- """
145
-
146
- def __init__(self) -> None:
147
- self._ctx = multiprocessing.get_context("spawn")
148
-
149
- def make_queue(self) -> Queue[Any]:
150
- return self._ctx.Queue()
151
-
152
- def make_event(self) -> Event:
153
- return self._ctx.Event()
154
-
155
- def make_worker(
156
- self, target: Callable[..., None], args: tuple[Any, ...], name: str | None = None
157
- ) -> Worker:
158
- return self._ctx.Process(target=target, args=args, name=name)
159
-
160
-
161
- def _get_from_queue(q: Queue[_T], block: bool = False, timeout: float | None = None) -> _T | None:
162
- """Get an object from a queue and return `None` if it is empty.
163
-
164
- Parameters
165
- ----------
166
- q : `Queue`
167
- Queue to get an object from.
168
- block : `bool`
169
- Whether to block until an object is available.
170
- timeout : `float` or `None`, optional
171
- Maximum number of seconds to wait while blocking.
172
-
173
- Returns
174
- -------
175
- obj : `object` or `None`
176
- Object from the queue, or `None` if it was empty.
177
- """
178
- try:
179
- return q.get(block=block, timeout=timeout)
180
- except queue.Empty:
181
- return None
182
-
183
61
 
184
62
  class FatalWorkerError(BaseException):
185
63
  """An exception raised by communicators when one worker (including the
@@ -188,6 +66,12 @@ class FatalWorkerError(BaseException):
188
66
  """
189
67
 
190
68
 
69
+ class _WorkerCommunicationError(Exception):
70
+ """An exception raised by communicators when a worker has died unexpectedly
71
+ or become unresponsive.
72
+ """
73
+
74
+
191
75
  class _Sentinel(enum.Enum):
192
76
  """Sentinel values used to indicate sequence points or worker shutdown
193
77
  conditions.
@@ -214,21 +98,6 @@ class _Sentinel(enum.Enum):
214
98
  quantum's provenance was written.
215
99
  """
216
100
 
217
- SCANNER_DONE = enum.auto()
218
- """Sentinel sent from scanners to the supervisor to report that they are
219
- done and shutting down.
220
- """
221
-
222
- INGESTER_DONE = enum.auto()
223
- """Sentinel sent from the ingester to the supervisor to report that it is
224
- done and shutting down.
225
- """
226
-
227
- WRITER_DONE = enum.auto()
228
- """Sentinel sent from the writer to the supervisor to report that it is
229
- done and shutting down.
230
- """
231
-
232
101
 
233
102
  @dataclasses.dataclass
234
103
  class _WorkerErrorMessage:
@@ -278,6 +147,16 @@ class _IngestReport:
278
147
  """
279
148
 
280
149
 
150
+ @dataclasses.dataclass
151
+ class _WorkerDone:
152
+ """An internal struct passed from a worker to the supervisor when it has
153
+ successfully completed all work.
154
+ """
155
+
156
+ name: str
157
+ """Name of the worker reporting completion."""
158
+
159
+
281
160
  @dataclasses.dataclass
282
161
  class _ProgressLog:
283
162
  """A high-level log message sent from a worker to the supervisor.
@@ -304,20 +183,22 @@ class _CompressionDictionary:
304
183
  """
305
184
 
306
185
 
307
- Report: TypeAlias = (
186
+ type Report = (
308
187
  ScanReport
309
188
  | _IngestReport
310
189
  | _WorkerErrorMessage
311
190
  | _ProgressLog
312
- | Literal[
313
- _Sentinel.WRITE_REPORT,
314
- _Sentinel.SCANNER_DONE,
315
- _Sentinel.INGESTER_DONE,
316
- _Sentinel.WRITER_DONE,
317
- ]
191
+ | _WorkerDone
192
+ | Literal[_Sentinel.WRITE_REPORT]
318
193
  )
319
194
 
320
195
 
196
+ def _disable_resources_parallelism() -> None:
197
+ os.environ["LSST_RESOURCES_NUM_WORKERS"] = "1"
198
+ os.environ.pop("LSST_RESOURCES_EXECUTOR", None)
199
+ os.environ["LSST_S3_USE_THREADS"] = "False"
200
+
201
+
321
202
  class SupervisorCommunicator:
322
203
  """A helper object that lets the supervisor direct the other workers.
323
204
 
@@ -327,7 +208,7 @@ class SupervisorCommunicator:
327
208
  LSST-customized logger.
328
209
  n_scanners : `int`
329
210
  Number of scanner workers.
330
- context : `WorkerContext`
211
+ worker_factory : `WorkerFactory`
331
212
  Abstraction over threading vs. multiprocessing.
332
213
  config : `AggregatorConfig`
333
214
  Configuration for the aggregator.
@@ -337,7 +218,7 @@ class SupervisorCommunicator:
337
218
  self,
338
219
  log: LsstLogAdapter,
339
220
  n_scanners: int,
340
- context: WorkerContext,
221
+ worker_factory: WorkerFactory,
341
222
  config: AggregatorConfig,
342
223
  ) -> None:
343
224
  self.config = config
@@ -347,14 +228,14 @@ class SupervisorCommunicator:
347
228
  # When complete, the supervisor sends n_scanners sentinals and each
348
229
  # scanner is careful to only take one before it starts its shutdown.
349
230
  self._scan_requests: Queue[_ScanRequest | Literal[_Sentinel.NO_MORE_SCAN_REQUESTS]] = (
350
- context.make_queue()
231
+ worker_factory.make_queue()
351
232
  )
352
233
  # The scanners send ingest requests to the ingester on this queue. Each
353
234
  # scanner sends one sentinal when it is done, and the ingester is
354
235
  # careful to wait for n_scanners sentinals to arrive before it starts
355
236
  # its shutdown.
356
237
  self._ingest_requests: Queue[IngestRequest | Literal[_Sentinel.NO_MORE_INGEST_REQUESTS]] = (
357
- context.make_queue()
238
+ worker_factory.make_queue()
358
239
  )
359
240
  # The scanners send write requests to the writer on this queue (which
360
241
  # will be `None` if we're not writing). The supervisor also sends
@@ -364,22 +245,22 @@ class SupervisorCommunicator:
364
245
  # starts its shutdown.
365
246
  self._write_requests: (
366
247
  Queue[ProvenanceQuantumScanData | Literal[_Sentinel.NO_MORE_WRITE_REQUESTS]] | None
367
- ) = context.make_queue() if config.output_path is not None else None
248
+ ) = worker_factory.make_queue() if config.is_writing_provenance else None
368
249
  # All other workers use this queue to send many different kinds of
369
250
  # reports the supervisor. The supervisor waits for a _DONE sentinal
370
251
  # from each worker before it finishes its shutdown.
371
- self._reports: Queue[Report] = context.make_queue()
252
+ self._reports: Queue[Report] = worker_factory.make_queue()
372
253
  # The writer sends the compression dictionary to the scanners on this
373
254
  # queue. It puts n_scanners copies on the queue, and each scanner only
374
255
  # takes one. The compression_dict queue has no sentinal because it is
375
256
  # only used at most once; the supervisor takes responsibility for
376
257
  # clearing it out shutting down.
377
- self._compression_dict: Queue[_CompressionDictionary] = context.make_queue()
258
+ self._compression_dict: Queue[_CompressionDictionary] = worker_factory.make_queue()
378
259
  # The supervisor sets this event when it receives an interrupt request
379
260
  # from an exception in the main process (usually KeyboardInterrupt).
380
261
  # Worker communicators check this in their polling loops and raise
381
262
  # FatalWorkerError when they see it set.
382
- self._cancel_event: Event = context.make_event()
263
+ self._cancel_event: Event = worker_factory.make_event()
383
264
  # Track what state we are in closing down, so we can start at the right
384
265
  # point if we're interrupted and __exit__ needs to clean up. Note that
385
266
  # we can't rely on a non-exception __exit__ to do any shutdown work
@@ -388,51 +269,77 @@ class SupervisorCommunicator:
388
269
  self._sent_no_more_scan_requests = False
389
270
  self._sent_no_more_write_requests = False
390
271
  self._n_scanners_done = 0
391
- self._ingester_done = False
392
- self._writer_done = self._write_requests is None
272
+ self.workers: dict[str, Worker] = {}
393
273
 
394
- def wait_for_workers_to_finish(self, already_failing: bool = False) -> None:
274
+ def _wait_for_workers_to_finish(self, already_failing: bool = False) -> None:
275
+ # Orderly shutdown, including exceptions: let workers clear out the
276
+ # queues they're responsible for reading from.
395
277
  if not self._sent_no_more_scan_requests:
396
278
  for _ in range(self.n_scanners):
397
- self._scan_requests.put(_Sentinel.NO_MORE_SCAN_REQUESTS, block=False)
279
+ self._scan_requests.put(_Sentinel.NO_MORE_SCAN_REQUESTS)
398
280
  self._sent_no_more_scan_requests = True
399
281
  if not self._sent_no_more_write_requests and self._write_requests is not None:
400
- self._write_requests.put(_Sentinel.NO_MORE_WRITE_REQUESTS, block=False)
282
+ self._write_requests.put(_Sentinel.NO_MORE_WRITE_REQUESTS)
401
283
  self._sent_no_more_write_requests = True
402
- while not (self._ingester_done and self._writer_done and self._n_scanners_done == self.n_scanners):
284
+ while not all(w.successful for w in self.workers.values()):
403
285
  match self._handle_progress_reports(
404
- self._reports.get(block=True), already_failing=already_failing
286
+ self._get_report(block=True), already_failing=already_failing
405
287
  ):
406
- case None | ScanReport() | _IngestReport():
288
+ case None | ScanReport():
407
289
  pass
408
- case _Sentinel.INGESTER_DONE:
409
- self._ingester_done = True
410
- self.progress.quantum_ingests.close()
411
- case _Sentinel.SCANNER_DONE:
412
- self._n_scanners_done += 1
413
- self.progress.scans.close()
414
- case _Sentinel.WRITER_DONE:
415
- self._writer_done = True
416
- self.progress.writes.close()
290
+ case _WorkerDone(name=worker_name):
291
+ self.workers[worker_name].successful = True
292
+ if worker_name == IngesterCommunicator.get_worker_name():
293
+ self.progress.quantum_ingests.close()
294
+ elif worker_name == WriterCommunicator.get_worker_name():
295
+ self.progress.writes.close()
296
+ else:
297
+ self._n_scanners_done += 1
298
+ if self._n_scanners_done == self.n_scanners:
299
+ self.progress.scans.close()
417
300
  case unexpected:
418
301
  raise AssertionError(f"Unexpected message {unexpected!r} to supervisor.")
419
302
  self.log.verbose(
420
- "Blocking on reports queue: ingester_done=%s, writer_done=%s, n_scanners_done=%s.",
421
- self._ingester_done,
422
- self._writer_done,
423
- self._n_scanners_done,
303
+ "Waiting for workers [%s] to report successful completion.",
304
+ ", ".join(w.name for w in self.workers.values() if not w.successful),
424
305
  )
425
- while _get_from_queue(self._compression_dict) is not None:
426
- self.log.verbose("Flushing compression dict queue.")
427
306
  self.log.verbose("Checking that all queues are empty.")
428
- self._expect_empty_queue(self._scan_requests)
429
- self._expect_empty_queue(self._ingest_requests)
307
+ if self._scan_requests.clear():
308
+ self.progress.log.warning("Scan request queue was not empty at shutdown.")
309
+ self._scan_requests.kill()
310
+ if self._ingest_requests.clear():
311
+ self.progress.log.warning("Ingest request queue was not empty at shutdown.")
312
+ self._ingest_requests.kill()
313
+ if self._write_requests is not None and self._write_requests.clear():
314
+ self.progress.log.warning("Write request queue was not empty at shutdown.")
315
+ self._write_requests.kill()
316
+ if self._reports.clear():
317
+ self.progress.log.warning("Reports queue was not empty at shutdown.")
318
+ self._reports.kill()
319
+ if self._compression_dict.clear():
320
+ self.progress.log.warning("Compression dictionary queue was not empty at shutdown.")
321
+ self._compression_dict.kill()
322
+ for worker in self.workers.values():
323
+ self.log.verbose("Waiting for %s to shut down.", worker.name)
324
+ worker.join()
325
+
326
+ def _terminate(self) -> None:
327
+ # Disorderly shutdown: we cannot assume any of the
328
+ # multiprocessing.Queue object work, and in fact they may hang
329
+ # if we try to do anything with them.
330
+ self._scan_requests.kill()
331
+ self._ingest_requests.kill()
430
332
  if self._write_requests is not None:
431
- self._expect_empty_queue(self._write_requests)
432
- self._expect_empty_queue(self._reports)
433
- self._expect_empty_queue(self._compression_dict)
333
+ self._write_requests.kill()
334
+ self._compression_dict.kill()
335
+ self._reports.kill()
336
+ for name, worker in self.workers.items():
337
+ if worker.is_alive():
338
+ self.progress.log.critical("Terminating worker %r.", name)
339
+ worker.kill()
434
340
 
435
341
  def __enter__(self) -> Self:
342
+ _disable_resources_parallelism()
436
343
  self.progress.__enter__()
437
344
  # We make the low-level logger in __enter__ instead of __init__ only
438
345
  # because that's the pattern used by true workers (where it matters).
@@ -446,11 +353,23 @@ class SupervisorCommunicator:
446
353
  traceback: TracebackType | None,
447
354
  ) -> None:
448
355
  if exc_type is not None:
449
- if exc_type is not FatalWorkerError:
450
- self.progress.log.critical(f"Caught {exc_type.__name__}; attempting to shut down cleanly.")
451
356
  self._cancel_event.set()
452
- self.wait_for_workers_to_finish(already_failing=exc_type is not None)
357
+ if exc_type is _WorkerCommunicationError:
358
+ self.progress.log.critical("Worker '%s' was terminated before it could finish.", exc_value)
359
+ self._terminate()
360
+ return None
361
+ if exc_type is not FatalWorkerError:
362
+ self.progress.log.critical("Caught %s; attempting to shut down cleanly.", exc_type)
363
+ try:
364
+ self._wait_for_workers_to_finish(already_failing=exc_type is not None)
365
+ except _WorkerCommunicationError as err:
366
+ self.progress.log.critical(
367
+ "Worker '%s' was terminated before it could finish (after scanning).", err
368
+ )
369
+ self._terminate()
370
+ raise
453
371
  self.progress.__exit__(exc_type, exc_value, traceback)
372
+ return None
454
373
 
455
374
  def request_scan(self, quantum_id: uuid.UUID) -> None:
456
375
  """Send a request to the scanners to scan the given quantum.
@@ -460,7 +379,7 @@ class SupervisorCommunicator:
460
379
  quantum_id : `uuid.UUID`
461
380
  ID of the quantum to scan.
462
381
  """
463
- self._scan_requests.put(_ScanRequest(quantum_id), block=False)
382
+ self._scan_requests.put(_ScanRequest(quantum_id))
464
383
 
465
384
  def request_write(self, request: ProvenanceQuantumScanData) -> None:
466
385
  """Send a request to the writer to write provenance for the given scan.
@@ -472,7 +391,7 @@ class SupervisorCommunicator:
472
391
  in the case of blocked quanta).
473
392
  """
474
393
  assert self._write_requests is not None, "Writer should not be used if writing is disabled."
475
- self._write_requests.put(request, block=False)
394
+ self._write_requests.put(request)
476
395
 
477
396
  def poll(self) -> Iterator[ScanReport]:
478
397
  """Poll for reports from workers while sending scan requests.
@@ -488,9 +407,8 @@ class SupervisorCommunicator:
488
407
  it continues until the report queue is empty.
489
408
  """
490
409
  block = True
491
- msg = _get_from_queue(self._reports, block=block)
492
- while msg is not None:
493
- match self._handle_progress_reports(msg):
410
+ while report := self._get_report(block=block):
411
+ match self._handle_progress_reports(report):
494
412
  case ScanReport() as scan_report:
495
413
  block = False
496
414
  yield scan_report
@@ -498,19 +416,40 @@ class SupervisorCommunicator:
498
416
  pass
499
417
  case unexpected:
500
418
  raise AssertionError(f"Unexpected message {unexpected!r} to supervisor.")
501
- msg = _get_from_queue(self._reports, block=block)
419
+
420
+ @overload
421
+ def _get_report(self, block: Literal[True]) -> Report: ...
422
+
423
+ @overload
424
+ def _get_report(self, block: bool) -> Report | None: ...
425
+
426
+ def _get_report(self, block: bool) -> Report | None:
427
+ """Get a report from the reports queue, with timeout guards on
428
+ blocking requests.
429
+
430
+ This method may *return* WorkerCommunicatorError (rather than raise it)
431
+ when a serious error occurred communicating with a subprocess. This
432
+ is to avoid raising an exception in an __exit__ method (which calls
433
+ method).
434
+ """
435
+ report = self._reports.get(block=block, timeout=self.config.worker_check_timeout)
436
+ while report is None and block:
437
+ # We hit the timeout; make sure all of the workers
438
+ # that should be alive actually are.
439
+ for name, worker in self.workers.items():
440
+ if not worker.successful and not worker.is_alive():
441
+ # Delete this worker from the list of workers so we don't
442
+ # hit this condition again when we try to handle the
443
+ # exception we raise.
444
+ raise _WorkerCommunicationError(name)
445
+ # If nothing is dead and we didn't hit the hang timeout, keep
446
+ # trying.
447
+ report = self._reports.get(block=block, timeout=self.config.worker_check_timeout)
448
+ return report
502
449
 
503
450
  def _handle_progress_reports(
504
451
  self, report: Report, already_failing: bool = False
505
- ) -> (
506
- ScanReport
507
- | Literal[
508
- _Sentinel.SCANNER_DONE,
509
- _Sentinel.INGESTER_DONE,
510
- _Sentinel.WRITER_DONE,
511
- ]
512
- | None
513
- ):
452
+ ) -> ScanReport | _WorkerDone | None:
514
453
  """Handle reports to the supervisor that can appear at any time, and
515
454
  are typically just updates to the progress we've made.
516
455
 
@@ -540,15 +479,9 @@ class SupervisorCommunicator:
540
479
  return report
541
480
  return None
542
481
 
543
- @staticmethod
544
- def _expect_empty_queue(queue: Queue[Any]) -> None:
545
- """Assert that the given queue is empty."""
546
- if (msg := _get_from_queue(queue, block=False, timeout=0)) is not None:
547
- raise AssertionError(f"Queue is not empty; found {msg!r}.")
548
-
549
482
 
550
483
  class WorkerCommunicator:
551
- """A base class for non-supervisor workers.
484
+ """A base class for non-supervisor worker communicators.
552
485
 
553
486
  Parameters
554
487
  ----------
@@ -560,8 +493,8 @@ class WorkerCommunicator:
560
493
  Notes
561
494
  -----
562
495
  Each worker communicator is constructed in the main process and entered as
563
- a context manager on the actual worker process, so attributes that cannot
564
- be pickled are constructed in ``__enter__`` instead of ``__init__``.
496
+ a context manager *only* on the actual worker process, so attributes that
497
+ cannot be pickled are constructed in ``__enter__`` instead of ``__init__``.
565
498
 
566
499
  Worker communicators provide access to an `AggregatorConfig` and a logger
567
500
  to their workers. As context managers, they handle exceptions and ensure
@@ -581,6 +514,7 @@ class WorkerCommunicator:
581
514
  self._cancel_event = supervisor._cancel_event
582
515
 
583
516
  def __enter__(self) -> Self:
517
+ _disable_resources_parallelism()
584
518
  self.log = make_worker_log(self.name, self.config)
585
519
  self.log.verbose("%s has PID %s (parent is %s).", self.name, os.getpid(), os.getppid())
586
520
  self._exit_stack = ExitStack().__enter__()
@@ -613,8 +547,7 @@ class WorkerCommunicator:
613
547
  _WorkerErrorMessage(
614
548
  self.name,
615
549
  "".join(format_exception(exc_type, exc_value, traceback)),
616
- ),
617
- block=False,
550
+ )
618
551
  )
619
552
  self.log.debug("Error message sent to supervisor.")
620
553
  else:
@@ -637,7 +570,7 @@ class WorkerCommunicator:
637
570
  message : `str`
638
571
  Log message.
639
572
  """
640
- self._reports.put(_ProgressLog(message=message, level=level), block=False)
573
+ self._reports.put(_ProgressLog(message=message, level=level))
641
574
 
642
575
  def check_for_cancel(self) -> None:
643
576
  """Check for a cancel signal from the supervisor and raise
@@ -659,7 +592,7 @@ class ScannerCommunicator(WorkerCommunicator):
659
592
  """
660
593
 
661
594
  def __init__(self, supervisor: SupervisorCommunicator, scanner_id: int):
662
- super().__init__(supervisor, f"scanner-{scanner_id:03d}")
595
+ super().__init__(supervisor, self.get_worker_name(scanner_id))
663
596
  self.scanner_id = scanner_id
664
597
  self._scan_requests = supervisor._scan_requests
665
598
  self._ingest_requests = supervisor._ingest_requests
@@ -668,6 +601,10 @@ class ScannerCommunicator(WorkerCommunicator):
668
601
  self._got_no_more_scan_requests: bool = False
669
602
  self._sent_no_more_ingest_requests: bool = False
670
603
 
604
+ @staticmethod
605
+ def get_worker_name(scanner_id: int) -> str:
606
+ return f"scanner-{scanner_id:03d}"
607
+
671
608
  def report_scan(self, msg: ScanReport) -> None:
672
609
  """Report a completed scan to the supervisor.
673
610
 
@@ -676,7 +613,7 @@ class ScannerCommunicator(WorkerCommunicator):
676
613
  msg : `ScanReport`
677
614
  Report to send.
678
615
  """
679
- self._reports.put(msg, block=False)
616
+ self._reports.put(msg)
680
617
 
681
618
  def request_ingest(self, request: IngestRequest) -> None:
682
619
  """Ask the ingester to ingest a quantum's outputs.
@@ -692,9 +629,9 @@ class ScannerCommunicator(WorkerCommunicator):
692
629
  as complete to the supervisor instead of sending it to the ingester.
693
630
  """
694
631
  if request:
695
- self._ingest_requests.put(request, block=False)
632
+ self._ingest_requests.put(request)
696
633
  else:
697
- self._reports.put(_IngestReport(1), block=False)
634
+ self._reports.put(_IngestReport(1))
698
635
 
699
636
  def request_write(self, request: ProvenanceQuantumScanData) -> None:
700
637
  """Ask the writer to write provenance for a quantum.
@@ -705,7 +642,7 @@ class ScannerCommunicator(WorkerCommunicator):
705
642
  Result of scanning a quantum.
706
643
  """
707
644
  assert self._write_requests is not None, "Writer should not be used if writing is disabled."
708
- self._write_requests.put(request, block=False)
645
+ self._write_requests.put(request)
709
646
 
710
647
  def get_compression_dict(self) -> bytes | None:
711
648
  """Attempt to get the compression dict from the writer.
@@ -721,7 +658,7 @@ class ScannerCommunicator(WorkerCommunicator):
721
658
  A scanner should only call this method before it actually has the
722
659
  compression dict.
723
660
  """
724
- if (cdict := _get_from_queue(self._compression_dict)) is not None:
661
+ if (cdict := self._compression_dict.get()) is not None:
725
662
  return cdict.data
726
663
  return None
727
664
 
@@ -740,7 +677,7 @@ class ScannerCommunicator(WorkerCommunicator):
740
677
  """
741
678
  while True:
742
679
  self.check_for_cancel()
743
- scan_request = _get_from_queue(self._scan_requests, block=True, timeout=self.config.worker_sleep)
680
+ scan_request = self._scan_requests.get(block=True, timeout=self.config.worker_sleep)
744
681
  if scan_request is _Sentinel.NO_MORE_SCAN_REQUESTS:
745
682
  self._got_no_more_scan_requests = True
746
683
  return
@@ -754,20 +691,18 @@ class ScannerCommunicator(WorkerCommunicator):
754
691
  traceback: TracebackType | None,
755
692
  ) -> bool | None:
756
693
  result = super().__exit__(exc_type, exc_value, traceback)
757
- self._ingest_requests.put(_Sentinel.NO_MORE_INGEST_REQUESTS, block=False)
694
+ self._ingest_requests.put(_Sentinel.NO_MORE_INGEST_REQUESTS)
758
695
  if self._write_requests is not None:
759
- self._write_requests.put(_Sentinel.NO_MORE_WRITE_REQUESTS, block=False)
696
+ self._write_requests.put(_Sentinel.NO_MORE_WRITE_REQUESTS)
760
697
  while not self._got_no_more_scan_requests:
761
- self.log.debug("Clearing scan request queue (~%d remaining)", self._scan_requests.qsize())
762
698
  if (
763
699
  not self._got_no_more_scan_requests
764
- and self._scan_requests.get() is _Sentinel.NO_MORE_SCAN_REQUESTS
700
+ and self._scan_requests.get(block=True) is _Sentinel.NO_MORE_SCAN_REQUESTS
765
701
  ):
766
702
  self._got_no_more_scan_requests = True
767
- # We let the supervisor clear out the compression dict queue, because
768
- # a single scanner can't know if it ever got sent out or not.
769
- self.log.verbose("Sending done sentinal.")
770
- self._reports.put(_Sentinel.SCANNER_DONE, block=False)
703
+ # We let the writer clear out the compression dict queue.
704
+ self.log.verbose("Sending completion message.")
705
+ self._reports.put(_WorkerDone(self.name))
771
706
  return result
772
707
 
773
708
 
@@ -781,11 +716,15 @@ class IngesterCommunicator(WorkerCommunicator):
781
716
  """
782
717
 
783
718
  def __init__(self, supervisor: SupervisorCommunicator):
784
- super().__init__(supervisor, "ingester")
719
+ super().__init__(supervisor, self.get_worker_name())
785
720
  self.n_scanners = supervisor.n_scanners
786
721
  self._ingest_requests = supervisor._ingest_requests
787
722
  self._n_requesters_done = 0
788
723
 
724
+ @staticmethod
725
+ def get_worker_name() -> str:
726
+ return "ingester"
727
+
789
728
  def __exit__(
790
729
  self,
791
730
  exc_type: type[BaseException] | None,
@@ -801,8 +740,8 @@ class IngesterCommunicator(WorkerCommunicator):
801
740
  )
802
741
  if self._ingest_requests.get(block=True) is _Sentinel.NO_MORE_INGEST_REQUESTS:
803
742
  self._n_requesters_done += 1
804
- self.log.verbose("Sending done sentinal.")
805
- self._reports.put(_Sentinel.INGESTER_DONE, block=False)
743
+ self.log.verbose("Sending completion message.")
744
+ self._reports.put(_WorkerDone(self.name))
806
745
  return result
807
746
 
808
747
  def report_ingest(self, n_producers: int) -> None:
@@ -813,7 +752,7 @@ class IngesterCommunicator(WorkerCommunicator):
813
752
  n_producers : `int`
814
753
  Number of producing quanta whose datasets were ingested.
815
754
  """
816
- self._reports.put(_IngestReport(n_producers), block=False)
755
+ self._reports.put(_IngestReport(n_producers))
817
756
 
818
757
  def poll(self) -> Iterator[IngestRequest]:
819
758
  """Poll for ingest requests from the scanner workers.
@@ -830,7 +769,7 @@ class IngesterCommunicator(WorkerCommunicator):
830
769
  """
831
770
  while True:
832
771
  self.check_for_cancel()
833
- ingest_request = _get_from_queue(self._ingest_requests, block=True, timeout=_TINY_TIMEOUT)
772
+ ingest_request = self._ingest_requests.get(block=True, timeout=_TINY_TIMEOUT)
834
773
  if ingest_request is _Sentinel.NO_MORE_INGEST_REQUESTS:
835
774
  self._n_requesters_done += 1
836
775
  if self._n_requesters_done == self.n_scanners:
@@ -852,7 +791,7 @@ class WriterCommunicator(WorkerCommunicator):
852
791
 
853
792
  def __init__(self, supervisor: SupervisorCommunicator):
854
793
  assert supervisor._write_requests is not None
855
- super().__init__(supervisor, "writer")
794
+ super().__init__(supervisor, self.get_worker_name())
856
795
  self.n_scanners = supervisor.n_scanners
857
796
  self._write_requests = supervisor._write_requests
858
797
  self._compression_dict = supervisor._compression_dict
@@ -860,6 +799,10 @@ class WriterCommunicator(WorkerCommunicator):
860
799
  self._n_requesters_done = 0
861
800
  self._sent_compression_dict = False
862
801
 
802
+ @staticmethod
803
+ def get_worker_name() -> str:
804
+ return "writer"
805
+
863
806
  def __exit__(
864
807
  self,
865
808
  exc_type: type[BaseException] | None,
@@ -877,8 +820,12 @@ class WriterCommunicator(WorkerCommunicator):
877
820
  )
878
821
  if self._write_requests.get(block=True) is _Sentinel.NO_MORE_WRITE_REQUESTS:
879
822
  self._n_requesters_done += 1
880
- self.log.verbose("Sending done sentinal.")
881
- self._reports.put(_Sentinel.WRITER_DONE, block=False)
823
+ if self._compression_dict.clear():
824
+ self.log.verbose("Cleared out compression dictionary queue.")
825
+ else:
826
+ self.log.verbose("Compression dictionary queue was already empty.")
827
+ self.log.verbose("Sending completion message.")
828
+ self._reports.put(_WorkerDone(self.name))
882
829
  return result
883
830
 
884
831
  def poll(self) -> Iterator[ProvenanceQuantumScanData]:
@@ -896,7 +843,7 @@ class WriterCommunicator(WorkerCommunicator):
896
843
  """
897
844
  while True:
898
845
  self.check_for_cancel()
899
- write_request = _get_from_queue(self._write_requests, block=True, timeout=_TINY_TIMEOUT)
846
+ write_request = self._write_requests.get(block=True, timeout=_TINY_TIMEOUT)
900
847
  if write_request is _Sentinel.NO_MORE_WRITE_REQUESTS:
901
848
  self._n_requesters_done += 1
902
849
  if self._n_requesters_done == self._n_requesters:
@@ -916,16 +863,16 @@ class WriterCommunicator(WorkerCommunicator):
916
863
  """
917
864
  self.log.debug("Sending compression dictionary.")
918
865
  for _ in range(self.n_scanners):
919
- self._compression_dict.put(_CompressionDictionary(cdict_data), block=False)
866
+ self._compression_dict.put(_CompressionDictionary(cdict_data))
920
867
  self._sent_compression_dict = True
921
868
 
922
869
  def report_write(self) -> None:
923
870
  """Report to the supervisor that provenance for a quantum was written
924
871
  to the graph.
925
872
  """
926
- self._reports.put(_Sentinel.WRITE_REPORT, block=False)
873
+ self._reports.put(_Sentinel.WRITE_REPORT)
927
874
 
928
- def periodically_check_for_cancel(self, iterable: Iterable[_T], n: int = 100) -> Iterator[_T]:
875
+ def periodically_check_for_cancel[T](self, iterable: Iterable[T], n: int = 100) -> Iterator[T]:
929
876
  """Iterate while checking for a cancellation signal every ``n``
930
877
  iterations.
931
878