lsst-pipe-base 29.2025.4100__py3-none-any.whl → 29.2025.4300__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. lsst/pipe/base/_status.py +1 -1
  2. lsst/pipe/base/cli/cmd/__init__.py +2 -2
  3. lsst/pipe/base/cli/cmd/commands.py +116 -1
  4. lsst/pipe/base/graph_walker.py +8 -4
  5. lsst/pipe/base/pipeline_graph/_pipeline_graph.py +30 -5
  6. lsst/pipe/base/quantum_graph/__init__.py +1 -0
  7. lsst/pipe/base/quantum_graph/_common.py +2 -1
  8. lsst/pipe/base/quantum_graph/_multiblock.py +41 -7
  9. lsst/pipe/base/quantum_graph/_predicted.py +62 -5
  10. lsst/pipe/base/quantum_graph/_provenance.py +1209 -0
  11. lsst/pipe/base/quantum_graph/aggregator/__init__.py +143 -0
  12. lsst/pipe/base/quantum_graph/aggregator/_communicators.py +981 -0
  13. lsst/pipe/base/quantum_graph/aggregator/_config.py +139 -0
  14. lsst/pipe/base/quantum_graph/aggregator/_ingester.py +312 -0
  15. lsst/pipe/base/quantum_graph/aggregator/_progress.py +208 -0
  16. lsst/pipe/base/quantum_graph/aggregator/_scanner.py +371 -0
  17. lsst/pipe/base/quantum_graph/aggregator/_structs.py +167 -0
  18. lsst/pipe/base/quantum_graph/aggregator/_supervisor.py +225 -0
  19. lsst/pipe/base/quantum_graph/aggregator/_writer.py +593 -0
  20. lsst/pipe/base/resource_usage.py +183 -0
  21. lsst/pipe/base/simple_pipeline_executor.py +4 -1
  22. lsst/pipe/base/tests/util.py +31 -0
  23. lsst/pipe/base/version.py +1 -1
  24. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/METADATA +1 -1
  25. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/RECORD +33 -22
  26. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/WHEEL +0 -0
  27. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/entry_points.txt +0 -0
  28. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/licenses/COPYRIGHT +0 -0
  29. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/licenses/LICENSE +0 -0
  30. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/licenses/bsd_license.txt +0 -0
  31. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/licenses/gpl-v3.0.txt +0 -0
  32. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/top_level.txt +0 -0
  33. {lsst_pipe_base-29.2025.4100.dist-info → lsst_pipe_base-29.2025.4300.dist-info}/zip-safe +0 -0
@@ -0,0 +1,981 @@
1
+ # This file is part of pipe_base.
2
+ #
3
+ # Developed for the LSST Data Management System.
4
+ # This product includes software developed by the LSST Project
5
+ # (http://www.lsst.org).
6
+ # See the COPYRIGHT file at the top-level directory of this distribution
7
+ # for details of code ownership.
8
+ #
9
+ # This software is dual licensed under the GNU General Public License and also
10
+ # under a 3-clause BSD license. Recipients may choose which of these licenses
11
+ # to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12
+ # respectively. If you choose the GPL option then the following text applies
13
+ # (but note that there is still no warranty even if you opt for BSD instead):
14
+ #
15
+ # This program is free software: you can redistribute it and/or modify
16
+ # it under the terms of the GNU General Public License as published by
17
+ # the Free Software Foundation, either version 3 of the License, or
18
+ # (at your option) any later version.
19
+ #
20
+ # This program is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
+ # GNU General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU General Public License
26
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
27
+
28
+ from __future__ import annotations
29
+
30
+ __all__ = (
31
+ "FatalWorkerError",
32
+ "IngesterCommunicator",
33
+ "ScannerCommunicator",
34
+ "SpawnProcessContext",
35
+ "SupervisorCommunicator",
36
+ "ThreadingContext",
37
+ "WorkerContext",
38
+ )
39
+
40
+ import cProfile
41
+ import dataclasses
42
+ import enum
43
+ import logging
44
+ import multiprocessing.context
45
+ import multiprocessing.synchronize
46
+ import os
47
+ import queue
48
+ import signal
49
+ import threading
50
+ import time
51
+ import uuid
52
+ from abc import ABC, abstractmethod
53
+ from collections.abc import Callable, Iterable, Iterator
54
+ from contextlib import AbstractContextManager, ExitStack, contextmanager
55
+ from traceback import format_exception
56
+ from types import TracebackType
57
+ from typing import Any, Literal, Self, TypeAlias, TypeVar, Union
58
+
59
+ from lsst.utils.logging import VERBOSE, LsstLogAdapter
60
+
61
+ from ._config import AggregatorConfig
62
+ from ._progress import Progress, make_worker_log
63
+ from ._structs import IngestRequest, ScanReport, ScanResult
64
+
65
+ _T = TypeVar("_T")
66
+
67
+ _TINY_TIMEOUT = 0.01
68
+
69
+ # multiprocessing.Queue is a type according to the standard library type stubs,
70
+ # but it's really a function at runtime. But since the Python <= 3.11 type
71
+ # alias syntax uses the real runtime things we need to use strings, and hence
72
+ # we need to use Union. With Python 3.12's 'type' statement this gets cleaner.
73
+ Queue: TypeAlias = Union["queue.Queue[_T]", "multiprocessing.Queue[_T]"]
74
+
75
+ Event: TypeAlias = threading.Event | multiprocessing.synchronize.Event
76
+
77
+ Worker: TypeAlias = threading.Thread | multiprocessing.context.SpawnProcess
78
+
79
+
80
+ class WorkerContext(ABC):
81
+ """A simple abstract interface that can be implemented by both threading
82
+ and multiprocessing.
83
+ """
84
+
85
+ @abstractmethod
86
+ def make_queue(self) -> Queue[Any]:
87
+ """Make an empty queue that can be used to pass objects between
88
+ workers in this context.
89
+ """
90
+ raise NotImplementedError()
91
+
92
+ @abstractmethod
93
+ def make_event(self) -> Event:
94
+ """Make an event that can be used to communicate a boolean state change
95
+ to workers in this context.
96
+ """
97
+ raise NotImplementedError()
98
+
99
+ @abstractmethod
100
+ def make_worker(
101
+ self, target: Callable[..., None], args: tuple[Any, ...], name: str | None = None
102
+ ) -> Worker:
103
+ """Make a worker that runs the given callable.
104
+
105
+ Parameters
106
+ ----------
107
+ target : `~collections.abc.Callable`
108
+ A callable to invoke on the worker.
109
+ args : `tuple`
110
+ Positional arguments to pass to the callable.
111
+ name : `str`, optional
112
+ Human-readable name for the worker.
113
+
114
+ Returns
115
+ -------
116
+ worker : `threading.Thread` or `multiprocessing.Process`
117
+ Process or thread. Will need to have its ``start`` method called
118
+ to actually begin.
119
+ """
120
+ raise NotImplementedError()
121
+
122
+
123
+ class ThreadingContext(WorkerContext):
124
+ """An implementation of `WorkerContext` backed by the `threading`
125
+ module.
126
+ """
127
+
128
+ def make_queue(self) -> Queue[Any]:
129
+ return queue.Queue()
130
+
131
+ def make_event(self) -> Event:
132
+ return threading.Event()
133
+
134
+ def make_worker(
135
+ self, target: Callable[..., None], args: tuple[Any, ...], name: str | None = None
136
+ ) -> Worker:
137
+ return threading.Thread(target=target, args=args, name=name)
138
+
139
+
140
+ class SpawnProcessContext(WorkerContext):
141
+ """An implementation of `WorkerContext` backed by the `multiprocessing`
142
+ module, with new processes started by spawning.
143
+ """
144
+
145
+ def __init__(self) -> None:
146
+ self._ctx = multiprocessing.get_context("spawn")
147
+
148
+ def make_queue(self) -> Queue[Any]:
149
+ return self._ctx.Queue()
150
+
151
+ def make_event(self) -> Event:
152
+ return self._ctx.Event()
153
+
154
+ def make_worker(
155
+ self, target: Callable[..., None], args: tuple[Any, ...], name: str | None = None
156
+ ) -> Worker:
157
+ return self._ctx.Process(target=target, args=args, name=name)
158
+
159
+
160
+ def _get_from_queue(q: Queue[_T], block: bool = False, timeout: float | None = None) -> _T | None:
161
+ """Get an object from a queue and return `None` if it is empty.
162
+
163
+ Parameters
164
+ ----------
165
+ q : `Queue`
166
+ Queue to get an object from.
167
+ block : `bool`
168
+ Whether to block until an object is available.
169
+ timeout : `float` or `None`, optional
170
+ Maximum number of seconds to wait while blocking.
171
+
172
+ Returns
173
+ -------
174
+ obj : `object` or `None`
175
+ Object from the queue, or `None` if it was empty.
176
+ """
177
+ try:
178
+ return q.get(block=block, timeout=timeout)
179
+ except queue.Empty:
180
+ return None
181
+
182
+
183
+ class FatalWorkerError(BaseException):
184
+ """An exception raised by communicators when one worker (including the
185
+ supervisor) has caught an exception in order to signal the others to shut
186
+ down.
187
+ """
188
+
189
+
190
+ class _Sentinel(enum.Enum):
191
+ """Sentinel values used to indicate sequence points or worker shutdown
192
+ conditions.
193
+ """
194
+
195
+ NO_MORE_SCAN_REQUESTS = enum.auto()
196
+ """Sentinel sent from the supervisor to scanners to indicate that there are
197
+ no more quanta left to be scanned.
198
+ """
199
+
200
+ NO_MORE_INGEST_REQUESTS = enum.auto()
201
+ """Sentinel sent from scanners to the ingester to indicate that there are
202
+ will be no more ingest requests from a particular worker.
203
+ """
204
+
205
+ NO_MORE_WRITE_REQUESTS = enum.auto()
206
+ """Sentinel sent from scanners and the supervisor to the writer to
207
+ indicate that there are will be no more write requests from a particular
208
+ worker.
209
+ """
210
+
211
+ WRITE_REPORT = enum.auto()
212
+ """Sentinel sent from the writer to the supervisor to report that a
213
+ quantum's provenance was written.
214
+ """
215
+
216
+ SCANNER_DONE = enum.auto()
217
+ """Sentinel sent from scanners to the supervisor to report that they are
218
+ done and shutting down.
219
+ """
220
+
221
+ INGESTER_DONE = enum.auto()
222
+ """Sentinel sent from the ingester to the supervisor to report that it is
223
+ done and shutting down.
224
+ """
225
+
226
+ WRITER_DONE = enum.auto()
227
+ """Sentinel sent from the writer to the supervisor to report that it is
228
+ done and shutting down.
229
+ """
230
+
231
+
232
+ @dataclasses.dataclass
233
+ class _WorkerErrorMessage:
234
+ """An internal worker used to pass information about an error that occurred
235
+ on a worker back to the supervisor.
236
+
237
+ As a rule, these are unexpected, unrecoverable exceptions.
238
+ """
239
+
240
+ worker: str
241
+ """Name of the originating worker."""
242
+
243
+ traceback: str
244
+ """A logged exception traceback.
245
+
246
+ Note that this is not a `BaseException` subclass that can actually be
247
+ re-raised on the supervisor; it's just something we can log to make the
248
+ right traceback appear on the screen. If something silences that printing
249
+ in favor of its own exception management (pytest!) this information
250
+ disappears.
251
+ """
252
+
253
+
254
+ @dataclasses.dataclass
255
+ class _ScanRequest:
256
+ """An internal struct passed from the supervisor to the scanners to request
257
+ a quantum be scanned.
258
+ """
259
+
260
+ quantum_id: uuid.UUID
261
+ """ID of the quantum to be scanned."""
262
+
263
+
264
+ @dataclasses.dataclass
265
+ class _IngestReport:
266
+ """An internal struct passed from the ingester to the supervisor to report
267
+ a completed ingest batch.
268
+ """
269
+
270
+ n_producers: int
271
+ """Number of producing quanta whose datasets were ingested.
272
+
273
+ We use quanta rather than datasets as the count here because the supervisor
274
+ knows the total number of quanta in advance but not the total number of
275
+ datasets to be ingested, so it's a lot easier to attach a denominator
276
+ and/or progress bar to this number.
277
+ """
278
+
279
+
280
+ @dataclasses.dataclass
281
+ class _ProgressLog:
282
+ """A high-level log message sent from a worker to the supervisor.
283
+
284
+ These are messages that should appear to come from the main
285
+ 'aggregate-graph' logger, not a worker-specific one.
286
+ """
287
+
288
+ message: str
289
+ """Log message."""
290
+
291
+ level: int
292
+ """Log level."""
293
+
294
+
295
+ @dataclasses.dataclass
296
+ class _CompressionDictionary:
297
+ """An internal struct used to send the compression dictionary from the
298
+ writer to the scanners.
299
+ """
300
+
301
+ data: bytes
302
+ """The `bytes` representation of a `zstandard.ZstdCompressionDict`.
303
+ """
304
+
305
+
306
+ Report: TypeAlias = (
307
+ ScanReport
308
+ | _IngestReport
309
+ | _WorkerErrorMessage
310
+ | _ProgressLog
311
+ | Literal[
312
+ _Sentinel.WRITE_REPORT,
313
+ _Sentinel.SCANNER_DONE,
314
+ _Sentinel.INGESTER_DONE,
315
+ _Sentinel.WRITER_DONE,
316
+ ]
317
+ )
318
+
319
+
320
+ class SupervisorCommunicator:
321
+ """A helper object that lets the supervisor direct the other workers.
322
+
323
+ Parameters
324
+ ----------
325
+ log : `lsst.utils.logging.LsstLogAdapter`
326
+ LSST-customized logger.
327
+ n_scanners : `int`
328
+ Number of scanner workers.
329
+ context : `WorkerContext`
330
+ Abstraction over threading vs. multiprocessing.
331
+ config : `AggregatorConfig`
332
+ Configuration for the aggregator.
333
+ """
334
+
335
+ def __init__(
336
+ self,
337
+ log: LsstLogAdapter,
338
+ n_scanners: int,
339
+ context: WorkerContext,
340
+ config: AggregatorConfig,
341
+ ) -> None:
342
+ self.config = config
343
+ self.progress = Progress(log, config)
344
+ self.n_scanners = n_scanners
345
+ # The supervisor sends scan requests to scanners on this queue.
346
+ # When complete, the supervisor sends n_scanners sentinals and each
347
+ # scanner is careful to only take one before it starts its shutdown.
348
+ self._scan_requests: Queue[_ScanRequest | Literal[_Sentinel.NO_MORE_SCAN_REQUESTS]] = (
349
+ context.make_queue()
350
+ )
351
+ # The scanners send ingest requests to the ingester on this queue. Each
352
+ # scanner sends one sentinal when it is done, and the ingester is
353
+ # careful to wait for n_scanners sentinals to arrive before it starts
354
+ # its shutdown.
355
+ self._ingest_requests: Queue[IngestRequest | Literal[_Sentinel.NO_MORE_INGEST_REQUESTS]] = (
356
+ context.make_queue()
357
+ )
358
+ # The scanners send write requests to the writer on this queue (which
359
+ # will be `None` if we're not writing). The supervisor also sends
360
+ # write requests for blocked quanta (which we don't scan). Each
361
+ # scanner and the supervisor send one sentinal when done, and the
362
+ # writer waits for (n_scanners + 1) sentinals to arrive before it
363
+ # starts its shutdown.
364
+ self._write_requests: Queue[ScanResult | Literal[_Sentinel.NO_MORE_WRITE_REQUESTS]] | None = (
365
+ context.make_queue() if config.output_path is not None else None
366
+ )
367
+ # All other workers use this queue to send many different kinds of
368
+ # reports the supervisor. The supervisor waits for a _DONE sentinal
369
+ # from each worker before it finishes its shutdown.
370
+ self._reports: Queue[Report] = context.make_queue()
371
+ # The writer sends the compression dictionary to the scanners on this
372
+ # queue. It puts n_scanners copies on the queue, and each scanner only
373
+ # takes one. The compression_dict queue has no sentinal because it is
374
+ # only used at most once; the supervisor takes responsibility for
375
+ # clearing it out shutting down.
376
+ self._compression_dict: Queue[_CompressionDictionary] = context.make_queue()
377
+ # The supervisor sets this event when it receives an interrupt request
378
+ # from an exception in the main process (usually KeyboardInterrupt).
379
+ # Worker communicators check this in their polling loops and raise
380
+ # FatalWorkerError when they see it set.
381
+ self._cancel_event: Event = context.make_event()
382
+ # Track what state we are in closing down, so we can start at the right
383
+ # point if we're interrupted and __exit__ needs to clean up. Note that
384
+ # we can't rely on a non-exception __exit__ to do any shutdown work
385
+ # that might be slow, since a KeyboardInterrupt that occurs when
386
+ # __exit__ is already running can't be caught inside __exit__.
387
+ self._sent_no_more_scan_requests = False
388
+ self._sent_no_more_write_requests = False
389
+ self._n_scanners_done = 0
390
+ self._ingester_done = False
391
+ self._writer_done = self._write_requests is None
392
+
393
+ def wait_for_workers_to_finish(self, already_failing: bool = False) -> None:
394
+ if not self._sent_no_more_scan_requests:
395
+ for _ in range(self.n_scanners):
396
+ self._scan_requests.put(_Sentinel.NO_MORE_SCAN_REQUESTS, block=False)
397
+ self._sent_no_more_scan_requests = True
398
+ if not self._sent_no_more_write_requests and self._write_requests is not None:
399
+ self._write_requests.put(_Sentinel.NO_MORE_WRITE_REQUESTS, block=False)
400
+ self._sent_no_more_write_requests = True
401
+ while not (self._ingester_done and self._writer_done and self._n_scanners_done == self.n_scanners):
402
+ match self._handle_progress_reports(
403
+ self._reports.get(block=True), already_failing=already_failing
404
+ ):
405
+ case None | ScanReport() | _IngestReport():
406
+ pass
407
+ case _Sentinel.INGESTER_DONE:
408
+ self._ingester_done = True
409
+ self.progress.finish_ingests()
410
+ case _Sentinel.SCANNER_DONE:
411
+ self._n_scanners_done += 1
412
+ self.progress.finish_scans()
413
+ case _Sentinel.WRITER_DONE:
414
+ self._writer_done = True
415
+ self.progress.finish_writes()
416
+ case unexpected:
417
+ raise AssertionError(f"Unexpected message {unexpected!r} to supervisor.")
418
+ self.log.verbose(
419
+ "Blocking on reports queue: ingester_done=%s, writer_done=%s, n_scanners_done=%s.",
420
+ self._ingester_done,
421
+ self._writer_done,
422
+ self._n_scanners_done,
423
+ )
424
+ while _get_from_queue(self._compression_dict) is not None:
425
+ self.log.verbose("Flushing compression dict queue.")
426
+ self.log.verbose("Checking that all queues are empty.")
427
+ self._expect_empty_queue(self._scan_requests)
428
+ self._expect_empty_queue(self._ingest_requests)
429
+ if self._write_requests is not None:
430
+ self._expect_empty_queue(self._write_requests)
431
+ self._expect_empty_queue(self._reports)
432
+ self._expect_empty_queue(self._compression_dict)
433
+
434
+ def __enter__(self) -> Self:
435
+ self.progress.__enter__()
436
+ # We make the low-level logger in __enter__ instead of __init__ only
437
+ # because that's the pattern used by true workers (where it matters).
438
+ self.log = make_worker_log("supervisor", self.config)
439
+ return self
440
+
441
+ def __exit__(
442
+ self,
443
+ exc_type: type[BaseException] | None,
444
+ exc_value: BaseException | None,
445
+ traceback: TracebackType | None,
446
+ ) -> None:
447
+ if exc_type is not None:
448
+ if exc_type is not FatalWorkerError:
449
+ self.progress.log.critical(f"Caught {exc_type.__name__}; attempting to shut down cleanly.")
450
+ self._cancel_event.set()
451
+ self.wait_for_workers_to_finish(already_failing=exc_type is not None)
452
+ self.progress.__exit__(exc_type, exc_value, traceback)
453
+
454
+ def request_scan(self, quantum_id: uuid.UUID) -> None:
455
+ """Send a request to the scanners to scan the given quantum.
456
+
457
+ Parameters
458
+ ----------
459
+ quantum_id : `uuid.UUID`
460
+ ID of the quantum to scan.
461
+ """
462
+ self._scan_requests.put(_ScanRequest(quantum_id), block=False)
463
+
464
+ def request_write(self, scan_result: ScanResult) -> None:
465
+ """Send a request to the writer to write provenance for the given scan.
466
+
467
+ Parameters
468
+ ----------
469
+ scan_result : `ScanResult`
470
+ Information from scanning a quantum (or knowing you don't have to,
471
+ in the case of blocked quanta).
472
+ """
473
+ assert self._write_requests is not None, "Writer should not be used if writing is disabled."
474
+ self._write_requests.put(scan_result, block=False)
475
+
476
+ def poll(self) -> Iterator[ScanReport]:
477
+ """Poll for reports from workers while sending scan requests.
478
+
479
+ Yields
480
+ ------
481
+ scan_report : `ScanReport`
482
+ A report from a scanner that a quantum was scanned.
483
+
484
+ Notes
485
+ -----
486
+ This iterator blocks until the first scan report is received, and then
487
+ it continues until the report queue is empty.
488
+ """
489
+ block = True
490
+ msg = _get_from_queue(self._reports, block=block)
491
+ while msg is not None:
492
+ match self._handle_progress_reports(msg):
493
+ case ScanReport() as scan_report:
494
+ block = False
495
+ yield scan_report
496
+ case None:
497
+ pass
498
+ case unexpected:
499
+ raise AssertionError(f"Unexpected message {unexpected!r} to supervisor.")
500
+ msg = _get_from_queue(self._reports, block=block)
501
+
502
+ def _handle_progress_reports(
503
+ self, report: Report, already_failing: bool = False
504
+ ) -> (
505
+ ScanReport
506
+ | Literal[
507
+ _Sentinel.SCANNER_DONE,
508
+ _Sentinel.INGESTER_DONE,
509
+ _Sentinel.WRITER_DONE,
510
+ ]
511
+ | None
512
+ ):
513
+ """Handle reports to the supervisor that can appear at any time, and
514
+ are typically just updates to the progress we've made.
515
+
516
+ This includes:
517
+
518
+ - exceptions from workers (which raise `FatalWorkerError` here to
519
+ trigger ``__exit__``);
520
+ - ingest reports;
521
+ - write reports;
522
+ - progress logs.
523
+
524
+ If one of these is handled, `None` is returned; otherwise the original
525
+ report is returned.
526
+ """
527
+ match report:
528
+ case _WorkerErrorMessage(traceback=traceback, worker=worker):
529
+ self.progress.log.fatal("Exception raised on %s: \n%s", worker, traceback)
530
+ if not already_failing:
531
+ raise FatalWorkerError()
532
+ case _IngestReport(n_producers=n_producers):
533
+ self.progress.report_ingests(n_producers)
534
+ case _Sentinel.WRITE_REPORT:
535
+ self.progress.report_write()
536
+ case _ProgressLog(message=message, level=level):
537
+ self.progress.log.log(level, "%s [after %0.1fs]", message, self.progress.elapsed_time)
538
+ case _:
539
+ return report
540
+ return None
541
+
542
+ @staticmethod
543
+ def _expect_empty_queue(queue: Queue[Any]) -> None:
544
+ """Assert that the given queue is empty."""
545
+ if (msg := _get_from_queue(queue, block=False, timeout=0)) is not None:
546
+ raise AssertionError(f"Queue is not empty; found {msg!r}.")
547
+
548
+
549
+ class WorkerCommunicator:
550
+ """A base class for non-supervisor workers.
551
+
552
+ Parameters
553
+ ----------
554
+ supervisor : `SupervisorCommunicator`
555
+ Communicator for the supervisor to grab queues and information from.
556
+ name : `str`
557
+ Human-readable name for this worker.
558
+
559
+ Notes
560
+ -----
561
+ Each worker communicator is constructed in the main process and entered as
562
+ a context manager on the actual worker process, so attributes that cannot
563
+ be pickled are constructed in ``__enter__`` instead of ``__init__``.
564
+
565
+ Worker communicators provide access to an `AggregatorConfig` and a logger
566
+ to their workers. As context managers, they handle exceptions and ensure
567
+ clean shutdowns, and since most workers need to use a lot of other context
568
+ managers (for file reading and writing, mostly), they provide an `enter`
569
+ method to keep every worker from also having to be a context manager just
570
+ to hold a context manager instance attribute.
571
+
572
+ Worker communicators can also be configured to record and dump profiling
573
+ information.
574
+ """
575
+
576
+ def __init__(self, supervisor: SupervisorCommunicator, name: str):
577
+ self.name = name
578
+ self.config = supervisor.config
579
+ self._reports = supervisor._reports
580
+ self._cancel_event = supervisor._cancel_event
581
+
582
+ def __enter__(self) -> Self:
583
+ self.log = make_worker_log(self.name, self.config)
584
+ self.log.verbose("%s has PID %s (parent is %s).", self.name, os.getpid(), os.getppid())
585
+ self._exit_stack = ExitStack().__enter__()
586
+ if self.config.n_processes > 1:
587
+ # Multiprocessing: ignore interrupts so we can shut down cleanly.
588
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
589
+ if self.config.worker_profile_dir is not None:
590
+ # We use time.time because we're interested in wall-clock time,
591
+ # not just CPU effort, since this is I/O-bound work.
592
+ self._profiler = cProfile.Profile(timer=time.time)
593
+ self._profiler.enable()
594
+ return self
595
+
596
+ def __exit__(
597
+ self,
598
+ exc_type: type[BaseException] | None,
599
+ exc_value: BaseException | None,
600
+ traceback: TracebackType | None,
601
+ ) -> bool | None:
602
+ if self.config.worker_profile_dir is not None and self.config.n_processes > 1:
603
+ self._profiler.disable()
604
+ os.makedirs(self.config.worker_profile_dir, exist_ok=True)
605
+ self._profiler.dump_stats(os.path.join(self.config.worker_profile_dir, f"{self.name}.profile"))
606
+ if exc_value is not None:
607
+ assert exc_type is not None, "Should be guaranteed by Python, but MyPy doesn't know that."
608
+ if exc_type is not FatalWorkerError:
609
+ self.log.warning("Error raised on this worker.", exc_info=(exc_type, exc_value, traceback))
610
+ assert exc_type is not None and traceback is not None
611
+ self._reports.put(
612
+ _WorkerErrorMessage(
613
+ self.name,
614
+ "".join(format_exception(exc_type, exc_value, traceback)),
615
+ ),
616
+ block=False,
617
+ )
618
+ self.log.debug("Error message sent to supervisor.")
619
+ else:
620
+ self.log.warning("Shutting down due to exception raised on another worker.")
621
+ self._exit_stack.__exit__(exc_type, exc_value, traceback)
622
+ return True
623
+
624
+ def log_progress(self, level: int, message: str) -> None:
625
+ """Send a high-level log message to the supervisor.
626
+
627
+ Parameters
628
+ ----------
629
+ message : `str`
630
+ Log message.
631
+ level : `int`
632
+ Log level. Should be ``VERBOSE`` or higher.
633
+ """
634
+ self._reports.put(_ProgressLog(message=message, level=level), block=False)
635
+
636
+ def enter(
637
+ self,
638
+ cm: AbstractContextManager[_T],
639
+ on_close: str | None = None,
640
+ level: int = VERBOSE,
641
+ is_progress_log: bool = False,
642
+ ) -> _T:
643
+ """Enter a context manager that will be exited when the communicator's
644
+ context is exited.
645
+
646
+ Parameters
647
+ ----------
648
+ cm : `contextlib.AbstractContextManager`
649
+ A context manager to enter.
650
+ on_close : `str`, optional
651
+ A log message to emit (on the worker's logger) just before the
652
+ given context manager is exited. This can be used to indicate
653
+ what's going on when an ``__exit__`` implementation has a lot of
654
+ work to do (e.g. moving a large file into a zip archive).
655
+ level : `int`, optional
656
+ Level for the ``on_close`` log message.
657
+ is_progress_log : `bool`, optional
658
+ If `True`, send the ``on_close`` message to the supervisor via
659
+ `log_progress` as well as the worker's logger.
660
+ """
661
+ if on_close is None:
662
+ return self._exit_stack.enter_context(cm)
663
+
664
+ @contextmanager
665
+ def wrapper() -> Iterator[_T]:
666
+ with cm as result:
667
+ yield result
668
+ self.log.log(level, on_close)
669
+ if is_progress_log:
670
+ self.log_progress(level, on_close)
671
+
672
+ return self._exit_stack.enter_context(wrapper())
673
+
674
+ def check_for_cancel(self) -> None:
675
+ """Check for a cancel signal from the supervisor and raise
676
+ `FatalWorkerError` if it is present.
677
+ """
678
+ if self._cancel_event.is_set():
679
+ raise FatalWorkerError()
680
+
681
+
682
+ class ScannerCommunicator(WorkerCommunicator):
683
+ """A communicator for scanner workers.
684
+
685
+ Parameters
686
+ ----------
687
+ supervisor : `SupervisorCommunicator`
688
+ Communicator for the supervisor to grab queues and information from.
689
+ scanner_id : `int`
690
+ Integer ID for this canner.
691
+ """
692
+
693
+ def __init__(self, supervisor: SupervisorCommunicator, scanner_id: int):
694
+ super().__init__(supervisor, f"scanner-{scanner_id:03d}")
695
+ self.scanner_id = scanner_id
696
+ self._scan_requests = supervisor._scan_requests
697
+ self._ingest_requests = supervisor._ingest_requests
698
+ self._write_requests = supervisor._write_requests
699
+ self._compression_dict = supervisor._compression_dict
700
+ self._got_no_more_scan_requests: bool = False
701
+ self._sent_no_more_ingest_requests: bool = False
702
+
703
+ def report_scan(self, msg: ScanReport) -> None:
704
+ """Report a completed scan to the supervisor.
705
+
706
+ Parameters
707
+ ----------
708
+ msg : `ScanReport`
709
+ Report to send.
710
+ """
711
+ self._reports.put(msg, block=False)
712
+
713
+ def request_ingest(self, request: IngestRequest) -> None:
714
+ """Ask the ingester to ingest a quantum's outputs.
715
+
716
+ Parameters
717
+ ----------
718
+ request : `IngestRequest`
719
+ Description of the datasets to ingest.
720
+
721
+ Notes
722
+ -----
723
+ If this request has no datasets, this automatically reports the ingest
724
+ as complete to the supervisor instead of sending it to the ingester.
725
+ """
726
+ if request:
727
+ self._ingest_requests.put(request, block=False)
728
+ else:
729
+ self._reports.put(_IngestReport(1), block=False)
730
+
731
+ def request_write(self, scan_result: ScanResult) -> None:
732
+ """Ask the writer to write provenance for a quantum.
733
+
734
+ Parameters
735
+ ----------
736
+ scan_result : `ScanResult`
737
+ Result of scanning a quantum.
738
+ """
739
+ assert self._write_requests is not None, "Writer should not be used if writing is disabled."
740
+ self._write_requests.put(scan_result, block=False)
741
+
742
+ def get_compression_dict(self) -> bytes | None:
743
+ """Attempt to get the compression dict from the writer.
744
+
745
+ Returns
746
+ -------
747
+ data : `bytes` or `None`
748
+ The `bytes` representation of the compression dictionary, or `None`
749
+ if the compression dictionary is not yet available.
750
+
751
+ Notes
752
+ -----
753
+ A scanner should only call this method before it actually has the
754
+ compression dict.
755
+ """
756
+ if (cdict := _get_from_queue(self._compression_dict)) is not None:
757
+ return cdict.data
758
+ return None
759
+
760
+ def poll(self) -> Iterator[uuid.UUID]:
761
+ """Poll for scan requests to process.
762
+
763
+ Yields
764
+ ------
765
+ quantum_id : `uuid.UUID`
766
+ ID of a new quantum to scan.
767
+
768
+ Notes
769
+ -----
770
+ This iterator ends when the supervisor reports that it is done
771
+ traversing the graph.
772
+ """
773
+ while True:
774
+ self.check_for_cancel()
775
+ scan_request = _get_from_queue(self._scan_requests, block=True, timeout=self.config.worker_sleep)
776
+ if scan_request is _Sentinel.NO_MORE_SCAN_REQUESTS:
777
+ self._got_no_more_scan_requests = True
778
+ return
779
+ if scan_request is not None:
780
+ yield scan_request.quantum_id
781
+
782
+ def __exit__(
783
+ self,
784
+ exc_type: type[BaseException] | None,
785
+ exc_value: BaseException | None,
786
+ traceback: TracebackType | None,
787
+ ) -> bool | None:
788
+ result = super().__exit__(exc_type, exc_value, traceback)
789
+ self._ingest_requests.put(_Sentinel.NO_MORE_INGEST_REQUESTS, block=False)
790
+ if self._write_requests is not None:
791
+ self._write_requests.put(_Sentinel.NO_MORE_WRITE_REQUESTS, block=False)
792
+ while not self._got_no_more_scan_requests:
793
+ self.log.debug("Clearing scan request queue (~%d remaining)", self._scan_requests.qsize())
794
+ if (
795
+ not self._got_no_more_scan_requests
796
+ and self._scan_requests.get() is _Sentinel.NO_MORE_SCAN_REQUESTS
797
+ ):
798
+ self._got_no_more_scan_requests = True
799
+ # We let the supervisor clear out the compression dict queue, because
800
+ # a single scanner can't know if it ever got sent out or not.
801
+ self.log.verbose("Sending done sentinal.")
802
+ self._reports.put(_Sentinel.SCANNER_DONE, block=False)
803
+ return result
804
+
805
+
806
+ class IngesterCommunicator(WorkerCommunicator):
807
+ """A communicator for the ingester worker.
808
+
809
+ Parameters
810
+ ----------
811
+ supervisor : `SupervisorCommunicator`
812
+ Communicator for the supervisor to grab queues and information from.
813
+ """
814
+
815
+ def __init__(self, supervisor: SupervisorCommunicator):
816
+ super().__init__(supervisor, "ingester")
817
+ self.n_scanners = supervisor.n_scanners
818
+ self._ingest_requests = supervisor._ingest_requests
819
+ self._n_requesters_done = 0
820
+
821
+ def __exit__(
822
+ self,
823
+ exc_type: type[BaseException] | None,
824
+ exc_value: BaseException | None,
825
+ traceback: TracebackType | None,
826
+ ) -> bool | None:
827
+ result = super().__exit__(exc_type, exc_value, traceback)
828
+ while self._n_requesters_done != self.n_scanners:
829
+ self.log.debug(
830
+ "Waiting for %d requesters to be done (currently %d).",
831
+ self.n_scanners,
832
+ self._n_requesters_done,
833
+ )
834
+ if self._ingest_requests.get(block=True) is _Sentinel.NO_MORE_INGEST_REQUESTS:
835
+ self._n_requesters_done += 1
836
+ self.log.verbose("Sending done sentinal.")
837
+ self._reports.put(_Sentinel.INGESTER_DONE, block=False)
838
+ return result
839
+
840
+ def report_ingest(self, n_producers: int) -> None:
841
+ """Report to the supervisor that an ingest batch was completed.
842
+
843
+ Parameters
844
+ ----------
845
+ n_producers : `int`
846
+ Number of producing quanta whose datasets were ingested.
847
+ """
848
+ self._reports.put(_IngestReport(n_producers), block=False)
849
+
850
+ def poll(self) -> Iterator[IngestRequest]:
851
+ """Poll for ingest requests from the scanner workers.
852
+
853
+ Yields
854
+ ------
855
+ request : `IngestRequest`
856
+ A request to ingest datasets produced by a single quantum.
857
+
858
+ Notes
859
+ -----
860
+ This iterator ends when all scanners indicate that they are done making
861
+ ingest requests.
862
+ """
863
+ while True:
864
+ self.check_for_cancel()
865
+ ingest_request = _get_from_queue(self._ingest_requests, block=True, timeout=_TINY_TIMEOUT)
866
+ if ingest_request is _Sentinel.NO_MORE_INGEST_REQUESTS:
867
+ self._n_requesters_done += 1
868
+ if self._n_requesters_done == self.n_scanners:
869
+ return
870
+ else:
871
+ continue
872
+ if ingest_request is not None:
873
+ yield ingest_request
874
+
875
+
876
+ class WriterCommunicator(WorkerCommunicator):
877
+ """A communicator for the writer worker.
878
+
879
+ Parameters
880
+ ----------
881
+ supervisor : `SupervisorCommunicator`
882
+ Communicator for the supervisor to grab queues and information from.
883
+ """
884
+
885
+ def __init__(self, supervisor: SupervisorCommunicator):
886
+ assert supervisor._write_requests is not None
887
+ super().__init__(supervisor, "writer")
888
+ self.n_scanners = supervisor.n_scanners
889
+ self._write_requests = supervisor._write_requests
890
+ self._compression_dict = supervisor._compression_dict
891
+ self._n_requesters = supervisor.n_scanners + 1
892
+ self._n_requesters_done = 0
893
+ self._sent_compression_dict = False
894
+
895
+ def __exit__(
896
+ self,
897
+ exc_type: type[BaseException] | None,
898
+ exc_value: BaseException | None,
899
+ traceback: TracebackType | None,
900
+ ) -> bool | None:
901
+ result = super().__exit__(exc_type, exc_value, traceback)
902
+ if exc_type is None:
903
+ self.log_progress(logging.INFO, "Provenance quantum graph written successfully.")
904
+ while self._n_requesters_done != self._n_requesters:
905
+ self.log.debug(
906
+ "Waiting for %d requesters to be done (currently %d).",
907
+ self._n_requesters,
908
+ self._n_requesters_done,
909
+ )
910
+ if self._write_requests.get(block=True) is _Sentinel.NO_MORE_WRITE_REQUESTS:
911
+ self._n_requesters_done += 1
912
+ self.log.verbose("Sending done sentinal.")
913
+ self._reports.put(_Sentinel.WRITER_DONE, block=False)
914
+ return result
915
+
916
+ def poll(self) -> Iterator[ScanResult]:
917
+ """Poll for writer requests from the scanner workers and supervisor.
918
+
919
+ Yields
920
+ ------
921
+ request : `ScanResult`
922
+ The result of a quantum scan.
923
+
924
+ Notes
925
+ -----
926
+ This iterator ends when all scanners and the supervisor indicate that
927
+ they are done making write requests.
928
+ """
929
+ while True:
930
+ self.check_for_cancel()
931
+ write_request = _get_from_queue(self._write_requests, block=True, timeout=_TINY_TIMEOUT)
932
+ if write_request is _Sentinel.NO_MORE_WRITE_REQUESTS:
933
+ self._n_requesters_done += 1
934
+ if self._n_requesters_done == self._n_requesters:
935
+ return
936
+ else:
937
+ continue
938
+ if write_request is not None:
939
+ yield write_request
940
+
941
+ def send_compression_dict(self, cdict_data: bytes) -> None:
942
+ """Send the compression dictionary to the scanners.
943
+
944
+ Parameters
945
+ ----------
946
+ cdict_data : `bytes`
947
+ The `bytes` representation of the compression dictionary.
948
+ """
949
+ self.log.debug("Sending compression dictionary.")
950
+ for _ in range(self.n_scanners):
951
+ self._compression_dict.put(_CompressionDictionary(cdict_data), block=False)
952
+ self._sent_compression_dict = True
953
+
954
+ def report_write(self) -> None:
955
+ """Report to the supervisor that provenance for a quantum was written
956
+ to the graph.
957
+ """
958
+ self._reports.put(_Sentinel.WRITE_REPORT, block=False)
959
+
960
+ def periodically_check_for_cancel(self, iterable: Iterable[_T], n: int = 100) -> Iterator[_T]:
961
+ """Iterate while checking for a cancellation signal every ``n``
962
+ iterations.
963
+
964
+ Parameters
965
+ ----------
966
+ iterable : `~collections.abc.Iterable`
967
+ Object to iterate over.
968
+ n : `int`
969
+ Check for cancellation every ``n`` iterations.
970
+
971
+ Returns
972
+ -------
973
+ iterator : `~collections.abc.Iterator`
974
+ Iterator.
975
+ """
976
+ i = 0
977
+ for entry in iterable:
978
+ yield entry
979
+ i += 1
980
+ if i % n == 0:
981
+ self.check_for_cancel()