warmpool 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,80 @@
1
+ Metadata-Version: 2.4
2
+ Name: warmpool
3
+ Version: 0.1.1
4
+ Summary: Process pool with hard-kill timeouts and import warming
5
+ Keywords: subprocess,pool,timeout,multiprocessing
6
+ Author: Michael Dawson-Haggerty
7
+ License-Expression: MIT
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Requires-Dist: psutil>=5.9
16
+ Requires-Python: >=3.10
17
+ Project-URL: Repository, https://github.com/slopden/warmpool
18
+ Description-Content-Type: text/markdown
19
+
20
+ <p align="center">
21
+ <img src="static/logo.svg" alt="warmpool" width="480">
22
+ </p>
23
+
24
+ <p align="center">
25
+ <strong>A single-worker subprocess pool that can actually kill C extensions.</strong>
26
+ </p>
27
+
28
+ <p align="center">
29
+ <a href="https://pypi.org/project/warmpool/"><img alt="PyPI" src="https://img.shields.io/pypi/v/warmpool?color=ff6b35"></a>
30
+ <a href="https://github.com/slopden/warmpool/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/github/license/slopden/warmpool?color=ff3d00"></a>
31
+ <img alt="Python" src="https://img.shields.io/pypi/pyversions/warmpool?color=1a1a2e">
32
+ </p>
33
+
34
+ ---
35
+
36
+ A "ProcessPool-like-executor" with hard-kill timeouts and import warming. The basic problem is that if you freeze up deep in a C-extension Python timeout-handling stuff doesn't work. `warmpool` runs functions in a spawned subprocess, and if they exceed their timeout it SIGTERM+SIGKILL the process and all children if the C extension has spawned anything.
37
+
38
+ - It calls a "warming function" in each new process, so you can have it keep a process warmed with `import scipy, numpy, etc` which can easily be 2+ seconds.
39
+ - The timeouts actually work regardless of what happens in the function.
40
+ - It has an option to keep a spare process warm in the background so it can rotate cleanly without eating an import period.
41
+ - It sends logs back to the parent through a pipe.
42
+
43
+ ```python
44
+ import time
45
+ from warmpool import WarmPool
46
+
47
+ def warm_imports():
48
+ import numpy
49
+ import scipy.linalg
50
+
51
+ def eigh_huge(n=5000):
52
+ """Stuck in LAPACK C code — only SIGKILL works."""
53
+ import numpy as np
54
+ from scipy import linalg
55
+ a = np.random.rand(n, n)
56
+ a = a + a.T
57
+ return linalg.eigh(a)
58
+
59
+ def add(a=0, b=0):
60
+ return a + b
61
+
62
+ pool = WarmPool(warming=warm_imports)
63
+
64
+ # numpy+scipy are already imported — no 2s wait
65
+ start = time.perf_counter()
66
+ try:
67
+ pool.run(eigh_huge, timeout=0.5, n=5000)
68
+ except TimeoutError:
69
+ print(f"killed after {time.perf_counter() - start:.2f}s")
70
+
71
+ # pool recovered via spare
72
+ result = pool.run(add, timeout=5.0, a=2, b=3)
73
+ print(f"recovered: add(2, 3) = {result}")
74
+ pool.shutdown()
75
+ ```
76
+
77
+ ```
78
+ killed after 0.53s
79
+ recovered: add(2, 3) = 5
80
+ ```
@@ -0,0 +1,61 @@
1
+ <p align="center">
2
+ <img src="static/logo.svg" alt="warmpool" width="480">
3
+ </p>
4
+
5
+ <p align="center">
6
+ <strong>A single-worker subprocess pool that can actually kill C extensions.</strong>
7
+ </p>
8
+
9
+ <p align="center">
10
+ <a href="https://pypi.org/project/warmpool/"><img alt="PyPI" src="https://img.shields.io/pypi/v/warmpool?color=ff6b35"></a>
11
+ <a href="https://github.com/slopden/warmpool/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/github/license/slopden/warmpool?color=ff3d00"></a>
12
+ <img alt="Python" src="https://img.shields.io/pypi/pyversions/warmpool?color=1a1a2e">
13
+ </p>
14
+
15
+ ---
16
+
17
+ A "ProcessPool-like-executor" with hard-kill timeouts and import warming. The basic problem is that if you freeze up deep in a C-extension Python timeout-handling stuff doesn't work. `warmpool` runs functions in a spawned subprocess, and if they exceed their timeout it SIGTERM+SIGKILL the process and all children if the C extension has spawned anything.
18
+
19
+ - It calls a "warming function" in each new process, so you can have it keep a process warmed with `import scipy, numpy, etc` which can easily be 2+ seconds.
20
+ - The timeouts actually work regardless of what happens in the function.
21
+ - It has an option to keep a spare process warm in the background so it can rotate cleanly without eating an import period.
22
+ - It sends logs back to the parent through a pipe.
23
+
24
+ ```python
25
+ import time
26
+ from warmpool import WarmPool
27
+
28
+ def warm_imports():
29
+ import numpy
30
+ import scipy.linalg
31
+
32
+ def eigh_huge(n=5000):
33
+ """Stuck in LAPACK C code — only SIGKILL works."""
34
+ import numpy as np
35
+ from scipy import linalg
36
+ a = np.random.rand(n, n)
37
+ a = a + a.T
38
+ return linalg.eigh(a)
39
+
40
+ def add(a=0, b=0):
41
+ return a + b
42
+
43
+ pool = WarmPool(warming=warm_imports)
44
+
45
+ # numpy+scipy are already imported — no 2s wait
46
+ start = time.perf_counter()
47
+ try:
48
+ pool.run(eigh_huge, timeout=0.5, n=5000)
49
+ except TimeoutError:
50
+ print(f"killed after {time.perf_counter() - start:.2f}s")
51
+
52
+ # pool recovered via spare
53
+ result = pool.run(add, timeout=5.0, a=2, b=3)
54
+ print(f"recovered: add(2, 3) = {result}")
55
+ pool.shutdown()
56
+ ```
57
+
58
+ ```
59
+ killed after 0.53s
60
+ recovered: add(2, 3) = 5
61
+ ```
@@ -0,0 +1,37 @@
1
+ [project]
2
+ name = "warmpool"
3
+ version = "0.1.1"
4
+ description = "Process pool with hard-kill timeouts and import warming"
5
+ readme = "README.md"
6
+ license = "MIT"
7
+ requires-python = ">=3.10"
8
+ dependencies = ["psutil>=5.9"]
9
+ authors = [{name = "Michael Dawson-Haggerty"}]
10
+ keywords = ["subprocess", "pool", "timeout", "multiprocessing"]
11
+ classifiers = [
12
+ "Development Status :: 3 - Alpha",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.10",
16
+ "Programming Language :: Python :: 3.11",
17
+ "Programming Language :: Python :: 3.12",
18
+ "Programming Language :: Python :: 3.13",
19
+ ]
20
+
21
+ [project.urls]
22
+ Repository = "https://github.com/slopden/warmpool"
23
+
24
+ [dependency-groups]
25
+ dev = [
26
+ "pytest>=8.0",
27
+ "pytest-asyncio>=0.24",
28
+ "scipy>=1.15.3",
29
+ ]
30
+
31
+ [build-system]
32
+ requires = ["uv_build>=0.10.9,<0.11.0"]
33
+ build-backend = "uv_build"
34
+
35
+ [tool.pytest.ini_options]
36
+ testpaths = ["tests"]
37
+ asyncio_mode = "auto"
@@ -0,0 +1,20 @@
1
+ """warmpool — single-worker subprocess pool with hard-kill timeouts.
2
+
3
+ Usage
4
+ -----
5
+ >>> from warmpool import WarmPool, PoolStatus
6
+ >>> pool = WarmPool(max_tasks=100, keep_spare=True)
7
+ >>> result = pool.run(my_func, timeout=10.0, x=42)
8
+ >>> pool.status is PoolStatus.READY
9
+ True
10
+ >>> pool.shutdown()
11
+
12
+ Memory-based rotation:
13
+
14
+ >>> pool = WarmPool(max_memory=500 * 1024 * 1024)
15
+ """
16
+
17
+ from ._exceptions import ProcessPoolExhausted
18
+ from .pool import PoolStatus, WarmPool
19
+
20
+ __all__ = ["PoolStatus", "WarmPool", "ProcessPoolExhausted"]
@@ -0,0 +1,21 @@
1
+ """Exception types for the warmpool package."""
2
+
3
+
4
+ class ProcessPoolExhausted(Exception):
5
+ """The worker process is dead or has hit its task limit.
6
+
7
+ When ``keep_spare=False``, the caller must create a new
8
+ :class:`~warmpool.WarmPool` instance to continue.
9
+
10
+ Parameters
11
+ ----------
12
+ message
13
+ Human-readable description of why the pool is exhausted.
14
+ exit_code
15
+ The worker's exit code, if it died. ``None`` when the pool
16
+ was shut down explicitly or the code is unavailable.
17
+ """
18
+
19
+ def __init__(self, message: str, exit_code: int | None = None):
20
+ super().__init__(message)
21
+ self.exit_code = exit_code
@@ -0,0 +1,80 @@
1
+ """Pipe-based logging for worker subprocesses.
2
+
3
+ Provides :class:`PipeHandler` (installed in workers) and
4
+ :func:`forward_subprocess_log` (called in the parent) so that log
5
+ records cross the process boundary as structured dicts.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ import traceback
12
+ from multiprocessing.connection import Connection
13
+ from typing import Any
14
+
15
+
16
+ class PipeHandler(logging.Handler):
17
+ """Logging handler that serializes records and sends them through a
18
+ multiprocessing pipe as structured dicts (JSON-ready).
19
+
20
+ Installed in the **worker** subprocess so that all log output is
21
+ forwarded to the parent process over the pipe.
22
+
23
+ Parameters
24
+ ----------
25
+ connection
26
+ The child-side :class:`~multiprocessing.connection.Connection`.
27
+ """
28
+
29
+ def __init__(self, connection: Connection):
30
+ super().__init__()
31
+ self.connection = connection
32
+
33
+ def emit(self, record: logging.LogRecord) -> None:
34
+ """Serialize *record* to a dict and send it over the pipe.
35
+
36
+ Parameters
37
+ ----------
38
+ record
39
+ The log record to forward.
40
+ """
41
+ try:
42
+ entry: dict = {
43
+ "timestamp": record.created,
44
+ "level": record.levelname,
45
+ "message": record.getMessage(),
46
+ "logger": record.name,
47
+ "process_id": record.process,
48
+ }
49
+ if record.exc_info and record.exc_info[1] is not None:
50
+ entry["exception"] = "".join(
51
+ traceback.format_exception(*record.exc_info)
52
+ )
53
+ self.connection.send(("log", entry, {}))
54
+ except Exception:
55
+ pass
56
+
57
+
58
+ def forward_subprocess_log(
59
+ payload: dict[str, Any],
60
+ logger: logging.Logger | None = None,
61
+ ) -> None:
62
+ """Re-emit a subprocess log record via the parent's logging system.
63
+
64
+ Parameters
65
+ ----------
66
+ payload
67
+ The structured dict received from :class:`PipeHandler`.
68
+ logger
69
+ Logger to emit on. Defaults to ``warmpool.subprocess``.
70
+ """
71
+ if logger is None:
72
+ logger = logging.getLogger("warmpool.subprocess")
73
+ level = getattr(logging, payload.get("level", "INFO"), logging.INFO)
74
+ message = payload.get("message", "")
75
+ extra = {
76
+ k: v
77
+ for k, v in payload.items()
78
+ if k not in ("level", "message", "levelname", "levelno")
79
+ }
80
+ logger.log(level, message, extra=extra)
@@ -0,0 +1,76 @@
1
+ """Worker subprocess entry point.
2
+
3
+ This module is imported by the spawned child process. It sets up
4
+ pipe-based logging, runs the optional warming callable, then enters a
5
+ receive-execute-send loop until the parent sends a shutdown sentinel
6
+ (``func is None``) or the pipe breaks.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import time
13
+ from multiprocessing.connection import Connection
14
+ from typing import Callable
15
+
16
+ from ._logging import PipeHandler
17
+
18
+
19
+ def _worker_process(
20
+ connection: Connection,
21
+ log_level: int = logging.DEBUG,
22
+ warming: Callable | None = None,
23
+ ) -> None:
24
+ """Entry point for the worker subprocess.
25
+
26
+ Parameters
27
+ ----------
28
+ connection
29
+ Child-side pipe connection shared with the parent.
30
+ warming
31
+ Optional callable invoked once on startup (e.g. to pre-import
32
+ modules). Its return value is sent to the parent.
33
+
34
+ Notes
35
+ -----
36
+ 1. Replaces all root-logger handlers with a :class:`PipeHandler` so
37
+ every log record is forwarded to the parent as a structured dict.
38
+ 2. Calls *warming* if provided.
39
+ 3. Sends a ``("ready", init_result, {})`` message, then enters the task loop.
40
+ """
41
+ root = logging.getLogger()
42
+ root.handlers.clear()
43
+ root.addHandler(PipeHandler(connection))
44
+ root.setLevel(log_level)
45
+
46
+ init_result = warming() if warming is not None else None
47
+ connection.send(("ready", init_result, {}))
48
+
49
+ try:
50
+ while True:
51
+ if not connection.poll(timeout=None):
52
+ continue
53
+
54
+ try:
55
+ function, args, kwargs = connection.recv()
56
+ if function is None: # shutdown sentinel
57
+ break
58
+
59
+ start = time.perf_counter()
60
+ result = function(*args, **kwargs)
61
+ elapsed_ms = int((time.perf_counter() - start) * 1000)
62
+
63
+ connection.send(("success", result, {"elapsed_ms": elapsed_ms}))
64
+ except Exception as error:
65
+ # Guard against unpicklable exceptions (common with
66
+ # C-API wrappers). If the exception can't be pickled
67
+ # the parent would see a silent worker death instead
68
+ # of a useful error message.
69
+ try:
70
+ connection.send(("error", error, {}))
71
+ except Exception:
72
+ connection.send(("error", RuntimeError(repr(error)), {}))
73
+ except (EOFError, BrokenPipeError):
74
+ pass
75
+ finally:
76
+ connection.close()
@@ -0,0 +1,642 @@
1
+ """Single-worker subprocess pool with hard-kill timeouts.
2
+
3
+ Runs callables in a spawned subprocess that can be SIGKILLed when
4
+ C-API code (OpenCASCADE, etc.) ignores Python signals. An optional
5
+ spare worker is pre-warmed in the background so that rotation after
6
+ task-limit exhaustion or crash is near-instant.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import asyncio
12
+ import atexit
13
+ import enum
14
+ import logging
15
+ import multiprocessing
16
+ import time
17
+ import weakref
18
+ from dataclasses import dataclass, field
19
+ from multiprocessing import Pipe, Process
20
+ from multiprocessing.connection import Connection
21
+ from typing import Any, Callable, NoReturn
22
+
23
+ import psutil
24
+
25
+ from ._exceptions import ProcessPoolExhausted
26
+ from ._logging import forward_subprocess_log
27
+ from ._worker import _worker_process
28
+
29
+ log = logging.getLogger(__name__)
30
+
31
+ # Seconds to wait when polling a pipe for data.
32
+ _POLL_TIMEOUT = 0.1
33
+ # Seconds to wait for a worker to join after a graceful shutdown signal.
34
+ _JOIN_TIMEOUT = 0.5
35
+ # Seconds to wait for a process tree to die after SIGKILL.
36
+ _KILL_WAIT = 1.0
37
+
38
+ _active_pools: weakref.WeakSet[WarmPool] = weakref.WeakSet()
39
+
40
+
41
+ def _cleanup_all_pools() -> None:
42
+ """Shut down every live pool at interpreter exit."""
43
+ for pool in list(_active_pools):
44
+ try:
45
+ pool.shutdown()
46
+ except Exception:
47
+ pass
48
+
49
+
50
+ atexit.register(_cleanup_all_pools)
51
+
52
+
53
+ class PoolStatus(enum.Enum):
54
+ """The pool's readiness state.
55
+
56
+ Returned by :attr:`WarmPool.status`. Every decision point
57
+ in the pool dispatches on this enum with :func:`_assert_never` in
58
+ the ``else`` branch so that mypy proves exhaustive handling.
59
+
60
+ Attributes
61
+ ----------
62
+ READY
63
+ Active worker is alive and under the task limit.
64
+ NEEDS_ROTATION
65
+ Active worker is spent or dead, but a spare can take over.
66
+ EXHAUSTED
67
+ No workers available and no spare to promote.
68
+ SHUTDOWN
69
+ The pool has been explicitly shut down.
70
+ """
71
+
72
+ READY = "ready"
73
+ NEEDS_ROTATION = "rotation"
74
+ EXHAUSTED = "exhausted"
75
+ SHUTDOWN = "shutdown"
76
+
77
+
78
+ def _assert_never(value: NoReturn) -> NoReturn:
79
+ """Statically assert all enum cases are handled.
80
+
81
+ Mypy narrows the type through each ``if``/``elif`` branch. If all
82
+ :class:`PoolStatus` members are covered, the remaining type at the
83
+ ``else`` is ``Never``/``NoReturn`` and this call type-checks.
84
+ Adding a new enum member without a matching branch causes a mypy
85
+ error.
86
+ """
87
+ raise AssertionError(f"Unhandled status: {value!r}")
88
+
89
+
90
+ @dataclass
91
+ class WorkerHandle:
92
+ """Bookkeeping for a single worker subprocess.
93
+
94
+ Parameters
95
+ ----------
96
+ process
97
+ The :class:`multiprocessing.Process` instance.
98
+ connection
99
+ Parent-side pipe connection.
100
+ child_connection
101
+ Child-side pipe connection (kept open so we can close it on
102
+ cleanup).
103
+ ready
104
+ ``True`` once the worker has sent its ``"ready"`` message.
105
+ task_count
106
+ Number of tasks dispatched to this worker.
107
+ last_metadata
108
+ Metadata dict from the most recent completed task.
109
+ """
110
+
111
+ process: multiprocessing.Process
112
+ connection: Connection
113
+ child_connection: Connection
114
+ ready: bool = False
115
+ task_count: int = 0
116
+ last_metadata: dict[str, Any] = field(default_factory=dict)
117
+ init_result: Any = None
118
+
119
+
120
+ class WarmPool:
121
+ """Single-worker subprocess pool with hard-kill timeouts.
122
+
123
+ Runs functions in a spawned subprocess that can be SIGKILLed when
124
+ C-API code (OpenCASCADE, etc.) ignores Python signals.
125
+
126
+ .. note::
127
+ This class is **not** thread-safe. Do not call :meth:`run` from
128
+ multiple threads concurrently.
129
+
130
+ Parameters
131
+ ----------
132
+ warming
133
+ Optional callable invoked once per worker on startup (e.g. to
134
+ pre-import modules). Its return value is available via
135
+ :attr:`init_result`.
136
+ max_tasks
137
+ Maximum tasks a single worker may handle before rotation.
138
+ keep_spare
139
+ If ``True``, a spare worker is pre-warmed in the background so
140
+ rotation is near-instant.
141
+ ready_timeout
142
+ Seconds to wait for a worker to send its ``"ready"`` signal.
143
+ max_memory
144
+ Maximum RSS in bytes before the worker is rotated.
145
+ max_memory_percent
146
+ Maximum RSS as a fraction of total system memory (0.0–1.0)
147
+ before the worker is rotated.
148
+ """
149
+
150
+ def __init__(
151
+ self,
152
+ max_tasks: int = 50,
153
+ keep_spare: bool = True,
154
+ ready_timeout: float = 30.0,
155
+ max_memory: int | None = None,
156
+ max_memory_percent: float | None = 0.35,
157
+ warming: Callable | None = None,
158
+ init_retries: int = 1,
159
+ ) -> None:
160
+ self._max_tasks = max_tasks
161
+ self._keep_spare = keep_spare
162
+ self._ready_timeout = ready_timeout
163
+ self._warming = warming
164
+ self._max_memory = max_memory
165
+ self._init_retries = init_retries
166
+ # Pre-compute absolute byte limit from percentage (avoid per-task psutil call).
167
+ if max_memory_percent is not None:
168
+ clamped = max(0.0, min(1.0, max_memory_percent))
169
+ self._max_memory_percent_bytes: int | None = int(
170
+ clamped * psutil.virtual_memory().total
171
+ )
172
+ else:
173
+ self._max_memory_percent_bytes = None
174
+
175
+ self._active: WorkerHandle | None = None
176
+ self._spare: WorkerHandle | None = None
177
+ self._shutdown = False
178
+ # Pool-level cache so elapsed_ms survives rotation.
179
+ self._last_elapsed_ms: int | None = None
180
+ self._last_memory_rss: int | None = None
181
+
182
+ _active_pools.add(self)
183
+
184
+ # Start primary worker (blocking), with retry.
185
+ for attempt in range(1 + self._init_retries):
186
+ try:
187
+ self._active = self._start_worker(block_ready=True)
188
+ break
189
+ except RuntimeError:
190
+ if attempt < self._init_retries:
191
+ log.warning(
192
+ "Primary worker failed to become ready, "
193
+ f"retrying ({attempt + 1}/{self._init_retries})..."
194
+ )
195
+ time.sleep(5)
196
+ else:
197
+ raise
198
+
199
+ # Start spare (non-blocking) if requested.
200
+ if self._keep_spare:
201
+ self._spare = self._start_worker(block_ready=False)
202
+
203
+ # ------------------------------------------------------------------
204
+ # Public API
205
+ # ------------------------------------------------------------------
206
+
207
+ @property
208
+ def status(self) -> PoolStatus:
209
+ """The pool's current readiness state.
210
+
211
+ Returns
212
+ -------
213
+ PoolStatus
214
+ Pure query — no side effects, no mutations.
215
+ """
216
+ if self._shutdown:
217
+ return PoolStatus.SHUTDOWN
218
+ if (
219
+ self._active is not None
220
+ and self._active.process.is_alive()
221
+ and self._active.task_count < self._max_tasks
222
+ ):
223
+ return PoolStatus.READY
224
+ if self._keep_spare:
225
+ return PoolStatus.NEEDS_ROTATION
226
+ return PoolStatus.EXHAUSTED
227
+
228
+ @property
229
+ def init_result(self) -> Any:
230
+ """Return value of ``warming`` from the active worker, or ``None``."""
231
+ return self._active.init_result if self._active else None
232
+
233
+ @property
234
+ def last_elapsed_ms(self) -> int | None:
235
+ """Wall-clock milliseconds the last completed task took.
236
+
237
+ Returns
238
+ -------
239
+ int or None
240
+ ``None`` if no task has completed yet.
241
+ """
242
+ return self._last_elapsed_ms
243
+
244
+ @property
245
+ def last_memory_rss(self) -> int | None:
246
+ """RSS in bytes of the worker after the last completed task.
247
+
248
+ Returns ``None`` if no task has completed or memory checking is disabled.
249
+ """
250
+ return self._last_memory_rss
251
+
252
+ def run(self, function: Callable, timeout: float, **kwargs: Any) -> Any:
253
+ """Run *function* in the worker subprocess (blocking).
254
+
255
+ Parameters
256
+ ----------
257
+ function
258
+ A picklable callable to execute in the worker.
259
+ timeout
260
+ Hard timeout in seconds; the worker is SIGKILLed if it
261
+ exceeds this.
262
+ **kwargs
263
+ Keyword arguments forwarded to *function*.
264
+
265
+ Returns
266
+ -------
267
+ Any
268
+ Whatever *function* returns.
269
+
270
+ Raises
271
+ ------
272
+ TimeoutError
273
+ If the worker exceeds *timeout*.
274
+ ProcessPoolExhausted
275
+ If the pool has no available workers.
276
+ """
277
+ status = self.status
278
+ if status is PoolStatus.READY:
279
+ pass
280
+ elif status is PoolStatus.NEEDS_ROTATION:
281
+ self._rotate_worker()
282
+ elif status is PoolStatus.EXHAUSTED:
283
+ # Capture diagnostics *before* cleanup.
284
+ task_count = self._active.task_count if self._active else 0
285
+ exit_code = self._active.process.exitcode if self._active else None
286
+ if self._active is not None:
287
+ self._shutdown_worker(self._active)
288
+ self._active = None
289
+ raise ProcessPoolExhausted(
290
+ f"tasks={task_count}/{self._max_tasks}",
291
+ exit_code=exit_code,
292
+ )
293
+ elif status is PoolStatus.SHUTDOWN:
294
+ raise ProcessPoolExhausted("Pool is shut down")
295
+ else:
296
+ _assert_never(status)
297
+
298
+ # At this point self._active is guaranteed non-None.
299
+ handle = self._active
300
+ assert handle is not None # narrowing for mypy
301
+
302
+ # Send the task.
303
+ handle.connection.send((function, (), kwargs))
304
+ handle.task_count += 1
305
+
306
+ # Wait for result.
307
+ try:
308
+ result = self._wait_for_result(handle, function, timeout)
309
+ except (TimeoutError, ProcessPoolExhausted):
310
+ self._kill_worker(handle)
311
+ self._active = None
312
+ if self._keep_spare:
313
+ try:
314
+ self._promote_spare()
315
+ except Exception:
316
+ log.warning("Failed to promote spare after error", exc_info=True)
317
+ raise
318
+
319
+ # Persist elapsed_ms at pool level so it survives rotation.
320
+ self._last_elapsed_ms = handle.last_metadata.get("elapsed_ms")
321
+
322
+ # Rotate after the final allowed task or memory limit exceeded.
323
+ memory_exceeded = self._exceeds_memory_limit(handle)
324
+ if handle.task_count >= self._max_tasks or memory_exceeded:
325
+ self._shutdown_worker(handle)
326
+ self._active = None
327
+ if self._keep_spare:
328
+ try:
329
+ self._promote_spare()
330
+ except Exception:
331
+ log.warning("Failed to promote spare after rotation", exc_info=True)
332
+
333
+ return result
334
+
335
+ async def arun(self, function: Callable, timeout: float, **kwargs: Any) -> Any:
336
+ """Async wrapper around :meth:`run`.
337
+
338
+ Parameters
339
+ ----------
340
+ function
341
+ A picklable callable.
342
+ timeout
343
+ Hard timeout in seconds.
344
+ **kwargs
345
+ Forwarded to *function*.
346
+
347
+ Returns
348
+ -------
349
+ Any
350
+ Whatever *function* returns.
351
+ """
352
+ loop = asyncio.get_running_loop()
353
+ return await loop.run_in_executor(
354
+ None, lambda: self.run(function, timeout, **kwargs)
355
+ )
356
+
357
+ def shutdown(self) -> None:
358
+ """Shut down all workers and mark the pool as dead."""
359
+ self._shutdown = True
360
+ if self._active is not None:
361
+ self._shutdown_worker(self._active)
362
+ self._active = None
363
+ if self._spare is not None:
364
+ self._shutdown_worker(self._spare)
365
+ self._spare = None
366
+
367
+ # ------------------------------------------------------------------
368
+ # Worker lifecycle
369
+ # ------------------------------------------------------------------
370
+
371
+ def _start_worker(self, block_ready: bool = True) -> WorkerHandle:
372
+ """Spawn a new worker subprocess.
373
+
374
+ Parameters
375
+ ----------
376
+ block_ready
377
+ If ``True``, block until the worker sends ``"ready"``.
378
+
379
+ Returns
380
+ -------
381
+ WorkerHandle
382
+ """
383
+ parent_connection, child_connection = Pipe()
384
+ log_level = logging.getLogger().getEffectiveLevel()
385
+ try:
386
+ context = multiprocessing.get_context("spawn")
387
+ process = context.Process(
388
+ target=_worker_process,
389
+ args=(child_connection, log_level, self._warming),
390
+ )
391
+ except RuntimeError:
392
+ process = Process(
393
+ target=_worker_process,
394
+ args=(child_connection, log_level, self._warming),
395
+ )
396
+
397
+ process.start()
398
+ handle = WorkerHandle(
399
+ process=process,
400
+ connection=parent_connection,
401
+ child_connection=child_connection,
402
+ )
403
+ log.info(f"Started worker pid={process.pid}")
404
+
405
+ if block_ready:
406
+ if not self._wait_for_ready(handle):
407
+ self._kill_worker(handle)
408
+ raise RuntimeError(
409
+ f"Worker pid={handle.process.pid} failed to become ready "
410
+ f"within {self._ready_timeout}s"
411
+ )
412
+
413
+ return handle
414
+
415
+ def _wait_for_ready(self, handle: WorkerHandle) -> bool:
416
+ """Block until *handle* sends ``"ready"`` or the timeout expires.
417
+
418
+ Parameters
419
+ ----------
420
+ handle
421
+ The worker to wait on.
422
+
423
+ Returns
424
+ -------
425
+ bool
426
+ ``True`` if the worker became ready, ``False`` on timeout or error.
427
+ """
428
+ deadline = time.perf_counter() + self._ready_timeout
429
+ while time.perf_counter() < deadline:
430
+ remaining = max(0.01, deadline - time.perf_counter())
431
+ if not handle.connection.poll(timeout=remaining):
432
+ break
433
+ try:
434
+ status, payload, _ = handle.connection.recv()
435
+ if status == "log":
436
+ forward_subprocess_log(payload)
437
+ continue
438
+ if status == "ready":
439
+ handle.ready = True
440
+ handle.init_result = payload
441
+ return True
442
+ log.warning(f"Expected 'ready', got: {status}")
443
+ return False
444
+ except Exception as error:
445
+ log.warning(f"Failed to receive ready signal: {error}")
446
+ return False
447
+ log.warning("Worker didn't send ready signal within timeout")
448
+ return False
449
+
450
+ def _wait_for_result(
451
+ self, handle: WorkerHandle, function: Callable, timeout: float
452
+ ) -> Any:
453
+ """Poll *handle* for the task result, forwarding log messages.
454
+
455
+ Parameters
456
+ ----------
457
+ handle
458
+ The active worker handle.
459
+ function
460
+ The function that was dispatched (used for error messages).
461
+ timeout
462
+ Hard timeout in seconds.
463
+
464
+ Returns
465
+ -------
466
+ Any
467
+ The return value of *function*.
468
+
469
+ Raises
470
+ ------
471
+ TimeoutError
472
+ If *timeout* is exceeded.
473
+ ProcessPoolExhausted
474
+ If the worker dies mid-task.
475
+ """
476
+ start = time.perf_counter()
477
+ while time.perf_counter() - start < timeout:
478
+ if handle.connection.poll(timeout=_POLL_TIMEOUT):
479
+ try:
480
+ status, payload, metadata = handle.connection.recv()
481
+ except (EOFError, BrokenPipeError):
482
+ exit_code = handle.process.exitcode
483
+ raise ProcessPoolExhausted(
484
+ f"Subprocess died during `{function.__name__}`",
485
+ exit_code=exit_code,
486
+ )
487
+
488
+ if status == "log":
489
+ forward_subprocess_log(payload)
490
+ continue
491
+
492
+ handle.last_metadata = metadata
493
+
494
+ if status == "success":
495
+ return payload
496
+ if status == "error":
497
+ raise payload
498
+
499
+ if not handle.process.is_alive():
500
+ exit_code = handle.process.exitcode
501
+ raise ProcessPoolExhausted(
502
+ f"Subprocess died during `{function.__name__}`",
503
+ exit_code=exit_code,
504
+ )
505
+
506
+ raise TimeoutError(f"`{function.__name__}` timed out after {timeout}s")
507
+
508
+ def _exceeds_memory_limit(self, handle: WorkerHandle) -> bool:
509
+ if self._max_memory is None and self._max_memory_percent_bytes is None:
510
+ return False
511
+ try:
512
+ rss = psutil.Process(handle.process.pid).memory_info().rss
513
+ except (psutil.NoSuchProcess, psutil.AccessDenied, ProcessLookupError):
514
+ return False
515
+ self._last_memory_rss = rss
516
+ if self._max_memory is not None and rss > self._max_memory:
517
+ log.info(
518
+ f"Worker pid={handle.process.pid} RSS {rss:,}B exceeds max_memory={self._max_memory:,}B, rotating"
519
+ )
520
+ return True
521
+ if (
522
+ self._max_memory_percent_bytes is not None
523
+ and rss > self._max_memory_percent_bytes
524
+ ):
525
+ log.info(
526
+ f"Worker pid={handle.process.pid} RSS {rss:,}B exceeds percent limit ({self._max_memory_percent_bytes:,}B), rotating"
527
+ )
528
+ return True
529
+ return False
530
+
531
+ def _rotate_worker(self) -> None:
532
+ """Shut down the spent active worker and promote the spare.
533
+
534
+ Called when :attr:`status` is :attr:`PoolStatus.NEEDS_ROTATION`.
535
+ """
536
+ if self._active is not None:
537
+ self._shutdown_worker(self._active)
538
+ self._active = None
539
+ self._promote_spare()
540
+
541
+ def _promote_spare(self) -> None:
542
+ """Make the spare worker active and replenish the spare.
543
+
544
+ If the spare has died or never became ready (e.g. deadlocked
545
+ during warming), it is killed and a fresh worker is
546
+ cold-started instead.
547
+ """
548
+ if self._spare is not None:
549
+ if self._spare.process.is_alive():
550
+ if not self._spare.ready:
551
+ self._wait_for_ready(self._spare)
552
+ if self._spare.ready:
553
+ self._active = self._spare
554
+ self._spare = None
555
+ else:
556
+ # Spare is alive but never became ready — kill it.
557
+ log.warning(
558
+ f"Spare pid={self._spare.process.pid} never became ready, killing"
559
+ )
560
+ self._kill_worker(self._spare)
561
+ self._spare = None
562
+ self._active = self._start_worker(block_ready=True)
563
+ else:
564
+ # Spare died — clean up and cold-start.
565
+ self._close_worker(self._spare)
566
+ self._spare = None
567
+ self._active = self._start_worker(block_ready=True)
568
+ else:
569
+ self._active = self._start_worker(block_ready=True)
570
+
571
+ # Replenish spare.
572
+ try:
573
+ self._spare = self._start_worker(block_ready=False)
574
+ except Exception:
575
+ log.warning("Failed to start spare worker", exc_info=True)
576
+ self._spare = None
577
+
578
+ def _shutdown_worker(self, handle: WorkerHandle) -> None:
579
+ """Gracefully shut down *handle*; escalate to kill if needed.
580
+
581
+ Parameters
582
+ ----------
583
+ handle
584
+ The worker to shut down.
585
+ """
586
+ if handle.process.is_alive():
587
+ try:
588
+ handle.connection.send((None, (), {}))
589
+ except (BrokenPipeError, OSError):
590
+ pass
591
+ handle.process.join(timeout=_JOIN_TIMEOUT)
592
+ if handle.process.is_alive():
593
+ self._kill_worker(handle)
594
+ return
595
+ self._close_worker(handle)
596
+
597
+ def _kill_worker(self, handle: WorkerHandle) -> None:
598
+ """SIGTERM then SIGKILL the worker and its entire process tree.
599
+
600
+ Parameters
601
+ ----------
602
+ handle
603
+ The worker to kill.
604
+ """
605
+ if not handle.process.is_alive():
606
+ self._close_worker(handle)
607
+ return
608
+ try:
609
+ worker = psutil.Process(handle.process.pid)
610
+ children = worker.children(recursive=True)
611
+ for child in children:
612
+ child.terminate()
613
+ worker.terminate()
614
+ gone, alive = psutil.wait_procs(children + [worker], timeout=0.1)
615
+ for remaining in alive:
616
+ remaining.kill()
617
+ psutil.wait_procs(alive, timeout=_KILL_WAIT)
618
+ except (psutil.NoSuchProcess, ProcessLookupError):
619
+ pass
620
+ except psutil.TimeoutExpired:
621
+ log.warning("Process tree still alive after SIGKILL")
622
+ except Exception:
623
+ log.error("Error killing process tree", exc_info=True)
624
+ self._close_worker(handle)
625
+
626
+ def _close_worker(self, handle: WorkerHandle) -> None:
627
+ """Join the process and close both pipe endpoints.
628
+
629
+ Parameters
630
+ ----------
631
+ handle
632
+ The worker whose resources should be freed.
633
+ """
634
+ handle.process.join(timeout=_POLL_TIMEOUT)
635
+ try:
636
+ handle.connection.close()
637
+ except Exception:
638
+ pass
639
+ try:
640
+ handle.child_connection.close()
641
+ except Exception:
642
+ pass
File without changes