uhttp-workers 1.5.0__tar.gz → 1.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {uhttp_workers-1.5.0/uhttp_workers.egg-info → uhttp_workers-1.7.0}/PKG-INFO +59 -10
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/README.md +58 -9
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/tests/test_dispatcher.py +272 -2
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/tests/test_request_response.py +11 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/tests/test_worker_pool.py +21 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/uhttp/workers.py +92 -11
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0/uhttp_workers.egg-info}/PKG-INFO +59 -10
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/.github/workflows/publish.yml +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/.github/workflows/tests.yml +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/.gitignore +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/examples/simple_workers.py +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/examples/sse_workers.py +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/examples/static/index.html +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/pyproject.toml +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/setup.cfg +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/tests/__init__.py +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/tests/test_api_handler.py +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/tests/test_decorators.py +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/tests/test_pattern_matching.py +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/tests/test_worker.py +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/uhttp_workers.egg-info/SOURCES.txt +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/uhttp_workers.egg-info/dependency_links.txt +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/uhttp_workers.egg-info/requires.txt +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.7.0}/uhttp_workers.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: uhttp-workers
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.7.0
|
|
4
4
|
Summary: Multi-process worker dispatcher built on uhttp-server
|
|
5
5
|
Author-email: Pavel Revak <pavelrevak@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -277,15 +277,18 @@ Workers send heartbeats automatically via the shared response queue. When a work
|
|
|
277
277
|
```python
|
|
278
278
|
@_workers.api('/process/{id:int}', 'POST')
|
|
279
279
|
def process(self, request):
|
|
280
|
-
# request.request_id
|
|
281
|
-
# request.method
|
|
282
|
-
# request.path
|
|
283
|
-
# request.path_params
|
|
284
|
-
# request.query
|
|
285
|
-
# request.data
|
|
286
|
-
# request.headers
|
|
287
|
-
# request.cookies
|
|
288
|
-
# request.content_type
|
|
280
|
+
# request.request_id — internal ID for dispatcher pairing
|
|
281
|
+
# request.method — 'POST'
|
|
282
|
+
# request.path — '/process/42'
|
|
283
|
+
# request.path_params — {'id': 42}
|
|
284
|
+
# request.query — {'page': '1'} or None
|
|
285
|
+
# request.data — dict (JSON), bytes (binary), or None
|
|
286
|
+
# request.headers — dict
|
|
287
|
+
# request.cookies — dict (lazy-parsed from Cookie header)
|
|
288
|
+
# request.content_type — 'application/json'
|
|
289
|
+
# request.remote_address — 'host:port' string (honors X-Forwarded-For
|
|
290
|
+
# from trusted proxies; configured on the
|
|
291
|
+
# HTTP server side)
|
|
289
292
|
|
|
290
293
|
# return data (status 200)
|
|
291
294
|
return {'result': 'ok'}
|
|
@@ -565,6 +568,7 @@ Reason is one of:
|
|
|
565
568
|
| `PENDING_DISCONNECTED` | Client disconnected mid-stream; worker was notified via control queue (race possible). |
|
|
566
569
|
| `PENDING_STREAM_CLOSED` | Worker ended the SSE stream cleanly. |
|
|
567
570
|
| `PENDING_SHUTDOWN` | Dispatcher is shutting down; client got 503. |
|
|
571
|
+
| `PENDING_WORKER_DIED` | Worker process died/was killed while owning the request; client got 500. `on_worker_died()` runs first. |
|
|
568
572
|
|
|
569
573
|
The hook is invoked after the client-facing action (respond / disconnect / control queue put)
|
|
570
574
|
so dispatcher state is finalized when it runs. Exceptions raised by the hook are logged at
|
|
@@ -575,6 +579,51 @@ Override `on_pending_removed()` if you need exactly-once cleanup. Overriding bot
|
|
|
575
579
|
but discouraged — for the `PENDING_COMPLETED` reason, `on_response()` is called immediately
|
|
576
580
|
before `on_pending_removed()`.
|
|
577
581
|
|
|
582
|
+
## Worker Death Hook
|
|
583
|
+
|
|
584
|
+
Workers die — segfault in a C extension, OOM-kill, or the dispatcher kills them after
|
|
585
|
+
`stuck_timeout`. Override `on_worker_died()` to capture which requests they had in-flight
|
|
586
|
+
(useful for forensics when a malformed payload reproduces a crash):
|
|
587
|
+
|
|
588
|
+
```python
|
|
589
|
+
class MyDispatcher(_workers.Dispatcher):
|
|
590
|
+
def on_worker_died(
|
|
591
|
+
self, pool, worker_id, reason, exitcode, victims):
|
|
592
|
+
# `victims` is a list of (request_id, _PendingRequest) for all
|
|
593
|
+
# requests this worker had claimed but not completed.
|
|
594
|
+
# `exitcode` is None for stuck workers, otherwise the process exit
|
|
595
|
+
# code (negative = signal: -9 OOM, -11 SIGSEGV).
|
|
596
|
+
for rid, pending in victims:
|
|
597
|
+
c = pending.client
|
|
598
|
+
self._crash_queue.append({
|
|
599
|
+
'reason': reason,
|
|
600
|
+
'exitcode': exitcode,
|
|
601
|
+
'method': c.method,
|
|
602
|
+
'path': c.path,
|
|
603
|
+
'address': c.address,
|
|
604
|
+
'body': c.body, # raw bytes — replay this to reproduce
|
|
605
|
+
})
|
|
606
|
+
# Default impl responds 500 to victims and fires
|
|
607
|
+
# on_pending_removed(PENDING_WORKER_DIED). Call it after capture.
|
|
608
|
+
super().on_worker_died(
|
|
609
|
+
pool, worker_id, reason, exitcode, victims)
|
|
610
|
+
```
|
|
611
|
+
|
|
612
|
+
What's a victim: any request the worker had claimed via `MSG_HEARTBEAT`
|
|
613
|
+
(`pending.worker_id == worker_id`). Requests still in the queue (`worker_id is None`)
|
|
614
|
+
are **not** victims — other workers in the pool will pick them up after restart.
|
|
615
|
+
|
|
616
|
+
Default behavior (if you don't override) is to log the death + each victim, respond
|
|
617
|
+
500 (or close the stream for SSE/NDJSON), and fire `on_pending_removed` for each.
|
|
618
|
+
Override only if you want to persist payloads or customize the response status/body.
|
|
619
|
+
|
|
620
|
+
**500 vs 503:** a victim of a crashed worker gets **500** (processing started, then
|
|
621
|
+
the server failed). A new request arriving while the pool has zero alive workers
|
|
622
|
+
gets **503 + `Retry-After: 1`** (rejected before processing — try again shortly).
|
|
623
|
+
A request to a pool that has exceeded `max_restarts` in `restart_window` gets **503**
|
|
624
|
+
permanently (`pool.is_degraded`). `pool.alive_count` is exposed for monitoring and
|
|
625
|
+
also appears in `pool.status()`.
|
|
626
|
+
|
|
578
627
|
## Dispatcher Idle Hook
|
|
579
628
|
|
|
580
629
|
Override `on_idle()` on the dispatcher for periodic background tasks — called on each `select()` timeout (every `SELECT_TIMEOUT` seconds, default 1s):
|
|
@@ -263,15 +263,18 @@ Workers send heartbeats automatically via the shared response queue. When a work
|
|
|
263
263
|
```python
|
|
264
264
|
@_workers.api('/process/{id:int}', 'POST')
|
|
265
265
|
def process(self, request):
|
|
266
|
-
# request.request_id
|
|
267
|
-
# request.method
|
|
268
|
-
# request.path
|
|
269
|
-
# request.path_params
|
|
270
|
-
# request.query
|
|
271
|
-
# request.data
|
|
272
|
-
# request.headers
|
|
273
|
-
# request.cookies
|
|
274
|
-
# request.content_type
|
|
266
|
+
# request.request_id — internal ID for dispatcher pairing
|
|
267
|
+
# request.method — 'POST'
|
|
268
|
+
# request.path — '/process/42'
|
|
269
|
+
# request.path_params — {'id': 42}
|
|
270
|
+
# request.query — {'page': '1'} or None
|
|
271
|
+
# request.data — dict (JSON), bytes (binary), or None
|
|
272
|
+
# request.headers — dict
|
|
273
|
+
# request.cookies — dict (lazy-parsed from Cookie header)
|
|
274
|
+
# request.content_type — 'application/json'
|
|
275
|
+
# request.remote_address — 'host:port' string (honors X-Forwarded-For
|
|
276
|
+
# from trusted proxies; configured on the
|
|
277
|
+
# HTTP server side)
|
|
275
278
|
|
|
276
279
|
# return data (status 200)
|
|
277
280
|
return {'result': 'ok'}
|
|
@@ -551,6 +554,7 @@ Reason is one of:
|
|
|
551
554
|
| `PENDING_DISCONNECTED` | Client disconnected mid-stream; worker was notified via control queue (race possible). |
|
|
552
555
|
| `PENDING_STREAM_CLOSED` | Worker ended the SSE stream cleanly. |
|
|
553
556
|
| `PENDING_SHUTDOWN` | Dispatcher is shutting down; client got 503. |
|
|
557
|
+
| `PENDING_WORKER_DIED` | Worker process died/was killed while owning the request; client got 500. `on_worker_died()` runs first. |
|
|
554
558
|
|
|
555
559
|
The hook is invoked after the client-facing action (respond / disconnect / control queue put)
|
|
556
560
|
so dispatcher state is finalized when it runs. Exceptions raised by the hook are logged at
|
|
@@ -561,6 +565,51 @@ Override `on_pending_removed()` if you need exactly-once cleanup. Overriding bot
|
|
|
561
565
|
but discouraged — for the `PENDING_COMPLETED` reason, `on_response()` is called immediately
|
|
562
566
|
before `on_pending_removed()`.
|
|
563
567
|
|
|
568
|
+
## Worker Death Hook
|
|
569
|
+
|
|
570
|
+
Workers die — segfault in a C extension, OOM-kill, or the dispatcher kills them after
|
|
571
|
+
`stuck_timeout`. Override `on_worker_died()` to capture which requests they had in-flight
|
|
572
|
+
(useful for forensics when a malformed payload reproduces a crash):
|
|
573
|
+
|
|
574
|
+
```python
|
|
575
|
+
class MyDispatcher(_workers.Dispatcher):
|
|
576
|
+
def on_worker_died(
|
|
577
|
+
self, pool, worker_id, reason, exitcode, victims):
|
|
578
|
+
# `victims` is a list of (request_id, _PendingRequest) for all
|
|
579
|
+
# requests this worker had claimed but not completed.
|
|
580
|
+
# `exitcode` is None for stuck workers, otherwise the process exit
|
|
581
|
+
# code (negative = signal: -9 OOM, -11 SIGSEGV).
|
|
582
|
+
for rid, pending in victims:
|
|
583
|
+
c = pending.client
|
|
584
|
+
self._crash_queue.append({
|
|
585
|
+
'reason': reason,
|
|
586
|
+
'exitcode': exitcode,
|
|
587
|
+
'method': c.method,
|
|
588
|
+
'path': c.path,
|
|
589
|
+
'address': c.address,
|
|
590
|
+
'body': c.body, # raw bytes — replay this to reproduce
|
|
591
|
+
})
|
|
592
|
+
# Default impl responds 500 to victims and fires
|
|
593
|
+
# on_pending_removed(PENDING_WORKER_DIED). Call it after capture.
|
|
594
|
+
super().on_worker_died(
|
|
595
|
+
pool, worker_id, reason, exitcode, victims)
|
|
596
|
+
```
|
|
597
|
+
|
|
598
|
+
What's a victim: any request the worker had claimed via `MSG_HEARTBEAT`
|
|
599
|
+
(`pending.worker_id == worker_id`). Requests still in the queue (`worker_id is None`)
|
|
600
|
+
are **not** victims — other workers in the pool will pick them up after restart.
|
|
601
|
+
|
|
602
|
+
Default behavior (if you don't override) is to log the death + each victim, respond
|
|
603
|
+
500 (or close the stream for SSE/NDJSON), and fire `on_pending_removed` for each.
|
|
604
|
+
Override only if you want to persist payloads or customize the response status/body.
|
|
605
|
+
|
|
606
|
+
**500 vs 503:** a victim of a crashed worker gets **500** (processing started, then
|
|
607
|
+
the server failed). A new request arriving while the pool has zero alive workers
|
|
608
|
+
gets **503 + `Retry-After: 1`** (rejected before processing — try again shortly).
|
|
609
|
+
A request to a pool that has exceeded `max_restarts` in `restart_window` gets **503**
|
|
610
|
+
permanently (`pool.is_degraded`). `pool.alive_count` is exposed for monitoring and
|
|
611
|
+
also appears in `pool.status()`.
|
|
612
|
+
|
|
564
613
|
## Dispatcher Idle Hook
|
|
565
614
|
|
|
566
615
|
Override `on_idle()` on the dispatcher for periodic background tasks — called on each `select()` timeout (every `SELECT_TIMEOUT` seconds, default 1s):
|
|
@@ -13,7 +13,7 @@ from uhttp.workers import (
|
|
|
13
13
|
MSG_SSE_OPEN, MSG_SSE_EVENT, MSG_SSE_CLOSE, MSG_NDJSON,
|
|
14
14
|
CTL_DISCONNECT,
|
|
15
15
|
PENDING_COMPLETED, PENDING_TIMEOUT, PENDING_DISCONNECTED,
|
|
16
|
-
PENDING_STREAM_CLOSED, PENDING_SHUTDOWN,
|
|
16
|
+
PENDING_STREAM_CLOSED, PENDING_SHUTDOWN, PENDING_WORKER_DIED,
|
|
17
17
|
LOG_ERROR,
|
|
18
18
|
_PendingRequest,
|
|
19
19
|
)
|
|
@@ -29,13 +29,16 @@ class MockClient:
|
|
|
29
29
|
"""Mock HttpConnection for testing dispatcher logic."""
|
|
30
30
|
|
|
31
31
|
def __init__(self, method='GET', path='/', query=None, data=None,
|
|
32
|
-
headers=None, content_type=None
|
|
32
|
+
headers=None, content_type=None, body=None,
|
|
33
|
+
remote_address='127.0.0.1:0'):
|
|
33
34
|
self.method = method
|
|
34
35
|
self.path = path
|
|
35
36
|
self.query = query
|
|
36
37
|
self.data = data
|
|
37
38
|
self.headers = headers or {}
|
|
38
39
|
self.content_type = content_type
|
|
40
|
+
self.body = body
|
|
41
|
+
self.remote_address = remote_address
|
|
39
42
|
self.responded = False
|
|
40
43
|
self.response_data = None
|
|
41
44
|
self.response_status = None
|
|
@@ -188,6 +191,8 @@ class TestDispatcherDoCheck(unittest.TestCase):
|
|
|
188
191
|
|
|
189
192
|
def test_pass_check(self):
|
|
190
193
|
pool = WorkerPool(DummyWorker, routes=['/api/**'])
|
|
194
|
+
# fake a live worker so alive_count > 0 without starting processes
|
|
195
|
+
pool.workers = [type('W', (), {'is_alive': lambda self: True})()]
|
|
191
196
|
d = Dispatcher.__new__(Dispatcher)
|
|
192
197
|
d._sync_routes = []
|
|
193
198
|
d._static_routes = {}
|
|
@@ -317,6 +322,68 @@ class TestDispatcherPoolRouting(unittest.TestCase):
|
|
|
317
322
|
self.assertTrue(client.responded)
|
|
318
323
|
self.assertEqual(client.response_status, 503)
|
|
319
324
|
|
|
325
|
+
def test_dispatch_no_alive_workers_returns_503(self):
|
|
326
|
+
"""Transient: pool has workers but none currently alive."""
|
|
327
|
+
# pool has dead workers (not degraded yet)
|
|
328
|
+
self.pool_default.workers = [
|
|
329
|
+
type('W', (), {'is_alive': lambda self: False})()]
|
|
330
|
+
d = Dispatcher.__new__(Dispatcher)
|
|
331
|
+
d._sync_routes = []
|
|
332
|
+
d._static_routes = {}
|
|
333
|
+
d._pools = [self.pool_default]
|
|
334
|
+
d._pending = {}
|
|
335
|
+
d._max_pending = 1000
|
|
336
|
+
d._next_request_id = 0
|
|
337
|
+
|
|
338
|
+
client = MockClient('GET', '/test')
|
|
339
|
+
d._dispatch_to_pool(client)
|
|
340
|
+
self.assertTrue(client.responded)
|
|
341
|
+
self.assertEqual(client.response_status, 503)
|
|
342
|
+
self.assertEqual(
|
|
343
|
+
client.response_data['error'], 'No workers available')
|
|
344
|
+
self.assertEqual(
|
|
345
|
+
client.response_headers.get('Retry-After'), '1')
|
|
346
|
+
# request was NOT enqueued
|
|
347
|
+
self.assertEqual(d._pending, {})
|
|
348
|
+
|
|
349
|
+
def test_dispatch_empty_workers_returns_503(self):
|
|
350
|
+
"""Pool was never started (empty workers list)."""
|
|
351
|
+
# self.pool_default.workers is [] from __init__
|
|
352
|
+
d = Dispatcher.__new__(Dispatcher)
|
|
353
|
+
d._sync_routes = []
|
|
354
|
+
d._static_routes = {}
|
|
355
|
+
d._pools = [self.pool_default]
|
|
356
|
+
d._pending = {}
|
|
357
|
+
d._max_pending = 1000
|
|
358
|
+
d._next_request_id = 0
|
|
359
|
+
|
|
360
|
+
client = MockClient('GET', '/test')
|
|
361
|
+
d._dispatch_to_pool(client)
|
|
362
|
+
self.assertTrue(client.responded)
|
|
363
|
+
self.assertEqual(client.response_status, 503)
|
|
364
|
+
self.assertEqual(
|
|
365
|
+
client.response_data['error'], 'No workers available')
|
|
366
|
+
|
|
367
|
+
def test_dispatch_forwards_remote_address(self):
|
|
368
|
+
"""Request enqueued to worker carries client.remote_address."""
|
|
369
|
+
# fake an alive worker so dispatch reaches enqueue
|
|
370
|
+
self.pool_default.workers = [
|
|
371
|
+
type('W', (), {'is_alive': lambda self: True})()]
|
|
372
|
+
d = Dispatcher.__new__(Dispatcher)
|
|
373
|
+
d._sync_routes = []
|
|
374
|
+
d._static_routes = {}
|
|
375
|
+
d._pools = [self.pool_default]
|
|
376
|
+
d._pending = {}
|
|
377
|
+
d._max_pending = 1000
|
|
378
|
+
d._next_request_id = 0
|
|
379
|
+
|
|
380
|
+
client = MockClient(
|
|
381
|
+
'GET', '/test', remote_address='198.51.100.7:33421')
|
|
382
|
+
d._dispatch_to_pool(client)
|
|
383
|
+
# request was enqueued
|
|
384
|
+
req = self.pool_default.request_queue.get(timeout=1)
|
|
385
|
+
self.assertEqual(req.remote_address, '198.51.100.7:33421')
|
|
386
|
+
|
|
320
387
|
|
|
321
388
|
class TestDispatcherProcessResponse(unittest.TestCase):
|
|
322
389
|
|
|
@@ -769,5 +836,208 @@ class TestDispatcherPendingRemoved(unittest.TestCase):
|
|
|
769
836
|
self.assertTrue(client.responded)
|
|
770
837
|
|
|
771
838
|
|
|
839
|
+
class TestDispatcherWorkerDied(unittest.TestCase):
|
|
840
|
+
"""Tests for on_worker_died hook and victim handling."""
|
|
841
|
+
|
|
842
|
+
def _make_dispatcher(self, dispatcher_cls=Dispatcher):
|
|
843
|
+
# queue_warning=0 disables the queue-size check (which would
|
|
844
|
+
# otherwise touch pool.pending_count → request_queue.qsize()).
|
|
845
|
+
pool = WorkerPool(
|
|
846
|
+
DummyWorker, routes=['/api/**'], queue_warning=0)
|
|
847
|
+
# Mock check_workers so we control what it returns without
|
|
848
|
+
# actually starting processes.
|
|
849
|
+
pool._fake_restarted = []
|
|
850
|
+
pool.check_workers = lambda: pool._fake_restarted
|
|
851
|
+
d = dispatcher_cls.__new__(dispatcher_cls)
|
|
852
|
+
d._sync_routes = []
|
|
853
|
+
d._static_routes = {}
|
|
854
|
+
d._pools = [pool]
|
|
855
|
+
d._pending = {}
|
|
856
|
+
d._max_pending = 1000
|
|
857
|
+
d._next_request_id = 0
|
|
858
|
+
d._response_queue = mp.Queue()
|
|
859
|
+
d._log_is_tty = False
|
|
860
|
+
d.log_calls = []
|
|
861
|
+
d.on_log = lambda name, level, msg: d.log_calls.append(
|
|
862
|
+
(name, level, msg))
|
|
863
|
+
d.recorded_removed = []
|
|
864
|
+
return d, pool
|
|
865
|
+
|
|
866
|
+
def test_single_victim_gets_500(self):
|
|
867
|
+
|
|
868
|
+
class RecordingDispatcher(Dispatcher):
|
|
869
|
+
def on_pending_removed(self, request_id, pending, reason):
|
|
870
|
+
self.recorded_removed.append((request_id, reason))
|
|
871
|
+
|
|
872
|
+
d, pool = self._make_dispatcher(RecordingDispatcher)
|
|
873
|
+
client = MockClient(
|
|
874
|
+
'POST', '/api/scan', body=b'\x00\x01bad',
|
|
875
|
+
remote_address='10.0.0.7:42')
|
|
876
|
+
pending = _PendingRequest(client, pool)
|
|
877
|
+
pending.worker_id = 0
|
|
878
|
+
d._pending[42] = pending
|
|
879
|
+
pool._fake_restarted = [(0, 'died exit=-11', -11)]
|
|
880
|
+
d._check_all_workers()
|
|
881
|
+
# client got 500
|
|
882
|
+
self.assertTrue(client.responded)
|
|
883
|
+
self.assertEqual(client.response_status, 500)
|
|
884
|
+
self.assertEqual(client.response_data['error'], 'Worker crashed')
|
|
885
|
+
self.assertIn('exit=-11', client.response_data['reason'])
|
|
886
|
+
# removed from pending + hook fired
|
|
887
|
+
self.assertNotIn(42, d._pending)
|
|
888
|
+
self.assertEqual(
|
|
889
|
+
d.recorded_removed, [(42, PENDING_WORKER_DIED)])
|
|
890
|
+
|
|
891
|
+
def test_multiple_victims_all_handled(self):
|
|
892
|
+
d, pool = self._make_dispatcher()
|
|
893
|
+
c1 = MockClient('GET', '/api/a', remote_address='1.1.1.1:42')
|
|
894
|
+
c2 = MockClient('GET', '/api/b', remote_address='2.2.2.2:42')
|
|
895
|
+
c3 = MockClient('GET', '/api/c', remote_address='3.3.3.3:42')
|
|
896
|
+
for rid, c in [(1, c1), (2, c2), (3, c3)]:
|
|
897
|
+
p = _PendingRequest(c, pool)
|
|
898
|
+
p.worker_id = 0
|
|
899
|
+
d._pending[rid] = p
|
|
900
|
+
pool._fake_restarted = [(0, 'stuck', None)]
|
|
901
|
+
d._check_all_workers()
|
|
902
|
+
for c in (c1, c2, c3):
|
|
903
|
+
self.assertTrue(c.responded)
|
|
904
|
+
self.assertEqual(c.response_status, 500)
|
|
905
|
+
self.assertEqual(d._pending, {})
|
|
906
|
+
|
|
907
|
+
def test_streaming_victim_gets_stream_end(self):
|
|
908
|
+
d, pool = self._make_dispatcher()
|
|
909
|
+
client = MockClient('GET', '/api/events')
|
|
910
|
+
pending = _PendingRequest(client, pool)
|
|
911
|
+
pending.worker_id = 0
|
|
912
|
+
pending.streaming = True
|
|
913
|
+
d._pending[1] = pending
|
|
914
|
+
pool._fake_restarted = [(0, 'died exit=-9', -9)]
|
|
915
|
+
d._check_all_workers()
|
|
916
|
+
# stream ended, NOT respond()
|
|
917
|
+
self.assertTrue(getattr(client, 'stream_ended', False))
|
|
918
|
+
self.assertFalse(client.responded)
|
|
919
|
+
self.assertNotIn(1, d._pending)
|
|
920
|
+
|
|
921
|
+
def test_queued_request_not_a_victim(self):
|
|
922
|
+
"""Request with worker_id=None is still in queue — not a victim."""
|
|
923
|
+
d, pool = self._make_dispatcher()
|
|
924
|
+
# request belonging to dying worker
|
|
925
|
+
in_flight = MockClient('GET', '/api/active')
|
|
926
|
+
p1 = _PendingRequest(in_flight, pool)
|
|
927
|
+
p1.worker_id = 0
|
|
928
|
+
d._pending[1] = p1
|
|
929
|
+
# request still in queue, no worker claimed it
|
|
930
|
+
queued = MockClient('GET', '/api/queued')
|
|
931
|
+
p2 = _PendingRequest(queued, pool)
|
|
932
|
+
# p2.worker_id stays None
|
|
933
|
+
d._pending[2] = p2
|
|
934
|
+
pool._fake_restarted = [(0, 'died exit=-11', -11)]
|
|
935
|
+
d._check_all_workers()
|
|
936
|
+
# in-flight responded
|
|
937
|
+
self.assertTrue(in_flight.responded)
|
|
938
|
+
self.assertNotIn(1, d._pending)
|
|
939
|
+
# queued untouched
|
|
940
|
+
self.assertFalse(queued.responded)
|
|
941
|
+
self.assertIn(2, d._pending)
|
|
942
|
+
|
|
943
|
+
def test_other_worker_not_affected(self):
|
|
944
|
+
"""Only victims of THIS worker are handled; other workers stay."""
|
|
945
|
+
d, pool = self._make_dispatcher()
|
|
946
|
+
c1 = MockClient('GET', '/api/a')
|
|
947
|
+
c2 = MockClient('GET', '/api/b')
|
|
948
|
+
p1 = _PendingRequest(c1, pool)
|
|
949
|
+
p1.worker_id = 0
|
|
950
|
+
p2 = _PendingRequest(c2, pool)
|
|
951
|
+
p2.worker_id = 1
|
|
952
|
+
d._pending[1] = p1
|
|
953
|
+
d._pending[2] = p2
|
|
954
|
+
pool._fake_restarted = [(0, 'died exit=-11', -11)]
|
|
955
|
+
d._check_all_workers()
|
|
956
|
+
self.assertTrue(c1.responded)
|
|
957
|
+
self.assertNotIn(1, d._pending)
|
|
958
|
+
self.assertFalse(c2.responded)
|
|
959
|
+
self.assertIn(2, d._pending)
|
|
960
|
+
|
|
961
|
+
def test_late_response_after_victim_cleanup_dropped(self):
|
|
962
|
+
"""MSG_RESPONSE from dead worker arriving after victim removal is dropped."""
|
|
963
|
+
d, pool = self._make_dispatcher()
|
|
964
|
+
client = MockClient('GET', '/api/test')
|
|
965
|
+
pending = _PendingRequest(client, pool)
|
|
966
|
+
pending.worker_id = 0
|
|
967
|
+
d._pending[1] = pending
|
|
968
|
+
pool._fake_restarted = [(0, 'died exit=-11', -11)]
|
|
969
|
+
d._check_all_workers()
|
|
970
|
+
# request already gone; client already got 500
|
|
971
|
+
self.assertEqual(client.response_status, 500)
|
|
972
|
+
# late response from before-death — must not break or double-respond
|
|
973
|
+
client.response_status = None
|
|
974
|
+
late = Response(request_id=1, data={'ok': True}, status=200)
|
|
975
|
+
d._process_response((MSG_RESPONSE, 1, late))
|
|
976
|
+
# silently dropped
|
|
977
|
+
self.assertIsNone(client.response_status)
|
|
978
|
+
|
|
979
|
+
def test_no_victims_just_logs(self):
|
|
980
|
+
"""Worker died while idle — restarted but no pending requests."""
|
|
981
|
+
d, pool = self._make_dispatcher()
|
|
982
|
+
pool._fake_restarted = [(0, 'died exit=0', 0)]
|
|
983
|
+
d._check_all_workers()
|
|
984
|
+
# no crash, no pending changes
|
|
985
|
+
self.assertEqual(d._pending, {})
|
|
986
|
+
# should have logged
|
|
987
|
+
error_logs = [
|
|
988
|
+
msg for _, level, msg in d.log_calls if level == LOG_ERROR]
|
|
989
|
+
self.assertEqual(len(error_logs), 1)
|
|
990
|
+
self.assertIn('victims=0', error_logs[0])
|
|
991
|
+
|
|
992
|
+
def test_override_can_persist_payload(self):
|
|
993
|
+
"""User override can capture victim payload before super() responds."""
|
|
994
|
+
captured = []
|
|
995
|
+
|
|
996
|
+
class ForensicDispatcher(Dispatcher):
|
|
997
|
+
def on_worker_died(
|
|
998
|
+
self, pool, worker_id, reason, exitcode, victims):
|
|
999
|
+
for rid, pending in victims:
|
|
1000
|
+
captured.append({
|
|
1001
|
+
'rid': rid,
|
|
1002
|
+
'remote_address': pending.client.remote_address,
|
|
1003
|
+
'body': pending.client.body,
|
|
1004
|
+
'reason': reason,
|
|
1005
|
+
'exitcode': exitcode})
|
|
1006
|
+
super().on_worker_died(
|
|
1007
|
+
pool, worker_id, reason, exitcode, victims)
|
|
1008
|
+
|
|
1009
|
+
d, pool = self._make_dispatcher(ForensicDispatcher)
|
|
1010
|
+
client = MockClient(
|
|
1011
|
+
'POST', '/api/process',
|
|
1012
|
+
body=b'\xff\xfecorrupted', remote_address='9.9.9.9:42')
|
|
1013
|
+
pending = _PendingRequest(client, pool)
|
|
1014
|
+
pending.worker_id = 0
|
|
1015
|
+
d._pending[7] = pending
|
|
1016
|
+
pool._fake_restarted = [(0, 'died exit=-11', -11)]
|
|
1017
|
+
d._check_all_workers()
|
|
1018
|
+
self.assertEqual(len(captured), 1)
|
|
1019
|
+
self.assertEqual(captured[0]['remote_address'], '9.9.9.9:42')
|
|
1020
|
+
self.assertEqual(captured[0]['body'], b'\xff\xfecorrupted')
|
|
1021
|
+
self.assertEqual(captured[0]['exitcode'], -11)
|
|
1022
|
+
# super() still ran
|
|
1023
|
+
self.assertEqual(client.response_status, 500)
|
|
1024
|
+
|
|
1025
|
+
def test_hook_exception_does_not_crash_dispatcher(self):
|
|
1026
|
+
|
|
1027
|
+
class BrokenDispatcher(Dispatcher):
|
|
1028
|
+
def on_worker_died(self, *args, **kwargs):
|
|
1029
|
+
raise RuntimeError('boom')
|
|
1030
|
+
|
|
1031
|
+
d, pool = self._make_dispatcher(BrokenDispatcher)
|
|
1032
|
+
pool._fake_restarted = [(0, 'died exit=-11', -11)]
|
|
1033
|
+
# must not propagate
|
|
1034
|
+
d._check_all_workers()
|
|
1035
|
+
error_logs = [
|
|
1036
|
+
msg for _, level, msg in d.log_calls if level == LOG_ERROR]
|
|
1037
|
+
# one error log about the hook failure
|
|
1038
|
+
self.assertTrue(any('on_worker_died' in m for m in error_logs))
|
|
1039
|
+
self.assertTrue(any('boom' in m for m in error_logs))
|
|
1040
|
+
|
|
1041
|
+
|
|
772
1042
|
if __name__ == '__main__':
|
|
773
1043
|
unittest.main()
|
|
@@ -18,6 +18,17 @@ class TestRequest(unittest.TestCase):
|
|
|
18
18
|
self.assertEqual(req.headers, {})
|
|
19
19
|
self.assertIsNone(req.content_type)
|
|
20
20
|
self.assertEqual(req.path_params, {})
|
|
21
|
+
self.assertIsNone(req.remote_address)
|
|
22
|
+
|
|
23
|
+
def test_remote_address(self):
|
|
24
|
+
req = Request(1, 'GET', '/', remote_address='10.0.0.1:54321')
|
|
25
|
+
self.assertEqual(req.remote_address, '10.0.0.1:54321')
|
|
26
|
+
|
|
27
|
+
def test_pickle_remote_address(self):
|
|
28
|
+
req = Request(
|
|
29
|
+
1, 'GET', '/', remote_address='2001:db8::1:54321')
|
|
30
|
+
restored = pickle.loads(pickle.dumps(req))
|
|
31
|
+
self.assertEqual(restored.remote_address, '2001:db8::1:54321')
|
|
21
32
|
|
|
22
33
|
def test_full_creation(self):
|
|
23
34
|
req = Request(
|
|
@@ -120,6 +120,27 @@ class TestWorkerPoolStatus(unittest.TestCase):
|
|
|
120
120
|
pool = WorkerPool(DummyWorker)
|
|
121
121
|
self.assertFalse(pool.is_degraded)
|
|
122
122
|
|
|
123
|
+
def test_alive_count_empty(self):
|
|
124
|
+
pool = WorkerPool(DummyWorker, num_workers=2)
|
|
125
|
+
# not started yet
|
|
126
|
+
self.assertEqual(pool.alive_count, 0)
|
|
127
|
+
|
|
128
|
+
def test_alive_count_running(self):
|
|
129
|
+
pool = WorkerPool(DummyWorker, num_workers=2)
|
|
130
|
+
response_queue = mp.Queue()
|
|
131
|
+
pool.start(response_queue)
|
|
132
|
+
time.sleep(0.2)
|
|
133
|
+
self.assertEqual(pool.alive_count, 2)
|
|
134
|
+
pool.shutdown(timeout=3)
|
|
135
|
+
|
|
136
|
+
def test_alive_count_in_status(self):
|
|
137
|
+
pool = WorkerPool(DummyWorker, num_workers=2)
|
|
138
|
+
response_queue = mp.Queue()
|
|
139
|
+
pool.start(response_queue)
|
|
140
|
+
time.sleep(0.2)
|
|
141
|
+
self.assertEqual(pool.status()['alive_count'], 2)
|
|
142
|
+
pool.shutdown(timeout=3)
|
|
143
|
+
|
|
123
144
|
|
|
124
145
|
class TestWorkerPoolCheckWorkers(unittest.TestCase):
|
|
125
146
|
|
|
@@ -36,6 +36,7 @@ PENDING_TIMEOUT = 'TIMEOUT'
|
|
|
36
36
|
PENDING_DISCONNECTED = 'DISCONNECTED'
|
|
37
37
|
PENDING_STREAM_CLOSED = 'STREAM_CLOSED'
|
|
38
38
|
PENDING_SHUTDOWN = 'SHUTDOWN'
|
|
39
|
+
PENDING_WORKER_DIED = 'WORKER_DIED'
|
|
39
40
|
|
|
40
41
|
# Sentinel for deferred response
|
|
41
42
|
DEFERRED = object()
|
|
@@ -214,16 +215,22 @@ class Request:
|
|
|
214
215
|
headers: Request headers dict.
|
|
215
216
|
content_type: Content-Type header value, or None.
|
|
216
217
|
path_params: Path parameters filled by worker router.
|
|
218
|
+
remote_address: Client address as "host:port" string. Honors
|
|
219
|
+
X-Forwarded-For when the connection comes from a trusted
|
|
220
|
+
proxy (uhttp-server's trusted_proxies setting). None if the
|
|
221
|
+
dispatcher could not resolve the address (e.g., in tests).
|
|
217
222
|
"""
|
|
218
223
|
|
|
219
224
|
__slots__ = (
|
|
220
225
|
'request_id', 'method', 'path', 'query',
|
|
221
226
|
'data', 'headers', 'content_type', 'path_params',
|
|
227
|
+
'remote_address',
|
|
222
228
|
'_cookies', '_response_queue')
|
|
223
229
|
|
|
224
230
|
def __init__(
|
|
225
231
|
self, request_id, method, path, query=None,
|
|
226
|
-
data=None, headers=None, content_type=None
|
|
232
|
+
data=None, headers=None, content_type=None,
|
|
233
|
+
remote_address=None):
|
|
227
234
|
self.request_id = request_id
|
|
228
235
|
self.method = method
|
|
229
236
|
self.path = path
|
|
@@ -232,6 +239,7 @@ class Request:
|
|
|
232
239
|
self.headers = headers or {}
|
|
233
240
|
self.content_type = content_type
|
|
234
241
|
self.path_params = {}
|
|
242
|
+
self.remote_address = remote_address
|
|
235
243
|
self._cookies = None
|
|
236
244
|
self._response_queue = None
|
|
237
245
|
|
|
@@ -879,7 +887,10 @@ class WorkerPool:
|
|
|
879
887
|
"""Check worker health, restart dead or stuck workers.
|
|
880
888
|
|
|
881
889
|
Returns:
|
|
882
|
-
List of (worker_id, reason) tuples for restarted
|
|
890
|
+
List of (worker_id, reason, exitcode) tuples for restarted
|
|
891
|
+
workers. exitcode is None for stuck workers (dispatcher killed
|
|
892
|
+
them), otherwise the process exit code (negative = signal:
|
|
893
|
+
-9 OOM, -11 SIGSEGV, -15 SIGTERM, etc.).
|
|
883
894
|
"""
|
|
884
895
|
restarted = []
|
|
885
896
|
now = _time.time()
|
|
@@ -889,8 +900,10 @@ class WorkerPool:
|
|
|
889
900
|
if now - t < self.restart_window]
|
|
890
901
|
for i, worker in enumerate(self.workers):
|
|
891
902
|
reason = None
|
|
903
|
+
exitcode = None
|
|
892
904
|
if not worker.is_alive():
|
|
893
|
-
|
|
905
|
+
exitcode = worker.exitcode
|
|
906
|
+
reason = f"died exit={exitcode}"
|
|
894
907
|
elif now - self._last_seen.get(i, 0) > self.stuck_timeout:
|
|
895
908
|
reason = "stuck"
|
|
896
909
|
worker.kill()
|
|
@@ -904,7 +917,7 @@ class WorkerPool:
|
|
|
904
917
|
if len(self._restart_times) >= self.max_restarts:
|
|
905
918
|
self._degraded = True
|
|
906
919
|
self._start_worker(i)
|
|
907
|
-
restarted.append((i, reason))
|
|
920
|
+
restarted.append((i, reason, exitcode))
|
|
908
921
|
return restarted
|
|
909
922
|
|
|
910
923
|
def matches(self, path):
|
|
@@ -959,6 +972,11 @@ class WorkerPool:
|
|
|
959
972
|
def is_degraded(self):
|
|
960
973
|
return self._degraded
|
|
961
974
|
|
|
975
|
+
@property
|
|
976
|
+
def alive_count(self):
|
|
977
|
+
"""Number of worker processes currently alive."""
|
|
978
|
+
return sum(1 for w in self.workers if w.is_alive())
|
|
979
|
+
|
|
962
980
|
@property
|
|
963
981
|
def pending_count(self):
|
|
964
982
|
try:
|
|
@@ -976,6 +994,7 @@ class WorkerPool:
|
|
|
976
994
|
return {
|
|
977
995
|
'name': self.name,
|
|
978
996
|
'degraded': self._degraded,
|
|
997
|
+
'alive_count': self.alive_count,
|
|
979
998
|
'queue_size': self.pending_count,
|
|
980
999
|
'workers': [
|
|
981
1000
|
{
|
|
@@ -1127,6 +1146,9 @@ class Dispatcher:
|
|
|
1127
1146
|
notified via control queue (race possible).
|
|
1128
1147
|
PENDING_STREAM_CLOSED - worker ended the SSE stream cleanly.
|
|
1129
1148
|
PENDING_SHUTDOWN - dispatcher is shutting down; client got 503.
|
|
1149
|
+
PENDING_WORKER_DIED - worker process died/was killed while owning
|
|
1150
|
+
this request; client got 500. on_worker_died()
|
|
1151
|
+
runs first.
|
|
1130
1152
|
|
|
1131
1153
|
Args:
|
|
1132
1154
|
request_id: The request id being removed.
|
|
@@ -1201,6 +1223,11 @@ class Dispatcher:
|
|
|
1201
1223
|
client.respond(
|
|
1202
1224
|
{'error': 'Service unavailable'}, status=503)
|
|
1203
1225
|
return
|
|
1226
|
+
if pool.alive_count == 0:
|
|
1227
|
+
client.respond(
|
|
1228
|
+
{'error': 'No workers available'}, status=503,
|
|
1229
|
+
headers={'Retry-After': '1'})
|
|
1230
|
+
return
|
|
1204
1231
|
if len(self._pending) >= self._max_pending:
|
|
1205
1232
|
client.respond(
|
|
1206
1233
|
{'error': 'Too many requests'}, status=503)
|
|
@@ -1215,7 +1242,8 @@ class Dispatcher:
|
|
|
1215
1242
|
query=client.query,
|
|
1216
1243
|
data=client.data,
|
|
1217
1244
|
headers=dict(client.headers),
|
|
1218
|
-
content_type=client.content_type
|
|
1245
|
+
content_type=client.content_type,
|
|
1246
|
+
remote_address=client.remote_address))
|
|
1219
1247
|
|
|
1220
1248
|
def _http_request(self, client):
|
|
1221
1249
|
"""Process incoming HTTP request."""
|
|
@@ -1353,8 +1381,18 @@ class Dispatcher:
|
|
|
1353
1381
|
"""Check health of all worker pools and queue sizes."""
|
|
1354
1382
|
for pool in self._pools:
|
|
1355
1383
|
restarted = pool.check_workers()
|
|
1356
|
-
for worker_id, reason in restarted:
|
|
1357
|
-
|
|
1384
|
+
for worker_id, reason, exitcode in restarted:
|
|
1385
|
+
victims = [
|
|
1386
|
+
(rid, p) for rid, p in self._pending.items()
|
|
1387
|
+
if p.pool is pool and p.worker_id == worker_id]
|
|
1388
|
+
try:
|
|
1389
|
+
self.on_worker_died(
|
|
1390
|
+
pool, worker_id, reason, exitcode, victims)
|
|
1391
|
+
except Exception:
|
|
1392
|
+
self.on_log(
|
|
1393
|
+
pool.name, LOG_ERROR,
|
|
1394
|
+
f"on_worker_died() failed:\n"
|
|
1395
|
+
f"{_traceback.format_exc()}")
|
|
1358
1396
|
if pool.queue_warning:
|
|
1359
1397
|
qsize = pool.pending_count
|
|
1360
1398
|
if qsize >= pool.queue_warning:
|
|
@@ -1390,14 +1428,57 @@ class Dispatcher:
|
|
|
1390
1428
|
print(f"{prefix}{level_name:8s} {name:20s} {message}",
|
|
1391
1429
|
file=_sys.stderr)
|
|
1392
1430
|
|
|
1393
|
-
def
|
|
1394
|
-
"""Called when a worker
|
|
1431
|
+
def on_worker_died(self, pool, worker_id, reason, exitcode, victims):
|
|
1432
|
+
"""Called when a worker process died or was killed by the dispatcher.
|
|
1433
|
+
|
|
1434
|
+
Default behavior:
|
|
1435
|
+
1. Log restart reason + each victim (request id, client address,
|
|
1436
|
+
method, path, body size).
|
|
1437
|
+
2. Respond 500 to every victim's client (or response_stream_end()
|
|
1438
|
+
for streams), remove them from _pending, and fire
|
|
1439
|
+
on_pending_removed(PENDING_WORKER_DIED) for each.
|
|
1395
1440
|
|
|
1396
|
-
|
|
1441
|
+
Override to capture victim payloads (e.g., persist to disk for
|
|
1442
|
+
post-mortem) BEFORE calling super(). pending.client gives access
|
|
1443
|
+
to method, path, headers, body, address.
|
|
1444
|
+
|
|
1445
|
+
Args:
|
|
1446
|
+
pool: WorkerPool the worker belonged to.
|
|
1447
|
+
worker_id: Index of the restarted worker.
|
|
1448
|
+
reason: 'stuck' or 'died exit=N' (string from check_workers).
|
|
1449
|
+
exitcode: Process exit code (int) or None for stuck workers.
|
|
1450
|
+
Negative values are signals: -9 OOM, -11 SIGSEGV, etc.
|
|
1451
|
+
victims: List of (request_id, _PendingRequest) tuples — requests
|
|
1452
|
+
this worker had claimed (via MSG_HEARTBEAT) but never
|
|
1453
|
+
completed. May be empty if worker died while idle.
|
|
1397
1454
|
"""
|
|
1398
1455
|
self.on_log(
|
|
1399
1456
|
f'{pool.name}[{worker_id}]', LOG_ERROR,
|
|
1400
|
-
f"worker restarted: {reason}"
|
|
1457
|
+
f"worker restarted: {reason}, "
|
|
1458
|
+
f"victims={len(victims)}")
|
|
1459
|
+
for request_id, pending in victims:
|
|
1460
|
+
c = pending.client
|
|
1461
|
+
body_len = len(c.body) if c.body is not None else 0
|
|
1462
|
+
self.on_log(
|
|
1463
|
+
pool.name, LOG_ERROR,
|
|
1464
|
+
f" victim rid={request_id} from={c.remote_address} "
|
|
1465
|
+
f"{c.method} {c.path} body={body_len}B")
|
|
1466
|
+
del self._pending[request_id]
|
|
1467
|
+
if pending.streaming:
|
|
1468
|
+
try:
|
|
1469
|
+
pending.client.response_stream_end()
|
|
1470
|
+
except Exception:
|
|
1471
|
+
pass
|
|
1472
|
+
else:
|
|
1473
|
+
try:
|
|
1474
|
+
pending.client.respond(
|
|
1475
|
+
{'error': 'Worker crashed',
|
|
1476
|
+
'reason': reason},
|
|
1477
|
+
status=500)
|
|
1478
|
+
except Exception:
|
|
1479
|
+
pass
|
|
1480
|
+
self._notify_pending_removed(
|
|
1481
|
+
request_id, pending, PENDING_WORKER_DIED)
|
|
1401
1482
|
|
|
1402
1483
|
def _sigterm(self, _signo, _stack_frame):
|
|
1403
1484
|
self._running = False
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: uhttp-workers
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.7.0
|
|
4
4
|
Summary: Multi-process worker dispatcher built on uhttp-server
|
|
5
5
|
Author-email: Pavel Revak <pavelrevak@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -277,15 +277,18 @@ Workers send heartbeats automatically via the shared response queue. When a work
|
|
|
277
277
|
```python
|
|
278
278
|
@_workers.api('/process/{id:int}', 'POST')
|
|
279
279
|
def process(self, request):
|
|
280
|
-
# request.request_id
|
|
281
|
-
# request.method
|
|
282
|
-
# request.path
|
|
283
|
-
# request.path_params
|
|
284
|
-
# request.query
|
|
285
|
-
# request.data
|
|
286
|
-
# request.headers
|
|
287
|
-
# request.cookies
|
|
288
|
-
# request.content_type
|
|
280
|
+
# request.request_id — internal ID for dispatcher pairing
|
|
281
|
+
# request.method — 'POST'
|
|
282
|
+
# request.path — '/process/42'
|
|
283
|
+
# request.path_params — {'id': 42}
|
|
284
|
+
# request.query — {'page': '1'} or None
|
|
285
|
+
# request.data — dict (JSON), bytes (binary), or None
|
|
286
|
+
# request.headers — dict
|
|
287
|
+
# request.cookies — dict (lazy-parsed from Cookie header)
|
|
288
|
+
# request.content_type — 'application/json'
|
|
289
|
+
# request.remote_address — 'host:port' string (honors X-Forwarded-For
|
|
290
|
+
# from trusted proxies; configured on the
|
|
291
|
+
# HTTP server side)
|
|
289
292
|
|
|
290
293
|
# return data (status 200)
|
|
291
294
|
return {'result': 'ok'}
|
|
@@ -565,6 +568,7 @@ Reason is one of:
|
|
|
565
568
|
| `PENDING_DISCONNECTED` | Client disconnected mid-stream; worker was notified via control queue (race possible). |
|
|
566
569
|
| `PENDING_STREAM_CLOSED` | Worker ended the SSE stream cleanly. |
|
|
567
570
|
| `PENDING_SHUTDOWN` | Dispatcher is shutting down; client got 503. |
|
|
571
|
+
| `PENDING_WORKER_DIED` | Worker process died/was killed while owning the request; client got 500. `on_worker_died()` runs first. |
|
|
568
572
|
|
|
569
573
|
The hook is invoked after the client-facing action (respond / disconnect / control queue put)
|
|
570
574
|
so dispatcher state is finalized when it runs. Exceptions raised by the hook are logged at
|
|
@@ -575,6 +579,51 @@ Override `on_pending_removed()` if you need exactly-once cleanup. Overriding bot
|
|
|
575
579
|
but discouraged — for the `PENDING_COMPLETED` reason, `on_response()` is called immediately
|
|
576
580
|
before `on_pending_removed()`.
|
|
577
581
|
|
|
582
|
+
## Worker Death Hook
|
|
583
|
+
|
|
584
|
+
Workers die — segfault in a C extension, OOM-kill, or the dispatcher kills them after
|
|
585
|
+
`stuck_timeout`. Override `on_worker_died()` to capture which requests they had in-flight
|
|
586
|
+
(useful for forensics when a malformed payload reproduces a crash):
|
|
587
|
+
|
|
588
|
+
```python
|
|
589
|
+
class MyDispatcher(_workers.Dispatcher):
|
|
590
|
+
def on_worker_died(
|
|
591
|
+
self, pool, worker_id, reason, exitcode, victims):
|
|
592
|
+
# `victims` is a list of (request_id, _PendingRequest) for all
|
|
593
|
+
# requests this worker had claimed but not completed.
|
|
594
|
+
# `exitcode` is None for stuck workers, otherwise the process exit
|
|
595
|
+
# code (negative = signal: -9 OOM, -11 SIGSEGV).
|
|
596
|
+
for rid, pending in victims:
|
|
597
|
+
c = pending.client
|
|
598
|
+
self._crash_queue.append({
|
|
599
|
+
'reason': reason,
|
|
600
|
+
'exitcode': exitcode,
|
|
601
|
+
'method': c.method,
|
|
602
|
+
'path': c.path,
|
|
603
|
+
'address': c.address,
|
|
604
|
+
'body': c.body, # raw bytes — replay this to reproduce
|
|
605
|
+
})
|
|
606
|
+
# Default impl responds 500 to victims and fires
|
|
607
|
+
# on_pending_removed(PENDING_WORKER_DIED). Call it after capture.
|
|
608
|
+
super().on_worker_died(
|
|
609
|
+
pool, worker_id, reason, exitcode, victims)
|
|
610
|
+
```
|
|
611
|
+
|
|
612
|
+
What's a victim: any request the worker had claimed via `MSG_HEARTBEAT`
|
|
613
|
+
(`pending.worker_id == worker_id`). Requests still in the queue (`worker_id is None`)
|
|
614
|
+
are **not** victims — other workers in the pool will pick them up after restart.
|
|
615
|
+
|
|
616
|
+
Default behavior (if you don't override) is to log the death + each victim, respond
|
|
617
|
+
500 (or close the stream for SSE/NDJSON), and fire `on_pending_removed` for each.
|
|
618
|
+
Override only if you want to persist payloads or customize the response status/body.
|
|
619
|
+
|
|
620
|
+
**500 vs 503:** a victim of a crashed worker gets **500** (processing started, then
|
|
621
|
+
the server failed). A new request arriving while the pool has zero alive workers
|
|
622
|
+
gets **503 + `Retry-After: 1`** (rejected before processing — try again shortly).
|
|
623
|
+
A request to a pool that has exceeded `max_restarts` in `restart_window` gets **503**
|
|
624
|
+
permanently (`pool.is_degraded`). `pool.alive_count` is exposed for monitoring and
|
|
625
|
+
also appears in `pool.status()`.
|
|
626
|
+
|
|
578
627
|
## Dispatcher Idle Hook
|
|
579
628
|
|
|
580
629
|
Override `on_idle()` on the dispatcher for periodic background tasks — called on each `select()` timeout (every `SELECT_TIMEOUT` seconds, default 1s):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|