uhttp-workers 1.5.0__tar.gz → 1.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {uhttp_workers-1.5.0/uhttp_workers.egg-info → uhttp_workers-1.6.0}/PKG-INFO +47 -1
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/README.md +46 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/tests/test_dispatcher.py +251 -2
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/tests/test_worker_pool.py +21 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/uhttp/workers.py +82 -9
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0/uhttp_workers.egg-info}/PKG-INFO +47 -1
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/.github/workflows/publish.yml +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/.github/workflows/tests.yml +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/.gitignore +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/examples/simple_workers.py +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/examples/sse_workers.py +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/examples/static/index.html +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/pyproject.toml +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/setup.cfg +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/tests/__init__.py +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/tests/test_api_handler.py +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/tests/test_decorators.py +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/tests/test_pattern_matching.py +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/tests/test_request_response.py +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/tests/test_worker.py +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/uhttp_workers.egg-info/SOURCES.txt +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/uhttp_workers.egg-info/dependency_links.txt +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/uhttp_workers.egg-info/requires.txt +0 -0
- {uhttp_workers-1.5.0 → uhttp_workers-1.6.0}/uhttp_workers.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: uhttp-workers
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.6.0
|
|
4
4
|
Summary: Multi-process worker dispatcher built on uhttp-server
|
|
5
5
|
Author-email: Pavel Revak <pavelrevak@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -565,6 +565,7 @@ Reason is one of:
|
|
|
565
565
|
| `PENDING_DISCONNECTED` | Client disconnected mid-stream; worker was notified via control queue (race possible). |
|
|
566
566
|
| `PENDING_STREAM_CLOSED` | Worker ended the SSE stream cleanly. |
|
|
567
567
|
| `PENDING_SHUTDOWN` | Dispatcher is shutting down; client got 503. |
|
|
568
|
+
| `PENDING_WORKER_DIED` | Worker process died/was killed while owning the request; client got 500. `on_worker_died()` runs first. |
|
|
568
569
|
|
|
569
570
|
The hook is invoked after the client-facing action (respond / disconnect / control queue put)
|
|
570
571
|
so dispatcher state is finalized when it runs. Exceptions raised by the hook are logged at
|
|
@@ -575,6 +576,51 @@ Override `on_pending_removed()` if you need exactly-once cleanup. Overriding bot
|
|
|
575
576
|
but discouraged — for the `PENDING_COMPLETED` reason, `on_response()` is called immediately
|
|
576
577
|
before `on_pending_removed()`.
|
|
577
578
|
|
|
579
|
+
## Worker Death Hook
|
|
580
|
+
|
|
581
|
+
Workers die — segfault in a C extension, OOM-kill, or the dispatcher kills them after
|
|
582
|
+
`stuck_timeout`. Override `on_worker_died()` to capture which requests they had in-flight
|
|
583
|
+
(useful for forensics when a malformed payload reproduces a crash):
|
|
584
|
+
|
|
585
|
+
```python
|
|
586
|
+
class MyDispatcher(_workers.Dispatcher):
|
|
587
|
+
def on_worker_died(
|
|
588
|
+
self, pool, worker_id, reason, exitcode, victims):
|
|
589
|
+
# `victims` is a list of (request_id, _PendingRequest) for all
|
|
590
|
+
# requests this worker had claimed but not completed.
|
|
591
|
+
# `exitcode` is None for stuck workers, otherwise the process exit
|
|
592
|
+
# code (negative = signal: -9 OOM, -11 SIGSEGV).
|
|
593
|
+
for rid, pending in victims:
|
|
594
|
+
c = pending.client
|
|
595
|
+
self._crash_queue.append({
|
|
596
|
+
'reason': reason,
|
|
597
|
+
'exitcode': exitcode,
|
|
598
|
+
'method': c.method,
|
|
599
|
+
'path': c.path,
|
|
600
|
+
'address': c.address,
|
|
601
|
+
'body': c.body, # raw bytes — replay this to reproduce
|
|
602
|
+
})
|
|
603
|
+
# Default impl responds 500 to victims and fires
|
|
604
|
+
# on_pending_removed(PENDING_WORKER_DIED). Call it after capture.
|
|
605
|
+
super().on_worker_died(
|
|
606
|
+
pool, worker_id, reason, exitcode, victims)
|
|
607
|
+
```
|
|
608
|
+
|
|
609
|
+
What's a victim: any request the worker had claimed via `MSG_HEARTBEAT`
|
|
610
|
+
(`pending.worker_id == worker_id`). Requests still in the queue (`worker_id is None`)
|
|
611
|
+
are **not** victims — other workers in the pool will pick them up after restart.
|
|
612
|
+
|
|
613
|
+
Default behavior (if you don't override) is to log the death + each victim, respond
|
|
614
|
+
500 (or close the stream for SSE/NDJSON), and fire `on_pending_removed` for each.
|
|
615
|
+
Override only if you want to persist payloads or customize the response status/body.
|
|
616
|
+
|
|
617
|
+
**500 vs 503:** a victim of a crashed worker gets **500** (processing started, then
|
|
618
|
+
the server failed). A new request arriving while the pool has zero alive workers
|
|
619
|
+
gets **503 + `Retry-After: 1`** (rejected before processing — try again shortly).
|
|
620
|
+
A request to a pool that has exceeded `max_restarts` in `restart_window` gets **503**
|
|
621
|
+
permanently (`pool.is_degraded`). `pool.alive_count` is exposed for monitoring and
|
|
622
|
+
also appears in `pool.status()`.
|
|
623
|
+
|
|
578
624
|
## Dispatcher Idle Hook
|
|
579
625
|
|
|
580
626
|
Override `on_idle()` on the dispatcher for periodic background tasks — called on each `select()` timeout (every `SELECT_TIMEOUT` seconds, default 1s):
|
|
@@ -551,6 +551,7 @@ Reason is one of:
|
|
|
551
551
|
| `PENDING_DISCONNECTED` | Client disconnected mid-stream; worker was notified via control queue (race possible). |
|
|
552
552
|
| `PENDING_STREAM_CLOSED` | Worker ended the SSE stream cleanly. |
|
|
553
553
|
| `PENDING_SHUTDOWN` | Dispatcher is shutting down; client got 503. |
|
|
554
|
+
| `PENDING_WORKER_DIED` | Worker process died/was killed while owning the request; client got 500. `on_worker_died()` runs first. |
|
|
554
555
|
|
|
555
556
|
The hook is invoked after the client-facing action (respond / disconnect / control queue put)
|
|
556
557
|
so dispatcher state is finalized when it runs. Exceptions raised by the hook are logged at
|
|
@@ -561,6 +562,51 @@ Override `on_pending_removed()` if you need exactly-once cleanup. Overriding bot
|
|
|
561
562
|
but discouraged — for the `PENDING_COMPLETED` reason, `on_response()` is called immediately
|
|
562
563
|
before `on_pending_removed()`.
|
|
563
564
|
|
|
565
|
+
## Worker Death Hook
|
|
566
|
+
|
|
567
|
+
Workers die — segfault in a C extension, OOM-kill, or the dispatcher kills them after
|
|
568
|
+
`stuck_timeout`. Override `on_worker_died()` to capture which requests they had in-flight
|
|
569
|
+
(useful for forensics when a malformed payload reproduces a crash):
|
|
570
|
+
|
|
571
|
+
```python
|
|
572
|
+
class MyDispatcher(_workers.Dispatcher):
|
|
573
|
+
def on_worker_died(
|
|
574
|
+
self, pool, worker_id, reason, exitcode, victims):
|
|
575
|
+
# `victims` is a list of (request_id, _PendingRequest) for all
|
|
576
|
+
# requests this worker had claimed but not completed.
|
|
577
|
+
# `exitcode` is None for stuck workers, otherwise the process exit
|
|
578
|
+
# code (negative = signal: -9 OOM, -11 SIGSEGV).
|
|
579
|
+
for rid, pending in victims:
|
|
580
|
+
c = pending.client
|
|
581
|
+
self._crash_queue.append({
|
|
582
|
+
'reason': reason,
|
|
583
|
+
'exitcode': exitcode,
|
|
584
|
+
'method': c.method,
|
|
585
|
+
'path': c.path,
|
|
586
|
+
'address': c.address,
|
|
587
|
+
'body': c.body, # raw bytes — replay this to reproduce
|
|
588
|
+
})
|
|
589
|
+
# Default impl responds 500 to victims and fires
|
|
590
|
+
# on_pending_removed(PENDING_WORKER_DIED). Call it after capture.
|
|
591
|
+
super().on_worker_died(
|
|
592
|
+
pool, worker_id, reason, exitcode, victims)
|
|
593
|
+
```
|
|
594
|
+
|
|
595
|
+
What's a victim: any request the worker had claimed via `MSG_HEARTBEAT`
|
|
596
|
+
(`pending.worker_id == worker_id`). Requests still in the queue (`worker_id is None`)
|
|
597
|
+
are **not** victims — other workers in the pool will pick them up after restart.
|
|
598
|
+
|
|
599
|
+
Default behavior (if you don't override) is to log the death + each victim, respond
|
|
600
|
+
500 (or close the stream for SSE/NDJSON), and fire `on_pending_removed` for each.
|
|
601
|
+
Override only if you want to persist payloads or customize the response status/body.
|
|
602
|
+
|
|
603
|
+
**500 vs 503:** a victim of a crashed worker gets **500** (processing started, then
|
|
604
|
+
the server failed). A new request arriving while the pool has zero alive workers
|
|
605
|
+
gets **503 + `Retry-After: 1`** (rejected before processing — try again shortly).
|
|
606
|
+
A request to a pool that has exceeded `max_restarts` in `restart_window` gets **503**
|
|
607
|
+
permanently (`pool.is_degraded`). `pool.alive_count` is exposed for monitoring and
|
|
608
|
+
also appears in `pool.status()`.
|
|
609
|
+
|
|
564
610
|
## Dispatcher Idle Hook
|
|
565
611
|
|
|
566
612
|
Override `on_idle()` on the dispatcher for periodic background tasks — called on each `select()` timeout (every `SELECT_TIMEOUT` seconds, default 1s):
|
|
@@ -13,7 +13,7 @@ from uhttp.workers import (
|
|
|
13
13
|
MSG_SSE_OPEN, MSG_SSE_EVENT, MSG_SSE_CLOSE, MSG_NDJSON,
|
|
14
14
|
CTL_DISCONNECT,
|
|
15
15
|
PENDING_COMPLETED, PENDING_TIMEOUT, PENDING_DISCONNECTED,
|
|
16
|
-
PENDING_STREAM_CLOSED, PENDING_SHUTDOWN,
|
|
16
|
+
PENDING_STREAM_CLOSED, PENDING_SHUTDOWN, PENDING_WORKER_DIED,
|
|
17
17
|
LOG_ERROR,
|
|
18
18
|
_PendingRequest,
|
|
19
19
|
)
|
|
@@ -29,13 +29,16 @@ class MockClient:
|
|
|
29
29
|
"""Mock HttpConnection for testing dispatcher logic."""
|
|
30
30
|
|
|
31
31
|
def __init__(self, method='GET', path='/', query=None, data=None,
|
|
32
|
-
headers=None, content_type=None
|
|
32
|
+
headers=None, content_type=None, body=None,
|
|
33
|
+
address='127.0.0.1'):
|
|
33
34
|
self.method = method
|
|
34
35
|
self.path = path
|
|
35
36
|
self.query = query
|
|
36
37
|
self.data = data
|
|
37
38
|
self.headers = headers or {}
|
|
38
39
|
self.content_type = content_type
|
|
40
|
+
self.body = body
|
|
41
|
+
self.address = address
|
|
39
42
|
self.responded = False
|
|
40
43
|
self.response_data = None
|
|
41
44
|
self.response_status = None
|
|
@@ -188,6 +191,8 @@ class TestDispatcherDoCheck(unittest.TestCase):
|
|
|
188
191
|
|
|
189
192
|
def test_pass_check(self):
|
|
190
193
|
pool = WorkerPool(DummyWorker, routes=['/api/**'])
|
|
194
|
+
# fake a live worker so alive_count > 0 without starting processes
|
|
195
|
+
pool.workers = [type('W', (), {'is_alive': lambda self: True})()]
|
|
191
196
|
d = Dispatcher.__new__(Dispatcher)
|
|
192
197
|
d._sync_routes = []
|
|
193
198
|
d._static_routes = {}
|
|
@@ -317,6 +322,48 @@ class TestDispatcherPoolRouting(unittest.TestCase):
|
|
|
317
322
|
self.assertTrue(client.responded)
|
|
318
323
|
self.assertEqual(client.response_status, 503)
|
|
319
324
|
|
|
325
|
+
def test_dispatch_no_alive_workers_returns_503(self):
|
|
326
|
+
"""Transient: pool has workers but none currently alive."""
|
|
327
|
+
# pool has dead workers (not degraded yet)
|
|
328
|
+
self.pool_default.workers = [
|
|
329
|
+
type('W', (), {'is_alive': lambda self: False})()]
|
|
330
|
+
d = Dispatcher.__new__(Dispatcher)
|
|
331
|
+
d._sync_routes = []
|
|
332
|
+
d._static_routes = {}
|
|
333
|
+
d._pools = [self.pool_default]
|
|
334
|
+
d._pending = {}
|
|
335
|
+
d._max_pending = 1000
|
|
336
|
+
d._next_request_id = 0
|
|
337
|
+
|
|
338
|
+
client = MockClient('GET', '/test')
|
|
339
|
+
d._dispatch_to_pool(client)
|
|
340
|
+
self.assertTrue(client.responded)
|
|
341
|
+
self.assertEqual(client.response_status, 503)
|
|
342
|
+
self.assertEqual(
|
|
343
|
+
client.response_data['error'], 'No workers available')
|
|
344
|
+
self.assertEqual(
|
|
345
|
+
client.response_headers.get('Retry-After'), '1')
|
|
346
|
+
# request was NOT enqueued
|
|
347
|
+
self.assertEqual(d._pending, {})
|
|
348
|
+
|
|
349
|
+
def test_dispatch_empty_workers_returns_503(self):
|
|
350
|
+
"""Pool was never started (empty workers list)."""
|
|
351
|
+
# self.pool_default.workers is [] from __init__
|
|
352
|
+
d = Dispatcher.__new__(Dispatcher)
|
|
353
|
+
d._sync_routes = []
|
|
354
|
+
d._static_routes = {}
|
|
355
|
+
d._pools = [self.pool_default]
|
|
356
|
+
d._pending = {}
|
|
357
|
+
d._max_pending = 1000
|
|
358
|
+
d._next_request_id = 0
|
|
359
|
+
|
|
360
|
+
client = MockClient('GET', '/test')
|
|
361
|
+
d._dispatch_to_pool(client)
|
|
362
|
+
self.assertTrue(client.responded)
|
|
363
|
+
self.assertEqual(client.response_status, 503)
|
|
364
|
+
self.assertEqual(
|
|
365
|
+
client.response_data['error'], 'No workers available')
|
|
366
|
+
|
|
320
367
|
|
|
321
368
|
class TestDispatcherProcessResponse(unittest.TestCase):
|
|
322
369
|
|
|
@@ -769,5 +816,207 @@ class TestDispatcherPendingRemoved(unittest.TestCase):
|
|
|
769
816
|
self.assertTrue(client.responded)
|
|
770
817
|
|
|
771
818
|
|
|
819
|
+
class TestDispatcherWorkerDied(unittest.TestCase):
|
|
820
|
+
"""Tests for on_worker_died hook and victim handling."""
|
|
821
|
+
|
|
822
|
+
def _make_dispatcher(self, dispatcher_cls=Dispatcher):
|
|
823
|
+
# queue_warning=0 disables the queue-size check (which would
|
|
824
|
+
# otherwise touch pool.pending_count → request_queue.qsize()).
|
|
825
|
+
pool = WorkerPool(
|
|
826
|
+
DummyWorker, routes=['/api/**'], queue_warning=0)
|
|
827
|
+
# Mock check_workers so we control what it returns without
|
|
828
|
+
# actually starting processes.
|
|
829
|
+
pool._fake_restarted = []
|
|
830
|
+
pool.check_workers = lambda: pool._fake_restarted
|
|
831
|
+
d = dispatcher_cls.__new__(dispatcher_cls)
|
|
832
|
+
d._sync_routes = []
|
|
833
|
+
d._static_routes = {}
|
|
834
|
+
d._pools = [pool]
|
|
835
|
+
d._pending = {}
|
|
836
|
+
d._max_pending = 1000
|
|
837
|
+
d._next_request_id = 0
|
|
838
|
+
d._response_queue = mp.Queue()
|
|
839
|
+
d._log_is_tty = False
|
|
840
|
+
d.log_calls = []
|
|
841
|
+
d.on_log = lambda name, level, msg: d.log_calls.append(
|
|
842
|
+
(name, level, msg))
|
|
843
|
+
d.recorded_removed = []
|
|
844
|
+
return d, pool
|
|
845
|
+
|
|
846
|
+
def test_single_victim_gets_500(self):
|
|
847
|
+
|
|
848
|
+
class RecordingDispatcher(Dispatcher):
|
|
849
|
+
def on_pending_removed(self, request_id, pending, reason):
|
|
850
|
+
self.recorded_removed.append((request_id, reason))
|
|
851
|
+
|
|
852
|
+
d, pool = self._make_dispatcher(RecordingDispatcher)
|
|
853
|
+
client = MockClient(
|
|
854
|
+
'POST', '/api/scan', body=b'\x00\x01bad', address='10.0.0.7')
|
|
855
|
+
pending = _PendingRequest(client, pool)
|
|
856
|
+
pending.worker_id = 0
|
|
857
|
+
d._pending[42] = pending
|
|
858
|
+
pool._fake_restarted = [(0, 'died exit=-11', -11)]
|
|
859
|
+
d._check_all_workers()
|
|
860
|
+
# client got 500
|
|
861
|
+
self.assertTrue(client.responded)
|
|
862
|
+
self.assertEqual(client.response_status, 500)
|
|
863
|
+
self.assertEqual(client.response_data['error'], 'Worker crashed')
|
|
864
|
+
self.assertIn('exit=-11', client.response_data['reason'])
|
|
865
|
+
# removed from pending + hook fired
|
|
866
|
+
self.assertNotIn(42, d._pending)
|
|
867
|
+
self.assertEqual(
|
|
868
|
+
d.recorded_removed, [(42, PENDING_WORKER_DIED)])
|
|
869
|
+
|
|
870
|
+
def test_multiple_victims_all_handled(self):
|
|
871
|
+
d, pool = self._make_dispatcher()
|
|
872
|
+
c1 = MockClient('GET', '/api/a', address='1.1.1.1')
|
|
873
|
+
c2 = MockClient('GET', '/api/b', address='2.2.2.2')
|
|
874
|
+
c3 = MockClient('GET', '/api/c', address='3.3.3.3')
|
|
875
|
+
for rid, c in [(1, c1), (2, c2), (3, c3)]:
|
|
876
|
+
p = _PendingRequest(c, pool)
|
|
877
|
+
p.worker_id = 0
|
|
878
|
+
d._pending[rid] = p
|
|
879
|
+
pool._fake_restarted = [(0, 'stuck', None)]
|
|
880
|
+
d._check_all_workers()
|
|
881
|
+
for c in (c1, c2, c3):
|
|
882
|
+
self.assertTrue(c.responded)
|
|
883
|
+
self.assertEqual(c.response_status, 500)
|
|
884
|
+
self.assertEqual(d._pending, {})
|
|
885
|
+
|
|
886
|
+
def test_streaming_victim_gets_stream_end(self):
|
|
887
|
+
d, pool = self._make_dispatcher()
|
|
888
|
+
client = MockClient('GET', '/api/events')
|
|
889
|
+
pending = _PendingRequest(client, pool)
|
|
890
|
+
pending.worker_id = 0
|
|
891
|
+
pending.streaming = True
|
|
892
|
+
d._pending[1] = pending
|
|
893
|
+
pool._fake_restarted = [(0, 'died exit=-9', -9)]
|
|
894
|
+
d._check_all_workers()
|
|
895
|
+
# stream ended, NOT respond()
|
|
896
|
+
self.assertTrue(getattr(client, 'stream_ended', False))
|
|
897
|
+
self.assertFalse(client.responded)
|
|
898
|
+
self.assertNotIn(1, d._pending)
|
|
899
|
+
|
|
900
|
+
def test_queued_request_not_a_victim(self):
|
|
901
|
+
"""Request with worker_id=None is still in queue — not a victim."""
|
|
902
|
+
d, pool = self._make_dispatcher()
|
|
903
|
+
# request belonging to dying worker
|
|
904
|
+
in_flight = MockClient('GET', '/api/active')
|
|
905
|
+
p1 = _PendingRequest(in_flight, pool)
|
|
906
|
+
p1.worker_id = 0
|
|
907
|
+
d._pending[1] = p1
|
|
908
|
+
# request still in queue, no worker claimed it
|
|
909
|
+
queued = MockClient('GET', '/api/queued')
|
|
910
|
+
p2 = _PendingRequest(queued, pool)
|
|
911
|
+
# p2.worker_id stays None
|
|
912
|
+
d._pending[2] = p2
|
|
913
|
+
pool._fake_restarted = [(0, 'died exit=-11', -11)]
|
|
914
|
+
d._check_all_workers()
|
|
915
|
+
# in-flight responded
|
|
916
|
+
self.assertTrue(in_flight.responded)
|
|
917
|
+
self.assertNotIn(1, d._pending)
|
|
918
|
+
# queued untouched
|
|
919
|
+
self.assertFalse(queued.responded)
|
|
920
|
+
self.assertIn(2, d._pending)
|
|
921
|
+
|
|
922
|
+
def test_other_worker_not_affected(self):
|
|
923
|
+
"""Only victims of THIS worker are handled; other workers stay."""
|
|
924
|
+
d, pool = self._make_dispatcher()
|
|
925
|
+
c1 = MockClient('GET', '/api/a')
|
|
926
|
+
c2 = MockClient('GET', '/api/b')
|
|
927
|
+
p1 = _PendingRequest(c1, pool)
|
|
928
|
+
p1.worker_id = 0
|
|
929
|
+
p2 = _PendingRequest(c2, pool)
|
|
930
|
+
p2.worker_id = 1
|
|
931
|
+
d._pending[1] = p1
|
|
932
|
+
d._pending[2] = p2
|
|
933
|
+
pool._fake_restarted = [(0, 'died exit=-11', -11)]
|
|
934
|
+
d._check_all_workers()
|
|
935
|
+
self.assertTrue(c1.responded)
|
|
936
|
+
self.assertNotIn(1, d._pending)
|
|
937
|
+
self.assertFalse(c2.responded)
|
|
938
|
+
self.assertIn(2, d._pending)
|
|
939
|
+
|
|
940
|
+
def test_late_response_after_victim_cleanup_dropped(self):
|
|
941
|
+
"""MSG_RESPONSE from dead worker arriving after victim removal is dropped."""
|
|
942
|
+
d, pool = self._make_dispatcher()
|
|
943
|
+
client = MockClient('GET', '/api/test')
|
|
944
|
+
pending = _PendingRequest(client, pool)
|
|
945
|
+
pending.worker_id = 0
|
|
946
|
+
d._pending[1] = pending
|
|
947
|
+
pool._fake_restarted = [(0, 'died exit=-11', -11)]
|
|
948
|
+
d._check_all_workers()
|
|
949
|
+
# request already gone; client already got 500
|
|
950
|
+
self.assertEqual(client.response_status, 500)
|
|
951
|
+
# late response from before-death — must not break or double-respond
|
|
952
|
+
client.response_status = None
|
|
953
|
+
late = Response(request_id=1, data={'ok': True}, status=200)
|
|
954
|
+
d._process_response((MSG_RESPONSE, 1, late))
|
|
955
|
+
# silently dropped
|
|
956
|
+
self.assertIsNone(client.response_status)
|
|
957
|
+
|
|
958
|
+
def test_no_victims_just_logs(self):
|
|
959
|
+
"""Worker died while idle — restarted but no pending requests."""
|
|
960
|
+
d, pool = self._make_dispatcher()
|
|
961
|
+
pool._fake_restarted = [(0, 'died exit=0', 0)]
|
|
962
|
+
d._check_all_workers()
|
|
963
|
+
# no crash, no pending changes
|
|
964
|
+
self.assertEqual(d._pending, {})
|
|
965
|
+
# should have logged
|
|
966
|
+
error_logs = [
|
|
967
|
+
msg for _, level, msg in d.log_calls if level == LOG_ERROR]
|
|
968
|
+
self.assertEqual(len(error_logs), 1)
|
|
969
|
+
self.assertIn('victims=0', error_logs[0])
|
|
970
|
+
|
|
971
|
+
def test_override_can_persist_payload(self):
|
|
972
|
+
"""User override can capture victim payload before super() responds."""
|
|
973
|
+
captured = []
|
|
974
|
+
|
|
975
|
+
class ForensicDispatcher(Dispatcher):
|
|
976
|
+
def on_worker_died(
|
|
977
|
+
self, pool, worker_id, reason, exitcode, victims):
|
|
978
|
+
for rid, pending in victims:
|
|
979
|
+
captured.append({
|
|
980
|
+
'rid': rid,
|
|
981
|
+
'address': pending.client.address,
|
|
982
|
+
'body': pending.client.body,
|
|
983
|
+
'reason': reason,
|
|
984
|
+
'exitcode': exitcode})
|
|
985
|
+
super().on_worker_died(
|
|
986
|
+
pool, worker_id, reason, exitcode, victims)
|
|
987
|
+
|
|
988
|
+
d, pool = self._make_dispatcher(ForensicDispatcher)
|
|
989
|
+
client = MockClient(
|
|
990
|
+
'POST', '/api/process',
|
|
991
|
+
body=b'\xff\xfecorrupted', address='9.9.9.9')
|
|
992
|
+
pending = _PendingRequest(client, pool)
|
|
993
|
+
pending.worker_id = 0
|
|
994
|
+
d._pending[7] = pending
|
|
995
|
+
pool._fake_restarted = [(0, 'died exit=-11', -11)]
|
|
996
|
+
d._check_all_workers()
|
|
997
|
+
self.assertEqual(len(captured), 1)
|
|
998
|
+
self.assertEqual(captured[0]['address'], '9.9.9.9')
|
|
999
|
+
self.assertEqual(captured[0]['body'], b'\xff\xfecorrupted')
|
|
1000
|
+
self.assertEqual(captured[0]['exitcode'], -11)
|
|
1001
|
+
# super() still ran
|
|
1002
|
+
self.assertEqual(client.response_status, 500)
|
|
1003
|
+
|
|
1004
|
+
def test_hook_exception_does_not_crash_dispatcher(self):
|
|
1005
|
+
|
|
1006
|
+
class BrokenDispatcher(Dispatcher):
|
|
1007
|
+
def on_worker_died(self, *args, **kwargs):
|
|
1008
|
+
raise RuntimeError('boom')
|
|
1009
|
+
|
|
1010
|
+
d, pool = self._make_dispatcher(BrokenDispatcher)
|
|
1011
|
+
pool._fake_restarted = [(0, 'died exit=-11', -11)]
|
|
1012
|
+
# must not propagate
|
|
1013
|
+
d._check_all_workers()
|
|
1014
|
+
error_logs = [
|
|
1015
|
+
msg for _, level, msg in d.log_calls if level == LOG_ERROR]
|
|
1016
|
+
# one error log about the hook failure
|
|
1017
|
+
self.assertTrue(any('on_worker_died' in m for m in error_logs))
|
|
1018
|
+
self.assertTrue(any('boom' in m for m in error_logs))
|
|
1019
|
+
|
|
1020
|
+
|
|
772
1021
|
if __name__ == '__main__':
|
|
773
1022
|
unittest.main()
|
|
@@ -120,6 +120,27 @@ class TestWorkerPoolStatus(unittest.TestCase):
|
|
|
120
120
|
pool = WorkerPool(DummyWorker)
|
|
121
121
|
self.assertFalse(pool.is_degraded)
|
|
122
122
|
|
|
123
|
+
def test_alive_count_empty(self):
|
|
124
|
+
pool = WorkerPool(DummyWorker, num_workers=2)
|
|
125
|
+
# not started yet
|
|
126
|
+
self.assertEqual(pool.alive_count, 0)
|
|
127
|
+
|
|
128
|
+
def test_alive_count_running(self):
|
|
129
|
+
pool = WorkerPool(DummyWorker, num_workers=2)
|
|
130
|
+
response_queue = mp.Queue()
|
|
131
|
+
pool.start(response_queue)
|
|
132
|
+
time.sleep(0.2)
|
|
133
|
+
self.assertEqual(pool.alive_count, 2)
|
|
134
|
+
pool.shutdown(timeout=3)
|
|
135
|
+
|
|
136
|
+
def test_alive_count_in_status(self):
|
|
137
|
+
pool = WorkerPool(DummyWorker, num_workers=2)
|
|
138
|
+
response_queue = mp.Queue()
|
|
139
|
+
pool.start(response_queue)
|
|
140
|
+
time.sleep(0.2)
|
|
141
|
+
self.assertEqual(pool.status()['alive_count'], 2)
|
|
142
|
+
pool.shutdown(timeout=3)
|
|
143
|
+
|
|
123
144
|
|
|
124
145
|
class TestWorkerPoolCheckWorkers(unittest.TestCase):
|
|
125
146
|
|
|
@@ -36,6 +36,7 @@ PENDING_TIMEOUT = 'TIMEOUT'
|
|
|
36
36
|
PENDING_DISCONNECTED = 'DISCONNECTED'
|
|
37
37
|
PENDING_STREAM_CLOSED = 'STREAM_CLOSED'
|
|
38
38
|
PENDING_SHUTDOWN = 'SHUTDOWN'
|
|
39
|
+
PENDING_WORKER_DIED = 'WORKER_DIED'
|
|
39
40
|
|
|
40
41
|
# Sentinel for deferred response
|
|
41
42
|
DEFERRED = object()
|
|
@@ -879,7 +880,10 @@ class WorkerPool:
|
|
|
879
880
|
"""Check worker health, restart dead or stuck workers.
|
|
880
881
|
|
|
881
882
|
Returns:
|
|
882
|
-
List of (worker_id, reason) tuples for restarted
|
|
883
|
+
List of (worker_id, reason, exitcode) tuples for restarted
|
|
884
|
+
workers. exitcode is None for stuck workers (dispatcher killed
|
|
885
|
+
them), otherwise the process exit code (negative = signal:
|
|
886
|
+
-9 OOM, -11 SIGSEGV, -15 SIGTERM, etc.).
|
|
883
887
|
"""
|
|
884
888
|
restarted = []
|
|
885
889
|
now = _time.time()
|
|
@@ -889,8 +893,10 @@ class WorkerPool:
|
|
|
889
893
|
if now - t < self.restart_window]
|
|
890
894
|
for i, worker in enumerate(self.workers):
|
|
891
895
|
reason = None
|
|
896
|
+
exitcode = None
|
|
892
897
|
if not worker.is_alive():
|
|
893
|
-
|
|
898
|
+
exitcode = worker.exitcode
|
|
899
|
+
reason = f"died exit={exitcode}"
|
|
894
900
|
elif now - self._last_seen.get(i, 0) > self.stuck_timeout:
|
|
895
901
|
reason = "stuck"
|
|
896
902
|
worker.kill()
|
|
@@ -904,7 +910,7 @@ class WorkerPool:
|
|
|
904
910
|
if len(self._restart_times) >= self.max_restarts:
|
|
905
911
|
self._degraded = True
|
|
906
912
|
self._start_worker(i)
|
|
907
|
-
restarted.append((i, reason))
|
|
913
|
+
restarted.append((i, reason, exitcode))
|
|
908
914
|
return restarted
|
|
909
915
|
|
|
910
916
|
def matches(self, path):
|
|
@@ -959,6 +965,11 @@ class WorkerPool:
|
|
|
959
965
|
def is_degraded(self):
|
|
960
966
|
return self._degraded
|
|
961
967
|
|
|
968
|
+
@property
|
|
969
|
+
def alive_count(self):
|
|
970
|
+
"""Number of worker processes currently alive."""
|
|
971
|
+
return sum(1 for w in self.workers if w.is_alive())
|
|
972
|
+
|
|
962
973
|
@property
|
|
963
974
|
def pending_count(self):
|
|
964
975
|
try:
|
|
@@ -976,6 +987,7 @@ class WorkerPool:
|
|
|
976
987
|
return {
|
|
977
988
|
'name': self.name,
|
|
978
989
|
'degraded': self._degraded,
|
|
990
|
+
'alive_count': self.alive_count,
|
|
979
991
|
'queue_size': self.pending_count,
|
|
980
992
|
'workers': [
|
|
981
993
|
{
|
|
@@ -1127,6 +1139,9 @@ class Dispatcher:
|
|
|
1127
1139
|
notified via control queue (race possible).
|
|
1128
1140
|
PENDING_STREAM_CLOSED - worker ended the SSE stream cleanly.
|
|
1129
1141
|
PENDING_SHUTDOWN - dispatcher is shutting down; client got 503.
|
|
1142
|
+
PENDING_WORKER_DIED - worker process died/was killed while owning
|
|
1143
|
+
this request; client got 500. on_worker_died()
|
|
1144
|
+
runs first.
|
|
1130
1145
|
|
|
1131
1146
|
Args:
|
|
1132
1147
|
request_id: The request id being removed.
|
|
@@ -1201,6 +1216,11 @@ class Dispatcher:
|
|
|
1201
1216
|
client.respond(
|
|
1202
1217
|
{'error': 'Service unavailable'}, status=503)
|
|
1203
1218
|
return
|
|
1219
|
+
if pool.alive_count == 0:
|
|
1220
|
+
client.respond(
|
|
1221
|
+
{'error': 'No workers available'}, status=503,
|
|
1222
|
+
headers={'Retry-After': '1'})
|
|
1223
|
+
return
|
|
1204
1224
|
if len(self._pending) >= self._max_pending:
|
|
1205
1225
|
client.respond(
|
|
1206
1226
|
{'error': 'Too many requests'}, status=503)
|
|
@@ -1353,8 +1373,18 @@ class Dispatcher:
|
|
|
1353
1373
|
"""Check health of all worker pools and queue sizes."""
|
|
1354
1374
|
for pool in self._pools:
|
|
1355
1375
|
restarted = pool.check_workers()
|
|
1356
|
-
for worker_id, reason in restarted:
|
|
1357
|
-
|
|
1376
|
+
for worker_id, reason, exitcode in restarted:
|
|
1377
|
+
victims = [
|
|
1378
|
+
(rid, p) for rid, p in self._pending.items()
|
|
1379
|
+
if p.pool is pool and p.worker_id == worker_id]
|
|
1380
|
+
try:
|
|
1381
|
+
self.on_worker_died(
|
|
1382
|
+
pool, worker_id, reason, exitcode, victims)
|
|
1383
|
+
except Exception:
|
|
1384
|
+
self.on_log(
|
|
1385
|
+
pool.name, LOG_ERROR,
|
|
1386
|
+
f"on_worker_died() failed:\n"
|
|
1387
|
+
f"{_traceback.format_exc()}")
|
|
1358
1388
|
if pool.queue_warning:
|
|
1359
1389
|
qsize = pool.pending_count
|
|
1360
1390
|
if qsize >= pool.queue_warning:
|
|
@@ -1390,14 +1420,57 @@ class Dispatcher:
|
|
|
1390
1420
|
print(f"{prefix}{level_name:8s} {name:20s} {message}",
|
|
1391
1421
|
file=_sys.stderr)
|
|
1392
1422
|
|
|
1393
|
-
def
|
|
1394
|
-
"""Called when a worker
|
|
1423
|
+
def on_worker_died(self, pool, worker_id, reason, exitcode, victims):
|
|
1424
|
+
"""Called when a worker process died or was killed by the dispatcher.
|
|
1425
|
+
|
|
1426
|
+
Default behavior:
|
|
1427
|
+
1. Log restart reason + each victim (request id, client address,
|
|
1428
|
+
method, path, body size).
|
|
1429
|
+
2. Respond 500 to every victim's client (or response_stream_end()
|
|
1430
|
+
for streams), remove them from _pending, and fire
|
|
1431
|
+
on_pending_removed(PENDING_WORKER_DIED) for each.
|
|
1395
1432
|
|
|
1396
|
-
|
|
1433
|
+
Override to capture victim payloads (e.g., persist to disk for
|
|
1434
|
+
post-mortem) BEFORE calling super(). pending.client gives access
|
|
1435
|
+
to method, path, headers, body, address.
|
|
1436
|
+
|
|
1437
|
+
Args:
|
|
1438
|
+
pool: WorkerPool the worker belonged to.
|
|
1439
|
+
worker_id: Index of the restarted worker.
|
|
1440
|
+
reason: 'stuck' or 'died exit=N' (string from check_workers).
|
|
1441
|
+
exitcode: Process exit code (int) or None for stuck workers.
|
|
1442
|
+
Negative values are signals: -9 OOM, -11 SIGSEGV, etc.
|
|
1443
|
+
victims: List of (request_id, _PendingRequest) tuples — requests
|
|
1444
|
+
this worker had claimed (via MSG_HEARTBEAT) but never
|
|
1445
|
+
completed. May be empty if worker died while idle.
|
|
1397
1446
|
"""
|
|
1398
1447
|
self.on_log(
|
|
1399
1448
|
f'{pool.name}[{worker_id}]', LOG_ERROR,
|
|
1400
|
-
f"worker restarted: {reason}"
|
|
1449
|
+
f"worker restarted: {reason}, "
|
|
1450
|
+
f"victims={len(victims)}")
|
|
1451
|
+
for request_id, pending in victims:
|
|
1452
|
+
c = pending.client
|
|
1453
|
+
body_len = len(c.body) if c.body is not None else 0
|
|
1454
|
+
self.on_log(
|
|
1455
|
+
pool.name, LOG_ERROR,
|
|
1456
|
+
f" victim rid={request_id} from={c.address} "
|
|
1457
|
+
f"{c.method} {c.path} body={body_len}B")
|
|
1458
|
+
del self._pending[request_id]
|
|
1459
|
+
if pending.streaming:
|
|
1460
|
+
try:
|
|
1461
|
+
pending.client.response_stream_end()
|
|
1462
|
+
except Exception:
|
|
1463
|
+
pass
|
|
1464
|
+
else:
|
|
1465
|
+
try:
|
|
1466
|
+
pending.client.respond(
|
|
1467
|
+
{'error': 'Worker crashed',
|
|
1468
|
+
'reason': reason},
|
|
1469
|
+
status=500)
|
|
1470
|
+
except Exception:
|
|
1471
|
+
pass
|
|
1472
|
+
self._notify_pending_removed(
|
|
1473
|
+
request_id, pending, PENDING_WORKER_DIED)
|
|
1401
1474
|
|
|
1402
1475
|
def _sigterm(self, _signo, _stack_frame):
|
|
1403
1476
|
self._running = False
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: uhttp-workers
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.6.0
|
|
4
4
|
Summary: Multi-process worker dispatcher built on uhttp-server
|
|
5
5
|
Author-email: Pavel Revak <pavelrevak@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -565,6 +565,7 @@ Reason is one of:
|
|
|
565
565
|
| `PENDING_DISCONNECTED` | Client disconnected mid-stream; worker was notified via control queue (race possible). |
|
|
566
566
|
| `PENDING_STREAM_CLOSED` | Worker ended the SSE stream cleanly. |
|
|
567
567
|
| `PENDING_SHUTDOWN` | Dispatcher is shutting down; client got 503. |
|
|
568
|
+
| `PENDING_WORKER_DIED` | Worker process died/was killed while owning the request; client got 500. `on_worker_died()` runs first. |
|
|
568
569
|
|
|
569
570
|
The hook is invoked after the client-facing action (respond / disconnect / control queue put)
|
|
570
571
|
so dispatcher state is finalized when it runs. Exceptions raised by the hook are logged at
|
|
@@ -575,6 +576,51 @@ Override `on_pending_removed()` if you need exactly-once cleanup. Overriding bot
|
|
|
575
576
|
but discouraged — for the `PENDING_COMPLETED` reason, `on_response()` is called immediately
|
|
576
577
|
before `on_pending_removed()`.
|
|
577
578
|
|
|
579
|
+
## Worker Death Hook
|
|
580
|
+
|
|
581
|
+
Workers die — segfault in a C extension, OOM-kill, or the dispatcher kills them after
|
|
582
|
+
`stuck_timeout`. Override `on_worker_died()` to capture which requests they had in-flight
|
|
583
|
+
(useful for forensics when a malformed payload reproduces a crash):
|
|
584
|
+
|
|
585
|
+
```python
|
|
586
|
+
class MyDispatcher(_workers.Dispatcher):
|
|
587
|
+
def on_worker_died(
|
|
588
|
+
self, pool, worker_id, reason, exitcode, victims):
|
|
589
|
+
# `victims` is a list of (request_id, _PendingRequest) for all
|
|
590
|
+
# requests this worker had claimed but not completed.
|
|
591
|
+
# `exitcode` is None for stuck workers, otherwise the process exit
|
|
592
|
+
# code (negative = signal: -9 OOM, -11 SIGSEGV).
|
|
593
|
+
for rid, pending in victims:
|
|
594
|
+
c = pending.client
|
|
595
|
+
self._crash_queue.append({
|
|
596
|
+
'reason': reason,
|
|
597
|
+
'exitcode': exitcode,
|
|
598
|
+
'method': c.method,
|
|
599
|
+
'path': c.path,
|
|
600
|
+
'address': c.address,
|
|
601
|
+
'body': c.body, # raw bytes — replay this to reproduce
|
|
602
|
+
})
|
|
603
|
+
# Default impl responds 500 to victims and fires
|
|
604
|
+
# on_pending_removed(PENDING_WORKER_DIED). Call it after capture.
|
|
605
|
+
super().on_worker_died(
|
|
606
|
+
pool, worker_id, reason, exitcode, victims)
|
|
607
|
+
```
|
|
608
|
+
|
|
609
|
+
What's a victim: any request the worker had claimed via `MSG_HEARTBEAT`
|
|
610
|
+
(`pending.worker_id == worker_id`). Requests still in the queue (`worker_id is None`)
|
|
611
|
+
are **not** victims — other workers in the pool will pick them up after restart.
|
|
612
|
+
|
|
613
|
+
Default behavior (if you don't override) is to log the death + each victim, respond
|
|
614
|
+
500 (or close the stream for SSE/NDJSON), and fire `on_pending_removed` for each.
|
|
615
|
+
Override only if you want to persist payloads or customize the response status/body.
|
|
616
|
+
|
|
617
|
+
**500 vs 503:** a victim of a crashed worker gets **500** (processing started, then
|
|
618
|
+
the server failed). A new request arriving while the pool has zero alive workers
|
|
619
|
+
gets **503 + `Retry-After: 1`** (rejected before processing — try again shortly).
|
|
620
|
+
A request to a pool that has exceeded `max_restarts` in `restart_window` gets **503**
|
|
621
|
+
permanently (`pool.is_degraded`). `pool.alive_count` is exposed for monitoring and
|
|
622
|
+
also appears in `pool.status()`.
|
|
623
|
+
|
|
578
624
|
## Dispatcher Idle Hook
|
|
579
625
|
|
|
580
626
|
Override `on_idle()` on the dispatcher for periodic background tasks — called on each `select()` timeout (every `SELECT_TIMEOUT` seconds, default 1s):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|