@miller-tech/uap 1.20.14 → 1.20.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -107,7 +107,9 @@ ANTHROPIC_PASSTHROUGH_MODELS = os.environ.get("ANTHROPIC_PASSTHROUGH_MODELS", ""
|
|
|
107
107
|
PROXY_PORT = int(os.environ.get("PROXY_PORT", "4000"))
|
|
108
108
|
PROXY_HOST = os.environ.get("PROXY_HOST", "0.0.0.0")
|
|
109
109
|
PROXY_LOG_LEVEL = os.environ.get("PROXY_LOG_LEVEL", "INFO").upper()
|
|
110
|
-
PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "
|
|
110
|
+
PROXY_READ_TIMEOUT = float(os.environ.get("PROXY_READ_TIMEOUT", "180"))
|
|
111
|
+
PROXY_GENERATION_TIMEOUT = float(os.environ.get("PROXY_GENERATION_TIMEOUT", "300"))
|
|
112
|
+
PROXY_SLOT_HANG_TIMEOUT = float(os.environ.get("PROXY_SLOT_HANG_TIMEOUT", "120"))
|
|
111
113
|
PROXY_UPSTREAM_RETRY_MAX = int(os.environ.get("PROXY_UPSTREAM_RETRY_MAX", "3"))
|
|
112
114
|
PROXY_UPSTREAM_RETRY_DELAY_SECS = float(os.environ.get("PROXY_UPSTREAM_RETRY_DELAY_SECS", "5"))
|
|
113
115
|
PROXY_MAX_CONNECTIONS = int(os.environ.get("PROXY_MAX_CONNECTIONS", "20"))
|
|
@@ -1311,6 +1313,72 @@ async def _post_with_retry(
|
|
|
1311
1313
|
raise last_exc if last_exc else RuntimeError("upstream retry failed")
|
|
1312
1314
|
|
|
1313
1315
|
|
|
1316
|
+
async def _post_with_generation_timeout(
|
|
1317
|
+
client: httpx.AsyncClient,
|
|
1318
|
+
url: str,
|
|
1319
|
+
payload: dict,
|
|
1320
|
+
headers: dict,
|
|
1321
|
+
) -> httpx.Response:
|
|
1322
|
+
"""Wrap _post_with_retry with an explicit asyncio generation timeout.
|
|
1323
|
+
|
|
1324
|
+
The httpx read timeout may not fire for hung connections where the server
|
|
1325
|
+
keeps the socket open but produces no data (observed with llama.cpp server
|
|
1326
|
+
hanging after prompt processing). This wrapper uses asyncio.wait_for to
|
|
1327
|
+
enforce a hard deadline.
|
|
1328
|
+
"""
|
|
1329
|
+
timeout = PROXY_GENERATION_TIMEOUT
|
|
1330
|
+
if timeout <= 0:
|
|
1331
|
+
return await _post_with_retry(client, url, payload, headers)
|
|
1332
|
+
try:
|
|
1333
|
+
return await asyncio.wait_for(
|
|
1334
|
+
_post_with_retry(client, url, payload, headers),
|
|
1335
|
+
timeout=timeout,
|
|
1336
|
+
)
|
|
1337
|
+
except asyncio.TimeoutError:
|
|
1338
|
+
logger.error(
|
|
1339
|
+
"GENERATION TIMEOUT: request to %s exceeded %ds hard deadline",
|
|
1340
|
+
url,
|
|
1341
|
+
int(timeout),
|
|
1342
|
+
)
|
|
1343
|
+
raise httpx.ReadTimeout(
|
|
1344
|
+
f"Generation timeout after {int(timeout)}s (PROXY_GENERATION_TIMEOUT)"
|
|
1345
|
+
)
|
|
1346
|
+
|
|
1347
|
+
|
|
1348
|
+
async def _check_slot_hang(slot_url: str) -> bool:
|
|
1349
|
+
"""Check if any upstream slot is hung (processing but n_decoded=0).
|
|
1350
|
+
|
|
1351
|
+
Returns True if a hung slot was detected and the server was restarted.
|
|
1352
|
+
"""
|
|
1353
|
+
if PROXY_SLOT_HANG_TIMEOUT <= 0:
|
|
1354
|
+
return False
|
|
1355
|
+
try:
|
|
1356
|
+
async with httpx.AsyncClient() as check_client:
|
|
1357
|
+
resp = await check_client.get(slot_url, timeout=5.0)
|
|
1358
|
+
if resp.status_code != 200:
|
|
1359
|
+
return False
|
|
1360
|
+
slots = resp.json()
|
|
1361
|
+
for slot in slots:
|
|
1362
|
+
if (
|
|
1363
|
+
slot.get("is_processing", False)
|
|
1364
|
+
and slot.get("n_decoded", -1) == 0
|
|
1365
|
+
):
|
|
1366
|
+
# Slot is processing but hasn't decoded any tokens —
|
|
1367
|
+
# check how long by looking at the task start time.
|
|
1368
|
+
# Since we can't easily get the start time from the slot,
|
|
1369
|
+
# we'll just log a warning. The generation timeout will
|
|
1370
|
+
# handle the actual cancellation.
|
|
1371
|
+
logger.warning(
|
|
1372
|
+
"SLOT HANG DETECTED: slot %d is_processing=True n_decoded=0 task=%s",
|
|
1373
|
+
slot.get("id", -1),
|
|
1374
|
+
slot.get("id_task", "?"),
|
|
1375
|
+
)
|
|
1376
|
+
return True
|
|
1377
|
+
except Exception as exc:
|
|
1378
|
+
logger.debug("Slot hang check failed: %s", exc)
|
|
1379
|
+
return False
|
|
1380
|
+
|
|
1381
|
+
|
|
1314
1382
|
@asynccontextmanager
|
|
1315
1383
|
async def lifespan(app: FastAPI):
|
|
1316
1384
|
"""Manage the httpx client lifecycle with the FastAPI app."""
|
|
@@ -1383,6 +1451,12 @@ async def lifespan(app: FastAPI):
|
|
|
1383
1451
|
TOOL_CALL_GRAMMAR_TOOLS_COMPATIBLE,
|
|
1384
1452
|
PROXY_TOOL_CALL_GRAMMAR_PATH,
|
|
1385
1453
|
)
|
|
1454
|
+
logger.info(
|
|
1455
|
+
"Timeouts: read=%ds generation=%ds slot_hang=%ds",
|
|
1456
|
+
int(PROXY_READ_TIMEOUT),
|
|
1457
|
+
int(PROXY_GENERATION_TIMEOUT),
|
|
1458
|
+
int(PROXY_SLOT_HANG_TIMEOUT),
|
|
1459
|
+
)
|
|
1386
1460
|
|
|
1387
1461
|
yield
|
|
1388
1462
|
await http_client.aclose()
|
|
@@ -2167,16 +2241,24 @@ def build_openai_request(
|
|
|
2167
2241
|
# Enforce configurable minimum floor for thinking mode: model needs
|
|
2168
2242
|
# tokens for reasoning (<think>...</think>) plus actual response/tool
|
|
2169
2243
|
# calls. Set PROXY_MAX_TOKENS_FLOOR=0 to disable this floor.
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2244
|
+
#
|
|
2245
|
+
# The floor is ONLY applied when thinking is actually enabled —
|
|
2246
|
+
# skip it for non-tool requests (tools=0) and for tool turns
|
|
2247
|
+
# with thinking disabled, to prevent inflating short preflight
|
|
2248
|
+
# requests (e.g. max_tokens=100 for plan generation).
|
|
2249
|
+
thinking_active_for_request = has_tools and not PROXY_DISABLE_THINKING_ON_TOOL_TURNS
|
|
2250
|
+
skip_floor = (
|
|
2251
|
+
not has_tools # non-tool requests don't need thinking headroom
|
|
2252
|
+
or PROXY_DISABLE_THINKING_ON_TOOL_TURNS # thinking disabled on tool turns
|
|
2253
|
+
or PROXY_MAX_TOKENS_FLOOR <= 0 # floor explicitly disabled
|
|
2174
2254
|
)
|
|
2175
|
-
if
|
|
2255
|
+
if skip_floor:
|
|
2176
2256
|
requested_max = requested_raw
|
|
2177
|
-
if requested_raw < PROXY_MAX_TOKENS_FLOOR:
|
|
2257
|
+
if requested_raw < PROXY_MAX_TOKENS_FLOOR and PROXY_MAX_TOKENS_FLOOR > 0:
|
|
2178
2258
|
logger.info(
|
|
2179
|
-
"MAX_TOKENS floor
|
|
2259
|
+
"MAX_TOKENS floor skipped: has_tools=%s thinking_active=%s requested=%d floor=%d",
|
|
2260
|
+
has_tools,
|
|
2261
|
+
thinking_active_for_request,
|
|
2180
2262
|
requested_raw,
|
|
2181
2263
|
PROXY_MAX_TOKENS_FLOOR,
|
|
2182
2264
|
)
|
|
@@ -4435,6 +4517,48 @@ def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
|
|
|
4435
4517
|
return openai_resp
|
|
4436
4518
|
|
|
4437
4519
|
|
|
4520
|
+
def _detect_and_truncate_degenerate_repetition(openai_resp: dict) -> dict:
|
|
4521
|
+
"""Detect degenerate repetitive text and truncate at first repetition.
|
|
4522
|
+
|
|
4523
|
+
When the model produces highly repetitive output (e.g. the same 20+ char
|
|
4524
|
+
substring repeated 10+ times), truncate at the first repetition boundary
|
|
4525
|
+
and set finish_reason to stop.
|
|
4526
|
+
"""
|
|
4527
|
+
text = _openai_message_text(openai_resp)
|
|
4528
|
+
if not text or len(text) < 200:
|
|
4529
|
+
return openai_resp
|
|
4530
|
+
|
|
4531
|
+
# Look for repeated substrings of length 20-100
|
|
4532
|
+
for substr_len in (60, 40, 20):
|
|
4533
|
+
# Sample from the middle of the text to find the repeating pattern
|
|
4534
|
+
mid = len(text) // 2
|
|
4535
|
+
sample = text[mid : mid + substr_len]
|
|
4536
|
+
if not sample.strip():
|
|
4537
|
+
continue
|
|
4538
|
+
count = text.count(sample)
|
|
4539
|
+
if count >= 8:
|
|
4540
|
+
# Found degenerate repetition — truncate at first occurrence + one repeat
|
|
4541
|
+
first_pos = text.find(sample)
|
|
4542
|
+
second_pos = text.find(sample, first_pos + len(sample))
|
|
4543
|
+
if second_pos > first_pos:
|
|
4544
|
+
truncated = text[:second_pos].rstrip()
|
|
4545
|
+
logger.warning(
|
|
4546
|
+
"DEGENERATE REPETITION: detected %d repeats of %d-char substring, truncating %d -> %d chars",
|
|
4547
|
+
count,
|
|
4548
|
+
substr_len,
|
|
4549
|
+
len(text),
|
|
4550
|
+
len(truncated),
|
|
4551
|
+
)
|
|
4552
|
+
# Update the response
|
|
4553
|
+
choices = openai_resp.get("choices", [])
|
|
4554
|
+
if choices:
|
|
4555
|
+
msg = choices[0].get("message", {})
|
|
4556
|
+
msg["content"] = truncated
|
|
4557
|
+
choices[0]["finish_reason"] = "stop"
|
|
4558
|
+
return openai_resp
|
|
4559
|
+
return openai_resp
|
|
4560
|
+
|
|
4561
|
+
|
|
4438
4562
|
def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
4439
4563
|
"""Convert an OpenAI Chat Completions response to Anthropic Messages format."""
|
|
4440
4564
|
# First: try to recover tool calls trapped in text XML tags
|
|
@@ -5166,13 +5290,15 @@ async def messages(request: Request):
|
|
|
5166
5290
|
strict_body["stream"] = False
|
|
5167
5291
|
|
|
5168
5292
|
try:
|
|
5169
|
-
strict_resp = await
|
|
5293
|
+
strict_resp = await _post_with_generation_timeout(
|
|
5170
5294
|
client,
|
|
5171
5295
|
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
5172
5296
|
strict_body,
|
|
5173
5297
|
{"Content-Type": "application/json"},
|
|
5174
5298
|
)
|
|
5175
5299
|
except Exception as exc:
|
|
5300
|
+
# Check if upstream is hung before returning error
|
|
5301
|
+
await _check_slot_hang(f"{LLAMA_CPP_BASE}/slots")
|
|
5176
5302
|
return Response(
|
|
5177
5303
|
content=json.dumps(
|
|
5178
5304
|
{
|
|
@@ -5196,7 +5322,7 @@ async def messages(request: Request):
|
|
|
5196
5322
|
"strict-stream",
|
|
5197
5323
|
):
|
|
5198
5324
|
try:
|
|
5199
|
-
strict_resp = await
|
|
5325
|
+
strict_resp = await _post_with_generation_timeout(
|
|
5200
5326
|
client,
|
|
5201
5327
|
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
5202
5328
|
strict_body,
|
|
@@ -5258,6 +5384,7 @@ async def messages(request: Request):
|
|
|
5258
5384
|
session_id,
|
|
5259
5385
|
)
|
|
5260
5386
|
|
|
5387
|
+
openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
|
|
5261
5388
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
5262
5389
|
monitor.record_response(anthropic_resp.get("usage", {}).get("output_tokens", 0))
|
|
5263
5390
|
# Update last_input_tokens from upstream's actual prompt_tokens
|
|
@@ -5478,7 +5605,7 @@ async def messages(request: Request):
|
|
|
5478
5605
|
)
|
|
5479
5606
|
else:
|
|
5480
5607
|
try:
|
|
5481
|
-
resp = await
|
|
5608
|
+
resp = await _post_with_generation_timeout(
|
|
5482
5609
|
client,
|
|
5483
5610
|
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
5484
5611
|
openai_body,
|
|
@@ -5508,7 +5635,7 @@ async def messages(request: Request):
|
|
|
5508
5635
|
"non-stream",
|
|
5509
5636
|
):
|
|
5510
5637
|
try:
|
|
5511
|
-
resp = await
|
|
5638
|
+
resp = await _post_with_generation_timeout(
|
|
5512
5639
|
client,
|
|
5513
5640
|
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
5514
5641
|
openai_body,
|
|
@@ -5596,6 +5723,7 @@ async def messages(request: Request):
|
|
|
5596
5723
|
monitor.invalid_tool_call_streak = 0
|
|
5597
5724
|
monitor.required_tool_miss_streak = 0
|
|
5598
5725
|
|
|
5726
|
+
openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
|
|
5599
5727
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
5600
5728
|
|
|
5601
5729
|
# Track output tokens in session monitor
|
|
@@ -6,6 +6,8 @@ import json
|
|
|
6
6
|
import unittest
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
|
|
9
|
+
import httpx
|
|
10
|
+
|
|
9
11
|
|
|
10
12
|
def _load_proxy_module():
|
|
11
13
|
proxy_path = Path(__file__).resolve().parents[1] / "scripts" / "anthropic_proxy.py"
|
|
@@ -116,7 +118,8 @@ class TestProxyConfigTuning(unittest.TestCase):
|
|
|
116
118
|
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
|
|
117
119
|
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
|
|
118
120
|
|
|
119
|
-
def
|
|
121
|
+
def test_build_request_skips_floor_for_non_tool_turns(self):
|
|
122
|
+
"""Non-tool requests should NOT have the max_tokens floor applied."""
|
|
120
123
|
old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
|
|
121
124
|
old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
|
|
122
125
|
try:
|
|
@@ -132,7 +135,8 @@ class TestProxyConfigTuning(unittest.TestCase):
|
|
|
132
135
|
openai = proxy.build_openai_request(
|
|
133
136
|
body, proxy.SessionMonitor(context_window=0)
|
|
134
137
|
)
|
|
135
|
-
|
|
138
|
+
# Floor should NOT inflate max_tokens for non-tool requests
|
|
139
|
+
self.assertEqual(openai.get("max_tokens"), 512)
|
|
136
140
|
finally:
|
|
137
141
|
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
|
|
138
142
|
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
|
|
@@ -3437,6 +3441,143 @@ class TestMalformedRetryHardening(unittest.TestCase):
|
|
|
3437
3441
|
self.assertNotIn("<tool_call>", m.get("content", ""))
|
|
3438
3442
|
|
|
3439
3443
|
|
|
3444
|
+
class TestDegenerateRepetitionDetection(unittest.TestCase):
|
|
3445
|
+
"""Tests for degenerate repetition detection and truncation."""
|
|
3446
|
+
|
|
3447
|
+
def test_detects_and_truncates_repetitive_text(self):
|
|
3448
|
+
"""Highly repetitive text should be truncated."""
|
|
3449
|
+
repeated = "Mermaid Diagrams](docs/mermaid-diagrams" * 50
|
|
3450
|
+
openai_resp = {
|
|
3451
|
+
"choices": [{"message": {"content": repeated}, "finish_reason": "length"}]
|
|
3452
|
+
}
|
|
3453
|
+
result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3454
|
+
truncated_text = result["choices"][0]["message"]["content"]
|
|
3455
|
+
self.assertLess(len(truncated_text), len(repeated))
|
|
3456
|
+
self.assertEqual(result["choices"][0]["finish_reason"], "stop")
|
|
3457
|
+
|
|
3458
|
+
def test_preserves_non_repetitive_text(self):
|
|
3459
|
+
"""Normal text should not be modified."""
|
|
3460
|
+
text = "This is a perfectly normal response with varied content. " * 5
|
|
3461
|
+
openai_resp = {
|
|
3462
|
+
"choices": [{"message": {"content": text}, "finish_reason": "stop"}]
|
|
3463
|
+
}
|
|
3464
|
+
result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3465
|
+
self.assertEqual(result["choices"][0]["message"]["content"], text)
|
|
3466
|
+
|
|
3467
|
+
def test_preserves_short_text(self):
|
|
3468
|
+
"""Short text (< 200 chars) should not be processed."""
|
|
3469
|
+
text = "Short response."
|
|
3470
|
+
openai_resp = {
|
|
3471
|
+
"choices": [{"message": {"content": text}, "finish_reason": "stop"}]
|
|
3472
|
+
}
|
|
3473
|
+
result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3474
|
+
self.assertEqual(result["choices"][0]["message"]["content"], text)
|
|
3475
|
+
|
|
3476
|
+
def test_max_tokens_floor_skipped_for_non_tool_requests(self):
|
|
3477
|
+
"""max_tokens floor should not inflate non-tool requests."""
|
|
3478
|
+
old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
|
|
3479
|
+
old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
|
|
3480
|
+
try:
|
|
3481
|
+
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 16384)
|
|
3482
|
+
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
|
|
3483
|
+
|
|
3484
|
+
body = {
|
|
3485
|
+
"model": "test",
|
|
3486
|
+
"max_tokens": 100,
|
|
3487
|
+
"messages": [{"role": "user", "content": "generate a title"}],
|
|
3488
|
+
}
|
|
3489
|
+
openai = proxy.build_openai_request(
|
|
3490
|
+
body, proxy.SessionMonitor(context_window=0)
|
|
3491
|
+
)
|
|
3492
|
+
# No tools = no floor inflation
|
|
3493
|
+
self.assertEqual(openai.get("max_tokens"), 100)
|
|
3494
|
+
finally:
|
|
3495
|
+
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
|
|
3496
|
+
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
|
|
3497
|
+
|
|
3498
|
+
def test_max_tokens_floor_applied_when_thinking_active(self):
|
|
3499
|
+
"""max_tokens floor should apply when tools present and thinking enabled."""
|
|
3500
|
+
old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
|
|
3501
|
+
old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
|
|
3502
|
+
try:
|
|
3503
|
+
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 4096)
|
|
3504
|
+
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
|
|
3505
|
+
|
|
3506
|
+
body = {
|
|
3507
|
+
"model": "test",
|
|
3508
|
+
"max_tokens": 512,
|
|
3509
|
+
"messages": [{"role": "user", "content": "run command"}],
|
|
3510
|
+
"tools": [{"name": "Bash", "description": "run", "input_schema": {"type": "object"}}],
|
|
3511
|
+
}
|
|
3512
|
+
openai = proxy.build_openai_request(
|
|
3513
|
+
body, proxy.SessionMonitor(context_window=0)
|
|
3514
|
+
)
|
|
3515
|
+
# Tools + thinking enabled = floor applied
|
|
3516
|
+
self.assertEqual(openai.get("max_tokens"), 4096)
|
|
3517
|
+
finally:
|
|
3518
|
+
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
|
|
3519
|
+
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
|
|
3520
|
+
|
|
3521
|
+
|
|
3522
|
+
class TestGenerationHangRecovery(unittest.TestCase):
|
|
3523
|
+
"""Tests for generation hang recovery: timeouts, slot hang detection."""
|
|
3524
|
+
|
|
3525
|
+
def test_read_timeout_default_is_180(self):
|
|
3526
|
+
"""Option 3: default read timeout reduced from 600 to 180."""
|
|
3527
|
+
self.assertEqual(proxy.PROXY_READ_TIMEOUT, 180)
|
|
3528
|
+
|
|
3529
|
+
def test_generation_timeout_default_is_300(self):
|
|
3530
|
+
"""Option 1: generation timeout is 300s."""
|
|
3531
|
+
self.assertEqual(proxy.PROXY_GENERATION_TIMEOUT, 300)
|
|
3532
|
+
|
|
3533
|
+
def test_slot_hang_timeout_default_is_120(self):
|
|
3534
|
+
"""Option 2: slot hang timeout is 120s."""
|
|
3535
|
+
self.assertEqual(proxy.PROXY_SLOT_HANG_TIMEOUT, 120)
|
|
3536
|
+
|
|
3537
|
+
def test_generation_timeout_wraps_post_with_retry(self):
|
|
3538
|
+
"""_post_with_generation_timeout raises ReadTimeout on asyncio timeout."""
|
|
3539
|
+
import asyncio
|
|
3540
|
+
|
|
3541
|
+
async def _run():
|
|
3542
|
+
# Create a mock client that hangs forever
|
|
3543
|
+
async def _hanging_post(*args, **kwargs):
|
|
3544
|
+
await asyncio.sleep(999)
|
|
3545
|
+
|
|
3546
|
+
class FakeClient:
|
|
3547
|
+
async def post(self, *args, **kwargs):
|
|
3548
|
+
await asyncio.sleep(999)
|
|
3549
|
+
|
|
3550
|
+
old_timeout = proxy.PROXY_GENERATION_TIMEOUT
|
|
3551
|
+
old_retry_max = proxy.PROXY_UPSTREAM_RETRY_MAX
|
|
3552
|
+
try:
|
|
3553
|
+
proxy.PROXY_GENERATION_TIMEOUT = 0.1 # 100ms
|
|
3554
|
+
proxy.PROXY_UPSTREAM_RETRY_MAX = 1
|
|
3555
|
+
with self.assertRaises(httpx.ReadTimeout):
|
|
3556
|
+
await proxy._post_with_generation_timeout(
|
|
3557
|
+
FakeClient(),
|
|
3558
|
+
"http://localhost:9999/fake",
|
|
3559
|
+
{},
|
|
3560
|
+
{},
|
|
3561
|
+
)
|
|
3562
|
+
finally:
|
|
3563
|
+
proxy.PROXY_GENERATION_TIMEOUT = old_timeout
|
|
3564
|
+
proxy.PROXY_UPSTREAM_RETRY_MAX = old_retry_max
|
|
3565
|
+
|
|
3566
|
+
asyncio.run(_run())
|
|
3567
|
+
|
|
3568
|
+
def test_check_slot_hang_detects_stuck_slot(self):
|
|
3569
|
+
"""_check_slot_hang returns True when a slot is processing with n_decoded=0."""
|
|
3570
|
+
import asyncio
|
|
3571
|
+
|
|
3572
|
+
async def _run():
|
|
3573
|
+
# We can't easily mock the HTTP call, but we can verify the function
|
|
3574
|
+
# doesn't crash when the server is unreachable
|
|
3575
|
+
result = await proxy._check_slot_hang("http://localhost:9999/nonexistent")
|
|
3576
|
+
self.assertFalse(result)
|
|
3577
|
+
|
|
3578
|
+
asyncio.run(_run())
|
|
3579
|
+
|
|
3580
|
+
|
|
3440
3581
|
if __name__ == "__main__":
|
|
3441
3582
|
unittest.main()
|
|
3442
3583
|
|