@miller-tech/uap 1.20.13 → 1.20.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -219,7 +219,7 @@ PROXY_MALFORMED_TOOL_GUARDRAIL = os.environ.get(
|
|
|
219
219
|
"no",
|
|
220
220
|
}
|
|
221
221
|
PROXY_MALFORMED_TOOL_RETRY_MAX = int(
|
|
222
|
-
os.environ.get("PROXY_MALFORMED_TOOL_RETRY_MAX", "
|
|
222
|
+
os.environ.get("PROXY_MALFORMED_TOOL_RETRY_MAX", "3")
|
|
223
223
|
)
|
|
224
224
|
PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS = int(
|
|
225
225
|
os.environ.get("PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS", "2048")
|
|
@@ -2167,16 +2167,24 @@ def build_openai_request(
|
|
|
2167
2167
|
# Enforce configurable minimum floor for thinking mode: model needs
|
|
2168
2168
|
# tokens for reasoning (<think>...</think>) plus actual response/tool
|
|
2169
2169
|
# calls. Set PROXY_MAX_TOKENS_FLOOR=0 to disable this floor.
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2170
|
+
#
|
|
2171
|
+
# The floor is ONLY applied when thinking is actually enabled —
|
|
2172
|
+
# skip it for non-tool requests (tools=0) and for tool turns
|
|
2173
|
+
# with thinking disabled, to prevent inflating short preflight
|
|
2174
|
+
# requests (e.g. max_tokens=100 for plan generation).
|
|
2175
|
+
thinking_active_for_request = has_tools and not PROXY_DISABLE_THINKING_ON_TOOL_TURNS
|
|
2176
|
+
skip_floor = (
|
|
2177
|
+
not has_tools # non-tool requests don't need thinking headroom
|
|
2178
|
+
or PROXY_DISABLE_THINKING_ON_TOOL_TURNS # thinking disabled on tool turns
|
|
2179
|
+
or PROXY_MAX_TOKENS_FLOOR <= 0 # floor explicitly disabled
|
|
2174
2180
|
)
|
|
2175
|
-
if
|
|
2181
|
+
if skip_floor:
|
|
2176
2182
|
requested_max = requested_raw
|
|
2177
|
-
if requested_raw < PROXY_MAX_TOKENS_FLOOR:
|
|
2183
|
+
if requested_raw < PROXY_MAX_TOKENS_FLOOR and PROXY_MAX_TOKENS_FLOOR > 0:
|
|
2178
2184
|
logger.info(
|
|
2179
|
-
"MAX_TOKENS floor
|
|
2185
|
+
"MAX_TOKENS floor skipped: has_tools=%s thinking_active=%s requested=%d floor=%d",
|
|
2186
|
+
has_tools,
|
|
2187
|
+
thinking_active_for_request,
|
|
2180
2188
|
requested_raw,
|
|
2181
2189
|
PROXY_MAX_TOKENS_FLOOR,
|
|
2182
2190
|
)
|
|
@@ -3890,6 +3898,40 @@ async def _apply_completion_contract_guardrail(
|
|
|
3890
3898
|
return retried
|
|
3891
3899
|
|
|
3892
3900
|
|
|
3901
|
+
def _sanitize_assistant_messages_for_retry(messages: list[dict]) -> list[dict]:
|
|
3902
|
+
"""Strip malformed tool-like text from assistant messages to prevent copy-contamination.
|
|
3903
|
+
|
|
3904
|
+
Only sanitizes the last 4 assistant messages to avoid excessive processing.
|
|
3905
|
+
"""
|
|
3906
|
+
import re
|
|
3907
|
+
|
|
3908
|
+
# Patterns that indicate malformed tool call text in assistant content
|
|
3909
|
+
_TOOL_LIKE_PATTERNS = re.compile(
|
|
3910
|
+
r"<tool_call>.*?</tool_call>"
|
|
3911
|
+
r"|<function_call>.*?</function_call>"
|
|
3912
|
+
r'|\{"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:'
|
|
3913
|
+
r"|```json\s*\{[^}]*\"name\"\s*:",
|
|
3914
|
+
re.DOTALL,
|
|
3915
|
+
)
|
|
3916
|
+
|
|
3917
|
+
result = list(messages)
|
|
3918
|
+
sanitized_count = 0
|
|
3919
|
+
for i in range(len(result) - 1, -1, -1):
|
|
3920
|
+
if sanitized_count >= 4:
|
|
3921
|
+
break
|
|
3922
|
+
msg = result[i]
|
|
3923
|
+
if msg.get("role") != "assistant":
|
|
3924
|
+
continue
|
|
3925
|
+
content = msg.get("content", "")
|
|
3926
|
+
if isinstance(content, str) and _TOOL_LIKE_PATTERNS.search(content):
|
|
3927
|
+
cleaned = _TOOL_LIKE_PATTERNS.sub("", content).strip()
|
|
3928
|
+
if not cleaned:
|
|
3929
|
+
cleaned = "I will use the appropriate tool."
|
|
3930
|
+
result[i] = {**msg, "content": cleaned}
|
|
3931
|
+
sanitized_count += 1
|
|
3932
|
+
return result
|
|
3933
|
+
|
|
3934
|
+
|
|
3893
3935
|
def _build_malformed_retry_body(
|
|
3894
3936
|
openai_body: dict,
|
|
3895
3937
|
anthropic_body: dict,
|
|
@@ -3901,7 +3943,11 @@ def _build_malformed_retry_body(
|
|
|
3901
3943
|
retry_body = dict(openai_body)
|
|
3902
3944
|
retry_body["stream"] = False
|
|
3903
3945
|
retry_body["tool_choice"] = tool_choice
|
|
3904
|
-
|
|
3946
|
+
# Escalate temperature down on successive retries for more deterministic output
|
|
3947
|
+
if total_attempts > 1 and attempt > 1:
|
|
3948
|
+
retry_body["temperature"] = 0.0
|
|
3949
|
+
else:
|
|
3950
|
+
retry_body["temperature"] = PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE
|
|
3905
3951
|
|
|
3906
3952
|
if tool_choice == "required":
|
|
3907
3953
|
retry_instruction = (
|
|
@@ -3922,7 +3968,10 @@ def _build_malformed_retry_body(
|
|
|
3922
3968
|
}
|
|
3923
3969
|
existing_messages = retry_body.get("messages")
|
|
3924
3970
|
if isinstance(existing_messages, list) and existing_messages:
|
|
3925
|
-
|
|
3971
|
+
# Strip malformed tool-like text from assistant messages to prevent
|
|
3972
|
+
# the model from copying contaminated patterns on retry
|
|
3973
|
+
sanitized = _sanitize_assistant_messages_for_retry(existing_messages)
|
|
3974
|
+
retry_body["messages"] = [*sanitized, malformed_retry_instruction]
|
|
3926
3975
|
|
|
3927
3976
|
if PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS > 0:
|
|
3928
3977
|
current_max = int(
|
|
@@ -4394,6 +4443,48 @@ def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
|
|
|
4394
4443
|
return openai_resp
|
|
4395
4444
|
|
|
4396
4445
|
|
|
4446
|
+
def _detect_and_truncate_degenerate_repetition(openai_resp: dict) -> dict:
|
|
4447
|
+
"""Detect degenerate repetitive text and truncate at first repetition.
|
|
4448
|
+
|
|
4449
|
+
When the model produces highly repetitive output (e.g. the same 20+ char
|
|
4450
|
+
substring repeated 10+ times), truncate at the first repetition boundary
|
|
4451
|
+
and set finish_reason to stop.
|
|
4452
|
+
"""
|
|
4453
|
+
text = _openai_message_text(openai_resp)
|
|
4454
|
+
if not text or len(text) < 200:
|
|
4455
|
+
return openai_resp
|
|
4456
|
+
|
|
4457
|
+
# Look for repeated substrings of length 20-100
|
|
4458
|
+
for substr_len in (60, 40, 20):
|
|
4459
|
+
# Sample from the middle of the text to find the repeating pattern
|
|
4460
|
+
mid = len(text) // 2
|
|
4461
|
+
sample = text[mid : mid + substr_len]
|
|
4462
|
+
if not sample.strip():
|
|
4463
|
+
continue
|
|
4464
|
+
count = text.count(sample)
|
|
4465
|
+
if count >= 8:
|
|
4466
|
+
# Found degenerate repetition — truncate at first occurrence + one repeat
|
|
4467
|
+
first_pos = text.find(sample)
|
|
4468
|
+
second_pos = text.find(sample, first_pos + len(sample))
|
|
4469
|
+
if second_pos > first_pos:
|
|
4470
|
+
truncated = text[:second_pos].rstrip()
|
|
4471
|
+
logger.warning(
|
|
4472
|
+
"DEGENERATE REPETITION: detected %d repeats of %d-char substring, truncating %d -> %d chars",
|
|
4473
|
+
count,
|
|
4474
|
+
substr_len,
|
|
4475
|
+
len(text),
|
|
4476
|
+
len(truncated),
|
|
4477
|
+
)
|
|
4478
|
+
# Update the response
|
|
4479
|
+
choices = openai_resp.get("choices", [])
|
|
4480
|
+
if choices:
|
|
4481
|
+
msg = choices[0].get("message", {})
|
|
4482
|
+
msg["content"] = truncated
|
|
4483
|
+
choices[0]["finish_reason"] = "stop"
|
|
4484
|
+
return openai_resp
|
|
4485
|
+
return openai_resp
|
|
4486
|
+
|
|
4487
|
+
|
|
4397
4488
|
def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
4398
4489
|
"""Convert an OpenAI Chat Completions response to Anthropic Messages format."""
|
|
4399
4490
|
# First: try to recover tool calls trapped in text XML tags
|
|
@@ -5217,6 +5308,7 @@ async def messages(request: Request):
|
|
|
5217
5308
|
session_id,
|
|
5218
5309
|
)
|
|
5219
5310
|
|
|
5311
|
+
openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
|
|
5220
5312
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
5221
5313
|
monitor.record_response(anthropic_resp.get("usage", {}).get("output_tokens", 0))
|
|
5222
5314
|
# Update last_input_tokens from upstream's actual prompt_tokens
|
|
@@ -5555,6 +5647,7 @@ async def messages(request: Request):
|
|
|
5555
5647
|
monitor.invalid_tool_call_streak = 0
|
|
5556
5648
|
monitor.required_tool_miss_streak = 0
|
|
5557
5649
|
|
|
5650
|
+
openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
|
|
5558
5651
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
5559
5652
|
|
|
5560
5653
|
# Track output tokens in session monitor
|
|
@@ -116,7 +116,8 @@ class TestProxyConfigTuning(unittest.TestCase):
|
|
|
116
116
|
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
|
|
117
117
|
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
|
|
118
118
|
|
|
119
|
-
def
|
|
119
|
+
def test_build_request_skips_floor_for_non_tool_turns(self):
|
|
120
|
+
"""Non-tool requests should NOT have the max_tokens floor applied."""
|
|
120
121
|
old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
|
|
121
122
|
old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
|
|
122
123
|
try:
|
|
@@ -132,7 +133,8 @@ class TestProxyConfigTuning(unittest.TestCase):
|
|
|
132
133
|
openai = proxy.build_openai_request(
|
|
133
134
|
body, proxy.SessionMonitor(context_window=0)
|
|
134
135
|
)
|
|
135
|
-
|
|
136
|
+
# Floor should NOT inflate max_tokens for non-tool requests
|
|
137
|
+
self.assertEqual(openai.get("max_tokens"), 512)
|
|
136
138
|
finally:
|
|
137
139
|
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
|
|
138
140
|
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
|
|
@@ -3377,6 +3379,144 @@ class TestCycleBreakOptions(unittest.TestCase):
|
|
|
3377
3379
|
self.assertEqual(monitor.cycling_tool_names, [])
|
|
3378
3380
|
|
|
3379
3381
|
|
|
3382
|
+
class TestMalformedRetryHardening(unittest.TestCase):
|
|
3383
|
+
"""Tests for malformed retry improvements: budget, temp escalation, message sanitization."""
|
|
3384
|
+
|
|
3385
|
+
def test_retry_max_default_is_3(self):
|
|
3386
|
+
"""Option 1: default retry budget increased from 2 to 3."""
|
|
3387
|
+
self.assertEqual(proxy.PROXY_MALFORMED_TOOL_RETRY_MAX, 3)
|
|
3388
|
+
|
|
3389
|
+
def test_sanitize_assistant_messages_strips_tool_like_text(self):
|
|
3390
|
+
"""Option 3: malformed tool-like text stripped from assistant messages on retry."""
|
|
3391
|
+
messages = [
|
|
3392
|
+
{"role": "system", "content": "You are helpful."},
|
|
3393
|
+
{"role": "user", "content": "Run a command"},
|
|
3394
|
+
{"role": "assistant", "content": 'Here is the result <tool_call>{"name": "Bash", "arguments": {"command": "ls"}}</tool_call>'},
|
|
3395
|
+
{"role": "user", "content": "ok"},
|
|
3396
|
+
]
|
|
3397
|
+
sanitized = proxy._sanitize_assistant_messages_for_retry(messages)
|
|
3398
|
+
# System and user messages unchanged
|
|
3399
|
+
self.assertEqual(sanitized[0]["content"], "You are helpful.")
|
|
3400
|
+
self.assertEqual(sanitized[1]["content"], "Run a command")
|
|
3401
|
+
self.assertEqual(sanitized[3]["content"], "ok")
|
|
3402
|
+
# Assistant message should have tool_call stripped
|
|
3403
|
+
self.assertNotIn("<tool_call>", sanitized[2]["content"])
|
|
3404
|
+
self.assertNotIn("Bash", sanitized[2]["content"])
|
|
3405
|
+
|
|
3406
|
+
def test_sanitize_preserves_clean_assistant_messages(self):
|
|
3407
|
+
"""Clean assistant messages are not modified by sanitization."""
|
|
3408
|
+
messages = [
|
|
3409
|
+
{"role": "assistant", "content": "I will read the file for you."},
|
|
3410
|
+
]
|
|
3411
|
+
sanitized = proxy._sanitize_assistant_messages_for_retry(messages)
|
|
3412
|
+
self.assertEqual(sanitized[0]["content"], "I will read the file for you.")
|
|
3413
|
+
|
|
3414
|
+
def test_sanitize_replaces_empty_content_with_placeholder(self):
|
|
3415
|
+
"""If stripping leaves empty content, a placeholder is used."""
|
|
3416
|
+
messages = [
|
|
3417
|
+
{"role": "assistant", "content": '<tool_call>{"name": "Bash", "arguments": {}}</tool_call>'},
|
|
3418
|
+
]
|
|
3419
|
+
sanitized = proxy._sanitize_assistant_messages_for_retry(messages)
|
|
3420
|
+
self.assertEqual(sanitized[0]["content"], "I will use the appropriate tool.")
|
|
3421
|
+
|
|
3422
|
+
def test_retry_body_uses_sanitized_messages(self):
|
|
3423
|
+
"""Retry body messages are sanitized before adding retry instruction."""
|
|
3424
|
+
openai_body = {
|
|
3425
|
+
"messages": [
|
|
3426
|
+
{"role": "system", "content": "sys"},
|
|
3427
|
+
{"role": "user", "content": "do it"},
|
|
3428
|
+
{"role": "assistant", "content": '<tool_call>{"name":"X","arguments":{}}</tool_call>'},
|
|
3429
|
+
],
|
|
3430
|
+
"tools": [{"type": "function", "function": {"name": "X", "parameters": {}}}],
|
|
3431
|
+
}
|
|
3432
|
+
anthropic_body = {"tools": [{"name": "X", "input_schema": {"type": "object"}}]}
|
|
3433
|
+
retry = proxy._build_malformed_retry_body(
|
|
3434
|
+
openai_body, anthropic_body, attempt=1, total_attempts=3,
|
|
3435
|
+
)
|
|
3436
|
+
# The assistant message should be sanitized
|
|
3437
|
+
assistant_msgs = [m for m in retry["messages"] if m.get("role") == "assistant"]
|
|
3438
|
+
for m in assistant_msgs:
|
|
3439
|
+
self.assertNotIn("<tool_call>", m.get("content", ""))
|
|
3440
|
+
|
|
3441
|
+
|
|
3442
|
+
class TestDegenerateRepetitionDetection(unittest.TestCase):
|
|
3443
|
+
"""Tests for degenerate repetition detection and truncation."""
|
|
3444
|
+
|
|
3445
|
+
def test_detects_and_truncates_repetitive_text(self):
|
|
3446
|
+
"""Highly repetitive text should be truncated."""
|
|
3447
|
+
repeated = "Mermaid Diagrams](docs/mermaid-diagrams" * 50
|
|
3448
|
+
openai_resp = {
|
|
3449
|
+
"choices": [{"message": {"content": repeated}, "finish_reason": "length"}]
|
|
3450
|
+
}
|
|
3451
|
+
result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3452
|
+
truncated_text = result["choices"][0]["message"]["content"]
|
|
3453
|
+
self.assertLess(len(truncated_text), len(repeated))
|
|
3454
|
+
self.assertEqual(result["choices"][0]["finish_reason"], "stop")
|
|
3455
|
+
|
|
3456
|
+
def test_preserves_non_repetitive_text(self):
|
|
3457
|
+
"""Normal text should not be modified."""
|
|
3458
|
+
text = "This is a perfectly normal response with varied content. " * 5
|
|
3459
|
+
openai_resp = {
|
|
3460
|
+
"choices": [{"message": {"content": text}, "finish_reason": "stop"}]
|
|
3461
|
+
}
|
|
3462
|
+
result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3463
|
+
self.assertEqual(result["choices"][0]["message"]["content"], text)
|
|
3464
|
+
|
|
3465
|
+
def test_preserves_short_text(self):
|
|
3466
|
+
"""Short text (< 200 chars) should not be processed."""
|
|
3467
|
+
text = "Short response."
|
|
3468
|
+
openai_resp = {
|
|
3469
|
+
"choices": [{"message": {"content": text}, "finish_reason": "stop"}]
|
|
3470
|
+
}
|
|
3471
|
+
result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3472
|
+
self.assertEqual(result["choices"][0]["message"]["content"], text)
|
|
3473
|
+
|
|
3474
|
+
def test_max_tokens_floor_skipped_for_non_tool_requests(self):
|
|
3475
|
+
"""max_tokens floor should not inflate non-tool requests."""
|
|
3476
|
+
old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
|
|
3477
|
+
old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
|
|
3478
|
+
try:
|
|
3479
|
+
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 16384)
|
|
3480
|
+
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
|
|
3481
|
+
|
|
3482
|
+
body = {
|
|
3483
|
+
"model": "test",
|
|
3484
|
+
"max_tokens": 100,
|
|
3485
|
+
"messages": [{"role": "user", "content": "generate a title"}],
|
|
3486
|
+
}
|
|
3487
|
+
openai = proxy.build_openai_request(
|
|
3488
|
+
body, proxy.SessionMonitor(context_window=0)
|
|
3489
|
+
)
|
|
3490
|
+
# No tools = no floor inflation
|
|
3491
|
+
self.assertEqual(openai.get("max_tokens"), 100)
|
|
3492
|
+
finally:
|
|
3493
|
+
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
|
|
3494
|
+
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
|
|
3495
|
+
|
|
3496
|
+
def test_max_tokens_floor_applied_when_thinking_active(self):
|
|
3497
|
+
"""max_tokens floor should apply when tools present and thinking enabled."""
|
|
3498
|
+
old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
|
|
3499
|
+
old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
|
|
3500
|
+
try:
|
|
3501
|
+
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 4096)
|
|
3502
|
+
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
|
|
3503
|
+
|
|
3504
|
+
body = {
|
|
3505
|
+
"model": "test",
|
|
3506
|
+
"max_tokens": 512,
|
|
3507
|
+
"messages": [{"role": "user", "content": "run command"}],
|
|
3508
|
+
"tools": [{"name": "Bash", "description": "run", "input_schema": {"type": "object"}}],
|
|
3509
|
+
}
|
|
3510
|
+
openai = proxy.build_openai_request(
|
|
3511
|
+
body, proxy.SessionMonitor(context_window=0)
|
|
3512
|
+
)
|
|
3513
|
+
# Tools + thinking enabled = floor applied
|
|
3514
|
+
self.assertEqual(openai.get("max_tokens"), 4096)
|
|
3515
|
+
finally:
|
|
3516
|
+
setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
|
|
3517
|
+
setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
|
|
3518
|
+
|
|
3519
|
+
|
|
3380
3520
|
if __name__ == "__main__":
|
|
3381
3521
|
unittest.main()
|
|
3382
3522
|
|