@miller-tech/uap 1.20.19 → 1.20.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -3044,6 +3044,21 @@ _SYSTEM_PROMPT_LEAK_MARKERS = (
|
|
|
3044
3044
|
"valid tool call with strict json",
|
|
3045
3045
|
"return exactly one valid tool call",
|
|
3046
3046
|
"invalid tool call format",
|
|
3047
|
+
# Option 1: Spec mode system-reminder phrases
|
|
3048
|
+
"spec mode is active",
|
|
3049
|
+
"spec mode active",
|
|
3050
|
+
"executed askuser tool to gather requirements",
|
|
3051
|
+
"gather requirements and clarify decisions",
|
|
3052
|
+
"before finalizing your spec",
|
|
3053
|
+
"you must not make any edits",
|
|
3054
|
+
# Option 2: Broader Claude Code system-reminder phrases
|
|
3055
|
+
"the user indicated that they do not want you to execute",
|
|
3056
|
+
"run any non-readonly tools",
|
|
3057
|
+
"making communications or interacting with external services",
|
|
3058
|
+
"this is encouraged in spec mode",
|
|
3059
|
+
"user has executed askuser tool",
|
|
3060
|
+
"<system-reminder>",
|
|
3061
|
+
"</system-reminder>",
|
|
3047
3062
|
)
|
|
3048
3063
|
|
|
3049
3064
|
|
|
@@ -4103,6 +4118,8 @@ def _build_malformed_retry_body(
|
|
|
4103
4118
|
tool_choice: str = "required",
|
|
4104
4119
|
attempt: int = 1,
|
|
4105
4120
|
total_attempts: int = 1,
|
|
4121
|
+
is_garbled: bool = False,
|
|
4122
|
+
exclude_tools: list[str] | None = None,
|
|
4106
4123
|
) -> dict:
|
|
4107
4124
|
retry_body = dict(openai_body)
|
|
4108
4125
|
retry_body["stream"] = False
|
|
@@ -4137,7 +4154,16 @@ def _build_malformed_retry_body(
|
|
|
4137
4154
|
sanitized = _sanitize_assistant_messages_for_retry(existing_messages)
|
|
4138
4155
|
retry_body["messages"] = [*sanitized, malformed_retry_instruction]
|
|
4139
4156
|
|
|
4140
|
-
|
|
4157
|
+
# Option 1: Progressive garbled-cap within retries — use smaller max_tokens
|
|
4158
|
+
# when the issue involves garbled/degenerate args to limit degeneration room.
|
|
4159
|
+
if is_garbled and PROXY_TOOL_TURN_MAX_TOKENS_GARBLED > 0:
|
|
4160
|
+
retry_body["max_tokens"] = PROXY_TOOL_TURN_MAX_TOKENS_GARBLED
|
|
4161
|
+
logger.info(
|
|
4162
|
+
"RETRY GARBLED CAP: max_tokens=%d for garbled retry attempt=%d",
|
|
4163
|
+
PROXY_TOOL_TURN_MAX_TOKENS_GARBLED,
|
|
4164
|
+
attempt,
|
|
4165
|
+
)
|
|
4166
|
+
elif PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS > 0:
|
|
4141
4167
|
current_max = int(
|
|
4142
4168
|
retry_body.get("max_tokens", PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS)
|
|
4143
4169
|
)
|
|
@@ -4151,6 +4177,23 @@ def _build_malformed_retry_body(
|
|
|
4151
4177
|
anthropic_body.get("tools", [])
|
|
4152
4178
|
)
|
|
4153
4179
|
|
|
4180
|
+
# Option 3: Exclude specific failing tools from retry to let the model
|
|
4181
|
+
# pick an alternative when a tool consistently produces garbled args.
|
|
4182
|
+
if exclude_tools and retry_body.get("tools"):
|
|
4183
|
+
exclude_lower = {t.lower() for t in exclude_tools}
|
|
4184
|
+
original_count = len(retry_body["tools"])
|
|
4185
|
+
retry_body["tools"] = [
|
|
4186
|
+
t for t in retry_body["tools"]
|
|
4187
|
+
if t.get("function", {}).get("name", "").lower() not in exclude_lower
|
|
4188
|
+
]
|
|
4189
|
+
if len(retry_body["tools"]) < original_count:
|
|
4190
|
+
logger.info(
|
|
4191
|
+
"RETRY TOOL NARROWING: excluded %s, tools %d -> %d",
|
|
4192
|
+
exclude_tools,
|
|
4193
|
+
original_count,
|
|
4194
|
+
len(retry_body["tools"]),
|
|
4195
|
+
)
|
|
4196
|
+
|
|
4154
4197
|
if PROXY_DISABLE_THINKING_ON_TOOL_TURNS:
|
|
4155
4198
|
retry_body["enable_thinking"] = False
|
|
4156
4199
|
|
|
@@ -4373,8 +4416,16 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4373
4416
|
|
|
4374
4417
|
monitor.maybe_activate_forced_tool_dampener(issue.kind)
|
|
4375
4418
|
excerpt = _openai_message_text(working_resp)[:220].replace("\n", " ")
|
|
4419
|
+
# Option 2: Log garbled argument content for diagnostics
|
|
4420
|
+
arg_excerpt = ""
|
|
4421
|
+
if issue.kind == "invalid_tool_args":
|
|
4422
|
+
for tc in (working_resp.get("choices", [{}])[0].get("message", {}).get("tool_calls", [])):
|
|
4423
|
+
raw_args = tc.get("function", {}).get("arguments", "")
|
|
4424
|
+
if raw_args and _is_garbled_tool_arguments(raw_args):
|
|
4425
|
+
arg_excerpt = raw_args[:200].replace("\n", " ")
|
|
4426
|
+
break
|
|
4376
4427
|
logger.warning(
|
|
4377
|
-
"TOOL RESPONSE ISSUE: session=%s kind=%s reason=%s malformed=%d invalid=%d required_miss=%d excerpt=%.220s",
|
|
4428
|
+
"TOOL RESPONSE ISSUE: session=%s kind=%s reason=%s malformed=%d invalid=%d required_miss=%d excerpt=%.220s args=%.200s",
|
|
4378
4429
|
session_id,
|
|
4379
4430
|
issue.kind,
|
|
4380
4431
|
issue.reason,
|
|
@@ -4382,16 +4433,27 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4382
4433
|
monitor.invalid_tool_call_streak,
|
|
4383
4434
|
monitor.required_tool_miss_streak,
|
|
4384
4435
|
excerpt,
|
|
4436
|
+
arg_excerpt,
|
|
4385
4437
|
)
|
|
4386
4438
|
|
|
4387
4439
|
attempts = max(0, PROXY_MALFORMED_TOOL_RETRY_MAX)
|
|
4388
4440
|
current_issue = issue
|
|
4441
|
+
# Track failing tool names for Option 3 (tool narrowing on retry)
|
|
4442
|
+
failing_tools: set[str] = set()
|
|
4443
|
+
if issue.kind == "invalid_tool_args":
|
|
4444
|
+
for tc in (working_resp.get("choices", [{}])[0].get("message", {}).get("tool_calls", [])):
|
|
4445
|
+
fn_name = tc.get("function", {}).get("name", "")
|
|
4446
|
+
raw_args = tc.get("function", {}).get("arguments", "")
|
|
4447
|
+
if fn_name and raw_args and _is_garbled_tool_arguments(raw_args):
|
|
4448
|
+
failing_tools.add(fn_name)
|
|
4389
4449
|
for attempt in range(attempts):
|
|
4390
4450
|
attempt_tool_choice = _retry_tool_choice_for_attempt(
|
|
4391
4451
|
required_tool_choice,
|
|
4392
4452
|
attempt,
|
|
4393
4453
|
attempts,
|
|
4394
4454
|
)
|
|
4455
|
+
# Option 3: On attempt >= 2, exclude consistently failing tools
|
|
4456
|
+
exclude = list(failing_tools) if attempt >= 1 and failing_tools else None
|
|
4395
4457
|
retry_body = _build_malformed_retry_body(
|
|
4396
4458
|
openai_body,
|
|
4397
4459
|
anthropic_body,
|
|
@@ -4399,6 +4461,8 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4399
4461
|
tool_choice=attempt_tool_choice,
|
|
4400
4462
|
attempt=attempt + 1,
|
|
4401
4463
|
total_attempts=attempts,
|
|
4464
|
+
is_garbled=current_issue.kind == "invalid_tool_args",
|
|
4465
|
+
exclude_tools=exclude,
|
|
4402
4466
|
)
|
|
4403
4467
|
retry_resp = await client.post(
|
|
4404
4468
|
f"{LLAMA_CPP_BASE}/chat/completions",
|
|
@@ -4471,6 +4535,12 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4471
4535
|
elif retry_issue.kind == "invalid_tool_args":
|
|
4472
4536
|
monitor.invalid_tool_call_streak += 1
|
|
4473
4537
|
monitor.arg_preflight_rejections += 1
|
|
4538
|
+
# Track failing tools from retries for progressive narrowing
|
|
4539
|
+
for tc in (retry_working.get("choices", [{}])[0].get("message", {}).get("tool_calls", [])):
|
|
4540
|
+
fn_name = tc.get("function", {}).get("name", "")
|
|
4541
|
+
raw_args = tc.get("function", {}).get("arguments", "")
|
|
4542
|
+
if fn_name and raw_args and _is_garbled_tool_arguments(raw_args):
|
|
4543
|
+
failing_tools.add(fn_name)
|
|
4474
4544
|
|
|
4475
4545
|
monitor.maybe_activate_forced_tool_dampener(retry_issue.kind)
|
|
4476
4546
|
logger.warning(
|
|
@@ -3872,3 +3872,175 @@ class TestFinalizeTurnToolCallLeak(unittest.TestCase):
|
|
|
3872
3872
|
for block in text_blocks:
|
|
3873
3873
|
self.assertNotIn("<tool_call>", block["text"])
|
|
3874
3874
|
self.assertNotIn("</tool_call>", block["text"])
|
|
3875
|
+
|
|
3876
|
+
|
|
3877
|
+
class TestRetryGarbledImprovements(unittest.TestCase):
|
|
3878
|
+
"""Tests for progressive garbled cap, arg logging, and tool narrowing on retries."""
|
|
3879
|
+
|
|
3880
|
+
def test_garbled_cap_applied_in_retry_body(self):
|
|
3881
|
+
"""When is_garbled=True, retry body uses PROXY_TOOL_TURN_MAX_TOKENS_GARBLED."""
|
|
3882
|
+
openai_body = {
|
|
3883
|
+
"model": "test-model",
|
|
3884
|
+
"max_tokens": 8192,
|
|
3885
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
3886
|
+
"tools": [],
|
|
3887
|
+
}
|
|
3888
|
+
anthropic_body = {"messages": [{"role": "user", "content": "test"}]}
|
|
3889
|
+
retry_body = proxy._build_malformed_retry_body(
|
|
3890
|
+
openai_body,
|
|
3891
|
+
anthropic_body,
|
|
3892
|
+
retry_hint="fix it",
|
|
3893
|
+
tool_choice="required",
|
|
3894
|
+
attempt=1,
|
|
3895
|
+
total_attempts=3,
|
|
3896
|
+
is_garbled=True,
|
|
3897
|
+
)
|
|
3898
|
+
self.assertEqual(retry_body["max_tokens"], proxy.PROXY_TOOL_TURN_MAX_TOKENS_GARBLED)
|
|
3899
|
+
|
|
3900
|
+
def test_non_garbled_uses_standard_retry_max(self):
|
|
3901
|
+
"""When is_garbled=False, retry body uses PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS."""
|
|
3902
|
+
openai_body = {
|
|
3903
|
+
"model": "test-model",
|
|
3904
|
+
"max_tokens": 8192,
|
|
3905
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
3906
|
+
"tools": [],
|
|
3907
|
+
}
|
|
3908
|
+
anthropic_body = {"messages": [{"role": "user", "content": "test"}]}
|
|
3909
|
+
retry_body = proxy._build_malformed_retry_body(
|
|
3910
|
+
openai_body,
|
|
3911
|
+
anthropic_body,
|
|
3912
|
+
retry_hint="fix it",
|
|
3913
|
+
tool_choice="required",
|
|
3914
|
+
attempt=1,
|
|
3915
|
+
total_attempts=3,
|
|
3916
|
+
is_garbled=False,
|
|
3917
|
+
)
|
|
3918
|
+
if proxy.PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS > 0:
|
|
3919
|
+
self.assertLessEqual(retry_body["max_tokens"], proxy.PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS)
|
|
3920
|
+
|
|
3921
|
+
def test_exclude_tools_removes_from_retry(self):
|
|
3922
|
+
"""exclude_tools parameter removes specified tools from retry body."""
|
|
3923
|
+
openai_body = {
|
|
3924
|
+
"model": "test-model",
|
|
3925
|
+
"max_tokens": 8192,
|
|
3926
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
3927
|
+
"tools": [
|
|
3928
|
+
{"type": "function", "function": {"name": "Grep", "description": "search", "parameters": {"type": "object"}}},
|
|
3929
|
+
{"type": "function", "function": {"name": "Read", "description": "read", "parameters": {"type": "object"}}},
|
|
3930
|
+
{"type": "function", "function": {"name": "Bash", "description": "run", "parameters": {"type": "object"}}},
|
|
3931
|
+
],
|
|
3932
|
+
}
|
|
3933
|
+
anthropic_body = {
|
|
3934
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
3935
|
+
"tools": [
|
|
3936
|
+
{"name": "Grep", "description": "search", "input_schema": {"type": "object"}},
|
|
3937
|
+
{"name": "Read", "description": "read", "input_schema": {"type": "object"}},
|
|
3938
|
+
{"name": "Bash", "description": "run", "input_schema": {"type": "object"}},
|
|
3939
|
+
],
|
|
3940
|
+
}
|
|
3941
|
+
retry_body = proxy._build_malformed_retry_body(
|
|
3942
|
+
openai_body,
|
|
3943
|
+
anthropic_body,
|
|
3944
|
+
retry_hint="fix it",
|
|
3945
|
+
tool_choice="required",
|
|
3946
|
+
attempt=2,
|
|
3947
|
+
total_attempts=3,
|
|
3948
|
+
exclude_tools=["Grep"],
|
|
3949
|
+
)
|
|
3950
|
+
tool_names = [t["function"]["name"] for t in retry_body.get("tools", [])]
|
|
3951
|
+
self.assertNotIn("Grep", tool_names)
|
|
3952
|
+
self.assertIn("Read", tool_names)
|
|
3953
|
+
self.assertIn("Bash", tool_names)
|
|
3954
|
+
|
|
3955
|
+
def test_exclude_tools_none_keeps_all(self):
|
|
3956
|
+
"""When exclude_tools is None, all tools are retained."""
|
|
3957
|
+
openai_body = {
|
|
3958
|
+
"model": "test-model",
|
|
3959
|
+
"max_tokens": 8192,
|
|
3960
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
3961
|
+
"tools": [
|
|
3962
|
+
{"type": "function", "function": {"name": "Grep", "description": "search", "parameters": {"type": "object"}}},
|
|
3963
|
+
],
|
|
3964
|
+
}
|
|
3965
|
+
anthropic_body = {
|
|
3966
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
3967
|
+
"tools": [
|
|
3968
|
+
{"name": "Grep", "description": "search", "input_schema": {"type": "object"}},
|
|
3969
|
+
],
|
|
3970
|
+
}
|
|
3971
|
+
retry_body = proxy._build_malformed_retry_body(
|
|
3972
|
+
openai_body,
|
|
3973
|
+
anthropic_body,
|
|
3974
|
+
retry_hint="fix it",
|
|
3975
|
+
tool_choice="required",
|
|
3976
|
+
attempt=2,
|
|
3977
|
+
total_attempts=3,
|
|
3978
|
+
exclude_tools=None,
|
|
3979
|
+
)
|
|
3980
|
+
tool_names = [t["function"]["name"] for t in retry_body.get("tools", [])]
|
|
3981
|
+
self.assertIn("Grep", tool_names)
|
|
3982
|
+
|
|
3983
|
+
def test_garbled_args_excerpt_in_issue(self):
|
|
3984
|
+
"""_is_garbled_tool_arguments detects garbled content for logging."""
|
|
3985
|
+
# Garbled pattern: runaway braces
|
|
3986
|
+
garbled = '{"pattern": "test}}}}}}}}}}}}}}"}'
|
|
3987
|
+
self.assertTrue(proxy._is_garbled_tool_arguments(garbled))
|
|
3988
|
+
# Clean pattern
|
|
3989
|
+
clean = '{"pattern": "hello", "path": "/src"}'
|
|
3990
|
+
self.assertFalse(proxy._is_garbled_tool_arguments(clean))
|
|
3991
|
+
|
|
3992
|
+
|
|
3993
|
+
class TestSpecModeLeakMarkers(unittest.TestCase):
|
|
3994
|
+
"""Tests for spec mode and system-reminder leak detection markers."""
|
|
3995
|
+
|
|
3996
|
+
def test_spec_mode_active_detected(self):
|
|
3997
|
+
"""Spec mode system prompt text is detected as a leak."""
|
|
3998
|
+
value = {"patterns": ["**Spec mode is active. The user indicated that they do not want you to execute"]}
|
|
3999
|
+
self.assertTrue(proxy._contains_system_prompt_leak(value))
|
|
4000
|
+
|
|
4001
|
+
def test_system_reminder_tags_detected(self):
|
|
4002
|
+
"""Raw <system-reminder> tags in args are detected as a leak."""
|
|
4003
|
+
value = {"content": "<system-reminder>\nSpec mode active\n</system-reminder>"}
|
|
4004
|
+
self.assertTrue(proxy._contains_system_prompt_leak(value))
|
|
4005
|
+
|
|
4006
|
+
def test_gather_requirements_detected(self):
|
|
4007
|
+
"""'gather requirements and clarify decisions' phrase is detected."""
|
|
4008
|
+
value = {"text": "executed AskUser tool to gather requirements and clarify decisions before finalizing your spec"}
|
|
4009
|
+
self.assertTrue(proxy._contains_system_prompt_leak(value))
|
|
4010
|
+
|
|
4011
|
+
def test_clean_args_not_flagged(self):
|
|
4012
|
+
"""Normal tool arguments are not flagged as leaks."""
|
|
4013
|
+
value = {"pattern": "*.ts", "path": "/home/user/project/src"}
|
|
4014
|
+
self.assertFalse(proxy._contains_system_prompt_leak(value))
|
|
4015
|
+
|
|
4016
|
+
def test_repair_truncates_string_arg_at_spec_mode_leak(self):
|
|
4017
|
+
"""_repair_system_prompt_leak truncates string args at spec mode leak point."""
|
|
4018
|
+
openai_resp = {
|
|
4019
|
+
"choices": [{
|
|
4020
|
+
"index": 0,
|
|
4021
|
+
"message": {
|
|
4022
|
+
"role": "assistant",
|
|
4023
|
+
"tool_calls": [{
|
|
4024
|
+
"id": "call_test",
|
|
4025
|
+
"type": "function",
|
|
4026
|
+
"function": {
|
|
4027
|
+
"name": "Grep",
|
|
4028
|
+
"arguments": '{"pattern":"TODO Spec mode is active. The user indicated"}'
|
|
4029
|
+
}
|
|
4030
|
+
}]
|
|
4031
|
+
},
|
|
4032
|
+
"finish_reason": "tool_calls",
|
|
4033
|
+
}],
|
|
4034
|
+
}
|
|
4035
|
+
repaired, count = proxy._repair_system_prompt_leak(openai_resp)
|
|
4036
|
+
self.assertGreater(count, 0)
|
|
4037
|
+
args_str = repaired["choices"][0]["message"]["tool_calls"][0]["function"]["arguments"]
|
|
4038
|
+
self.assertNotIn("spec mode is active", args_str.lower())
|
|
4039
|
+
# The valid prefix should be preserved
|
|
4040
|
+
parsed = json.loads(args_str)
|
|
4041
|
+
self.assertTrue(parsed["pattern"].startswith("TODO"))
|
|
4042
|
+
|
|
4043
|
+
def test_detection_works_on_list_values(self):
|
|
4044
|
+
"""_contains_system_prompt_leak detects leaks inside list values."""
|
|
4045
|
+
value = {"patterns": ["**Spec mode is active. The user indicated"]}
|
|
4046
|
+
self.assertTrue(proxy._contains_system_prompt_leak(value))
|