@miller-tech/uap 1.20.16 → 1.20.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -42,6 +42,12 @@ Configuration (Environment Variables)
|
|
|
42
42
|
PROXY_READ_TIMEOUT Read timeout in seconds for upstream LLM streaming
|
|
43
43
|
Default: 600 (10 minutes)
|
|
44
44
|
|
|
45
|
+
PROXY_TOOL_TURN_MAX_TOKENS Max tokens for tool-call turns (0 to disable)
|
|
46
|
+
Default: 8192
|
|
47
|
+
|
|
48
|
+
PROXY_TOOL_TURN_MAX_TOKENS_GARBLED Max tokens after garbled/malformed output
|
|
49
|
+
Default: 4096
|
|
50
|
+
|
|
45
51
|
PROXY_MAX_CONNECTIONS Max concurrent connections to upstream
|
|
46
52
|
Default: 20
|
|
47
53
|
|
|
@@ -194,6 +200,10 @@ PROXY_STREAM_REASONING_MAX_CHARS = int(
|
|
|
194
200
|
os.environ.get("PROXY_STREAM_REASONING_MAX_CHARS", "240")
|
|
195
201
|
)
|
|
196
202
|
PROXY_MAX_TOKENS_FLOOR = int(os.environ.get("PROXY_MAX_TOKENS_FLOOR", "16384"))
|
|
203
|
+
PROXY_TOOL_TURN_MAX_TOKENS = int(os.environ.get("PROXY_TOOL_TURN_MAX_TOKENS", "8192"))
|
|
204
|
+
PROXY_TOOL_TURN_MAX_TOKENS_GARBLED = int(
|
|
205
|
+
os.environ.get("PROXY_TOOL_TURN_MAX_TOKENS_GARBLED", "4096")
|
|
206
|
+
)
|
|
197
207
|
PROXY_TOOL_NARROWING = os.environ.get("PROXY_TOOL_NARROWING", "off").lower() not in {
|
|
198
208
|
"0",
|
|
199
209
|
"false",
|
|
@@ -631,6 +641,7 @@ class SessionMonitor:
|
|
|
631
641
|
tool_state_review_cycles: int = 0
|
|
632
642
|
last_tool_fingerprint: str = ""
|
|
633
643
|
cycling_tool_names: list = field(default_factory=list)
|
|
644
|
+
last_response_garbled: bool = False # previous turn had garbled/malformed output
|
|
634
645
|
finalize_turn_active: bool = False
|
|
635
646
|
completion_required: bool = False
|
|
636
647
|
completion_pending: bool = False
|
|
@@ -1457,6 +1468,11 @@ async def lifespan(app: FastAPI):
|
|
|
1457
1468
|
int(PROXY_GENERATION_TIMEOUT),
|
|
1458
1469
|
int(PROXY_SLOT_HANG_TIMEOUT),
|
|
1459
1470
|
)
|
|
1471
|
+
logger.info(
|
|
1472
|
+
"Tool turn max_tokens: cap=%d garbled_cap=%d",
|
|
1473
|
+
PROXY_TOOL_TURN_MAX_TOKENS,
|
|
1474
|
+
PROXY_TOOL_TURN_MAX_TOKENS_GARBLED,
|
|
1475
|
+
)
|
|
1460
1476
|
|
|
1461
1477
|
yield
|
|
1462
1478
|
await http_client.aclose()
|
|
@@ -2316,6 +2332,23 @@ def build_openai_request(
|
|
|
2316
2332
|
)
|
|
2317
2333
|
requested_max = PROXY_OPUS46_MAX_TOKENS_HIGH_CTX
|
|
2318
2334
|
|
|
2335
|
+
# Option 1+3+4: Cap max_tokens for tool turns to prevent 32K waste.
|
|
2336
|
+
# Tool call responses rarely need more than a few thousand tokens.
|
|
2337
|
+
# After garbled/malformed output, use an even lower cap.
|
|
2338
|
+
if has_tools and PROXY_TOOL_TURN_MAX_TOKENS > 0:
|
|
2339
|
+
if monitor.last_response_garbled and PROXY_TOOL_TURN_MAX_TOKENS_GARBLED > 0:
|
|
2340
|
+
tool_cap = PROXY_TOOL_TURN_MAX_TOKENS_GARBLED
|
|
2341
|
+
else:
|
|
2342
|
+
tool_cap = PROXY_TOOL_TURN_MAX_TOKENS
|
|
2343
|
+
if requested_max > tool_cap:
|
|
2344
|
+
logger.info(
|
|
2345
|
+
"TOOL TURN MAX_TOKENS cap: %d -> %d (garbled_prev=%s)",
|
|
2346
|
+
requested_max,
|
|
2347
|
+
tool_cap,
|
|
2348
|
+
monitor.last_response_garbled,
|
|
2349
|
+
)
|
|
2350
|
+
requested_max = tool_cap
|
|
2351
|
+
|
|
2319
2352
|
openai_body["max_tokens"] = requested_max
|
|
2320
2353
|
if "temperature" in anthropic_body:
|
|
2321
2354
|
openai_body["temperature"] = anthropic_body["temperature"]
|
|
@@ -3634,6 +3667,16 @@ def _validate_tool_call_arguments(
|
|
|
3634
3667
|
),
|
|
3635
3668
|
)
|
|
3636
3669
|
|
|
3670
|
+
if _is_garbled_tool_arguments(arg_text):
|
|
3671
|
+
return ToolResponseIssue(
|
|
3672
|
+
kind="invalid_tool_args",
|
|
3673
|
+
reason=f"arguments for '{tool_name}' contain garbled/degenerate content",
|
|
3674
|
+
retry_hint=(
|
|
3675
|
+
f"Emit exactly one `{tool_name}` tool call with well-formed JSON arguments. "
|
|
3676
|
+
"Do not repeat closing braces, brackets, or digits."
|
|
3677
|
+
),
|
|
3678
|
+
)
|
|
3679
|
+
|
|
3637
3680
|
if _contains_required_placeholder(parsed):
|
|
3638
3681
|
return ToolResponseIssue(
|
|
3639
3682
|
kind="invalid_tool_args",
|
|
@@ -4250,6 +4293,7 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4250
4293
|
monitor.malformed_tool_streak = 0
|
|
4251
4294
|
monitor.invalid_tool_call_streak = 0
|
|
4252
4295
|
monitor.required_tool_miss_streak = 0
|
|
4296
|
+
monitor.last_response_garbled = False
|
|
4253
4297
|
if repair_count > 0:
|
|
4254
4298
|
monitor.arg_preflight_repairs += repair_count
|
|
4255
4299
|
logger.info(
|
|
@@ -4259,6 +4303,9 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4259
4303
|
)
|
|
4260
4304
|
return working_resp
|
|
4261
4305
|
|
|
4306
|
+
# Mark garbled state for progressive max_tokens reduction on next turn
|
|
4307
|
+
monitor.last_response_garbled = True
|
|
4308
|
+
|
|
4262
4309
|
if issue.kind == "malformed_payload":
|
|
4263
4310
|
monitor.malformed_tool_streak += 1
|
|
4264
4311
|
elif issue.kind == "invalid_tool_args":
|
|
@@ -4344,6 +4391,7 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4344
4391
|
monitor.malformed_tool_streak = 0
|
|
4345
4392
|
monitor.invalid_tool_call_streak = 0
|
|
4346
4393
|
monitor.required_tool_miss_streak = 0
|
|
4394
|
+
monitor.last_response_garbled = False
|
|
4347
4395
|
logger.info(
|
|
4348
4396
|
"TOOL RESPONSE RETRY success: kind=%s attempt=%d/%d",
|
|
4349
4397
|
current_issue.kind,
|
|
@@ -3650,3 +3650,136 @@ class TestCompletionContractGuardrails(unittest.TestCase):
|
|
|
3650
3650
|
self.assertEqual(reason, "completion_pending")
|
|
3651
3651
|
self.assertEqual(monitor.tool_turn_phase, "review")
|
|
3652
3652
|
self.assertEqual(monitor.completion_recovery_attempts, 1)
|
|
3653
|
+
|
|
3654
|
+
|
|
3655
|
+
class TestGarbledArgsRetry(unittest.TestCase):
|
|
3656
|
+
"""Tests for garbled tool arguments triggering retry via _validate_tool_call_arguments."""
|
|
3657
|
+
|
|
3658
|
+
def test_garbled_runaway_braces_triggers_retry(self):
|
|
3659
|
+
"""Garbled brace imbalance should return an invalid_tool_args issue."""
|
|
3660
|
+
# Valid JSON but with extreme brace imbalance in string value
|
|
3661
|
+
garbled_args = '{"todos": "}}}}}}}}}}}}}"}'
|
|
3662
|
+
issue = proxy._validate_tool_call_arguments(
|
|
3663
|
+
"TodoWrite", garbled_args, {}, {"TodoWrite"}
|
|
3664
|
+
)
|
|
3665
|
+
self.assertTrue(issue.has_issue())
|
|
3666
|
+
self.assertEqual(issue.kind, "invalid_tool_args")
|
|
3667
|
+
self.assertIn("garbled", issue.reason)
|
|
3668
|
+
|
|
3669
|
+
def test_garbled_repetitive_digits_triggers_retry(self):
|
|
3670
|
+
"""Repetitive digit patterns should return an invalid_tool_args issue."""
|
|
3671
|
+
garbled_args = '{"value": "398859738398859738398859738"}'
|
|
3672
|
+
issue = proxy._validate_tool_call_arguments(
|
|
3673
|
+
"Bash", garbled_args, {}, {"Bash"}
|
|
3674
|
+
)
|
|
3675
|
+
self.assertTrue(issue.has_issue())
|
|
3676
|
+
self.assertEqual(issue.kind, "invalid_tool_args")
|
|
3677
|
+
self.assertIn("garbled", issue.reason)
|
|
3678
|
+
|
|
3679
|
+
def test_clean_args_pass_garbled_check(self):
|
|
3680
|
+
"""Well-formed tool arguments should not be flagged as garbled."""
|
|
3681
|
+
clean_args = '{"command": "echo hello world"}'
|
|
3682
|
+
issue = proxy._validate_tool_call_arguments(
|
|
3683
|
+
"Bash", clean_args, {}, {"Bash"}
|
|
3684
|
+
)
|
|
3685
|
+
self.assertFalse(issue.has_issue())
|
|
3686
|
+
|
|
3687
|
+
def test_garbled_detection_before_schema_validation(self):
|
|
3688
|
+
"""Garbled args should be caught even without schema info."""
|
|
3689
|
+
garbled_args = '{"content": "' + "0" * 40 + '"}'
|
|
3690
|
+
issue = proxy._validate_tool_call_arguments(
|
|
3691
|
+
"Write", garbled_args, {}, {"Write"}
|
|
3692
|
+
)
|
|
3693
|
+
self.assertTrue(issue.has_issue())
|
|
3694
|
+
self.assertEqual(issue.kind, "invalid_tool_args")
|
|
3695
|
+
|
|
3696
|
+
def test_env_sync_malformed_retry_max(self):
|
|
3697
|
+
"""PROXY_MALFORMED_TOOL_RETRY_MAX should be 3."""
|
|
3698
|
+
self.assertEqual(proxy.PROXY_MALFORMED_TOOL_RETRY_MAX, 3)
|
|
3699
|
+
|
|
3700
|
+
|
|
3701
|
+
class TestToolTurnMaxTokensCap(unittest.TestCase):
|
|
3702
|
+
"""Tests for tool turn max_tokens capping to prevent 32K waste."""
|
|
3703
|
+
|
|
3704
|
+
def test_tool_turn_max_tokens_constant(self):
|
|
3705
|
+
"""PROXY_TOOL_TURN_MAX_TOKENS should default to 8192."""
|
|
3706
|
+
self.assertEqual(proxy.PROXY_TOOL_TURN_MAX_TOKENS, 8192)
|
|
3707
|
+
|
|
3708
|
+
def test_tool_turn_max_tokens_garbled_constant(self):
|
|
3709
|
+
"""PROXY_TOOL_TURN_MAX_TOKENS_GARBLED should default to 4096."""
|
|
3710
|
+
self.assertEqual(proxy.PROXY_TOOL_TURN_MAX_TOKENS_GARBLED, 4096)
|
|
3711
|
+
|
|
3712
|
+
def test_tool_turn_caps_high_max_tokens(self):
|
|
3713
|
+
"""Tool turn with max_tokens=32000 should be capped to 8192."""
|
|
3714
|
+
body = {
|
|
3715
|
+
"model": "test-model",
|
|
3716
|
+
"max_tokens": 32000,
|
|
3717
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
3718
|
+
"tools": [
|
|
3719
|
+
{
|
|
3720
|
+
"name": "Bash",
|
|
3721
|
+
"description": "run command",
|
|
3722
|
+
"input_schema": {"type": "object"},
|
|
3723
|
+
}
|
|
3724
|
+
],
|
|
3725
|
+
}
|
|
3726
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
3727
|
+
openai_body = proxy.build_openai_request(body, monitor)
|
|
3728
|
+
self.assertLessEqual(openai_body["max_tokens"], proxy.PROXY_TOOL_TURN_MAX_TOKENS)
|
|
3729
|
+
|
|
3730
|
+
def test_tool_turn_garbled_reduces_cap(self):
|
|
3731
|
+
"""After garbled output, max_tokens should use the lower garbled cap."""
|
|
3732
|
+
body = {
|
|
3733
|
+
"model": "test-model",
|
|
3734
|
+
"max_tokens": 32000,
|
|
3735
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
3736
|
+
"tools": [
|
|
3737
|
+
{
|
|
3738
|
+
"name": "Bash",
|
|
3739
|
+
"description": "run command",
|
|
3740
|
+
"input_schema": {"type": "object"},
|
|
3741
|
+
}
|
|
3742
|
+
],
|
|
3743
|
+
}
|
|
3744
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
3745
|
+
monitor.last_response_garbled = True
|
|
3746
|
+
openai_body = proxy.build_openai_request(body, monitor)
|
|
3747
|
+
self.assertLessEqual(
|
|
3748
|
+
openai_body["max_tokens"], proxy.PROXY_TOOL_TURN_MAX_TOKENS_GARBLED
|
|
3749
|
+
)
|
|
3750
|
+
|
|
3751
|
+
def test_non_tool_request_not_capped(self):
|
|
3752
|
+
"""Non-tool requests should not be affected by tool turn cap."""
|
|
3753
|
+
body = {
|
|
3754
|
+
"model": "test-model",
|
|
3755
|
+
"max_tokens": 32000,
|
|
3756
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
3757
|
+
}
|
|
3758
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
3759
|
+
openai_body = proxy.build_openai_request(body, monitor)
|
|
3760
|
+
# Should not be capped to 8192 (may be capped by context window logic)
|
|
3761
|
+
self.assertGreater(openai_body["max_tokens"], proxy.PROXY_TOOL_TURN_MAX_TOKENS)
|
|
3762
|
+
|
|
3763
|
+
def test_last_response_garbled_cleared_on_clean(self):
|
|
3764
|
+
"""SessionMonitor.last_response_garbled should default to False."""
|
|
3765
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
3766
|
+
self.assertFalse(monitor.last_response_garbled)
|
|
3767
|
+
|
|
3768
|
+
def test_small_max_tokens_stays_within_cap(self):
|
|
3769
|
+
"""If client requests less than the cap, result should not exceed cap."""
|
|
3770
|
+
body = {
|
|
3771
|
+
"model": "test-model",
|
|
3772
|
+
"max_tokens": 4096,
|
|
3773
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
3774
|
+
"tools": [
|
|
3775
|
+
{
|
|
3776
|
+
"name": "Bash",
|
|
3777
|
+
"description": "run command",
|
|
3778
|
+
"input_schema": {"type": "object"},
|
|
3779
|
+
}
|
|
3780
|
+
],
|
|
3781
|
+
}
|
|
3782
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
3783
|
+
openai_body = proxy.build_openai_request(body, monitor)
|
|
3784
|
+
# The tool turn cap should ensure we don't exceed PROXY_TOOL_TURN_MAX_TOKENS
|
|
3785
|
+
self.assertLessEqual(openai_body["max_tokens"], proxy.PROXY_TOOL_TURN_MAX_TOKENS)
|