@miller-tech/uap 1.20.17 → 1.20.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@miller-tech/uap",
3
- "version": "1.20.17",
3
+ "version": "1.20.18",
4
4
  "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -42,6 +42,12 @@ Configuration (Environment Variables)
42
42
  PROXY_READ_TIMEOUT Read timeout in seconds for upstream LLM streaming
43
43
  Default: 600 (10 minutes)
44
44
 
45
+ PROXY_TOOL_TURN_MAX_TOKENS Max tokens for tool-call turns (0 to disable)
46
+ Default: 8192
47
+
48
+ PROXY_TOOL_TURN_MAX_TOKENS_GARBLED Max tokens after garbled/malformed output
49
+ Default: 4096
50
+
45
51
  PROXY_MAX_CONNECTIONS Max concurrent connections to upstream
46
52
  Default: 20
47
53
 
@@ -194,6 +200,10 @@ PROXY_STREAM_REASONING_MAX_CHARS = int(
194
200
  os.environ.get("PROXY_STREAM_REASONING_MAX_CHARS", "240")
195
201
  )
196
202
  PROXY_MAX_TOKENS_FLOOR = int(os.environ.get("PROXY_MAX_TOKENS_FLOOR", "16384"))
203
+ PROXY_TOOL_TURN_MAX_TOKENS = int(os.environ.get("PROXY_TOOL_TURN_MAX_TOKENS", "8192"))
204
+ PROXY_TOOL_TURN_MAX_TOKENS_GARBLED = int(
205
+ os.environ.get("PROXY_TOOL_TURN_MAX_TOKENS_GARBLED", "4096")
206
+ )
197
207
  PROXY_TOOL_NARROWING = os.environ.get("PROXY_TOOL_NARROWING", "off").lower() not in {
198
208
  "0",
199
209
  "false",
@@ -631,6 +641,7 @@ class SessionMonitor:
631
641
  tool_state_review_cycles: int = 0
632
642
  last_tool_fingerprint: str = ""
633
643
  cycling_tool_names: list = field(default_factory=list)
644
+ last_response_garbled: bool = False # previous turn had garbled/malformed output
634
645
  finalize_turn_active: bool = False
635
646
  completion_required: bool = False
636
647
  completion_pending: bool = False
@@ -1457,6 +1468,11 @@ async def lifespan(app: FastAPI):
1457
1468
  int(PROXY_GENERATION_TIMEOUT),
1458
1469
  int(PROXY_SLOT_HANG_TIMEOUT),
1459
1470
  )
1471
+ logger.info(
1472
+ "Tool turn max_tokens: cap=%d garbled_cap=%d",
1473
+ PROXY_TOOL_TURN_MAX_TOKENS,
1474
+ PROXY_TOOL_TURN_MAX_TOKENS_GARBLED,
1475
+ )
1460
1476
 
1461
1477
  yield
1462
1478
  await http_client.aclose()
@@ -2316,6 +2332,23 @@ def build_openai_request(
2316
2332
  )
2317
2333
  requested_max = PROXY_OPUS46_MAX_TOKENS_HIGH_CTX
2318
2334
 
2335
+ # Option 1+3+4: Cap max_tokens for tool turns to prevent 32K waste.
2336
+ # Tool call responses rarely need more than a few thousand tokens.
2337
+ # After garbled/malformed output, use an even lower cap.
2338
+ if has_tools and PROXY_TOOL_TURN_MAX_TOKENS > 0:
2339
+ if monitor.last_response_garbled and PROXY_TOOL_TURN_MAX_TOKENS_GARBLED > 0:
2340
+ tool_cap = PROXY_TOOL_TURN_MAX_TOKENS_GARBLED
2341
+ else:
2342
+ tool_cap = PROXY_TOOL_TURN_MAX_TOKENS
2343
+ if requested_max > tool_cap:
2344
+ logger.info(
2345
+ "TOOL TURN MAX_TOKENS cap: %d -> %d (garbled_prev=%s)",
2346
+ requested_max,
2347
+ tool_cap,
2348
+ monitor.last_response_garbled,
2349
+ )
2350
+ requested_max = tool_cap
2351
+
2319
2352
  openai_body["max_tokens"] = requested_max
2320
2353
  if "temperature" in anthropic_body:
2321
2354
  openai_body["temperature"] = anthropic_body["temperature"]
@@ -4260,6 +4293,7 @@ async def _apply_malformed_tool_guardrail(
4260
4293
  monitor.malformed_tool_streak = 0
4261
4294
  monitor.invalid_tool_call_streak = 0
4262
4295
  monitor.required_tool_miss_streak = 0
4296
+ monitor.last_response_garbled = False
4263
4297
  if repair_count > 0:
4264
4298
  monitor.arg_preflight_repairs += repair_count
4265
4299
  logger.info(
@@ -4269,6 +4303,9 @@ async def _apply_malformed_tool_guardrail(
4269
4303
  )
4270
4304
  return working_resp
4271
4305
 
4306
+ # Mark garbled state for progressive max_tokens reduction on next turn
4307
+ monitor.last_response_garbled = True
4308
+
4272
4309
  if issue.kind == "malformed_payload":
4273
4310
  monitor.malformed_tool_streak += 1
4274
4311
  elif issue.kind == "invalid_tool_args":
@@ -4354,6 +4391,7 @@ async def _apply_malformed_tool_guardrail(
4354
4391
  monitor.malformed_tool_streak = 0
4355
4392
  monitor.invalid_tool_call_streak = 0
4356
4393
  monitor.required_tool_miss_streak = 0
4394
+ monitor.last_response_garbled = False
4357
4395
  logger.info(
4358
4396
  "TOOL RESPONSE RETRY success: kind=%s attempt=%d/%d",
4359
4397
  current_issue.kind,
@@ -3696,3 +3696,90 @@ class TestGarbledArgsRetry(unittest.TestCase):
3696
3696
  def test_env_sync_malformed_retry_max(self):
3697
3697
  """PROXY_MALFORMED_TOOL_RETRY_MAX should be 3."""
3698
3698
  self.assertEqual(proxy.PROXY_MALFORMED_TOOL_RETRY_MAX, 3)
3699
+
3700
+
3701
+ class TestToolTurnMaxTokensCap(unittest.TestCase):
3702
+ """Tests for tool turn max_tokens capping to prevent 32K waste."""
3703
+
3704
+ def test_tool_turn_max_tokens_constant(self):
3705
+ """PROXY_TOOL_TURN_MAX_TOKENS should default to 8192."""
3706
+ self.assertEqual(proxy.PROXY_TOOL_TURN_MAX_TOKENS, 8192)
3707
+
3708
+ def test_tool_turn_max_tokens_garbled_constant(self):
3709
+ """PROXY_TOOL_TURN_MAX_TOKENS_GARBLED should default to 4096."""
3710
+ self.assertEqual(proxy.PROXY_TOOL_TURN_MAX_TOKENS_GARBLED, 4096)
3711
+
3712
+ def test_tool_turn_caps_high_max_tokens(self):
3713
+ """Tool turn with max_tokens=32000 should be capped to 8192."""
3714
+ body = {
3715
+ "model": "test-model",
3716
+ "max_tokens": 32000,
3717
+ "messages": [{"role": "user", "content": "test"}],
3718
+ "tools": [
3719
+ {
3720
+ "name": "Bash",
3721
+ "description": "run command",
3722
+ "input_schema": {"type": "object"},
3723
+ }
3724
+ ],
3725
+ }
3726
+ monitor = proxy.SessionMonitor(context_window=262144)
3727
+ openai_body = proxy.build_openai_request(body, monitor)
3728
+ self.assertLessEqual(openai_body["max_tokens"], proxy.PROXY_TOOL_TURN_MAX_TOKENS)
3729
+
3730
+ def test_tool_turn_garbled_reduces_cap(self):
3731
+ """After garbled output, max_tokens should use the lower garbled cap."""
3732
+ body = {
3733
+ "model": "test-model",
3734
+ "max_tokens": 32000,
3735
+ "messages": [{"role": "user", "content": "test"}],
3736
+ "tools": [
3737
+ {
3738
+ "name": "Bash",
3739
+ "description": "run command",
3740
+ "input_schema": {"type": "object"},
3741
+ }
3742
+ ],
3743
+ }
3744
+ monitor = proxy.SessionMonitor(context_window=262144)
3745
+ monitor.last_response_garbled = True
3746
+ openai_body = proxy.build_openai_request(body, monitor)
3747
+ self.assertLessEqual(
3748
+ openai_body["max_tokens"], proxy.PROXY_TOOL_TURN_MAX_TOKENS_GARBLED
3749
+ )
3750
+
3751
+ def test_non_tool_request_not_capped(self):
3752
+ """Non-tool requests should not be affected by tool turn cap."""
3753
+ body = {
3754
+ "model": "test-model",
3755
+ "max_tokens": 32000,
3756
+ "messages": [{"role": "user", "content": "test"}],
3757
+ }
3758
+ monitor = proxy.SessionMonitor(context_window=262144)
3759
+ openai_body = proxy.build_openai_request(body, monitor)
3760
+ # Should not be capped to 8192 (may be capped by context window logic)
3761
+ self.assertGreater(openai_body["max_tokens"], proxy.PROXY_TOOL_TURN_MAX_TOKENS)
3762
+
3763
+ def test_last_response_garbled_cleared_on_clean(self):
3764
+ """SessionMonitor.last_response_garbled should default to False."""
3765
+ monitor = proxy.SessionMonitor(context_window=262144)
3766
+ self.assertFalse(monitor.last_response_garbled)
3767
+
3768
+ def test_small_max_tokens_stays_within_cap(self):
3769
+ """If client requests less than the cap, result should not exceed cap."""
3770
+ body = {
3771
+ "model": "test-model",
3772
+ "max_tokens": 4096,
3773
+ "messages": [{"role": "user", "content": "test"}],
3774
+ "tools": [
3775
+ {
3776
+ "name": "Bash",
3777
+ "description": "run command",
3778
+ "input_schema": {"type": "object"},
3779
+ }
3780
+ ],
3781
+ }
3782
+ monitor = proxy.SessionMonitor(context_window=262144)
3783
+ openai_body = proxy.build_openai_request(body, monitor)
3784
+ # The tool turn cap should ensure we don't exceed PROXY_TOOL_TURN_MAX_TOKENS
3785
+ self.assertLessEqual(openai_body["max_tokens"], proxy.PROXY_TOOL_TURN_MAX_TOKENS)