npm - @miller-tech/uap - Versions diffs - 1.20.17 → 1.20.18 - Mend

@miller-tech/uap 1.20.17 → 1.20.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +1 -1
package/tools/agents/scripts/anthropic_proxy.py +38 -0
package/tools/agents/tests/test_anthropic_proxy_streaming.py +87 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@miller-tech/uap",
-  "version": "1.20.17",
+  "version": "1.20.18",
   "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
   "type": "module",
   "main": "dist/index.js",

package/tools/agents/scripts/anthropic_proxy.py CHANGED Viewed

@@ -42,6 +42,12 @@ Configuration (Environment Variables)
     PROXY_READ_TIMEOUT   Read timeout in seconds for upstream LLM streaming
                          Default: 600 (10 minutes)
+    PROXY_TOOL_TURN_MAX_TOKENS   Max tokens for tool-call turns (0 to disable)
+                                Default: 8192
+    PROXY_TOOL_TURN_MAX_TOKENS_GARBLED   Max tokens after garbled/malformed output
+                                         Default: 4096
     PROXY_MAX_CONNECTIONS   Max concurrent connections to upstream
                             Default: 20
@@ -194,6 +200,10 @@ PROXY_STREAM_REASONING_MAX_CHARS = int(
     os.environ.get("PROXY_STREAM_REASONING_MAX_CHARS", "240")
 )
 PROXY_MAX_TOKENS_FLOOR = int(os.environ.get("PROXY_MAX_TOKENS_FLOOR", "16384"))
+PROXY_TOOL_TURN_MAX_TOKENS = int(os.environ.get("PROXY_TOOL_TURN_MAX_TOKENS", "8192"))
+PROXY_TOOL_TURN_MAX_TOKENS_GARBLED = int(
+    os.environ.get("PROXY_TOOL_TURN_MAX_TOKENS_GARBLED", "4096")
+)
 PROXY_TOOL_NARROWING = os.environ.get("PROXY_TOOL_NARROWING", "off").lower() not in {
     "0",
     "false",
@@ -631,6 +641,7 @@ class SessionMonitor:
     tool_state_review_cycles: int = 0
     last_tool_fingerprint: str = ""
     cycling_tool_names: list = field(default_factory=list)
+    last_response_garbled: bool = False  # previous turn had garbled/malformed output
     finalize_turn_active: bool = False
     completion_required: bool = False
     completion_pending: bool = False
@@ -1457,6 +1468,11 @@ async def lifespan(app: FastAPI):
         int(PROXY_GENERATION_TIMEOUT),
         int(PROXY_SLOT_HANG_TIMEOUT),
     )
+    logger.info(
+        "Tool turn max_tokens: cap=%d garbled_cap=%d",
+        PROXY_TOOL_TURN_MAX_TOKENS,
+        PROXY_TOOL_TURN_MAX_TOKENS_GARBLED,
+    )
     yield
     await http_client.aclose()
@@ -2316,6 +2332,23 @@ def build_openai_request(
                 )
                 requested_max = PROXY_OPUS46_MAX_TOKENS_HIGH_CTX
+        # Option 1+3+4: Cap max_tokens for tool turns to prevent 32K waste.
+        # Tool call responses rarely need more than a few thousand tokens.
+        # After garbled/malformed output, use an even lower cap.
+        if has_tools and PROXY_TOOL_TURN_MAX_TOKENS > 0:
+            if monitor.last_response_garbled and PROXY_TOOL_TURN_MAX_TOKENS_GARBLED > 0:
+                tool_cap = PROXY_TOOL_TURN_MAX_TOKENS_GARBLED
+            else:
+                tool_cap = PROXY_TOOL_TURN_MAX_TOKENS
+            if requested_max > tool_cap:
+                logger.info(
+                    "TOOL TURN MAX_TOKENS cap: %d -> %d (garbled_prev=%s)",
+                    requested_max,
+                    tool_cap,
+                    monitor.last_response_garbled,
+                )
+                requested_max = tool_cap
         openai_body["max_tokens"] = requested_max
     if "temperature" in anthropic_body:
         openai_body["temperature"] = anthropic_body["temperature"]
@@ -4260,6 +4293,7 @@ async def _apply_malformed_tool_guardrail(
             monitor.malformed_tool_streak = 0
             monitor.invalid_tool_call_streak = 0
             monitor.required_tool_miss_streak = 0
+            monitor.last_response_garbled = False
         if repair_count > 0:
             monitor.arg_preflight_repairs += repair_count
             logger.info(
@@ -4269,6 +4303,9 @@ async def _apply_malformed_tool_guardrail(
             )
         return working_resp
+    # Mark garbled state for progressive max_tokens reduction on next turn
+    monitor.last_response_garbled = True
     if issue.kind == "malformed_payload":
         monitor.malformed_tool_streak += 1
     elif issue.kind == "invalid_tool_args":
@@ -4354,6 +4391,7 @@ async def _apply_malformed_tool_guardrail(
             monitor.malformed_tool_streak = 0
             monitor.invalid_tool_call_streak = 0
             monitor.required_tool_miss_streak = 0
+            monitor.last_response_garbled = False
             logger.info(
                 "TOOL RESPONSE RETRY success: kind=%s attempt=%d/%d",
                 current_issue.kind,

package/tools/agents/tests/test_anthropic_proxy_streaming.py CHANGED Viewed

@@ -3696,3 +3696,90 @@ class TestGarbledArgsRetry(unittest.TestCase):
     def test_env_sync_malformed_retry_max(self):
         """PROXY_MALFORMED_TOOL_RETRY_MAX should be 3."""
         self.assertEqual(proxy.PROXY_MALFORMED_TOOL_RETRY_MAX, 3)
+class TestToolTurnMaxTokensCap(unittest.TestCase):
+    """Tests for tool turn max_tokens capping to prevent 32K waste."""
+    def test_tool_turn_max_tokens_constant(self):
+        """PROXY_TOOL_TURN_MAX_TOKENS should default to 8192."""
+        self.assertEqual(proxy.PROXY_TOOL_TURN_MAX_TOKENS, 8192)
+    def test_tool_turn_max_tokens_garbled_constant(self):
+        """PROXY_TOOL_TURN_MAX_TOKENS_GARBLED should default to 4096."""
+        self.assertEqual(proxy.PROXY_TOOL_TURN_MAX_TOKENS_GARBLED, 4096)
+    def test_tool_turn_caps_high_max_tokens(self):
+        """Tool turn with max_tokens=32000 should be capped to 8192."""
+        body = {
+            "model": "test-model",
+            "max_tokens": 32000,
+            "messages": [{"role": "user", "content": "test"}],
+            "tools": [
+                {
+                    "name": "Bash",
+                    "description": "run command",
+                    "input_schema": {"type": "object"},
+                }
+            ],
+        }
+        monitor = proxy.SessionMonitor(context_window=262144)
+        openai_body = proxy.build_openai_request(body, monitor)
+        self.assertLessEqual(openai_body["max_tokens"], proxy.PROXY_TOOL_TURN_MAX_TOKENS)
+    def test_tool_turn_garbled_reduces_cap(self):
+        """After garbled output, max_tokens should use the lower garbled cap."""
+        body = {
+            "model": "test-model",
+            "max_tokens": 32000,
+            "messages": [{"role": "user", "content": "test"}],
+            "tools": [
+                {
+                    "name": "Bash",
+                    "description": "run command",
+                    "input_schema": {"type": "object"},
+                }
+            ],
+        }
+        monitor = proxy.SessionMonitor(context_window=262144)
+        monitor.last_response_garbled = True
+        openai_body = proxy.build_openai_request(body, monitor)
+        self.assertLessEqual(
+            openai_body["max_tokens"], proxy.PROXY_TOOL_TURN_MAX_TOKENS_GARBLED
+        )
+    def test_non_tool_request_not_capped(self):
+        """Non-tool requests should not be affected by tool turn cap."""
+        body = {
+            "model": "test-model",
+            "max_tokens": 32000,
+            "messages": [{"role": "user", "content": "test"}],
+        }
+        monitor = proxy.SessionMonitor(context_window=262144)
+        openai_body = proxy.build_openai_request(body, monitor)
+        # Should not be capped to 8192 (may be capped by context window logic)
+        self.assertGreater(openai_body["max_tokens"], proxy.PROXY_TOOL_TURN_MAX_TOKENS)
+    def test_last_response_garbled_cleared_on_clean(self):
+        """SessionMonitor.last_response_garbled should default to False."""
+        monitor = proxy.SessionMonitor(context_window=262144)
+        self.assertFalse(monitor.last_response_garbled)
+    def test_small_max_tokens_stays_within_cap(self):
+        """If client requests less than the cap, result should not exceed cap."""
+        body = {
+            "model": "test-model",
+            "max_tokens": 4096,
+            "messages": [{"role": "user", "content": "test"}],
+            "tools": [
+                {
+                    "name": "Bash",
+                    "description": "run command",
+                    "input_schema": {"type": "object"},
+                }
+            ],
+        }
+        monitor = proxy.SessionMonitor(context_window=262144)
+        openai_body = proxy.build_openai_request(body, monitor)
+        # The tool turn cap should ensure we don't exceed PROXY_TOOL_TURN_MAX_TOKENS
+        self.assertLessEqual(openai_body["max_tokens"], proxy.PROXY_TOOL_TURN_MAX_TOKENS)