npm - @miller-tech/uap - Versions diffs - 1.20.13 → 1.20.15 - Mend

@miller-tech/uap 1.20.13 → 1.20.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +1 -1
package/tools/agents/scripts/anthropic_proxy.py +103 -10
package/tools/agents/tests/test_anthropic_proxy_streaming.py +142 -2

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@miller-tech/uap",
-  "version": "1.20.13",
+  "version": "1.20.15",
   "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
   "type": "module",
   "main": "dist/index.js",

package/tools/agents/scripts/anthropic_proxy.py CHANGED Viewed

@@ -219,7 +219,7 @@ PROXY_MALFORMED_TOOL_GUARDRAIL = os.environ.get(
     "no",
 }
 PROXY_MALFORMED_TOOL_RETRY_MAX = int(
-    os.environ.get("PROXY_MALFORMED_TOOL_RETRY_MAX", "2")
+    os.environ.get("PROXY_MALFORMED_TOOL_RETRY_MAX", "3")
 )
 PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS = int(
     os.environ.get("PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS", "2048")
@@ -2167,16 +2167,24 @@ def build_openai_request(
         # Enforce configurable minimum floor for thinking mode: model needs
         # tokens for reasoning (<think>...</think>) plus actual response/tool
         # calls. Set PROXY_MAX_TOKENS_FLOOR=0 to disable this floor.
-        floor_bypassed_for_tool_turn = (
-            has_tools
-            and PROXY_DISABLE_THINKING_ON_TOOL_TURNS
-            and PROXY_MAX_TOKENS_FLOOR > 0
+        #
+        # The floor is ONLY applied when thinking is actually enabled —
+        # skip it for non-tool requests (tools=0) and for tool turns
+        # with thinking disabled, to prevent inflating short preflight
+        # requests (e.g. max_tokens=100 for plan generation).
+        thinking_active_for_request = has_tools and not PROXY_DISABLE_THINKING_ON_TOOL_TURNS
+        skip_floor = (
+            not has_tools  # non-tool requests don't need thinking headroom
+            or PROXY_DISABLE_THINKING_ON_TOOL_TURNS  # thinking disabled on tool turns
+            or PROXY_MAX_TOKENS_FLOOR <= 0  # floor explicitly disabled
         )
-        if floor_bypassed_for_tool_turn:
+        if skip_floor:
             requested_max = requested_raw
-            if requested_raw < PROXY_MAX_TOKENS_FLOOR:
+            if requested_raw < PROXY_MAX_TOKENS_FLOOR and PROXY_MAX_TOKENS_FLOOR > 0:
                 logger.info(
-                    "MAX_TOKENS floor bypassed for tool turn with thinking disabled: requested=%d floor=%d",
+                    "MAX_TOKENS floor skipped: has_tools=%s thinking_active=%s requested=%d floor=%d",
+                    has_tools,
+                    thinking_active_for_request,
                     requested_raw,
                     PROXY_MAX_TOKENS_FLOOR,
                 )
@@ -3890,6 +3898,40 @@ async def _apply_completion_contract_guardrail(
     return retried
+def _sanitize_assistant_messages_for_retry(messages: list[dict]) -> list[dict]:
+    """Strip malformed tool-like text from assistant messages to prevent copy-contamination.
+    Only sanitizes the last 4 assistant messages to avoid excessive processing.
+    """
+    import re
+    # Patterns that indicate malformed tool call text in assistant content
+    _TOOL_LIKE_PATTERNS = re.compile(
+        r"<tool_call>.*?</tool_call>"
+        r"|<function_call>.*?</function_call>"
+        r'|\{"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:'
+        r"|```json\s*\{[^}]*\"name\"\s*:",
+        re.DOTALL,
+    )
+    result = list(messages)
+    sanitized_count = 0
+    for i in range(len(result) - 1, -1, -1):
+        if sanitized_count >= 4:
+            break
+        msg = result[i]
+        if msg.get("role") != "assistant":
+            continue
+        content = msg.get("content", "")
+        if isinstance(content, str) and _TOOL_LIKE_PATTERNS.search(content):
+            cleaned = _TOOL_LIKE_PATTERNS.sub("", content).strip()
+            if not cleaned:
+                cleaned = "I will use the appropriate tool."
+            result[i] = {**msg, "content": cleaned}
+            sanitized_count += 1
+    return result
 def _build_malformed_retry_body(
     openai_body: dict,
     anthropic_body: dict,
@@ -3901,7 +3943,11 @@ def _build_malformed_retry_body(
     retry_body = dict(openai_body)
     retry_body["stream"] = False
     retry_body["tool_choice"] = tool_choice
-    retry_body["temperature"] = PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE
+    # Escalate temperature down on successive retries for more deterministic output
+    if total_attempts > 1 and attempt > 1:
+        retry_body["temperature"] = 0.0
+    else:
+        retry_body["temperature"] = PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE
     if tool_choice == "required":
         retry_instruction = (
@@ -3922,7 +3968,10 @@ def _build_malformed_retry_body(
     }
     existing_messages = retry_body.get("messages")
     if isinstance(existing_messages, list) and existing_messages:
-        retry_body["messages"] = [*existing_messages, malformed_retry_instruction]
+        # Strip malformed tool-like text from assistant messages to prevent
+        # the model from copying contaminated patterns on retry
+        sanitized = _sanitize_assistant_messages_for_retry(existing_messages)
+        retry_body["messages"] = [*sanitized, malformed_retry_instruction]
     if PROXY_MALFORMED_TOOL_RETRY_MAX_TOKENS > 0:
         current_max = int(
@@ -4394,6 +4443,48 @@ def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
     return openai_resp
+def _detect_and_truncate_degenerate_repetition(openai_resp: dict) -> dict:
+    """Detect degenerate repetitive text and truncate at first repetition.
+    When the model produces highly repetitive output (e.g. the same 20+ char
+    substring repeated 10+ times), truncate at the first repetition boundary
+    and set finish_reason to stop.
+    """
+    text = _openai_message_text(openai_resp)
+    if not text or len(text) < 200:
+        return openai_resp
+    # Look for repeated substrings of length 20-100
+    for substr_len in (60, 40, 20):
+        # Sample from the middle of the text to find the repeating pattern
+        mid = len(text) // 2
+        sample = text[mid : mid + substr_len]
+        if not sample.strip():
+            continue
+        count = text.count(sample)
+        if count >= 8:
+            # Found degenerate repetition — truncate at first occurrence + one repeat
+            first_pos = text.find(sample)
+            second_pos = text.find(sample, first_pos + len(sample))
+            if second_pos > first_pos:
+                truncated = text[:second_pos].rstrip()
+                logger.warning(
+                    "DEGENERATE REPETITION: detected %d repeats of %d-char substring, truncating %d -> %d chars",
+                    count,
+                    substr_len,
+                    len(text),
+                    len(truncated),
+                )
+                # Update the response
+                choices = openai_resp.get("choices", [])
+                if choices:
+                    msg = choices[0].get("message", {})
+                    msg["content"] = truncated
+                    choices[0]["finish_reason"] = "stop"
+                return openai_resp
+    return openai_resp
 def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
     """Convert an OpenAI Chat Completions response to Anthropic Messages format."""
     # First: try to recover tool calls trapped in text XML tags
@@ -5217,6 +5308,7 @@ async def messages(request: Request):
             session_id,
         )
+        openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
         anthropic_resp = openai_to_anthropic_response(openai_resp, model)
         monitor.record_response(anthropic_resp.get("usage", {}).get("output_tokens", 0))
         # Update last_input_tokens from upstream's actual prompt_tokens
@@ -5555,6 +5647,7 @@ async def messages(request: Request):
             monitor.invalid_tool_call_streak = 0
             monitor.required_tool_miss_streak = 0
+        openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
         anthropic_resp = openai_to_anthropic_response(openai_resp, model)
         # Track output tokens in session monitor

package/tools/agents/tests/test_anthropic_proxy_streaming.py CHANGED Viewed

@@ -116,7 +116,8 @@ class TestProxyConfigTuning(unittest.TestCase):
             setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
             setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
-    def test_build_request_keeps_floor_for_non_tool_turns(self):
+    def test_build_request_skips_floor_for_non_tool_turns(self):
+        """Non-tool requests should NOT have the max_tokens floor applied."""
         old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
         old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
         try:
@@ -132,7 +133,8 @@ class TestProxyConfigTuning(unittest.TestCase):
             openai = proxy.build_openai_request(
                 body, proxy.SessionMonitor(context_window=0)
             )
-            self.assertEqual(openai.get("max_tokens"), 4096)
+            # Floor should NOT inflate max_tokens for non-tool requests
+            self.assertEqual(openai.get("max_tokens"), 512)
         finally:
             setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
             setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
@@ -3377,6 +3379,144 @@ class TestCycleBreakOptions(unittest.TestCase):
         self.assertEqual(monitor.cycling_tool_names, [])
+class TestMalformedRetryHardening(unittest.TestCase):
+    """Tests for malformed retry improvements: budget, temp escalation, message sanitization."""
+    def test_retry_max_default_is_3(self):
+        """Option 1: default retry budget increased from 2 to 3."""
+        self.assertEqual(proxy.PROXY_MALFORMED_TOOL_RETRY_MAX, 3)
+    def test_sanitize_assistant_messages_strips_tool_like_text(self):
+        """Option 3: malformed tool-like text stripped from assistant messages on retry."""
+        messages = [
+            {"role": "system", "content": "You are helpful."},
+            {"role": "user", "content": "Run a command"},
+            {"role": "assistant", "content": 'Here is the result <tool_call>{"name": "Bash", "arguments": {"command": "ls"}}</tool_call>'},
+            {"role": "user", "content": "ok"},
+        ]
+        sanitized = proxy._sanitize_assistant_messages_for_retry(messages)
+        # System and user messages unchanged
+        self.assertEqual(sanitized[0]["content"], "You are helpful.")
+        self.assertEqual(sanitized[1]["content"], "Run a command")
+        self.assertEqual(sanitized[3]["content"], "ok")
+        # Assistant message should have tool_call stripped
+        self.assertNotIn("<tool_call>", sanitized[2]["content"])
+        self.assertNotIn("Bash", sanitized[2]["content"])
+    def test_sanitize_preserves_clean_assistant_messages(self):
+        """Clean assistant messages are not modified by sanitization."""
+        messages = [
+            {"role": "assistant", "content": "I will read the file for you."},
+        ]
+        sanitized = proxy._sanitize_assistant_messages_for_retry(messages)
+        self.assertEqual(sanitized[0]["content"], "I will read the file for you.")
+    def test_sanitize_replaces_empty_content_with_placeholder(self):
+        """If stripping leaves empty content, a placeholder is used."""
+        messages = [
+            {"role": "assistant", "content": '<tool_call>{"name": "Bash", "arguments": {}}</tool_call>'},
+        ]
+        sanitized = proxy._sanitize_assistant_messages_for_retry(messages)
+        self.assertEqual(sanitized[0]["content"], "I will use the appropriate tool.")
+    def test_retry_body_uses_sanitized_messages(self):
+        """Retry body messages are sanitized before adding retry instruction."""
+        openai_body = {
+            "messages": [
+                {"role": "system", "content": "sys"},
+                {"role": "user", "content": "do it"},
+                {"role": "assistant", "content": '<tool_call>{"name":"X","arguments":{}}</tool_call>'},
+            ],
+            "tools": [{"type": "function", "function": {"name": "X", "parameters": {}}}],
+        }
+        anthropic_body = {"tools": [{"name": "X", "input_schema": {"type": "object"}}]}
+        retry = proxy._build_malformed_retry_body(
+            openai_body, anthropic_body, attempt=1, total_attempts=3,
+        )
+        # The assistant message should be sanitized
+        assistant_msgs = [m for m in retry["messages"] if m.get("role") == "assistant"]
+        for m in assistant_msgs:
+            self.assertNotIn("<tool_call>", m.get("content", ""))
+class TestDegenerateRepetitionDetection(unittest.TestCase):
+    """Tests for degenerate repetition detection and truncation."""
+    def test_detects_and_truncates_repetitive_text(self):
+        """Highly repetitive text should be truncated."""
+        repeated = "Mermaid Diagrams](docs/mermaid-diagrams" * 50
+        openai_resp = {
+            "choices": [{"message": {"content": repeated}, "finish_reason": "length"}]
+        }
+        result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
+        truncated_text = result["choices"][0]["message"]["content"]
+        self.assertLess(len(truncated_text), len(repeated))
+        self.assertEqual(result["choices"][0]["finish_reason"], "stop")
+    def test_preserves_non_repetitive_text(self):
+        """Normal text should not be modified."""
+        text = "This is a perfectly normal response with varied content. " * 5
+        openai_resp = {
+            "choices": [{"message": {"content": text}, "finish_reason": "stop"}]
+        }
+        result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
+        self.assertEqual(result["choices"][0]["message"]["content"], text)
+    def test_preserves_short_text(self):
+        """Short text (< 200 chars) should not be processed."""
+        text = "Short response."
+        openai_resp = {
+            "choices": [{"message": {"content": text}, "finish_reason": "stop"}]
+        }
+        result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
+        self.assertEqual(result["choices"][0]["message"]["content"], text)
+    def test_max_tokens_floor_skipped_for_non_tool_requests(self):
+        """max_tokens floor should not inflate non-tool requests."""
+        old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
+        old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
+        try:
+            setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 16384)
+            setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
+            body = {
+                "model": "test",
+                "max_tokens": 100,
+                "messages": [{"role": "user", "content": "generate a title"}],
+            }
+            openai = proxy.build_openai_request(
+                body, proxy.SessionMonitor(context_window=0)
+            )
+            # No tools = no floor inflation
+            self.assertEqual(openai.get("max_tokens"), 100)
+        finally:
+            setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
+            setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
+    def test_max_tokens_floor_applied_when_thinking_active(self):
+        """max_tokens floor should apply when tools present and thinking enabled."""
+        old_floor = getattr(proxy, "PROXY_MAX_TOKENS_FLOOR")
+        old_disable = getattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS")
+        try:
+            setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", 4096)
+            setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", False)
+            body = {
+                "model": "test",
+                "max_tokens": 512,
+                "messages": [{"role": "user", "content": "run command"}],
+                "tools": [{"name": "Bash", "description": "run", "input_schema": {"type": "object"}}],
+            }
+            openai = proxy.build_openai_request(
+                body, proxy.SessionMonitor(context_window=0)
+            )
+            # Tools + thinking enabled = floor applied
+            self.assertEqual(openai.get("max_tokens"), 4096)
+        finally:
+            setattr(proxy, "PROXY_MAX_TOKENS_FLOOR", old_floor)
+            setattr(proxy, "PROXY_DISABLE_THINKING_ON_TOOL_TURNS", old_disable)
 if __name__ == "__main__":
     unittest.main()