npm - @miller-tech/uap - Versions diffs - 1.20.25 → 1.20.27 - Mend

@miller-tech/uap 1.20.25 → 1.20.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +1 -1
package/tools/agents/scripts/anthropic_proxy.py +80 -21
package/tools/agents/tests/test_anthropic_proxy_streaming.py +256 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@miller-tech/uap",
-  "version": "1.20.25",
+  "version": "1.20.27",
   "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
   "type": "module",
   "main": "dist/index.js",

package/tools/agents/scripts/anthropic_proxy.py CHANGED Viewed

@@ -2281,11 +2281,13 @@ def _resolve_state_machine_tool_choice(
                 1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
             )
             # If stagnation cleared during review, the model tried a
-            # different approach — reward by reducing cycle pressure.
+            # different approach — reward by reducing cycle pressure and
+            # lifting persistent tool exclusion.
             if monitor.tool_state_stagnation_streak == 0 and monitor.tool_state_review_cycles > 0:
                 monitor.tool_state_review_cycles = max(0, monitor.tool_state_review_cycles - 1)
+                monitor.cycling_tool_names = []
                 logger.info(
-                    "TOOL STATE MACHINE: review_cycles decremented to %d (stagnation cleared)",
+                    "TOOL STATE MACHINE: review_cycles decremented to %d, cycling exclusion lifted (stagnation cleared)",
                     monitor.tool_state_review_cycles,
                 )
             return "required", "review_complete"
@@ -2463,14 +2465,22 @@ def build_openai_request(
         openai_body["stop"] = anthropic_body["stop_sequences"]
     # Force controlled temperature for tool-call turns to reduce garbled output
+    # Cycle 15 Option 2: use lower temperature after contamination resets
     if has_tools:
         client_temp = openai_body.get("temperature")
-        if client_temp is None or client_temp > PROXY_TOOL_TURN_TEMPERATURE:
-            openai_body["temperature"] = PROXY_TOOL_TURN_TEMPERATURE
+        target_temp = PROXY_TOOL_TURN_TEMPERATURE
+        if monitor.contamination_resets > 0:
+            target_temp = min(target_temp, 0.1)
+        if client_temp is None or client_temp > target_temp:
+            openai_body["temperature"] = target_temp
+            extra = ""
+            if monitor.contamination_resets > 0:
+                extra = f" (post-contamination reset, resets={monitor.contamination_resets})"
             logger.info(
-                "TOOL TURN TEMP: forcing temperature=%.2f (was %s) for tool-enabled request",
-                PROXY_TOOL_TURN_TEMPERATURE,
+                "TOOL TURN TEMP: forcing temperature=%.2f (was %s) for tool-enabled request%s",
+                target_temp,
                 client_temp,
+                extra,
             )
     # Convert Anthropic tools to OpenAI function-calling tools
@@ -2589,31 +2599,41 @@ def build_openai_request(
             monitor.no_progress_streak = (
                 0 if last_user_has_tool_result else monitor.no_progress_streak + 1
             )
-            # Option 1: Inject cycle-break instruction when entering review
+            # Inject cycle-break instruction when entering review
+            # Option 3 (Cycle 14): Escalate hint text based on review cycle count
             if (
                 monitor.tool_turn_phase == "review"
                 and state_reason in {"cycle_detected", "stagnation"}
                 and monitor.cycling_tool_names
             ):
                 cycling_names = ", ".join(monitor.cycling_tool_names)
-                cycle_hint = (
-                    f"You have been repeatedly calling the same tool(s): {cycling_names}. "
-                    "This is not making progress. Use a DIFFERENT tool to advance the task, "
-                    "or call a tool that produces your final answer."
-                )
+                cycles = monitor.tool_state_review_cycles
+                if cycles <= 1:
+                    cycle_hint = (
+                        f"You have been repeatedly calling the same tool(s): {cycling_names}. "
+                        "This is not making progress. Use a DIFFERENT tool to advance the task, "
+                        "or call a tool that produces your final answer."
+                    )
+                else:
+                    cycle_hint = (
+                        f"CRITICAL: You have cycled {cycling_names} for {cycles} review rounds without progress. "
+                        "State what you have accomplished so far and what the next DIFFERENT action should be. "
+                        "Do NOT call the same tool again. Choose a completely different approach or "
+                        "produce your final answer now."
+                    )
                 messages = openai_body.get("messages", [])
                 messages.append({"role": "user", "content": cycle_hint})
                 openai_body["messages"] = messages
                 logger.warning(
-                    "CYCLE BREAK: injected hint about cycling tools: %s",
+                    "CYCLE BREAK: injected hint about cycling tools: %s (escalation=%d)",
                     cycling_names,
+                    cycles,
                 )
-            # Option 2: Narrow tools during review to exclude cycling tools
-            # Option 1 enhancement: if any cycling tool is read-only, exclude
-            # the entire read-only class to prevent tool-hopping (read→glob→grep)
+            # Narrow tools to exclude cycling tools
+            # Option 1 (Cycle 13): if any cycling tool is read-only, exclude entire class
+            # Option 1 (Cycle 14): persist exclusion during act phase too, not just review
             if (
-                monitor.tool_turn_phase == "review"
-                and monitor.cycling_tool_names
+                monitor.cycling_tool_names
                 and "tools" in openai_body
             ):
                 exclude_set = set(monitor.cycling_tool_names)
@@ -4679,7 +4699,7 @@ async def _apply_malformed_tool_guardrail(
     attempts = max(0, PROXY_MALFORMED_TOOL_RETRY_MAX)
     current_issue = issue
-    # Track failing tool names for Option 3 (tool narrowing on retry)
+    # Track failing tool names for tool narrowing on retry
     failing_tools: set[str] = set()
     if issue.kind == "invalid_tool_args":
         for tc in (working_resp.get("choices", [{}])[0].get("message", {}).get("tool_calls", [])):
@@ -4687,14 +4707,22 @@ async def _apply_malformed_tool_guardrail(
             raw_args = tc.get("function", {}).get("arguments", "")
             if fn_name and raw_args and _is_garbled_tool_arguments(raw_args):
                 failing_tools.add(fn_name)
+    # Cycle 15 Option 1: For malformed_payload retries, exclude complex
+    # multi-field tools (task, Agent) that are prone to garbled generation
+    # after the first retry fails.
+    _COMPLEX_TOOLS_TO_EXCLUDE_ON_MALFORMED = {"task", "Agent"}
+    malformed_exclude_active = False
     for attempt in range(attempts):
         attempt_tool_choice = _retry_tool_choice_for_attempt(
             required_tool_choice,
             attempt,
             attempts,
         )
-        # Option 3: On attempt >= 2, exclude consistently failing tools
-        exclude = list(failing_tools) if attempt >= 1 and failing_tools else None
+        # On attempt >= 1, exclude consistently failing tools OR complex tools for malformed
+        exclude_set = set(failing_tools) if failing_tools else set()
+        if malformed_exclude_active:
+            exclude_set |= _COMPLEX_TOOLS_TO_EXCLUDE_ON_MALFORMED
+        exclude = list(exclude_set) if (attempt >= 1 and exclude_set) else None
         retry_body = _build_malformed_retry_body(
             openai_body,
             anthropic_body,
@@ -4773,6 +4801,8 @@ async def _apply_malformed_tool_guardrail(
         if retry_issue.kind == "malformed_payload":
             monitor.malformed_tool_streak += 1
+            # Cycle 15 Option 1: activate complex tool exclusion for next retry
+            malformed_exclude_active = True
         elif retry_issue.kind == "invalid_tool_args":
             monitor.invalid_tool_call_streak += 1
             monitor.arg_preflight_rejections += 1
@@ -4886,6 +4916,35 @@ def _maybe_apply_session_contamination_breaker(
     if not should_reset:
         return anthropic_body
+    # Cycle 15 Option 3: if contamination has already reset N+ times in this
+    # session, the model is fundamentally unable to produce valid tool calls.
+    # Force finalize so the Droid framework can intervene.
+    max_contamination_resets = 3
+    if monitor.contamination_resets >= max_contamination_resets:
+        logger.error(
+            "SESSION CONTAMINATION LOOP: session=%s contamination_resets=%d >= %d, forcing finalize",
+            session_id,
+            monitor.contamination_resets,
+            max_contamination_resets,
+        )
+        monitor.set_tool_turn_phase("finalize", reason="contamination_loop")
+        monitor.contamination_resets += 1
+        monitor.malformed_tool_streak = 0
+        monitor.invalid_tool_call_streak = 0
+        # Remove tools to force text-only response
+        updated = dict(anthropic_body)
+        updated.pop("tools", None)
+        updated.pop("tool_choice", None)
+        msgs = updated.get("messages", [])
+        msgs.append({
+            "role": "user",
+            "content": (
+                "Tool-call generation has failed repeatedly. Respond with plain text only. "
+                "Summarize what you have accomplished and what remains to be done."
+            ),
+        })
+        return updated
     messages = anthropic_body.get("messages", [])
     keep_last = max(2, PROXY_SESSION_CONTAMINATION_KEEP_LAST)
     if len(messages) <= keep_last + 1:

package/tools/agents/tests/test_anthropic_proxy_streaming.py CHANGED Viewed

@@ -4406,3 +4406,259 @@ class TestReadOnlyCycleClassExclusion(unittest.TestCase):
         dup, _ = monitor.has_duplicate_read_target(threshold=3)
         self.assertFalse(dup)
+class TestPersistentCycleExclusion(unittest.TestCase):
+    """Tests for Cycle 14: persistent exclusion, escalating hints, and
+    exclusion across review→act transitions."""
+    def _make_body_with_tools(self, tool_names, active_tool="bash", active_input=None):
+        tools = [
+            {"name": n, "description": f"{n} tool", "input_schema": {"type": "object"}}
+            for n in tool_names
+        ]
+        inp = active_input or {"command": "ls"}
+        return {
+            "model": "test",
+            "messages": [
+                {"role": "user", "content": "do something"},
+                {
+                    "role": "assistant",
+                    "content": [
+                        {"type": "tool_use", "id": "t1", "name": active_tool, "input": inp}
+                    ],
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "tool_result", "tool_use_id": "t1", "content": "ok"}
+                    ],
+                },
+            ],
+            "tools": tools,
+        }
+    def test_exclusion_persists_through_act_phase(self):
+        """Option 1: cycling_tool_names exclusion persists in act phase after review."""
+        old_vals = {}
+        for k in ["PROXY_TOOL_STATE_MACHINE", "PROXY_TOOL_STATE_MIN_MESSAGES",
+                   "PROXY_TOOL_STATE_FORCED_BUDGET", "PROXY_TOOL_STATE_CYCLE_WINDOW",
+                   "PROXY_TOOL_STATE_STAGNATION_THRESHOLD"]:
+            old_vals[k] = getattr(proxy, k)
+        try:
+            setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
+            setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
+            setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 6)
+            setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 3)
+            setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 2)
+            all_tools = ["bash", "read", "write", "edit"]
+            body = self._make_body_with_tools(all_tools)
+            monitor = proxy.SessionMonitor(context_window=262144)
+            # Simulate bash cycling that triggers review
+            monitor.cycling_tool_names = ["bash"]
+            monitor.tool_turn_phase = "act"
+            monitor.tool_state_forced_budget_remaining = 5
+            openai = proxy.build_openai_request(body, monitor)
+            # In act phase with cycling_tool_names set, bash should be excluded
+            remaining = [t["function"]["name"] for t in openai.get("tools", [])]
+            self.assertNotIn("bash", remaining)
+            self.assertIn("read", remaining)
+            self.assertIn("write", remaining)
+        finally:
+            for k, v in old_vals.items():
+                setattr(proxy, k, v)
+    def test_exclusion_cleared_on_stagnation_clear(self):
+        """Option 1: cycling exclusion is lifted when stagnation clears in review."""
+        monitor = proxy.SessionMonitor(context_window=262144)
+        monitor.tool_turn_phase = "review"
+        monitor.tool_state_review_cycles = 1
+        monitor.tool_state_stagnation_streak = 0  # stagnation cleared
+        monitor.cycling_tool_names = ["bash"]
+        monitor.tool_state_auto_budget_remaining = 0
+        monitor.tool_state_forced_budget_remaining = 6
+        # This should transition review→act and clear cycling names
+        old_vals = {}
+        for k in ["PROXY_TOOL_STATE_MACHINE", "PROXY_TOOL_STATE_MIN_MESSAGES",
+                   "PROXY_TOOL_STATE_FORCED_BUDGET"]:
+            old_vals[k] = getattr(proxy, k)
+        try:
+            setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
+            setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
+            setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 6)
+            body = self._make_body_with_tools(["bash", "read", "write"])
+            proxy.build_openai_request(body, monitor)
+            self.assertEqual(monitor.tool_turn_phase, "act")
+            self.assertEqual(monitor.cycling_tool_names, [])
+        finally:
+            for k, v in old_vals.items():
+                setattr(proxy, k, v)
+    def test_escalated_hint_on_cycle_2(self):
+        """Option 3: cycle 2+ gets escalated CRITICAL hint text."""
+        old_vals = {}
+        for k in ["PROXY_TOOL_STATE_MACHINE", "PROXY_TOOL_STATE_MIN_MESSAGES",
+                   "PROXY_TOOL_STATE_FORCED_BUDGET", "PROXY_TOOL_STATE_CYCLE_WINDOW",
+                   "PROXY_TOOL_STATE_STAGNATION_THRESHOLD"]:
+            old_vals[k] = getattr(proxy, k)
+        try:
+            setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
+            setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
+            setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 20)
+            setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 3)
+            setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 2)
+            all_tools = ["bash", "read", "write"]
+            body = self._make_body_with_tools(all_tools)
+            monitor = proxy.SessionMonitor(context_window=262144)
+            # Pre-set as if we've already been through 1 review cycle
+            monitor.tool_turn_phase = "act"
+            monitor.tool_state_review_cycles = 1
+            monitor.tool_state_forced_budget_remaining = 20
+            monitor.tool_state_stagnation_streak = 3
+            fp = "bash:781c24ad"
+            monitor.tool_call_history = [fp, fp, fp]
+            monitor.last_tool_fingerprint = fp
+            openai = proxy.build_openai_request(body, monitor)
+            # Should now be in review with cycles=2 and escalated hint
+            self.assertEqual(monitor.tool_turn_phase, "review")
+            self.assertEqual(monitor.tool_state_review_cycles, 2)
+            messages = openai.get("messages", [])
+            last_user = [m for m in messages if m.get("role") == "user"][-1]
+            self.assertIn("CRITICAL", last_user["content"])
+            self.assertIn("2 review rounds", last_user["content"])
+        finally:
+            for k, v in old_vals.items():
+                setattr(proxy, k, v)
+    def test_mild_hint_on_cycle_1(self):
+        """Option 3: cycle 1 gets mild hint, not escalated."""
+        old_vals = {}
+        for k in ["PROXY_TOOL_STATE_MACHINE", "PROXY_TOOL_STATE_MIN_MESSAGES",
+                   "PROXY_TOOL_STATE_FORCED_BUDGET", "PROXY_TOOL_STATE_CYCLE_WINDOW",
+                   "PROXY_TOOL_STATE_STAGNATION_THRESHOLD"]:
+            old_vals[k] = getattr(proxy, k)
+        try:
+            setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
+            setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
+            setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 20)
+            setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 3)
+            setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 2)
+            body = self._make_body_with_tools(["bash", "read", "write"])
+            monitor = proxy.SessionMonitor(context_window=262144)
+            monitor.tool_turn_phase = "act"
+            monitor.tool_state_review_cycles = 0
+            monitor.tool_state_forced_budget_remaining = 20
+            monitor.tool_state_stagnation_streak = 3
+            fp = "bash:781c24ad"
+            monitor.tool_call_history = [fp, fp, fp]
+            monitor.last_tool_fingerprint = fp
+            openai = proxy.build_openai_request(body, monitor)
+            self.assertEqual(monitor.tool_turn_phase, "review")
+            self.assertEqual(monitor.tool_state_review_cycles, 1)
+            messages = openai.get("messages", [])
+            last_user = [m for m in messages if m.get("role") == "user"][-1]
+            self.assertNotIn("CRITICAL", last_user["content"])
+            self.assertIn("DIFFERENT tool", last_user["content"])
+        finally:
+            for k, v in old_vals.items():
+                setattr(proxy, k, v)
+class TestMalformedPayloadLoopFix(unittest.TestCase):
+    """Tests for Cycle 15: malformed payload loop breaking."""
+    def test_contamination_loop_forces_finalize(self):
+        """Option 3: after 3+ contamination resets, force finalize."""
+        monitor = proxy.SessionMonitor(context_window=262144)
+        monitor.contamination_resets = 3  # already hit 3 resets
+        monitor.malformed_tool_streak = 3  # triggers should_reset
+        body = {
+            "model": "test",
+            "messages": [
+                {"role": "user", "content": "do something"},
+                {"role": "assistant", "content": "ok"},
+                {"role": "user", "content": "continue"},
+            ],
+            "tools": [
+                {"name": "bash", "description": "Run", "input_schema": {"type": "object"}},
+            ],
+        }
+        result = proxy._maybe_apply_session_contamination_breaker(
+            body, monitor, "test-session"
+        )
+        # Should have removed tools and forced finalize
+        self.assertNotIn("tools", result)
+        self.assertNotIn("tool_choice", result)
+        self.assertEqual(monitor.tool_turn_phase, "finalize")
+        # Check finalize instruction was injected
+        last_msg = result["messages"][-1]
+        self.assertIn("plain text only", last_msg["content"])
+    def test_contamination_below_threshold_resets_normally(self):
+        """Below 3 contamination resets, normal reset behavior."""
+        monitor = proxy.SessionMonitor(context_window=262144)
+        monitor.contamination_resets = 1
+        monitor.malformed_tool_streak = 3
+        # Need enough messages (> keep_last + 1) for full reset path
+        msgs = [{"role": "user", "content": "start"}]
+        for i in range(20):
+            msgs.append({"role": "assistant", "content": f"resp {i}"})
+            msgs.append({"role": "user", "content": f"msg {i}"})
+        body = {
+            "model": "test",
+            "messages": msgs,
+            "tools": [
+                {"name": "bash", "description": "Run", "input_schema": {"type": "object"}},
+            ],
+        }
+        result = proxy._maybe_apply_session_contamination_breaker(
+            body, monitor, "test-session"
+        )
+        # Should have done normal reset (increment contamination_resets)
+        self.assertEqual(monitor.contamination_resets, 2)
+        self.assertEqual(monitor.tool_turn_phase, "bootstrap")
+    def test_post_contamination_temp_lowered(self):
+        """Option 2: temperature lowered to 0.1 after contamination reset."""
+        monitor = proxy.SessionMonitor(context_window=262144)
+        monitor.contamination_resets = 1  # has had a reset
+        body = {
+            "model": "test",
+            "messages": [{"role": "user", "content": "test"}],
+            "tools": [
+                {"name": "bash", "description": "Run", "input_schema": {"type": "object"}},
+            ],
+        }
+        openai = proxy.build_openai_request(body, monitor)
+        self.assertLessEqual(openai.get("temperature", 1.0), 0.1)
+    def test_normal_temp_without_contamination(self):
+        """Without contamination resets, normal tool temp (0.3) is used."""
+        monitor = proxy.SessionMonitor(context_window=262144)
+        monitor.contamination_resets = 0
+        body = {
+            "model": "test",
+            "messages": [{"role": "user", "content": "test"}],
+            "tools": [
+                {"name": "bash", "description": "Run", "input_schema": {"type": "object"}},
+            ],
+        }
+        openai = proxy.build_openai_request(body, monitor)
+        self.assertAlmostEqual(openai.get("temperature", 1.0), 0.3, places=1)