npm - @miller-tech/uap - Versions diffs - 1.20.23 → 1.20.25 - Mend

@miller-tech/uap 1.20.23 → 1.20.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +1 -1
package/tools/agents/scripts/anthropic_proxy.py +404 -24
package/tools/agents/tests/test_anthropic_proxy_streaming.py +243 -15

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@miller-tech/uap",
-  "version": "1.20.23",
+  "version": "1.20.25",
   "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
   "type": "module",
   "main": "dist/index.js",

package/tools/agents/scripts/anthropic_proxy.py CHANGED Viewed

@@ -155,16 +155,16 @@ PROXY_TOOL_STATE_FORCED_BUDGET = int(
 )
 PROXY_TOOL_STATE_AUTO_BUDGET = int(os.environ.get("PROXY_TOOL_STATE_AUTO_BUDGET", "2"))
 PROXY_TOOL_STATE_STAGNATION_THRESHOLD = int(
-    os.environ.get("PROXY_TOOL_STATE_STAGNATION_THRESHOLD", "5")
+    os.environ.get("PROXY_TOOL_STATE_STAGNATION_THRESHOLD", "8")
 )
 PROXY_TOOL_STATE_CYCLE_WINDOW = int(
-    os.environ.get("PROXY_TOOL_STATE_CYCLE_WINDOW", "4")
+    os.environ.get("PROXY_TOOL_STATE_CYCLE_WINDOW", "3")
 )
 PROXY_TOOL_STATE_FINALIZE_THRESHOLD = int(
     os.environ.get("PROXY_TOOL_STATE_FINALIZE_THRESHOLD", "18")
 )
 PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT = int(
-    os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "1")
+    os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "3")
 )
 PROXY_COMPLETION_RECOVERY_MAX = int(
     os.environ.get("PROXY_COMPLETION_RECOVERY_MAX", "3")
@@ -189,6 +189,12 @@ PROXY_TOOL_NARROWING_EXPAND_ON_LOOP = os.environ.get(
     "off",
     "no",
 }
+# Read-only tools that should be excluded as a class when any one cycles
+_READ_ONLY_TOOL_CLASS = frozenset({
+    "read", "glob", "grep", "Read", "Glob", "Grep",
+    "search", "Search", "list_files", "ListFiles",
+})
 PROXY_GUARDRAIL_RETRY = os.environ.get("PROXY_GUARDRAIL_RETRY", "on").lower() not in {
     "0",
     "false",
@@ -196,6 +202,9 @@ PROXY_GUARDRAIL_RETRY = os.environ.get("PROXY_GUARDRAIL_RETRY", "on").lower() no
     "no",
 }
 PROXY_SESSION_TTL_SECS = int(os.environ.get("PROXY_SESSION_TTL_SECS", "7200"))
+PROXY_FINALIZE_CONTINUATION_MAX = int(
+    os.environ.get("PROXY_FINALIZE_CONTINUATION_MAX", "3")
+)
 PROXY_STREAM_REASONING_FALLBACK = (
     os.environ.get("PROXY_STREAM_REASONING_FALLBACK", "off").strip().lower()
 )
@@ -621,6 +630,9 @@ class SessionMonitor:
     tool_call_history: list = field(
         default_factory=list
     )  # Recent tool call fingerprints
+    tool_target_history: dict = field(
+        default_factory=dict
+    )  # {tool_name: {target: count}} for read-only dedup
     consecutive_forced_count: int = (
         0  # How many times tool_choice was forced consecutively
     )
@@ -646,6 +658,8 @@ class SessionMonitor:
     cycling_tool_names: list = field(default_factory=list)
     last_response_garbled: bool = False  # previous turn had garbled/malformed output
     finalize_turn_active: bool = False
+    finalize_continuation_count: int = 0
+    finalize_synthetic_tool_id: str = ""
     completion_required: bool = False
     completion_pending: bool = False
     completion_verified: bool = False
@@ -753,14 +767,47 @@ class SessionMonitor:
     # --- Token Loop Protection Methods ---
-    def record_tool_calls(self, tool_names: list[str]):
-        """Record tool call names for loop detection."""
-        fingerprint = "|".join(sorted(tool_names)) if tool_names else ""
-        self.tool_call_history.append(fingerprint)
+    def record_tool_calls(
+        self,
+        tool_names: list[str],
+        tool_targets: dict[str, str] | None = None,
+        fingerprint: str = "",
+    ):
+        """Record tool call names for loop detection.
+        tool_targets: optional {tool_name: target_key} for read-only dedup.
+        e.g. {"read": "/path/to/file", "glob": "**/*.ts"}
+        If a pre-computed fingerprint (with argument hashes) is provided,
+        use it directly.  Otherwise fall back to name-only fingerprint.
+        """
+        fp = fingerprint or ("|".join(sorted(tool_names)) if tool_names else "")
+        self.tool_call_history.append(fp)
         # Keep last 30 entries
         if len(self.tool_call_history) > 30:
             self.tool_call_history = self.tool_call_history[-30:]
+        # Track read-only tool targets for dedup (Option 3)
+        if tool_targets:
+            for name, target in tool_targets.items():
+                if name.lower() in {n.lower() for n in _READ_ONLY_TOOL_CLASS} and target:
+                    by_tool = self.tool_target_history.setdefault(name, {})
+                    by_tool[target] = by_tool.get(target, 0) + 1
+    def has_duplicate_read_target(self, threshold: int = 2) -> tuple[bool, str]:
+        """Check if any read-only tool has re-read the same target >= threshold times.
+        Returns (is_duplicate, tool_name) for the first offending tool.
+        """
+        for tool_name, targets in self.tool_target_history.items():
+            for target, count in targets.items():
+                if count >= threshold:
+                    return True, tool_name
+        return False, ""
+    def reset_tool_targets(self):
+        """Clear target history (on phase reset or fresh user text)."""
+        self.tool_target_history = {}
     def detect_tool_loop(self, window: int = 6) -> tuple[bool, int]:
         """Detect if the model is stuck in a tool call loop.
@@ -851,6 +898,7 @@ class SessionMonitor:
         self.tool_state_review_cycles = 0
         self.cycling_tool_names = []
         self.last_tool_fingerprint = ""
+        self.reset_tool_targets()
     def update_completion_state(self, anthropic_body: dict, has_tool_results: bool):
         self.completion_required = _should_enforce_completion_contract(anthropic_body)
@@ -2095,6 +2143,8 @@ def _resolve_state_machine_tool_choice(
             monitor.invalid_tool_call_streak = 0
             monitor.required_tool_miss_streak = 0
         monitor.reset_tool_turn_state(reason="fresh_user_text")
+        monitor.finalize_continuation_count = 0
+        monitor.finalize_synthetic_tool_id = ""
         return None, "fresh_user_text"
     active_loop = (
@@ -2113,6 +2163,8 @@ def _resolve_state_machine_tool_choice(
                 monitor.invalid_tool_call_streak = 0
                 monitor.required_tool_miss_streak = 0
         monitor.reset_tool_turn_state(reason="inactive_loop")
+        monitor.finalize_continuation_count = 0
+        monitor.finalize_synthetic_tool_id = ""
         return None, "inactive_loop"
     if monitor.tool_turn_phase == "bootstrap":
@@ -2158,6 +2210,16 @@ def _resolve_state_machine_tool_choice(
         return "finalize", "review_cycle_limit"
     if monitor.tool_turn_phase == "act":
+        # Option 3: Early cycle break when same read target is hit 3+ times
+        dup_target, dup_tool = monitor.has_duplicate_read_target(threshold=3)
+        if dup_target and not cycle_looping and not stagnating:
+            cycle_looping = True
+            cycle_repeat = 2
+            logger.warning(
+                "TOOL STATE MACHINE: duplicate read target detected for '%s', triggering early cycle break",
+                dup_tool,
+            )
         if cycle_looping or stagnating:
             reason = "cycle_detected" if cycle_looping else "stagnation"
             monitor.set_tool_turn_phase("review", reason=reason)
@@ -2169,9 +2231,15 @@ def _resolve_state_machine_tool_choice(
                 1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
             )
             # Capture which tools are cycling for narrowing/hint injection
+            # Strip argument hashes (e.g. "glob:abc12345" -> "glob") so that
+            # tool narrowing can match against actual tool names.
             window = max(2, PROXY_TOOL_STATE_CYCLE_WINDOW)
             recent = [fp for fp in monitor.tool_call_history[-window:] if fp]
-            monitor.cycling_tool_names = list(dict.fromkeys(recent))
+            raw_names = []
+            for fp in recent:
+                for part in fp.split("|"):
+                    raw_names.append(part.split(":")[0])
+            monitor.cycling_tool_names = list(dict.fromkeys(raw_names))
             logger.warning(
                 "TOOL STATE MACHINE: entering review (cycle=%s repeat=%d stagnation=%d cycles=%d cycling_tools=%s)",
                 cycle_looping,
@@ -2184,7 +2252,11 @@ def _resolve_state_machine_tool_choice(
         if monitor.tool_state_forced_budget_remaining <= 0:
             monitor.set_tool_turn_phase("review", reason="forced_budget_exhausted")
-            monitor.tool_state_review_cycles += 1
+            # Only count toward review cycle limit if there was an actual
+            # cycle/stagnation detected.  Budget exhaustion alone means the
+            # model is working — it just used all its turns — not cycling.
+            if cycle_looping or stagnating:
+                monitor.tool_state_review_cycles += 1
             monitor.tool_state_auto_budget_remaining = max(
                 1, PROXY_TOOL_STATE_AUTO_BUDGET
             )
@@ -2192,8 +2264,10 @@ def _resolve_state_machine_tool_choice(
                 1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
             )
             logger.warning(
-                "TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d)",
+                "TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d cycling=%s stagnating=%s)",
                 monitor.tool_state_review_cycles,
+                cycle_looping,
+                stagnating,
             )
             return "required", "forced_budget_exhausted"
@@ -2206,6 +2280,14 @@ def _resolve_state_machine_tool_choice(
             monitor.tool_state_forced_budget_remaining = max(
                 1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
             )
+            # If stagnation cleared during review, the model tried a
+            # different approach — reward by reducing cycle pressure.
+            if monitor.tool_state_stagnation_streak == 0 and monitor.tool_state_review_cycles > 0:
+                monitor.tool_state_review_cycles = max(0, monitor.tool_state_review_cycles - 1)
+                logger.info(
+                    "TOOL STATE MACHINE: review_cycles decremented to %d (stagnation cleared)",
+                    monitor.tool_state_review_cycles,
+                )
             return "required", "review_complete"
         monitor.tool_state_auto_budget_remaining -= 1
@@ -2416,6 +2498,9 @@ def build_openai_request(
         n_msgs = len(anthropic_body.get("messages", []))
         has_tool_results = _conversation_has_tool_results(anthropic_body)
+        # Detect and strip synthetic finalize continuation before fingerprinting
+        _detect_and_strip_synthetic_continuation(anthropic_body, monitor)
         # Record tool calls from the last assistant message for loop detection
         latest_tool_fingerprint = _record_last_assistant_tool_calls(
             anthropic_body, monitor
@@ -2524,24 +2609,31 @@ def build_openai_request(
                     cycling_names,
                 )
             # Option 2: Narrow tools during review to exclude cycling tools
+            # Option 1 enhancement: if any cycling tool is read-only, exclude
+            # the entire read-only class to prevent tool-hopping (read→glob→grep)
             if (
                 monitor.tool_turn_phase == "review"
                 and monitor.cycling_tool_names
                 and "tools" in openai_body
             ):
+                exclude_set = set(monitor.cycling_tool_names)
+                # Expand to full read-only class if any cycling tool is read-only
+                if any(n.lower() in {c.lower() for c in _READ_ONLY_TOOL_CLASS} for n in exclude_set):
+                    exclude_set |= _READ_ONLY_TOOL_CLASS
                 original_count = len(openai_body["tools"])
                 narrowed = [
                     t
                     for t in openai_body["tools"]
-                    if t.get("function", {}).get("name") not in monitor.cycling_tool_names
+                    if t.get("function", {}).get("name") not in exclude_set
                 ]
                 if narrowed:
                     openai_body["tools"] = narrowed
                     logger.warning(
-                        "CYCLE BREAK: narrowed tools from %d to %d (excluded %s)",
+                        "CYCLE BREAK: narrowed tools from %d to %d (excluded %s, read_only_class=%s)",
                         original_count,
                         len(narrowed),
                         monitor.cycling_tool_names,
+                        any(n.lower() in {c.lower() for c in _READ_ONLY_TOOL_CLASS} for n in monitor.cycling_tool_names),
                     )
                 else:
                     logger.warning(
@@ -2602,13 +2694,117 @@ def build_openai_request(
     return openai_body
+def _tool_call_fingerprint(block: dict) -> str:
+    """Create a fingerprint for a tool call that includes both name and a
+    short hash of the arguments.  This prevents false cycle detection when
+    the same tool is called with different arguments (e.g. reading different
+    files)."""
+    name = block.get("name", "unknown")
+    inp = block.get("input")
+    if inp:
+        arg_str = json.dumps(inp, sort_keys=True, separators=(",", ":"))
+        arg_hash = hashlib.md5(arg_str.encode()).hexdigest()[:8]
+        return f"{name}:{arg_hash}"
+    return name
+def _detect_and_strip_synthetic_continuation(
+    anthropic_body: dict, monitor: SessionMonitor
+) -> bool:
+    """Detect if the latest messages contain a synthetic finalize continuation
+    tool_use/tool_result pair.  If found, strip them from the conversation and
+    reset the state machine so the model gets a fresh act cycle.
+    Returns True if a synthetic continuation was detected and handled.
+    """
+    synthetic_id = monitor.finalize_synthetic_tool_id
+    if not synthetic_id:
+        return False
+    messages = anthropic_body.get("messages", [])
+    if not messages:
+        return False
+    # Walk backwards to find the synthetic tool_result in a user message
+    found = False
+    for msg in reversed(messages):
+        if msg.get("role") != "user":
+            continue
+        content = msg.get("content")
+        if not isinstance(content, list):
+            break
+        has_synthetic = any(
+            isinstance(b, dict)
+            and b.get("type") == "tool_result"
+            and b.get("tool_use_id") == synthetic_id
+            for b in content
+        )
+        if not has_synthetic:
+            break
+        # Strip synthetic tool_result from user message
+        new_content = [
+            b for b in content
+            if not (
+                isinstance(b, dict)
+                and b.get("type") == "tool_result"
+                and b.get("tool_use_id") == synthetic_id
+            )
+        ]
+        if not new_content:
+            msg["content"] = [{"type": "text", "text": "Continue working on the task."}]
+        else:
+            msg["content"] = new_content
+        # Strip synthetic tool_use from the preceding assistant message
+        for asst_msg in reversed(messages):
+            if asst_msg.get("role") != "assistant":
+                continue
+            asst_content = asst_msg.get("content")
+            if isinstance(asst_content, list):
+                asst_msg["content"] = [
+                    b for b in asst_content
+                    if not (
+                        isinstance(b, dict)
+                        and b.get("type") == "tool_use"
+                        and b.get("id") == synthetic_id
+                    )
+                ]
+            break
+        found = True
+        break
+    if not found:
+        return False
+    # Reset state machine for fresh act cycle
+    monitor.finalize_synthetic_tool_id = ""
+    monitor.reset_tool_turn_state(reason="finalize_continuation_resume")
+    monitor.reset_completion_recovery()
+    monitor.tool_call_history = []
+    logger.info(
+        "FINALIZE CONTINUATION: stripped synthetic tool id=%s, "
+        "reset state machine for fresh act cycle (continuations=%d/%d)",
+        synthetic_id,
+        monitor.finalize_continuation_count,
+        PROXY_FINALIZE_CONTINUATION_MAX,
+    )
+    return True
 def _record_last_assistant_tool_calls(
     anthropic_body: dict, monitor: SessionMonitor
 ) -> str:
     """Extract tool call names from the last assistant message and record
-    them in the session monitor for loop detection."""
+    them in the session monitor for loop detection.
+    Fingerprints now include an argument hash so that the same tool called
+    with different arguments (e.g. read(file_a) vs read(file_b)) produces
+    distinct fingerprints, preventing false cycle/stagnation detection."""
     messages = anthropic_body.get("messages", [])
-    tool_names = []
+    tool_fingerprints = []
+    tool_targets: dict[str, str] = {}
     for msg in reversed(messages):
         if msg.get("role") != "assistant":
             continue
@@ -2616,11 +2812,28 @@ def _record_last_assistant_tool_calls(
         if isinstance(content, list):
             for block in content:
                 if isinstance(block, dict) and block.get("type") == "tool_use":
-                    tool_names.append(block.get("name", "unknown"))
+                    tool_fingerprints.append(_tool_call_fingerprint(block))
+                    # Extract target key for read-only dedup (Option 3)
+                    name = block.get("name", "unknown")
+                    inp = block.get("input", {})
+                    if isinstance(inp, dict):
+                        target = (
+                            inp.get("file_path")
+                            or inp.get("path")
+                            or inp.get("pattern")
+                            or inp.get("command", "")[:80]
+                        )
+                        if target:
+                            tool_targets[name] = str(target)
         break
-    if tool_names:
-        monitor.record_tool_calls(tool_names)
-        return "|".join(sorted(tool_names))
+    if tool_fingerprints:
+        fingerprint = "|".join(sorted(tool_fingerprints))
+        monitor.record_tool_calls(
+            [fp.split(":")[0] for fp in tool_fingerprints],
+            tool_targets=tool_targets,
+            fingerprint=fingerprint,
+        )
+        return fingerprint
     return ""
@@ -4581,6 +4794,34 @@ async def _apply_malformed_tool_guardrail(
         )
         current_issue = retry_issue
+    # Option 2 (PR #154): When retries exhaust during review phase, reset to
+    # bootstrap instead of returning guardrail fallback. This re-enables all
+    # tools (including previously excluded cycling ones) and gives the model
+    # a clean shot. The cycle detector will catch re-cycling if it recurs.
+    if monitor.tool_turn_phase == "review":
+        logger.warning(
+            "TOOL RESPONSE review-phase reset: session=%s retries exhausted in review "
+            "(kind=%s malformed=%d), resetting to bootstrap for fresh attempt",
+            session_id,
+            current_issue.kind or issue.kind,
+            monitor.malformed_tool_streak,
+        )
+        monitor.reset_tool_turn_state(reason="review_retry_exhausted")
+        monitor.malformed_tool_streak = 0
+        monitor.invalid_tool_call_streak = 0
+        # Return the best response we have — even if degraded — to keep
+        # the conversation moving rather than returning a guardrail stub.
+        degraded_text = _sanitize_tool_call_apology_text(
+            _openai_message_text(working_resp)
+        ).strip()
+        if degraded_text and not _looks_malformed_tool_payload(degraded_text):
+            return _build_safe_text_openai_response(
+                working_resp, degraded_text, finish_reason="tool_calls",
+            )
+        return _build_clean_guardrail_openai_response(
+            working_resp, finish_reason="tool_calls",
+        )
     logger.error(
         "TOOL RESPONSE issue persisted after retries (session=%s kind=%s malformed=%d invalid=%d required_miss=%d); returning clean guardrail response",
         session_id,
@@ -4722,16 +4963,20 @@ def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
     return openai_resp
-def _detect_and_truncate_degenerate_repetition(openai_resp: dict) -> dict:
+def _detect_and_truncate_degenerate_repetition(
+    openai_resp: dict,
+) -> tuple[dict, bool]:
     """Detect degenerate repetitive text and truncate at first repetition.
     When the model produces highly repetitive output (e.g. the same 20+ char
     substring repeated 10+ times), truncate at the first repetition boundary
     and set finish_reason to stop.
+    Returns (response, was_degenerate) so the caller can retry if needed.
     """
     text = _openai_message_text(openai_resp)
     if not text or len(text) < 200:
-        return openai_resp
+        return openai_resp, False
     # Look for repeated substrings of length 20-100
     for substr_len in (60, 40, 20):
@@ -4760,8 +5005,70 @@ def _detect_and_truncate_degenerate_repetition(openai_resp: dict) -> dict:
                     msg = choices[0].get("message", {})
                     msg["content"] = truncated
                     choices[0]["finish_reason"] = "stop"
-                return openai_resp
-    return openai_resp
+                return openai_resp, True
+    return openai_resp, False
+def _client_has_tool(anthropic_body: dict, tool_name: str) -> bool:
+    """Check if the client's tool list contains a tool with the given name (case-insensitive)."""
+    lower = tool_name.lower()
+    return any(
+        (t.get("name") or "").lower() == lower for t in anthropic_body.get("tools", [])
+    )
+def _client_tool_name(anthropic_body: dict, tool_name: str) -> str:
+    """Return the actual tool name as the client spells it (case-sensitive match)."""
+    lower = tool_name.lower()
+    for t in anthropic_body.get("tools", []):
+        if (t.get("name") or "").lower() == lower:
+            return t["name"]
+    return tool_name
+def _inject_synthetic_continuation(
+    anthropic_resp: dict, monitor: SessionMonitor, anthropic_body: dict
+) -> dict:
+    """Inject a synthetic tool_use into a finalize-turn response to keep the
+    client's agentic loop alive.
+    Appends a no-op Read("/dev/null") tool_use block and changes stop_reason
+    from "end_turn" to "tool_use" so the client continues sending requests.
+    """
+    # Pick a safe tool the client knows about (case-insensitive match,
+    # then use the client's actual casing for the tool name)
+    if _client_has_tool(anthropic_body, "read"):
+        tool_name = _client_tool_name(anthropic_body, "read")
+        tool_input = {"file_path": "/dev/null"}
+    elif _client_has_tool(anthropic_body, "bash"):
+        tool_name = _client_tool_name(anthropic_body, "bash")
+        tool_input = {"command": "true", "description": "continuation ping"}
+    else:
+        logger.warning("FINALIZE CONTINUATION: no suitable tool found, skipping injection")
+        return anthropic_resp
+    synthetic_id = f"toolu_{uuid.uuid4().hex[:12]}"
+    monitor.finalize_synthetic_tool_id = synthetic_id
+    monitor.finalize_continuation_count += 1
+    content = anthropic_resp.get("content", [])
+    content.append({
+        "type": "tool_use",
+        "id": synthetic_id,
+        "name": tool_name,
+        "input": tool_input,
+    })
+    anthropic_resp["content"] = content
+    anthropic_resp["stop_reason"] = "tool_use"
+    logger.info(
+        "FINALIZE CONTINUATION: injected synthetic %s tool_use id=%s (count=%d/%d)",
+        tool_name,
+        synthetic_id,
+        monitor.finalize_continuation_count,
+        PROXY_FINALIZE_CONTINUATION_MAX,
+    )
+    return anthropic_resp
 def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
@@ -5595,8 +5902,51 @@ async def messages(request: Request):
             session_id,
         )
-        openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
+        openai_resp, was_degenerate = _detect_and_truncate_degenerate_repetition(openai_resp)
+        if was_degenerate:
+            # Retry with constrained parameters to avoid degenerate output.
+            # With tools: force tool_choice=required for a useful tool call.
+            # Without tools (finalize): retry with capped max_tokens for clean text.
+            has_tools = bool(strict_body.get("tools"))
+            retry_body = dict(strict_body)
+            retry_body["max_tokens"] = 2048
+            retry_body["temperature"] = 0.1
+            retry_body["stream"] = False
+            if has_tools:
+                retry_body["tool_choice"] = "required"
+                logger.warning("DEGENERATE RETRY: retrying with tool_choice=required max_tokens=2048")
+            else:
+                logger.warning("DEGENERATE RETRY: retrying text-only with max_tokens=2048 temp=0.1")
+            try:
+                retry_resp = await _post_with_generation_timeout(
+                    client, f"{LLAMA_CPP_BASE}/chat/completions", retry_body,
+                    {"Content-Type": "application/json"},
+                )
+                if retry_resp.status_code == 200:
+                    retry_data = retry_resp.json()
+                    retry_text = _openai_message_text(retry_data)
+                    _, retry_degenerate = _detect_and_truncate_degenerate_repetition(retry_data)
+                    if retry_degenerate:
+                        logger.info("DEGENERATE RETRY: retry also degenerate, using truncated original")
+                    elif has_tools and (retry_data.get("choices", [{}])[0]
+                            .get("message", {}).get("tool_calls")):
+                        logger.info("DEGENERATE RETRY: success, got tool call")
+                        openai_resp = retry_data
+                    elif not has_tools and retry_text and len(retry_text) > 50:
+                        logger.info("DEGENERATE RETRY: success, got clean text (%d chars)", len(retry_text))
+                        openai_resp = retry_data
+                    else:
+                        logger.info("DEGENERATE RETRY: retry insufficient, using truncated original")
+            except Exception as exc:
+                logger.warning("DEGENERATE RETRY: failed: %s", exc)
         anthropic_resp = openai_to_anthropic_response(openai_resp, model)
+        # FINALIZE CONTINUATION: inject synthetic tool_use to keep client loop alive
+        if (
+            monitor.finalize_turn_active
+            and monitor.finalize_continuation_count < PROXY_FINALIZE_CONTINUATION_MAX
+            and anthropic_resp.get("stop_reason") == "end_turn"
+        ):
+            anthropic_resp = _inject_synthetic_continuation(anthropic_resp, monitor, body)
         monitor.record_response(anthropic_resp.get("usage", {}).get("output_tokens", 0))
         # Update last_input_tokens from upstream's actual prompt_tokens
         upstream_input = anthropic_resp.get("usage", {}).get("input_tokens", 0)
@@ -5934,8 +6284,38 @@ async def messages(request: Request):
             monitor.invalid_tool_call_streak = 0
             monitor.required_tool_miss_streak = 0
-        openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
+        openai_resp, was_degenerate = _detect_and_truncate_degenerate_repetition(openai_resp)
+        # Degenerate retry for non-guarded stream path
+        if was_degenerate and openai_body.get("tools"):
+            logger.warning("DEGENERATE RETRY (stream): retrying with tool_choice=required max_tokens=2048")
+            retry_body = dict(openai_body)
+            retry_body["tool_choice"] = "required"
+            retry_body["max_tokens"] = 2048
+            retry_body["temperature"] = 0.1
+            retry_body["stream"] = False
+            try:
+                retry_resp = await _post_with_generation_timeout(
+                    client, f"{LLAMA_CPP_BASE}/chat/completions", retry_body,
+                    {"Content-Type": "application/json"},
+                )
+                if retry_resp.status_code == 200:
+                    retry_data = retry_resp.json()
+                    if (retry_data.get("choices", [{}])[0]
+                            .get("message", {}).get("tool_calls")):
+                        logger.info("DEGENERATE RETRY (stream): success, got tool call")
+                        openai_resp = retry_data
+                    else:
+                        logger.info("DEGENERATE RETRY (stream): no tool call, using truncated")
+            except Exception as exc:
+                logger.warning("DEGENERATE RETRY (stream): failed: %s", exc)
         anthropic_resp = openai_to_anthropic_response(openai_resp, model)
+        # FINALIZE CONTINUATION: inject synthetic tool_use (non-guarded stream path)
+        if (
+            monitor.finalize_turn_active
+            and monitor.finalize_continuation_count < PROXY_FINALIZE_CONTINUATION_MAX
+            and anthropic_resp.get("stop_reason") == "end_turn"
+        ):
+            anthropic_resp = _inject_synthetic_continuation(anthropic_resp, monitor, body)
         # Track output tokens in session monitor
         output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)

package/tools/agents/tests/test_anthropic_proxy_streaming.py CHANGED Viewed

@@ -1892,12 +1892,13 @@ class TestToolTurnControls(unittest.TestCase):
             monitor = proxy.SessionMonitor(context_window=262144)
             monitor.tool_turn_phase = "act"
             monitor.tool_state_forced_budget_remaining = 20
+            # Use hash-format fingerprints to match _tool_call_fingerprint output
             monitor.tool_call_history = [
-                "Bash",
+                "Bash:1e7b8d07",
                 "TaskOutput",
-                "Bash",
+                "Bash:1e7b8d07",
                 "TaskOutput",
-                "Bash",
+                "Bash:1e7b8d07",
                 "TaskOutput",
             ]
             monitor.last_tool_fingerprint = "TaskOutput"
@@ -2076,7 +2077,9 @@ class TestToolTurnControls(unittest.TestCase):
             # Review phase now keeps required to prevent end-turn escape
             self.assertEqual(openai.get("tool_choice"), "required")
             self.assertEqual(monitor.tool_turn_phase, "review")
-            self.assertEqual(monitor.tool_state_review_cycles, 1)
+            # review_cycles only increments when cycle_looping or stagnating,
+            # not on mere budget exhaustion (model was working, not cycling)
+            self.assertEqual(monitor.tool_state_review_cycles, 0)
         finally:
             setattr(proxy, "PROXY_TOOL_STATE_MACHINE", old_state)
             setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", old_min_msgs)
@@ -2242,7 +2245,11 @@ class TestToolTurnControls(unittest.TestCase):
             monitor = proxy.SessionMonitor(context_window=262144)
             monitor.tool_turn_phase = "act"
             monitor.tool_state_stagnation_streak = 4
-            monitor.tool_call_history = ["Bash", "TaskOutput", "Bash", "TaskOutput"]
+            # Use hash-format fingerprints to match _tool_call_fingerprint output
+            monitor.tool_call_history = [
+                "Bash:1e7b8d07", "TaskOutput", "Bash:1e7b8d07", "TaskOutput",
+                "Bash:1e7b8d07", "TaskOutput",
+            ]
             monitor.last_tool_fingerprint = "TaskOutput"
             body = {
@@ -3262,8 +3269,11 @@ class TestCycleBreakOptions(unittest.TestCase):
             monitor = proxy.SessionMonitor(context_window=262144)
             monitor.tool_turn_phase = "act"
             monitor.tool_state_forced_budget_remaining = 20
-            monitor.tool_call_history = ["Bash", "Bash", "Bash", "Bash"]
-            monitor.last_tool_fingerprint = "Bash"
+            # Hash-format fingerprints matching Bash+{"command":"ls"}
+            monitor.tool_call_history = [
+                "Bash:781c24ad", "Bash:781c24ad", "Bash:781c24ad", "Bash:781c24ad",
+            ]
+            monitor.last_tool_fingerprint = "Bash:781c24ad"
             body = {
                 "model": "test",
@@ -3323,8 +3333,11 @@ class TestCycleBreakOptions(unittest.TestCase):
             monitor = proxy.SessionMonitor(context_window=262144)
             monitor.tool_turn_phase = "act"
             monitor.tool_state_forced_budget_remaining = 20
-            monitor.tool_call_history = ["Bash", "Bash", "Bash", "Bash"]
-            monitor.last_tool_fingerprint = "Bash"
+            # Hash-format fingerprints matching Bash+{"command":"ls"}
+            monitor.tool_call_history = [
+                "Bash:781c24ad", "Bash:781c24ad", "Bash:781c24ad", "Bash:781c24ad",
+            ]
+            monitor.last_tool_fingerprint = "Bash:781c24ad"
             body = {
                 "model": "test",
@@ -3369,9 +3382,9 @@ class TestCycleBreakOptions(unittest.TestCase):
         """Option 3: default forced budget reduced from 24 to 12."""
         self.assertEqual(proxy.PROXY_TOOL_STATE_FORCED_BUDGET, 12)
-    def test_review_cycle_limit_default_is_1(self):
-        """Option 4: default review cycle limit reduced from 2 to 1."""
-        self.assertEqual(proxy.PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT, 1)
+    def test_review_cycle_limit_default_is_3(self):
+        """Option 4: default review cycle limit is 3."""
+        self.assertEqual(proxy.PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT, 3)
     def test_cycling_tool_names_cleared_on_reset(self):
         """cycling_tool_names is cleared when tool turn state resets."""
@@ -3450,8 +3463,9 @@ class TestDegenerateRepetitionDetection(unittest.TestCase):
         openai_resp = {
             "choices": [{"message": {"content": repeated}, "finish_reason": "length"}]
         }
-        result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
+        result, truncated = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
         truncated_text = result["choices"][0]["message"]["content"]
+        self.assertTrue(truncated)
         self.assertLess(len(truncated_text), len(repeated))
         self.assertEqual(result["choices"][0]["finish_reason"], "stop")
@@ -3461,7 +3475,8 @@ class TestDegenerateRepetitionDetection(unittest.TestCase):
         openai_resp = {
             "choices": [{"message": {"content": text}, "finish_reason": "stop"}]
         }
-        result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
+        result, truncated = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
+        self.assertFalse(truncated)
         self.assertEqual(result["choices"][0]["message"]["content"], text)
     def test_preserves_short_text(self):
@@ -3470,7 +3485,8 @@ class TestDegenerateRepetitionDetection(unittest.TestCase):
         openai_resp = {
             "choices": [{"message": {"content": text}, "finish_reason": "stop"}]
         }
-        result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
+        result, truncated = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
+        self.assertFalse(truncated)
         self.assertEqual(result["choices"][0]["message"]["content"], text)
     def test_max_tokens_floor_skipped_for_non_tool_requests(self):
@@ -4178,3 +4194,215 @@ class TestFinalizePingPongFix(unittest.TestCase):
             proxy.TOOL_CALL_GRAMMAR_TOOLS_COMPATIBLE = old_compat
             proxy.PROXY_TOOL_CALL_GRAMMAR = old_flag
             proxy.TOOL_CALL_GBNF = old_gbnf
+class TestReviewPhaseBootstrapReset(unittest.TestCase):
+    """Tests for bootstrap reset after exhausted retries in review phase (PR #154)."""
+    def _make_monitor_in_review(self):
+        m = proxy.SessionMonitor()
+        m.set_tool_turn_phase("review", reason="test")
+        m.malformed_tool_streak = 3
+        m.invalid_tool_call_streak = 0
+        return m
+    def _make_monitor_in_act(self):
+        m = proxy.SessionMonitor()
+        m.set_tool_turn_phase("act", reason="test")
+        m.malformed_tool_streak = 3
+        return m
+    def test_review_phase_resets_to_bootstrap(self):
+        """After retries exhaust in review, monitor resets to bootstrap."""
+        m = self._make_monitor_in_review()
+        self.assertEqual(m.tool_turn_phase, "review")
+        self.assertEqual(m.malformed_tool_streak, 3)
+        # Simulate what happens after retry exhaustion: the code checks
+        # monitor.tool_turn_phase == "review" and resets
+        if m.tool_turn_phase == "review":
+            m.reset_tool_turn_state(reason="review_retry_exhausted")
+            m.malformed_tool_streak = 0
+            m.invalid_tool_call_streak = 0
+        self.assertEqual(m.tool_turn_phase, "bootstrap")
+        self.assertEqual(m.malformed_tool_streak, 0)
+        self.assertEqual(m.tool_state_stagnation_streak, 0)
+        self.assertEqual(m.cycling_tool_names, [])
+    def test_act_phase_does_not_reset(self):
+        """In act phase, retries exhaustion should NOT trigger bootstrap reset."""
+        m = self._make_monitor_in_act()
+        # The bootstrap reset only triggers for review phase
+        self.assertNotEqual(m.tool_turn_phase, "review")
+        # In act phase, the normal guardrail fallback path runs instead
+class TestReadOnlyCycleClassExclusion(unittest.TestCase):
+    """Tests for Option 1: read-only tool class exclusion on cycle break,
+    Option 2: reduced cycle window (3), and Option 3: duplicate target dedup."""
+    def _make_body_with_tools(self, tool_names):
+        """Build a minimal Anthropic body with named tools and a tool_result."""
+        tools = [
+            {"name": n, "description": f"{n} tool", "input_schema": {"type": "object"}}
+            for n in tool_names
+        ]
+        return {
+            "model": "test",
+            "messages": [
+                {"role": "user", "content": "do something"},
+                {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "tool_use",
+                            "id": "toolu_1",
+                            "name": tool_names[0],
+                            "input": {"file_path": "/some/file.ts"},
+                        }
+                    ],
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "tool_result", "tool_use_id": "toolu_1", "content": "ok"}
+                    ],
+                },
+            ],
+            "tools": tools,
+        }
+    def test_read_only_class_exclusion_expands(self):
+        """When 'read' is cycling, all read-only tools are excluded, not just 'read'."""
+        old_vals = {
+            "PROXY_TOOL_STATE_MACHINE": getattr(proxy, "PROXY_TOOL_STATE_MACHINE"),
+            "PROXY_TOOL_STATE_MIN_MESSAGES": getattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES"),
+            "PROXY_TOOL_STATE_FORCED_BUDGET": getattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET"),
+            "PROXY_TOOL_STATE_CYCLE_WINDOW": getattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW"),
+            "PROXY_TOOL_STATE_STAGNATION_THRESHOLD": getattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD"),
+        }
+        try:
+            setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
+            setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
+            setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 3)
+            setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 3)
+            setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 2)
+            all_tools = ["read", "glob", "grep", "bash", "write", "edit"]
+            body = self._make_body_with_tools(all_tools)
+            monitor = proxy.SessionMonitor(context_window=262144)
+            # Simulate cycling on 'read' by recording 3 identical fingerprints
+            # Hash-format matching read+{"file_path":"/some/file.ts"}
+            fp = "read:cfb28722"
+            monitor.record_tool_calls(["read"], fingerprint=fp)
+            monitor.record_tool_calls(["read"], fingerprint=fp)
+            monitor.record_tool_calls(["read"], fingerprint=fp)
+            openai_body = proxy.build_openai_request(body, monitor)
+            # After cycle break, the tools in the body should exclude ALL
+            # read-only tools, not just 'read'
+            remaining_names = [
+                t.get("function", {}).get("name") for t in openai_body.get("tools", [])
+            ]
+            self.assertNotIn("read", remaining_names)
+            self.assertNotIn("glob", remaining_names)
+            self.assertNotIn("grep", remaining_names)
+            # Write/action tools should remain
+            self.assertIn("bash", remaining_names)
+            self.assertIn("write", remaining_names)
+            self.assertIn("edit", remaining_names)
+        finally:
+            for k, v in old_vals.items():
+                setattr(proxy, k, v)
+    def test_non_read_tool_cycling_no_class_expansion(self):
+        """When 'bash' is cycling, only 'bash' is excluded, not read-only tools."""
+        old_vals = {
+            "PROXY_TOOL_STATE_MACHINE": getattr(proxy, "PROXY_TOOL_STATE_MACHINE"),
+            "PROXY_TOOL_STATE_MIN_MESSAGES": getattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES"),
+            "PROXY_TOOL_STATE_FORCED_BUDGET": getattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET"),
+            "PROXY_TOOL_STATE_CYCLE_WINDOW": getattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW"),
+            "PROXY_TOOL_STATE_STAGNATION_THRESHOLD": getattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD"),
+        }
+        try:
+            setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
+            setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
+            setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 3)
+            setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 3)
+            setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 2)
+            all_tools = ["read", "glob", "grep", "bash", "write", "edit"]
+            body = self._make_body_with_tools(all_tools)
+            # Change the assistant tool_use to bash
+            body["messages"][1]["content"][0]["name"] = "bash"
+            body["messages"][1]["content"][0]["input"] = {"command": "ls"}
+            monitor = proxy.SessionMonitor(context_window=262144)
+            # Use hash-format fingerprints matching bash+{"command":"ls"}
+            fp = "bash:781c24ad"
+            monitor.record_tool_calls(["bash"], fingerprint=fp)
+            monitor.record_tool_calls(["bash"], fingerprint=fp)
+            monitor.record_tool_calls(["bash"], fingerprint=fp)
+            openai_body = proxy.build_openai_request(body, monitor)
+            remaining_names = [
+                t.get("function", {}).get("name") for t in openai_body.get("tools", [])
+            ]
+            self.assertNotIn("bash", remaining_names)
+            # Read-only tools should still be available
+            self.assertIn("read", remaining_names)
+            self.assertIn("glob", remaining_names)
+            self.assertIn("grep", remaining_names)
+        finally:
+            for k, v in old_vals.items():
+                setattr(proxy, k, v)
+    def test_duplicate_read_target_triggers_early_cycle(self):
+        """Option 3: reading same file 3+ times triggers early cycle break."""
+        monitor = proxy.SessionMonitor(context_window=262144)
+        # Record 3 reads of same target
+        monitor.record_tool_calls(["read"], tool_targets={"read": "/path/to/file.ts"})
+        monitor.record_tool_calls(["read"], tool_targets={"read": "/path/to/file.ts"})
+        monitor.record_tool_calls(["read"], tool_targets={"read": "/path/to/file.ts"})
+        dup, tool = monitor.has_duplicate_read_target(threshold=3)
+        self.assertTrue(dup)
+        self.assertEqual(tool, "read")
+    def test_different_read_targets_no_duplicate(self):
+        """Option 3: reading different files does NOT trigger duplicate detection."""
+        monitor = proxy.SessionMonitor(context_window=262144)
+        monitor.record_tool_calls(["read"], tool_targets={"read": "/path/a.ts"})
+        monitor.record_tool_calls(["read"], tool_targets={"read": "/path/b.ts"})
+        monitor.record_tool_calls(["read"], tool_targets={"read": "/path/c.ts"})
+        dup, _ = monitor.has_duplicate_read_target(threshold=3)
+        self.assertFalse(dup)
+    def test_cycle_window_default_is_3(self):
+        """Option 2: verify default cycle window is now 3."""
+        # This tests the constant directly
+        self.assertEqual(
+            int(proxy.os.environ.get("PROXY_TOOL_STATE_CYCLE_WINDOW", "3")), 3
+        )
+    def test_target_history_reset_on_state_reset(self):
+        """Target history is cleared when tool state resets."""
+        monitor = proxy.SessionMonitor(context_window=262144)
+        monitor.record_tool_calls(["read"], tool_targets={"read": "/file.ts"})
+        monitor.record_tool_calls(["read"], tool_targets={"read": "/file.ts"})
+        monitor.record_tool_calls(["read"], tool_targets={"read": "/file.ts"})
+        dup, _ = monitor.has_duplicate_read_target(threshold=3)
+        self.assertTrue(dup)
+        monitor.reset_tool_turn_state(reason="test_reset")
+        dup, _ = monitor.has_duplicate_read_target(threshold=3)
+        self.assertFalse(dup)