@miller-tech/uap 1.20.24 → 1.20.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -155,16 +155,16 @@ PROXY_TOOL_STATE_FORCED_BUDGET = int(
|
|
|
155
155
|
)
|
|
156
156
|
PROXY_TOOL_STATE_AUTO_BUDGET = int(os.environ.get("PROXY_TOOL_STATE_AUTO_BUDGET", "2"))
|
|
157
157
|
PROXY_TOOL_STATE_STAGNATION_THRESHOLD = int(
|
|
158
|
-
os.environ.get("PROXY_TOOL_STATE_STAGNATION_THRESHOLD", "
|
|
158
|
+
os.environ.get("PROXY_TOOL_STATE_STAGNATION_THRESHOLD", "8")
|
|
159
159
|
)
|
|
160
160
|
PROXY_TOOL_STATE_CYCLE_WINDOW = int(
|
|
161
|
-
os.environ.get("PROXY_TOOL_STATE_CYCLE_WINDOW", "
|
|
161
|
+
os.environ.get("PROXY_TOOL_STATE_CYCLE_WINDOW", "3")
|
|
162
162
|
)
|
|
163
163
|
PROXY_TOOL_STATE_FINALIZE_THRESHOLD = int(
|
|
164
164
|
os.environ.get("PROXY_TOOL_STATE_FINALIZE_THRESHOLD", "18")
|
|
165
165
|
)
|
|
166
166
|
PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT = int(
|
|
167
|
-
os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "
|
|
167
|
+
os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "3")
|
|
168
168
|
)
|
|
169
169
|
PROXY_COMPLETION_RECOVERY_MAX = int(
|
|
170
170
|
os.environ.get("PROXY_COMPLETION_RECOVERY_MAX", "3")
|
|
@@ -189,6 +189,12 @@ PROXY_TOOL_NARROWING_EXPAND_ON_LOOP = os.environ.get(
|
|
|
189
189
|
"off",
|
|
190
190
|
"no",
|
|
191
191
|
}
|
|
192
|
+
# Read-only tools that should be excluded as a class when any one cycles
|
|
193
|
+
_READ_ONLY_TOOL_CLASS = frozenset({
|
|
194
|
+
"read", "glob", "grep", "Read", "Glob", "Grep",
|
|
195
|
+
"search", "Search", "list_files", "ListFiles",
|
|
196
|
+
})
|
|
197
|
+
|
|
192
198
|
PROXY_GUARDRAIL_RETRY = os.environ.get("PROXY_GUARDRAIL_RETRY", "on").lower() not in {
|
|
193
199
|
"0",
|
|
194
200
|
"false",
|
|
@@ -196,6 +202,9 @@ PROXY_GUARDRAIL_RETRY = os.environ.get("PROXY_GUARDRAIL_RETRY", "on").lower() no
|
|
|
196
202
|
"no",
|
|
197
203
|
}
|
|
198
204
|
PROXY_SESSION_TTL_SECS = int(os.environ.get("PROXY_SESSION_TTL_SECS", "7200"))
|
|
205
|
+
PROXY_FINALIZE_CONTINUATION_MAX = int(
|
|
206
|
+
os.environ.get("PROXY_FINALIZE_CONTINUATION_MAX", "3")
|
|
207
|
+
)
|
|
199
208
|
PROXY_STREAM_REASONING_FALLBACK = (
|
|
200
209
|
os.environ.get("PROXY_STREAM_REASONING_FALLBACK", "off").strip().lower()
|
|
201
210
|
)
|
|
@@ -621,6 +630,9 @@ class SessionMonitor:
|
|
|
621
630
|
tool_call_history: list = field(
|
|
622
631
|
default_factory=list
|
|
623
632
|
) # Recent tool call fingerprints
|
|
633
|
+
tool_target_history: dict = field(
|
|
634
|
+
default_factory=dict
|
|
635
|
+
) # {tool_name: {target: count}} for read-only dedup
|
|
624
636
|
consecutive_forced_count: int = (
|
|
625
637
|
0 # How many times tool_choice was forced consecutively
|
|
626
638
|
)
|
|
@@ -646,6 +658,8 @@ class SessionMonitor:
|
|
|
646
658
|
cycling_tool_names: list = field(default_factory=list)
|
|
647
659
|
last_response_garbled: bool = False # previous turn had garbled/malformed output
|
|
648
660
|
finalize_turn_active: bool = False
|
|
661
|
+
finalize_continuation_count: int = 0
|
|
662
|
+
finalize_synthetic_tool_id: str = ""
|
|
649
663
|
completion_required: bool = False
|
|
650
664
|
completion_pending: bool = False
|
|
651
665
|
completion_verified: bool = False
|
|
@@ -753,14 +767,47 @@ class SessionMonitor:
|
|
|
753
767
|
|
|
754
768
|
# --- Token Loop Protection Methods ---
|
|
755
769
|
|
|
756
|
-
def record_tool_calls(
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
770
|
+
def record_tool_calls(
|
|
771
|
+
self,
|
|
772
|
+
tool_names: list[str],
|
|
773
|
+
tool_targets: dict[str, str] | None = None,
|
|
774
|
+
fingerprint: str = "",
|
|
775
|
+
):
|
|
776
|
+
"""Record tool call names for loop detection.
|
|
777
|
+
|
|
778
|
+
tool_targets: optional {tool_name: target_key} for read-only dedup.
|
|
779
|
+
e.g. {"read": "/path/to/file", "glob": "**/*.ts"}
|
|
780
|
+
If a pre-computed fingerprint (with argument hashes) is provided,
|
|
781
|
+
use it directly. Otherwise fall back to name-only fingerprint.
|
|
782
|
+
"""
|
|
783
|
+
fp = fingerprint or ("|".join(sorted(tool_names)) if tool_names else "")
|
|
784
|
+
self.tool_call_history.append(fp)
|
|
760
785
|
# Keep last 30 entries
|
|
761
786
|
if len(self.tool_call_history) > 30:
|
|
762
787
|
self.tool_call_history = self.tool_call_history[-30:]
|
|
763
788
|
|
|
789
|
+
# Track read-only tool targets for dedup (Option 3)
|
|
790
|
+
if tool_targets:
|
|
791
|
+
for name, target in tool_targets.items():
|
|
792
|
+
if name.lower() in {n.lower() for n in _READ_ONLY_TOOL_CLASS} and target:
|
|
793
|
+
by_tool = self.tool_target_history.setdefault(name, {})
|
|
794
|
+
by_tool[target] = by_tool.get(target, 0) + 1
|
|
795
|
+
|
|
796
|
+
def has_duplicate_read_target(self, threshold: int = 2) -> tuple[bool, str]:
|
|
797
|
+
"""Check if any read-only tool has re-read the same target >= threshold times.
|
|
798
|
+
|
|
799
|
+
Returns (is_duplicate, tool_name) for the first offending tool.
|
|
800
|
+
"""
|
|
801
|
+
for tool_name, targets in self.tool_target_history.items():
|
|
802
|
+
for target, count in targets.items():
|
|
803
|
+
if count >= threshold:
|
|
804
|
+
return True, tool_name
|
|
805
|
+
return False, ""
|
|
806
|
+
|
|
807
|
+
def reset_tool_targets(self):
|
|
808
|
+
"""Clear target history (on phase reset or fresh user text)."""
|
|
809
|
+
self.tool_target_history = {}
|
|
810
|
+
|
|
764
811
|
def detect_tool_loop(self, window: int = 6) -> tuple[bool, int]:
|
|
765
812
|
"""Detect if the model is stuck in a tool call loop.
|
|
766
813
|
|
|
@@ -851,6 +898,7 @@ class SessionMonitor:
|
|
|
851
898
|
self.tool_state_review_cycles = 0
|
|
852
899
|
self.cycling_tool_names = []
|
|
853
900
|
self.last_tool_fingerprint = ""
|
|
901
|
+
self.reset_tool_targets()
|
|
854
902
|
|
|
855
903
|
def update_completion_state(self, anthropic_body: dict, has_tool_results: bool):
|
|
856
904
|
self.completion_required = _should_enforce_completion_contract(anthropic_body)
|
|
@@ -2095,6 +2143,8 @@ def _resolve_state_machine_tool_choice(
|
|
|
2095
2143
|
monitor.invalid_tool_call_streak = 0
|
|
2096
2144
|
monitor.required_tool_miss_streak = 0
|
|
2097
2145
|
monitor.reset_tool_turn_state(reason="fresh_user_text")
|
|
2146
|
+
monitor.finalize_continuation_count = 0
|
|
2147
|
+
monitor.finalize_synthetic_tool_id = ""
|
|
2098
2148
|
return None, "fresh_user_text"
|
|
2099
2149
|
|
|
2100
2150
|
active_loop = (
|
|
@@ -2113,6 +2163,8 @@ def _resolve_state_machine_tool_choice(
|
|
|
2113
2163
|
monitor.invalid_tool_call_streak = 0
|
|
2114
2164
|
monitor.required_tool_miss_streak = 0
|
|
2115
2165
|
monitor.reset_tool_turn_state(reason="inactive_loop")
|
|
2166
|
+
monitor.finalize_continuation_count = 0
|
|
2167
|
+
monitor.finalize_synthetic_tool_id = ""
|
|
2116
2168
|
return None, "inactive_loop"
|
|
2117
2169
|
|
|
2118
2170
|
if monitor.tool_turn_phase == "bootstrap":
|
|
@@ -2158,6 +2210,16 @@ def _resolve_state_machine_tool_choice(
|
|
|
2158
2210
|
return "finalize", "review_cycle_limit"
|
|
2159
2211
|
|
|
2160
2212
|
if monitor.tool_turn_phase == "act":
|
|
2213
|
+
# Option 3: Early cycle break when same read target is hit 3+ times
|
|
2214
|
+
dup_target, dup_tool = monitor.has_duplicate_read_target(threshold=3)
|
|
2215
|
+
if dup_target and not cycle_looping and not stagnating:
|
|
2216
|
+
cycle_looping = True
|
|
2217
|
+
cycle_repeat = 2
|
|
2218
|
+
logger.warning(
|
|
2219
|
+
"TOOL STATE MACHINE: duplicate read target detected for '%s', triggering early cycle break",
|
|
2220
|
+
dup_tool,
|
|
2221
|
+
)
|
|
2222
|
+
|
|
2161
2223
|
if cycle_looping or stagnating:
|
|
2162
2224
|
reason = "cycle_detected" if cycle_looping else "stagnation"
|
|
2163
2225
|
monitor.set_tool_turn_phase("review", reason=reason)
|
|
@@ -2169,9 +2231,15 @@ def _resolve_state_machine_tool_choice(
|
|
|
2169
2231
|
1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
|
|
2170
2232
|
)
|
|
2171
2233
|
# Capture which tools are cycling for narrowing/hint injection
|
|
2234
|
+
# Strip argument hashes (e.g. "glob:abc12345" -> "glob") so that
|
|
2235
|
+
# tool narrowing can match against actual tool names.
|
|
2172
2236
|
window = max(2, PROXY_TOOL_STATE_CYCLE_WINDOW)
|
|
2173
2237
|
recent = [fp for fp in monitor.tool_call_history[-window:] if fp]
|
|
2174
|
-
|
|
2238
|
+
raw_names = []
|
|
2239
|
+
for fp in recent:
|
|
2240
|
+
for part in fp.split("|"):
|
|
2241
|
+
raw_names.append(part.split(":")[0])
|
|
2242
|
+
monitor.cycling_tool_names = list(dict.fromkeys(raw_names))
|
|
2175
2243
|
logger.warning(
|
|
2176
2244
|
"TOOL STATE MACHINE: entering review (cycle=%s repeat=%d stagnation=%d cycles=%d cycling_tools=%s)",
|
|
2177
2245
|
cycle_looping,
|
|
@@ -2184,7 +2252,11 @@ def _resolve_state_machine_tool_choice(
|
|
|
2184
2252
|
|
|
2185
2253
|
if monitor.tool_state_forced_budget_remaining <= 0:
|
|
2186
2254
|
monitor.set_tool_turn_phase("review", reason="forced_budget_exhausted")
|
|
2187
|
-
|
|
2255
|
+
# Only count toward review cycle limit if there was an actual
|
|
2256
|
+
# cycle/stagnation detected. Budget exhaustion alone means the
|
|
2257
|
+
# model is working — it just used all its turns — not cycling.
|
|
2258
|
+
if cycle_looping or stagnating:
|
|
2259
|
+
monitor.tool_state_review_cycles += 1
|
|
2188
2260
|
monitor.tool_state_auto_budget_remaining = max(
|
|
2189
2261
|
1, PROXY_TOOL_STATE_AUTO_BUDGET
|
|
2190
2262
|
)
|
|
@@ -2192,8 +2264,10 @@ def _resolve_state_machine_tool_choice(
|
|
|
2192
2264
|
1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
|
|
2193
2265
|
)
|
|
2194
2266
|
logger.warning(
|
|
2195
|
-
"TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d)",
|
|
2267
|
+
"TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d cycling=%s stagnating=%s)",
|
|
2196
2268
|
monitor.tool_state_review_cycles,
|
|
2269
|
+
cycle_looping,
|
|
2270
|
+
stagnating,
|
|
2197
2271
|
)
|
|
2198
2272
|
return "required", "forced_budget_exhausted"
|
|
2199
2273
|
|
|
@@ -2206,6 +2280,16 @@ def _resolve_state_machine_tool_choice(
|
|
|
2206
2280
|
monitor.tool_state_forced_budget_remaining = max(
|
|
2207
2281
|
1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
|
|
2208
2282
|
)
|
|
2283
|
+
# If stagnation cleared during review, the model tried a
|
|
2284
|
+
# different approach — reward by reducing cycle pressure and
|
|
2285
|
+
# lifting persistent tool exclusion.
|
|
2286
|
+
if monitor.tool_state_stagnation_streak == 0 and monitor.tool_state_review_cycles > 0:
|
|
2287
|
+
monitor.tool_state_review_cycles = max(0, monitor.tool_state_review_cycles - 1)
|
|
2288
|
+
monitor.cycling_tool_names = []
|
|
2289
|
+
logger.info(
|
|
2290
|
+
"TOOL STATE MACHINE: review_cycles decremented to %d, cycling exclusion lifted (stagnation cleared)",
|
|
2291
|
+
monitor.tool_state_review_cycles,
|
|
2292
|
+
)
|
|
2209
2293
|
return "required", "review_complete"
|
|
2210
2294
|
|
|
2211
2295
|
monitor.tool_state_auto_budget_remaining -= 1
|
|
@@ -2416,6 +2500,9 @@ def build_openai_request(
|
|
|
2416
2500
|
n_msgs = len(anthropic_body.get("messages", []))
|
|
2417
2501
|
has_tool_results = _conversation_has_tool_results(anthropic_body)
|
|
2418
2502
|
|
|
2503
|
+
# Detect and strip synthetic finalize continuation before fingerprinting
|
|
2504
|
+
_detect_and_strip_synthetic_continuation(anthropic_body, monitor)
|
|
2505
|
+
|
|
2419
2506
|
# Record tool calls from the last assistant message for loop detection
|
|
2420
2507
|
latest_tool_fingerprint = _record_last_assistant_tool_calls(
|
|
2421
2508
|
anthropic_body, monitor
|
|
@@ -2504,44 +2591,61 @@ def build_openai_request(
|
|
|
2504
2591
|
monitor.no_progress_streak = (
|
|
2505
2592
|
0 if last_user_has_tool_result else monitor.no_progress_streak + 1
|
|
2506
2593
|
)
|
|
2507
|
-
#
|
|
2594
|
+
# Inject cycle-break instruction when entering review
|
|
2595
|
+
# Option 3 (Cycle 14): Escalate hint text based on review cycle count
|
|
2508
2596
|
if (
|
|
2509
2597
|
monitor.tool_turn_phase == "review"
|
|
2510
2598
|
and state_reason in {"cycle_detected", "stagnation"}
|
|
2511
2599
|
and monitor.cycling_tool_names
|
|
2512
2600
|
):
|
|
2513
2601
|
cycling_names = ", ".join(monitor.cycling_tool_names)
|
|
2514
|
-
|
|
2515
|
-
|
|
2516
|
-
|
|
2517
|
-
|
|
2518
|
-
|
|
2602
|
+
cycles = monitor.tool_state_review_cycles
|
|
2603
|
+
if cycles <= 1:
|
|
2604
|
+
cycle_hint = (
|
|
2605
|
+
f"You have been repeatedly calling the same tool(s): {cycling_names}. "
|
|
2606
|
+
"This is not making progress. Use a DIFFERENT tool to advance the task, "
|
|
2607
|
+
"or call a tool that produces your final answer."
|
|
2608
|
+
)
|
|
2609
|
+
else:
|
|
2610
|
+
cycle_hint = (
|
|
2611
|
+
f"CRITICAL: You have cycled {cycling_names} for {cycles} review rounds without progress. "
|
|
2612
|
+
"State what you have accomplished so far and what the next DIFFERENT action should be. "
|
|
2613
|
+
"Do NOT call the same tool again. Choose a completely different approach or "
|
|
2614
|
+
"produce your final answer now."
|
|
2615
|
+
)
|
|
2519
2616
|
messages = openai_body.get("messages", [])
|
|
2520
2617
|
messages.append({"role": "user", "content": cycle_hint})
|
|
2521
2618
|
openai_body["messages"] = messages
|
|
2522
2619
|
logger.warning(
|
|
2523
|
-
"CYCLE BREAK: injected hint about cycling tools: %s",
|
|
2620
|
+
"CYCLE BREAK: injected hint about cycling tools: %s (escalation=%d)",
|
|
2524
2621
|
cycling_names,
|
|
2622
|
+
cycles,
|
|
2525
2623
|
)
|
|
2526
|
-
#
|
|
2624
|
+
# Narrow tools to exclude cycling tools
|
|
2625
|
+
# Option 1 (Cycle 13): if any cycling tool is read-only, exclude entire class
|
|
2626
|
+
# Option 1 (Cycle 14): persist exclusion during act phase too, not just review
|
|
2527
2627
|
if (
|
|
2528
|
-
monitor.
|
|
2529
|
-
and monitor.cycling_tool_names
|
|
2628
|
+
monitor.cycling_tool_names
|
|
2530
2629
|
and "tools" in openai_body
|
|
2531
2630
|
):
|
|
2631
|
+
exclude_set = set(monitor.cycling_tool_names)
|
|
2632
|
+
# Expand to full read-only class if any cycling tool is read-only
|
|
2633
|
+
if any(n.lower() in {c.lower() for c in _READ_ONLY_TOOL_CLASS} for n in exclude_set):
|
|
2634
|
+
exclude_set |= _READ_ONLY_TOOL_CLASS
|
|
2532
2635
|
original_count = len(openai_body["tools"])
|
|
2533
2636
|
narrowed = [
|
|
2534
2637
|
t
|
|
2535
2638
|
for t in openai_body["tools"]
|
|
2536
|
-
if t.get("function", {}).get("name") not in
|
|
2639
|
+
if t.get("function", {}).get("name") not in exclude_set
|
|
2537
2640
|
]
|
|
2538
2641
|
if narrowed:
|
|
2539
2642
|
openai_body["tools"] = narrowed
|
|
2540
2643
|
logger.warning(
|
|
2541
|
-
"CYCLE BREAK: narrowed tools from %d to %d (excluded %s)",
|
|
2644
|
+
"CYCLE BREAK: narrowed tools from %d to %d (excluded %s, read_only_class=%s)",
|
|
2542
2645
|
original_count,
|
|
2543
2646
|
len(narrowed),
|
|
2544
2647
|
monitor.cycling_tool_names,
|
|
2648
|
+
any(n.lower() in {c.lower() for c in _READ_ONLY_TOOL_CLASS} for n in monitor.cycling_tool_names),
|
|
2545
2649
|
)
|
|
2546
2650
|
else:
|
|
2547
2651
|
logger.warning(
|
|
@@ -2602,13 +2706,117 @@ def build_openai_request(
|
|
|
2602
2706
|
return openai_body
|
|
2603
2707
|
|
|
2604
2708
|
|
|
2709
|
+
def _tool_call_fingerprint(block: dict) -> str:
|
|
2710
|
+
"""Create a fingerprint for a tool call that includes both name and a
|
|
2711
|
+
short hash of the arguments. This prevents false cycle detection when
|
|
2712
|
+
the same tool is called with different arguments (e.g. reading different
|
|
2713
|
+
files)."""
|
|
2714
|
+
name = block.get("name", "unknown")
|
|
2715
|
+
inp = block.get("input")
|
|
2716
|
+
if inp:
|
|
2717
|
+
arg_str = json.dumps(inp, sort_keys=True, separators=(",", ":"))
|
|
2718
|
+
arg_hash = hashlib.md5(arg_str.encode()).hexdigest()[:8]
|
|
2719
|
+
return f"{name}:{arg_hash}"
|
|
2720
|
+
return name
|
|
2721
|
+
|
|
2722
|
+
|
|
2723
|
+
def _detect_and_strip_synthetic_continuation(
|
|
2724
|
+
anthropic_body: dict, monitor: SessionMonitor
|
|
2725
|
+
) -> bool:
|
|
2726
|
+
"""Detect if the latest messages contain a synthetic finalize continuation
|
|
2727
|
+
tool_use/tool_result pair. If found, strip them from the conversation and
|
|
2728
|
+
reset the state machine so the model gets a fresh act cycle.
|
|
2729
|
+
|
|
2730
|
+
Returns True if a synthetic continuation was detected and handled.
|
|
2731
|
+
"""
|
|
2732
|
+
synthetic_id = monitor.finalize_synthetic_tool_id
|
|
2733
|
+
if not synthetic_id:
|
|
2734
|
+
return False
|
|
2735
|
+
|
|
2736
|
+
messages = anthropic_body.get("messages", [])
|
|
2737
|
+
if not messages:
|
|
2738
|
+
return False
|
|
2739
|
+
|
|
2740
|
+
# Walk backwards to find the synthetic tool_result in a user message
|
|
2741
|
+
found = False
|
|
2742
|
+
for msg in reversed(messages):
|
|
2743
|
+
if msg.get("role") != "user":
|
|
2744
|
+
continue
|
|
2745
|
+
content = msg.get("content")
|
|
2746
|
+
if not isinstance(content, list):
|
|
2747
|
+
break
|
|
2748
|
+
has_synthetic = any(
|
|
2749
|
+
isinstance(b, dict)
|
|
2750
|
+
and b.get("type") == "tool_result"
|
|
2751
|
+
and b.get("tool_use_id") == synthetic_id
|
|
2752
|
+
for b in content
|
|
2753
|
+
)
|
|
2754
|
+
if not has_synthetic:
|
|
2755
|
+
break
|
|
2756
|
+
|
|
2757
|
+
# Strip synthetic tool_result from user message
|
|
2758
|
+
new_content = [
|
|
2759
|
+
b for b in content
|
|
2760
|
+
if not (
|
|
2761
|
+
isinstance(b, dict)
|
|
2762
|
+
and b.get("type") == "tool_result"
|
|
2763
|
+
and b.get("tool_use_id") == synthetic_id
|
|
2764
|
+
)
|
|
2765
|
+
]
|
|
2766
|
+
if not new_content:
|
|
2767
|
+
msg["content"] = [{"type": "text", "text": "Continue working on the task."}]
|
|
2768
|
+
else:
|
|
2769
|
+
msg["content"] = new_content
|
|
2770
|
+
|
|
2771
|
+
# Strip synthetic tool_use from the preceding assistant message
|
|
2772
|
+
for asst_msg in reversed(messages):
|
|
2773
|
+
if asst_msg.get("role") != "assistant":
|
|
2774
|
+
continue
|
|
2775
|
+
asst_content = asst_msg.get("content")
|
|
2776
|
+
if isinstance(asst_content, list):
|
|
2777
|
+
asst_msg["content"] = [
|
|
2778
|
+
b for b in asst_content
|
|
2779
|
+
if not (
|
|
2780
|
+
isinstance(b, dict)
|
|
2781
|
+
and b.get("type") == "tool_use"
|
|
2782
|
+
and b.get("id") == synthetic_id
|
|
2783
|
+
)
|
|
2784
|
+
]
|
|
2785
|
+
break
|
|
2786
|
+
|
|
2787
|
+
found = True
|
|
2788
|
+
break
|
|
2789
|
+
|
|
2790
|
+
if not found:
|
|
2791
|
+
return False
|
|
2792
|
+
|
|
2793
|
+
# Reset state machine for fresh act cycle
|
|
2794
|
+
monitor.finalize_synthetic_tool_id = ""
|
|
2795
|
+
monitor.reset_tool_turn_state(reason="finalize_continuation_resume")
|
|
2796
|
+
monitor.reset_completion_recovery()
|
|
2797
|
+
monitor.tool_call_history = []
|
|
2798
|
+
logger.info(
|
|
2799
|
+
"FINALIZE CONTINUATION: stripped synthetic tool id=%s, "
|
|
2800
|
+
"reset state machine for fresh act cycle (continuations=%d/%d)",
|
|
2801
|
+
synthetic_id,
|
|
2802
|
+
monitor.finalize_continuation_count,
|
|
2803
|
+
PROXY_FINALIZE_CONTINUATION_MAX,
|
|
2804
|
+
)
|
|
2805
|
+
return True
|
|
2806
|
+
|
|
2807
|
+
|
|
2605
2808
|
def _record_last_assistant_tool_calls(
|
|
2606
2809
|
anthropic_body: dict, monitor: SessionMonitor
|
|
2607
2810
|
) -> str:
|
|
2608
2811
|
"""Extract tool call names from the last assistant message and record
|
|
2609
|
-
them in the session monitor for loop detection.
|
|
2812
|
+
them in the session monitor for loop detection.
|
|
2813
|
+
|
|
2814
|
+
Fingerprints now include an argument hash so that the same tool called
|
|
2815
|
+
with different arguments (e.g. read(file_a) vs read(file_b)) produces
|
|
2816
|
+
distinct fingerprints, preventing false cycle/stagnation detection."""
|
|
2610
2817
|
messages = anthropic_body.get("messages", [])
|
|
2611
|
-
|
|
2818
|
+
tool_fingerprints = []
|
|
2819
|
+
tool_targets: dict[str, str] = {}
|
|
2612
2820
|
for msg in reversed(messages):
|
|
2613
2821
|
if msg.get("role") != "assistant":
|
|
2614
2822
|
continue
|
|
@@ -2616,11 +2824,28 @@ def _record_last_assistant_tool_calls(
|
|
|
2616
2824
|
if isinstance(content, list):
|
|
2617
2825
|
for block in content:
|
|
2618
2826
|
if isinstance(block, dict) and block.get("type") == "tool_use":
|
|
2619
|
-
|
|
2827
|
+
tool_fingerprints.append(_tool_call_fingerprint(block))
|
|
2828
|
+
# Extract target key for read-only dedup (Option 3)
|
|
2829
|
+
name = block.get("name", "unknown")
|
|
2830
|
+
inp = block.get("input", {})
|
|
2831
|
+
if isinstance(inp, dict):
|
|
2832
|
+
target = (
|
|
2833
|
+
inp.get("file_path")
|
|
2834
|
+
or inp.get("path")
|
|
2835
|
+
or inp.get("pattern")
|
|
2836
|
+
or inp.get("command", "")[:80]
|
|
2837
|
+
)
|
|
2838
|
+
if target:
|
|
2839
|
+
tool_targets[name] = str(target)
|
|
2620
2840
|
break
|
|
2621
|
-
if
|
|
2622
|
-
|
|
2623
|
-
|
|
2841
|
+
if tool_fingerprints:
|
|
2842
|
+
fingerprint = "|".join(sorted(tool_fingerprints))
|
|
2843
|
+
monitor.record_tool_calls(
|
|
2844
|
+
[fp.split(":")[0] for fp in tool_fingerprints],
|
|
2845
|
+
tool_targets=tool_targets,
|
|
2846
|
+
fingerprint=fingerprint,
|
|
2847
|
+
)
|
|
2848
|
+
return fingerprint
|
|
2624
2849
|
return ""
|
|
2625
2850
|
|
|
2626
2851
|
|
|
@@ -4750,16 +4975,20 @@ def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
|
|
|
4750
4975
|
return openai_resp
|
|
4751
4976
|
|
|
4752
4977
|
|
|
4753
|
-
def _detect_and_truncate_degenerate_repetition(
|
|
4978
|
+
def _detect_and_truncate_degenerate_repetition(
|
|
4979
|
+
openai_resp: dict,
|
|
4980
|
+
) -> tuple[dict, bool]:
|
|
4754
4981
|
"""Detect degenerate repetitive text and truncate at first repetition.
|
|
4755
4982
|
|
|
4756
4983
|
When the model produces highly repetitive output (e.g. the same 20+ char
|
|
4757
4984
|
substring repeated 10+ times), truncate at the first repetition boundary
|
|
4758
4985
|
and set finish_reason to stop.
|
|
4986
|
+
|
|
4987
|
+
Returns (response, was_degenerate) so the caller can retry if needed.
|
|
4759
4988
|
"""
|
|
4760
4989
|
text = _openai_message_text(openai_resp)
|
|
4761
4990
|
if not text or len(text) < 200:
|
|
4762
|
-
return openai_resp
|
|
4991
|
+
return openai_resp, False
|
|
4763
4992
|
|
|
4764
4993
|
# Look for repeated substrings of length 20-100
|
|
4765
4994
|
for substr_len in (60, 40, 20):
|
|
@@ -4788,8 +5017,70 @@ def _detect_and_truncate_degenerate_repetition(openai_resp: dict) -> dict:
|
|
|
4788
5017
|
msg = choices[0].get("message", {})
|
|
4789
5018
|
msg["content"] = truncated
|
|
4790
5019
|
choices[0]["finish_reason"] = "stop"
|
|
4791
|
-
return openai_resp
|
|
4792
|
-
return openai_resp
|
|
5020
|
+
return openai_resp, True
|
|
5021
|
+
return openai_resp, False
|
|
5022
|
+
|
|
5023
|
+
|
|
5024
|
+
def _client_has_tool(anthropic_body: dict, tool_name: str) -> bool:
|
|
5025
|
+
"""Check if the client's tool list contains a tool with the given name (case-insensitive)."""
|
|
5026
|
+
lower = tool_name.lower()
|
|
5027
|
+
return any(
|
|
5028
|
+
(t.get("name") or "").lower() == lower for t in anthropic_body.get("tools", [])
|
|
5029
|
+
)
|
|
5030
|
+
|
|
5031
|
+
|
|
5032
|
+
def _client_tool_name(anthropic_body: dict, tool_name: str) -> str:
|
|
5033
|
+
"""Return the actual tool name as the client spells it (case-sensitive match)."""
|
|
5034
|
+
lower = tool_name.lower()
|
|
5035
|
+
for t in anthropic_body.get("tools", []):
|
|
5036
|
+
if (t.get("name") or "").lower() == lower:
|
|
5037
|
+
return t["name"]
|
|
5038
|
+
return tool_name
|
|
5039
|
+
|
|
5040
|
+
|
|
5041
|
+
def _inject_synthetic_continuation(
|
|
5042
|
+
anthropic_resp: dict, monitor: SessionMonitor, anthropic_body: dict
|
|
5043
|
+
) -> dict:
|
|
5044
|
+
"""Inject a synthetic tool_use into a finalize-turn response to keep the
|
|
5045
|
+
client's agentic loop alive.
|
|
5046
|
+
|
|
5047
|
+
Appends a no-op Read("/dev/null") tool_use block and changes stop_reason
|
|
5048
|
+
from "end_turn" to "tool_use" so the client continues sending requests.
|
|
5049
|
+
"""
|
|
5050
|
+
# Pick a safe tool the client knows about (case-insensitive match,
|
|
5051
|
+
# then use the client's actual casing for the tool name)
|
|
5052
|
+
if _client_has_tool(anthropic_body, "read"):
|
|
5053
|
+
tool_name = _client_tool_name(anthropic_body, "read")
|
|
5054
|
+
tool_input = {"file_path": "/dev/null"}
|
|
5055
|
+
elif _client_has_tool(anthropic_body, "bash"):
|
|
5056
|
+
tool_name = _client_tool_name(anthropic_body, "bash")
|
|
5057
|
+
tool_input = {"command": "true", "description": "continuation ping"}
|
|
5058
|
+
else:
|
|
5059
|
+
logger.warning("FINALIZE CONTINUATION: no suitable tool found, skipping injection")
|
|
5060
|
+
return anthropic_resp
|
|
5061
|
+
|
|
5062
|
+
synthetic_id = f"toolu_{uuid.uuid4().hex[:12]}"
|
|
5063
|
+
monitor.finalize_synthetic_tool_id = synthetic_id
|
|
5064
|
+
monitor.finalize_continuation_count += 1
|
|
5065
|
+
|
|
5066
|
+
content = anthropic_resp.get("content", [])
|
|
5067
|
+
content.append({
|
|
5068
|
+
"type": "tool_use",
|
|
5069
|
+
"id": synthetic_id,
|
|
5070
|
+
"name": tool_name,
|
|
5071
|
+
"input": tool_input,
|
|
5072
|
+
})
|
|
5073
|
+
anthropic_resp["content"] = content
|
|
5074
|
+
anthropic_resp["stop_reason"] = "tool_use"
|
|
5075
|
+
|
|
5076
|
+
logger.info(
|
|
5077
|
+
"FINALIZE CONTINUATION: injected synthetic %s tool_use id=%s (count=%d/%d)",
|
|
5078
|
+
tool_name,
|
|
5079
|
+
synthetic_id,
|
|
5080
|
+
monitor.finalize_continuation_count,
|
|
5081
|
+
PROXY_FINALIZE_CONTINUATION_MAX,
|
|
5082
|
+
)
|
|
5083
|
+
return anthropic_resp
|
|
4793
5084
|
|
|
4794
5085
|
|
|
4795
5086
|
def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
@@ -5623,8 +5914,51 @@ async def messages(request: Request):
|
|
|
5623
5914
|
session_id,
|
|
5624
5915
|
)
|
|
5625
5916
|
|
|
5626
|
-
openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
|
|
5917
|
+
openai_resp, was_degenerate = _detect_and_truncate_degenerate_repetition(openai_resp)
|
|
5918
|
+
if was_degenerate:
|
|
5919
|
+
# Retry with constrained parameters to avoid degenerate output.
|
|
5920
|
+
# With tools: force tool_choice=required for a useful tool call.
|
|
5921
|
+
# Without tools (finalize): retry with capped max_tokens for clean text.
|
|
5922
|
+
has_tools = bool(strict_body.get("tools"))
|
|
5923
|
+
retry_body = dict(strict_body)
|
|
5924
|
+
retry_body["max_tokens"] = 2048
|
|
5925
|
+
retry_body["temperature"] = 0.1
|
|
5926
|
+
retry_body["stream"] = False
|
|
5927
|
+
if has_tools:
|
|
5928
|
+
retry_body["tool_choice"] = "required"
|
|
5929
|
+
logger.warning("DEGENERATE RETRY: retrying with tool_choice=required max_tokens=2048")
|
|
5930
|
+
else:
|
|
5931
|
+
logger.warning("DEGENERATE RETRY: retrying text-only with max_tokens=2048 temp=0.1")
|
|
5932
|
+
try:
|
|
5933
|
+
retry_resp = await _post_with_generation_timeout(
|
|
5934
|
+
client, f"{LLAMA_CPP_BASE}/chat/completions", retry_body,
|
|
5935
|
+
{"Content-Type": "application/json"},
|
|
5936
|
+
)
|
|
5937
|
+
if retry_resp.status_code == 200:
|
|
5938
|
+
retry_data = retry_resp.json()
|
|
5939
|
+
retry_text = _openai_message_text(retry_data)
|
|
5940
|
+
_, retry_degenerate = _detect_and_truncate_degenerate_repetition(retry_data)
|
|
5941
|
+
if retry_degenerate:
|
|
5942
|
+
logger.info("DEGENERATE RETRY: retry also degenerate, using truncated original")
|
|
5943
|
+
elif has_tools and (retry_data.get("choices", [{}])[0]
|
|
5944
|
+
.get("message", {}).get("tool_calls")):
|
|
5945
|
+
logger.info("DEGENERATE RETRY: success, got tool call")
|
|
5946
|
+
openai_resp = retry_data
|
|
5947
|
+
elif not has_tools and retry_text and len(retry_text) > 50:
|
|
5948
|
+
logger.info("DEGENERATE RETRY: success, got clean text (%d chars)", len(retry_text))
|
|
5949
|
+
openai_resp = retry_data
|
|
5950
|
+
else:
|
|
5951
|
+
logger.info("DEGENERATE RETRY: retry insufficient, using truncated original")
|
|
5952
|
+
except Exception as exc:
|
|
5953
|
+
logger.warning("DEGENERATE RETRY: failed: %s", exc)
|
|
5627
5954
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
5955
|
+
# FINALIZE CONTINUATION: inject synthetic tool_use to keep client loop alive
|
|
5956
|
+
if (
|
|
5957
|
+
monitor.finalize_turn_active
|
|
5958
|
+
and monitor.finalize_continuation_count < PROXY_FINALIZE_CONTINUATION_MAX
|
|
5959
|
+
and anthropic_resp.get("stop_reason") == "end_turn"
|
|
5960
|
+
):
|
|
5961
|
+
anthropic_resp = _inject_synthetic_continuation(anthropic_resp, monitor, body)
|
|
5628
5962
|
monitor.record_response(anthropic_resp.get("usage", {}).get("output_tokens", 0))
|
|
5629
5963
|
# Update last_input_tokens from upstream's actual prompt_tokens
|
|
5630
5964
|
upstream_input = anthropic_resp.get("usage", {}).get("input_tokens", 0)
|
|
@@ -5962,8 +6296,38 @@ async def messages(request: Request):
|
|
|
5962
6296
|
monitor.invalid_tool_call_streak = 0
|
|
5963
6297
|
monitor.required_tool_miss_streak = 0
|
|
5964
6298
|
|
|
5965
|
-
openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
|
|
6299
|
+
openai_resp, was_degenerate = _detect_and_truncate_degenerate_repetition(openai_resp)
|
|
6300
|
+
# Degenerate retry for non-guarded stream path
|
|
6301
|
+
if was_degenerate and openai_body.get("tools"):
|
|
6302
|
+
logger.warning("DEGENERATE RETRY (stream): retrying with tool_choice=required max_tokens=2048")
|
|
6303
|
+
retry_body = dict(openai_body)
|
|
6304
|
+
retry_body["tool_choice"] = "required"
|
|
6305
|
+
retry_body["max_tokens"] = 2048
|
|
6306
|
+
retry_body["temperature"] = 0.1
|
|
6307
|
+
retry_body["stream"] = False
|
|
6308
|
+
try:
|
|
6309
|
+
retry_resp = await _post_with_generation_timeout(
|
|
6310
|
+
client, f"{LLAMA_CPP_BASE}/chat/completions", retry_body,
|
|
6311
|
+
{"Content-Type": "application/json"},
|
|
6312
|
+
)
|
|
6313
|
+
if retry_resp.status_code == 200:
|
|
6314
|
+
retry_data = retry_resp.json()
|
|
6315
|
+
if (retry_data.get("choices", [{}])[0]
|
|
6316
|
+
.get("message", {}).get("tool_calls")):
|
|
6317
|
+
logger.info("DEGENERATE RETRY (stream): success, got tool call")
|
|
6318
|
+
openai_resp = retry_data
|
|
6319
|
+
else:
|
|
6320
|
+
logger.info("DEGENERATE RETRY (stream): no tool call, using truncated")
|
|
6321
|
+
except Exception as exc:
|
|
6322
|
+
logger.warning("DEGENERATE RETRY (stream): failed: %s", exc)
|
|
5966
6323
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
6324
|
+
# FINALIZE CONTINUATION: inject synthetic tool_use (non-guarded stream path)
|
|
6325
|
+
if (
|
|
6326
|
+
monitor.finalize_turn_active
|
|
6327
|
+
and monitor.finalize_continuation_count < PROXY_FINALIZE_CONTINUATION_MAX
|
|
6328
|
+
and anthropic_resp.get("stop_reason") == "end_turn"
|
|
6329
|
+
):
|
|
6330
|
+
anthropic_resp = _inject_synthetic_continuation(anthropic_resp, monitor, body)
|
|
5967
6331
|
|
|
5968
6332
|
# Track output tokens in session monitor
|
|
5969
6333
|
output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
|
|
@@ -1892,12 +1892,13 @@ class TestToolTurnControls(unittest.TestCase):
|
|
|
1892
1892
|
monitor = proxy.SessionMonitor(context_window=262144)
|
|
1893
1893
|
monitor.tool_turn_phase = "act"
|
|
1894
1894
|
monitor.tool_state_forced_budget_remaining = 20
|
|
1895
|
+
# Use hash-format fingerprints to match _tool_call_fingerprint output
|
|
1895
1896
|
monitor.tool_call_history = [
|
|
1896
|
-
"Bash",
|
|
1897
|
+
"Bash:1e7b8d07",
|
|
1897
1898
|
"TaskOutput",
|
|
1898
|
-
"Bash",
|
|
1899
|
+
"Bash:1e7b8d07",
|
|
1899
1900
|
"TaskOutput",
|
|
1900
|
-
"Bash",
|
|
1901
|
+
"Bash:1e7b8d07",
|
|
1901
1902
|
"TaskOutput",
|
|
1902
1903
|
]
|
|
1903
1904
|
monitor.last_tool_fingerprint = "TaskOutput"
|
|
@@ -2076,7 +2077,9 @@ class TestToolTurnControls(unittest.TestCase):
|
|
|
2076
2077
|
# Review phase now keeps required to prevent end-turn escape
|
|
2077
2078
|
self.assertEqual(openai.get("tool_choice"), "required")
|
|
2078
2079
|
self.assertEqual(monitor.tool_turn_phase, "review")
|
|
2079
|
-
|
|
2080
|
+
# review_cycles only increments when cycle_looping or stagnating,
|
|
2081
|
+
# not on mere budget exhaustion (model was working, not cycling)
|
|
2082
|
+
self.assertEqual(monitor.tool_state_review_cycles, 0)
|
|
2080
2083
|
finally:
|
|
2081
2084
|
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", old_state)
|
|
2082
2085
|
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", old_min_msgs)
|
|
@@ -2242,7 +2245,11 @@ class TestToolTurnControls(unittest.TestCase):
|
|
|
2242
2245
|
monitor = proxy.SessionMonitor(context_window=262144)
|
|
2243
2246
|
monitor.tool_turn_phase = "act"
|
|
2244
2247
|
monitor.tool_state_stagnation_streak = 4
|
|
2245
|
-
|
|
2248
|
+
# Use hash-format fingerprints to match _tool_call_fingerprint output
|
|
2249
|
+
monitor.tool_call_history = [
|
|
2250
|
+
"Bash:1e7b8d07", "TaskOutput", "Bash:1e7b8d07", "TaskOutput",
|
|
2251
|
+
"Bash:1e7b8d07", "TaskOutput",
|
|
2252
|
+
]
|
|
2246
2253
|
monitor.last_tool_fingerprint = "TaskOutput"
|
|
2247
2254
|
|
|
2248
2255
|
body = {
|
|
@@ -3262,8 +3269,11 @@ class TestCycleBreakOptions(unittest.TestCase):
|
|
|
3262
3269
|
monitor = proxy.SessionMonitor(context_window=262144)
|
|
3263
3270
|
monitor.tool_turn_phase = "act"
|
|
3264
3271
|
monitor.tool_state_forced_budget_remaining = 20
|
|
3265
|
-
|
|
3266
|
-
monitor.
|
|
3272
|
+
# Hash-format fingerprints matching Bash+{"command":"ls"}
|
|
3273
|
+
monitor.tool_call_history = [
|
|
3274
|
+
"Bash:781c24ad", "Bash:781c24ad", "Bash:781c24ad", "Bash:781c24ad",
|
|
3275
|
+
]
|
|
3276
|
+
monitor.last_tool_fingerprint = "Bash:781c24ad"
|
|
3267
3277
|
|
|
3268
3278
|
body = {
|
|
3269
3279
|
"model": "test",
|
|
@@ -3323,8 +3333,11 @@ class TestCycleBreakOptions(unittest.TestCase):
|
|
|
3323
3333
|
monitor = proxy.SessionMonitor(context_window=262144)
|
|
3324
3334
|
monitor.tool_turn_phase = "act"
|
|
3325
3335
|
monitor.tool_state_forced_budget_remaining = 20
|
|
3326
|
-
|
|
3327
|
-
monitor.
|
|
3336
|
+
# Hash-format fingerprints matching Bash+{"command":"ls"}
|
|
3337
|
+
monitor.tool_call_history = [
|
|
3338
|
+
"Bash:781c24ad", "Bash:781c24ad", "Bash:781c24ad", "Bash:781c24ad",
|
|
3339
|
+
]
|
|
3340
|
+
monitor.last_tool_fingerprint = "Bash:781c24ad"
|
|
3328
3341
|
|
|
3329
3342
|
body = {
|
|
3330
3343
|
"model": "test",
|
|
@@ -3369,9 +3382,9 @@ class TestCycleBreakOptions(unittest.TestCase):
|
|
|
3369
3382
|
"""Option 3: default forced budget reduced from 24 to 12."""
|
|
3370
3383
|
self.assertEqual(proxy.PROXY_TOOL_STATE_FORCED_BUDGET, 12)
|
|
3371
3384
|
|
|
3372
|
-
def
|
|
3373
|
-
"""Option 4: default review cycle limit
|
|
3374
|
-
self.assertEqual(proxy.PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT,
|
|
3385
|
+
def test_review_cycle_limit_default_is_3(self):
|
|
3386
|
+
"""Option 4: default review cycle limit is 3."""
|
|
3387
|
+
self.assertEqual(proxy.PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT, 3)
|
|
3375
3388
|
|
|
3376
3389
|
def test_cycling_tool_names_cleared_on_reset(self):
|
|
3377
3390
|
"""cycling_tool_names is cleared when tool turn state resets."""
|
|
@@ -3450,8 +3463,9 @@ class TestDegenerateRepetitionDetection(unittest.TestCase):
|
|
|
3450
3463
|
openai_resp = {
|
|
3451
3464
|
"choices": [{"message": {"content": repeated}, "finish_reason": "length"}]
|
|
3452
3465
|
}
|
|
3453
|
-
result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3466
|
+
result, truncated = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3454
3467
|
truncated_text = result["choices"][0]["message"]["content"]
|
|
3468
|
+
self.assertTrue(truncated)
|
|
3455
3469
|
self.assertLess(len(truncated_text), len(repeated))
|
|
3456
3470
|
self.assertEqual(result["choices"][0]["finish_reason"], "stop")
|
|
3457
3471
|
|
|
@@ -3461,7 +3475,8 @@ class TestDegenerateRepetitionDetection(unittest.TestCase):
|
|
|
3461
3475
|
openai_resp = {
|
|
3462
3476
|
"choices": [{"message": {"content": text}, "finish_reason": "stop"}]
|
|
3463
3477
|
}
|
|
3464
|
-
result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3478
|
+
result, truncated = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3479
|
+
self.assertFalse(truncated)
|
|
3465
3480
|
self.assertEqual(result["choices"][0]["message"]["content"], text)
|
|
3466
3481
|
|
|
3467
3482
|
def test_preserves_short_text(self):
|
|
@@ -3470,7 +3485,8 @@ class TestDegenerateRepetitionDetection(unittest.TestCase):
|
|
|
3470
3485
|
openai_resp = {
|
|
3471
3486
|
"choices": [{"message": {"content": text}, "finish_reason": "stop"}]
|
|
3472
3487
|
}
|
|
3473
|
-
result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3488
|
+
result, truncated = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3489
|
+
self.assertFalse(truncated)
|
|
3474
3490
|
self.assertEqual(result["choices"][0]["message"]["content"], text)
|
|
3475
3491
|
|
|
3476
3492
|
def test_max_tokens_floor_skipped_for_non_tool_requests(self):
|
|
@@ -4220,3 +4236,342 @@ class TestReviewPhaseBootstrapReset(unittest.TestCase):
|
|
|
4220
4236
|
# The bootstrap reset only triggers for review phase
|
|
4221
4237
|
self.assertNotEqual(m.tool_turn_phase, "review")
|
|
4222
4238
|
# In act phase, the normal guardrail fallback path runs instead
|
|
4239
|
+
|
|
4240
|
+
|
|
4241
|
+
class TestReadOnlyCycleClassExclusion(unittest.TestCase):
|
|
4242
|
+
"""Tests for Option 1: read-only tool class exclusion on cycle break,
|
|
4243
|
+
Option 2: reduced cycle window (3), and Option 3: duplicate target dedup."""
|
|
4244
|
+
|
|
4245
|
+
def _make_body_with_tools(self, tool_names):
|
|
4246
|
+
"""Build a minimal Anthropic body with named tools and a tool_result."""
|
|
4247
|
+
tools = [
|
|
4248
|
+
{"name": n, "description": f"{n} tool", "input_schema": {"type": "object"}}
|
|
4249
|
+
for n in tool_names
|
|
4250
|
+
]
|
|
4251
|
+
return {
|
|
4252
|
+
"model": "test",
|
|
4253
|
+
"messages": [
|
|
4254
|
+
{"role": "user", "content": "do something"},
|
|
4255
|
+
{
|
|
4256
|
+
"role": "assistant",
|
|
4257
|
+
"content": [
|
|
4258
|
+
{
|
|
4259
|
+
"type": "tool_use",
|
|
4260
|
+
"id": "toolu_1",
|
|
4261
|
+
"name": tool_names[0],
|
|
4262
|
+
"input": {"file_path": "/some/file.ts"},
|
|
4263
|
+
}
|
|
4264
|
+
],
|
|
4265
|
+
},
|
|
4266
|
+
{
|
|
4267
|
+
"role": "user",
|
|
4268
|
+
"content": [
|
|
4269
|
+
{"type": "tool_result", "tool_use_id": "toolu_1", "content": "ok"}
|
|
4270
|
+
],
|
|
4271
|
+
},
|
|
4272
|
+
],
|
|
4273
|
+
"tools": tools,
|
|
4274
|
+
}
|
|
4275
|
+
|
|
4276
|
+
def test_read_only_class_exclusion_expands(self):
|
|
4277
|
+
"""When 'read' is cycling, all read-only tools are excluded, not just 'read'."""
|
|
4278
|
+
old_vals = {
|
|
4279
|
+
"PROXY_TOOL_STATE_MACHINE": getattr(proxy, "PROXY_TOOL_STATE_MACHINE"),
|
|
4280
|
+
"PROXY_TOOL_STATE_MIN_MESSAGES": getattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES"),
|
|
4281
|
+
"PROXY_TOOL_STATE_FORCED_BUDGET": getattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET"),
|
|
4282
|
+
"PROXY_TOOL_STATE_CYCLE_WINDOW": getattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW"),
|
|
4283
|
+
"PROXY_TOOL_STATE_STAGNATION_THRESHOLD": getattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD"),
|
|
4284
|
+
}
|
|
4285
|
+
try:
|
|
4286
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
|
|
4287
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
|
|
4288
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 3)
|
|
4289
|
+
setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 3)
|
|
4290
|
+
setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 2)
|
|
4291
|
+
|
|
4292
|
+
all_tools = ["read", "glob", "grep", "bash", "write", "edit"]
|
|
4293
|
+
body = self._make_body_with_tools(all_tools)
|
|
4294
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4295
|
+
|
|
4296
|
+
# Simulate cycling on 'read' by recording 3 identical fingerprints
|
|
4297
|
+
# Hash-format matching read+{"file_path":"/some/file.ts"}
|
|
4298
|
+
fp = "read:cfb28722"
|
|
4299
|
+
monitor.record_tool_calls(["read"], fingerprint=fp)
|
|
4300
|
+
monitor.record_tool_calls(["read"], fingerprint=fp)
|
|
4301
|
+
monitor.record_tool_calls(["read"], fingerprint=fp)
|
|
4302
|
+
|
|
4303
|
+
openai_body = proxy.build_openai_request(body, monitor)
|
|
4304
|
+
|
|
4305
|
+
# After cycle break, the tools in the body should exclude ALL
|
|
4306
|
+
# read-only tools, not just 'read'
|
|
4307
|
+
remaining_names = [
|
|
4308
|
+
t.get("function", {}).get("name") for t in openai_body.get("tools", [])
|
|
4309
|
+
]
|
|
4310
|
+
self.assertNotIn("read", remaining_names)
|
|
4311
|
+
self.assertNotIn("glob", remaining_names)
|
|
4312
|
+
self.assertNotIn("grep", remaining_names)
|
|
4313
|
+
# Write/action tools should remain
|
|
4314
|
+
self.assertIn("bash", remaining_names)
|
|
4315
|
+
self.assertIn("write", remaining_names)
|
|
4316
|
+
self.assertIn("edit", remaining_names)
|
|
4317
|
+
finally:
|
|
4318
|
+
for k, v in old_vals.items():
|
|
4319
|
+
setattr(proxy, k, v)
|
|
4320
|
+
|
|
4321
|
+
def test_non_read_tool_cycling_no_class_expansion(self):
|
|
4322
|
+
"""When 'bash' is cycling, only 'bash' is excluded, not read-only tools."""
|
|
4323
|
+
old_vals = {
|
|
4324
|
+
"PROXY_TOOL_STATE_MACHINE": getattr(proxy, "PROXY_TOOL_STATE_MACHINE"),
|
|
4325
|
+
"PROXY_TOOL_STATE_MIN_MESSAGES": getattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES"),
|
|
4326
|
+
"PROXY_TOOL_STATE_FORCED_BUDGET": getattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET"),
|
|
4327
|
+
"PROXY_TOOL_STATE_CYCLE_WINDOW": getattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW"),
|
|
4328
|
+
"PROXY_TOOL_STATE_STAGNATION_THRESHOLD": getattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD"),
|
|
4329
|
+
}
|
|
4330
|
+
try:
|
|
4331
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
|
|
4332
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
|
|
4333
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 3)
|
|
4334
|
+
setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 3)
|
|
4335
|
+
setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 2)
|
|
4336
|
+
|
|
4337
|
+
all_tools = ["read", "glob", "grep", "bash", "write", "edit"]
|
|
4338
|
+
body = self._make_body_with_tools(all_tools)
|
|
4339
|
+
# Change the assistant tool_use to bash
|
|
4340
|
+
body["messages"][1]["content"][0]["name"] = "bash"
|
|
4341
|
+
body["messages"][1]["content"][0]["input"] = {"command": "ls"}
|
|
4342
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4343
|
+
|
|
4344
|
+
# Use hash-format fingerprints matching bash+{"command":"ls"}
|
|
4345
|
+
fp = "bash:781c24ad"
|
|
4346
|
+
monitor.record_tool_calls(["bash"], fingerprint=fp)
|
|
4347
|
+
monitor.record_tool_calls(["bash"], fingerprint=fp)
|
|
4348
|
+
monitor.record_tool_calls(["bash"], fingerprint=fp)
|
|
4349
|
+
|
|
4350
|
+
openai_body = proxy.build_openai_request(body, monitor)
|
|
4351
|
+
|
|
4352
|
+
remaining_names = [
|
|
4353
|
+
t.get("function", {}).get("name") for t in openai_body.get("tools", [])
|
|
4354
|
+
]
|
|
4355
|
+
self.assertNotIn("bash", remaining_names)
|
|
4356
|
+
# Read-only tools should still be available
|
|
4357
|
+
self.assertIn("read", remaining_names)
|
|
4358
|
+
self.assertIn("glob", remaining_names)
|
|
4359
|
+
self.assertIn("grep", remaining_names)
|
|
4360
|
+
finally:
|
|
4361
|
+
for k, v in old_vals.items():
|
|
4362
|
+
setattr(proxy, k, v)
|
|
4363
|
+
|
|
4364
|
+
def test_duplicate_read_target_triggers_early_cycle(self):
|
|
4365
|
+
"""Option 3: reading same file 3+ times triggers early cycle break."""
|
|
4366
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4367
|
+
|
|
4368
|
+
# Record 3 reads of same target
|
|
4369
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/path/to/file.ts"})
|
|
4370
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/path/to/file.ts"})
|
|
4371
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/path/to/file.ts"})
|
|
4372
|
+
|
|
4373
|
+
dup, tool = monitor.has_duplicate_read_target(threshold=3)
|
|
4374
|
+
self.assertTrue(dup)
|
|
4375
|
+
self.assertEqual(tool, "read")
|
|
4376
|
+
|
|
4377
|
+
def test_different_read_targets_no_duplicate(self):
|
|
4378
|
+
"""Option 3: reading different files does NOT trigger duplicate detection."""
|
|
4379
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4380
|
+
|
|
4381
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/path/a.ts"})
|
|
4382
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/path/b.ts"})
|
|
4383
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/path/c.ts"})
|
|
4384
|
+
|
|
4385
|
+
dup, _ = monitor.has_duplicate_read_target(threshold=3)
|
|
4386
|
+
self.assertFalse(dup)
|
|
4387
|
+
|
|
4388
|
+
def test_cycle_window_default_is_3(self):
|
|
4389
|
+
"""Option 2: verify default cycle window is now 3."""
|
|
4390
|
+
# This tests the constant directly
|
|
4391
|
+
self.assertEqual(
|
|
4392
|
+
int(proxy.os.environ.get("PROXY_TOOL_STATE_CYCLE_WINDOW", "3")), 3
|
|
4393
|
+
)
|
|
4394
|
+
|
|
4395
|
+
def test_target_history_reset_on_state_reset(self):
|
|
4396
|
+
"""Target history is cleared when tool state resets."""
|
|
4397
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4398
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/file.ts"})
|
|
4399
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/file.ts"})
|
|
4400
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/file.ts"})
|
|
4401
|
+
|
|
4402
|
+
dup, _ = monitor.has_duplicate_read_target(threshold=3)
|
|
4403
|
+
self.assertTrue(dup)
|
|
4404
|
+
|
|
4405
|
+
monitor.reset_tool_turn_state(reason="test_reset")
|
|
4406
|
+
|
|
4407
|
+
dup, _ = monitor.has_duplicate_read_target(threshold=3)
|
|
4408
|
+
self.assertFalse(dup)
|
|
4409
|
+
|
|
4410
|
+
|
|
4411
|
+
class TestPersistentCycleExclusion(unittest.TestCase):
|
|
4412
|
+
"""Tests for Cycle 14: persistent exclusion, escalating hints, and
|
|
4413
|
+
exclusion across review→act transitions."""
|
|
4414
|
+
|
|
4415
|
+
def _make_body_with_tools(self, tool_names, active_tool="bash", active_input=None):
|
|
4416
|
+
tools = [
|
|
4417
|
+
{"name": n, "description": f"{n} tool", "input_schema": {"type": "object"}}
|
|
4418
|
+
for n in tool_names
|
|
4419
|
+
]
|
|
4420
|
+
inp = active_input or {"command": "ls"}
|
|
4421
|
+
return {
|
|
4422
|
+
"model": "test",
|
|
4423
|
+
"messages": [
|
|
4424
|
+
{"role": "user", "content": "do something"},
|
|
4425
|
+
{
|
|
4426
|
+
"role": "assistant",
|
|
4427
|
+
"content": [
|
|
4428
|
+
{"type": "tool_use", "id": "t1", "name": active_tool, "input": inp}
|
|
4429
|
+
],
|
|
4430
|
+
},
|
|
4431
|
+
{
|
|
4432
|
+
"role": "user",
|
|
4433
|
+
"content": [
|
|
4434
|
+
{"type": "tool_result", "tool_use_id": "t1", "content": "ok"}
|
|
4435
|
+
],
|
|
4436
|
+
},
|
|
4437
|
+
],
|
|
4438
|
+
"tools": tools,
|
|
4439
|
+
}
|
|
4440
|
+
|
|
4441
|
+
def test_exclusion_persists_through_act_phase(self):
|
|
4442
|
+
"""Option 1: cycling_tool_names exclusion persists in act phase after review."""
|
|
4443
|
+
old_vals = {}
|
|
4444
|
+
for k in ["PROXY_TOOL_STATE_MACHINE", "PROXY_TOOL_STATE_MIN_MESSAGES",
|
|
4445
|
+
"PROXY_TOOL_STATE_FORCED_BUDGET", "PROXY_TOOL_STATE_CYCLE_WINDOW",
|
|
4446
|
+
"PROXY_TOOL_STATE_STAGNATION_THRESHOLD"]:
|
|
4447
|
+
old_vals[k] = getattr(proxy, k)
|
|
4448
|
+
try:
|
|
4449
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
|
|
4450
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
|
|
4451
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 6)
|
|
4452
|
+
setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 3)
|
|
4453
|
+
setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 2)
|
|
4454
|
+
|
|
4455
|
+
all_tools = ["bash", "read", "write", "edit"]
|
|
4456
|
+
body = self._make_body_with_tools(all_tools)
|
|
4457
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4458
|
+
|
|
4459
|
+
# Simulate bash cycling that triggers review
|
|
4460
|
+
monitor.cycling_tool_names = ["bash"]
|
|
4461
|
+
monitor.tool_turn_phase = "act"
|
|
4462
|
+
monitor.tool_state_forced_budget_remaining = 5
|
|
4463
|
+
|
|
4464
|
+
openai = proxy.build_openai_request(body, monitor)
|
|
4465
|
+
|
|
4466
|
+
# In act phase with cycling_tool_names set, bash should be excluded
|
|
4467
|
+
remaining = [t["function"]["name"] for t in openai.get("tools", [])]
|
|
4468
|
+
self.assertNotIn("bash", remaining)
|
|
4469
|
+
self.assertIn("read", remaining)
|
|
4470
|
+
self.assertIn("write", remaining)
|
|
4471
|
+
finally:
|
|
4472
|
+
for k, v in old_vals.items():
|
|
4473
|
+
setattr(proxy, k, v)
|
|
4474
|
+
|
|
4475
|
+
def test_exclusion_cleared_on_stagnation_clear(self):
|
|
4476
|
+
"""Option 1: cycling exclusion is lifted when stagnation clears in review."""
|
|
4477
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4478
|
+
monitor.tool_turn_phase = "review"
|
|
4479
|
+
monitor.tool_state_review_cycles = 1
|
|
4480
|
+
monitor.tool_state_stagnation_streak = 0 # stagnation cleared
|
|
4481
|
+
monitor.cycling_tool_names = ["bash"]
|
|
4482
|
+
monitor.tool_state_auto_budget_remaining = 0
|
|
4483
|
+
monitor.tool_state_forced_budget_remaining = 6
|
|
4484
|
+
|
|
4485
|
+
# This should transition review→act and clear cycling names
|
|
4486
|
+
old_vals = {}
|
|
4487
|
+
for k in ["PROXY_TOOL_STATE_MACHINE", "PROXY_TOOL_STATE_MIN_MESSAGES",
|
|
4488
|
+
"PROXY_TOOL_STATE_FORCED_BUDGET"]:
|
|
4489
|
+
old_vals[k] = getattr(proxy, k)
|
|
4490
|
+
try:
|
|
4491
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
|
|
4492
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
|
|
4493
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 6)
|
|
4494
|
+
|
|
4495
|
+
body = self._make_body_with_tools(["bash", "read", "write"])
|
|
4496
|
+
proxy.build_openai_request(body, monitor)
|
|
4497
|
+
|
|
4498
|
+
self.assertEqual(monitor.tool_turn_phase, "act")
|
|
4499
|
+
self.assertEqual(monitor.cycling_tool_names, [])
|
|
4500
|
+
finally:
|
|
4501
|
+
for k, v in old_vals.items():
|
|
4502
|
+
setattr(proxy, k, v)
|
|
4503
|
+
|
|
4504
|
+
def test_escalated_hint_on_cycle_2(self):
|
|
4505
|
+
"""Option 3: cycle 2+ gets escalated CRITICAL hint text."""
|
|
4506
|
+
old_vals = {}
|
|
4507
|
+
for k in ["PROXY_TOOL_STATE_MACHINE", "PROXY_TOOL_STATE_MIN_MESSAGES",
|
|
4508
|
+
"PROXY_TOOL_STATE_FORCED_BUDGET", "PROXY_TOOL_STATE_CYCLE_WINDOW",
|
|
4509
|
+
"PROXY_TOOL_STATE_STAGNATION_THRESHOLD"]:
|
|
4510
|
+
old_vals[k] = getattr(proxy, k)
|
|
4511
|
+
try:
|
|
4512
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
|
|
4513
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
|
|
4514
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 20)
|
|
4515
|
+
setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 3)
|
|
4516
|
+
setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 2)
|
|
4517
|
+
|
|
4518
|
+
all_tools = ["bash", "read", "write"]
|
|
4519
|
+
body = self._make_body_with_tools(all_tools)
|
|
4520
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4521
|
+
# Pre-set as if we've already been through 1 review cycle
|
|
4522
|
+
monitor.tool_turn_phase = "act"
|
|
4523
|
+
monitor.tool_state_review_cycles = 1
|
|
4524
|
+
monitor.tool_state_forced_budget_remaining = 20
|
|
4525
|
+
monitor.tool_state_stagnation_streak = 3
|
|
4526
|
+
fp = "bash:781c24ad"
|
|
4527
|
+
monitor.tool_call_history = [fp, fp, fp]
|
|
4528
|
+
monitor.last_tool_fingerprint = fp
|
|
4529
|
+
|
|
4530
|
+
openai = proxy.build_openai_request(body, monitor)
|
|
4531
|
+
|
|
4532
|
+
# Should now be in review with cycles=2 and escalated hint
|
|
4533
|
+
self.assertEqual(monitor.tool_turn_phase, "review")
|
|
4534
|
+
self.assertEqual(monitor.tool_state_review_cycles, 2)
|
|
4535
|
+
messages = openai.get("messages", [])
|
|
4536
|
+
last_user = [m for m in messages if m.get("role") == "user"][-1]
|
|
4537
|
+
self.assertIn("CRITICAL", last_user["content"])
|
|
4538
|
+
self.assertIn("2 review rounds", last_user["content"])
|
|
4539
|
+
finally:
|
|
4540
|
+
for k, v in old_vals.items():
|
|
4541
|
+
setattr(proxy, k, v)
|
|
4542
|
+
|
|
4543
|
+
def test_mild_hint_on_cycle_1(self):
|
|
4544
|
+
"""Option 3: cycle 1 gets mild hint, not escalated."""
|
|
4545
|
+
old_vals = {}
|
|
4546
|
+
for k in ["PROXY_TOOL_STATE_MACHINE", "PROXY_TOOL_STATE_MIN_MESSAGES",
|
|
4547
|
+
"PROXY_TOOL_STATE_FORCED_BUDGET", "PROXY_TOOL_STATE_CYCLE_WINDOW",
|
|
4548
|
+
"PROXY_TOOL_STATE_STAGNATION_THRESHOLD"]:
|
|
4549
|
+
old_vals[k] = getattr(proxy, k)
|
|
4550
|
+
try:
|
|
4551
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
|
|
4552
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
|
|
4553
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 20)
|
|
4554
|
+
setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 3)
|
|
4555
|
+
setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 2)
|
|
4556
|
+
|
|
4557
|
+
body = self._make_body_with_tools(["bash", "read", "write"])
|
|
4558
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4559
|
+
monitor.tool_turn_phase = "act"
|
|
4560
|
+
monitor.tool_state_review_cycles = 0
|
|
4561
|
+
monitor.tool_state_forced_budget_remaining = 20
|
|
4562
|
+
monitor.tool_state_stagnation_streak = 3
|
|
4563
|
+
fp = "bash:781c24ad"
|
|
4564
|
+
monitor.tool_call_history = [fp, fp, fp]
|
|
4565
|
+
monitor.last_tool_fingerprint = fp
|
|
4566
|
+
|
|
4567
|
+
openai = proxy.build_openai_request(body, monitor)
|
|
4568
|
+
|
|
4569
|
+
self.assertEqual(monitor.tool_turn_phase, "review")
|
|
4570
|
+
self.assertEqual(monitor.tool_state_review_cycles, 1)
|
|
4571
|
+
messages = openai.get("messages", [])
|
|
4572
|
+
last_user = [m for m in messages if m.get("role") == "user"][-1]
|
|
4573
|
+
self.assertNotIn("CRITICAL", last_user["content"])
|
|
4574
|
+
self.assertIn("DIFFERENT tool", last_user["content"])
|
|
4575
|
+
finally:
|
|
4576
|
+
for k, v in old_vals.items():
|
|
4577
|
+
setattr(proxy, k, v)
|