@miller-tech/uap 1.20.24 → 1.20.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -155,16 +155,16 @@ PROXY_TOOL_STATE_FORCED_BUDGET = int(
|
|
|
155
155
|
)
|
|
156
156
|
PROXY_TOOL_STATE_AUTO_BUDGET = int(os.environ.get("PROXY_TOOL_STATE_AUTO_BUDGET", "2"))
|
|
157
157
|
PROXY_TOOL_STATE_STAGNATION_THRESHOLD = int(
|
|
158
|
-
os.environ.get("PROXY_TOOL_STATE_STAGNATION_THRESHOLD", "
|
|
158
|
+
os.environ.get("PROXY_TOOL_STATE_STAGNATION_THRESHOLD", "8")
|
|
159
159
|
)
|
|
160
160
|
PROXY_TOOL_STATE_CYCLE_WINDOW = int(
|
|
161
|
-
os.environ.get("PROXY_TOOL_STATE_CYCLE_WINDOW", "
|
|
161
|
+
os.environ.get("PROXY_TOOL_STATE_CYCLE_WINDOW", "3")
|
|
162
162
|
)
|
|
163
163
|
PROXY_TOOL_STATE_FINALIZE_THRESHOLD = int(
|
|
164
164
|
os.environ.get("PROXY_TOOL_STATE_FINALIZE_THRESHOLD", "18")
|
|
165
165
|
)
|
|
166
166
|
PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT = int(
|
|
167
|
-
os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "
|
|
167
|
+
os.environ.get("PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT", "3")
|
|
168
168
|
)
|
|
169
169
|
PROXY_COMPLETION_RECOVERY_MAX = int(
|
|
170
170
|
os.environ.get("PROXY_COMPLETION_RECOVERY_MAX", "3")
|
|
@@ -189,6 +189,12 @@ PROXY_TOOL_NARROWING_EXPAND_ON_LOOP = os.environ.get(
|
|
|
189
189
|
"off",
|
|
190
190
|
"no",
|
|
191
191
|
}
|
|
192
|
+
# Read-only tools that should be excluded as a class when any one cycles
|
|
193
|
+
_READ_ONLY_TOOL_CLASS = frozenset({
|
|
194
|
+
"read", "glob", "grep", "Read", "Glob", "Grep",
|
|
195
|
+
"search", "Search", "list_files", "ListFiles",
|
|
196
|
+
})
|
|
197
|
+
|
|
192
198
|
PROXY_GUARDRAIL_RETRY = os.environ.get("PROXY_GUARDRAIL_RETRY", "on").lower() not in {
|
|
193
199
|
"0",
|
|
194
200
|
"false",
|
|
@@ -196,6 +202,9 @@ PROXY_GUARDRAIL_RETRY = os.environ.get("PROXY_GUARDRAIL_RETRY", "on").lower() no
|
|
|
196
202
|
"no",
|
|
197
203
|
}
|
|
198
204
|
PROXY_SESSION_TTL_SECS = int(os.environ.get("PROXY_SESSION_TTL_SECS", "7200"))
|
|
205
|
+
PROXY_FINALIZE_CONTINUATION_MAX = int(
|
|
206
|
+
os.environ.get("PROXY_FINALIZE_CONTINUATION_MAX", "3")
|
|
207
|
+
)
|
|
199
208
|
PROXY_STREAM_REASONING_FALLBACK = (
|
|
200
209
|
os.environ.get("PROXY_STREAM_REASONING_FALLBACK", "off").strip().lower()
|
|
201
210
|
)
|
|
@@ -621,6 +630,9 @@ class SessionMonitor:
|
|
|
621
630
|
tool_call_history: list = field(
|
|
622
631
|
default_factory=list
|
|
623
632
|
) # Recent tool call fingerprints
|
|
633
|
+
tool_target_history: dict = field(
|
|
634
|
+
default_factory=dict
|
|
635
|
+
) # {tool_name: {target: count}} for read-only dedup
|
|
624
636
|
consecutive_forced_count: int = (
|
|
625
637
|
0 # How many times tool_choice was forced consecutively
|
|
626
638
|
)
|
|
@@ -646,6 +658,8 @@ class SessionMonitor:
|
|
|
646
658
|
cycling_tool_names: list = field(default_factory=list)
|
|
647
659
|
last_response_garbled: bool = False # previous turn had garbled/malformed output
|
|
648
660
|
finalize_turn_active: bool = False
|
|
661
|
+
finalize_continuation_count: int = 0
|
|
662
|
+
finalize_synthetic_tool_id: str = ""
|
|
649
663
|
completion_required: bool = False
|
|
650
664
|
completion_pending: bool = False
|
|
651
665
|
completion_verified: bool = False
|
|
@@ -753,14 +767,47 @@ class SessionMonitor:
|
|
|
753
767
|
|
|
754
768
|
# --- Token Loop Protection Methods ---
|
|
755
769
|
|
|
756
|
-
def record_tool_calls(
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
770
|
+
def record_tool_calls(
|
|
771
|
+
self,
|
|
772
|
+
tool_names: list[str],
|
|
773
|
+
tool_targets: dict[str, str] | None = None,
|
|
774
|
+
fingerprint: str = "",
|
|
775
|
+
):
|
|
776
|
+
"""Record tool call names for loop detection.
|
|
777
|
+
|
|
778
|
+
tool_targets: optional {tool_name: target_key} for read-only dedup.
|
|
779
|
+
e.g. {"read": "/path/to/file", "glob": "**/*.ts"}
|
|
780
|
+
If a pre-computed fingerprint (with argument hashes) is provided,
|
|
781
|
+
use it directly. Otherwise fall back to name-only fingerprint.
|
|
782
|
+
"""
|
|
783
|
+
fp = fingerprint or ("|".join(sorted(tool_names)) if tool_names else "")
|
|
784
|
+
self.tool_call_history.append(fp)
|
|
760
785
|
# Keep last 30 entries
|
|
761
786
|
if len(self.tool_call_history) > 30:
|
|
762
787
|
self.tool_call_history = self.tool_call_history[-30:]
|
|
763
788
|
|
|
789
|
+
# Track read-only tool targets for dedup (Option 3)
|
|
790
|
+
if tool_targets:
|
|
791
|
+
for name, target in tool_targets.items():
|
|
792
|
+
if name.lower() in {n.lower() for n in _READ_ONLY_TOOL_CLASS} and target:
|
|
793
|
+
by_tool = self.tool_target_history.setdefault(name, {})
|
|
794
|
+
by_tool[target] = by_tool.get(target, 0) + 1
|
|
795
|
+
|
|
796
|
+
def has_duplicate_read_target(self, threshold: int = 2) -> tuple[bool, str]:
|
|
797
|
+
"""Check if any read-only tool has re-read the same target >= threshold times.
|
|
798
|
+
|
|
799
|
+
Returns (is_duplicate, tool_name) for the first offending tool.
|
|
800
|
+
"""
|
|
801
|
+
for tool_name, targets in self.tool_target_history.items():
|
|
802
|
+
for target, count in targets.items():
|
|
803
|
+
if count >= threshold:
|
|
804
|
+
return True, tool_name
|
|
805
|
+
return False, ""
|
|
806
|
+
|
|
807
|
+
def reset_tool_targets(self):
|
|
808
|
+
"""Clear target history (on phase reset or fresh user text)."""
|
|
809
|
+
self.tool_target_history = {}
|
|
810
|
+
|
|
764
811
|
def detect_tool_loop(self, window: int = 6) -> tuple[bool, int]:
|
|
765
812
|
"""Detect if the model is stuck in a tool call loop.
|
|
766
813
|
|
|
@@ -851,6 +898,7 @@ class SessionMonitor:
|
|
|
851
898
|
self.tool_state_review_cycles = 0
|
|
852
899
|
self.cycling_tool_names = []
|
|
853
900
|
self.last_tool_fingerprint = ""
|
|
901
|
+
self.reset_tool_targets()
|
|
854
902
|
|
|
855
903
|
def update_completion_state(self, anthropic_body: dict, has_tool_results: bool):
|
|
856
904
|
self.completion_required = _should_enforce_completion_contract(anthropic_body)
|
|
@@ -2095,6 +2143,8 @@ def _resolve_state_machine_tool_choice(
|
|
|
2095
2143
|
monitor.invalid_tool_call_streak = 0
|
|
2096
2144
|
monitor.required_tool_miss_streak = 0
|
|
2097
2145
|
monitor.reset_tool_turn_state(reason="fresh_user_text")
|
|
2146
|
+
monitor.finalize_continuation_count = 0
|
|
2147
|
+
monitor.finalize_synthetic_tool_id = ""
|
|
2098
2148
|
return None, "fresh_user_text"
|
|
2099
2149
|
|
|
2100
2150
|
active_loop = (
|
|
@@ -2113,6 +2163,8 @@ def _resolve_state_machine_tool_choice(
|
|
|
2113
2163
|
monitor.invalid_tool_call_streak = 0
|
|
2114
2164
|
monitor.required_tool_miss_streak = 0
|
|
2115
2165
|
monitor.reset_tool_turn_state(reason="inactive_loop")
|
|
2166
|
+
monitor.finalize_continuation_count = 0
|
|
2167
|
+
monitor.finalize_synthetic_tool_id = ""
|
|
2116
2168
|
return None, "inactive_loop"
|
|
2117
2169
|
|
|
2118
2170
|
if monitor.tool_turn_phase == "bootstrap":
|
|
@@ -2158,6 +2210,16 @@ def _resolve_state_machine_tool_choice(
|
|
|
2158
2210
|
return "finalize", "review_cycle_limit"
|
|
2159
2211
|
|
|
2160
2212
|
if monitor.tool_turn_phase == "act":
|
|
2213
|
+
# Option 3: Early cycle break when same read target is hit 3+ times
|
|
2214
|
+
dup_target, dup_tool = monitor.has_duplicate_read_target(threshold=3)
|
|
2215
|
+
if dup_target and not cycle_looping and not stagnating:
|
|
2216
|
+
cycle_looping = True
|
|
2217
|
+
cycle_repeat = 2
|
|
2218
|
+
logger.warning(
|
|
2219
|
+
"TOOL STATE MACHINE: duplicate read target detected for '%s', triggering early cycle break",
|
|
2220
|
+
dup_tool,
|
|
2221
|
+
)
|
|
2222
|
+
|
|
2161
2223
|
if cycle_looping or stagnating:
|
|
2162
2224
|
reason = "cycle_detected" if cycle_looping else "stagnation"
|
|
2163
2225
|
monitor.set_tool_turn_phase("review", reason=reason)
|
|
@@ -2169,9 +2231,15 @@ def _resolve_state_machine_tool_choice(
|
|
|
2169
2231
|
1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
|
|
2170
2232
|
)
|
|
2171
2233
|
# Capture which tools are cycling for narrowing/hint injection
|
|
2234
|
+
# Strip argument hashes (e.g. "glob:abc12345" -> "glob") so that
|
|
2235
|
+
# tool narrowing can match against actual tool names.
|
|
2172
2236
|
window = max(2, PROXY_TOOL_STATE_CYCLE_WINDOW)
|
|
2173
2237
|
recent = [fp for fp in monitor.tool_call_history[-window:] if fp]
|
|
2174
|
-
|
|
2238
|
+
raw_names = []
|
|
2239
|
+
for fp in recent:
|
|
2240
|
+
for part in fp.split("|"):
|
|
2241
|
+
raw_names.append(part.split(":")[0])
|
|
2242
|
+
monitor.cycling_tool_names = list(dict.fromkeys(raw_names))
|
|
2175
2243
|
logger.warning(
|
|
2176
2244
|
"TOOL STATE MACHINE: entering review (cycle=%s repeat=%d stagnation=%d cycles=%d cycling_tools=%s)",
|
|
2177
2245
|
cycle_looping,
|
|
@@ -2184,7 +2252,11 @@ def _resolve_state_machine_tool_choice(
|
|
|
2184
2252
|
|
|
2185
2253
|
if monitor.tool_state_forced_budget_remaining <= 0:
|
|
2186
2254
|
monitor.set_tool_turn_phase("review", reason="forced_budget_exhausted")
|
|
2187
|
-
|
|
2255
|
+
# Only count toward review cycle limit if there was an actual
|
|
2256
|
+
# cycle/stagnation detected. Budget exhaustion alone means the
|
|
2257
|
+
# model is working — it just used all its turns — not cycling.
|
|
2258
|
+
if cycle_looping or stagnating:
|
|
2259
|
+
monitor.tool_state_review_cycles += 1
|
|
2188
2260
|
monitor.tool_state_auto_budget_remaining = max(
|
|
2189
2261
|
1, PROXY_TOOL_STATE_AUTO_BUDGET
|
|
2190
2262
|
)
|
|
@@ -2192,8 +2264,10 @@ def _resolve_state_machine_tool_choice(
|
|
|
2192
2264
|
1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
|
|
2193
2265
|
)
|
|
2194
2266
|
logger.warning(
|
|
2195
|
-
"TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d)",
|
|
2267
|
+
"TOOL STATE MACHINE: forced budget exhausted, entering review (cycles=%d cycling=%s stagnating=%s)",
|
|
2196
2268
|
monitor.tool_state_review_cycles,
|
|
2269
|
+
cycle_looping,
|
|
2270
|
+
stagnating,
|
|
2197
2271
|
)
|
|
2198
2272
|
return "required", "forced_budget_exhausted"
|
|
2199
2273
|
|
|
@@ -2206,6 +2280,14 @@ def _resolve_state_machine_tool_choice(
|
|
|
2206
2280
|
monitor.tool_state_forced_budget_remaining = max(
|
|
2207
2281
|
1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
|
|
2208
2282
|
)
|
|
2283
|
+
# If stagnation cleared during review, the model tried a
|
|
2284
|
+
# different approach — reward by reducing cycle pressure.
|
|
2285
|
+
if monitor.tool_state_stagnation_streak == 0 and monitor.tool_state_review_cycles > 0:
|
|
2286
|
+
monitor.tool_state_review_cycles = max(0, monitor.tool_state_review_cycles - 1)
|
|
2287
|
+
logger.info(
|
|
2288
|
+
"TOOL STATE MACHINE: review_cycles decremented to %d (stagnation cleared)",
|
|
2289
|
+
monitor.tool_state_review_cycles,
|
|
2290
|
+
)
|
|
2209
2291
|
return "required", "review_complete"
|
|
2210
2292
|
|
|
2211
2293
|
monitor.tool_state_auto_budget_remaining -= 1
|
|
@@ -2416,6 +2498,9 @@ def build_openai_request(
|
|
|
2416
2498
|
n_msgs = len(anthropic_body.get("messages", []))
|
|
2417
2499
|
has_tool_results = _conversation_has_tool_results(anthropic_body)
|
|
2418
2500
|
|
|
2501
|
+
# Detect and strip synthetic finalize continuation before fingerprinting
|
|
2502
|
+
_detect_and_strip_synthetic_continuation(anthropic_body, monitor)
|
|
2503
|
+
|
|
2419
2504
|
# Record tool calls from the last assistant message for loop detection
|
|
2420
2505
|
latest_tool_fingerprint = _record_last_assistant_tool_calls(
|
|
2421
2506
|
anthropic_body, monitor
|
|
@@ -2524,24 +2609,31 @@ def build_openai_request(
|
|
|
2524
2609
|
cycling_names,
|
|
2525
2610
|
)
|
|
2526
2611
|
# Option 2: Narrow tools during review to exclude cycling tools
|
|
2612
|
+
# Option 1 enhancement: if any cycling tool is read-only, exclude
|
|
2613
|
+
# the entire read-only class to prevent tool-hopping (read→glob→grep)
|
|
2527
2614
|
if (
|
|
2528
2615
|
monitor.tool_turn_phase == "review"
|
|
2529
2616
|
and monitor.cycling_tool_names
|
|
2530
2617
|
and "tools" in openai_body
|
|
2531
2618
|
):
|
|
2619
|
+
exclude_set = set(monitor.cycling_tool_names)
|
|
2620
|
+
# Expand to full read-only class if any cycling tool is read-only
|
|
2621
|
+
if any(n.lower() in {c.lower() for c in _READ_ONLY_TOOL_CLASS} for n in exclude_set):
|
|
2622
|
+
exclude_set |= _READ_ONLY_TOOL_CLASS
|
|
2532
2623
|
original_count = len(openai_body["tools"])
|
|
2533
2624
|
narrowed = [
|
|
2534
2625
|
t
|
|
2535
2626
|
for t in openai_body["tools"]
|
|
2536
|
-
if t.get("function", {}).get("name") not in
|
|
2627
|
+
if t.get("function", {}).get("name") not in exclude_set
|
|
2537
2628
|
]
|
|
2538
2629
|
if narrowed:
|
|
2539
2630
|
openai_body["tools"] = narrowed
|
|
2540
2631
|
logger.warning(
|
|
2541
|
-
"CYCLE BREAK: narrowed tools from %d to %d (excluded %s)",
|
|
2632
|
+
"CYCLE BREAK: narrowed tools from %d to %d (excluded %s, read_only_class=%s)",
|
|
2542
2633
|
original_count,
|
|
2543
2634
|
len(narrowed),
|
|
2544
2635
|
monitor.cycling_tool_names,
|
|
2636
|
+
any(n.lower() in {c.lower() for c in _READ_ONLY_TOOL_CLASS} for n in monitor.cycling_tool_names),
|
|
2545
2637
|
)
|
|
2546
2638
|
else:
|
|
2547
2639
|
logger.warning(
|
|
@@ -2602,13 +2694,117 @@ def build_openai_request(
|
|
|
2602
2694
|
return openai_body
|
|
2603
2695
|
|
|
2604
2696
|
|
|
2697
|
+
def _tool_call_fingerprint(block: dict) -> str:
|
|
2698
|
+
"""Create a fingerprint for a tool call that includes both name and a
|
|
2699
|
+
short hash of the arguments. This prevents false cycle detection when
|
|
2700
|
+
the same tool is called with different arguments (e.g. reading different
|
|
2701
|
+
files)."""
|
|
2702
|
+
name = block.get("name", "unknown")
|
|
2703
|
+
inp = block.get("input")
|
|
2704
|
+
if inp:
|
|
2705
|
+
arg_str = json.dumps(inp, sort_keys=True, separators=(",", ":"))
|
|
2706
|
+
arg_hash = hashlib.md5(arg_str.encode()).hexdigest()[:8]
|
|
2707
|
+
return f"{name}:{arg_hash}"
|
|
2708
|
+
return name
|
|
2709
|
+
|
|
2710
|
+
|
|
2711
|
+
def _detect_and_strip_synthetic_continuation(
|
|
2712
|
+
anthropic_body: dict, monitor: SessionMonitor
|
|
2713
|
+
) -> bool:
|
|
2714
|
+
"""Detect if the latest messages contain a synthetic finalize continuation
|
|
2715
|
+
tool_use/tool_result pair. If found, strip them from the conversation and
|
|
2716
|
+
reset the state machine so the model gets a fresh act cycle.
|
|
2717
|
+
|
|
2718
|
+
Returns True if a synthetic continuation was detected and handled.
|
|
2719
|
+
"""
|
|
2720
|
+
synthetic_id = monitor.finalize_synthetic_tool_id
|
|
2721
|
+
if not synthetic_id:
|
|
2722
|
+
return False
|
|
2723
|
+
|
|
2724
|
+
messages = anthropic_body.get("messages", [])
|
|
2725
|
+
if not messages:
|
|
2726
|
+
return False
|
|
2727
|
+
|
|
2728
|
+
# Walk backwards to find the synthetic tool_result in a user message
|
|
2729
|
+
found = False
|
|
2730
|
+
for msg in reversed(messages):
|
|
2731
|
+
if msg.get("role") != "user":
|
|
2732
|
+
continue
|
|
2733
|
+
content = msg.get("content")
|
|
2734
|
+
if not isinstance(content, list):
|
|
2735
|
+
break
|
|
2736
|
+
has_synthetic = any(
|
|
2737
|
+
isinstance(b, dict)
|
|
2738
|
+
and b.get("type") == "tool_result"
|
|
2739
|
+
and b.get("tool_use_id") == synthetic_id
|
|
2740
|
+
for b in content
|
|
2741
|
+
)
|
|
2742
|
+
if not has_synthetic:
|
|
2743
|
+
break
|
|
2744
|
+
|
|
2745
|
+
# Strip synthetic tool_result from user message
|
|
2746
|
+
new_content = [
|
|
2747
|
+
b for b in content
|
|
2748
|
+
if not (
|
|
2749
|
+
isinstance(b, dict)
|
|
2750
|
+
and b.get("type") == "tool_result"
|
|
2751
|
+
and b.get("tool_use_id") == synthetic_id
|
|
2752
|
+
)
|
|
2753
|
+
]
|
|
2754
|
+
if not new_content:
|
|
2755
|
+
msg["content"] = [{"type": "text", "text": "Continue working on the task."}]
|
|
2756
|
+
else:
|
|
2757
|
+
msg["content"] = new_content
|
|
2758
|
+
|
|
2759
|
+
# Strip synthetic tool_use from the preceding assistant message
|
|
2760
|
+
for asst_msg in reversed(messages):
|
|
2761
|
+
if asst_msg.get("role") != "assistant":
|
|
2762
|
+
continue
|
|
2763
|
+
asst_content = asst_msg.get("content")
|
|
2764
|
+
if isinstance(asst_content, list):
|
|
2765
|
+
asst_msg["content"] = [
|
|
2766
|
+
b for b in asst_content
|
|
2767
|
+
if not (
|
|
2768
|
+
isinstance(b, dict)
|
|
2769
|
+
and b.get("type") == "tool_use"
|
|
2770
|
+
and b.get("id") == synthetic_id
|
|
2771
|
+
)
|
|
2772
|
+
]
|
|
2773
|
+
break
|
|
2774
|
+
|
|
2775
|
+
found = True
|
|
2776
|
+
break
|
|
2777
|
+
|
|
2778
|
+
if not found:
|
|
2779
|
+
return False
|
|
2780
|
+
|
|
2781
|
+
# Reset state machine for fresh act cycle
|
|
2782
|
+
monitor.finalize_synthetic_tool_id = ""
|
|
2783
|
+
monitor.reset_tool_turn_state(reason="finalize_continuation_resume")
|
|
2784
|
+
monitor.reset_completion_recovery()
|
|
2785
|
+
monitor.tool_call_history = []
|
|
2786
|
+
logger.info(
|
|
2787
|
+
"FINALIZE CONTINUATION: stripped synthetic tool id=%s, "
|
|
2788
|
+
"reset state machine for fresh act cycle (continuations=%d/%d)",
|
|
2789
|
+
synthetic_id,
|
|
2790
|
+
monitor.finalize_continuation_count,
|
|
2791
|
+
PROXY_FINALIZE_CONTINUATION_MAX,
|
|
2792
|
+
)
|
|
2793
|
+
return True
|
|
2794
|
+
|
|
2795
|
+
|
|
2605
2796
|
def _record_last_assistant_tool_calls(
|
|
2606
2797
|
anthropic_body: dict, monitor: SessionMonitor
|
|
2607
2798
|
) -> str:
|
|
2608
2799
|
"""Extract tool call names from the last assistant message and record
|
|
2609
|
-
them in the session monitor for loop detection.
|
|
2800
|
+
them in the session monitor for loop detection.
|
|
2801
|
+
|
|
2802
|
+
Fingerprints now include an argument hash so that the same tool called
|
|
2803
|
+
with different arguments (e.g. read(file_a) vs read(file_b)) produces
|
|
2804
|
+
distinct fingerprints, preventing false cycle/stagnation detection."""
|
|
2610
2805
|
messages = anthropic_body.get("messages", [])
|
|
2611
|
-
|
|
2806
|
+
tool_fingerprints = []
|
|
2807
|
+
tool_targets: dict[str, str] = {}
|
|
2612
2808
|
for msg in reversed(messages):
|
|
2613
2809
|
if msg.get("role") != "assistant":
|
|
2614
2810
|
continue
|
|
@@ -2616,11 +2812,28 @@ def _record_last_assistant_tool_calls(
|
|
|
2616
2812
|
if isinstance(content, list):
|
|
2617
2813
|
for block in content:
|
|
2618
2814
|
if isinstance(block, dict) and block.get("type") == "tool_use":
|
|
2619
|
-
|
|
2815
|
+
tool_fingerprints.append(_tool_call_fingerprint(block))
|
|
2816
|
+
# Extract target key for read-only dedup (Option 3)
|
|
2817
|
+
name = block.get("name", "unknown")
|
|
2818
|
+
inp = block.get("input", {})
|
|
2819
|
+
if isinstance(inp, dict):
|
|
2820
|
+
target = (
|
|
2821
|
+
inp.get("file_path")
|
|
2822
|
+
or inp.get("path")
|
|
2823
|
+
or inp.get("pattern")
|
|
2824
|
+
or inp.get("command", "")[:80]
|
|
2825
|
+
)
|
|
2826
|
+
if target:
|
|
2827
|
+
tool_targets[name] = str(target)
|
|
2620
2828
|
break
|
|
2621
|
-
if
|
|
2622
|
-
|
|
2623
|
-
|
|
2829
|
+
if tool_fingerprints:
|
|
2830
|
+
fingerprint = "|".join(sorted(tool_fingerprints))
|
|
2831
|
+
monitor.record_tool_calls(
|
|
2832
|
+
[fp.split(":")[0] for fp in tool_fingerprints],
|
|
2833
|
+
tool_targets=tool_targets,
|
|
2834
|
+
fingerprint=fingerprint,
|
|
2835
|
+
)
|
|
2836
|
+
return fingerprint
|
|
2624
2837
|
return ""
|
|
2625
2838
|
|
|
2626
2839
|
|
|
@@ -4750,16 +4963,20 @@ def _maybe_extract_text_tool_calls(openai_resp: dict) -> dict:
|
|
|
4750
4963
|
return openai_resp
|
|
4751
4964
|
|
|
4752
4965
|
|
|
4753
|
-
def _detect_and_truncate_degenerate_repetition(
|
|
4966
|
+
def _detect_and_truncate_degenerate_repetition(
|
|
4967
|
+
openai_resp: dict,
|
|
4968
|
+
) -> tuple[dict, bool]:
|
|
4754
4969
|
"""Detect degenerate repetitive text and truncate at first repetition.
|
|
4755
4970
|
|
|
4756
4971
|
When the model produces highly repetitive output (e.g. the same 20+ char
|
|
4757
4972
|
substring repeated 10+ times), truncate at the first repetition boundary
|
|
4758
4973
|
and set finish_reason to stop.
|
|
4974
|
+
|
|
4975
|
+
Returns (response, was_degenerate) so the caller can retry if needed.
|
|
4759
4976
|
"""
|
|
4760
4977
|
text = _openai_message_text(openai_resp)
|
|
4761
4978
|
if not text or len(text) < 200:
|
|
4762
|
-
return openai_resp
|
|
4979
|
+
return openai_resp, False
|
|
4763
4980
|
|
|
4764
4981
|
# Look for repeated substrings of length 20-100
|
|
4765
4982
|
for substr_len in (60, 40, 20):
|
|
@@ -4788,8 +5005,70 @@ def _detect_and_truncate_degenerate_repetition(openai_resp: dict) -> dict:
|
|
|
4788
5005
|
msg = choices[0].get("message", {})
|
|
4789
5006
|
msg["content"] = truncated
|
|
4790
5007
|
choices[0]["finish_reason"] = "stop"
|
|
4791
|
-
return openai_resp
|
|
4792
|
-
return openai_resp
|
|
5008
|
+
return openai_resp, True
|
|
5009
|
+
return openai_resp, False
|
|
5010
|
+
|
|
5011
|
+
|
|
5012
|
+
def _client_has_tool(anthropic_body: dict, tool_name: str) -> bool:
|
|
5013
|
+
"""Check if the client's tool list contains a tool with the given name (case-insensitive)."""
|
|
5014
|
+
lower = tool_name.lower()
|
|
5015
|
+
return any(
|
|
5016
|
+
(t.get("name") or "").lower() == lower for t in anthropic_body.get("tools", [])
|
|
5017
|
+
)
|
|
5018
|
+
|
|
5019
|
+
|
|
5020
|
+
def _client_tool_name(anthropic_body: dict, tool_name: str) -> str:
|
|
5021
|
+
"""Return the actual tool name as the client spells it (case-sensitive match)."""
|
|
5022
|
+
lower = tool_name.lower()
|
|
5023
|
+
for t in anthropic_body.get("tools", []):
|
|
5024
|
+
if (t.get("name") or "").lower() == lower:
|
|
5025
|
+
return t["name"]
|
|
5026
|
+
return tool_name
|
|
5027
|
+
|
|
5028
|
+
|
|
5029
|
+
def _inject_synthetic_continuation(
|
|
5030
|
+
anthropic_resp: dict, monitor: SessionMonitor, anthropic_body: dict
|
|
5031
|
+
) -> dict:
|
|
5032
|
+
"""Inject a synthetic tool_use into a finalize-turn response to keep the
|
|
5033
|
+
client's agentic loop alive.
|
|
5034
|
+
|
|
5035
|
+
Appends a no-op Read("/dev/null") tool_use block and changes stop_reason
|
|
5036
|
+
from "end_turn" to "tool_use" so the client continues sending requests.
|
|
5037
|
+
"""
|
|
5038
|
+
# Pick a safe tool the client knows about (case-insensitive match,
|
|
5039
|
+
# then use the client's actual casing for the tool name)
|
|
5040
|
+
if _client_has_tool(anthropic_body, "read"):
|
|
5041
|
+
tool_name = _client_tool_name(anthropic_body, "read")
|
|
5042
|
+
tool_input = {"file_path": "/dev/null"}
|
|
5043
|
+
elif _client_has_tool(anthropic_body, "bash"):
|
|
5044
|
+
tool_name = _client_tool_name(anthropic_body, "bash")
|
|
5045
|
+
tool_input = {"command": "true", "description": "continuation ping"}
|
|
5046
|
+
else:
|
|
5047
|
+
logger.warning("FINALIZE CONTINUATION: no suitable tool found, skipping injection")
|
|
5048
|
+
return anthropic_resp
|
|
5049
|
+
|
|
5050
|
+
synthetic_id = f"toolu_{uuid.uuid4().hex[:12]}"
|
|
5051
|
+
monitor.finalize_synthetic_tool_id = synthetic_id
|
|
5052
|
+
monitor.finalize_continuation_count += 1
|
|
5053
|
+
|
|
5054
|
+
content = anthropic_resp.get("content", [])
|
|
5055
|
+
content.append({
|
|
5056
|
+
"type": "tool_use",
|
|
5057
|
+
"id": synthetic_id,
|
|
5058
|
+
"name": tool_name,
|
|
5059
|
+
"input": tool_input,
|
|
5060
|
+
})
|
|
5061
|
+
anthropic_resp["content"] = content
|
|
5062
|
+
anthropic_resp["stop_reason"] = "tool_use"
|
|
5063
|
+
|
|
5064
|
+
logger.info(
|
|
5065
|
+
"FINALIZE CONTINUATION: injected synthetic %s tool_use id=%s (count=%d/%d)",
|
|
5066
|
+
tool_name,
|
|
5067
|
+
synthetic_id,
|
|
5068
|
+
monitor.finalize_continuation_count,
|
|
5069
|
+
PROXY_FINALIZE_CONTINUATION_MAX,
|
|
5070
|
+
)
|
|
5071
|
+
return anthropic_resp
|
|
4793
5072
|
|
|
4794
5073
|
|
|
4795
5074
|
def openai_to_anthropic_response(openai_resp: dict, model: str) -> dict:
|
|
@@ -5623,8 +5902,51 @@ async def messages(request: Request):
|
|
|
5623
5902
|
session_id,
|
|
5624
5903
|
)
|
|
5625
5904
|
|
|
5626
|
-
openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
|
|
5905
|
+
openai_resp, was_degenerate = _detect_and_truncate_degenerate_repetition(openai_resp)
|
|
5906
|
+
if was_degenerate:
|
|
5907
|
+
# Retry with constrained parameters to avoid degenerate output.
|
|
5908
|
+
# With tools: force tool_choice=required for a useful tool call.
|
|
5909
|
+
# Without tools (finalize): retry with capped max_tokens for clean text.
|
|
5910
|
+
has_tools = bool(strict_body.get("tools"))
|
|
5911
|
+
retry_body = dict(strict_body)
|
|
5912
|
+
retry_body["max_tokens"] = 2048
|
|
5913
|
+
retry_body["temperature"] = 0.1
|
|
5914
|
+
retry_body["stream"] = False
|
|
5915
|
+
if has_tools:
|
|
5916
|
+
retry_body["tool_choice"] = "required"
|
|
5917
|
+
logger.warning("DEGENERATE RETRY: retrying with tool_choice=required max_tokens=2048")
|
|
5918
|
+
else:
|
|
5919
|
+
logger.warning("DEGENERATE RETRY: retrying text-only with max_tokens=2048 temp=0.1")
|
|
5920
|
+
try:
|
|
5921
|
+
retry_resp = await _post_with_generation_timeout(
|
|
5922
|
+
client, f"{LLAMA_CPP_BASE}/chat/completions", retry_body,
|
|
5923
|
+
{"Content-Type": "application/json"},
|
|
5924
|
+
)
|
|
5925
|
+
if retry_resp.status_code == 200:
|
|
5926
|
+
retry_data = retry_resp.json()
|
|
5927
|
+
retry_text = _openai_message_text(retry_data)
|
|
5928
|
+
_, retry_degenerate = _detect_and_truncate_degenerate_repetition(retry_data)
|
|
5929
|
+
if retry_degenerate:
|
|
5930
|
+
logger.info("DEGENERATE RETRY: retry also degenerate, using truncated original")
|
|
5931
|
+
elif has_tools and (retry_data.get("choices", [{}])[0]
|
|
5932
|
+
.get("message", {}).get("tool_calls")):
|
|
5933
|
+
logger.info("DEGENERATE RETRY: success, got tool call")
|
|
5934
|
+
openai_resp = retry_data
|
|
5935
|
+
elif not has_tools and retry_text and len(retry_text) > 50:
|
|
5936
|
+
logger.info("DEGENERATE RETRY: success, got clean text (%d chars)", len(retry_text))
|
|
5937
|
+
openai_resp = retry_data
|
|
5938
|
+
else:
|
|
5939
|
+
logger.info("DEGENERATE RETRY: retry insufficient, using truncated original")
|
|
5940
|
+
except Exception as exc:
|
|
5941
|
+
logger.warning("DEGENERATE RETRY: failed: %s", exc)
|
|
5627
5942
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
5943
|
+
# FINALIZE CONTINUATION: inject synthetic tool_use to keep client loop alive
|
|
5944
|
+
if (
|
|
5945
|
+
monitor.finalize_turn_active
|
|
5946
|
+
and monitor.finalize_continuation_count < PROXY_FINALIZE_CONTINUATION_MAX
|
|
5947
|
+
and anthropic_resp.get("stop_reason") == "end_turn"
|
|
5948
|
+
):
|
|
5949
|
+
anthropic_resp = _inject_synthetic_continuation(anthropic_resp, monitor, body)
|
|
5628
5950
|
monitor.record_response(anthropic_resp.get("usage", {}).get("output_tokens", 0))
|
|
5629
5951
|
# Update last_input_tokens from upstream's actual prompt_tokens
|
|
5630
5952
|
upstream_input = anthropic_resp.get("usage", {}).get("input_tokens", 0)
|
|
@@ -5962,8 +6284,38 @@ async def messages(request: Request):
|
|
|
5962
6284
|
monitor.invalid_tool_call_streak = 0
|
|
5963
6285
|
monitor.required_tool_miss_streak = 0
|
|
5964
6286
|
|
|
5965
|
-
openai_resp = _detect_and_truncate_degenerate_repetition(openai_resp)
|
|
6287
|
+
openai_resp, was_degenerate = _detect_and_truncate_degenerate_repetition(openai_resp)
|
|
6288
|
+
# Degenerate retry for non-guarded stream path
|
|
6289
|
+
if was_degenerate and openai_body.get("tools"):
|
|
6290
|
+
logger.warning("DEGENERATE RETRY (stream): retrying with tool_choice=required max_tokens=2048")
|
|
6291
|
+
retry_body = dict(openai_body)
|
|
6292
|
+
retry_body["tool_choice"] = "required"
|
|
6293
|
+
retry_body["max_tokens"] = 2048
|
|
6294
|
+
retry_body["temperature"] = 0.1
|
|
6295
|
+
retry_body["stream"] = False
|
|
6296
|
+
try:
|
|
6297
|
+
retry_resp = await _post_with_generation_timeout(
|
|
6298
|
+
client, f"{LLAMA_CPP_BASE}/chat/completions", retry_body,
|
|
6299
|
+
{"Content-Type": "application/json"},
|
|
6300
|
+
)
|
|
6301
|
+
if retry_resp.status_code == 200:
|
|
6302
|
+
retry_data = retry_resp.json()
|
|
6303
|
+
if (retry_data.get("choices", [{}])[0]
|
|
6304
|
+
.get("message", {}).get("tool_calls")):
|
|
6305
|
+
logger.info("DEGENERATE RETRY (stream): success, got tool call")
|
|
6306
|
+
openai_resp = retry_data
|
|
6307
|
+
else:
|
|
6308
|
+
logger.info("DEGENERATE RETRY (stream): no tool call, using truncated")
|
|
6309
|
+
except Exception as exc:
|
|
6310
|
+
logger.warning("DEGENERATE RETRY (stream): failed: %s", exc)
|
|
5966
6311
|
anthropic_resp = openai_to_anthropic_response(openai_resp, model)
|
|
6312
|
+
# FINALIZE CONTINUATION: inject synthetic tool_use (non-guarded stream path)
|
|
6313
|
+
if (
|
|
6314
|
+
monitor.finalize_turn_active
|
|
6315
|
+
and monitor.finalize_continuation_count < PROXY_FINALIZE_CONTINUATION_MAX
|
|
6316
|
+
and anthropic_resp.get("stop_reason") == "end_turn"
|
|
6317
|
+
):
|
|
6318
|
+
anthropic_resp = _inject_synthetic_continuation(anthropic_resp, monitor, body)
|
|
5967
6319
|
|
|
5968
6320
|
# Track output tokens in session monitor
|
|
5969
6321
|
output_tokens = anthropic_resp.get("usage", {}).get("output_tokens", 0)
|
|
@@ -1892,12 +1892,13 @@ class TestToolTurnControls(unittest.TestCase):
|
|
|
1892
1892
|
monitor = proxy.SessionMonitor(context_window=262144)
|
|
1893
1893
|
monitor.tool_turn_phase = "act"
|
|
1894
1894
|
monitor.tool_state_forced_budget_remaining = 20
|
|
1895
|
+
# Use hash-format fingerprints to match _tool_call_fingerprint output
|
|
1895
1896
|
monitor.tool_call_history = [
|
|
1896
|
-
"Bash",
|
|
1897
|
+
"Bash:1e7b8d07",
|
|
1897
1898
|
"TaskOutput",
|
|
1898
|
-
"Bash",
|
|
1899
|
+
"Bash:1e7b8d07",
|
|
1899
1900
|
"TaskOutput",
|
|
1900
|
-
"Bash",
|
|
1901
|
+
"Bash:1e7b8d07",
|
|
1901
1902
|
"TaskOutput",
|
|
1902
1903
|
]
|
|
1903
1904
|
monitor.last_tool_fingerprint = "TaskOutput"
|
|
@@ -2076,7 +2077,9 @@ class TestToolTurnControls(unittest.TestCase):
|
|
|
2076
2077
|
# Review phase now keeps required to prevent end-turn escape
|
|
2077
2078
|
self.assertEqual(openai.get("tool_choice"), "required")
|
|
2078
2079
|
self.assertEqual(monitor.tool_turn_phase, "review")
|
|
2079
|
-
|
|
2080
|
+
# review_cycles only increments when cycle_looping or stagnating,
|
|
2081
|
+
# not on mere budget exhaustion (model was working, not cycling)
|
|
2082
|
+
self.assertEqual(monitor.tool_state_review_cycles, 0)
|
|
2080
2083
|
finally:
|
|
2081
2084
|
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", old_state)
|
|
2082
2085
|
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", old_min_msgs)
|
|
@@ -2242,7 +2245,11 @@ class TestToolTurnControls(unittest.TestCase):
|
|
|
2242
2245
|
monitor = proxy.SessionMonitor(context_window=262144)
|
|
2243
2246
|
monitor.tool_turn_phase = "act"
|
|
2244
2247
|
monitor.tool_state_stagnation_streak = 4
|
|
2245
|
-
|
|
2248
|
+
# Use hash-format fingerprints to match _tool_call_fingerprint output
|
|
2249
|
+
monitor.tool_call_history = [
|
|
2250
|
+
"Bash:1e7b8d07", "TaskOutput", "Bash:1e7b8d07", "TaskOutput",
|
|
2251
|
+
"Bash:1e7b8d07", "TaskOutput",
|
|
2252
|
+
]
|
|
2246
2253
|
monitor.last_tool_fingerprint = "TaskOutput"
|
|
2247
2254
|
|
|
2248
2255
|
body = {
|
|
@@ -3262,8 +3269,11 @@ class TestCycleBreakOptions(unittest.TestCase):
|
|
|
3262
3269
|
monitor = proxy.SessionMonitor(context_window=262144)
|
|
3263
3270
|
monitor.tool_turn_phase = "act"
|
|
3264
3271
|
monitor.tool_state_forced_budget_remaining = 20
|
|
3265
|
-
|
|
3266
|
-
monitor.
|
|
3272
|
+
# Hash-format fingerprints matching Bash+{"command":"ls"}
|
|
3273
|
+
monitor.tool_call_history = [
|
|
3274
|
+
"Bash:781c24ad", "Bash:781c24ad", "Bash:781c24ad", "Bash:781c24ad",
|
|
3275
|
+
]
|
|
3276
|
+
monitor.last_tool_fingerprint = "Bash:781c24ad"
|
|
3267
3277
|
|
|
3268
3278
|
body = {
|
|
3269
3279
|
"model": "test",
|
|
@@ -3323,8 +3333,11 @@ class TestCycleBreakOptions(unittest.TestCase):
|
|
|
3323
3333
|
monitor = proxy.SessionMonitor(context_window=262144)
|
|
3324
3334
|
monitor.tool_turn_phase = "act"
|
|
3325
3335
|
monitor.tool_state_forced_budget_remaining = 20
|
|
3326
|
-
|
|
3327
|
-
monitor.
|
|
3336
|
+
# Hash-format fingerprints matching Bash+{"command":"ls"}
|
|
3337
|
+
monitor.tool_call_history = [
|
|
3338
|
+
"Bash:781c24ad", "Bash:781c24ad", "Bash:781c24ad", "Bash:781c24ad",
|
|
3339
|
+
]
|
|
3340
|
+
monitor.last_tool_fingerprint = "Bash:781c24ad"
|
|
3328
3341
|
|
|
3329
3342
|
body = {
|
|
3330
3343
|
"model": "test",
|
|
@@ -3369,9 +3382,9 @@ class TestCycleBreakOptions(unittest.TestCase):
|
|
|
3369
3382
|
"""Option 3: default forced budget reduced from 24 to 12."""
|
|
3370
3383
|
self.assertEqual(proxy.PROXY_TOOL_STATE_FORCED_BUDGET, 12)
|
|
3371
3384
|
|
|
3372
|
-
def
|
|
3373
|
-
"""Option 4: default review cycle limit
|
|
3374
|
-
self.assertEqual(proxy.PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT,
|
|
3385
|
+
def test_review_cycle_limit_default_is_3(self):
|
|
3386
|
+
"""Option 4: default review cycle limit is 3."""
|
|
3387
|
+
self.assertEqual(proxy.PROXY_TOOL_STATE_REVIEW_CYCLE_LIMIT, 3)
|
|
3375
3388
|
|
|
3376
3389
|
def test_cycling_tool_names_cleared_on_reset(self):
|
|
3377
3390
|
"""cycling_tool_names is cleared when tool turn state resets."""
|
|
@@ -3450,8 +3463,9 @@ class TestDegenerateRepetitionDetection(unittest.TestCase):
|
|
|
3450
3463
|
openai_resp = {
|
|
3451
3464
|
"choices": [{"message": {"content": repeated}, "finish_reason": "length"}]
|
|
3452
3465
|
}
|
|
3453
|
-
result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3466
|
+
result, truncated = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3454
3467
|
truncated_text = result["choices"][0]["message"]["content"]
|
|
3468
|
+
self.assertTrue(truncated)
|
|
3455
3469
|
self.assertLess(len(truncated_text), len(repeated))
|
|
3456
3470
|
self.assertEqual(result["choices"][0]["finish_reason"], "stop")
|
|
3457
3471
|
|
|
@@ -3461,7 +3475,8 @@ class TestDegenerateRepetitionDetection(unittest.TestCase):
|
|
|
3461
3475
|
openai_resp = {
|
|
3462
3476
|
"choices": [{"message": {"content": text}, "finish_reason": "stop"}]
|
|
3463
3477
|
}
|
|
3464
|
-
result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3478
|
+
result, truncated = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3479
|
+
self.assertFalse(truncated)
|
|
3465
3480
|
self.assertEqual(result["choices"][0]["message"]["content"], text)
|
|
3466
3481
|
|
|
3467
3482
|
def test_preserves_short_text(self):
|
|
@@ -3470,7 +3485,8 @@ class TestDegenerateRepetitionDetection(unittest.TestCase):
|
|
|
3470
3485
|
openai_resp = {
|
|
3471
3486
|
"choices": [{"message": {"content": text}, "finish_reason": "stop"}]
|
|
3472
3487
|
}
|
|
3473
|
-
result = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3488
|
+
result, truncated = proxy._detect_and_truncate_degenerate_repetition(openai_resp)
|
|
3489
|
+
self.assertFalse(truncated)
|
|
3474
3490
|
self.assertEqual(result["choices"][0]["message"]["content"], text)
|
|
3475
3491
|
|
|
3476
3492
|
def test_max_tokens_floor_skipped_for_non_tool_requests(self):
|
|
@@ -4220,3 +4236,173 @@ class TestReviewPhaseBootstrapReset(unittest.TestCase):
|
|
|
4220
4236
|
# The bootstrap reset only triggers for review phase
|
|
4221
4237
|
self.assertNotEqual(m.tool_turn_phase, "review")
|
|
4222
4238
|
# In act phase, the normal guardrail fallback path runs instead
|
|
4239
|
+
|
|
4240
|
+
|
|
4241
|
+
class TestReadOnlyCycleClassExclusion(unittest.TestCase):
|
|
4242
|
+
"""Tests for Option 1: read-only tool class exclusion on cycle break,
|
|
4243
|
+
Option 2: reduced cycle window (3), and Option 3: duplicate target dedup."""
|
|
4244
|
+
|
|
4245
|
+
def _make_body_with_tools(self, tool_names):
|
|
4246
|
+
"""Build a minimal Anthropic body with named tools and a tool_result."""
|
|
4247
|
+
tools = [
|
|
4248
|
+
{"name": n, "description": f"{n} tool", "input_schema": {"type": "object"}}
|
|
4249
|
+
for n in tool_names
|
|
4250
|
+
]
|
|
4251
|
+
return {
|
|
4252
|
+
"model": "test",
|
|
4253
|
+
"messages": [
|
|
4254
|
+
{"role": "user", "content": "do something"},
|
|
4255
|
+
{
|
|
4256
|
+
"role": "assistant",
|
|
4257
|
+
"content": [
|
|
4258
|
+
{
|
|
4259
|
+
"type": "tool_use",
|
|
4260
|
+
"id": "toolu_1",
|
|
4261
|
+
"name": tool_names[0],
|
|
4262
|
+
"input": {"file_path": "/some/file.ts"},
|
|
4263
|
+
}
|
|
4264
|
+
],
|
|
4265
|
+
},
|
|
4266
|
+
{
|
|
4267
|
+
"role": "user",
|
|
4268
|
+
"content": [
|
|
4269
|
+
{"type": "tool_result", "tool_use_id": "toolu_1", "content": "ok"}
|
|
4270
|
+
],
|
|
4271
|
+
},
|
|
4272
|
+
],
|
|
4273
|
+
"tools": tools,
|
|
4274
|
+
}
|
|
4275
|
+
|
|
4276
|
+
def test_read_only_class_exclusion_expands(self):
|
|
4277
|
+
"""When 'read' is cycling, all read-only tools are excluded, not just 'read'."""
|
|
4278
|
+
old_vals = {
|
|
4279
|
+
"PROXY_TOOL_STATE_MACHINE": getattr(proxy, "PROXY_TOOL_STATE_MACHINE"),
|
|
4280
|
+
"PROXY_TOOL_STATE_MIN_MESSAGES": getattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES"),
|
|
4281
|
+
"PROXY_TOOL_STATE_FORCED_BUDGET": getattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET"),
|
|
4282
|
+
"PROXY_TOOL_STATE_CYCLE_WINDOW": getattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW"),
|
|
4283
|
+
"PROXY_TOOL_STATE_STAGNATION_THRESHOLD": getattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD"),
|
|
4284
|
+
}
|
|
4285
|
+
try:
|
|
4286
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
|
|
4287
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
|
|
4288
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 3)
|
|
4289
|
+
setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 3)
|
|
4290
|
+
setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 2)
|
|
4291
|
+
|
|
4292
|
+
all_tools = ["read", "glob", "grep", "bash", "write", "edit"]
|
|
4293
|
+
body = self._make_body_with_tools(all_tools)
|
|
4294
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4295
|
+
|
|
4296
|
+
# Simulate cycling on 'read' by recording 3 identical fingerprints
|
|
4297
|
+
# Hash-format matching read+{"file_path":"/some/file.ts"}
|
|
4298
|
+
fp = "read:cfb28722"
|
|
4299
|
+
monitor.record_tool_calls(["read"], fingerprint=fp)
|
|
4300
|
+
monitor.record_tool_calls(["read"], fingerprint=fp)
|
|
4301
|
+
monitor.record_tool_calls(["read"], fingerprint=fp)
|
|
4302
|
+
|
|
4303
|
+
openai_body = proxy.build_openai_request(body, monitor)
|
|
4304
|
+
|
|
4305
|
+
# After cycle break, the tools in the body should exclude ALL
|
|
4306
|
+
# read-only tools, not just 'read'
|
|
4307
|
+
remaining_names = [
|
|
4308
|
+
t.get("function", {}).get("name") for t in openai_body.get("tools", [])
|
|
4309
|
+
]
|
|
4310
|
+
self.assertNotIn("read", remaining_names)
|
|
4311
|
+
self.assertNotIn("glob", remaining_names)
|
|
4312
|
+
self.assertNotIn("grep", remaining_names)
|
|
4313
|
+
# Write/action tools should remain
|
|
4314
|
+
self.assertIn("bash", remaining_names)
|
|
4315
|
+
self.assertIn("write", remaining_names)
|
|
4316
|
+
self.assertIn("edit", remaining_names)
|
|
4317
|
+
finally:
|
|
4318
|
+
for k, v in old_vals.items():
|
|
4319
|
+
setattr(proxy, k, v)
|
|
4320
|
+
|
|
4321
|
+
def test_non_read_tool_cycling_no_class_expansion(self):
|
|
4322
|
+
"""When 'bash' is cycling, only 'bash' is excluded, not read-only tools."""
|
|
4323
|
+
old_vals = {
|
|
4324
|
+
"PROXY_TOOL_STATE_MACHINE": getattr(proxy, "PROXY_TOOL_STATE_MACHINE"),
|
|
4325
|
+
"PROXY_TOOL_STATE_MIN_MESSAGES": getattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES"),
|
|
4326
|
+
"PROXY_TOOL_STATE_FORCED_BUDGET": getattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET"),
|
|
4327
|
+
"PROXY_TOOL_STATE_CYCLE_WINDOW": getattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW"),
|
|
4328
|
+
"PROXY_TOOL_STATE_STAGNATION_THRESHOLD": getattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD"),
|
|
4329
|
+
}
|
|
4330
|
+
try:
|
|
4331
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
|
|
4332
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
|
|
4333
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 3)
|
|
4334
|
+
setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 3)
|
|
4335
|
+
setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 2)
|
|
4336
|
+
|
|
4337
|
+
all_tools = ["read", "glob", "grep", "bash", "write", "edit"]
|
|
4338
|
+
body = self._make_body_with_tools(all_tools)
|
|
4339
|
+
# Change the assistant tool_use to bash
|
|
4340
|
+
body["messages"][1]["content"][0]["name"] = "bash"
|
|
4341
|
+
body["messages"][1]["content"][0]["input"] = {"command": "ls"}
|
|
4342
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4343
|
+
|
|
4344
|
+
# Use hash-format fingerprints matching bash+{"command":"ls"}
|
|
4345
|
+
fp = "bash:781c24ad"
|
|
4346
|
+
monitor.record_tool_calls(["bash"], fingerprint=fp)
|
|
4347
|
+
monitor.record_tool_calls(["bash"], fingerprint=fp)
|
|
4348
|
+
monitor.record_tool_calls(["bash"], fingerprint=fp)
|
|
4349
|
+
|
|
4350
|
+
openai_body = proxy.build_openai_request(body, monitor)
|
|
4351
|
+
|
|
4352
|
+
remaining_names = [
|
|
4353
|
+
t.get("function", {}).get("name") for t in openai_body.get("tools", [])
|
|
4354
|
+
]
|
|
4355
|
+
self.assertNotIn("bash", remaining_names)
|
|
4356
|
+
# Read-only tools should still be available
|
|
4357
|
+
self.assertIn("read", remaining_names)
|
|
4358
|
+
self.assertIn("glob", remaining_names)
|
|
4359
|
+
self.assertIn("grep", remaining_names)
|
|
4360
|
+
finally:
|
|
4361
|
+
for k, v in old_vals.items():
|
|
4362
|
+
setattr(proxy, k, v)
|
|
4363
|
+
|
|
4364
|
+
def test_duplicate_read_target_triggers_early_cycle(self):
|
|
4365
|
+
"""Option 3: reading same file 3+ times triggers early cycle break."""
|
|
4366
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4367
|
+
|
|
4368
|
+
# Record 3 reads of same target
|
|
4369
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/path/to/file.ts"})
|
|
4370
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/path/to/file.ts"})
|
|
4371
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/path/to/file.ts"})
|
|
4372
|
+
|
|
4373
|
+
dup, tool = monitor.has_duplicate_read_target(threshold=3)
|
|
4374
|
+
self.assertTrue(dup)
|
|
4375
|
+
self.assertEqual(tool, "read")
|
|
4376
|
+
|
|
4377
|
+
def test_different_read_targets_no_duplicate(self):
|
|
4378
|
+
"""Option 3: reading different files does NOT trigger duplicate detection."""
|
|
4379
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4380
|
+
|
|
4381
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/path/a.ts"})
|
|
4382
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/path/b.ts"})
|
|
4383
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/path/c.ts"})
|
|
4384
|
+
|
|
4385
|
+
dup, _ = monitor.has_duplicate_read_target(threshold=3)
|
|
4386
|
+
self.assertFalse(dup)
|
|
4387
|
+
|
|
4388
|
+
def test_cycle_window_default_is_3(self):
|
|
4389
|
+
"""Option 2: verify default cycle window is now 3."""
|
|
4390
|
+
# This tests the constant directly
|
|
4391
|
+
self.assertEqual(
|
|
4392
|
+
int(proxy.os.environ.get("PROXY_TOOL_STATE_CYCLE_WINDOW", "3")), 3
|
|
4393
|
+
)
|
|
4394
|
+
|
|
4395
|
+
def test_target_history_reset_on_state_reset(self):
|
|
4396
|
+
"""Target history is cleared when tool state resets."""
|
|
4397
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4398
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/file.ts"})
|
|
4399
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/file.ts"})
|
|
4400
|
+
monitor.record_tool_calls(["read"], tool_targets={"read": "/file.ts"})
|
|
4401
|
+
|
|
4402
|
+
dup, _ = monitor.has_duplicate_read_target(threshold=3)
|
|
4403
|
+
self.assertTrue(dup)
|
|
4404
|
+
|
|
4405
|
+
monitor.reset_tool_turn_state(reason="test_reset")
|
|
4406
|
+
|
|
4407
|
+
dup, _ = monitor.has_duplicate_read_target(threshold=3)
|
|
4408
|
+
self.assertFalse(dup)
|