@miller-tech/uap 1.20.26 → 1.20.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -656,6 +656,8 @@ class SessionMonitor:
|
|
|
656
656
|
tool_state_review_cycles: int = 0
|
|
657
657
|
last_tool_fingerprint: str = ""
|
|
658
658
|
cycling_tool_names: list = field(default_factory=list)
|
|
659
|
+
session_banned_tools: set = field(default_factory=set) # tools banned for entire session after repeated cycling
|
|
660
|
+
tool_cycle_counts: dict = field(default_factory=dict) # {tool_name: cycle_count} across resets
|
|
659
661
|
last_response_garbled: bool = False # previous turn had garbled/malformed output
|
|
660
662
|
finalize_turn_active: bool = False
|
|
661
663
|
finalize_continuation_count: int = 0
|
|
@@ -2240,6 +2242,16 @@ def _resolve_state_machine_tool_choice(
|
|
|
2240
2242
|
for part in fp.split("|"):
|
|
2241
2243
|
raw_names.append(part.split(":")[0])
|
|
2242
2244
|
monitor.cycling_tool_names = list(dict.fromkeys(raw_names))
|
|
2245
|
+
# Cycle 18 Option 2: track per-tool cycle counts and ban after 3 cycles
|
|
2246
|
+
for name in monitor.cycling_tool_names:
|
|
2247
|
+
monitor.tool_cycle_counts[name] = monitor.tool_cycle_counts.get(name, 0) + 1
|
|
2248
|
+
if monitor.tool_cycle_counts[name] >= 3 and name not in monitor.session_banned_tools:
|
|
2249
|
+
monitor.session_banned_tools.add(name)
|
|
2250
|
+
logger.warning(
|
|
2251
|
+
"TOOL BAN: '%s' banned for session after %d cycle detections",
|
|
2252
|
+
name,
|
|
2253
|
+
monitor.tool_cycle_counts[name],
|
|
2254
|
+
)
|
|
2243
2255
|
logger.warning(
|
|
2244
2256
|
"TOOL STATE MACHINE: entering review (cycle=%s repeat=%d stagnation=%d cycles=%d cycling_tools=%s)",
|
|
2245
2257
|
cycle_looping,
|
|
@@ -2465,14 +2477,22 @@ def build_openai_request(
|
|
|
2465
2477
|
openai_body["stop"] = anthropic_body["stop_sequences"]
|
|
2466
2478
|
|
|
2467
2479
|
# Force controlled temperature for tool-call turns to reduce garbled output
|
|
2480
|
+
# Cycle 15 Option 2: use lower temperature after contamination resets
|
|
2468
2481
|
if has_tools:
|
|
2469
2482
|
client_temp = openai_body.get("temperature")
|
|
2470
|
-
|
|
2471
|
-
|
|
2483
|
+
target_temp = PROXY_TOOL_TURN_TEMPERATURE
|
|
2484
|
+
if monitor.contamination_resets > 0:
|
|
2485
|
+
target_temp = min(target_temp, 0.1)
|
|
2486
|
+
if client_temp is None or client_temp > target_temp:
|
|
2487
|
+
openai_body["temperature"] = target_temp
|
|
2488
|
+
extra = ""
|
|
2489
|
+
if monitor.contamination_resets > 0:
|
|
2490
|
+
extra = f" (post-contamination reset, resets={monitor.contamination_resets})"
|
|
2472
2491
|
logger.info(
|
|
2473
|
-
"TOOL TURN TEMP: forcing temperature=%.2f (was %s) for tool-enabled request",
|
|
2474
|
-
|
|
2492
|
+
"TOOL TURN TEMP: forcing temperature=%.2f (was %s) for tool-enabled request%s",
|
|
2493
|
+
target_temp,
|
|
2475
2494
|
client_temp,
|
|
2495
|
+
extra,
|
|
2476
2496
|
)
|
|
2477
2497
|
|
|
2478
2498
|
# Convert Anthropic tools to OpenAI function-calling tools
|
|
@@ -2621,14 +2641,15 @@ def build_openai_request(
|
|
|
2621
2641
|
cycling_names,
|
|
2622
2642
|
cycles,
|
|
2623
2643
|
)
|
|
2624
|
-
# Narrow tools to exclude cycling tools
|
|
2644
|
+
# Narrow tools to exclude cycling tools + session-banned tools
|
|
2625
2645
|
# Option 1 (Cycle 13): if any cycling tool is read-only, exclude entire class
|
|
2626
2646
|
# Option 1 (Cycle 14): persist exclusion during act phase too, not just review
|
|
2647
|
+
# Option 2 (Cycle 18): always exclude session-banned tools
|
|
2627
2648
|
if (
|
|
2628
|
-
monitor.cycling_tool_names
|
|
2649
|
+
(monitor.cycling_tool_names or monitor.session_banned_tools)
|
|
2629
2650
|
and "tools" in openai_body
|
|
2630
2651
|
):
|
|
2631
|
-
exclude_set = set(monitor.cycling_tool_names)
|
|
2652
|
+
exclude_set = set(monitor.cycling_tool_names) | monitor.session_banned_tools
|
|
2632
2653
|
# Expand to full read-only class if any cycling tool is read-only
|
|
2633
2654
|
if any(n.lower() in {c.lower() for c in _READ_ONLY_TOOL_CLASS} for n in exclude_set):
|
|
2634
2655
|
exclude_set |= _READ_ONLY_TOOL_CLASS
|
|
@@ -2640,13 +2661,15 @@ def build_openai_request(
|
|
|
2640
2661
|
]
|
|
2641
2662
|
if narrowed:
|
|
2642
2663
|
openai_body["tools"] = narrowed
|
|
2643
|
-
|
|
2644
|
-
|
|
2645
|
-
|
|
2646
|
-
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
|
|
2664
|
+
# Only log on first activation or phase transitions to reduce noise
|
|
2665
|
+
if state_reason in {"cycle_detected", "stagnation"}:
|
|
2666
|
+
logger.warning(
|
|
2667
|
+
"CYCLE BREAK: narrowed tools from %d to %d (excluded %s, read_only_class=%s)",
|
|
2668
|
+
original_count,
|
|
2669
|
+
len(narrowed),
|
|
2670
|
+
monitor.cycling_tool_names,
|
|
2671
|
+
any(n.lower() in {c.lower() for c in _READ_ONLY_TOOL_CLASS} for n in monitor.cycling_tool_names),
|
|
2672
|
+
)
|
|
2650
2673
|
else:
|
|
2651
2674
|
logger.warning(
|
|
2652
2675
|
"CYCLE BREAK: cannot narrow tools — all tools are cycling, keeping original set",
|
|
@@ -3084,6 +3107,47 @@ _TOOL_CALL_XML_RE = re.compile(
|
|
|
3084
3107
|
)
|
|
3085
3108
|
|
|
3086
3109
|
|
|
3110
|
+
def _repair_tool_call_json(raw: str) -> str | None:
|
|
3111
|
+
"""Attempt to repair common garbled JSON in tool call payloads.
|
|
3112
|
+
|
|
3113
|
+
Returns repaired JSON string, or None if repair is not possible.
|
|
3114
|
+
Handles: trailing braces, unbalanced brackets, truncated strings.
|
|
3115
|
+
"""
|
|
3116
|
+
s = raw.strip()
|
|
3117
|
+
if not s.startswith("{"):
|
|
3118
|
+
return None
|
|
3119
|
+
# Strip trailing garbage (runaway braces/brackets)
|
|
3120
|
+
while s.endswith("}}") and s.count("{") < s.count("}"):
|
|
3121
|
+
s = s[:-1]
|
|
3122
|
+
while s.endswith("]]") and s.count("[") < s.count("]"):
|
|
3123
|
+
s = s[:-1]
|
|
3124
|
+
# Balance braces
|
|
3125
|
+
open_b = s.count("{") - s.count("}")
|
|
3126
|
+
if open_b > 0:
|
|
3127
|
+
s += "}" * open_b
|
|
3128
|
+
elif open_b < 0:
|
|
3129
|
+
# Too many closing braces — trim from end
|
|
3130
|
+
for _ in range(-open_b):
|
|
3131
|
+
idx = s.rfind("}")
|
|
3132
|
+
if idx > 0:
|
|
3133
|
+
s = s[:idx] + s[idx + 1:]
|
|
3134
|
+
# Try to parse
|
|
3135
|
+
try:
|
|
3136
|
+
json.loads(s)
|
|
3137
|
+
return s
|
|
3138
|
+
except json.JSONDecodeError:
|
|
3139
|
+
pass
|
|
3140
|
+
# Try truncating at last valid comma + closing
|
|
3141
|
+
for end in range(len(s) - 1, max(0, len(s) - 200), -1):
|
|
3142
|
+
candidate = s[:end].rstrip().rstrip(",") + "}" * max(0, s[:end].count("{") - s[:end].count("}"))
|
|
3143
|
+
try:
|
|
3144
|
+
json.loads(candidate)
|
|
3145
|
+
return candidate
|
|
3146
|
+
except json.JSONDecodeError:
|
|
3147
|
+
continue
|
|
3148
|
+
return None
|
|
3149
|
+
|
|
3150
|
+
|
|
3087
3151
|
def _extract_tool_calls_from_text(text: str) -> tuple[list[dict], str]:
|
|
3088
3152
|
"""Parse ``<tool_call>{...}</tool_call>`` blocks out of *text*.
|
|
3089
3153
|
|
|
@@ -3104,7 +3168,18 @@ def _extract_tool_calls_from_text(text: str) -> tuple[list[dict], str]:
|
|
|
3104
3168
|
try:
|
|
3105
3169
|
payload = json.loads(raw_json)
|
|
3106
3170
|
except json.JSONDecodeError:
|
|
3107
|
-
|
|
3171
|
+
# Cycle 15 Option 1: attempt JSON repair before giving up
|
|
3172
|
+
repaired = _repair_tool_call_json(raw_json)
|
|
3173
|
+
if repaired:
|
|
3174
|
+
try:
|
|
3175
|
+
payload = json.loads(repaired)
|
|
3176
|
+
logger.info(
|
|
3177
|
+
"TOOL CALL EXTRACTION: repaired garbled JSON in <tool_call> block"
|
|
3178
|
+
)
|
|
3179
|
+
except json.JSONDecodeError:
|
|
3180
|
+
continue
|
|
3181
|
+
else:
|
|
3182
|
+
continue
|
|
3108
3183
|
if not isinstance(payload, dict):
|
|
3109
3184
|
continue
|
|
3110
3185
|
|
|
@@ -4372,9 +4447,11 @@ def _build_malformed_retry_body(
|
|
|
4372
4447
|
retry_body = dict(openai_body)
|
|
4373
4448
|
retry_body["stream"] = False
|
|
4374
4449
|
retry_body["tool_choice"] = tool_choice
|
|
4375
|
-
#
|
|
4450
|
+
# Cycle 15 Option 3: vary temperature across retries to break degenerate patterns.
|
|
4451
|
+
# Attempt 1: use configured retry temp (default 0.0) for deterministic first try.
|
|
4452
|
+
# Attempt 2+: increase to 0.5 to escape the degenerate local minimum.
|
|
4376
4453
|
if total_attempts > 1 and attempt > 1:
|
|
4377
|
-
retry_body["temperature"] = 0.
|
|
4454
|
+
retry_body["temperature"] = 0.5
|
|
4378
4455
|
else:
|
|
4379
4456
|
retry_body["temperature"] = PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE
|
|
4380
4457
|
|
|
@@ -4691,7 +4768,7 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4691
4768
|
|
|
4692
4769
|
attempts = max(0, PROXY_MALFORMED_TOOL_RETRY_MAX)
|
|
4693
4770
|
current_issue = issue
|
|
4694
|
-
# Track failing tool names for
|
|
4771
|
+
# Track failing tool names for tool narrowing on retry
|
|
4695
4772
|
failing_tools: set[str] = set()
|
|
4696
4773
|
if issue.kind == "invalid_tool_args":
|
|
4697
4774
|
for tc in (working_resp.get("choices", [{}])[0].get("message", {}).get("tool_calls", [])):
|
|
@@ -4699,14 +4776,22 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4699
4776
|
raw_args = tc.get("function", {}).get("arguments", "")
|
|
4700
4777
|
if fn_name and raw_args and _is_garbled_tool_arguments(raw_args):
|
|
4701
4778
|
failing_tools.add(fn_name)
|
|
4779
|
+
# Cycle 15 Option 1: For malformed_payload retries, exclude complex
|
|
4780
|
+
# multi-field tools (task, Agent) that are prone to garbled generation
|
|
4781
|
+
# after the first retry fails.
|
|
4782
|
+
_COMPLEX_TOOLS_TO_EXCLUDE_ON_MALFORMED = {"task", "Agent"}
|
|
4783
|
+
malformed_exclude_active = False
|
|
4702
4784
|
for attempt in range(attempts):
|
|
4703
4785
|
attempt_tool_choice = _retry_tool_choice_for_attempt(
|
|
4704
4786
|
required_tool_choice,
|
|
4705
4787
|
attempt,
|
|
4706
4788
|
attempts,
|
|
4707
4789
|
)
|
|
4708
|
-
#
|
|
4709
|
-
|
|
4790
|
+
# On attempt >= 1, exclude consistently failing tools OR complex tools for malformed
|
|
4791
|
+
exclude_set = set(failing_tools) if failing_tools else set()
|
|
4792
|
+
if malformed_exclude_active:
|
|
4793
|
+
exclude_set |= _COMPLEX_TOOLS_TO_EXCLUDE_ON_MALFORMED
|
|
4794
|
+
exclude = list(exclude_set) if (attempt >= 1 and exclude_set) else None
|
|
4710
4795
|
retry_body = _build_malformed_retry_body(
|
|
4711
4796
|
openai_body,
|
|
4712
4797
|
anthropic_body,
|
|
@@ -4785,6 +4870,8 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4785
4870
|
|
|
4786
4871
|
if retry_issue.kind == "malformed_payload":
|
|
4787
4872
|
monitor.malformed_tool_streak += 1
|
|
4873
|
+
# Cycle 15 Option 1: activate complex tool exclusion for next retry
|
|
4874
|
+
malformed_exclude_active = True
|
|
4788
4875
|
elif retry_issue.kind == "invalid_tool_args":
|
|
4789
4876
|
monitor.invalid_tool_call_streak += 1
|
|
4790
4877
|
monitor.arg_preflight_rejections += 1
|
|
@@ -4898,6 +4985,35 @@ def _maybe_apply_session_contamination_breaker(
|
|
|
4898
4985
|
if not should_reset:
|
|
4899
4986
|
return anthropic_body
|
|
4900
4987
|
|
|
4988
|
+
# Cycle 15 Option 3: if contamination has already reset N+ times in this
|
|
4989
|
+
# session, the model is fundamentally unable to produce valid tool calls.
|
|
4990
|
+
# Force finalize so the Droid framework can intervene.
|
|
4991
|
+
max_contamination_resets = 3
|
|
4992
|
+
if monitor.contamination_resets >= max_contamination_resets:
|
|
4993
|
+
logger.error(
|
|
4994
|
+
"SESSION CONTAMINATION LOOP: session=%s contamination_resets=%d >= %d, forcing finalize",
|
|
4995
|
+
session_id,
|
|
4996
|
+
monitor.contamination_resets,
|
|
4997
|
+
max_contamination_resets,
|
|
4998
|
+
)
|
|
4999
|
+
monitor.set_tool_turn_phase("finalize", reason="contamination_loop")
|
|
5000
|
+
monitor.contamination_resets += 1
|
|
5001
|
+
monitor.malformed_tool_streak = 0
|
|
5002
|
+
monitor.invalid_tool_call_streak = 0
|
|
5003
|
+
# Remove tools to force text-only response
|
|
5004
|
+
updated = dict(anthropic_body)
|
|
5005
|
+
updated.pop("tools", None)
|
|
5006
|
+
updated.pop("tool_choice", None)
|
|
5007
|
+
msgs = updated.get("messages", [])
|
|
5008
|
+
msgs.append({
|
|
5009
|
+
"role": "user",
|
|
5010
|
+
"content": (
|
|
5011
|
+
"Tool-call generation has failed repeatedly. Respond with plain text only. "
|
|
5012
|
+
"Summarize what you have accomplished and what remains to be done."
|
|
5013
|
+
),
|
|
5014
|
+
})
|
|
5015
|
+
return updated
|
|
5016
|
+
|
|
4901
5017
|
messages = anthropic_body.get("messages", [])
|
|
4902
5018
|
keep_last = max(2, PROXY_SESSION_CONTAMINATION_KEEP_LAST)
|
|
4903
5019
|
if len(messages) <= keep_last + 1:
|
|
@@ -4575,3 +4575,224 @@ class TestPersistentCycleExclusion(unittest.TestCase):
|
|
|
4575
4575
|
finally:
|
|
4576
4576
|
for k, v in old_vals.items():
|
|
4577
4577
|
setattr(proxy, k, v)
|
|
4578
|
+
|
|
4579
|
+
|
|
4580
|
+
class TestMalformedPayloadLoopFix(unittest.TestCase):
|
|
4581
|
+
"""Tests for Cycle 15: malformed payload loop breaking."""
|
|
4582
|
+
|
|
4583
|
+
def test_contamination_loop_forces_finalize(self):
|
|
4584
|
+
"""Option 3: after 3+ contamination resets, force finalize."""
|
|
4585
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4586
|
+
monitor.contamination_resets = 3 # already hit 3 resets
|
|
4587
|
+
monitor.malformed_tool_streak = 3 # triggers should_reset
|
|
4588
|
+
|
|
4589
|
+
body = {
|
|
4590
|
+
"model": "test",
|
|
4591
|
+
"messages": [
|
|
4592
|
+
{"role": "user", "content": "do something"},
|
|
4593
|
+
{"role": "assistant", "content": "ok"},
|
|
4594
|
+
{"role": "user", "content": "continue"},
|
|
4595
|
+
],
|
|
4596
|
+
"tools": [
|
|
4597
|
+
{"name": "bash", "description": "Run", "input_schema": {"type": "object"}},
|
|
4598
|
+
],
|
|
4599
|
+
}
|
|
4600
|
+
result = proxy._maybe_apply_session_contamination_breaker(
|
|
4601
|
+
body, monitor, "test-session"
|
|
4602
|
+
)
|
|
4603
|
+
# Should have removed tools and forced finalize
|
|
4604
|
+
self.assertNotIn("tools", result)
|
|
4605
|
+
self.assertNotIn("tool_choice", result)
|
|
4606
|
+
self.assertEqual(monitor.tool_turn_phase, "finalize")
|
|
4607
|
+
# Check finalize instruction was injected
|
|
4608
|
+
last_msg = result["messages"][-1]
|
|
4609
|
+
self.assertIn("plain text only", last_msg["content"])
|
|
4610
|
+
|
|
4611
|
+
def test_contamination_below_threshold_resets_normally(self):
|
|
4612
|
+
"""Below 3 contamination resets, normal reset behavior."""
|
|
4613
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4614
|
+
monitor.contamination_resets = 1
|
|
4615
|
+
monitor.malformed_tool_streak = 3
|
|
4616
|
+
|
|
4617
|
+
# Need enough messages (> keep_last + 1) for full reset path
|
|
4618
|
+
msgs = [{"role": "user", "content": "start"}]
|
|
4619
|
+
for i in range(20):
|
|
4620
|
+
msgs.append({"role": "assistant", "content": f"resp {i}"})
|
|
4621
|
+
msgs.append({"role": "user", "content": f"msg {i}"})
|
|
4622
|
+
body = {
|
|
4623
|
+
"model": "test",
|
|
4624
|
+
"messages": msgs,
|
|
4625
|
+
"tools": [
|
|
4626
|
+
{"name": "bash", "description": "Run", "input_schema": {"type": "object"}},
|
|
4627
|
+
],
|
|
4628
|
+
}
|
|
4629
|
+
result = proxy._maybe_apply_session_contamination_breaker(
|
|
4630
|
+
body, monitor, "test-session"
|
|
4631
|
+
)
|
|
4632
|
+
# Should have done normal reset (increment contamination_resets)
|
|
4633
|
+
self.assertEqual(monitor.contamination_resets, 2)
|
|
4634
|
+
self.assertEqual(monitor.tool_turn_phase, "bootstrap")
|
|
4635
|
+
|
|
4636
|
+
def test_post_contamination_temp_lowered(self):
|
|
4637
|
+
"""Option 2: temperature lowered to 0.1 after contamination reset."""
|
|
4638
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4639
|
+
monitor.contamination_resets = 1 # has had a reset
|
|
4640
|
+
|
|
4641
|
+
body = {
|
|
4642
|
+
"model": "test",
|
|
4643
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
4644
|
+
"tools": [
|
|
4645
|
+
{"name": "bash", "description": "Run", "input_schema": {"type": "object"}},
|
|
4646
|
+
],
|
|
4647
|
+
}
|
|
4648
|
+
openai = proxy.build_openai_request(body, monitor)
|
|
4649
|
+
self.assertLessEqual(openai.get("temperature", 1.0), 0.1)
|
|
4650
|
+
|
|
4651
|
+
def test_normal_temp_without_contamination(self):
|
|
4652
|
+
"""Without contamination resets, normal tool temp (0.3) is used."""
|
|
4653
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4654
|
+
monitor.contamination_resets = 0
|
|
4655
|
+
|
|
4656
|
+
body = {
|
|
4657
|
+
"model": "test",
|
|
4658
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
4659
|
+
"tools": [
|
|
4660
|
+
{"name": "bash", "description": "Run", "input_schema": {"type": "object"}},
|
|
4661
|
+
],
|
|
4662
|
+
}
|
|
4663
|
+
openai = proxy.build_openai_request(body, monitor)
|
|
4664
|
+
self.assertAlmostEqual(openai.get("temperature", 1.0), 0.3, places=1)
|
|
4665
|
+
|
|
4666
|
+
|
|
4667
|
+
class TestToolCallJsonRepair(unittest.TestCase):
|
|
4668
|
+
"""Tests for Cycle 15 Option 1: JSON repair in tool call extraction."""
|
|
4669
|
+
|
|
4670
|
+
def test_repairs_trailing_braces(self):
|
|
4671
|
+
"""Runaway closing braces are trimmed and JSON parsed."""
|
|
4672
|
+
garbled = '{"name":"bash","arguments":{"command":"ls"}}}}'
|
|
4673
|
+
repaired = proxy._repair_tool_call_json(garbled)
|
|
4674
|
+
self.assertIsNotNone(repaired)
|
|
4675
|
+
parsed = json.loads(repaired)
|
|
4676
|
+
self.assertEqual(parsed["name"], "bash")
|
|
4677
|
+
|
|
4678
|
+
def test_repairs_unbalanced_open_braces(self):
|
|
4679
|
+
"""Missing closing braces are added."""
|
|
4680
|
+
garbled = '{"name":"read","arguments":{"file_path":"/foo"}'
|
|
4681
|
+
repaired = proxy._repair_tool_call_json(garbled)
|
|
4682
|
+
self.assertIsNotNone(repaired)
|
|
4683
|
+
parsed = json.loads(repaired)
|
|
4684
|
+
self.assertEqual(parsed["name"], "read")
|
|
4685
|
+
|
|
4686
|
+
def test_returns_none_for_total_garbage(self):
|
|
4687
|
+
"""Completely invalid JSON returns None."""
|
|
4688
|
+
result = proxy._repair_tool_call_json("not json at all")
|
|
4689
|
+
self.assertIsNone(result)
|
|
4690
|
+
|
|
4691
|
+
def test_extracts_repaired_tool_call_from_text(self):
|
|
4692
|
+
"""End-to-end: garbled <tool_call> XML is extracted after repair."""
|
|
4693
|
+
text = '<tool_call>\n{"name":"bash","arguments":{"command":"pwd"}}}\n</tool_call>'
|
|
4694
|
+
extracted, remaining = proxy._extract_tool_calls_from_text(text)
|
|
4695
|
+
self.assertEqual(len(extracted), 1)
|
|
4696
|
+
self.assertEqual(extracted[0]["function"]["name"], "bash")
|
|
4697
|
+
|
|
4698
|
+
|
|
4699
|
+
class TestRetryTemperatureVariance(unittest.TestCase):
|
|
4700
|
+
"""Tests for Cycle 15 Option 3: retry temperature variance."""
|
|
4701
|
+
|
|
4702
|
+
def test_retry_attempt_1_uses_configured_temp(self):
|
|
4703
|
+
"""First retry attempt uses PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE."""
|
|
4704
|
+
body = proxy._build_malformed_retry_body(
|
|
4705
|
+
{"messages": [{"role": "user", "content": "test"}], "tools": []},
|
|
4706
|
+
{"messages": [{"role": "user", "content": "test"}], "tools": []},
|
|
4707
|
+
retry_hint="fix it",
|
|
4708
|
+
tool_choice="required",
|
|
4709
|
+
attempt=1,
|
|
4710
|
+
total_attempts=3,
|
|
4711
|
+
is_garbled=False,
|
|
4712
|
+
)
|
|
4713
|
+
self.assertEqual(body["temperature"], proxy.PROXY_MALFORMED_TOOL_RETRY_TEMPERATURE)
|
|
4714
|
+
|
|
4715
|
+
def test_retry_attempt_2_uses_higher_temp(self):
|
|
4716
|
+
"""Second retry attempt uses temp=0.5 to break degenerate patterns."""
|
|
4717
|
+
body = proxy._build_malformed_retry_body(
|
|
4718
|
+
{"messages": [{"role": "user", "content": "test"}], "tools": []},
|
|
4719
|
+
{"messages": [{"role": "user", "content": "test"}], "tools": []},
|
|
4720
|
+
retry_hint="fix it",
|
|
4721
|
+
tool_choice="required",
|
|
4722
|
+
attempt=2,
|
|
4723
|
+
total_attempts=3,
|
|
4724
|
+
is_garbled=False,
|
|
4725
|
+
)
|
|
4726
|
+
self.assertEqual(body["temperature"], 0.5)
|
|
4727
|
+
|
|
4728
|
+
|
|
4729
|
+
class TestCycle18SessionBanAndLogNoise(unittest.TestCase):
|
|
4730
|
+
"""Tests for Cycle 18: session tool banning and log noise reduction."""
|
|
4731
|
+
|
|
4732
|
+
def test_tool_banned_after_3_cycle_detections(self):
|
|
4733
|
+
"""Option 2: tool gets session-banned after cycling 3 times."""
|
|
4734
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4735
|
+
# Simulate 3 separate cycle detections for 'task'
|
|
4736
|
+
monitor.tool_cycle_counts["task"] = 2
|
|
4737
|
+
monitor.cycling_tool_names = ["task"]
|
|
4738
|
+
|
|
4739
|
+
# This is what happens inside the cycle detection — manually trigger
|
|
4740
|
+
for name in monitor.cycling_tool_names:
|
|
4741
|
+
monitor.tool_cycle_counts[name] = monitor.tool_cycle_counts.get(name, 0) + 1
|
|
4742
|
+
if monitor.tool_cycle_counts[name] >= 3:
|
|
4743
|
+
monitor.session_banned_tools.add(name)
|
|
4744
|
+
|
|
4745
|
+
self.assertIn("task", monitor.session_banned_tools)
|
|
4746
|
+
self.assertEqual(monitor.tool_cycle_counts["task"], 3)
|
|
4747
|
+
|
|
4748
|
+
def test_session_ban_survives_state_reset(self):
|
|
4749
|
+
"""Option 2: session_banned_tools persists through reset_tool_turn_state."""
|
|
4750
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4751
|
+
monitor.session_banned_tools.add("task")
|
|
4752
|
+
monitor.tool_cycle_counts["task"] = 3
|
|
4753
|
+
|
|
4754
|
+
monitor.reset_tool_turn_state(reason="test")
|
|
4755
|
+
|
|
4756
|
+
# Session bans survive resets — they're session-level, not phase-level
|
|
4757
|
+
self.assertIn("task", monitor.session_banned_tools)
|
|
4758
|
+
self.assertEqual(monitor.tool_cycle_counts["task"], 3)
|
|
4759
|
+
|
|
4760
|
+
def test_banned_tools_excluded_even_without_cycling(self):
|
|
4761
|
+
"""Option 2: session-banned tools are excluded even when cycling_tool_names is empty."""
|
|
4762
|
+
old_vals = {}
|
|
4763
|
+
for k in ["PROXY_TOOL_STATE_MACHINE", "PROXY_TOOL_STATE_MIN_MESSAGES",
|
|
4764
|
+
"PROXY_TOOL_STATE_FORCED_BUDGET"]:
|
|
4765
|
+
old_vals[k] = getattr(proxy, k)
|
|
4766
|
+
try:
|
|
4767
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
|
|
4768
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
|
|
4769
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 6)
|
|
4770
|
+
|
|
4771
|
+
body = {
|
|
4772
|
+
"model": "test",
|
|
4773
|
+
"messages": [
|
|
4774
|
+
{"role": "user", "content": "do"},
|
|
4775
|
+
{"role": "assistant", "content": [
|
|
4776
|
+
{"type": "tool_use", "id": "t1", "name": "bash", "input": {"command": "ls"}}
|
|
4777
|
+
]},
|
|
4778
|
+
{"role": "user", "content": [
|
|
4779
|
+
{"type": "tool_result", "tool_use_id": "t1", "content": "ok"}
|
|
4780
|
+
]},
|
|
4781
|
+
],
|
|
4782
|
+
"tools": [
|
|
4783
|
+
{"name": "task", "description": "Task", "input_schema": {"type": "object"}},
|
|
4784
|
+
{"name": "bash", "description": "Bash", "input_schema": {"type": "object"}},
|
|
4785
|
+
{"name": "read", "description": "Read", "input_schema": {"type": "object"}},
|
|
4786
|
+
],
|
|
4787
|
+
}
|
|
4788
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4789
|
+
monitor.session_banned_tools.add("task")
|
|
4790
|
+
monitor.cycling_tool_names = [] # no active cycling
|
|
4791
|
+
|
|
4792
|
+
openai = proxy.build_openai_request(body, monitor)
|
|
4793
|
+
remaining = [t["function"]["name"] for t in openai.get("tools", [])]
|
|
4794
|
+
self.assertNotIn("task", remaining)
|
|
4795
|
+
self.assertIn("bash", remaining)
|
|
4796
|
+
finally:
|
|
4797
|
+
for k, v in old_vals.items():
|
|
4798
|
+
setattr(proxy, k, v)
|