@miller-tech/uap 1.20.25 → 1.20.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -2281,11 +2281,13 @@ def _resolve_state_machine_tool_choice(
|
|
|
2281
2281
|
1, PROXY_TOOL_STATE_FORCED_BUDGET // 2
|
|
2282
2282
|
)
|
|
2283
2283
|
# If stagnation cleared during review, the model tried a
|
|
2284
|
-
# different approach — reward by reducing cycle pressure
|
|
2284
|
+
# different approach — reward by reducing cycle pressure and
|
|
2285
|
+
# lifting persistent tool exclusion.
|
|
2285
2286
|
if monitor.tool_state_stagnation_streak == 0 and monitor.tool_state_review_cycles > 0:
|
|
2286
2287
|
monitor.tool_state_review_cycles = max(0, monitor.tool_state_review_cycles - 1)
|
|
2288
|
+
monitor.cycling_tool_names = []
|
|
2287
2289
|
logger.info(
|
|
2288
|
-
"TOOL STATE MACHINE: review_cycles decremented to %d (stagnation cleared)",
|
|
2290
|
+
"TOOL STATE MACHINE: review_cycles decremented to %d, cycling exclusion lifted (stagnation cleared)",
|
|
2289
2291
|
monitor.tool_state_review_cycles,
|
|
2290
2292
|
)
|
|
2291
2293
|
return "required", "review_complete"
|
|
@@ -2463,14 +2465,22 @@ def build_openai_request(
|
|
|
2463
2465
|
openai_body["stop"] = anthropic_body["stop_sequences"]
|
|
2464
2466
|
|
|
2465
2467
|
# Force controlled temperature for tool-call turns to reduce garbled output
|
|
2468
|
+
# Cycle 15 Option 2: use lower temperature after contamination resets
|
|
2466
2469
|
if has_tools:
|
|
2467
2470
|
client_temp = openai_body.get("temperature")
|
|
2468
|
-
|
|
2469
|
-
|
|
2471
|
+
target_temp = PROXY_TOOL_TURN_TEMPERATURE
|
|
2472
|
+
if monitor.contamination_resets > 0:
|
|
2473
|
+
target_temp = min(target_temp, 0.1)
|
|
2474
|
+
if client_temp is None or client_temp > target_temp:
|
|
2475
|
+
openai_body["temperature"] = target_temp
|
|
2476
|
+
extra = ""
|
|
2477
|
+
if monitor.contamination_resets > 0:
|
|
2478
|
+
extra = f" (post-contamination reset, resets={monitor.contamination_resets})"
|
|
2470
2479
|
logger.info(
|
|
2471
|
-
"TOOL TURN TEMP: forcing temperature=%.2f (was %s) for tool-enabled request",
|
|
2472
|
-
|
|
2480
|
+
"TOOL TURN TEMP: forcing temperature=%.2f (was %s) for tool-enabled request%s",
|
|
2481
|
+
target_temp,
|
|
2473
2482
|
client_temp,
|
|
2483
|
+
extra,
|
|
2474
2484
|
)
|
|
2475
2485
|
|
|
2476
2486
|
# Convert Anthropic tools to OpenAI function-calling tools
|
|
@@ -2589,31 +2599,41 @@ def build_openai_request(
|
|
|
2589
2599
|
monitor.no_progress_streak = (
|
|
2590
2600
|
0 if last_user_has_tool_result else monitor.no_progress_streak + 1
|
|
2591
2601
|
)
|
|
2592
|
-
#
|
|
2602
|
+
# Inject cycle-break instruction when entering review
|
|
2603
|
+
# Option 3 (Cycle 14): Escalate hint text based on review cycle count
|
|
2593
2604
|
if (
|
|
2594
2605
|
monitor.tool_turn_phase == "review"
|
|
2595
2606
|
and state_reason in {"cycle_detected", "stagnation"}
|
|
2596
2607
|
and monitor.cycling_tool_names
|
|
2597
2608
|
):
|
|
2598
2609
|
cycling_names = ", ".join(monitor.cycling_tool_names)
|
|
2599
|
-
|
|
2600
|
-
|
|
2601
|
-
|
|
2602
|
-
|
|
2603
|
-
|
|
2610
|
+
cycles = monitor.tool_state_review_cycles
|
|
2611
|
+
if cycles <= 1:
|
|
2612
|
+
cycle_hint = (
|
|
2613
|
+
f"You have been repeatedly calling the same tool(s): {cycling_names}. "
|
|
2614
|
+
"This is not making progress. Use a DIFFERENT tool to advance the task, "
|
|
2615
|
+
"or call a tool that produces your final answer."
|
|
2616
|
+
)
|
|
2617
|
+
else:
|
|
2618
|
+
cycle_hint = (
|
|
2619
|
+
f"CRITICAL: You have cycled {cycling_names} for {cycles} review rounds without progress. "
|
|
2620
|
+
"State what you have accomplished so far and what the next DIFFERENT action should be. "
|
|
2621
|
+
"Do NOT call the same tool again. Choose a completely different approach or "
|
|
2622
|
+
"produce your final answer now."
|
|
2623
|
+
)
|
|
2604
2624
|
messages = openai_body.get("messages", [])
|
|
2605
2625
|
messages.append({"role": "user", "content": cycle_hint})
|
|
2606
2626
|
openai_body["messages"] = messages
|
|
2607
2627
|
logger.warning(
|
|
2608
|
-
"CYCLE BREAK: injected hint about cycling tools: %s",
|
|
2628
|
+
"CYCLE BREAK: injected hint about cycling tools: %s (escalation=%d)",
|
|
2609
2629
|
cycling_names,
|
|
2630
|
+
cycles,
|
|
2610
2631
|
)
|
|
2611
|
-
#
|
|
2612
|
-
# Option 1
|
|
2613
|
-
#
|
|
2632
|
+
# Narrow tools to exclude cycling tools
|
|
2633
|
+
# Option 1 (Cycle 13): if any cycling tool is read-only, exclude entire class
|
|
2634
|
+
# Option 1 (Cycle 14): persist exclusion during act phase too, not just review
|
|
2614
2635
|
if (
|
|
2615
|
-
monitor.
|
|
2616
|
-
and monitor.cycling_tool_names
|
|
2636
|
+
monitor.cycling_tool_names
|
|
2617
2637
|
and "tools" in openai_body
|
|
2618
2638
|
):
|
|
2619
2639
|
exclude_set = set(monitor.cycling_tool_names)
|
|
@@ -4679,7 +4699,7 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4679
4699
|
|
|
4680
4700
|
attempts = max(0, PROXY_MALFORMED_TOOL_RETRY_MAX)
|
|
4681
4701
|
current_issue = issue
|
|
4682
|
-
# Track failing tool names for
|
|
4702
|
+
# Track failing tool names for tool narrowing on retry
|
|
4683
4703
|
failing_tools: set[str] = set()
|
|
4684
4704
|
if issue.kind == "invalid_tool_args":
|
|
4685
4705
|
for tc in (working_resp.get("choices", [{}])[0].get("message", {}).get("tool_calls", [])):
|
|
@@ -4687,14 +4707,22 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4687
4707
|
raw_args = tc.get("function", {}).get("arguments", "")
|
|
4688
4708
|
if fn_name and raw_args and _is_garbled_tool_arguments(raw_args):
|
|
4689
4709
|
failing_tools.add(fn_name)
|
|
4710
|
+
# Cycle 15 Option 1: For malformed_payload retries, exclude complex
|
|
4711
|
+
# multi-field tools (task, Agent) that are prone to garbled generation
|
|
4712
|
+
# after the first retry fails.
|
|
4713
|
+
_COMPLEX_TOOLS_TO_EXCLUDE_ON_MALFORMED = {"task", "Agent"}
|
|
4714
|
+
malformed_exclude_active = False
|
|
4690
4715
|
for attempt in range(attempts):
|
|
4691
4716
|
attempt_tool_choice = _retry_tool_choice_for_attempt(
|
|
4692
4717
|
required_tool_choice,
|
|
4693
4718
|
attempt,
|
|
4694
4719
|
attempts,
|
|
4695
4720
|
)
|
|
4696
|
-
#
|
|
4697
|
-
|
|
4721
|
+
# On attempt >= 1, exclude consistently failing tools OR complex tools for malformed
|
|
4722
|
+
exclude_set = set(failing_tools) if failing_tools else set()
|
|
4723
|
+
if malformed_exclude_active:
|
|
4724
|
+
exclude_set |= _COMPLEX_TOOLS_TO_EXCLUDE_ON_MALFORMED
|
|
4725
|
+
exclude = list(exclude_set) if (attempt >= 1 and exclude_set) else None
|
|
4698
4726
|
retry_body = _build_malformed_retry_body(
|
|
4699
4727
|
openai_body,
|
|
4700
4728
|
anthropic_body,
|
|
@@ -4773,6 +4801,8 @@ async def _apply_malformed_tool_guardrail(
|
|
|
4773
4801
|
|
|
4774
4802
|
if retry_issue.kind == "malformed_payload":
|
|
4775
4803
|
monitor.malformed_tool_streak += 1
|
|
4804
|
+
# Cycle 15 Option 1: activate complex tool exclusion for next retry
|
|
4805
|
+
malformed_exclude_active = True
|
|
4776
4806
|
elif retry_issue.kind == "invalid_tool_args":
|
|
4777
4807
|
monitor.invalid_tool_call_streak += 1
|
|
4778
4808
|
monitor.arg_preflight_rejections += 1
|
|
@@ -4886,6 +4916,35 @@ def _maybe_apply_session_contamination_breaker(
|
|
|
4886
4916
|
if not should_reset:
|
|
4887
4917
|
return anthropic_body
|
|
4888
4918
|
|
|
4919
|
+
# Cycle 15 Option 3: if contamination has already reset N+ times in this
|
|
4920
|
+
# session, the model is fundamentally unable to produce valid tool calls.
|
|
4921
|
+
# Force finalize so the Droid framework can intervene.
|
|
4922
|
+
max_contamination_resets = 3
|
|
4923
|
+
if monitor.contamination_resets >= max_contamination_resets:
|
|
4924
|
+
logger.error(
|
|
4925
|
+
"SESSION CONTAMINATION LOOP: session=%s contamination_resets=%d >= %d, forcing finalize",
|
|
4926
|
+
session_id,
|
|
4927
|
+
monitor.contamination_resets,
|
|
4928
|
+
max_contamination_resets,
|
|
4929
|
+
)
|
|
4930
|
+
monitor.set_tool_turn_phase("finalize", reason="contamination_loop")
|
|
4931
|
+
monitor.contamination_resets += 1
|
|
4932
|
+
monitor.malformed_tool_streak = 0
|
|
4933
|
+
monitor.invalid_tool_call_streak = 0
|
|
4934
|
+
# Remove tools to force text-only response
|
|
4935
|
+
updated = dict(anthropic_body)
|
|
4936
|
+
updated.pop("tools", None)
|
|
4937
|
+
updated.pop("tool_choice", None)
|
|
4938
|
+
msgs = updated.get("messages", [])
|
|
4939
|
+
msgs.append({
|
|
4940
|
+
"role": "user",
|
|
4941
|
+
"content": (
|
|
4942
|
+
"Tool-call generation has failed repeatedly. Respond with plain text only. "
|
|
4943
|
+
"Summarize what you have accomplished and what remains to be done."
|
|
4944
|
+
),
|
|
4945
|
+
})
|
|
4946
|
+
return updated
|
|
4947
|
+
|
|
4889
4948
|
messages = anthropic_body.get("messages", [])
|
|
4890
4949
|
keep_last = max(2, PROXY_SESSION_CONTAMINATION_KEEP_LAST)
|
|
4891
4950
|
if len(messages) <= keep_last + 1:
|
|
@@ -4406,3 +4406,259 @@ class TestReadOnlyCycleClassExclusion(unittest.TestCase):
|
|
|
4406
4406
|
|
|
4407
4407
|
dup, _ = monitor.has_duplicate_read_target(threshold=3)
|
|
4408
4408
|
self.assertFalse(dup)
|
|
4409
|
+
|
|
4410
|
+
|
|
4411
|
+
class TestPersistentCycleExclusion(unittest.TestCase):
|
|
4412
|
+
"""Tests for Cycle 14: persistent exclusion, escalating hints, and
|
|
4413
|
+
exclusion across review→act transitions."""
|
|
4414
|
+
|
|
4415
|
+
def _make_body_with_tools(self, tool_names, active_tool="bash", active_input=None):
|
|
4416
|
+
tools = [
|
|
4417
|
+
{"name": n, "description": f"{n} tool", "input_schema": {"type": "object"}}
|
|
4418
|
+
for n in tool_names
|
|
4419
|
+
]
|
|
4420
|
+
inp = active_input or {"command": "ls"}
|
|
4421
|
+
return {
|
|
4422
|
+
"model": "test",
|
|
4423
|
+
"messages": [
|
|
4424
|
+
{"role": "user", "content": "do something"},
|
|
4425
|
+
{
|
|
4426
|
+
"role": "assistant",
|
|
4427
|
+
"content": [
|
|
4428
|
+
{"type": "tool_use", "id": "t1", "name": active_tool, "input": inp}
|
|
4429
|
+
],
|
|
4430
|
+
},
|
|
4431
|
+
{
|
|
4432
|
+
"role": "user",
|
|
4433
|
+
"content": [
|
|
4434
|
+
{"type": "tool_result", "tool_use_id": "t1", "content": "ok"}
|
|
4435
|
+
],
|
|
4436
|
+
},
|
|
4437
|
+
],
|
|
4438
|
+
"tools": tools,
|
|
4439
|
+
}
|
|
4440
|
+
|
|
4441
|
+
def test_exclusion_persists_through_act_phase(self):
|
|
4442
|
+
"""Option 1: cycling_tool_names exclusion persists in act phase after review."""
|
|
4443
|
+
old_vals = {}
|
|
4444
|
+
for k in ["PROXY_TOOL_STATE_MACHINE", "PROXY_TOOL_STATE_MIN_MESSAGES",
|
|
4445
|
+
"PROXY_TOOL_STATE_FORCED_BUDGET", "PROXY_TOOL_STATE_CYCLE_WINDOW",
|
|
4446
|
+
"PROXY_TOOL_STATE_STAGNATION_THRESHOLD"]:
|
|
4447
|
+
old_vals[k] = getattr(proxy, k)
|
|
4448
|
+
try:
|
|
4449
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
|
|
4450
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
|
|
4451
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 6)
|
|
4452
|
+
setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 3)
|
|
4453
|
+
setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 2)
|
|
4454
|
+
|
|
4455
|
+
all_tools = ["bash", "read", "write", "edit"]
|
|
4456
|
+
body = self._make_body_with_tools(all_tools)
|
|
4457
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4458
|
+
|
|
4459
|
+
# Simulate bash cycling that triggers review
|
|
4460
|
+
monitor.cycling_tool_names = ["bash"]
|
|
4461
|
+
monitor.tool_turn_phase = "act"
|
|
4462
|
+
monitor.tool_state_forced_budget_remaining = 5
|
|
4463
|
+
|
|
4464
|
+
openai = proxy.build_openai_request(body, monitor)
|
|
4465
|
+
|
|
4466
|
+
# In act phase with cycling_tool_names set, bash should be excluded
|
|
4467
|
+
remaining = [t["function"]["name"] for t in openai.get("tools", [])]
|
|
4468
|
+
self.assertNotIn("bash", remaining)
|
|
4469
|
+
self.assertIn("read", remaining)
|
|
4470
|
+
self.assertIn("write", remaining)
|
|
4471
|
+
finally:
|
|
4472
|
+
for k, v in old_vals.items():
|
|
4473
|
+
setattr(proxy, k, v)
|
|
4474
|
+
|
|
4475
|
+
def test_exclusion_cleared_on_stagnation_clear(self):
|
|
4476
|
+
"""Option 1: cycling exclusion is lifted when stagnation clears in review."""
|
|
4477
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4478
|
+
monitor.tool_turn_phase = "review"
|
|
4479
|
+
monitor.tool_state_review_cycles = 1
|
|
4480
|
+
monitor.tool_state_stagnation_streak = 0 # stagnation cleared
|
|
4481
|
+
monitor.cycling_tool_names = ["bash"]
|
|
4482
|
+
monitor.tool_state_auto_budget_remaining = 0
|
|
4483
|
+
monitor.tool_state_forced_budget_remaining = 6
|
|
4484
|
+
|
|
4485
|
+
# This should transition review→act and clear cycling names
|
|
4486
|
+
old_vals = {}
|
|
4487
|
+
for k in ["PROXY_TOOL_STATE_MACHINE", "PROXY_TOOL_STATE_MIN_MESSAGES",
|
|
4488
|
+
"PROXY_TOOL_STATE_FORCED_BUDGET"]:
|
|
4489
|
+
old_vals[k] = getattr(proxy, k)
|
|
4490
|
+
try:
|
|
4491
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
|
|
4492
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
|
|
4493
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 6)
|
|
4494
|
+
|
|
4495
|
+
body = self._make_body_with_tools(["bash", "read", "write"])
|
|
4496
|
+
proxy.build_openai_request(body, monitor)
|
|
4497
|
+
|
|
4498
|
+
self.assertEqual(monitor.tool_turn_phase, "act")
|
|
4499
|
+
self.assertEqual(monitor.cycling_tool_names, [])
|
|
4500
|
+
finally:
|
|
4501
|
+
for k, v in old_vals.items():
|
|
4502
|
+
setattr(proxy, k, v)
|
|
4503
|
+
|
|
4504
|
+
def test_escalated_hint_on_cycle_2(self):
|
|
4505
|
+
"""Option 3: cycle 2+ gets escalated CRITICAL hint text."""
|
|
4506
|
+
old_vals = {}
|
|
4507
|
+
for k in ["PROXY_TOOL_STATE_MACHINE", "PROXY_TOOL_STATE_MIN_MESSAGES",
|
|
4508
|
+
"PROXY_TOOL_STATE_FORCED_BUDGET", "PROXY_TOOL_STATE_CYCLE_WINDOW",
|
|
4509
|
+
"PROXY_TOOL_STATE_STAGNATION_THRESHOLD"]:
|
|
4510
|
+
old_vals[k] = getattr(proxy, k)
|
|
4511
|
+
try:
|
|
4512
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
|
|
4513
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
|
|
4514
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 20)
|
|
4515
|
+
setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 3)
|
|
4516
|
+
setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 2)
|
|
4517
|
+
|
|
4518
|
+
all_tools = ["bash", "read", "write"]
|
|
4519
|
+
body = self._make_body_with_tools(all_tools)
|
|
4520
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4521
|
+
# Pre-set as if we've already been through 1 review cycle
|
|
4522
|
+
monitor.tool_turn_phase = "act"
|
|
4523
|
+
monitor.tool_state_review_cycles = 1
|
|
4524
|
+
monitor.tool_state_forced_budget_remaining = 20
|
|
4525
|
+
monitor.tool_state_stagnation_streak = 3
|
|
4526
|
+
fp = "bash:781c24ad"
|
|
4527
|
+
monitor.tool_call_history = [fp, fp, fp]
|
|
4528
|
+
monitor.last_tool_fingerprint = fp
|
|
4529
|
+
|
|
4530
|
+
openai = proxy.build_openai_request(body, monitor)
|
|
4531
|
+
|
|
4532
|
+
# Should now be in review with cycles=2 and escalated hint
|
|
4533
|
+
self.assertEqual(monitor.tool_turn_phase, "review")
|
|
4534
|
+
self.assertEqual(monitor.tool_state_review_cycles, 2)
|
|
4535
|
+
messages = openai.get("messages", [])
|
|
4536
|
+
last_user = [m for m in messages if m.get("role") == "user"][-1]
|
|
4537
|
+
self.assertIn("CRITICAL", last_user["content"])
|
|
4538
|
+
self.assertIn("2 review rounds", last_user["content"])
|
|
4539
|
+
finally:
|
|
4540
|
+
for k, v in old_vals.items():
|
|
4541
|
+
setattr(proxy, k, v)
|
|
4542
|
+
|
|
4543
|
+
def test_mild_hint_on_cycle_1(self):
|
|
4544
|
+
"""Option 3: cycle 1 gets mild hint, not escalated."""
|
|
4545
|
+
old_vals = {}
|
|
4546
|
+
for k in ["PROXY_TOOL_STATE_MACHINE", "PROXY_TOOL_STATE_MIN_MESSAGES",
|
|
4547
|
+
"PROXY_TOOL_STATE_FORCED_BUDGET", "PROXY_TOOL_STATE_CYCLE_WINDOW",
|
|
4548
|
+
"PROXY_TOOL_STATE_STAGNATION_THRESHOLD"]:
|
|
4549
|
+
old_vals[k] = getattr(proxy, k)
|
|
4550
|
+
try:
|
|
4551
|
+
setattr(proxy, "PROXY_TOOL_STATE_MACHINE", True)
|
|
4552
|
+
setattr(proxy, "PROXY_TOOL_STATE_MIN_MESSAGES", 3)
|
|
4553
|
+
setattr(proxy, "PROXY_TOOL_STATE_FORCED_BUDGET", 20)
|
|
4554
|
+
setattr(proxy, "PROXY_TOOL_STATE_CYCLE_WINDOW", 3)
|
|
4555
|
+
setattr(proxy, "PROXY_TOOL_STATE_STAGNATION_THRESHOLD", 2)
|
|
4556
|
+
|
|
4557
|
+
body = self._make_body_with_tools(["bash", "read", "write"])
|
|
4558
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4559
|
+
monitor.tool_turn_phase = "act"
|
|
4560
|
+
monitor.tool_state_review_cycles = 0
|
|
4561
|
+
monitor.tool_state_forced_budget_remaining = 20
|
|
4562
|
+
monitor.tool_state_stagnation_streak = 3
|
|
4563
|
+
fp = "bash:781c24ad"
|
|
4564
|
+
monitor.tool_call_history = [fp, fp, fp]
|
|
4565
|
+
monitor.last_tool_fingerprint = fp
|
|
4566
|
+
|
|
4567
|
+
openai = proxy.build_openai_request(body, monitor)
|
|
4568
|
+
|
|
4569
|
+
self.assertEqual(monitor.tool_turn_phase, "review")
|
|
4570
|
+
self.assertEqual(monitor.tool_state_review_cycles, 1)
|
|
4571
|
+
messages = openai.get("messages", [])
|
|
4572
|
+
last_user = [m for m in messages if m.get("role") == "user"][-1]
|
|
4573
|
+
self.assertNotIn("CRITICAL", last_user["content"])
|
|
4574
|
+
self.assertIn("DIFFERENT tool", last_user["content"])
|
|
4575
|
+
finally:
|
|
4576
|
+
for k, v in old_vals.items():
|
|
4577
|
+
setattr(proxy, k, v)
|
|
4578
|
+
|
|
4579
|
+
|
|
4580
|
+
class TestMalformedPayloadLoopFix(unittest.TestCase):
|
|
4581
|
+
"""Tests for Cycle 15: malformed payload loop breaking."""
|
|
4582
|
+
|
|
4583
|
+
def test_contamination_loop_forces_finalize(self):
|
|
4584
|
+
"""Option 3: after 3+ contamination resets, force finalize."""
|
|
4585
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4586
|
+
monitor.contamination_resets = 3 # already hit 3 resets
|
|
4587
|
+
monitor.malformed_tool_streak = 3 # triggers should_reset
|
|
4588
|
+
|
|
4589
|
+
body = {
|
|
4590
|
+
"model": "test",
|
|
4591
|
+
"messages": [
|
|
4592
|
+
{"role": "user", "content": "do something"},
|
|
4593
|
+
{"role": "assistant", "content": "ok"},
|
|
4594
|
+
{"role": "user", "content": "continue"},
|
|
4595
|
+
],
|
|
4596
|
+
"tools": [
|
|
4597
|
+
{"name": "bash", "description": "Run", "input_schema": {"type": "object"}},
|
|
4598
|
+
],
|
|
4599
|
+
}
|
|
4600
|
+
result = proxy._maybe_apply_session_contamination_breaker(
|
|
4601
|
+
body, monitor, "test-session"
|
|
4602
|
+
)
|
|
4603
|
+
# Should have removed tools and forced finalize
|
|
4604
|
+
self.assertNotIn("tools", result)
|
|
4605
|
+
self.assertNotIn("tool_choice", result)
|
|
4606
|
+
self.assertEqual(monitor.tool_turn_phase, "finalize")
|
|
4607
|
+
# Check finalize instruction was injected
|
|
4608
|
+
last_msg = result["messages"][-1]
|
|
4609
|
+
self.assertIn("plain text only", last_msg["content"])
|
|
4610
|
+
|
|
4611
|
+
def test_contamination_below_threshold_resets_normally(self):
|
|
4612
|
+
"""Below 3 contamination resets, normal reset behavior."""
|
|
4613
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4614
|
+
monitor.contamination_resets = 1
|
|
4615
|
+
monitor.malformed_tool_streak = 3
|
|
4616
|
+
|
|
4617
|
+
# Need enough messages (> keep_last + 1) for full reset path
|
|
4618
|
+
msgs = [{"role": "user", "content": "start"}]
|
|
4619
|
+
for i in range(20):
|
|
4620
|
+
msgs.append({"role": "assistant", "content": f"resp {i}"})
|
|
4621
|
+
msgs.append({"role": "user", "content": f"msg {i}"})
|
|
4622
|
+
body = {
|
|
4623
|
+
"model": "test",
|
|
4624
|
+
"messages": msgs,
|
|
4625
|
+
"tools": [
|
|
4626
|
+
{"name": "bash", "description": "Run", "input_schema": {"type": "object"}},
|
|
4627
|
+
],
|
|
4628
|
+
}
|
|
4629
|
+
result = proxy._maybe_apply_session_contamination_breaker(
|
|
4630
|
+
body, monitor, "test-session"
|
|
4631
|
+
)
|
|
4632
|
+
# Should have done normal reset (increment contamination_resets)
|
|
4633
|
+
self.assertEqual(monitor.contamination_resets, 2)
|
|
4634
|
+
self.assertEqual(monitor.tool_turn_phase, "bootstrap")
|
|
4635
|
+
|
|
4636
|
+
def test_post_contamination_temp_lowered(self):
|
|
4637
|
+
"""Option 2: temperature lowered to 0.1 after contamination reset."""
|
|
4638
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4639
|
+
monitor.contamination_resets = 1 # has had a reset
|
|
4640
|
+
|
|
4641
|
+
body = {
|
|
4642
|
+
"model": "test",
|
|
4643
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
4644
|
+
"tools": [
|
|
4645
|
+
{"name": "bash", "description": "Run", "input_schema": {"type": "object"}},
|
|
4646
|
+
],
|
|
4647
|
+
}
|
|
4648
|
+
openai = proxy.build_openai_request(body, monitor)
|
|
4649
|
+
self.assertLessEqual(openai.get("temperature", 1.0), 0.1)
|
|
4650
|
+
|
|
4651
|
+
def test_normal_temp_without_contamination(self):
|
|
4652
|
+
"""Without contamination resets, normal tool temp (0.3) is used."""
|
|
4653
|
+
monitor = proxy.SessionMonitor(context_window=262144)
|
|
4654
|
+
monitor.contamination_resets = 0
|
|
4655
|
+
|
|
4656
|
+
body = {
|
|
4657
|
+
"model": "test",
|
|
4658
|
+
"messages": [{"role": "user", "content": "test"}],
|
|
4659
|
+
"tools": [
|
|
4660
|
+
{"name": "bash", "description": "Run", "input_schema": {"type": "object"}},
|
|
4661
|
+
],
|
|
4662
|
+
}
|
|
4663
|
+
openai = proxy.build_openai_request(body, monitor)
|
|
4664
|
+
self.assertAlmostEqual(openai.get("temperature", 1.0), 0.3, places=1)
|