@miller-tech/uap 1.20.51 → 1.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +65 -21
- package/dist/.tsbuildinfo +1 -1
- package/dist/bin/cli.js +88 -5
- package/dist/bin/cli.js.map +1 -1
- package/dist/cli/agent.js +1 -1
- package/dist/cli/agent.js.map +1 -1
- package/dist/cli/droids.d.ts +21 -1
- package/dist/cli/droids.d.ts.map +1 -1
- package/dist/cli/droids.js +142 -0
- package/dist/cli/droids.js.map +1 -1
- package/dist/cli/expert-route.d.ts +11 -0
- package/dist/cli/expert-route.d.ts.map +1 -0
- package/dist/cli/expert-route.js +67 -0
- package/dist/cli/expert-route.js.map +1 -0
- package/dist/cli/harness.d.ts +24 -0
- package/dist/cli/harness.d.ts.map +1 -0
- package/dist/cli/harness.js +84 -0
- package/dist/cli/harness.js.map +1 -0
- package/dist/cli/hooks.d.ts +13 -2
- package/dist/cli/hooks.d.ts.map +1 -1
- package/dist/cli/hooks.js +333 -3
- package/dist/cli/hooks.js.map +1 -1
- package/dist/cli/ideate.d.ts +18 -0
- package/dist/cli/ideate.d.ts.map +1 -0
- package/dist/cli/ideate.js +148 -0
- package/dist/cli/ideate.js.map +1 -0
- package/dist/cli/patterns.js +55 -0
- package/dist/cli/patterns.js.map +1 -1
- package/dist/cli/setup.d.ts.map +1 -1
- package/dist/cli/setup.js +14 -1
- package/dist/cli/setup.js.map +1 -1
- package/dist/coordination/capability-router.d.ts +1 -1
- package/dist/coordination/capability-router.d.ts.map +1 -1
- package/dist/coordination/capability-router.js +132 -0
- package/dist/coordination/capability-router.js.map +1 -1
- package/dist/coordination/expert-orchestrator.d.ts +66 -0
- package/dist/coordination/expert-orchestrator.d.ts.map +1 -0
- package/dist/coordination/expert-orchestrator.js +150 -0
- package/dist/coordination/expert-orchestrator.js.map +1 -0
- package/dist/coordination/service.d.ts +8 -1
- package/dist/coordination/service.d.ts.map +1 -1
- package/dist/coordination/service.js +18 -4
- package/dist/coordination/service.js.map +1 -1
- package/dist/mcp-router/experts/registry.d.ts +54 -0
- package/dist/mcp-router/experts/registry.d.ts.map +1 -0
- package/dist/mcp-router/experts/registry.js +143 -0
- package/dist/mcp-router/experts/registry.js.map +1 -0
- package/dist/mcp-router/index.d.ts +2 -0
- package/dist/mcp-router/index.d.ts.map +1 -1
- package/dist/mcp-router/index.js +1 -0
- package/dist/mcp-router/index.js.map +1 -1
- package/dist/mcp-router/server.d.ts.map +1 -1
- package/dist/mcp-router/server.js +16 -0
- package/dist/mcp-router/server.js.map +1 -1
- package/dist/mcp-router/tools/execute.d.ts.map +1 -1
- package/dist/mcp-router/tools/execute.js +40 -0
- package/dist/mcp-router/tools/execute.js.map +1 -1
- package/dist/models/planner.d.ts +7 -1
- package/dist/models/planner.d.ts.map +1 -1
- package/dist/models/planner.js +61 -0
- package/dist/models/planner.js.map +1 -1
- package/dist/models/types.d.ts +2 -0
- package/dist/models/types.d.ts.map +1 -1
- package/dist/models/types.js.map +1 -1
- package/dist/observability/halo-exporter.d.ts +86 -0
- package/dist/observability/halo-exporter.d.ts.map +1 -0
- package/dist/observability/halo-exporter.js +139 -0
- package/dist/observability/halo-exporter.js.map +1 -0
- package/dist/tasks/database.d.ts.map +1 -1
- package/dist/tasks/database.js +9 -2
- package/dist/tasks/database.js.map +1 -1
- package/dist/telemetry/session-telemetry.d.ts.map +1 -1
- package/dist/telemetry/session-telemetry.js +7 -0
- package/dist/telemetry/session-telemetry.js.map +1 -1
- package/docs/architecture/EXPERT_STACK.md +137 -0
- package/docs/architecture/PLATFORM_GATING.md +68 -0
- package/docs/reference/EXPERT_DROIDS.md +219 -0
- package/package.json +1 -1
- package/templates/hooks/pre-tool-use-edit-write.sh +29 -8
- package/templates/hooks/uap-policy-gate-hermes.sh +42 -0
- package/tools/agents/scripts/anthropic_proxy.py +166 -30
- package/tools/agents/tests/test_attractor_detection.py +213 -0
|
@@ -390,6 +390,20 @@ PROXY_SESSION_CONTAMINATION_FORCED_THRESHOLD = int(
|
|
|
390
390
|
PROXY_SESSION_CONTAMINATION_REQUIRED_MISS_THRESHOLD = int(
|
|
391
391
|
os.environ.get("PROXY_SESSION_CONTAMINATION_REQUIRED_MISS_THRESHOLD", "2")
|
|
392
392
|
)
|
|
393
|
+
# Attractor-aware contamination escape. When the same fault excerpt repeats
|
|
394
|
+
# across consecutive contamination resets the model is in a stable output
|
|
395
|
+
# attractor that the standard kept_last reset cannot escape (the preserved
|
|
396
|
+
# tail re-primes the same fixed-point response). Detect via excerpt hash and
|
|
397
|
+
# respond with a harder reset + corrective injection + temperature bump.
|
|
398
|
+
PROXY_ATTRACTOR_DETECT = os.environ.get(
|
|
399
|
+
"PROXY_ATTRACTOR_DETECT", "on"
|
|
400
|
+
).lower() not in {"0", "false", "off", "no"}
|
|
401
|
+
PROXY_ATTRACTOR_TEMP_OVERRIDE = float(
|
|
402
|
+
os.environ.get("PROXY_ATTRACTOR_TEMP_OVERRIDE", "1.20")
|
|
403
|
+
)
|
|
404
|
+
PROXY_ATTRACTOR_FINALIZE_THRESHOLD = max(1, int(
|
|
405
|
+
os.environ.get("PROXY_ATTRACTOR_FINALIZE_THRESHOLD", "2")
|
|
406
|
+
))
|
|
393
407
|
PROXY_AGENTIC_SUPPLEMENT_MODE = (
|
|
394
408
|
os.environ.get("PROXY_AGENTIC_SUPPLEMENT_MODE", "clean").strip().lower()
|
|
395
409
|
)
|
|
@@ -750,6 +764,8 @@ class SessionMonitor:
|
|
|
750
764
|
invalid_tool_call_streak: int = 0 # consecutive invalid tool arg payloads
|
|
751
765
|
required_tool_miss_streak: int = 0 # required tool turns with no tool call
|
|
752
766
|
contamination_resets: int = 0 # how many contamination resets were applied
|
|
767
|
+
last_fault_excerpt_hash: str = "" # hash of last TOOL RESPONSE ISSUE excerpt (attractor detection)
|
|
768
|
+
attractor_correction_active: bool = False # next turn uses high-temp escape sampling
|
|
753
769
|
forced_auto_cooldown_turns: int = 0 # temporary auto override turns remaining
|
|
754
770
|
forced_dampener_triggers: int = 0 # number of dampener activations
|
|
755
771
|
arg_preflight_rejections: int = 0 # rejected tool calls from arg preflight
|
|
@@ -3586,23 +3602,37 @@ def build_openai_request(
|
|
|
3586
3602
|
openai_body["stop"] = anthropic_body["stop_sequences"]
|
|
3587
3603
|
|
|
3588
3604
|
# Force controlled temperature for tool-call turns to reduce garbled output
|
|
3589
|
-
# Cycle 15 Option 2: use lower temperature after contamination resets
|
|
3605
|
+
# Cycle 15 Option 2: use lower temperature after contamination resets.
|
|
3606
|
+
# Attractor escape: when an attractor correction is active, OVERRIDE the
|
|
3607
|
+
# low-temp default with a HIGH-temp sample so the deterministic output
|
|
3608
|
+
# trajectory has a chance to break. Single-turn override (cleared on
|
|
3609
|
+
# successful tool_use further down in the response handler).
|
|
3590
3610
|
if has_tools:
|
|
3591
3611
|
client_temp = openai_body.get("temperature")
|
|
3592
3612
|
target_temp = PROXY_TOOL_TURN_TEMPERATURE
|
|
3593
|
-
|
|
3594
|
-
|
|
3595
|
-
|
|
3613
|
+
attractor_active = getattr(monitor, "attractor_correction_active", False)
|
|
3614
|
+
if attractor_active:
|
|
3615
|
+
target_temp = max(target_temp, PROXY_ATTRACTOR_TEMP_OVERRIDE)
|
|
3596
3616
|
openai_body["temperature"] = target_temp
|
|
3597
|
-
extra = ""
|
|
3598
|
-
if monitor.contamination_resets > 0:
|
|
3599
|
-
extra = f" (post-contamination reset, resets={monitor.contamination_resets})"
|
|
3600
3617
|
logger.info(
|
|
3601
|
-
"TOOL TURN TEMP:
|
|
3618
|
+
"TOOL TURN TEMP: ATTRACTOR ESCAPE temperature=%.2f (was %s)",
|
|
3602
3619
|
target_temp,
|
|
3603
3620
|
client_temp,
|
|
3604
|
-
extra,
|
|
3605
3621
|
)
|
|
3622
|
+
else:
|
|
3623
|
+
if monitor.contamination_resets > 0:
|
|
3624
|
+
target_temp = min(target_temp, 0.1)
|
|
3625
|
+
if client_temp is None or client_temp > target_temp:
|
|
3626
|
+
openai_body["temperature"] = target_temp
|
|
3627
|
+
extra = ""
|
|
3628
|
+
if monitor.contamination_resets > 0:
|
|
3629
|
+
extra = f" (post-contamination reset, resets={monitor.contamination_resets})"
|
|
3630
|
+
logger.info(
|
|
3631
|
+
"TOOL TURN TEMP: forcing temperature=%.2f (was %s) for tool-enabled request%s",
|
|
3632
|
+
target_temp,
|
|
3633
|
+
client_temp,
|
|
3634
|
+
extra,
|
|
3635
|
+
)
|
|
3606
3636
|
|
|
3607
3637
|
# Convert Anthropic tools to OpenAI function-calling tools
|
|
3608
3638
|
full_openai_tools: list[dict] = []
|
|
@@ -4144,6 +4174,17 @@ def _openai_message_text(openai_resp: dict) -> str:
|
|
|
4144
4174
|
return content if isinstance(content, str) else str(content)
|
|
4145
4175
|
|
|
4146
4176
|
|
|
4177
|
+
def _hash_fault_excerpt(excerpt: str) -> str:
|
|
4178
|
+
"""Stable hash of a fault excerpt for attractor-repeat detection. Lowercased
|
|
4179
|
+
+ whitespace-collapsed so trivial rendering differences don't break the match."""
|
|
4180
|
+
if not excerpt:
|
|
4181
|
+
return ""
|
|
4182
|
+
normalized = " ".join(excerpt.lower().split())[:200]
|
|
4183
|
+
if not normalized:
|
|
4184
|
+
return ""
|
|
4185
|
+
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:16]
|
|
4186
|
+
|
|
4187
|
+
|
|
4147
4188
|
def _extract_openai_tool_calls(openai_resp: dict) -> list[dict]:
|
|
4148
4189
|
_, message = _extract_openai_choice(openai_resp)
|
|
4149
4190
|
tool_calls = message.get("tool_calls") or []
|
|
@@ -6348,6 +6389,13 @@ async def _apply_malformed_tool_guardrail(
|
|
|
6348
6389
|
monitor.invalid_tool_call_streak = 0
|
|
6349
6390
|
monitor.required_tool_miss_streak = 0
|
|
6350
6391
|
monitor.last_response_garbled = False
|
|
6392
|
+
if monitor.attractor_correction_active:
|
|
6393
|
+
logger.info(
|
|
6394
|
+
"ATTRACTOR ESCAPE succeeded: session=%s — tool_use emitted, clearing attractor flag",
|
|
6395
|
+
session_id,
|
|
6396
|
+
)
|
|
6397
|
+
monitor.attractor_correction_active = False
|
|
6398
|
+
monitor.last_fault_excerpt_hash = ""
|
|
6351
6399
|
if repair_count > 0:
|
|
6352
6400
|
monitor.arg_preflight_repairs += repair_count
|
|
6353
6401
|
logger.info(
|
|
@@ -6385,6 +6433,11 @@ async def _apply_malformed_tool_guardrail(
|
|
|
6385
6433
|
if raw_args and _is_garbled_tool_arguments(raw_args):
|
|
6386
6434
|
arg_excerpt = raw_args[:200].replace("\n", " ")
|
|
6387
6435
|
break
|
|
6436
|
+
# Attractor detection — hash the normalized fault excerpt so the
|
|
6437
|
+
# contamination breaker can recognize the same fixed-point response
|
|
6438
|
+
# reappearing across consecutive resets. Whitespace-normalized so trivial
|
|
6439
|
+
# rendering differences don't break the match.
|
|
6440
|
+
monitor.last_fault_excerpt_hash = _hash_fault_excerpt(excerpt)
|
|
6388
6441
|
logger.warning(
|
|
6389
6442
|
"TOOL RESPONSE ISSUE: session=%s kind=%s reason=%s malformed=%d invalid=%d required_miss=%d excerpt=%.220s args=%.200s",
|
|
6390
6443
|
session_id,
|
|
@@ -6627,7 +6680,16 @@ def _maybe_apply_session_contamination_breaker(
|
|
|
6627
6680
|
# Cycle 15 Option 3: if contamination has already reset N+ times in this
|
|
6628
6681
|
# session, the model is fundamentally unable to produce valid tool calls.
|
|
6629
6682
|
# Force finalize so the Droid framework can intervene.
|
|
6630
|
-
|
|
6683
|
+
#
|
|
6684
|
+
# Lower the threshold when an attractor correction has already been
|
|
6685
|
+
# applied — if the corrective injection + temp bump didn't break the
|
|
6686
|
+
# attractor on the next turn, more resets won't help. Cuts wasted retry
|
|
6687
|
+
# budget from 3 resets (~60 min observed) to 2 (~25 min).
|
|
6688
|
+
max_contamination_resets = (
|
|
6689
|
+
PROXY_ATTRACTOR_FINALIZE_THRESHOLD
|
|
6690
|
+
if monitor.attractor_correction_active
|
|
6691
|
+
else 3
|
|
6692
|
+
)
|
|
6631
6693
|
if monitor.contamination_resets >= max_contamination_resets:
|
|
6632
6694
|
logger.error(
|
|
6633
6695
|
"SESSION CONTAMINATION LOOP: session=%s contamination_resets=%d >= %d, forcing finalize",
|
|
@@ -6654,26 +6716,89 @@ def _maybe_apply_session_contamination_breaker(
|
|
|
6654
6716
|
return updated
|
|
6655
6717
|
|
|
6656
6718
|
messages = anthropic_body.get("messages", [])
|
|
6719
|
+
|
|
6720
|
+
# Attractor detection: if the fault excerpt that triggered this reset
|
|
6721
|
+
# hashes to the same value as the *previous* reset's fault excerpt, the
|
|
6722
|
+
# model is in a stable output attractor — keep_last reset preserves the
|
|
6723
|
+
# priming tail that pulls it back in. Apply a harder reset (system +
|
|
6724
|
+
# initial user turn only) plus a corrective injection. Temperature gets
|
|
6725
|
+
# bumped UP on the next turn (see _apply_request_sampling) instead of
|
|
6726
|
+
# the standard post-contamination drop, to break the deterministic
|
|
6727
|
+
# output trajectory.
|
|
6728
|
+
attractor_detected = bool(
|
|
6729
|
+
PROXY_ATTRACTOR_DETECT
|
|
6730
|
+
and monitor.contamination_resets >= 1
|
|
6731
|
+
and monitor.last_fault_excerpt_hash
|
|
6732
|
+
and monitor.last_fault_excerpt_hash
|
|
6733
|
+
== getattr(monitor, "_prev_reset_fault_hash", "")
|
|
6734
|
+
)
|
|
6735
|
+
monitor._prev_reset_fault_hash = monitor.last_fault_excerpt_hash
|
|
6736
|
+
|
|
6657
6737
|
keep_last = max(2, PROXY_SESSION_CONTAMINATION_KEEP_LAST)
|
|
6658
|
-
if len(messages) <= keep_last + 1:
|
|
6738
|
+
if not attractor_detected and len(messages) <= keep_last + 1:
|
|
6659
6739
|
monitor.malformed_tool_streak = 0
|
|
6660
6740
|
monitor.invalid_tool_call_streak = 0
|
|
6661
6741
|
monitor.required_tool_miss_streak = 0
|
|
6662
6742
|
monitor.reset_tool_turn_state(reason="contamination_guardrail_soft_reset")
|
|
6663
6743
|
return anthropic_body
|
|
6664
6744
|
|
|
6665
|
-
|
|
6666
|
-
|
|
6667
|
-
|
|
6668
|
-
|
|
6669
|
-
|
|
6670
|
-
|
|
6671
|
-
|
|
6672
|
-
)
|
|
6673
|
-
|
|
6745
|
+
if attractor_detected:
|
|
6746
|
+
# Hard reset: drop the entire trailing context. Keep only the system
|
|
6747
|
+
# turn (if present) and the first user turn so the model has the
|
|
6748
|
+
# original goal but none of the attractor-priming tail.
|
|
6749
|
+
first_user_idx = next(
|
|
6750
|
+
(i for i, m in enumerate(messages) if m.get("role") == "user"),
|
|
6751
|
+
None,
|
|
6752
|
+
)
|
|
6753
|
+
if first_user_idx is None:
|
|
6754
|
+
head = messages[:1]
|
|
6755
|
+
else:
|
|
6756
|
+
head = messages[: first_user_idx + 1]
|
|
6757
|
+
# Phase 2 (PR #192): stronger, more structured intervention wording.
|
|
6758
|
+
# The Phase 1 single-paragraph message + temp 0.95 escaped one
|
|
6759
|
+
# production attractor (2026-05-25 02:39:59 fp:1f7e2c95...) but failed
|
|
6760
|
+
# to escape another (2026-05-24 19:11 fp:d19b7a44...). Increase the
|
|
6761
|
+
# signal-to-noise on the corrective by: (1) splitting MUST/MUST NOT
|
|
6762
|
+
# into bullet points the model attends to better, (2) using ALL CAPS
|
|
6763
|
+
# on the critical negative ("DO NOT narrate"), (3) explicitly naming
|
|
6764
|
+
# the attractor failure mode so the model can recognize and avoid it.
|
|
6765
|
+
reset_marker = {
|
|
6766
|
+
"role": "user",
|
|
6767
|
+
"content": (
|
|
6768
|
+
"[ATTRACTOR INTERVENTION — CRITICAL]\n\n"
|
|
6769
|
+
"Your previous responses REPEATEDLY emitted prose summaries "
|
|
6770
|
+
"instead of tool calls. This is the exact failure mode this "
|
|
6771
|
+
"intervention is designed to break. The trailing conversation "
|
|
6772
|
+
"has been REMOVED.\n\n"
|
|
6773
|
+
"YOUR NEXT RESPONSE MUST:\n"
|
|
6774
|
+
" 1. Begin with a tool_use block (no preamble, no thinking)\n"
|
|
6775
|
+
" 2. Invoke one of the available tools\n"
|
|
6776
|
+
" 3. Take a CONCRETE action toward the original task\n\n"
|
|
6777
|
+
"DO NOT:\n"
|
|
6778
|
+
" • Summarize what you have done or plan to do\n"
|
|
6779
|
+
" • Narrate, explain, or describe\n"
|
|
6780
|
+
" • Emit any prose before the tool_use block\n\n"
|
|
6781
|
+
"Just call the tool."
|
|
6782
|
+
),
|
|
6783
|
+
}
|
|
6784
|
+
new_messages = head + [reset_marker]
|
|
6785
|
+
monitor.attractor_correction_active = True
|
|
6786
|
+
log_reason = "attractor"
|
|
6787
|
+
else:
|
|
6788
|
+
head = messages[:1]
|
|
6789
|
+
tail = messages[-keep_last:]
|
|
6790
|
+
reset_marker = {
|
|
6791
|
+
"role": "user",
|
|
6792
|
+
"content": (
|
|
6793
|
+
"[SESSION RESET: tool-call quality degraded in earlier turns. "
|
|
6794
|
+
"Continue from the recent context and emit valid tool calls with strict JSON arguments only.]"
|
|
6795
|
+
),
|
|
6796
|
+
}
|
|
6797
|
+
new_messages = head + [reset_marker] + tail
|
|
6798
|
+
log_reason = "standard"
|
|
6674
6799
|
|
|
6675
6800
|
updated_body = dict(anthropic_body)
|
|
6676
|
-
updated_body["messages"] =
|
|
6801
|
+
updated_body["messages"] = new_messages
|
|
6677
6802
|
|
|
6678
6803
|
forced_before = monitor.consecutive_forced_count
|
|
6679
6804
|
required_miss_before = monitor.required_tool_miss_streak
|
|
@@ -6684,15 +6809,26 @@ def _maybe_apply_session_contamination_breaker(
|
|
|
6684
6809
|
monitor.no_progress_streak = 0
|
|
6685
6810
|
monitor.consecutive_forced_count = 0
|
|
6686
6811
|
monitor.forced_auto_cooldown_turns = 0
|
|
6687
|
-
monitor.reset_tool_turn_state(reason="
|
|
6688
|
-
|
|
6689
|
-
|
|
6690
|
-
|
|
6691
|
-
|
|
6692
|
-
|
|
6693
|
-
|
|
6694
|
-
|
|
6695
|
-
|
|
6812
|
+
monitor.reset_tool_turn_state(reason=f"contamination_guardrail_reset_{log_reason}")
|
|
6813
|
+
if attractor_detected:
|
|
6814
|
+
logger.warning(
|
|
6815
|
+
"CONTAMINATION ATTRACTOR DETECTED: session=%s hash=%s — hard reset "
|
|
6816
|
+
"applied, kept=%d messages (initial intent only), temp override "
|
|
6817
|
+
"and finalize threshold lowered to %d",
|
|
6818
|
+
session_id,
|
|
6819
|
+
monitor.last_fault_excerpt_hash,
|
|
6820
|
+
len(updated_body["messages"]),
|
|
6821
|
+
PROXY_ATTRACTOR_FINALIZE_THRESHOLD,
|
|
6822
|
+
)
|
|
6823
|
+
else:
|
|
6824
|
+
logger.warning(
|
|
6825
|
+
"SESSION CONTAMINATION BREAKER: session=%s reset applied, kept=%d messages (bad_streak=%d forced=%d required_miss=%d)",
|
|
6826
|
+
session_id,
|
|
6827
|
+
len(updated_body["messages"]),
|
|
6828
|
+
bad_streak,
|
|
6829
|
+
forced_before,
|
|
6830
|
+
required_miss_before,
|
|
6831
|
+
)
|
|
6696
6832
|
|
|
6697
6833
|
return updated_body
|
|
6698
6834
|
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Unit tests for the attractor-aware contamination-breaker path.
|
|
3
|
+
|
|
4
|
+
Validates that a repeated fault-excerpt hash across consecutive contamination
|
|
5
|
+
resets triggers the hard-reset + corrective-injection path, and that the
|
|
6
|
+
standard kept-last path remains unchanged when no repeat is observed.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import importlib.util
|
|
10
|
+
import unittest
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _load_proxy_module():
|
|
15
|
+
proxy_path = Path(__file__).resolve().parents[1] / "scripts" / "anthropic_proxy.py"
|
|
16
|
+
spec = importlib.util.spec_from_file_location("anthropic_proxy", proxy_path)
|
|
17
|
+
assert spec is not None and spec.loader is not None
|
|
18
|
+
module = importlib.util.module_from_spec(spec)
|
|
19
|
+
spec.loader.exec_module(module)
|
|
20
|
+
return module
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
proxy = _load_proxy_module()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _make_monitor(**overrides):
|
|
27
|
+
m = proxy.SessionMonitor()
|
|
28
|
+
for k, v in overrides.items():
|
|
29
|
+
setattr(m, k, v)
|
|
30
|
+
return m
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _make_body(n_msgs: int):
|
|
34
|
+
"""Build an anthropic_body with a system + N user/assistant turns."""
|
|
35
|
+
messages = [{"role": "user", "content": "Run a recon on /repos/pay2u."}]
|
|
36
|
+
for i in range(n_msgs - 1):
|
|
37
|
+
role = "assistant" if i % 2 == 0 else "user"
|
|
38
|
+
messages.append({"role": role, "content": f"turn-{i}"})
|
|
39
|
+
return {
|
|
40
|
+
"model": "qwen36-35b-a3b-iq4xs",
|
|
41
|
+
"messages": messages,
|
|
42
|
+
"tools": [{"name": "Bash", "input_schema": {"type": "object"}}],
|
|
43
|
+
"tool_choice": {"type": "any"},
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class TestHashFaultExcerpt(unittest.TestCase):
|
|
48
|
+
def test_empty_returns_empty(self):
|
|
49
|
+
self.assertEqual(proxy._hash_fault_excerpt(""), "")
|
|
50
|
+
self.assertEqual(proxy._hash_fault_excerpt(" "), "")
|
|
51
|
+
|
|
52
|
+
def test_whitespace_normalized(self):
|
|
53
|
+
a = proxy._hash_fault_excerpt("The security architecture is layered.")
|
|
54
|
+
b = proxy._hash_fault_excerpt("The security architecture is layered.")
|
|
55
|
+
c = proxy._hash_fault_excerpt("The\nsecurity\narchitecture\nis\nlayered.")
|
|
56
|
+
self.assertEqual(a, b)
|
|
57
|
+
self.assertEqual(a, c)
|
|
58
|
+
|
|
59
|
+
def test_case_normalized(self):
|
|
60
|
+
a = proxy._hash_fault_excerpt("FAIL CLOSED security")
|
|
61
|
+
b = proxy._hash_fault_excerpt("fail closed security")
|
|
62
|
+
self.assertEqual(a, b)
|
|
63
|
+
|
|
64
|
+
def test_distinct_excerpts_distinct_hashes(self):
|
|
65
|
+
a = proxy._hash_fault_excerpt("Pay2U API analysis")
|
|
66
|
+
b = proxy._hash_fault_excerpt("Different attractor text")
|
|
67
|
+
self.assertNotEqual(a, b)
|
|
68
|
+
self.assertEqual(len(a), 16)
|
|
69
|
+
self.assertEqual(len(b), 16)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class TestAttractorDetectionPath(unittest.TestCase):
|
|
73
|
+
"""First reset → standard. Second reset with same excerpt → attractor."""
|
|
74
|
+
|
|
75
|
+
def _trip_breaker(self, monitor):
|
|
76
|
+
# Make the breaker think it should reset.
|
|
77
|
+
monitor.required_tool_miss_streak = (
|
|
78
|
+
proxy.PROXY_SESSION_CONTAMINATION_REQUIRED_MISS_THRESHOLD
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def test_first_reset_is_standard(self):
|
|
82
|
+
monitor = _make_monitor()
|
|
83
|
+
monitor.last_fault_excerpt_hash = "deadbeefcafebabe"
|
|
84
|
+
self._trip_breaker(monitor)
|
|
85
|
+
|
|
86
|
+
body = _make_body(n_msgs=20)
|
|
87
|
+
updated = proxy._maybe_apply_session_contamination_breaker(
|
|
88
|
+
body, monitor, "test-session"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
self.assertEqual(monitor.contamination_resets, 1)
|
|
92
|
+
self.assertFalse(monitor.attractor_correction_active)
|
|
93
|
+
# Standard keeps head + reset_marker + last keep_last messages
|
|
94
|
+
kept_last = max(2, proxy.PROXY_SESSION_CONTAMINATION_KEEP_LAST)
|
|
95
|
+
self.assertEqual(len(updated["messages"]), 1 + 1 + kept_last)
|
|
96
|
+
# Reset marker carries the standard wording, not the attractor wording.
|
|
97
|
+
self.assertIn("SESSION RESET", updated["messages"][1]["content"])
|
|
98
|
+
|
|
99
|
+
def test_second_reset_same_hash_triggers_attractor(self):
|
|
100
|
+
monitor = _make_monitor()
|
|
101
|
+
monitor.last_fault_excerpt_hash = "deadbeefcafebabe"
|
|
102
|
+
# Pretend we've already done one reset with the same fault excerpt.
|
|
103
|
+
monitor._prev_reset_fault_hash = "deadbeefcafebabe"
|
|
104
|
+
monitor.contamination_resets = 1
|
|
105
|
+
self._trip_breaker(monitor)
|
|
106
|
+
|
|
107
|
+
body = _make_body(n_msgs=20)
|
|
108
|
+
updated = proxy._maybe_apply_session_contamination_breaker(
|
|
109
|
+
body, monitor, "test-session"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
self.assertTrue(monitor.attractor_correction_active)
|
|
113
|
+
# Hard reset keeps only system + first user (+ corrective marker)
|
|
114
|
+
# → 2 messages total for this body (first user + marker).
|
|
115
|
+
self.assertLessEqual(len(updated["messages"]), 3)
|
|
116
|
+
self.assertIn("ATTRACTOR INTERVENTION", updated["messages"][-1]["content"])
|
|
117
|
+
|
|
118
|
+
def test_second_reset_different_hash_stays_standard(self):
|
|
119
|
+
monitor = _make_monitor()
|
|
120
|
+
monitor.last_fault_excerpt_hash = "newhashvalue1234"
|
|
121
|
+
monitor._prev_reset_fault_hash = "deadbeefcafebabe"
|
|
122
|
+
monitor.contamination_resets = 1
|
|
123
|
+
self._trip_breaker(monitor)
|
|
124
|
+
|
|
125
|
+
body = _make_body(n_msgs=20)
|
|
126
|
+
updated = proxy._maybe_apply_session_contamination_breaker(
|
|
127
|
+
body, monitor, "test-session"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
self.assertFalse(monitor.attractor_correction_active)
|
|
131
|
+
self.assertIn("SESSION RESET", updated["messages"][1]["content"])
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class TestAttractorFinalizeThreshold(unittest.TestCase):
|
|
135
|
+
"""Once attractor correction is active, finalize fires at the lower
|
|
136
|
+
threshold instead of waiting for 3 resets."""
|
|
137
|
+
|
|
138
|
+
def test_attractor_lowers_finalize_threshold(self):
|
|
139
|
+
monitor = _make_monitor()
|
|
140
|
+
monitor.attractor_correction_active = True
|
|
141
|
+
# Just at the lowered threshold.
|
|
142
|
+
monitor.contamination_resets = proxy.PROXY_ATTRACTOR_FINALIZE_THRESHOLD
|
|
143
|
+
monitor.required_tool_miss_streak = (
|
|
144
|
+
proxy.PROXY_SESSION_CONTAMINATION_REQUIRED_MISS_THRESHOLD
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
body = _make_body(n_msgs=20)
|
|
148
|
+
updated = proxy._maybe_apply_session_contamination_breaker(
|
|
149
|
+
body, monitor, "test-session"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Finalize path strips tools and appends the "respond with plain text" prompt.
|
|
153
|
+
self.assertNotIn("tools", updated)
|
|
154
|
+
self.assertNotIn("tool_choice", updated)
|
|
155
|
+
self.assertIn("plain text only", updated["messages"][-1]["content"])
|
|
156
|
+
|
|
157
|
+
def test_standard_path_keeps_3_reset_budget(self):
|
|
158
|
+
monitor = _make_monitor()
|
|
159
|
+
monitor.attractor_correction_active = False
|
|
160
|
+
# 2 resets done — under the standard 3-reset budget.
|
|
161
|
+
monitor.contamination_resets = 2
|
|
162
|
+
monitor.required_tool_miss_streak = (
|
|
163
|
+
proxy.PROXY_SESSION_CONTAMINATION_REQUIRED_MISS_THRESHOLD
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
body = _make_body(n_msgs=20)
|
|
167
|
+
updated = proxy._maybe_apply_session_contamination_breaker(
|
|
168
|
+
body, monitor, "test-session"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Standard reset, not finalize.
|
|
172
|
+
self.assertIn("tools", updated)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class TestAttractorPhase2Defaults(unittest.TestCase):
|
|
176
|
+
"""Phase 2 (PR #192) raises the default temp override and strengthens the
|
|
177
|
+
intervention wording. Verify the defaults the operator gets out of the box."""
|
|
178
|
+
|
|
179
|
+
def test_temp_override_default_is_1_20(self):
|
|
180
|
+
# Phase 1 default was 0.95; Phase 2 raises to 1.20 after one
|
|
181
|
+
# production attractor (fp:d19b7a44...) failed to escape at 0.95.
|
|
182
|
+
self.assertGreaterEqual(proxy.PROXY_ATTRACTOR_TEMP_OVERRIDE, 1.20 - 0.001)
|
|
183
|
+
|
|
184
|
+
def test_intervention_message_has_structured_directives(self):
|
|
185
|
+
"""The Phase 2 wording uses MUST / DO NOT bullets and explicitly
|
|
186
|
+
names the failure mode. Trigger the attractor path and inspect the
|
|
187
|
+
injected marker."""
|
|
188
|
+
monitor = _make_monitor()
|
|
189
|
+
monitor.last_fault_excerpt_hash = "deadbeefcafebabe"
|
|
190
|
+
monitor._prev_reset_fault_hash = "deadbeefcafebabe"
|
|
191
|
+
monitor.contamination_resets = 1
|
|
192
|
+
monitor.required_tool_miss_streak = (
|
|
193
|
+
proxy.PROXY_SESSION_CONTAMINATION_REQUIRED_MISS_THRESHOLD
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
body = _make_body(n_msgs=20)
|
|
197
|
+
updated = proxy._maybe_apply_session_contamination_breaker(
|
|
198
|
+
body, monitor, "test-session"
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
content = updated["messages"][-1]["content"]
|
|
202
|
+
# Phase 2 wording signals
|
|
203
|
+
self.assertIn("CRITICAL", content)
|
|
204
|
+
self.assertIn("MUST", content)
|
|
205
|
+
self.assertIn("DO NOT", content)
|
|
206
|
+
# Specifically forbids the attractor's preferred behaviors
|
|
207
|
+
self.assertIn("prose", content.lower())
|
|
208
|
+
# Still has the marker substring callers may grep on
|
|
209
|
+
self.assertIn("ATTRACTOR INTERVENTION", content)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
if __name__ == "__main__":
|
|
213
|
+
unittest.main()
|