@miller-tech/uap 1.20.47 → 1.20.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -224,6 +224,16 @@ PROXY_FINALIZE_CONTINUATION_MAX = int(
|
|
|
224
224
|
PROXY_FINALIZE_SESSION_HARD_CAP = int(
|
|
225
225
|
os.environ.get("PROXY_FINALIZE_SESSION_HARD_CAP", "3")
|
|
226
226
|
)
|
|
227
|
+
# Recon-convergence guardrail: after this many consecutive turns of PURE
|
|
228
|
+
# read-only exploration (Read/Grep/Glob/etc. — no write/edit/deliverable
|
|
229
|
+
# tool), the proxy injects a directive telling the model to stop exploring
|
|
230
|
+
# and produce its deliverable. Targets the failure mode where an agentic
|
|
231
|
+
# recon task reads files for hundreds of turns and never converges to the
|
|
232
|
+
# synthesis/write step (observed: 664-turn recon, no deliverable started).
|
|
233
|
+
# 0 disables.
|
|
234
|
+
PROXY_RECON_CONVERGENCE_THRESHOLD = int(
|
|
235
|
+
os.environ.get("PROXY_RECON_CONVERGENCE_THRESHOLD", "40")
|
|
236
|
+
)
|
|
227
237
|
PROXY_STREAM_REASONING_FALLBACK = (
|
|
228
238
|
os.environ.get("PROXY_STREAM_REASONING_FALLBACK", "off").strip().lower()
|
|
229
239
|
)
|
|
@@ -702,6 +712,7 @@ class SessionMonitor:
|
|
|
702
712
|
peak_input_tokens: int = 0 # High-water mark
|
|
703
713
|
prune_count: int = 0 # How many times pruning was triggered
|
|
704
714
|
overflow_count: int = 0 # How many context overflow errors caught
|
|
715
|
+
prune_drop_count: int = 0 # monotonic: # of oldest middle msgs pruned (B3)
|
|
705
716
|
context_history: list = field(default_factory=list) # Recent token counts
|
|
706
717
|
|
|
707
718
|
# --- Token Loop Protection ---
|
|
@@ -716,6 +727,7 @@ class SessionMonitor:
|
|
|
716
727
|
)
|
|
717
728
|
loop_warnings_emitted: int = 0 # How many loop warnings sent to the model
|
|
718
729
|
no_progress_streak: int = 0 # Forced tool turns without new tool_result
|
|
730
|
+
consecutive_readonly_turns: int = 0 # turns of pure read-only exploration (B1)
|
|
719
731
|
unexpected_end_turn_count: int = 0 # end_turn without tool_use in active loop
|
|
720
732
|
tool_starvation_streak: int = 0 # Consecutive forced turns with no tool_calls produced
|
|
721
733
|
malformed_tool_streak: int = 0 # consecutive malformed pseudo tool payloads
|
|
@@ -873,6 +885,16 @@ class SessionMonitor:
|
|
|
873
885
|
if len(self.tool_call_history) > 30:
|
|
874
886
|
self.tool_call_history = self.tool_call_history[-30:]
|
|
875
887
|
|
|
888
|
+
# Recon-convergence (B1): count consecutive turns of PURE read-only
|
|
889
|
+
# exploration. A turn that uses any non-read-only tool (write, edit,
|
|
890
|
+
# a deliverable tool) resets the streak — that's the model
|
|
891
|
+
# converging from exploration toward synthesis/action.
|
|
892
|
+
_ro = {n.lower() for n in _READ_ONLY_TOOL_CLASS}
|
|
893
|
+
if tool_names and all(n.lower() in _ro for n in tool_names):
|
|
894
|
+
self.consecutive_readonly_turns += 1
|
|
895
|
+
else:
|
|
896
|
+
self.consecutive_readonly_turns = 0
|
|
897
|
+
|
|
876
898
|
# Track read-only tool targets for dedup (Option 3)
|
|
877
899
|
if tool_targets:
|
|
878
900
|
for name, target in tool_targets.items():
|
|
@@ -1297,24 +1319,83 @@ def estimate_total_tokens(anthropic_body: dict) -> int:
|
|
|
1297
1319
|
return tokens
|
|
1298
1320
|
|
|
1299
1321
|
|
|
1322
|
+
# Max tool-result breadcrumbs listed in a prune summary (B2). Bounds the
|
|
1323
|
+
# summary size — beyond this the oldest breadcrumbs are elided.
|
|
1324
|
+
_PRUNE_SUMMARY_MAX_ITEMS = int(os.environ.get("PROXY_PRUNE_SUMMARY_MAX_ITEMS", "30"))
|
|
1325
|
+
|
|
1326
|
+
|
|
1327
|
+
def _summarize_pruned_block(dropped: list[dict]) -> str:
|
|
1328
|
+
"""Build a compact breadcrumb summary of pruned messages (B2).
|
|
1329
|
+
|
|
1330
|
+
Instead of discarding dropped tool-results outright, leave a one-line
|
|
1331
|
+
trace of each so the agent retains *what it already found*. A recon
|
|
1332
|
+
agent that can still see "I read auth_handler.cpp — JWT validation in
|
|
1333
|
+
validateToken()" is far likelier to converge to a synthesis than one
|
|
1334
|
+
whose findings vanished entirely and which therefore re-explores.
|
|
1335
|
+
|
|
1336
|
+
Heuristic only — no LLM call. Bounded to the most recent
|
|
1337
|
+
PROXY_PRUNE_SUMMARY_MAX_ITEMS tool-result breadcrumbs so the summary
|
|
1338
|
+
itself cannot grow unbounded.
|
|
1339
|
+
"""
|
|
1340
|
+
breadcrumbs: list[str] = []
|
|
1341
|
+
for msg in dropped:
|
|
1342
|
+
content = msg.get("content", [])
|
|
1343
|
+
if not isinstance(content, list):
|
|
1344
|
+
continue
|
|
1345
|
+
for block in content:
|
|
1346
|
+
if isinstance(block, dict) and block.get("type") == "tool_result":
|
|
1347
|
+
text = _extract_text(block.get("content", "")).strip()
|
|
1348
|
+
if not text:
|
|
1349
|
+
continue
|
|
1350
|
+
excerpt = " ".join(text.split())[:100]
|
|
1351
|
+
breadcrumbs.append(
|
|
1352
|
+
f"- tool result (~{estimate_tokens(text)} tok): {excerpt}"
|
|
1353
|
+
)
|
|
1354
|
+
if not breadcrumbs:
|
|
1355
|
+
return (
|
|
1356
|
+
"[CONTEXT PRUNED: older messages were removed to fit the context "
|
|
1357
|
+
"window. The conversation continues from recent context below.]"
|
|
1358
|
+
)
|
|
1359
|
+
total = len(breadcrumbs)
|
|
1360
|
+
if total > _PRUNE_SUMMARY_MAX_ITEMS:
|
|
1361
|
+
breadcrumbs = breadcrumbs[-_PRUNE_SUMMARY_MAX_ITEMS:]
|
|
1362
|
+
header = (
|
|
1363
|
+
f"[CONTEXT PRUNED — {len(dropped)} older messages removed to fit the "
|
|
1364
|
+
"context window. Breadcrumbs of earlier findings"
|
|
1365
|
+
)
|
|
1366
|
+
if total > len(breadcrumbs):
|
|
1367
|
+
header += f" (most recent {len(breadcrumbs)} of {total} tool results)"
|
|
1368
|
+
header += " — rely on these instead of re-reading those files:]"
|
|
1369
|
+
return header + "\n" + "\n".join(breadcrumbs)
|
|
1370
|
+
|
|
1371
|
+
|
|
1300
1372
|
def prune_conversation(
|
|
1301
1373
|
anthropic_body: dict,
|
|
1302
1374
|
context_window: int,
|
|
1375
|
+
monitor: "SessionMonitor | None" = None,
|
|
1303
1376
|
target_fraction: float = 0.65,
|
|
1304
1377
|
keep_last: int = 8,
|
|
1305
1378
|
) -> dict:
|
|
1306
1379
|
"""Prune the conversation to fit within the context window.
|
|
1307
1380
|
|
|
1308
|
-
Strategy:
|
|
1309
|
-
- Always keep: system prompt, first user message, last N messages
|
|
1310
|
-
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1381
|
+
Strategy (reworked — UAP PR #186):
|
|
1382
|
+
- Always keep: system prompt, first user message, last N messages.
|
|
1383
|
+
- Drop a CONTIGUOUS block of the oldest middle messages. The drop
|
|
1384
|
+
count is persisted per-session on the monitor (`prune_drop_count`)
|
|
1385
|
+
and is monotonic — it only ever grows. This keeps the retained
|
|
1386
|
+
region a stable recent *suffix*: on turns where the boundary does
|
|
1387
|
+
not advance, the upstream KV-cache prefix stays valid and the turn
|
|
1388
|
+
is not reprocessed. (The previous priority-greedy keep was
|
|
1389
|
+
non-contiguous and reshuffled the prompt mid-stream every turn,
|
|
1390
|
+
defeating the cache.)
|
|
1391
|
+
- Replace the dropped block with a breadcrumb summary (see
|
|
1392
|
+
_summarize_pruned_block) so the agent keeps its earlier findings.
|
|
1314
1393
|
|
|
1315
1394
|
Args:
|
|
1316
1395
|
anthropic_body: The full Anthropic request body
|
|
1317
1396
|
context_window: Maximum context window in tokens
|
|
1397
|
+
monitor: SessionMonitor — carries the monotonic prune boundary.
|
|
1398
|
+
When None, pruning still works but is non-monotonic per call.
|
|
1318
1399
|
target_fraction: Target utilization after pruning (0.0-1.0)
|
|
1319
1400
|
keep_last: Number of recent messages to always keep (default 8)
|
|
1320
1401
|
|
|
@@ -1390,70 +1471,39 @@ def prune_conversation(
|
|
|
1390
1471
|
|
|
1391
1472
|
remaining_budget = message_budget - protected_tokens
|
|
1392
1473
|
|
|
1393
|
-
#
|
|
1394
|
-
#
|
|
1395
|
-
#
|
|
1396
|
-
#
|
|
1397
|
-
#
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
)
|
|
1409
|
-
|
|
1410
|
-
# Lower priority = removed first
|
|
1411
|
-
if is_tool_result:
|
|
1412
|
-
priority = 0 # Remove first
|
|
1413
|
-
elif is_assistant:
|
|
1414
|
-
priority = 1 # Remove second
|
|
1415
|
-
else:
|
|
1416
|
-
priority = 2 # Remove last (user messages)
|
|
1417
|
-
|
|
1418
|
-
scored_middle.append((priority, i, tokens, msg))
|
|
1419
|
-
|
|
1420
|
-
# Sort by priority (ascending = remove first), then by index (oldest first)
|
|
1421
|
-
scored_middle.sort(key=lambda x: (x[0], x[1]))
|
|
1422
|
-
|
|
1423
|
-
# Greedily keep messages from highest priority (keep last) until budget fills
|
|
1424
|
-
kept_middle = []
|
|
1425
|
-
used_tokens = 0
|
|
1426
|
-
# Process in reverse priority order (keep high-priority messages first)
|
|
1427
|
-
for priority, idx, tokens, msg in reversed(scored_middle):
|
|
1428
|
-
if used_tokens + tokens <= remaining_budget:
|
|
1429
|
-
kept_middle.append((idx, msg))
|
|
1430
|
-
used_tokens += tokens
|
|
1431
|
-
|
|
1432
|
-
# Sort kept messages back into original order
|
|
1433
|
-
kept_middle.sort(key=lambda x: x[0])
|
|
1434
|
-
kept_msgs = [m for _, m in kept_middle]
|
|
1474
|
+
# --- Monotonic contiguous prune boundary (cache-stable, B3) ---
|
|
1475
|
+
# Drop the oldest `drop_count` middle messages as one contiguous block.
|
|
1476
|
+
# Seed from the monitor's persisted boundary; advance it only as far as
|
|
1477
|
+
# the budget forces. Persist back monotonically so a later/looser prune
|
|
1478
|
+
# in the same turn can't shrink it (which would reshuffle the prompt).
|
|
1479
|
+
drop_count = 0
|
|
1480
|
+
if monitor is not None:
|
|
1481
|
+
drop_count = min(max(0, monitor.prune_drop_count), len(middle))
|
|
1482
|
+
while drop_count < len(middle):
|
|
1483
|
+
kept_tokens = sum(estimate_message_tokens(m) for m in middle[drop_count:])
|
|
1484
|
+
if kept_tokens <= remaining_budget:
|
|
1485
|
+
break
|
|
1486
|
+
drop_count += 1
|
|
1487
|
+
if monitor is not None:
|
|
1488
|
+
monitor.prune_drop_count = max(monitor.prune_drop_count, drop_count)
|
|
1435
1489
|
|
|
1436
|
-
|
|
1437
|
-
|
|
1490
|
+
dropped = middle[:drop_count]
|
|
1491
|
+
kept_msgs = middle[drop_count:]
|
|
1438
1492
|
|
|
1439
|
-
if
|
|
1440
|
-
#
|
|
1493
|
+
if dropped:
|
|
1494
|
+
# Replace the dropped block with a findings-breadcrumb summary (B2).
|
|
1441
1495
|
prune_marker = {
|
|
1442
1496
|
"role": "user",
|
|
1443
|
-
"content": (
|
|
1444
|
-
f"[CONTEXT PRUNED: {removed_count} older messages (~{removed_tokens} tokens) "
|
|
1445
|
-
f"were removed to fit within the context window. "
|
|
1446
|
-
f"The conversation continues from recent context below.]"
|
|
1447
|
-
),
|
|
1497
|
+
"content": _summarize_pruned_block(dropped),
|
|
1448
1498
|
}
|
|
1449
1499
|
anthropic_body["messages"] = (
|
|
1450
1500
|
protected_head + [prune_marker] + kept_msgs + protected_tail
|
|
1451
1501
|
)
|
|
1452
1502
|
logger.warning(
|
|
1453
|
-
"PRUNED:
|
|
1454
|
-
"target=%.0f%% of %d ctx",
|
|
1455
|
-
|
|
1456
|
-
|
|
1503
|
+
"PRUNED: dropped %d oldest middle messages (boundary=%d), "
|
|
1504
|
+
"kept %d total, target=%.0f%% of %d ctx",
|
|
1505
|
+
len(dropped),
|
|
1506
|
+
drop_count,
|
|
1457
1507
|
len(anthropic_body["messages"]),
|
|
1458
1508
|
target_fraction * 100,
|
|
1459
1509
|
context_window,
|
|
@@ -3218,6 +3268,51 @@ def _resolve_state_machine_tool_choice(
|
|
|
3218
3268
|
return None, "unknown_phase"
|
|
3219
3269
|
|
|
3220
3270
|
|
|
3271
|
+
def _maybe_inject_recon_convergence(openai_body: dict, monitor: "SessionMonitor") -> None:
|
|
3272
|
+
"""Nudge a session stuck in prolonged read-only exploration toward its
|
|
3273
|
+
deliverable.
|
|
3274
|
+
|
|
3275
|
+
Fires when `consecutive_readonly_turns` crosses
|
|
3276
|
+
PROXY_RECON_CONVERGENCE_THRESHOLD — the model has read files for many
|
|
3277
|
+
turns without writing anything. Targets the observed failure mode of
|
|
3278
|
+
an agentic recon task wandering for hundreds of turns and never
|
|
3279
|
+
converging to the synthesis/write step. Two escalation tiers: a firm
|
|
3280
|
+
"switch to synthesis" directive, then a hard "STOP, write it now" once
|
|
3281
|
+
the streak is 2x over threshold.
|
|
3282
|
+
"""
|
|
3283
|
+
if PROXY_RECON_CONVERGENCE_THRESHOLD <= 0:
|
|
3284
|
+
return
|
|
3285
|
+
streak = monitor.consecutive_readonly_turns
|
|
3286
|
+
if streak < PROXY_RECON_CONVERGENCE_THRESHOLD:
|
|
3287
|
+
return
|
|
3288
|
+
util = monitor.get_utilization()
|
|
3289
|
+
if streak >= 2 * PROXY_RECON_CONVERGENCE_THRESHOLD:
|
|
3290
|
+
directive = (
|
|
3291
|
+
f"STOP exploring. You have run {streak} consecutive turns of "
|
|
3292
|
+
f"read-only exploration and context is at {util * 100:.0f}%. "
|
|
3293
|
+
"You will NOT finish if you keep reading files. Produce your "
|
|
3294
|
+
"deliverable NOW from the information you already have — write "
|
|
3295
|
+
"it to a file with the appropriate tool. Do not read anything else."
|
|
3296
|
+
)
|
|
3297
|
+
tier = "hard"
|
|
3298
|
+
else:
|
|
3299
|
+
directive = (
|
|
3300
|
+
f"You have read files for {streak} consecutive turns without "
|
|
3301
|
+
f"producing a deliverable (context {util * 100:.0f}%). You have "
|
|
3302
|
+
"enough to begin. Switch from exploration to synthesis: write "
|
|
3303
|
+
"your deliverable now. Read at most one more file, and only if "
|
|
3304
|
+
"strictly required to write it."
|
|
3305
|
+
)
|
|
3306
|
+
tier = "firm"
|
|
3307
|
+
msgs = openai_body.get("messages", [])
|
|
3308
|
+
msgs.append({"role": "user", "content": directive})
|
|
3309
|
+
openai_body["messages"] = msgs
|
|
3310
|
+
logger.warning(
|
|
3311
|
+
"RECON CONVERGENCE: injected %s directive (readonly_streak=%d, ctx=%.0f%%)",
|
|
3312
|
+
tier, streak, util * 100,
|
|
3313
|
+
)
|
|
3314
|
+
|
|
3315
|
+
|
|
3221
3316
|
def build_openai_request(
|
|
3222
3317
|
anthropic_body: dict,
|
|
3223
3318
|
monitor: SessionMonitor,
|
|
@@ -3725,6 +3820,11 @@ def build_openai_request(
|
|
|
3725
3820
|
|
|
3726
3821
|
_apply_tool_call_grammar(openai_body, grammar_override=profile_grammar)
|
|
3727
3822
|
|
|
3823
|
+
# Recon-convergence guardrail (B1) — runs on every built request so a
|
|
3824
|
+
# session wandering in read-only exploration is nudged toward its
|
|
3825
|
+
# deliverable regardless of tool-turn phase.
|
|
3826
|
+
_maybe_inject_recon_convergence(openai_body, monitor)
|
|
3827
|
+
|
|
3728
3828
|
return openai_body
|
|
3729
3829
|
|
|
3730
3830
|
|
|
@@ -7489,7 +7589,8 @@ async def messages(request: Request):
|
|
|
7489
7589
|
target_frac * 100,
|
|
7490
7590
|
)
|
|
7491
7591
|
body = prune_conversation(
|
|
7492
|
-
body, ctx_window,
|
|
7592
|
+
body, ctx_window, monitor=monitor,
|
|
7593
|
+
target_fraction=target_frac, keep_last=keep_last,
|
|
7493
7594
|
)
|
|
7494
7595
|
monitor.prune_count += 1
|
|
7495
7596
|
# Option 4: Post-prune validation — verify actual reduction
|
|
@@ -7510,7 +7611,8 @@ async def messages(request: Request):
|
|
|
7510
7611
|
post_util * 100,
|
|
7511
7612
|
)
|
|
7512
7613
|
body = prune_conversation(
|
|
7513
|
-
body, ctx_window,
|
|
7614
|
+
body, ctx_window, monitor=monitor,
|
|
7615
|
+
target_fraction=0.35, keep_last=4,
|
|
7514
7616
|
)
|
|
7515
7617
|
monitor.prune_count += 1
|
|
7516
7618
|
estimated_tokens = estimate_total_tokens(body)
|
|
@@ -5371,3 +5371,189 @@ class TestSlotSaveRestore(unittest.TestCase):
|
|
|
5371
5371
|
self.assertIn("fp:owner", proxy._slot_lru)
|
|
5372
5372
|
self.assertIn("fp:new1", proxy._slot_lru)
|
|
5373
5373
|
self.assertIn("fp:new2", proxy._slot_lru)
|
|
5374
|
+
|
|
5375
|
+
|
|
5376
|
+
class TestReconConvergence(unittest.TestCase):
|
|
5377
|
+
"""Tests for the B1 recon-convergence guardrail — nudges a session
|
|
5378
|
+
stuck doing read-only exploration toward producing its deliverable.
|
|
5379
|
+
|
|
5380
|
+
Targets the observed failure: a 664-turn agentic recon task that read
|
|
5381
|
+
files for hours and never converged to the synthesis/write step."""
|
|
5382
|
+
|
|
5383
|
+
def setUp(self):
|
|
5384
|
+
self._threshold = proxy.PROXY_RECON_CONVERGENCE_THRESHOLD
|
|
5385
|
+
|
|
5386
|
+
def tearDown(self):
|
|
5387
|
+
proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = self._threshold
|
|
5388
|
+
|
|
5389
|
+
def test_readonly_turns_increment_the_streak(self):
|
|
5390
|
+
"""Consecutive turns using only read-only tools grow the streak."""
|
|
5391
|
+
m = proxy.SessionMonitor(context_window=131072)
|
|
5392
|
+
for _ in range(5):
|
|
5393
|
+
m.record_tool_calls(["Read"])
|
|
5394
|
+
self.assertEqual(m.consecutive_readonly_turns, 5)
|
|
5395
|
+
m.record_tool_calls(["Grep", "Glob"])
|
|
5396
|
+
self.assertEqual(m.consecutive_readonly_turns, 6)
|
|
5397
|
+
|
|
5398
|
+
def test_non_readonly_tool_resets_the_streak(self):
|
|
5399
|
+
"""A turn using a write/edit tool means the model converged toward
|
|
5400
|
+
action — the streak resets to 0."""
|
|
5401
|
+
m = proxy.SessionMonitor(context_window=131072)
|
|
5402
|
+
for _ in range(10):
|
|
5403
|
+
m.record_tool_calls(["Read"])
|
|
5404
|
+
self.assertEqual(m.consecutive_readonly_turns, 10)
|
|
5405
|
+
m.record_tool_calls(["Write"])
|
|
5406
|
+
self.assertEqual(m.consecutive_readonly_turns, 0)
|
|
5407
|
+
|
|
5408
|
+
def test_mixed_turn_with_one_write_resets(self):
|
|
5409
|
+
"""A turn mixing read-only and a write tool still counts as
|
|
5410
|
+
converging — any non-read-only tool resets."""
|
|
5411
|
+
m = proxy.SessionMonitor(context_window=131072)
|
|
5412
|
+
for _ in range(10):
|
|
5413
|
+
m.record_tool_calls(["Read"])
|
|
5414
|
+
m.record_tool_calls(["Read", "Edit"])
|
|
5415
|
+
self.assertEqual(m.consecutive_readonly_turns, 0)
|
|
5416
|
+
|
|
5417
|
+
def test_no_injection_below_threshold(self):
|
|
5418
|
+
proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 40
|
|
5419
|
+
m = proxy.SessionMonitor(context_window=131072)
|
|
5420
|
+
m.consecutive_readonly_turns = 39
|
|
5421
|
+
body = {"messages": [{"role": "user", "content": "go"}]}
|
|
5422
|
+
proxy._maybe_inject_recon_convergence(body, m)
|
|
5423
|
+
self.assertEqual(len(body["messages"]), 1)
|
|
5424
|
+
|
|
5425
|
+
def test_firm_directive_at_threshold(self):
|
|
5426
|
+
proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 40
|
|
5427
|
+
m = proxy.SessionMonitor(context_window=131072)
|
|
5428
|
+
m.consecutive_readonly_turns = 45
|
|
5429
|
+
m.last_input_tokens = 120000
|
|
5430
|
+
body = {"messages": [{"role": "user", "content": "go"}]}
|
|
5431
|
+
proxy._maybe_inject_recon_convergence(body, m)
|
|
5432
|
+
self.assertEqual(len(body["messages"]), 2)
|
|
5433
|
+
injected = body["messages"][-1]["content"]
|
|
5434
|
+
self.assertIn("synthesis", injected.lower())
|
|
5435
|
+
self.assertNotIn("STOP exploring", injected)
|
|
5436
|
+
|
|
5437
|
+
def test_hard_directive_at_2x_threshold(self):
|
|
5438
|
+
"""Once the streak is 2x over threshold, escalate to a hard STOP."""
|
|
5439
|
+
proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 40
|
|
5440
|
+
m = proxy.SessionMonitor(context_window=131072)
|
|
5441
|
+
m.consecutive_readonly_turns = 80
|
|
5442
|
+
m.last_input_tokens = 250000 # over budget — the real-incident shape
|
|
5443
|
+
body = {"messages": [{"role": "user", "content": "go"}]}
|
|
5444
|
+
proxy._maybe_inject_recon_convergence(body, m)
|
|
5445
|
+
injected = body["messages"][-1]["content"]
|
|
5446
|
+
self.assertIn("STOP exploring", injected)
|
|
5447
|
+
|
|
5448
|
+
def test_disabled_when_threshold_zero(self):
|
|
5449
|
+
proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 0
|
|
5450
|
+
m = proxy.SessionMonitor(context_window=131072)
|
|
5451
|
+
m.consecutive_readonly_turns = 500
|
|
5452
|
+
body = {"messages": [{"role": "user", "content": "go"}]}
|
|
5453
|
+
proxy._maybe_inject_recon_convergence(body, m)
|
|
5454
|
+
self.assertEqual(len(body["messages"]), 1)
|
|
5455
|
+
|
|
5456
|
+
|
|
5457
|
+
class TestPrunerRework(unittest.TestCase):
|
|
5458
|
+
"""Tests for the reworked context pruner (B2 + B3): contiguous
|
|
5459
|
+
monotonic prune boundary (cache-stable) + breadcrumb summary of the
|
|
5460
|
+
dropped block (findings retained)."""
|
|
5461
|
+
|
|
5462
|
+
@staticmethod
|
|
5463
|
+
def _tool_result_msg(idx: int, size: int = 4000) -> dict:
|
|
5464
|
+
return {
|
|
5465
|
+
"role": "user",
|
|
5466
|
+
"content": [
|
|
5467
|
+
{
|
|
5468
|
+
"type": "tool_result",
|
|
5469
|
+
"tool_use_id": f"toolu_{idx}",
|
|
5470
|
+
"content": f"FILE-{idx} " + ("x" * size),
|
|
5471
|
+
}
|
|
5472
|
+
],
|
|
5473
|
+
}
|
|
5474
|
+
|
|
5475
|
+
def _big_body(self, n_middle: int = 20) -> dict:
|
|
5476
|
+
msgs = [{"role": "user", "content": "recon task: analyze the repo"}]
|
|
5477
|
+
for i in range(n_middle):
|
|
5478
|
+
msgs.append({"role": "assistant", "content": f"reading file {i}"})
|
|
5479
|
+
msgs.append(self._tool_result_msg(i))
|
|
5480
|
+
msgs.append({"role": "user", "content": "continue"})
|
|
5481
|
+
return {"messages": msgs}
|
|
5482
|
+
|
|
5483
|
+
def test_prune_drop_count_is_monotonic(self):
|
|
5484
|
+
"""The per-session prune boundary only ever grows."""
|
|
5485
|
+
m = proxy.SessionMonitor(context_window=8192)
|
|
5486
|
+
proxy.prune_conversation(self._big_body(), 8192, monitor=m,
|
|
5487
|
+
target_fraction=0.5, keep_last=6)
|
|
5488
|
+
first = m.prune_drop_count
|
|
5489
|
+
self.assertGreater(first, 0)
|
|
5490
|
+
# A tighter target on the same body can only drop more, never fewer.
|
|
5491
|
+
proxy.prune_conversation(self._big_body(), 8192, monitor=m,
|
|
5492
|
+
target_fraction=0.25, keep_last=6)
|
|
5493
|
+
self.assertGreaterEqual(m.prune_drop_count, first)
|
|
5494
|
+
|
|
5495
|
+
def test_kept_middle_is_contiguous_suffix(self):
|
|
5496
|
+
"""The pruner drops a contiguous oldest block — the surviving
|
|
5497
|
+
middle messages are a contiguous suffix of the original middle,
|
|
5498
|
+
never a non-contiguous greedy pick."""
|
|
5499
|
+
m = proxy.SessionMonitor(context_window=8192)
|
|
5500
|
+
body = self._big_body()
|
|
5501
|
+
original = list(body["messages"])
|
|
5502
|
+
result = proxy.prune_conversation(body, 8192, monitor=m,
|
|
5503
|
+
target_fraction=0.5, keep_last=6)
|
|
5504
|
+
out = result["messages"]
|
|
5505
|
+
survivors = [msg for msg in out if msg in original]
|
|
5506
|
+
idxs = [original.index(msg) for msg in survivors]
|
|
5507
|
+
self.assertEqual(idxs, sorted(idxs))
|
|
5508
|
+
tail_idxs = [i for i in idxs if i > 0]
|
|
5509
|
+
if len(tail_idxs) > 1:
|
|
5510
|
+
self.assertEqual(
|
|
5511
|
+
tail_idxs, list(range(tail_idxs[0], tail_idxs[0] + len(tail_idxs)))
|
|
5512
|
+
)
|
|
5513
|
+
|
|
5514
|
+
def test_stable_output_when_boundary_does_not_advance(self):
|
|
5515
|
+
"""Cache-stability: pruning the same body twice with the same
|
|
5516
|
+
monitor yields byte-identical message lists — the second call
|
|
5517
|
+
seeds from the persisted boundary and does not advance it."""
|
|
5518
|
+
m = proxy.SessionMonitor(context_window=8192)
|
|
5519
|
+
first = proxy.prune_conversation(self._big_body(), 8192, monitor=m,
|
|
5520
|
+
target_fraction=0.5, keep_last=6)
|
|
5521
|
+
boundary_after_first = m.prune_drop_count
|
|
5522
|
+
second = proxy.prune_conversation(self._big_body(), 8192, monitor=m,
|
|
5523
|
+
target_fraction=0.5, keep_last=6)
|
|
5524
|
+
self.assertEqual(m.prune_drop_count, boundary_after_first)
|
|
5525
|
+
self.assertEqual(first["messages"], second["messages"])
|
|
5526
|
+
|
|
5527
|
+
def test_dropped_tool_results_become_breadcrumbs(self):
|
|
5528
|
+
"""Pruned tool-results survive as one-line breadcrumbs in the
|
|
5529
|
+
marker, not silently discarded."""
|
|
5530
|
+
dropped = [self._tool_result_msg(i) for i in range(3)]
|
|
5531
|
+
summary = proxy._summarize_pruned_block(dropped)
|
|
5532
|
+
self.assertIn("CONTEXT PRUNED", summary)
|
|
5533
|
+
self.assertIn("tool result", summary)
|
|
5534
|
+
self.assertIn("FILE-0", summary)
|
|
5535
|
+
self.assertIn("FILE-2", summary)
|
|
5536
|
+
|
|
5537
|
+
def test_summary_is_bounded_by_max_items(self):
|
|
5538
|
+
"""A huge dropped block does not produce an unbounded summary."""
|
|
5539
|
+
old = proxy._PRUNE_SUMMARY_MAX_ITEMS
|
|
5540
|
+
try:
|
|
5541
|
+
proxy._PRUNE_SUMMARY_MAX_ITEMS = 5
|
|
5542
|
+
dropped = [self._tool_result_msg(i) for i in range(40)]
|
|
5543
|
+
summary = proxy._summarize_pruned_block(dropped)
|
|
5544
|
+
self.assertEqual(summary.count("- tool result"), 5)
|
|
5545
|
+
self.assertIn("most recent 5 of 40", summary)
|
|
5546
|
+
finally:
|
|
5547
|
+
proxy._PRUNE_SUMMARY_MAX_ITEMS = old
|
|
5548
|
+
|
|
5549
|
+
def test_summarize_no_tool_results_falls_back_to_static_marker(self):
|
|
5550
|
+
"""A dropped block with no tool-results yields the plain static
|
|
5551
|
+
marker — no per-call varying text (cache-safe)."""
|
|
5552
|
+
dropped = [
|
|
5553
|
+
{"role": "assistant", "content": "thinking out loud"},
|
|
5554
|
+
{"role": "user", "content": "ok"},
|
|
5555
|
+
]
|
|
5556
|
+
summary = proxy._summarize_pruned_block(dropped)
|
|
5557
|
+
self.assertIn("CONTEXT PRUNED", summary)
|
|
5558
|
+
self.assertNotIn("tool result", summary)
|
|
5559
|
+
self.assertNotIn("most recent", summary)
|