@miller-tech/uap 1.20.48 → 1.20.50
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -207,6 +207,19 @@ _READ_ONLY_TOOL_CLASS = frozenset({
|
|
|
207
207
|
"search", "Search", "list_files", "ListFiles",
|
|
208
208
|
})
|
|
209
209
|
|
|
210
|
+
# Tools that produce or mutate a deliverable. Using any of these in a turn
|
|
211
|
+
# means the agent is converging from exploration toward output, and resets
|
|
212
|
+
# the recon-convergence streak (B1). This is deliberately a SHORT allowlist
|
|
213
|
+
# of write tools, NOT a read-only denylist: exploration happens through an
|
|
214
|
+
# open-ended set of tools (Bash, WebFetch, Agent, ...) that cannot be
|
|
215
|
+
# enumerated, but "the agent produced a write" is a small, stable signal.
|
|
216
|
+
# Names are matched case-insensitively (callers lower() before lookup).
|
|
217
|
+
_WRITE_TOOL_CLASS = frozenset({
|
|
218
|
+
"write", "edit", "multiedit", "notebookedit",
|
|
219
|
+
"str_replace", "str_replace_editor", "str_replace_based_edit_tool",
|
|
220
|
+
"create_file", "applypatch", "apply_patch",
|
|
221
|
+
})
|
|
222
|
+
|
|
210
223
|
PROXY_GUARDRAIL_RETRY = os.environ.get("PROXY_GUARDRAIL_RETRY", "on").lower() not in {
|
|
211
224
|
"0",
|
|
212
225
|
"false",
|
|
@@ -224,12 +237,15 @@ PROXY_FINALIZE_CONTINUATION_MAX = int(
|
|
|
224
237
|
PROXY_FINALIZE_SESSION_HARD_CAP = int(
|
|
225
238
|
os.environ.get("PROXY_FINALIZE_SESSION_HARD_CAP", "3")
|
|
226
239
|
)
|
|
227
|
-
# Recon-convergence guardrail: after this many consecutive turns
|
|
228
|
-
#
|
|
229
|
-
#
|
|
230
|
-
#
|
|
231
|
-
#
|
|
240
|
+
# Recon-convergence guardrail: after this many consecutive turns that use
|
|
241
|
+
# tools but produce NO write/deliverable tool call (see _WRITE_TOOL_CLASS),
|
|
242
|
+
# the proxy injects a directive telling the model to stop exploring and
|
|
243
|
+
# produce its deliverable. Targets the failure mode where an agentic recon
|
|
244
|
+
# task explores for hundreds of turns and never converges to the
|
|
232
245
|
# synthesis/write step (observed: 664-turn recon, no deliverable started).
|
|
246
|
+
# Defined as write-tool ABSENCE rather than read-tool presence: a real
|
|
247
|
+
# recon agent explores via Bash/WebFetch/Agent, not just Read/Grep, so a
|
|
248
|
+
# "all tools are recognized read-only" test never accumulates a streak.
|
|
233
249
|
# 0 disables.
|
|
234
250
|
PROXY_RECON_CONVERGENCE_THRESHOLD = int(
|
|
235
251
|
os.environ.get("PROXY_RECON_CONVERGENCE_THRESHOLD", "40")
|
|
@@ -712,6 +728,7 @@ class SessionMonitor:
|
|
|
712
728
|
peak_input_tokens: int = 0 # High-water mark
|
|
713
729
|
prune_count: int = 0 # How many times pruning was triggered
|
|
714
730
|
overflow_count: int = 0 # How many context overflow errors caught
|
|
731
|
+
prune_drop_count: int = 0 # monotonic: # of oldest middle msgs pruned (B3)
|
|
715
732
|
context_history: list = field(default_factory=list) # Recent token counts
|
|
716
733
|
|
|
717
734
|
# --- Token Loop Protection ---
|
|
@@ -726,7 +743,7 @@ class SessionMonitor:
|
|
|
726
743
|
)
|
|
727
744
|
loop_warnings_emitted: int = 0 # How many loop warnings sent to the model
|
|
728
745
|
no_progress_streak: int = 0 # Forced tool turns without new tool_result
|
|
729
|
-
|
|
746
|
+
consecutive_no_write_turns: int = 0 # turns exploring with no write tool (B1)
|
|
730
747
|
unexpected_end_turn_count: int = 0 # end_turn without tool_use in active loop
|
|
731
748
|
tool_starvation_streak: int = 0 # Consecutive forced turns with no tool_calls produced
|
|
732
749
|
malformed_tool_streak: int = 0 # consecutive malformed pseudo tool payloads
|
|
@@ -884,15 +901,19 @@ class SessionMonitor:
|
|
|
884
901
|
if len(self.tool_call_history) > 30:
|
|
885
902
|
self.tool_call_history = self.tool_call_history[-30:]
|
|
886
903
|
|
|
887
|
-
# Recon-convergence (B1): count consecutive turns
|
|
888
|
-
#
|
|
889
|
-
#
|
|
890
|
-
#
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
904
|
+
# Recon-convergence (B1): count consecutive turns that use tools but
|
|
905
|
+
# produce NO write/deliverable tool call. A turn that uses any write
|
|
906
|
+
# tool resets the streak — that's the model converging from
|
|
907
|
+
# exploration toward synthesis/output. A turn with no tool calls at
|
|
908
|
+
# all is a plain-text turn (neither exploration nor a write) and
|
|
909
|
+
# leaves the streak unchanged. This is the inverse of the old
|
|
910
|
+
# "all tools are recognized read-only" test, which reset on any
|
|
911
|
+
# Bash/WebFetch/Agent turn and so never accumulated for real agents.
|
|
912
|
+
if tool_names:
|
|
913
|
+
if any(n.lower() in _WRITE_TOOL_CLASS for n in tool_names):
|
|
914
|
+
self.consecutive_no_write_turns = 0
|
|
915
|
+
else:
|
|
916
|
+
self.consecutive_no_write_turns += 1
|
|
896
917
|
|
|
897
918
|
# Track read-only tool targets for dedup (Option 3)
|
|
898
919
|
if tool_targets:
|
|
@@ -1318,24 +1339,83 @@ def estimate_total_tokens(anthropic_body: dict) -> int:
|
|
|
1318
1339
|
return tokens
|
|
1319
1340
|
|
|
1320
1341
|
|
|
1342
|
+
# Max tool-result breadcrumbs listed in a prune summary (B2). Bounds the
|
|
1343
|
+
# summary size — beyond this the oldest breadcrumbs are elided.
|
|
1344
|
+
_PRUNE_SUMMARY_MAX_ITEMS = int(os.environ.get("PROXY_PRUNE_SUMMARY_MAX_ITEMS", "30"))
|
|
1345
|
+
|
|
1346
|
+
|
|
1347
|
+
def _summarize_pruned_block(dropped: list[dict]) -> str:
|
|
1348
|
+
"""Build a compact breadcrumb summary of pruned messages (B2).
|
|
1349
|
+
|
|
1350
|
+
Instead of discarding dropped tool-results outright, leave a one-line
|
|
1351
|
+
trace of each so the agent retains *what it already found*. A recon
|
|
1352
|
+
agent that can still see "I read auth_handler.cpp — JWT validation in
|
|
1353
|
+
validateToken()" is far likelier to converge to a synthesis than one
|
|
1354
|
+
whose findings vanished entirely and which therefore re-explores.
|
|
1355
|
+
|
|
1356
|
+
Heuristic only — no LLM call. Bounded to the most recent
|
|
1357
|
+
PROXY_PRUNE_SUMMARY_MAX_ITEMS tool-result breadcrumbs so the summary
|
|
1358
|
+
itself cannot grow unbounded.
|
|
1359
|
+
"""
|
|
1360
|
+
breadcrumbs: list[str] = []
|
|
1361
|
+
for msg in dropped:
|
|
1362
|
+
content = msg.get("content", [])
|
|
1363
|
+
if not isinstance(content, list):
|
|
1364
|
+
continue
|
|
1365
|
+
for block in content:
|
|
1366
|
+
if isinstance(block, dict) and block.get("type") == "tool_result":
|
|
1367
|
+
text = _extract_text(block.get("content", "")).strip()
|
|
1368
|
+
if not text:
|
|
1369
|
+
continue
|
|
1370
|
+
excerpt = " ".join(text.split())[:100]
|
|
1371
|
+
breadcrumbs.append(
|
|
1372
|
+
f"- tool result (~{estimate_tokens(text)} tok): {excerpt}"
|
|
1373
|
+
)
|
|
1374
|
+
if not breadcrumbs:
|
|
1375
|
+
return (
|
|
1376
|
+
"[CONTEXT PRUNED: older messages were removed to fit the context "
|
|
1377
|
+
"window. The conversation continues from recent context below.]"
|
|
1378
|
+
)
|
|
1379
|
+
total = len(breadcrumbs)
|
|
1380
|
+
if total > _PRUNE_SUMMARY_MAX_ITEMS:
|
|
1381
|
+
breadcrumbs = breadcrumbs[-_PRUNE_SUMMARY_MAX_ITEMS:]
|
|
1382
|
+
header = (
|
|
1383
|
+
f"[CONTEXT PRUNED — {len(dropped)} older messages removed to fit the "
|
|
1384
|
+
"context window. Breadcrumbs of earlier findings"
|
|
1385
|
+
)
|
|
1386
|
+
if total > len(breadcrumbs):
|
|
1387
|
+
header += f" (most recent {len(breadcrumbs)} of {total} tool results)"
|
|
1388
|
+
header += " — rely on these instead of re-reading those files:]"
|
|
1389
|
+
return header + "\n" + "\n".join(breadcrumbs)
|
|
1390
|
+
|
|
1391
|
+
|
|
1321
1392
|
def prune_conversation(
|
|
1322
1393
|
anthropic_body: dict,
|
|
1323
1394
|
context_window: int,
|
|
1395
|
+
monitor: "SessionMonitor | None" = None,
|
|
1324
1396
|
target_fraction: float = 0.65,
|
|
1325
1397
|
keep_last: int = 8,
|
|
1326
1398
|
) -> dict:
|
|
1327
1399
|
"""Prune the conversation to fit within the context window.
|
|
1328
1400
|
|
|
1329
|
-
Strategy:
|
|
1330
|
-
- Always keep: system prompt, first user message, last N messages
|
|
1331
|
-
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1401
|
+
Strategy (reworked — UAP PR #186):
|
|
1402
|
+
- Always keep: system prompt, first user message, last N messages.
|
|
1403
|
+
- Drop a CONTIGUOUS block of the oldest middle messages. The drop
|
|
1404
|
+
count is persisted per-session on the monitor (`prune_drop_count`)
|
|
1405
|
+
and is monotonic — it only ever grows. This keeps the retained
|
|
1406
|
+
region a stable recent *suffix*: on turns where the boundary does
|
|
1407
|
+
not advance, the upstream KV-cache prefix stays valid and the turn
|
|
1408
|
+
is not reprocessed. (The previous priority-greedy keep was
|
|
1409
|
+
non-contiguous and reshuffled the prompt mid-stream every turn,
|
|
1410
|
+
defeating the cache.)
|
|
1411
|
+
- Replace the dropped block with a breadcrumb summary (see
|
|
1412
|
+
_summarize_pruned_block) so the agent keeps its earlier findings.
|
|
1335
1413
|
|
|
1336
1414
|
Args:
|
|
1337
1415
|
anthropic_body: The full Anthropic request body
|
|
1338
1416
|
context_window: Maximum context window in tokens
|
|
1417
|
+
monitor: SessionMonitor — carries the monotonic prune boundary.
|
|
1418
|
+
When None, pruning still works but is non-monotonic per call.
|
|
1339
1419
|
target_fraction: Target utilization after pruning (0.0-1.0)
|
|
1340
1420
|
keep_last: Number of recent messages to always keep (default 8)
|
|
1341
1421
|
|
|
@@ -1411,70 +1491,39 @@ def prune_conversation(
|
|
|
1411
1491
|
|
|
1412
1492
|
remaining_budget = message_budget - protected_tokens
|
|
1413
1493
|
|
|
1414
|
-
#
|
|
1415
|
-
#
|
|
1416
|
-
#
|
|
1417
|
-
#
|
|
1418
|
-
#
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
)
|
|
1430
|
-
|
|
1431
|
-
# Lower priority = removed first
|
|
1432
|
-
if is_tool_result:
|
|
1433
|
-
priority = 0 # Remove first
|
|
1434
|
-
elif is_assistant:
|
|
1435
|
-
priority = 1 # Remove second
|
|
1436
|
-
else:
|
|
1437
|
-
priority = 2 # Remove last (user messages)
|
|
1438
|
-
|
|
1439
|
-
scored_middle.append((priority, i, tokens, msg))
|
|
1440
|
-
|
|
1441
|
-
# Sort by priority (ascending = remove first), then by index (oldest first)
|
|
1442
|
-
scored_middle.sort(key=lambda x: (x[0], x[1]))
|
|
1443
|
-
|
|
1444
|
-
# Greedily keep messages from highest priority (keep last) until budget fills
|
|
1445
|
-
kept_middle = []
|
|
1446
|
-
used_tokens = 0
|
|
1447
|
-
# Process in reverse priority order (keep high-priority messages first)
|
|
1448
|
-
for priority, idx, tokens, msg in reversed(scored_middle):
|
|
1449
|
-
if used_tokens + tokens <= remaining_budget:
|
|
1450
|
-
kept_middle.append((idx, msg))
|
|
1451
|
-
used_tokens += tokens
|
|
1452
|
-
|
|
1453
|
-
# Sort kept messages back into original order
|
|
1454
|
-
kept_middle.sort(key=lambda x: x[0])
|
|
1455
|
-
kept_msgs = [m for _, m in kept_middle]
|
|
1494
|
+
# --- Monotonic contiguous prune boundary (cache-stable, B3) ---
|
|
1495
|
+
# Drop the oldest `drop_count` middle messages as one contiguous block.
|
|
1496
|
+
# Seed from the monitor's persisted boundary; advance it only as far as
|
|
1497
|
+
# the budget forces. Persist back monotonically so a later/looser prune
|
|
1498
|
+
# in the same turn can't shrink it (which would reshuffle the prompt).
|
|
1499
|
+
drop_count = 0
|
|
1500
|
+
if monitor is not None:
|
|
1501
|
+
drop_count = min(max(0, monitor.prune_drop_count), len(middle))
|
|
1502
|
+
while drop_count < len(middle):
|
|
1503
|
+
kept_tokens = sum(estimate_message_tokens(m) for m in middle[drop_count:])
|
|
1504
|
+
if kept_tokens <= remaining_budget:
|
|
1505
|
+
break
|
|
1506
|
+
drop_count += 1
|
|
1507
|
+
if monitor is not None:
|
|
1508
|
+
monitor.prune_drop_count = max(monitor.prune_drop_count, drop_count)
|
|
1456
1509
|
|
|
1457
|
-
|
|
1458
|
-
|
|
1510
|
+
dropped = middle[:drop_count]
|
|
1511
|
+
kept_msgs = middle[drop_count:]
|
|
1459
1512
|
|
|
1460
|
-
if
|
|
1461
|
-
#
|
|
1513
|
+
if dropped:
|
|
1514
|
+
# Replace the dropped block with a findings-breadcrumb summary (B2).
|
|
1462
1515
|
prune_marker = {
|
|
1463
1516
|
"role": "user",
|
|
1464
|
-
"content": (
|
|
1465
|
-
f"[CONTEXT PRUNED: {removed_count} older messages (~{removed_tokens} tokens) "
|
|
1466
|
-
f"were removed to fit within the context window. "
|
|
1467
|
-
f"The conversation continues from recent context below.]"
|
|
1468
|
-
),
|
|
1517
|
+
"content": _summarize_pruned_block(dropped),
|
|
1469
1518
|
}
|
|
1470
1519
|
anthropic_body["messages"] = (
|
|
1471
1520
|
protected_head + [prune_marker] + kept_msgs + protected_tail
|
|
1472
1521
|
)
|
|
1473
1522
|
logger.warning(
|
|
1474
|
-
"PRUNED:
|
|
1475
|
-
"target=%.0f%% of %d ctx",
|
|
1476
|
-
|
|
1477
|
-
|
|
1523
|
+
"PRUNED: dropped %d oldest middle messages (boundary=%d), "
|
|
1524
|
+
"kept %d total, target=%.0f%% of %d ctx",
|
|
1525
|
+
len(dropped),
|
|
1526
|
+
drop_count,
|
|
1478
1527
|
len(anthropic_body["messages"]),
|
|
1479
1528
|
target_fraction * 100,
|
|
1480
1529
|
context_window,
|
|
@@ -3240,46 +3289,46 @@ def _resolve_state_machine_tool_choice(
|
|
|
3240
3289
|
|
|
3241
3290
|
|
|
3242
3291
|
def _maybe_inject_recon_convergence(openai_body: dict, monitor: "SessionMonitor") -> None:
|
|
3243
|
-
"""Nudge a session stuck in prolonged
|
|
3244
|
-
|
|
3245
|
-
|
|
3246
|
-
|
|
3247
|
-
|
|
3248
|
-
|
|
3249
|
-
|
|
3250
|
-
|
|
3251
|
-
"
|
|
3252
|
-
the streak is 2x over threshold.
|
|
3292
|
+
"""Nudge a session stuck in prolonged exploration toward its deliverable.
|
|
3293
|
+
|
|
3294
|
+
Fires when `consecutive_no_write_turns` crosses
|
|
3295
|
+
PROXY_RECON_CONVERGENCE_THRESHOLD — the model has used tools for many
|
|
3296
|
+
turns without producing any write/deliverable tool call. Targets the
|
|
3297
|
+
observed failure mode of an agentic recon task wandering for hundreds
|
|
3298
|
+
of turns and never converging to the synthesis/write step. Two
|
|
3299
|
+
escalation tiers: a firm "switch to synthesis" directive, then a hard
|
|
3300
|
+
"STOP, write it now" once the streak is 2x over threshold.
|
|
3253
3301
|
"""
|
|
3254
3302
|
if PROXY_RECON_CONVERGENCE_THRESHOLD <= 0:
|
|
3255
3303
|
return
|
|
3256
|
-
streak = monitor.
|
|
3304
|
+
streak = monitor.consecutive_no_write_turns
|
|
3257
3305
|
if streak < PROXY_RECON_CONVERGENCE_THRESHOLD:
|
|
3258
3306
|
return
|
|
3259
3307
|
util = monitor.get_utilization()
|
|
3260
3308
|
if streak >= 2 * PROXY_RECON_CONVERGENCE_THRESHOLD:
|
|
3261
3309
|
directive = (
|
|
3262
3310
|
f"STOP exploring. You have run {streak} consecutive turns of "
|
|
3263
|
-
f"
|
|
3264
|
-
"You will NOT finish if you keep
|
|
3265
|
-
"deliverable NOW from the information you already
|
|
3266
|
-
"it to a file with the appropriate tool. Do not
|
|
3311
|
+
f"exploration without producing a deliverable and context is at "
|
|
3312
|
+
f"{util * 100:.0f}%. You will NOT finish if you keep exploring. "
|
|
3313
|
+
"Produce your deliverable NOW from the information you already "
|
|
3314
|
+
"have — write it to a file with the appropriate tool. Do not "
|
|
3315
|
+
"read or run anything else."
|
|
3267
3316
|
)
|
|
3268
3317
|
tier = "hard"
|
|
3269
3318
|
else:
|
|
3270
3319
|
directive = (
|
|
3271
|
-
f"You have
|
|
3320
|
+
f"You have explored for {streak} consecutive turns without "
|
|
3272
3321
|
f"producing a deliverable (context {util * 100:.0f}%). You have "
|
|
3273
3322
|
"enough to begin. Switch from exploration to synthesis: write "
|
|
3274
|
-
"your deliverable now.
|
|
3275
|
-
"strictly required to write it."
|
|
3323
|
+
"your deliverable now. Explore at most one more time, and only "
|
|
3324
|
+
"if strictly required to write it."
|
|
3276
3325
|
)
|
|
3277
3326
|
tier = "firm"
|
|
3278
3327
|
msgs = openai_body.get("messages", [])
|
|
3279
3328
|
msgs.append({"role": "user", "content": directive})
|
|
3280
3329
|
openai_body["messages"] = msgs
|
|
3281
3330
|
logger.warning(
|
|
3282
|
-
"RECON CONVERGENCE: injected %s directive (
|
|
3331
|
+
"RECON CONVERGENCE: injected %s directive (no_write_streak=%d, ctx=%.0f%%)",
|
|
3283
3332
|
tier, streak, util * 100,
|
|
3284
3333
|
)
|
|
3285
3334
|
|
|
@@ -3792,8 +3841,8 @@ def build_openai_request(
|
|
|
3792
3841
|
_apply_tool_call_grammar(openai_body, grammar_override=profile_grammar)
|
|
3793
3842
|
|
|
3794
3843
|
# Recon-convergence guardrail (B1) — runs on every built request so a
|
|
3795
|
-
# session wandering in
|
|
3796
|
-
# deliverable regardless of tool-turn phase.
|
|
3844
|
+
# session wandering in exploration without producing a write is nudged
|
|
3845
|
+
# toward its deliverable regardless of tool-turn phase.
|
|
3797
3846
|
_maybe_inject_recon_convergence(openai_body, monitor)
|
|
3798
3847
|
|
|
3799
3848
|
return openai_body
|
|
@@ -7560,7 +7609,8 @@ async def messages(request: Request):
|
|
|
7560
7609
|
target_frac * 100,
|
|
7561
7610
|
)
|
|
7562
7611
|
body = prune_conversation(
|
|
7563
|
-
body, ctx_window,
|
|
7612
|
+
body, ctx_window, monitor=monitor,
|
|
7613
|
+
target_fraction=target_frac, keep_last=keep_last,
|
|
7564
7614
|
)
|
|
7565
7615
|
monitor.prune_count += 1
|
|
7566
7616
|
# Option 4: Post-prune validation — verify actual reduction
|
|
@@ -7581,7 +7631,8 @@ async def messages(request: Request):
|
|
|
7581
7631
|
post_util * 100,
|
|
7582
7632
|
)
|
|
7583
7633
|
body = prune_conversation(
|
|
7584
|
-
body, ctx_window,
|
|
7634
|
+
body, ctx_window, monitor=monitor,
|
|
7635
|
+
target_fraction=0.35, keep_last=4,
|
|
7585
7636
|
)
|
|
7586
7637
|
monitor.prune_count += 1
|
|
7587
7638
|
estimated_tokens = estimate_total_tokens(body)
|
|
@@ -5375,10 +5375,13 @@ class TestSlotSaveRestore(unittest.TestCase):
|
|
|
5375
5375
|
|
|
5376
5376
|
class TestReconConvergence(unittest.TestCase):
|
|
5377
5377
|
"""Tests for the B1 recon-convergence guardrail — nudges a session
|
|
5378
|
-
stuck
|
|
5378
|
+
stuck exploring without producing a write toward its deliverable.
|
|
5379
5379
|
|
|
5380
|
-
|
|
5381
|
-
|
|
5380
|
+
The streak is defined as write-tool ABSENCE, not read-tool presence: a
|
|
5381
|
+
real recon agent explores via Bash/WebFetch/Agent, so an "all tools are
|
|
5382
|
+
recognized read-only" test never accumulates. Targets the observed
|
|
5383
|
+
failure: a 664-turn agentic recon task that explored for hours and
|
|
5384
|
+
never converged to the synthesis/write step."""
|
|
5382
5385
|
|
|
5383
5386
|
def setUp(self):
|
|
5384
5387
|
self._threshold = proxy.PROXY_RECON_CONVERGENCE_THRESHOLD
|
|
@@ -5387,37 +5390,60 @@ class TestReconConvergence(unittest.TestCase):
|
|
|
5387
5390
|
proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = self._threshold
|
|
5388
5391
|
|
|
5389
5392
|
def test_readonly_turns_increment_the_streak(self):
|
|
5390
|
-
"""Consecutive turns using only read
|
|
5393
|
+
"""Consecutive turns using only read tools grow the streak."""
|
|
5391
5394
|
m = proxy.SessionMonitor(context_window=131072)
|
|
5392
5395
|
for _ in range(5):
|
|
5393
5396
|
m.record_tool_calls(["Read"])
|
|
5394
|
-
self.assertEqual(m.
|
|
5397
|
+
self.assertEqual(m.consecutive_no_write_turns, 5)
|
|
5395
5398
|
m.record_tool_calls(["Grep", "Glob"])
|
|
5396
|
-
self.assertEqual(m.
|
|
5399
|
+
self.assertEqual(m.consecutive_no_write_turns, 6)
|
|
5397
5400
|
|
|
5398
|
-
def
|
|
5401
|
+
def test_bash_and_webfetch_turns_increment_the_streak(self):
|
|
5402
|
+
"""The core fix: exploration via Bash/WebFetch/Agent — tools the old
|
|
5403
|
+
read-only allowlist did not recognize — must grow the streak. The
|
|
5404
|
+
old logic reset on every such turn, so the streak never built."""
|
|
5405
|
+
m = proxy.SessionMonitor(context_window=131072)
|
|
5406
|
+
m.record_tool_calls(["Bash"])
|
|
5407
|
+
m.record_tool_calls(["WebFetch"])
|
|
5408
|
+
m.record_tool_calls(["Agent"])
|
|
5409
|
+
m.record_tool_calls(["Read", "Bash"]) # mixed exploration, no write
|
|
5410
|
+
self.assertEqual(m.consecutive_no_write_turns, 4)
|
|
5411
|
+
|
|
5412
|
+
def test_write_tool_resets_the_streak(self):
|
|
5399
5413
|
"""A turn using a write/edit tool means the model converged toward
|
|
5400
|
-
|
|
5414
|
+
output — the streak resets to 0."""
|
|
5401
5415
|
m = proxy.SessionMonitor(context_window=131072)
|
|
5402
5416
|
for _ in range(10):
|
|
5403
|
-
m.record_tool_calls(["
|
|
5404
|
-
self.assertEqual(m.
|
|
5417
|
+
m.record_tool_calls(["Bash"])
|
|
5418
|
+
self.assertEqual(m.consecutive_no_write_turns, 10)
|
|
5405
5419
|
m.record_tool_calls(["Write"])
|
|
5406
|
-
self.assertEqual(m.
|
|
5420
|
+
self.assertEqual(m.consecutive_no_write_turns, 0)
|
|
5407
5421
|
|
|
5408
5422
|
def test_mixed_turn_with_one_write_resets(self):
|
|
5409
|
-
"""A turn mixing
|
|
5410
|
-
converging — any
|
|
5423
|
+
"""A turn mixing exploration and a write tool still counts as
|
|
5424
|
+
converging — any write tool resets."""
|
|
5411
5425
|
m = proxy.SessionMonitor(context_window=131072)
|
|
5412
5426
|
for _ in range(10):
|
|
5413
5427
|
m.record_tool_calls(["Read"])
|
|
5414
5428
|
m.record_tool_calls(["Read", "Edit"])
|
|
5415
|
-
self.assertEqual(m.
|
|
5429
|
+
self.assertEqual(m.consecutive_no_write_turns, 0)
|
|
5430
|
+
|
|
5431
|
+
def test_no_tool_turn_leaves_streak_unchanged(self):
|
|
5432
|
+
"""A plain-text turn (no tool calls) is neither exploration nor a
|
|
5433
|
+
write — it must leave the streak untouched, not reset it."""
|
|
5434
|
+
m = proxy.SessionMonitor(context_window=131072)
|
|
5435
|
+
for _ in range(7):
|
|
5436
|
+
m.record_tool_calls(["Bash"])
|
|
5437
|
+
self.assertEqual(m.consecutive_no_write_turns, 7)
|
|
5438
|
+
m.record_tool_calls([]) # plain-text turn
|
|
5439
|
+
self.assertEqual(m.consecutive_no_write_turns, 7)
|
|
5440
|
+
m.record_tool_calls(["Read"])
|
|
5441
|
+
self.assertEqual(m.consecutive_no_write_turns, 8)
|
|
5416
5442
|
|
|
5417
5443
|
def test_no_injection_below_threshold(self):
|
|
5418
5444
|
proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 40
|
|
5419
5445
|
m = proxy.SessionMonitor(context_window=131072)
|
|
5420
|
-
m.
|
|
5446
|
+
m.consecutive_no_write_turns = 39
|
|
5421
5447
|
body = {"messages": [{"role": "user", "content": "go"}]}
|
|
5422
5448
|
proxy._maybe_inject_recon_convergence(body, m)
|
|
5423
5449
|
self.assertEqual(len(body["messages"]), 1)
|
|
@@ -5425,7 +5451,7 @@ class TestReconConvergence(unittest.TestCase):
|
|
|
5425
5451
|
def test_firm_directive_at_threshold(self):
|
|
5426
5452
|
proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 40
|
|
5427
5453
|
m = proxy.SessionMonitor(context_window=131072)
|
|
5428
|
-
m.
|
|
5454
|
+
m.consecutive_no_write_turns = 45
|
|
5429
5455
|
m.last_input_tokens = 120000
|
|
5430
5456
|
body = {"messages": [{"role": "user", "content": "go"}]}
|
|
5431
5457
|
proxy._maybe_inject_recon_convergence(body, m)
|
|
@@ -5438,7 +5464,7 @@ class TestReconConvergence(unittest.TestCase):
|
|
|
5438
5464
|
"""Once the streak is 2x over threshold, escalate to a hard STOP."""
|
|
5439
5465
|
proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 40
|
|
5440
5466
|
m = proxy.SessionMonitor(context_window=131072)
|
|
5441
|
-
m.
|
|
5467
|
+
m.consecutive_no_write_turns = 80
|
|
5442
5468
|
m.last_input_tokens = 250000 # over budget — the real-incident shape
|
|
5443
5469
|
body = {"messages": [{"role": "user", "content": "go"}]}
|
|
5444
5470
|
proxy._maybe_inject_recon_convergence(body, m)
|
|
@@ -5448,7 +5474,112 @@ class TestReconConvergence(unittest.TestCase):
|
|
|
5448
5474
|
def test_disabled_when_threshold_zero(self):
|
|
5449
5475
|
proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 0
|
|
5450
5476
|
m = proxy.SessionMonitor(context_window=131072)
|
|
5451
|
-
m.
|
|
5477
|
+
m.consecutive_no_write_turns = 500
|
|
5452
5478
|
body = {"messages": [{"role": "user", "content": "go"}]}
|
|
5453
5479
|
proxy._maybe_inject_recon_convergence(body, m)
|
|
5454
5480
|
self.assertEqual(len(body["messages"]), 1)
|
|
5481
|
+
|
|
5482
|
+
|
|
5483
|
+
class TestPrunerRework(unittest.TestCase):
|
|
5484
|
+
"""Tests for the reworked context pruner (B2 + B3): contiguous
|
|
5485
|
+
monotonic prune boundary (cache-stable) + breadcrumb summary of the
|
|
5486
|
+
dropped block (findings retained)."""
|
|
5487
|
+
|
|
5488
|
+
@staticmethod
|
|
5489
|
+
def _tool_result_msg(idx: int, size: int = 4000) -> dict:
|
|
5490
|
+
return {
|
|
5491
|
+
"role": "user",
|
|
5492
|
+
"content": [
|
|
5493
|
+
{
|
|
5494
|
+
"type": "tool_result",
|
|
5495
|
+
"tool_use_id": f"toolu_{idx}",
|
|
5496
|
+
"content": f"FILE-{idx} " + ("x" * size),
|
|
5497
|
+
}
|
|
5498
|
+
],
|
|
5499
|
+
}
|
|
5500
|
+
|
|
5501
|
+
def _big_body(self, n_middle: int = 20) -> dict:
|
|
5502
|
+
msgs = [{"role": "user", "content": "recon task: analyze the repo"}]
|
|
5503
|
+
for i in range(n_middle):
|
|
5504
|
+
msgs.append({"role": "assistant", "content": f"reading file {i}"})
|
|
5505
|
+
msgs.append(self._tool_result_msg(i))
|
|
5506
|
+
msgs.append({"role": "user", "content": "continue"})
|
|
5507
|
+
return {"messages": msgs}
|
|
5508
|
+
|
|
5509
|
+
def test_prune_drop_count_is_monotonic(self):
|
|
5510
|
+
"""The per-session prune boundary only ever grows."""
|
|
5511
|
+
m = proxy.SessionMonitor(context_window=8192)
|
|
5512
|
+
proxy.prune_conversation(self._big_body(), 8192, monitor=m,
|
|
5513
|
+
target_fraction=0.5, keep_last=6)
|
|
5514
|
+
first = m.prune_drop_count
|
|
5515
|
+
self.assertGreater(first, 0)
|
|
5516
|
+
# A tighter target on the same body can only drop more, never fewer.
|
|
5517
|
+
proxy.prune_conversation(self._big_body(), 8192, monitor=m,
|
|
5518
|
+
target_fraction=0.25, keep_last=6)
|
|
5519
|
+
self.assertGreaterEqual(m.prune_drop_count, first)
|
|
5520
|
+
|
|
5521
|
+
def test_kept_middle_is_contiguous_suffix(self):
|
|
5522
|
+
"""The pruner drops a contiguous oldest block — the surviving
|
|
5523
|
+
middle messages are a contiguous suffix of the original middle,
|
|
5524
|
+
never a non-contiguous greedy pick."""
|
|
5525
|
+
m = proxy.SessionMonitor(context_window=8192)
|
|
5526
|
+
body = self._big_body()
|
|
5527
|
+
original = list(body["messages"])
|
|
5528
|
+
result = proxy.prune_conversation(body, 8192, monitor=m,
|
|
5529
|
+
target_fraction=0.5, keep_last=6)
|
|
5530
|
+
out = result["messages"]
|
|
5531
|
+
survivors = [msg for msg in out if msg in original]
|
|
5532
|
+
idxs = [original.index(msg) for msg in survivors]
|
|
5533
|
+
self.assertEqual(idxs, sorted(idxs))
|
|
5534
|
+
tail_idxs = [i for i in idxs if i > 0]
|
|
5535
|
+
if len(tail_idxs) > 1:
|
|
5536
|
+
self.assertEqual(
|
|
5537
|
+
tail_idxs, list(range(tail_idxs[0], tail_idxs[0] + len(tail_idxs)))
|
|
5538
|
+
)
|
|
5539
|
+
|
|
5540
|
+
def test_stable_output_when_boundary_does_not_advance(self):
|
|
5541
|
+
"""Cache-stability: pruning the same body twice with the same
|
|
5542
|
+
monitor yields byte-identical message lists — the second call
|
|
5543
|
+
seeds from the persisted boundary and does not advance it."""
|
|
5544
|
+
m = proxy.SessionMonitor(context_window=8192)
|
|
5545
|
+
first = proxy.prune_conversation(self._big_body(), 8192, monitor=m,
|
|
5546
|
+
target_fraction=0.5, keep_last=6)
|
|
5547
|
+
boundary_after_first = m.prune_drop_count
|
|
5548
|
+
second = proxy.prune_conversation(self._big_body(), 8192, monitor=m,
|
|
5549
|
+
target_fraction=0.5, keep_last=6)
|
|
5550
|
+
self.assertEqual(m.prune_drop_count, boundary_after_first)
|
|
5551
|
+
self.assertEqual(first["messages"], second["messages"])
|
|
5552
|
+
|
|
5553
|
+
def test_dropped_tool_results_become_breadcrumbs(self):
|
|
5554
|
+
"""Pruned tool-results survive as one-line breadcrumbs in the
|
|
5555
|
+
marker, not silently discarded."""
|
|
5556
|
+
dropped = [self._tool_result_msg(i) for i in range(3)]
|
|
5557
|
+
summary = proxy._summarize_pruned_block(dropped)
|
|
5558
|
+
self.assertIn("CONTEXT PRUNED", summary)
|
|
5559
|
+
self.assertIn("tool result", summary)
|
|
5560
|
+
self.assertIn("FILE-0", summary)
|
|
5561
|
+
self.assertIn("FILE-2", summary)
|
|
5562
|
+
|
|
5563
|
+
def test_summary_is_bounded_by_max_items(self):
|
|
5564
|
+
"""A huge dropped block does not produce an unbounded summary."""
|
|
5565
|
+
old = proxy._PRUNE_SUMMARY_MAX_ITEMS
|
|
5566
|
+
try:
|
|
5567
|
+
proxy._PRUNE_SUMMARY_MAX_ITEMS = 5
|
|
5568
|
+
dropped = [self._tool_result_msg(i) for i in range(40)]
|
|
5569
|
+
summary = proxy._summarize_pruned_block(dropped)
|
|
5570
|
+
self.assertEqual(summary.count("- tool result"), 5)
|
|
5571
|
+
self.assertIn("most recent 5 of 40", summary)
|
|
5572
|
+
finally:
|
|
5573
|
+
proxy._PRUNE_SUMMARY_MAX_ITEMS = old
|
|
5574
|
+
|
|
5575
|
+
def test_summarize_no_tool_results_falls_back_to_static_marker(self):
|
|
5576
|
+
"""A dropped block with no tool-results yields the plain static
|
|
5577
|
+
marker — no per-call varying text (cache-safe)."""
|
|
5578
|
+
dropped = [
|
|
5579
|
+
{"role": "assistant", "content": "thinking out loud"},
|
|
5580
|
+
{"role": "user", "content": "ok"},
|
|
5581
|
+
]
|
|
5582
|
+
summary = proxy._summarize_pruned_block(dropped)
|
|
5583
|
+
self.assertIn("CONTEXT PRUNED", summary)
|
|
5584
|
+
self.assertNotIn("tool result", summary)
|
|
5585
|
+
self.assertNotIn("most recent", summary)
|