@miller-tech/uap 1.20.48 → 1.20.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@miller-tech/uap",
3
- "version": "1.20.48",
3
+ "version": "1.20.50",
4
4
  "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -207,6 +207,19 @@ _READ_ONLY_TOOL_CLASS = frozenset({
207
207
  "search", "Search", "list_files", "ListFiles",
208
208
  })
209
209
 
210
+ # Tools that produce or mutate a deliverable. Using any of these in a turn
211
+ # means the agent is converging from exploration toward output, and resets
212
+ # the recon-convergence streak (B1). This is deliberately a SHORT allowlist
213
+ # of write tools, NOT a read-only denylist: exploration happens through an
214
+ # open-ended set of tools (Bash, WebFetch, Agent, ...) that cannot be
215
+ # enumerated, but "the agent produced a write" is a small, stable signal.
216
+ # Names are matched case-insensitively (callers lower() before lookup).
217
+ _WRITE_TOOL_CLASS = frozenset({
218
+ "write", "edit", "multiedit", "notebookedit",
219
+ "str_replace", "str_replace_editor", "str_replace_based_edit_tool",
220
+ "create_file", "applypatch", "apply_patch",
221
+ })
222
+
210
223
  PROXY_GUARDRAIL_RETRY = os.environ.get("PROXY_GUARDRAIL_RETRY", "on").lower() not in {
211
224
  "0",
212
225
  "false",
@@ -224,12 +237,15 @@ PROXY_FINALIZE_CONTINUATION_MAX = int(
224
237
  PROXY_FINALIZE_SESSION_HARD_CAP = int(
225
238
  os.environ.get("PROXY_FINALIZE_SESSION_HARD_CAP", "3")
226
239
  )
227
- # Recon-convergence guardrail: after this many consecutive turns of PURE
228
- # read-only exploration (Read/Grep/Glob/etc. no write/edit/deliverable
229
- # tool), the proxy injects a directive telling the model to stop exploring
230
- # and produce its deliverable. Targets the failure mode where an agentic
231
- # recon task reads files for hundreds of turns and never converges to the
240
+ # Recon-convergence guardrail: after this many consecutive turns that use
241
+ # tools but produce NO write/deliverable tool call (see _WRITE_TOOL_CLASS),
242
+ # the proxy injects a directive telling the model to stop exploring and
243
+ # produce its deliverable. Targets the failure mode where an agentic recon
244
+ # task explores for hundreds of turns and never converges to the
232
245
  # synthesis/write step (observed: 664-turn recon, no deliverable started).
246
+ # Defined as write-tool ABSENCE rather than read-tool presence: a real
247
+ # recon agent explores via Bash/WebFetch/Agent, not just Read/Grep, so a
248
+ # "all tools are recognized read-only" test never accumulates a streak.
233
249
  # 0 disables.
234
250
  PROXY_RECON_CONVERGENCE_THRESHOLD = int(
235
251
  os.environ.get("PROXY_RECON_CONVERGENCE_THRESHOLD", "40")
@@ -712,6 +728,7 @@ class SessionMonitor:
712
728
  peak_input_tokens: int = 0 # High-water mark
713
729
  prune_count: int = 0 # How many times pruning was triggered
714
730
  overflow_count: int = 0 # How many context overflow errors caught
731
+ prune_drop_count: int = 0 # monotonic: # of oldest middle msgs pruned (B3)
715
732
  context_history: list = field(default_factory=list) # Recent token counts
716
733
 
717
734
  # --- Token Loop Protection ---
@@ -726,7 +743,7 @@ class SessionMonitor:
726
743
  )
727
744
  loop_warnings_emitted: int = 0 # How many loop warnings sent to the model
728
745
  no_progress_streak: int = 0 # Forced tool turns without new tool_result
729
- consecutive_readonly_turns: int = 0 # turns of pure read-only exploration (B1)
746
+ consecutive_no_write_turns: int = 0 # turns exploring with no write tool (B1)
730
747
  unexpected_end_turn_count: int = 0 # end_turn without tool_use in active loop
731
748
  tool_starvation_streak: int = 0 # Consecutive forced turns with no tool_calls produced
732
749
  malformed_tool_streak: int = 0 # consecutive malformed pseudo tool payloads
@@ -884,15 +901,19 @@ class SessionMonitor:
884
901
  if len(self.tool_call_history) > 30:
885
902
  self.tool_call_history = self.tool_call_history[-30:]
886
903
 
887
- # Recon-convergence (B1): count consecutive turns of PURE read-only
888
- # exploration. A turn that uses any non-read-only tool (write, edit,
889
- # a deliverable tool) resets the streak — that's the model
890
- # converging from exploration toward synthesis/action.
891
- _ro = {n.lower() for n in _READ_ONLY_TOOL_CLASS}
892
- if tool_names and all(n.lower() in _ro for n in tool_names):
893
- self.consecutive_readonly_turns += 1
894
- else:
895
- self.consecutive_readonly_turns = 0
904
+ # Recon-convergence (B1): count consecutive turns that use tools but
905
+ # produce NO write/deliverable tool call. A turn that uses any write
906
+ # tool resets the streak — that's the model converging from
907
+ # exploration toward synthesis/output. A turn with no tool calls at
908
+ # all is a plain-text turn (neither exploration nor a write) and
909
+ # leaves the streak unchanged. This is the inverse of the old
910
+ # "all tools are recognized read-only" test, which reset on any
911
+ # Bash/WebFetch/Agent turn and so never accumulated for real agents.
912
+ if tool_names:
913
+ if any(n.lower() in _WRITE_TOOL_CLASS for n in tool_names):
914
+ self.consecutive_no_write_turns = 0
915
+ else:
916
+ self.consecutive_no_write_turns += 1
896
917
 
897
918
  # Track read-only tool targets for dedup (Option 3)
898
919
  if tool_targets:
@@ -1318,24 +1339,83 @@ def estimate_total_tokens(anthropic_body: dict) -> int:
1318
1339
  return tokens
1319
1340
 
1320
1341
 
1342
+ # Max tool-result breadcrumbs listed in a prune summary (B2). Bounds the
1343
+ # summary size — beyond this the oldest breadcrumbs are elided.
1344
+ _PRUNE_SUMMARY_MAX_ITEMS = int(os.environ.get("PROXY_PRUNE_SUMMARY_MAX_ITEMS", "30"))
1345
+
1346
+
1347
+ def _summarize_pruned_block(dropped: list[dict]) -> str:
1348
+ """Build a compact breadcrumb summary of pruned messages (B2).
1349
+
1350
+ Instead of discarding dropped tool-results outright, leave a one-line
1351
+ trace of each so the agent retains *what it already found*. A recon
1352
+ agent that can still see "I read auth_handler.cpp — JWT validation in
1353
+ validateToken()" is far likelier to converge to a synthesis than one
1354
+ whose findings vanished entirely and which therefore re-explores.
1355
+
1356
+ Heuristic only — no LLM call. Bounded to the most recent
1357
+ PROXY_PRUNE_SUMMARY_MAX_ITEMS tool-result breadcrumbs so the summary
1358
+ itself cannot grow unbounded.
1359
+ """
1360
+ breadcrumbs: list[str] = []
1361
+ for msg in dropped:
1362
+ content = msg.get("content", [])
1363
+ if not isinstance(content, list):
1364
+ continue
1365
+ for block in content:
1366
+ if isinstance(block, dict) and block.get("type") == "tool_result":
1367
+ text = _extract_text(block.get("content", "")).strip()
1368
+ if not text:
1369
+ continue
1370
+ excerpt = " ".join(text.split())[:100]
1371
+ breadcrumbs.append(
1372
+ f"- tool result (~{estimate_tokens(text)} tok): {excerpt}"
1373
+ )
1374
+ if not breadcrumbs:
1375
+ return (
1376
+ "[CONTEXT PRUNED: older messages were removed to fit the context "
1377
+ "window. The conversation continues from recent context below.]"
1378
+ )
1379
+ total = len(breadcrumbs)
1380
+ if total > _PRUNE_SUMMARY_MAX_ITEMS:
1381
+ breadcrumbs = breadcrumbs[-_PRUNE_SUMMARY_MAX_ITEMS:]
1382
+ header = (
1383
+ f"[CONTEXT PRUNED — {len(dropped)} older messages removed to fit the "
1384
+ "context window. Breadcrumbs of earlier findings"
1385
+ )
1386
+ if total > len(breadcrumbs):
1387
+ header += f" (most recent {len(breadcrumbs)} of {total} tool results)"
1388
+ header += " — rely on these instead of re-reading those files:]"
1389
+ return header + "\n" + "\n".join(breadcrumbs)
1390
+
1391
+
1321
1392
  def prune_conversation(
1322
1393
  anthropic_body: dict,
1323
1394
  context_window: int,
1395
+ monitor: "SessionMonitor | None" = None,
1324
1396
  target_fraction: float = 0.65,
1325
1397
  keep_last: int = 8,
1326
1398
  ) -> dict:
1327
1399
  """Prune the conversation to fit within the context window.
1328
1400
 
1329
- Strategy:
1330
- - Always keep: system prompt, first user message, last N messages
1331
- - Remove from the middle: oldest tool_result messages first (they're
1332
- the largest -- file contents, command output, etc.), then oldest
1333
- assistant messages, then oldest user messages.
1334
- - Inject a [CONTEXT PRUNED] marker so the model knows history was trimmed.
1401
+ Strategy (reworked — UAP PR #186):
1402
+ - Always keep: system prompt, first user message, last N messages.
1403
+ - Drop a CONTIGUOUS block of the oldest middle messages. The drop
1404
+ count is persisted per-session on the monitor (`prune_drop_count`)
1405
+ and is monotonic it only ever grows. This keeps the retained
1406
+ region a stable recent *suffix*: on turns where the boundary does
1407
+ not advance, the upstream KV-cache prefix stays valid and the turn
1408
+ is not reprocessed. (The previous priority-greedy keep was
1409
+ non-contiguous and reshuffled the prompt mid-stream every turn,
1410
+ defeating the cache.)
1411
+ - Replace the dropped block with a breadcrumb summary (see
1412
+ _summarize_pruned_block) so the agent keeps its earlier findings.
1335
1413
 
1336
1414
  Args:
1337
1415
  anthropic_body: The full Anthropic request body
1338
1416
  context_window: Maximum context window in tokens
1417
+ monitor: SessionMonitor — carries the monotonic prune boundary.
1418
+ When None, pruning still works but is non-monotonic per call.
1339
1419
  target_fraction: Target utilization after pruning (0.0-1.0)
1340
1420
  keep_last: Number of recent messages to always keep (default 8)
1341
1421
 
@@ -1411,70 +1491,39 @@ def prune_conversation(
1411
1491
 
1412
1492
  remaining_budget = message_budget - protected_tokens
1413
1493
 
1414
- # Score middle messages for removal priority:
1415
- # - tool_result messages: remove first (biggest, least important historically)
1416
- # - assistant text-only: remove second
1417
- # - user messages: remove last (provide context for the model's actions)
1418
- # Within each category, remove oldest first.
1419
- scored_middle = []
1420
- for i, msg in enumerate(middle):
1421
- content = msg.get("content", [])
1422
- tokens = estimate_message_tokens(msg)
1423
- is_tool_result = False
1424
- is_assistant = msg.get("role") == "assistant"
1425
-
1426
- if isinstance(content, list):
1427
- is_tool_result = any(
1428
- isinstance(b, dict) and b.get("type") == "tool_result" for b in content
1429
- )
1430
-
1431
- # Lower priority = removed first
1432
- if is_tool_result:
1433
- priority = 0 # Remove first
1434
- elif is_assistant:
1435
- priority = 1 # Remove second
1436
- else:
1437
- priority = 2 # Remove last (user messages)
1438
-
1439
- scored_middle.append((priority, i, tokens, msg))
1440
-
1441
- # Sort by priority (ascending = remove first), then by index (oldest first)
1442
- scored_middle.sort(key=lambda x: (x[0], x[1]))
1443
-
1444
- # Greedily keep messages from highest priority (keep last) until budget fills
1445
- kept_middle = []
1446
- used_tokens = 0
1447
- # Process in reverse priority order (keep high-priority messages first)
1448
- for priority, idx, tokens, msg in reversed(scored_middle):
1449
- if used_tokens + tokens <= remaining_budget:
1450
- kept_middle.append((idx, msg))
1451
- used_tokens += tokens
1452
-
1453
- # Sort kept messages back into original order
1454
- kept_middle.sort(key=lambda x: x[0])
1455
- kept_msgs = [m for _, m in kept_middle]
1494
+ # --- Monotonic contiguous prune boundary (cache-stable, B3) ---
1495
+ # Drop the oldest `drop_count` middle messages as one contiguous block.
1496
+ # Seed from the monitor's persisted boundary; advance it only as far as
1497
+ # the budget forces. Persist back monotonically so a later/looser prune
1498
+ # in the same turn can't shrink it (which would reshuffle the prompt).
1499
+ drop_count = 0
1500
+ if monitor is not None:
1501
+ drop_count = min(max(0, monitor.prune_drop_count), len(middle))
1502
+ while drop_count < len(middle):
1503
+ kept_tokens = sum(estimate_message_tokens(m) for m in middle[drop_count:])
1504
+ if kept_tokens <= remaining_budget:
1505
+ break
1506
+ drop_count += 1
1507
+ if monitor is not None:
1508
+ monitor.prune_drop_count = max(monitor.prune_drop_count, drop_count)
1456
1509
 
1457
- removed_count = len(middle) - len(kept_msgs)
1458
- removed_tokens = sum(t for _, _, t, _ in scored_middle) - used_tokens
1510
+ dropped = middle[:drop_count]
1511
+ kept_msgs = middle[drop_count:]
1459
1512
 
1460
- if removed_count > 0:
1461
- # Insert a context-pruned marker
1513
+ if dropped:
1514
+ # Replace the dropped block with a findings-breadcrumb summary (B2).
1462
1515
  prune_marker = {
1463
1516
  "role": "user",
1464
- "content": (
1465
- f"[CONTEXT PRUNED: {removed_count} older messages (~{removed_tokens} tokens) "
1466
- f"were removed to fit within the context window. "
1467
- f"The conversation continues from recent context below.]"
1468
- ),
1517
+ "content": _summarize_pruned_block(dropped),
1469
1518
  }
1470
1519
  anthropic_body["messages"] = (
1471
1520
  protected_head + [prune_marker] + kept_msgs + protected_tail
1472
1521
  )
1473
1522
  logger.warning(
1474
- "PRUNED: removed %d messages (~%d tokens), kept %d messages, "
1475
- "target=%.0f%% of %d ctx",
1476
- removed_count,
1477
- removed_tokens,
1523
+ "PRUNED: dropped %d oldest middle messages (boundary=%d), "
1524
+ "kept %d total, target=%.0f%% of %d ctx",
1525
+ len(dropped),
1526
+ drop_count,
1478
1527
  len(anthropic_body["messages"]),
1479
1528
  target_fraction * 100,
1480
1529
  context_window,
@@ -3240,46 +3289,46 @@ def _resolve_state_machine_tool_choice(
3240
3289
 
3241
3290
 
3242
3291
  def _maybe_inject_recon_convergence(openai_body: dict, monitor: "SessionMonitor") -> None:
3243
- """Nudge a session stuck in prolonged read-only exploration toward its
3244
- deliverable.
3245
-
3246
- Fires when `consecutive_readonly_turns` crosses
3247
- PROXY_RECON_CONVERGENCE_THRESHOLD the model has read files for many
3248
- turns without writing anything. Targets the observed failure mode of
3249
- an agentic recon task wandering for hundreds of turns and never
3250
- converging to the synthesis/write step. Two escalation tiers: a firm
3251
- "switch to synthesis" directive, then a hard "STOP, write it now" once
3252
- the streak is 2x over threshold.
3292
+ """Nudge a session stuck in prolonged exploration toward its deliverable.
3293
+
3294
+ Fires when `consecutive_no_write_turns` crosses
3295
+ PROXY_RECON_CONVERGENCE_THRESHOLD the model has used tools for many
3296
+ turns without producing any write/deliverable tool call. Targets the
3297
+ observed failure mode of an agentic recon task wandering for hundreds
3298
+ of turns and never converging to the synthesis/write step. Two
3299
+ escalation tiers: a firm "switch to synthesis" directive, then a hard
3300
+ "STOP, write it now" once the streak is 2x over threshold.
3253
3301
  """
3254
3302
  if PROXY_RECON_CONVERGENCE_THRESHOLD <= 0:
3255
3303
  return
3256
- streak = monitor.consecutive_readonly_turns
3304
+ streak = monitor.consecutive_no_write_turns
3257
3305
  if streak < PROXY_RECON_CONVERGENCE_THRESHOLD:
3258
3306
  return
3259
3307
  util = monitor.get_utilization()
3260
3308
  if streak >= 2 * PROXY_RECON_CONVERGENCE_THRESHOLD:
3261
3309
  directive = (
3262
3310
  f"STOP exploring. You have run {streak} consecutive turns of "
3263
- f"read-only exploration and context is at {util * 100:.0f}%. "
3264
- "You will NOT finish if you keep reading files. Produce your "
3265
- "deliverable NOW from the information you already have — write "
3266
- "it to a file with the appropriate tool. Do not read anything else."
3311
+ f"exploration without producing a deliverable and context is at "
3312
+ f"{util * 100:.0f}%. You will NOT finish if you keep exploring. "
3313
+ "Produce your deliverable NOW from the information you already "
3314
+ "have — write it to a file with the appropriate tool. Do not "
3315
+ "read or run anything else."
3267
3316
  )
3268
3317
  tier = "hard"
3269
3318
  else:
3270
3319
  directive = (
3271
- f"You have read files for {streak} consecutive turns without "
3320
+ f"You have explored for {streak} consecutive turns without "
3272
3321
  f"producing a deliverable (context {util * 100:.0f}%). You have "
3273
3322
  "enough to begin. Switch from exploration to synthesis: write "
3274
- "your deliverable now. Read at most one more file, and only if "
3275
- "strictly required to write it."
3323
+ "your deliverable now. Explore at most one more time, and only "
3324
+ "if strictly required to write it."
3276
3325
  )
3277
3326
  tier = "firm"
3278
3327
  msgs = openai_body.get("messages", [])
3279
3328
  msgs.append({"role": "user", "content": directive})
3280
3329
  openai_body["messages"] = msgs
3281
3330
  logger.warning(
3282
- "RECON CONVERGENCE: injected %s directive (readonly_streak=%d, ctx=%.0f%%)",
3331
+ "RECON CONVERGENCE: injected %s directive (no_write_streak=%d, ctx=%.0f%%)",
3283
3332
  tier, streak, util * 100,
3284
3333
  )
3285
3334
 
@@ -3792,8 +3841,8 @@ def build_openai_request(
3792
3841
  _apply_tool_call_grammar(openai_body, grammar_override=profile_grammar)
3793
3842
 
3794
3843
  # Recon-convergence guardrail (B1) — runs on every built request so a
3795
- # session wandering in read-only exploration is nudged toward its
3796
- # deliverable regardless of tool-turn phase.
3844
+ # session wandering in exploration without producing a write is nudged
3845
+ # toward its deliverable regardless of tool-turn phase.
3797
3846
  _maybe_inject_recon_convergence(openai_body, monitor)
3798
3847
 
3799
3848
  return openai_body
@@ -7560,7 +7609,8 @@ async def messages(request: Request):
7560
7609
  target_frac * 100,
7561
7610
  )
7562
7611
  body = prune_conversation(
7563
- body, ctx_window, target_fraction=target_frac, keep_last=keep_last
7612
+ body, ctx_window, monitor=monitor,
7613
+ target_fraction=target_frac, keep_last=keep_last,
7564
7614
  )
7565
7615
  monitor.prune_count += 1
7566
7616
  # Option 4: Post-prune validation — verify actual reduction
@@ -7581,7 +7631,8 @@ async def messages(request: Request):
7581
7631
  post_util * 100,
7582
7632
  )
7583
7633
  body = prune_conversation(
7584
- body, ctx_window, target_fraction=0.35, keep_last=4
7634
+ body, ctx_window, monitor=monitor,
7635
+ target_fraction=0.35, keep_last=4,
7585
7636
  )
7586
7637
  monitor.prune_count += 1
7587
7638
  estimated_tokens = estimate_total_tokens(body)
@@ -5375,10 +5375,13 @@ class TestSlotSaveRestore(unittest.TestCase):
5375
5375
 
5376
5376
  class TestReconConvergence(unittest.TestCase):
5377
5377
  """Tests for the B1 recon-convergence guardrail — nudges a session
5378
- stuck doing read-only exploration toward producing its deliverable.
5378
+ stuck exploring without producing a write toward its deliverable.
5379
5379
 
5380
- Targets the observed failure: a 664-turn agentic recon task that read
5381
- files for hours and never converged to the synthesis/write step."""
5380
+ The streak is defined as write-tool ABSENCE, not read-tool presence: a
5381
+ real recon agent explores via Bash/WebFetch/Agent, so an "all tools are
5382
+ recognized read-only" test never accumulates. Targets the observed
5383
+ failure: a 664-turn agentic recon task that explored for hours and
5384
+ never converged to the synthesis/write step."""
5382
5385
 
5383
5386
  def setUp(self):
5384
5387
  self._threshold = proxy.PROXY_RECON_CONVERGENCE_THRESHOLD
@@ -5387,37 +5390,60 @@ class TestReconConvergence(unittest.TestCase):
5387
5390
  proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = self._threshold
5388
5391
 
5389
5392
  def test_readonly_turns_increment_the_streak(self):
5390
- """Consecutive turns using only read-only tools grow the streak."""
5393
+ """Consecutive turns using only read tools grow the streak."""
5391
5394
  m = proxy.SessionMonitor(context_window=131072)
5392
5395
  for _ in range(5):
5393
5396
  m.record_tool_calls(["Read"])
5394
- self.assertEqual(m.consecutive_readonly_turns, 5)
5397
+ self.assertEqual(m.consecutive_no_write_turns, 5)
5395
5398
  m.record_tool_calls(["Grep", "Glob"])
5396
- self.assertEqual(m.consecutive_readonly_turns, 6)
5399
+ self.assertEqual(m.consecutive_no_write_turns, 6)
5397
5400
 
5398
- def test_non_readonly_tool_resets_the_streak(self):
5401
+ def test_bash_and_webfetch_turns_increment_the_streak(self):
5402
+ """The core fix: exploration via Bash/WebFetch/Agent — tools the old
5403
+ read-only allowlist did not recognize — must grow the streak. The
5404
+ old logic reset on every such turn, so the streak never built."""
5405
+ m = proxy.SessionMonitor(context_window=131072)
5406
+ m.record_tool_calls(["Bash"])
5407
+ m.record_tool_calls(["WebFetch"])
5408
+ m.record_tool_calls(["Agent"])
5409
+ m.record_tool_calls(["Read", "Bash"]) # mixed exploration, no write
5410
+ self.assertEqual(m.consecutive_no_write_turns, 4)
5411
+
5412
+ def test_write_tool_resets_the_streak(self):
5399
5413
  """A turn using a write/edit tool means the model converged toward
5400
- action — the streak resets to 0."""
5414
+ output — the streak resets to 0."""
5401
5415
  m = proxy.SessionMonitor(context_window=131072)
5402
5416
  for _ in range(10):
5403
- m.record_tool_calls(["Read"])
5404
- self.assertEqual(m.consecutive_readonly_turns, 10)
5417
+ m.record_tool_calls(["Bash"])
5418
+ self.assertEqual(m.consecutive_no_write_turns, 10)
5405
5419
  m.record_tool_calls(["Write"])
5406
- self.assertEqual(m.consecutive_readonly_turns, 0)
5420
+ self.assertEqual(m.consecutive_no_write_turns, 0)
5407
5421
 
5408
5422
  def test_mixed_turn_with_one_write_resets(self):
5409
- """A turn mixing read-only and a write tool still counts as
5410
- converging — any non-read-only tool resets."""
5423
+ """A turn mixing exploration and a write tool still counts as
5424
+ converging — any write tool resets."""
5411
5425
  m = proxy.SessionMonitor(context_window=131072)
5412
5426
  for _ in range(10):
5413
5427
  m.record_tool_calls(["Read"])
5414
5428
  m.record_tool_calls(["Read", "Edit"])
5415
- self.assertEqual(m.consecutive_readonly_turns, 0)
5429
+ self.assertEqual(m.consecutive_no_write_turns, 0)
5430
+
5431
+ def test_no_tool_turn_leaves_streak_unchanged(self):
5432
+ """A plain-text turn (no tool calls) is neither exploration nor a
5433
+ write — it must leave the streak untouched, not reset it."""
5434
+ m = proxy.SessionMonitor(context_window=131072)
5435
+ for _ in range(7):
5436
+ m.record_tool_calls(["Bash"])
5437
+ self.assertEqual(m.consecutive_no_write_turns, 7)
5438
+ m.record_tool_calls([]) # plain-text turn
5439
+ self.assertEqual(m.consecutive_no_write_turns, 7)
5440
+ m.record_tool_calls(["Read"])
5441
+ self.assertEqual(m.consecutive_no_write_turns, 8)
5416
5442
 
5417
5443
  def test_no_injection_below_threshold(self):
5418
5444
  proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 40
5419
5445
  m = proxy.SessionMonitor(context_window=131072)
5420
- m.consecutive_readonly_turns = 39
5446
+ m.consecutive_no_write_turns = 39
5421
5447
  body = {"messages": [{"role": "user", "content": "go"}]}
5422
5448
  proxy._maybe_inject_recon_convergence(body, m)
5423
5449
  self.assertEqual(len(body["messages"]), 1)
@@ -5425,7 +5451,7 @@ class TestReconConvergence(unittest.TestCase):
5425
5451
  def test_firm_directive_at_threshold(self):
5426
5452
  proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 40
5427
5453
  m = proxy.SessionMonitor(context_window=131072)
5428
- m.consecutive_readonly_turns = 45
5454
+ m.consecutive_no_write_turns = 45
5429
5455
  m.last_input_tokens = 120000
5430
5456
  body = {"messages": [{"role": "user", "content": "go"}]}
5431
5457
  proxy._maybe_inject_recon_convergence(body, m)
@@ -5438,7 +5464,7 @@ class TestReconConvergence(unittest.TestCase):
5438
5464
  """Once the streak is 2x over threshold, escalate to a hard STOP."""
5439
5465
  proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 40
5440
5466
  m = proxy.SessionMonitor(context_window=131072)
5441
- m.consecutive_readonly_turns = 80
5467
+ m.consecutive_no_write_turns = 80
5442
5468
  m.last_input_tokens = 250000 # over budget — the real-incident shape
5443
5469
  body = {"messages": [{"role": "user", "content": "go"}]}
5444
5470
  proxy._maybe_inject_recon_convergence(body, m)
@@ -5448,7 +5474,112 @@ class TestReconConvergence(unittest.TestCase):
5448
5474
  def test_disabled_when_threshold_zero(self):
5449
5475
  proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 0
5450
5476
  m = proxy.SessionMonitor(context_window=131072)
5451
- m.consecutive_readonly_turns = 500
5477
+ m.consecutive_no_write_turns = 500
5452
5478
  body = {"messages": [{"role": "user", "content": "go"}]}
5453
5479
  proxy._maybe_inject_recon_convergence(body, m)
5454
5480
  self.assertEqual(len(body["messages"]), 1)
5481
+
5482
+
5483
+ class TestPrunerRework(unittest.TestCase):
5484
+ """Tests for the reworked context pruner (B2 + B3): contiguous
5485
+ monotonic prune boundary (cache-stable) + breadcrumb summary of the
5486
+ dropped block (findings retained)."""
5487
+
5488
+ @staticmethod
5489
+ def _tool_result_msg(idx: int, size: int = 4000) -> dict:
5490
+ return {
5491
+ "role": "user",
5492
+ "content": [
5493
+ {
5494
+ "type": "tool_result",
5495
+ "tool_use_id": f"toolu_{idx}",
5496
+ "content": f"FILE-{idx} " + ("x" * size),
5497
+ }
5498
+ ],
5499
+ }
5500
+
5501
+ def _big_body(self, n_middle: int = 20) -> dict:
5502
+ msgs = [{"role": "user", "content": "recon task: analyze the repo"}]
5503
+ for i in range(n_middle):
5504
+ msgs.append({"role": "assistant", "content": f"reading file {i}"})
5505
+ msgs.append(self._tool_result_msg(i))
5506
+ msgs.append({"role": "user", "content": "continue"})
5507
+ return {"messages": msgs}
5508
+
5509
+ def test_prune_drop_count_is_monotonic(self):
5510
+ """The per-session prune boundary only ever grows."""
5511
+ m = proxy.SessionMonitor(context_window=8192)
5512
+ proxy.prune_conversation(self._big_body(), 8192, monitor=m,
5513
+ target_fraction=0.5, keep_last=6)
5514
+ first = m.prune_drop_count
5515
+ self.assertGreater(first, 0)
5516
+ # A tighter target on the same body can only drop more, never fewer.
5517
+ proxy.prune_conversation(self._big_body(), 8192, monitor=m,
5518
+ target_fraction=0.25, keep_last=6)
5519
+ self.assertGreaterEqual(m.prune_drop_count, first)
5520
+
5521
+ def test_kept_middle_is_contiguous_suffix(self):
5522
+ """The pruner drops a contiguous oldest block — the surviving
5523
+ middle messages are a contiguous suffix of the original middle,
5524
+ never a non-contiguous greedy pick."""
5525
+ m = proxy.SessionMonitor(context_window=8192)
5526
+ body = self._big_body()
5527
+ original = list(body["messages"])
5528
+ result = proxy.prune_conversation(body, 8192, monitor=m,
5529
+ target_fraction=0.5, keep_last=6)
5530
+ out = result["messages"]
5531
+ survivors = [msg for msg in out if msg in original]
5532
+ idxs = [original.index(msg) for msg in survivors]
5533
+ self.assertEqual(idxs, sorted(idxs))
5534
+ tail_idxs = [i for i in idxs if i > 0]
5535
+ if len(tail_idxs) > 1:
5536
+ self.assertEqual(
5537
+ tail_idxs, list(range(tail_idxs[0], tail_idxs[0] + len(tail_idxs)))
5538
+ )
5539
+
5540
+ def test_stable_output_when_boundary_does_not_advance(self):
5541
+ """Cache-stability: pruning the same body twice with the same
5542
+ monitor yields byte-identical message lists — the second call
5543
+ seeds from the persisted boundary and does not advance it."""
5544
+ m = proxy.SessionMonitor(context_window=8192)
5545
+ first = proxy.prune_conversation(self._big_body(), 8192, monitor=m,
5546
+ target_fraction=0.5, keep_last=6)
5547
+ boundary_after_first = m.prune_drop_count
5548
+ second = proxy.prune_conversation(self._big_body(), 8192, monitor=m,
5549
+ target_fraction=0.5, keep_last=6)
5550
+ self.assertEqual(m.prune_drop_count, boundary_after_first)
5551
+ self.assertEqual(first["messages"], second["messages"])
5552
+
5553
+ def test_dropped_tool_results_become_breadcrumbs(self):
5554
+ """Pruned tool-results survive as one-line breadcrumbs in the
5555
+ marker, not silently discarded."""
5556
+ dropped = [self._tool_result_msg(i) for i in range(3)]
5557
+ summary = proxy._summarize_pruned_block(dropped)
5558
+ self.assertIn("CONTEXT PRUNED", summary)
5559
+ self.assertIn("tool result", summary)
5560
+ self.assertIn("FILE-0", summary)
5561
+ self.assertIn("FILE-2", summary)
5562
+
5563
+ def test_summary_is_bounded_by_max_items(self):
5564
+ """A huge dropped block does not produce an unbounded summary."""
5565
+ old = proxy._PRUNE_SUMMARY_MAX_ITEMS
5566
+ try:
5567
+ proxy._PRUNE_SUMMARY_MAX_ITEMS = 5
5568
+ dropped = [self._tool_result_msg(i) for i in range(40)]
5569
+ summary = proxy._summarize_pruned_block(dropped)
5570
+ self.assertEqual(summary.count("- tool result"), 5)
5571
+ self.assertIn("most recent 5 of 40", summary)
5572
+ finally:
5573
+ proxy._PRUNE_SUMMARY_MAX_ITEMS = old
5574
+
5575
+ def test_summarize_no_tool_results_falls_back_to_static_marker(self):
5576
+ """A dropped block with no tool-results yields the plain static
5577
+ marker — no per-call varying text (cache-safe)."""
5578
+ dropped = [
5579
+ {"role": "assistant", "content": "thinking out loud"},
5580
+ {"role": "user", "content": "ok"},
5581
+ ]
5582
+ summary = proxy._summarize_pruned_block(dropped)
5583
+ self.assertIn("CONTEXT PRUNED", summary)
5584
+ self.assertNotIn("tool result", summary)
5585
+ self.assertNotIn("most recent", summary)