@miller-tech/uap 1.20.47 → 1.20.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@miller-tech/uap",
3
- "version": "1.20.47",
3
+ "version": "1.20.49",
4
4
  "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -224,6 +224,16 @@ PROXY_FINALIZE_CONTINUATION_MAX = int(
224
224
  PROXY_FINALIZE_SESSION_HARD_CAP = int(
225
225
  os.environ.get("PROXY_FINALIZE_SESSION_HARD_CAP", "3")
226
226
  )
227
+ # Recon-convergence guardrail: after this many consecutive turns of PURE
228
+ # read-only exploration (Read/Grep/Glob/etc. — no write/edit/deliverable
229
+ # tool), the proxy injects a directive telling the model to stop exploring
230
+ # and produce its deliverable. Targets the failure mode where an agentic
231
+ # recon task reads files for hundreds of turns and never converges to the
232
+ # synthesis/write step (observed: 664-turn recon, no deliverable started).
233
+ # 0 disables.
234
+ PROXY_RECON_CONVERGENCE_THRESHOLD = int(
235
+ os.environ.get("PROXY_RECON_CONVERGENCE_THRESHOLD", "40")
236
+ )
227
237
  PROXY_STREAM_REASONING_FALLBACK = (
228
238
  os.environ.get("PROXY_STREAM_REASONING_FALLBACK", "off").strip().lower()
229
239
  )
@@ -702,6 +712,7 @@ class SessionMonitor:
702
712
  peak_input_tokens: int = 0 # High-water mark
703
713
  prune_count: int = 0 # How many times pruning was triggered
704
714
  overflow_count: int = 0 # How many context overflow errors caught
715
+ prune_drop_count: int = 0 # monotonic: # of oldest middle msgs pruned (B3)
705
716
  context_history: list = field(default_factory=list) # Recent token counts
706
717
 
707
718
  # --- Token Loop Protection ---
@@ -716,6 +727,7 @@ class SessionMonitor:
716
727
  )
717
728
  loop_warnings_emitted: int = 0 # How many loop warnings sent to the model
718
729
  no_progress_streak: int = 0 # Forced tool turns without new tool_result
730
+ consecutive_readonly_turns: int = 0 # turns of pure read-only exploration (B1)
719
731
  unexpected_end_turn_count: int = 0 # end_turn without tool_use in active loop
720
732
  tool_starvation_streak: int = 0 # Consecutive forced turns with no tool_calls produced
721
733
  malformed_tool_streak: int = 0 # consecutive malformed pseudo tool payloads
@@ -873,6 +885,16 @@ class SessionMonitor:
873
885
  if len(self.tool_call_history) > 30:
874
886
  self.tool_call_history = self.tool_call_history[-30:]
875
887
 
888
+ # Recon-convergence (B1): count consecutive turns of PURE read-only
889
+ # exploration. A turn that uses any non-read-only tool (write, edit,
890
+ # a deliverable tool) resets the streak — that's the model
891
+ # converging from exploration toward synthesis/action.
892
+ _ro = {n.lower() for n in _READ_ONLY_TOOL_CLASS}
893
+ if tool_names and all(n.lower() in _ro for n in tool_names):
894
+ self.consecutive_readonly_turns += 1
895
+ else:
896
+ self.consecutive_readonly_turns = 0
897
+
876
898
  # Track read-only tool targets for dedup (Option 3)
877
899
  if tool_targets:
878
900
  for name, target in tool_targets.items():
@@ -1297,24 +1319,83 @@ def estimate_total_tokens(anthropic_body: dict) -> int:
1297
1319
  return tokens
1298
1320
 
1299
1321
 
1322
+ # Max tool-result breadcrumbs listed in a prune summary (B2). Bounds the
1323
+ # summary size — beyond this the oldest breadcrumbs are elided.
1324
+ _PRUNE_SUMMARY_MAX_ITEMS = int(os.environ.get("PROXY_PRUNE_SUMMARY_MAX_ITEMS", "30"))
1325
+
1326
+
1327
+ def _summarize_pruned_block(dropped: list[dict]) -> str:
1328
+ """Build a compact breadcrumb summary of pruned messages (B2).
1329
+
1330
+ Instead of discarding dropped tool-results outright, leave a one-line
1331
+ trace of each so the agent retains *what it already found*. A recon
1332
+ agent that can still see "I read auth_handler.cpp — JWT validation in
1333
+ validateToken()" is far likelier to converge to a synthesis than one
1334
+ whose findings vanished entirely and which therefore re-explores.
1335
+
1336
+ Heuristic only — no LLM call. Bounded to the most recent
1337
+ PROXY_PRUNE_SUMMARY_MAX_ITEMS tool-result breadcrumbs so the summary
1338
+ itself cannot grow unbounded.
1339
+ """
1340
+ breadcrumbs: list[str] = []
1341
+ for msg in dropped:
1342
+ content = msg.get("content", [])
1343
+ if not isinstance(content, list):
1344
+ continue
1345
+ for block in content:
1346
+ if isinstance(block, dict) and block.get("type") == "tool_result":
1347
+ text = _extract_text(block.get("content", "")).strip()
1348
+ if not text:
1349
+ continue
1350
+ excerpt = " ".join(text.split())[:100]
1351
+ breadcrumbs.append(
1352
+ f"- tool result (~{estimate_tokens(text)} tok): {excerpt}"
1353
+ )
1354
+ if not breadcrumbs:
1355
+ return (
1356
+ "[CONTEXT PRUNED: older messages were removed to fit the context "
1357
+ "window. The conversation continues from recent context below.]"
1358
+ )
1359
+ total = len(breadcrumbs)
1360
+ if total > _PRUNE_SUMMARY_MAX_ITEMS:
1361
+ breadcrumbs = breadcrumbs[-_PRUNE_SUMMARY_MAX_ITEMS:]
1362
+ header = (
1363
+ f"[CONTEXT PRUNED — {len(dropped)} older messages removed to fit the "
1364
+ "context window. Breadcrumbs of earlier findings"
1365
+ )
1366
+ if total > len(breadcrumbs):
1367
+ header += f" (most recent {len(breadcrumbs)} of {total} tool results)"
1368
+ header += " — rely on these instead of re-reading those files:]"
1369
+ return header + "\n" + "\n".join(breadcrumbs)
1370
+
1371
+
1300
1372
  def prune_conversation(
1301
1373
  anthropic_body: dict,
1302
1374
  context_window: int,
1375
+ monitor: "SessionMonitor | None" = None,
1303
1376
  target_fraction: float = 0.65,
1304
1377
  keep_last: int = 8,
1305
1378
  ) -> dict:
1306
1379
  """Prune the conversation to fit within the context window.
1307
1380
 
1308
- Strategy:
1309
- - Always keep: system prompt, first user message, last N messages
1310
- - Remove from the middle: oldest tool_result messages first (they're
1311
- the largest -- file contents, command output, etc.), then oldest
1312
- assistant messages, then oldest user messages.
1313
- - Inject a [CONTEXT PRUNED] marker so the model knows history was trimmed.
1381
+ Strategy (reworked — UAP PR #186):
1382
+ - Always keep: system prompt, first user message, last N messages.
1383
+ - Drop a CONTIGUOUS block of the oldest middle messages. The drop
1384
+ count is persisted per-session on the monitor (`prune_drop_count`)
1385
+ and is monotonic it only ever grows. This keeps the retained
1386
+ region a stable recent *suffix*: on turns where the boundary does
1387
+ not advance, the upstream KV-cache prefix stays valid and the turn
1388
+ is not reprocessed. (The previous priority-greedy keep was
1389
+ non-contiguous and reshuffled the prompt mid-stream every turn,
1390
+ defeating the cache.)
1391
+ - Replace the dropped block with a breadcrumb summary (see
1392
+ _summarize_pruned_block) so the agent keeps its earlier findings.
1314
1393
 
1315
1394
  Args:
1316
1395
  anthropic_body: The full Anthropic request body
1317
1396
  context_window: Maximum context window in tokens
1397
+ monitor: SessionMonitor — carries the monotonic prune boundary.
1398
+ When None, pruning still works but is non-monotonic per call.
1318
1399
  target_fraction: Target utilization after pruning (0.0-1.0)
1319
1400
  keep_last: Number of recent messages to always keep (default 8)
1320
1401
 
@@ -1390,70 +1471,39 @@ def prune_conversation(
1390
1471
 
1391
1472
  remaining_budget = message_budget - protected_tokens
1392
1473
 
1393
- # Score middle messages for removal priority:
1394
- # - tool_result messages: remove first (biggest, least important historically)
1395
- # - assistant text-only: remove second
1396
- # - user messages: remove last (provide context for the model's actions)
1397
- # Within each category, remove oldest first.
1398
- scored_middle = []
1399
- for i, msg in enumerate(middle):
1400
- content = msg.get("content", [])
1401
- tokens = estimate_message_tokens(msg)
1402
- is_tool_result = False
1403
- is_assistant = msg.get("role") == "assistant"
1404
-
1405
- if isinstance(content, list):
1406
- is_tool_result = any(
1407
- isinstance(b, dict) and b.get("type") == "tool_result" for b in content
1408
- )
1409
-
1410
- # Lower priority = removed first
1411
- if is_tool_result:
1412
- priority = 0 # Remove first
1413
- elif is_assistant:
1414
- priority = 1 # Remove second
1415
- else:
1416
- priority = 2 # Remove last (user messages)
1417
-
1418
- scored_middle.append((priority, i, tokens, msg))
1419
-
1420
- # Sort by priority (ascending = remove first), then by index (oldest first)
1421
- scored_middle.sort(key=lambda x: (x[0], x[1]))
1422
-
1423
- # Greedily keep messages from highest priority (keep last) until budget fills
1424
- kept_middle = []
1425
- used_tokens = 0
1426
- # Process in reverse priority order (keep high-priority messages first)
1427
- for priority, idx, tokens, msg in reversed(scored_middle):
1428
- if used_tokens + tokens <= remaining_budget:
1429
- kept_middle.append((idx, msg))
1430
- used_tokens += tokens
1431
-
1432
- # Sort kept messages back into original order
1433
- kept_middle.sort(key=lambda x: x[0])
1434
- kept_msgs = [m for _, m in kept_middle]
1474
+ # --- Monotonic contiguous prune boundary (cache-stable, B3) ---
1475
+ # Drop the oldest `drop_count` middle messages as one contiguous block.
1476
+ # Seed from the monitor's persisted boundary; advance it only as far as
1477
+ # the budget forces. Persist back monotonically so a later/looser prune
1478
+ # in the same turn can't shrink it (which would reshuffle the prompt).
1479
+ drop_count = 0
1480
+ if monitor is not None:
1481
+ drop_count = min(max(0, monitor.prune_drop_count), len(middle))
1482
+ while drop_count < len(middle):
1483
+ kept_tokens = sum(estimate_message_tokens(m) for m in middle[drop_count:])
1484
+ if kept_tokens <= remaining_budget:
1485
+ break
1486
+ drop_count += 1
1487
+ if monitor is not None:
1488
+ monitor.prune_drop_count = max(monitor.prune_drop_count, drop_count)
1435
1489
 
1436
- removed_count = len(middle) - len(kept_msgs)
1437
- removed_tokens = sum(t for _, _, t, _ in scored_middle) - used_tokens
1490
+ dropped = middle[:drop_count]
1491
+ kept_msgs = middle[drop_count:]
1438
1492
 
1439
- if removed_count > 0:
1440
- # Insert a context-pruned marker
1493
+ if dropped:
1494
+ # Replace the dropped block with a findings-breadcrumb summary (B2).
1441
1495
  prune_marker = {
1442
1496
  "role": "user",
1443
- "content": (
1444
- f"[CONTEXT PRUNED: {removed_count} older messages (~{removed_tokens} tokens) "
1445
- f"were removed to fit within the context window. "
1446
- f"The conversation continues from recent context below.]"
1447
- ),
1497
+ "content": _summarize_pruned_block(dropped),
1448
1498
  }
1449
1499
  anthropic_body["messages"] = (
1450
1500
  protected_head + [prune_marker] + kept_msgs + protected_tail
1451
1501
  )
1452
1502
  logger.warning(
1453
- "PRUNED: removed %d messages (~%d tokens), kept %d messages, "
1454
- "target=%.0f%% of %d ctx",
1455
- removed_count,
1456
- removed_tokens,
1503
+ "PRUNED: dropped %d oldest middle messages (boundary=%d), "
1504
+ "kept %d total, target=%.0f%% of %d ctx",
1505
+ len(dropped),
1506
+ drop_count,
1457
1507
  len(anthropic_body["messages"]),
1458
1508
  target_fraction * 100,
1459
1509
  context_window,
@@ -3218,6 +3268,51 @@ def _resolve_state_machine_tool_choice(
3218
3268
  return None, "unknown_phase"
3219
3269
 
3220
3270
 
3271
+ def _maybe_inject_recon_convergence(openai_body: dict, monitor: "SessionMonitor") -> None:
3272
+ """Nudge a session stuck in prolonged read-only exploration toward its
3273
+ deliverable.
3274
+
3275
+ Fires when `consecutive_readonly_turns` crosses
3276
+ PROXY_RECON_CONVERGENCE_THRESHOLD — the model has read files for many
3277
+ turns without writing anything. Targets the observed failure mode of
3278
+ an agentic recon task wandering for hundreds of turns and never
3279
+ converging to the synthesis/write step. Two escalation tiers: a firm
3280
+ "switch to synthesis" directive, then a hard "STOP, write it now" once
3281
+ the streak is 2x over threshold.
3282
+ """
3283
+ if PROXY_RECON_CONVERGENCE_THRESHOLD <= 0:
3284
+ return
3285
+ streak = monitor.consecutive_readonly_turns
3286
+ if streak < PROXY_RECON_CONVERGENCE_THRESHOLD:
3287
+ return
3288
+ util = monitor.get_utilization()
3289
+ if streak >= 2 * PROXY_RECON_CONVERGENCE_THRESHOLD:
3290
+ directive = (
3291
+ f"STOP exploring. You have run {streak} consecutive turns of "
3292
+ f"read-only exploration and context is at {util * 100:.0f}%. "
3293
+ "You will NOT finish if you keep reading files. Produce your "
3294
+ "deliverable NOW from the information you already have — write "
3295
+ "it to a file with the appropriate tool. Do not read anything else."
3296
+ )
3297
+ tier = "hard"
3298
+ else:
3299
+ directive = (
3300
+ f"You have read files for {streak} consecutive turns without "
3301
+ f"producing a deliverable (context {util * 100:.0f}%). You have "
3302
+ "enough to begin. Switch from exploration to synthesis: write "
3303
+ "your deliverable now. Read at most one more file, and only if "
3304
+ "strictly required to write it."
3305
+ )
3306
+ tier = "firm"
3307
+ msgs = openai_body.get("messages", [])
3308
+ msgs.append({"role": "user", "content": directive})
3309
+ openai_body["messages"] = msgs
3310
+ logger.warning(
3311
+ "RECON CONVERGENCE: injected %s directive (readonly_streak=%d, ctx=%.0f%%)",
3312
+ tier, streak, util * 100,
3313
+ )
3314
+
3315
+
3221
3316
  def build_openai_request(
3222
3317
  anthropic_body: dict,
3223
3318
  monitor: SessionMonitor,
@@ -3725,6 +3820,11 @@ def build_openai_request(
3725
3820
 
3726
3821
  _apply_tool_call_grammar(openai_body, grammar_override=profile_grammar)
3727
3822
 
3823
+ # Recon-convergence guardrail (B1) — runs on every built request so a
3824
+ # session wandering in read-only exploration is nudged toward its
3825
+ # deliverable regardless of tool-turn phase.
3826
+ _maybe_inject_recon_convergence(openai_body, monitor)
3827
+
3728
3828
  return openai_body
3729
3829
 
3730
3830
 
@@ -7489,7 +7589,8 @@ async def messages(request: Request):
7489
7589
  target_frac * 100,
7490
7590
  )
7491
7591
  body = prune_conversation(
7492
- body, ctx_window, target_fraction=target_frac, keep_last=keep_last
7592
+ body, ctx_window, monitor=monitor,
7593
+ target_fraction=target_frac, keep_last=keep_last,
7493
7594
  )
7494
7595
  monitor.prune_count += 1
7495
7596
  # Option 4: Post-prune validation — verify actual reduction
@@ -7510,7 +7611,8 @@ async def messages(request: Request):
7510
7611
  post_util * 100,
7511
7612
  )
7512
7613
  body = prune_conversation(
7513
- body, ctx_window, target_fraction=0.35, keep_last=4
7614
+ body, ctx_window, monitor=monitor,
7615
+ target_fraction=0.35, keep_last=4,
7514
7616
  )
7515
7617
  monitor.prune_count += 1
7516
7618
  estimated_tokens = estimate_total_tokens(body)
@@ -5371,3 +5371,189 @@ class TestSlotSaveRestore(unittest.TestCase):
5371
5371
  self.assertIn("fp:owner", proxy._slot_lru)
5372
5372
  self.assertIn("fp:new1", proxy._slot_lru)
5373
5373
  self.assertIn("fp:new2", proxy._slot_lru)
5374
+
5375
+
5376
+ class TestReconConvergence(unittest.TestCase):
5377
+ """Tests for the B1 recon-convergence guardrail — nudges a session
5378
+ stuck doing read-only exploration toward producing its deliverable.
5379
+
5380
+ Targets the observed failure: a 664-turn agentic recon task that read
5381
+ files for hours and never converged to the synthesis/write step."""
5382
+
5383
+ def setUp(self):
5384
+ self._threshold = proxy.PROXY_RECON_CONVERGENCE_THRESHOLD
5385
+
5386
+ def tearDown(self):
5387
+ proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = self._threshold
5388
+
5389
+ def test_readonly_turns_increment_the_streak(self):
5390
+ """Consecutive turns using only read-only tools grow the streak."""
5391
+ m = proxy.SessionMonitor(context_window=131072)
5392
+ for _ in range(5):
5393
+ m.record_tool_calls(["Read"])
5394
+ self.assertEqual(m.consecutive_readonly_turns, 5)
5395
+ m.record_tool_calls(["Grep", "Glob"])
5396
+ self.assertEqual(m.consecutive_readonly_turns, 6)
5397
+
5398
+ def test_non_readonly_tool_resets_the_streak(self):
5399
+ """A turn using a write/edit tool means the model converged toward
5400
+ action — the streak resets to 0."""
5401
+ m = proxy.SessionMonitor(context_window=131072)
5402
+ for _ in range(10):
5403
+ m.record_tool_calls(["Read"])
5404
+ self.assertEqual(m.consecutive_readonly_turns, 10)
5405
+ m.record_tool_calls(["Write"])
5406
+ self.assertEqual(m.consecutive_readonly_turns, 0)
5407
+
5408
+ def test_mixed_turn_with_one_write_resets(self):
5409
+ """A turn mixing read-only and a write tool still counts as
5410
+ converging — any non-read-only tool resets."""
5411
+ m = proxy.SessionMonitor(context_window=131072)
5412
+ for _ in range(10):
5413
+ m.record_tool_calls(["Read"])
5414
+ m.record_tool_calls(["Read", "Edit"])
5415
+ self.assertEqual(m.consecutive_readonly_turns, 0)
5416
+
5417
+ def test_no_injection_below_threshold(self):
5418
+ proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 40
5419
+ m = proxy.SessionMonitor(context_window=131072)
5420
+ m.consecutive_readonly_turns = 39
5421
+ body = {"messages": [{"role": "user", "content": "go"}]}
5422
+ proxy._maybe_inject_recon_convergence(body, m)
5423
+ self.assertEqual(len(body["messages"]), 1)
5424
+
5425
+ def test_firm_directive_at_threshold(self):
5426
+ proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 40
5427
+ m = proxy.SessionMonitor(context_window=131072)
5428
+ m.consecutive_readonly_turns = 45
5429
+ m.last_input_tokens = 120000
5430
+ body = {"messages": [{"role": "user", "content": "go"}]}
5431
+ proxy._maybe_inject_recon_convergence(body, m)
5432
+ self.assertEqual(len(body["messages"]), 2)
5433
+ injected = body["messages"][-1]["content"]
5434
+ self.assertIn("synthesis", injected.lower())
5435
+ self.assertNotIn("STOP exploring", injected)
5436
+
5437
+ def test_hard_directive_at_2x_threshold(self):
5438
+ """Once the streak is 2x over threshold, escalate to a hard STOP."""
5439
+ proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 40
5440
+ m = proxy.SessionMonitor(context_window=131072)
5441
+ m.consecutive_readonly_turns = 80
5442
+ m.last_input_tokens = 250000 # over budget — the real-incident shape
5443
+ body = {"messages": [{"role": "user", "content": "go"}]}
5444
+ proxy._maybe_inject_recon_convergence(body, m)
5445
+ injected = body["messages"][-1]["content"]
5446
+ self.assertIn("STOP exploring", injected)
5447
+
5448
+ def test_disabled_when_threshold_zero(self):
5449
+ proxy.PROXY_RECON_CONVERGENCE_THRESHOLD = 0
5450
+ m = proxy.SessionMonitor(context_window=131072)
5451
+ m.consecutive_readonly_turns = 500
5452
+ body = {"messages": [{"role": "user", "content": "go"}]}
5453
+ proxy._maybe_inject_recon_convergence(body, m)
5454
+ self.assertEqual(len(body["messages"]), 1)
5455
+
5456
+
5457
+ class TestPrunerRework(unittest.TestCase):
5458
+ """Tests for the reworked context pruner (B2 + B3): contiguous
5459
+ monotonic prune boundary (cache-stable) + breadcrumb summary of the
5460
+ dropped block (findings retained)."""
5461
+
5462
+ @staticmethod
5463
+ def _tool_result_msg(idx: int, size: int = 4000) -> dict:
5464
+ return {
5465
+ "role": "user",
5466
+ "content": [
5467
+ {
5468
+ "type": "tool_result",
5469
+ "tool_use_id": f"toolu_{idx}",
5470
+ "content": f"FILE-{idx} " + ("x" * size),
5471
+ }
5472
+ ],
5473
+ }
5474
+
5475
+ def _big_body(self, n_middle: int = 20) -> dict:
5476
+ msgs = [{"role": "user", "content": "recon task: analyze the repo"}]
5477
+ for i in range(n_middle):
5478
+ msgs.append({"role": "assistant", "content": f"reading file {i}"})
5479
+ msgs.append(self._tool_result_msg(i))
5480
+ msgs.append({"role": "user", "content": "continue"})
5481
+ return {"messages": msgs}
5482
+
5483
+ def test_prune_drop_count_is_monotonic(self):
5484
+ """The per-session prune boundary only ever grows."""
5485
+ m = proxy.SessionMonitor(context_window=8192)
5486
+ proxy.prune_conversation(self._big_body(), 8192, monitor=m,
5487
+ target_fraction=0.5, keep_last=6)
5488
+ first = m.prune_drop_count
5489
+ self.assertGreater(first, 0)
5490
+ # A tighter target on the same body can only drop more, never fewer.
5491
+ proxy.prune_conversation(self._big_body(), 8192, monitor=m,
5492
+ target_fraction=0.25, keep_last=6)
5493
+ self.assertGreaterEqual(m.prune_drop_count, first)
5494
+
5495
+ def test_kept_middle_is_contiguous_suffix(self):
5496
+ """The pruner drops a contiguous oldest block — the surviving
5497
+ middle messages are a contiguous suffix of the original middle,
5498
+ never a non-contiguous greedy pick."""
5499
+ m = proxy.SessionMonitor(context_window=8192)
5500
+ body = self._big_body()
5501
+ original = list(body["messages"])
5502
+ result = proxy.prune_conversation(body, 8192, monitor=m,
5503
+ target_fraction=0.5, keep_last=6)
5504
+ out = result["messages"]
5505
+ survivors = [msg for msg in out if msg in original]
5506
+ idxs = [original.index(msg) for msg in survivors]
5507
+ self.assertEqual(idxs, sorted(idxs))
5508
+ tail_idxs = [i for i in idxs if i > 0]
5509
+ if len(tail_idxs) > 1:
5510
+ self.assertEqual(
5511
+ tail_idxs, list(range(tail_idxs[0], tail_idxs[0] + len(tail_idxs)))
5512
+ )
5513
+
5514
+ def test_stable_output_when_boundary_does_not_advance(self):
5515
+ """Cache-stability: pruning the same body twice with the same
5516
+ monitor yields byte-identical message lists — the second call
5517
+ seeds from the persisted boundary and does not advance it."""
5518
+ m = proxy.SessionMonitor(context_window=8192)
5519
+ first = proxy.prune_conversation(self._big_body(), 8192, monitor=m,
5520
+ target_fraction=0.5, keep_last=6)
5521
+ boundary_after_first = m.prune_drop_count
5522
+ second = proxy.prune_conversation(self._big_body(), 8192, monitor=m,
5523
+ target_fraction=0.5, keep_last=6)
5524
+ self.assertEqual(m.prune_drop_count, boundary_after_first)
5525
+ self.assertEqual(first["messages"], second["messages"])
5526
+
5527
+ def test_dropped_tool_results_become_breadcrumbs(self):
5528
+ """Pruned tool-results survive as one-line breadcrumbs in the
5529
+ marker, not silently discarded."""
5530
+ dropped = [self._tool_result_msg(i) for i in range(3)]
5531
+ summary = proxy._summarize_pruned_block(dropped)
5532
+ self.assertIn("CONTEXT PRUNED", summary)
5533
+ self.assertIn("tool result", summary)
5534
+ self.assertIn("FILE-0", summary)
5535
+ self.assertIn("FILE-2", summary)
5536
+
5537
+ def test_summary_is_bounded_by_max_items(self):
5538
+ """A huge dropped block does not produce an unbounded summary."""
5539
+ old = proxy._PRUNE_SUMMARY_MAX_ITEMS
5540
+ try:
5541
+ proxy._PRUNE_SUMMARY_MAX_ITEMS = 5
5542
+ dropped = [self._tool_result_msg(i) for i in range(40)]
5543
+ summary = proxy._summarize_pruned_block(dropped)
5544
+ self.assertEqual(summary.count("- tool result"), 5)
5545
+ self.assertIn("most recent 5 of 40", summary)
5546
+ finally:
5547
+ proxy._PRUNE_SUMMARY_MAX_ITEMS = old
5548
+
5549
+ def test_summarize_no_tool_results_falls_back_to_static_marker(self):
5550
+ """A dropped block with no tool-results yields the plain static
5551
+ marker — no per-call varying text (cache-safe)."""
5552
+ dropped = [
5553
+ {"role": "assistant", "content": "thinking out loud"},
5554
+ {"role": "user", "content": "ok"},
5555
+ ]
5556
+ summary = proxy._summarize_pruned_block(dropped)
5557
+ self.assertIn("CONTEXT PRUNED", summary)
5558
+ self.assertNotIn("tool result", summary)
5559
+ self.assertNotIn("most recent", summary)