compact-agent 1.33.7 → 1.35.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/query.d.ts CHANGED
@@ -38,6 +38,62 @@ export interface InputGuard {
38
38
  onSteer(handler: () => void): void;
39
39
  restore(): void;
40
40
  }
41
+ export declare function buildStateBlock(messages: Message[]): string | null;
42
+ export declare function maskOldToolResults(messages: Message[]): Message[];
43
+ /**
44
+ * F5+ DeCRIM 3-stage critique prompts.
45
+ *
46
+ * Each prompt is designed to do exactly one job, in sequence:
47
+ *
48
+ * decompose — Forces the model to extract requirements from the
49
+ * ORIGINAL task before judging its own work. This is
50
+ * the leverage point: the model can't bypass an
51
+ * implicit requirement if it has to name it.
52
+ *
53
+ * critique — Per-item PASS/FAIL with concrete evidence required.
54
+ * Asking for evidence ("file path", "command output",
55
+ * "test result") is much harder to fake than the
56
+ * generic "have you accomplished what was asked?".
57
+ *
58
+ * refine — Only the FAIL items get redone, plus any items
59
+ * whose PASS evidence the model now thinks was weak.
60
+ * If everything is solid, the model exits naturally.
61
+ *
62
+ * The phrasing deliberately includes "be honest" / "the user prefers
63
+ * honest failures over confident lies" — research on prompted self-
64
+ * criticism shows this kind of social-cost signaling reduces the
65
+ * self-confirmation bias that otherwise dominates weak-model
66
+ * critique. (Reflexion-style "just reflect on your work" prompts
67
+ * have been shown to degrade weak models — generic self-questioning
68
+ * without concrete structure produces overconfident revisions.)
69
+ */
70
+ export declare function critiquePromptFor(stage: 'decompose' | 'critique' | 'refine'): string;
71
+ export declare function dedupFingerprint(toolName: string, rawArgs: string): string;
72
+ /**
73
+ * F4 — Rewrite stale duplicate tool-result messages in place.
74
+ *
75
+ * Called once per tool-execution batch. For each call whose
76
+ * fingerprint we've seen before in this chain, find the previous
77
+ * tool-result message and replace its `content` with a 1-line stub
78
+ * pointing at the newer message. The new result stays untouched so
79
+ * the model's next turn reads complete, fresh data.
80
+ *
81
+ * NOT called for the FIRST occurrence of a fingerprint — only when
82
+ * a repeat fires. So a one-time `read` of a file is never touched.
83
+ *
84
+ * The map is keyed by fingerprint → array-index of the tool result
85
+ * in ctx.messages. We update the index to the newest occurrence after
86
+ * each rewrite, so the NEXT repeat collapses the second one (not the
87
+ * first, which is already stubbed).
88
+ */
89
+ export declare function dedupRepeatedToolCalls(messages: Message[], toolCalls: {
90
+ id: string;
91
+ type: 'function';
92
+ function: {
93
+ name: string;
94
+ arguments: string;
95
+ };
96
+ }[], toolResults: Message[], dedupMap: Map<string, number>): void;
41
97
  /**
42
98
  * Main query loop: sends messages to the API, handles tool calls, loops until done.
43
99
  */
package/dist/query.js CHANGED
@@ -214,6 +214,322 @@ function startInputSuppression(screenReader = false) {
214
214
  /**
215
215
  * Validate tool arguments against the tool's JSON schema
216
216
  */
217
+ /**
218
+ * F4 — Tool-call dedup fingerprint.
219
+ *
220
+ * Normalizes the raw JSON arguments before hashing so trivially-
221
+ * different forms collapse to the same key:
222
+ *
223
+ * - parsed + JSON.stringify with sorted keys (so {"a":1,"b":2} and
224
+ * {"b":2,"a":1} hash the same)
225
+ * - common path arguments (file_path, path, cwd, dir) normalized to
226
+ * forward-slashes and lowercased (catches `read /app/x.py` vs
227
+ * `read /APP/X.PY` vs `read \\app\\x.py`)
228
+ * - whitespace runs in `command` collapsed (catches `ls -la` vs `ls -la`)
229
+ *
230
+ * Errors during parse fall through to a literal-string fingerprint —
231
+ * worse than nothing? No: even a literal hash of the raw arg string
232
+ * catches the most common case (model emits identical JSON twice).
233
+ */
234
+ /**
235
+ * StateAct — task-state block injected fresh each turn.
236
+ *
237
+ * Source: arxiv 2410.02810 ("StateAct: Enhancing LLM Base Agents via
238
+ * Self-prompting and State-tracking"). Reports +10% over ReAct on
239
+ * ALFWorld, +30% on TextCraft, +7% on WebShop. Zero added LLM calls.
240
+ *
241
+ * Mechanism: before each assistant turn, prepend a short structured
242
+ * block summarizing (a) the ORIGINAL GOAL — re-injected as a
243
+ * reminder, since long chains can drift away from the initial task,
244
+ * and (b) RECENT ACTIONS — a compressed view of what tool calls
245
+ * have been made so far. The model gets a fresh recap every turn
246
+ * regardless of context drift.
247
+ *
248
+ * Directly attacks the failure mode observed on `run-pdp11-code`
249
+ * (375K context, model wrote `gen_load.py` twice with identical
250
+ * content because the earlier write had drifted out of attention).
251
+ *
252
+ * Implementation choices:
253
+ * - State block is a `system` role message inserted AFTER the main
254
+ * system prompt (so the latter stays cacheable) but BEFORE the
255
+ * message history. The model interprets it as ambient context.
256
+ * - Action list shows only the last N actions to keep the block
257
+ * short. Older actions are summarized in the conversation
258
+ * history itself (and increasingly masked by F2 observation
259
+ * masking).
260
+ * - The block is regenerated EVERY turn from current messages.
261
+ * Not persisted; it's purely a derived view.
262
+ * - Skipped on very short chains (< 3 messages) where there's
263
+ * nothing to recap.
264
+ * - Opt-out via COMPACT_AGENT_STATE_BLOCK=0.
265
+ */
266
+ const STATE_BLOCK_RECENT_ACTIONS = 8;
267
+ const STATE_BLOCK_GOAL_MAX_CHARS = 400;
268
+ export function buildStateBlock(messages) {
269
+ if (process.env.COMPACT_AGENT_STATE_BLOCK === '0')
270
+ return null;
271
+ if (messages.length < 3)
272
+ return null;
273
+ // GOAL = the first user-role message. This is the original task
274
+ // instruction from the harness or human. Re-inject it so the model
275
+ // can't drift even when the user message has scrolled far up.
276
+ const firstUser = messages.find((m) => m.role === 'user');
277
+ if (!firstUser || typeof firstUser.content !== 'string')
278
+ return null;
279
+ const goal = firstUser.content.replace(/\s+/g, ' ').trim().slice(0, STATE_BLOCK_GOAL_MAX_CHARS);
280
+ if (!goal)
281
+ return null;
282
+ const actions = [];
283
+ for (const m of messages) {
284
+ if (m.role !== 'assistant')
285
+ continue;
286
+ const calls = m.tool_calls;
287
+ if (!calls)
288
+ continue;
289
+ for (const tc of calls) {
290
+ const argsRaw = String(tc.function.arguments ?? '');
291
+ const compact = argsRaw.replace(/\s+/g, ' ').slice(0, 80);
292
+ actions.push({ tool: tc.function.name, argsPreview: compact });
293
+ }
294
+ }
295
+ if (actions.length === 0)
296
+ return null;
297
+ const recent = actions.slice(-STATE_BLOCK_RECENT_ACTIONS);
298
+ const olderCount = actions.length - recent.length;
299
+ const lines = [
300
+ '<task_state>',
301
+ `Original goal: ${goal}${goal.length >= STATE_BLOCK_GOAL_MAX_CHARS ? '…' : ''}`,
302
+ `Actions completed: ${actions.length}`,
303
+ ];
304
+ if (olderCount > 0) {
305
+ lines.push(`Recent ${recent.length} (${olderCount} earlier omitted):`);
306
+ }
307
+ else {
308
+ lines.push(`Actions:`);
309
+ }
310
+ recent.forEach((a, i) => {
311
+ lines.push(` ${i + 1}. ${a.tool}(${a.argsPreview}${a.argsPreview.length >= 80 ? '…' : ''})`);
312
+ });
313
+ lines.push('');
314
+ lines.push('Stay focused on the goal. Do not re-issue actions you have already completed — refer to their results in the conversation above.');
315
+ lines.push('</task_state>');
316
+ return lines.join('\n');
317
+ }
318
+ /**
319
+ * F2 — Observation Window Masking.
320
+ *
321
+ * Source: arxiv 2508.21433 ("The Complexity Trap: Simple Observation
322
+ * Masking Is as Efficient as LLM Summarization for Agent Context
323
+ * Management"). Cuts token cost ~50% on long agent loops while
324
+ * matching or beating LLM-summarization solve rates — at ZERO extra
325
+ * inference cost.
326
+ *
327
+ * Strategy: keep the last MASKING_WINDOW tool-result messages in
328
+ * full. For older tool-results, replace `content` with a short stub
329
+ * indicating what was there. The stub preserves `role` and
330
+ * `tool_call_id` so the OpenAI message-schema invariants are not
331
+ * violated.
332
+ *
333
+ * We DO NOT mask:
334
+ * - assistant turns (the reasoning chain stays intact)
335
+ * - user turns (task instruction + DeCRIM critique prompts)
336
+ * - system messages (priming + mode)
337
+ *
338
+ * Only `role === 'tool'` messages are masked, because the paper's
339
+ * empirical finding is that ~84% of token cost is tool observations
340
+ * and the model rarely needs the old verbatim output to make the
341
+ * next decision — it needs the current state. The reasoning trace
342
+ * across assistant turns carries the necessary memory.
343
+ *
344
+ * Tunable: MASKING_WINDOW = 12 (last 12 tool-results stay verbatim).
345
+ * Conservative for our model class — the paper's Qwen3-32B run
346
+ * regressed -11.8% with overly aggressive masking, while Gemini-Flash
347
+ * gained +8.5%. Deepseek-v4-flash is in that capability band, so we
348
+ * pick a generous window. Override with COMPACT_AGENT_MASK_WINDOW.
349
+ *
350
+ * Threshold: we only bother masking when the total estimated payload
351
+ * exceeds ~60K characters (rough proxy for ~15K tokens). Below that,
352
+ * masking adds noise without saving anything material.
353
+ */
354
+ const MASKING_WINDOW_DEFAULT = 12;
355
+ const MASKING_TRIGGER_BYTES = 60_000;
356
+ export function maskOldToolResults(messages) {
357
+ const totalBytes = estimateMessageBytes(messages);
358
+ if (totalBytes < MASKING_TRIGGER_BYTES)
359
+ return messages;
360
+ const window = Math.max(1, parseInt(process.env.COMPACT_AGENT_MASK_WINDOW ?? '', 10) || MASKING_WINDOW_DEFAULT);
361
+ // Find indices of tool-result messages (newest first).
362
+ const toolIdxs = [];
363
+ for (let i = messages.length - 1; i >= 0; i--) {
364
+ if (messages[i].role === 'tool') {
365
+ toolIdxs.push(i);
366
+ }
367
+ }
368
+ // Keep the most-recent `window` tool results untouched; mask the rest.
369
+ const toMask = new Set(toolIdxs.slice(window));
370
+ if (toMask.size === 0)
371
+ return messages;
372
+ // Build a new array. Original messages are not mutated.
373
+ return messages.map((m, i) => {
374
+ if (!toMask.has(i))
375
+ return m;
376
+ const original = typeof m.content === 'string' ? m.content : JSON.stringify(m.content);
377
+ const stub = `[older tool output omitted — ${original.length} chars; re-run the tool if you need the content]`;
378
+ return { ...m, content: stub };
379
+ });
380
+ }
381
+ function estimateMessageBytes(messages) {
382
+ let total = 0;
383
+ for (const m of messages) {
384
+ if (typeof m.content === 'string') {
385
+ total += m.content.length;
386
+ }
387
+ else if (m.content) {
388
+ try {
389
+ total += JSON.stringify(m.content).length;
390
+ }
391
+ catch {
392
+ /* noop */
393
+ }
394
+ }
395
+ }
396
+ return total;
397
+ }
398
+ /**
399
+ * F5+ DeCRIM 3-stage critique prompts.
400
+ *
401
+ * Each prompt is designed to do exactly one job, in sequence:
402
+ *
403
+ * decompose — Forces the model to extract requirements from the
404
+ * ORIGINAL task before judging its own work. This is
405
+ * the leverage point: the model can't bypass an
406
+ * implicit requirement if it has to name it.
407
+ *
408
+ * critique — Per-item PASS/FAIL with concrete evidence required.
409
+ * Asking for evidence ("file path", "command output",
410
+ * "test result") is much harder to fake than the
411
+ * generic "have you accomplished what was asked?".
412
+ *
413
+ * refine — Only the FAIL items get redone, plus any items
414
+ * whose PASS evidence the model now thinks was weak.
415
+ * If everything is solid, the model exits naturally.
416
+ *
417
+ * The phrasing deliberately includes "be honest" / "the user prefers
418
+ * honest failures over confident lies" — research on prompted self-
419
+ * criticism shows this kind of social-cost signaling reduces the
420
+ * self-confirmation bias that otherwise dominates weak-model
421
+ * critique. (Reflexion-style "just reflect on your work" prompts
422
+ * have been shown to degrade weak models — generic self-questioning
423
+ * without concrete structure produces overconfident revisions.)
424
+ */
425
+ export function critiquePromptFor(stage) {
426
+ if (stage === 'decompose') {
427
+ return ('Before you finalize: re-read the ORIGINAL task description (the very first user message in this conversation).\n\n' +
428
+ 'List every concrete verifiable requirement it contains, as a numbered Markdown list. For each item:\n' +
429
+ ' - Quote the exact words from the task that express the requirement, where possible.\n' +
430
+ ' - Note how a third party could verify the requirement is met (which file would they check? which command would they run? what output would they look for?).\n\n' +
431
+ 'Be exhaustive. Include format requirements, file names, output structure, and any "should also" clauses. ' +
432
+ 'Do not paraphrase — quote. Do not add requirements the task did not state. ' +
433
+ 'This list is just for grounding; you will judge each item in the next step.');
434
+ }
435
+ if (stage === 'critique') {
436
+ return ('Now judge each item from your checklist: did you actually satisfy it?\n\n' +
437
+ 'Format your answer as:\n' +
438
+ ' 1. [requirement quote] → PASS | FAIL\n' +
439
+ ' evidence: [specific file path you created, command output you observed, test that passed, etc.]\n\n' +
440
+ 'Rules:\n' +
441
+ ' - Mark PASS only if you have concrete evidence right now (a file on disk, an output you can paste).\n' +
442
+ ' - "I implemented it" is NOT evidence. "I ran `ls /app/x.txt` and the file exists, with content `Hello`" IS evidence.\n' +
443
+ ' - "It should work" is NOT evidence. "I ran the failing command and it now exits 0" IS evidence.\n' +
444
+ ' - If you skipped a step, mark FAIL.\n' +
445
+ ' - If you are uncertain, mark FAIL.\n\n' +
446
+ 'Be honest. The user prefers an honest "I left these 2 items undone" over a confident "all done" that fails the test. ' +
447
+ 'A FAIL here is fixable in the next step; a falsely-claimed PASS is not.');
448
+ }
449
+ return ('For each FAIL item above, do the work to make it pass. Use the tools available.\n\n' +
450
+ 'Also revisit any PASS items where, on reflection, your evidence was weak — re-verify those.\n\n' +
451
+ 'If after the work all items are now genuinely PASS with concrete evidence, briefly summarize what you did and stop. ' +
452
+ 'Otherwise, keep working until every item is honestly PASS.');
453
+ }
454
+ export function dedupFingerprint(toolName, rawArgs) {
455
+ let normalized;
456
+ try {
457
+ const parsed = JSON.parse(rawArgs ?? '{}');
458
+ if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
459
+ const obj = parsed;
460
+ // Normalize commonly-pathy fields
461
+ for (const k of ['file_path', 'path', 'cwd', 'dir', 'directory']) {
462
+ if (typeof obj[k] === 'string') {
463
+ obj[k] = obj[k].replace(/\\/g, '/').toLowerCase();
464
+ }
465
+ }
466
+ // Collapse whitespace in shell commands so `ls -la` and `ls -la` match
467
+ if (typeof obj.command === 'string') {
468
+ obj.command = obj.command.replace(/\s+/g, ' ').trim();
469
+ }
470
+ // Sorted-key serialization
471
+ const keys = Object.keys(obj).sort();
472
+ normalized = JSON.stringify(obj, keys);
473
+ }
474
+ else {
475
+ normalized = JSON.stringify(parsed);
476
+ }
477
+ }
478
+ catch {
479
+ normalized = String(rawArgs ?? '');
480
+ }
481
+ return `${toolName}::${normalized}`;
482
+ }
483
+ /**
484
+ * F4 — Rewrite stale duplicate tool-result messages in place.
485
+ *
486
+ * Called once per tool-execution batch. For each call whose
487
+ * fingerprint we've seen before in this chain, find the previous
488
+ * tool-result message and replace its `content` with a 1-line stub
489
+ * pointing at the newer message. The new result stays untouched so
490
+ * the model's next turn reads complete, fresh data.
491
+ *
492
+ * NOT called for the FIRST occurrence of a fingerprint — only when
493
+ * a repeat fires. So a one-time `read` of a file is never touched.
494
+ *
495
+ * The map is keyed by fingerprint → array-index of the tool result
496
+ * in ctx.messages. We update the index to the newest occurrence after
497
+ * each rewrite, so the NEXT repeat collapses the second one (not the
498
+ * first, which is already stubbed).
499
+ */
500
+ export function dedupRepeatedToolCalls(messages, toolCalls, toolResults, dedupMap) {
501
+ // Build a quick lookup from tool_call_id → freshly-appended message index.
502
+ // toolResults are the LAST toolResults.length entries of messages.
503
+ const newResultIndexById = new Map();
504
+ const firstNewIdx = messages.length - toolResults.length;
505
+ for (let i = 0; i < toolResults.length; i++) {
506
+ const m = toolResults[i];
507
+ if (m.tool_call_id)
508
+ newResultIndexById.set(m.tool_call_id, firstNewIdx + i);
509
+ }
510
+ for (const tc of toolCalls) {
511
+ const fp = dedupFingerprint(tc.function.name, tc.function.arguments);
512
+ const newIdx = newResultIndexById.get(tc.id);
513
+ if (newIdx === undefined)
514
+ continue;
515
+ const priorIdx = dedupMap.get(fp);
516
+ if (priorIdx !== undefined && priorIdx !== newIdx) {
517
+ const prior = messages[priorIdx];
518
+ if (prior && prior.role === 'tool' && typeof prior.content === 'string') {
519
+ const wasBytes = prior.content.length;
520
+ // Keep the prior message structurally valid for the API
521
+ // (role + tool_call_id stay; only content shrinks).
522
+ prior.content =
523
+ `[deduped — same ${tc.function.name} call was re-issued; ` +
524
+ `see the fresh result later in this conversation. ` +
525
+ `Original was ${wasBytes} bytes.]`;
526
+ }
527
+ }
528
+ // Point the fingerprint at the NEWEST occurrence so future
529
+ // repeats collapse the second one, not the (already-stubbed) first.
530
+ dedupMap.set(fp, newIdx);
531
+ }
532
+ }
217
533
  function validateToolArguments(tool, input) {
218
534
  const schema = tool.parameters;
219
535
  const required = schema.required || [];
@@ -308,6 +624,29 @@ export async function runQuery(ctx) {
308
624
  toolParseFailureStreaks: new Map(),
309
625
  toolCallLoopDetected: false,
310
626
  };
627
+ // ── F4: Tool-call dedup map (chain-scope) ──
628
+ //
629
+ // Hash of (tool_name, normalized_args) → message-index where that
630
+ // tool call's *result* lives in ctx.messages. When the same
631
+ // fingerprint fires a second time, we rewrite the OLDER tool-result
632
+ // message in place to a 1-line stub pointing at the newer one. The
633
+ // new result is preserved so the model can read the live data; only
634
+ // the stale duplicate gets collapsed.
635
+ //
636
+ // Why this matters: terminal-bench tasks routinely re-read the same
637
+ // files / re-grep for the same patterns / re-list the same directory
638
+ // 3-5 times across a chain. Each verbatim re-read costs 1-30K tokens
639
+ // of context. After the rewrite, ctx token cost on the repeated read
640
+ // drops from N to ~20.
641
+ //
642
+ // Different from the existing toolCallErrorCounts loop detector —
643
+ // that one counts CONSECUTIVE ERRORS and aborts. This one runs on
644
+ // SUCCESSFUL repeats and just rewrites stale messages. They compose.
645
+ const toolCallDedupMap = new Map();
646
+ const CRITIQUE_STAGES = ['decompose', 'critique', 'refine'];
647
+ let critiqueStageIdx = 0;
648
+ const selfCritiqueEnabled = process.env.COMPACT_AGENT_NON_INTERACTIVE === '1'
649
+ && process.env.COMPACT_AGENT_SELF_CRITIQUE !== '0';
311
650
  // Input suppression spans the entire chain: model streaming AND tool
312
651
  // execution. executeToolCalls calls inputGuard.pause()/resume() around
313
652
  // permission prompts so rl.question() can still read user input. Final
@@ -351,11 +690,25 @@ export async function runQuery(ctx) {
351
690
  // Get the last user message for context-aware system prompt
352
691
  const lastUserMsg = ctx.messages.filter((m) => m.role === 'user').pop();
353
692
  const userQuery = typeof lastUserMsg?.content === 'string' ? lastUserMsg.content : undefined;
354
- // Build full messages array with system prompt
693
+ // Build full messages array with system prompt.
694
+ // F2 — Observation window masking: before sending to the model,
695
+ // if our message history is large, mask older tool_result
696
+ // contents with a short stub. Only the last MASKING_WINDOW tool
697
+ // results stay verbatim. Stub keeps role + tool_call_id intact
698
+ // so the API stays valid; only the content shrinks.
355
699
  const systemPrompt = buildSystemPrompt(ctx.config, ctx.cwd, ctx.mode, userQuery);
700
+ const visibleMessages = maskOldToolResults(ctx.messages);
701
+ // StateAct: inject a fresh task-state block as a system message
702
+ // between the main system prompt and the conversation history.
703
+ // The main system prompt stays first (cacheable); the state block
704
+ // sits right after so the model sees it as ambient context for
705
+ // the upcoming turn. Skipped on short chains or via env-var
706
+ // override.
707
+ const stateBlock = buildStateBlock(visibleMessages);
356
708
  const apiMessages = [
357
709
  { role: 'system', content: systemPrompt },
358
- ...ctx.messages,
710
+ ...(stateBlock ? [{ role: 'system', content: stateBlock }] : []),
711
+ ...visibleMessages,
359
712
  ];
360
713
  let fullText = '';
361
714
  let toolCalls;
@@ -688,9 +1041,27 @@ export async function runQuery(ctx) {
688
1041
  // between tool calls — speaking each one is noisy and slow.
689
1042
  if (fullText)
690
1043
  accumulatedAssistantText += (accumulatedAssistantText ? '\n\n' : '') + fullText;
691
- // If no tool calls, we're done
692
- if (!toolCalls || toolCalls.length === 0)
1044
+ // F5+ DeCRIM 3-stage self-critique gate.
1045
+ //
1046
+ // When the model emits a no-tool-call turn ("I'm done"), we
1047
+ // walk through three sequential stages. Each stage injects a
1048
+ // user message; the model responds, possibly with more tool
1049
+ // calls. When it next tries to declare done, we advance to the
1050
+ // next stage. After all 3 stages fire, the gate is exhausted
1051
+ // and the next no-tool-call turn lets the chain end normally.
1052
+ if (!toolCalls || toolCalls.length === 0) {
1053
+ if (selfCritiqueEnabled && critiqueStageIdx < CRITIQUE_STAGES.length) {
1054
+ const stage = CRITIQUE_STAGES[critiqueStageIdx];
1055
+ critiqueStageIdx++;
1056
+ ctx.messages.push({
1057
+ role: 'user',
1058
+ content: critiquePromptFor(stage),
1059
+ });
1060
+ // Re-enter the loop — the model responds to the stage prompt.
1061
+ continue;
1062
+ }
693
1063
  break;
1064
+ }
694
1065
  // Execute tool calls — executeToolCalls itself flips per-tool state
695
1066
  // and uses inputGuard.pause()/resume() around each permission prompt
696
1067
  // so rl.question() can read user input even though suppression is on
@@ -698,6 +1069,18 @@ export async function runQuery(ctx) {
698
1069
  // we can surface a skill-graduation hint at chain end.
699
1070
  const toolResults = await executeToolCalls(toolCalls, ctx, inputGuard, chainStats);
700
1071
  ctx.messages.push(...toolResults);
1072
+ // ── F4: Dedup repeat tool calls ──
1073
+ //
1074
+ // After each fresh batch of tool results lands in ctx.messages,
1075
+ // hash each call's (toolName, normalizedArgs) fingerprint. If
1076
+ // we've seen this fingerprint before in this chain, rewrite the
1077
+ // PRIOR tool-result message in place to a 1-line stub. The new
1078
+ // result stays full-fidelity so the model can read it.
1079
+ //
1080
+ // We rewrite the older one (not the newer) so the model's most
1081
+ // recent attention sees a fresh, complete result — but the
1082
+ // accumulated history doesn't carry redundant copies.
1083
+ dedupRepeatedToolCalls(ctx.messages, toolCalls, toolResults, toolCallDedupMap);
701
1084
  }
702
1085
  // Chain ended; back to idle so F1 reports the correct state.
703
1086
  setStatus({ state: 'idle' });