npm - compact-agent - Versions diffs - 1.33.6 → 1.35.0 - Mend

compact-agent 1.33.6 → 1.35.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/query.d.ts CHANGED Viewed

@@ -38,6 +38,62 @@ export interface InputGuard {
     onSteer(handler: () => void): void;
     restore(): void;
 }
+export declare function buildStateBlock(messages: Message[]): string | null;
+export declare function maskOldToolResults(messages: Message[]): Message[];
+/**
+ * F5+ DeCRIM 3-stage critique prompts.
+ *
+ * Each prompt is designed to do exactly one job, in sequence:
+ *
+ *   decompose — Forces the model to extract requirements from the
+ *               ORIGINAL task before judging its own work. This is
+ *               the leverage point: the model can't bypass an
+ *               implicit requirement if it has to name it.
+ *
+ *   critique  — Per-item PASS/FAIL with concrete evidence required.
+ *               Asking for evidence ("file path", "command output",
+ *               "test result") is much harder to fake than the
+ *               generic "have you accomplished what was asked?".
+ *
+ *   refine    — Only the FAIL items get redone, plus any items
+ *               whose PASS evidence the model now thinks was weak.
+ *               If everything is solid, the model exits naturally.
+ *
+ * The phrasing deliberately includes "be honest" / "the user prefers
+ * honest failures over confident lies" — research on prompted self-
+ * criticism shows this kind of social-cost signaling reduces the
+ * self-confirmation bias that otherwise dominates weak-model
+ * critique. (Reflexion-style "just reflect on your work" prompts
+ * have been shown to degrade weak models — generic self-questioning
+ * without concrete structure produces overconfident revisions.)
+ */
+export declare function critiquePromptFor(stage: 'decompose' | 'critique' | 'refine'): string;
+export declare function dedupFingerprint(toolName: string, rawArgs: string): string;
+/**
+ * F4 — Rewrite stale duplicate tool-result messages in place.
+ *
+ * Called once per tool-execution batch. For each call whose
+ * fingerprint we've seen before in this chain, find the previous
+ * tool-result message and replace its `content` with a 1-line stub
+ * pointing at the newer message. The new result stays untouched so
+ * the model's next turn reads complete, fresh data.
+ *
+ * NOT called for the FIRST occurrence of a fingerprint — only when
+ * a repeat fires. So a one-time `read` of a file is never touched.
+ *
+ * The map is keyed by fingerprint → array-index of the tool result
+ * in ctx.messages. We update the index to the newest occurrence after
+ * each rewrite, so the NEXT repeat collapses the second one (not the
+ * first, which is already stubbed).
+ */
+export declare function dedupRepeatedToolCalls(messages: Message[], toolCalls: {
+    id: string;
+    type: 'function';
+    function: {
+        name: string;
+        arguments: string;
+    };
+}[], toolResults: Message[], dedupMap: Map<string, number>): void;
 /**
  * Main query loop: sends messages to the API, handles tool calls, loops until done.
  */

package/dist/query.js CHANGED Viewed

@@ -214,6 +214,322 @@ function startInputSuppression(screenReader = false) {
 /**
  * Validate tool arguments against the tool's JSON schema
  */
+/**
+ * F4 — Tool-call dedup fingerprint.
+ *
+ * Normalizes the raw JSON arguments before hashing so trivially-
+ * different forms collapse to the same key:
+ *
+ *   - parsed + JSON.stringify with sorted keys (so {"a":1,"b":2} and
+ *     {"b":2,"a":1} hash the same)
+ *   - common path arguments (file_path, path, cwd, dir) normalized to
+ *     forward-slashes and lowercased (catches `read /app/x.py` vs
+ *     `read /APP/X.PY` vs `read \\app\\x.py`)
+ *   - whitespace runs in `command` collapsed (catches `ls  -la` vs `ls -la`)
+ *
+ * Errors during parse fall through to a literal-string fingerprint —
+ * worse than nothing? No: even a literal hash of the raw arg string
+ * catches the most common case (model emits identical JSON twice).
+ */
+/**
+ * StateAct — task-state block injected fresh each turn.
+ *
+ * Source: arxiv 2410.02810 ("StateAct: Enhancing LLM Base Agents via
+ * Self-prompting and State-tracking"). Reports +10% over ReAct on
+ * ALFWorld, +30% on TextCraft, +7% on WebShop. Zero added LLM calls.
+ *
+ * Mechanism: before each assistant turn, prepend a short structured
+ * block summarizing (a) the ORIGINAL GOAL — re-injected as a
+ * reminder, since long chains can drift away from the initial task,
+ * and (b) RECENT ACTIONS — a compressed view of what tool calls
+ * have been made so far. The model gets a fresh recap every turn
+ * regardless of context drift.
+ *
+ * Directly attacks the failure mode observed on `run-pdp11-code`
+ * (375K context, model wrote `gen_load.py` twice with identical
+ * content because the earlier write had drifted out of attention).
+ *
+ * Implementation choices:
+ *   - State block is a `system` role message inserted AFTER the main
+ *     system prompt (so the latter stays cacheable) but BEFORE the
+ *     message history. The model interprets it as ambient context.
+ *   - Action list shows only the last N actions to keep the block
+ *     short. Older actions are summarized in the conversation
+ *     history itself (and increasingly masked by F2 observation
+ *     masking).
+ *   - The block is regenerated EVERY turn from current messages.
+ *     Not persisted; it's purely a derived view.
+ *   - Skipped on very short chains (< 3 messages) where there's
+ *     nothing to recap.
+ *   - Opt-out via COMPACT_AGENT_STATE_BLOCK=0.
+ */
+const STATE_BLOCK_RECENT_ACTIONS = 8;
+const STATE_BLOCK_GOAL_MAX_CHARS = 400;
+export function buildStateBlock(messages) {
+    if (process.env.COMPACT_AGENT_STATE_BLOCK === '0')
+        return null;
+    if (messages.length < 3)
+        return null;
+    // GOAL = the first user-role message. This is the original task
+    // instruction from the harness or human. Re-inject it so the model
+    // can't drift even when the user message has scrolled far up.
+    const firstUser = messages.find((m) => m.role === 'user');
+    if (!firstUser || typeof firstUser.content !== 'string')
+        return null;
+    const goal = firstUser.content.replace(/\s+/g, ' ').trim().slice(0, STATE_BLOCK_GOAL_MAX_CHARS);
+    if (!goal)
+        return null;
+    const actions = [];
+    for (const m of messages) {
+        if (m.role !== 'assistant')
+            continue;
+        const calls = m.tool_calls;
+        if (!calls)
+            continue;
+        for (const tc of calls) {
+            const argsRaw = String(tc.function.arguments ?? '');
+            const compact = argsRaw.replace(/\s+/g, ' ').slice(0, 80);
+            actions.push({ tool: tc.function.name, argsPreview: compact });
+        }
+    }
+    if (actions.length === 0)
+        return null;
+    const recent = actions.slice(-STATE_BLOCK_RECENT_ACTIONS);
+    const olderCount = actions.length - recent.length;
+    const lines = [
+        '<task_state>',
+        `Original goal: ${goal}${goal.length >= STATE_BLOCK_GOAL_MAX_CHARS ? '…' : ''}`,
+        `Actions completed: ${actions.length}`,
+    ];
+    if (olderCount > 0) {
+        lines.push(`Recent ${recent.length} (${olderCount} earlier omitted):`);
+    }
+    else {
+        lines.push(`Actions:`);
+    }
+    recent.forEach((a, i) => {
+        lines.push(`  ${i + 1}. ${a.tool}(${a.argsPreview}${a.argsPreview.length >= 80 ? '…' : ''})`);
+    });
+    lines.push('');
+    lines.push('Stay focused on the goal. Do not re-issue actions you have already completed — refer to their results in the conversation above.');
+    lines.push('</task_state>');
+    return lines.join('\n');
+}
+/**
+ * F2 — Observation Window Masking.
+ *
+ * Source: arxiv 2508.21433 ("The Complexity Trap: Simple Observation
+ * Masking Is as Efficient as LLM Summarization for Agent Context
+ * Management"). Cuts token cost ~50% on long agent loops while
+ * matching or beating LLM-summarization solve rates — at ZERO extra
+ * inference cost.
+ *
+ * Strategy: keep the last MASKING_WINDOW tool-result messages in
+ * full. For older tool-results, replace `content` with a short stub
+ * indicating what was there. The stub preserves `role` and
+ * `tool_call_id` so the OpenAI message-schema invariants are not
+ * violated.
+ *
+ * We DO NOT mask:
+ *   - assistant turns (the reasoning chain stays intact)
+ *   - user turns (task instruction + DeCRIM critique prompts)
+ *   - system messages (priming + mode)
+ *
+ * Only `role === 'tool'` messages are masked, because the paper's
+ * empirical finding is that ~84% of token cost is tool observations
+ * and the model rarely needs the old verbatim output to make the
+ * next decision — it needs the current state. The reasoning trace
+ * across assistant turns carries the necessary memory.
+ *
+ * Tunable: MASKING_WINDOW = 12 (last 12 tool-results stay verbatim).
+ * Conservative for our model class — the paper's Qwen3-32B run
+ * regressed -11.8% with overly aggressive masking, while Gemini-Flash
+ * gained +8.5%. Deepseek-v4-flash is in that capability band, so we
+ * pick a generous window. Override with COMPACT_AGENT_MASK_WINDOW.
+ *
+ * Threshold: we only bother masking when the total estimated payload
+ * exceeds ~60K characters (rough proxy for ~15K tokens). Below that,
+ * masking adds noise without saving anything material.
+ */
+const MASKING_WINDOW_DEFAULT = 12;
+const MASKING_TRIGGER_BYTES = 60_000;
+export function maskOldToolResults(messages) {
+    const totalBytes = estimateMessageBytes(messages);
+    if (totalBytes < MASKING_TRIGGER_BYTES)
+        return messages;
+    const window = Math.max(1, parseInt(process.env.COMPACT_AGENT_MASK_WINDOW ?? '', 10) || MASKING_WINDOW_DEFAULT);
+    // Find indices of tool-result messages (newest first).
+    const toolIdxs = [];
+    for (let i = messages.length - 1; i >= 0; i--) {
+        if (messages[i].role === 'tool') {
+            toolIdxs.push(i);
+        }
+    }
+    // Keep the most-recent `window` tool results untouched; mask the rest.
+    const toMask = new Set(toolIdxs.slice(window));
+    if (toMask.size === 0)
+        return messages;
+    // Build a new array. Original messages are not mutated.
+    return messages.map((m, i) => {
+        if (!toMask.has(i))
+            return m;
+        const original = typeof m.content === 'string' ? m.content : JSON.stringify(m.content);
+        const stub = `[older tool output omitted — ${original.length} chars; re-run the tool if you need the content]`;
+        return { ...m, content: stub };
+    });
+}
+function estimateMessageBytes(messages) {
+    let total = 0;
+    for (const m of messages) {
+        if (typeof m.content === 'string') {
+            total += m.content.length;
+        }
+        else if (m.content) {
+            try {
+                total += JSON.stringify(m.content).length;
+            }
+            catch {
+                /* noop */
+            }
+        }
+    }
+    return total;
+}
+/**
+ * F5+ DeCRIM 3-stage critique prompts.
+ *
+ * Each prompt is designed to do exactly one job, in sequence:
+ *
+ *   decompose — Forces the model to extract requirements from the
+ *               ORIGINAL task before judging its own work. This is
+ *               the leverage point: the model can't bypass an
+ *               implicit requirement if it has to name it.
+ *
+ *   critique  — Per-item PASS/FAIL with concrete evidence required.
+ *               Asking for evidence ("file path", "command output",
+ *               "test result") is much harder to fake than the
+ *               generic "have you accomplished what was asked?".
+ *
+ *   refine    — Only the FAIL items get redone, plus any items
+ *               whose PASS evidence the model now thinks was weak.
+ *               If everything is solid, the model exits naturally.
+ *
+ * The phrasing deliberately includes "be honest" / "the user prefers
+ * honest failures over confident lies" — research on prompted self-
+ * criticism shows this kind of social-cost signaling reduces the
+ * self-confirmation bias that otherwise dominates weak-model
+ * critique. (Reflexion-style "just reflect on your work" prompts
+ * have been shown to degrade weak models — generic self-questioning
+ * without concrete structure produces overconfident revisions.)
+ */
+export function critiquePromptFor(stage) {
+    if (stage === 'decompose') {
+        return ('Before you finalize: re-read the ORIGINAL task description (the very first user message in this conversation).\n\n' +
+            'List every concrete verifiable requirement it contains, as a numbered Markdown list. For each item:\n' +
+            '  - Quote the exact words from the task that express the requirement, where possible.\n' +
+            '  - Note how a third party could verify the requirement is met (which file would they check? which command would they run? what output would they look for?).\n\n' +
+            'Be exhaustive. Include format requirements, file names, output structure, and any "should also" clauses. ' +
+            'Do not paraphrase — quote. Do not add requirements the task did not state. ' +
+            'This list is just for grounding; you will judge each item in the next step.');
+    }
+    if (stage === 'critique') {
+        return ('Now judge each item from your checklist: did you actually satisfy it?\n\n' +
+            'Format your answer as:\n' +
+            '  1. [requirement quote] → PASS | FAIL\n' +
+            '     evidence: [specific file path you created, command output you observed, test that passed, etc.]\n\n' +
+            'Rules:\n' +
+            '  - Mark PASS only if you have concrete evidence right now (a file on disk, an output you can paste).\n' +
+            '  - "I implemented it" is NOT evidence. "I ran `ls /app/x.txt` and the file exists, with content `Hello`" IS evidence.\n' +
+            '  - "It should work" is NOT evidence. "I ran the failing command and it now exits 0" IS evidence.\n' +
+            '  - If you skipped a step, mark FAIL.\n' +
+            '  - If you are uncertain, mark FAIL.\n\n' +
+            'Be honest. The user prefers an honest "I left these 2 items undone" over a confident "all done" that fails the test. ' +
+            'A FAIL here is fixable in the next step; a falsely-claimed PASS is not.');
+    }
+    return ('For each FAIL item above, do the work to make it pass. Use the tools available.\n\n' +
+        'Also revisit any PASS items where, on reflection, your evidence was weak — re-verify those.\n\n' +
+        'If after the work all items are now genuinely PASS with concrete evidence, briefly summarize what you did and stop. ' +
+        'Otherwise, keep working until every item is honestly PASS.');
+}
+export function dedupFingerprint(toolName, rawArgs) {
+    let normalized;
+    try {
+        const parsed = JSON.parse(rawArgs ?? '{}');
+        if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
+            const obj = parsed;
+            // Normalize commonly-pathy fields
+            for (const k of ['file_path', 'path', 'cwd', 'dir', 'directory']) {
+                if (typeof obj[k] === 'string') {
+                    obj[k] = obj[k].replace(/\\/g, '/').toLowerCase();
+                }
+            }
+            // Collapse whitespace in shell commands so `ls -la` and `ls  -la` match
+            if (typeof obj.command === 'string') {
+                obj.command = obj.command.replace(/\s+/g, ' ').trim();
+            }
+            // Sorted-key serialization
+            const keys = Object.keys(obj).sort();
+            normalized = JSON.stringify(obj, keys);
+        }
+        else {
+            normalized = JSON.stringify(parsed);
+        }
+    }
+    catch {
+        normalized = String(rawArgs ?? '');
+    }
+    return `${toolName}::${normalized}`;
+}
+/**
+ * F4 — Rewrite stale duplicate tool-result messages in place.
+ *
+ * Called once per tool-execution batch. For each call whose
+ * fingerprint we've seen before in this chain, find the previous
+ * tool-result message and replace its `content` with a 1-line stub
+ * pointing at the newer message. The new result stays untouched so
+ * the model's next turn reads complete, fresh data.
+ *
+ * NOT called for the FIRST occurrence of a fingerprint — only when
+ * a repeat fires. So a one-time `read` of a file is never touched.
+ *
+ * The map is keyed by fingerprint → array-index of the tool result
+ * in ctx.messages. We update the index to the newest occurrence after
+ * each rewrite, so the NEXT repeat collapses the second one (not the
+ * first, which is already stubbed).
+ */
+export function dedupRepeatedToolCalls(messages, toolCalls, toolResults, dedupMap) {
+    // Build a quick lookup from tool_call_id → freshly-appended message index.
+    // toolResults are the LAST toolResults.length entries of messages.
+    const newResultIndexById = new Map();
+    const firstNewIdx = messages.length - toolResults.length;
+    for (let i = 0; i < toolResults.length; i++) {
+        const m = toolResults[i];
+        if (m.tool_call_id)
+            newResultIndexById.set(m.tool_call_id, firstNewIdx + i);
+    }
+    for (const tc of toolCalls) {
+        const fp = dedupFingerprint(tc.function.name, tc.function.arguments);
+        const newIdx = newResultIndexById.get(tc.id);
+        if (newIdx === undefined)
+            continue;
+        const priorIdx = dedupMap.get(fp);
+        if (priorIdx !== undefined && priorIdx !== newIdx) {
+            const prior = messages[priorIdx];
+            if (prior && prior.role === 'tool' && typeof prior.content === 'string') {
+                const wasBytes = prior.content.length;
+                // Keep the prior message structurally valid for the API
+                // (role + tool_call_id stay; only content shrinks).
+                prior.content =
+                    `[deduped — same ${tc.function.name} call was re-issued; ` +
+                        `see the fresh result later in this conversation. ` +
+                        `Original was ${wasBytes} bytes.]`;
+            }
+        }
+        // Point the fingerprint at the NEWEST occurrence so future
+        // repeats collapse the second one, not the (already-stubbed) first.
+        dedupMap.set(fp, newIdx);
+    }
+}
 function validateToolArguments(tool, input) {
     const schema = tool.parameters;
     const required = schema.required || [];
@@ -308,6 +624,29 @@ export async function runQuery(ctx) {
         toolParseFailureStreaks: new Map(),
         toolCallLoopDetected: false,
     };
+    // ── F4: Tool-call dedup map (chain-scope) ──
+    //
+    // Hash of (tool_name, normalized_args) → message-index where that
+    // tool call's *result* lives in ctx.messages. When the same
+    // fingerprint fires a second time, we rewrite the OLDER tool-result
+    // message in place to a 1-line stub pointing at the newer one. The
+    // new result is preserved so the model can read the live data; only
+    // the stale duplicate gets collapsed.
+    //
+    // Why this matters: terminal-bench tasks routinely re-read the same
+    // files / re-grep for the same patterns / re-list the same directory
+    // 3-5 times across a chain. Each verbatim re-read costs 1-30K tokens
+    // of context. After the rewrite, ctx token cost on the repeated read
+    // drops from N to ~20.
+    //
+    // Different from the existing toolCallErrorCounts loop detector —
+    // that one counts CONSECUTIVE ERRORS and aborts. This one runs on
+    // SUCCESSFUL repeats and just rewrites stale messages. They compose.
+    const toolCallDedupMap = new Map();
+    const CRITIQUE_STAGES = ['decompose', 'critique', 'refine'];
+    let critiqueStageIdx = 0;
+    const selfCritiqueEnabled = process.env.COMPACT_AGENT_NON_INTERACTIVE === '1'
+        && process.env.COMPACT_AGENT_SELF_CRITIQUE !== '0';
     // Input suppression spans the entire chain: model streaming AND tool
     // execution. executeToolCalls calls inputGuard.pause()/resume() around
     // permission prompts so rl.question() can still read user input. Final
@@ -351,11 +690,25 @@ export async function runQuery(ctx) {
             // Get the last user message for context-aware system prompt
             const lastUserMsg = ctx.messages.filter((m) => m.role === 'user').pop();
             const userQuery = typeof lastUserMsg?.content === 'string' ? lastUserMsg.content : undefined;
-            // Build full messages array with system prompt
+            // Build full messages array with system prompt.
+            // F2 — Observation window masking: before sending to the model,
+            // if our message history is large, mask older tool_result
+            // contents with a short stub. Only the last MASKING_WINDOW tool
+            // results stay verbatim. Stub keeps role + tool_call_id intact
+            // so the API stays valid; only the content shrinks.
             const systemPrompt = buildSystemPrompt(ctx.config, ctx.cwd, ctx.mode, userQuery);
+            const visibleMessages = maskOldToolResults(ctx.messages);
+            // StateAct: inject a fresh task-state block as a system message
+            // between the main system prompt and the conversation history.
+            // The main system prompt stays first (cacheable); the state block
+            // sits right after so the model sees it as ambient context for
+            // the upcoming turn. Skipped on short chains or via env-var
+            // override.
+            const stateBlock = buildStateBlock(visibleMessages);
             const apiMessages = [
                 { role: 'system', content: systemPrompt },
-                ...ctx.messages,
+                ...(stateBlock ? [{ role: 'system', content: stateBlock }] : []),
+                ...visibleMessages,
             ];
             let fullText = '';
             let toolCalls;
@@ -688,9 +1041,27 @@ export async function runQuery(ctx) {
             // between tool calls — speaking each one is noisy and slow.
             if (fullText)
                 accumulatedAssistantText += (accumulatedAssistantText ? '\n\n' : '') + fullText;
-            // If no tool calls, we're done
-            if (!toolCalls || toolCalls.length === 0)
+            // F5+ DeCRIM 3-stage self-critique gate.
+            //
+            // When the model emits a no-tool-call turn ("I'm done"), we
+            // walk through three sequential stages. Each stage injects a
+            // user message; the model responds, possibly with more tool
+            // calls. When it next tries to declare done, we advance to the
+            // next stage. After all 3 stages fire, the gate is exhausted
+            // and the next no-tool-call turn lets the chain end normally.
+            if (!toolCalls || toolCalls.length === 0) {
+                if (selfCritiqueEnabled && critiqueStageIdx < CRITIQUE_STAGES.length) {
+                    const stage = CRITIQUE_STAGES[critiqueStageIdx];
+                    critiqueStageIdx++;
+                    ctx.messages.push({
+                        role: 'user',
+                        content: critiquePromptFor(stage),
+                    });
+                    // Re-enter the loop — the model responds to the stage prompt.
+                    continue;
+                }
                 break;
+            }
             // Execute tool calls — executeToolCalls itself flips per-tool state
             // and uses inputGuard.pause()/resume() around each permission prompt
             // so rl.question() can read user input even though suppression is on
@@ -698,6 +1069,18 @@ export async function runQuery(ctx) {
             // we can surface a skill-graduation hint at chain end.
             const toolResults = await executeToolCalls(toolCalls, ctx, inputGuard, chainStats);
             ctx.messages.push(...toolResults);
+            // ── F4: Dedup repeat tool calls ──
+            //
+            // After each fresh batch of tool results lands in ctx.messages,
+            // hash each call's (toolName, normalizedArgs) fingerprint. If
+            // we've seen this fingerprint before in this chain, rewrite the
+            // PRIOR tool-result message in place to a 1-line stub. The new
+            // result stays full-fidelity so the model can read it.
+            //
+            // We rewrite the older one (not the newer) so the model's most
+            // recent attention sees a fresh, complete result — but the
+            // accumulated history doesn't carry redundant copies.
+            dedupRepeatedToolCalls(ctx.messages, toolCalls, toolResults, toolCallDedupMap);
         }
         // Chain ended; back to idle so F1 reports the correct state.
         setStatus({ state: 'idle' });