npm - @phnx-labs/agents-cli - Versions diffs - 1.20.16 → 1.20.18 - Mend

@phnx-labs/agents-cli 1.20.16 → 1.20.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

package/CHANGELOG.md +19 -0
package/README.md +1 -1
package/dist/commands/budget.d.ts +14 -0
package/dist/commands/budget.js +137 -0
package/dist/commands/cost.d.ts +12 -0
package/dist/commands/cost.js +139 -0
package/dist/commands/exec.d.ts +20 -0
package/dist/commands/exec.js +382 -5
package/dist/commands/secrets.d.ts +15 -0
package/dist/commands/secrets.js +250 -4
package/dist/commands/sessions.js +4 -0
package/dist/commands/sync.d.ts +10 -3
package/dist/commands/sync.js +72 -9
package/dist/index.js +4 -0
package/dist/lib/budget/config.d.ts +9 -0
package/dist/lib/budget/config.js +115 -0
package/dist/lib/budget/enforce.d.ts +94 -0
package/dist/lib/budget/enforce.js +151 -0
package/dist/lib/budget/ledger.d.ts +61 -0
package/dist/lib/budget/ledger.js +107 -0
package/dist/lib/budget/preflight.d.ts +110 -0
package/dist/lib/budget/preflight.js +200 -0
package/dist/lib/checkpoint.d.ts +54 -0
package/dist/lib/checkpoint.js +56 -0
package/dist/lib/cloud/rush.js +18 -0
package/dist/lib/exec.d.ts +36 -0
package/dist/lib/exec.js +192 -4
package/dist/lib/git.d.ts +18 -0
package/dist/lib/git.js +67 -4
package/dist/lib/hooks.js +12 -0
package/dist/lib/loop.d.ts +145 -0
package/dist/lib/loop.js +330 -0
package/dist/lib/mcp.d.ts +7 -0
package/dist/lib/mcp.js +24 -0
package/dist/lib/models.d.ts +11 -0
package/dist/lib/models.js +21 -0
package/dist/lib/plugin-marketplace.js +16 -6
package/dist/lib/plugins.js +5 -2
package/dist/lib/pricing/cost.d.ts +46 -0
package/dist/lib/pricing/cost.js +71 -0
package/dist/lib/pricing/index.d.ts +8 -0
package/dist/lib/pricing/index.js +8 -0
package/dist/lib/pricing/prices.json +138 -0
package/dist/lib/pricing/table.d.ts +17 -0
package/dist/lib/pricing/table.js +73 -0
package/dist/lib/secrets/Agents CLI.app/Contents/CodeResources +0 -0
package/dist/lib/secrets/Agents CLI.app/Contents/MacOS/Agents CLI +0 -0
package/dist/lib/secrets/agent.d.ts +134 -0
package/dist/lib/secrets/agent.js +501 -0
package/dist/lib/secrets/bundles.d.ts +21 -0
package/dist/lib/secrets/bundles.js +43 -0
package/dist/lib/secrets/drivers/rush.d.ts +14 -0
package/dist/lib/secrets/drivers/rush.js +84 -0
package/dist/lib/secrets/linux.js +88 -10
package/dist/lib/secrets/sync-backend.d.ts +48 -0
package/dist/lib/secrets/sync-backend.js +13 -0
package/dist/lib/secrets/sync.d.ts +15 -23
package/dist/lib/secrets/sync.js +31 -66
package/dist/lib/session/db.d.ts +40 -0
package/dist/lib/session/db.js +84 -2
package/dist/lib/session/discover.d.ts +2 -0
package/dist/lib/session/discover.js +126 -2
package/dist/lib/session/render.d.ts +2 -0
package/dist/lib/session/render.js +1 -1
package/dist/lib/session/types.d.ts +4 -0
package/dist/lib/sync-umbrella.d.ts +76 -0
package/dist/lib/sync-umbrella.js +125 -0
package/dist/lib/teams/agents.d.ts +32 -0
package/dist/lib/teams/agents.js +66 -3
package/dist/lib/teams/api.js +20 -0
package/dist/lib/teams/parsers.js +16 -4
package/dist/lib/types.d.ts +48 -0
package/dist/lib/workflows.d.ts +56 -0
package/dist/lib/workflows.js +72 -5
package/package.json +2 -1

package/dist/lib/budget/preflight.js ADDED Viewed

@@ -0,0 +1,200 @@
+import { estimateCost, formatUsd } from '../pricing/index.js';
+import { loadLedger, spendForDay, spendForAgentDay, spendForProject, localDay } from './ledger.js';
+import { resolveBudgetConfig, hasAnyCap } from './config.js';
+/** Roughly 4 characters per token — the standard coarse heuristic for English text. */
+const CHARS_PER_TOKEN = 4;
+/**
+ * Output is typically a multiple of the visible prompt for an agentic run
+ * (tool calls, file reads, reasoning). 6x is a deliberately conservative
+ * lower bound so the estimate doesn't wildly under-report and wave through a
+ * run that then blows the cap on its first turn.
+ */
+const HEURISTIC_OUTPUT_MULTIPLIER = 6;
+/**
+ * Estimate the cost of a run. When the ledger has prior runs for this agent we
+ * use their average input/output tokens; otherwise we fall back to a
+ * prompt-character heuristic. `recentAvgTokens` lets callers inject a
+ * precomputed average (e.g. from a scoped ledger) for testability.
+ */
+export function estimateRunCost(args) {
+    const ledger = args.ledger ?? loadLedger();
+    let estInputTokens = 0;
+    let estOutputTokens = 0;
+    let basis = 'none';
+    const avg = args.recentAvgTokens ?? ledgerAverageTokens(args.agent, ledger);
+    if (avg && (avg.input > 0 || avg.output > 0)) {
+        estInputTokens = avg.input;
+        estOutputTokens = avg.output;
+        basis = 'ledger-average';
+    }
+    else if (args.promptChars && args.promptChars > 0) {
+        estInputTokens = Math.ceil(args.promptChars / CHARS_PER_TOKEN);
+        estOutputTokens = estInputTokens * HEURISTIC_OUTPUT_MULTIPLIER;
+        basis = 'prompt-heuristic';
+    }
+    const { usd, modelMatched } = estimateCost(args.model, {
+        inputTokens: estInputTokens,
+        outputTokens: estOutputTokens,
+    });
+    return {
+        estUsd: usd,
+        basis: estInputTokens === 0 && estOutputTokens === 0 ? 'none' : basis,
+        priced: modelMatched !== null,
+        estInputTokens,
+        estOutputTokens,
+    };
+}
+/** Average input/output tokens per RUN for an agent, from the ledger. Null when no history. */
+export function ledgerAverageTokens(agent, ledger) {
+    const runs = new Map();
+    for (const e of ledger) {
+        if (e.agent !== agent)
+            continue;
+        const acc = runs.get(e.runId) ?? { input: 0, output: 0 };
+        acc.input += e.inputTok;
+        acc.output += e.outputTok;
+        runs.set(e.runId, acc);
+    }
+    if (runs.size === 0)
+        return null;
+    let input = 0;
+    let output = 0;
+    for (const r of runs.values()) {
+        input += r.input;
+        output += r.output;
+    }
+    return { input: Math.round(input / runs.size), output: Math.round(output / runs.size) };
+}
+/** Read the ledger snapshot the gate needs for `agent` / `project` / today. */
+export function ledgerStateFor(agent, project, ledger) {
+    const entries = ledger ?? loadLedger();
+    const today = localDay();
+    return {
+        agent,
+        daySpend: spendForDay(today, entries),
+        projectSpend: spendForProject(project, entries),
+        agentDaySpend: spendForAgentDay(agent, today, entries),
+    };
+}
+/**
+ * The pre-flight gate. Projects this run's estimate on top of current spend and
+ * decides allow/deny. `on_exceed: warn` never blocks (allow:true) but still
+ * reports the projected overrun via `reason`. A hard block sets allow:false —
+ * `--yes` MUST NOT override it (the caller enforces that; this function only
+ * reports the truth).
+ */
+export function enforcePreflight(cfg, state, est) {
+    const projectedDaySpend = state.daySpend + est.estUsd;
+    const projectedProjectSpend = state.projectSpend + est.estUsd;
+    const projectedAgentDaySpend = state.agentDaySpend + est.estUsd;
+    const warnOnly = cfg.on_exceed === 'warn';
+    const breaches = [];
+    if (cfg.per_run !== undefined && est.estUsd > cfg.per_run) {
+        breaches.push({
+            cap: 'per_run',
+            reason: `estimated ${formatUsd(est.estUsd)} exceeds per_run cap ${formatUsd(cfg.per_run)}`,
+        });
+    }
+    if (cfg.per_day !== undefined && projectedDaySpend > cfg.per_day) {
+        breaches.push({
+            cap: 'per_day',
+            reason: `projected day spend ${formatUsd(projectedDaySpend)} exceeds per_day cap ${formatUsd(cfg.per_day)}`,
+        });
+    }
+    if (cfg.per_project !== undefined && projectedProjectSpend > cfg.per_project) {
+        breaches.push({
+            cap: 'per_project',
+            reason: `projected project spend ${formatUsd(projectedProjectSpend)} exceeds per_project cap ${formatUsd(cfg.per_project)}`,
+        });
+    }
+    const agentCap = cfg.per_agent?.[state.agent];
+    if (agentCap !== undefined && projectedAgentDaySpend > agentCap) {
+        breaches.push({
+            cap: 'per_agent',
+            reason: `projected agent day spend ${formatUsd(projectedAgentDaySpend)} exceeds per_agent cap ${formatUsd(agentCap)}`,
+        });
+    }
+    // require_confirm_over only governs interactive confirm, not a hard block.
+    let needsConfirm = cfg.require_confirm_over !== undefined && est.estUsd >= cfg.require_confirm_over;
+    // Unpriced model + active caps: the estimate is $0 because we have no price
+    // for this model, so NONE of the per_run/per_day caps above can ever trip and
+    // we'd silently wave the run through. Never $0-wave-through (#346): when caps
+    // are set but the model is unpriced, require confirmation so the user is told
+    // the cap cannot be enforced for this model rather than getting a false pass.
+    if (!est.priced && hasAnyCap(cfg) && breaches.length === 0) {
+        needsConfirm = true;
+        return {
+            allow: true,
+            needsConfirm: true,
+            reason: `model is unpriced — budget caps cannot be enforced for this run (estimate is $0); confirm to proceed`,
+            projectedDaySpend,
+            projectedProjectSpend,
+        };
+    }
+    if (breaches.length > 0) {
+        const first = breaches[0];
+        return {
+            allow: warnOnly,
+            needsConfirm: warnOnly ? needsConfirm : false,
+            reason: first.reason,
+            blockedCap: first.cap,
+            projectedDaySpend,
+            projectedProjectSpend,
+        };
+    }
+    return {
+        allow: true,
+        needsConfirm,
+        reason: needsConfirm
+            ? `estimated ${formatUsd(est.estUsd)} is at or above confirm threshold ${formatUsd(cfg.require_confirm_over)}`
+            : undefined,
+        projectedDaySpend,
+        projectedProjectSpend,
+    };
+}
+/** Build a one-line human estimate banner for `agents run` preamble. */
+export function formatEstimateBanner(agent, model, est) {
+    const cost = est.priced ? formatUsd(est.estUsd) : 'unpriced';
+    const basisLabel = est.basis === 'ledger-average'
+        ? 'recent average'
+        : est.basis === 'prompt-heuristic'
+            ? 'prompt size'
+            : 'no basis';
+    return `[budget] est. ${cost} for this ${agent} run (${model}, ${basisLabel})`;
+}
+/**
+ * High-level pre-flight gate: resolve the effective budget for `cwd`, estimate
+ * the run, and evaluate every cap. Returns `dormant:true` (and skips all work)
+ * when no caps are set, so the gate is zero-cost for users who never configure
+ * a budget. The CLI layer decides how to act on `decision` (print banner,
+ * confirm, or block + exit non-zero).
+ */
+export function runPreflightGate(args) {
+    const cfg = resolveBudgetConfig(args.cwd);
+    const ledger = args.ledger ?? loadLedger();
+    const estimate = estimateRunCost({
+        agent: args.agent,
+        model: args.model,
+        mode: args.mode,
+        promptChars: args.prompt?.length,
+        ledger,
+    });
+    const banner = formatEstimateBanner(args.agent, args.model, estimate);
+    if (!hasAnyCap(cfg)) {
+        return {
+            dormant: true,
+            cfg,
+            estimate,
+            decision: {
+                allow: true,
+                needsConfirm: false,
+                projectedDaySpend: 0,
+                projectedProjectSpend: 0,
+            },
+            banner,
+        };
+    }
+    const state = ledgerStateFor(args.agent, args.project, ledger);
+    const decision = enforcePreflight(cfg, state, estimate);
+    return { dormant: false, cfg, estimate, decision, banner };
+}

package/dist/lib/checkpoint.d.ts ADDED Viewed

@@ -0,0 +1,54 @@
+/**
+ * Harness-level loop checkpoint (issue #332).
+ *
+ * A checkpoint is the durable harness state for a `--loop` run: it records the
+ * iteration count, the pinned session id, the prompt being re-injected, and the
+ * loop config — everything `--resume-checkpoint` needs to continue a run that a
+ * SIGTERM, timeout, or machine sleep killed mid-flight.
+ *
+ * This is NOT provider-side state. `--session-id` resumes Claude's *conversation*
+ * (server-side); a checkpoint resumes the *harness* (iteration count, loop
+ * variables, prompt chain) — the part Claude's own resume cannot recover.
+ *
+ * Atomic write (temp + rename) mirrors `writeRunMeta` in routines.ts so a crash
+ * mid-write never leaves a half-written checkpoint that `readCheckpoint` would
+ * choke on. `readCheckpoint` returns null on a missing or corrupt file (mirrors
+ * `readRunMeta`) — a corrupt checkpoint is a "start fresh", never a throw.
+ */
+import type { AgentId } from './types.js';
+import type { LoopConfig, LoopSignal } from './loop.js';
+/** Durable harness state for a looped run, serialized to checkpoint.json. */
+export interface Checkpoint {
+    /** runId == the run directory name under getRunsDir(). */
+    id: string;
+    agent: AgentId;
+    version?: string;
+    /** The prompt re-injected each iteration. */
+    prompt?: string;
+    /** Pinned Claude session id so a resume continues the same conversation. */
+    sessionId?: string;
+    /** Iterations COMPLETED so far. A resume starts at iteration + 1. */
+    iteration: number;
+    /** The loop config governing termination. */
+    loop: LoopConfig;
+    /** Last loop-signal read, if any (for audit / resume context). */
+    loopSignal?: LoopSignal;
+    /** Cumulative tokens consumed across all iterations so far. */
+    cumulativeTokens?: number;
+    createdAt: string;
+    updatedAt: string;
+}
+/** Path to a run's checkpoint file: <runsDir>/<runId>/checkpoint.json. */
+export declare function checkpointPath(runId: string): string;
+/**
+ * Write a checkpoint atomically (temp file + rename). The rename is atomic on a
+ * single filesystem, so a reader never observes a partially written file.
+ * Mirrors the durable-write contract of `writeRunMeta`.
+ */
+export declare function writeCheckpoint(c: Checkpoint, file?: string): void;
+/**
+ * Read a checkpoint from disk. Returns null if the file is missing or its
+ * contents are not valid JSON — corruption means "no resumable state", which
+ * the caller treats as a fresh start. Mirrors `readRunMeta`.
+ */
+export declare function readCheckpoint(file: string): Checkpoint | null;

package/dist/lib/checkpoint.js ADDED Viewed

@@ -0,0 +1,56 @@
+/**
+ * Harness-level loop checkpoint (issue #332).
+ *
+ * A checkpoint is the durable harness state for a `--loop` run: it records the
+ * iteration count, the pinned session id, the prompt being re-injected, and the
+ * loop config — everything `--resume-checkpoint` needs to continue a run that a
+ * SIGTERM, timeout, or machine sleep killed mid-flight.
+ *
+ * This is NOT provider-side state. `--session-id` resumes Claude's *conversation*
+ * (server-side); a checkpoint resumes the *harness* (iteration count, loop
+ * variables, prompt chain) — the part Claude's own resume cannot recover.
+ *
+ * Atomic write (temp + rename) mirrors `writeRunMeta` in routines.ts so a crash
+ * mid-write never leaves a half-written checkpoint that `readCheckpoint` would
+ * choke on. `readCheckpoint` returns null on a missing or corrupt file (mirrors
+ * `readRunMeta`) — a corrupt checkpoint is a "start fresh", never a throw.
+ */
+import * as fs from 'fs';
+import * as path from 'path';
+import { getRunsDir } from './state.js';
+/** Path to a run's checkpoint file: <runsDir>/<runId>/checkpoint.json. */
+export function checkpointPath(runId) {
+    return path.join(getRunsDir(), runId, 'checkpoint.json');
+}
+/**
+ * Write a checkpoint atomically (temp file + rename). The rename is atomic on a
+ * single filesystem, so a reader never observes a partially written file.
+ * Mirrors the durable-write contract of `writeRunMeta`.
+ */
+export function writeCheckpoint(c, file) {
+    const target = file ?? checkpointPath(c.id);
+    fs.mkdirSync(path.dirname(target), { recursive: true });
+    const tmp = `${target}.${process.pid}.tmp`;
+    fs.writeFileSync(tmp, JSON.stringify(c, null, 2), 'utf-8');
+    fs.renameSync(tmp, target);
+}
+/**
+ * Read a checkpoint from disk. Returns null if the file is missing or its
+ * contents are not valid JSON — corruption means "no resumable state", which
+ * the caller treats as a fresh start. Mirrors `readRunMeta`.
+ */
+export function readCheckpoint(file) {
+    if (!fs.existsSync(file))
+        return null;
+    try {
+        const parsed = JSON.parse(fs.readFileSync(file, 'utf-8'));
+        if (!parsed || typeof parsed !== 'object')
+            return null;
+        if (typeof parsed.id !== 'string' || typeof parsed.iteration !== 'number')
+            return null;
+        return parsed;
+    }
+    catch {
+        return null;
+    }
+}

package/dist/lib/cloud/rush.js CHANGED Viewed

@@ -341,6 +341,24 @@ export class RushCloudProvider {
         if (repos.length === 0) {
             throw new Error('Rush Cloud requires --repo <owner/repo> (or --repo repeated for multi-repo).');
         }
+        // Budget pre-flight gate (issue #346). Cloud dispatches inherit the local
+        // project's caps; we refuse to POST a run that would breach an on_exceed:block
+        // cap. The repo slug is the project attribution key. Server-side spend is
+        // authoritative for live enforcement; this pre-flight is the deterministic
+        // "don't even start it" guard. Dormant when no caps are configured.
+        {
+            const { runPreflightGate } = await import('../budget/preflight.js');
+            const projectKey = repos[0] ?? process.cwd();
+            const gate = runPreflightGate({
+                agent: options.agent ?? 'cloud',
+                model: options.model ?? `${options.agent ?? 'cloud'}-default`,
+                prompt: options.prompt,
+                project: projectKey,
+            });
+            if (!gate.dormant && !gate.decision.allow) {
+                throw new Error(`[budget] BLOCKED cloud dispatch (${projectKey}): ${gate.decision.reason}`);
+            }
+        }
         // Validate each repo's shape and resolve its installation_id up front.
         // Any bad entry fails the whole dispatch — we never want a half-started
         // multi-repo run that only found installations for some of the repos.

package/dist/lib/exec.d.ts CHANGED Viewed

@@ -82,6 +82,23 @@ export interface ExecOptions {
     sessionId?: string;
     verbose?: boolean;
     env?: Record<string, string>;
+    /**
+     * Workflow capability scoping (Claude only). Sourced from WORKFLOW.md
+     * frontmatter `tools:` / `mcpServers:` and translated to Claude headless
+     * flags in buildExecCommand. Other agents ignore these.
+     *
+     * `toolsRestrict` is the AVAILABLE-tool allowlist: it maps to `--tools`, which
+     * restricts the built-in tool set the run can use at all (NOT `--allowedTools`,
+     * which only auto-approves without restricting availability). Declaring
+     * `[Read, Grep]` makes Write/Bash/Edit unavailable for the whole run.
+     */
+    toolsRestrict?: string[];
+    /**
+     * Path to an ephemeral mcp-config JSON. Emitted as `--mcp-config <path>`
+     * together with `--strict-mcp-config` so ONLY the named servers load (the
+     * flag alone merely ADDS to the existing server set).
+     */
+    mcpConfigPath?: string;
 }
 /**
  * Resolve interactive vs headless. Explicit flags are definitive and win over
@@ -90,6 +107,23 @@ export interface ExecOptions {
  * `--interactive` takes precedence over `--headless`; the CLI layer rejects passing both.
  */
 export declare function resolveInteractive(options: Pick<ExecOptions, 'interactive' | 'headless' | 'prompt'>): boolean;
+/**
+ * Decide whether spawnAgent must capture (PIPE + tee) the child's stdout so the
+ * live budget watcher can parse it (issue #346, FIX 3).
+ *
+ * The bug this fixes: stdout used to be PIPED only when downstream output was
+ * piped (`piped = !isTTY`). For a normal headless run AT A TERMINAL, stdout was
+ * 'inherit', so `child.stdout` was null and the watcher — hence the mid-run
+ * hard-cap kill — was silently skipped. We now tap stdout for ALL
+ * non-interactive runs when caps are active, regardless of TTY, and tee it back
+ * so the user still sees output. Interactive REPLs are never tapped (the human
+ * owns the TTY; they rely on the pre-flight gate).
+ *
+ * @param interactive  resolveInteractive() result for the run
+ * @param piped        true when the parent's stdout is NOT a TTY (output piped)
+ * @param capsActive   true when a budget watcher is attached (caps configured)
+ */
+export declare function shouldTapStdout(interactive: boolean, piped: boolean, capsActive: boolean): boolean;
 /** Parse an array of KEY=VALUE strings into an env record. Returns undefined for empty input. */
 export declare function parseExecEnv(entries: string[]): Record<string, string> | undefined;
 /**
@@ -135,6 +169,8 @@ export declare function execAgent(options: ExecOptions): Promise<number>;
  * keeping version resolution in one place instead of reimplementing it in batch.
  */
 export declare function execShimPassthrough(agent: AgentId, rawArgs: string[], cwd: string, pinnedVersion?: string): Promise<number>;
+/** Exit code spawnAgent resolves with when a run is killed for crossing a budget cap. */
+export declare const BUDGET_KILL_EXIT_CODE = 7;
 /**
  * Patterns that indicate a rate/usage limit. Matching is intentionally broad
  * because providers phrase these differently -- Anthropic uses "5-hour limit"

package/dist/lib/exec.js CHANGED Viewed

@@ -114,6 +114,29 @@ export function resolveInteractive(options) {
         return false;
     return options.prompt === undefined;
 }
+/**
+ * Decide whether spawnAgent must capture (PIPE + tee) the child's stdout so the
+ * live budget watcher can parse it (issue #346, FIX 3).
+ *
+ * The bug this fixes: stdout used to be PIPED only when downstream output was
+ * piped (`piped = !isTTY`). For a normal headless run AT A TERMINAL, stdout was
+ * 'inherit', so `child.stdout` was null and the watcher — hence the mid-run
+ * hard-cap kill — was silently skipped. We now tap stdout for ALL
+ * non-interactive runs when caps are active, regardless of TTY, and tee it back
+ * so the user still sees output. Interactive REPLs are never tapped (the human
+ * owns the TTY; they rely on the pre-flight gate).
+ *
+ * @param interactive  resolveInteractive() result for the run
+ * @param piped        true when the parent's stdout is NOT a TTY (output piped)
+ * @param capsActive   true when a budget watcher is attached (caps configured)
+ */
+export function shouldTapStdout(interactive, piped, capsActive) {
+    if (interactive)
+        return false;
+    // Always pipe when the caller pipes us downstream (preserve composability),
+    // OR when caps are active so the watcher can read the stream at a TTY.
+    return piped || capsActive;
+}
 /** Pattern for valid environment variable names (C identifier rules). */
 const EXEC_ENV_KEY_PATTERN = /^[A-Za-z_][A-Za-z0-9_]*$/;
 /** Parse a single KEY=VALUE string into a tuple, validating the key name. */
@@ -540,6 +563,39 @@ export function buildExecCommand(options) {
             cmd.push('--add-dir', dir);
         }
     }
+    // Claude-specific: workflow capability scoping. WORKFLOW.md frontmatter
+    // `tools:` / `mcpServers:` is translated to the headless flags that ACTUALLY
+    // restrict the run (verified against `claude --help` on the installed CLI):
+    //
+    //   tools:       -> `--tools <names...>` — restricts the AVAILABLE built-in
+    //                   tool set. This is the security boundary: tools NOT named
+    //                   here (e.g. Write, Bash, Edit) are unavailable for the whole
+    //                   run. `--allowedTools` would only auto-approve without
+    //                   restricting, so it is the WRONG flag for sandboxing.
+    //                   We also emit `--allowedTools <names...>` for the same set so
+    //                   the permitted tools don't prompt in headless `-p` mode.
+    //   mcpServers:  -> `--mcp-config <path>` PLUS `--strict-mcp-config`. The
+    //                   config flag alone ADDS servers to the existing set; only
+    //                   `--strict-mcp-config` makes the run use *only* the named
+    //                   servers, which is what scoping means.
+    //
+    // The command layer gates this behind the `allowlist` capability and assembles
+    // the mcp-config file; buildExecCommand stays a pure string-builder.
+    //
+    // `<tools...>` is variadic. Emit the names as separate argv tokens. The flags
+    // here are appended AFTER the positional prompt (added above), so the variadic
+    // never swallows the prompt; the trailing `--allowedTools` / `--strict-mcp-config`
+    // tokens also terminate the `--tools` variadic cleanly.
+    if (options.agent === 'claude') {
+        if (options.toolsRestrict && options.toolsRestrict.length > 0) {
+            cmd.push('--tools', ...options.toolsRestrict);
+            cmd.push('--allowedTools', ...options.toolsRestrict);
+        }
+        if (options.mcpConfigPath) {
+            cmd.push('--mcp-config', options.mcpConfigPath);
+            cmd.push('--strict-mcp-config');
+        }
+    }
     return cmd;
 }
 /** Spawn an agent and return its exit code. Convenience wrapper over spawnAgent. */
@@ -599,6 +655,15 @@ async function spawnAgent(options) {
     const timeoutMs = options.timeout ? parseTimeout(options.timeout) : undefined;
     const piped = !process.stdout.isTTY;
     const interactive = resolveInteractive(options);
+    // Budget live kill-switch (issue #346). For headless runs we incrementally
+    // parse stream-json usage off stdout, accumulate cost, and kill the child the
+    // moment a configured cap is crossed — exactly like the --timeout path, but
+    // resolving with a DISTINCT exit code so CI/headless can tell budget-kill from
+    // timeout. Spend is recorded to the shared ledger in the close handler. The
+    // watcher is dormant (and zero-cost) when no caps are configured.
+    const cwd = options.cwd || process.cwd();
+    const runId = randomUUID();
+    const watcherState = await setupBudgetWatcher(options, cwd, runId);
     maybeRotate();
     const timer = createTimer('agent.run', {
         agent: options.agent,
@@ -617,9 +682,13 @@ async function spawnAgent(options) {
         // rendering, raw-mode keystrokes, colored output). Headless mode pipes
         // stderr so we can scan for rate limits and feed fallback. stdout stays
         // inherited for TTY, piped when the caller pipes us downstream.
+        // PIPE (and later tee) stdout whenever the live budget watcher must read it
+        // — for ALL non-interactive runs when caps are active, regardless of TTY.
+        // See shouldTapStdout() for the rationale (FIX 3, issue #346).
+        const tapStdout = shouldTapStdout(interactive, piped, watcherState !== null);
         const stdio = interactive
             ? ['inherit', 'inherit', 'inherit']
-            : ['inherit', piped ? 'pipe' : 'inherit', 'pipe'];
+            : ['inherit', tapStdout ? 'pipe' : 'inherit', 'pipe'];
         // On Windows, .cmd batch wrappers (npm-installed CLIs) require shell:true
         // whether addressed by name or absolute path.
         const useShell = process.platform === 'win32' && (!path.isAbsolute(executable) || executable.endsWith('.cmd'));
@@ -631,8 +700,29 @@ async function spawnAgent(options) {
         });
         // Mark startup time (time from function call to process spawn)
         timer.mark('startup');
-        if (!interactive && piped && child.stdout) {
+        let budgetKilled = false;
+        let budgetKillTimer;
+        if (!interactive && tapStdout && child.stdout) {
+            // TEE the child's stdout back to the parent's so the user still sees
+            // output (mirrors stdio:'inherit') while we tap the same stream for usage.
             child.stdout.pipe(process.stdout);
+            // Tap the same stream for budget usage events without consuming the pipe
+            // (a 'data' listener and .pipe() both receive every chunk). Kill on breach.
+            if (watcherState) {
+                let pendingLine = '';
+                child.stdout.on('data', (chunk) => {
+                    const { events, rest } = watcherState.extract(chunk.toString('utf-8'), pendingLine);
+                    pendingLine = rest;
+                    for (const ev of events)
+                        watcherState.watcher.feedUsage(ev);
+                    if (watcherState.watcher.breached() && !budgetKilled) {
+                        budgetKilled = true;
+                        process.stderr.write(`[budget] hard cap exceeded — terminating ${options.agent} run\n`);
+                        child.kill('SIGTERM');
+                        budgetKillTimer = setTimeout(() => child.kill('SIGKILL'), 5000);
+                    }
+                });
+            }
         }
         let stderrBuffer = '';
         const STDERR_BUFFER_CAP = 64 * 1024;
@@ -663,11 +753,94 @@ async function spawnAgent(options) {
         child.on('close', (code) => {
             if (timeoutTimer)
                 clearTimeout(timeoutTimer);
-            timer.end({ exitCode: code ?? 0, status: code === 0 ? 'success' : 'failed' });
-            resolve({ exitCode: code ?? 0, stderr: stderrBuffer });
+            // Clear the budget-kill SIGKILL escalation timer (mirror the --timeout
+            // timer cleanup) so a programmatic caller reusing execAgent (the #332 loop
+            // driver) never sees a stray 5s kill event fire after the child has exited.
+            if (budgetKillTimer)
+                clearTimeout(budgetKillTimer);
+            // Record final spend to the shared ledger (issue #346). Best-effort: a
+            // ledger write must never mask the run's own outcome.
+            if (watcherState) {
+                try {
+                    watcherState.finalize();
+                }
+                catch { /* ledger write is non-critical */ }
+                // Release the watcher's references / stop accepting events (symmetry).
+                try {
+                    watcherState.watcher.dispose();
+                }
+                catch { /* dispose is best-effort */ }
+            }
+            // Budget kill resolves with a DISTINCT non-zero exit so CI/headless and
+            // teams/cloud can tell a budget termination apart from a normal failure.
+            const exitCode = budgetKilled ? BUDGET_KILL_EXIT_CODE : (code ?? 0);
+            timer.end({ exitCode, status: budgetKilled ? 'budget_killed' : code === 0 ? 'success' : 'failed' });
+            resolve({ exitCode, stderr: stderrBuffer });
         });
     });
 }
+/** Exit code spawnAgent resolves with when a run is killed for crossing a budget cap. */
+export const BUDGET_KILL_EXIT_CODE = 7;
+/**
+ * Resolve the budget watcher for a run. Returns null (watcher dormant) when no
+ * caps are configured, so non-budget users pay nothing. When caps exist, builds
+ * a live watcher seeded with the day/project spend already on the ledger, plus
+ * a finalize() that appends this run's accumulated spend.
+ */
+async function setupBudgetWatcher(options, cwd, runId) {
+    const interactive = resolveInteractive(options);
+    if (interactive)
+        return null;
+    const [{ resolveBudgetConfig, hasAnyCap }, { makeLiveSpendWatcher, capsFromConfig, extractUsageEvents }, ledger] = await Promise.all([
+        import('./budget/config.js'),
+        import('./budget/enforce.js'),
+        import('./budget/ledger.js'),
+    ]);
+    const cfg = resolveBudgetConfig(cwd);
+    if (!hasAnyCap(cfg))
+        return null;
+    const today = ledger.localDay();
+    const entries = ledger.loadLedger();
+    const caps = capsFromConfig(cfg, {
+        daySpend: ledger.spendForDay(today, entries),
+        projectSpend: ledger.spendForProject(cwd, entries),
+        agentDaySpend: { [options.agent]: ledger.spendForAgentDay(options.agent, today, entries) },
+    });
+    const watcher = makeLiveSpendWatcher({ caps, onBreach: () => { } });
+    // Accumulate per-(model) usage for a clean final ledger record.
+    const seen = [];
+    const model = options.model ?? `${options.agent}-default`;
+    return {
+        watcher,
+        extract: (chunk, pending) => {
+            const res = extractUsageEvents(chunk, pending, model, options.agent);
+            for (const ev of res.events) {
+                seen.push({
+                    model: ev.model ?? model,
+                    usage: {
+                        inputTokens: ev.inputTokens,
+                        outputTokens: ev.outputTokens,
+                        cacheReadTokens: ev.cacheReadTokens,
+                        cacheCreationTokens: ev.cacheCreationTokens,
+                    },
+                });
+            }
+            return res;
+        },
+        finalize: () => {
+            for (const s of seen) {
+                ledger.recordSpend({
+                    runId,
+                    agent: options.agent,
+                    project: cwd,
+                    model: s.model,
+                    usage: s.usage,
+                    source: 'run',
+                });
+            }
+        },
+    };
+}
 /**
  * Patterns that indicate a rate/usage limit. Matching is intentionally broad
  * because providers phrase these differently -- Anthropic uses "5-hour limit"
@@ -733,6 +906,21 @@ export async function runWithFallback(options) {
     ];
     let prevAgent;
     let prevSessionId;
+    // Workflow capability scoping only takes effect on claude (buildExecCommand
+    // guards `--tools` / `--mcp-config` / `--strict-mcp-config` on agent==='claude').
+    // A fallback to any non-claude agent would run with NONE of that scoping — the
+    // declared sandbox silently evaporates. Warn loudly so a rate-limit handoff to
+    // an unscoped agent is never silent (issue #324 fail-open).
+    const scopingActive = (options.toolsRestrict && options.toolsRestrict.length > 0)
+        || !!options.mcpConfigPath;
+    if (scopingActive) {
+        const unscoped = options.fallback.filter(f => f.agent !== 'claude').map(f => f.agent);
+        if (unscoped.length > 0) {
+            process.stderr.write(`[agents] WARNING: workflow tool/MCP scoping is enforced on claude only. ` +
+                `Fallback agent(s) ${[...new Set(unscoped)].join(', ')} would run UNSCOPED ` +
+                `(no --tools / --strict-mcp-config restriction) if claude hits a rate limit.\n`);
+        }
+    }
     for (let i = 0; i < chain.length; i++) {
         const { agent, version } = chain[i];
         const pinnedSessionId = agent === 'claude' ? randomUUID() : undefined;

package/dist/lib/git.d.ts CHANGED Viewed

@@ -1,3 +1,21 @@
+/**
+ * Validate that a clone/pull source uses a safe git transport before it is
+ * handed to `git`.
+ *
+ * Git's remote-helper transports (`ext::`, `fd::`, …) execute arbitrary
+ * commands at clone time, `file://`/`git://` are unauthenticated, and a source
+ * beginning with `-` is parsed by `git` as a command-line flag (option
+ * injection). We therefore allow only:
+ *   - `https://`                         (encrypted + authenticated)
+ *   - `ssh://` and SCP-style `git@host:path` / `host:path`
+ *   - local filesystem paths (callers handle these before reaching `git clone`)
+ *
+ * Pure string inspection — no filesystem or platform calls — so it behaves
+ * identically on Linux, macOS, and Windows.
+ *
+ * @throws Error if the source uses a disallowed transport.
+ */
+export declare function assertSafeGitTransport(source: string): void;
 /** Parsed representation of a git source string (GitHub, generic URL, or local path). */
 export interface GitSource {
     type: 'github' | 'url' | 'local';