npm - @loops-adk/core - Versions diffs - 0.1.1 → 0.3.0 - Mend

@loops-adk/core 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/README.md +132 -15
package/assets/logo.png +0 -0
package/dist/{agent-sdk-RF5VJZAT.js → agent-sdk-4QJDWM7N.js} +3 -3
package/dist/{agent-sdk-RF5VJZAT.js.map → agent-sdk-4QJDWM7N.js.map} +1 -1
package/dist/api.d.ts +119 -3
package/dist/api.js +26 -10
package/dist/api.js.map +1 -1
package/dist/{chunk-6BDWTFOS.js → chunk-3PMVII43.js} +784 -37
package/dist/chunk-3PMVII43.js.map +1 -0
package/dist/{chunk-XC46B4FD.js → chunk-MA6NDQMO.js} +2 -2
package/dist/chunk-MA6NDQMO.js.map +1 -0
package/dist/{claude-cli-U7WEVAOL.js → claude-cli-75AOQUKG.js} +3 -3
package/dist/{claude-cli-U7WEVAOL.js.map → claude-cli-75AOQUKG.js.map} +1 -1
package/dist/{codex-6I5UZ2HM.js → codex-LYZF52WL.js} +25 -13
package/dist/codex-LYZF52WL.js.map +1 -0
package/dist/env/command.d.ts +1 -1
package/dist/env/docker.d.ts +1 -1
package/dist/env/sst.d.ts +1 -1
package/dist/index.js +155 -14
package/dist/index.js.map +1 -1
package/dist/{types-B4wGVpqo.d.ts → types-CpB03Jj4.d.ts} +255 -38
package/package.json +11 -1
package/skills/author-loop/SKILL.md +14 -5
package/skills/design-agent-team/SKILL.md +108 -0
package/skills/supervise-loop-run/SKILL.md +64 -0
package/dist/chunk-6BDWTFOS.js.map +0 -1
package/dist/chunk-XC46B4FD.js.map +0 -1
package/dist/codex-6I5UZ2HM.js.map +0 -1

package/dist/{types-B4wGVpqo.d.ts → types-CpB03Jj4.d.ts} RENAMED Viewed

@@ -9,7 +9,7 @@
  * fixed provider set. (`mock` is constructed directly in tests/examples, not
  * registered by name, so it is intentionally not listed here.)
  */
-type EngineName = 'agent-sdk' | 'claude-cli' | 'anthropic-api' | (string & {});
+type EngineName = 'agent-sdk' | 'claude-cli' | 'codex' | 'anthropic-api' | (string & {});
 interface Usage {
     inputTokens: number;
     outputTokens: number;
@@ -30,7 +30,7 @@ interface AgentRequest {
      * disallow the sub-agent tool (`SUBAGENT_TOOLS`), so a branch of the graph bottoms out
      * here instead of expanding into an uncontrolled swarm — control over where work stops.
      * Authoritative over `allowedTools` (a disallow wins). Engines with no sub-agent tool
-     * (anthropic-api, mock) ignore it.
+     * (anthropic-api, codex, mock) ignore it.
      */
     leaf?: boolean;
 }
@@ -77,10 +77,10 @@ interface Engine {
 type EngineRef = EngineName | Engine;
 declare function isEngine(ref: EngineRef | undefined): ref is Engine;
 /**
- * How a tool-using engine (claude-cli / agent-sdk) treats permission prompts.
- * Mirrors the Claude Code values. `bypassPermissions` lets a headless worker
- * read/write/run without prompting — required for an unattended agent that must
- * touch the filesystem or shell, and to be set deliberately.
+ * How a tool-using engine treats permission prompts. Mirrors the Claude Code
+ * values. `bypassPermissions` lets a headless worker read/write/run without
+ * prompting — required for an unattended agent that must touch the filesystem or
+ * shell, and to be set deliberately.
  */
 type PermissionMode = 'default' | 'acceptEdits' | 'bypassPermissions' | 'plan' | 'dontAsk' | 'auto';
 /** Per-run options that the registry uses to construct engines. */
@@ -88,13 +88,14 @@ interface EngineOptions {
     /** Default model when a request/step does not name one. */
     defaultModel?: string;
     apiKey?: string;
-    /** For `claude-cli`: path to the binary (defaults to `claude` on PATH). */
+    /** For CLI-backed engines: path to the binary. */
     cliBinary?: string;
-    /** Extra args appended to the `claude` invocation. */
+    /** Extra args appended to CLI-backed engine invocations. */
     cliArgs?: string[];
     /**
-     * Permission mode for tool-using engines (claude-cli `--permission-mode`,
-     * agent-sdk `permissionMode`). Unset = the engine/CLI default (prompts).
+     * Permission mode for tool-using engines. Unset = the engine/CLI default
+     * where applicable; the Codex adapter stays read-only unless explicitly set
+     * to `bypassPermissions`.
      */
     permissionMode?: PermissionMode;
 }
@@ -199,6 +200,27 @@ interface Skill {
     /** The methodology instructions — prepended to the agent's system when it applies them. */
     instructions: string;
 }
+type AgentTier = 'worker' | 'reviewer' | 'lead' | 'specialist' | 'utility' | (string & {});
+type AgentSkillRef = string | Skill;
+interface AgentOutputContract {
+    /** Stable output name, such as `patch`, `review`, or `test-report`. */
+    name: string;
+    description?: string;
+    /** Optional structured schema owned by the loop author. Loops stores it, it does not interpret it. */
+    schema?: unknown;
+}
+interface AgentHumanGate {
+    /** Stable gate name, such as `prod-approval` or `security-signoff`. */
+    name: string;
+    description?: string;
+    when?: string;
+}
+interface AgentFailureMode {
+    mode: string;
+    recovery: string;
+    detection?: string;
+    severity?: 'block' | 'should-fix' | 'nice-to-have' | (string & {});
+}
 interface AgentDef {
     /** Identity (also the default job label). */
     name: string;
@@ -216,15 +238,31 @@ interface AgentDef {
      * stop a thorough agent from quietly expanding into a slow, expensive swarm.
      */
     leaf?: boolean;
+    /** Contract tier for humans, describe output, and future discovery. No scheduling authority. */
+    tier?: AgentTier;
     /** Structured job descriptions (not prose) — for discovery / docs. */
     capabilities?: string[];
+    /** Structured outputs this agent is expected to produce. */
+    outputs?: AgentOutputContract[];
     /** Methodologies the agent applies; their instructions are folded into the system. */
     skills?: Skill[];
+    /** Skills the caller should supply before the turn. Metadata only unless also listed in `skills`. */
+    requiresSkills?: AgentSkillRef[];
+    /** Skills the agent is known to use. Metadata only unless also listed in `skills`. */
+    usesSkills?: AgentSkillRef[];
+    /** Human approvals or external handoffs this agent may need. Metadata only. */
+    humanGates?: AgentHumanGate[];
     /** Named failure modes + their recovery — first-class contracts, not buried prose. */
-    failureModes?: {
-        mode: string;
-        recovery: string;
-    }[];
+    failureModes?: AgentFailureMode[];
+}
+interface AgentContractSummary {
+    tier?: string;
+    capabilities?: string[];
+    outputs?: string[];
+    requiresSkills?: string[];
+    usesSkills?: string[];
+    humanGates?: string[];
+    failureModes?: string[];
 }
 /** Read a markdown file as a string — for `system` or skill `instructions`. Pass an
  *  absolute path, or `new URL('./x.md', import.meta.url)` for a path relative to the file. */
@@ -233,6 +271,7 @@ declare function fromFile(path: string | URL): string;
 declare function defineSkill(skill: Skill): Skill;
 /** Define an agent. Identity + validation; strongly typed (the wrapper around the md). */
 declare function defineAgent(def: AgentDef): AgentDef;
+declare function agentContract(agent: AgentDef | undefined): AgentContractSummary | undefined;
 /**
  * Resolve an agent's system prompt, folding in its skills' methodologies. This is what
  * `agentJob` hands the engine as `system`.
@@ -269,6 +308,17 @@ interface AgentJobConfig {
      * tool), so a branch bottoms out here. Falls back to the agent def's `leaf`.
      */
     leaf?: boolean;
+    /**
+     * Append the current `ctx.lastReview` / revision feedback to the prompt. This
+     * keeps implementation agents from having to remember to manually read the
+     * runtime feedback channel in every prompt function.
+     */
+    consumeFeedback?: boolean;
+    /**
+     * Append a small DAG-position block: this node, its direct dependencies, and
+     * its direct dependents, without handing the agent the whole orchestration graph.
+     */
+    graphContext?: boolean;
     /** Working dir for the turn. Default: the workspace dir (the worktree). */
     cwd?: string;
     timeoutMs?: number;
@@ -341,20 +391,122 @@ interface CommitJobConfig {
  * dropping the work's record.
  */
 declare function commitJob(config: CommitJobConfig): Job;
-/**
- * Build an `Outcome` that sends work back to an earlier dag node — real-team
- * feedback ("marketing found the contract drifted; re-run engineering"). Return
- * it from any job or `agentJob({ outcome })` mapper. The enclosing `dag` re-runs
- * `to` and its dependents with `reason` threaded in as `lastReview`, bounded by
- * `DagConfig.maxKickbacks`. Defaults to a `fail` status, so an unresolved
- * kickback (budget spent) leaves the dag failing honestly; override via `over`
- * (e.g. `{ status: 'pass' }`) when the kicking node's own work is fine and it is
- * only requesting an upstream revision.
- */
-declare function kickback(to: string, reason: string, over?: Partial<Outcome>): Outcome;
 /** A deterministic step from a plain function — for glue, checks, side effects. */
 declare function fnJob(label: string, fn: (ctx: JobContext) => Outcome | Promise<Outcome>): Job;
+/**
+ * No-progress (stall) detection — the third hard stop, alongside `max` and
+ * `budget`. `max` bounds how many attempts a loop gets and `budget` bounds what
+ * they cost; neither can tell "slow but real convergence" from "the same failure
+ * five turns running". This module supplies that sensor, so a doomed loop exits
+ * at iteration N+window instead of burning everything it was given.
+ *
+ * The decision rule is NOVELTY, not change. An iteration makes progress when it
+ * reaches a state this run has never seen:
+ *
+ *   - the workspace fingerprint (HEAD + pending diff + untracked content) is new
+ *     — so an agent oscillating A→B→A gets no credit for the return trip;
+ *   - a caller-supplied `signal` value is new — the escape hatch for loops whose
+ *     progress lives outside the worktree (a queue length, a passing-test count);
+ *   - the gate confidence beats its previous best by `minConfidenceDelta` — a
+ *     high-water mark, so judge jitter around a flat score is not progress but
+ *     slow, steady improvement accumulates until it clears the bar.
+ *
+ * `window` consecutive iterations with evidence and no novelty = stalled. The
+ * default is deliberately conservative (any channel's novelty counts): a false
+ * "stalled" on work that was actually converging is worse than one more
+ * iteration. An iteration with NO evidence channel at all (no git workspace, no
+ * confidence, no signal) is indeterminate — it neither extends nor resets the
+ * stall run, and the detector reports itself inert so the loop can warn once.
+ * Gate/review reasons are deliberately NOT compared: judge prose varies between
+ * identical verdicts, so it is quoted in the report but never used as evidence.
+ */
+interface NoProgressConfig {
+    /** Consecutive no-progress iterations before the loop stalls out. Default 3. */
+    window?: number;
+    /**
+     * How far the gate confidence must beat its previous best to count as
+     * progress (the high-water mark). Default 0.02.
+     */
+    minConfidenceDelta?: number;
+    /**
+     * A caller-supplied progress fingerprint for state the workspace cannot see
+     * (a queue length, a passing-test count, an external resource). Returning a
+     * value this run has already produced counts as no progress; `undefined`
+     * leaves the channel out of this iteration's evidence. A throw is a bug in
+     * the definition and fails the loop, like any other guarded user code.
+     */
+    signal?: (ctx: JobContext, last: Outcome | undefined) => string | number | undefined | Promise<string | number | undefined>;
+    /**
+     * Read the workspace fingerprint each iteration (a few git subprocesses).
+     * Default true; set false when a custom `signal` is the only honest channel.
+     */
+    workspace?: boolean;
+}
+/** What `LoopConfig.noProgress` accepts: a bare window, or the full config. */
+type NoProgressInput = number | NoProgressConfig;
+/** The evidence a stalled loop carries out — on the outcome and the event. */
+interface StallReport {
+    /** The configured window that was filled. */
+    window: number;
+    /** The consecutive no-progress iterations, in order. */
+    iterations: number[];
+    /** The last gate/review reason observed — what kept failing. */
+    reason: string;
+    /** Per-channel assessment of the tripping iteration. */
+    evidence: string[];
+}
+/** One completed, non-converged iteration as the tracker sees it. */
+interface ProgressSample {
+    iteration: number;
+    /** Workspace fingerprint, when the workspace is a git repo. */
+    fingerprint?: string;
+    /** The confidence that gated this turn (review ?? until ?? body). */
+    confidence?: number;
+    /** The custom signal value, when a `signal` fn is configured. */
+    signal?: string;
+    /** The gate/review reason — reporting only, never evidence. */
+    reason?: string;
+}
+/** Resolve the `noProgress` sugar (`3` ⇒ `{ window: 3 }`) with defaults applied. */
+declare function resolveNoProgress(input: NoProgressInput | undefined): Required<Pick<NoProgressConfig, 'window' | 'minConfidenceDelta'>> & NoProgressConfig | undefined;
+/**
+ * The novelty tracker behind `LoopConfig.noProgress`. Feed it one sample per
+ * non-converged iteration; it returns a `StallReport` the moment `window`
+ * consecutive samples show evidence and no novelty.
+ */
+declare class ProgressTracker {
+    readonly window: number;
+    readonly minConfidenceDelta: number;
+    /** Every state this run has reached, namespaced by channel. */
+    private readonly seen;
+    /** Confidence high-water mark — the best score at the last progress point. */
+    private best;
+    /** The current run of consecutive no-progress iterations. */
+    private stalledRun;
+    private lastEvidence;
+    private lastReason;
+    private indeterminate;
+    private sampled;
+    constructor(cfg: {
+        window: number;
+        minConfidenceDelta: number;
+    });
+    /**
+     * Record one iteration. Returns a `StallReport` when this sample fills the
+     * window, else undefined.
+     */
+    record(sample: ProgressSample): StallReport | undefined;
+    /**
+     * True when the detector has seen a full window of samples and none carried
+     * any evidence channel — detection is configured but cannot fire. The loop
+     * uses this to warn once instead of failing silently-inert.
+     */
+    isInert(): boolean;
+}
 /**
  * The Environment provider — the third axis, after Engine (where the agent
  * thinks) and Workspace (where the code lives). Environment is where the code
@@ -556,18 +708,23 @@ interface Outcome {
     /** Present when `status` is driven by a failure. */
     error?: LoopError;
     /**
-     * A request to send work back to an earlier dag node (real-team feedback: a
-     * later stage that found a problem upstream). The enclosing `dag` re-runs `to`
-     * and its transitive dependents with `reason` threaded in as `lastReview`,
-     * bounded by `DagConfig.maxKickbacks` (default 0 — ignored). A feedback cycle
-     * is a loop boundary, not a backward edge: the graph stays acyclic and the
-     * re-run budget guarantees it terminates. Use the `kickback(to, reason)`
-     * helper to produce one.
-     */
-    kickback?: {
-        to: string;
-        reason: string;
-    };
+     * Present when a loop ended `exhausted` because its `noProgress` detector
+     * tripped: the evidence that the last `window` iterations reached no state
+     * the run had not already seen. Lets a supervisor tell "stalled, re-brief it"
+     * from "ran out of runway mid-progress" without parsing the summary.
+     */
+    stall?: StallReport;
+    /**
+     * Structured feedback asking an earlier unit of work for another pass, and the
+     * single channel for it. When `revision.target` is set, the enclosing `dag`
+     * re-runs that node and its transitive dependents with `revision.reason`
+     * threaded in as `lastReview`, bounded by `DagConfig.maxKickbacks` (default
+     * 0 — ignored). A feedback cycle is a loop boundary, not a backward edge: the
+     * graph stays acyclic and the re-run budget guarantees it terminates. Produce
+     * one with `revisionRequest({ target, findings })` or the terse
+     * `kickback(to, reason)`.
+     */
+    revision?: RevisionRequest;
 }
 type LogLevel = 'debug' | 'info' | 'warn' | 'error';
 /**
@@ -583,6 +740,33 @@ interface Workspace {
     /** The branch checked out in `dir`, when known (undefined on detached HEAD). */
     readonly branch?: string;
 }
+type FeedbackActionSeverity = 'block' | 'should-fix' | 'nice-to-have' | 'approve';
+/** `blocking` and `advisory` are legacy aliases kept for source compatibility. */
+type FeedbackSeverity = FeedbackActionSeverity | 'blocking' | 'advisory';
+type FeedbackDecision = 'accepted' | 'rejected' | 'deferred' | 'escalated';
+interface FeedbackFinding {
+    reviewer?: string;
+    severity?: FeedbackSeverity;
+    decision?: FeedbackDecision;
+    evidence: string;
+    recommendation?: string;
+}
+type RevisionRerun = 'target-and-dependents';
+interface RevisionRequest {
+    target?: string;
+    reason: string;
+    findings?: FeedbackFinding[];
+    rerun?: RevisionRerun;
+    source?: string;
+    decision?: FeedbackDecision;
+}
+interface GraphPosition {
+    dag: string;
+    node: string;
+    path: readonly string[];
+    needs: readonly string[];
+    dependents: readonly string[];
+}
 /**
  * Threaded into every `Job`. Carries the engine, the abort signal, the event
  * sink, a mutable scratchpad shared across the run, the workspace the work
@@ -612,6 +796,8 @@ interface JobContext {
     readonly depth: number;
     /** Loop/step names from the root down to here. */
     readonly path: readonly string[];
+    /** The current DAG node position, when this job is running inside a dag node. */
+    readonly graph?: GraphPosition;
     /** The previous body outcome in the enclosing loop (used by `review`/gates). */
     readonly lastOutcome?: Outcome;
     /** The most recent failed-review outcome, so a restart can act on it. */
@@ -680,6 +866,17 @@ interface LoopConfig {
     stopOn?: ConditionInput;
     /** Iteration cap. Reached without passing => `exhausted`. */
     max?: number;
+    /**
+     * The third hard stop, alongside `max` and `budget`: end the loop `exhausted`
+     * when this many consecutive iterations make no observable progress — no
+     * workspace state the run has not already visited, no custom `signal` value
+     * not already seen, no gate confidence beating its previous best. A bare
+     * number is the window (`3` ⇒ three flat iterations); pass a `NoProgressConfig`
+     * for the full knobs. Off by default: a polling loop legitimately makes no
+     * progress until the outside world changes, so this is opt-in like `commit`.
+     * The stalled outcome carries the evidence as `Outcome.stall`.
+     */
+    noProgress?: NoProgressInput;
     /**
      * Runs when `until` is met. If it returns `pass`, the loop completes.
      * Any other status re-enters the loop — this is the "review fails, run the
@@ -800,12 +997,26 @@ type LoopEvent = {
     ts: number;
     path: string[];
     outcome: Outcome;
+    /**
+     * Whether the loop will re-enter to act on a failing review (the review's
+     * revision was accepted), vs give up because it exhausted its iterations or
+     * `maxReviewRestarts`. Mirrors `dag:kickback`'s `accepted`. Only meaningful
+     * for a non-pass review; a downstream consumer that omits it (e.g. a test
+     * fixture) is treated as accepted.
+     */
+    accepted?: boolean;
 } | {
     kind: 'loop:end';
     ts: number;
     path: string[];
     outcome: Outcome;
     iterations: number;
+} | {
+    kind: 'loop:stall';
+    ts: number;
+    path: string[];
+    iteration: number;
+    report: StallReport;
 } | {
     kind: 'limit:wait';
     ts: number;
@@ -834,6 +1045,12 @@ type LoopEvent = {
     node: string;
     phase: NodePhase;
     outcome?: Outcome;
+    /**
+     * Which run of this node this is: 1 on the first pass, incremented each time
+     * a kickback re-runs it. Lets a records consumer tell a re-run's completion
+     * from the original and correlate it with the revision that caused it.
+     */
+    attempt?: number;
 } | {
     kind: 'dag:end';
     ts: number;
@@ -895,4 +1112,4 @@ type LoopEvent = {
     code: string;
 };
-export { commitJob as $, type AgentDef as A, type BudgetConfig as B, type ConditionInput as C, type DagConfig as D, type Environment as E, type Forge as F, GhForge as G, type OutcomeStatus as H, type PrPatch as I, type Job as J, type PrRef as K, type LoopConfig as L, type MergeOptions as M, type RetryPolicy as N, type Outcome as O, type PrInput as P, type Skill as Q, type RawPredicate as R, SUBAGENT_TOOLS as S, agentJob as T, type Usage as U, buildChecksArgs as V, type Workspace as W, buildCreateArgs as X, buildEditArgs as Y, buildMergeArgs as Z, buildViewArgs as _, type JobContext as a, defineAgent as a0, defineSkill as a1, fnJob as a2, fromFile as a3, isEngine as a4, isEnvironment as a5, isForge as a6, kickback as a7, resolveSystem as a8, type JobMeta as b, type EngineRef as c, type Condition as d, type EngineOptions as e, type Engine as f, type EngineName as g, type AgentRequest as h, type EngineEventSink as i, type AgentResult as j, type EnvHandle as k, type LoopEvent as l, type LimitPolicy as m, type AgentJobConfig as n, Budget as o, type CommitJobConfig as p, type ConditionResult as q, type DagNode as r, type EngineStreamEvent as s, type ForgeOpts as t, type GroundConfig as u, type LogLevel as v, LoopError as w, type LoopErrorCode as x, MockForge as y, type MockForgeOptions as z };
+export { type NoProgressInput as $, type AgentDef as A, type BudgetConfig as B, type ConditionInput as C, type DagConfig as D, type Environment as E, type FeedbackFinding as F, type GraphPosition as G, type CommitJobConfig as H, type ConditionResult as I, type Job as J, type DagNode as K, type LoopConfig as L, type EngineStreamEvent as M, type ForgeOpts as N, type Outcome as O, GhForge as P, type GroundConfig as Q, type RevisionRerun as R, type LogLevel as S, LoopError as T, type Usage as U, type LoopErrorCode as V, type Workspace as W, type MergeOptions as X, MockForge as Y, type MockForgeOptions as Z, type NoProgressConfig as _, type FeedbackDecision as a, type OutcomeStatus as a0, type PrInput as a1, type PrPatch as a2, type PrRef as a3, type ProgressSample as a4, ProgressTracker as a5, type RawPredicate as a6, type RetryPolicy as a7, SUBAGENT_TOOLS as a8, type Skill as a9, type StallReport as aa, agentContract as ab, agentJob as ac, buildChecksArgs as ad, buildCreateArgs as ae, buildEditArgs as af, buildMergeArgs as ag, buildViewArgs as ah, commitJob as ai, defineAgent as aj, defineSkill as ak, fnJob as al, fromFile as am, isEngine as an, isEnvironment as ao, isForge as ap, resolveNoProgress as aq, resolveSystem as ar, type FeedbackSeverity as b, type FeedbackActionSeverity as c, type JobContext as d, type RevisionRequest as e, type JobMeta as f, type EngineRef as g, type Condition as h, type EngineOptions as i, type Engine as j, type EngineName as k, type AgentRequest as l, type EngineEventSink as m, type AgentResult as n, type EnvHandle as o, type LoopEvent as p, type Forge as q, type LimitPolicy as r, type AgentContractSummary as s, type AgentFailureMode as t, type AgentHumanGate as u, type AgentJobConfig as v, type AgentOutputContract as w, type AgentSkillRef as x, type AgentTier as y, Budget as z };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@loops-adk/core",
-  "version": "0.1.1",
+  "version": "0.3.0",
   "license": "MIT",
   "author": "Jonny Neill",
   "description": "Run an agent in a convergence loop with an honest done-gate. A small, nestable loop and DAG primitive: deterministic plus agent-judge conditions, git as memory, review-restart, budgets, and a live TUI.",
@@ -52,6 +52,7 @@
     "dist",
     "bin",
     "skills",
+    "assets",
     "README.md",
     "LICENSE"
   ],
@@ -64,7 +65,16 @@
     "typecheck": "tsc --noEmit",
     "test": "vitest run",
     "test:watch": "vitest",
+    "bench:ab": "tsx bench/ab.ts",
+    "bench:graph": "tsx bench/graph.ts",
+    "bench:signal": "BENCH_GRAPH_TASK=graph-tasks/stable-store-contract BENCH_OUT=bench/results-signal.json tsx bench/graph.ts",
+    "bench:compare": "tsx bench/compare.ts",
+    "bench:report": "tsx bench/report.ts",
+    "bench:report:sample": "tsx bench/report.ts bench/results.sample.json",
+    "bench:context:dry": "BENCH_DRY=1 BENCH_CB_GROUPS=bench/contextbench/groups.dry.json tsx bench/swecontextbench.ts",
+    "bench:mechanism": "tsx bench/mechanism.ts",
     "example:poll": "tsx src/index.ts run examples/simple-poll.loop.ts --no-tui",
+    "example:stall": "tsx src/index.ts run examples/stall-demo.loop.ts --no-tui",
     "example:gate": "tsx src/index.ts run examples/confidence-gate.loop.ts",
     "prepack": "npm run build",
     "prepublishOnly": "npm run typecheck"

package/skills/author-loop/SKILL.md CHANGED Viewed

@@ -97,21 +97,30 @@ dag({
 `needs` are dependencies; `optional` nodes never block; an unmet `when` skips a node; `isolation: 'worktree'` (on the dag) or `isolate: true` (per node) runs writers in parallel worktrees that land back on pass. `sequence` and `parallel` are sugar over `dag`.
+## Agents and feedback
+A node can be a named specialist instead of an inline prompt. Define it once with `defineAgent` (persona in markdown via `fromFile`, structure in TS) and hand it to `agentJob({ agent })`; `defineSkill` folds a methodology into its system. The contract fields (`tier`, `outputs`, `failureModes`, …) are metadata for `describe` and validation, not scheduling power: the `dag` orchestrates, agents stay workers.
+Review feedback is a structured revision request that flows back to the worker on one channel. In a loop, a failing `review` is threaded into the next turn as `ctx.lastReview`; set `consumeFeedback: true` and `agentJob` folds it into the prompt. Aggregate several reviewers with `reviewPanel`; route a fix back to an earlier dag node with a targeted `revisionRequest({ target, findings })` (or the terse `kickback(to, reason)`) when the dag's `maxKickbacks` allows it.
+Composing a team of specialists, gates, and routed feedback is its own skill: see `skills/design-agent-team/SKILL.md`.
 ## Author → validate → run
 ```bash
-loops validate path/to/feature.loop.ts   # offline pre-flight: loads + prints the shape, no model calls, no spend
-loops describe path/to/feature.loop.ts   # print the loop's shape (gate, body, nodes) without running
-loops run path/to/feature.loop.ts         # live Ink TUI
+loops validate path/to/feature.loop.ts     # offline pre-flight: loads + prints the shape, no model calls, no spend
+loops describe path/to/feature.loop.ts     # print the loop's shape (gate, body, nodes) without running
+loops describe path/to/feature.loop.ts --json # the same shape as JSON (incl. each agent node's contract)
+loops run path/to/feature.loop.ts          # live Ink TUI
 loops run path/to/feature.loop.ts --no-tui # plain streamed logs
-loops run path/to/feature.loop.ts --json   # NDJSON event stream (parse this from an agent)
+loops run path/to/feature.loop.ts --json   # raw NDJSON event firehose (to supervise a run, prefer --supervise + records, below)
 ```
 Always `loops validate` first. It imports and constructs the loop (catching syntax, import, and bad-export errors) without running it, so you fix authoring mistakes for free before spending a single agent turn. It also prints the loop's shape (its gate, body, and dag nodes), so you can confirm you built what you intended. `loops describe` prints that shape on its own.
 `loops run` works from any repo, including one that uses `loops` as a submodule or dependency. The recipe's folder must be an ES module scope (a `package.json` with `{"type":"module"}`); repos that consume `loops` already have this. If a load fails with an ES-module error, that scope is what is missing.
-Add `--supervise` to make a run observable from another process: it registers under `~/.loops/runs/`, and `loops list` / `loops status <runId>` / `loops tail <runId>` read its live state (shape, iteration, last gate verdict, tokens, events). Use this to watch a long run, or to supervise several at once.
+Add `--supervise` to make a run observable from another process: it registers under `~/.loops/runs/`. From an agent, the primary read API is `loops records <runId>`, the semantic decision stream (dispatch / completion / surfacing / revision), filterable with `--kind`, `--path`, `--last`, `--json`, rather than the raw `run --json` firehose. `loops tail <runId>` streams live events, `loops status <runId>` reports terminal state, and `loops list` enumerates runs. Watching a long run or supervising several at once is its own skill: see `skills/supervise-loop-run/SKILL.md`.
 ## Gotchas

package/skills/design-agent-team/SKILL.md ADDED Viewed

@@ -0,0 +1,108 @@
+---
+name: design-agent-team
+description: Use when composing a team of specialist agents in a loops `dag`: defining an `AgentDef`, folding in `defineSkill` methodologies, wiring review feedback (`reviewPanel`/`consumeFeedback`/`revisionRequest`), and gating nodes so the graph orchestrates and the agents stay workers, never dispatchers. Load this before turning a loop into a multi-agent team.
+---
+# Designing an agent team
+A `dag` of specialist agents is a team. The load-bearing rule that keeps it a team and not a swarm:
+**The graph orchestrates; agents do not.** The `dag` is the manager (toposort + dispatch), `Condition`/`quorum` are the gates, `Outcome` is the result channel. An `AgentDef` is only the *contract*: who the agent is, what it may touch, how it works. It carries no scheduling authority. An agent produces an `Outcome`; the graph decides what runs next. Never build an agent whose job is to dispatch other agents; make the graph do it.
+**REQUIRED BACKGROUND:** you compose these agents into a loop/dag. Read `skills/author-loop/SKILL.md` for the loop mental model, the honest gate, and git-memory first.
+## Two builders: a skill is a method, an agent is a worker
+- `defineSkill({ name, instructions })` is a **methodology** (how to work: TDD, writing-plans). Prose only. A skill never dispatches an agent.
+- `defineAgent({ ... })` is a **worker**: a persona plus its contract. It *composes* skills; the skills' instructions fold into its system prompt.
+Persona and methodology live in editable markdown (`fromFile`); structure and types live in TS. The `.ts` is the typed wrapper around the `.md`.
+```ts
+import { defineAgent, defineSkill, fromFile, agentJob } from '@loops-adk/core';
+const tdd = defineSkill({
+  name: 'tdd',
+  instructions: fromFile(new URL('./skills/tdd.md', import.meta.url)),
+});
+const storeEngineer = defineAgent({
+  name: 'store-engineer',
+  system: fromFile(new URL('./agents/store-engineer.md', import.meta.url)), // persona, as markdown
+  model: 'sonnet',
+  tools: ['edit', 'bash'],        // the permission boundary
+  leaf: true,                     // may not spawn sub-agents; bottoms the branch out here
+  tier: 'worker',                 // contract metadata (no scheduling power)
+  capabilities: ['storage engine', 'id stability'],
+  outputs: [{ name: 'patch' }, { name: 'test-report' }],
+  skills: [tdd],                  // methodologies fold into the system
+  requiresSkills: ['contract-first'], // metadata unless also in `skills`
+  usesSkills: ['small-diff'],
+  humanGates: [{ name: 'prod-approval', when: 'deploying production changes' }],
+  failureModes: [{ mode: 'tests-flaky', recovery: 'isolate the flake, retry once', severity: 'should-fix' }],
+});
+```
+`agentJob({ agent: storeEngineer, prompt, ground: true })` resolves the def into the engine request (`system` = persona + folded skills, plus `model`/`tools`). Inline `system`/`model`/`tools`/`allowedTools` on the `agentJob` still override the def. The contract fields beyond `system`/`model`/`tools` are **optional metadata** for validation, `loops describe`, docs, and future discovery. They change nothing at runtime; they do not grant dispatch authority.
+**`leaf` is the fan-out brake.** A leaf agent cannot spawn sub-agents (the engine withholds the sub-agent tool). Use it to stop a thorough worker from quietly expanding into a slow, expensive swarm. The team's shape stays the graph you drew, not one the agent invents.
+## Wire the team as a graph
+```ts
+import { dag, loop, agentJob, gateJob, quorum, agentCheck, commandSucceeds } from '@loops-adk/core';
+dag({
+  name: 'ship',
+  nodes: {
+    store:  loop({ name: 'store', body: agentJob({ agent: storeEngineer, prompt: 'Build the store to its tests.', ground: true }), until: commandSucceeds('npm', ['test']) }),
+    api:    { needs: ['store'], job: loop({ /* apiEngineer, same shape */ }) },
+    review: { needs: ['api'], job: gateJob('review', quorum(2,
+      agentCheck({ agent: securityReviewer, question: 'Is it safe?' }),
+      agentCheck({ agent: correctnessReviewer, question: 'Is it correct?' }),
+    )) },
+  },
+});
+```
+Each engineer is a Converge loop (build to a `test` gate); reviewers are gates. `quorum(k, ...)` is a k-of-n jury; `gateJob(name, condition)` turns a `Condition` into a `Job` so it can be a node. Because a reviewer is just an agent and `agentCheck` takes an `engine`/`model`, any reviewer runs on any model, so put the adversarial lens on a second model for a genuinely independent signal.
+## Feedback is a loop boundary, not a back-edge
+Review findings are structured, and they flow back to the worker on the same channel whether they come from a loop's `review` slot or a dag kickback.
+**In a loop:** a failing `review` outcome is threaded into the next body turn as `ctx.lastReview`. Set `consumeFeedback: true` so the worker reads it without you hand-writing "address the feedback" into every prompt:
+```ts
+const implement = agentJob({ agent: implementationAgent, prompt: brief, consumeFeedback: true });
+```
+**Aggregate several reviewers** with `reviewPanel`. Every reviewer is a gate: the panel passes when all of them clear, or `pass: N` of them (k-of-n). An empty panel is a construction error. Give each reviewer real evidence with `reviewContext`:
+```ts
+import { reviewPanel, reviewContext, agentCheck } from '@loops-adk/core';
+const review = reviewPanel({
+  pass: 2, // optional: k-of-n instead of all
+  reviewers: [
+    { name: 'security',    review: agentCheck({ question: 'Is it safe?',    context: reviewContext({ diff: true, ledger: true }) }) },
+    { name: 'correctness', review: agentCheck({ question: 'Is it correct?', context: reviewContext({ tests: { command: 'npm', args: ['test'] } }) }) },
+    { name: 'simplicity',  review: agentCheck({ question: 'Is it simple?',  context: reviewContext({ files: ['src/**'] }) }) },
+  ],
+});
+```
+A failing panel emits a `revisionRequest` carrying each failing reviewer's concern as a finding, threaded into the next pass.
+**Route feedback across a DAG** with a targeted revision. When `DagConfig.maxKickbacks > 0`, a `revisionRequest({ target, findings })` (or the terse `kickback(to, reason)`) re-runs the target node and its transitive dependents, threading the reason in as their `lastReview`. Constrain valid targets with `DagNode.acceptsKickbackTo`. Because every cycle is a bounded re-run, not a graph edge, it always terminates.
+Give a worker just enough map to act on routed feedback without seeing the whole orchestration, with `graphContext: true`, which appends a small block naming this node, its direct dependencies, and its direct dependents.
+## Verify the contract before spending a turn
+```bash
+loops validate team.loop.ts          # loads + constructs, no model calls
+loops describe team.loop.ts --json   # the shape, incl. each agent node's contract (tier, outputs, failure modes)
+```
+`describe --json` reflects the contract you declared back at you, so you confirm the team you built is the team you meant. To watch or supervise the team once it runs, see `skills/supervise-loop-run/SKILL.md`.