npm - @themoltnet/pi-extension - Versions diffs - 0.14.0 → 0.15.1 - Mend

@themoltnet/pi-extension 0.14.0 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -1,7 +1,11 @@
+import { AgentSession } from '@earendil-works/pi-coding-agent';
+import { Api } from '@earendil-works/pi-ai';
 import { BashOperations } from '@earendil-works/pi-coding-agent';
 import { connect } from '@themoltnet/sdk';
 import { EditOperations } from '@earendil-works/pi-coding-agent';
 import { ExtensionAPI } from '@earendil-works/pi-coding-agent';
+import { LoadSkillsResult } from '@earendil-works/pi-coding-agent';
+import { Model } from '@earendil-works/pi-ai';
 import { ReadOperations } from '@earendil-works/pi-coding-agent';
 import { Skill } from '@earendil-works/pi-coding-agent';
 import { Static } from '@sinclair/typebox';
@@ -27,6 +31,33 @@ import { WriteOperations } from '@earendil-works/pi-coding-agent';
  */
 export declare function activateAgentEnv(agentEnv: Record<string, string | undefined>, repoRoot: string): void;
+/**
+ * Construct an in-memory `AgentSession`. The caller is responsible for
+ * eventually invoking `session.prompt(...)` and for tearing down — the
+ * helper does no lifecycle management beyond construction.
+ */
+export declare function buildAgentSession(args: BuildAgentSessionArgs): Promise<AgentSession>;
+declare interface BuildAgentSessionArgs {
+    /** Host directory mounted at /workspace inside the VM. */
+    mountPath: string;
+    /** pi auth directory (resolved from `PI_CODING_AGENT_DIR` or `~/.pi/agent`). */
+    piAuthDir: string;
+    /** Resolved pi model handle (provider + model id). */
+    modelHandle: Model<Api>;
+    /** Pre-built customTools array. Caller composes Gondolin + MoltNet + submit tools. */
+    customTools: ToolDefinition[];
+    /** System-prompt fragments appended after pi's defaults. Parent passes the
+     *  runtime instructor; subagents pass their narrower variant. */
+    appendSystemPrompt: string[];
+    /** Skills to advertise in `<available_skills>`. Default: empty list. */
+    skillsOverride?: () => LoadSkillsResult;
+    /** Span attributes merged onto every OTel span the session emits. */
+    otelSpanAttrs: Record<string, string | number | boolean>;
+    /** Agent name for `gen_ai.agent.name` on the root span. */
+    agentName: string;
+}
 declare interface ClaimedTask {
     /** The claimed task payload itself. */
     task: Task;
@@ -83,6 +114,73 @@ export declare function createPiOtelExtension(options?: PiOtelOptions): (pi: Ext
  */
 export declare function createPiTaskExecutor(opts: ExecutePiTaskOptions): (claimedTask: ClaimedTask, reporter: TaskReporter) => Promise<TaskOutput>;
+/**
+ * Build the subagent custom tool for a parent session. The handle
+ * exposes the call counter so executors can emit summary telemetry
+ * when the parent terminates.
+ */
+export declare function createSubagentTool(args: CreateSubagentToolArgs): SubagentToolHandle;
+export declare interface CreateSubagentToolArgs {
+    /** Host directory mounted at /workspace inside the VM. */
+    mountPath: string;
+    /** pi auth directory the parent resolved. */
+    piAuthDir: string;
+    /** Resolved pi model handle — subagents share it. */
+    modelHandle: Model<Api>;
+    /** Agent name for telemetry. */
+    agentName: string;
+    /**
+     * Custom tools every subagent inherits (Gondolin-routed
+     * Read/Write/Edit/Bash + moltnet_* tools, etc). MUST NOT include
+     * the parent's submit-output tool, the parent's `subagent` tool,
+     * or any other parent-only artefact — the caller is responsible
+     * for filtering. The subagent appends its own submit tool.
+     */
+    inheritedCustomTools: ToolDefinition[];
+    /**
+     * The parent runtime instructor verbatim. Subagents prepend it to
+     * their own short "you are a subagent" preamble so the same
+     * invariants (gh auth, diary discipline, accountable commits)
+     * apply if the subagent takes those actions. The parent's task
+     * description dictates whether they should.
+     */
+    parentRuntimeInstructor: string;
+    parentTaskId: string;
+    parentTaskType: string;
+    parentAttemptN: number;
+    /**
+     * Parent task's cancel signal. When the daemon cancels the parent
+     * task (operator cancel or task-level `runningTimeoutSec` expiry),
+     * each in-flight subagent's inner `session.abort()` is invoked so
+     * it tears down promptly instead of running until its own LLM
+     * call resolves. Mirrors the existing `wireSessionAbort` pattern
+     * the parent session uses.
+     *
+     * Optional only because the test seam can omit it; production
+     * callers (executePiTask) pass `reporter.cancelSignal`.
+     */
+    parentCancelSignal?: AbortSignal;
+    /**
+     * Per-call fallback timeout. Defends against an inner session that
+     * ignores `abort()` for any reason (LLM provider stuck, tool call
+     * hanging on I/O, etc.). When the timeout fires, `session.abort()`
+     * is invoked and the tool returns `isError: true` with a
+     * `subagent_timed_out` reason the parent LLM can recover from.
+     *
+     * Default: 5 minutes. Set to `0` to disable (relying purely on
+     * parentCancelSignal). Negative values are treated as the default.
+     */
+    timeoutMs?: number;
+    /**
+     * Test seam. Production callers leave this undefined and get
+     * `buildAgentSession` from the factory module. Tests inject a mock
+     * that returns a stub session implementing only `prompt()` to
+     * exercise the tool's logic without booting a VM.
+     */
+    buildAgentSession?: (args: BuildAgentSessionArgs) => Promise<AgentSession>;
+}
 /**
  * Ensure a cached snapshot exists, building one if needed.
  * Returns the absolute path to the qcow2 checkpoint file.
@@ -279,6 +377,17 @@ export declare interface SandboxConfig {
         /** Overlay disk size (default '3G'). */
         overlaySize?: string;
     };
+    /** Shell commands to run every VM resume, after platform setup
+     *  (TLS, DNS, git safe.directory, tmpfs node_modules) and before
+     *  the agent session starts. Use for per-session bootstrap that
+     *  doesn't belong baked into the snapshot.
+     *
+     *  Not included in the snapshot cache key — changes here apply on
+     *  every resume without triggering a snapshot rebuild. Each command
+     *  runs in a fresh shell with `set -eu` and `set -o pipefail`; a
+     *  non-zero exit (including from any segment of a pipeline) aborts
+     *  resume with the failing command's stderr/stdout tail. */
+    resumeCommands?: string[];
     /** VFS shadow settings — hide host paths from the guest. */
     vfs?: {
         /** Paths (relative to workspace root) to shadow from the host mount. */
@@ -300,6 +409,29 @@ export declare interface SandboxConfig {
 /** Extract snapshot-specific config for backwards compat with ensureSnapshot. */
 export declare type SnapshotConfig = NonNullable<SandboxConfig['snapshot']>;
+export declare interface SubagentToolHandle {
+    /** ToolDefinition to register via `customTools` on the parent session. */
+    readonly tool: ToolDefinition;
+    /** How many times the parent LLM has called this tool. */
+    getCallCount: () => number;
+}
+/**
+ * Parameters shape the parent LLM sees when calling the subagent tool.
+ *
+ *   - `task`         — natural-language instructions for the subagent.
+ *                      The parent authors this per call. Must be
+ *                      non-empty.
+ *   - `output_schema` — name of a registered SubagentOutputContract.
+ *                      Resolved at call time; unknown names error.
+ */
+export declare const SubagentToolParameters: TObject<{
+    task: TString;
+    output_schema: TString;
+}>;
+export declare type SubagentToolParameters = Static<typeof SubagentToolParameters>;
 /**
  * The Task promise body.
  *

package/dist/index.js CHANGED Viewed

@@ -2515,11 +2515,12 @@ function createCryptoNamespace(context, signingRequests) {
 function createDiariesNamespace(context) {
 	const { client, auth } = context;
 	return {
-		async list(query) {
+		async list(query, headers) {
 			return unwrapResult(await listDiaries({
 				client,
 				auth,
-				query
+				query,
+				headers
 			}));
 		},
 		async create(body, headers) {
@@ -8177,6 +8178,27 @@ var BASE_ALLOWED_HOSTS = [
 	"*.googlesource.com"
 ];
 /**
+* Run a shell command in the guest and throw if it fails. Mirror of
+* `run()` in `snapshot.ts` for the resume-side hook chain — every
+* setup step is essential to a healthy session, so a silent non-zero
+* exit (e.g. a mount that fails into the FUSE write path, or a
+* consumer-provided resume command that fails to install pnpm) must
+* surface immediately rather than fall through to cryptic agent
+* errors later.
+*/
+async function vmRun(vm, label, command) {
+	const wrapped = `set -eu\nset -o pipefail\n${command}`;
+	const r = await vm.exec([
+		"sh",
+		"-c",
+		wrapped
+	]);
+	if (r.exitCode !== 0) {
+		const tail = [r.stderr, r.stdout].filter(Boolean).join("\n").slice(-800);
+		throw new Error(`resume step "${label}" failed (exit ${r.exitCode}):\n${tail}`);
+	}
+}
+/**
 * Resume a VM from a checkpoint, inject credentials, configure egress +
 * TLS. Returns the managed VM handle.
 */
@@ -8236,8 +8258,9 @@ async function resumeVm(config) {
     update-ca-certificates 2>/dev/null
     cat /etc/gondolin/mitm/ca.crt >> /etc/ssl/certs/ca-certificates.crt
   '`);
-	await vm.exec(`sh -c 'echo "nameserver 8.8.8.8
-nameserver 1.1.1.1" > /etc/resolv.conf'`);
+	await vmRun(vm, "DNS resolvers", `printf 'nameserver 8.8.8.8\\nnameserver 1.1.1.1\\n' > /etc/resolv.conf`);
+	await vmRun(vm, "git safe.directory", `git config --system --add safe.directory '*'`);
+	for (const [i, cmd] of (config.sandboxConfig?.resumeCommands ?? []).entries()) await vmRun(vm, `resumeCommands[${i}]`, cmd);
 	const vmSshDir = `${vmAgentDir}/ssh`;
 	await vm.exec(`mkdir -p ${vmAgentDir}/ssh /home/agent/.pi/agent`);
 	if (creds.piAuthJson !== null) await vm.fs.writeFile("/home/agent/.pi/agent/auth.json", creds.piAuthJson, { mode: 384 });
@@ -8580,59 +8603,37 @@ function extractUsage(message) {
 	};
 }
 //#endregion
-//#region ../agent-runtime/src/context-bindings.ts
-var PROMPT_SEPARATOR = "\n\n---\n\n";
+//#region src/runtime/agent-session-factory.ts
+var NO_SKILLS = () => ({
+	skills: [],
+	diagnostics: []
+});
 /**
-* Resolve `task.input.context[]` into delivered side-effects (skills
-* persisted via `deliver.skill`) and prompt fragments
-* (`systemPromptPrefix`, `userInlineSuffix`) the caller weaves into the
-* built prompt.
-*
-* Per-binding semantics (V1):
-*   - `skill`         → `deliver.skill({ slug, content })` once per ref.
-*                       Slug collisions on distinct contents are
-*                       refused loudly.
-*   - `prompt_prefix` → content appended to `systemPromptPrefix` with
-*                       the canonical `\n\n---\n\n` separator (in
-*                       declared order).
-*   - `user_inline`   → content appended to `userInlineSuffix` in
-*                       declared order, same separator.
-*
-* No fetching, no hashing — bytes are inlined in `ContextRef.content`,
-* and the task's `inputCid` already pins the entire input. The imposer
-* chose these bytes; the resolver just dispatches them.
-*
-* The function is pure with respect to its arguments: file writes are
-* confined to the injected `deliver` callback, which makes the
-* resolver trivial to test.
+* Construct an in-memory `AgentSession`. The caller is responsible for
+* eventually invoking `session.prompt(...)` and for tearing down — the
+* helper does no lifecycle management beyond construction.
 */
-async function resolveTaskContext(args) {
-	const promptParts = [];
-	const userParts = [];
-	const injected = [];
-	const usedSlugs = /* @__PURE__ */ new Map();
-	for (const ref of args.context) {
-		if (ref.binding === "skill") {
-			const prior = usedSlugs.get(ref.slug);
-			if (prior !== void 0) {
-				if (prior !== ref.content) throw new Error(`slug collision on '${ref.slug}': two skill entries share the same slug but have different content`);
-				injected.push(ref);
-				continue;
-			}
-			usedSlugs.set(ref.slug, ref.content);
-			await args.deliver.skill({
-				slug: ref.slug,
-				content: ref.content
-			});
-		} else if (ref.binding === "prompt_prefix") promptParts.push(ref.content);
-		else userParts.push(ref.content);
-		injected.push(ref);
-	}
-	return {
-		injected,
-		systemPromptPrefix: promptParts.join(PROMPT_SEPARATOR),
-		userInlineSuffix: userParts.join(PROMPT_SEPARATOR)
-	};
+async function buildAgentSession(args) {
+	const piOtelExtension = createPiOtelExtension({
+		agentName: args.agentName,
+		spanAttributes: args.otelSpanAttrs
+	});
+	const resourceLoader = new DefaultResourceLoader({
+		cwd: args.mountPath,
+		agentDir: args.piAuthDir,
+		extensionFactories: [piOtelExtension],
+		appendSystemPrompt: args.appendSystemPrompt,
+		skillsOverride: args.skillsOverride ?? NO_SKILLS
+	});
+	await resourceLoader.reload();
+	return (await createAgentSession({
+		agentDir: args.piAuthDir,
+		cwd: args.mountPath,
+		model: args.modelHandle,
+		customTools: args.customTools,
+		sessionManager: SessionManager.inMemory(),
+		resourceLoader
+	})).session;
 }
 //#endregion
 //#region ../tasks/src/formats.ts
@@ -8851,7 +8852,7 @@ unchanged" is.
 * (server-side schema check). Self-assessment is a truthful self-rating,
 * NOT enforcement — `verification.passed=false` does not block /complete
 * and does not affect `acceptedAttemptN`. See
-* `docs/agent-runtime.md` for the full producer/judge flow.
+* `docs/understand/agent-runtime.md` for the full producer/judge flow.
 *
 * **Binding evaluation** (judgment tasks: `assess_brief`, `judge_pack`).
 * A separate task whose IS the application of `successCriteria` to
@@ -9008,6 +9009,39 @@ var AssessBriefOutput = Type$1.Object({
 	$id: "AssessBriefOutput",
 	additionalProperties: false
 });
+/**
+* Async preflight (#1096):
+*   - `targetTaskId` resolves to a real task the caller can see.
+*   - The target is a `fulfill_brief` (you cannot grade an arbitrary
+*     task type as if it were a brief fulfillment).
+*   - The target is `completed` with an accepted attempt — grading
+*     an in-flight or failed task would either race or grade nothing.
+*
+* Agent-distinctness ("assessor ≠ producer") is a runtime / auth-
+* layer concern and intentionally NOT checked here. It belongs in
+* an auth-aware claim-time check.
+*/
+async function validateAssessBriefInputAsync(input, ctx) {
+	const { targetTaskId } = input;
+	const errors = [];
+	const target = await ctx.resolveTask(targetTaskId);
+	if (!target) {
+		errors.push({
+			field: "targetTaskId",
+			message: `targetTaskId ${targetTaskId} does not resolve to a task you can read`
+		});
+		return errors;
+	}
+	if (target.taskType !== "fulfill_brief") errors.push({
+		field: "targetTaskId",
+		message: `targetTaskId ${targetTaskId} is a ${target.taskType}, not a fulfill_brief`
+	});
+	if (target.status !== "completed" || target.acceptedAttemptN === null) errors.push({
+		field: "targetTaskId",
+		message: `targetTaskId ${targetTaskId} is not completed with an accepted attempt (status=${target.status}, acceptedAttemptN=${target.acceptedAttemptN})`
+	});
+	return errors;
+}
 //#endregion
 //#region ../tasks/src/task-types/curate-pack.ts
 /**
@@ -9206,6 +9240,311 @@ function validateJudgePackOutput(output) {
 	}
 	return null;
 }
+/**
+* Async preflight (#1096):
+*   - `renderedPackId` resolves to a rendered_packs row.
+*   - `sourcePackId` resolves to a context_packs row.
+*   - The rendered pack actually came from the claimed source pack —
+*     `renderedPack.sourcePackId === input.sourcePackId`. Without
+*     this check a judge can be tricked into grading rendering A as
+*     if it came from source B.
+*/
+async function validateJudgePackInputAsync(input, ctx) {
+	const { renderedPackId, sourcePackId } = input;
+	const errors = [];
+	const [rendered, source] = await Promise.all([ctx.resolveRenderedPack(renderedPackId), ctx.resolveContextPack(sourcePackId)]);
+	if (!rendered) errors.push({
+		field: "renderedPackId",
+		message: `renderedPackId ${renderedPackId} does not resolve to a rendered pack you can read`
+	});
+	if (!source) errors.push({
+		field: "sourcePackId",
+		message: `sourcePackId ${sourcePackId} does not resolve to a context pack you can read`
+	});
+	if (rendered && source && rendered.sourcePackId !== source.id) errors.push({
+		field: "sourcePackId",
+		message: `renderedPack ${renderedPackId} was produced from source ${rendered.sourcePackId}, not from sourcePackId=${sourcePackId}`
+	});
+	return errors;
+}
+//#endregion
+//#region ../tasks/src/task-types/judge-eval-variant.ts
+/**
+* `judge_eval_variant` — score N variants of a `run_eval` scenario
+* against a single rubric, in one pass, with per-variant subagent
+* isolation.
+*
+* output_kind: judgment
+* criteria: required (`successCriteria.rubric` — same envelope shape as
+*   `judge_pack` / `assess_brief`)
+* references: not required at the input layer — `runTaskIds` already
+*   pin the targets being graded.
+*
+* Slice 2 of #943. The parent task carries the rubric and the list of
+* variant `run_eval` task ids. The pi executor registers the generic
+* `subagent` custom tool (#1087), and the parent LLM calls
+* `subagent({ task, output_schema: 'judge_eval_variant_result' })` once
+* per variant — each child session has fresh context, fetches the
+* variant's accepted attempt output via `moltnet_get_task` /
+* `moltnet_list_task_attempts`, and grades against the rubric.
+*
+* Reuses `JudgePackScore` from `judge_pack` for per-criterion scoring
+* (Lane 1 binary via `llm_checklist`, Lane 2 graded via `llm_score`,
+* deterministic_*) — the score shape is the same across judgment
+* tasks; only the wrapping (per-variant grouping + deltas) differs.
+*
+* Cross-task input invariants — "all targets share the same
+* correlation_id, all are `run_eval`, all are completed with an
+* accepted attempt, all share byte-identical `input.successCriteria`"
+* — REQUIRE async DB lookups and live in `validateInputAsync` below,
+* which the task service runs at create time (#1096 wiring). The
+* TypeBox layer here only enforces shape: UUID format,
+* minItems/maxItems, rubric presence + weight invariant.
+*/
+var JUDGE_EVAL_VARIANT_TYPE = "judge_eval_variant";
+var JudgeEvalVariantInput = Type$1.Object({
+	runTaskIds: Type$1.Array(Type$1.String({ format: "uuid" }), {
+		minItems: 2,
+		maxItems: 10
+	}),
+	successCriteria: SuccessCriteria
+}, {
+	$id: "JudgeEvalVariantInput",
+	additionalProperties: false
+});
+/**
+* Per-variant grading. `scores[]` shape is identical to `JudgePackScore`
+* (mode-aware: binary via `llm_checklist`, graded via `llm_score`,
+* deterministic_*). Reuse the type rather than re-declare.
+*
+* This is also the **subagent output contract** — the parent's
+* `subagent` tool resolves the contract name `judge_eval_variant_result`
+* to this schema. See `agent-runtime`'s subagent contract registry.
+*/
+var JudgeEvalVariantResult = Type$1.Object({
+	runTaskId: Type$1.String({ format: "uuid" }),
+	variantLabel: Type$1.String({
+		minLength: 1,
+		maxLength: 64,
+		pattern: "^(?!.* - ).*$"
+	}),
+	scores: Type$1.Array(JudgePackScore, { minItems: 1 }),
+	composite: Type$1.Number({
+		minimum: 0,
+		maximum: 1
+	}),
+	verdict: Type$1.String({ minLength: 1 })
+}, {
+	$id: "JudgeEvalVariantResult",
+	additionalProperties: false
+});
+var JudgeEvalVariantOutput = Type$1.Object({
+	results: Type$1.Array(JudgeEvalVariantResult, { minItems: 2 }),
+	deltas: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Number({
+		minimum: -1,
+		maximum: 1
+	}))),
+	judgeModel: Type$1.Optional(Type$1.String({ minLength: 1 })),
+	traceparent: Type$1.String({ minLength: 1 })
+}, {
+	$id: "JudgeEvalVariantOutput",
+	additionalProperties: false
+});
+/**
+* Synchronous input invariants beyond TypeBox shape: rubric must be
+* present (already required by the schema, but the rubric body has
+* its own per-criterion weight invariant) and the rubric's weights
+* must sum to 1.
+*
+* Cross-task invariants (all targets are `run_eval`, all completed,
+* share `correlation_id`, byte-identical `input.successCriteria`)
+* are NOT checked here — they require async DB lookups against
+* `runTaskIds` and live in `validateJudgeEvalVariantInputAsync`
+* below, invoked by the task service at create time (#1096).
+*/
+function validateJudgeEvalVariantInput(input) {
+	const sc = input.successCriteria;
+	if (!sc) return "successCriteria is required for judge_eval_variant";
+	if (!sc.rubric) return "successCriteria.rubric is required for judge_eval_variant";
+	return validateRubricWeights(sc.rubric);
+}
+/**
+* Output cross-field invariants the schema cannot express:
+*
+*   1. `results.length === input.runTaskIds.length` — every variant
+*      the imposer asked for must be graded. Partial grading
+*      invalidates cross-variant comparison; fail the whole task
+*      rather than silently report a subset.
+*
+*   2. `results[i].runTaskId === input.runTaskIds[i]` — order is
+*      load-bearing for downstream consumers (e.g. deltas keyed by
+*      adjacent pairs). Mismatch is an LLM bug; reject loudly.
+*
+*   3. Each `result.scores` follows the same `llm_checklist` rule
+*      `judge_pack` enforces (#999): if a score has an `assertions`
+*      array, the numeric score MUST be `1` iff every assertion
+*      passes. Inconsistent payloads pollute attestations.
+*
+*   4. Each `result.composite` MUST equal the rubric-weighted sum
+*      `Σ(weight_j × scores[j].score)`. The parent (and any subagent
+*      it delegated to) is supposed to compute this; surfacing a
+*      drift here catches LLMs that hand-wave the arithmetic.
+*
+*   5. Optional `deltas` keys MUST be of the form `"A - B"` where
+*      both `A` and `B` are variantLabels present in `results`.
+*      Values are not range-checked (any float in [-1, 1] is
+*      arithmetically possible).
+*/
+function validateJudgeEvalVariantOutput(output, input) {
+	const out = output;
+	const inp = input;
+	if (inp) {
+		if (out.results.length !== inp.runTaskIds.length) return `results.length (${out.results.length}) does not match input.runTaskIds.length (${inp.runTaskIds.length}). Every variant must be graded; partial grading is rejected.`;
+		for (let i = 0; i < out.results.length; i++) if (out.results[i].runTaskId !== inp.runTaskIds[i]) return `results[${i}].runTaskId (${out.results[i].runTaskId}) does not match input.runTaskIds[${i}] (${inp.runTaskIds[i]}). Order must align with input for downstream delta computation.`;
+	}
+	for (let r = 0; r < out.results.length; r++) {
+		const result = out.results[r];
+		for (let s = 0; s < result.scores.length; s++) {
+			const sc = result.scores[s];
+			if (!sc.assertions) continue;
+			const allPassed = sc.assertions.every((a) => a.passed);
+			const expected = allPassed ? 1 : 0;
+			if (sc.score !== expected) return `results[${r}].scores[${s}] (criterionId="${sc.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${sc.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
+		}
+	}
+	if (inp?.successCriteria?.rubric) {
+		const criteria = inp.successCriteria.rubric.criteria;
+		const weightById = new Map(criteria.map((c) => [c.id, c.weight]));
+		for (let r = 0; r < out.results.length; r++) {
+			const result = out.results[r];
+			let sum = 0;
+			for (const sc of result.scores) {
+				const w = weightById.get(sc.criterionId);
+				if (w === void 0) return `results[${r}].scores: criterionId "${sc.criterionId}" is not in the input rubric (known: ${Array.from(weightById.keys()).join(", ")}). Score every rubric criterion exactly once; do not invent new ids.`;
+				sum += w * sc.score;
+			}
+			if (Math.abs(sum - result.composite) > .001) return `results[${r}].composite (${result.composite}) does not match Σ(weight × score) (${sum.toFixed(6)}). Composite must be the rubric-weighted sum of per-criterion scores (drift > 0.001).`;
+		}
+	}
+	if (out.deltas) {
+		const labels = new Set(out.results.map((r) => r.variantLabel));
+		for (const key of Object.keys(out.deltas)) {
+			const m = /^(.+?) - (.+)$/.exec(key);
+			if (!m) return `deltas key "${key}" is not of the form "<variantLabel-A> - <variantLabel-B>". Use a single space-hyphen-space separator between labels.`;
+			const [, a, b] = m;
+			if (!labels.has(a) || !labels.has(b)) return `deltas key "${key}" references variantLabel(s) not present in results: ${!labels.has(a) ? `"${a}" missing` : ""}${!labels.has(a) && !labels.has(b) ? ", " : ""}${!labels.has(b) ? `"${b}" missing` : ""}`;
+		}
+	}
+	return null;
+}
+/**
+* Local stable-stringify for cross-variant `successCriteria` byte-
+* equality. Recursively sorts object keys; arrays preserve order
+* (intentional — rubric criteria order is semantically meaningful).
+* Mirrors the canonical-JSON shape `crypto-service` uses for CIDs,
+* without taking on a crypto-service dep just for this comparison.
+*/
+function stableStringify(value) {
+	if (value === null || typeof value !== "object") return JSON.stringify(value);
+	if (Array.isArray(value)) return "[" + value.map(stableStringify).join(",") + "]";
+	const obj = value;
+	return "{" + Object.keys(obj).sort().map((k) => JSON.stringify(k) + ":" + stableStringify(obj[k])).join(",") + "}";
+}
+/**
+* Async preflight for `judge_eval_variant` (#1096 + #943):
+*
+*  1. Every `runTaskIds[i]` resolves to a task the caller can read.
+*  2. Every resolved task is `taskType === 'run_eval'`.
+*  3. Every resolved task is `status === 'completed'` with a
+*     non-null `acceptedAttemptN` — grading an unaccepted attempt
+*     races with re-attempts and pollutes the judge attestation.
+*  4. Every resolved task shares a non-null `correlationId`, and all
+*     `correlationId`s are equal. Without this an imposer could
+*     fabricate a "variant set" by stapling unrelated runs together.
+*  5. The shared `correlationId` is NOT already sealed. A previous
+*     judge_eval_variant against the same group is final; produce a
+*     fresh correlation_id for a new judging round rather than
+*     adding contradictory verdicts to a sealed group.
+*  6. Every variant's `input.successCriteria` is byte-identical (via
+*     stable-stringify). Different rubrics across "variants" makes
+*     the comparison meaningless.
+*/
+async function validateJudgeEvalVariantInputAsync(input, ctx) {
+	const { runTaskIds } = input;
+	const errors = [];
+	const resolved = await Promise.all(runTaskIds.map((id) => ctx.resolveTask(id)));
+	let missingTargets = false;
+	const presentTargets = [];
+	for (let i = 0; i < runTaskIds.length; i++) {
+		const t = resolved[i];
+		if (!t) {
+			missingTargets = true;
+			errors.push({
+				field: `runTaskIds[${i}]`,
+				message: `runTaskIds[${i}]=${runTaskIds[i]} does not resolve to a task you can read`
+			});
+			continue;
+		}
+		presentTargets.push(t);
+		if (t.taskType !== "run_eval") errors.push({
+			field: `runTaskIds[${i}]`,
+			message: `runTaskIds[${i}]=${runTaskIds[i]} is a ${t.taskType}, not a run_eval`
+		});
+		if (t.status !== "completed" || t.acceptedAttemptN === null) errors.push({
+			field: `runTaskIds[${i}]`,
+			message: `runTaskIds[${i}]=${runTaskIds[i]} is not completed with an accepted attempt (status=${t.status}, acceptedAttemptN=${t.acceptedAttemptN})`
+		});
+	}
+	if (missingTargets || presentTargets.length === 0) return errors;
+	const correlationIds = new Set(presentTargets.map((t) => t.correlationId ?? "__null__"));
+	if (correlationIds.has("__null__")) errors.push({
+		field: "runTaskIds",
+		message: "one or more run_eval targets have no correlation_id; cannot group as variants"
+	});
+	if (correlationIds.size > 1) errors.push({
+		field: "runTaskIds",
+		message: `run_eval targets span multiple correlation_ids (${Array.from(correlationIds).join(", ")}); variants must share one`
+	});
+	if (errors.length > 0) return errors;
+	const correlationId = presentTargets[0].correlationId;
+	if (!correlationId) return errors;
+	const seal = await ctx.findCorrelationSeal(correlationId);
+	if (seal) errors.push({
+		field: "runTaskIds",
+		message: `correlation_id ${correlationId} is already sealed by ${seal.sealedByTaskType}/${seal.sealedByTaskId} at ${seal.sealedAt}; use a fresh correlation_id for a new judging round`
+	});
+	const first = stableStringify(presentTargets[0].input.successCriteria);
+	for (let i = 1; i < presentTargets.length; i++) if (stableStringify(presentTargets[i].input.successCriteria) !== first) {
+		errors.push({
+			field: `runTaskIds[${i}]`,
+			message: `runTaskIds[${i}] has a different input.successCriteria than runTaskIds[0]; all variants must share the rubric and gates`
+		});
+		break;
+	}
+	return errors;
+}
+/**
+* Side effect emitted on successful `judge_eval_variant` create:
+* seal the shared correlation_id atomically with the insert. The
+* task service applies the seal in the same transaction; a
+* concurrent second `judge_eval_variant` against the same group
+* loses the race and is rejected with a clean conflict error.
+*
+* The seal applies to the SHARED correlation_id of the targets —
+* NOT to the judge task's own correlationId (which is typically
+* null or distinct). The task service derives the correlationId
+* for the effect from the resolved targets, not from the judge
+* task row.
+*/
+async function onCreateJudgeEvalVariant(input, ctx) {
+	const { runTaskIds } = input;
+	const first = await ctx.resolveTask(runTaskIds[0]);
+	if (!first?.correlationId) return [];
+	return [{
+		kind: "sealCorrelation",
+		correlationId: first.correlationId
+	}];
+}
 //#endregion
 //#region ../tasks/src/task-types/render-pack.ts
 /**
@@ -9245,6 +9584,18 @@ var RenderPackOutput = Type$1.Object({
 	$id: "RenderPackOutput",
 	additionalProperties: false
 });
+/**
+* Async preflight (#1096): `packId` resolves to a context_packs row
+* the caller can read.
+*/
+async function validateRenderPackInputAsync(input, ctx) {
+	const { packId } = input;
+	if (!await ctx.resolveContextPack(packId)) return [{
+		field: "packId",
+		message: `packId ${packId} does not resolve to a context pack you can read`
+	}];
+	return [];
+}
 //#endregion
 //#region ../tasks/src/task-types/run-eval.ts
 /**
@@ -9352,7 +9703,8 @@ var BUILT_IN_TASK_TYPES = {
 		outputSchema: AssessBriefOutput,
 		outputKind: "judgment",
 		requiresReferences: true,
-		validateInput: validateJudgmentInput
+		validateInput: validateJudgmentInput,
+		validateInputAsync: validateAssessBriefInputAsync
 	},
 	[CURATE_PACK_TYPE]: {
 		name: CURATE_PACK_TYPE,
@@ -9368,7 +9720,8 @@ var BUILT_IN_TASK_TYPES = {
 		outputSchema: RenderPackOutput,
 		outputKind: "artifact",
 		requiresReferences: false,
-		validateOutput: requireVerificationWhenCriteriaPresent
+		validateOutput: requireVerificationWhenCriteriaPresent,
+		validateInputAsync: validateRenderPackInputAsync
 	},
 	[JUDGE_PACK_TYPE]: {
 		name: JUDGE_PACK_TYPE,
@@ -9377,7 +9730,8 @@ var BUILT_IN_TASK_TYPES = {
 		outputKind: "judgment",
 		requiresReferences: true,
 		validateInput: validateJudgmentInput,
-		validateOutput: validateJudgePackOutput
+		validateOutput: validateJudgePackOutput,
+		validateInputAsync: validateJudgePackInputAsync
 	},
 	[RUN_EVAL_TYPE]: {
 		name: RUN_EVAL_TYPE,
@@ -9386,6 +9740,18 @@ var BUILT_IN_TASK_TYPES = {
 		outputKind: "artifact",
 		requiresReferences: false,
 		validateOutput: validateRunEvalOutput
+	},
+	[JUDGE_EVAL_VARIANT_TYPE]: {
+		name: JUDGE_EVAL_VARIANT_TYPE,
+		inputSchema: JudgeEvalVariantInput,
+		outputSchema: JudgeEvalVariantOutput,
+		outputKind: "judgment",
+		requiresReferences: false,
+		validateInput: validateJudgeEvalVariantInput,
+		validateOutput: validateJudgeEvalVariantOutput,
+		validateInputAsync: validateJudgeEvalVariantInputAsync,
+		onCreate: onCreateJudgeEvalVariant,
+		usesSubagents: true
 	}
 };
 //#endregion
@@ -9440,6 +9806,15 @@ function validateTaskOutput(taskType, output, input) {
 function getTaskOutputSchema(taskType) {
 	return getTaskTypeEntry(taskType)?.outputSchema ?? null;
 }
+/**
+* Whether sessions running this task type should have the generic
+* `subagent` custom tool registered. Returns `false` for unknown task
+* types and for task types that didn't opt in. See `TaskTypeEntry`
+* for the design rationale.
+*/
+function taskTypeUsesSubagents(taskType) {
+	return getTaskTypeEntry(taskType)?.usesSubagents === true;
+}
 //#endregion
 //#region ../tasks/src/wire.ts
 /**
@@ -9676,6 +10051,133 @@ Type$1.Object({
 	additionalProperties: false
 });
 //#endregion
+//#region ../agent-runtime/src/subagent-output-contracts.ts
+var REGISTRY = /* @__PURE__ */ new Map();
+/**
+* Register a subagent output contract. Idempotent: re-registering the
+* same name with a different schema throws — contracts are meant to
+* be stable. Re-registering with the identical contract object (same
+* reference) is a no-op for HMR and test convenience.
+*
+* Typically called at module-init time alongside task-type
+* registration. See task-types/index.ts in @moltnet/tasks for the
+* conventional pattern.
+*/
+function registerSubagentOutputContract(contract) {
+	if (!contract.name || contract.name.trim().length === 0) throw new Error("subagent output contract name is required");
+	if (!/^[a-z][a-z0-9_]*$/.test(contract.name)) throw new Error(`subagent output contract name '${contract.name}' must be lower_snake_case (starts with a letter, then [a-z0-9_]+)`);
+	const existing = REGISTRY.get(contract.name);
+	if (existing && existing !== contract) {
+		if (existing.parametersSchema !== contract.parametersSchema) throw new Error(`subagent output contract '${contract.name}' is already registered with a different schema; refusing to override`);
+	}
+	REGISTRY.set(contract.name, contract);
+}
+/**
+* Resolve a subagent output contract by name. Returns `null` for
+* unknown names — callers (the subagent custom tool) decide whether
+* that's a tool error the parent LLM can recover from or a hard fail.
+*/
+function getSubagentOutputContract(name) {
+	return REGISTRY.get(name) ?? null;
+}
+/**
+* List all registered contracts. Useful for diagnostics and for the
+* subagent tool's parameter description so a parent LLM can see what
+* contracts are available without enumerating them in its prompt.
+*/
+function listSubagentOutputContracts() {
+	return [...REGISTRY.values()];
+}
+//#endregion
+//#region ../agent-runtime/src/built-in-contract-registrations.ts
+/**
+* Built-in subagent output contracts (#1087, #943).
+*
+* Why this is an exported function and not a module-init side
+* effect:
+*
+*   - The registry is process-global. Module-init registration
+*     fires exactly once per Node process (ESM modules are cached
+*     by URL). Tests that call `__resetSubagentOutputContractsForTests()`
+*     to start from an empty registry have no way to repopulate
+*     the built-ins without re-evaluating the module — which the
+*     cache prevents. PR #1101 review M4.
+*   - An explicit `registerBuiltInSubagentContracts()` lets the
+*     package index call it once at module load AND lets test
+*     setup hooks call it again after `__reset...`.
+*   - `registerSubagentOutputContract` is itself idempotent for
+*     identical re-registrations, so calling this function twice
+*     in the same process is safe.
+*
+* Adding a new built-in: extend the body of this function. Do not
+* call `registerSubagentOutputContract` from anywhere else in the
+* package — keeping all built-ins in one function makes the set
+* auditable.
+*/
+function registerBuiltInSubagentContracts() {
+	registerSubagentOutputContract({
+		name: "judge_eval_variant_result",
+		description: "Per-variant grading result produced by a subagent of judge_eval_variant: scores against the shared rubric, composite, and a 1-3 sentence verdict for a single variant.",
+		parametersSchema: JudgeEvalVariantResult
+	});
+}
+registerBuiltInSubagentContracts();
+//#endregion
+//#region ../agent-runtime/src/context-bindings.ts
+var PROMPT_SEPARATOR = "\n\n---\n\n";
+/**
+* Resolve `task.input.context[]` into delivered side-effects (skills
+* persisted via `deliver.skill`) and prompt fragments
+* (`systemPromptPrefix`, `userInlineSuffix`) the caller weaves into the
+* built prompt.
+*
+* Per-binding semantics (V1):
+*   - `skill`         → `deliver.skill({ slug, content })` once per ref.
+*                       Slug collisions on distinct contents are
+*                       refused loudly.
+*   - `prompt_prefix` → content appended to `systemPromptPrefix` with
+*                       the canonical `\n\n---\n\n` separator (in
+*                       declared order).
+*   - `user_inline`   → content appended to `userInlineSuffix` in
+*                       declared order, same separator.
+*
+* No fetching, no hashing — bytes are inlined in `ContextRef.content`,
+* and the task's `inputCid` already pins the entire input. The imposer
+* chose these bytes; the resolver just dispatches them.
+*
+* The function is pure with respect to its arguments: file writes are
+* confined to the injected `deliver` callback, which makes the
+* resolver trivial to test.
+*/
+async function resolveTaskContext(args) {
+	const promptParts = [];
+	const userParts = [];
+	const injected = [];
+	const usedSlugs = /* @__PURE__ */ new Map();
+	for (const ref of args.context) {
+		if (ref.binding === "skill") {
+			const prior = usedSlugs.get(ref.slug);
+			if (prior !== void 0) {
+				if (prior !== ref.content) throw new Error(`slug collision on '${ref.slug}': two skill entries share the same slug but have different content`);
+				injected.push(ref);
+				continue;
+			}
+			usedSlugs.set(ref.slug, ref.content);
+			await args.deliver.skill({
+				slug: ref.slug,
+				content: ref.content
+			});
+		} else if (ref.binding === "prompt_prefix") promptParts.push(ref.content);
+		else userParts.push(ref.content);
+		injected.push(ref);
+	}
+	return {
+		injected,
+		systemPromptPrefix: promptParts.join(PROMPT_SEPARATOR),
+		userInlineSuffix: userParts.join(PROMPT_SEPARATOR)
+	};
+}
+//#endregion
 //#region ../agent-runtime/src/output-tools.ts
 /**
 * Submit-output tool contract.
@@ -10148,6 +10650,109 @@ function buildFulfillBriefUserPrompt(input, ctx) {
 	].filter(Boolean).join("\n");
 }
 //#endregion
+//#region ../agent-runtime/src/prompts/judge-eval-variant.ts
+/**
+* Build the first user-message prompt for a `judge_eval_variant` task
+* (#943 Slice 2).
+*
+* The parent agent's job is **fan-out-and-collect**: for each
+* `runTaskIds[i]`, spawn an isolated subagent via the `subagent` custom
+* tool (#1087), have it grade that variant against the shared rubric,
+* and collect each subagent's structured `judge_eval_variant_result`
+* payload. The parent does NOT grade itself; it composes the per-
+* variant results into the final `judge_eval_variant` output (results
+* array + optional deltas + verdicts).
+*
+* Isolation is the point: each variant gets a fresh subagent session
+* with no carryover context from sibling variants, so per-variant
+* grading is independent. Cost is bounded by `maxItems: 10` on
+* runTaskIds.
+*/
+function buildJudgeEvalVariantUserPrompt(input, ctx) {
+	const { runTaskIds, successCriteria } = input;
+	const rubric = successCriteria.rubric;
+	if (!rubric) throw new Error("judge_eval_variant requires successCriteria.rubric — none present");
+	const escapeCell = (s) => s.replace(/\\/g, "\\\\").replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
+	const criteriaTable = rubric.criteria.map((c) => `| \`${c.id}\` | ${c.weight.toFixed(3)} | ${c.scoring} | ${escapeCell(c.description)} |`).join("\n");
+	const targetsBlock = runTaskIds.map((id, i) => `${i + 1}. \`${id}\``).join("\n");
+	const finalOutputBlock = buildFinalOutputBlock({
+		taskType: "judge_eval_variant",
+		outputSchemaName: "JudgeEvalVariantOutput",
+		shapeSketch: [
+			"{",
+			"  \"results\": [",
+			"    {",
+			"      \"runTaskId\": \"<runTaskIds[i]>\",",
+			"      \"variantLabel\": \"<from variant input>\",",
+			"      \"scores\": [ { \"criterionId\": \"...\", \"score\": 0..1, \"rationale\": \"...\", \"assertions\": [...]? } ],",
+			"      \"composite\": <Σ(weight × score), 0..1>,",
+			"      \"verdict\": \"<1-3 sentences>\"",
+			"    },",
+			"    ...one entry per runTaskIds[i], same order",
+			"  ],",
+			"  \"deltas\": { \"<labelA> - <labelB>\": <composite(A) - composite(B)> },  // optional",
+			"  \"judgeModel\": \"<id>\",  // optional",
+			"  \"traceparent\": \"<from claim>\"",
+			"}"
+		].join("\n")
+	});
+	return [
+		"# Judge Eval Variants\n",
+		`You are grading ${runTaskIds.length} variants of a single run_eval scenario`,
+		"against ONE shared rubric. Your job is fan-out-and-collect — you do not",
+		"grade yourself.",
+		"",
+		`Task id: \`${ctx.taskId}\``,
+		`Diary: \`${ctx.diaryId}\``,
+		"",
+		"### Targets (variants to grade)",
+		"",
+		targetsBlock,
+		"",
+		"Each target is a completed `run_eval` task in the same correlation group.",
+		"Read its accepted attempt via `moltnet_get_task` / `moltnet_list_task_attempts`",
+		"to see the producer's output before grading.",
+		"",
+		"### Rubric",
+		"",
+		rubric.preamble ? `${rubric.preamble}\n` : "",
+		"| Criterion | Weight | Scoring | Description |",
+		"| --- | --- | --- | --- |",
+		criteriaTable,
+		"",
+		"### How to grade",
+		"",
+		"For EACH `runTaskIds[i]`:",
+		"",
+		"1. Call the `subagent` custom tool with:",
+		"   - `task`: a brief instructing the subagent to grade ONLY that variant",
+		"     against the rubric above; include the target task id and the rubric",
+		"     verbatim. The subagent has the same MoltNet tools and can fetch the",
+		"     accepted attempt output independently.",
+		"   - `output_schema`: `\"judge_eval_variant_result\"`",
+		"2. Receive the subagent's structured `judge_eval_variant_result` payload.",
+		"3. Append it to your `results[]` array, **in the same order as input.runTaskIds**.",
+		"",
+		"Do NOT score any variant in your own session. The whole point of the",
+		"subagent fan-out is per-variant context isolation — grading two variants",
+		"back-to-back in one session lets the second be biased by the first.",
+		"",
+		"### Composite arithmetic",
+		"",
+		"Each `composite` MUST equal `Σ(criterion.weight × score)` over the rubric",
+		"criteria. Drift > 0.001 is rejected. Subagents are instructed to compute it",
+		"themselves; double-check before assembling the final output.",
+		"",
+		"### Deltas (optional)",
+		"",
+		"If useful, populate `deltas` with pairwise composite differences keyed by",
+		"`\"<variantLabel-A> - <variantLabel-B>\"` (single space-hyphen-space). Both",
+		"labels must appear in `results`. Omit `deltas` entirely if not used.",
+		"",
+		finalOutputBlock
+	].filter((s) => s !== "").join("\n");
+}
+//#endregion
 //#region ../agent-runtime/src/prompts/judge-pack.ts
 function buildJudgePackUserPrompt(input, ctx) {
 	const { renderedPackId, sourcePackId, successCriteria } = input;
@@ -10454,6 +11059,15 @@ function buildTaskUserPrompt(task, ctx) {
 				diaryId: ctx.diaryId,
 				taskId: ctx.taskId
 			});
+		case JUDGE_EVAL_VARIANT_TYPE:
+			if (!Value.Check(JudgeEvalVariantInput, task.input)) {
+				const errors = [...Value.Errors(JudgeEvalVariantInput, task.input)];
+				throw new Error(`judge_eval_variant input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
+			}
+			return buildJudgeEvalVariantUserPrompt(task.input, {
+				diaryId: ctx.diaryId,
+				taskId: ctx.taskId
+			});
 		case RUN_EVAL_TYPE:
 			if (!Value.Check(RunEvalInput, task.input)) {
 				const errors = [...Value.Errors(RunEvalInput, task.input)];
@@ -14128,6 +14742,190 @@ function buildRuntimeInstructor(ctx) {
 	].join("\n");
 }
 //#endregion
+//#region src/runtime/subagent-tool.ts
+var SUBAGENT_SUBMIT_TOOL_NAME = "submit_subagent_output";
+/**
+* Parameters shape the parent LLM sees when calling the subagent tool.
+*
+*   - `task`         — natural-language instructions for the subagent.
+*                      The parent authors this per call. Must be
+*                      non-empty.
+*   - `output_schema` — name of a registered SubagentOutputContract.
+*                      Resolved at call time; unknown names error.
+*/
+var SubagentToolParameters = Type$1.Object({
+	task: Type$1.String({
+		minLength: 1,
+		description: "Natural-language instructions for the subagent. The subagent starts with a fresh conversation and a narrowed system prompt; this is the only context it has from you."
+	}),
+	output_schema: Type$1.String({
+		minLength: 1,
+		description: "Name of a registered subagent output contract. The subagent must submit a structured payload via `submit_subagent_output` matching this contract."
+	})
+}, { additionalProperties: false });
+var DEFAULT_SUBAGENT_TIMEOUT_MS = 300 * 1e3;
+/**
+* Build the subagent custom tool for a parent session. The handle
+* exposes the call counter so executors can emit summary telemetry
+* when the parent terminates.
+*/
+function createSubagentTool(args) {
+	const buildSession = args.buildAgentSession ?? buildAgentSession;
+	let callCount = 0;
+	return {
+		tool: defineTool({
+			name: "subagent",
+			label: "Delegate to subagent",
+			description: subagentToolDescription(),
+			parameters: SubagentToolParameters,
+			async execute(_id, params) {
+				if (!Value.Check(SubagentToolParameters, params)) return toolError(`subagent: invalid parameters: ${JSON.stringify([...Value.Errors(SubagentToolParameters, params)].slice(0, 3))}`);
+				const { task, output_schema } = params;
+				const contract = getSubagentOutputContract(output_schema);
+				if (!contract) return toolError(`subagent: unknown output_schema "${output_schema}". Registered contracts: [${listSubagentOutputContracts().map((c) => c.name).join(", ")}]`);
+				callCount += 1;
+				const callIndex = callCount;
+				let captured = null;
+				const submitTool = defineTool({
+					name: SUBAGENT_SUBMIT_TOOL_NAME,
+					label: `Submit ${output_schema}`,
+					description: `Submit your structured output for this subagent task. Call exactly once when done. Args MUST match the ${output_schema} contract; mismatches return a tool error you can recover from in the same session.`,
+					parameters: contract.parametersSchema,
+					async execute(_innerId, innerParams) {
+						if (!Value.Check(contract.parametersSchema, innerParams)) return toolError(`submit_subagent_output: schema validation failed: ${[...Value.Errors(contract.parametersSchema, innerParams)].slice(0, 3).map((e) => `${e.path}: ${e.message}`).join("; ")}. Re-call with a corrected payload.`);
+						captured = innerParams;
+						return {
+							content: [{
+								type: "text",
+								text: "Output captured. Subagent session will terminate; no further action needed."
+							}],
+							details: { captured: true },
+							terminate: true
+						};
+					}
+				});
+				const subagentInstructor = buildSubagentInstructor({
+					contractName: output_schema,
+					contractDescription: contract.description,
+					parentTaskId: args.parentTaskId,
+					callIndex
+				});
+				const session = await buildSession({
+					mountPath: args.mountPath,
+					piAuthDir: args.piAuthDir,
+					modelHandle: args.modelHandle,
+					agentName: args.agentName,
+					customTools: [...args.inheritedCustomTools, submitTool],
+					appendSystemPrompt: [args.parentRuntimeInstructor, subagentInstructor],
+					skillsOverride: () => ({
+						skills: [],
+						diagnostics: []
+					}),
+					otelSpanAttrs: {
+						"moltnet.task.id": args.parentTaskId,
+						"moltnet.task.type": args.parentTaskType,
+						"moltnet.task.attempt": args.parentAttemptN,
+						"moltnet.subagent.contract": output_schema,
+						"moltnet.subagent.index": callIndex
+					}
+				});
+				let abortReason = null;
+				let abortInvoked = false;
+				const fireAbort = (reason) => {
+					if (abortInvoked) return;
+					abortInvoked = true;
+					abortReason = reason;
+					session.abort().catch((err) => {
+						const message = err instanceof Error ? err.message : String(err);
+						process.stderr.write(`[subagent] inner session.abort() failed: ${message}\n`);
+					});
+				};
+				const cancelListener = args.parentCancelSignal ? (() => {
+					const signal = args.parentCancelSignal;
+					const listener = () => fireAbort("parent_cancelled");
+					if (signal.aborted) listener();
+					else signal.addEventListener("abort", listener, { once: true });
+					return () => signal.removeEventListener("abort", listener);
+				})() : null;
+				const timeoutMs = args.timeoutMs === void 0 || args.timeoutMs < 0 ? DEFAULT_SUBAGENT_TIMEOUT_MS : args.timeoutMs;
+				const timeoutHandle = timeoutMs > 0 ? setTimeout(() => fireAbort("subagent_timed_out"), timeoutMs) : null;
+				try {
+					await session.prompt(task);
+				} catch (err) {
+					return toolError(`subagent: inner session.prompt() threw: ${err instanceof Error ? err.message : String(err)}`);
+				} finally {
+					if (timeoutHandle) clearTimeout(timeoutHandle);
+					if (cancelListener) cancelListener();
+				}
+				if (abortReason !== null) return toolError(`subagent: ${abortReason === "subagent_timed_out" ? `subagent timed out after ${timeoutMs}ms` : "parent task was cancelled"}. The parent should fail this task or retry with a clearer scope.`);
+				if (captured === null) return toolError(`subagent: inner session ended without calling ${SUBAGENT_SUBMIT_TOOL_NAME}. The parent should retry with clearer instructions or fail the task.`);
+				return {
+					content: [{
+						type: "text",
+						text: JSON.stringify(captured)
+					}],
+					details: {
+						captured: true,
+						contract: output_schema,
+						callIndex
+					}
+				};
+			}
+		}),
+		getCallCount: () => callCount
+	};
+}
+function subagentToolDescription() {
+	return [
+		"Delegate a sub-task to a fresh subagent session with isolated context.",
+		"",
+		"The subagent starts with no conversation history and only the `task` ",
+		"string you provide as its instructions. It runs in the same VM with ",
+		"the same tools you have (Gondolin-routed Read/Write/Edit/Bash, ",
+		"moltnet_* tools), and is expected to call ",
+		`\`${SUBAGENT_SUBMIT_TOOL_NAME}\` with a payload matching the named `,
+		"contract before its session ends.",
+		"",
+		"On success, the tool result is the JSON-stringified subagent payload.",
+		"On failure (unknown contract, validation error, subagent did not ",
+		"submit) the tool returns isError:true with a recoverable message."
+	].join("\n");
+}
+function buildSubagentInstructor(args) {
+	return [
+		"# You are a subagent",
+		"",
+		`Parent task: \`${args.parentTaskId}\` (subagent call #${args.callIndex}).`,
+		"",
+		`Your assigned output contract is \`${args.contractName}\`:`,
+		`${args.contractDescription}`,
+		"",
+		"Rules for this session:",
+		"",
+		`- You MUST call \`${SUBAGENT_SUBMIT_TOOL_NAME}\` exactly once with a `,
+		"  payload matching the contract above. Your session terminates on ",
+		"  the valid call.",
+		"- The parent's message above is your task. Do not invent additional ",
+		"  steps the parent did not request.",
+		"- All MoltNet runtime invariants from the parent runtime instructor ",
+		"  apply (diary discipline, gh-auth pattern, etc.) IF you take any ",
+		"  action that would trigger them. Most subagents do not commit code ",
+		"  or open PRs — only do so if your task message explicitly requires it.",
+		"- You do NOT have access to the `subagent` tool. Do not attempt nested ",
+		"  delegation; do the work yourself."
+	].join("\n");
+}
+function toolError(text) {
+	return {
+		content: [{
+			type: "text",
+			text
+		}],
+		details: { captured: false },
+		isError: true
+	};
+}
+//#endregion
 //#region src/runtime/task-output.ts
 var METER_NAME = "@themoltnet/pi-extension/task-output";
 var parseResultCounter = null;
@@ -14439,6 +15237,7 @@ async function executePiTask(claimedTask, reporter, opts) {
 	const taskTeamId = task.teamId ?? "";
 	let reporterOpen = false;
 	let session = null;
+	let subagentHandle = null;
 	const finalUsage = emptyUsage(opts.provider, opts.model);
 	let cancelListener = null;
 	const makeFailedOutput = (code, message, usage = finalUsage) => ({
@@ -14556,47 +15355,55 @@ async function executePiTask(claimedTask, reporter, opts) {
 			});
 			const piAuthDir = process.env.PI_CODING_AGENT_DIR ?? join(homedir(), ".pi", "agent");
 			const modelHandle = getModel(opts.provider, opts.model);
-			const piOtelExtension = createPiOtelExtension({
-				agentName: opts.agentName,
-				spanAttributes: {
-					"moltnet.task.id": task.id,
-					"moltnet.task.attempt": attemptN,
-					"moltnet.task.type": task.taskType
-				}
-			});
-			const appendSystemPrompt = [buildRuntimeInstructor({
+			const runtimeInstructor = buildRuntimeInstructor({
 				taskId: task.id,
 				taskType: task.taskType,
 				attemptN,
 				diaryId,
 				agentName: opts.agentName,
 				correlationId: task.correlationId ?? null
-			})];
+			});
+			const appendSystemPrompt = [runtimeInstructor];
 			if (injectedContext.systemPromptPrefix) appendSystemPrompt.push(injectedContext.systemPromptPrefix);
 			const injectedSkills = injectedContext.skills;
-			const resourceLoader = new DefaultResourceLoader({
-				cwd: mountPath,
-				agentDir: piAuthDir,
-				extensionFactories: [piOtelExtension],
+			const parentSubagentTools = [];
+			if (taskTypeUsesSubagents(task.taskType)) {
+				subagentHandle = createSubagentTool({
+					mountPath,
+					piAuthDir,
+					modelHandle,
+					agentName: opts.agentName,
+					inheritedCustomTools: [...gondolinCustomTools, ...moltnetTools],
+					parentRuntimeInstructor: runtimeInstructor,
+					parentTaskId: task.id,
+					parentTaskType: task.taskType,
+					parentAttemptN: attemptN,
+					parentCancelSignal: reporter.cancelSignal
+				});
+				parentSubagentTools.push(subagentHandle.tool);
+			}
+			session = await buildAgentSession({
+				mountPath,
+				piAuthDir,
+				modelHandle,
+				agentName: opts.agentName,
+				customTools: [
+					...gondolinCustomTools,
+					...moltnetTools,
+					...submitTools,
+					...parentSubagentTools
+				],
 				appendSystemPrompt,
 				skillsOverride: () => ({
 					skills: injectedSkills,
 					diagnostics: []
-				})
+				}),
+				otelSpanAttrs: {
+					"moltnet.task.id": task.id,
+					"moltnet.task.attempt": attemptN,
+					"moltnet.task.type": task.taskType
+				}
 			});
-			await resourceLoader.reload();
-			session = (await createAgentSession({
-				agentDir: piAuthDir,
-				cwd: mountPath,
-				model: modelHandle,
-				customTools: [
-					...gondolinCustomTools,
-					...moltnetTools,
-					...submitTools
-				],
-				sessionManager: SessionManager.inMemory(),
-				resourceLoader
-			})).session;
 		} catch (err) {
 			const message = err instanceof Error ? err.message : String(err);
 			await emit("error", {
@@ -14667,6 +15474,10 @@ async function executePiTask(claimedTask, reporter, opts) {
 				phase: "session_prompt"
 			});
 		}
+		if (subagentHandle && subagentHandle.getCallCount() > 0) await emit("info", {
+			event: "subagent_summary",
+			callCount: subagentHandle.getCallCount()
+		});
 		await Promise.all(recordingPromise);
 		const cancelled = reporter.cancelSignal.aborted;
 		let parsedOutput = null;
@@ -15126,4 +15937,4 @@ function moltnetExtension(pi) {
 	registerMoltnetReflectCommand(pi, state);
 }
 //#endregion
-export { HOST_EXEC_DEFAULT_BASE_ENV, activateAgentEnv, createGondolinBashOps, createGondolinEditOps, createGondolinReadOps, createGondolinWriteOps, createMoltNetTools, createPiOtelExtension, createPiTaskExecutor, moltnetExtension as default, ensureSnapshot, executePiTask, findMainWorktree, injectTaskContext, loadCredentials, resumeVm, toGuestPath };
+export { HOST_EXEC_DEFAULT_BASE_ENV, activateAgentEnv, buildAgentSession, createGondolinBashOps, createGondolinEditOps, createGondolinReadOps, createGondolinWriteOps, createMoltNetTools, createPiOtelExtension, createPiTaskExecutor, createSubagentTool, moltnetExtension as default, ensureSnapshot, executePiTask, findMainWorktree, injectTaskContext, loadCredentials, resumeVm, toGuestPath };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@themoltnet/pi-extension",
-  "version": "0.14.0",
+  "version": "0.15.1",
   "type": "module",
   "description": "MoltNet pi extension — sandboxed tool execution in Gondolin VMs with MoltNet identity and persistent memory",
   "license": "MIT",
@@ -31,8 +31,8 @@
     "@earendil-works/gondolin": "^0.9.1",
     "@opentelemetry/api": "^1.9.0",
     "@sinclair/typebox": "^0.34.0",
-    "@themoltnet/agent-runtime": "0.12.0",
-    "@themoltnet/sdk": "0.100.0"
+    "@themoltnet/agent-runtime": "0.14.0",
+    "@themoltnet/sdk": "0.101.0"
   },
   "peerDependencies": {
     "@earendil-works/pi-coding-agent": ">=0.74.0",