npm - @themoltnet/pi-extension - Versions diffs - 0.15.0 → 0.15.1 - Mend

@themoltnet/pi-extension 0.15.0 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -377,6 +377,17 @@ export declare interface SandboxConfig {
         /** Overlay disk size (default '3G'). */
         overlaySize?: string;
     };
+    /** Shell commands to run every VM resume, after platform setup
+     *  (TLS, DNS, git safe.directory, tmpfs node_modules) and before
+     *  the agent session starts. Use for per-session bootstrap that
+     *  doesn't belong baked into the snapshot.
+     *
+     *  Not included in the snapshot cache key — changes here apply on
+     *  every resume without triggering a snapshot rebuild. Each command
+     *  runs in a fresh shell with `set -eu` and `set -o pipefail`; a
+     *  non-zero exit (including from any segment of a pipeline) aborts
+     *  resume with the failing command's stderr/stdout tail. */
+    resumeCommands?: string[];
     /** VFS shadow settings — hide host paths from the guest. */
     vfs?: {
         /** Paths (relative to workspace root) to shadow from the host mount. */

package/dist/index.js CHANGED Viewed

@@ -2515,11 +2515,12 @@ function createCryptoNamespace(context, signingRequests) {
 function createDiariesNamespace(context) {
 	const { client, auth } = context;
 	return {
-		async list(query) {
+		async list(query, headers) {
 			return unwrapResult(await listDiaries({
 				client,
 				auth,
-				query
+				query,
+				headers
 			}));
 		},
 		async create(body, headers) {
@@ -8177,6 +8178,27 @@ var BASE_ALLOWED_HOSTS = [
 	"*.googlesource.com"
 ];
 /**
+* Run a shell command in the guest and throw if it fails. Mirror of
+* `run()` in `snapshot.ts` for the resume-side hook chain — every
+* setup step is essential to a healthy session, so a silent non-zero
+* exit (e.g. a mount that fails into the FUSE write path, or a
+* consumer-provided resume command that fails to install pnpm) must
+* surface immediately rather than fall through to cryptic agent
+* errors later.
+*/
+async function vmRun(vm, label, command) {
+	const wrapped = `set -eu\nset -o pipefail\n${command}`;
+	const r = await vm.exec([
+		"sh",
+		"-c",
+		wrapped
+	]);
+	if (r.exitCode !== 0) {
+		const tail = [r.stderr, r.stdout].filter(Boolean).join("\n").slice(-800);
+		throw new Error(`resume step "${label}" failed (exit ${r.exitCode}):\n${tail}`);
+	}
+}
+/**
 * Resume a VM from a checkpoint, inject credentials, configure egress +
 * TLS. Returns the managed VM handle.
 */
@@ -8236,8 +8258,9 @@ async function resumeVm(config) {
     update-ca-certificates 2>/dev/null
     cat /etc/gondolin/mitm/ca.crt >> /etc/ssl/certs/ca-certificates.crt
   '`);
-	await vm.exec(`sh -c 'echo "nameserver 8.8.8.8
-nameserver 1.1.1.1" > /etc/resolv.conf'`);
+	await vmRun(vm, "DNS resolvers", `printf 'nameserver 8.8.8.8\\nnameserver 1.1.1.1\\n' > /etc/resolv.conf`);
+	await vmRun(vm, "git safe.directory", `git config --system --add safe.directory '*'`);
+	for (const [i, cmd] of (config.sandboxConfig?.resumeCommands ?? []).entries()) await vmRun(vm, `resumeCommands[${i}]`, cmd);
 	const vmSshDir = `${vmAgentDir}/ssh`;
 	await vm.exec(`mkdir -p ${vmAgentDir}/ssh /home/agent/.pi/agent`);
 	if (creds.piAuthJson !== null) await vm.fs.writeFile("/home/agent/.pi/agent/auth.json", creds.piAuthJson, { mode: 384 });
@@ -8613,61 +8636,6 @@ async function buildAgentSession(args) {
 	})).session;
 }
 //#endregion
-//#region ../agent-runtime/src/context-bindings.ts
-var PROMPT_SEPARATOR = "\n\n---\n\n";
-/**
-* Resolve `task.input.context[]` into delivered side-effects (skills
-* persisted via `deliver.skill`) and prompt fragments
-* (`systemPromptPrefix`, `userInlineSuffix`) the caller weaves into the
-* built prompt.
-*
-* Per-binding semantics (V1):
-*   - `skill`         → `deliver.skill({ slug, content })` once per ref.
-*                       Slug collisions on distinct contents are
-*                       refused loudly.
-*   - `prompt_prefix` → content appended to `systemPromptPrefix` with
-*                       the canonical `\n\n---\n\n` separator (in
-*                       declared order).
-*   - `user_inline`   → content appended to `userInlineSuffix` in
-*                       declared order, same separator.
-*
-* No fetching, no hashing — bytes are inlined in `ContextRef.content`,
-* and the task's `inputCid` already pins the entire input. The imposer
-* chose these bytes; the resolver just dispatches them.
-*
-* The function is pure with respect to its arguments: file writes are
-* confined to the injected `deliver` callback, which makes the
-* resolver trivial to test.
-*/
-async function resolveTaskContext(args) {
-	const promptParts = [];
-	const userParts = [];
-	const injected = [];
-	const usedSlugs = /* @__PURE__ */ new Map();
-	for (const ref of args.context) {
-		if (ref.binding === "skill") {
-			const prior = usedSlugs.get(ref.slug);
-			if (prior !== void 0) {
-				if (prior !== ref.content) throw new Error(`slug collision on '${ref.slug}': two skill entries share the same slug but have different content`);
-				injected.push(ref);
-				continue;
-			}
-			usedSlugs.set(ref.slug, ref.content);
-			await args.deliver.skill({
-				slug: ref.slug,
-				content: ref.content
-			});
-		} else if (ref.binding === "prompt_prefix") promptParts.push(ref.content);
-		else userParts.push(ref.content);
-		injected.push(ref);
-	}
-	return {
-		injected,
-		systemPromptPrefix: promptParts.join(PROMPT_SEPARATOR),
-		userInlineSuffix: userParts.join(PROMPT_SEPARATOR)
-	};
-}
-//#endregion
 //#region ../tasks/src/formats.ts
 /**
 * Register TypeBox string formats used across Task / TaskOutput / task-type
@@ -8884,7 +8852,7 @@ unchanged" is.
 * (server-side schema check). Self-assessment is a truthful self-rating,
 * NOT enforcement — `verification.passed=false` does not block /complete
 * and does not affect `acceptedAttemptN`. See
-* `docs/agent-runtime.md` for the full producer/judge flow.
+* `docs/understand/agent-runtime.md` for the full producer/judge flow.
 *
 * **Binding evaluation** (judgment tasks: `assess_brief`, `judge_pack`).
 * A separate task whose IS the application of `successCriteria` to
@@ -9041,6 +9009,39 @@ var AssessBriefOutput = Type$1.Object({
 	$id: "AssessBriefOutput",
 	additionalProperties: false
 });
+/**
+* Async preflight (#1096):
+*   - `targetTaskId` resolves to a real task the caller can see.
+*   - The target is a `fulfill_brief` (you cannot grade an arbitrary
+*     task type as if it were a brief fulfillment).
+*   - The target is `completed` with an accepted attempt — grading
+*     an in-flight or failed task would either race or grade nothing.
+*
+* Agent-distinctness ("assessor ≠ producer") is a runtime / auth-
+* layer concern and intentionally NOT checked here. It belongs in
+* an auth-aware claim-time check.
+*/
+async function validateAssessBriefInputAsync(input, ctx) {
+	const { targetTaskId } = input;
+	const errors = [];
+	const target = await ctx.resolveTask(targetTaskId);
+	if (!target) {
+		errors.push({
+			field: "targetTaskId",
+			message: `targetTaskId ${targetTaskId} does not resolve to a task you can read`
+		});
+		return errors;
+	}
+	if (target.taskType !== "fulfill_brief") errors.push({
+		field: "targetTaskId",
+		message: `targetTaskId ${targetTaskId} is a ${target.taskType}, not a fulfill_brief`
+	});
+	if (target.status !== "completed" || target.acceptedAttemptN === null) errors.push({
+		field: "targetTaskId",
+		message: `targetTaskId ${targetTaskId} is not completed with an accepted attempt (status=${target.status}, acceptedAttemptN=${target.acceptedAttemptN})`
+	});
+	return errors;
+}
 //#endregion
 //#region ../tasks/src/task-types/curate-pack.ts
 /**
@@ -9239,6 +9240,311 @@ function validateJudgePackOutput(output) {
 	}
 	return null;
 }
+/**
+* Async preflight (#1096):
+*   - `renderedPackId` resolves to a rendered_packs row.
+*   - `sourcePackId` resolves to a context_packs row.
+*   - The rendered pack actually came from the claimed source pack —
+*     `renderedPack.sourcePackId === input.sourcePackId`. Without
+*     this check a judge can be tricked into grading rendering A as
+*     if it came from source B.
+*/
+async function validateJudgePackInputAsync(input, ctx) {
+	const { renderedPackId, sourcePackId } = input;
+	const errors = [];
+	const [rendered, source] = await Promise.all([ctx.resolveRenderedPack(renderedPackId), ctx.resolveContextPack(sourcePackId)]);
+	if (!rendered) errors.push({
+		field: "renderedPackId",
+		message: `renderedPackId ${renderedPackId} does not resolve to a rendered pack you can read`
+	});
+	if (!source) errors.push({
+		field: "sourcePackId",
+		message: `sourcePackId ${sourcePackId} does not resolve to a context pack you can read`
+	});
+	if (rendered && source && rendered.sourcePackId !== source.id) errors.push({
+		field: "sourcePackId",
+		message: `renderedPack ${renderedPackId} was produced from source ${rendered.sourcePackId}, not from sourcePackId=${sourcePackId}`
+	});
+	return errors;
+}
+//#endregion
+//#region ../tasks/src/task-types/judge-eval-variant.ts
+/**
+* `judge_eval_variant` — score N variants of a `run_eval` scenario
+* against a single rubric, in one pass, with per-variant subagent
+* isolation.
+*
+* output_kind: judgment
+* criteria: required (`successCriteria.rubric` — same envelope shape as
+*   `judge_pack` / `assess_brief`)
+* references: not required at the input layer — `runTaskIds` already
+*   pin the targets being graded.
+*
+* Slice 2 of #943. The parent task carries the rubric and the list of
+* variant `run_eval` task ids. The pi executor registers the generic
+* `subagent` custom tool (#1087), and the parent LLM calls
+* `subagent({ task, output_schema: 'judge_eval_variant_result' })` once
+* per variant — each child session has fresh context, fetches the
+* variant's accepted attempt output via `moltnet_get_task` /
+* `moltnet_list_task_attempts`, and grades against the rubric.
+*
+* Reuses `JudgePackScore` from `judge_pack` for per-criterion scoring
+* (Lane 1 binary via `llm_checklist`, Lane 2 graded via `llm_score`,
+* deterministic_*) — the score shape is the same across judgment
+* tasks; only the wrapping (per-variant grouping + deltas) differs.
+*
+* Cross-task input invariants — "all targets share the same
+* correlation_id, all are `run_eval`, all are completed with an
+* accepted attempt, all share byte-identical `input.successCriteria`"
+* — REQUIRE async DB lookups and live in `validateInputAsync` below,
+* which the task service runs at create time (#1096 wiring). The
+* TypeBox layer here only enforces shape: UUID format,
+* minItems/maxItems, rubric presence + weight invariant.
+*/
+var JUDGE_EVAL_VARIANT_TYPE = "judge_eval_variant";
+var JudgeEvalVariantInput = Type$1.Object({
+	runTaskIds: Type$1.Array(Type$1.String({ format: "uuid" }), {
+		minItems: 2,
+		maxItems: 10
+	}),
+	successCriteria: SuccessCriteria
+}, {
+	$id: "JudgeEvalVariantInput",
+	additionalProperties: false
+});
+/**
+* Per-variant grading. `scores[]` shape is identical to `JudgePackScore`
+* (mode-aware: binary via `llm_checklist`, graded via `llm_score`,
+* deterministic_*). Reuse the type rather than re-declare.
+*
+* This is also the **subagent output contract** — the parent's
+* `subagent` tool resolves the contract name `judge_eval_variant_result`
+* to this schema. See `agent-runtime`'s subagent contract registry.
+*/
+var JudgeEvalVariantResult = Type$1.Object({
+	runTaskId: Type$1.String({ format: "uuid" }),
+	variantLabel: Type$1.String({
+		minLength: 1,
+		maxLength: 64,
+		pattern: "^(?!.* - ).*$"
+	}),
+	scores: Type$1.Array(JudgePackScore, { minItems: 1 }),
+	composite: Type$1.Number({
+		minimum: 0,
+		maximum: 1
+	}),
+	verdict: Type$1.String({ minLength: 1 })
+}, {
+	$id: "JudgeEvalVariantResult",
+	additionalProperties: false
+});
+var JudgeEvalVariantOutput = Type$1.Object({
+	results: Type$1.Array(JudgeEvalVariantResult, { minItems: 2 }),
+	deltas: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Number({
+		minimum: -1,
+		maximum: 1
+	}))),
+	judgeModel: Type$1.Optional(Type$1.String({ minLength: 1 })),
+	traceparent: Type$1.String({ minLength: 1 })
+}, {
+	$id: "JudgeEvalVariantOutput",
+	additionalProperties: false
+});
+/**
+* Synchronous input invariants beyond TypeBox shape: rubric must be
+* present (already required by the schema, but the rubric body has
+* its own per-criterion weight invariant) and the rubric's weights
+* must sum to 1.
+*
+* Cross-task invariants (all targets are `run_eval`, all completed,
+* share `correlation_id`, byte-identical `input.successCriteria`)
+* are NOT checked here — they require async DB lookups against
+* `runTaskIds` and live in `validateJudgeEvalVariantInputAsync`
+* below, invoked by the task service at create time (#1096).
+*/
+function validateJudgeEvalVariantInput(input) {
+	const sc = input.successCriteria;
+	if (!sc) return "successCriteria is required for judge_eval_variant";
+	if (!sc.rubric) return "successCriteria.rubric is required for judge_eval_variant";
+	return validateRubricWeights(sc.rubric);
+}
+/**
+* Output cross-field invariants the schema cannot express:
+*
+*   1. `results.length === input.runTaskIds.length` — every variant
+*      the imposer asked for must be graded. Partial grading
+*      invalidates cross-variant comparison; fail the whole task
+*      rather than silently report a subset.
+*
+*   2. `results[i].runTaskId === input.runTaskIds[i]` — order is
+*      load-bearing for downstream consumers (e.g. deltas keyed by
+*      adjacent pairs). Mismatch is an LLM bug; reject loudly.
+*
+*   3. Each `result.scores` follows the same `llm_checklist` rule
+*      `judge_pack` enforces (#999): if a score has an `assertions`
+*      array, the numeric score MUST be `1` iff every assertion
+*      passes. Inconsistent payloads pollute attestations.
+*
+*   4. Each `result.composite` MUST equal the rubric-weighted sum
+*      `Σ(weight_j × scores[j].score)`. The parent (and any subagent
+*      it delegated to) is supposed to compute this; surfacing a
+*      drift here catches LLMs that hand-wave the arithmetic.
+*
+*   5. Optional `deltas` keys MUST be of the form `"A - B"` where
+*      both `A` and `B` are variantLabels present in `results`.
+*      Values are not range-checked (any float in [-1, 1] is
+*      arithmetically possible).
+*/
+function validateJudgeEvalVariantOutput(output, input) {
+	const out = output;
+	const inp = input;
+	if (inp) {
+		if (out.results.length !== inp.runTaskIds.length) return `results.length (${out.results.length}) does not match input.runTaskIds.length (${inp.runTaskIds.length}). Every variant must be graded; partial grading is rejected.`;
+		for (let i = 0; i < out.results.length; i++) if (out.results[i].runTaskId !== inp.runTaskIds[i]) return `results[${i}].runTaskId (${out.results[i].runTaskId}) does not match input.runTaskIds[${i}] (${inp.runTaskIds[i]}). Order must align with input for downstream delta computation.`;
+	}
+	for (let r = 0; r < out.results.length; r++) {
+		const result = out.results[r];
+		for (let s = 0; s < result.scores.length; s++) {
+			const sc = result.scores[s];
+			if (!sc.assertions) continue;
+			const allPassed = sc.assertions.every((a) => a.passed);
+			const expected = allPassed ? 1 : 0;
+			if (sc.score !== expected) return `results[${r}].scores[${s}] (criterionId="${sc.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${sc.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
+		}
+	}
+	if (inp?.successCriteria?.rubric) {
+		const criteria = inp.successCriteria.rubric.criteria;
+		const weightById = new Map(criteria.map((c) => [c.id, c.weight]));
+		for (let r = 0; r < out.results.length; r++) {
+			const result = out.results[r];
+			let sum = 0;
+			for (const sc of result.scores) {
+				const w = weightById.get(sc.criterionId);
+				if (w === void 0) return `results[${r}].scores: criterionId "${sc.criterionId}" is not in the input rubric (known: ${Array.from(weightById.keys()).join(", ")}). Score every rubric criterion exactly once; do not invent new ids.`;
+				sum += w * sc.score;
+			}
+			if (Math.abs(sum - result.composite) > .001) return `results[${r}].composite (${result.composite}) does not match Σ(weight × score) (${sum.toFixed(6)}). Composite must be the rubric-weighted sum of per-criterion scores (drift > 0.001).`;
+		}
+	}
+	if (out.deltas) {
+		const labels = new Set(out.results.map((r) => r.variantLabel));
+		for (const key of Object.keys(out.deltas)) {
+			const m = /^(.+?) - (.+)$/.exec(key);
+			if (!m) return `deltas key "${key}" is not of the form "<variantLabel-A> - <variantLabel-B>". Use a single space-hyphen-space separator between labels.`;
+			const [, a, b] = m;
+			if (!labels.has(a) || !labels.has(b)) return `deltas key "${key}" references variantLabel(s) not present in results: ${!labels.has(a) ? `"${a}" missing` : ""}${!labels.has(a) && !labels.has(b) ? ", " : ""}${!labels.has(b) ? `"${b}" missing` : ""}`;
+		}
+	}
+	return null;
+}
+/**
+* Local stable-stringify for cross-variant `successCriteria` byte-
+* equality. Recursively sorts object keys; arrays preserve order
+* (intentional — rubric criteria order is semantically meaningful).
+* Mirrors the canonical-JSON shape `crypto-service` uses for CIDs,
+* without taking on a crypto-service dep just for this comparison.
+*/
+function stableStringify(value) {
+	if (value === null || typeof value !== "object") return JSON.stringify(value);
+	if (Array.isArray(value)) return "[" + value.map(stableStringify).join(",") + "]";
+	const obj = value;
+	return "{" + Object.keys(obj).sort().map((k) => JSON.stringify(k) + ":" + stableStringify(obj[k])).join(",") + "}";
+}
+/**
+* Async preflight for `judge_eval_variant` (#1096 + #943):
+*
+*  1. Every `runTaskIds[i]` resolves to a task the caller can read.
+*  2. Every resolved task is `taskType === 'run_eval'`.
+*  3. Every resolved task is `status === 'completed'` with a
+*     non-null `acceptedAttemptN` — grading an unaccepted attempt
+*     races with re-attempts and pollutes the judge attestation.
+*  4. Every resolved task shares a non-null `correlationId`, and all
+*     `correlationId`s are equal. Without this an imposer could
+*     fabricate a "variant set" by stapling unrelated runs together.
+*  5. The shared `correlationId` is NOT already sealed. A previous
+*     judge_eval_variant against the same group is final; produce a
+*     fresh correlation_id for a new judging round rather than
+*     adding contradictory verdicts to a sealed group.
+*  6. Every variant's `input.successCriteria` is byte-identical (via
+*     stable-stringify). Different rubrics across "variants" makes
+*     the comparison meaningless.
+*/
+async function validateJudgeEvalVariantInputAsync(input, ctx) {
+	const { runTaskIds } = input;
+	const errors = [];
+	const resolved = await Promise.all(runTaskIds.map((id) => ctx.resolveTask(id)));
+	let missingTargets = false;
+	const presentTargets = [];
+	for (let i = 0; i < runTaskIds.length; i++) {
+		const t = resolved[i];
+		if (!t) {
+			missingTargets = true;
+			errors.push({
+				field: `runTaskIds[${i}]`,
+				message: `runTaskIds[${i}]=${runTaskIds[i]} does not resolve to a task you can read`
+			});
+			continue;
+		}
+		presentTargets.push(t);
+		if (t.taskType !== "run_eval") errors.push({
+			field: `runTaskIds[${i}]`,
+			message: `runTaskIds[${i}]=${runTaskIds[i]} is a ${t.taskType}, not a run_eval`
+		});
+		if (t.status !== "completed" || t.acceptedAttemptN === null) errors.push({
+			field: `runTaskIds[${i}]`,
+			message: `runTaskIds[${i}]=${runTaskIds[i]} is not completed with an accepted attempt (status=${t.status}, acceptedAttemptN=${t.acceptedAttemptN})`
+		});
+	}
+	if (missingTargets || presentTargets.length === 0) return errors;
+	const correlationIds = new Set(presentTargets.map((t) => t.correlationId ?? "__null__"));
+	if (correlationIds.has("__null__")) errors.push({
+		field: "runTaskIds",
+		message: "one or more run_eval targets have no correlation_id; cannot group as variants"
+	});
+	if (correlationIds.size > 1) errors.push({
+		field: "runTaskIds",
+		message: `run_eval targets span multiple correlation_ids (${Array.from(correlationIds).join(", ")}); variants must share one`
+	});
+	if (errors.length > 0) return errors;
+	const correlationId = presentTargets[0].correlationId;
+	if (!correlationId) return errors;
+	const seal = await ctx.findCorrelationSeal(correlationId);
+	if (seal) errors.push({
+		field: "runTaskIds",
+		message: `correlation_id ${correlationId} is already sealed by ${seal.sealedByTaskType}/${seal.sealedByTaskId} at ${seal.sealedAt}; use a fresh correlation_id for a new judging round`
+	});
+	const first = stableStringify(presentTargets[0].input.successCriteria);
+	for (let i = 1; i < presentTargets.length; i++) if (stableStringify(presentTargets[i].input.successCriteria) !== first) {
+		errors.push({
+			field: `runTaskIds[${i}]`,
+			message: `runTaskIds[${i}] has a different input.successCriteria than runTaskIds[0]; all variants must share the rubric and gates`
+		});
+		break;
+	}
+	return errors;
+}
+/**
+* Side effect emitted on successful `judge_eval_variant` create:
+* seal the shared correlation_id atomically with the insert. The
+* task service applies the seal in the same transaction; a
+* concurrent second `judge_eval_variant` against the same group
+* loses the race and is rejected with a clean conflict error.
+*
+* The seal applies to the SHARED correlation_id of the targets —
+* NOT to the judge task's own correlationId (which is typically
+* null or distinct). The task service derives the correlationId
+* for the effect from the resolved targets, not from the judge
+* task row.
+*/
+async function onCreateJudgeEvalVariant(input, ctx) {
+	const { runTaskIds } = input;
+	const first = await ctx.resolveTask(runTaskIds[0]);
+	if (!first?.correlationId) return [];
+	return [{
+		kind: "sealCorrelation",
+		correlationId: first.correlationId
+	}];
+}
 //#endregion
 //#region ../tasks/src/task-types/render-pack.ts
 /**
@@ -9278,6 +9584,18 @@ var RenderPackOutput = Type$1.Object({
 	$id: "RenderPackOutput",
 	additionalProperties: false
 });
+/**
+* Async preflight (#1096): `packId` resolves to a context_packs row
+* the caller can read.
+*/
+async function validateRenderPackInputAsync(input, ctx) {
+	const { packId } = input;
+	if (!await ctx.resolveContextPack(packId)) return [{
+		field: "packId",
+		message: `packId ${packId} does not resolve to a context pack you can read`
+	}];
+	return [];
+}
 //#endregion
 //#region ../tasks/src/task-types/run-eval.ts
 /**
@@ -9385,7 +9703,8 @@ var BUILT_IN_TASK_TYPES = {
 		outputSchema: AssessBriefOutput,
 		outputKind: "judgment",
 		requiresReferences: true,
-		validateInput: validateJudgmentInput
+		validateInput: validateJudgmentInput,
+		validateInputAsync: validateAssessBriefInputAsync
 	},
 	[CURATE_PACK_TYPE]: {
 		name: CURATE_PACK_TYPE,
@@ -9401,7 +9720,8 @@ var BUILT_IN_TASK_TYPES = {
 		outputSchema: RenderPackOutput,
 		outputKind: "artifact",
 		requiresReferences: false,
-		validateOutput: requireVerificationWhenCriteriaPresent
+		validateOutput: requireVerificationWhenCriteriaPresent,
+		validateInputAsync: validateRenderPackInputAsync
 	},
 	[JUDGE_PACK_TYPE]: {
 		name: JUDGE_PACK_TYPE,
@@ -9410,7 +9730,8 @@ var BUILT_IN_TASK_TYPES = {
 		outputKind: "judgment",
 		requiresReferences: true,
 		validateInput: validateJudgmentInput,
-		validateOutput: validateJudgePackOutput
+		validateOutput: validateJudgePackOutput,
+		validateInputAsync: validateJudgePackInputAsync
 	},
 	[RUN_EVAL_TYPE]: {
 		name: RUN_EVAL_TYPE,
@@ -9419,6 +9740,18 @@ var BUILT_IN_TASK_TYPES = {
 		outputKind: "artifact",
 		requiresReferences: false,
 		validateOutput: validateRunEvalOutput
+	},
+	[JUDGE_EVAL_VARIANT_TYPE]: {
+		name: JUDGE_EVAL_VARIANT_TYPE,
+		inputSchema: JudgeEvalVariantInput,
+		outputSchema: JudgeEvalVariantOutput,
+		outputKind: "judgment",
+		requiresReferences: false,
+		validateInput: validateJudgeEvalVariantInput,
+		validateOutput: validateJudgeEvalVariantOutput,
+		validateInputAsync: validateJudgeEvalVariantInputAsync,
+		onCreate: onCreateJudgeEvalVariant,
+		usesSubagents: true
 	}
 };
 //#endregion
@@ -9718,6 +10051,133 @@ Type$1.Object({
 	additionalProperties: false
 });
 //#endregion
+//#region ../agent-runtime/src/subagent-output-contracts.ts
+var REGISTRY = /* @__PURE__ */ new Map();
+/**
+* Register a subagent output contract. Idempotent: re-registering the
+* same name with a different schema throws — contracts are meant to
+* be stable. Re-registering with the identical contract object (same
+* reference) is a no-op for HMR and test convenience.
+*
+* Typically called at module-init time alongside task-type
+* registration. See task-types/index.ts in @moltnet/tasks for the
+* conventional pattern.
+*/
+function registerSubagentOutputContract(contract) {
+	if (!contract.name || contract.name.trim().length === 0) throw new Error("subagent output contract name is required");
+	if (!/^[a-z][a-z0-9_]*$/.test(contract.name)) throw new Error(`subagent output contract name '${contract.name}' must be lower_snake_case (starts with a letter, then [a-z0-9_]+)`);
+	const existing = REGISTRY.get(contract.name);
+	if (existing && existing !== contract) {
+		if (existing.parametersSchema !== contract.parametersSchema) throw new Error(`subagent output contract '${contract.name}' is already registered with a different schema; refusing to override`);
+	}
+	REGISTRY.set(contract.name, contract);
+}
+/**
+* Resolve a subagent output contract by name. Returns `null` for
+* unknown names — callers (the subagent custom tool) decide whether
+* that's a tool error the parent LLM can recover from or a hard fail.
+*/
+function getSubagentOutputContract(name) {
+	return REGISTRY.get(name) ?? null;
+}
+/**
+* List all registered contracts. Useful for diagnostics and for the
+* subagent tool's parameter description so a parent LLM can see what
+* contracts are available without enumerating them in its prompt.
+*/
+function listSubagentOutputContracts() {
+	return [...REGISTRY.values()];
+}
+//#endregion
+//#region ../agent-runtime/src/built-in-contract-registrations.ts
+/**
+* Built-in subagent output contracts (#1087, #943).
+*
+* Why this is an exported function and not a module-init side
+* effect:
+*
+*   - The registry is process-global. Module-init registration
+*     fires exactly once per Node process (ESM modules are cached
+*     by URL). Tests that call `__resetSubagentOutputContractsForTests()`
+*     to start from an empty registry have no way to repopulate
+*     the built-ins without re-evaluating the module — which the
+*     cache prevents. PR #1101 review M4.
+*   - An explicit `registerBuiltInSubagentContracts()` lets the
+*     package index call it once at module load AND lets test
+*     setup hooks call it again after `__reset...`.
+*   - `registerSubagentOutputContract` is itself idempotent for
+*     identical re-registrations, so calling this function twice
+*     in the same process is safe.
+*
+* Adding a new built-in: extend the body of this function. Do not
+* call `registerSubagentOutputContract` from anywhere else in the
+* package — keeping all built-ins in one function makes the set
+* auditable.
+*/
+function registerBuiltInSubagentContracts() {
+	registerSubagentOutputContract({
+		name: "judge_eval_variant_result",
+		description: "Per-variant grading result produced by a subagent of judge_eval_variant: scores against the shared rubric, composite, and a 1-3 sentence verdict for a single variant.",
+		parametersSchema: JudgeEvalVariantResult
+	});
+}
+registerBuiltInSubagentContracts();
+//#endregion
+//#region ../agent-runtime/src/context-bindings.ts
+var PROMPT_SEPARATOR = "\n\n---\n\n";
+/**
+* Resolve `task.input.context[]` into delivered side-effects (skills
+* persisted via `deliver.skill`) and prompt fragments
+* (`systemPromptPrefix`, `userInlineSuffix`) the caller weaves into the
+* built prompt.
+*
+* Per-binding semantics (V1):
+*   - `skill`         → `deliver.skill({ slug, content })` once per ref.
+*                       Slug collisions on distinct contents are
+*                       refused loudly.
+*   - `prompt_prefix` → content appended to `systemPromptPrefix` with
+*                       the canonical `\n\n---\n\n` separator (in
+*                       declared order).
+*   - `user_inline`   → content appended to `userInlineSuffix` in
+*                       declared order, same separator.
+*
+* No fetching, no hashing — bytes are inlined in `ContextRef.content`,
+* and the task's `inputCid` already pins the entire input. The imposer
+* chose these bytes; the resolver just dispatches them.
+*
+* The function is pure with respect to its arguments: file writes are
+* confined to the injected `deliver` callback, which makes the
+* resolver trivial to test.
+*/
+async function resolveTaskContext(args) {
+	const promptParts = [];
+	const userParts = [];
+	const injected = [];
+	const usedSlugs = /* @__PURE__ */ new Map();
+	for (const ref of args.context) {
+		if (ref.binding === "skill") {
+			const prior = usedSlugs.get(ref.slug);
+			if (prior !== void 0) {
+				if (prior !== ref.content) throw new Error(`slug collision on '${ref.slug}': two skill entries share the same slug but have different content`);
+				injected.push(ref);
+				continue;
+			}
+			usedSlugs.set(ref.slug, ref.content);
+			await args.deliver.skill({
+				slug: ref.slug,
+				content: ref.content
+			});
+		} else if (ref.binding === "prompt_prefix") promptParts.push(ref.content);
+		else userParts.push(ref.content);
+		injected.push(ref);
+	}
+	return {
+		injected,
+		systemPromptPrefix: promptParts.join(PROMPT_SEPARATOR),
+		userInlineSuffix: userParts.join(PROMPT_SEPARATOR)
+	};
+}
+//#endregion
 //#region ../agent-runtime/src/output-tools.ts
 /**
 * Submit-output tool contract.
@@ -10190,6 +10650,109 @@ function buildFulfillBriefUserPrompt(input, ctx) {
 	].filter(Boolean).join("\n");
 }
 //#endregion
+//#region ../agent-runtime/src/prompts/judge-eval-variant.ts
+/**
+* Build the first user-message prompt for a `judge_eval_variant` task
+* (#943 Slice 2).
+*
+* The parent agent's job is **fan-out-and-collect**: for each
+* `runTaskIds[i]`, spawn an isolated subagent via the `subagent` custom
+* tool (#1087), have it grade that variant against the shared rubric,
+* and collect each subagent's structured `judge_eval_variant_result`
+* payload. The parent does NOT grade itself; it composes the per-
+* variant results into the final `judge_eval_variant` output (results
+* array + optional deltas + verdicts).
+*
+* Isolation is the point: each variant gets a fresh subagent session
+* with no carryover context from sibling variants, so per-variant
+* grading is independent. Cost is bounded by `maxItems: 10` on
+* runTaskIds.
+*/
+function buildJudgeEvalVariantUserPrompt(input, ctx) {
+	const { runTaskIds, successCriteria } = input;
+	const rubric = successCriteria.rubric;
+	if (!rubric) throw new Error("judge_eval_variant requires successCriteria.rubric — none present");
+	const escapeCell = (s) => s.replace(/\\/g, "\\\\").replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
+	const criteriaTable = rubric.criteria.map((c) => `| \`${c.id}\` | ${c.weight.toFixed(3)} | ${c.scoring} | ${escapeCell(c.description)} |`).join("\n");
+	const targetsBlock = runTaskIds.map((id, i) => `${i + 1}. \`${id}\``).join("\n");
+	const finalOutputBlock = buildFinalOutputBlock({
+		taskType: "judge_eval_variant",
+		outputSchemaName: "JudgeEvalVariantOutput",
+		shapeSketch: [
+			"{",
+			"  \"results\": [",
+			"    {",
+			"      \"runTaskId\": \"<runTaskIds[i]>\",",
+			"      \"variantLabel\": \"<from variant input>\",",
+			"      \"scores\": [ { \"criterionId\": \"...\", \"score\": 0..1, \"rationale\": \"...\", \"assertions\": [...]? } ],",
+			"      \"composite\": <Σ(weight × score), 0..1>,",
+			"      \"verdict\": \"<1-3 sentences>\"",
+			"    },",
+			"    ...one entry per runTaskIds[i], same order",
+			"  ],",
+			"  \"deltas\": { \"<labelA> - <labelB>\": <composite(A) - composite(B)> },  // optional",
+			"  \"judgeModel\": \"<id>\",  // optional",
+			"  \"traceparent\": \"<from claim>\"",
+			"}"
+		].join("\n")
+	});
+	return [
+		"# Judge Eval Variants\n",
+		`You are grading ${runTaskIds.length} variants of a single run_eval scenario`,
+		"against ONE shared rubric. Your job is fan-out-and-collect — you do not",
+		"grade yourself.",
+		"",
+		`Task id: \`${ctx.taskId}\``,
+		`Diary: \`${ctx.diaryId}\``,
+		"",
+		"### Targets (variants to grade)",
+		"",
+		targetsBlock,
+		"",
+		"Each target is a completed `run_eval` task in the same correlation group.",
+		"Read its accepted attempt via `moltnet_get_task` / `moltnet_list_task_attempts`",
+		"to see the producer's output before grading.",
+		"",
+		"### Rubric",
+		"",
+		rubric.preamble ? `${rubric.preamble}\n` : "",
+		"| Criterion | Weight | Scoring | Description |",
+		"| --- | --- | --- | --- |",
+		criteriaTable,
+		"",
+		"### How to grade",
+		"",
+		"For EACH `runTaskIds[i]`:",
+		"",
+		"1. Call the `subagent` custom tool with:",
+		"   - `task`: a brief instructing the subagent to grade ONLY that variant",
+		"     against the rubric above; include the target task id and the rubric",
+		"     verbatim. The subagent has the same MoltNet tools and can fetch the",
+		"     accepted attempt output independently.",
+		"   - `output_schema`: `\"judge_eval_variant_result\"`",
+		"2. Receive the subagent's structured `judge_eval_variant_result` payload.",
+		"3. Append it to your `results[]` array, **in the same order as input.runTaskIds**.",
+		"",
+		"Do NOT score any variant in your own session. The whole point of the",
+		"subagent fan-out is per-variant context isolation — grading two variants",
+		"back-to-back in one session lets the second be biased by the first.",
+		"",
+		"### Composite arithmetic",
+		"",
+		"Each `composite` MUST equal `Σ(criterion.weight × score)` over the rubric",
+		"criteria. Drift > 0.001 is rejected. Subagents are instructed to compute it",
+		"themselves; double-check before assembling the final output.",
+		"",
+		"### Deltas (optional)",
+		"",
+		"If useful, populate `deltas` with pairwise composite differences keyed by",
+		"`\"<variantLabel-A> - <variantLabel-B>\"` (single space-hyphen-space). Both",
+		"labels must appear in `results`. Omit `deltas` entirely if not used.",
+		"",
+		finalOutputBlock
+	].filter((s) => s !== "").join("\n");
+}
+//#endregion
 //#region ../agent-runtime/src/prompts/judge-pack.ts
 function buildJudgePackUserPrompt(input, ctx) {
 	const { renderedPackId, sourcePackId, successCriteria } = input;
@@ -10496,6 +11059,15 @@ function buildTaskUserPrompt(task, ctx) {
 				diaryId: ctx.diaryId,
 				taskId: ctx.taskId
 			});
+		case JUDGE_EVAL_VARIANT_TYPE:
+			if (!Value.Check(JudgeEvalVariantInput, task.input)) {
+				const errors = [...Value.Errors(JudgeEvalVariantInput, task.input)];
+				throw new Error(`judge_eval_variant input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
+			}
+			return buildJudgeEvalVariantUserPrompt(task.input, {
+				diaryId: ctx.diaryId,
+				taskId: ctx.taskId
+			});
 		case RUN_EVAL_TYPE:
 			if (!Value.Check(RunEvalInput, task.input)) {
 				const errors = [...Value.Errors(RunEvalInput, task.input)];
@@ -13977,25 +14549,6 @@ var require_multistream = /* @__PURE__ */ __commonJSMin(((exports, module) => {
 	module.exports.pino = pino;
 })))();
 //#endregion
-//#region ../agent-runtime/src/subagent-output-contracts.ts
-var REGISTRY = /* @__PURE__ */ new Map();
-/**
-* Resolve a subagent output contract by name. Returns `null` for
-* unknown names — callers (the subagent custom tool) decide whether
-* that's a tool error the parent LLM can recover from or a hard fail.
-*/
-function getSubagentOutputContract(name) {
-	return REGISTRY.get(name) ?? null;
-}
-/**
-* List all registered contracts. Useful for diagnostics and for the
-* subagent tool's parameter description so a parent LLM can see what
-* contracts are available without enumerating them in its prompt.
-*/
-function listSubagentOutputContracts() {
-	return [...REGISTRY.values()];
-}
-//#endregion
 //#region src/runtime/inject-task-context.ts
 /**
 * Slice 1.5 of #943 — wire the agent-runtime resolver into the

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@themoltnet/pi-extension",
-  "version": "0.15.0",
+  "version": "0.15.1",
   "type": "module",
   "description": "MoltNet pi extension — sandboxed tool execution in Gondolin VMs with MoltNet identity and persistent memory",
   "license": "MIT",
@@ -31,8 +31,8 @@
     "@earendil-works/gondolin": "^0.9.1",
     "@opentelemetry/api": "^1.9.0",
     "@sinclair/typebox": "^0.34.0",
-    "@themoltnet/agent-runtime": "0.13.0",
-    "@themoltnet/sdk": "0.100.0"
+    "@themoltnet/agent-runtime": "0.14.0",
+    "@themoltnet/sdk": "0.101.0"
   },
   "peerDependencies": {
     "@earendil-works/pi-coding-agent": ">=0.74.0",