npm - @themoltnet/pi-extension - Versions diffs - 0.18.1 → 0.19.0 - Mend

@themoltnet/pi-extension 0.18.1 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/index.js CHANGED Viewed

@@ -2386,12 +2386,20 @@ var MoltNetError = class extends Error {
 	code;
 	statusCode;
 	detail;
+	/**
+	* Populated when the server returned a `VALIDATION_FAILED` problem
+	* (status 400) with field-level errors. Empty / undefined for every
+	* other problem kind. Imposer scripts surface these to operators so
+	* they don't have to re-run with curl to see what was rejected.
+	*/
+	validationErrors;
 	constructor(message, options) {
 		super(message);
 		this.name = "MoltNetError";
 		this.code = options.code;
 		this.statusCode = options.statusCode;
 		this.detail = options.detail;
+		this.validationErrors = options.validationErrors;
 	}
 };
 var NetworkError = class extends MoltNetError {
@@ -2415,10 +2423,14 @@ var AuthenticationError = class extends MoltNetError {
 };
 function problemToError(problem, statusCode) {
 	const title = problem.title ?? "Request failed";
-	return new MoltNetError(problem.detail ? `${title}: ${problem.detail}` : title, {
+	const message = problem.detail ? `${title}: ${problem.detail}` : title;
+	const rawErrors = problem.errors;
+	const validationErrors = Array.isArray(rawErrors) ? rawErrors.filter((e) => typeof e === "object" && e !== null && typeof e.field === "string" && typeof e.message === "string") : void 0;
+	return new MoltNetError(message, {
 		code: problem.type ?? problem.code ?? "UNKNOWN",
 		statusCode,
-		detail: problem.detail
+		detail: problem.detail,
+		validationErrors
 	});
 }
 //#endregion
@@ -7767,6 +7779,41 @@ function createMoltNetTools(config) {
 			};
 		}
 	});
+	const listTaskMessages = defineTool({
+		name: "moltnet_list_task_messages",
+		label: "List MoltNet Task Attempt Messages",
+		description: "List messages for a specific task attempt. Use this when you need the turn-by-turn execution record behind an accepted attempt — tool calls, text deltas, and error/info events that do not appear in the attempt output alone.",
+		parameters: Type.Object({
+			taskId: Type.String({ description: "Task ID (UUID)." }),
+			attemptN: Type.Integer({
+				minimum: 1,
+				description: "Attempt number to inspect."
+			}),
+			afterSeq: Type.Optional(Type.Integer({
+				minimum: 0,
+				description: "Optional cursor: only return messages with seq > afterSeq."
+			})),
+			limit: Type.Optional(Type.Integer({
+				minimum: 1,
+				maximum: 500,
+				description: "Optional maximum messages to return. Defaults to the API value."
+			}))
+		}),
+		async execute(_id, params) {
+			const { agent } = ensureConnected(config);
+			const messages = await agent.tasks.listMessages(params.taskId, params.attemptN, {
+				afterSeq: params.afterSeq,
+				limit: params.limit
+			});
+			return {
+				content: [{
+					type: "text",
+					text: JSON.stringify(messages, null, 2)
+				}],
+				details: {}
+			};
+		}
+	});
 	const reviewSessionErrors = defineTool({
 		name: "moltnet_review_session_errors",
 		label: "Review Session Tool Errors",
@@ -7815,6 +7862,7 @@ function createMoltNetTools(config) {
 		createEntry,
 		getTask,
 		listTaskAttempts,
+		listTaskMessages,
 		reviewSessionErrors,
 		defineTool({
 			name: "moltnet_host_exec",
@@ -8113,6 +8161,12 @@ var GUEST_WORKSPACE$2 = "/workspace";
 *     investigation and the alternatives we rejected.
 */
 var GUEST_TASK_SKILLS_MOUNT = "/moltnet-task-skills";
+function shouldRunResumeCommand(entry, ctx) {
+	if (typeof entry === "string") return true;
+	const workspaceModes = entry.when?.workspaceMode;
+	if (workspaceModes && !workspaceModes.includes(ctx.workspaceMode)) return false;
+	return true;
+}
 /**
 * Resolve the main worktree root (where .moltnet/ lives — it's untracked,
 * only exists in the main worktree, not in git worktrees).
@@ -8258,6 +8312,7 @@ async function resumeVm(config) {
 		...envOverrides
 	};
 	const resources = config.sandboxConfig?.resources;
+	const workspaceMode = config.workspaceMode ?? "shared_mount";
 	const vm = await VmCheckpoint.load(config.checkpointPath).resume({
 		httpHooks,
 		env: vmEnv,
@@ -8276,7 +8331,32 @@ async function resumeVm(config) {
   '`);
 		await vmRun(vm, "DNS resolvers", `printf 'nameserver 8.8.8.8\\nnameserver 1.1.1.1\\n' > /etc/resolv.conf`);
 		await vmRun(vm, "git safe.directory", `git config --system --add safe.directory '*'`);
-		for (const [i, cmd] of (config.sandboxConfig?.resumeCommands ?? []).entries()) await vmRun(vm, `resumeCommands[${i}]`, cmd);
+		for (const [i, entry] of (config.sandboxConfig?.resumeCommands ?? []).entries()) {
+			if (!shouldRunResumeCommand(entry, { workspaceMode })) continue;
+			const { run, retries, backoffMs } = typeof entry === "string" ? {
+				run: entry,
+				retries: 0,
+				backoffMs: 2e3
+			} : {
+				run: entry.run,
+				retries: entry.retries ?? 0,
+				backoffMs: entry.retryBackoffMs ?? 2e3
+			};
+			const label = `resumeCommands[${i}]`;
+			let lastErr;
+			for (let attempt = 0; attempt <= retries; attempt++) try {
+				await vmRun(vm, label, run);
+				lastErr = void 0;
+				break;
+			} catch (err) {
+				lastErr = err;
+				if (attempt === retries) break;
+				await new Promise((resolve) => {
+					setTimeout(resolve, (attempt + 1) * backoffMs);
+				});
+			}
+			if (lastErr) throw lastErr instanceof Error ? lastErr : new Error(String(lastErr));
+		}
 		const vmSshDir = `${vmAgentDir}/ssh`;
 		await vm.exec(`mkdir -p ${vmAgentDir}/ssh /home/agent/.pi/agent`);
 		if (creds.piAuthJson !== null) await vm.fs.writeFile("/home/agent/.pi/agent/auth.json", creds.piAuthJson, { mode: 384 });
@@ -8655,7 +8735,8 @@ async function buildAgentSession(args) {
 	await resourceLoader.reload();
 	const sessionManager = args.sessionPersistence ? await resolvePersistentSessionManager({
 		cwd: args.cwdPath,
-		sessionDir: args.sessionPersistence.sessionDir
+		sessionDir: args.sessionPersistence.sessionDir,
+		forkFromSessionPath: args.sessionPersistence.forkFromSessionPath
 	}) : SessionManager.inMemory(args.cwdPath);
 	return (await createAgentSession({
 		agentDir: args.piAuthDir,
@@ -8667,6 +8748,7 @@ async function buildAgentSession(args) {
 	})).session;
 }
 async function resolvePersistentSessionManager(args) {
+	if (args.forkFromSessionPath) return SessionManager.forkFrom(args.forkFromSessionPath, args.cwd, args.sessionDir);
 	await SessionManager.list(args.cwd, args.sessionDir);
 	return SessionManager.continueRecent(args.cwd, args.sessionDir);
 }
@@ -8683,6 +8765,11 @@ var PROMPT_SEPARATOR = "\n\n---\n\n";
 *   - `skill`         → `deliver.skill({ slug, content })` once per ref.
 *                       Slug collisions on distinct contents are
 *                       refused loudly.
+*   - `context_inline`→ persist raw bytes via `deliver.contextFile(...)`
+*                       and inject them into the prompt in an explicit,
+*                       named block. Intended for eval/context experiments
+*                       where the content must be in the model context
+*                       window, not merely discoverable as a skill.
 *   - `prompt_prefix` → content appended to `systemPromptPrefix` with
 *                       the canonical `\n\n---\n\n` separator (in
 *                       declared order).
@@ -8715,6 +8802,13 @@ async function resolveTaskContext(args) {
 				slug: ref.slug,
 				content: ref.content
 			});
+		} else if (ref.binding === "context_inline") {
+			await args.deliver.contextFile({
+				slug: ref.slug,
+				content: ref.content,
+				suggestedFileName: `${ref.slug}.md`
+			});
+			promptParts.push(formatInlineContextBlock(ref.slug, ref.content));
 		} else if (ref.binding === "prompt_prefix") promptParts.push(ref.content);
 		else userParts.push(ref.content);
 		injected.push(ref);
@@ -8725,6 +8819,23 @@ async function resolveTaskContext(args) {
 		userInlineSuffix: userParts.join(PROMPT_SEPARATOR)
 	};
 }
+function formatInlineContextBlock(slug, content) {
+	return [
+		"### Injected Task Context",
+		"",
+		`Context id: \`${slug}\``,
+		"The following raw context was supplied by the task creator. Treat it",
+		"as task-relevant background that may override generic coding instincts",
+		"when it contains repo- or workflow-specific constraints.",
+		"The same content is also materialized in the workspace as",
+		"`/workspace/context-pack.md` and mirrored in `AGENTS.md` for",
+		"repo-context discovery.",
+		"",
+		"<context>",
+		content,
+		"</context>"
+	].join("\n");
+}
 //#endregion
 //#region ../tasks/src/formats.ts
 /**
@@ -8748,6 +8859,7 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
 */
 var ContextBinding = Type$1.Union([
 	Type$1.Literal("skill"),
+	Type$1.Literal("context_inline"),
 	Type$1.Literal("prompt_prefix"),
 	Type$1.Literal("user_inline")
 ], { $id: "ContextBinding" });
@@ -8764,9 +8876,14 @@ var ContextBinding = Type$1.Union([
 *            name under the runtime's skill discovery path. Must be
 *            kebab-case-safe (alphanumeric + dashes/underscores).
 * - `binding` — how the bytes are delivered to the LLM (see above).
-* - `content` — the actual bytes (UTF-8 text). Capped at 32 KiB per
+* - `content` — the actual bytes (UTF-8 text). Capped at 64 KiB per
 *               entry; total per-task context bytes are bounded by the
 *               soft `maxItems` cap and per-binding daemon limits.
+*               Raised from 32 KiB in 2026-05 — protocol-heavy operator
+*               skills (e.g. `.claude/skills/legreffier/SKILL.md`) ship
+*               at ~35 KiB inline, and the original cap was sized for
+*               short example skills, not the kind of skill the eval
+*               substrate is dogfooded on (#943, #823).
 */
 var ContextRef = Type$1.Object({
 	slug: Type$1.String({
@@ -8777,7 +8894,7 @@ var ContextRef = Type$1.Object({
 	binding: ContextBinding,
 	content: Type$1.String({
 		minLength: 1,
-		maxLength: 32768
+		maxLength: 65536
 	})
 }, {
 	$id: "ContextRef",
@@ -9341,61 +9458,33 @@ async function validateJudgePackInputAsync(input, ctx) {
 	return errors;
 }
 //#endregion
-//#region ../tasks/src/task-types/judge-eval-variant.ts
+//#region ../tasks/src/task-types/judge-eval-attempt.ts
 /**
-* `judge_eval_variant` — score N variants of a `run_eval` scenario
-* against a single rubric, in one pass, with per-variant subagent
-* isolation.
+* `judge_eval_attempt` — score one completed `run_eval` attempt against a
+* hidden judge rubric.
 *
 * output_kind: judgment
-* criteria: required (`successCriteria.rubric` — same envelope shape as
-*   `judge_pack` / `assess_brief`)
-* references: not required at the input layer — `runTaskIds` already
-*   pin the targets being graded.
-*
-* Slice 2 of #943. The parent task carries the rubric and the list of
-* variant `run_eval` task ids. The pi executor registers the generic
-* `subagent` custom tool (#1087), and the parent LLM calls
-* `subagent({ task, output_schema: 'judge_eval_variant_result' })` once
-* per variant — each child session has fresh context, fetches the
-* variant's accepted attempt output via `moltnet_get_task` /
-* `moltnet_list_task_attempts`, and grades against the rubric.
+* criteria: required (`successCriteria.rubric`)
+* references: not required at the input layer — `targetTaskId` +
+*   `targetAttemptN` pin the producer attempt being judged.
 *
-* Reuses `JudgePackScore` from `judge_pack` for per-criterion scoring
-* (Lane 1 binary via `llm_checklist`, Lane 2 graded via `llm_score`,
-* deterministic_*) — the score shape is the same across judgment
-* tasks; only the wrapping (per-variant grouping + deltas) differs.
-*
-* Cross-task input invariants — "all targets share the same
-* correlation_id, all are `run_eval`, all are completed with an
-* accepted attempt, all share byte-identical `input.successCriteria`"
-* — REQUIRE async DB lookups and live in `validateInputAsync` below,
-* which the task service runs at create time (#1096 wiring). The
-* TypeBox layer here only enforces shape: UUID format,
-* minItems/maxItems, rubric presence + weight invariant.
-*/
-var JUDGE_EVAL_VARIANT_TYPE = "judge_eval_variant";
-var JudgeEvalVariantInput = Type$1.Object({
-	runTaskIds: Type$1.Array(Type$1.String({ format: "uuid" }), {
-		minItems: 2,
-		maxItems: 10
-	}),
+* This replaces the earlier parent/subagent `judge_eval_variant` design.
+* The unit of judgment is one producer attempt. Cross-variant deltas can be
+* computed later at read time from stored scores, rather than materialized as
+* their own task output.
+*/
+var JUDGE_EVAL_ATTEMPT_TYPE = "judge_eval_attempt";
+var JudgeEvalAttemptInput = Type$1.Object({
+	targetTaskId: Type$1.String({ format: "uuid" }),
+	targetAttemptN: Type$1.Integer({ minimum: 1 }),
 	successCriteria: SuccessCriteria
 }, {
-	$id: "JudgeEvalVariantInput",
+	$id: "JudgeEvalAttemptInput",
 	additionalProperties: false
 });
-/**
-* Per-variant grading. `scores[]` shape is identical to `JudgePackScore`
-* (mode-aware: binary via `llm_checklist`, graded via `llm_score`,
-* deterministic_*). Reuse the type rather than re-declare.
-*
-* This is also the **subagent output contract** — the parent's
-* `subagent` tool resolves the contract name `judge_eval_variant_result`
-* to this schema. See `agent-runtime`'s subagent contract registry.
-*/
-var JudgeEvalVariantResult = Type$1.Object({
-	runTaskId: Type$1.String({ format: "uuid" }),
+var JudgeEvalAttemptOutput = Type$1.Object({
+	targetTaskId: Type$1.String({ format: "uuid" }),
+	targetAttemptN: Type$1.Integer({ minimum: 1 }),
 	variantLabel: Type$1.String({
 		minLength: 1,
 		maxLength: 64,
@@ -9406,216 +9495,126 @@ var JudgeEvalVariantResult = Type$1.Object({
 		minimum: 0,
 		maximum: 1
 	}),
-	verdict: Type$1.String({ minLength: 1 })
-}, {
-	$id: "JudgeEvalVariantResult",
-	additionalProperties: false
-});
-var JudgeEvalVariantOutput = Type$1.Object({
-	results: Type$1.Array(JudgeEvalVariantResult, { minItems: 2 }),
-	deltas: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Number({
-		minimum: -1,
-		maximum: 1
-	}))),
+	verdict: Type$1.String({ minLength: 1 }),
 	judgeModel: Type$1.Optional(Type$1.String({ minLength: 1 })),
 	traceparent: Type$1.String({ minLength: 1 })
 }, {
-	$id: "JudgeEvalVariantOutput",
+	$id: "JudgeEvalAttemptOutput",
 	additionalProperties: false
 });
-/**
-* Synchronous input invariants beyond TypeBox shape: rubric must be
-* present (already required by the schema, but the rubric body has
-* its own per-criterion weight invariant) and the rubric's weights
-* must sum to 1.
-*
-* Cross-task invariants (all targets are `run_eval`, all completed,
-* share `correlation_id`, byte-identical `input.successCriteria`)
-* are NOT checked here — they require async DB lookups against
-* `runTaskIds` and live in `validateJudgeEvalVariantInputAsync`
-* below, invoked by the task service at create time (#1096).
-*/
-function validateJudgeEvalVariantInput(input) {
+function validateJudgeEvalAttemptInput(input) {
 	const sc = input.successCriteria;
-	if (!sc) return "successCriteria is required for judge_eval_variant";
-	if (!sc.rubric) return "successCriteria.rubric is required for judge_eval_variant";
+	if (!sc) return "successCriteria is required for judge_eval_attempt";
+	if (!sc.rubric) return "successCriteria.rubric is required for judge_eval_attempt";
 	return validateRubricWeights(sc.rubric);
 }
-/**
-* Output cross-field invariants the schema cannot express:
-*
-*   1. `results.length === input.runTaskIds.length` — every variant
-*      the imposer asked for must be graded. Partial grading
-*      invalidates cross-variant comparison; fail the whole task
-*      rather than silently report a subset.
-*
-*   2. `results[i].runTaskId === input.runTaskIds[i]` — order is
-*      load-bearing for downstream consumers (e.g. deltas keyed by
-*      adjacent pairs). Mismatch is an LLM bug; reject loudly.
-*
-*   3. Each `result.scores` follows the same `llm_checklist` rule
-*      `judge_pack` enforces (#999): if a score has an `assertions`
-*      array, the numeric score MUST be `1` iff every assertion
-*      passes. Inconsistent payloads pollute attestations.
-*
-*   4. Each `result.composite` MUST equal the rubric-weighted sum
-*      `Σ(weight_j × scores[j].score)`. The parent (and any subagent
-*      it delegated to) is supposed to compute this; surfacing a
-*      drift here catches LLMs that hand-wave the arithmetic.
-*
-*   5. Optional `deltas` keys MUST be of the form `"A - B"` where
-*      both `A` and `B` are variantLabels present in `results`.
-*      Values are not range-checked (any float in [-1, 1] is
-*      arithmetically possible).
-*/
-function validateJudgeEvalVariantOutput(output, input) {
+function validateJudgeEvalAttemptOutput(output, input) {
 	const out = output;
 	const inp = input;
 	if (inp) {
-		if (out.results.length !== inp.runTaskIds.length) return `results.length (${out.results.length}) does not match input.runTaskIds.length (${inp.runTaskIds.length}). Every variant must be graded; partial grading is rejected.`;
-		for (let i = 0; i < out.results.length; i++) if (out.results[i].runTaskId !== inp.runTaskIds[i]) return `results[${i}].runTaskId (${out.results[i].runTaskId}) does not match input.runTaskIds[${i}] (${inp.runTaskIds[i]}). Order must align with input for downstream delta computation.`;
+		if (out.targetTaskId !== inp.targetTaskId) return `output.targetTaskId (${out.targetTaskId}) does not match input.targetTaskId (${inp.targetTaskId})`;
+		if (out.targetAttemptN !== inp.targetAttemptN) return `output.targetAttemptN (${out.targetAttemptN}) does not match input.targetAttemptN (${inp.targetAttemptN})`;
 	}
-	for (let r = 0; r < out.results.length; r++) {
-		const result = out.results[r];
-		for (let s = 0; s < result.scores.length; s++) {
-			const sc = result.scores[s];
-			if (!sc.assertions) continue;
-			const allPassed = sc.assertions.every((a) => a.passed);
-			const expected = allPassed ? 1 : 0;
-			if (sc.score !== expected) return `results[${r}].scores[${s}] (criterionId="${sc.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${sc.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
-		}
+	for (let s = 0; s < out.scores.length; s++) {
+		const sc = out.scores[s];
+		if (!sc.assertions) continue;
+		const allPassed = sc.assertions.every((a) => a.passed);
+		const expected = allPassed ? 1 : 0;
+		if (sc.score !== expected) return `scores[${s}] (criterionId="${sc.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${sc.score}. Score must be 1 iff every assertion passes, else 0.`;
 	}
 	if (inp?.successCriteria?.rubric) {
 		const criteria = inp.successCriteria.rubric.criteria;
 		const weightById = new Map(criteria.map((c) => [c.id, c.weight]));
-		for (let r = 0; r < out.results.length; r++) {
-			const result = out.results[r];
-			let sum = 0;
-			for (const sc of result.scores) {
-				const w = weightById.get(sc.criterionId);
-				if (w === void 0) return `results[${r}].scores: criterionId "${sc.criterionId}" is not in the input rubric (known: ${Array.from(weightById.keys()).join(", ")}). Score every rubric criterion exactly once; do not invent new ids.`;
-				sum += w * sc.score;
-			}
-			if (Math.abs(sum - result.composite) > .001) return `results[${r}].composite (${result.composite}) does not match Σ(weight × score) (${sum.toFixed(6)}). Composite must be the rubric-weighted sum of per-criterion scores (drift > 0.001).`;
-		}
-	}
-	if (out.deltas) {
-		const labels = new Set(out.results.map((r) => r.variantLabel));
-		for (const key of Object.keys(out.deltas)) {
-			const m = /^(.+?) - (.+)$/.exec(key);
-			if (!m) return `deltas key "${key}" is not of the form "<variantLabel-A> - <variantLabel-B>". Use a single space-hyphen-space separator between labels.`;
-			const [, a, b] = m;
-			if (!labels.has(a) || !labels.has(b)) return `deltas key "${key}" references variantLabel(s) not present in results: ${!labels.has(a) ? `"${a}" missing` : ""}${!labels.has(a) && !labels.has(b) ? ", " : ""}${!labels.has(b) ? `"${b}" missing` : ""}`;
+		let sum = 0;
+		for (const sc of out.scores) {
+			const w = weightById.get(sc.criterionId);
+			if (w === void 0) return `scores references unknown criterionId "${sc.criterionId}"`;
+			sum += w * sc.score;
 		}
+		const rounded = Math.round(sum * 1e3) / 1e3;
+		if (Math.abs(rounded - out.composite) > .001) return `composite (${out.composite}) does not match weighted rubric sum (${rounded})`;
 	}
 	return null;
 }
-/**
-* Local stable-stringify for cross-variant `successCriteria` byte-
-* equality. Recursively sorts object keys; arrays preserve order
-* (intentional — rubric criteria order is semantically meaningful).
-* Mirrors the canonical-JSON shape `crypto-service` uses for CIDs,
-* without taking on a crypto-service dep just for this comparison.
-*/
-function stableStringify(value) {
-	if (value === null || typeof value !== "object") return JSON.stringify(value);
-	if (Array.isArray(value)) return "[" + value.map(stableStringify).join(",") + "]";
-	const obj = value;
-	return "{" + Object.keys(obj).sort().map((k) => JSON.stringify(k) + ":" + stableStringify(obj[k])).join(",") + "}";
-}
-/**
-* Async preflight for `judge_eval_variant` (#1096 + #943):
-*
-*  1. Every `runTaskIds[i]` resolves to a task the caller can read.
-*  2. Every resolved task is `taskType === 'run_eval'`.
-*  3. Every resolved task is `status === 'completed'` with a
-*     non-null `acceptedAttemptN` — grading an unaccepted attempt
-*     races with re-attempts and pollutes the judge attestation.
-*  4. Every resolved task shares a non-null `correlationId`, and all
-*     `correlationId`s are equal. Without this an imposer could
-*     fabricate a "variant set" by stapling unrelated runs together.
-*  5. The shared `correlationId` is NOT already sealed. A previous
-*     judge_eval_variant against the same group is final; produce a
-*     fresh correlation_id for a new judging round rather than
-*     adding contradictory verdicts to a sealed group.
-*  6. Every variant's `input.successCriteria` is byte-identical (via
-*     stable-stringify). Different rubrics across "variants" makes
-*     the comparison meaningless.
-*/
-async function validateJudgeEvalVariantInputAsync(input, ctx) {
-	const { runTaskIds } = input;
+async function validateJudgeEvalAttemptInputAsync(input, ctx) {
+	const inp = input;
 	const errors = [];
-	const resolved = await Promise.all(runTaskIds.map((id) => ctx.resolveTask(id)));
-	let missingTargets = false;
-	const presentTargets = [];
-	for (let i = 0; i < runTaskIds.length; i++) {
-		const t = resolved[i];
-		if (!t) {
-			missingTargets = true;
-			errors.push({
-				field: `runTaskIds[${i}]`,
-				message: `runTaskIds[${i}]=${runTaskIds[i]} does not resolve to a task you can read`
-			});
-			continue;
-		}
-		presentTargets.push(t);
-		if (t.taskType !== "run_eval") errors.push({
-			field: `runTaskIds[${i}]`,
-			message: `runTaskIds[${i}]=${runTaskIds[i]} is a ${t.taskType}, not a run_eval`
-		});
-		if (t.status !== "completed" || t.acceptedAttemptN === null) errors.push({
-			field: `runTaskIds[${i}]`,
-			message: `runTaskIds[${i}]=${runTaskIds[i]} is not completed with an accepted attempt (status=${t.status}, acceptedAttemptN=${t.acceptedAttemptN})`
-		});
-	}
-	if (missingTargets || presentTargets.length === 0) return errors;
-	const correlationIds = new Set(presentTargets.map((t) => t.correlationId ?? "__null__"));
-	if (correlationIds.has("__null__")) errors.push({
-		field: "runTaskIds",
-		message: "one or more run_eval targets have no correlation_id; cannot group as variants"
+	const target = await ctx.resolveTask(inp.targetTaskId);
+	if (!target) return [{
+		field: "targetTaskId",
+		message: `targetTaskId=${inp.targetTaskId} does not resolve to a task you can read`
+	}];
+	if (target.taskType !== "run_eval") errors.push({
+		field: "targetTaskId",
+		message: `targetTaskId=${inp.targetTaskId} is a ${target.taskType}, not a run_eval`
+	});
+	if (target.status !== "completed" || target.acceptedAttemptN === null) errors.push({
+		field: "targetTaskId",
+		message: `targetTaskId=${inp.targetTaskId} is not completed with an accepted attempt (status=${target.status}, acceptedAttemptN=${target.acceptedAttemptN})`
 	});
-	if (correlationIds.size > 1) errors.push({
-		field: "runTaskIds",
-		message: `run_eval targets span multiple correlation_ids (${Array.from(correlationIds).join(", ")}); variants must share one`
+	else if (target.acceptedAttemptN !== inp.targetAttemptN) errors.push({
+		field: "targetAttemptN",
+		message: `targetAttemptN=${inp.targetAttemptN} does not match the producer's acceptedAttemptN=${target.acceptedAttemptN}`
 	});
-	if (errors.length > 0) return errors;
-	const correlationId = presentTargets[0].correlationId;
-	if (!correlationId) return errors;
-	const seal = await ctx.findCorrelationSeal(correlationId);
-	if (seal) errors.push({
-		field: "runTaskIds",
-		message: `correlation_id ${correlationId} is already sealed by ${seal.sealedByTaskType}/${seal.sealedByTaskId} at ${seal.sealedAt}; use a fresh correlation_id for a new judging round`
+	if (!target.correlationId) errors.push({
+		field: "targetTaskId",
+		message: "target run_eval has no correlation_id; cannot enforce duplicate-judge protection"
+	});
+	if (errors.length > 0 || !target.correlationId) return errors;
+	const rubric = inp.successCriteria.rubric;
+	const duplicate = (await ctx.listTasksByCorrelation(target.correlationId)).find((task) => {
+		if (task.taskType !== "judge_eval_attempt") return false;
+		if (task.status === "failed" || task.status === "cancelled" || task.status === "expired") return false;
+		const existing = task.input;
+		const existingRubric = existing.successCriteria?.rubric;
+		return existing.targetTaskId === inp.targetTaskId && existing.targetAttemptN === inp.targetAttemptN && existingRubric?.rubricId === rubric?.rubricId && existingRubric?.version === rubric?.version;
+	});
+	if (duplicate) errors.push({
+		field: "targetTaskId",
+		message: `judge task ${duplicate.id} already exists for (${inp.targetTaskId}, attempt ${inp.targetAttemptN}, rubric ${rubric?.rubricId}@${rubric?.version})`
 	});
-	const first = stableStringify(presentTargets[0].input.successCriteria);
-	for (let i = 1; i < presentTargets.length; i++) if (stableStringify(presentTargets[i].input.successCriteria) !== first) {
-		errors.push({
-			field: `runTaskIds[${i}]`,
-			message: `runTaskIds[${i}] has a different input.successCriteria than runTaskIds[0]; all variants must share the rubric and gates`
-		});
-		break;
-	}
 	return errors;
 }
-/**
-* Side effect emitted on successful `judge_eval_variant` create:
-* seal the shared correlation_id atomically with the insert. The
-* task service applies the seal in the same transaction; a
-* concurrent second `judge_eval_variant` against the same group
-* loses the race and is rejected with a clean conflict error.
-*
-* The seal applies to the SHARED correlation_id of the targets —
-* NOT to the judge task's own correlationId (which is typically
-* null or distinct). The task service derives the correlationId
-* for the effect from the resolved targets, not from the judge
-* task row.
-*/
-async function onCreateJudgeEvalVariant(input, ctx) {
-	const { runTaskIds } = input;
-	const first = await ctx.resolveTask(runTaskIds[0]);
-	if (!first?.correlationId) return [];
+async function onCreateJudgeEvalAttempt(input, _ctx) {
+	const judge = input;
+	const rubric = judge.successCriteria.rubric;
+	if (!rubric) return [];
 	return [{
-		kind: "sealCorrelation",
-		correlationId: first.correlationId
+		kind: "guardTaskUniqueness",
+		taskType: JUDGE_EVAL_ATTEMPT_TYPE,
+		lockKey: [
+			JUDGE_EVAL_ATTEMPT_TYPE,
+			judge.targetTaskId,
+			String(judge.targetAttemptN),
+			rubric.rubricId,
+			rubric.version
+		].join(":"),
+		inputMatches: [
+			{
+				path: ["targetTaskId"],
+				value: judge.targetTaskId
+			},
+			{
+				path: ["targetAttemptN"],
+				value: judge.targetAttemptN
+			},
+			{
+				path: [
+					"successCriteria",
+					"rubric",
+					"rubricId"
+				],
+				value: rubric.rubricId
+			},
+			{
+				path: [
+					"successCriteria",
+					"rubric",
+					"version"
+				],
+				value: rubric.version
+			}
+		]
 	}];
 }
 //#endregion
@@ -9739,14 +9738,43 @@ async function validateRenderPackInputAsync(input, ctx) {
 //#region ../tasks/src/task-types/run-eval.ts
 /**
 * `run_eval` — execute a scenario prompt under a named variant for
-* later cross-variant grading by `judge_eval_variant` (Slice 2).
+* later per-attempt grading by `judge_eval_attempt` tasks.
 *
 * output_kind: artifact
-* criteria: optional (when set, output.verification is required —
-*   producer self-assessment; the judge is the binding evaluator)
+* criteria: optional producer-only checks (when set,
+*   output.verification is required — the judge rubric remains hidden
+*   on downstream `judge_eval_attempt` tasks)
 * references: not required (scenario lives entirely in input)
 */
 var RUN_EVAL_TYPE = "run_eval";
+var RunEvalMode = Type$1.Union([Type$1.Literal("vitro"), Type$1.Literal("vivo")], { $id: "RunEvalMode" });
+var RunEvalWorkspace = Type$1.Union([
+	Type$1.Literal("none"),
+	Type$1.Literal("shared_mount"),
+	Type$1.Literal("dedicated_worktree")
+], { $id: "RunEvalWorkspace" });
+var RunEvalExecution = Type$1.Object({
+	mode: RunEvalMode,
+	workspace: RunEvalWorkspace
+}, {
+	$id: "RunEvalExecution",
+	additionalProperties: false
+});
+/**
+* Producer-visible checks for `run_eval`. Deliberately forbids `rubric`
+* so the variant runner cannot see the downstream judge's answer key.
+* Keep the rest of the SuccessCriteria envelope available for generic
+* process / structure checks (`gates`, `assertions`, `sideEffects`).
+*/
+var RunEvalSuccessCriteria = Type$1.Object({
+	version: Type$1.Literal(1),
+	gates: Type$1.Optional(SuccessCriteria.properties.gates),
+	assertions: Type$1.Optional(SuccessCriteria.properties.assertions),
+	sideEffects: Type$1.Optional(SuccessCriteria.properties.sideEffects)
+}, {
+	$id: "RunEvalSuccessCriteria",
+	additionalProperties: false
+});
 var RunEvalInput = Type$1.Object({
 	scenario: Type$1.Object({
 		prompt: Type$1.String({ minLength: 1 }),
@@ -9756,8 +9784,9 @@ var RunEvalInput = Type$1.Object({
 		minLength: 1,
 		maxLength: 64
 	}),
+	execution: RunEvalExecution,
 	context: TaskContext,
-	successCriteria: Type$1.Optional(SuccessCriteria)
+	successCriteria: Type$1.Optional(RunEvalSuccessCriteria)
 }, {
 	$id: "RunEvalInput",
 	additionalProperties: false
@@ -9785,8 +9814,8 @@ var RunEvalOutput = Type$1.Object({
 function validateRunEvalOutput(output, input) {
 	const hasCriteria = input !== null && input !== void 0 && input.successCriteria !== void 0;
 	const hasVerification = output !== null && output !== void 0 && output.verification !== void 0;
-	if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the criteria";
-	if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no criteria to assess against";
+	if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the producer checks";
+	if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no producer checks to assess against";
 	return null;
 }
 //#endregion
@@ -9902,24 +9931,24 @@ var BUILT_IN_TASK_TYPES = {
 		inputSchema: RunEvalInput,
 		outputSchema: RunEvalOutput,
 		outputKind: "artifact",
-		workspaceScope: "attempt",
+		resumable: true,
+		workspaceScope: "session",
 		sessionScope: "custom",
 		requiresReferences: false,
 		validateOutput: validateRunEvalOutput
 	},
-	[JUDGE_EVAL_VARIANT_TYPE]: {
-		name: JUDGE_EVAL_VARIANT_TYPE,
-		inputSchema: JudgeEvalVariantInput,
-		outputSchema: JudgeEvalVariantOutput,
+	[JUDGE_EVAL_ATTEMPT_TYPE]: {
+		name: JUDGE_EVAL_ATTEMPT_TYPE,
+		inputSchema: JudgeEvalAttemptInput,
+		outputSchema: JudgeEvalAttemptOutput,
 		outputKind: "judgment",
 		workspaceScope: "attempt",
-		sessionScope: "custom",
+		sessionScope: "none",
 		requiresReferences: false,
-		validateInput: validateJudgeEvalVariantInput,
-		validateOutput: validateJudgeEvalVariantOutput,
-		validateInputAsync: validateJudgeEvalVariantInputAsync,
-		onCreate: onCreateJudgeEvalVariant,
-		usesSubagents: true
+		validateInput: validateJudgeEvalAttemptInput,
+		validateOutput: validateJudgeEvalAttemptOutput,
+		validateInputAsync: validateJudgeEvalAttemptInputAsync,
+		onCreate: onCreateJudgeEvalAttempt
 	}
 };
 //#endregion
@@ -10283,20 +10312,16 @@ function buildFinalOutputBlock(opts) {
 		"## Final output (read this carefully)",
 		"",
 		`Your VERY LAST action in this conversation MUST report the structured`,
-		`output matching \`${outputSchemaName}\`. Two ways to do it, in order of`,
-		`preference:`,
+		`output matching \`${outputSchemaName}\`.`,
 		"",
-		`1. **Preferred — call \`${submitTool}\` exactly once** with the payload.`,
-		`   The runtime captures the validated arguments and ends the session.`,
-		`   If the tool is registered, prefer this path.`,
-		`2. **Fallback** — if the submit tool is unavailable, your very last`,
-		`   assistant message MUST be a single JSON object matching`,
-		`   \`${outputSchemaName}\`. No prose before or after. No code fences.`,
-		`   No "ok" or "done". The runtime parses the last balanced top-level`,
-		`   JSON object as the output.`,
+		`Call \`${submitTool}\` exactly once with the payload.`,
+		`The runtime captures the validated arguments and ends the session.`,
+		`Do NOT emit the output as plain assistant text. Do NOT rely on a`,
+		`JSON-in-message fallback. If you do not call \`${submitTool}\`, the`,
+		`attempt fails even if the underlying work succeeded.`,
 		"",
-		`Failing to report structured output as the very last action means the`,
-		`attempt is marked failed even if the underlying work succeeded.`,
+		`Your final assistant text before that tool call may explain your work,`,
+		`but the submit-tool call itself must be your VERY LAST action.`,
 		"",
 		`Output shape:`,
 		"",
@@ -10434,21 +10459,30 @@ function buildAssessBriefUserPrompt(input, ctx) {
 }
 //#endregion
 //#region ../agent-runtime/src/prompts/self-verification.ts
-function buildSelfVerificationBlock(taskId) {
+function buildSelfVerificationBlock(taskId, criteriaField = "successCriteria") {
 	return [
 		"## Self-verification",
 		"",
-		`Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.successCriteria\`.`,
+		`If \`input.${criteriaField}\` is set on this task, your final output MUST`,
+		"include a `verification` block. **The runtime/server rejects task",
+		`submission without \`verification\` when \`${criteriaField}\` is present**`,
+		"— the request fails validation and the attempt is discarded, even if the",
+		"underlying work succeeded. Do not call the submit tool until you have",
+		"computed the verification payload.",
 		"",
-		"- If `input.successCriteria` is **absent**, omit `verification` from your",
+		`Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.${criteriaField}\`.`,
+		"",
+		`- If \`input.${criteriaField}\` is **absent**, omit \`verification\` from your`,
 		"  final output entirely.",
-		"- If `input.successCriteria` is **present**, you MUST include a",
-		"  `verification` block in your final output. Evaluate every applicable",
+		`- If \`input.${criteriaField}\` is **present**, evaluate every applicable`,
 		"  item — `gates`, `assertions`, `rubric` criteria, `sideEffects` — against",
 		"  your produced work and emit one result per id. Be honest: a `fail` with",
 		"  a one-line reason is more useful than a false `pass`. Use `skip` (with a",
 		"  `detail`) when you genuinely could not determine a result. Compute",
 		"  `passed = results.every(r => r.status !== 'fail')`.",
+		"- `verification` MUST be a JSON object. Never send a string, markdown",
+		"  block, null, or an empty placeholder. The submit tool expects an object",
+		"  with `inputCid`, `results`, and `passed` fields.",
 		"",
 		"Verification shape:",
 		"",
@@ -10462,6 +10496,23 @@ function buildSelfVerificationBlock(taskId) {
 		"  \"passed\": <boolean>",
 		"}",
 		"```",
+		"",
+		"Minimal valid example:",
+		"",
+		"```json",
+		"{",
+		"  \"inputCid\": \"<task inputCid>\",",
+		"  \"results\": [",
+		"    {",
+		"      \"id\": \"<criterion id>\",",
+		"      \"kind\": \"rubric\",",
+		"      \"status\": \"pass\",",
+		"      \"detail\": \"one-line reason\"",
+		"    }",
+		"  ],",
+		"  \"passed\": true",
+		"}",
+		"```",
 		""
 	].join("\n");
 }
@@ -10712,69 +10763,62 @@ function buildFulfillBriefUserPrompt(input, ctx) {
 	].filter(Boolean).join("\n");
 }
 //#endregion
-//#region ../agent-runtime/src/prompts/judge-eval-variant.ts
-/**
-* Build the first user-message prompt for a `judge_eval_variant` task
-* (#943 Slice 2).
-*
-* The parent agent's job is **fan-out-and-collect**: for each
-* `runTaskIds[i]`, spawn an isolated subagent via the `subagent` custom
-* tool (#1087), have it grade that variant against the shared rubric,
-* and collect each subagent's structured `judge_eval_variant_result`
-* payload. The parent does NOT grade itself; it composes the per-
-* variant results into the final `judge_eval_variant` output (results
-* array + optional deltas + verdicts).
-*
-* Isolation is the point: each variant gets a fresh subagent session
-* with no carryover context from sibling variants, so per-variant
-* grading is independent. Cost is bounded by `maxItems: 10` on
-* runTaskIds.
-*/
-function buildJudgeEvalVariantUserPrompt(input, ctx) {
-	const { runTaskIds, successCriteria } = input;
-	const rubric = successCriteria.rubric;
-	if (!rubric) throw new Error("judge_eval_variant requires successCriteria.rubric — none present");
+//#region ../agent-runtime/src/prompts/judge-eval-attempt.ts
+function buildJudgeEvalAttemptUserPrompt(input, ctx) {
+	const rubric = input.successCriteria.rubric;
+	if (!rubric) throw new Error("judge_eval_attempt requires successCriteria.rubric — none present");
 	const escapeCell = (s) => s.replace(/\\/g, "\\\\").replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
 	const criteriaTable = rubric.criteria.map((c) => `| \`${c.id}\` | ${c.weight.toFixed(3)} | ${c.scoring} | ${escapeCell(c.description)} |`).join("\n");
-	const targetsBlock = runTaskIds.map((id, i) => `${i + 1}. \`${id}\``).join("\n");
 	const finalOutputBlock = buildFinalOutputBlock({
-		taskType: "judge_eval_variant",
-		outputSchemaName: "JudgeEvalVariantOutput",
+		taskType: "judge_eval_attempt",
+		outputSchemaName: "JudgeEvalAttemptOutput",
 		shapeSketch: [
 			"{",
-			"  \"results\": [",
-			"    {",
-			"      \"runTaskId\": \"<runTaskIds[i]>\",",
-			"      \"variantLabel\": \"<from variant input>\",",
-			"      \"scores\": [ { \"criterionId\": \"...\", \"score\": 0..1, \"rationale\": \"...\", \"assertions\": [...]? } ],",
-			"      \"composite\": <Σ(weight × score), 0..1>,",
-			"      \"verdict\": \"<1-3 sentences>\"",
-			"    },",
-			"    ...one entry per runTaskIds[i], same order",
-			"  ],",
-			"  \"deltas\": { \"<labelA> - <labelB>\": <composite(A) - composite(B)> },  // optional",
+			`  "targetTaskId": "${input.targetTaskId}",`,
+			`  "targetAttemptN": ${input.targetAttemptN},`,
+			"  \"variantLabel\": \"<from producer input>\",",
+			"  \"scores\": [ { \"criterionId\": \"...\", \"score\": 0..1, \"rationale\": \"...\", \"assertions\": [...]? } ],",
+			"  \"composite\": <Σ(weight × score), 0..1>,",
+			"  \"verdict\": \"<1-3 sentences>\",",
 			"  \"judgeModel\": \"<id>\",  // optional",
 			"  \"traceparent\": \"<from claim>\"",
 			"}"
 		].join("\n")
 	});
+	const workspaceSection = ctx.workspace?.attached === true ? [
+		"### Workspace",
+		"",
+		"Your current workspace is already attached to the producer attempt",
+		"you are judging. Inspect files directly from the current workspace",
+		"root instead of inventing synthetic `artifact_<taskId>` paths.",
+		"If the accepted attempt output lists `artifacts[].path`, treat those",
+		"paths as relative to the current workspace root unless the output",
+		"explicitly says otherwise.",
+		ctx.workspace.mode === "dedicated_worktree" ? `This attachment is a dedicated producer worktree${ctx.workspace.branch ? ` on branch \`${ctx.workspace.branch}\`` : ""}.` : ctx.workspace.mode === "scratch_mount" ? "This attachment is the producer scratch workspace mounted with shadow writes for safe inspection." : "This attachment is the producer shared workspace mounted with shadow writes for safe inspection.",
+		""
+	].join("\n") : "";
 	return [
-		"# Judge Eval Variants\n",
-		`You are grading ${runTaskIds.length} variants of a single run_eval scenario`,
-		"against ONE shared rubric. Your job is fan-out-and-collect — you do not",
-		"grade yourself.",
+		"# Judge Eval Attempt\n",
+		"You are grading one accepted `run_eval` producer attempt against a hidden",
+		"judge rubric. Do not delegate to subagents. Grade in this session only.",
 		"",
 		`Task id: \`${ctx.taskId}\``,
 		`Diary: \`${ctx.diaryId}\``,
+		`Producer task: \`${input.targetTaskId}\``,
+		`Producer attempt: \`${input.targetAttemptN}\``,
 		"",
-		"### Targets (variants to grade)",
-		"",
-		targetsBlock,
+		"### Evidence gathering",
 		"",
-		"Each target is a completed `run_eval` task in the same correlation group.",
-		"Read its accepted attempt via `moltnet_get_task` / `moltnet_list_task_attempts`",
-		"to see the producer's output before grading.",
+		`1. Call \`moltnet_get_task\` with taskId=\`${input.targetTaskId}\`.`,
+		`2. Call \`moltnet_list_task_attempts\` with taskId=\`${input.targetTaskId}\` and inspect the accepted attempt matching \`${input.targetAttemptN}\`.`,
+		`3. Call \`moltnet_list_task_messages\` with taskId=\`${input.targetTaskId}\`, attemptN=\`${input.targetAttemptN}\` to inspect the producer's turn-by-turn behavior.`,
+		"4. Use the accepted attempt output, attempt messages, and any accessible",
+		"   artifacts or workspace evidence available in your environment.",
+		"   Read artifact files from the mounted producer workspace when present;",
+		"   do not assume detached `artifact_<taskId>` directories exist.",
+		"5. Score strictly against the rubric below.",
 		"",
+		workspaceSection,
 		"### Rubric",
 		"",
 		rubric.preamble ? `${rubric.preamble}\n` : "",
@@ -10782,34 +10826,10 @@ function buildJudgeEvalVariantUserPrompt(input, ctx) {
 		"| --- | --- | --- | --- |",
 		criteriaTable,
 		"",
-		"### How to grade",
-		"",
-		"For EACH `runTaskIds[i]`:",
-		"",
-		"1. Call the `subagent` custom tool with:",
-		"   - `task`: a brief instructing the subagent to grade ONLY that variant",
-		"     against the rubric above; include the target task id and the rubric",
-		"     verbatim. The subagent has the same MoltNet tools and can fetch the",
-		"     accepted attempt output independently.",
-		"   - `output_schema`: `\"judge_eval_variant_result\"`",
-		"2. Receive the subagent's structured `judge_eval_variant_result` payload.",
-		"3. Append it to your `results[]` array, **in the same order as input.runTaskIds**.",
-		"",
-		"Do NOT score any variant in your own session. The whole point of the",
-		"subagent fan-out is per-variant context isolation — grading two variants",
-		"back-to-back in one session lets the second be biased by the first.",
-		"",
 		"### Composite arithmetic",
 		"",
-		"Each `composite` MUST equal `Σ(criterion.weight × score)` over the rubric",
-		"criteria. Drift > 0.001 is rejected. Subagents are instructed to compute it",
-		"themselves; double-check before assembling the final output.",
-		"",
-		"### Deltas (optional)",
-		"",
-		"If useful, populate `deltas` with pairwise composite differences keyed by",
-		"`\"<variantLabel-A> - <variantLabel-B>\"` (single space-hyphen-space). Both",
-		"labels must appear in `results`. Omit `deltas` entirely if not used.",
+		"Your `composite` MUST equal `Σ(criterion.weight × score)` over the rubric",
+		"criteria. Drift > 0.001 is rejected.",
 		"",
 		finalOutputBlock
 	].filter((s) => s !== "").join("\n");
@@ -11106,8 +11126,9 @@ function buildRenderPackUserPrompt(input, ctx) {
 * Build the first user-message prompt for a `run_eval` task.
 *
 * Free-form: no git workflow, no commit ceremony. The executor produces
-* a textual response (and optional file artifacts) that a later
-* `judge_eval_variant` task (Slice 2) grades against the rubric.
+* a textual response (and optional file artifacts) that later
+* `judge_eval_attempt` task(s) grade against their own hidden
+* rubric.
 *
 * Context delivery is handled by `resolveTaskContext` (see
 * libs/agent-runtime/src/context-bindings.ts) and runs BEFORE this
@@ -11117,7 +11138,9 @@ function buildRenderPackUserPrompt(input, ctx) {
 * builder does NOT inline `input.context[]` itself.
 */
 function buildRunEvalUserPrompt(input, ctx) {
-	const { scenario, variantLabel, successCriteria } = input;
+	const { scenario, variantLabel, execution, successCriteria } = input;
+	const hasContext = input.context.length > 0;
+	const hasInlineContext = input.context.some((entry) => entry.binding === "context_inline");
 	const inputFilesSection = scenario.inputFiles?.length ? [
 		"### Input files",
 		"",
@@ -11130,9 +11153,30 @@ function buildRunEvalUserPrompt(input, ctx) {
 		"",
 		`This task carries correlationId \`${ctx.correlationId}\`. It joins`,
 		"this variant to its sibling `run_eval` tasks (other variants of the",
-		"same scenario) and to the eventual `judge_eval_variant` task that",
-		"will grade them together. You do not need to act on it directly —",
-		"it is recorded for cross-variant aggregation at query time.",
+		"same scenario and to any later `judge_eval_attempt` tasks created",
+		"against those variants. You do not need to act on it directly — it",
+		"is recorded for cross-variant aggregation at query time.",
+		""
+	].join("\n") : "";
+	const executionSection = [
+		"### Execution mode",
+		"",
+		`Mode: \`${execution.mode}\``,
+		`Workspace: \`${execution.workspace}\``,
+		execution.workspace === "none" ? "You are running in a scratch workspace with no repository checkout mounted. Do not assume git history or repo files are present unless the scenario provided them explicitly." : execution.workspace === "shared_mount" ? "You are running against the daemon shared mount. Treat any repository mutations as affecting the mounted checkout directly." : "You are running in a dedicated disposable git worktree isolated from the daemon shared checkout.",
+		""
+	].join("\n");
+	const contextDisciplineSection = hasContext ? [
+		"### Injected context discipline",
+		"",
+		"This task includes extra injected context from the task creator.",
+		"You MUST inspect and use that context BEFORE you write solution",
+		"files or draft your final answer.",
+		"Do not solve first and only review the context afterward.",
+		hasInlineContext ? "For `context_inline`, your FIRST content-inspection step should be a `read` of `/workspace/context-pack.md` before your first `write` call. The same content is also mirrored in `/workspace/AGENTS.md` and may be referenced from `/workspace/.claude/CLAUDE.md`." : "If injected context was provided as a skill, inspect that task-injected context before solving.",
+		hasInlineContext ? "If `/workspace/context-pack.md` exists and you skip reading it before writing solution files, you are not following the task instructions." : "Do not rely on memory alone when task-injected context is available; inspect it first.",
+		"If the injected context contains repo- or workflow-specific rules,",
+		"those rules override your generic instincts.",
 		""
 	].join("\n") : "";
 	const finalOutputBlock = buildFinalOutputBlock({
@@ -11145,7 +11189,13 @@ function buildRunEvalUserPrompt(input, ctx) {
 			"  \"totalTokens\": <int>,",
 			"  \"durationMs\": <int>,",
 			"  \"traceparent\": \"<from claim>\",",
-			"  \"verification\": <required iff input.successCriteria; see Self-verification>",
+			"  \"verification\": {",
+			"    \"inputCid\": \"<task inputCid>\",",
+			"    \"results\": [",
+			"      { \"id\": \"<criterion id>\", \"kind\": \"rubric\", \"status\": \"pass|fail|skip\", \"detail\": \"<optional one-liner>\" }",
+			"    ],",
+			"    \"passed\": <boolean>",
+			"  } // required iff input.successCriteria; must be an object, never a string",
 			"}"
 		].join("\n")
 	});
@@ -11153,6 +11203,8 @@ function buildRunEvalUserPrompt(input, ctx) {
 		"# Run Eval Agent\n",
 		`You are running an evaluation scenario as variant \`${variantLabel}\`.\nTask id: \`${ctx.taskId}\`\n`,
 		correlationSection,
+		executionSection,
+		contextDisciplineSection,
 		`### Scenario\n\n${scenario.prompt}\n`,
 		inputFilesSection,
 		verificationSection,
@@ -11224,6 +11276,16 @@ function buildTaskUserPrompt(task, ctx) {
 				diaryId: ctx.diaryId,
 				taskId: ctx.taskId
 			});
+		case JUDGE_EVAL_ATTEMPT_TYPE:
+			if (!Value.Check(JudgeEvalAttemptInput, task.input)) {
+				const errors = [...Value.Errors(JudgeEvalAttemptInput, task.input)];
+				throw new Error(`judge_eval_attempt input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
+			}
+			return buildJudgeEvalAttemptUserPrompt(task.input, {
+				diaryId: ctx.diaryId,
+				taskId: ctx.taskId,
+				workspace: ctx.workspace
+			});
 		case PR_REVIEW_TYPE:
 			if (!Value.Check(PrReviewInput, task.input)) {
 				const errors = [...Value.Errors(PrReviewInput, task.input)];
@@ -11234,15 +11296,6 @@ function buildTaskUserPrompt(task, ctx) {
 				taskId: ctx.taskId,
 				workspace: ctx.workspace
 			});
-		case JUDGE_EVAL_VARIANT_TYPE:
-			if (!Value.Check(JudgeEvalVariantInput, task.input)) {
-				const errors = [...Value.Errors(JudgeEvalVariantInput, task.input)];
-				throw new Error(`judge_eval_variant input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
-			}
-			return buildJudgeEvalVariantUserPrompt(task.input, {
-				diaryId: ctx.diaryId,
-				taskId: ctx.taskId
-			});
 		case RUN_EVAL_TYPE:
 			if (!Value.Check(RunEvalInput, task.input)) {
 				const errors = [...Value.Errors(RunEvalInput, task.input)];
@@ -14760,6 +14813,11 @@ var require_multistream = /* @__PURE__ */ __commonJSMin(((exports, module) => {
 * paths under this mount via `toGuestPath` in `tool-operations.ts`.
 */
 var SKILL_ROOT_IN_VM = GUEST_TASK_SKILLS_MOUNT;
+var INLINE_CONTEXT_ROOT_IN_VM = "/workspace/.moltnet/context";
+var WORKSPACE_CONTEXT_PACK = "/workspace/context-pack.md";
+var WORKSPACE_AGENTS_MD = "/workspace/AGENTS.md";
+var WORKSPACE_CLAUDE_DIR = "/workspace/.claude";
+var WORKSPACE_CLAUDE_MD = "/workspace/.claude/CLAUDE.md";
 /** Bounds borrowed from pi's skill validation; conservative caps so a
 *  malformed SKILL.md doesn't bloat the system prompt. */
 var MAX_SKILL_NAME = 64;
@@ -14770,21 +14828,40 @@ var MAX_SKILL_DESCRIPTION = 1024;
 */
 async function injectTaskContext(args) {
 	const skills = [];
+	const inlineContexts = [];
 	const resolved = await resolveTaskContext({
 		context: args.context,
-		deliver: { skill: async ({ slug, content }) => {
-			const dir = `${SKILL_ROOT_IN_VM}/${slug}`;
-			const filePath = `${dir}/SKILL.md`;
-			await args.fs.mkdir(dir, { recursive: true });
-			await args.fs.writeFile(filePath, content, { mode: 420 });
-			skills.push(buildSyntheticSkill({
-				slug,
-				content,
-				filePath,
-				dir
-			}));
-		} }
+		deliver: {
+			skill: async ({ slug, content }) => {
+				const dir = `${SKILL_ROOT_IN_VM}/${slug}`;
+				const filePath = `${dir}/SKILL.md`;
+				await args.fs.mkdir(dir, { recursive: true });
+				await args.fs.writeFile(filePath, content, { mode: 420 });
+				skills.push(buildSyntheticSkill({
+					slug,
+					content,
+					filePath,
+					dir
+				}));
+			},
+			contextFile: async ({ suggestedFileName, content }) => {
+				await args.fs.mkdir(INLINE_CONTEXT_ROOT_IN_VM, { recursive: true });
+				const filePath = `${INLINE_CONTEXT_ROOT_IN_VM}/${suggestedFileName}`;
+				await args.fs.writeFile(filePath, content, { mode: 420 });
+				inlineContexts.push({
+					slug: suggestedFileName.replace(/\.md$/u, ""),
+					content
+				});
+			}
+		}
 	});
+	if (inlineContexts.length > 0) {
+		const packContent = buildWorkspaceContextPack(inlineContexts);
+		await args.fs.writeFile(WORKSPACE_CONTEXT_PACK, packContent, { mode: 420 });
+		await args.fs.writeFile(WORKSPACE_AGENTS_MD, packContent, { mode: 420 });
+		await args.fs.mkdir(WORKSPACE_CLAUDE_DIR, { recursive: true });
+		await args.fs.writeFile(WORKSPACE_CLAUDE_MD, "@../context-pack.md\n", { mode: 420 });
+	}
 	return {
 		injected: resolved.injected,
 		skills,
@@ -14792,6 +14869,17 @@ async function injectTaskContext(args) {
 		userInlineSuffix: resolved.userInlineSuffix
 	};
 }
+function buildWorkspaceContextPack(contexts) {
+	return [
+		"# Context Pack",
+		"",
+		...contexts.map(({ slug, content }) => [
+			`## ${slug}`,
+			"",
+			content.trimEnd()
+		].join("\n"))
+	].join("\n\n").trimEnd() + "\n";
+}
 /**
 * Build a `Skill` object pi will faithfully render in
 * `<available_skills>`. We extract `name` and `description` from the
@@ -15155,7 +15243,7 @@ async function parseStructuredTaskOutput(assistantText, taskType, opts = {}) {
 			}
 		};
 	}
-	const errors = validateTaskOutput(taskType, extracted);
+	const errors = validateTaskOutput(taskType, extracted, opts.input);
 	if (errors.length > 0) {
 		const details = errors.slice(0, 3).map((error) => `${error.field}: ${error.message}`);
 		const [firstError] = errors;
@@ -15269,7 +15357,7 @@ function createSubmitOutputTool(taskType, opts = {}) {
 			description: contract.description,
 			parameters: schema,
 			async execute(_id, params) {
-				const errors = validateTaskOutput(taskType, params);
+				const errors = validateTaskOutput(taskType, params, opts.input);
 				if (errors.length > 0) {
 					const detailMsg = errors.slice(0, 3).map((err) => `${err.field}: ${err.message}`).join("; ");
 					const details = {
@@ -15338,6 +15426,39 @@ function resolveSubmitTools(taskType, opts = {}) {
 //#region src/runtime/task-workspace.ts
 function prepareTaskWorkspace(task, requestedMountPath, executionPlan) {
 	const branch = executionPlan?.worktreeBranch ?? null;
+	const workspaceMode = executionPlan?.workspaceMode ?? "shared_mount";
+	const attachedWorkspace = executionPlan?.workspaceAttachment ?? null;
+	if (attachedWorkspace) return {
+		mountPath: attachedWorkspace.mountPath,
+		cwdPath: attachedWorkspace.cwdPath,
+		mode: workspaceMode,
+		branch,
+		cleanup: () => {}
+	};
+	if (workspaceMode === "scratch_mount") {
+		const scratchDir = resolveTaskScratchPath(findMainWorktree(), executionPlan?.workspaceId ?? `task-${task.id}`);
+		const keepWorkspace = executionPlan?.workspaceScope === "session" && executionPlan.sessionKey !== null;
+		if (keepWorkspace) mkdirSync(scratchDir, { recursive: true });
+		else {
+			rmSync(scratchDir, {
+				recursive: true,
+				force: true
+			});
+			mkdirSync(scratchDir, { recursive: true });
+		}
+		return {
+			mountPath: scratchDir,
+			cwdPath: scratchDir,
+			mode: "scratch_mount",
+			branch: null,
+			cleanup: keepWorkspace ? () => {} : () => {
+				rmSync(scratchDir, {
+					recursive: true,
+					force: true
+				});
+			}
+		};
+	}
 	if (!branch) return {
 		mountPath: requestedMountPath,
 		cwdPath: requestedMountPath,
@@ -15375,6 +15496,9 @@ function prepareTaskWorkspace(task, requestedMountPath, executionPlan) {
 function resolveTaskWorktreePath(mainRepo, workspaceId) {
 	return join(mainRepo, ".worktrees", workspaceId);
 }
+function resolveTaskScratchPath(mainRepo, workspaceId) {
+	return join(mainRepo, ".moltnet", "d", "task-workspaces", workspaceId);
+}
 function ensureReusableTaskWorktree(mainRepo, worktreeDir, branch) {
 	if (isRegisteredWorktree(mainRepo, worktreeDir)) return;
 	if (existsSync(worktreeDir)) throw new Error(`Expected reusable worktree ${worktreeDir} to be git-managed, but it exists outside git worktree metadata.`);
@@ -15611,12 +15735,14 @@ async function executePiTask(claimedTask, reporter, opts) {
 			return makeFailedOutput("worktree_setup_failed", message);
 		}
 		try {
+			const sandboxConfig = applyExecutionPlanSandboxOverrides(opts.sandboxConfig, executionPlan);
 			managed = await resumeVm({
 				checkpointPath,
 				agentName: opts.agentName,
 				mountPath,
+				workspaceMode: workspace.mode,
 				extraAllowedHosts: opts.extraAllowedHosts,
-				sandboxConfig: opts.sandboxConfig
+				sandboxConfig
 			});
 		} catch (err) {
 			const message = err instanceof Error ? err.message : String(err);
@@ -15645,7 +15771,8 @@ async function executePiTask(claimedTask, reporter, opts) {
 				taskId: task.id,
 				workspace: {
 					mode: activeWorkspace.mode,
-					branch: activeWorkspace.branch
+					branch: activeWorkspace.branch,
+					attached: executionPlan?.workspaceAttachment !== void 0
 				},
 				extras: opts.promptExtras
 			});
@@ -15687,7 +15814,10 @@ async function executePiTask(claimedTask, reporter, opts) {
 			createEditToolDefinition(mountPath, { operations: createGondolinEditOps(managed.vm, mountPath) }),
 			createBashToolDefinition(mountPath, { operations: createGondolinBashOps(managed.vm, mountPath) })
 		];
-		const { handle: submitToolHandle, tools: submitToolDefs } = resolveSubmitTools(task.taskType, { model: opts.model });
+		const { handle: submitToolHandle, tools: submitToolDefs } = resolveSubmitTools(task.taskType, {
+			model: opts.model,
+			input: task.input
+		});
 		const submitTools = submitToolDefs;
 		try {
 			const moltnetAgent = await connect({ configDir: managed.agentDir });
@@ -15906,8 +16036,20 @@ async function executePiTask(claimedTask, reporter, opts) {
 					phase: "output_validation"
 				});
 			}
-			else {
-				const parsed = await parseStructuredTaskOutput(assistantText, task.taskType, { model: opts.model });
+			else if (submitToolHandle) {
+				parseError = {
+					code: "output_missing",
+					message: "Agent did not submit output through the task submit tool. A valid submit tool call is required to complete this task type."
+				};
+				await emit("error", {
+					message: parseError.message,
+					phase: "output_validation"
+				});
+			} else {
+				const parsed = await parseStructuredTaskOutput(assistantText, task.taskType, {
+					model: opts.model,
+					input: task.input
+				});
 				parsedOutput = parsed.output;
 				parsedOutputCid = parsed.outputCid;
 				parseError = parsed.error;
@@ -15993,6 +16135,18 @@ async function executePiTask(claimedTask, reporter, opts) {
 		}
 	}
 }
+function applyExecutionPlanSandboxOverrides(sandboxConfig, executionPlan) {
+	const shadowWrites = executionPlan?.workspaceAttachment?.shadowWrites;
+	if (!shadowWrites) return sandboxConfig;
+	return {
+		...sandboxConfig,
+		vfs: {
+			...sandboxConfig?.vfs,
+			shadow: ["**"],
+			shadowMode: shadowWrites
+		}
+	};
+}
 function emptyUsage(provider, model) {
 	return {
 		inputTokens: 0,
@@ -16210,6 +16364,7 @@ function moltnetExtension(pi) {
 				checkpointPath,
 				agentName,
 				mountPath,
+				workspaceMode: "shared_mount",
 				sandboxConfig
 			});
 			activateAgentEnv(managed.credentials.agentEnv, mainRepo);