@themoltnet/pi-extension 0.18.1 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2386,12 +2386,20 @@ var MoltNetError = class extends Error {
2386
2386
  code;
2387
2387
  statusCode;
2388
2388
  detail;
2389
+ /**
2390
+ * Populated when the server returned a `VALIDATION_FAILED` problem
2391
+ * (status 400) with field-level errors. Empty / undefined for every
2392
+ * other problem kind. Imposer scripts surface these to operators so
2393
+ * they don't have to re-run with curl to see what was rejected.
2394
+ */
2395
+ validationErrors;
2389
2396
  constructor(message, options) {
2390
2397
  super(message);
2391
2398
  this.name = "MoltNetError";
2392
2399
  this.code = options.code;
2393
2400
  this.statusCode = options.statusCode;
2394
2401
  this.detail = options.detail;
2402
+ this.validationErrors = options.validationErrors;
2395
2403
  }
2396
2404
  };
2397
2405
  var NetworkError = class extends MoltNetError {
@@ -2415,10 +2423,14 @@ var AuthenticationError = class extends MoltNetError {
2415
2423
  };
2416
2424
  function problemToError(problem, statusCode) {
2417
2425
  const title = problem.title ?? "Request failed";
2418
- return new MoltNetError(problem.detail ? `${title}: ${problem.detail}` : title, {
2426
+ const message = problem.detail ? `${title}: ${problem.detail}` : title;
2427
+ const rawErrors = problem.errors;
2428
+ const validationErrors = Array.isArray(rawErrors) ? rawErrors.filter((e) => typeof e === "object" && e !== null && typeof e.field === "string" && typeof e.message === "string") : void 0;
2429
+ return new MoltNetError(message, {
2419
2430
  code: problem.type ?? problem.code ?? "UNKNOWN",
2420
2431
  statusCode,
2421
- detail: problem.detail
2432
+ detail: problem.detail,
2433
+ validationErrors
2422
2434
  });
2423
2435
  }
2424
2436
  //#endregion
@@ -7767,6 +7779,41 @@ function createMoltNetTools(config) {
7767
7779
  };
7768
7780
  }
7769
7781
  });
7782
+ const listTaskMessages = defineTool({
7783
+ name: "moltnet_list_task_messages",
7784
+ label: "List MoltNet Task Attempt Messages",
7785
+ description: "List messages for a specific task attempt. Use this when you need the turn-by-turn execution record behind an accepted attempt — tool calls, text deltas, and error/info events that do not appear in the attempt output alone.",
7786
+ parameters: Type.Object({
7787
+ taskId: Type.String({ description: "Task ID (UUID)." }),
7788
+ attemptN: Type.Integer({
7789
+ minimum: 1,
7790
+ description: "Attempt number to inspect."
7791
+ }),
7792
+ afterSeq: Type.Optional(Type.Integer({
7793
+ minimum: 0,
7794
+ description: "Optional cursor: only return messages with seq > afterSeq."
7795
+ })),
7796
+ limit: Type.Optional(Type.Integer({
7797
+ minimum: 1,
7798
+ maximum: 500,
7799
+ description: "Optional maximum messages to return. Defaults to the API value."
7800
+ }))
7801
+ }),
7802
+ async execute(_id, params) {
7803
+ const { agent } = ensureConnected(config);
7804
+ const messages = await agent.tasks.listMessages(params.taskId, params.attemptN, {
7805
+ afterSeq: params.afterSeq,
7806
+ limit: params.limit
7807
+ });
7808
+ return {
7809
+ content: [{
7810
+ type: "text",
7811
+ text: JSON.stringify(messages, null, 2)
7812
+ }],
7813
+ details: {}
7814
+ };
7815
+ }
7816
+ });
7770
7817
  const reviewSessionErrors = defineTool({
7771
7818
  name: "moltnet_review_session_errors",
7772
7819
  label: "Review Session Tool Errors",
@@ -7815,6 +7862,7 @@ function createMoltNetTools(config) {
7815
7862
  createEntry,
7816
7863
  getTask,
7817
7864
  listTaskAttempts,
7865
+ listTaskMessages,
7818
7866
  reviewSessionErrors,
7819
7867
  defineTool({
7820
7868
  name: "moltnet_host_exec",
@@ -8113,6 +8161,12 @@ var GUEST_WORKSPACE$2 = "/workspace";
8113
8161
  * investigation and the alternatives we rejected.
8114
8162
  */
8115
8163
  var GUEST_TASK_SKILLS_MOUNT = "/moltnet-task-skills";
8164
+ function shouldRunResumeCommand(entry, ctx) {
8165
+ if (typeof entry === "string") return true;
8166
+ const workspaceModes = entry.when?.workspaceMode;
8167
+ if (workspaceModes && !workspaceModes.includes(ctx.workspaceMode)) return false;
8168
+ return true;
8169
+ }
8116
8170
  /**
8117
8171
  * Resolve the main worktree root (where .moltnet/ lives — it's untracked,
8118
8172
  * only exists in the main worktree, not in git worktrees).
@@ -8258,6 +8312,7 @@ async function resumeVm(config) {
8258
8312
  ...envOverrides
8259
8313
  };
8260
8314
  const resources = config.sandboxConfig?.resources;
8315
+ const workspaceMode = config.workspaceMode ?? "shared_mount";
8261
8316
  const vm = await VmCheckpoint.load(config.checkpointPath).resume({
8262
8317
  httpHooks,
8263
8318
  env: vmEnv,
@@ -8276,7 +8331,32 @@ async function resumeVm(config) {
8276
8331
  '`);
8277
8332
  await vmRun(vm, "DNS resolvers", `printf 'nameserver 8.8.8.8\\nnameserver 1.1.1.1\\n' > /etc/resolv.conf`);
8278
8333
  await vmRun(vm, "git safe.directory", `git config --system --add safe.directory '*'`);
8279
- for (const [i, cmd] of (config.sandboxConfig?.resumeCommands ?? []).entries()) await vmRun(vm, `resumeCommands[${i}]`, cmd);
8334
+ for (const [i, entry] of (config.sandboxConfig?.resumeCommands ?? []).entries()) {
8335
+ if (!shouldRunResumeCommand(entry, { workspaceMode })) continue;
8336
+ const { run, retries, backoffMs } = typeof entry === "string" ? {
8337
+ run: entry,
8338
+ retries: 0,
8339
+ backoffMs: 2e3
8340
+ } : {
8341
+ run: entry.run,
8342
+ retries: entry.retries ?? 0,
8343
+ backoffMs: entry.retryBackoffMs ?? 2e3
8344
+ };
8345
+ const label = `resumeCommands[${i}]`;
8346
+ let lastErr;
8347
+ for (let attempt = 0; attempt <= retries; attempt++) try {
8348
+ await vmRun(vm, label, run);
8349
+ lastErr = void 0;
8350
+ break;
8351
+ } catch (err) {
8352
+ lastErr = err;
8353
+ if (attempt === retries) break;
8354
+ await new Promise((resolve) => {
8355
+ setTimeout(resolve, (attempt + 1) * backoffMs);
8356
+ });
8357
+ }
8358
+ if (lastErr) throw lastErr instanceof Error ? lastErr : new Error(String(lastErr));
8359
+ }
8280
8360
  const vmSshDir = `${vmAgentDir}/ssh`;
8281
8361
  await vm.exec(`mkdir -p ${vmAgentDir}/ssh /home/agent/.pi/agent`);
8282
8362
  if (creds.piAuthJson !== null) await vm.fs.writeFile("/home/agent/.pi/agent/auth.json", creds.piAuthJson, { mode: 384 });
@@ -8655,7 +8735,8 @@ async function buildAgentSession(args) {
8655
8735
  await resourceLoader.reload();
8656
8736
  const sessionManager = args.sessionPersistence ? await resolvePersistentSessionManager({
8657
8737
  cwd: args.cwdPath,
8658
- sessionDir: args.sessionPersistence.sessionDir
8738
+ sessionDir: args.sessionPersistence.sessionDir,
8739
+ forkFromSessionPath: args.sessionPersistence.forkFromSessionPath
8659
8740
  }) : SessionManager.inMemory(args.cwdPath);
8660
8741
  return (await createAgentSession({
8661
8742
  agentDir: args.piAuthDir,
@@ -8667,6 +8748,7 @@ async function buildAgentSession(args) {
8667
8748
  })).session;
8668
8749
  }
8669
8750
  async function resolvePersistentSessionManager(args) {
8751
+ if (args.forkFromSessionPath) return SessionManager.forkFrom(args.forkFromSessionPath, args.cwd, args.sessionDir);
8670
8752
  await SessionManager.list(args.cwd, args.sessionDir);
8671
8753
  return SessionManager.continueRecent(args.cwd, args.sessionDir);
8672
8754
  }
@@ -8683,6 +8765,11 @@ var PROMPT_SEPARATOR = "\n\n---\n\n";
8683
8765
  * - `skill` → `deliver.skill({ slug, content })` once per ref.
8684
8766
  * Slug collisions on distinct contents are
8685
8767
  * refused loudly.
8768
+ * - `context_inline`→ persist raw bytes via `deliver.contextFile(...)`
8769
+ * and inject them into the prompt in an explicit,
8770
+ * named block. Intended for eval/context experiments
8771
+ * where the content must be in the model context
8772
+ * window, not merely discoverable as a skill.
8686
8773
  * - `prompt_prefix` → content appended to `systemPromptPrefix` with
8687
8774
  * the canonical `\n\n---\n\n` separator (in
8688
8775
  * declared order).
@@ -8715,6 +8802,13 @@ async function resolveTaskContext(args) {
8715
8802
  slug: ref.slug,
8716
8803
  content: ref.content
8717
8804
  });
8805
+ } else if (ref.binding === "context_inline") {
8806
+ await args.deliver.contextFile({
8807
+ slug: ref.slug,
8808
+ content: ref.content,
8809
+ suggestedFileName: `${ref.slug}.md`
8810
+ });
8811
+ promptParts.push(formatInlineContextBlock(ref.slug, ref.content));
8718
8812
  } else if (ref.binding === "prompt_prefix") promptParts.push(ref.content);
8719
8813
  else userParts.push(ref.content);
8720
8814
  injected.push(ref);
@@ -8725,6 +8819,23 @@ async function resolveTaskContext(args) {
8725
8819
  userInlineSuffix: userParts.join(PROMPT_SEPARATOR)
8726
8820
  };
8727
8821
  }
8822
+ function formatInlineContextBlock(slug, content) {
8823
+ return [
8824
+ "### Injected Task Context",
8825
+ "",
8826
+ `Context id: \`${slug}\``,
8827
+ "The following raw context was supplied by the task creator. Treat it",
8828
+ "as task-relevant background that may override generic coding instincts",
8829
+ "when it contains repo- or workflow-specific constraints.",
8830
+ "The same content is also materialized in the workspace as",
8831
+ "`/workspace/context-pack.md` and mirrored in `AGENTS.md` for",
8832
+ "repo-context discovery.",
8833
+ "",
8834
+ "<context>",
8835
+ content,
8836
+ "</context>"
8837
+ ].join("\n");
8838
+ }
8728
8839
  //#endregion
8729
8840
  //#region ../tasks/src/formats.ts
8730
8841
  /**
@@ -8748,6 +8859,7 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
8748
8859
  */
8749
8860
  var ContextBinding = Type$1.Union([
8750
8861
  Type$1.Literal("skill"),
8862
+ Type$1.Literal("context_inline"),
8751
8863
  Type$1.Literal("prompt_prefix"),
8752
8864
  Type$1.Literal("user_inline")
8753
8865
  ], { $id: "ContextBinding" });
@@ -8764,9 +8876,14 @@ var ContextBinding = Type$1.Union([
8764
8876
  * name under the runtime's skill discovery path. Must be
8765
8877
  * kebab-case-safe (alphanumeric + dashes/underscores).
8766
8878
  * - `binding` — how the bytes are delivered to the LLM (see above).
8767
- * - `content` — the actual bytes (UTF-8 text). Capped at 32 KiB per
8879
+ * - `content` — the actual bytes (UTF-8 text). Capped at 64 KiB per
8768
8880
  * entry; total per-task context bytes are bounded by the
8769
8881
  * soft `maxItems` cap and per-binding daemon limits.
8882
+ * Raised from 32 KiB in 2026-05 — protocol-heavy operator
8883
+ * skills (e.g. `.claude/skills/legreffier/SKILL.md`) ship
8884
+ * at ~35 KiB inline, and the original cap was sized for
8885
+ * short example skills, not the kind of skill the eval
8886
+ * substrate is dogfooded on (#943, #823).
8770
8887
  */
8771
8888
  var ContextRef = Type$1.Object({
8772
8889
  slug: Type$1.String({
@@ -8777,7 +8894,7 @@ var ContextRef = Type$1.Object({
8777
8894
  binding: ContextBinding,
8778
8895
  content: Type$1.String({
8779
8896
  minLength: 1,
8780
- maxLength: 32768
8897
+ maxLength: 65536
8781
8898
  })
8782
8899
  }, {
8783
8900
  $id: "ContextRef",
@@ -9341,61 +9458,33 @@ async function validateJudgePackInputAsync(input, ctx) {
9341
9458
  return errors;
9342
9459
  }
9343
9460
  //#endregion
9344
- //#region ../tasks/src/task-types/judge-eval-variant.ts
9461
+ //#region ../tasks/src/task-types/judge-eval-attempt.ts
9345
9462
  /**
9346
- * `judge_eval_variant` — score N variants of a `run_eval` scenario
9347
- * against a single rubric, in one pass, with per-variant subagent
9348
- * isolation.
9463
+ * `judge_eval_attempt` — score one completed `run_eval` attempt against a
9464
+ * hidden judge rubric.
9349
9465
  *
9350
9466
  * output_kind: judgment
9351
- * criteria: required (`successCriteria.rubric` — same envelope shape as
9352
- * `judge_pack` / `assess_brief`)
9353
- * references: not required at the input layer `runTaskIds` already
9354
- * pin the targets being graded.
9355
- *
9356
- * Slice 2 of #943. The parent task carries the rubric and the list of
9357
- * variant `run_eval` task ids. The pi executor registers the generic
9358
- * `subagent` custom tool (#1087), and the parent LLM calls
9359
- * `subagent({ task, output_schema: 'judge_eval_variant_result' })` once
9360
- * per variant — each child session has fresh context, fetches the
9361
- * variant's accepted attempt output via `moltnet_get_task` /
9362
- * `moltnet_list_task_attempts`, and grades against the rubric.
9467
+ * criteria: required (`successCriteria.rubric`)
9468
+ * references: not required at the input layer — `targetTaskId` +
9469
+ * `targetAttemptN` pin the producer attempt being judged.
9363
9470
  *
9364
- * Reuses `JudgePackScore` from `judge_pack` for per-criterion scoring
9365
- * (Lane 1 binary via `llm_checklist`, Lane 2 graded via `llm_score`,
9366
- * deterministic_*) the score shape is the same across judgment
9367
- * tasks; only the wrapping (per-variant grouping + deltas) differs.
9368
- *
9369
- * Cross-task input invariants — "all targets share the same
9370
- * correlation_id, all are `run_eval`, all are completed with an
9371
- * accepted attempt, all share byte-identical `input.successCriteria`"
9372
- * REQUIRE async DB lookups and live in `validateInputAsync` below,
9373
- * which the task service runs at create time (#1096 wiring). The
9374
- * TypeBox layer here only enforces shape: UUID format,
9375
- * minItems/maxItems, rubric presence + weight invariant.
9376
- */
9377
- var JUDGE_EVAL_VARIANT_TYPE = "judge_eval_variant";
9378
- var JudgeEvalVariantInput = Type$1.Object({
9379
- runTaskIds: Type$1.Array(Type$1.String({ format: "uuid" }), {
9380
- minItems: 2,
9381
- maxItems: 10
9382
- }),
9471
+ * This replaces the earlier parent/subagent `judge_eval_variant` design.
9472
+ * The unit of judgment is one producer attempt. Cross-variant deltas can be
9473
+ * computed later at read time from stored scores, rather than materialized as
9474
+ * their own task output.
9475
+ */
9476
+ var JUDGE_EVAL_ATTEMPT_TYPE = "judge_eval_attempt";
9477
+ var JudgeEvalAttemptInput = Type$1.Object({
9478
+ targetTaskId: Type$1.String({ format: "uuid" }),
9479
+ targetAttemptN: Type$1.Integer({ minimum: 1 }),
9383
9480
  successCriteria: SuccessCriteria
9384
9481
  }, {
9385
- $id: "JudgeEvalVariantInput",
9482
+ $id: "JudgeEvalAttemptInput",
9386
9483
  additionalProperties: false
9387
9484
  });
9388
- /**
9389
- * Per-variant grading. `scores[]` shape is identical to `JudgePackScore`
9390
- * (mode-aware: binary via `llm_checklist`, graded via `llm_score`,
9391
- * deterministic_*). Reuse the type rather than re-declare.
9392
- *
9393
- * This is also the **subagent output contract** — the parent's
9394
- * `subagent` tool resolves the contract name `judge_eval_variant_result`
9395
- * to this schema. See `agent-runtime`'s subagent contract registry.
9396
- */
9397
- var JudgeEvalVariantResult = Type$1.Object({
9398
- runTaskId: Type$1.String({ format: "uuid" }),
9485
+ var JudgeEvalAttemptOutput = Type$1.Object({
9486
+ targetTaskId: Type$1.String({ format: "uuid" }),
9487
+ targetAttemptN: Type$1.Integer({ minimum: 1 }),
9399
9488
  variantLabel: Type$1.String({
9400
9489
  minLength: 1,
9401
9490
  maxLength: 64,
@@ -9406,216 +9495,126 @@ var JudgeEvalVariantResult = Type$1.Object({
9406
9495
  minimum: 0,
9407
9496
  maximum: 1
9408
9497
  }),
9409
- verdict: Type$1.String({ minLength: 1 })
9410
- }, {
9411
- $id: "JudgeEvalVariantResult",
9412
- additionalProperties: false
9413
- });
9414
- var JudgeEvalVariantOutput = Type$1.Object({
9415
- results: Type$1.Array(JudgeEvalVariantResult, { minItems: 2 }),
9416
- deltas: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Number({
9417
- minimum: -1,
9418
- maximum: 1
9419
- }))),
9498
+ verdict: Type$1.String({ minLength: 1 }),
9420
9499
  judgeModel: Type$1.Optional(Type$1.String({ minLength: 1 })),
9421
9500
  traceparent: Type$1.String({ minLength: 1 })
9422
9501
  }, {
9423
- $id: "JudgeEvalVariantOutput",
9502
+ $id: "JudgeEvalAttemptOutput",
9424
9503
  additionalProperties: false
9425
9504
  });
9426
- /**
9427
- * Synchronous input invariants beyond TypeBox shape: rubric must be
9428
- * present (already required by the schema, but the rubric body has
9429
- * its own per-criterion weight invariant) and the rubric's weights
9430
- * must sum to 1.
9431
- *
9432
- * Cross-task invariants (all targets are `run_eval`, all completed,
9433
- * share `correlation_id`, byte-identical `input.successCriteria`)
9434
- * are NOT checked here — they require async DB lookups against
9435
- * `runTaskIds` and live in `validateJudgeEvalVariantInputAsync`
9436
- * below, invoked by the task service at create time (#1096).
9437
- */
9438
- function validateJudgeEvalVariantInput(input) {
9505
+ function validateJudgeEvalAttemptInput(input) {
9439
9506
  const sc = input.successCriteria;
9440
- if (!sc) return "successCriteria is required for judge_eval_variant";
9441
- if (!sc.rubric) return "successCriteria.rubric is required for judge_eval_variant";
9507
+ if (!sc) return "successCriteria is required for judge_eval_attempt";
9508
+ if (!sc.rubric) return "successCriteria.rubric is required for judge_eval_attempt";
9442
9509
  return validateRubricWeights(sc.rubric);
9443
9510
  }
9444
- /**
9445
- * Output cross-field invariants the schema cannot express:
9446
- *
9447
- * 1. `results.length === input.runTaskIds.length` — every variant
9448
- * the imposer asked for must be graded. Partial grading
9449
- * invalidates cross-variant comparison; fail the whole task
9450
- * rather than silently report a subset.
9451
- *
9452
- * 2. `results[i].runTaskId === input.runTaskIds[i]` — order is
9453
- * load-bearing for downstream consumers (e.g. deltas keyed by
9454
- * adjacent pairs). Mismatch is an LLM bug; reject loudly.
9455
- *
9456
- * 3. Each `result.scores` follows the same `llm_checklist` rule
9457
- * `judge_pack` enforces (#999): if a score has an `assertions`
9458
- * array, the numeric score MUST be `1` iff every assertion
9459
- * passes. Inconsistent payloads pollute attestations.
9460
- *
9461
- * 4. Each `result.composite` MUST equal the rubric-weighted sum
9462
- * `Σ(weight_j × scores[j].score)`. The parent (and any subagent
9463
- * it delegated to) is supposed to compute this; surfacing a
9464
- * drift here catches LLMs that hand-wave the arithmetic.
9465
- *
9466
- * 5. Optional `deltas` keys MUST be of the form `"A - B"` where
9467
- * both `A` and `B` are variantLabels present in `results`.
9468
- * Values are not range-checked (any float in [-1, 1] is
9469
- * arithmetically possible).
9470
- */
9471
- function validateJudgeEvalVariantOutput(output, input) {
9511
+ function validateJudgeEvalAttemptOutput(output, input) {
9472
9512
  const out = output;
9473
9513
  const inp = input;
9474
9514
  if (inp) {
9475
- if (out.results.length !== inp.runTaskIds.length) return `results.length (${out.results.length}) does not match input.runTaskIds.length (${inp.runTaskIds.length}). Every variant must be graded; partial grading is rejected.`;
9476
- for (let i = 0; i < out.results.length; i++) if (out.results[i].runTaskId !== inp.runTaskIds[i]) return `results[${i}].runTaskId (${out.results[i].runTaskId}) does not match input.runTaskIds[${i}] (${inp.runTaskIds[i]}). Order must align with input for downstream delta computation.`;
9515
+ if (out.targetTaskId !== inp.targetTaskId) return `output.targetTaskId (${out.targetTaskId}) does not match input.targetTaskId (${inp.targetTaskId})`;
9516
+ if (out.targetAttemptN !== inp.targetAttemptN) return `output.targetAttemptN (${out.targetAttemptN}) does not match input.targetAttemptN (${inp.targetAttemptN})`;
9477
9517
  }
9478
- for (let r = 0; r < out.results.length; r++) {
9479
- const result = out.results[r];
9480
- for (let s = 0; s < result.scores.length; s++) {
9481
- const sc = result.scores[s];
9482
- if (!sc.assertions) continue;
9483
- const allPassed = sc.assertions.every((a) => a.passed);
9484
- const expected = allPassed ? 1 : 0;
9485
- if (sc.score !== expected) return `results[${r}].scores[${s}] (criterionId="${sc.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${sc.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
9486
- }
9518
+ for (let s = 0; s < out.scores.length; s++) {
9519
+ const sc = out.scores[s];
9520
+ if (!sc.assertions) continue;
9521
+ const allPassed = sc.assertions.every((a) => a.passed);
9522
+ const expected = allPassed ? 1 : 0;
9523
+ if (sc.score !== expected) return `scores[${s}] (criterionId="${sc.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${sc.score}. Score must be 1 iff every assertion passes, else 0.`;
9487
9524
  }
9488
9525
  if (inp?.successCriteria?.rubric) {
9489
9526
  const criteria = inp.successCriteria.rubric.criteria;
9490
9527
  const weightById = new Map(criteria.map((c) => [c.id, c.weight]));
9491
- for (let r = 0; r < out.results.length; r++) {
9492
- const result = out.results[r];
9493
- let sum = 0;
9494
- for (const sc of result.scores) {
9495
- const w = weightById.get(sc.criterionId);
9496
- if (w === void 0) return `results[${r}].scores: criterionId "${sc.criterionId}" is not in the input rubric (known: ${Array.from(weightById.keys()).join(", ")}). Score every rubric criterion exactly once; do not invent new ids.`;
9497
- sum += w * sc.score;
9498
- }
9499
- if (Math.abs(sum - result.composite) > .001) return `results[${r}].composite (${result.composite}) does not match Σ(weight × score) (${sum.toFixed(6)}). Composite must be the rubric-weighted sum of per-criterion scores (drift > 0.001).`;
9500
- }
9501
- }
9502
- if (out.deltas) {
9503
- const labels = new Set(out.results.map((r) => r.variantLabel));
9504
- for (const key of Object.keys(out.deltas)) {
9505
- const m = /^(.+?) - (.+)$/.exec(key);
9506
- if (!m) return `deltas key "${key}" is not of the form "<variantLabel-A> - <variantLabel-B>". Use a single space-hyphen-space separator between labels.`;
9507
- const [, a, b] = m;
9508
- if (!labels.has(a) || !labels.has(b)) return `deltas key "${key}" references variantLabel(s) not present in results: ${!labels.has(a) ? `"${a}" missing` : ""}${!labels.has(a) && !labels.has(b) ? ", " : ""}${!labels.has(b) ? `"${b}" missing` : ""}`;
9528
+ let sum = 0;
9529
+ for (const sc of out.scores) {
9530
+ const w = weightById.get(sc.criterionId);
9531
+ if (w === void 0) return `scores references unknown criterionId "${sc.criterionId}"`;
9532
+ sum += w * sc.score;
9509
9533
  }
9534
+ const rounded = Math.round(sum * 1e3) / 1e3;
9535
+ if (Math.abs(rounded - out.composite) > .001) return `composite (${out.composite}) does not match weighted rubric sum (${rounded})`;
9510
9536
  }
9511
9537
  return null;
9512
9538
  }
9513
- /**
9514
- * Local stable-stringify for cross-variant `successCriteria` byte-
9515
- * equality. Recursively sorts object keys; arrays preserve order
9516
- * (intentional — rubric criteria order is semantically meaningful).
9517
- * Mirrors the canonical-JSON shape `crypto-service` uses for CIDs,
9518
- * without taking on a crypto-service dep just for this comparison.
9519
- */
9520
- function stableStringify(value) {
9521
- if (value === null || typeof value !== "object") return JSON.stringify(value);
9522
- if (Array.isArray(value)) return "[" + value.map(stableStringify).join(",") + "]";
9523
- const obj = value;
9524
- return "{" + Object.keys(obj).sort().map((k) => JSON.stringify(k) + ":" + stableStringify(obj[k])).join(",") + "}";
9525
- }
9526
- /**
9527
- * Async preflight for `judge_eval_variant` (#1096 + #943):
9528
- *
9529
- * 1. Every `runTaskIds[i]` resolves to a task the caller can read.
9530
- * 2. Every resolved task is `taskType === 'run_eval'`.
9531
- * 3. Every resolved task is `status === 'completed'` with a
9532
- * non-null `acceptedAttemptN` — grading an unaccepted attempt
9533
- * races with re-attempts and pollutes the judge attestation.
9534
- * 4. Every resolved task shares a non-null `correlationId`, and all
9535
- * `correlationId`s are equal. Without this an imposer could
9536
- * fabricate a "variant set" by stapling unrelated runs together.
9537
- * 5. The shared `correlationId` is NOT already sealed. A previous
9538
- * judge_eval_variant against the same group is final; produce a
9539
- * fresh correlation_id for a new judging round rather than
9540
- * adding contradictory verdicts to a sealed group.
9541
- * 6. Every variant's `input.successCriteria` is byte-identical (via
9542
- * stable-stringify). Different rubrics across "variants" makes
9543
- * the comparison meaningless.
9544
- */
9545
- async function validateJudgeEvalVariantInputAsync(input, ctx) {
9546
- const { runTaskIds } = input;
9539
+ async function validateJudgeEvalAttemptInputAsync(input, ctx) {
9540
+ const inp = input;
9547
9541
  const errors = [];
9548
- const resolved = await Promise.all(runTaskIds.map((id) => ctx.resolveTask(id)));
9549
- let missingTargets = false;
9550
- const presentTargets = [];
9551
- for (let i = 0; i < runTaskIds.length; i++) {
9552
- const t = resolved[i];
9553
- if (!t) {
9554
- missingTargets = true;
9555
- errors.push({
9556
- field: `runTaskIds[${i}]`,
9557
- message: `runTaskIds[${i}]=${runTaskIds[i]} does not resolve to a task you can read`
9558
- });
9559
- continue;
9560
- }
9561
- presentTargets.push(t);
9562
- if (t.taskType !== "run_eval") errors.push({
9563
- field: `runTaskIds[${i}]`,
9564
- message: `runTaskIds[${i}]=${runTaskIds[i]} is a ${t.taskType}, not a run_eval`
9565
- });
9566
- if (t.status !== "completed" || t.acceptedAttemptN === null) errors.push({
9567
- field: `runTaskIds[${i}]`,
9568
- message: `runTaskIds[${i}]=${runTaskIds[i]} is not completed with an accepted attempt (status=${t.status}, acceptedAttemptN=${t.acceptedAttemptN})`
9569
- });
9570
- }
9571
- if (missingTargets || presentTargets.length === 0) return errors;
9572
- const correlationIds = new Set(presentTargets.map((t) => t.correlationId ?? "__null__"));
9573
- if (correlationIds.has("__null__")) errors.push({
9574
- field: "runTaskIds",
9575
- message: "one or more run_eval targets have no correlation_id; cannot group as variants"
9542
+ const target = await ctx.resolveTask(inp.targetTaskId);
9543
+ if (!target) return [{
9544
+ field: "targetTaskId",
9545
+ message: `targetTaskId=${inp.targetTaskId} does not resolve to a task you can read`
9546
+ }];
9547
+ if (target.taskType !== "run_eval") errors.push({
9548
+ field: "targetTaskId",
9549
+ message: `targetTaskId=${inp.targetTaskId} is a ${target.taskType}, not a run_eval`
9550
+ });
9551
+ if (target.status !== "completed" || target.acceptedAttemptN === null) errors.push({
9552
+ field: "targetTaskId",
9553
+ message: `targetTaskId=${inp.targetTaskId} is not completed with an accepted attempt (status=${target.status}, acceptedAttemptN=${target.acceptedAttemptN})`
9576
9554
  });
9577
- if (correlationIds.size > 1) errors.push({
9578
- field: "runTaskIds",
9579
- message: `run_eval targets span multiple correlation_ids (${Array.from(correlationIds).join(", ")}); variants must share one`
9555
+ else if (target.acceptedAttemptN !== inp.targetAttemptN) errors.push({
9556
+ field: "targetAttemptN",
9557
+ message: `targetAttemptN=${inp.targetAttemptN} does not match the producer's acceptedAttemptN=${target.acceptedAttemptN}`
9580
9558
  });
9581
- if (errors.length > 0) return errors;
9582
- const correlationId = presentTargets[0].correlationId;
9583
- if (!correlationId) return errors;
9584
- const seal = await ctx.findCorrelationSeal(correlationId);
9585
- if (seal) errors.push({
9586
- field: "runTaskIds",
9587
- message: `correlation_id ${correlationId} is already sealed by ${seal.sealedByTaskType}/${seal.sealedByTaskId} at ${seal.sealedAt}; use a fresh correlation_id for a new judging round`
9559
+ if (!target.correlationId) errors.push({
9560
+ field: "targetTaskId",
9561
+ message: "target run_eval has no correlation_id; cannot enforce duplicate-judge protection"
9562
+ });
9563
+ if (errors.length > 0 || !target.correlationId) return errors;
9564
+ const rubric = inp.successCriteria.rubric;
9565
+ const duplicate = (await ctx.listTasksByCorrelation(target.correlationId)).find((task) => {
9566
+ if (task.taskType !== "judge_eval_attempt") return false;
9567
+ if (task.status === "failed" || task.status === "cancelled" || task.status === "expired") return false;
9568
+ const existing = task.input;
9569
+ const existingRubric = existing.successCriteria?.rubric;
9570
+ return existing.targetTaskId === inp.targetTaskId && existing.targetAttemptN === inp.targetAttemptN && existingRubric?.rubricId === rubric?.rubricId && existingRubric?.version === rubric?.version;
9571
+ });
9572
+ if (duplicate) errors.push({
9573
+ field: "targetTaskId",
9574
+ message: `judge task ${duplicate.id} already exists for (${inp.targetTaskId}, attempt ${inp.targetAttemptN}, rubric ${rubric?.rubricId}@${rubric?.version})`
9588
9575
  });
9589
- const first = stableStringify(presentTargets[0].input.successCriteria);
9590
- for (let i = 1; i < presentTargets.length; i++) if (stableStringify(presentTargets[i].input.successCriteria) !== first) {
9591
- errors.push({
9592
- field: `runTaskIds[${i}]`,
9593
- message: `runTaskIds[${i}] has a different input.successCriteria than runTaskIds[0]; all variants must share the rubric and gates`
9594
- });
9595
- break;
9596
- }
9597
9576
  return errors;
9598
9577
  }
9599
- /**
9600
- * Side effect emitted on successful `judge_eval_variant` create:
9601
- * seal the shared correlation_id atomically with the insert. The
9602
- * task service applies the seal in the same transaction; a
9603
- * concurrent second `judge_eval_variant` against the same group
9604
- * loses the race and is rejected with a clean conflict error.
9605
- *
9606
- * The seal applies to the SHARED correlation_id of the targets —
9607
- * NOT to the judge task's own correlationId (which is typically
9608
- * null or distinct). The task service derives the correlationId
9609
- * for the effect from the resolved targets, not from the judge
9610
- * task row.
9611
- */
9612
- async function onCreateJudgeEvalVariant(input, ctx) {
9613
- const { runTaskIds } = input;
9614
- const first = await ctx.resolveTask(runTaskIds[0]);
9615
- if (!first?.correlationId) return [];
9578
+ async function onCreateJudgeEvalAttempt(input, _ctx) {
9579
+ const judge = input;
9580
+ const rubric = judge.successCriteria.rubric;
9581
+ if (!rubric) return [];
9616
9582
  return [{
9617
- kind: "sealCorrelation",
9618
- correlationId: first.correlationId
9583
+ kind: "guardTaskUniqueness",
9584
+ taskType: JUDGE_EVAL_ATTEMPT_TYPE,
9585
+ lockKey: [
9586
+ JUDGE_EVAL_ATTEMPT_TYPE,
9587
+ judge.targetTaskId,
9588
+ String(judge.targetAttemptN),
9589
+ rubric.rubricId,
9590
+ rubric.version
9591
+ ].join(":"),
9592
+ inputMatches: [
9593
+ {
9594
+ path: ["targetTaskId"],
9595
+ value: judge.targetTaskId
9596
+ },
9597
+ {
9598
+ path: ["targetAttemptN"],
9599
+ value: judge.targetAttemptN
9600
+ },
9601
+ {
9602
+ path: [
9603
+ "successCriteria",
9604
+ "rubric",
9605
+ "rubricId"
9606
+ ],
9607
+ value: rubric.rubricId
9608
+ },
9609
+ {
9610
+ path: [
9611
+ "successCriteria",
9612
+ "rubric",
9613
+ "version"
9614
+ ],
9615
+ value: rubric.version
9616
+ }
9617
+ ]
9619
9618
  }];
9620
9619
  }
9621
9620
  //#endregion
@@ -9739,14 +9738,43 @@ async function validateRenderPackInputAsync(input, ctx) {
9739
9738
  //#region ../tasks/src/task-types/run-eval.ts
9740
9739
  /**
9741
9740
  * `run_eval` — execute a scenario prompt under a named variant for
9742
- * later cross-variant grading by `judge_eval_variant` (Slice 2).
9741
+ * later per-attempt grading by `judge_eval_attempt` tasks.
9743
9742
  *
9744
9743
  * output_kind: artifact
9745
- * criteria: optional (when set, output.verification is required —
9746
- * producer self-assessment; the judge is the binding evaluator)
9744
+ * criteria: optional producer-only checks (when set,
9745
+ * output.verification is required — the judge rubric remains hidden
9746
+ * on downstream `judge_eval_attempt` tasks)
9747
9747
  * references: not required (scenario lives entirely in input)
9748
9748
  */
9749
9749
  var RUN_EVAL_TYPE = "run_eval";
9750
+ var RunEvalMode = Type$1.Union([Type$1.Literal("vitro"), Type$1.Literal("vivo")], { $id: "RunEvalMode" });
9751
+ var RunEvalWorkspace = Type$1.Union([
9752
+ Type$1.Literal("none"),
9753
+ Type$1.Literal("shared_mount"),
9754
+ Type$1.Literal("dedicated_worktree")
9755
+ ], { $id: "RunEvalWorkspace" });
9756
+ var RunEvalExecution = Type$1.Object({
9757
+ mode: RunEvalMode,
9758
+ workspace: RunEvalWorkspace
9759
+ }, {
9760
+ $id: "RunEvalExecution",
9761
+ additionalProperties: false
9762
+ });
9763
+ /**
9764
+ * Producer-visible checks for `run_eval`. Deliberately forbids `rubric`
9765
+ * so the variant runner cannot see the downstream judge's answer key.
9766
+ * Keep the rest of the SuccessCriteria envelope available for generic
9767
+ * process / structure checks (`gates`, `assertions`, `sideEffects`).
9768
+ */
9769
+ var RunEvalSuccessCriteria = Type$1.Object({
9770
+ version: Type$1.Literal(1),
9771
+ gates: Type$1.Optional(SuccessCriteria.properties.gates),
9772
+ assertions: Type$1.Optional(SuccessCriteria.properties.assertions),
9773
+ sideEffects: Type$1.Optional(SuccessCriteria.properties.sideEffects)
9774
+ }, {
9775
+ $id: "RunEvalSuccessCriteria",
9776
+ additionalProperties: false
9777
+ });
9750
9778
  var RunEvalInput = Type$1.Object({
9751
9779
  scenario: Type$1.Object({
9752
9780
  prompt: Type$1.String({ minLength: 1 }),
@@ -9756,8 +9784,9 @@ var RunEvalInput = Type$1.Object({
9756
9784
  minLength: 1,
9757
9785
  maxLength: 64
9758
9786
  }),
9787
+ execution: RunEvalExecution,
9759
9788
  context: TaskContext,
9760
- successCriteria: Type$1.Optional(SuccessCriteria)
9789
+ successCriteria: Type$1.Optional(RunEvalSuccessCriteria)
9761
9790
  }, {
9762
9791
  $id: "RunEvalInput",
9763
9792
  additionalProperties: false
@@ -9785,8 +9814,8 @@ var RunEvalOutput = Type$1.Object({
9785
9814
  function validateRunEvalOutput(output, input) {
9786
9815
  const hasCriteria = input !== null && input !== void 0 && input.successCriteria !== void 0;
9787
9816
  const hasVerification = output !== null && output !== void 0 && output.verification !== void 0;
9788
- if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the criteria";
9789
- if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no criteria to assess against";
9817
+ if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the producer checks";
9818
+ if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no producer checks to assess against";
9790
9819
  return null;
9791
9820
  }
9792
9821
  //#endregion
@@ -9902,24 +9931,24 @@ var BUILT_IN_TASK_TYPES = {
9902
9931
  inputSchema: RunEvalInput,
9903
9932
  outputSchema: RunEvalOutput,
9904
9933
  outputKind: "artifact",
9905
- workspaceScope: "attempt",
9934
+ resumable: true,
9935
+ workspaceScope: "session",
9906
9936
  sessionScope: "custom",
9907
9937
  requiresReferences: false,
9908
9938
  validateOutput: validateRunEvalOutput
9909
9939
  },
9910
- [JUDGE_EVAL_VARIANT_TYPE]: {
9911
- name: JUDGE_EVAL_VARIANT_TYPE,
9912
- inputSchema: JudgeEvalVariantInput,
9913
- outputSchema: JudgeEvalVariantOutput,
9940
+ [JUDGE_EVAL_ATTEMPT_TYPE]: {
9941
+ name: JUDGE_EVAL_ATTEMPT_TYPE,
9942
+ inputSchema: JudgeEvalAttemptInput,
9943
+ outputSchema: JudgeEvalAttemptOutput,
9914
9944
  outputKind: "judgment",
9915
9945
  workspaceScope: "attempt",
9916
- sessionScope: "custom",
9946
+ sessionScope: "none",
9917
9947
  requiresReferences: false,
9918
- validateInput: validateJudgeEvalVariantInput,
9919
- validateOutput: validateJudgeEvalVariantOutput,
9920
- validateInputAsync: validateJudgeEvalVariantInputAsync,
9921
- onCreate: onCreateJudgeEvalVariant,
9922
- usesSubagents: true
9948
+ validateInput: validateJudgeEvalAttemptInput,
9949
+ validateOutput: validateJudgeEvalAttemptOutput,
9950
+ validateInputAsync: validateJudgeEvalAttemptInputAsync,
9951
+ onCreate: onCreateJudgeEvalAttempt
9923
9952
  }
9924
9953
  };
9925
9954
  //#endregion
@@ -10283,20 +10312,16 @@ function buildFinalOutputBlock(opts) {
10283
10312
  "## Final output (read this carefully)",
10284
10313
  "",
10285
10314
  `Your VERY LAST action in this conversation MUST report the structured`,
10286
- `output matching \`${outputSchemaName}\`. Two ways to do it, in order of`,
10287
- `preference:`,
10315
+ `output matching \`${outputSchemaName}\`.`,
10288
10316
  "",
10289
- `1. **Preferred — call \`${submitTool}\` exactly once** with the payload.`,
10290
- ` The runtime captures the validated arguments and ends the session.`,
10291
- ` If the tool is registered, prefer this path.`,
10292
- `2. **Fallback** if the submit tool is unavailable, your very last`,
10293
- ` assistant message MUST be a single JSON object matching`,
10294
- ` \`${outputSchemaName}\`. No prose before or after. No code fences.`,
10295
- ` No "ok" or "done". The runtime parses the last balanced top-level`,
10296
- ` JSON object as the output.`,
10317
+ `Call \`${submitTool}\` exactly once with the payload.`,
10318
+ `The runtime captures the validated arguments and ends the session.`,
10319
+ `Do NOT emit the output as plain assistant text. Do NOT rely on a`,
10320
+ `JSON-in-message fallback. If you do not call \`${submitTool}\`, the`,
10321
+ `attempt fails even if the underlying work succeeded.`,
10297
10322
  "",
10298
- `Failing to report structured output as the very last action means the`,
10299
- `attempt is marked failed even if the underlying work succeeded.`,
10323
+ `Your final assistant text before that tool call may explain your work,`,
10324
+ `but the submit-tool call itself must be your VERY LAST action.`,
10300
10325
  "",
10301
10326
  `Output shape:`,
10302
10327
  "",
@@ -10434,21 +10459,30 @@ function buildAssessBriefUserPrompt(input, ctx) {
10434
10459
  }
10435
10460
  //#endregion
10436
10461
  //#region ../agent-runtime/src/prompts/self-verification.ts
10437
- function buildSelfVerificationBlock(taskId) {
10462
+ function buildSelfVerificationBlock(taskId, criteriaField = "successCriteria") {
10438
10463
  return [
10439
10464
  "## Self-verification",
10440
10465
  "",
10441
- `Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.successCriteria\`.`,
10466
+ `If \`input.${criteriaField}\` is set on this task, your final output MUST`,
10467
+ "include a `verification` block. **The runtime/server rejects task",
10468
+ `submission without \`verification\` when \`${criteriaField}\` is present**`,
10469
+ "— the request fails validation and the attempt is discarded, even if the",
10470
+ "underlying work succeeded. Do not call the submit tool until you have",
10471
+ "computed the verification payload.",
10442
10472
  "",
10443
- "- If `input.successCriteria` is **absent**, omit `verification` from your",
10473
+ `Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.${criteriaField}\`.`,
10474
+ "",
10475
+ `- If \`input.${criteriaField}\` is **absent**, omit \`verification\` from your`,
10444
10476
  " final output entirely.",
10445
- "- If `input.successCriteria` is **present**, you MUST include a",
10446
- " `verification` block in your final output. Evaluate every applicable",
10477
+ `- If \`input.${criteriaField}\` is **present**, evaluate every applicable`,
10447
10478
  " item — `gates`, `assertions`, `rubric` criteria, `sideEffects` — against",
10448
10479
  " your produced work and emit one result per id. Be honest: a `fail` with",
10449
10480
  " a one-line reason is more useful than a false `pass`. Use `skip` (with a",
10450
10481
  " `detail`) when you genuinely could not determine a result. Compute",
10451
10482
  " `passed = results.every(r => r.status !== 'fail')`.",
10483
+ "- `verification` MUST be a JSON object. Never send a string, markdown",
10484
+ " block, null, or an empty placeholder. The submit tool expects an object",
10485
+ " with `inputCid`, `results`, and `passed` fields.",
10452
10486
  "",
10453
10487
  "Verification shape:",
10454
10488
  "",
@@ -10462,6 +10496,23 @@ function buildSelfVerificationBlock(taskId) {
10462
10496
  " \"passed\": <boolean>",
10463
10497
  "}",
10464
10498
  "```",
10499
+ "",
10500
+ "Minimal valid example:",
10501
+ "",
10502
+ "```json",
10503
+ "{",
10504
+ " \"inputCid\": \"<task inputCid>\",",
10505
+ " \"results\": [",
10506
+ " {",
10507
+ " \"id\": \"<criterion id>\",",
10508
+ " \"kind\": \"rubric\",",
10509
+ " \"status\": \"pass\",",
10510
+ " \"detail\": \"one-line reason\"",
10511
+ " }",
10512
+ " ],",
10513
+ " \"passed\": true",
10514
+ "}",
10515
+ "```",
10465
10516
  ""
10466
10517
  ].join("\n");
10467
10518
  }
@@ -10712,69 +10763,62 @@ function buildFulfillBriefUserPrompt(input, ctx) {
10712
10763
  ].filter(Boolean).join("\n");
10713
10764
  }
10714
10765
  //#endregion
10715
- //#region ../agent-runtime/src/prompts/judge-eval-variant.ts
10716
- /**
10717
- * Build the first user-message prompt for a `judge_eval_variant` task
10718
- * (#943 Slice 2).
10719
- *
10720
- * The parent agent's job is **fan-out-and-collect**: for each
10721
- * `runTaskIds[i]`, spawn an isolated subagent via the `subagent` custom
10722
- * tool (#1087), have it grade that variant against the shared rubric,
10723
- * and collect each subagent's structured `judge_eval_variant_result`
10724
- * payload. The parent does NOT grade itself; it composes the per-
10725
- * variant results into the final `judge_eval_variant` output (results
10726
- * array + optional deltas + verdicts).
10727
- *
10728
- * Isolation is the point: each variant gets a fresh subagent session
10729
- * with no carryover context from sibling variants, so per-variant
10730
- * grading is independent. Cost is bounded by `maxItems: 10` on
10731
- * runTaskIds.
10732
- */
10733
- function buildJudgeEvalVariantUserPrompt(input, ctx) {
10734
- const { runTaskIds, successCriteria } = input;
10735
- const rubric = successCriteria.rubric;
10736
- if (!rubric) throw new Error("judge_eval_variant requires successCriteria.rubric — none present");
10766
+ //#region ../agent-runtime/src/prompts/judge-eval-attempt.ts
10767
+ function buildJudgeEvalAttemptUserPrompt(input, ctx) {
10768
+ const rubric = input.successCriteria.rubric;
10769
+ if (!rubric) throw new Error("judge_eval_attempt requires successCriteria.rubric — none present");
10737
10770
  const escapeCell = (s) => s.replace(/\\/g, "\\\\").replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
10738
10771
  const criteriaTable = rubric.criteria.map((c) => `| \`${c.id}\` | ${c.weight.toFixed(3)} | ${c.scoring} | ${escapeCell(c.description)} |`).join("\n");
10739
- const targetsBlock = runTaskIds.map((id, i) => `${i + 1}. \`${id}\``).join("\n");
10740
10772
  const finalOutputBlock = buildFinalOutputBlock({
10741
- taskType: "judge_eval_variant",
10742
- outputSchemaName: "JudgeEvalVariantOutput",
10773
+ taskType: "judge_eval_attempt",
10774
+ outputSchemaName: "JudgeEvalAttemptOutput",
10743
10775
  shapeSketch: [
10744
10776
  "{",
10745
- " \"results\": [",
10746
- " {",
10747
- " \"runTaskId\": \"<runTaskIds[i]>\",",
10748
- " \"variantLabel\": \"<from variant input>\",",
10749
- " \"scores\": [ { \"criterionId\": \"...\", \"score\": 0..1, \"rationale\": \"...\", \"assertions\": [...]? } ],",
10750
- " \"composite\": <Σ(weight × score), 0..1>,",
10751
- " \"verdict\": \"<1-3 sentences>\"",
10752
- " },",
10753
- " ...one entry per runTaskIds[i], same order",
10754
- " ],",
10755
- " \"deltas\": { \"<labelA> - <labelB>\": <composite(A) - composite(B)> }, // optional",
10777
+ ` "targetTaskId": "${input.targetTaskId}",`,
10778
+ ` "targetAttemptN": ${input.targetAttemptN},`,
10779
+ " \"variantLabel\": \"<from producer input>\",",
10780
+ " \"scores\": [ { \"criterionId\": \"...\", \"score\": 0..1, \"rationale\": \"...\", \"assertions\": [...]? } ],",
10781
+ " \"composite\": <Σ(weight × score), 0..1>,",
10782
+ " \"verdict\": \"<1-3 sentences>\",",
10756
10783
  " \"judgeModel\": \"<id>\", // optional",
10757
10784
  " \"traceparent\": \"<from claim>\"",
10758
10785
  "}"
10759
10786
  ].join("\n")
10760
10787
  });
10788
+ const workspaceSection = ctx.workspace?.attached === true ? [
10789
+ "### Workspace",
10790
+ "",
10791
+ "Your current workspace is already attached to the producer attempt",
10792
+ "you are judging. Inspect files directly from the current workspace",
10793
+ "root instead of inventing synthetic `artifact_<taskId>` paths.",
10794
+ "If the accepted attempt output lists `artifacts[].path`, treat those",
10795
+ "paths as relative to the current workspace root unless the output",
10796
+ "explicitly says otherwise.",
10797
+ ctx.workspace.mode === "dedicated_worktree" ? `This attachment is a dedicated producer worktree${ctx.workspace.branch ? ` on branch \`${ctx.workspace.branch}\`` : ""}.` : ctx.workspace.mode === "scratch_mount" ? "This attachment is the producer scratch workspace mounted with shadow writes for safe inspection." : "This attachment is the producer shared workspace mounted with shadow writes for safe inspection.",
10798
+ ""
10799
+ ].join("\n") : "";
10761
10800
  return [
10762
- "# Judge Eval Variants\n",
10763
- `You are grading ${runTaskIds.length} variants of a single run_eval scenario`,
10764
- "against ONE shared rubric. Your job is fan-out-and-collect you do not",
10765
- "grade yourself.",
10801
+ "# Judge Eval Attempt\n",
10802
+ "You are grading one accepted `run_eval` producer attempt against a hidden",
10803
+ "judge rubric. Do not delegate to subagents. Grade in this session only.",
10766
10804
  "",
10767
10805
  `Task id: \`${ctx.taskId}\``,
10768
10806
  `Diary: \`${ctx.diaryId}\``,
10807
+ `Producer task: \`${input.targetTaskId}\``,
10808
+ `Producer attempt: \`${input.targetAttemptN}\``,
10769
10809
  "",
10770
- "### Targets (variants to grade)",
10771
- "",
10772
- targetsBlock,
10810
+ "### Evidence gathering",
10773
10811
  "",
10774
- "Each target is a completed `run_eval` task in the same correlation group.",
10775
- "Read its accepted attempt via `moltnet_get_task` / `moltnet_list_task_attempts`",
10776
- "to see the producer's output before grading.",
10812
+ `1. Call \`moltnet_get_task\` with taskId=\`${input.targetTaskId}\`.`,
10813
+ `2. Call \`moltnet_list_task_attempts\` with taskId=\`${input.targetTaskId}\` and inspect the accepted attempt matching \`${input.targetAttemptN}\`.`,
10814
+ `3. Call \`moltnet_list_task_messages\` with taskId=\`${input.targetTaskId}\`, attemptN=\`${input.targetAttemptN}\` to inspect the producer's turn-by-turn behavior.`,
10815
+ "4. Use the accepted attempt output, attempt messages, and any accessible",
10816
+ " artifacts or workspace evidence available in your environment.",
10817
+ " Read artifact files from the mounted producer workspace when present;",
10818
+ " do not assume detached `artifact_<taskId>` directories exist.",
10819
+ "5. Score strictly against the rubric below.",
10777
10820
  "",
10821
+ workspaceSection,
10778
10822
  "### Rubric",
10779
10823
  "",
10780
10824
  rubric.preamble ? `${rubric.preamble}\n` : "",
@@ -10782,34 +10826,10 @@ function buildJudgeEvalVariantUserPrompt(input, ctx) {
10782
10826
  "| --- | --- | --- | --- |",
10783
10827
  criteriaTable,
10784
10828
  "",
10785
- "### How to grade",
10786
- "",
10787
- "For EACH `runTaskIds[i]`:",
10788
- "",
10789
- "1. Call the `subagent` custom tool with:",
10790
- " - `task`: a brief instructing the subagent to grade ONLY that variant",
10791
- " against the rubric above; include the target task id and the rubric",
10792
- " verbatim. The subagent has the same MoltNet tools and can fetch the",
10793
- " accepted attempt output independently.",
10794
- " - `output_schema`: `\"judge_eval_variant_result\"`",
10795
- "2. Receive the subagent's structured `judge_eval_variant_result` payload.",
10796
- "3. Append it to your `results[]` array, **in the same order as input.runTaskIds**.",
10797
- "",
10798
- "Do NOT score any variant in your own session. The whole point of the",
10799
- "subagent fan-out is per-variant context isolation — grading two variants",
10800
- "back-to-back in one session lets the second be biased by the first.",
10801
- "",
10802
10829
  "### Composite arithmetic",
10803
10830
  "",
10804
- "Each `composite` MUST equal `Σ(criterion.weight × score)` over the rubric",
10805
- "criteria. Drift > 0.001 is rejected. Subagents are instructed to compute it",
10806
- "themselves; double-check before assembling the final output.",
10807
- "",
10808
- "### Deltas (optional)",
10809
- "",
10810
- "If useful, populate `deltas` with pairwise composite differences keyed by",
10811
- "`\"<variantLabel-A> - <variantLabel-B>\"` (single space-hyphen-space). Both",
10812
- "labels must appear in `results`. Omit `deltas` entirely if not used.",
10831
+ "Your `composite` MUST equal `Σ(criterion.weight × score)` over the rubric",
10832
+ "criteria. Drift > 0.001 is rejected.",
10813
10833
  "",
10814
10834
  finalOutputBlock
10815
10835
  ].filter((s) => s !== "").join("\n");
@@ -11106,8 +11126,9 @@ function buildRenderPackUserPrompt(input, ctx) {
11106
11126
  * Build the first user-message prompt for a `run_eval` task.
11107
11127
  *
11108
11128
  * Free-form: no git workflow, no commit ceremony. The executor produces
11109
- * a textual response (and optional file artifacts) that a later
11110
- * `judge_eval_variant` task (Slice 2) grades against the rubric.
11129
+ * a textual response (and optional file artifacts) that later
11130
+ * `judge_eval_attempt` task(s) grade against their own hidden
11131
+ * rubric.
11111
11132
  *
11112
11133
  * Context delivery is handled by `resolveTaskContext` (see
11113
11134
  * libs/agent-runtime/src/context-bindings.ts) and runs BEFORE this
@@ -11117,7 +11138,9 @@ function buildRenderPackUserPrompt(input, ctx) {
11117
11138
  * builder does NOT inline `input.context[]` itself.
11118
11139
  */
11119
11140
  function buildRunEvalUserPrompt(input, ctx) {
11120
- const { scenario, variantLabel, successCriteria } = input;
11141
+ const { scenario, variantLabel, execution, successCriteria } = input;
11142
+ const hasContext = input.context.length > 0;
11143
+ const hasInlineContext = input.context.some((entry) => entry.binding === "context_inline");
11121
11144
  const inputFilesSection = scenario.inputFiles?.length ? [
11122
11145
  "### Input files",
11123
11146
  "",
@@ -11130,9 +11153,30 @@ function buildRunEvalUserPrompt(input, ctx) {
11130
11153
  "",
11131
11154
  `This task carries correlationId \`${ctx.correlationId}\`. It joins`,
11132
11155
  "this variant to its sibling `run_eval` tasks (other variants of the",
11133
- "same scenario) and to the eventual `judge_eval_variant` task that",
11134
- "will grade them together. You do not need to act on it directly —",
11135
- "it is recorded for cross-variant aggregation at query time.",
11156
+ "same scenario and to any later `judge_eval_attempt` tasks created",
11157
+ "against those variants. You do not need to act on it directly — it",
11158
+ "is recorded for cross-variant aggregation at query time.",
11159
+ ""
11160
+ ].join("\n") : "";
11161
+ const executionSection = [
11162
+ "### Execution mode",
11163
+ "",
11164
+ `Mode: \`${execution.mode}\``,
11165
+ `Workspace: \`${execution.workspace}\``,
11166
+ execution.workspace === "none" ? "You are running in a scratch workspace with no repository checkout mounted. Do not assume git history or repo files are present unless the scenario provided them explicitly." : execution.workspace === "shared_mount" ? "You are running against the daemon shared mount. Treat any repository mutations as affecting the mounted checkout directly." : "You are running in a dedicated disposable git worktree isolated from the daemon shared checkout.",
11167
+ ""
11168
+ ].join("\n");
11169
+ const contextDisciplineSection = hasContext ? [
11170
+ "### Injected context discipline",
11171
+ "",
11172
+ "This task includes extra injected context from the task creator.",
11173
+ "You MUST inspect and use that context BEFORE you write solution",
11174
+ "files or draft your final answer.",
11175
+ "Do not solve first and only review the context afterward.",
11176
+ hasInlineContext ? "For `context_inline`, your FIRST content-inspection step should be a `read` of `/workspace/context-pack.md` before your first `write` call. The same content is also mirrored in `/workspace/AGENTS.md` and may be referenced from `/workspace/.claude/CLAUDE.md`." : "If injected context was provided as a skill, inspect that task-injected context before solving.",
11177
+ hasInlineContext ? "If `/workspace/context-pack.md` exists and you skip reading it before writing solution files, you are not following the task instructions." : "Do not rely on memory alone when task-injected context is available; inspect it first.",
11178
+ "If the injected context contains repo- or workflow-specific rules,",
11179
+ "those rules override your generic instincts.",
11136
11180
  ""
11137
11181
  ].join("\n") : "";
11138
11182
  const finalOutputBlock = buildFinalOutputBlock({
@@ -11145,7 +11189,13 @@ function buildRunEvalUserPrompt(input, ctx) {
11145
11189
  " \"totalTokens\": <int>,",
11146
11190
  " \"durationMs\": <int>,",
11147
11191
  " \"traceparent\": \"<from claim>\",",
11148
- " \"verification\": <required iff input.successCriteria; see Self-verification>",
11192
+ " \"verification\": {",
11193
+ " \"inputCid\": \"<task inputCid>\",",
11194
+ " \"results\": [",
11195
+ " { \"id\": \"<criterion id>\", \"kind\": \"rubric\", \"status\": \"pass|fail|skip\", \"detail\": \"<optional one-liner>\" }",
11196
+ " ],",
11197
+ " \"passed\": <boolean>",
11198
+ " } // required iff input.successCriteria; must be an object, never a string",
11149
11199
  "}"
11150
11200
  ].join("\n")
11151
11201
  });
@@ -11153,6 +11203,8 @@ function buildRunEvalUserPrompt(input, ctx) {
11153
11203
  "# Run Eval Agent\n",
11154
11204
  `You are running an evaluation scenario as variant \`${variantLabel}\`.\nTask id: \`${ctx.taskId}\`\n`,
11155
11205
  correlationSection,
11206
+ executionSection,
11207
+ contextDisciplineSection,
11156
11208
  `### Scenario\n\n${scenario.prompt}\n`,
11157
11209
  inputFilesSection,
11158
11210
  verificationSection,
@@ -11224,6 +11276,16 @@ function buildTaskUserPrompt(task, ctx) {
11224
11276
  diaryId: ctx.diaryId,
11225
11277
  taskId: ctx.taskId
11226
11278
  });
11279
+ case JUDGE_EVAL_ATTEMPT_TYPE:
11280
+ if (!Value.Check(JudgeEvalAttemptInput, task.input)) {
11281
+ const errors = [...Value.Errors(JudgeEvalAttemptInput, task.input)];
11282
+ throw new Error(`judge_eval_attempt input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
11283
+ }
11284
+ return buildJudgeEvalAttemptUserPrompt(task.input, {
11285
+ diaryId: ctx.diaryId,
11286
+ taskId: ctx.taskId,
11287
+ workspace: ctx.workspace
11288
+ });
11227
11289
  case PR_REVIEW_TYPE:
11228
11290
  if (!Value.Check(PrReviewInput, task.input)) {
11229
11291
  const errors = [...Value.Errors(PrReviewInput, task.input)];
@@ -11234,15 +11296,6 @@ function buildTaskUserPrompt(task, ctx) {
11234
11296
  taskId: ctx.taskId,
11235
11297
  workspace: ctx.workspace
11236
11298
  });
11237
- case JUDGE_EVAL_VARIANT_TYPE:
11238
- if (!Value.Check(JudgeEvalVariantInput, task.input)) {
11239
- const errors = [...Value.Errors(JudgeEvalVariantInput, task.input)];
11240
- throw new Error(`judge_eval_variant input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
11241
- }
11242
- return buildJudgeEvalVariantUserPrompt(task.input, {
11243
- diaryId: ctx.diaryId,
11244
- taskId: ctx.taskId
11245
- });
11246
11299
  case RUN_EVAL_TYPE:
11247
11300
  if (!Value.Check(RunEvalInput, task.input)) {
11248
11301
  const errors = [...Value.Errors(RunEvalInput, task.input)];
@@ -14760,6 +14813,11 @@ var require_multistream = /* @__PURE__ */ __commonJSMin(((exports, module) => {
14760
14813
  * paths under this mount via `toGuestPath` in `tool-operations.ts`.
14761
14814
  */
14762
14815
  var SKILL_ROOT_IN_VM = GUEST_TASK_SKILLS_MOUNT;
14816
+ var INLINE_CONTEXT_ROOT_IN_VM = "/workspace/.moltnet/context";
14817
+ var WORKSPACE_CONTEXT_PACK = "/workspace/context-pack.md";
14818
+ var WORKSPACE_AGENTS_MD = "/workspace/AGENTS.md";
14819
+ var WORKSPACE_CLAUDE_DIR = "/workspace/.claude";
14820
+ var WORKSPACE_CLAUDE_MD = "/workspace/.claude/CLAUDE.md";
14763
14821
  /** Bounds borrowed from pi's skill validation; conservative caps so a
14764
14822
  * malformed SKILL.md doesn't bloat the system prompt. */
14765
14823
  var MAX_SKILL_NAME = 64;
@@ -14770,21 +14828,40 @@ var MAX_SKILL_DESCRIPTION = 1024;
14770
14828
  */
14771
14829
  async function injectTaskContext(args) {
14772
14830
  const skills = [];
14831
+ const inlineContexts = [];
14773
14832
  const resolved = await resolveTaskContext({
14774
14833
  context: args.context,
14775
- deliver: { skill: async ({ slug, content }) => {
14776
- const dir = `${SKILL_ROOT_IN_VM}/${slug}`;
14777
- const filePath = `${dir}/SKILL.md`;
14778
- await args.fs.mkdir(dir, { recursive: true });
14779
- await args.fs.writeFile(filePath, content, { mode: 420 });
14780
- skills.push(buildSyntheticSkill({
14781
- slug,
14782
- content,
14783
- filePath,
14784
- dir
14785
- }));
14786
- } }
14834
+ deliver: {
14835
+ skill: async ({ slug, content }) => {
14836
+ const dir = `${SKILL_ROOT_IN_VM}/${slug}`;
14837
+ const filePath = `${dir}/SKILL.md`;
14838
+ await args.fs.mkdir(dir, { recursive: true });
14839
+ await args.fs.writeFile(filePath, content, { mode: 420 });
14840
+ skills.push(buildSyntheticSkill({
14841
+ slug,
14842
+ content,
14843
+ filePath,
14844
+ dir
14845
+ }));
14846
+ },
14847
+ contextFile: async ({ suggestedFileName, content }) => {
14848
+ await args.fs.mkdir(INLINE_CONTEXT_ROOT_IN_VM, { recursive: true });
14849
+ const filePath = `${INLINE_CONTEXT_ROOT_IN_VM}/${suggestedFileName}`;
14850
+ await args.fs.writeFile(filePath, content, { mode: 420 });
14851
+ inlineContexts.push({
14852
+ slug: suggestedFileName.replace(/\.md$/u, ""),
14853
+ content
14854
+ });
14855
+ }
14856
+ }
14787
14857
  });
14858
+ if (inlineContexts.length > 0) {
14859
+ const packContent = buildWorkspaceContextPack(inlineContexts);
14860
+ await args.fs.writeFile(WORKSPACE_CONTEXT_PACK, packContent, { mode: 420 });
14861
+ await args.fs.writeFile(WORKSPACE_AGENTS_MD, packContent, { mode: 420 });
14862
+ await args.fs.mkdir(WORKSPACE_CLAUDE_DIR, { recursive: true });
14863
+ await args.fs.writeFile(WORKSPACE_CLAUDE_MD, "@../context-pack.md\n", { mode: 420 });
14864
+ }
14788
14865
  return {
14789
14866
  injected: resolved.injected,
14790
14867
  skills,
@@ -14792,6 +14869,17 @@ async function injectTaskContext(args) {
14792
14869
  userInlineSuffix: resolved.userInlineSuffix
14793
14870
  };
14794
14871
  }
14872
+ function buildWorkspaceContextPack(contexts) {
14873
+ return [
14874
+ "# Context Pack",
14875
+ "",
14876
+ ...contexts.map(({ slug, content }) => [
14877
+ `## ${slug}`,
14878
+ "",
14879
+ content.trimEnd()
14880
+ ].join("\n"))
14881
+ ].join("\n\n").trimEnd() + "\n";
14882
+ }
14795
14883
  /**
14796
14884
  * Build a `Skill` object pi will faithfully render in
14797
14885
  * `<available_skills>`. We extract `name` and `description` from the
@@ -15155,7 +15243,7 @@ async function parseStructuredTaskOutput(assistantText, taskType, opts = {}) {
15155
15243
  }
15156
15244
  };
15157
15245
  }
15158
- const errors = validateTaskOutput(taskType, extracted);
15246
+ const errors = validateTaskOutput(taskType, extracted, opts.input);
15159
15247
  if (errors.length > 0) {
15160
15248
  const details = errors.slice(0, 3).map((error) => `${error.field}: ${error.message}`);
15161
15249
  const [firstError] = errors;
@@ -15269,7 +15357,7 @@ function createSubmitOutputTool(taskType, opts = {}) {
15269
15357
  description: contract.description,
15270
15358
  parameters: schema,
15271
15359
  async execute(_id, params) {
15272
- const errors = validateTaskOutput(taskType, params);
15360
+ const errors = validateTaskOutput(taskType, params, opts.input);
15273
15361
  if (errors.length > 0) {
15274
15362
  const detailMsg = errors.slice(0, 3).map((err) => `${err.field}: ${err.message}`).join("; ");
15275
15363
  const details = {
@@ -15338,6 +15426,39 @@ function resolveSubmitTools(taskType, opts = {}) {
15338
15426
  //#region src/runtime/task-workspace.ts
15339
15427
  function prepareTaskWorkspace(task, requestedMountPath, executionPlan) {
15340
15428
  const branch = executionPlan?.worktreeBranch ?? null;
15429
+ const workspaceMode = executionPlan?.workspaceMode ?? "shared_mount";
15430
+ const attachedWorkspace = executionPlan?.workspaceAttachment ?? null;
15431
+ if (attachedWorkspace) return {
15432
+ mountPath: attachedWorkspace.mountPath,
15433
+ cwdPath: attachedWorkspace.cwdPath,
15434
+ mode: workspaceMode,
15435
+ branch,
15436
+ cleanup: () => {}
15437
+ };
15438
+ if (workspaceMode === "scratch_mount") {
15439
+ const scratchDir = resolveTaskScratchPath(findMainWorktree(), executionPlan?.workspaceId ?? `task-${task.id}`);
15440
+ const keepWorkspace = executionPlan?.workspaceScope === "session" && executionPlan.sessionKey !== null;
15441
+ if (keepWorkspace) mkdirSync(scratchDir, { recursive: true });
15442
+ else {
15443
+ rmSync(scratchDir, {
15444
+ recursive: true,
15445
+ force: true
15446
+ });
15447
+ mkdirSync(scratchDir, { recursive: true });
15448
+ }
15449
+ return {
15450
+ mountPath: scratchDir,
15451
+ cwdPath: scratchDir,
15452
+ mode: "scratch_mount",
15453
+ branch: null,
15454
+ cleanup: keepWorkspace ? () => {} : () => {
15455
+ rmSync(scratchDir, {
15456
+ recursive: true,
15457
+ force: true
15458
+ });
15459
+ }
15460
+ };
15461
+ }
15341
15462
  if (!branch) return {
15342
15463
  mountPath: requestedMountPath,
15343
15464
  cwdPath: requestedMountPath,
@@ -15375,6 +15496,9 @@ function prepareTaskWorkspace(task, requestedMountPath, executionPlan) {
15375
15496
  function resolveTaskWorktreePath(mainRepo, workspaceId) {
15376
15497
  return join(mainRepo, ".worktrees", workspaceId);
15377
15498
  }
15499
+ function resolveTaskScratchPath(mainRepo, workspaceId) {
15500
+ return join(mainRepo, ".moltnet", "d", "task-workspaces", workspaceId);
15501
+ }
15378
15502
  function ensureReusableTaskWorktree(mainRepo, worktreeDir, branch) {
15379
15503
  if (isRegisteredWorktree(mainRepo, worktreeDir)) return;
15380
15504
  if (existsSync(worktreeDir)) throw new Error(`Expected reusable worktree ${worktreeDir} to be git-managed, but it exists outside git worktree metadata.`);
@@ -15611,12 +15735,14 @@ async function executePiTask(claimedTask, reporter, opts) {
15611
15735
  return makeFailedOutput("worktree_setup_failed", message);
15612
15736
  }
15613
15737
  try {
15738
+ const sandboxConfig = applyExecutionPlanSandboxOverrides(opts.sandboxConfig, executionPlan);
15614
15739
  managed = await resumeVm({
15615
15740
  checkpointPath,
15616
15741
  agentName: opts.agentName,
15617
15742
  mountPath,
15743
+ workspaceMode: workspace.mode,
15618
15744
  extraAllowedHosts: opts.extraAllowedHosts,
15619
- sandboxConfig: opts.sandboxConfig
15745
+ sandboxConfig
15620
15746
  });
15621
15747
  } catch (err) {
15622
15748
  const message = err instanceof Error ? err.message : String(err);
@@ -15645,7 +15771,8 @@ async function executePiTask(claimedTask, reporter, opts) {
15645
15771
  taskId: task.id,
15646
15772
  workspace: {
15647
15773
  mode: activeWorkspace.mode,
15648
- branch: activeWorkspace.branch
15774
+ branch: activeWorkspace.branch,
15775
+ attached: executionPlan?.workspaceAttachment !== void 0
15649
15776
  },
15650
15777
  extras: opts.promptExtras
15651
15778
  });
@@ -15687,7 +15814,10 @@ async function executePiTask(claimedTask, reporter, opts) {
15687
15814
  createEditToolDefinition(mountPath, { operations: createGondolinEditOps(managed.vm, mountPath) }),
15688
15815
  createBashToolDefinition(mountPath, { operations: createGondolinBashOps(managed.vm, mountPath) })
15689
15816
  ];
15690
- const { handle: submitToolHandle, tools: submitToolDefs } = resolveSubmitTools(task.taskType, { model: opts.model });
15817
+ const { handle: submitToolHandle, tools: submitToolDefs } = resolveSubmitTools(task.taskType, {
15818
+ model: opts.model,
15819
+ input: task.input
15820
+ });
15691
15821
  const submitTools = submitToolDefs;
15692
15822
  try {
15693
15823
  const moltnetAgent = await connect({ configDir: managed.agentDir });
@@ -15906,8 +16036,20 @@ async function executePiTask(claimedTask, reporter, opts) {
15906
16036
  phase: "output_validation"
15907
16037
  });
15908
16038
  }
15909
- else {
15910
- const parsed = await parseStructuredTaskOutput(assistantText, task.taskType, { model: opts.model });
16039
+ else if (submitToolHandle) {
16040
+ parseError = {
16041
+ code: "output_missing",
16042
+ message: "Agent did not submit output through the task submit tool. A valid submit tool call is required to complete this task type."
16043
+ };
16044
+ await emit("error", {
16045
+ message: parseError.message,
16046
+ phase: "output_validation"
16047
+ });
16048
+ } else {
16049
+ const parsed = await parseStructuredTaskOutput(assistantText, task.taskType, {
16050
+ model: opts.model,
16051
+ input: task.input
16052
+ });
15911
16053
  parsedOutput = parsed.output;
15912
16054
  parsedOutputCid = parsed.outputCid;
15913
16055
  parseError = parsed.error;
@@ -15993,6 +16135,18 @@ async function executePiTask(claimedTask, reporter, opts) {
15993
16135
  }
15994
16136
  }
15995
16137
  }
16138
+ function applyExecutionPlanSandboxOverrides(sandboxConfig, executionPlan) {
16139
+ const shadowWrites = executionPlan?.workspaceAttachment?.shadowWrites;
16140
+ if (!shadowWrites) return sandboxConfig;
16141
+ return {
16142
+ ...sandboxConfig,
16143
+ vfs: {
16144
+ ...sandboxConfig?.vfs,
16145
+ shadow: ["**"],
16146
+ shadowMode: shadowWrites
16147
+ }
16148
+ };
16149
+ }
15996
16150
  function emptyUsage(provider, model) {
15997
16151
  return {
15998
16152
  inputTokens: 0,
@@ -16210,6 +16364,7 @@ function moltnetExtension(pi) {
16210
16364
  checkpointPath,
16211
16365
  agentName,
16212
16366
  mountPath,
16367
+ workspaceMode: "shared_mount",
16213
16368
  sandboxConfig
16214
16369
  });
16215
16370
  activateAgentEnv(managed.credentials.agentEnv, mainRepo);