@themoltnet/pi-extension 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2386,12 +2386,20 @@ var MoltNetError = class extends Error {
2386
2386
  code;
2387
2387
  statusCode;
2388
2388
  detail;
2389
+ /**
2390
+ * Populated when the server returned a `VALIDATION_FAILED` problem
2391
+ * (status 400) with field-level errors. Empty / undefined for every
2392
+ * other problem kind. Imposer scripts surface these to operators so
2393
+ * they don't have to re-run with curl to see what was rejected.
2394
+ */
2395
+ validationErrors;
2389
2396
  constructor(message, options) {
2390
2397
  super(message);
2391
2398
  this.name = "MoltNetError";
2392
2399
  this.code = options.code;
2393
2400
  this.statusCode = options.statusCode;
2394
2401
  this.detail = options.detail;
2402
+ this.validationErrors = options.validationErrors;
2395
2403
  }
2396
2404
  };
2397
2405
  var NetworkError = class extends MoltNetError {
@@ -2415,10 +2423,14 @@ var AuthenticationError = class extends MoltNetError {
2415
2423
  };
2416
2424
  function problemToError(problem, statusCode) {
2417
2425
  const title = problem.title ?? "Request failed";
2418
- return new MoltNetError(problem.detail ? `${title}: ${problem.detail}` : title, {
2426
+ const message = problem.detail ? `${title}: ${problem.detail}` : title;
2427
+ const rawErrors = problem.errors;
2428
+ const validationErrors = Array.isArray(rawErrors) ? rawErrors.filter((e) => typeof e === "object" && e !== null && typeof e.field === "string" && typeof e.message === "string") : void 0;
2429
+ return new MoltNetError(message, {
2419
2430
  code: problem.type ?? problem.code ?? "UNKNOWN",
2420
2431
  statusCode,
2421
- detail: problem.detail
2432
+ detail: problem.detail,
2433
+ validationErrors
2422
2434
  });
2423
2435
  }
2424
2436
  //#endregion
@@ -7767,6 +7779,41 @@ function createMoltNetTools(config) {
7767
7779
  };
7768
7780
  }
7769
7781
  });
7782
+ const listTaskMessages = defineTool({
7783
+ name: "moltnet_list_task_messages",
7784
+ label: "List MoltNet Task Attempt Messages",
7785
+ description: "List messages for a specific task attempt. Use this when you need the turn-by-turn execution record behind an accepted attempt — tool calls, text deltas, and error/info events that do not appear in the attempt output alone.",
7786
+ parameters: Type.Object({
7787
+ taskId: Type.String({ description: "Task ID (UUID)." }),
7788
+ attemptN: Type.Integer({
7789
+ minimum: 1,
7790
+ description: "Attempt number to inspect."
7791
+ }),
7792
+ afterSeq: Type.Optional(Type.Integer({
7793
+ minimum: 0,
7794
+ description: "Optional cursor: only return messages with seq > afterSeq."
7795
+ })),
7796
+ limit: Type.Optional(Type.Integer({
7797
+ minimum: 1,
7798
+ maximum: 500,
7799
+ description: "Optional maximum messages to return. Defaults to the API value."
7800
+ }))
7801
+ }),
7802
+ async execute(_id, params) {
7803
+ const { agent } = ensureConnected(config);
7804
+ const messages = await agent.tasks.listMessages(params.taskId, params.attemptN, {
7805
+ afterSeq: params.afterSeq,
7806
+ limit: params.limit
7807
+ });
7808
+ return {
7809
+ content: [{
7810
+ type: "text",
7811
+ text: JSON.stringify(messages, null, 2)
7812
+ }],
7813
+ details: {}
7814
+ };
7815
+ }
7816
+ });
7770
7817
  const reviewSessionErrors = defineTool({
7771
7818
  name: "moltnet_review_session_errors",
7772
7819
  label: "Review Session Tool Errors",
@@ -7815,6 +7862,7 @@ function createMoltNetTools(config) {
7815
7862
  createEntry,
7816
7863
  getTask,
7817
7864
  listTaskAttempts,
7865
+ listTaskMessages,
7818
7866
  reviewSessionErrors,
7819
7867
  defineTool({
7820
7868
  name: "moltnet_host_exec",
@@ -8113,6 +8161,12 @@ var GUEST_WORKSPACE$2 = "/workspace";
8113
8161
  * investigation and the alternatives we rejected.
8114
8162
  */
8115
8163
  var GUEST_TASK_SKILLS_MOUNT = "/moltnet-task-skills";
8164
+ function shouldRunResumeCommand(entry, ctx) {
8165
+ if (typeof entry === "string") return true;
8166
+ const workspaceModes = entry.when?.workspaceMode;
8167
+ if (workspaceModes && !workspaceModes.includes(ctx.workspaceMode)) return false;
8168
+ return true;
8169
+ }
8116
8170
  /**
8117
8171
  * Resolve the main worktree root (where .moltnet/ lives — it's untracked,
8118
8172
  * only exists in the main worktree, not in git worktrees).
@@ -8258,6 +8312,7 @@ async function resumeVm(config) {
8258
8312
  ...envOverrides
8259
8313
  };
8260
8314
  const resources = config.sandboxConfig?.resources;
8315
+ const workspaceMode = config.workspaceMode ?? "shared_mount";
8261
8316
  const vm = await VmCheckpoint.load(config.checkpointPath).resume({
8262
8317
  httpHooks,
8263
8318
  env: vmEnv,
@@ -8276,7 +8331,32 @@ async function resumeVm(config) {
8276
8331
  '`);
8277
8332
  await vmRun(vm, "DNS resolvers", `printf 'nameserver 8.8.8.8\\nnameserver 1.1.1.1\\n' > /etc/resolv.conf`);
8278
8333
  await vmRun(vm, "git safe.directory", `git config --system --add safe.directory '*'`);
8279
- for (const [i, cmd] of (config.sandboxConfig?.resumeCommands ?? []).entries()) await vmRun(vm, `resumeCommands[${i}]`, cmd);
8334
+ for (const [i, entry] of (config.sandboxConfig?.resumeCommands ?? []).entries()) {
8335
+ if (!shouldRunResumeCommand(entry, { workspaceMode })) continue;
8336
+ const { run, retries, backoffMs } = typeof entry === "string" ? {
8337
+ run: entry,
8338
+ retries: 0,
8339
+ backoffMs: 2e3
8340
+ } : {
8341
+ run: entry.run,
8342
+ retries: entry.retries ?? 0,
8343
+ backoffMs: entry.retryBackoffMs ?? 2e3
8344
+ };
8345
+ const label = `resumeCommands[${i}]`;
8346
+ let lastErr;
8347
+ for (let attempt = 0; attempt <= retries; attempt++) try {
8348
+ await vmRun(vm, label, run);
8349
+ lastErr = void 0;
8350
+ break;
8351
+ } catch (err) {
8352
+ lastErr = err;
8353
+ if (attempt === retries) break;
8354
+ await new Promise((resolve) => {
8355
+ setTimeout(resolve, (attempt + 1) * backoffMs);
8356
+ });
8357
+ }
8358
+ if (lastErr) throw lastErr instanceof Error ? lastErr : new Error(String(lastErr));
8359
+ }
8280
8360
  const vmSshDir = `${vmAgentDir}/ssh`;
8281
8361
  await vm.exec(`mkdir -p ${vmAgentDir}/ssh /home/agent/.pi/agent`);
8282
8362
  if (creds.piAuthJson !== null) await vm.fs.writeFile("/home/agent/.pi/agent/auth.json", creds.piAuthJson, { mode: 384 });
@@ -8655,7 +8735,8 @@ async function buildAgentSession(args) {
8655
8735
  await resourceLoader.reload();
8656
8736
  const sessionManager = args.sessionPersistence ? await resolvePersistentSessionManager({
8657
8737
  cwd: args.cwdPath,
8658
- sessionDir: args.sessionPersistence.sessionDir
8738
+ sessionDir: args.sessionPersistence.sessionDir,
8739
+ forkFromSessionPath: args.sessionPersistence.forkFromSessionPath
8659
8740
  }) : SessionManager.inMemory(args.cwdPath);
8660
8741
  return (await createAgentSession({
8661
8742
  agentDir: args.piAuthDir,
@@ -8667,6 +8748,7 @@ async function buildAgentSession(args) {
8667
8748
  })).session;
8668
8749
  }
8669
8750
  async function resolvePersistentSessionManager(args) {
8751
+ if (args.forkFromSessionPath) return SessionManager.forkFrom(args.forkFromSessionPath, args.cwd, args.sessionDir);
8670
8752
  await SessionManager.list(args.cwd, args.sessionDir);
8671
8753
  return SessionManager.continueRecent(args.cwd, args.sessionDir);
8672
8754
  }
@@ -8683,6 +8765,11 @@ var PROMPT_SEPARATOR = "\n\n---\n\n";
8683
8765
  * - `skill` → `deliver.skill({ slug, content })` once per ref.
8684
8766
  * Slug collisions on distinct contents are
8685
8767
  * refused loudly.
8768
+ * - `context_inline`→ persist raw bytes via `deliver.contextFile(...)`
8769
+ * and inject them into the prompt in an explicit,
8770
+ * named block. Intended for eval/context experiments
8771
+ * where the content must be in the model context
8772
+ * window, not merely discoverable as a skill.
8686
8773
  * - `prompt_prefix` → content appended to `systemPromptPrefix` with
8687
8774
  * the canonical `\n\n---\n\n` separator (in
8688
8775
  * declared order).
@@ -8715,6 +8802,13 @@ async function resolveTaskContext(args) {
8715
8802
  slug: ref.slug,
8716
8803
  content: ref.content
8717
8804
  });
8805
+ } else if (ref.binding === "context_inline") {
8806
+ await args.deliver.contextFile({
8807
+ slug: ref.slug,
8808
+ content: ref.content,
8809
+ suggestedFileName: `${ref.slug}.md`
8810
+ });
8811
+ promptParts.push(formatInlineContextBlock(ref.slug, ref.content));
8718
8812
  } else if (ref.binding === "prompt_prefix") promptParts.push(ref.content);
8719
8813
  else userParts.push(ref.content);
8720
8814
  injected.push(ref);
@@ -8725,6 +8819,23 @@ async function resolveTaskContext(args) {
8725
8819
  userInlineSuffix: userParts.join(PROMPT_SEPARATOR)
8726
8820
  };
8727
8821
  }
8822
+ function formatInlineContextBlock(slug, content) {
8823
+ return [
8824
+ "### Injected Task Context",
8825
+ "",
8826
+ `Context id: \`${slug}\``,
8827
+ "The following raw context was supplied by the task creator. Treat it",
8828
+ "as task-relevant background that may override generic coding instincts",
8829
+ "when it contains repo- or workflow-specific constraints.",
8830
+ "The same content is also materialized in the workspace as",
8831
+ "`/workspace/context-pack.md` and mirrored in `AGENTS.md` for",
8832
+ "repo-context discovery.",
8833
+ "",
8834
+ "<context>",
8835
+ content,
8836
+ "</context>"
8837
+ ].join("\n");
8838
+ }
8728
8839
  //#endregion
8729
8840
  //#region ../tasks/src/formats.ts
8730
8841
  /**
@@ -8748,6 +8859,7 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
8748
8859
  */
8749
8860
  var ContextBinding = Type$1.Union([
8750
8861
  Type$1.Literal("skill"),
8862
+ Type$1.Literal("context_inline"),
8751
8863
  Type$1.Literal("prompt_prefix"),
8752
8864
  Type$1.Literal("user_inline")
8753
8865
  ], { $id: "ContextBinding" });
@@ -8764,9 +8876,14 @@ var ContextBinding = Type$1.Union([
8764
8876
  * name under the runtime's skill discovery path. Must be
8765
8877
  * kebab-case-safe (alphanumeric + dashes/underscores).
8766
8878
  * - `binding` — how the bytes are delivered to the LLM (see above).
8767
- * - `content` — the actual bytes (UTF-8 text). Capped at 32 KiB per
8879
+ * - `content` — the actual bytes (UTF-8 text). Capped at 64 KiB per
8768
8880
  * entry; total per-task context bytes are bounded by the
8769
8881
  * soft `maxItems` cap and per-binding daemon limits.
8882
+ * Raised from 32 KiB in 2026-05 — protocol-heavy operator
8883
+ * skills (e.g. `.claude/skills/legreffier/SKILL.md`) ship
8884
+ * at ~35 KiB inline, and the original cap was sized for
8885
+ * short example skills, not the kind of skill the eval
8886
+ * substrate is dogfooded on (#943, #823).
8770
8887
  */
8771
8888
  var ContextRef = Type$1.Object({
8772
8889
  slug: Type$1.String({
@@ -8777,7 +8894,7 @@ var ContextRef = Type$1.Object({
8777
8894
  binding: ContextBinding,
8778
8895
  content: Type$1.String({
8779
8896
  minLength: 1,
8780
- maxLength: 32768
8897
+ maxLength: 65536
8781
8898
  })
8782
8899
  }, {
8783
8900
  $id: "ContextRef",
@@ -9341,61 +9458,33 @@ async function validateJudgePackInputAsync(input, ctx) {
9341
9458
  return errors;
9342
9459
  }
9343
9460
  //#endregion
9344
- //#region ../tasks/src/task-types/judge-eval-variant.ts
9461
+ //#region ../tasks/src/task-types/judge-eval-attempt.ts
9345
9462
  /**
9346
- * `judge_eval_variant` — score N variants of a `run_eval` scenario
9347
- * against a single rubric, in one pass, with per-variant subagent
9348
- * isolation.
9463
+ * `judge_eval_attempt` — score one completed `run_eval` attempt against a
9464
+ * hidden judge rubric.
9349
9465
  *
9350
9466
  * output_kind: judgment
9351
- * criteria: required (`successCriteria.rubric` — same envelope shape as
9352
- * `judge_pack` / `assess_brief`)
9353
- * references: not required at the input layer `runTaskIds` already
9354
- * pin the targets being graded.
9355
- *
9356
- * Slice 2 of #943. The parent task carries the rubric and the list of
9357
- * variant `run_eval` task ids. The pi executor registers the generic
9358
- * `subagent` custom tool (#1087), and the parent LLM calls
9359
- * `subagent({ task, output_schema: 'judge_eval_variant_result' })` once
9360
- * per variant — each child session has fresh context, fetches the
9361
- * variant's accepted attempt output via `moltnet_get_task` /
9362
- * `moltnet_list_task_attempts`, and grades against the rubric.
9467
+ * criteria: required (`successCriteria.rubric`)
9468
+ * references: not required at the input layer — `targetTaskId` +
9469
+ * `targetAttemptN` pin the producer attempt being judged.
9363
9470
  *
9364
- * Reuses `JudgePackScore` from `judge_pack` for per-criterion scoring
9365
- * (Lane 1 binary via `llm_checklist`, Lane 2 graded via `llm_score`,
9366
- * deterministic_*) the score shape is the same across judgment
9367
- * tasks; only the wrapping (per-variant grouping + deltas) differs.
9368
- *
9369
- * Cross-task input invariants — "all targets share the same
9370
- * correlation_id, all are `run_eval`, all are completed with an
9371
- * accepted attempt, all share byte-identical `input.successCriteria`"
9372
- * REQUIRE async DB lookups and live in `validateInputAsync` below,
9373
- * which the task service runs at create time (#1096 wiring). The
9374
- * TypeBox layer here only enforces shape: UUID format,
9375
- * minItems/maxItems, rubric presence + weight invariant.
9376
- */
9377
- var JUDGE_EVAL_VARIANT_TYPE = "judge_eval_variant";
9378
- var JudgeEvalVariantInput = Type$1.Object({
9379
- runTaskIds: Type$1.Array(Type$1.String({ format: "uuid" }), {
9380
- minItems: 2,
9381
- maxItems: 10
9382
- }),
9471
+ * This replaces the earlier parent/subagent `judge_eval_variant` design.
9472
+ * The unit of judgment is one producer attempt. Cross-variant deltas can be
9473
+ * computed later at read time from stored scores, rather than materialized as
9474
+ * their own task output.
9475
+ */
9476
+ var JUDGE_EVAL_ATTEMPT_TYPE = "judge_eval_attempt";
9477
+ var JudgeEvalAttemptInput = Type$1.Object({
9478
+ targetTaskId: Type$1.String({ format: "uuid" }),
9479
+ targetAttemptN: Type$1.Integer({ minimum: 1 }),
9383
9480
  successCriteria: SuccessCriteria
9384
9481
  }, {
9385
- $id: "JudgeEvalVariantInput",
9482
+ $id: "JudgeEvalAttemptInput",
9386
9483
  additionalProperties: false
9387
9484
  });
9388
- /**
9389
- * Per-variant grading. `scores[]` shape is identical to `JudgePackScore`
9390
- * (mode-aware: binary via `llm_checklist`, graded via `llm_score`,
9391
- * deterministic_*). Reuse the type rather than re-declare.
9392
- *
9393
- * This is also the **subagent output contract** — the parent's
9394
- * `subagent` tool resolves the contract name `judge_eval_variant_result`
9395
- * to this schema. See `agent-runtime`'s subagent contract registry.
9396
- */
9397
- var JudgeEvalVariantResult = Type$1.Object({
9398
- runTaskId: Type$1.String({ format: "uuid" }),
9485
+ var JudgeEvalAttemptOutput = Type$1.Object({
9486
+ targetTaskId: Type$1.String({ format: "uuid" }),
9487
+ targetAttemptN: Type$1.Integer({ minimum: 1 }),
9399
9488
  variantLabel: Type$1.String({
9400
9489
  minLength: 1,
9401
9490
  maxLength: 64,
@@ -9406,219 +9495,195 @@ var JudgeEvalVariantResult = Type$1.Object({
9406
9495
  minimum: 0,
9407
9496
  maximum: 1
9408
9497
  }),
9409
- verdict: Type$1.String({ minLength: 1 })
9410
- }, {
9411
- $id: "JudgeEvalVariantResult",
9412
- additionalProperties: false
9413
- });
9414
- var JudgeEvalVariantOutput = Type$1.Object({
9415
- results: Type$1.Array(JudgeEvalVariantResult, { minItems: 2 }),
9416
- deltas: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Number({
9417
- minimum: -1,
9418
- maximum: 1
9419
- }))),
9498
+ verdict: Type$1.String({ minLength: 1 }),
9420
9499
  judgeModel: Type$1.Optional(Type$1.String({ minLength: 1 })),
9421
9500
  traceparent: Type$1.String({ minLength: 1 })
9422
9501
  }, {
9423
- $id: "JudgeEvalVariantOutput",
9502
+ $id: "JudgeEvalAttemptOutput",
9424
9503
  additionalProperties: false
9425
9504
  });
9426
- /**
9427
- * Synchronous input invariants beyond TypeBox shape: rubric must be
9428
- * present (already required by the schema, but the rubric body has
9429
- * its own per-criterion weight invariant) and the rubric's weights
9430
- * must sum to 1.
9431
- *
9432
- * Cross-task invariants (all targets are `run_eval`, all completed,
9433
- * share `correlation_id`, byte-identical `input.successCriteria`)
9434
- * are NOT checked here — they require async DB lookups against
9435
- * `runTaskIds` and live in `validateJudgeEvalVariantInputAsync`
9436
- * below, invoked by the task service at create time (#1096).
9437
- */
9438
- function validateJudgeEvalVariantInput(input) {
9505
+ function validateJudgeEvalAttemptInput(input) {
9439
9506
  const sc = input.successCriteria;
9440
- if (!sc) return "successCriteria is required for judge_eval_variant";
9441
- if (!sc.rubric) return "successCriteria.rubric is required for judge_eval_variant";
9507
+ if (!sc) return "successCriteria is required for judge_eval_attempt";
9508
+ if (!sc.rubric) return "successCriteria.rubric is required for judge_eval_attempt";
9442
9509
  return validateRubricWeights(sc.rubric);
9443
9510
  }
9444
- /**
9445
- * Output cross-field invariants the schema cannot express:
9446
- *
9447
- * 1. `results.length === input.runTaskIds.length` — every variant
9448
- * the imposer asked for must be graded. Partial grading
9449
- * invalidates cross-variant comparison; fail the whole task
9450
- * rather than silently report a subset.
9451
- *
9452
- * 2. `results[i].runTaskId === input.runTaskIds[i]` — order is
9453
- * load-bearing for downstream consumers (e.g. deltas keyed by
9454
- * adjacent pairs). Mismatch is an LLM bug; reject loudly.
9455
- *
9456
- * 3. Each `result.scores` follows the same `llm_checklist` rule
9457
- * `judge_pack` enforces (#999): if a score has an `assertions`
9458
- * array, the numeric score MUST be `1` iff every assertion
9459
- * passes. Inconsistent payloads pollute attestations.
9460
- *
9461
- * 4. Each `result.composite` MUST equal the rubric-weighted sum
9462
- * `Σ(weight_j × scores[j].score)`. The parent (and any subagent
9463
- * it delegated to) is supposed to compute this; surfacing a
9464
- * drift here catches LLMs that hand-wave the arithmetic.
9465
- *
9466
- * 5. Optional `deltas` keys MUST be of the form `"A - B"` where
9467
- * both `A` and `B` are variantLabels present in `results`.
9468
- * Values are not range-checked (any float in [-1, 1] is
9469
- * arithmetically possible).
9470
- */
9471
- function validateJudgeEvalVariantOutput(output, input) {
9511
+ function validateJudgeEvalAttemptOutput(output, input) {
9472
9512
  const out = output;
9473
9513
  const inp = input;
9474
9514
  if (inp) {
9475
- if (out.results.length !== inp.runTaskIds.length) return `results.length (${out.results.length}) does not match input.runTaskIds.length (${inp.runTaskIds.length}). Every variant must be graded; partial grading is rejected.`;
9476
- for (let i = 0; i < out.results.length; i++) if (out.results[i].runTaskId !== inp.runTaskIds[i]) return `results[${i}].runTaskId (${out.results[i].runTaskId}) does not match input.runTaskIds[${i}] (${inp.runTaskIds[i]}). Order must align with input for downstream delta computation.`;
9515
+ if (out.targetTaskId !== inp.targetTaskId) return `output.targetTaskId (${out.targetTaskId}) does not match input.targetTaskId (${inp.targetTaskId})`;
9516
+ if (out.targetAttemptN !== inp.targetAttemptN) return `output.targetAttemptN (${out.targetAttemptN}) does not match input.targetAttemptN (${inp.targetAttemptN})`;
9477
9517
  }
9478
- for (let r = 0; r < out.results.length; r++) {
9479
- const result = out.results[r];
9480
- for (let s = 0; s < result.scores.length; s++) {
9481
- const sc = result.scores[s];
9482
- if (!sc.assertions) continue;
9483
- const allPassed = sc.assertions.every((a) => a.passed);
9484
- const expected = allPassed ? 1 : 0;
9485
- if (sc.score !== expected) return `results[${r}].scores[${s}] (criterionId="${sc.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${sc.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
9486
- }
9518
+ for (let s = 0; s < out.scores.length; s++) {
9519
+ const sc = out.scores[s];
9520
+ if (!sc.assertions) continue;
9521
+ const allPassed = sc.assertions.every((a) => a.passed);
9522
+ const expected = allPassed ? 1 : 0;
9523
+ if (sc.score !== expected) return `scores[${s}] (criterionId="${sc.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${sc.score}. Score must be 1 iff every assertion passes, else 0.`;
9487
9524
  }
9488
9525
  if (inp?.successCriteria?.rubric) {
9489
9526
  const criteria = inp.successCriteria.rubric.criteria;
9490
9527
  const weightById = new Map(criteria.map((c) => [c.id, c.weight]));
9491
- for (let r = 0; r < out.results.length; r++) {
9492
- const result = out.results[r];
9493
- let sum = 0;
9494
- for (const sc of result.scores) {
9495
- const w = weightById.get(sc.criterionId);
9496
- if (w === void 0) return `results[${r}].scores: criterionId "${sc.criterionId}" is not in the input rubric (known: ${Array.from(weightById.keys()).join(", ")}). Score every rubric criterion exactly once; do not invent new ids.`;
9497
- sum += w * sc.score;
9498
- }
9499
- if (Math.abs(sum - result.composite) > .001) return `results[${r}].composite (${result.composite}) does not match Σ(weight × score) (${sum.toFixed(6)}). Composite must be the rubric-weighted sum of per-criterion scores (drift > 0.001).`;
9500
- }
9501
- }
9502
- if (out.deltas) {
9503
- const labels = new Set(out.results.map((r) => r.variantLabel));
9504
- for (const key of Object.keys(out.deltas)) {
9505
- const m = /^(.+?) - (.+)$/.exec(key);
9506
- if (!m) return `deltas key "${key}" is not of the form "<variantLabel-A> - <variantLabel-B>". Use a single space-hyphen-space separator between labels.`;
9507
- const [, a, b] = m;
9508
- if (!labels.has(a) || !labels.has(b)) return `deltas key "${key}" references variantLabel(s) not present in results: ${!labels.has(a) ? `"${a}" missing` : ""}${!labels.has(a) && !labels.has(b) ? ", " : ""}${!labels.has(b) ? `"${b}" missing` : ""}`;
9528
+ let sum = 0;
9529
+ for (const sc of out.scores) {
9530
+ const w = weightById.get(sc.criterionId);
9531
+ if (w === void 0) return `scores references unknown criterionId "${sc.criterionId}"`;
9532
+ sum += w * sc.score;
9509
9533
  }
9534
+ const rounded = Math.round(sum * 1e3) / 1e3;
9535
+ if (Math.abs(rounded - out.composite) > .001) return `composite (${out.composite}) does not match weighted rubric sum (${rounded})`;
9510
9536
  }
9511
9537
  return null;
9512
9538
  }
9513
- /**
9514
- * Local stable-stringify for cross-variant `successCriteria` byte-
9515
- * equality. Recursively sorts object keys; arrays preserve order
9516
- * (intentional — rubric criteria order is semantically meaningful).
9517
- * Mirrors the canonical-JSON shape `crypto-service` uses for CIDs,
9518
- * without taking on a crypto-service dep just for this comparison.
9519
- */
9520
- function stableStringify(value) {
9521
- if (value === null || typeof value !== "object") return JSON.stringify(value);
9522
- if (Array.isArray(value)) return "[" + value.map(stableStringify).join(",") + "]";
9523
- const obj = value;
9524
- return "{" + Object.keys(obj).sort().map((k) => JSON.stringify(k) + ":" + stableStringify(obj[k])).join(",") + "}";
9525
- }
9526
- /**
9527
- * Async preflight for `judge_eval_variant` (#1096 + #943):
9528
- *
9529
- * 1. Every `runTaskIds[i]` resolves to a task the caller can read.
9530
- * 2. Every resolved task is `taskType === 'run_eval'`.
9531
- * 3. Every resolved task is `status === 'completed'` with a
9532
- * non-null `acceptedAttemptN` — grading an unaccepted attempt
9533
- * races with re-attempts and pollutes the judge attestation.
9534
- * 4. Every resolved task shares a non-null `correlationId`, and all
9535
- * `correlationId`s are equal. Without this an imposer could
9536
- * fabricate a "variant set" by stapling unrelated runs together.
9537
- * 5. The shared `correlationId` is NOT already sealed. A previous
9538
- * judge_eval_variant against the same group is final; produce a
9539
- * fresh correlation_id for a new judging round rather than
9540
- * adding contradictory verdicts to a sealed group.
9541
- * 6. Every variant's `input.successCriteria` is byte-identical (via
9542
- * stable-stringify). Different rubrics across "variants" makes
9543
- * the comparison meaningless.
9544
- */
9545
- async function validateJudgeEvalVariantInputAsync(input, ctx) {
9546
- const { runTaskIds } = input;
9539
+ async function validateJudgeEvalAttemptInputAsync(input, ctx) {
9540
+ const inp = input;
9547
9541
  const errors = [];
9548
- const resolved = await Promise.all(runTaskIds.map((id) => ctx.resolveTask(id)));
9549
- let missingTargets = false;
9550
- const presentTargets = [];
9551
- for (let i = 0; i < runTaskIds.length; i++) {
9552
- const t = resolved[i];
9553
- if (!t) {
9554
- missingTargets = true;
9555
- errors.push({
9556
- field: `runTaskIds[${i}]`,
9557
- message: `runTaskIds[${i}]=${runTaskIds[i]} does not resolve to a task you can read`
9558
- });
9559
- continue;
9560
- }
9561
- presentTargets.push(t);
9562
- if (t.taskType !== "run_eval") errors.push({
9563
- field: `runTaskIds[${i}]`,
9564
- message: `runTaskIds[${i}]=${runTaskIds[i]} is a ${t.taskType}, not a run_eval`
9565
- });
9566
- if (t.status !== "completed" || t.acceptedAttemptN === null) errors.push({
9567
- field: `runTaskIds[${i}]`,
9568
- message: `runTaskIds[${i}]=${runTaskIds[i]} is not completed with an accepted attempt (status=${t.status}, acceptedAttemptN=${t.acceptedAttemptN})`
9569
- });
9570
- }
9571
- if (missingTargets || presentTargets.length === 0) return errors;
9572
- const correlationIds = new Set(presentTargets.map((t) => t.correlationId ?? "__null__"));
9573
- if (correlationIds.has("__null__")) errors.push({
9574
- field: "runTaskIds",
9575
- message: "one or more run_eval targets have no correlation_id; cannot group as variants"
9542
+ const target = await ctx.resolveTask(inp.targetTaskId);
9543
+ if (!target) return [{
9544
+ field: "targetTaskId",
9545
+ message: `targetTaskId=${inp.targetTaskId} does not resolve to a task you can read`
9546
+ }];
9547
+ if (target.taskType !== "run_eval") errors.push({
9548
+ field: "targetTaskId",
9549
+ message: `targetTaskId=${inp.targetTaskId} is a ${target.taskType}, not a run_eval`
9576
9550
  });
9577
- if (correlationIds.size > 1) errors.push({
9578
- field: "runTaskIds",
9579
- message: `run_eval targets span multiple correlation_ids (${Array.from(correlationIds).join(", ")}); variants must share one`
9551
+ if (target.status !== "completed" || target.acceptedAttemptN === null) errors.push({
9552
+ field: "targetTaskId",
9553
+ message: `targetTaskId=${inp.targetTaskId} is not completed with an accepted attempt (status=${target.status}, acceptedAttemptN=${target.acceptedAttemptN})`
9580
9554
  });
9581
- if (errors.length > 0) return errors;
9582
- const correlationId = presentTargets[0].correlationId;
9583
- if (!correlationId) return errors;
9584
- const seal = await ctx.findCorrelationSeal(correlationId);
9585
- if (seal) errors.push({
9586
- field: "runTaskIds",
9587
- message: `correlation_id ${correlationId} is already sealed by ${seal.sealedByTaskType}/${seal.sealedByTaskId} at ${seal.sealedAt}; use a fresh correlation_id for a new judging round`
9555
+ else if (target.acceptedAttemptN !== inp.targetAttemptN) errors.push({
9556
+ field: "targetAttemptN",
9557
+ message: `targetAttemptN=${inp.targetAttemptN} does not match the producer's acceptedAttemptN=${target.acceptedAttemptN}`
9558
+ });
9559
+ if (!target.correlationId) errors.push({
9560
+ field: "targetTaskId",
9561
+ message: "target run_eval has no correlation_id; cannot enforce duplicate-judge protection"
9562
+ });
9563
+ if (errors.length > 0 || !target.correlationId) return errors;
9564
+ const rubric = inp.successCriteria.rubric;
9565
+ const duplicate = (await ctx.listTasksByCorrelation(target.correlationId)).find((task) => {
9566
+ if (task.taskType !== "judge_eval_attempt") return false;
9567
+ if (task.status === "failed" || task.status === "cancelled" || task.status === "expired") return false;
9568
+ const existing = task.input;
9569
+ const existingRubric = existing.successCriteria?.rubric;
9570
+ return existing.targetTaskId === inp.targetTaskId && existing.targetAttemptN === inp.targetAttemptN && existingRubric?.rubricId === rubric?.rubricId && existingRubric?.version === rubric?.version;
9571
+ });
9572
+ if (duplicate) errors.push({
9573
+ field: "targetTaskId",
9574
+ message: `judge task ${duplicate.id} already exists for (${inp.targetTaskId}, attempt ${inp.targetAttemptN}, rubric ${rubric?.rubricId}@${rubric?.version})`
9588
9575
  });
9589
- const first = stableStringify(presentTargets[0].input.successCriteria);
9590
- for (let i = 1; i < presentTargets.length; i++) if (stableStringify(presentTargets[i].input.successCriteria) !== first) {
9591
- errors.push({
9592
- field: `runTaskIds[${i}]`,
9593
- message: `runTaskIds[${i}] has a different input.successCriteria than runTaskIds[0]; all variants must share the rubric and gates`
9594
- });
9595
- break;
9596
- }
9597
9576
  return errors;
9598
9577
  }
9599
- /**
9600
- * Side effect emitted on successful `judge_eval_variant` create:
9601
- * seal the shared correlation_id atomically with the insert. The
9602
- * task service applies the seal in the same transaction; a
9603
- * concurrent second `judge_eval_variant` against the same group
9604
- * loses the race and is rejected with a clean conflict error.
9605
- *
9606
- * The seal applies to the SHARED correlation_id of the targets —
9607
- * NOT to the judge task's own correlationId (which is typically
9608
- * null or distinct). The task service derives the correlationId
9609
- * for the effect from the resolved targets, not from the judge
9610
- * task row.
9611
- */
9612
- async function onCreateJudgeEvalVariant(input, ctx) {
9613
- const { runTaskIds } = input;
9614
- const first = await ctx.resolveTask(runTaskIds[0]);
9615
- if (!first?.correlationId) return [];
9578
+ async function onCreateJudgeEvalAttempt(input, _ctx) {
9579
+ const judge = input;
9580
+ const rubric = judge.successCriteria.rubric;
9581
+ if (!rubric) return [];
9616
9582
  return [{
9617
- kind: "sealCorrelation",
9618
- correlationId: first.correlationId
9583
+ kind: "guardTaskUniqueness",
9584
+ taskType: JUDGE_EVAL_ATTEMPT_TYPE,
9585
+ lockKey: [
9586
+ JUDGE_EVAL_ATTEMPT_TYPE,
9587
+ judge.targetTaskId,
9588
+ String(judge.targetAttemptN),
9589
+ rubric.rubricId,
9590
+ rubric.version
9591
+ ].join(":"),
9592
+ inputMatches: [
9593
+ {
9594
+ path: ["targetTaskId"],
9595
+ value: judge.targetTaskId
9596
+ },
9597
+ {
9598
+ path: ["targetAttemptN"],
9599
+ value: judge.targetAttemptN
9600
+ },
9601
+ {
9602
+ path: [
9603
+ "successCriteria",
9604
+ "rubric",
9605
+ "rubricId"
9606
+ ],
9607
+ value: rubric.rubricId
9608
+ },
9609
+ {
9610
+ path: [
9611
+ "successCriteria",
9612
+ "rubric",
9613
+ "version"
9614
+ ],
9615
+ value: rubric.version
9616
+ }
9617
+ ]
9619
9618
  }];
9620
9619
  }
9621
9620
  //#endregion
9621
+ //#region ../tasks/src/task-types/pr-review.ts
9622
+ var PR_REVIEW_TYPE = "pr_review";
9623
+ var PrReviewSubject = Type$1.Object({
9624
+ title: Type$1.String({ minLength: 1 }),
9625
+ summary: Type$1.String({ minLength: 1 }),
9626
+ resourceUrls: Type$1.Optional(Type$1.Array(Type$1.String({ minLength: 1 }))),
9627
+ inspectionHints: Type$1.Optional(Type$1.Array(Type$1.String({ minLength: 1 })))
9628
+ }, {
9629
+ $id: "PrReviewSubject",
9630
+ additionalProperties: false
9631
+ });
9632
+ var PrReviewInput = Type$1.Object({
9633
+ subject: PrReviewSubject,
9634
+ taskPrompt: Type$1.Optional(Type$1.String({ minLength: 1 })),
9635
+ successCriteria: SuccessCriteria
9636
+ }, {
9637
+ $id: "PrReviewInput",
9638
+ additionalProperties: false
9639
+ });
9640
+ var PrReviewScore = Type$1.Object({
9641
+ criterionId: Type$1.String({ minLength: 1 }),
9642
+ score: Type$1.Union([Type$1.Literal(0), Type$1.Literal(1)]),
9643
+ rationale: Type$1.String({ minLength: 1 })
9644
+ }, {
9645
+ $id: "PrReviewScore",
9646
+ additionalProperties: false
9647
+ });
9648
+ var PrReviewOutput = Type$1.Object({
9649
+ scores: Type$1.Array(PrReviewScore, { minItems: 1 }),
9650
+ composite: Type$1.Number({
9651
+ minimum: 0,
9652
+ maximum: 1
9653
+ }),
9654
+ verdict: Type$1.String({ minLength: 1 })
9655
+ }, {
9656
+ $id: "PrReviewOutput",
9657
+ additionalProperties: false
9658
+ });
9659
+ function requireBooleanRubric(rubric) {
9660
+ for (const criterion of rubric.criteria) if (criterion.scoring !== "boolean") return `pr_review requires boolean scoring for every rubric criterion; criterion "${criterion.id}" uses "${criterion.scoring}"`;
9661
+ return null;
9662
+ }
9663
+ function validatePrReviewInput(input) {
9664
+ const sc = input.successCriteria;
9665
+ if (!sc) return "successCriteria is required for judgment tasks";
9666
+ if (!sc.rubric) return "successCriteria.rubric is required for judgment tasks";
9667
+ return validateRubricWeights(sc.rubric) ?? requireBooleanRubric(sc.rubric);
9668
+ }
9669
+ function validatePrReviewOutput(output, input) {
9670
+ if (!input) return null;
9671
+ const scores = output.scores;
9672
+ const rubric = input.successCriteria.rubric;
9673
+ if (!rubric) return null;
9674
+ if (scores.length !== rubric.criteria.length) return `scores length ${scores.length} does not match rubric criteria length ${rubric.criteria.length}`;
9675
+ let composite = 0;
9676
+ for (let i = 0; i < rubric.criteria.length; i++) {
9677
+ const criterion = rubric.criteria[i];
9678
+ const score = scores[i];
9679
+ if (score.criterionId !== criterion.id) return `scores[${i}] has criterionId "${score.criterionId}" but rubric expects "${criterion.id}" in that position`;
9680
+ composite += criterion.weight * score.score;
9681
+ }
9682
+ const claimed = output.composite;
9683
+ if (Math.abs(claimed - composite) > 1e-6) return `composite ${claimed} does not match weighted sum ${composite.toFixed(6)}`;
9684
+ return null;
9685
+ }
9686
+ //#endregion
9622
9687
  //#region ../tasks/src/task-types/render-pack.ts
9623
9688
  /**
9624
9689
  * `render_pack` — turn a context pack into a signed rendered artefact.
@@ -9673,14 +9738,43 @@ async function validateRenderPackInputAsync(input, ctx) {
9673
9738
  //#region ../tasks/src/task-types/run-eval.ts
9674
9739
  /**
9675
9740
  * `run_eval` — execute a scenario prompt under a named variant for
9676
- * later cross-variant grading by `judge_eval_variant` (Slice 2).
9741
+ * later per-attempt grading by `judge_eval_attempt` tasks.
9677
9742
  *
9678
9743
  * output_kind: artifact
9679
- * criteria: optional (when set, output.verification is required —
9680
- * producer self-assessment; the judge is the binding evaluator)
9744
+ * criteria: optional producer-only checks (when set,
9745
+ * output.verification is required — the judge rubric remains hidden
9746
+ * on downstream `judge_eval_attempt` tasks)
9681
9747
  * references: not required (scenario lives entirely in input)
9682
9748
  */
9683
9749
  var RUN_EVAL_TYPE = "run_eval";
9750
+ var RunEvalMode = Type$1.Union([Type$1.Literal("vitro"), Type$1.Literal("vivo")], { $id: "RunEvalMode" });
9751
+ var RunEvalWorkspace = Type$1.Union([
9752
+ Type$1.Literal("none"),
9753
+ Type$1.Literal("shared_mount"),
9754
+ Type$1.Literal("dedicated_worktree")
9755
+ ], { $id: "RunEvalWorkspace" });
9756
+ var RunEvalExecution = Type$1.Object({
9757
+ mode: RunEvalMode,
9758
+ workspace: RunEvalWorkspace
9759
+ }, {
9760
+ $id: "RunEvalExecution",
9761
+ additionalProperties: false
9762
+ });
9763
+ /**
9764
+ * Producer-visible checks for `run_eval`. Deliberately forbids `rubric`
9765
+ * so the variant runner cannot see the downstream judge's answer key.
9766
+ * Keep the rest of the SuccessCriteria envelope available for generic
9767
+ * process / structure checks (`gates`, `assertions`, `sideEffects`).
9768
+ */
9769
+ var RunEvalSuccessCriteria = Type$1.Object({
9770
+ version: Type$1.Literal(1),
9771
+ gates: Type$1.Optional(SuccessCriteria.properties.gates),
9772
+ assertions: Type$1.Optional(SuccessCriteria.properties.assertions),
9773
+ sideEffects: Type$1.Optional(SuccessCriteria.properties.sideEffects)
9774
+ }, {
9775
+ $id: "RunEvalSuccessCriteria",
9776
+ additionalProperties: false
9777
+ });
9684
9778
  var RunEvalInput = Type$1.Object({
9685
9779
  scenario: Type$1.Object({
9686
9780
  prompt: Type$1.String({ minLength: 1 }),
@@ -9690,8 +9784,9 @@ var RunEvalInput = Type$1.Object({
9690
9784
  minLength: 1,
9691
9785
  maxLength: 64
9692
9786
  }),
9787
+ execution: RunEvalExecution,
9693
9788
  context: TaskContext,
9694
- successCriteria: Type$1.Optional(SuccessCriteria)
9789
+ successCriteria: Type$1.Optional(RunEvalSuccessCriteria)
9695
9790
  }, {
9696
9791
  $id: "RunEvalInput",
9697
9792
  additionalProperties: false
@@ -9719,8 +9814,8 @@ var RunEvalOutput = Type$1.Object({
9719
9814
  function validateRunEvalOutput(output, input) {
9720
9815
  const hasCriteria = input !== null && input !== void 0 && input.successCriteria !== void 0;
9721
9816
  const hasVerification = output !== null && output !== void 0 && output.verification !== void 0;
9722
- if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the criteria";
9723
- if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no criteria to assess against";
9817
+ if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the producer checks";
9818
+ if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no producer checks to assess against";
9724
9819
  return null;
9725
9820
  }
9726
9821
  //#endregion
@@ -9786,6 +9881,18 @@ var BUILT_IN_TASK_TYPES = {
9786
9881
  validateInput: validateJudgmentInput,
9787
9882
  validateInputAsync: validateAssessBriefInputAsync
9788
9883
  },
9884
+ [PR_REVIEW_TYPE]: {
9885
+ name: PR_REVIEW_TYPE,
9886
+ inputSchema: PrReviewInput,
9887
+ outputSchema: PrReviewOutput,
9888
+ outputKind: "judgment",
9889
+ workspaceMode: "dedicated_worktree",
9890
+ workspaceScope: "attempt",
9891
+ sessionScope: "none",
9892
+ requiresReferences: false,
9893
+ validateInput: validatePrReviewInput,
9894
+ validateOutput: validatePrReviewOutput
9895
+ },
9789
9896
  [CURATE_PACK_TYPE]: {
9790
9897
  name: CURATE_PACK_TYPE,
9791
9898
  inputSchema: CuratePackInput,
@@ -9824,24 +9931,24 @@ var BUILT_IN_TASK_TYPES = {
9824
9931
  inputSchema: RunEvalInput,
9825
9932
  outputSchema: RunEvalOutput,
9826
9933
  outputKind: "artifact",
9827
- workspaceScope: "attempt",
9934
+ resumable: true,
9935
+ workspaceScope: "session",
9828
9936
  sessionScope: "custom",
9829
9937
  requiresReferences: false,
9830
9938
  validateOutput: validateRunEvalOutput
9831
9939
  },
9832
- [JUDGE_EVAL_VARIANT_TYPE]: {
9833
- name: JUDGE_EVAL_VARIANT_TYPE,
9834
- inputSchema: JudgeEvalVariantInput,
9835
- outputSchema: JudgeEvalVariantOutput,
9940
+ [JUDGE_EVAL_ATTEMPT_TYPE]: {
9941
+ name: JUDGE_EVAL_ATTEMPT_TYPE,
9942
+ inputSchema: JudgeEvalAttemptInput,
9943
+ outputSchema: JudgeEvalAttemptOutput,
9836
9944
  outputKind: "judgment",
9837
9945
  workspaceScope: "attempt",
9838
- sessionScope: "custom",
9946
+ sessionScope: "none",
9839
9947
  requiresReferences: false,
9840
- validateInput: validateJudgeEvalVariantInput,
9841
- validateOutput: validateJudgeEvalVariantOutput,
9842
- validateInputAsync: validateJudgeEvalVariantInputAsync,
9843
- onCreate: onCreateJudgeEvalVariant,
9844
- usesSubagents: true
9948
+ validateInput: validateJudgeEvalAttemptInput,
9949
+ validateOutput: validateJudgeEvalAttemptOutput,
9950
+ validateInputAsync: validateJudgeEvalAttemptInputAsync,
9951
+ onCreate: onCreateJudgeEvalAttempt
9845
9952
  }
9846
9953
  };
9847
9954
  //#endregion
@@ -10205,20 +10312,16 @@ function buildFinalOutputBlock(opts) {
10205
10312
  "## Final output (read this carefully)",
10206
10313
  "",
10207
10314
  `Your VERY LAST action in this conversation MUST report the structured`,
10208
- `output matching \`${outputSchemaName}\`. Two ways to do it, in order of`,
10209
- `preference:`,
10315
+ `output matching \`${outputSchemaName}\`.`,
10210
10316
  "",
10211
- `1. **Preferred — call \`${submitTool}\` exactly once** with the payload.`,
10212
- ` The runtime captures the validated arguments and ends the session.`,
10213
- ` If the tool is registered, prefer this path.`,
10214
- `2. **Fallback** if the submit tool is unavailable, your very last`,
10215
- ` assistant message MUST be a single JSON object matching`,
10216
- ` \`${outputSchemaName}\`. No prose before or after. No code fences.`,
10217
- ` No "ok" or "done". The runtime parses the last balanced top-level`,
10218
- ` JSON object as the output.`,
10317
+ `Call \`${submitTool}\` exactly once with the payload.`,
10318
+ `The runtime captures the validated arguments and ends the session.`,
10319
+ `Do NOT emit the output as plain assistant text. Do NOT rely on a`,
10320
+ `JSON-in-message fallback. If you do not call \`${submitTool}\`, the`,
10321
+ `attempt fails even if the underlying work succeeded.`,
10219
10322
  "",
10220
- `Failing to report structured output as the very last action means the`,
10221
- `attempt is marked failed even if the underlying work succeeded.`,
10323
+ `Your final assistant text before that tool call may explain your work,`,
10324
+ `but the submit-tool call itself must be your VERY LAST action.`,
10222
10325
  "",
10223
10326
  `Output shape:`,
10224
10327
  "",
@@ -10233,6 +10336,20 @@ function buildFinalOutputBlock(opts) {
10233
10336
  return lines.join("\n");
10234
10337
  }
10235
10338
  //#endregion
10339
+ //#region ../agent-runtime/src/prompts/rubric-common.ts
10340
+ function renderRubricCriteriaList(rubric) {
10341
+ return rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
10342
+ }
10343
+ function renderRubricPreambleSection(rubric) {
10344
+ if (!rubric.preamble) return null;
10345
+ return [
10346
+ "### Rubric preamble",
10347
+ "",
10348
+ rubric.preamble,
10349
+ ""
10350
+ ].join("\n");
10351
+ }
10352
+ //#endregion
10236
10353
  //#region ../agent-runtime/src/prompts/assess-brief.ts
10237
10354
  /**
10238
10355
  * Build the first user-message prompt for an `assess_brief` judge attempt.
@@ -10258,13 +10375,8 @@ function buildFinalOutputBlock(opts) {
10258
10375
  */
10259
10376
  function buildAssessBriefUserPrompt(input, ctx) {
10260
10377
  const rubric = input.successCriteria.rubric;
10261
- const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
10262
- const preambleSection = rubric.preamble ? [
10263
- "### Rubric preamble",
10264
- "",
10265
- rubric.preamble,
10266
- ""
10267
- ].join("\n") : "";
10378
+ const criteriaList = renderRubricCriteriaList(rubric);
10379
+ const preambleSection = renderRubricPreambleSection(rubric) ?? "";
10268
10380
  const workspaceSection = ctx.workspace?.mode === "dedicated_worktree" ? [
10269
10381
  "### Workspace",
10270
10382
  "",
@@ -10347,21 +10459,30 @@ function buildAssessBriefUserPrompt(input, ctx) {
10347
10459
  }
10348
10460
  //#endregion
10349
10461
  //#region ../agent-runtime/src/prompts/self-verification.ts
10350
- function buildSelfVerificationBlock(taskId) {
10462
+ function buildSelfVerificationBlock(taskId, criteriaField = "successCriteria") {
10351
10463
  return [
10352
10464
  "## Self-verification",
10353
10465
  "",
10354
- `Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.successCriteria\`.`,
10466
+ `If \`input.${criteriaField}\` is set on this task, your final output MUST`,
10467
+ "include a `verification` block. **The runtime/server rejects task",
10468
+ `submission without \`verification\` when \`${criteriaField}\` is present**`,
10469
+ "— the request fails validation and the attempt is discarded, even if the",
10470
+ "underlying work succeeded. Do not call the submit tool until you have",
10471
+ "computed the verification payload.",
10472
+ "",
10473
+ `Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.${criteriaField}\`.`,
10355
10474
  "",
10356
- "- If `input.successCriteria` is **absent**, omit `verification` from your",
10475
+ `- If \`input.${criteriaField}\` is **absent**, omit \`verification\` from your`,
10357
10476
  " final output entirely.",
10358
- "- If `input.successCriteria` is **present**, you MUST include a",
10359
- " `verification` block in your final output. Evaluate every applicable",
10477
+ `- If \`input.${criteriaField}\` is **present**, evaluate every applicable`,
10360
10478
  " item — `gates`, `assertions`, `rubric` criteria, `sideEffects` — against",
10361
10479
  " your produced work and emit one result per id. Be honest: a `fail` with",
10362
10480
  " a one-line reason is more useful than a false `pass`. Use `skip` (with a",
10363
10481
  " `detail`) when you genuinely could not determine a result. Compute",
10364
10482
  " `passed = results.every(r => r.status !== 'fail')`.",
10483
+ "- `verification` MUST be a JSON object. Never send a string, markdown",
10484
+ " block, null, or an empty placeholder. The submit tool expects an object",
10485
+ " with `inputCid`, `results`, and `passed` fields.",
10365
10486
  "",
10366
10487
  "Verification shape:",
10367
10488
  "",
@@ -10375,6 +10496,23 @@ function buildSelfVerificationBlock(taskId) {
10375
10496
  " \"passed\": <boolean>",
10376
10497
  "}",
10377
10498
  "```",
10499
+ "",
10500
+ "Minimal valid example:",
10501
+ "",
10502
+ "```json",
10503
+ "{",
10504
+ " \"inputCid\": \"<task inputCid>\",",
10505
+ " \"results\": [",
10506
+ " {",
10507
+ " \"id\": \"<criterion id>\",",
10508
+ " \"kind\": \"rubric\",",
10509
+ " \"status\": \"pass\",",
10510
+ " \"detail\": \"one-line reason\"",
10511
+ " }",
10512
+ " ],",
10513
+ " \"passed\": true",
10514
+ "}",
10515
+ "```",
10378
10516
  ""
10379
10517
  ].join("\n");
10380
10518
  }
@@ -10625,69 +10763,62 @@ function buildFulfillBriefUserPrompt(input, ctx) {
10625
10763
  ].filter(Boolean).join("\n");
10626
10764
  }
10627
10765
  //#endregion
10628
- //#region ../agent-runtime/src/prompts/judge-eval-variant.ts
10629
- /**
10630
- * Build the first user-message prompt for a `judge_eval_variant` task
10631
- * (#943 Slice 2).
10632
- *
10633
- * The parent agent's job is **fan-out-and-collect**: for each
10634
- * `runTaskIds[i]`, spawn an isolated subagent via the `subagent` custom
10635
- * tool (#1087), have it grade that variant against the shared rubric,
10636
- * and collect each subagent's structured `judge_eval_variant_result`
10637
- * payload. The parent does NOT grade itself; it composes the per-
10638
- * variant results into the final `judge_eval_variant` output (results
10639
- * array + optional deltas + verdicts).
10640
- *
10641
- * Isolation is the point: each variant gets a fresh subagent session
10642
- * with no carryover context from sibling variants, so per-variant
10643
- * grading is independent. Cost is bounded by `maxItems: 10` on
10644
- * runTaskIds.
10645
- */
10646
- function buildJudgeEvalVariantUserPrompt(input, ctx) {
10647
- const { runTaskIds, successCriteria } = input;
10648
- const rubric = successCriteria.rubric;
10649
- if (!rubric) throw new Error("judge_eval_variant requires successCriteria.rubric — none present");
10766
+ //#region ../agent-runtime/src/prompts/judge-eval-attempt.ts
10767
+ function buildJudgeEvalAttemptUserPrompt(input, ctx) {
10768
+ const rubric = input.successCriteria.rubric;
10769
+ if (!rubric) throw new Error("judge_eval_attempt requires successCriteria.rubric — none present");
10650
10770
  const escapeCell = (s) => s.replace(/\\/g, "\\\\").replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
10651
10771
  const criteriaTable = rubric.criteria.map((c) => `| \`${c.id}\` | ${c.weight.toFixed(3)} | ${c.scoring} | ${escapeCell(c.description)} |`).join("\n");
10652
- const targetsBlock = runTaskIds.map((id, i) => `${i + 1}. \`${id}\``).join("\n");
10653
10772
  const finalOutputBlock = buildFinalOutputBlock({
10654
- taskType: "judge_eval_variant",
10655
- outputSchemaName: "JudgeEvalVariantOutput",
10773
+ taskType: "judge_eval_attempt",
10774
+ outputSchemaName: "JudgeEvalAttemptOutput",
10656
10775
  shapeSketch: [
10657
10776
  "{",
10658
- " \"results\": [",
10659
- " {",
10660
- " \"runTaskId\": \"<runTaskIds[i]>\",",
10661
- " \"variantLabel\": \"<from variant input>\",",
10662
- " \"scores\": [ { \"criterionId\": \"...\", \"score\": 0..1, \"rationale\": \"...\", \"assertions\": [...]? } ],",
10663
- " \"composite\": <Σ(weight × score), 0..1>,",
10664
- " \"verdict\": \"<1-3 sentences>\"",
10665
- " },",
10666
- " ...one entry per runTaskIds[i], same order",
10667
- " ],",
10668
- " \"deltas\": { \"<labelA> - <labelB>\": <composite(A) - composite(B)> }, // optional",
10777
+ ` "targetTaskId": "${input.targetTaskId}",`,
10778
+ ` "targetAttemptN": ${input.targetAttemptN},`,
10779
+ " \"variantLabel\": \"<from producer input>\",",
10780
+ " \"scores\": [ { \"criterionId\": \"...\", \"score\": 0..1, \"rationale\": \"...\", \"assertions\": [...]? } ],",
10781
+ " \"composite\": <Σ(weight × score), 0..1>,",
10782
+ " \"verdict\": \"<1-3 sentences>\",",
10669
10783
  " \"judgeModel\": \"<id>\", // optional",
10670
10784
  " \"traceparent\": \"<from claim>\"",
10671
10785
  "}"
10672
10786
  ].join("\n")
10673
10787
  });
10788
+ const workspaceSection = ctx.workspace?.attached === true ? [
10789
+ "### Workspace",
10790
+ "",
10791
+ "Your current workspace is already attached to the producer attempt",
10792
+ "you are judging. Inspect files directly from the current workspace",
10793
+ "root instead of inventing synthetic `artifact_<taskId>` paths.",
10794
+ "If the accepted attempt output lists `artifacts[].path`, treat those",
10795
+ "paths as relative to the current workspace root unless the output",
10796
+ "explicitly says otherwise.",
10797
+ ctx.workspace.mode === "dedicated_worktree" ? `This attachment is a dedicated producer worktree${ctx.workspace.branch ? ` on branch \`${ctx.workspace.branch}\`` : ""}.` : ctx.workspace.mode === "scratch_mount" ? "This attachment is the producer scratch workspace mounted with shadow writes for safe inspection." : "This attachment is the producer shared workspace mounted with shadow writes for safe inspection.",
10798
+ ""
10799
+ ].join("\n") : "";
10674
10800
  return [
10675
- "# Judge Eval Variants\n",
10676
- `You are grading ${runTaskIds.length} variants of a single run_eval scenario`,
10677
- "against ONE shared rubric. Your job is fan-out-and-collect you do not",
10678
- "grade yourself.",
10801
+ "# Judge Eval Attempt\n",
10802
+ "You are grading one accepted `run_eval` producer attempt against a hidden",
10803
+ "judge rubric. Do not delegate to subagents. Grade in this session only.",
10679
10804
  "",
10680
10805
  `Task id: \`${ctx.taskId}\``,
10681
10806
  `Diary: \`${ctx.diaryId}\``,
10807
+ `Producer task: \`${input.targetTaskId}\``,
10808
+ `Producer attempt: \`${input.targetAttemptN}\``,
10682
10809
  "",
10683
- "### Targets (variants to grade)",
10684
- "",
10685
- targetsBlock,
10810
+ "### Evidence gathering",
10686
10811
  "",
10687
- "Each target is a completed `run_eval` task in the same correlation group.",
10688
- "Read its accepted attempt via `moltnet_get_task` / `moltnet_list_task_attempts`",
10689
- "to see the producer's output before grading.",
10812
+ `1. Call \`moltnet_get_task\` with taskId=\`${input.targetTaskId}\`.`,
10813
+ `2. Call \`moltnet_list_task_attempts\` with taskId=\`${input.targetTaskId}\` and inspect the accepted attempt matching \`${input.targetAttemptN}\`.`,
10814
+ `3. Call \`moltnet_list_task_messages\` with taskId=\`${input.targetTaskId}\`, attemptN=\`${input.targetAttemptN}\` to inspect the producer's turn-by-turn behavior.`,
10815
+ "4. Use the accepted attempt output, attempt messages, and any accessible",
10816
+ " artifacts or workspace evidence available in your environment.",
10817
+ " Read artifact files from the mounted producer workspace when present;",
10818
+ " do not assume detached `artifact_<taskId>` directories exist.",
10819
+ "5. Score strictly against the rubric below.",
10690
10820
  "",
10821
+ workspaceSection,
10691
10822
  "### Rubric",
10692
10823
  "",
10693
10824
  rubric.preamble ? `${rubric.preamble}\n` : "",
@@ -10695,34 +10826,10 @@ function buildJudgeEvalVariantUserPrompt(input, ctx) {
10695
10826
  "| --- | --- | --- | --- |",
10696
10827
  criteriaTable,
10697
10828
  "",
10698
- "### How to grade",
10699
- "",
10700
- "For EACH `runTaskIds[i]`:",
10701
- "",
10702
- "1. Call the `subagent` custom tool with:",
10703
- " - `task`: a brief instructing the subagent to grade ONLY that variant",
10704
- " against the rubric above; include the target task id and the rubric",
10705
- " verbatim. The subagent has the same MoltNet tools and can fetch the",
10706
- " accepted attempt output independently.",
10707
- " - `output_schema`: `\"judge_eval_variant_result\"`",
10708
- "2. Receive the subagent's structured `judge_eval_variant_result` payload.",
10709
- "3. Append it to your `results[]` array, **in the same order as input.runTaskIds**.",
10710
- "",
10711
- "Do NOT score any variant in your own session. The whole point of the",
10712
- "subagent fan-out is per-variant context isolation — grading two variants",
10713
- "back-to-back in one session lets the second be biased by the first.",
10714
- "",
10715
10829
  "### Composite arithmetic",
10716
10830
  "",
10717
- "Each `composite` MUST equal `Σ(criterion.weight × score)` over the rubric",
10718
- "criteria. Drift > 0.001 is rejected. Subagents are instructed to compute it",
10719
- "themselves; double-check before assembling the final output.",
10720
- "",
10721
- "### Deltas (optional)",
10722
- "",
10723
- "If useful, populate `deltas` with pairwise composite differences keyed by",
10724
- "`\"<variantLabel-A> - <variantLabel-B>\"` (single space-hyphen-space). Both",
10725
- "labels must appear in `results`. Omit `deltas` entirely if not used.",
10831
+ "Your `composite` MUST equal `Σ(criterion.weight × score)` over the rubric",
10832
+ "criteria. Drift > 0.001 is rejected.",
10726
10833
  "",
10727
10834
  finalOutputBlock
10728
10835
  ].filter((s) => s !== "").join("\n");
@@ -10732,13 +10839,8 @@ function buildJudgeEvalVariantUserPrompt(input, ctx) {
10732
10839
  function buildJudgePackUserPrompt(input, ctx) {
10733
10840
  const { renderedPackId, sourcePackId, successCriteria } = input;
10734
10841
  const rubric = successCriteria.rubric;
10735
- const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
10736
- const preambleSection = rubric.preamble ? [
10737
- "### Rubric preamble",
10738
- "",
10739
- rubric.preamble,
10740
- ""
10741
- ].join("\n") : null;
10842
+ const criteriaList = renderRubricCriteriaList(rubric);
10843
+ const preambleSection = renderRubricPreambleSection(rubric);
10742
10844
  return [
10743
10845
  "# Judge Pack Agent",
10744
10846
  "",
@@ -10854,6 +10956,112 @@ function buildJudgePackUserPrompt(input, ctx) {
10854
10956
  ].filter((l) => l !== null).join("\n");
10855
10957
  }
10856
10958
  //#endregion
10959
+ //#region ../agent-runtime/src/prompts/pr-review.ts
10960
+ function buildPrReviewUserPrompt(input, ctx) {
10961
+ const rubric = input.successCriteria.rubric;
10962
+ const criteriaList = renderRubricCriteriaList(rubric);
10963
+ const preambleSection = renderRubricPreambleSection(rubric);
10964
+ const taskPromptSection = input.taskPrompt ? [
10965
+ "## Task-specific instructions",
10966
+ "",
10967
+ input.taskPrompt,
10968
+ ""
10969
+ ].join("\n") : "";
10970
+ const resourceSection = input.subject.resourceUrls && input.subject.resourceUrls.length > 0 ? [
10971
+ "### Resources",
10972
+ "",
10973
+ ...input.subject.resourceUrls.map((url) => `- ${url}`),
10974
+ ""
10975
+ ].join("\n") : "";
10976
+ const hintsSection = input.subject.inspectionHints && input.subject.inspectionHints.length > 0 ? [
10977
+ "### Inspection hints",
10978
+ "",
10979
+ ...input.subject.inspectionHints.map((hint) => `- ${hint}`),
10980
+ ""
10981
+ ].join("\n") : "";
10982
+ const workspaceSection = ctx.workspace?.mode === "dedicated_worktree" ? [
10983
+ "### Workspace",
10984
+ "",
10985
+ "This review attempt is running inside a dedicated disposable git",
10986
+ "worktree. Inspect and reason inside this workspace only.",
10987
+ ctx.workspace.branch ? `The current review branch is \`${ctx.workspace.branch}\`.` : "The current checkout is disposable and will be cleaned up when the task ends.",
10988
+ ""
10989
+ ].join("\n") : "";
10990
+ return [
10991
+ "# Review Agent",
10992
+ "",
10993
+ "You are an independent judge. You did NOT produce the subject under review.",
10994
+ "Assess it strictly against the rubric below and emit a structured judgment.",
10995
+ "You may inspect the local workspace and the referenced resources, but do NOT modify anything.",
10996
+ "",
10997
+ `Your diary ID is: ${ctx.diaryId}`,
10998
+ `This task's id is: ${ctx.taskId}`,
10999
+ "",
11000
+ "## Subject",
11001
+ "",
11002
+ `**Title:** ${input.subject.title}`,
11003
+ "",
11004
+ input.subject.summary,
11005
+ "",
11006
+ resourceSection,
11007
+ hintsSection,
11008
+ workspaceSection,
11009
+ "### Execution contract",
11010
+ "",
11011
+ "Treat the provided subject, resources, inspection hints, and any",
11012
+ "task-specific instructions as the full",
11013
+ "review contract for this task.",
11014
+ "",
11015
+ "If the task-specific instructions or inspection hints require an outward action tied to the review",
11016
+ "(for example publishing the judgment somewhere), perform that action as",
11017
+ "part of the task before reporting structured output.",
11018
+ "",
11019
+ "## Review workflow",
11020
+ "",
11021
+ "1. Read the subject summary, resources, inspection hints, and any",
11022
+ " task-specific instructions before scoring.",
11023
+ "2. Inspect the target artefact directly using the tools and resources the",
11024
+ " task makes available.",
11025
+ "3. If you are in a dedicated disposable worktree and need the review target",
11026
+ " checked out locally, do that work inside this disposable workspace only.",
11027
+ "4. Apply the rubric strictly. This task is about complexity and",
11028
+ " reviewability, not correctness or feature desirability.",
11029
+ "5. Perform any required outward action before emitting the final",
11030
+ " structured output.",
11031
+ "",
11032
+ taskPromptSection,
11033
+ preambleSection,
11034
+ "## Criteria",
11035
+ "",
11036
+ criteriaList,
11037
+ "",
11038
+ "### Scoring rules",
11039
+ "",
11040
+ "- Every criterion uses binary scoring only.",
11041
+ "- Score `1` when the subject clearly clears the criterion.",
11042
+ "- Score `0` when it does not, or when the evidence is ambiguous.",
11043
+ "- `rationale` is REQUIRED for every score. Keep it concrete and audit-friendly.",
11044
+ "- Compute `composite = Σ(weight_i × score_i)` exactly; the runtime rejects mismatches.",
11045
+ "",
11046
+ "Write a signed diary entry (tags: `judgment`, `pr_review`) capturing the rationale before reporting structured output.",
11047
+ "",
11048
+ buildFinalOutputBlock({
11049
+ taskType: "pr_review",
11050
+ outputSchemaName: "PrReviewOutput",
11051
+ shapeSketch: [
11052
+ "{",
11053
+ " \"scores\": [",
11054
+ " { \"criterionId\": \"...\", \"score\": 0, \"rationale\": \"...\" }",
11055
+ " ],",
11056
+ " \"composite\": <sum-of-weighted-binary-scores>,",
11057
+ " \"verdict\": \"<1-3 sentence overall>\"",
11058
+ "}"
11059
+ ].join("\n"),
11060
+ extraNotes: ["`scores` MUST stay in the same order as the rubric criteria.", "`score` MUST be exactly `0` or `1` for every criterion."]
11061
+ })
11062
+ ].filter(Boolean).join("\n");
11063
+ }
11064
+ //#endregion
10857
11065
  //#region ../agent-runtime/src/prompts/render-pack.ts
10858
11066
  /**
10859
11067
  * Build the first user-message prompt for a `render_pack` task. Almost mechanical:
@@ -10918,8 +11126,9 @@ function buildRenderPackUserPrompt(input, ctx) {
10918
11126
  * Build the first user-message prompt for a `run_eval` task.
10919
11127
  *
10920
11128
  * Free-form: no git workflow, no commit ceremony. The executor produces
10921
- * a textual response (and optional file artifacts) that a later
10922
- * `judge_eval_variant` task (Slice 2) grades against the rubric.
11129
+ * a textual response (and optional file artifacts) that later
11130
+ * `judge_eval_attempt` task(s) grade against their own hidden
11131
+ * rubric.
10923
11132
  *
10924
11133
  * Context delivery is handled by `resolveTaskContext` (see
10925
11134
  * libs/agent-runtime/src/context-bindings.ts) and runs BEFORE this
@@ -10929,7 +11138,9 @@ function buildRenderPackUserPrompt(input, ctx) {
10929
11138
  * builder does NOT inline `input.context[]` itself.
10930
11139
  */
10931
11140
  function buildRunEvalUserPrompt(input, ctx) {
10932
- const { scenario, variantLabel, successCriteria } = input;
11141
+ const { scenario, variantLabel, execution, successCriteria } = input;
11142
+ const hasContext = input.context.length > 0;
11143
+ const hasInlineContext = input.context.some((entry) => entry.binding === "context_inline");
10933
11144
  const inputFilesSection = scenario.inputFiles?.length ? [
10934
11145
  "### Input files",
10935
11146
  "",
@@ -10942,9 +11153,30 @@ function buildRunEvalUserPrompt(input, ctx) {
10942
11153
  "",
10943
11154
  `This task carries correlationId \`${ctx.correlationId}\`. It joins`,
10944
11155
  "this variant to its sibling `run_eval` tasks (other variants of the",
10945
- "same scenario) and to the eventual `judge_eval_variant` task that",
10946
- "will grade them together. You do not need to act on it directly —",
10947
- "it is recorded for cross-variant aggregation at query time.",
11156
+ "same scenario and to any later `judge_eval_attempt` tasks created",
11157
+ "against those variants. You do not need to act on it directly — it",
11158
+ "is recorded for cross-variant aggregation at query time.",
11159
+ ""
11160
+ ].join("\n") : "";
11161
+ const executionSection = [
11162
+ "### Execution mode",
11163
+ "",
11164
+ `Mode: \`${execution.mode}\``,
11165
+ `Workspace: \`${execution.workspace}\``,
11166
+ execution.workspace === "none" ? "You are running in a scratch workspace with no repository checkout mounted. Do not assume git history or repo files are present unless the scenario provided them explicitly." : execution.workspace === "shared_mount" ? "You are running against the daemon shared mount. Treat any repository mutations as affecting the mounted checkout directly." : "You are running in a dedicated disposable git worktree isolated from the daemon shared checkout.",
11167
+ ""
11168
+ ].join("\n");
11169
+ const contextDisciplineSection = hasContext ? [
11170
+ "### Injected context discipline",
11171
+ "",
11172
+ "This task includes extra injected context from the task creator.",
11173
+ "You MUST inspect and use that context BEFORE you write solution",
11174
+ "files or draft your final answer.",
11175
+ "Do not solve first and only review the context afterward.",
11176
+ hasInlineContext ? "For `context_inline`, your FIRST content-inspection step should be a `read` of `/workspace/context-pack.md` before your first `write` call. The same content is also mirrored in `/workspace/AGENTS.md` and may be referenced from `/workspace/.claude/CLAUDE.md`." : "If injected context was provided as a skill, inspect that task-injected context before solving.",
11177
+ hasInlineContext ? "If `/workspace/context-pack.md` exists and you skip reading it before writing solution files, you are not following the task instructions." : "Do not rely on memory alone when task-injected context is available; inspect it first.",
11178
+ "If the injected context contains repo- or workflow-specific rules,",
11179
+ "those rules override your generic instincts.",
10948
11180
  ""
10949
11181
  ].join("\n") : "";
10950
11182
  const finalOutputBlock = buildFinalOutputBlock({
@@ -10957,7 +11189,13 @@ function buildRunEvalUserPrompt(input, ctx) {
10957
11189
  " \"totalTokens\": <int>,",
10958
11190
  " \"durationMs\": <int>,",
10959
11191
  " \"traceparent\": \"<from claim>\",",
10960
- " \"verification\": <required iff input.successCriteria; see Self-verification>",
11192
+ " \"verification\": {",
11193
+ " \"inputCid\": \"<task inputCid>\",",
11194
+ " \"results\": [",
11195
+ " { \"id\": \"<criterion id>\", \"kind\": \"rubric\", \"status\": \"pass|fail|skip\", \"detail\": \"<optional one-liner>\" }",
11196
+ " ],",
11197
+ " \"passed\": <boolean>",
11198
+ " } // required iff input.successCriteria; must be an object, never a string",
10961
11199
  "}"
10962
11200
  ].join("\n")
10963
11201
  });
@@ -10965,6 +11203,8 @@ function buildRunEvalUserPrompt(input, ctx) {
10965
11203
  "# Run Eval Agent\n",
10966
11204
  `You are running an evaluation scenario as variant \`${variantLabel}\`.\nTask id: \`${ctx.taskId}\`\n`,
10967
11205
  correlationSection,
11206
+ executionSection,
11207
+ contextDisciplineSection,
10968
11208
  `### Scenario\n\n${scenario.prompt}\n`,
10969
11209
  inputFilesSection,
10970
11210
  verificationSection,
@@ -11036,14 +11276,25 @@ function buildTaskUserPrompt(task, ctx) {
11036
11276
  diaryId: ctx.diaryId,
11037
11277
  taskId: ctx.taskId
11038
11278
  });
11039
- case JUDGE_EVAL_VARIANT_TYPE:
11040
- if (!Value.Check(JudgeEvalVariantInput, task.input)) {
11041
- const errors = [...Value.Errors(JudgeEvalVariantInput, task.input)];
11042
- throw new Error(`judge_eval_variant input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
11279
+ case JUDGE_EVAL_ATTEMPT_TYPE:
11280
+ if (!Value.Check(JudgeEvalAttemptInput, task.input)) {
11281
+ const errors = [...Value.Errors(JudgeEvalAttemptInput, task.input)];
11282
+ throw new Error(`judge_eval_attempt input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
11043
11283
  }
11044
- return buildJudgeEvalVariantUserPrompt(task.input, {
11284
+ return buildJudgeEvalAttemptUserPrompt(task.input, {
11045
11285
  diaryId: ctx.diaryId,
11046
- taskId: ctx.taskId
11286
+ taskId: ctx.taskId,
11287
+ workspace: ctx.workspace
11288
+ });
11289
+ case PR_REVIEW_TYPE:
11290
+ if (!Value.Check(PrReviewInput, task.input)) {
11291
+ const errors = [...Value.Errors(PrReviewInput, task.input)];
11292
+ throw new Error(`pr_review input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
11293
+ }
11294
+ return buildPrReviewUserPrompt(task.input, {
11295
+ diaryId: ctx.diaryId,
11296
+ taskId: ctx.taskId,
11297
+ workspace: ctx.workspace
11047
11298
  });
11048
11299
  case RUN_EVAL_TYPE:
11049
11300
  if (!Value.Check(RunEvalInput, task.input)) {
@@ -14562,6 +14813,11 @@ var require_multistream = /* @__PURE__ */ __commonJSMin(((exports, module) => {
14562
14813
  * paths under this mount via `toGuestPath` in `tool-operations.ts`.
14563
14814
  */
14564
14815
  var SKILL_ROOT_IN_VM = GUEST_TASK_SKILLS_MOUNT;
14816
+ var INLINE_CONTEXT_ROOT_IN_VM = "/workspace/.moltnet/context";
14817
+ var WORKSPACE_CONTEXT_PACK = "/workspace/context-pack.md";
14818
+ var WORKSPACE_AGENTS_MD = "/workspace/AGENTS.md";
14819
+ var WORKSPACE_CLAUDE_DIR = "/workspace/.claude";
14820
+ var WORKSPACE_CLAUDE_MD = "/workspace/.claude/CLAUDE.md";
14565
14821
  /** Bounds borrowed from pi's skill validation; conservative caps so a
14566
14822
  * malformed SKILL.md doesn't bloat the system prompt. */
14567
14823
  var MAX_SKILL_NAME = 64;
@@ -14572,21 +14828,40 @@ var MAX_SKILL_DESCRIPTION = 1024;
14572
14828
  */
14573
14829
  async function injectTaskContext(args) {
14574
14830
  const skills = [];
14831
+ const inlineContexts = [];
14575
14832
  const resolved = await resolveTaskContext({
14576
14833
  context: args.context,
14577
- deliver: { skill: async ({ slug, content }) => {
14578
- const dir = `${SKILL_ROOT_IN_VM}/${slug}`;
14579
- const filePath = `${dir}/SKILL.md`;
14580
- await args.fs.mkdir(dir, { recursive: true });
14581
- await args.fs.writeFile(filePath, content, { mode: 420 });
14582
- skills.push(buildSyntheticSkill({
14583
- slug,
14584
- content,
14585
- filePath,
14586
- dir
14587
- }));
14588
- } }
14834
+ deliver: {
14835
+ skill: async ({ slug, content }) => {
14836
+ const dir = `${SKILL_ROOT_IN_VM}/${slug}`;
14837
+ const filePath = `${dir}/SKILL.md`;
14838
+ await args.fs.mkdir(dir, { recursive: true });
14839
+ await args.fs.writeFile(filePath, content, { mode: 420 });
14840
+ skills.push(buildSyntheticSkill({
14841
+ slug,
14842
+ content,
14843
+ filePath,
14844
+ dir
14845
+ }));
14846
+ },
14847
+ contextFile: async ({ suggestedFileName, content }) => {
14848
+ await args.fs.mkdir(INLINE_CONTEXT_ROOT_IN_VM, { recursive: true });
14849
+ const filePath = `${INLINE_CONTEXT_ROOT_IN_VM}/${suggestedFileName}`;
14850
+ await args.fs.writeFile(filePath, content, { mode: 420 });
14851
+ inlineContexts.push({
14852
+ slug: suggestedFileName.replace(/\.md$/u, ""),
14853
+ content
14854
+ });
14855
+ }
14856
+ }
14589
14857
  });
14858
+ if (inlineContexts.length > 0) {
14859
+ const packContent = buildWorkspaceContextPack(inlineContexts);
14860
+ await args.fs.writeFile(WORKSPACE_CONTEXT_PACK, packContent, { mode: 420 });
14861
+ await args.fs.writeFile(WORKSPACE_AGENTS_MD, packContent, { mode: 420 });
14862
+ await args.fs.mkdir(WORKSPACE_CLAUDE_DIR, { recursive: true });
14863
+ await args.fs.writeFile(WORKSPACE_CLAUDE_MD, "@../context-pack.md\n", { mode: 420 });
14864
+ }
14590
14865
  return {
14591
14866
  injected: resolved.injected,
14592
14867
  skills,
@@ -14594,6 +14869,17 @@ async function injectTaskContext(args) {
14594
14869
  userInlineSuffix: resolved.userInlineSuffix
14595
14870
  };
14596
14871
  }
14872
+ function buildWorkspaceContextPack(contexts) {
14873
+ return [
14874
+ "# Context Pack",
14875
+ "",
14876
+ ...contexts.map(({ slug, content }) => [
14877
+ `## ${slug}`,
14878
+ "",
14879
+ content.trimEnd()
14880
+ ].join("\n"))
14881
+ ].join("\n\n").trimEnd() + "\n";
14882
+ }
14597
14883
  /**
14598
14884
  * Build a `Skill` object pi will faithfully render in
14599
14885
  * `<available_skills>`. We extract `name` and `description` from the
@@ -14957,7 +15243,7 @@ async function parseStructuredTaskOutput(assistantText, taskType, opts = {}) {
14957
15243
  }
14958
15244
  };
14959
15245
  }
14960
- const errors = validateTaskOutput(taskType, extracted);
15246
+ const errors = validateTaskOutput(taskType, extracted, opts.input);
14961
15247
  if (errors.length > 0) {
14962
15248
  const details = errors.slice(0, 3).map((error) => `${error.field}: ${error.message}`);
14963
15249
  const [firstError] = errors;
@@ -15071,7 +15357,7 @@ function createSubmitOutputTool(taskType, opts = {}) {
15071
15357
  description: contract.description,
15072
15358
  parameters: schema,
15073
15359
  async execute(_id, params) {
15074
- const errors = validateTaskOutput(taskType, params);
15360
+ const errors = validateTaskOutput(taskType, params, opts.input);
15075
15361
  if (errors.length > 0) {
15076
15362
  const detailMsg = errors.slice(0, 3).map((err) => `${err.field}: ${err.message}`).join("; ");
15077
15363
  const details = {
@@ -15140,6 +15426,39 @@ function resolveSubmitTools(taskType, opts = {}) {
15140
15426
  //#region src/runtime/task-workspace.ts
15141
15427
  function prepareTaskWorkspace(task, requestedMountPath, executionPlan) {
15142
15428
  const branch = executionPlan?.worktreeBranch ?? null;
15429
+ const workspaceMode = executionPlan?.workspaceMode ?? "shared_mount";
15430
+ const attachedWorkspace = executionPlan?.workspaceAttachment ?? null;
15431
+ if (attachedWorkspace) return {
15432
+ mountPath: attachedWorkspace.mountPath,
15433
+ cwdPath: attachedWorkspace.cwdPath,
15434
+ mode: workspaceMode,
15435
+ branch,
15436
+ cleanup: () => {}
15437
+ };
15438
+ if (workspaceMode === "scratch_mount") {
15439
+ const scratchDir = resolveTaskScratchPath(findMainWorktree(), executionPlan?.workspaceId ?? `task-${task.id}`);
15440
+ const keepWorkspace = executionPlan?.workspaceScope === "session" && executionPlan.sessionKey !== null;
15441
+ if (keepWorkspace) mkdirSync(scratchDir, { recursive: true });
15442
+ else {
15443
+ rmSync(scratchDir, {
15444
+ recursive: true,
15445
+ force: true
15446
+ });
15447
+ mkdirSync(scratchDir, { recursive: true });
15448
+ }
15449
+ return {
15450
+ mountPath: scratchDir,
15451
+ cwdPath: scratchDir,
15452
+ mode: "scratch_mount",
15453
+ branch: null,
15454
+ cleanup: keepWorkspace ? () => {} : () => {
15455
+ rmSync(scratchDir, {
15456
+ recursive: true,
15457
+ force: true
15458
+ });
15459
+ }
15460
+ };
15461
+ }
15143
15462
  if (!branch) return {
15144
15463
  mountPath: requestedMountPath,
15145
15464
  cwdPath: requestedMountPath,
@@ -15177,6 +15496,9 @@ function prepareTaskWorkspace(task, requestedMountPath, executionPlan) {
15177
15496
  function resolveTaskWorktreePath(mainRepo, workspaceId) {
15178
15497
  return join(mainRepo, ".worktrees", workspaceId);
15179
15498
  }
15499
+ function resolveTaskScratchPath(mainRepo, workspaceId) {
15500
+ return join(mainRepo, ".moltnet", "d", "task-workspaces", workspaceId);
15501
+ }
15180
15502
  function ensureReusableTaskWorktree(mainRepo, worktreeDir, branch) {
15181
15503
  if (isRegisteredWorktree(mainRepo, worktreeDir)) return;
15182
15504
  if (existsSync(worktreeDir)) throw new Error(`Expected reusable worktree ${worktreeDir} to be git-managed, but it exists outside git worktree metadata.`);
@@ -15413,12 +15735,14 @@ async function executePiTask(claimedTask, reporter, opts) {
15413
15735
  return makeFailedOutput("worktree_setup_failed", message);
15414
15736
  }
15415
15737
  try {
15738
+ const sandboxConfig = applyExecutionPlanSandboxOverrides(opts.sandboxConfig, executionPlan);
15416
15739
  managed = await resumeVm({
15417
15740
  checkpointPath,
15418
15741
  agentName: opts.agentName,
15419
15742
  mountPath,
15743
+ workspaceMode: workspace.mode,
15420
15744
  extraAllowedHosts: opts.extraAllowedHosts,
15421
- sandboxConfig: opts.sandboxConfig
15745
+ sandboxConfig
15422
15746
  });
15423
15747
  } catch (err) {
15424
15748
  const message = err instanceof Error ? err.message : String(err);
@@ -15447,7 +15771,8 @@ async function executePiTask(claimedTask, reporter, opts) {
15447
15771
  taskId: task.id,
15448
15772
  workspace: {
15449
15773
  mode: activeWorkspace.mode,
15450
- branch: activeWorkspace.branch
15774
+ branch: activeWorkspace.branch,
15775
+ attached: executionPlan?.workspaceAttachment !== void 0
15451
15776
  },
15452
15777
  extras: opts.promptExtras
15453
15778
  });
@@ -15489,7 +15814,10 @@ async function executePiTask(claimedTask, reporter, opts) {
15489
15814
  createEditToolDefinition(mountPath, { operations: createGondolinEditOps(managed.vm, mountPath) }),
15490
15815
  createBashToolDefinition(mountPath, { operations: createGondolinBashOps(managed.vm, mountPath) })
15491
15816
  ];
15492
- const { handle: submitToolHandle, tools: submitToolDefs } = resolveSubmitTools(task.taskType, { model: opts.model });
15817
+ const { handle: submitToolHandle, tools: submitToolDefs } = resolveSubmitTools(task.taskType, {
15818
+ model: opts.model,
15819
+ input: task.input
15820
+ });
15493
15821
  const submitTools = submitToolDefs;
15494
15822
  try {
15495
15823
  const moltnetAgent = await connect({ configDir: managed.agentDir });
@@ -15708,8 +16036,20 @@ async function executePiTask(claimedTask, reporter, opts) {
15708
16036
  phase: "output_validation"
15709
16037
  });
15710
16038
  }
15711
- else {
15712
- const parsed = await parseStructuredTaskOutput(assistantText, task.taskType, { model: opts.model });
16039
+ else if (submitToolHandle) {
16040
+ parseError = {
16041
+ code: "output_missing",
16042
+ message: "Agent did not submit output through the task submit tool. A valid submit tool call is required to complete this task type."
16043
+ };
16044
+ await emit("error", {
16045
+ message: parseError.message,
16046
+ phase: "output_validation"
16047
+ });
16048
+ } else {
16049
+ const parsed = await parseStructuredTaskOutput(assistantText, task.taskType, {
16050
+ model: opts.model,
16051
+ input: task.input
16052
+ });
15713
16053
  parsedOutput = parsed.output;
15714
16054
  parsedOutputCid = parsed.outputCid;
15715
16055
  parseError = parsed.error;
@@ -15795,6 +16135,18 @@ async function executePiTask(claimedTask, reporter, opts) {
15795
16135
  }
15796
16136
  }
15797
16137
  }
16138
+ function applyExecutionPlanSandboxOverrides(sandboxConfig, executionPlan) {
16139
+ const shadowWrites = executionPlan?.workspaceAttachment?.shadowWrites;
16140
+ if (!shadowWrites) return sandboxConfig;
16141
+ return {
16142
+ ...sandboxConfig,
16143
+ vfs: {
16144
+ ...sandboxConfig?.vfs,
16145
+ shadow: ["**"],
16146
+ shadowMode: shadowWrites
16147
+ }
16148
+ };
16149
+ }
15798
16150
  function emptyUsage(provider, model) {
15799
16151
  return {
15800
16152
  inputTokens: 0,
@@ -16012,6 +16364,7 @@ function moltnetExtension(pi) {
16012
16364
  checkpointPath,
16013
16365
  agentName,
16014
16366
  mountPath,
16367
+ workspaceMode: "shared_mount",
16015
16368
  sandboxConfig
16016
16369
  });
16017
16370
  activateAgentEnv(managed.credentials.agentEnv, mainRepo);