@themoltnet/pi-extension 0.14.0 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,7 +1,11 @@
1
+ import { AgentSession } from '@earendil-works/pi-coding-agent';
2
+ import { Api } from '@earendil-works/pi-ai';
1
3
  import { BashOperations } from '@earendil-works/pi-coding-agent';
2
4
  import { connect } from '@themoltnet/sdk';
3
5
  import { EditOperations } from '@earendil-works/pi-coding-agent';
4
6
  import { ExtensionAPI } from '@earendil-works/pi-coding-agent';
7
+ import { LoadSkillsResult } from '@earendil-works/pi-coding-agent';
8
+ import { Model } from '@earendil-works/pi-ai';
5
9
  import { ReadOperations } from '@earendil-works/pi-coding-agent';
6
10
  import { Skill } from '@earendil-works/pi-coding-agent';
7
11
  import { Static } from '@sinclair/typebox';
@@ -27,6 +31,33 @@ import { WriteOperations } from '@earendil-works/pi-coding-agent';
27
31
  */
28
32
  export declare function activateAgentEnv(agentEnv: Record<string, string | undefined>, repoRoot: string): void;
29
33
 
34
+ /**
35
+ * Construct an in-memory `AgentSession`. The caller is responsible for
36
+ * eventually invoking `session.prompt(...)` and for tearing down — the
37
+ * helper does no lifecycle management beyond construction.
38
+ */
39
+ export declare function buildAgentSession(args: BuildAgentSessionArgs): Promise<AgentSession>;
40
+
41
+ declare interface BuildAgentSessionArgs {
42
+ /** Host directory mounted at /workspace inside the VM. */
43
+ mountPath: string;
44
+ /** pi auth directory (resolved from `PI_CODING_AGENT_DIR` or `~/.pi/agent`). */
45
+ piAuthDir: string;
46
+ /** Resolved pi model handle (provider + model id). */
47
+ modelHandle: Model<Api>;
48
+ /** Pre-built customTools array. Caller composes Gondolin + MoltNet + submit tools. */
49
+ customTools: ToolDefinition[];
50
+ /** System-prompt fragments appended after pi's defaults. Parent passes the
51
+ * runtime instructor; subagents pass their narrower variant. */
52
+ appendSystemPrompt: string[];
53
+ /** Skills to advertise in `<available_skills>`. Default: empty list. */
54
+ skillsOverride?: () => LoadSkillsResult;
55
+ /** Span attributes merged onto every OTel span the session emits. */
56
+ otelSpanAttrs: Record<string, string | number | boolean>;
57
+ /** Agent name for `gen_ai.agent.name` on the root span. */
58
+ agentName: string;
59
+ }
60
+
30
61
  declare interface ClaimedTask {
31
62
  /** The claimed task payload itself. */
32
63
  task: Task;
@@ -83,6 +114,73 @@ export declare function createPiOtelExtension(options?: PiOtelOptions): (pi: Ext
83
114
  */
84
115
  export declare function createPiTaskExecutor(opts: ExecutePiTaskOptions): (claimedTask: ClaimedTask, reporter: TaskReporter) => Promise<TaskOutput>;
85
116
 
117
+ /**
118
+ * Build the subagent custom tool for a parent session. The handle
119
+ * exposes the call counter so executors can emit summary telemetry
120
+ * when the parent terminates.
121
+ */
122
+ export declare function createSubagentTool(args: CreateSubagentToolArgs): SubagentToolHandle;
123
+
124
+ export declare interface CreateSubagentToolArgs {
125
+ /** Host directory mounted at /workspace inside the VM. */
126
+ mountPath: string;
127
+ /** pi auth directory the parent resolved. */
128
+ piAuthDir: string;
129
+ /** Resolved pi model handle — subagents share it. */
130
+ modelHandle: Model<Api>;
131
+ /** Agent name for telemetry. */
132
+ agentName: string;
133
+ /**
134
+ * Custom tools every subagent inherits (Gondolin-routed
135
+ * Read/Write/Edit/Bash + moltnet_* tools, etc). MUST NOT include
136
+ * the parent's submit-output tool, the parent's `subagent` tool,
137
+ * or any other parent-only artefact — the caller is responsible
138
+ * for filtering. The subagent appends its own submit tool.
139
+ */
140
+ inheritedCustomTools: ToolDefinition[];
141
+ /**
142
+ * The parent runtime instructor verbatim. Subagents prepend it to
143
+ * their own short "you are a subagent" preamble so the same
144
+ * invariants (gh auth, diary discipline, accountable commits)
145
+ * apply if the subagent takes those actions. The parent's task
146
+ * description dictates whether they should.
147
+ */
148
+ parentRuntimeInstructor: string;
149
+ parentTaskId: string;
150
+ parentTaskType: string;
151
+ parentAttemptN: number;
152
+ /**
153
+ * Parent task's cancel signal. When the daemon cancels the parent
154
+ * task (operator cancel or task-level `runningTimeoutSec` expiry),
155
+ * each in-flight subagent's inner `session.abort()` is invoked so
156
+ * it tears down promptly instead of running until its own LLM
157
+ * call resolves. Mirrors the existing `wireSessionAbort` pattern
158
+ * the parent session uses.
159
+ *
160
+ * Optional only because the test seam can omit it; production
161
+ * callers (executePiTask) pass `reporter.cancelSignal`.
162
+ */
163
+ parentCancelSignal?: AbortSignal;
164
+ /**
165
+ * Per-call fallback timeout. Defends against an inner session that
166
+ * ignores `abort()` for any reason (LLM provider stuck, tool call
167
+ * hanging on I/O, etc.). When the timeout fires, `session.abort()`
168
+ * is invoked and the tool returns `isError: true` with a
169
+ * `subagent_timed_out` reason the parent LLM can recover from.
170
+ *
171
+ * Default: 5 minutes. Set to `0` to disable (relying purely on
172
+ * parentCancelSignal). Negative values are treated as the default.
173
+ */
174
+ timeoutMs?: number;
175
+ /**
176
+ * Test seam. Production callers leave this undefined and get
177
+ * `buildAgentSession` from the factory module. Tests inject a mock
178
+ * that returns a stub session implementing only `prompt()` to
179
+ * exercise the tool's logic without booting a VM.
180
+ */
181
+ buildAgentSession?: (args: BuildAgentSessionArgs) => Promise<AgentSession>;
182
+ }
183
+
86
184
  /**
87
185
  * Ensure a cached snapshot exists, building one if needed.
88
186
  * Returns the absolute path to the qcow2 checkpoint file.
@@ -279,6 +377,17 @@ export declare interface SandboxConfig {
279
377
  /** Overlay disk size (default '3G'). */
280
378
  overlaySize?: string;
281
379
  };
380
+ /** Shell commands to run every VM resume, after platform setup
381
+ * (TLS, DNS, git safe.directory, tmpfs node_modules) and before
382
+ * the agent session starts. Use for per-session bootstrap that
383
+ * doesn't belong baked into the snapshot.
384
+ *
385
+ * Not included in the snapshot cache key — changes here apply on
386
+ * every resume without triggering a snapshot rebuild. Each command
387
+ * runs in a fresh shell with `set -eu` and `set -o pipefail`; a
388
+ * non-zero exit (including from any segment of a pipeline) aborts
389
+ * resume with the failing command's stderr/stdout tail. */
390
+ resumeCommands?: string[];
282
391
  /** VFS shadow settings — hide host paths from the guest. */
283
392
  vfs?: {
284
393
  /** Paths (relative to workspace root) to shadow from the host mount. */
@@ -300,6 +409,29 @@ export declare interface SandboxConfig {
300
409
  /** Extract snapshot-specific config for backwards compat with ensureSnapshot. */
301
410
  export declare type SnapshotConfig = NonNullable<SandboxConfig['snapshot']>;
302
411
 
412
+ export declare interface SubagentToolHandle {
413
+ /** ToolDefinition to register via `customTools` on the parent session. */
414
+ readonly tool: ToolDefinition;
415
+ /** How many times the parent LLM has called this tool. */
416
+ getCallCount: () => number;
417
+ }
418
+
419
+ /**
420
+ * Parameters shape the parent LLM sees when calling the subagent tool.
421
+ *
422
+ * - `task` — natural-language instructions for the subagent.
423
+ * The parent authors this per call. Must be
424
+ * non-empty.
425
+ * - `output_schema` — name of a registered SubagentOutputContract.
426
+ * Resolved at call time; unknown names error.
427
+ */
428
+ export declare const SubagentToolParameters: TObject<{
429
+ task: TString;
430
+ output_schema: TString;
431
+ }>;
432
+
433
+ export declare type SubagentToolParameters = Static<typeof SubagentToolParameters>;
434
+
303
435
  /**
304
436
  * The Task promise body.
305
437
  *
package/dist/index.js CHANGED
@@ -2515,11 +2515,12 @@ function createCryptoNamespace(context, signingRequests) {
2515
2515
  function createDiariesNamespace(context) {
2516
2516
  const { client, auth } = context;
2517
2517
  return {
2518
- async list(query) {
2518
+ async list(query, headers) {
2519
2519
  return unwrapResult(await listDiaries({
2520
2520
  client,
2521
2521
  auth,
2522
- query
2522
+ query,
2523
+ headers
2523
2524
  }));
2524
2525
  },
2525
2526
  async create(body, headers) {
@@ -8177,6 +8178,27 @@ var BASE_ALLOWED_HOSTS = [
8177
8178
  "*.googlesource.com"
8178
8179
  ];
8179
8180
  /**
8181
+ * Run a shell command in the guest and throw if it fails. Mirror of
8182
+ * `run()` in `snapshot.ts` for the resume-side hook chain — every
8183
+ * setup step is essential to a healthy session, so a silent non-zero
8184
+ * exit (e.g. a mount that fails into the FUSE write path, or a
8185
+ * consumer-provided resume command that fails to install pnpm) must
8186
+ * surface immediately rather than fall through to cryptic agent
8187
+ * errors later.
8188
+ */
8189
+ async function vmRun(vm, label, command) {
8190
+ const wrapped = `set -eu\nset -o pipefail\n${command}`;
8191
+ const r = await vm.exec([
8192
+ "sh",
8193
+ "-c",
8194
+ wrapped
8195
+ ]);
8196
+ if (r.exitCode !== 0) {
8197
+ const tail = [r.stderr, r.stdout].filter(Boolean).join("\n").slice(-800);
8198
+ throw new Error(`resume step "${label}" failed (exit ${r.exitCode}):\n${tail}`);
8199
+ }
8200
+ }
8201
+ /**
8180
8202
  * Resume a VM from a checkpoint, inject credentials, configure egress +
8181
8203
  * TLS. Returns the managed VM handle.
8182
8204
  */
@@ -8236,8 +8258,9 @@ async function resumeVm(config) {
8236
8258
  update-ca-certificates 2>/dev/null
8237
8259
  cat /etc/gondolin/mitm/ca.crt >> /etc/ssl/certs/ca-certificates.crt
8238
8260
  '`);
8239
- await vm.exec(`sh -c 'echo "nameserver 8.8.8.8
8240
- nameserver 1.1.1.1" > /etc/resolv.conf'`);
8261
+ await vmRun(vm, "DNS resolvers", `printf 'nameserver 8.8.8.8\\nnameserver 1.1.1.1\\n' > /etc/resolv.conf`);
8262
+ await vmRun(vm, "git safe.directory", `git config --system --add safe.directory '*'`);
8263
+ for (const [i, cmd] of (config.sandboxConfig?.resumeCommands ?? []).entries()) await vmRun(vm, `resumeCommands[${i}]`, cmd);
8241
8264
  const vmSshDir = `${vmAgentDir}/ssh`;
8242
8265
  await vm.exec(`mkdir -p ${vmAgentDir}/ssh /home/agent/.pi/agent`);
8243
8266
  if (creds.piAuthJson !== null) await vm.fs.writeFile("/home/agent/.pi/agent/auth.json", creds.piAuthJson, { mode: 384 });
@@ -8580,59 +8603,37 @@ function extractUsage(message) {
8580
8603
  };
8581
8604
  }
8582
8605
  //#endregion
8583
- //#region ../agent-runtime/src/context-bindings.ts
8584
- var PROMPT_SEPARATOR = "\n\n---\n\n";
8606
+ //#region src/runtime/agent-session-factory.ts
8607
+ var NO_SKILLS = () => ({
8608
+ skills: [],
8609
+ diagnostics: []
8610
+ });
8585
8611
  /**
8586
- * Resolve `task.input.context[]` into delivered side-effects (skills
8587
- * persisted via `deliver.skill`) and prompt fragments
8588
- * (`systemPromptPrefix`, `userInlineSuffix`) the caller weaves into the
8589
- * built prompt.
8590
- *
8591
- * Per-binding semantics (V1):
8592
- * - `skill` → `deliver.skill({ slug, content })` once per ref.
8593
- * Slug collisions on distinct contents are
8594
- * refused loudly.
8595
- * - `prompt_prefix` → content appended to `systemPromptPrefix` with
8596
- * the canonical `\n\n---\n\n` separator (in
8597
- * declared order).
8598
- * - `user_inline` → content appended to `userInlineSuffix` in
8599
- * declared order, same separator.
8600
- *
8601
- * No fetching, no hashing — bytes are inlined in `ContextRef.content`,
8602
- * and the task's `inputCid` already pins the entire input. The imposer
8603
- * chose these bytes; the resolver just dispatches them.
8604
- *
8605
- * The function is pure with respect to its arguments: file writes are
8606
- * confined to the injected `deliver` callback, which makes the
8607
- * resolver trivial to test.
8612
+ * Construct an in-memory `AgentSession`. The caller is responsible for
8613
+ * eventually invoking `session.prompt(...)` and for tearing down — the
8614
+ * helper does no lifecycle management beyond construction.
8608
8615
  */
8609
- async function resolveTaskContext(args) {
8610
- const promptParts = [];
8611
- const userParts = [];
8612
- const injected = [];
8613
- const usedSlugs = /* @__PURE__ */ new Map();
8614
- for (const ref of args.context) {
8615
- if (ref.binding === "skill") {
8616
- const prior = usedSlugs.get(ref.slug);
8617
- if (prior !== void 0) {
8618
- if (prior !== ref.content) throw new Error(`slug collision on '${ref.slug}': two skill entries share the same slug but have different content`);
8619
- injected.push(ref);
8620
- continue;
8621
- }
8622
- usedSlugs.set(ref.slug, ref.content);
8623
- await args.deliver.skill({
8624
- slug: ref.slug,
8625
- content: ref.content
8626
- });
8627
- } else if (ref.binding === "prompt_prefix") promptParts.push(ref.content);
8628
- else userParts.push(ref.content);
8629
- injected.push(ref);
8630
- }
8631
- return {
8632
- injected,
8633
- systemPromptPrefix: promptParts.join(PROMPT_SEPARATOR),
8634
- userInlineSuffix: userParts.join(PROMPT_SEPARATOR)
8635
- };
8616
+ async function buildAgentSession(args) {
8617
+ const piOtelExtension = createPiOtelExtension({
8618
+ agentName: args.agentName,
8619
+ spanAttributes: args.otelSpanAttrs
8620
+ });
8621
+ const resourceLoader = new DefaultResourceLoader({
8622
+ cwd: args.mountPath,
8623
+ agentDir: args.piAuthDir,
8624
+ extensionFactories: [piOtelExtension],
8625
+ appendSystemPrompt: args.appendSystemPrompt,
8626
+ skillsOverride: args.skillsOverride ?? NO_SKILLS
8627
+ });
8628
+ await resourceLoader.reload();
8629
+ return (await createAgentSession({
8630
+ agentDir: args.piAuthDir,
8631
+ cwd: args.mountPath,
8632
+ model: args.modelHandle,
8633
+ customTools: args.customTools,
8634
+ sessionManager: SessionManager.inMemory(),
8635
+ resourceLoader
8636
+ })).session;
8636
8637
  }
8637
8638
  //#endregion
8638
8639
  //#region ../tasks/src/formats.ts
@@ -8851,7 +8852,7 @@ unchanged" is.
8851
8852
  * (server-side schema check). Self-assessment is a truthful self-rating,
8852
8853
  * NOT enforcement — `verification.passed=false` does not block /complete
8853
8854
  * and does not affect `acceptedAttemptN`. See
8854
- * `docs/agent-runtime.md` for the full producer/judge flow.
8855
+ * `docs/understand/agent-runtime.md` for the full producer/judge flow.
8855
8856
  *
8856
8857
  * **Binding evaluation** (judgment tasks: `assess_brief`, `judge_pack`).
8857
8858
  * A separate task whose IS the application of `successCriteria` to
@@ -9008,6 +9009,39 @@ var AssessBriefOutput = Type$1.Object({
9008
9009
  $id: "AssessBriefOutput",
9009
9010
  additionalProperties: false
9010
9011
  });
9012
+ /**
9013
+ * Async preflight (#1096):
9014
+ * - `targetTaskId` resolves to a real task the caller can see.
9015
+ * - The target is a `fulfill_brief` (you cannot grade an arbitrary
9016
+ * task type as if it were a brief fulfillment).
9017
+ * - The target is `completed` with an accepted attempt — grading
9018
+ * an in-flight or failed task would either race or grade nothing.
9019
+ *
9020
+ * Agent-distinctness ("assessor ≠ producer") is a runtime / auth-
9021
+ * layer concern and intentionally NOT checked here. It belongs in
9022
+ * an auth-aware claim-time check.
9023
+ */
9024
+ async function validateAssessBriefInputAsync(input, ctx) {
9025
+ const { targetTaskId } = input;
9026
+ const errors = [];
9027
+ const target = await ctx.resolveTask(targetTaskId);
9028
+ if (!target) {
9029
+ errors.push({
9030
+ field: "targetTaskId",
9031
+ message: `targetTaskId ${targetTaskId} does not resolve to a task you can read`
9032
+ });
9033
+ return errors;
9034
+ }
9035
+ if (target.taskType !== "fulfill_brief") errors.push({
9036
+ field: "targetTaskId",
9037
+ message: `targetTaskId ${targetTaskId} is a ${target.taskType}, not a fulfill_brief`
9038
+ });
9039
+ if (target.status !== "completed" || target.acceptedAttemptN === null) errors.push({
9040
+ field: "targetTaskId",
9041
+ message: `targetTaskId ${targetTaskId} is not completed with an accepted attempt (status=${target.status}, acceptedAttemptN=${target.acceptedAttemptN})`
9042
+ });
9043
+ return errors;
9044
+ }
9011
9045
  //#endregion
9012
9046
  //#region ../tasks/src/task-types/curate-pack.ts
9013
9047
  /**
@@ -9206,6 +9240,311 @@ function validateJudgePackOutput(output) {
9206
9240
  }
9207
9241
  return null;
9208
9242
  }
9243
+ /**
9244
+ * Async preflight (#1096):
9245
+ * - `renderedPackId` resolves to a rendered_packs row.
9246
+ * - `sourcePackId` resolves to a context_packs row.
9247
+ * - The rendered pack actually came from the claimed source pack —
9248
+ * `renderedPack.sourcePackId === input.sourcePackId`. Without
9249
+ * this check a judge can be tricked into grading rendering A as
9250
+ * if it came from source B.
9251
+ */
9252
+ async function validateJudgePackInputAsync(input, ctx) {
9253
+ const { renderedPackId, sourcePackId } = input;
9254
+ const errors = [];
9255
+ const [rendered, source] = await Promise.all([ctx.resolveRenderedPack(renderedPackId), ctx.resolveContextPack(sourcePackId)]);
9256
+ if (!rendered) errors.push({
9257
+ field: "renderedPackId",
9258
+ message: `renderedPackId ${renderedPackId} does not resolve to a rendered pack you can read`
9259
+ });
9260
+ if (!source) errors.push({
9261
+ field: "sourcePackId",
9262
+ message: `sourcePackId ${sourcePackId} does not resolve to a context pack you can read`
9263
+ });
9264
+ if (rendered && source && rendered.sourcePackId !== source.id) errors.push({
9265
+ field: "sourcePackId",
9266
+ message: `renderedPack ${renderedPackId} was produced from source ${rendered.sourcePackId}, not from sourcePackId=${sourcePackId}`
9267
+ });
9268
+ return errors;
9269
+ }
9270
+ //#endregion
9271
+ //#region ../tasks/src/task-types/judge-eval-variant.ts
9272
+ /**
9273
+ * `judge_eval_variant` — score N variants of a `run_eval` scenario
9274
+ * against a single rubric, in one pass, with per-variant subagent
9275
+ * isolation.
9276
+ *
9277
+ * output_kind: judgment
9278
+ * criteria: required (`successCriteria.rubric` — same envelope shape as
9279
+ * `judge_pack` / `assess_brief`)
9280
+ * references: not required at the input layer — `runTaskIds` already
9281
+ * pin the targets being graded.
9282
+ *
9283
+ * Slice 2 of #943. The parent task carries the rubric and the list of
9284
+ * variant `run_eval` task ids. The pi executor registers the generic
9285
+ * `subagent` custom tool (#1087), and the parent LLM calls
9286
+ * `subagent({ task, output_schema: 'judge_eval_variant_result' })` once
9287
+ * per variant — each child session has fresh context, fetches the
9288
+ * variant's accepted attempt output via `moltnet_get_task` /
9289
+ * `moltnet_list_task_attempts`, and grades against the rubric.
9290
+ *
9291
+ * Reuses `JudgePackScore` from `judge_pack` for per-criterion scoring
9292
+ * (Lane 1 binary via `llm_checklist`, Lane 2 graded via `llm_score`,
9293
+ * deterministic_*) — the score shape is the same across judgment
9294
+ * tasks; only the wrapping (per-variant grouping + deltas) differs.
9295
+ *
9296
+ * Cross-task input invariants — "all targets share the same
9297
+ * correlation_id, all are `run_eval`, all are completed with an
9298
+ * accepted attempt, all share byte-identical `input.successCriteria`"
9299
+ * — REQUIRE async DB lookups and live in `validateInputAsync` below,
9300
+ * which the task service runs at create time (#1096 wiring). The
9301
+ * TypeBox layer here only enforces shape: UUID format,
9302
+ * minItems/maxItems, rubric presence + weight invariant.
9303
+ */
9304
+ var JUDGE_EVAL_VARIANT_TYPE = "judge_eval_variant";
9305
+ var JudgeEvalVariantInput = Type$1.Object({
9306
+ runTaskIds: Type$1.Array(Type$1.String({ format: "uuid" }), {
9307
+ minItems: 2,
9308
+ maxItems: 10
9309
+ }),
9310
+ successCriteria: SuccessCriteria
9311
+ }, {
9312
+ $id: "JudgeEvalVariantInput",
9313
+ additionalProperties: false
9314
+ });
9315
+ /**
9316
+ * Per-variant grading. `scores[]` shape is identical to `JudgePackScore`
9317
+ * (mode-aware: binary via `llm_checklist`, graded via `llm_score`,
9318
+ * deterministic_*). Reuse the type rather than re-declare.
9319
+ *
9320
+ * This is also the **subagent output contract** — the parent's
9321
+ * `subagent` tool resolves the contract name `judge_eval_variant_result`
9322
+ * to this schema. See `agent-runtime`'s subagent contract registry.
9323
+ */
9324
+ var JudgeEvalVariantResult = Type$1.Object({
9325
+ runTaskId: Type$1.String({ format: "uuid" }),
9326
+ variantLabel: Type$1.String({
9327
+ minLength: 1,
9328
+ maxLength: 64,
9329
+ pattern: "^(?!.* - ).*$"
9330
+ }),
9331
+ scores: Type$1.Array(JudgePackScore, { minItems: 1 }),
9332
+ composite: Type$1.Number({
9333
+ minimum: 0,
9334
+ maximum: 1
9335
+ }),
9336
+ verdict: Type$1.String({ minLength: 1 })
9337
+ }, {
9338
+ $id: "JudgeEvalVariantResult",
9339
+ additionalProperties: false
9340
+ });
9341
+ var JudgeEvalVariantOutput = Type$1.Object({
9342
+ results: Type$1.Array(JudgeEvalVariantResult, { minItems: 2 }),
9343
+ deltas: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Number({
9344
+ minimum: -1,
9345
+ maximum: 1
9346
+ }))),
9347
+ judgeModel: Type$1.Optional(Type$1.String({ minLength: 1 })),
9348
+ traceparent: Type$1.String({ minLength: 1 })
9349
+ }, {
9350
+ $id: "JudgeEvalVariantOutput",
9351
+ additionalProperties: false
9352
+ });
9353
+ /**
9354
+ * Synchronous input invariants beyond TypeBox shape: rubric must be
9355
+ * present (already required by the schema, but the rubric body has
9356
+ * its own per-criterion weight invariant) and the rubric's weights
9357
+ * must sum to 1.
9358
+ *
9359
+ * Cross-task invariants (all targets are `run_eval`, all completed,
9360
+ * share `correlation_id`, byte-identical `input.successCriteria`)
9361
+ * are NOT checked here — they require async DB lookups against
9362
+ * `runTaskIds` and live in `validateJudgeEvalVariantInputAsync`
9363
+ * below, invoked by the task service at create time (#1096).
9364
+ */
9365
+ function validateJudgeEvalVariantInput(input) {
9366
+ const sc = input.successCriteria;
9367
+ if (!sc) return "successCriteria is required for judge_eval_variant";
9368
+ if (!sc.rubric) return "successCriteria.rubric is required for judge_eval_variant";
9369
+ return validateRubricWeights(sc.rubric);
9370
+ }
9371
+ /**
9372
+ * Output cross-field invariants the schema cannot express:
9373
+ *
9374
+ * 1. `results.length === input.runTaskIds.length` — every variant
9375
+ * the imposer asked for must be graded. Partial grading
9376
+ * invalidates cross-variant comparison; fail the whole task
9377
+ * rather than silently report a subset.
9378
+ *
9379
+ * 2. `results[i].runTaskId === input.runTaskIds[i]` — order is
9380
+ * load-bearing for downstream consumers (e.g. deltas keyed by
9381
+ * adjacent pairs). Mismatch is an LLM bug; reject loudly.
9382
+ *
9383
+ * 3. Each `result.scores` follows the same `llm_checklist` rule
9384
+ * `judge_pack` enforces (#999): if a score has an `assertions`
9385
+ * array, the numeric score MUST be `1` iff every assertion
9386
+ * passes. Inconsistent payloads pollute attestations.
9387
+ *
9388
+ * 4. Each `result.composite` MUST equal the rubric-weighted sum
9389
+ * `Σ(weight_j × scores[j].score)`. The parent (and any subagent
9390
+ * it delegated to) is supposed to compute this; surfacing a
9391
+ * drift here catches LLMs that hand-wave the arithmetic.
9392
+ *
9393
+ * 5. Optional `deltas` keys MUST be of the form `"A - B"` where
9394
+ * both `A` and `B` are variantLabels present in `results`.
9395
+ * Values are not range-checked (any float in [-1, 1] is
9396
+ * arithmetically possible).
9397
+ */
9398
+ function validateJudgeEvalVariantOutput(output, input) {
9399
+ const out = output;
9400
+ const inp = input;
9401
+ if (inp) {
9402
+ if (out.results.length !== inp.runTaskIds.length) return `results.length (${out.results.length}) does not match input.runTaskIds.length (${inp.runTaskIds.length}). Every variant must be graded; partial grading is rejected.`;
9403
+ for (let i = 0; i < out.results.length; i++) if (out.results[i].runTaskId !== inp.runTaskIds[i]) return `results[${i}].runTaskId (${out.results[i].runTaskId}) does not match input.runTaskIds[${i}] (${inp.runTaskIds[i]}). Order must align with input for downstream delta computation.`;
9404
+ }
9405
+ for (let r = 0; r < out.results.length; r++) {
9406
+ const result = out.results[r];
9407
+ for (let s = 0; s < result.scores.length; s++) {
9408
+ const sc = result.scores[s];
9409
+ if (!sc.assertions) continue;
9410
+ const allPassed = sc.assertions.every((a) => a.passed);
9411
+ const expected = allPassed ? 1 : 0;
9412
+ if (sc.score !== expected) return `results[${r}].scores[${s}] (criterionId="${sc.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${sc.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
9413
+ }
9414
+ }
9415
+ if (inp?.successCriteria?.rubric) {
9416
+ const criteria = inp.successCriteria.rubric.criteria;
9417
+ const weightById = new Map(criteria.map((c) => [c.id, c.weight]));
9418
+ for (let r = 0; r < out.results.length; r++) {
9419
+ const result = out.results[r];
9420
+ let sum = 0;
9421
+ for (const sc of result.scores) {
9422
+ const w = weightById.get(sc.criterionId);
9423
+ if (w === void 0) return `results[${r}].scores: criterionId "${sc.criterionId}" is not in the input rubric (known: ${Array.from(weightById.keys()).join(", ")}). Score every rubric criterion exactly once; do not invent new ids.`;
9424
+ sum += w * sc.score;
9425
+ }
9426
+ if (Math.abs(sum - result.composite) > .001) return `results[${r}].composite (${result.composite}) does not match Σ(weight × score) (${sum.toFixed(6)}). Composite must be the rubric-weighted sum of per-criterion scores (drift > 0.001).`;
9427
+ }
9428
+ }
9429
+ if (out.deltas) {
9430
+ const labels = new Set(out.results.map((r) => r.variantLabel));
9431
+ for (const key of Object.keys(out.deltas)) {
9432
+ const m = /^(.+?) - (.+)$/.exec(key);
9433
+ if (!m) return `deltas key "${key}" is not of the form "<variantLabel-A> - <variantLabel-B>". Use a single space-hyphen-space separator between labels.`;
9434
+ const [, a, b] = m;
9435
+ if (!labels.has(a) || !labels.has(b)) return `deltas key "${key}" references variantLabel(s) not present in results: ${!labels.has(a) ? `"${a}" missing` : ""}${!labels.has(a) && !labels.has(b) ? ", " : ""}${!labels.has(b) ? `"${b}" missing` : ""}`;
9436
+ }
9437
+ }
9438
+ return null;
9439
+ }
9440
+ /**
9441
+ * Local stable-stringify for cross-variant `successCriteria` byte-
9442
+ * equality. Recursively sorts object keys; arrays preserve order
9443
+ * (intentional — rubric criteria order is semantically meaningful).
9444
+ * Mirrors the canonical-JSON shape `crypto-service` uses for CIDs,
9445
+ * without taking on a crypto-service dep just for this comparison.
9446
+ */
9447
+ function stableStringify(value) {
9448
+ if (value === null || typeof value !== "object") return JSON.stringify(value);
9449
+ if (Array.isArray(value)) return "[" + value.map(stableStringify).join(",") + "]";
9450
+ const obj = value;
9451
+ return "{" + Object.keys(obj).sort().map((k) => JSON.stringify(k) + ":" + stableStringify(obj[k])).join(",") + "}";
9452
+ }
9453
+ /**
9454
+ * Async preflight for `judge_eval_variant` (#1096 + #943):
9455
+ *
9456
+ * 1. Every `runTaskIds[i]` resolves to a task the caller can read.
9457
+ * 2. Every resolved task is `taskType === 'run_eval'`.
9458
+ * 3. Every resolved task is `status === 'completed'` with a
9459
+ * non-null `acceptedAttemptN` — grading an unaccepted attempt
9460
+ * races with re-attempts and pollutes the judge attestation.
9461
+ * 4. Every resolved task shares a non-null `correlationId`, and all
9462
+ * `correlationId`s are equal. Without this an imposer could
9463
+ * fabricate a "variant set" by stapling unrelated runs together.
9464
+ * 5. The shared `correlationId` is NOT already sealed. A previous
9465
+ * judge_eval_variant against the same group is final; produce a
9466
+ * fresh correlation_id for a new judging round rather than
9467
+ * adding contradictory verdicts to a sealed group.
9468
+ * 6. Every variant's `input.successCriteria` is byte-identical (via
9469
+ * stable-stringify). Different rubrics across "variants" makes
9470
+ * the comparison meaningless.
9471
+ */
9472
+ async function validateJudgeEvalVariantInputAsync(input, ctx) {
9473
+ const { runTaskIds } = input;
9474
+ const errors = [];
9475
+ const resolved = await Promise.all(runTaskIds.map((id) => ctx.resolveTask(id)));
9476
+ let missingTargets = false;
9477
+ const presentTargets = [];
9478
+ for (let i = 0; i < runTaskIds.length; i++) {
9479
+ const t = resolved[i];
9480
+ if (!t) {
9481
+ missingTargets = true;
9482
+ errors.push({
9483
+ field: `runTaskIds[${i}]`,
9484
+ message: `runTaskIds[${i}]=${runTaskIds[i]} does not resolve to a task you can read`
9485
+ });
9486
+ continue;
9487
+ }
9488
+ presentTargets.push(t);
9489
+ if (t.taskType !== "run_eval") errors.push({
9490
+ field: `runTaskIds[${i}]`,
9491
+ message: `runTaskIds[${i}]=${runTaskIds[i]} is a ${t.taskType}, not a run_eval`
9492
+ });
9493
+ if (t.status !== "completed" || t.acceptedAttemptN === null) errors.push({
9494
+ field: `runTaskIds[${i}]`,
9495
+ message: `runTaskIds[${i}]=${runTaskIds[i]} is not completed with an accepted attempt (status=${t.status}, acceptedAttemptN=${t.acceptedAttemptN})`
9496
+ });
9497
+ }
9498
+ if (missingTargets || presentTargets.length === 0) return errors;
9499
+ const correlationIds = new Set(presentTargets.map((t) => t.correlationId ?? "__null__"));
9500
+ if (correlationIds.has("__null__")) errors.push({
9501
+ field: "runTaskIds",
9502
+ message: "one or more run_eval targets have no correlation_id; cannot group as variants"
9503
+ });
9504
+ if (correlationIds.size > 1) errors.push({
9505
+ field: "runTaskIds",
9506
+ message: `run_eval targets span multiple correlation_ids (${Array.from(correlationIds).join(", ")}); variants must share one`
9507
+ });
9508
+ if (errors.length > 0) return errors;
9509
+ const correlationId = presentTargets[0].correlationId;
9510
+ if (!correlationId) return errors;
9511
+ const seal = await ctx.findCorrelationSeal(correlationId);
9512
+ if (seal) errors.push({
9513
+ field: "runTaskIds",
9514
+ message: `correlation_id ${correlationId} is already sealed by ${seal.sealedByTaskType}/${seal.sealedByTaskId} at ${seal.sealedAt}; use a fresh correlation_id for a new judging round`
9515
+ });
9516
+ const first = stableStringify(presentTargets[0].input.successCriteria);
9517
+ for (let i = 1; i < presentTargets.length; i++) if (stableStringify(presentTargets[i].input.successCriteria) !== first) {
9518
+ errors.push({
9519
+ field: `runTaskIds[${i}]`,
9520
+ message: `runTaskIds[${i}] has a different input.successCriteria than runTaskIds[0]; all variants must share the rubric and gates`
9521
+ });
9522
+ break;
9523
+ }
9524
+ return errors;
9525
+ }
9526
+ /**
9527
+ * Side effect emitted on successful `judge_eval_variant` create:
9528
+ * seal the shared correlation_id atomically with the insert. The
9529
+ * task service applies the seal in the same transaction; a
9530
+ * concurrent second `judge_eval_variant` against the same group
9531
+ * loses the race and is rejected with a clean conflict error.
9532
+ *
9533
+ * The seal applies to the SHARED correlation_id of the targets —
9534
+ * NOT to the judge task's own correlationId (which is typically
9535
+ * null or distinct). The task service derives the correlationId
9536
+ * for the effect from the resolved targets, not from the judge
9537
+ * task row.
9538
+ */
9539
+ async function onCreateJudgeEvalVariant(input, ctx) {
9540
+ const { runTaskIds } = input;
9541
+ const first = await ctx.resolveTask(runTaskIds[0]);
9542
+ if (!first?.correlationId) return [];
9543
+ return [{
9544
+ kind: "sealCorrelation",
9545
+ correlationId: first.correlationId
9546
+ }];
9547
+ }
9209
9548
  //#endregion
9210
9549
  //#region ../tasks/src/task-types/render-pack.ts
9211
9550
  /**
@@ -9245,6 +9584,18 @@ var RenderPackOutput = Type$1.Object({
9245
9584
  $id: "RenderPackOutput",
9246
9585
  additionalProperties: false
9247
9586
  });
9587
+ /**
9588
+ * Async preflight (#1096): `packId` resolves to a context_packs row
9589
+ * the caller can read.
9590
+ */
9591
+ async function validateRenderPackInputAsync(input, ctx) {
9592
+ const { packId } = input;
9593
+ if (!await ctx.resolveContextPack(packId)) return [{
9594
+ field: "packId",
9595
+ message: `packId ${packId} does not resolve to a context pack you can read`
9596
+ }];
9597
+ return [];
9598
+ }
9248
9599
  //#endregion
9249
9600
  //#region ../tasks/src/task-types/run-eval.ts
9250
9601
  /**
@@ -9352,7 +9703,8 @@ var BUILT_IN_TASK_TYPES = {
9352
9703
  outputSchema: AssessBriefOutput,
9353
9704
  outputKind: "judgment",
9354
9705
  requiresReferences: true,
9355
- validateInput: validateJudgmentInput
9706
+ validateInput: validateJudgmentInput,
9707
+ validateInputAsync: validateAssessBriefInputAsync
9356
9708
  },
9357
9709
  [CURATE_PACK_TYPE]: {
9358
9710
  name: CURATE_PACK_TYPE,
@@ -9368,7 +9720,8 @@ var BUILT_IN_TASK_TYPES = {
9368
9720
  outputSchema: RenderPackOutput,
9369
9721
  outputKind: "artifact",
9370
9722
  requiresReferences: false,
9371
- validateOutput: requireVerificationWhenCriteriaPresent
9723
+ validateOutput: requireVerificationWhenCriteriaPresent,
9724
+ validateInputAsync: validateRenderPackInputAsync
9372
9725
  },
9373
9726
  [JUDGE_PACK_TYPE]: {
9374
9727
  name: JUDGE_PACK_TYPE,
@@ -9377,7 +9730,8 @@ var BUILT_IN_TASK_TYPES = {
9377
9730
  outputKind: "judgment",
9378
9731
  requiresReferences: true,
9379
9732
  validateInput: validateJudgmentInput,
9380
- validateOutput: validateJudgePackOutput
9733
+ validateOutput: validateJudgePackOutput,
9734
+ validateInputAsync: validateJudgePackInputAsync
9381
9735
  },
9382
9736
  [RUN_EVAL_TYPE]: {
9383
9737
  name: RUN_EVAL_TYPE,
@@ -9386,6 +9740,18 @@ var BUILT_IN_TASK_TYPES = {
9386
9740
  outputKind: "artifact",
9387
9741
  requiresReferences: false,
9388
9742
  validateOutput: validateRunEvalOutput
9743
+ },
9744
+ [JUDGE_EVAL_VARIANT_TYPE]: {
9745
+ name: JUDGE_EVAL_VARIANT_TYPE,
9746
+ inputSchema: JudgeEvalVariantInput,
9747
+ outputSchema: JudgeEvalVariantOutput,
9748
+ outputKind: "judgment",
9749
+ requiresReferences: false,
9750
+ validateInput: validateJudgeEvalVariantInput,
9751
+ validateOutput: validateJudgeEvalVariantOutput,
9752
+ validateInputAsync: validateJudgeEvalVariantInputAsync,
9753
+ onCreate: onCreateJudgeEvalVariant,
9754
+ usesSubagents: true
9389
9755
  }
9390
9756
  };
9391
9757
  //#endregion
@@ -9440,6 +9806,15 @@ function validateTaskOutput(taskType, output, input) {
9440
9806
  function getTaskOutputSchema(taskType) {
9441
9807
  return getTaskTypeEntry(taskType)?.outputSchema ?? null;
9442
9808
  }
9809
+ /**
9810
+ * Whether sessions running this task type should have the generic
9811
+ * `subagent` custom tool registered. Returns `false` for unknown task
9812
+ * types and for task types that didn't opt in. See `TaskTypeEntry`
9813
+ * for the design rationale.
9814
+ */
9815
+ function taskTypeUsesSubagents(taskType) {
9816
+ return getTaskTypeEntry(taskType)?.usesSubagents === true;
9817
+ }
9443
9818
  //#endregion
9444
9819
  //#region ../tasks/src/wire.ts
9445
9820
  /**
@@ -9676,6 +10051,133 @@ Type$1.Object({
9676
10051
  additionalProperties: false
9677
10052
  });
9678
10053
  //#endregion
10054
+ //#region ../agent-runtime/src/subagent-output-contracts.ts
10055
+ var REGISTRY = /* @__PURE__ */ new Map();
10056
+ /**
10057
+ * Register a subagent output contract. Idempotent: re-registering the
10058
+ * same name with a different schema throws — contracts are meant to
10059
+ * be stable. Re-registering with the identical contract object (same
10060
+ * reference) is a no-op for HMR and test convenience.
10061
+ *
10062
+ * Typically called at module-init time alongside task-type
10063
+ * registration. See task-types/index.ts in @moltnet/tasks for the
10064
+ * conventional pattern.
10065
+ */
10066
+ function registerSubagentOutputContract(contract) {
10067
+ if (!contract.name || contract.name.trim().length === 0) throw new Error("subagent output contract name is required");
10068
+ if (!/^[a-z][a-z0-9_]*$/.test(contract.name)) throw new Error(`subagent output contract name '${contract.name}' must be lower_snake_case (starts with a letter, then [a-z0-9_]+)`);
10069
+ const existing = REGISTRY.get(contract.name);
10070
+ if (existing && existing !== contract) {
10071
+ if (existing.parametersSchema !== contract.parametersSchema) throw new Error(`subagent output contract '${contract.name}' is already registered with a different schema; refusing to override`);
10072
+ }
10073
+ REGISTRY.set(contract.name, contract);
10074
+ }
10075
+ /**
10076
+ * Resolve a subagent output contract by name. Returns `null` for
10077
+ * unknown names — callers (the subagent custom tool) decide whether
10078
+ * that's a tool error the parent LLM can recover from or a hard fail.
10079
+ */
10080
+ function getSubagentOutputContract(name) {
10081
+ return REGISTRY.get(name) ?? null;
10082
+ }
10083
+ /**
10084
+ * List all registered contracts. Useful for diagnostics and for the
10085
+ * subagent tool's parameter description so a parent LLM can see what
10086
+ * contracts are available without enumerating them in its prompt.
10087
+ */
10088
+ function listSubagentOutputContracts() {
10089
+ return [...REGISTRY.values()];
10090
+ }
10091
+ //#endregion
10092
+ //#region ../agent-runtime/src/built-in-contract-registrations.ts
10093
+ /**
10094
+ * Built-in subagent output contracts (#1087, #943).
10095
+ *
10096
+ * Why this is an exported function and not a module-init side
10097
+ * effect:
10098
+ *
10099
+ * - The registry is process-global. Module-init registration
10100
+ * fires exactly once per Node process (ESM modules are cached
10101
+ * by URL). Tests that call `__resetSubagentOutputContractsForTests()`
10102
+ * to start from an empty registry have no way to repopulate
10103
+ * the built-ins without re-evaluating the module — which the
10104
+ * cache prevents. PR #1101 review M4.
10105
+ * - An explicit `registerBuiltInSubagentContracts()` lets the
10106
+ * package index call it once at module load AND lets test
10107
+ * setup hooks call it again after `__reset...`.
10108
+ * - `registerSubagentOutputContract` is itself idempotent for
10109
+ * identical re-registrations, so calling this function twice
10110
+ * in the same process is safe.
10111
+ *
10112
+ * Adding a new built-in: extend the body of this function. Do not
10113
+ * call `registerSubagentOutputContract` from anywhere else in the
10114
+ * package — keeping all built-ins in one function makes the set
10115
+ * auditable.
10116
+ */
10117
+ function registerBuiltInSubagentContracts() {
10118
+ registerSubagentOutputContract({
10119
+ name: "judge_eval_variant_result",
10120
+ description: "Per-variant grading result produced by a subagent of judge_eval_variant: scores against the shared rubric, composite, and a 1-3 sentence verdict for a single variant.",
10121
+ parametersSchema: JudgeEvalVariantResult
10122
+ });
10123
+ }
10124
+ registerBuiltInSubagentContracts();
10125
+ //#endregion
10126
+ //#region ../agent-runtime/src/context-bindings.ts
10127
+ var PROMPT_SEPARATOR = "\n\n---\n\n";
10128
+ /**
10129
+ * Resolve `task.input.context[]` into delivered side-effects (skills
10130
+ * persisted via `deliver.skill`) and prompt fragments
10131
+ * (`systemPromptPrefix`, `userInlineSuffix`) the caller weaves into the
10132
+ * built prompt.
10133
+ *
10134
+ * Per-binding semantics (V1):
10135
+ * - `skill` → `deliver.skill({ slug, content })` once per ref.
10136
+ * Slug collisions on distinct contents are
10137
+ * refused loudly.
10138
+ * - `prompt_prefix` → content appended to `systemPromptPrefix` with
10139
+ * the canonical `\n\n---\n\n` separator (in
10140
+ * declared order).
10141
+ * - `user_inline` → content appended to `userInlineSuffix` in
10142
+ * declared order, same separator.
10143
+ *
10144
+ * No fetching, no hashing — bytes are inlined in `ContextRef.content`,
10145
+ * and the task's `inputCid` already pins the entire input. The imposer
10146
+ * chose these bytes; the resolver just dispatches them.
10147
+ *
10148
+ * The function is pure with respect to its arguments: file writes are
10149
+ * confined to the injected `deliver` callback, which makes the
10150
+ * resolver trivial to test.
10151
+ */
10152
+ async function resolveTaskContext(args) {
10153
+ const promptParts = [];
10154
+ const userParts = [];
10155
+ const injected = [];
10156
+ const usedSlugs = /* @__PURE__ */ new Map();
10157
+ for (const ref of args.context) {
10158
+ if (ref.binding === "skill") {
10159
+ const prior = usedSlugs.get(ref.slug);
10160
+ if (prior !== void 0) {
10161
+ if (prior !== ref.content) throw new Error(`slug collision on '${ref.slug}': two skill entries share the same slug but have different content`);
10162
+ injected.push(ref);
10163
+ continue;
10164
+ }
10165
+ usedSlugs.set(ref.slug, ref.content);
10166
+ await args.deliver.skill({
10167
+ slug: ref.slug,
10168
+ content: ref.content
10169
+ });
10170
+ } else if (ref.binding === "prompt_prefix") promptParts.push(ref.content);
10171
+ else userParts.push(ref.content);
10172
+ injected.push(ref);
10173
+ }
10174
+ return {
10175
+ injected,
10176
+ systemPromptPrefix: promptParts.join(PROMPT_SEPARATOR),
10177
+ userInlineSuffix: userParts.join(PROMPT_SEPARATOR)
10178
+ };
10179
+ }
10180
+ //#endregion
9679
10181
  //#region ../agent-runtime/src/output-tools.ts
9680
10182
  /**
9681
10183
  * Submit-output tool contract.
@@ -10148,6 +10650,109 @@ function buildFulfillBriefUserPrompt(input, ctx) {
10148
10650
  ].filter(Boolean).join("\n");
10149
10651
  }
10150
10652
  //#endregion
10653
+ //#region ../agent-runtime/src/prompts/judge-eval-variant.ts
10654
+ /**
10655
+ * Build the first user-message prompt for a `judge_eval_variant` task
10656
+ * (#943 Slice 2).
10657
+ *
10658
+ * The parent agent's job is **fan-out-and-collect**: for each
10659
+ * `runTaskIds[i]`, spawn an isolated subagent via the `subagent` custom
10660
+ * tool (#1087), have it grade that variant against the shared rubric,
10661
+ * and collect each subagent's structured `judge_eval_variant_result`
10662
+ * payload. The parent does NOT grade itself; it composes the per-
10663
+ * variant results into the final `judge_eval_variant` output (results
10664
+ * array + optional deltas + verdicts).
10665
+ *
10666
+ * Isolation is the point: each variant gets a fresh subagent session
10667
+ * with no carryover context from sibling variants, so per-variant
10668
+ * grading is independent. Cost is bounded by `maxItems: 10` on
10669
+ * runTaskIds.
10670
+ */
10671
+ function buildJudgeEvalVariantUserPrompt(input, ctx) {
10672
+ const { runTaskIds, successCriteria } = input;
10673
+ const rubric = successCriteria.rubric;
10674
+ if (!rubric) throw new Error("judge_eval_variant requires successCriteria.rubric — none present");
10675
+ const escapeCell = (s) => s.replace(/\\/g, "\\\\").replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
10676
+ const criteriaTable = rubric.criteria.map((c) => `| \`${c.id}\` | ${c.weight.toFixed(3)} | ${c.scoring} | ${escapeCell(c.description)} |`).join("\n");
10677
+ const targetsBlock = runTaskIds.map((id, i) => `${i + 1}. \`${id}\``).join("\n");
10678
+ const finalOutputBlock = buildFinalOutputBlock({
10679
+ taskType: "judge_eval_variant",
10680
+ outputSchemaName: "JudgeEvalVariantOutput",
10681
+ shapeSketch: [
10682
+ "{",
10683
+ " \"results\": [",
10684
+ " {",
10685
+ " \"runTaskId\": \"<runTaskIds[i]>\",",
10686
+ " \"variantLabel\": \"<from variant input>\",",
10687
+ " \"scores\": [ { \"criterionId\": \"...\", \"score\": 0..1, \"rationale\": \"...\", \"assertions\": [...]? } ],",
10688
+ " \"composite\": <Σ(weight × score), 0..1>,",
10689
+ " \"verdict\": \"<1-3 sentences>\"",
10690
+ " },",
10691
+ " ...one entry per runTaskIds[i], same order",
10692
+ " ],",
10693
+ " \"deltas\": { \"<labelA> - <labelB>\": <composite(A) - composite(B)> }, // optional",
10694
+ " \"judgeModel\": \"<id>\", // optional",
10695
+ " \"traceparent\": \"<from claim>\"",
10696
+ "}"
10697
+ ].join("\n")
10698
+ });
10699
+ return [
10700
+ "# Judge Eval Variants\n",
10701
+ `You are grading ${runTaskIds.length} variants of a single run_eval scenario`,
10702
+ "against ONE shared rubric. Your job is fan-out-and-collect — you do not",
10703
+ "grade yourself.",
10704
+ "",
10705
+ `Task id: \`${ctx.taskId}\``,
10706
+ `Diary: \`${ctx.diaryId}\``,
10707
+ "",
10708
+ "### Targets (variants to grade)",
10709
+ "",
10710
+ targetsBlock,
10711
+ "",
10712
+ "Each target is a completed `run_eval` task in the same correlation group.",
10713
+ "Read its accepted attempt via `moltnet_get_task` / `moltnet_list_task_attempts`",
10714
+ "to see the producer's output before grading.",
10715
+ "",
10716
+ "### Rubric",
10717
+ "",
10718
+ rubric.preamble ? `${rubric.preamble}\n` : "",
10719
+ "| Criterion | Weight | Scoring | Description |",
10720
+ "| --- | --- | --- | --- |",
10721
+ criteriaTable,
10722
+ "",
10723
+ "### How to grade",
10724
+ "",
10725
+ "For EACH `runTaskIds[i]`:",
10726
+ "",
10727
+ "1. Call the `subagent` custom tool with:",
10728
+ " - `task`: a brief instructing the subagent to grade ONLY that variant",
10729
+ " against the rubric above; include the target task id and the rubric",
10730
+ " verbatim. The subagent has the same MoltNet tools and can fetch the",
10731
+ " accepted attempt output independently.",
10732
+ " - `output_schema`: `\"judge_eval_variant_result\"`",
10733
+ "2. Receive the subagent's structured `judge_eval_variant_result` payload.",
10734
+ "3. Append it to your `results[]` array, **in the same order as input.runTaskIds**.",
10735
+ "",
10736
+ "Do NOT score any variant in your own session. The whole point of the",
10737
+ "subagent fan-out is per-variant context isolation — grading two variants",
10738
+ "back-to-back in one session lets the second be biased by the first.",
10739
+ "",
10740
+ "### Composite arithmetic",
10741
+ "",
10742
+ "Each `composite` MUST equal `Σ(criterion.weight × score)` over the rubric",
10743
+ "criteria. Drift > 0.001 is rejected. Subagents are instructed to compute it",
10744
+ "themselves; double-check before assembling the final output.",
10745
+ "",
10746
+ "### Deltas (optional)",
10747
+ "",
10748
+ "If useful, populate `deltas` with pairwise composite differences keyed by",
10749
+ "`\"<variantLabel-A> - <variantLabel-B>\"` (single space-hyphen-space). Both",
10750
+ "labels must appear in `results`. Omit `deltas` entirely if not used.",
10751
+ "",
10752
+ finalOutputBlock
10753
+ ].filter((s) => s !== "").join("\n");
10754
+ }
10755
+ //#endregion
10151
10756
  //#region ../agent-runtime/src/prompts/judge-pack.ts
10152
10757
  function buildJudgePackUserPrompt(input, ctx) {
10153
10758
  const { renderedPackId, sourcePackId, successCriteria } = input;
@@ -10454,6 +11059,15 @@ function buildTaskUserPrompt(task, ctx) {
10454
11059
  diaryId: ctx.diaryId,
10455
11060
  taskId: ctx.taskId
10456
11061
  });
11062
+ case JUDGE_EVAL_VARIANT_TYPE:
11063
+ if (!Value.Check(JudgeEvalVariantInput, task.input)) {
11064
+ const errors = [...Value.Errors(JudgeEvalVariantInput, task.input)];
11065
+ throw new Error(`judge_eval_variant input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
11066
+ }
11067
+ return buildJudgeEvalVariantUserPrompt(task.input, {
11068
+ diaryId: ctx.diaryId,
11069
+ taskId: ctx.taskId
11070
+ });
10457
11071
  case RUN_EVAL_TYPE:
10458
11072
  if (!Value.Check(RunEvalInput, task.input)) {
10459
11073
  const errors = [...Value.Errors(RunEvalInput, task.input)];
@@ -14128,6 +14742,190 @@ function buildRuntimeInstructor(ctx) {
14128
14742
  ].join("\n");
14129
14743
  }
14130
14744
  //#endregion
14745
+ //#region src/runtime/subagent-tool.ts
14746
+ var SUBAGENT_SUBMIT_TOOL_NAME = "submit_subagent_output";
14747
+ /**
14748
+ * Parameters shape the parent LLM sees when calling the subagent tool.
14749
+ *
14750
+ * - `task` — natural-language instructions for the subagent.
14751
+ * The parent authors this per call. Must be
14752
+ * non-empty.
14753
+ * - `output_schema` — name of a registered SubagentOutputContract.
14754
+ * Resolved at call time; unknown names error.
14755
+ */
14756
+ var SubagentToolParameters = Type$1.Object({
14757
+ task: Type$1.String({
14758
+ minLength: 1,
14759
+ description: "Natural-language instructions for the subagent. The subagent starts with a fresh conversation and a narrowed system prompt; this is the only context it has from you."
14760
+ }),
14761
+ output_schema: Type$1.String({
14762
+ minLength: 1,
14763
+ description: "Name of a registered subagent output contract. The subagent must submit a structured payload via `submit_subagent_output` matching this contract."
14764
+ })
14765
+ }, { additionalProperties: false });
14766
+ var DEFAULT_SUBAGENT_TIMEOUT_MS = 300 * 1e3;
14767
+ /**
14768
+ * Build the subagent custom tool for a parent session. The handle
14769
+ * exposes the call counter so executors can emit summary telemetry
14770
+ * when the parent terminates.
14771
+ */
14772
+ function createSubagentTool(args) {
14773
+ const buildSession = args.buildAgentSession ?? buildAgentSession;
14774
+ let callCount = 0;
14775
+ return {
14776
+ tool: defineTool({
14777
+ name: "subagent",
14778
+ label: "Delegate to subagent",
14779
+ description: subagentToolDescription(),
14780
+ parameters: SubagentToolParameters,
14781
+ async execute(_id, params) {
14782
+ if (!Value.Check(SubagentToolParameters, params)) return toolError(`subagent: invalid parameters: ${JSON.stringify([...Value.Errors(SubagentToolParameters, params)].slice(0, 3))}`);
14783
+ const { task, output_schema } = params;
14784
+ const contract = getSubagentOutputContract(output_schema);
14785
+ if (!contract) return toolError(`subagent: unknown output_schema "${output_schema}". Registered contracts: [${listSubagentOutputContracts().map((c) => c.name).join(", ")}]`);
14786
+ callCount += 1;
14787
+ const callIndex = callCount;
14788
+ let captured = null;
14789
+ const submitTool = defineTool({
14790
+ name: SUBAGENT_SUBMIT_TOOL_NAME,
14791
+ label: `Submit ${output_schema}`,
14792
+ description: `Submit your structured output for this subagent task. Call exactly once when done. Args MUST match the ${output_schema} contract; mismatches return a tool error you can recover from in the same session.`,
14793
+ parameters: contract.parametersSchema,
14794
+ async execute(_innerId, innerParams) {
14795
+ if (!Value.Check(contract.parametersSchema, innerParams)) return toolError(`submit_subagent_output: schema validation failed: ${[...Value.Errors(contract.parametersSchema, innerParams)].slice(0, 3).map((e) => `${e.path}: ${e.message}`).join("; ")}. Re-call with a corrected payload.`);
14796
+ captured = innerParams;
14797
+ return {
14798
+ content: [{
14799
+ type: "text",
14800
+ text: "Output captured. Subagent session will terminate; no further action needed."
14801
+ }],
14802
+ details: { captured: true },
14803
+ terminate: true
14804
+ };
14805
+ }
14806
+ });
14807
+ const subagentInstructor = buildSubagentInstructor({
14808
+ contractName: output_schema,
14809
+ contractDescription: contract.description,
14810
+ parentTaskId: args.parentTaskId,
14811
+ callIndex
14812
+ });
14813
+ const session = await buildSession({
14814
+ mountPath: args.mountPath,
14815
+ piAuthDir: args.piAuthDir,
14816
+ modelHandle: args.modelHandle,
14817
+ agentName: args.agentName,
14818
+ customTools: [...args.inheritedCustomTools, submitTool],
14819
+ appendSystemPrompt: [args.parentRuntimeInstructor, subagentInstructor],
14820
+ skillsOverride: () => ({
14821
+ skills: [],
14822
+ diagnostics: []
14823
+ }),
14824
+ otelSpanAttrs: {
14825
+ "moltnet.task.id": args.parentTaskId,
14826
+ "moltnet.task.type": args.parentTaskType,
14827
+ "moltnet.task.attempt": args.parentAttemptN,
14828
+ "moltnet.subagent.contract": output_schema,
14829
+ "moltnet.subagent.index": callIndex
14830
+ }
14831
+ });
14832
+ let abortReason = null;
14833
+ let abortInvoked = false;
14834
+ const fireAbort = (reason) => {
14835
+ if (abortInvoked) return;
14836
+ abortInvoked = true;
14837
+ abortReason = reason;
14838
+ session.abort().catch((err) => {
14839
+ const message = err instanceof Error ? err.message : String(err);
14840
+ process.stderr.write(`[subagent] inner session.abort() failed: ${message}\n`);
14841
+ });
14842
+ };
14843
+ const cancelListener = args.parentCancelSignal ? (() => {
14844
+ const signal = args.parentCancelSignal;
14845
+ const listener = () => fireAbort("parent_cancelled");
14846
+ if (signal.aborted) listener();
14847
+ else signal.addEventListener("abort", listener, { once: true });
14848
+ return () => signal.removeEventListener("abort", listener);
14849
+ })() : null;
14850
+ const timeoutMs = args.timeoutMs === void 0 || args.timeoutMs < 0 ? DEFAULT_SUBAGENT_TIMEOUT_MS : args.timeoutMs;
14851
+ const timeoutHandle = timeoutMs > 0 ? setTimeout(() => fireAbort("subagent_timed_out"), timeoutMs) : null;
14852
+ try {
14853
+ await session.prompt(task);
14854
+ } catch (err) {
14855
+ return toolError(`subagent: inner session.prompt() threw: ${err instanceof Error ? err.message : String(err)}`);
14856
+ } finally {
14857
+ if (timeoutHandle) clearTimeout(timeoutHandle);
14858
+ if (cancelListener) cancelListener();
14859
+ }
14860
+ if (abortReason !== null) return toolError(`subagent: ${abortReason === "subagent_timed_out" ? `subagent timed out after ${timeoutMs}ms` : "parent task was cancelled"}. The parent should fail this task or retry with a clearer scope.`);
14861
+ if (captured === null) return toolError(`subagent: inner session ended without calling ${SUBAGENT_SUBMIT_TOOL_NAME}. The parent should retry with clearer instructions or fail the task.`);
14862
+ return {
14863
+ content: [{
14864
+ type: "text",
14865
+ text: JSON.stringify(captured)
14866
+ }],
14867
+ details: {
14868
+ captured: true,
14869
+ contract: output_schema,
14870
+ callIndex
14871
+ }
14872
+ };
14873
+ }
14874
+ }),
14875
+ getCallCount: () => callCount
14876
+ };
14877
+ }
14878
+ function subagentToolDescription() {
14879
+ return [
14880
+ "Delegate a sub-task to a fresh subagent session with isolated context.",
14881
+ "",
14882
+ "The subagent starts with no conversation history and only the `task` ",
14883
+ "string you provide as its instructions. It runs in the same VM with ",
14884
+ "the same tools you have (Gondolin-routed Read/Write/Edit/Bash, ",
14885
+ "moltnet_* tools), and is expected to call ",
14886
+ `\`${SUBAGENT_SUBMIT_TOOL_NAME}\` with a payload matching the named `,
14887
+ "contract before its session ends.",
14888
+ "",
14889
+ "On success, the tool result is the JSON-stringified subagent payload.",
14890
+ "On failure (unknown contract, validation error, subagent did not ",
14891
+ "submit) the tool returns isError:true with a recoverable message."
14892
+ ].join("\n");
14893
+ }
14894
+ function buildSubagentInstructor(args) {
14895
+ return [
14896
+ "# You are a subagent",
14897
+ "",
14898
+ `Parent task: \`${args.parentTaskId}\` (subagent call #${args.callIndex}).`,
14899
+ "",
14900
+ `Your assigned output contract is \`${args.contractName}\`:`,
14901
+ `${args.contractDescription}`,
14902
+ "",
14903
+ "Rules for this session:",
14904
+ "",
14905
+ `- You MUST call \`${SUBAGENT_SUBMIT_TOOL_NAME}\` exactly once with a `,
14906
+ " payload matching the contract above. Your session terminates on ",
14907
+ " the valid call.",
14908
+ "- The parent's message above is your task. Do not invent additional ",
14909
+ " steps the parent did not request.",
14910
+ "- All MoltNet runtime invariants from the parent runtime instructor ",
14911
+ " apply (diary discipline, gh-auth pattern, etc.) IF you take any ",
14912
+ " action that would trigger them. Most subagents do not commit code ",
14913
+ " or open PRs — only do so if your task message explicitly requires it.",
14914
+ "- You do NOT have access to the `subagent` tool. Do not attempt nested ",
14915
+ " delegation; do the work yourself."
14916
+ ].join("\n");
14917
+ }
14918
+ function toolError(text) {
14919
+ return {
14920
+ content: [{
14921
+ type: "text",
14922
+ text
14923
+ }],
14924
+ details: { captured: false },
14925
+ isError: true
14926
+ };
14927
+ }
14928
+ //#endregion
14131
14929
  //#region src/runtime/task-output.ts
14132
14930
  var METER_NAME = "@themoltnet/pi-extension/task-output";
14133
14931
  var parseResultCounter = null;
@@ -14439,6 +15237,7 @@ async function executePiTask(claimedTask, reporter, opts) {
14439
15237
  const taskTeamId = task.teamId ?? "";
14440
15238
  let reporterOpen = false;
14441
15239
  let session = null;
15240
+ let subagentHandle = null;
14442
15241
  const finalUsage = emptyUsage(opts.provider, opts.model);
14443
15242
  let cancelListener = null;
14444
15243
  const makeFailedOutput = (code, message, usage = finalUsage) => ({
@@ -14556,47 +15355,55 @@ async function executePiTask(claimedTask, reporter, opts) {
14556
15355
  });
14557
15356
  const piAuthDir = process.env.PI_CODING_AGENT_DIR ?? join(homedir(), ".pi", "agent");
14558
15357
  const modelHandle = getModel(opts.provider, opts.model);
14559
- const piOtelExtension = createPiOtelExtension({
14560
- agentName: opts.agentName,
14561
- spanAttributes: {
14562
- "moltnet.task.id": task.id,
14563
- "moltnet.task.attempt": attemptN,
14564
- "moltnet.task.type": task.taskType
14565
- }
14566
- });
14567
- const appendSystemPrompt = [buildRuntimeInstructor({
15358
+ const runtimeInstructor = buildRuntimeInstructor({
14568
15359
  taskId: task.id,
14569
15360
  taskType: task.taskType,
14570
15361
  attemptN,
14571
15362
  diaryId,
14572
15363
  agentName: opts.agentName,
14573
15364
  correlationId: task.correlationId ?? null
14574
- })];
15365
+ });
15366
+ const appendSystemPrompt = [runtimeInstructor];
14575
15367
  if (injectedContext.systemPromptPrefix) appendSystemPrompt.push(injectedContext.systemPromptPrefix);
14576
15368
  const injectedSkills = injectedContext.skills;
14577
- const resourceLoader = new DefaultResourceLoader({
14578
- cwd: mountPath,
14579
- agentDir: piAuthDir,
14580
- extensionFactories: [piOtelExtension],
15369
+ const parentSubagentTools = [];
15370
+ if (taskTypeUsesSubagents(task.taskType)) {
15371
+ subagentHandle = createSubagentTool({
15372
+ mountPath,
15373
+ piAuthDir,
15374
+ modelHandle,
15375
+ agentName: opts.agentName,
15376
+ inheritedCustomTools: [...gondolinCustomTools, ...moltnetTools],
15377
+ parentRuntimeInstructor: runtimeInstructor,
15378
+ parentTaskId: task.id,
15379
+ parentTaskType: task.taskType,
15380
+ parentAttemptN: attemptN,
15381
+ parentCancelSignal: reporter.cancelSignal
15382
+ });
15383
+ parentSubagentTools.push(subagentHandle.tool);
15384
+ }
15385
+ session = await buildAgentSession({
15386
+ mountPath,
15387
+ piAuthDir,
15388
+ modelHandle,
15389
+ agentName: opts.agentName,
15390
+ customTools: [
15391
+ ...gondolinCustomTools,
15392
+ ...moltnetTools,
15393
+ ...submitTools,
15394
+ ...parentSubagentTools
15395
+ ],
14581
15396
  appendSystemPrompt,
14582
15397
  skillsOverride: () => ({
14583
15398
  skills: injectedSkills,
14584
15399
  diagnostics: []
14585
- })
15400
+ }),
15401
+ otelSpanAttrs: {
15402
+ "moltnet.task.id": task.id,
15403
+ "moltnet.task.attempt": attemptN,
15404
+ "moltnet.task.type": task.taskType
15405
+ }
14586
15406
  });
14587
- await resourceLoader.reload();
14588
- session = (await createAgentSession({
14589
- agentDir: piAuthDir,
14590
- cwd: mountPath,
14591
- model: modelHandle,
14592
- customTools: [
14593
- ...gondolinCustomTools,
14594
- ...moltnetTools,
14595
- ...submitTools
14596
- ],
14597
- sessionManager: SessionManager.inMemory(),
14598
- resourceLoader
14599
- })).session;
14600
15407
  } catch (err) {
14601
15408
  const message = err instanceof Error ? err.message : String(err);
14602
15409
  await emit("error", {
@@ -14667,6 +15474,10 @@ async function executePiTask(claimedTask, reporter, opts) {
14667
15474
  phase: "session_prompt"
14668
15475
  });
14669
15476
  }
15477
+ if (subagentHandle && subagentHandle.getCallCount() > 0) await emit("info", {
15478
+ event: "subagent_summary",
15479
+ callCount: subagentHandle.getCallCount()
15480
+ });
14670
15481
  await Promise.all(recordingPromise);
14671
15482
  const cancelled = reporter.cancelSignal.aborted;
14672
15483
  let parsedOutput = null;
@@ -15126,4 +15937,4 @@ function moltnetExtension(pi) {
15126
15937
  registerMoltnetReflectCommand(pi, state);
15127
15938
  }
15128
15939
  //#endregion
15129
- export { HOST_EXEC_DEFAULT_BASE_ENV, activateAgentEnv, createGondolinBashOps, createGondolinEditOps, createGondolinReadOps, createGondolinWriteOps, createMoltNetTools, createPiOtelExtension, createPiTaskExecutor, moltnetExtension as default, ensureSnapshot, executePiTask, findMainWorktree, injectTaskContext, loadCredentials, resumeVm, toGuestPath };
15940
+ export { HOST_EXEC_DEFAULT_BASE_ENV, activateAgentEnv, buildAgentSession, createGondolinBashOps, createGondolinEditOps, createGondolinReadOps, createGondolinWriteOps, createMoltNetTools, createPiOtelExtension, createPiTaskExecutor, createSubagentTool, moltnetExtension as default, ensureSnapshot, executePiTask, findMainWorktree, injectTaskContext, loadCredentials, resumeVm, toGuestPath };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@themoltnet/pi-extension",
3
- "version": "0.14.0",
3
+ "version": "0.15.1",
4
4
  "type": "module",
5
5
  "description": "MoltNet pi extension — sandboxed tool execution in Gondolin VMs with MoltNet identity and persistent memory",
6
6
  "license": "MIT",
@@ -31,8 +31,8 @@
31
31
  "@earendil-works/gondolin": "^0.9.1",
32
32
  "@opentelemetry/api": "^1.9.0",
33
33
  "@sinclair/typebox": "^0.34.0",
34
- "@themoltnet/agent-runtime": "0.12.0",
35
- "@themoltnet/sdk": "0.100.0"
34
+ "@themoltnet/agent-runtime": "0.14.0",
35
+ "@themoltnet/sdk": "0.101.0"
36
36
  },
37
37
  "peerDependencies": {
38
38
  "@earendil-works/pi-coding-agent": ">=0.74.0",