@themoltnet/pi-extension 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -7,11 +7,10 @@ import { createHash } from "node:crypto";
7
7
  import crypto, { createHash as createHash$1 } from "crypto";
8
8
  import { readFile } from "node:fs/promises";
9
9
  import { homedir } from "node:os";
10
- import { Type, complete, getModel } from "@mariozechner/pi-ai";
10
+ import { Type, getModel } from "@mariozechner/pi-ai";
11
11
  import { RealFSProvider, ShadowProvider, VM, VmCheckpoint, createHttpHooks, createShadowPathPredicate, ensureImageSelector, loadGuestAssets } from "@earendil-works/gondolin";
12
12
  import { parseEnv } from "node:util";
13
- import { fileURLToPath } from "node:url";
14
- import { SpanStatusCode, context, trace } from "@opentelemetry/api";
13
+ import { SpanStatusCode, context, metrics, trace } from "@opentelemetry/api";
15
14
  import { FormatRegistry, Type as Type$1 } from "@sinclair/typebox";
16
15
  import { Value } from "@sinclair/typebox/value";
17
16
  //#region \0rolldown/runtime.js
@@ -3848,7 +3847,7 @@ var cidSymbol = Symbol.for("@ipld/js-cid/CID");
3848
3847
  * naturally prevents field delimiter collision.
3849
3848
  */
3850
3849
  /** SHA-256 multicodec code per multihash table */
3851
- var SHA2_256_CODE$1 = 18;
3850
+ var SHA2_256_CODE = 18;
3852
3851
  /**
3853
3852
  * Build the canonical JSON input for content hashing.
3854
3853
  *
@@ -3880,7 +3879,7 @@ function computeCanonicalHash(entryType, title, content, tags) {
3880
3879
  * Example output: "bafkreig..."
3881
3880
  */
3882
3881
  function computeContentCid(entryType, title, content, tags) {
3883
- const digest = create(SHA2_256_CODE$1, computeCanonicalHash(entryType, title, content, tags));
3882
+ const digest = create(SHA2_256_CODE, computeCanonicalHash(entryType, title, content, tags));
3884
3883
  return CID.createV1(85, digest).toString(base32);
3885
3884
  }
3886
3885
  var { p: P, n: N, Gx, Gy, a: _a, d: _d } = {
@@ -7135,159 +7134,6 @@ var registerSandboxCommand = (pi, state) => {
7135
7134
  });
7136
7135
  };
7137
7136
  //#endregion
7138
- //#region src/moltnet/judge/assets.ts
7139
- /** Default fidelity rubric — kept verbatim from the Go judge. */
7140
- var DEFAULT_RUBRIC = `Evaluate the rendered content against the source entries on three axes:
7141
-
7142
- COVERAGE (0.0-1.0):
7143
- - Identify each distinct topic/fact in the source entries
7144
- - Check if each is represented in the rendered content
7145
- - Score = (represented topics) / (total source topics)
7146
- - A topic can be restructured or summarized but must be present
7147
-
7148
- GROUNDING (0.0-1.0):
7149
- - Identify each distinct claim/fact in the rendered content
7150
- - Check if each is traceable to a specific source entry
7151
- - Score = (grounded claims) / (total rendered claims)
7152
- - Restructured content is fine if the underlying fact comes from a source
7153
-
7154
- FAITHFULNESS (0.0-1.0):
7155
- - For content that IS represented, check semantic accuracy
7156
- - Is the meaning preserved? Any distortions, inversions, or misquotes?
7157
- - Score = (accurate representations) / (total representations)
7158
- - Summarization is fine; misrepresentation is not
7159
- `;
7160
- /** Judge system prompt — kept verbatim from the Go judge signature. */
7161
- var JUDGE_SYSTEM_PROMPT = `You are a fidelity judge for rendered context packs. Your job is to evaluate
7162
- whether a rendered markdown document faithfully represents its source entries.
7163
-
7164
- Score each axis independently and precisely. Be critical — the purpose is to
7165
- catch content drift, hallucination, and cherry-picking.
7166
-
7167
- You will be given three inputs:
7168
-
7169
- 1. \`source_entries\` — the original source entries from the context pack, in
7170
- markdown format.
7171
- 2. \`rendered_content\` — the agent-rendered markdown derived from the source
7172
- entries.
7173
- 3. \`rubric\` — the fidelity scoring rubric with criteria definitions.
7174
-
7175
- Return a JSON object matching the requested schema with these fields:
7176
-
7177
- - \`coverage\` (number, 0.0–1.0): fraction of source entries represented in
7178
- rendered content. 1.0 means all source entries are covered.
7179
- - \`grounding\` (number, 0.0–1.0): fraction of rendered content traceable to
7180
- source entries. 1.0 means everything comes from sources.
7181
- - \`faithfulness\` (number, 0.0–1.0): semantic accuracy of represented content.
7182
- 1.0 means source content is accurately represented.
7183
- - \`reasoning\` (string): detailed step-by-step analysis explaining each score.
7184
-
7185
- Respond with ONLY a single JSON object. No prose before or after.
7186
- `;
7187
- //#endregion
7188
- //#region src/moltnet/judge/fidelity.ts
7189
- /**
7190
- * Pi-native port of the Go fidelity judge
7191
- * (libs/dspy-adapters/fidelity/fidelity.go).
7192
- *
7193
- * Same inputs (source_entries, rendered_content, rubric), same outputs
7194
- * (coverage, grounding, faithfulness, reasoning). Uses pi-ai `complete()`
7195
- * instead of dspy-go; no process-global state.
7196
- */
7197
- var JSON_FENCE_RE = /```(?:json)?\s*([\s\S]*?)```/i;
7198
- function extractJson(text) {
7199
- const fenceMatch = text.match(JSON_FENCE_RE);
7200
- if (fenceMatch && fenceMatch[1]) return fenceMatch[1].trim();
7201
- const firstBrace = text.indexOf("{");
7202
- const lastBrace = text.lastIndexOf("}");
7203
- if (firstBrace >= 0 && lastBrace > firstBrace) return text.slice(firstBrace, lastBrace + 1);
7204
- return text.trim();
7205
- }
7206
- function clamp01(value) {
7207
- const n = typeof value === "number" ? value : Number(value);
7208
- if (!Number.isFinite(n)) return 0;
7209
- if (n < 0) return 0;
7210
- if (n > 1) return 1;
7211
- return n;
7212
- }
7213
- function coerceString(value) {
7214
- if (typeof value === "string") return value;
7215
- if (value === null || value === void 0) return "";
7216
- if (typeof value === "number" || typeof value === "boolean") return String(value);
7217
- try {
7218
- return JSON.stringify(value);
7219
- } catch {
7220
- return "";
7221
- }
7222
- }
7223
- function parseScores(raw) {
7224
- const jsonText = extractJson(raw);
7225
- let parsed;
7226
- try {
7227
- parsed = JSON.parse(jsonText);
7228
- } catch (err) {
7229
- throw new Error(`judge returned an invalid structured response: ${err.message}\n---raw---\n${raw}`);
7230
- }
7231
- const coverage = clamp01(parsed.coverage);
7232
- const grounding = clamp01(parsed.grounding);
7233
- const faithfulness = clamp01(parsed.faithfulness);
7234
- const reasoning = coerceString(parsed.reasoning);
7235
- return {
7236
- coverage,
7237
- grounding,
7238
- faithfulness,
7239
- composite: (coverage + grounding + faithfulness) / 3,
7240
- reasoning
7241
- };
7242
- }
7243
- function buildUserMessage(sourceEntries, renderedContent, rubric) {
7244
- return [
7245
- "## Rubric",
7246
- rubric,
7247
- "",
7248
- "## Source entries",
7249
- sourceEntries,
7250
- "",
7251
- "## Rendered content",
7252
- renderedContent,
7253
- "",
7254
- "Produce the JSON object now."
7255
- ].join("\n");
7256
- }
7257
- /**
7258
- * Run the fidelity judge via pi-ai `complete()`. Mirrors `fidelity.Run` in
7259
- * libs/dspy-adapters/fidelity/fidelity.go.
7260
- */
7261
- async function runFidelityJudge(req, options = {}) {
7262
- const rubric = req.rubric?.trim() ? req.rubric : DEFAULT_RUBRIC;
7263
- const userPrompt = buildUserMessage(req.sourceEntries, req.renderedContent, rubric);
7264
- const message = await complete(req.model, {
7265
- systemPrompt: JUDGE_SYSTEM_PROMPT,
7266
- messages: [{
7267
- role: "user",
7268
- content: userPrompt,
7269
- timestamp: Date.now()
7270
- }]
7271
- }, options.signal ? { signal: options.signal } : void 0);
7272
- if (message.stopReason === "error" || message.stopReason === "aborted") throw new Error(`judge failed: ${message.errorMessage ?? message.stopReason}`);
7273
- const textContent = message.content.filter((c) => c.type === "text" && typeof c.text === "string").map((c) => c.text).join("\n").trim();
7274
- if (!textContent) throw new Error("judge returned empty response");
7275
- return parseScores(textContent);
7276
- }
7277
- /**
7278
- * Build a stable markdown blob of source entries for the judge prompt.
7279
- * Mirrors `buildSourceEntriesFromPack` / `buildSourceEntriesMarkdown` in the
7280
- * Go CLI so that local and proctored modes produce the same input shape.
7281
- */
7282
- function buildSourceEntriesMarkdown(entries) {
7283
- const parts = [];
7284
- for (const entry of entries) {
7285
- const title = entry.title?.trim() || "Untitled";
7286
- parts.push(`## ${title}\n${entry.content}\n`);
7287
- }
7288
- return parts.join("\n");
7289
- }
7290
- //#endregion
7291
7137
  //#region src/moltnet/render-phase6.ts
7292
7138
  function slugToTitle(value) {
7293
7139
  return value.split(/[:/_-]+/).filter(Boolean).map((part) => part[0]?.toUpperCase() + part.slice(1)).join(" ");
@@ -7434,6 +7280,21 @@ function ensureConnected(config) {
7434
7280
  };
7435
7281
  }
7436
7282
  /**
7283
+ * Expand the `taskFilter` shorthand on the diary list/search tools into
7284
+ * the matching `task:*` provenance tags emitted by `moltnet_create_entry`
7285
+ * during a task. Returning an array (possibly empty) lets callers spread
7286
+ * it into a larger `tags` AND-filter without conditionals.
7287
+ */
7288
+ function compileTaskFilterTags(filter) {
7289
+ if (!filter) return [];
7290
+ const tags = [];
7291
+ if (filter.taskId) tags.push(`task:id:${filter.taskId}`);
7292
+ if (filter.taskType) tags.push(`task:type:${filter.taskType}`);
7293
+ if (filter.correlationId) tags.push(`task:correlation:${filter.correlationId}`);
7294
+ if (typeof filter.attemptN === "number") tags.push(`task:attempt:${filter.attemptN}`);
7295
+ return tags;
7296
+ }
7297
+ /**
7437
7298
  * Create all MoltNet tool definitions, ready to pass to `pi.registerTool()`.
7438
7299
  */
7439
7300
  function createMoltNetTools(config) {
@@ -7596,122 +7457,6 @@ function createMoltNetTools(config) {
7596
7457
  };
7597
7458
  }
7598
7459
  });
7599
- const createJudgePackTask = defineTool({
7600
- name: "moltnet_judge_pack_task_create",
7601
- label: "Create Judge Pack Task",
7602
- description: "Create a judge_pack task for a rendered pack. Returns a taskId that moltnet_rendered_pack_judge can claim and execute. The rubric is required — pass the structured rubric JSON from @moltnet/tasks Rubric schema.",
7603
- parameters: Type.Object({
7604
- renderedPackId: Type.String({ description: "Rendered pack ID to judge" }),
7605
- sourcePackId: Type.String({ description: "Source pack ID. Fetch it from the rendered pack if unknown." }),
7606
- rubric: Type.Any({ description: "Structured rubric object (Rubric schema from @moltnet/tasks). Must have rubricId, version, criteria[]." }),
7607
- diaryId: Type.Optional(Type.String({ description: "Diary ID to impose the task on. Defaults to the connected diary." }))
7608
- }),
7609
- async execute(_id, params) {
7610
- const { agent, diaryId: connectedDiaryId, teamId: connectedTeamId } = ensureConnected(config);
7611
- const task = await agent.tasks.create({
7612
- taskType: "judge_pack",
7613
- input: {
7614
- renderedPackId: params.renderedPackId,
7615
- sourcePackId: params.sourcePackId,
7616
- rubric: params.rubric
7617
- },
7618
- diaryId: params.diaryId ?? connectedDiaryId,
7619
- teamId: connectedTeamId
7620
- });
7621
- return {
7622
- content: [{
7623
- type: "text",
7624
- text: JSON.stringify({
7625
- taskId: task.id,
7626
- task
7627
- }, null, 2)
7628
- }],
7629
- details: {}
7630
- };
7631
- }
7632
- });
7633
- const judgeRenderedPack = defineTool({
7634
- name: "moltnet_rendered_pack_judge",
7635
- label: "Judge MoltNet Rendered Pack",
7636
- description: "Claim a judge_pack task, run the fidelity judge locally, complete the task with structured scores, and set verifiedTaskId on the rendered pack. Create the task first with moltnet_judge_pack_task_create.",
7637
- parameters: Type.Object({
7638
- taskId: Type.String({ description: "judge_pack task ID from moltnet_judge_pack_task_create" }),
7639
- rubricOverride: Type.Optional(Type.String({ description: "Freeform rubric string override for the LLM judge prompt. When omitted the task rubric preamble (or built-in default) is used." }))
7640
- }),
7641
- async execute(_id, params, _signal, _onUpdate, ctx) {
7642
- const { agent } = ensureConnected(config);
7643
- const model = ctx?.model;
7644
- if (!model) throw new Error("No active model in pi session — cannot run the fidelity judge.");
7645
- const claimed = await agent.tasks.claim(params.taskId);
7646
- const input = claimed.task.input;
7647
- const rendered = await agent.packs.getRendered(input.renderedPackId);
7648
- if (!rendered.content?.trim()) throw new Error(`rendered pack ${input.renderedPackId} has empty content`);
7649
- const sourcePack = await agent.packs.get(input.sourcePackId, { expand: "entries" });
7650
- if (!sourcePack.entries || sourcePack.entries.length === 0) throw new Error(`source pack ${input.sourcePackId} has no entries`);
7651
- const sourceEntriesMd = buildSourceEntriesMarkdown(sourcePack.entries.map((entry) => ({
7652
- title: entry.entry.title,
7653
- content: entry.entry.content
7654
- })));
7655
- const rubric = params.rubricOverride?.trim() || input.rubric?.preamble?.trim() || DEFAULT_RUBRIC;
7656
- let scores;
7657
- try {
7658
- scores = await runFidelityJudge({
7659
- model,
7660
- sourceEntries: sourceEntriesMd,
7661
- renderedContent: rendered.content,
7662
- rubric
7663
- });
7664
- } catch (err) {
7665
- await agent.tasks.fail(params.taskId, claimed.attempt.attemptN, { error: {
7666
- code: "judge_failed",
7667
- message: err.message ?? String(err)
7668
- } }).catch(() => {});
7669
- throw new Error(`judge failed: ${err.message ?? String(err)}`);
7670
- }
7671
- const modelId = model.provider && model.id ? `${model.provider}:${model.id}` : model.id ?? "pi:unknown";
7672
- const output = {
7673
- scores: [
7674
- {
7675
- criterionId: "coverage",
7676
- score: scores.coverage
7677
- },
7678
- {
7679
- criterionId: "grounding",
7680
- score: scores.grounding
7681
- },
7682
- {
7683
- criterionId: "faithfulness",
7684
- score: scores.faithfulness
7685
- }
7686
- ],
7687
- composite: scores.composite,
7688
- verdict: scores.reasoning,
7689
- judgeModel: modelId
7690
- };
7691
- const outputCid = await computeJsonCid(output);
7692
- const completed = await agent.tasks.complete(params.taskId, claimed.attempt.attemptN, {
7693
- output,
7694
- outputCid,
7695
- usage: {
7696
- inputTokens: 0,
7697
- outputTokens: 0
7698
- }
7699
- });
7700
- await agent.packs.updateRendered(input.renderedPackId, { verifiedTaskId: params.taskId });
7701
- return {
7702
- content: [{
7703
- type: "text",
7704
- text: JSON.stringify({
7705
- renderedPackId: input.renderedPackId,
7706
- taskId: params.taskId,
7707
- scores,
7708
- task: completed
7709
- }, null, 2)
7710
- }],
7711
- details: {}
7712
- };
7713
- }
7714
- });
7715
7460
  const diaryTags = defineTool({
7716
7461
  name: "moltnet_diary_tags",
7717
7462
  label: "List MoltNet Diary Tags",
@@ -7747,12 +7492,32 @@ function createMoltNetTools(config) {
7747
7492
  const listEntries = defineTool({
7748
7493
  name: "moltnet_list_entries",
7749
7494
  label: "List MoltNet Diary Entries",
7750
- description: "List entries from the MoltNet diary. When `entryIds` is provided, batch-fetches those specific entries (max 50) and returns full fields including entryType, contentSignature, and contentHash for signature checks. Otherwise returns recent entries with a content preview.",
7495
+ description: "List entries from the MoltNet diary. When `entryIds` is provided, batch-fetches those specific entries (max 50) and returns full fields including entryType, contentSignature, and contentHash for signature checks. Otherwise returns recent entries with a content preview, filtered by any combination of tags (AND), excludeTags (NONE), entryType, and the taskFilter shorthand which expands into the right `task:*` tags.",
7751
7496
  parameters: Type.Object({
7752
7497
  limit: Type.Optional(Type.Number({ description: "Max entries to return (default 10)" })),
7753
- tag: Type.Optional(Type.String({ description: "Filter by tag (optional)" })),
7498
+ tags: Type.Optional(Type.Array(Type.String({
7499
+ minLength: 1,
7500
+ maxLength: 50
7501
+ }), {
7502
+ description: "Tags filter — entry must have ALL listed tags (AND). Max 20.",
7503
+ maxItems: 20
7504
+ })),
7505
+ excludeTags: Type.Optional(Type.Array(Type.String({
7506
+ minLength: 1,
7507
+ maxLength: 50
7508
+ }), {
7509
+ description: "Tags to exclude — entry must have NONE of these. Max 20.",
7510
+ maxItems: 20
7511
+ })),
7512
+ entryType: Type.Optional(Type.String({ description: "Filter by entry type (procedural, semantic, episodic, reflection, identity, soul)." })),
7513
+ taskFilter: Type.Optional(Type.Object({
7514
+ taskId: Type.Optional(Type.String()),
7515
+ taskType: Type.Optional(Type.String()),
7516
+ correlationId: Type.Optional(Type.String()),
7517
+ attemptN: Type.Optional(Type.Number())
7518
+ }, { description: "Shorthand: any combination compiles to the matching task:* tags (task:id:<id>, task:type:<type>, task:correlation:<id>, task:attempt:<n>) and is merged into the tags filter." })),
7754
7519
  entryIds: Type.Optional(Type.Array(Type.String(), {
7755
- description: "Batch-fetch specific entries by UUID (max 50). Overrides `limit` and `tag` for selection.",
7520
+ description: "Batch-fetch specific entries by UUID (max 50). Overrides every other filter.",
7756
7521
  maxItems: 50
7757
7522
  }))
7758
7523
  }),
@@ -7766,7 +7531,11 @@ function createMoltNetTools(config) {
7766
7531
  if (batchMode) query.ids = params.entryIds;
7767
7532
  else {
7768
7533
  query.limit = params.limit ?? 10;
7769
- if (params.tag) query.tag = params.tag;
7534
+ const expandedTags = compileTaskFilterTags(params.taskFilter);
7535
+ const allTags = [...params.tags ?? [], ...expandedTags];
7536
+ if (allTags.length) query.tags = allTags;
7537
+ if (params.excludeTags?.length) query.excludeTags = params.excludeTags;
7538
+ if (params.entryType) query.entryType = params.entryType;
7770
7539
  }
7771
7540
  const entries = await agent.entries.list(diaryId, query);
7772
7541
  return {
@@ -7822,17 +7591,46 @@ function createMoltNetTools(config) {
7822
7591
  const searchEntries = defineTool({
7823
7592
  name: "moltnet_search_entries",
7824
7593
  label: "Search MoltNet Diary Entries",
7825
- description: "Search diary entries by semantic query. Uses vector similarity to find relevant entries.",
7594
+ description: "Hybrid (semantic + lexical) search over diary entries. Optional tags / excludeTags / entryTypes filters AND with the query; the taskFilter shorthand expands into task:* provenance tags so `taskFilter: { taskType: \"fulfill_brief\" }` returns only entries from fulfill_brief attempts. Filters apply server-side before ranking.",
7826
7595
  parameters: Type.Object({
7827
7596
  query: Type.String({ description: "Natural language search query" }),
7828
- limit: Type.Optional(Type.Number({ description: "Max results (default 5)" }))
7597
+ limit: Type.Optional(Type.Number({ description: "Max results (default 5)" })),
7598
+ tags: Type.Optional(Type.Array(Type.String({
7599
+ minLength: 1,
7600
+ maxLength: 50
7601
+ }), {
7602
+ description: "Entry must have ALL listed tags (AND). Max 20.",
7603
+ maxItems: 20
7604
+ })),
7605
+ excludeTags: Type.Optional(Type.Array(Type.String({
7606
+ minLength: 1,
7607
+ maxLength: 50
7608
+ }), {
7609
+ description: "Entry must have NONE of these tags. Max 20.",
7610
+ maxItems: 20
7611
+ })),
7612
+ entryTypes: Type.Optional(Type.Array(Type.String(), {
7613
+ description: "Restrict to these entry types (procedural, semantic, episodic, reflection, identity, soul). Max 6.",
7614
+ maxItems: 6
7615
+ })),
7616
+ taskFilter: Type.Optional(Type.Object({
7617
+ taskId: Type.Optional(Type.String()),
7618
+ taskType: Type.Optional(Type.String()),
7619
+ correlationId: Type.Optional(Type.String()),
7620
+ attemptN: Type.Optional(Type.Number())
7621
+ }, { description: "Shorthand: any combination compiles to the matching task:* tags and is merged into the tags filter." }))
7829
7622
  }),
7830
7623
  async execute(_id, params) {
7831
7624
  const { agent, diaryId } = ensureConnected(config);
7625
+ const expandedTags = compileTaskFilterTags(params.taskFilter);
7626
+ const allTags = [...params.tags ?? [], ...expandedTags];
7832
7627
  const results = await agent.entries.search({
7833
7628
  diaryId,
7834
7629
  query: params.query,
7835
- limit: params.limit ?? 5
7630
+ limit: params.limit ?? 5,
7631
+ ...allTags.length ? { tags: allTags } : {},
7632
+ ...params.excludeTags?.length ? { excludeTags: params.excludeTags } : {},
7633
+ ...params.entryTypes?.length ? { entryTypes: params.entryTypes } : {}
7836
7634
  });
7837
7635
  return {
7838
7636
  content: [{
@@ -7852,7 +7650,7 @@ function createMoltNetTools(config) {
7852
7650
  const createEntry = defineTool({
7853
7651
  name: "moltnet_create_entry",
7854
7652
  label: "Create MoltNet Diary Entry",
7855
- description: "Create a new diary entry to record decisions, findings, incidents, or reflections. During an active task, the entry is forced into the task diary and tagged with task:<id>, task_type:<type>, task_attempt:<n>, and correlation:<id> when set; an explicit diaryId mismatching the task diary is rejected.",
7653
+ description: "Create a new diary entry to record decisions, findings, incidents, or reflections. During an active task, the entry is forced into the task diary and tagged with the task:* provenance namespace (task:id:<id>, task:type:<type>, task:attempt:<n>, plus task:correlation:<id> when set); an explicit diaryId mismatching the task diary is rejected.",
7856
7654
  parameters: Type.Object({
7857
7655
  title: Type.String({ description: "Entry title (concise, descriptive)" }),
7858
7656
  content: Type.String({ description: "Entry content (markdown)" }),
@@ -7869,10 +7667,10 @@ function createMoltNetTools(config) {
7869
7667
  if (params.diaryId && params.diaryId !== taskCtx.diaryId) throw new Error(`entries_create: diaryId "${params.diaryId}" does not match the active task diary "${taskCtx.diaryId}". Entries created during a task must land in the task diary.`);
7870
7668
  targetDiaryId = taskCtx.diaryId;
7871
7669
  autoTags = [
7872
- `task:${taskCtx.taskId}`,
7873
- `task_type:${taskCtx.taskType}`,
7874
- `task_attempt:${taskCtx.attemptN}`,
7875
- ...taskCtx.correlationId ? [`correlation:${taskCtx.correlationId}`] : []
7670
+ `task:id:${taskCtx.taskId}`,
7671
+ `task:type:${taskCtx.taskType}`,
7672
+ `task:attempt:${taskCtx.attemptN}`,
7673
+ ...taskCtx.correlationId ? [`task:correlation:${taskCtx.correlationId}`] : []
7876
7674
  ];
7877
7675
  } else targetDiaryId = params.diaryId ?? envDiaryId;
7878
7676
  const userTags = params.tags ?? [];
@@ -7973,8 +7771,6 @@ function createMoltNetTools(config) {
7973
7771
  renderPack,
7974
7772
  listRenderedPacks,
7975
7773
  getRenderedPack,
7976
- createJudgePackTask,
7977
- judgeRenderedPack,
7978
7774
  diaryTags,
7979
7775
  listEntries,
7980
7776
  getEntry,
@@ -8591,135 +8387,6 @@ function ensureRelativeWorktreePaths(gitconfig) {
8591
8387
  return `${gitconfig}${gitconfig.endsWith("\n") ? "" : "\n"}[worktree]\n\tuseRelativePaths = true\n`;
8592
8388
  }
8593
8389
  //#endregion
8594
- //#region src/moltnet/judge-recipe-cid.ts
8595
- var require$1 = createRequire(import.meta.url);
8596
- var SELF_PACKAGE_NAME = "@themoltnet/pi-extension";
8597
- var PI_PACKAGE_NAME = "@mariozechner/pi-coding-agent";
8598
- var SDK_PACKAGE_NAME = "@themoltnet/sdk";
8599
- var CID_VERSION = 1;
8600
- var RAW_CODEC = 85;
8601
- var SHA2_256_CODE = 18;
8602
- var BASE32_ALPHABET = "abcdefghijklmnopqrstuvwxyz234567";
8603
- function findSelfPackageDir() {
8604
- const start = path.dirname(fileURLToPath(import.meta.url));
8605
- let dir = start;
8606
- while (true) {
8607
- const candidate = path.join(dir, "package.json");
8608
- if (existsSync(candidate)) {
8609
- if (JSON.parse(readFileSync(candidate, "utf8")).name === SELF_PACKAGE_NAME) return dir;
8610
- }
8611
- const parent = path.dirname(dir);
8612
- if (parent === dir) return start;
8613
- dir = parent;
8614
- }
8615
- }
8616
- var PACKAGE_DIR = findSelfPackageDir();
8617
- function sha256Hex(value) {
8618
- return createHash("sha256").update(value, "utf8").digest("hex");
8619
- }
8620
- function encodeVarint(value) {
8621
- const bytes = [];
8622
- let current = value >>> 0;
8623
- while (current >= 128) {
8624
- bytes.push(current & 127 | 128);
8625
- current >>>= 7;
8626
- }
8627
- bytes.push(current);
8628
- return bytes;
8629
- }
8630
- function base32Lower(bytes) {
8631
- let bits = 0;
8632
- let value = 0;
8633
- let output = "";
8634
- for (const byte of bytes) {
8635
- value = value << 8 | byte;
8636
- bits += 8;
8637
- while (bits >= 5) {
8638
- output += BASE32_ALPHABET[value >>> bits - 5 & 31];
8639
- bits -= 5;
8640
- }
8641
- }
8642
- if (bits > 0) output += BASE32_ALPHABET[value << 5 - bits & 31];
8643
- return `b${output}`;
8644
- }
8645
- function stableStringify(value) {
8646
- if (value === null || typeof value !== "object") return JSON.stringify(value);
8647
- if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(",")}]`;
8648
- return `{${Object.entries(value).sort(([left], [right]) => left.localeCompare(right)).map(([key, item]) => `${JSON.stringify(key)}:${stableStringify(item)}`).join(",")}}`;
8649
- }
8650
- function readPackageVersion(pkgPath, expectedName) {
8651
- if (!existsSync(pkgPath)) return null;
8652
- const parsed = JSON.parse(readFileSync(pkgPath, "utf8"));
8653
- if (expectedName && parsed.name !== expectedName) return null;
8654
- return typeof parsed.version === "string" ? parsed.version : null;
8655
- }
8656
- function resolveInstalledPackageVersion(packageName) {
8657
- const candidates = [];
8658
- try {
8659
- candidates.push(path.dirname(require$1.resolve(packageName)));
8660
- } catch {}
8661
- let dir = PACKAGE_DIR;
8662
- while (true) {
8663
- candidates.push(path.join(dir, "node_modules", packageName));
8664
- const parent = path.dirname(dir);
8665
- if (parent === dir) break;
8666
- dir = parent;
8667
- }
8668
- for (const start of candidates) {
8669
- let current = start;
8670
- while (true) {
8671
- const version = readPackageVersion(path.join(current, "package.json"), packageName);
8672
- if (version) return version;
8673
- const parent = path.dirname(current);
8674
- if (parent === current) break;
8675
- current = parent;
8676
- }
8677
- }
8678
- return null;
8679
- }
8680
- function resolvePiJudgeRecipeVersions() {
8681
- return {
8682
- pi: resolveInstalledPackageVersion(PI_PACKAGE_NAME),
8683
- piExtension: readPackageVersion(path.join(PACKAGE_DIR, "package.json"), SELF_PACKAGE_NAME),
8684
- sdk: resolveInstalledPackageVersion(SDK_PACKAGE_NAME)
8685
- };
8686
- }
8687
- function buildPiJudgeRecipeManifest(inputs) {
8688
- return {
8689
- kind: "pi-judge-recipe/v1",
8690
- versions: {
8691
- ...resolvePiJudgeRecipeVersions(),
8692
- ...inputs.overrides
8693
- },
8694
- assets: {
8695
- promptAsset: inputs.promptAsset ?? null,
8696
- rubricAsset: inputs.rubricAsset ?? null,
8697
- skillSourcePath: inputs.skillSourcePath ?? null
8698
- },
8699
- hashes: {
8700
- judgePromptSha256: sha256Hex(inputs.judgePrompt),
8701
- rubricSha256: sha256Hex(inputs.rubric),
8702
- skillFragmentSha256: inputs.skillFragment ? sha256Hex(inputs.skillFragment) : null,
8703
- implementationSha256: inputs.implementationSource ? sha256Hex(inputs.implementationSource) : null
8704
- }
8705
- };
8706
- }
8707
- function computePiJudgeRecipeCid(inputs) {
8708
- const manifest = buildPiJudgeRecipeManifest(inputs);
8709
- const manifestBytes = Buffer.from(stableStringify(manifest), "utf8");
8710
- const digestBytes = createHash("sha256").update(manifestBytes).digest();
8711
- return {
8712
- cid: base32Lower(Uint8Array.from([
8713
- ...encodeVarint(CID_VERSION),
8714
- ...encodeVarint(RAW_CODEC),
8715
- ...encodeVarint(SHA2_256_CODE),
8716
- ...encodeVarint(digestBytes.length),
8717
- ...digestBytes
8718
- ])),
8719
- manifest
8720
- };
8721
- }
8722
- //#endregion
8723
8390
  //#region src/otel/index.ts
8724
8391
  var TRACER_NAME = "@themoltnet/pi-extension/otel";
8725
8392
  function stripReservedAttrs(attrs) {
@@ -8891,7 +8558,13 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
8891
8558
  /**
8892
8559
  * How a judge must score a single criterion.
8893
8560
  *
8894
- * - `llm_judged`: 0..1 continuous, `rationale` required.
8561
+ * - `llm_score`: 0..1 continuous, `rationale` required. Smooths failures
8562
+ * into the gradient — use `llm_checklist` instead for properties where
8563
+ * a single failure is a real failure (grounding, faithfulness).
8564
+ * - `llm_checklist`: judge enumerates per-claim assertions with
8565
+ * `{passed, evidence}`. The criterion's numeric `score` is derived:
8566
+ * `1` iff every assertion passes, else `0`. Per-claim evidence is the
8567
+ * dataset for cluster-analysis of failure modes. See #999.
8895
8568
  * - `boolean`: 0 or 1, `rationale` optional.
8896
8569
  * - `deterministic_signature_check`: judge runs a signature check;
8897
8570
  * result is 0 or 1. No LLM discretion.
@@ -8899,11 +8572,31 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
8899
8572
  * appears in the rendered output; 0 or 1.
8900
8573
  */
8901
8574
  var RubricScoringMode = Type$1.Union([
8902
- Type$1.Literal("llm_judged"),
8575
+ Type$1.Literal("llm_score"),
8576
+ Type$1.Literal("llm_checklist"),
8903
8577
  Type$1.Literal("boolean"),
8904
8578
  Type$1.Literal("deterministic_signature_check"),
8905
8579
  Type$1.Literal("deterministic_coverage_check")
8906
8580
  ], { $id: "RubricScoringMode" });
8581
+ /**
8582
+ * One binary check produced by an `llm_checklist`-mode criterion.
8583
+ *
8584
+ * `evidence` is REQUIRED for both PASS and FAIL — agentskills.io grading
8585
+ * principle: \"Don't give the benefit of the doubt.\" A PASS without
8586
+ * concrete evidence (a quoted span, an entry id, a source location)
8587
+ * cannot be audited. A FAIL without evidence cannot be clustered into
8588
+ * structural fixes. The same shape is reused by `judge-eval-variant`
8589
+ * (#943) so tooling, dashboards, and analysis stay uniform.
8590
+ */
8591
+ var AssertionResult = Type$1.Object({
8592
+ id: Type$1.String({ minLength: 1 }),
8593
+ text: Type$1.String({ minLength: 1 }),
8594
+ passed: Type$1.Boolean(),
8595
+ evidence: Type$1.String({ minLength: 1 })
8596
+ }, {
8597
+ $id: "AssertionResult",
8598
+ additionalProperties: false
8599
+ });
8907
8600
  var RubricCriterion = Type$1.Object({
8908
8601
  id: Type$1.String({ minLength: 1 }),
8909
8602
  description: Type$1.String({ minLength: 1 }),
@@ -8963,44 +8656,165 @@ unrelated subsystems and the test coverage on the auth path is
8963
8656
  unchanged" is.
8964
8657
  `.trim();
8965
8658
  //#endregion
8659
+ //#region ../tasks/src/success-criteria.ts
8660
+ /**
8661
+ * SuccessCriteria — imposer-stated acceptance criteria, evaluated in two
8662
+ * complementary places.
8663
+ *
8664
+ * Before this envelope existed, criteria were scattered: a vestigial
8665
+ * `criteriaCid` column nobody resolved, an `acceptanceCriteria: string[]`
8666
+ * field on `fulfill_brief.input` that was "interpreted by the claiming
8667
+ * agent," and inline `rubric` / `criteria[]` fields on judgment-task
8668
+ * inputs. None of those were machine-verifiable end-to-end.
8669
+ *
8670
+ * This module defines a single, content-addressable envelope an imposer
8671
+ * attaches to any task type. It has four orthogonal sections — pick
8672
+ * whichever apply per task type:
8673
+ *
8674
+ * - `gates` Deterministic structural checks (CID/schema match)
8675
+ * - `assertions` Declarative claims about output JSON
8676
+ * - `rubric` Weighted-criteria scoring instrument, reused
8677
+ * verbatim from `./rubric.ts`.
8678
+ * - `sideEffects` Required process side-effects (e.g. diary entry)
8679
+ *
8680
+ * ## Two roles, two task types
8681
+ *
8682
+ * **Producer self-assessment** (fulfillment tasks: `fulfill_brief`,
8683
+ * `curate_pack`, `render_pack`). The producer **LLM** evaluates the
8684
+ * criteria against its own output and emits a `VerificationRecord`
8685
+ * inside `output.verification`. The daemon is pure passthrough — it
8686
+ * does not run `evaluateAssertions`, does not inspect the verification
8687
+ * record. The REST API is dumb storage; it never re-runs assertions and
8688
+ * never runs LLMs. The cross-field rule
8689
+ * `requireVerificationWhenCriteriaPresent` enforces "verification
8690
+ * required iff successCriteria present" at task-output validation time
8691
+ * (server-side schema check). Self-assessment is a truthful self-rating,
8692
+ * NOT enforcement — `verification.passed=false` does not block /complete
8693
+ * and does not affect `acceptedAttemptN`. See
8694
+ * `docs/agent-runtime.md` for the full producer/judge flow.
8695
+ *
8696
+ * **Binding evaluation** (judgment tasks: `assess_brief`, `judge_pack`).
8697
+ * A separate task whose IS the application of `successCriteria` to
8698
+ * someone else's output. Different agent (enforced at claim time), same
8699
+ * envelope. The judge's verdict is binding: this is the *gate* in the
8700
+ * MoltNet model. The rubric inside `successCriteria.rubric` IS the job
8701
+ * spec for the judge.
8702
+ *
8703
+ * The clean chain: producer task with `successCriteria` → producer
8704
+ * self-assesses honestly → imposer (or automation) creates a downstream
8705
+ * judgment task that references the same `successCriteria` (or a
8706
+ * stricter rubric) → judgment task delivers the binding verdict.
8707
+ *
8708
+ * Storage: SuccessCriteria lives inline at `task.input.successCriteria`,
8709
+ * pinned via the task's `inputCid`. No separate column or hash. When
8710
+ * #881 lands, the `rubric` field can graduate to `{ rubricCid }` lookup
8711
+ * without changing this envelope, and producer + judge tasks can pin
8712
+ * the SAME rubric across the chain for end-to-end auditability.
8713
+ */
8714
+ var SchemaCheckSpec = Type$1.Object({ schemaCid: Type$1.String({ minLength: 1 }) }, { additionalProperties: false });
8715
+ var CidEqualsSpec = Type$1.Object({
8716
+ path: Type$1.String({ minLength: 1 }),
8717
+ expected: Type$1.String({ minLength: 1 })
8718
+ }, { additionalProperties: false });
8719
+ var Gate = Type$1.Union([Type$1.Object({
8720
+ id: Type$1.String({ minLength: 1 }),
8721
+ kind: Type$1.Literal("schema-check"),
8722
+ spec: SchemaCheckSpec,
8723
+ required: Type$1.Boolean()
8724
+ }, { additionalProperties: false }), Type$1.Object({
8725
+ id: Type$1.String({ minLength: 1 }),
8726
+ kind: Type$1.Literal("cid-equals"),
8727
+ spec: CidEqualsSpec,
8728
+ required: Type$1.Boolean()
8729
+ }, { additionalProperties: false })], { $id: "Gate" });
8730
+ var AssertionOp = Type$1.Union([
8731
+ Type$1.Literal("exists"),
8732
+ Type$1.Literal("equals"),
8733
+ Type$1.Literal("matches"),
8734
+ Type$1.Literal("in-range"),
8735
+ Type$1.Literal("min-length")
8736
+ ], { $id: "AssertionOp" });
8737
+ var Assertion = Type$1.Object({
8738
+ id: Type$1.String({ minLength: 1 }),
8739
+ path: Type$1.String({ minLength: 1 }),
8740
+ op: AssertionOp,
8741
+ value: Type$1.Optional(Type$1.Unknown())
8742
+ }, {
8743
+ $id: "Assertion",
8744
+ additionalProperties: false
8745
+ });
8746
+ var SideEffectsSpec = Type$1.Object({
8747
+ diaryEntryRequired: Type$1.Optional(Type$1.Boolean()),
8748
+ diaryEntryTags: Type$1.Optional(Type$1.Array(Type$1.String({ minLength: 1 }))),
8749
+ referencedEntries: Type$1.Optional(Type$1.Integer({ minimum: 0 }))
8750
+ }, {
8751
+ $id: "SideEffectsSpec",
8752
+ additionalProperties: false
8753
+ });
8754
+ var SuccessCriteria = Type$1.Object({
8755
+ version: Type$1.Literal(1),
8756
+ gates: Type$1.Optional(Type$1.Array(Gate)),
8757
+ assertions: Type$1.Optional(Type$1.Array(Assertion)),
8758
+ rubric: Type$1.Optional(Rubric),
8759
+ minComposite: Type$1.Optional(Type$1.Number({
8760
+ minimum: 0,
8761
+ maximum: 1
8762
+ })),
8763
+ sideEffects: Type$1.Optional(SideEffectsSpec)
8764
+ }, {
8765
+ $id: "SuccessCriteria",
8766
+ additionalProperties: false
8767
+ });
8768
+ var VerificationResultStatus = Type$1.Union([
8769
+ Type$1.Literal("pass"),
8770
+ Type$1.Literal("fail"),
8771
+ Type$1.Literal("skip")
8772
+ ], { $id: "VerificationResultStatus" });
8773
+ var VerificationResultKind = Type$1.Union([
8774
+ Type$1.Literal("gate"),
8775
+ Type$1.Literal("assertion"),
8776
+ Type$1.Literal("rubric"),
8777
+ Type$1.Literal("sideEffect")
8778
+ ], { $id: "VerificationResultKind" });
8779
+ var VerificationResult = Type$1.Object({
8780
+ id: Type$1.String({ minLength: 1 }),
8781
+ kind: VerificationResultKind,
8782
+ status: VerificationResultStatus,
8783
+ detail: Type$1.Optional(Type$1.String())
8784
+ }, {
8785
+ $id: "VerificationResult",
8786
+ additionalProperties: false
8787
+ });
8788
+ var VerificationRecord = Type$1.Object({
8789
+ inputCid: Type$1.String({ minLength: 1 }),
8790
+ results: Type$1.Array(VerificationResult),
8791
+ passed: Type$1.Boolean()
8792
+ }, {
8793
+ $id: "VerificationRecord",
8794
+ additionalProperties: false
8795
+ });
8796
+ //#endregion
8966
8797
  //#region ../tasks/src/task-types/assess-brief.ts
8967
8798
  /**
8968
8799
  * `assess_brief` — independently evaluate a fulfilled brief.
8969
8800
  *
8970
8801
  * output_kind: judgment
8971
- * criteria: required (rubric lives as a diary entry with tag='rubric';
8972
- * the Task's `criteria_cid` points at that entry)
8802
+ * criteria: required (`successCriteria.rubric` same envelope as
8803
+ * `judge_pack`)
8973
8804
  * references: required (must reference the target `fulfill_brief` task)
8974
8805
  *
8975
8806
  * The assessor is a different agent from the producer (enforced by the
8976
8807
  * server / runtime at claim time — not in the wire schema).
8808
+ *
8809
+ * The rubric in `successCriteria` IS the job spec — the assessor applies
8810
+ * it to the target task's output and emits per-criterion scores. Other
8811
+ * sections (`assertions`, `gates`, `sideEffects`) MAY be present and are
8812
+ * evaluated against the *assessor's output*.
8977
8813
  */
8978
8814
  var ASSESS_BRIEF_TYPE = "assess_brief";
8979
- /**
8980
- * One criterion lifted from the rubric. Denormalized into the input so the
8981
- * assessor prompt can be built without a second fetch; the `criteria_cid`
8982
- * on the Task row remains authoritative for verification.
8983
- */
8984
- var AssessBriefCriterion = Type$1.Object({
8985
- id: Type$1.String({ minLength: 1 }),
8986
- description: Type$1.String({ minLength: 1 }),
8987
- weight: Type$1.Number({
8988
- minimum: 0,
8989
- maximum: 1
8990
- }),
8991
- scoring: Type$1.Union([
8992
- Type$1.Literal("llm_judged"),
8993
- Type$1.Literal("boolean"),
8994
- Type$1.Literal("deterministic_signature_check")
8995
- ])
8996
- }, {
8997
- $id: "AssessBriefCriterion",
8998
- additionalProperties: false
8999
- });
9000
8815
  var AssessBriefInput = Type$1.Object({
9001
8816
  targetTaskId: Type$1.String({ format: "uuid" }),
9002
- criteria: Type$1.Array(AssessBriefCriterion, { minItems: 1 }),
9003
- rubricPreamble: Type$1.Optional(Type$1.String())
8817
+ successCriteria: SuccessCriteria
9004
8818
  }, {
9005
8819
  $id: "AssessBriefInput",
9006
8820
  additionalProperties: false
@@ -9069,7 +8883,8 @@ var CuratePackInput = Type$1.Object({
9069
8883
  prefix: Type$1.Optional(Type$1.String())
9070
8884
  }, { additionalProperties: false })),
9071
8885
  tokenBudget: Type$1.Optional(Type$1.Number({ minimum: 500 })),
9072
- recipe: Type$1.Optional(Type$1.Union([Type$1.Literal("topic-focused-v1"), Type$1.Literal("scope-inventory-v1")]))
8886
+ recipe: Type$1.Optional(Type$1.Union([Type$1.Literal("topic-focused-v1"), Type$1.Literal("scope-inventory-v1")])),
8887
+ successCriteria: Type$1.Optional(SuccessCriteria)
9073
8888
  }, {
9074
8889
  $id: "CuratePackInput",
9075
8890
  additionalProperties: false
@@ -9094,7 +8909,8 @@ var CuratePackOutput = Type$1.Object({
9094
8909
  droppedIds: Type$1.Optional(Type$1.Array(Type$1.String({ format: "uuid" }))),
9095
8910
  notes: Type$1.String({ minLength: 1 })
9096
8911
  }, { additionalProperties: false }))),
9097
- summary: Type$1.String({ minLength: 1 })
8912
+ summary: Type$1.String({ minLength: 1 }),
8913
+ verification: Type$1.Optional(VerificationRecord)
9098
8914
  }, {
9099
8915
  $id: "CuratePackOutput",
9100
8916
  additionalProperties: false
@@ -9113,6 +8929,7 @@ var FulfillBriefInput = Type$1.Object({
9113
8929
  brief: Type$1.String({ minLength: 1 }),
9114
8930
  title: Type$1.Optional(Type$1.String()),
9115
8931
  acceptanceCriteria: Type$1.Optional(Type$1.Array(Type$1.String())),
8932
+ successCriteria: Type$1.Optional(SuccessCriteria),
9116
8933
  seedFiles: Type$1.Optional(Type$1.Array(Type$1.String())),
9117
8934
  scopeHint: Type$1.Optional(Type$1.String())
9118
8935
  }, {
@@ -9132,7 +8949,8 @@ var FulfillBriefOutput = Type$1.Object({
9132
8949
  }, { additionalProperties: false })),
9133
8950
  pullRequestUrl: Type$1.Union([Type$1.String(), Type$1.Null()]),
9134
8951
  diaryEntryIds: Type$1.Array(Type$1.String({ format: "uuid" })),
9135
- summary: Type$1.String({ minLength: 1 })
8952
+ summary: Type$1.String({ minLength: 1 }),
8953
+ verification: Type$1.Optional(VerificationRecord)
9136
8954
  }, {
9137
8955
  $id: "FulfillBriefOutput",
9138
8956
  additionalProperties: false
@@ -9143,19 +8961,18 @@ var FulfillBriefOutput = Type$1.Object({
9143
8961
  * `judge_pack` — independently score a rendered pack against a rubric.
9144
8962
  *
9145
8963
  * output_kind: judgment
9146
- * criteria: required (embedded `rubric` — see Phase 1 design in #852
9147
- * amendment and Phase 2 issue #881)
8964
+ * criteria: required (`successCriteria.rubric` — see #852 amendment and
8965
+ * Phase 2 issue #881)
9148
8966
  * references: required (must reference the `render_pack` task it judges,
9149
8967
  * role='judged_work')
9150
8968
  *
9151
8969
  * Step 3 of the three-session attribution loop (#875). Mirrors
9152
8970
  * `assess_brief` in shape, but over a rendered context pack.
9153
8971
  *
9154
- * Phase 1 rubric storage: the rubric body is inlined in `input.rubric`.
9155
- * Integrity is pinned via the task's `input_cid`. Phase 2 (#881) will
9156
- * replace the inline body with a `rubric_cid` referencing a `rubrics`
9157
- * table row; the denormalized `criteria[]` projection stays for prompt
9158
- * building without a fetch.
8972
+ * Phase 1 rubric storage: the rubric body lives at
8973
+ * `input.successCriteria.rubric` and is pinned via the task's `inputCid`.
8974
+ * Phase 2 (#881) will replace the inline body with a `rubricCid`
8975
+ * referencing a stored `rubrics` row; the envelope stays the same.
9159
8976
  *
9160
8977
  * The judge MUST be a different agent from the renderer. Enforced at
9161
8978
  * claim time by the runtime, not in the wire schema.
@@ -9164,7 +8981,7 @@ var JUDGE_PACK_TYPE = "judge_pack";
9164
8981
  var JudgePackInput = Type$1.Object({
9165
8982
  renderedPackId: Type$1.String({ format: "uuid" }),
9166
8983
  sourcePackId: Type$1.String({ format: "uuid" }),
9167
- rubric: Rubric
8984
+ successCriteria: SuccessCriteria
9168
8985
  }, {
9169
8986
  $id: "JudgePackInput",
9170
8987
  additionalProperties: false
@@ -9177,6 +8994,7 @@ var JudgePackScore = Type$1.Object({
9177
8994
  maximum: 1
9178
8995
  }),
9179
8996
  rationale: Type$1.Optional(Type$1.String()),
8997
+ assertions: Type$1.Optional(Type$1.Array(AssertionResult, { minItems: 1 })),
9180
8998
  evidence: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Unknown()))
9181
8999
  }, {
9182
9000
  $id: "JudgePackScore",
@@ -9195,6 +9013,39 @@ var JudgePackOutput = Type$1.Object({
9195
9013
  $id: "JudgePackOutput",
9196
9014
  additionalProperties: false
9197
9015
  });
9016
+ /**
9017
+ * Cross-field validator for JudgePackOutput. Run after the TypeBox
9018
+ * schema check passes. Enforces invariants the schema can't express:
9019
+ *
9020
+ * 1. If a `JudgePackScore` carries an `assertions` array (i.e. the
9021
+ * judge ran the criterion in `llm_checklist` mode), its numeric
9022
+ * `score` MUST equal `1` if every `assertions[i].passed` is true,
9023
+ * else `0`. The prompt instructs the judge to derive `score` from
9024
+ * the array, but the LLM can drift — without this check, the
9025
+ * runtime accepts inconsistent payloads and propagates them into
9026
+ * composite scores and judge attestations (#999 P1).
9027
+ *
9028
+ * 2. If `score` is exactly `1` AND `assertions` is present, every
9029
+ * assertion must have `passed: true`. Catches the failure mode in
9030
+ * the issue: "score: 1 with a failing assertion accepted."
9031
+ *
9032
+ * Cross-rubric checks (e.g. "did the judge populate `assertions` for
9033
+ * every criterion the rubric marked `llm_checklist`?") require the
9034
+ * input rubric and live in a separate, runtime-side validator. This
9035
+ * one is rubric-agnostic on purpose — it catches within-score
9036
+ * inconsistency without needing the original task input.
9037
+ */
9038
+ function validateJudgePackOutput(output) {
9039
+ const scores = output.scores;
9040
+ for (let i = 0; i < scores.length; i++) {
9041
+ const s = scores[i];
9042
+ if (!s.assertions) continue;
9043
+ const allPassed = s.assertions.every((a) => a.passed);
9044
+ const expected = allPassed ? 1 : 0;
9045
+ if (s.score !== expected) return `scores[${i}] (criterionId="${s.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${s.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
9046
+ }
9047
+ return null;
9048
+ }
9198
9049
  //#endregion
9199
9050
  //#region ../tasks/src/task-types/render-pack.ts
9200
9051
  /**
@@ -9216,7 +9067,8 @@ var RENDER_PACK_TYPE = "render_pack";
9216
9067
  var RenderPackInput = Type$1.Object({
9217
9068
  packId: Type$1.String({ format: "uuid" }),
9218
9069
  persist: Type$1.Optional(Type$1.Boolean()),
9219
- pinned: Type$1.Optional(Type$1.Boolean())
9070
+ pinned: Type$1.Optional(Type$1.Boolean()),
9071
+ successCriteria: Type$1.Optional(SuccessCriteria)
9220
9072
  }, {
9221
9073
  $id: "RenderPackInput",
9222
9074
  additionalProperties: false
@@ -9227,7 +9079,8 @@ var RenderPackOutput = Type$1.Object({
9227
9079
  renderMethod: Type$1.String({ minLength: 1 }),
9228
9080
  byteSize: Type$1.Number({ minimum: 0 }),
9229
9081
  entriesRendered: Type$1.Number({ minimum: 0 }),
9230
- summary: Type$1.String({ minLength: 1 })
9082
+ summary: Type$1.String({ minLength: 1 }),
9083
+ verification: Type$1.Optional(VerificationRecord)
9231
9084
  }, {
9232
9085
  $id: "RenderPackOutput",
9233
9086
  additionalProperties: false
@@ -9235,6 +9088,33 @@ var RenderPackOutput = Type$1.Object({
9235
9088
  //#endregion
9236
9089
  //#region ../tasks/src/task-types/index.ts
9237
9090
  /**
9091
+ * Validate that a judgment-task input carries a rubric inside its
9092
+ * `successCriteria` envelope, and that the rubric's weights sum to 1.
9093
+ * Used for `assess_brief` and `judge_pack`.
9094
+ */
9095
+ function validateJudgmentInput(input) {
9096
+ const sc = input.successCriteria;
9097
+ if (!sc) return "successCriteria is required for judgment tasks";
9098
+ if (!sc.rubric) return "successCriteria.rubric is required for judgment tasks";
9099
+ return validateRubricWeights(sc.rubric);
9100
+ }
9101
+ /**
9102
+ * Cross-field rule: when `input.successCriteria` is set, the producer's
9103
+ * output MUST carry a `verification` block (the LLM's self-assessment).
9104
+ * When it is unset, the output MUST NOT carry one (avoid garbage data).
9105
+ *
9106
+ * Used by all three fulfillment task types. Judgment task outputs do
9107
+ * NOT use this — their entire output IS a structured judgment, so a
9108
+ * separate self-assessment field would be circular.
9109
+ */
9110
+ function requireVerificationWhenCriteriaPresent(output, input) {
9111
+ const hasCriteria = input !== void 0 && input !== null && input.successCriteria !== void 0;
9112
+ const hasVerification = output.verification !== void 0;
9113
+ if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the criteria";
9114
+ if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no criteria to assess against";
9115
+ return null;
9116
+ }
9117
+ /**
9238
9118
  * Client-side task-type registry. Mirrors the server-owned DB registry
9239
9119
  * (PR 2). PR 0 shipped the two brief types; this PR adds the three
9240
9120
  * pack-pipeline types for the three-session attribution loop (#875).
@@ -9249,41 +9129,41 @@ var BUILT_IN_TASK_TYPES = {
9249
9129
  inputSchema: FulfillBriefInput,
9250
9130
  outputSchema: FulfillBriefOutput,
9251
9131
  outputKind: "artifact",
9252
- requiresCriteria: false,
9253
- requiresReferences: false
9132
+ requiresReferences: false,
9133
+ validateOutput: requireVerificationWhenCriteriaPresent
9254
9134
  },
9255
9135
  [ASSESS_BRIEF_TYPE]: {
9256
9136
  name: ASSESS_BRIEF_TYPE,
9257
9137
  inputSchema: AssessBriefInput,
9258
9138
  outputSchema: AssessBriefOutput,
9259
9139
  outputKind: "judgment",
9260
- requiresCriteria: true,
9261
- requiresReferences: true
9140
+ requiresReferences: true,
9141
+ validateInput: validateJudgmentInput
9262
9142
  },
9263
9143
  [CURATE_PACK_TYPE]: {
9264
9144
  name: CURATE_PACK_TYPE,
9265
9145
  inputSchema: CuratePackInput,
9266
9146
  outputSchema: CuratePackOutput,
9267
9147
  outputKind: "artifact",
9268
- requiresCriteria: false,
9269
- requiresReferences: false
9148
+ requiresReferences: false,
9149
+ validateOutput: requireVerificationWhenCriteriaPresent
9270
9150
  },
9271
9151
  [RENDER_PACK_TYPE]: {
9272
9152
  name: RENDER_PACK_TYPE,
9273
9153
  inputSchema: RenderPackInput,
9274
9154
  outputSchema: RenderPackOutput,
9275
9155
  outputKind: "artifact",
9276
- requiresCriteria: false,
9277
- requiresReferences: false
9156
+ requiresReferences: false,
9157
+ validateOutput: requireVerificationWhenCriteriaPresent
9278
9158
  },
9279
9159
  [JUDGE_PACK_TYPE]: {
9280
9160
  name: JUDGE_PACK_TYPE,
9281
9161
  inputSchema: JudgePackInput,
9282
9162
  outputSchema: JudgePackOutput,
9283
9163
  outputKind: "judgment",
9284
- requiresCriteria: false,
9285
9164
  requiresReferences: true,
9286
- validateInput: (input) => validateRubricWeights(input.rubric)
9165
+ validateInput: validateJudgmentInput,
9166
+ validateOutput: validateJudgePackOutput
9287
9167
  }
9288
9168
  };
9289
9169
  //#endregion
@@ -9313,13 +9193,30 @@ function schemaErrors(prefix, schema, value) {
9313
9193
  message: error.message
9314
9194
  }));
9315
9195
  }
9316
- function validateTaskOutput(taskType, output) {
9196
+ function validateTaskOutput(taskType, output, input) {
9317
9197
  const entry = getTaskTypeEntry(taskType);
9318
9198
  if (!entry) return [{
9319
9199
  field: "taskType",
9320
9200
  message: `Unknown task type: ${taskType}`
9321
9201
  }];
9322
- return schemaErrors("output", entry.outputSchema, output);
9202
+ const errors = schemaErrors("output", entry.outputSchema, output);
9203
+ if (errors.length > 0) return errors;
9204
+ if (entry.validateOutput) {
9205
+ const validationError = entry.validateOutput(output, input);
9206
+ if (validationError) return [{
9207
+ field: "output",
9208
+ message: validationError
9209
+ }];
9210
+ }
9211
+ return [];
9212
+ }
9213
+ /**
9214
+ * Resolve the TypeBox output schema registered for `taskType`. Returns
9215
+ * `null` for unknown task types — callers (e.g. submit-tool factories)
9216
+ * decide how to surface that.
9217
+ */
9218
+ function getTaskOutputSchema(taskType) {
9219
+ return getTaskTypeEntry(taskType)?.outputSchema ?? null;
9323
9220
  }
9324
9221
  //#endregion
9325
9222
  //#region ../tasks/src/wire.ts
@@ -9451,7 +9348,6 @@ Type$1.Object({
9451
9348
  input: Type$1.Record(Type$1.String(), Type$1.Unknown()),
9452
9349
  inputSchemaCid: Cid,
9453
9350
  inputCid: Cid,
9454
- criteriaCid: Type$1.Union([Cid, Type$1.Null()]),
9455
9351
  references: Type$1.Array(TaskRef),
9456
9352
  correlationId: Type$1.Union([Uuid, Type$1.Null()]),
9457
9353
  imposedByAgentId: Type$1.Union([Uuid, Type$1.Null()]),
@@ -9549,6 +9445,98 @@ Type$1.Object({
9549
9445
  additionalProperties: false
9550
9446
  });
9551
9447
  //#endregion
9448
+ //#region ../agent-runtime/src/output-tools.ts
9449
+ /**
9450
+ * Submit-output tool contract.
9451
+ *
9452
+ * The runtime advertises a per-task-type "submit output" tool in every
9453
+ * prompt. The tool's name and schema must be the same wherever the
9454
+ * agent encounters it: in the system prompt the model reads, in the
9455
+ * executor that registers it, in any future executor that wires it
9456
+ * into a different coding-agent SDK.
9457
+ *
9458
+ * This module is the single source of truth for the (toolName,
9459
+ * description, parametersSchema) triple. It has no executor-specific
9460
+ * dependencies — `agent-runtime` is intentionally agnostic of the
9461
+ * concrete coding-agent runtime — so anything that wants to register
9462
+ * the tool (pi-extension today, a Codex-SDK adapter tomorrow, a local
9463
+ * MCP bridge if we ever go that route) can read the contract here and
9464
+ * wire it into its own tool API.
9465
+ *
9466
+ * Conventions captured here:
9467
+ *
9468
+ * - Tool name shape: `submit_<task_type>_output` (e.g.
9469
+ * `submit_fulfill_brief_output`). This is the string the model
9470
+ * sees in the prompt's "preferred path" instruction.
9471
+ * - Parameters schema: the task type's TypeBox `*Output` schema
9472
+ * **directly**, NOT wrapped in `{ output: <schema> }`. Tool args
9473
+ * ARE the payload, so the model gets field-level guidance at
9474
+ * planning time.
9475
+ * - Description text: shared across executors so the tool's
9476
+ * advertised purpose is identical regardless of who registers it.
9477
+ */
9478
+ /**
9479
+ * Build the submit-output contract for a task type. Returns `null` if
9480
+ * no output schema is registered for that type — callers (executors)
9481
+ * decide whether that's a hard error, a fallback to the parser-only
9482
+ * path, or anything else.
9483
+ */
9484
+ function getSubmitOutputContract(taskType) {
9485
+ const schema = getTaskOutputSchema(taskType);
9486
+ if (!schema) return null;
9487
+ return {
9488
+ toolName: submitOutputToolName(taskType),
9489
+ taskType,
9490
+ description: `Submit the structured output for this ${taskType} task. Call exactly once when done. The arguments below ARE the output payload — pass each top-level field of the task type's output schema directly. The runtime validates the args against the schema; mismatches return a tool error you can recover from in the same session. On a valid call the runtime captures the payload and ends the session — you do not need to repeat the JSON in your final assistant message.`,
9491
+ parametersSchema: schema
9492
+ };
9493
+ }
9494
+ /**
9495
+ * Plain-string name builder. Exposed separately so the prompt builder
9496
+ * can advertise the tool name even when the schema lookup is deferred
9497
+ * to the executor (the prompt is built before any tool registration
9498
+ * happens).
9499
+ */
9500
+ function submitOutputToolName(taskType) {
9501
+ return `submit_${taskType}_output`;
9502
+ }
9503
+ //#endregion
9504
+ //#region ../agent-runtime/src/prompts/final-output.ts
9505
+ function buildFinalOutputBlock(opts) {
9506
+ const { taskType, outputSchemaName, shapeSketch, extraNotes } = opts;
9507
+ const submitTool = submitOutputToolName(taskType);
9508
+ const lines = [
9509
+ "## Final output (read this carefully)",
9510
+ "",
9511
+ `Your VERY LAST action in this conversation MUST report the structured`,
9512
+ `output matching \`${outputSchemaName}\`. Two ways to do it, in order of`,
9513
+ `preference:`,
9514
+ "",
9515
+ `1. **Preferred — call \`${submitTool}\` exactly once** with the payload.`,
9516
+ ` The runtime captures the validated arguments and ends the session.`,
9517
+ ` If the tool is registered, prefer this path.`,
9518
+ `2. **Fallback** — if the submit tool is unavailable, your very last`,
9519
+ ` assistant message MUST be a single JSON object matching`,
9520
+ ` \`${outputSchemaName}\`. No prose before or after. No code fences.`,
9521
+ ` No "ok" or "done". The runtime parses the last balanced top-level`,
9522
+ ` JSON object as the output.`,
9523
+ "",
9524
+ `Failing to report structured output as the very last action means the`,
9525
+ `attempt is marked failed even if the underlying work succeeded.`,
9526
+ "",
9527
+ `Output shape:`,
9528
+ "",
9529
+ "```json",
9530
+ shapeSketch,
9531
+ "```"
9532
+ ];
9533
+ if (extraNotes?.length) {
9534
+ lines.push("");
9535
+ for (const note of extraNotes) lines.push(note);
9536
+ }
9537
+ return lines.join("\n");
9538
+ }
9539
+ //#endregion
9552
9540
  //#region ../agent-runtime/src/prompts/assess-brief.ts
9553
9541
  /**
9554
9542
  * Build the system prompt for an `assess_brief` judge attempt.
@@ -9573,11 +9561,12 @@ Type$1.Object({
9573
9561
  * anything) work without any code path here.
9574
9562
  */
9575
9563
  function buildAssessBriefPrompt(input, ctx) {
9576
- const criteriaList = input.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
9577
- const preambleSection = input.rubricPreamble ? [
9564
+ const rubric = input.successCriteria.rubric;
9565
+ const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
9566
+ const preambleSection = rubric.preamble ? [
9578
9567
  "### Rubric preamble",
9579
9568
  "",
9580
- input.rubricPreamble,
9569
+ rubric.preamble,
9581
9570
  ""
9582
9571
  ].join("\n") : "";
9583
9572
  return [
@@ -9606,6 +9595,20 @@ function buildAssessBriefPrompt(input, ctx) {
9606
9595
  " - `summary` set → use as orientation, not as ground truth.",
9607
9596
  "Adapt your investigation to whatever the output actually contains. Score conservatively when the producer's output is opaque or thin.",
9608
9597
  "",
9598
+ "### Querying the producer's diary entries",
9599
+ "",
9600
+ `Beyond the explicit \`diaryEntryIds[]\` from step 3, the producer's`,
9601
+ "attempts auto-tag every entry with the `task:*` provenance namespace.",
9602
+ "You can pull the full set without enumerating ids by passing the",
9603
+ "`taskFilter` shorthand to `moltnet_list_entries` or",
9604
+ "`moltnet_search_entries`:",
9605
+ "",
9606
+ `- All entries from the producer task: \`taskFilter: { taskId: "${input.targetTaskId}" }\`.`,
9607
+ "- Just the accepted attempt: add `attemptN: <acceptedAttemptN>`.",
9608
+ "- The producer plus any prior chain (when a correlationId was set):",
9609
+ " read it from the task you fetched in step 1 and pass",
9610
+ " `taskFilter: { correlationId: \"<id>\" }`.",
9611
+ "",
9609
9612
  preambleSection,
9610
9613
  "## Criteria",
9611
9614
  "",
@@ -9613,19 +9616,63 @@ function buildAssessBriefPrompt(input, ctx) {
9613
9616
  "",
9614
9617
  "### Scoring rules",
9615
9618
  "",
9616
- "- `llm_judged`: score 0..1 continuous. `rationale` REQUIRED (2–4 sentences).",
9619
+ "- `llm_score`: score 0..1 continuous. `rationale` REQUIRED (2–4 sentences).",
9617
9620
  "- `boolean`: score exactly 0 or 1. `rationale` optional.",
9618
9621
  "- `deterministic_signature_check`: run `moltnet entry verify` on every diary entry returned by step 3 above AND `git verify-commit` on every commit. Score 1 iff ALL signatures are valid; otherwise 0. Populate `evidence.commitsVerified`, `evidence.commitsTotal`, `evidence.signatureFailures`.",
9619
9622
  "",
9620
- "### Final output",
9623
+ "Write a signed diary entry (tags: \"judgment\", \"assess_brief\") capturing the rationale before reporting structured output.",
9621
9624
  "",
9622
- "Emit a JSON object matching `AssessBriefOutput`:",
9623
- " { \"scores\": [{criterionId, score, rationale?, evidence?}], \"composite\", \"verdict\", \"judgeModel\"? }",
9624
- "`composite` = Σ(weight_i × score_i) recomputed. The runtime will reject a mismatch.",
9625
- "Write a signed diary entry (tags: \"judgment\", \"assess_brief\") capturing the rationale before emitting the JSON."
9625
+ buildFinalOutputBlock({
9626
+ taskType: "assess_brief",
9627
+ outputSchemaName: "AssessBriefOutput",
9628
+ shapeSketch: [
9629
+ "{",
9630
+ " \"scores\": [",
9631
+ " { \"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {} }",
9632
+ " ],",
9633
+ " \"composite\": <sum>,",
9634
+ " \"verdict\": \"<1-3 sentence overall>\",",
9635
+ " \"judgeModel\": \"<provider:model>\"",
9636
+ "}"
9637
+ ].join("\n"),
9638
+ extraNotes: ["`composite` = Σ(weight_i × score_i) recomputed. The runtime rejects a mismatch."]
9639
+ })
9626
9640
  ].filter(Boolean).join("\n");
9627
9641
  }
9628
9642
  //#endregion
9643
+ //#region ../agent-runtime/src/prompts/self-verification.ts
9644
+ function buildSelfVerificationBlock(taskId) {
9645
+ return [
9646
+ "## Self-verification",
9647
+ "",
9648
+ `Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.successCriteria\`.`,
9649
+ "",
9650
+ "- If `input.successCriteria` is **absent**, omit `verification` from your",
9651
+ " final output entirely.",
9652
+ "- If `input.successCriteria` is **present**, you MUST include a",
9653
+ " `verification` block in your final output. Evaluate every applicable",
9654
+ " item — `gates`, `assertions`, `rubric` criteria, `sideEffects` — against",
9655
+ " your produced work and emit one result per id. Be honest: a `fail` with",
9656
+ " a one-line reason is more useful than a false `pass`. Use `skip` (with a",
9657
+ " `detail`) when you genuinely could not determine a result. Compute",
9658
+ " `passed = results.every(r => r.status !== 'fail')`.",
9659
+ "",
9660
+ "Verification shape:",
9661
+ "",
9662
+ "```json",
9663
+ "{",
9664
+ " \"inputCid\": \"<the inputCid you saw on the task>\",",
9665
+ " \"results\": [",
9666
+ " { \"id\": \"<criterion id>\", \"kind\": \"assertion|gate|rubric|sideEffect\",",
9667
+ " \"status\": \"pass|fail|skip\", \"detail\": \"<optional one-liner>\" }",
9668
+ " ],",
9669
+ " \"passed\": <boolean>",
9670
+ "}",
9671
+ "```",
9672
+ ""
9673
+ ].join("\n");
9674
+ }
9675
+ //#endregion
9629
9676
  //#region ../agent-runtime/src/prompts/curate-pack.ts
9630
9677
  /**
9631
9678
  * Build the system prompt for a `curate_pack` task.
@@ -9699,9 +9746,16 @@ function buildCuratePackPrompt(input, ctx) {
9699
9746
  "## Tools available (not a recipe — use what the situation calls for)",
9700
9747
  "",
9701
9748
  "- `moltnet_diary_tags` — tag inventory with counts. Cheap reconnaissance",
9702
- " when the prompt implies a scope but not a tag.",
9749
+ " when the prompt implies a scope but not a tag. Pass",
9750
+ " `prefix: \"task:\"` to enumerate task-provenance tags only",
9751
+ " (`task:type:*`, `task:correlation:*`, etc.).",
9703
9752
  "- `moltnet_search_entries` — hybrid semantic + lexical search.",
9704
- "- `moltnet_list_entries` tag-filtered listing.",
9753
+ " Filters AND with the query: pass `tags`, `excludeTags`,",
9754
+ " `entryTypes`, or the `taskFilter` shorthand to narrow before",
9755
+ " ranking. Example: `taskFilter: { taskType: \"fulfill_brief\" }`",
9756
+ " returns only entries from fulfill_brief attempts.",
9757
+ "- `moltnet_list_entries` — multi-tag (AND) listing with optional",
9758
+ " `excludeTags`, `entryType`, and the same `taskFilter` shorthand.",
9705
9759
  "- `moltnet_get_entry` — full entry read, for disambiguation.",
9706
9760
  "- `moltnet_pack_create` — terminal call that persists the pack.",
9707
9761
  "",
@@ -9747,31 +9801,32 @@ function buildCuratePackPrompt(input, ctx) {
9747
9801
  "",
9748
9802
  "## Hard constraints",
9749
9803
  "",
9750
- "- Do NOT call `moltnet_pack_render` or `moltnet_rendered_pack_judge` ",
9751
- " those belong to the next sessions.",
9804
+ "- Do NOT call `moltnet_pack_render` that belongs to the next session.",
9752
9805
  "- Do NOT write diary entries unless curation surfaces a genuine",
9753
9806
  " incident worth recording. The curation reasoning lives in the task",
9754
9807
  " output, not in the diary.",
9755
9808
  "- Respect hard include/exclude filters literally.",
9756
9809
  "",
9757
- "## Final output",
9758
- "",
9759
- "Write to stdout a JSON object matching `CuratePackOutput`:",
9760
- "```",
9761
- "{",
9762
- " \"packId\": \"<uuid>\",",
9763
- " \"packCid\": \"<cid>\",",
9764
- " \"entries\": [",
9765
- " { \"entryId\": \"<uuid>\", \"rank\": 1, \"rationale\": \"<why>\" }",
9766
- " ],",
9767
- " \"recipeParams\": { \"recipe\": \"...\", \"prompt\": \"...\", ... },",
9768
- " \"checkpoints\": [",
9769
- " { \"phase\": \"recon\", \"candidateIds\": [...], \"droppedIds\": [...], \"notes\": \"...\" }",
9770
- " ],",
9771
- " \"summary\": \"<2-4 sentences: what you looked for, how you narrowed, what defines the final set>\"",
9772
- "}",
9773
- "```",
9774
- "The runtime parses this. Failing to emit it is a task failure."
9810
+ buildSelfVerificationBlock(ctx.taskId),
9811
+ buildFinalOutputBlock({
9812
+ taskType: "curate_pack",
9813
+ outputSchemaName: "CuratePackOutput",
9814
+ shapeSketch: [
9815
+ "{",
9816
+ " \"packId\": \"<uuid>\",",
9817
+ " \"packCid\": \"<cid>\",",
9818
+ " \"entries\": [",
9819
+ " { \"entryId\": \"<uuid>\", \"rank\": 1, \"rationale\": \"<why>\" }",
9820
+ " ],",
9821
+ " \"recipeParams\": { \"recipe\": \"...\", \"prompt\": \"...\", ... },",
9822
+ " \"checkpoints\": [",
9823
+ " { \"phase\": \"recon\", \"candidateIds\": [...], \"droppedIds\": [...], \"notes\": \"...\" }",
9824
+ " ],",
9825
+ " \"summary\": \"<2-4 sentences: what you looked for, how you narrowed, what defines the final set>\",",
9826
+ " \"verification\": <required iff input.successCriteria; see Self-verification>",
9827
+ "}"
9828
+ ].join("\n")
9829
+ })
9775
9830
  ].filter((l) => l !== null).join("\n");
9776
9831
  }
9777
9832
  //#endregion
@@ -9829,17 +9884,28 @@ function buildFulfillBriefPrompt(input, ctx) {
9829
9884
  " `MoltNet-Diary: <id>` (per the runtime instructor).",
9830
9885
  "6. Push the branch and open a PR.",
9831
9886
  "",
9832
- "### Final output",
9833
- "",
9834
- "When done, write to stdout a JSON object with shape matching `FulfillBriefOutput`:",
9835
- " { \"branch\", \"commits\": [{sha, message, diaryEntryId}], \"pullRequestUrl\", \"diaryEntryIds\", \"summary\" }",
9836
- "The runtime parses this as the structured task output. Failing to emit it is a failure."
9887
+ buildSelfVerificationBlock(ctx.taskId),
9888
+ buildFinalOutputBlock({
9889
+ taskType: "fulfill_brief",
9890
+ outputSchemaName: "FulfillBriefOutput",
9891
+ shapeSketch: [
9892
+ "{",
9893
+ " \"branch\": \"<branch-name>\",",
9894
+ " \"commits\": [{ \"sha\": \"...\", \"message\": \"...\", \"diaryEntryId\": \"...\" }],",
9895
+ " \"pullRequestUrl\": \"<url-or-null>\",",
9896
+ " \"diaryEntryIds\": [\"...\"],",
9897
+ " \"summary\": \"<1-3 sentence recap>\",",
9898
+ " \"verification\": <required iff input.successCriteria; see Self-verification>",
9899
+ "}"
9900
+ ].join("\n")
9901
+ })
9837
9902
  ].filter(Boolean).join("\n");
9838
9903
  }
9839
9904
  //#endregion
9840
9905
  //#region ../agent-runtime/src/prompts/judge-pack.ts
9841
9906
  function buildJudgePackPrompt(input, ctx) {
9842
- const { renderedPackId, sourcePackId, rubric } = input;
9907
+ const { renderedPackId, sourcePackId, successCriteria } = input;
9908
+ const rubric = successCriteria.rubric;
9843
9909
  const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
9844
9910
  const preambleSection = rubric.preamble ? [
9845
9911
  "### Rubric preamble",
@@ -9869,7 +9935,7 @@ function buildJudgePackPrompt(input, ctx) {
9869
9935
  "",
9870
9936
  "1. Call `moltnet_rendered_pack_get` for the rendered pack. Keep the",
9871
9937
  " `content` string — you will score it.",
9872
- "2. Call `moltnet_pack_get` with `expand: \"entries\"` for the source",
9938
+ "2. Call `moltnet_pack_get` with `expandEntries: true` for the source",
9873
9939
  " pack. Keep the source entries for grounding / coverage checks.",
9874
9940
  "3. For each criterion, score according to its `scoring` mode (see",
9875
9941
  " Scoring rules below). Produce rationales where required.",
@@ -9882,9 +9948,23 @@ function buildJudgePackPrompt(input, ctx) {
9882
9948
  "",
9883
9949
  "### Scoring rules",
9884
9950
  "",
9885
- "- `llm_judged`: score 0..1 continuous. `rationale` REQUIRED (2–4",
9951
+ "- `llm_score`: score 0..1 continuous. `rationale` REQUIRED (2–4",
9886
9952
  " sentences pointing at specific evidence in the rendered content or",
9887
- " the source entries).",
9953
+ " the source entries). NOTE: this mode smooths individual failures",
9954
+ " into the gradient. Prefer `llm_checklist` for grounding,",
9955
+ " faithfulness, or any property where one failure is a real failure.",
9956
+ "- `llm_checklist`: enumerate per-claim binary assertions instead of",
9957
+ " picking a continuous score. For each assertion, return",
9958
+ " `{ id, text, passed: bool, evidence: string }`. `evidence` is",
9959
+ " REQUIRED for both PASS and FAIL — for PASS, quote the supporting",
9960
+ " span (rendered or source) or cite the source entry id; for FAIL,",
9961
+ " quote the offending claim verbatim and explain why it fails.",
9962
+ " Don't give the benefit of the doubt: if a claim looks supported but",
9963
+ " you cannot point at the supporting source span, mark it FAIL with",
9964
+ " evidence = \"no supporting span found\". Set the criterion `score`",
9965
+ " to `1` iff every assertion passes, else `0` — the runtime checks",
9966
+ " this matches the assertions array. Populate `assertions` on the",
9967
+ " score object; leave `evidence` (the structured record) empty.",
9888
9968
  "- `boolean`: score exactly 0 or 1. `rationale` optional.",
9889
9969
  "- `deterministic_signature_check`: batch-fetch ALL referenced source",
9890
9970
  " entries in a single call — `moltnet_list_entries` with `entryIds` set",
@@ -9915,23 +9995,36 @@ function buildJudgePackPrompt(input, ctx) {
9915
9995
  " may leak guidance that biases judgment.",
9916
9996
  "- Keep the session focused on scoring; no speculative exploration.",
9917
9997
  "",
9918
- "## Final output",
9919
- "",
9920
- "Write to stdout a JSON object matching `JudgePackOutput`:",
9921
- "```",
9922
- "{",
9923
- " \"scores\": [{\"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {...}}],",
9924
- " \"composite\": <sum-of-weighted-scores>,",
9925
- " \"verdict\": \"<1-3 sentence overall>\",",
9926
- " \"judgeModel\": \"<provider:model>\",",
9927
- " \"rendererBinaryCid\": \"<cid-string-only-if-available>\"",
9928
- "}",
9929
- "```",
9930
- "Omit `rendererBinaryCid` entirely when no binary CID is exposed by",
9931
- "`moltnet_rendered_pack_get`. Do NOT emit `null` — the field is optional",
9932
- "and absence is the correct representation when unavailable.",
9933
9998
  `Write a signed diary entry (tags: \`judgment\`, \`judge_pack\`, \`rubric:${rubric.rubricId}\`) capturing the rationale before`,
9934
- "emitting the JSON."
9999
+ "reporting structured output.",
10000
+ "",
10001
+ buildFinalOutputBlock({
10002
+ taskType: "judge_pack",
10003
+ outputSchemaName: "JudgePackOutput",
10004
+ shapeSketch: [
10005
+ "{",
10006
+ " \"scores\": [",
10007
+ " { \"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {} },",
10008
+ " {",
10009
+ " \"criterionId\": \"<llm_checklist criterion>\",",
10010
+ " \"score\": 0, // 1 iff every assertion passed",
10011
+ " \"assertions\": [",
10012
+ " { \"id\": \"claim-1\", \"text\": \"...\", \"passed\": false, \"evidence\": \"...\" }",
10013
+ " ]",
10014
+ " }",
10015
+ " ],",
10016
+ " \"composite\": <sum-of-weighted-scores>,",
10017
+ " \"verdict\": \"<1-3 sentence overall>\",",
10018
+ " \"judgeModel\": \"<provider:model>\",",
10019
+ " \"rendererBinaryCid\": \"<cid-string-only-if-available>\"",
10020
+ "}"
10021
+ ].join("\n"),
10022
+ extraNotes: [
10023
+ "Omit `rendererBinaryCid` entirely when no binary CID is exposed by",
10024
+ "`moltnet_rendered_pack_get`. Do NOT emit `null` — the field is",
10025
+ "optional and absence is the correct representation when unavailable."
10026
+ ]
10027
+ })
9935
10028
  ].filter((l) => l !== null).join("\n");
9936
10029
  }
9937
10030
  //#endregion
@@ -9960,7 +10053,7 @@ function buildRenderPackPrompt(input, ctx) {
9960
10053
  "",
9961
10054
  "## Workflow",
9962
10055
  "",
9963
- "1. Call `moltnet_pack_get` with `expand: \"entries\"` to inspect the",
10056
+ "1. Call `moltnet_pack_get` with `expandEntries: true` to inspect the",
9964
10057
  " source entries. Read it — you need the entry count for your output.",
9965
10058
  "2. Call `moltnet_pack_render` with:",
9966
10059
  ` - \`packId\`: \`${packId}\``,
@@ -9972,24 +10065,25 @@ function buildRenderPackPrompt(input, ctx) {
9972
10065
  "## Constraints",
9973
10066
  "",
9974
10067
  "- Do NOT modify the source pack or its entries.",
9975
- "- Do NOT call `moltnet_rendered_pack_judge`.",
9976
10068
  "- Do NOT write diary entries unless a genuine incident occurs",
9977
10069
  " (rendering failure, invariant violation).",
9978
10070
  "",
9979
- "## Final output",
9980
- "",
9981
- "Write to stdout a JSON object matching `RenderPackOutput`:",
9982
- "```",
9983
- "{",
9984
- " \"renderedPackId\": \"<uuid-or-null>\",",
9985
- " \"renderedCid\": \"<cid>\",",
9986
- " \"renderMethod\": \"<label>\",",
9987
- " \"byteSize\": <int>,",
9988
- " \"entriesRendered\": <int>,",
9989
- " \"summary\": \"<1-3 sentence recap>\"",
9990
- "}",
9991
- "```",
9992
- "Failing to emit it is a task failure."
10071
+ buildSelfVerificationBlock(ctx.taskId),
10072
+ buildFinalOutputBlock({
10073
+ taskType: "render_pack",
10074
+ outputSchemaName: "RenderPackOutput",
10075
+ shapeSketch: [
10076
+ "{",
10077
+ " \"renderedPackId\": \"<uuid-or-null>\",",
10078
+ " \"renderedCid\": \"<cid>\",",
10079
+ " \"renderMethod\": \"<label>\",",
10080
+ " \"byteSize\": <int>,",
10081
+ " \"entriesRendered\": <int>,",
10082
+ " \"summary\": \"<1-3 sentence recap>\",",
10083
+ " \"verification\": <required iff input.successCriteria; see Self-verification>",
10084
+ "}"
10085
+ ].join("\n")
10086
+ })
9993
10087
  ].join("\n");
9994
10088
  }
9995
10089
  //#endregion
@@ -12020,7 +12114,7 @@ var require_transport = /* @__PURE__ */ __commonJSMin(((exports, module) => {
12020
12114
  var { existsSync: existsSync$1 } = __require("node:fs");
12021
12115
  var getCallers = require_caller();
12022
12116
  var { join: join$1, isAbsolute, sep } = __require("node:path");
12023
- var { fileURLToPath: fileURLToPath$1 } = __require("node:url");
12117
+ var { fileURLToPath } = __require("node:url");
12024
12118
  var sleep = require_atomic_sleep();
12025
12119
  var onExit = require_on_exit_leak_free();
12026
12120
  var ThreadStream = require_thread_stream();
@@ -12076,7 +12170,7 @@ var require_transport = /* @__PURE__ */ __commonJSMin(((exports, module) => {
12076
12170
  if (!unquoted) return false;
12077
12171
  let path = unquoted;
12078
12172
  if (path.startsWith("file://")) try {
12079
- path = fileURLToPath$1(path);
12173
+ path = fileURLToPath(path);
12080
12174
  } catch {
12081
12175
  return false;
12082
12176
  }
@@ -13567,9 +13661,13 @@ function buildRuntimeInstructor(ctx) {
13567
13661
  `- During this task, every diary entry MUST land in \`${ctx.diaryId}\``,
13568
13662
  " (the task diary). The MCP `moltnet_create_entry` tool enforces this",
13569
13663
  " and rejects mismatched explicit `diaryId` parameters.",
13570
- `- Provenance tags \`task:${ctx.taskId}\`, \`task_type:${ctx.taskType}\`,`,
13571
- ` and \`task_attempt:${ctx.attemptN}\`${ctx.correlationId ? `, plus \`correlation:${ctx.correlationId}\`` : ""} are auto-injected on every entry.`,
13572
- " You may add additional tags; you cannot remove the auto-tags.",
13664
+ `- Provenance tags \`task:id:${ctx.taskId}\`, \`task:type:${ctx.taskType}\`,`,
13665
+ ` and \`task:attempt:${ctx.attemptN}\`${ctx.correlationId ? `, plus \`task:correlation:${ctx.correlationId}\`` : ""} are auto-injected on every entry.`,
13666
+ " These share the `task:` namespace so `moltnet_diary_tags` with",
13667
+ " `prefix: \"task:\"` lists every task-scoped tag, and the",
13668
+ " `taskFilter` shorthand on `moltnet_list_entries` /",
13669
+ " `moltnet_search_entries` expands into them. You may add additional",
13670
+ " tags but you cannot remove the auto-injected ones.",
13573
13671
  "",
13574
13672
  "## Accountable commits",
13575
13673
  "",
@@ -13598,42 +13696,78 @@ function buildRuntimeInstructor(ctx) {
13598
13696
  }
13599
13697
  //#endregion
13600
13698
  //#region src/runtime/task-output.ts
13601
- async function parseStructuredTaskOutput(assistantText, taskType) {
13699
+ var METER_NAME = "@themoltnet/pi-extension/task-output";
13700
+ var parseResultCounter = null;
13701
+ function getParseResultCounter() {
13702
+ if (parseResultCounter) return parseResultCounter;
13703
+ parseResultCounter = metrics.getMeter(METER_NAME).createCounter("agent_runtime.task_output.parse_result", {
13704
+ description: "Outcome of structured task-output capture, labelled by task_type, model, and code (success | output_missing | output_validation_failed | unknown_task_type | output_cid_compute_failed | captured_via_tool).",
13705
+ unit: "1"
13706
+ });
13707
+ return parseResultCounter;
13708
+ }
13709
+ /**
13710
+ * Record one parse-result observation. Exposed so the executor can also
13711
+ * record the `captured_via_tool` outcome from the submit-tool path
13712
+ * without bouncing through the parser. Labels: `task_type`, `model`, `code`.
13713
+ */
13714
+ function recordTaskOutputParseResult(args) {
13715
+ getParseResultCounter().add(1, {
13716
+ task_type: args.taskType,
13717
+ model: args.model ?? "unknown",
13718
+ code: args.code
13719
+ });
13720
+ }
13721
+ async function parseStructuredTaskOutput(assistantText, taskType, opts = {}) {
13722
+ const record = (code) => recordTaskOutputParseResult({
13723
+ taskType,
13724
+ model: opts.model,
13725
+ code
13726
+ });
13602
13727
  const extracted = extractJsonObject(assistantText);
13603
- if (!extracted) return {
13604
- output: null,
13605
- outputCid: null,
13606
- error: {
13607
- code: "output_missing",
13608
- message: "Agent did not emit a parseable JSON object as its final message."
13609
- }
13610
- };
13728
+ if (!extracted) {
13729
+ record("output_missing");
13730
+ return {
13731
+ output: null,
13732
+ outputCid: null,
13733
+ error: {
13734
+ code: "output_missing",
13735
+ message: "Agent did not emit a parseable JSON object as its final message."
13736
+ }
13737
+ };
13738
+ }
13611
13739
  const errors = validateTaskOutput(taskType, extracted);
13612
13740
  if (errors.length > 0) {
13613
13741
  const details = errors.slice(0, 3).map((error) => `${error.field}: ${error.message}`);
13614
13742
  const [firstError] = errors;
13743
+ const code = firstError?.field === "taskType" ? "unknown_task_type" : "output_validation_failed";
13744
+ record(code);
13615
13745
  return {
13616
13746
  output: null,
13617
13747
  outputCid: null,
13618
13748
  error: {
13619
- code: firstError?.field === "taskType" ? "unknown_task_type" : "output_validation_failed",
13749
+ code,
13620
13750
  message: `Output failed schema validation: ${details.join("; ")}`
13621
13751
  }
13622
13752
  };
13623
13753
  }
13624
13754
  try {
13755
+ const outputCid = await computeJsonCid(extracted);
13756
+ record("success");
13625
13757
  return {
13626
13758
  output: extracted,
13627
- outputCid: await computeJsonCid(extracted),
13759
+ outputCid,
13628
13760
  error: null
13629
13761
  };
13630
13762
  } catch (error) {
13763
+ const message = error instanceof Error ? error.message : String(error);
13764
+ record("output_cid_compute_failed");
13631
13765
  return {
13632
13766
  output: null,
13633
13767
  outputCid: null,
13634
13768
  error: {
13635
13769
  code: "output_cid_compute_failed",
13636
- message: `Validated output could not be canonicalized: ${error instanceof Error ? error.message : String(error)}`
13770
+ message: `Validated output could not be canonicalized: ${message}`
13637
13771
  }
13638
13772
  };
13639
13773
  }
@@ -13689,6 +13823,99 @@ function extractJsonObject(text) {
13689
13823
  return null;
13690
13824
  }
13691
13825
  //#endregion
13826
+ //#region src/runtime/submit-output-tool.ts
13827
+ /**
13828
+ * Sentinel thrown when the requested task type has no registered output
13829
+ * schema. The executor recognises this specific error class and falls
13830
+ * back to the parser path; any other error from `createSubmitOutputTool`
13831
+ * is unexpected and must propagate.
13832
+ */
13833
+ var UnknownTaskTypeForSubmitToolError = class extends Error {
13834
+ constructor(taskType) {
13835
+ super(`createSubmitOutputTool: no output schema registered for task type "${taskType}"`);
13836
+ this.taskType = taskType;
13837
+ this.name = "UnknownTaskTypeForSubmitToolError";
13838
+ }
13839
+ };
13840
+ function createSubmitOutputTool(taskType, opts = {}) {
13841
+ const contract = getSubmitOutputContract(taskType);
13842
+ if (!contract) throw new UnknownTaskTypeForSubmitToolError(taskType);
13843
+ const schema = contract.parametersSchema;
13844
+ let captured = null;
13845
+ let callCount = 0;
13846
+ return {
13847
+ tool: defineTool({
13848
+ name: contract.toolName,
13849
+ label: `Submit ${taskType} output`,
13850
+ description: contract.description,
13851
+ parameters: schema,
13852
+ async execute(_id, params) {
13853
+ const errors = validateTaskOutput(taskType, params);
13854
+ if (errors.length > 0) {
13855
+ const detailMsg = errors.slice(0, 3).map((err) => `${err.field}: ${err.message}`).join("; ");
13856
+ const details = {
13857
+ captured: false,
13858
+ callCount,
13859
+ error: "output_validation_failed"
13860
+ };
13861
+ recordTaskOutputParseResult({
13862
+ taskType,
13863
+ model: opts.model,
13864
+ code: "output_validation_failed"
13865
+ });
13866
+ return {
13867
+ content: [{
13868
+ type: "text",
13869
+ text: `Output failed validation: ${detailMsg}. Re-call this tool with a corrected output.`
13870
+ }],
13871
+ details,
13872
+ isError: true
13873
+ };
13874
+ }
13875
+ captured = params;
13876
+ callCount += 1;
13877
+ return {
13878
+ content: [{
13879
+ type: "text",
13880
+ text: "Output captured. The runtime now has the validated payload; no further action is needed for output reporting."
13881
+ }],
13882
+ details: {
13883
+ captured: true,
13884
+ callCount,
13885
+ error: null
13886
+ },
13887
+ terminate: true
13888
+ };
13889
+ }
13890
+ }),
13891
+ getCaptured: () => captured,
13892
+ getCallCount: () => callCount
13893
+ };
13894
+ }
13895
+ /**
13896
+ * Build the submit-tool wiring for one task attempt. Returns a handle
13897
+ * (or `null` if no submit-tool should be registered) plus the
13898
+ * `customTools`-shaped array ready to spread into the session config.
13899
+ *
13900
+ * The catch is **narrowed** to `UnknownTaskTypeForSubmitToolError` —
13901
+ * exporters/dependency-API drift would otherwise be silently degraded
13902
+ * to parser-only behaviour, which reintroduces the failure mode this
13903
+ * change is fixing. Any other error from the factory propagates.
13904
+ */
13905
+ function resolveSubmitTools(taskType, opts = {}) {
13906
+ let handle;
13907
+ try {
13908
+ handle = createSubmitOutputTool(taskType, opts);
13909
+ } catch (err) {
13910
+ if (err instanceof UnknownTaskTypeForSubmitToolError) handle = null;
13911
+ else throw err;
13912
+ }
13913
+ return {
13914
+ handle,
13915
+ tools: handle ? [handle.tool] : []
13916
+ };
13917
+ }
13918
+ //#endregion
13692
13919
  //#region src/runtime/execute-pi-task.ts
13693
13920
  /**
13694
13921
  * executePiTask — run a single Task attempt using pi-coding-agent inside a
@@ -13834,6 +14061,8 @@ async function executePiTask(claimedTask, reporter, opts) {
13834
14061
  createEditToolDefinition(mountPath, { operations: createGondolinEditOps(managed.vm, mountPath) }),
13835
14062
  createBashToolDefinition(mountPath, { operations: createGondolinBashOps(managed.vm, mountPath) })
13836
14063
  ];
14064
+ const { handle: submitToolHandle, tools: submitToolDefs } = resolveSubmitTools(task.taskType, { model: opts.model });
14065
+ const submitTools = submitToolDefs;
13837
14066
  try {
13838
14067
  const moltnetAgent = await connect({ configDir: managed.agentDir });
13839
14068
  const moltnetTools = createMoltNetTools({
@@ -13885,7 +14114,11 @@ async function executePiTask(claimedTask, reporter, opts) {
13885
14114
  agentDir: piAuthDir,
13886
14115
  cwd: mountPath,
13887
14116
  model: modelHandle,
13888
- customTools: [...gondolinCustomTools, ...moltnetTools],
14117
+ customTools: [
14118
+ ...gondolinCustomTools,
14119
+ ...moltnetTools,
14120
+ ...submitTools
14121
+ ],
13889
14122
  sessionManager: SessionManager.inMemory(),
13890
14123
  resourceLoader
13891
14124
  })).session;
@@ -13962,14 +14195,43 @@ async function executePiTask(claimedTask, reporter, opts) {
13962
14195
  let parsedOutputCid = null;
13963
14196
  let parseError = null;
13964
14197
  if (!runError && !llmAbort && !cancelled) {
13965
- const parsed = await parseStructuredTaskOutput(assistantText, task.taskType);
13966
- parsedOutput = parsed.output;
13967
- parsedOutputCid = parsed.outputCid;
13968
- parseError = parsed.error;
13969
- if (parseError) await emit("error", {
13970
- message: parseError.message,
13971
- phase: "output_validation"
13972
- });
14198
+ const captured = submitToolHandle?.getCaptured() ?? null;
14199
+ if (captured) try {
14200
+ parsedOutput = captured;
14201
+ parsedOutputCid = await computeJsonCid(captured);
14202
+ recordTaskOutputParseResult({
14203
+ taskType: task.taskType,
14204
+ model: opts.model,
14205
+ code: "captured_via_tool"
14206
+ });
14207
+ } catch (err) {
14208
+ const message = err instanceof Error ? err.message : String(err);
14209
+ parsedOutput = null;
14210
+ parsedOutputCid = null;
14211
+ parseError = {
14212
+ code: "output_cid_compute_failed",
14213
+ message: `Captured submit-tool output could not be canonicalized: ${message}`
14214
+ };
14215
+ recordTaskOutputParseResult({
14216
+ taskType: task.taskType,
14217
+ model: opts.model,
14218
+ code: "output_cid_compute_failed"
14219
+ });
14220
+ await emit("error", {
14221
+ message: parseError.message,
14222
+ phase: "output_validation"
14223
+ });
14224
+ }
14225
+ else {
14226
+ const parsed = await parseStructuredTaskOutput(assistantText, task.taskType, { model: opts.model });
14227
+ parsedOutput = parsed.output;
14228
+ parsedOutputCid = parsed.outputCid;
14229
+ parseError = parsed.error;
14230
+ if (parseError) await emit("error", {
14231
+ message: parseError.message,
14232
+ phase: "output_validation"
14233
+ });
14234
+ }
13973
14235
  }
13974
14236
  if (cancelled) return {
13975
14237
  taskId: task.id,
@@ -14365,4 +14627,4 @@ function moltnetExtension(pi) {
14365
14627
  registerMoltnetReflectCommand(pi, state);
14366
14628
  }
14367
14629
  //#endregion
14368
- export { HOST_EXEC_DEFAULT_BASE_ENV, activateAgentEnv, buildPiJudgeRecipeManifest, computePiJudgeRecipeCid, createGondolinBashOps, createGondolinEditOps, createGondolinReadOps, createGondolinWriteOps, createMoltNetTools, createPiOtelExtension, createPiTaskExecutor, moltnetExtension as default, ensureSnapshot, executePiTask, findMainWorktree, loadCredentials, resolvePiJudgeRecipeVersions, resumeVm, toGuestPath };
14630
+ export { HOST_EXEC_DEFAULT_BASE_ENV, activateAgentEnv, createGondolinBashOps, createGondolinEditOps, createGondolinReadOps, createGondolinWriteOps, createMoltNetTools, createPiOtelExtension, createPiTaskExecutor, moltnetExtension as default, ensureSnapshot, executePiTask, findMainWorktree, loadCredentials, resumeVm, toGuestPath };