@themoltnet/pi-extension 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -2
- package/dist/index.d.ts +8 -51
- package/dist/index.js +831 -569
- package/package.json +27 -11
package/dist/index.js
CHANGED
|
@@ -7,11 +7,10 @@ import { createHash } from "node:crypto";
|
|
|
7
7
|
import crypto, { createHash as createHash$1 } from "crypto";
|
|
8
8
|
import { readFile } from "node:fs/promises";
|
|
9
9
|
import { homedir } from "node:os";
|
|
10
|
-
import { Type,
|
|
10
|
+
import { Type, getModel } from "@mariozechner/pi-ai";
|
|
11
11
|
import { RealFSProvider, ShadowProvider, VM, VmCheckpoint, createHttpHooks, createShadowPathPredicate, ensureImageSelector, loadGuestAssets } from "@earendil-works/gondolin";
|
|
12
12
|
import { parseEnv } from "node:util";
|
|
13
|
-
import {
|
|
14
|
-
import { SpanStatusCode, context, trace } from "@opentelemetry/api";
|
|
13
|
+
import { SpanStatusCode, context, metrics, trace } from "@opentelemetry/api";
|
|
15
14
|
import { FormatRegistry, Type as Type$1 } from "@sinclair/typebox";
|
|
16
15
|
import { Value } from "@sinclair/typebox/value";
|
|
17
16
|
//#region \0rolldown/runtime.js
|
|
@@ -3848,7 +3847,7 @@ var cidSymbol = Symbol.for("@ipld/js-cid/CID");
|
|
|
3848
3847
|
* naturally prevents field delimiter collision.
|
|
3849
3848
|
*/
|
|
3850
3849
|
/** SHA-256 multicodec code per multihash table */
|
|
3851
|
-
var SHA2_256_CODE
|
|
3850
|
+
var SHA2_256_CODE = 18;
|
|
3852
3851
|
/**
|
|
3853
3852
|
* Build the canonical JSON input for content hashing.
|
|
3854
3853
|
*
|
|
@@ -3880,7 +3879,7 @@ function computeCanonicalHash(entryType, title, content, tags) {
|
|
|
3880
3879
|
* Example output: "bafkreig..."
|
|
3881
3880
|
*/
|
|
3882
3881
|
function computeContentCid(entryType, title, content, tags) {
|
|
3883
|
-
const digest = create(SHA2_256_CODE
|
|
3882
|
+
const digest = create(SHA2_256_CODE, computeCanonicalHash(entryType, title, content, tags));
|
|
3884
3883
|
return CID.createV1(85, digest).toString(base32);
|
|
3885
3884
|
}
|
|
3886
3885
|
var { p: P, n: N, Gx, Gy, a: _a, d: _d } = {
|
|
@@ -7135,159 +7134,6 @@ var registerSandboxCommand = (pi, state) => {
|
|
|
7135
7134
|
});
|
|
7136
7135
|
};
|
|
7137
7136
|
//#endregion
|
|
7138
|
-
//#region src/moltnet/judge/assets.ts
|
|
7139
|
-
/** Default fidelity rubric — kept verbatim from the Go judge. */
|
|
7140
|
-
var DEFAULT_RUBRIC = `Evaluate the rendered content against the source entries on three axes:
|
|
7141
|
-
|
|
7142
|
-
COVERAGE (0.0-1.0):
|
|
7143
|
-
- Identify each distinct topic/fact in the source entries
|
|
7144
|
-
- Check if each is represented in the rendered content
|
|
7145
|
-
- Score = (represented topics) / (total source topics)
|
|
7146
|
-
- A topic can be restructured or summarized but must be present
|
|
7147
|
-
|
|
7148
|
-
GROUNDING (0.0-1.0):
|
|
7149
|
-
- Identify each distinct claim/fact in the rendered content
|
|
7150
|
-
- Check if each is traceable to a specific source entry
|
|
7151
|
-
- Score = (grounded claims) / (total rendered claims)
|
|
7152
|
-
- Restructured content is fine if the underlying fact comes from a source
|
|
7153
|
-
|
|
7154
|
-
FAITHFULNESS (0.0-1.0):
|
|
7155
|
-
- For content that IS represented, check semantic accuracy
|
|
7156
|
-
- Is the meaning preserved? Any distortions, inversions, or misquotes?
|
|
7157
|
-
- Score = (accurate representations) / (total representations)
|
|
7158
|
-
- Summarization is fine; misrepresentation is not
|
|
7159
|
-
`;
|
|
7160
|
-
/** Judge system prompt — kept verbatim from the Go judge signature. */
|
|
7161
|
-
var JUDGE_SYSTEM_PROMPT = `You are a fidelity judge for rendered context packs. Your job is to evaluate
|
|
7162
|
-
whether a rendered markdown document faithfully represents its source entries.
|
|
7163
|
-
|
|
7164
|
-
Score each axis independently and precisely. Be critical — the purpose is to
|
|
7165
|
-
catch content drift, hallucination, and cherry-picking.
|
|
7166
|
-
|
|
7167
|
-
You will be given three inputs:
|
|
7168
|
-
|
|
7169
|
-
1. \`source_entries\` — the original source entries from the context pack, in
|
|
7170
|
-
markdown format.
|
|
7171
|
-
2. \`rendered_content\` — the agent-rendered markdown derived from the source
|
|
7172
|
-
entries.
|
|
7173
|
-
3. \`rubric\` — the fidelity scoring rubric with criteria definitions.
|
|
7174
|
-
|
|
7175
|
-
Return a JSON object matching the requested schema with these fields:
|
|
7176
|
-
|
|
7177
|
-
- \`coverage\` (number, 0.0–1.0): fraction of source entries represented in
|
|
7178
|
-
rendered content. 1.0 means all source entries are covered.
|
|
7179
|
-
- \`grounding\` (number, 0.0–1.0): fraction of rendered content traceable to
|
|
7180
|
-
source entries. 1.0 means everything comes from sources.
|
|
7181
|
-
- \`faithfulness\` (number, 0.0–1.0): semantic accuracy of represented content.
|
|
7182
|
-
1.0 means source content is accurately represented.
|
|
7183
|
-
- \`reasoning\` (string): detailed step-by-step analysis explaining each score.
|
|
7184
|
-
|
|
7185
|
-
Respond with ONLY a single JSON object. No prose before or after.
|
|
7186
|
-
`;
|
|
7187
|
-
//#endregion
|
|
7188
|
-
//#region src/moltnet/judge/fidelity.ts
|
|
7189
|
-
/**
|
|
7190
|
-
* Pi-native port of the Go fidelity judge
|
|
7191
|
-
* (libs/dspy-adapters/fidelity/fidelity.go).
|
|
7192
|
-
*
|
|
7193
|
-
* Same inputs (source_entries, rendered_content, rubric), same outputs
|
|
7194
|
-
* (coverage, grounding, faithfulness, reasoning). Uses pi-ai `complete()`
|
|
7195
|
-
* instead of dspy-go; no process-global state.
|
|
7196
|
-
*/
|
|
7197
|
-
var JSON_FENCE_RE = /```(?:json)?\s*([\s\S]*?)```/i;
|
|
7198
|
-
function extractJson(text) {
|
|
7199
|
-
const fenceMatch = text.match(JSON_FENCE_RE);
|
|
7200
|
-
if (fenceMatch && fenceMatch[1]) return fenceMatch[1].trim();
|
|
7201
|
-
const firstBrace = text.indexOf("{");
|
|
7202
|
-
const lastBrace = text.lastIndexOf("}");
|
|
7203
|
-
if (firstBrace >= 0 && lastBrace > firstBrace) return text.slice(firstBrace, lastBrace + 1);
|
|
7204
|
-
return text.trim();
|
|
7205
|
-
}
|
|
7206
|
-
function clamp01(value) {
|
|
7207
|
-
const n = typeof value === "number" ? value : Number(value);
|
|
7208
|
-
if (!Number.isFinite(n)) return 0;
|
|
7209
|
-
if (n < 0) return 0;
|
|
7210
|
-
if (n > 1) return 1;
|
|
7211
|
-
return n;
|
|
7212
|
-
}
|
|
7213
|
-
function coerceString(value) {
|
|
7214
|
-
if (typeof value === "string") return value;
|
|
7215
|
-
if (value === null || value === void 0) return "";
|
|
7216
|
-
if (typeof value === "number" || typeof value === "boolean") return String(value);
|
|
7217
|
-
try {
|
|
7218
|
-
return JSON.stringify(value);
|
|
7219
|
-
} catch {
|
|
7220
|
-
return "";
|
|
7221
|
-
}
|
|
7222
|
-
}
|
|
7223
|
-
function parseScores(raw) {
|
|
7224
|
-
const jsonText = extractJson(raw);
|
|
7225
|
-
let parsed;
|
|
7226
|
-
try {
|
|
7227
|
-
parsed = JSON.parse(jsonText);
|
|
7228
|
-
} catch (err) {
|
|
7229
|
-
throw new Error(`judge returned an invalid structured response: ${err.message}\n---raw---\n${raw}`);
|
|
7230
|
-
}
|
|
7231
|
-
const coverage = clamp01(parsed.coverage);
|
|
7232
|
-
const grounding = clamp01(parsed.grounding);
|
|
7233
|
-
const faithfulness = clamp01(parsed.faithfulness);
|
|
7234
|
-
const reasoning = coerceString(parsed.reasoning);
|
|
7235
|
-
return {
|
|
7236
|
-
coverage,
|
|
7237
|
-
grounding,
|
|
7238
|
-
faithfulness,
|
|
7239
|
-
composite: (coverage + grounding + faithfulness) / 3,
|
|
7240
|
-
reasoning
|
|
7241
|
-
};
|
|
7242
|
-
}
|
|
7243
|
-
function buildUserMessage(sourceEntries, renderedContent, rubric) {
|
|
7244
|
-
return [
|
|
7245
|
-
"## Rubric",
|
|
7246
|
-
rubric,
|
|
7247
|
-
"",
|
|
7248
|
-
"## Source entries",
|
|
7249
|
-
sourceEntries,
|
|
7250
|
-
"",
|
|
7251
|
-
"## Rendered content",
|
|
7252
|
-
renderedContent,
|
|
7253
|
-
"",
|
|
7254
|
-
"Produce the JSON object now."
|
|
7255
|
-
].join("\n");
|
|
7256
|
-
}
|
|
7257
|
-
/**
|
|
7258
|
-
* Run the fidelity judge via pi-ai `complete()`. Mirrors `fidelity.Run` in
|
|
7259
|
-
* libs/dspy-adapters/fidelity/fidelity.go.
|
|
7260
|
-
*/
|
|
7261
|
-
async function runFidelityJudge(req, options = {}) {
|
|
7262
|
-
const rubric = req.rubric?.trim() ? req.rubric : DEFAULT_RUBRIC;
|
|
7263
|
-
const userPrompt = buildUserMessage(req.sourceEntries, req.renderedContent, rubric);
|
|
7264
|
-
const message = await complete(req.model, {
|
|
7265
|
-
systemPrompt: JUDGE_SYSTEM_PROMPT,
|
|
7266
|
-
messages: [{
|
|
7267
|
-
role: "user",
|
|
7268
|
-
content: userPrompt,
|
|
7269
|
-
timestamp: Date.now()
|
|
7270
|
-
}]
|
|
7271
|
-
}, options.signal ? { signal: options.signal } : void 0);
|
|
7272
|
-
if (message.stopReason === "error" || message.stopReason === "aborted") throw new Error(`judge failed: ${message.errorMessage ?? message.stopReason}`);
|
|
7273
|
-
const textContent = message.content.filter((c) => c.type === "text" && typeof c.text === "string").map((c) => c.text).join("\n").trim();
|
|
7274
|
-
if (!textContent) throw new Error("judge returned empty response");
|
|
7275
|
-
return parseScores(textContent);
|
|
7276
|
-
}
|
|
7277
|
-
/**
|
|
7278
|
-
* Build a stable markdown blob of source entries for the judge prompt.
|
|
7279
|
-
* Mirrors `buildSourceEntriesFromPack` / `buildSourceEntriesMarkdown` in the
|
|
7280
|
-
* Go CLI so that local and proctored modes produce the same input shape.
|
|
7281
|
-
*/
|
|
7282
|
-
function buildSourceEntriesMarkdown(entries) {
|
|
7283
|
-
const parts = [];
|
|
7284
|
-
for (const entry of entries) {
|
|
7285
|
-
const title = entry.title?.trim() || "Untitled";
|
|
7286
|
-
parts.push(`## ${title}\n${entry.content}\n`);
|
|
7287
|
-
}
|
|
7288
|
-
return parts.join("\n");
|
|
7289
|
-
}
|
|
7290
|
-
//#endregion
|
|
7291
7137
|
//#region src/moltnet/render-phase6.ts
|
|
7292
7138
|
function slugToTitle(value) {
|
|
7293
7139
|
return value.split(/[:/_-]+/).filter(Boolean).map((part) => part[0]?.toUpperCase() + part.slice(1)).join(" ");
|
|
@@ -7434,6 +7280,21 @@ function ensureConnected(config) {
|
|
|
7434
7280
|
};
|
|
7435
7281
|
}
|
|
7436
7282
|
/**
|
|
7283
|
+
* Expand the `taskFilter` shorthand on the diary list/search tools into
|
|
7284
|
+
* the matching `task:*` provenance tags emitted by `moltnet_create_entry`
|
|
7285
|
+
* during a task. Returning an array (possibly empty) lets callers spread
|
|
7286
|
+
* it into a larger `tags` AND-filter without conditionals.
|
|
7287
|
+
*/
|
|
7288
|
+
function compileTaskFilterTags(filter) {
|
|
7289
|
+
if (!filter) return [];
|
|
7290
|
+
const tags = [];
|
|
7291
|
+
if (filter.taskId) tags.push(`task:id:${filter.taskId}`);
|
|
7292
|
+
if (filter.taskType) tags.push(`task:type:${filter.taskType}`);
|
|
7293
|
+
if (filter.correlationId) tags.push(`task:correlation:${filter.correlationId}`);
|
|
7294
|
+
if (typeof filter.attemptN === "number") tags.push(`task:attempt:${filter.attemptN}`);
|
|
7295
|
+
return tags;
|
|
7296
|
+
}
|
|
7297
|
+
/**
|
|
7437
7298
|
* Create all MoltNet tool definitions, ready to pass to `pi.registerTool()`.
|
|
7438
7299
|
*/
|
|
7439
7300
|
function createMoltNetTools(config) {
|
|
@@ -7596,122 +7457,6 @@ function createMoltNetTools(config) {
|
|
|
7596
7457
|
};
|
|
7597
7458
|
}
|
|
7598
7459
|
});
|
|
7599
|
-
const createJudgePackTask = defineTool({
|
|
7600
|
-
name: "moltnet_judge_pack_task_create",
|
|
7601
|
-
label: "Create Judge Pack Task",
|
|
7602
|
-
description: "Create a judge_pack task for a rendered pack. Returns a taskId that moltnet_rendered_pack_judge can claim and execute. The rubric is required — pass the structured rubric JSON from @moltnet/tasks Rubric schema.",
|
|
7603
|
-
parameters: Type.Object({
|
|
7604
|
-
renderedPackId: Type.String({ description: "Rendered pack ID to judge" }),
|
|
7605
|
-
sourcePackId: Type.String({ description: "Source pack ID. Fetch it from the rendered pack if unknown." }),
|
|
7606
|
-
rubric: Type.Any({ description: "Structured rubric object (Rubric schema from @moltnet/tasks). Must have rubricId, version, criteria[]." }),
|
|
7607
|
-
diaryId: Type.Optional(Type.String({ description: "Diary ID to impose the task on. Defaults to the connected diary." }))
|
|
7608
|
-
}),
|
|
7609
|
-
async execute(_id, params) {
|
|
7610
|
-
const { agent, diaryId: connectedDiaryId, teamId: connectedTeamId } = ensureConnected(config);
|
|
7611
|
-
const task = await agent.tasks.create({
|
|
7612
|
-
taskType: "judge_pack",
|
|
7613
|
-
input: {
|
|
7614
|
-
renderedPackId: params.renderedPackId,
|
|
7615
|
-
sourcePackId: params.sourcePackId,
|
|
7616
|
-
rubric: params.rubric
|
|
7617
|
-
},
|
|
7618
|
-
diaryId: params.diaryId ?? connectedDiaryId,
|
|
7619
|
-
teamId: connectedTeamId
|
|
7620
|
-
});
|
|
7621
|
-
return {
|
|
7622
|
-
content: [{
|
|
7623
|
-
type: "text",
|
|
7624
|
-
text: JSON.stringify({
|
|
7625
|
-
taskId: task.id,
|
|
7626
|
-
task
|
|
7627
|
-
}, null, 2)
|
|
7628
|
-
}],
|
|
7629
|
-
details: {}
|
|
7630
|
-
};
|
|
7631
|
-
}
|
|
7632
|
-
});
|
|
7633
|
-
const judgeRenderedPack = defineTool({
|
|
7634
|
-
name: "moltnet_rendered_pack_judge",
|
|
7635
|
-
label: "Judge MoltNet Rendered Pack",
|
|
7636
|
-
description: "Claim a judge_pack task, run the fidelity judge locally, complete the task with structured scores, and set verifiedTaskId on the rendered pack. Create the task first with moltnet_judge_pack_task_create.",
|
|
7637
|
-
parameters: Type.Object({
|
|
7638
|
-
taskId: Type.String({ description: "judge_pack task ID from moltnet_judge_pack_task_create" }),
|
|
7639
|
-
rubricOverride: Type.Optional(Type.String({ description: "Freeform rubric string override for the LLM judge prompt. When omitted the task rubric preamble (or built-in default) is used." }))
|
|
7640
|
-
}),
|
|
7641
|
-
async execute(_id, params, _signal, _onUpdate, ctx) {
|
|
7642
|
-
const { agent } = ensureConnected(config);
|
|
7643
|
-
const model = ctx?.model;
|
|
7644
|
-
if (!model) throw new Error("No active model in pi session — cannot run the fidelity judge.");
|
|
7645
|
-
const claimed = await agent.tasks.claim(params.taskId);
|
|
7646
|
-
const input = claimed.task.input;
|
|
7647
|
-
const rendered = await agent.packs.getRendered(input.renderedPackId);
|
|
7648
|
-
if (!rendered.content?.trim()) throw new Error(`rendered pack ${input.renderedPackId} has empty content`);
|
|
7649
|
-
const sourcePack = await agent.packs.get(input.sourcePackId, { expand: "entries" });
|
|
7650
|
-
if (!sourcePack.entries || sourcePack.entries.length === 0) throw new Error(`source pack ${input.sourcePackId} has no entries`);
|
|
7651
|
-
const sourceEntriesMd = buildSourceEntriesMarkdown(sourcePack.entries.map((entry) => ({
|
|
7652
|
-
title: entry.entry.title,
|
|
7653
|
-
content: entry.entry.content
|
|
7654
|
-
})));
|
|
7655
|
-
const rubric = params.rubricOverride?.trim() || input.rubric?.preamble?.trim() || DEFAULT_RUBRIC;
|
|
7656
|
-
let scores;
|
|
7657
|
-
try {
|
|
7658
|
-
scores = await runFidelityJudge({
|
|
7659
|
-
model,
|
|
7660
|
-
sourceEntries: sourceEntriesMd,
|
|
7661
|
-
renderedContent: rendered.content,
|
|
7662
|
-
rubric
|
|
7663
|
-
});
|
|
7664
|
-
} catch (err) {
|
|
7665
|
-
await agent.tasks.fail(params.taskId, claimed.attempt.attemptN, { error: {
|
|
7666
|
-
code: "judge_failed",
|
|
7667
|
-
message: err.message ?? String(err)
|
|
7668
|
-
} }).catch(() => {});
|
|
7669
|
-
throw new Error(`judge failed: ${err.message ?? String(err)}`);
|
|
7670
|
-
}
|
|
7671
|
-
const modelId = model.provider && model.id ? `${model.provider}:${model.id}` : model.id ?? "pi:unknown";
|
|
7672
|
-
const output = {
|
|
7673
|
-
scores: [
|
|
7674
|
-
{
|
|
7675
|
-
criterionId: "coverage",
|
|
7676
|
-
score: scores.coverage
|
|
7677
|
-
},
|
|
7678
|
-
{
|
|
7679
|
-
criterionId: "grounding",
|
|
7680
|
-
score: scores.grounding
|
|
7681
|
-
},
|
|
7682
|
-
{
|
|
7683
|
-
criterionId: "faithfulness",
|
|
7684
|
-
score: scores.faithfulness
|
|
7685
|
-
}
|
|
7686
|
-
],
|
|
7687
|
-
composite: scores.composite,
|
|
7688
|
-
verdict: scores.reasoning,
|
|
7689
|
-
judgeModel: modelId
|
|
7690
|
-
};
|
|
7691
|
-
const outputCid = await computeJsonCid(output);
|
|
7692
|
-
const completed = await agent.tasks.complete(params.taskId, claimed.attempt.attemptN, {
|
|
7693
|
-
output,
|
|
7694
|
-
outputCid,
|
|
7695
|
-
usage: {
|
|
7696
|
-
inputTokens: 0,
|
|
7697
|
-
outputTokens: 0
|
|
7698
|
-
}
|
|
7699
|
-
});
|
|
7700
|
-
await agent.packs.updateRendered(input.renderedPackId, { verifiedTaskId: params.taskId });
|
|
7701
|
-
return {
|
|
7702
|
-
content: [{
|
|
7703
|
-
type: "text",
|
|
7704
|
-
text: JSON.stringify({
|
|
7705
|
-
renderedPackId: input.renderedPackId,
|
|
7706
|
-
taskId: params.taskId,
|
|
7707
|
-
scores,
|
|
7708
|
-
task: completed
|
|
7709
|
-
}, null, 2)
|
|
7710
|
-
}],
|
|
7711
|
-
details: {}
|
|
7712
|
-
};
|
|
7713
|
-
}
|
|
7714
|
-
});
|
|
7715
7460
|
const diaryTags = defineTool({
|
|
7716
7461
|
name: "moltnet_diary_tags",
|
|
7717
7462
|
label: "List MoltNet Diary Tags",
|
|
@@ -7747,12 +7492,32 @@ function createMoltNetTools(config) {
|
|
|
7747
7492
|
const listEntries = defineTool({
|
|
7748
7493
|
name: "moltnet_list_entries",
|
|
7749
7494
|
label: "List MoltNet Diary Entries",
|
|
7750
|
-
description: "List entries from the MoltNet diary. When `entryIds` is provided, batch-fetches those specific entries (max 50) and returns full fields including entryType, contentSignature, and contentHash for signature checks. Otherwise returns recent entries with a content preview.",
|
|
7495
|
+
description: "List entries from the MoltNet diary. When `entryIds` is provided, batch-fetches those specific entries (max 50) and returns full fields including entryType, contentSignature, and contentHash for signature checks. Otherwise returns recent entries with a content preview, filtered by any combination of tags (AND), excludeTags (NONE), entryType, and the taskFilter shorthand which expands into the right `task:*` tags.",
|
|
7751
7496
|
parameters: Type.Object({
|
|
7752
7497
|
limit: Type.Optional(Type.Number({ description: "Max entries to return (default 10)" })),
|
|
7753
|
-
|
|
7498
|
+
tags: Type.Optional(Type.Array(Type.String({
|
|
7499
|
+
minLength: 1,
|
|
7500
|
+
maxLength: 50
|
|
7501
|
+
}), {
|
|
7502
|
+
description: "Tags filter — entry must have ALL listed tags (AND). Max 20.",
|
|
7503
|
+
maxItems: 20
|
|
7504
|
+
})),
|
|
7505
|
+
excludeTags: Type.Optional(Type.Array(Type.String({
|
|
7506
|
+
minLength: 1,
|
|
7507
|
+
maxLength: 50
|
|
7508
|
+
}), {
|
|
7509
|
+
description: "Tags to exclude — entry must have NONE of these. Max 20.",
|
|
7510
|
+
maxItems: 20
|
|
7511
|
+
})),
|
|
7512
|
+
entryType: Type.Optional(Type.String({ description: "Filter by entry type (procedural, semantic, episodic, reflection, identity, soul)." })),
|
|
7513
|
+
taskFilter: Type.Optional(Type.Object({
|
|
7514
|
+
taskId: Type.Optional(Type.String()),
|
|
7515
|
+
taskType: Type.Optional(Type.String()),
|
|
7516
|
+
correlationId: Type.Optional(Type.String()),
|
|
7517
|
+
attemptN: Type.Optional(Type.Number())
|
|
7518
|
+
}, { description: "Shorthand: any combination compiles to the matching task:* tags (task:id:<id>, task:type:<type>, task:correlation:<id>, task:attempt:<n>) and is merged into the tags filter." })),
|
|
7754
7519
|
entryIds: Type.Optional(Type.Array(Type.String(), {
|
|
7755
|
-
description: "Batch-fetch specific entries by UUID (max 50). Overrides
|
|
7520
|
+
description: "Batch-fetch specific entries by UUID (max 50). Overrides every other filter.",
|
|
7756
7521
|
maxItems: 50
|
|
7757
7522
|
}))
|
|
7758
7523
|
}),
|
|
@@ -7766,7 +7531,11 @@ function createMoltNetTools(config) {
|
|
|
7766
7531
|
if (batchMode) query.ids = params.entryIds;
|
|
7767
7532
|
else {
|
|
7768
7533
|
query.limit = params.limit ?? 10;
|
|
7769
|
-
|
|
7534
|
+
const expandedTags = compileTaskFilterTags(params.taskFilter);
|
|
7535
|
+
const allTags = [...params.tags ?? [], ...expandedTags];
|
|
7536
|
+
if (allTags.length) query.tags = allTags;
|
|
7537
|
+
if (params.excludeTags?.length) query.excludeTags = params.excludeTags;
|
|
7538
|
+
if (params.entryType) query.entryType = params.entryType;
|
|
7770
7539
|
}
|
|
7771
7540
|
const entries = await agent.entries.list(diaryId, query);
|
|
7772
7541
|
return {
|
|
@@ -7822,17 +7591,46 @@ function createMoltNetTools(config) {
|
|
|
7822
7591
|
const searchEntries = defineTool({
|
|
7823
7592
|
name: "moltnet_search_entries",
|
|
7824
7593
|
label: "Search MoltNet Diary Entries",
|
|
7825
|
-
description: "
|
|
7594
|
+
description: "Hybrid (semantic + lexical) search over diary entries. Optional tags / excludeTags / entryTypes filters AND with the query; the taskFilter shorthand expands into task:* provenance tags so `taskFilter: { taskType: \"fulfill_brief\" }` returns only entries from fulfill_brief attempts. Filters apply server-side before ranking.",
|
|
7826
7595
|
parameters: Type.Object({
|
|
7827
7596
|
query: Type.String({ description: "Natural language search query" }),
|
|
7828
|
-
limit: Type.Optional(Type.Number({ description: "Max results (default 5)" }))
|
|
7597
|
+
limit: Type.Optional(Type.Number({ description: "Max results (default 5)" })),
|
|
7598
|
+
tags: Type.Optional(Type.Array(Type.String({
|
|
7599
|
+
minLength: 1,
|
|
7600
|
+
maxLength: 50
|
|
7601
|
+
}), {
|
|
7602
|
+
description: "Entry must have ALL listed tags (AND). Max 20.",
|
|
7603
|
+
maxItems: 20
|
|
7604
|
+
})),
|
|
7605
|
+
excludeTags: Type.Optional(Type.Array(Type.String({
|
|
7606
|
+
minLength: 1,
|
|
7607
|
+
maxLength: 50
|
|
7608
|
+
}), {
|
|
7609
|
+
description: "Entry must have NONE of these tags. Max 20.",
|
|
7610
|
+
maxItems: 20
|
|
7611
|
+
})),
|
|
7612
|
+
entryTypes: Type.Optional(Type.Array(Type.String(), {
|
|
7613
|
+
description: "Restrict to these entry types (procedural, semantic, episodic, reflection, identity, soul). Max 6.",
|
|
7614
|
+
maxItems: 6
|
|
7615
|
+
})),
|
|
7616
|
+
taskFilter: Type.Optional(Type.Object({
|
|
7617
|
+
taskId: Type.Optional(Type.String()),
|
|
7618
|
+
taskType: Type.Optional(Type.String()),
|
|
7619
|
+
correlationId: Type.Optional(Type.String()),
|
|
7620
|
+
attemptN: Type.Optional(Type.Number())
|
|
7621
|
+
}, { description: "Shorthand: any combination compiles to the matching task:* tags and is merged into the tags filter." }))
|
|
7829
7622
|
}),
|
|
7830
7623
|
async execute(_id, params) {
|
|
7831
7624
|
const { agent, diaryId } = ensureConnected(config);
|
|
7625
|
+
const expandedTags = compileTaskFilterTags(params.taskFilter);
|
|
7626
|
+
const allTags = [...params.tags ?? [], ...expandedTags];
|
|
7832
7627
|
const results = await agent.entries.search({
|
|
7833
7628
|
diaryId,
|
|
7834
7629
|
query: params.query,
|
|
7835
|
-
limit: params.limit ?? 5
|
|
7630
|
+
limit: params.limit ?? 5,
|
|
7631
|
+
...allTags.length ? { tags: allTags } : {},
|
|
7632
|
+
...params.excludeTags?.length ? { excludeTags: params.excludeTags } : {},
|
|
7633
|
+
...params.entryTypes?.length ? { entryTypes: params.entryTypes } : {}
|
|
7836
7634
|
});
|
|
7837
7635
|
return {
|
|
7838
7636
|
content: [{
|
|
@@ -7852,7 +7650,7 @@ function createMoltNetTools(config) {
|
|
|
7852
7650
|
const createEntry = defineTool({
|
|
7853
7651
|
name: "moltnet_create_entry",
|
|
7854
7652
|
label: "Create MoltNet Diary Entry",
|
|
7855
|
-
description: "Create a new diary entry to record decisions, findings, incidents, or reflections. During an active task, the entry is forced into the task diary and tagged with task:<id>,
|
|
7653
|
+
description: "Create a new diary entry to record decisions, findings, incidents, or reflections. During an active task, the entry is forced into the task diary and tagged with the task:* provenance namespace (task:id:<id>, task:type:<type>, task:attempt:<n>, plus task:correlation:<id> when set); an explicit diaryId mismatching the task diary is rejected.",
|
|
7856
7654
|
parameters: Type.Object({
|
|
7857
7655
|
title: Type.String({ description: "Entry title (concise, descriptive)" }),
|
|
7858
7656
|
content: Type.String({ description: "Entry content (markdown)" }),
|
|
@@ -7869,10 +7667,10 @@ function createMoltNetTools(config) {
|
|
|
7869
7667
|
if (params.diaryId && params.diaryId !== taskCtx.diaryId) throw new Error(`entries_create: diaryId "${params.diaryId}" does not match the active task diary "${taskCtx.diaryId}". Entries created during a task must land in the task diary.`);
|
|
7870
7668
|
targetDiaryId = taskCtx.diaryId;
|
|
7871
7669
|
autoTags = [
|
|
7872
|
-
`task:${taskCtx.taskId}`,
|
|
7873
|
-
`
|
|
7874
|
-
`
|
|
7875
|
-
...taskCtx.correlationId ? [`correlation:${taskCtx.correlationId}`] : []
|
|
7670
|
+
`task:id:${taskCtx.taskId}`,
|
|
7671
|
+
`task:type:${taskCtx.taskType}`,
|
|
7672
|
+
`task:attempt:${taskCtx.attemptN}`,
|
|
7673
|
+
...taskCtx.correlationId ? [`task:correlation:${taskCtx.correlationId}`] : []
|
|
7876
7674
|
];
|
|
7877
7675
|
} else targetDiaryId = params.diaryId ?? envDiaryId;
|
|
7878
7676
|
const userTags = params.tags ?? [];
|
|
@@ -7973,8 +7771,6 @@ function createMoltNetTools(config) {
|
|
|
7973
7771
|
renderPack,
|
|
7974
7772
|
listRenderedPacks,
|
|
7975
7773
|
getRenderedPack,
|
|
7976
|
-
createJudgePackTask,
|
|
7977
|
-
judgeRenderedPack,
|
|
7978
7774
|
diaryTags,
|
|
7979
7775
|
listEntries,
|
|
7980
7776
|
getEntry,
|
|
@@ -8591,135 +8387,6 @@ function ensureRelativeWorktreePaths(gitconfig) {
|
|
|
8591
8387
|
return `${gitconfig}${gitconfig.endsWith("\n") ? "" : "\n"}[worktree]\n\tuseRelativePaths = true\n`;
|
|
8592
8388
|
}
|
|
8593
8389
|
//#endregion
|
|
8594
|
-
//#region src/moltnet/judge-recipe-cid.ts
|
|
8595
|
-
var require$1 = createRequire(import.meta.url);
|
|
8596
|
-
var SELF_PACKAGE_NAME = "@themoltnet/pi-extension";
|
|
8597
|
-
var PI_PACKAGE_NAME = "@mariozechner/pi-coding-agent";
|
|
8598
|
-
var SDK_PACKAGE_NAME = "@themoltnet/sdk";
|
|
8599
|
-
var CID_VERSION = 1;
|
|
8600
|
-
var RAW_CODEC = 85;
|
|
8601
|
-
var SHA2_256_CODE = 18;
|
|
8602
|
-
var BASE32_ALPHABET = "abcdefghijklmnopqrstuvwxyz234567";
|
|
8603
|
-
function findSelfPackageDir() {
|
|
8604
|
-
const start = path.dirname(fileURLToPath(import.meta.url));
|
|
8605
|
-
let dir = start;
|
|
8606
|
-
while (true) {
|
|
8607
|
-
const candidate = path.join(dir, "package.json");
|
|
8608
|
-
if (existsSync(candidate)) {
|
|
8609
|
-
if (JSON.parse(readFileSync(candidate, "utf8")).name === SELF_PACKAGE_NAME) return dir;
|
|
8610
|
-
}
|
|
8611
|
-
const parent = path.dirname(dir);
|
|
8612
|
-
if (parent === dir) return start;
|
|
8613
|
-
dir = parent;
|
|
8614
|
-
}
|
|
8615
|
-
}
|
|
8616
|
-
var PACKAGE_DIR = findSelfPackageDir();
|
|
8617
|
-
function sha256Hex(value) {
|
|
8618
|
-
return createHash("sha256").update(value, "utf8").digest("hex");
|
|
8619
|
-
}
|
|
8620
|
-
function encodeVarint(value) {
|
|
8621
|
-
const bytes = [];
|
|
8622
|
-
let current = value >>> 0;
|
|
8623
|
-
while (current >= 128) {
|
|
8624
|
-
bytes.push(current & 127 | 128);
|
|
8625
|
-
current >>>= 7;
|
|
8626
|
-
}
|
|
8627
|
-
bytes.push(current);
|
|
8628
|
-
return bytes;
|
|
8629
|
-
}
|
|
8630
|
-
function base32Lower(bytes) {
|
|
8631
|
-
let bits = 0;
|
|
8632
|
-
let value = 0;
|
|
8633
|
-
let output = "";
|
|
8634
|
-
for (const byte of bytes) {
|
|
8635
|
-
value = value << 8 | byte;
|
|
8636
|
-
bits += 8;
|
|
8637
|
-
while (bits >= 5) {
|
|
8638
|
-
output += BASE32_ALPHABET[value >>> bits - 5 & 31];
|
|
8639
|
-
bits -= 5;
|
|
8640
|
-
}
|
|
8641
|
-
}
|
|
8642
|
-
if (bits > 0) output += BASE32_ALPHABET[value << 5 - bits & 31];
|
|
8643
|
-
return `b${output}`;
|
|
8644
|
-
}
|
|
8645
|
-
function stableStringify(value) {
|
|
8646
|
-
if (value === null || typeof value !== "object") return JSON.stringify(value);
|
|
8647
|
-
if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(",")}]`;
|
|
8648
|
-
return `{${Object.entries(value).sort(([left], [right]) => left.localeCompare(right)).map(([key, item]) => `${JSON.stringify(key)}:${stableStringify(item)}`).join(",")}}`;
|
|
8649
|
-
}
|
|
8650
|
-
function readPackageVersion(pkgPath, expectedName) {
|
|
8651
|
-
if (!existsSync(pkgPath)) return null;
|
|
8652
|
-
const parsed = JSON.parse(readFileSync(pkgPath, "utf8"));
|
|
8653
|
-
if (expectedName && parsed.name !== expectedName) return null;
|
|
8654
|
-
return typeof parsed.version === "string" ? parsed.version : null;
|
|
8655
|
-
}
|
|
8656
|
-
function resolveInstalledPackageVersion(packageName) {
|
|
8657
|
-
const candidates = [];
|
|
8658
|
-
try {
|
|
8659
|
-
candidates.push(path.dirname(require$1.resolve(packageName)));
|
|
8660
|
-
} catch {}
|
|
8661
|
-
let dir = PACKAGE_DIR;
|
|
8662
|
-
while (true) {
|
|
8663
|
-
candidates.push(path.join(dir, "node_modules", packageName));
|
|
8664
|
-
const parent = path.dirname(dir);
|
|
8665
|
-
if (parent === dir) break;
|
|
8666
|
-
dir = parent;
|
|
8667
|
-
}
|
|
8668
|
-
for (const start of candidates) {
|
|
8669
|
-
let current = start;
|
|
8670
|
-
while (true) {
|
|
8671
|
-
const version = readPackageVersion(path.join(current, "package.json"), packageName);
|
|
8672
|
-
if (version) return version;
|
|
8673
|
-
const parent = path.dirname(current);
|
|
8674
|
-
if (parent === current) break;
|
|
8675
|
-
current = parent;
|
|
8676
|
-
}
|
|
8677
|
-
}
|
|
8678
|
-
return null;
|
|
8679
|
-
}
|
|
8680
|
-
function resolvePiJudgeRecipeVersions() {
|
|
8681
|
-
return {
|
|
8682
|
-
pi: resolveInstalledPackageVersion(PI_PACKAGE_NAME),
|
|
8683
|
-
piExtension: readPackageVersion(path.join(PACKAGE_DIR, "package.json"), SELF_PACKAGE_NAME),
|
|
8684
|
-
sdk: resolveInstalledPackageVersion(SDK_PACKAGE_NAME)
|
|
8685
|
-
};
|
|
8686
|
-
}
|
|
8687
|
-
function buildPiJudgeRecipeManifest(inputs) {
|
|
8688
|
-
return {
|
|
8689
|
-
kind: "pi-judge-recipe/v1",
|
|
8690
|
-
versions: {
|
|
8691
|
-
...resolvePiJudgeRecipeVersions(),
|
|
8692
|
-
...inputs.overrides
|
|
8693
|
-
},
|
|
8694
|
-
assets: {
|
|
8695
|
-
promptAsset: inputs.promptAsset ?? null,
|
|
8696
|
-
rubricAsset: inputs.rubricAsset ?? null,
|
|
8697
|
-
skillSourcePath: inputs.skillSourcePath ?? null
|
|
8698
|
-
},
|
|
8699
|
-
hashes: {
|
|
8700
|
-
judgePromptSha256: sha256Hex(inputs.judgePrompt),
|
|
8701
|
-
rubricSha256: sha256Hex(inputs.rubric),
|
|
8702
|
-
skillFragmentSha256: inputs.skillFragment ? sha256Hex(inputs.skillFragment) : null,
|
|
8703
|
-
implementationSha256: inputs.implementationSource ? sha256Hex(inputs.implementationSource) : null
|
|
8704
|
-
}
|
|
8705
|
-
};
|
|
8706
|
-
}
|
|
8707
|
-
function computePiJudgeRecipeCid(inputs) {
|
|
8708
|
-
const manifest = buildPiJudgeRecipeManifest(inputs);
|
|
8709
|
-
const manifestBytes = Buffer.from(stableStringify(manifest), "utf8");
|
|
8710
|
-
const digestBytes = createHash("sha256").update(manifestBytes).digest();
|
|
8711
|
-
return {
|
|
8712
|
-
cid: base32Lower(Uint8Array.from([
|
|
8713
|
-
...encodeVarint(CID_VERSION),
|
|
8714
|
-
...encodeVarint(RAW_CODEC),
|
|
8715
|
-
...encodeVarint(SHA2_256_CODE),
|
|
8716
|
-
...encodeVarint(digestBytes.length),
|
|
8717
|
-
...digestBytes
|
|
8718
|
-
])),
|
|
8719
|
-
manifest
|
|
8720
|
-
};
|
|
8721
|
-
}
|
|
8722
|
-
//#endregion
|
|
8723
8390
|
//#region src/otel/index.ts
|
|
8724
8391
|
var TRACER_NAME = "@themoltnet/pi-extension/otel";
|
|
8725
8392
|
function stripReservedAttrs(attrs) {
|
|
@@ -8891,7 +8558,13 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
|
|
|
8891
8558
|
/**
|
|
8892
8559
|
* How a judge must score a single criterion.
|
|
8893
8560
|
*
|
|
8894
|
-
* - `
|
|
8561
|
+
* - `llm_score`: 0..1 continuous, `rationale` required. Smooths failures
|
|
8562
|
+
* into the gradient — use `llm_checklist` instead for properties where
|
|
8563
|
+
* a single failure is a real failure (grounding, faithfulness).
|
|
8564
|
+
* - `llm_checklist`: judge enumerates per-claim assertions with
|
|
8565
|
+
* `{passed, evidence}`. The criterion's numeric `score` is derived:
|
|
8566
|
+
* `1` iff every assertion passes, else `0`. Per-claim evidence is the
|
|
8567
|
+
* dataset for cluster-analysis of failure modes. See #999.
|
|
8895
8568
|
* - `boolean`: 0 or 1, `rationale` optional.
|
|
8896
8569
|
* - `deterministic_signature_check`: judge runs a signature check;
|
|
8897
8570
|
* result is 0 or 1. No LLM discretion.
|
|
@@ -8899,11 +8572,31 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
|
|
|
8899
8572
|
* appears in the rendered output; 0 or 1.
|
|
8900
8573
|
*/
|
|
8901
8574
|
var RubricScoringMode = Type$1.Union([
|
|
8902
|
-
Type$1.Literal("
|
|
8575
|
+
Type$1.Literal("llm_score"),
|
|
8576
|
+
Type$1.Literal("llm_checklist"),
|
|
8903
8577
|
Type$1.Literal("boolean"),
|
|
8904
8578
|
Type$1.Literal("deterministic_signature_check"),
|
|
8905
8579
|
Type$1.Literal("deterministic_coverage_check")
|
|
8906
8580
|
], { $id: "RubricScoringMode" });
|
|
8581
|
+
/**
|
|
8582
|
+
* One binary check produced by an `llm_checklist`-mode criterion.
|
|
8583
|
+
*
|
|
8584
|
+
* `evidence` is REQUIRED for both PASS and FAIL — agentskills.io grading
|
|
8585
|
+
* principle: \"Don't give the benefit of the doubt.\" A PASS without
|
|
8586
|
+
* concrete evidence (a quoted span, an entry id, a source location)
|
|
8587
|
+
* cannot be audited. A FAIL without evidence cannot be clustered into
|
|
8588
|
+
* structural fixes. The same shape is reused by `judge-eval-variant`
|
|
8589
|
+
* (#943) so tooling, dashboards, and analysis stay uniform.
|
|
8590
|
+
*/
|
|
8591
|
+
var AssertionResult = Type$1.Object({
|
|
8592
|
+
id: Type$1.String({ minLength: 1 }),
|
|
8593
|
+
text: Type$1.String({ minLength: 1 }),
|
|
8594
|
+
passed: Type$1.Boolean(),
|
|
8595
|
+
evidence: Type$1.String({ minLength: 1 })
|
|
8596
|
+
}, {
|
|
8597
|
+
$id: "AssertionResult",
|
|
8598
|
+
additionalProperties: false
|
|
8599
|
+
});
|
|
8907
8600
|
var RubricCriterion = Type$1.Object({
|
|
8908
8601
|
id: Type$1.String({ minLength: 1 }),
|
|
8909
8602
|
description: Type$1.String({ minLength: 1 }),
|
|
@@ -8963,44 +8656,165 @@ unrelated subsystems and the test coverage on the auth path is
|
|
|
8963
8656
|
unchanged" is.
|
|
8964
8657
|
`.trim();
|
|
8965
8658
|
//#endregion
|
|
8659
|
+
//#region ../tasks/src/success-criteria.ts
|
|
8660
|
+
/**
|
|
8661
|
+
* SuccessCriteria — imposer-stated acceptance criteria, evaluated in two
|
|
8662
|
+
* complementary places.
|
|
8663
|
+
*
|
|
8664
|
+
* Before this envelope existed, criteria were scattered: a vestigial
|
|
8665
|
+
* `criteriaCid` column nobody resolved, an `acceptanceCriteria: string[]`
|
|
8666
|
+
* field on `fulfill_brief.input` that was "interpreted by the claiming
|
|
8667
|
+
* agent," and inline `rubric` / `criteria[]` fields on judgment-task
|
|
8668
|
+
* inputs. None of those were machine-verifiable end-to-end.
|
|
8669
|
+
*
|
|
8670
|
+
* This module defines a single, content-addressable envelope an imposer
|
|
8671
|
+
* attaches to any task type. It has four orthogonal sections — pick
|
|
8672
|
+
* whichever apply per task type:
|
|
8673
|
+
*
|
|
8674
|
+
* - `gates` Deterministic structural checks (CID/schema match)
|
|
8675
|
+
* - `assertions` Declarative claims about output JSON
|
|
8676
|
+
* - `rubric` Weighted-criteria scoring instrument, reused
|
|
8677
|
+
* verbatim from `./rubric.ts`.
|
|
8678
|
+
* - `sideEffects` Required process side-effects (e.g. diary entry)
|
|
8679
|
+
*
|
|
8680
|
+
* ## Two roles, two task types
|
|
8681
|
+
*
|
|
8682
|
+
* **Producer self-assessment** (fulfillment tasks: `fulfill_brief`,
|
|
8683
|
+
* `curate_pack`, `render_pack`). The producer **LLM** evaluates the
|
|
8684
|
+
* criteria against its own output and emits a `VerificationRecord`
|
|
8685
|
+
* inside `output.verification`. The daemon is pure passthrough — it
|
|
8686
|
+
* does not run `evaluateAssertions`, does not inspect the verification
|
|
8687
|
+
* record. The REST API is dumb storage; it never re-runs assertions and
|
|
8688
|
+
* never runs LLMs. The cross-field rule
|
|
8689
|
+
* `requireVerificationWhenCriteriaPresent` enforces "verification
|
|
8690
|
+
* required iff successCriteria present" at task-output validation time
|
|
8691
|
+
* (server-side schema check). Self-assessment is a truthful self-rating,
|
|
8692
|
+
* NOT enforcement — `verification.passed=false` does not block /complete
|
|
8693
|
+
* and does not affect `acceptedAttemptN`. See
|
|
8694
|
+
* `docs/agent-runtime.md` for the full producer/judge flow.
|
|
8695
|
+
*
|
|
8696
|
+
* **Binding evaluation** (judgment tasks: `assess_brief`, `judge_pack`).
|
|
8697
|
+
* A separate task whose IS the application of `successCriteria` to
|
|
8698
|
+
* someone else's output. Different agent (enforced at claim time), same
|
|
8699
|
+
* envelope. The judge's verdict is binding: this is the *gate* in the
|
|
8700
|
+
* MoltNet model. The rubric inside `successCriteria.rubric` IS the job
|
|
8701
|
+
* spec for the judge.
|
|
8702
|
+
*
|
|
8703
|
+
* The clean chain: producer task with `successCriteria` → producer
|
|
8704
|
+
* self-assesses honestly → imposer (or automation) creates a downstream
|
|
8705
|
+
* judgment task that references the same `successCriteria` (or a
|
|
8706
|
+
* stricter rubric) → judgment task delivers the binding verdict.
|
|
8707
|
+
*
|
|
8708
|
+
* Storage: SuccessCriteria lives inline at `task.input.successCriteria`,
|
|
8709
|
+
* pinned via the task's `inputCid`. No separate column or hash. When
|
|
8710
|
+
* #881 lands, the `rubric` field can graduate to `{ rubricCid }` lookup
|
|
8711
|
+
* without changing this envelope, and producer + judge tasks can pin
|
|
8712
|
+
* the SAME rubric across the chain for end-to-end auditability.
|
|
8713
|
+
*/
|
|
8714
|
+
var SchemaCheckSpec = Type$1.Object({ schemaCid: Type$1.String({ minLength: 1 }) }, { additionalProperties: false });
|
|
8715
|
+
var CidEqualsSpec = Type$1.Object({
|
|
8716
|
+
path: Type$1.String({ minLength: 1 }),
|
|
8717
|
+
expected: Type$1.String({ minLength: 1 })
|
|
8718
|
+
}, { additionalProperties: false });
|
|
8719
|
+
var Gate = Type$1.Union([Type$1.Object({
|
|
8720
|
+
id: Type$1.String({ minLength: 1 }),
|
|
8721
|
+
kind: Type$1.Literal("schema-check"),
|
|
8722
|
+
spec: SchemaCheckSpec,
|
|
8723
|
+
required: Type$1.Boolean()
|
|
8724
|
+
}, { additionalProperties: false }), Type$1.Object({
|
|
8725
|
+
id: Type$1.String({ minLength: 1 }),
|
|
8726
|
+
kind: Type$1.Literal("cid-equals"),
|
|
8727
|
+
spec: CidEqualsSpec,
|
|
8728
|
+
required: Type$1.Boolean()
|
|
8729
|
+
}, { additionalProperties: false })], { $id: "Gate" });
|
|
8730
|
+
var AssertionOp = Type$1.Union([
|
|
8731
|
+
Type$1.Literal("exists"),
|
|
8732
|
+
Type$1.Literal("equals"),
|
|
8733
|
+
Type$1.Literal("matches"),
|
|
8734
|
+
Type$1.Literal("in-range"),
|
|
8735
|
+
Type$1.Literal("min-length")
|
|
8736
|
+
], { $id: "AssertionOp" });
|
|
8737
|
+
var Assertion = Type$1.Object({
|
|
8738
|
+
id: Type$1.String({ minLength: 1 }),
|
|
8739
|
+
path: Type$1.String({ minLength: 1 }),
|
|
8740
|
+
op: AssertionOp,
|
|
8741
|
+
value: Type$1.Optional(Type$1.Unknown())
|
|
8742
|
+
}, {
|
|
8743
|
+
$id: "Assertion",
|
|
8744
|
+
additionalProperties: false
|
|
8745
|
+
});
|
|
8746
|
+
var SideEffectsSpec = Type$1.Object({
|
|
8747
|
+
diaryEntryRequired: Type$1.Optional(Type$1.Boolean()),
|
|
8748
|
+
diaryEntryTags: Type$1.Optional(Type$1.Array(Type$1.String({ minLength: 1 }))),
|
|
8749
|
+
referencedEntries: Type$1.Optional(Type$1.Integer({ minimum: 0 }))
|
|
8750
|
+
}, {
|
|
8751
|
+
$id: "SideEffectsSpec",
|
|
8752
|
+
additionalProperties: false
|
|
8753
|
+
});
|
|
8754
|
+
var SuccessCriteria = Type$1.Object({
|
|
8755
|
+
version: Type$1.Literal(1),
|
|
8756
|
+
gates: Type$1.Optional(Type$1.Array(Gate)),
|
|
8757
|
+
assertions: Type$1.Optional(Type$1.Array(Assertion)),
|
|
8758
|
+
rubric: Type$1.Optional(Rubric),
|
|
8759
|
+
minComposite: Type$1.Optional(Type$1.Number({
|
|
8760
|
+
minimum: 0,
|
|
8761
|
+
maximum: 1
|
|
8762
|
+
})),
|
|
8763
|
+
sideEffects: Type$1.Optional(SideEffectsSpec)
|
|
8764
|
+
}, {
|
|
8765
|
+
$id: "SuccessCriteria",
|
|
8766
|
+
additionalProperties: false
|
|
8767
|
+
});
|
|
8768
|
+
var VerificationResultStatus = Type$1.Union([
|
|
8769
|
+
Type$1.Literal("pass"),
|
|
8770
|
+
Type$1.Literal("fail"),
|
|
8771
|
+
Type$1.Literal("skip")
|
|
8772
|
+
], { $id: "VerificationResultStatus" });
|
|
8773
|
+
var VerificationResultKind = Type$1.Union([
|
|
8774
|
+
Type$1.Literal("gate"),
|
|
8775
|
+
Type$1.Literal("assertion"),
|
|
8776
|
+
Type$1.Literal("rubric"),
|
|
8777
|
+
Type$1.Literal("sideEffect")
|
|
8778
|
+
], { $id: "VerificationResultKind" });
|
|
8779
|
+
var VerificationResult = Type$1.Object({
|
|
8780
|
+
id: Type$1.String({ minLength: 1 }),
|
|
8781
|
+
kind: VerificationResultKind,
|
|
8782
|
+
status: VerificationResultStatus,
|
|
8783
|
+
detail: Type$1.Optional(Type$1.String())
|
|
8784
|
+
}, {
|
|
8785
|
+
$id: "VerificationResult",
|
|
8786
|
+
additionalProperties: false
|
|
8787
|
+
});
|
|
8788
|
+
var VerificationRecord = Type$1.Object({
|
|
8789
|
+
inputCid: Type$1.String({ minLength: 1 }),
|
|
8790
|
+
results: Type$1.Array(VerificationResult),
|
|
8791
|
+
passed: Type$1.Boolean()
|
|
8792
|
+
}, {
|
|
8793
|
+
$id: "VerificationRecord",
|
|
8794
|
+
additionalProperties: false
|
|
8795
|
+
});
|
|
8796
|
+
//#endregion
|
|
8966
8797
|
//#region ../tasks/src/task-types/assess-brief.ts
|
|
8967
8798
|
/**
|
|
8968
8799
|
* `assess_brief` — independently evaluate a fulfilled brief.
|
|
8969
8800
|
*
|
|
8970
8801
|
* output_kind: judgment
|
|
8971
|
-
* criteria: required (rubric
|
|
8972
|
-
*
|
|
8802
|
+
* criteria: required (`successCriteria.rubric` — same envelope as
|
|
8803
|
+
* `judge_pack`)
|
|
8973
8804
|
* references: required (must reference the target `fulfill_brief` task)
|
|
8974
8805
|
*
|
|
8975
8806
|
* The assessor is a different agent from the producer (enforced by the
|
|
8976
8807
|
* server / runtime at claim time — not in the wire schema).
|
|
8808
|
+
*
|
|
8809
|
+
* The rubric in `successCriteria` IS the job spec — the assessor applies
|
|
8810
|
+
* it to the target task's output and emits per-criterion scores. Other
|
|
8811
|
+
* sections (`assertions`, `gates`, `sideEffects`) MAY be present and are
|
|
8812
|
+
* evaluated against the *assessor's output*.
|
|
8977
8813
|
*/
|
|
8978
8814
|
var ASSESS_BRIEF_TYPE = "assess_brief";
|
|
8979
|
-
/**
|
|
8980
|
-
* One criterion lifted from the rubric. Denormalized into the input so the
|
|
8981
|
-
* assessor prompt can be built without a second fetch; the `criteria_cid`
|
|
8982
|
-
* on the Task row remains authoritative for verification.
|
|
8983
|
-
*/
|
|
8984
|
-
var AssessBriefCriterion = Type$1.Object({
|
|
8985
|
-
id: Type$1.String({ minLength: 1 }),
|
|
8986
|
-
description: Type$1.String({ minLength: 1 }),
|
|
8987
|
-
weight: Type$1.Number({
|
|
8988
|
-
minimum: 0,
|
|
8989
|
-
maximum: 1
|
|
8990
|
-
}),
|
|
8991
|
-
scoring: Type$1.Union([
|
|
8992
|
-
Type$1.Literal("llm_judged"),
|
|
8993
|
-
Type$1.Literal("boolean"),
|
|
8994
|
-
Type$1.Literal("deterministic_signature_check")
|
|
8995
|
-
])
|
|
8996
|
-
}, {
|
|
8997
|
-
$id: "AssessBriefCriterion",
|
|
8998
|
-
additionalProperties: false
|
|
8999
|
-
});
|
|
9000
8815
|
var AssessBriefInput = Type$1.Object({
|
|
9001
8816
|
targetTaskId: Type$1.String({ format: "uuid" }),
|
|
9002
|
-
|
|
9003
|
-
rubricPreamble: Type$1.Optional(Type$1.String())
|
|
8817
|
+
successCriteria: SuccessCriteria
|
|
9004
8818
|
}, {
|
|
9005
8819
|
$id: "AssessBriefInput",
|
|
9006
8820
|
additionalProperties: false
|
|
@@ -9069,7 +8883,8 @@ var CuratePackInput = Type$1.Object({
|
|
|
9069
8883
|
prefix: Type$1.Optional(Type$1.String())
|
|
9070
8884
|
}, { additionalProperties: false })),
|
|
9071
8885
|
tokenBudget: Type$1.Optional(Type$1.Number({ minimum: 500 })),
|
|
9072
|
-
recipe: Type$1.Optional(Type$1.Union([Type$1.Literal("topic-focused-v1"), Type$1.Literal("scope-inventory-v1")]))
|
|
8886
|
+
recipe: Type$1.Optional(Type$1.Union([Type$1.Literal("topic-focused-v1"), Type$1.Literal("scope-inventory-v1")])),
|
|
8887
|
+
successCriteria: Type$1.Optional(SuccessCriteria)
|
|
9073
8888
|
}, {
|
|
9074
8889
|
$id: "CuratePackInput",
|
|
9075
8890
|
additionalProperties: false
|
|
@@ -9094,7 +8909,8 @@ var CuratePackOutput = Type$1.Object({
|
|
|
9094
8909
|
droppedIds: Type$1.Optional(Type$1.Array(Type$1.String({ format: "uuid" }))),
|
|
9095
8910
|
notes: Type$1.String({ minLength: 1 })
|
|
9096
8911
|
}, { additionalProperties: false }))),
|
|
9097
|
-
summary: Type$1.String({ minLength: 1 })
|
|
8912
|
+
summary: Type$1.String({ minLength: 1 }),
|
|
8913
|
+
verification: Type$1.Optional(VerificationRecord)
|
|
9098
8914
|
}, {
|
|
9099
8915
|
$id: "CuratePackOutput",
|
|
9100
8916
|
additionalProperties: false
|
|
@@ -9113,6 +8929,7 @@ var FulfillBriefInput = Type$1.Object({
|
|
|
9113
8929
|
brief: Type$1.String({ minLength: 1 }),
|
|
9114
8930
|
title: Type$1.Optional(Type$1.String()),
|
|
9115
8931
|
acceptanceCriteria: Type$1.Optional(Type$1.Array(Type$1.String())),
|
|
8932
|
+
successCriteria: Type$1.Optional(SuccessCriteria),
|
|
9116
8933
|
seedFiles: Type$1.Optional(Type$1.Array(Type$1.String())),
|
|
9117
8934
|
scopeHint: Type$1.Optional(Type$1.String())
|
|
9118
8935
|
}, {
|
|
@@ -9132,7 +8949,8 @@ var FulfillBriefOutput = Type$1.Object({
|
|
|
9132
8949
|
}, { additionalProperties: false })),
|
|
9133
8950
|
pullRequestUrl: Type$1.Union([Type$1.String(), Type$1.Null()]),
|
|
9134
8951
|
diaryEntryIds: Type$1.Array(Type$1.String({ format: "uuid" })),
|
|
9135
|
-
summary: Type$1.String({ minLength: 1 })
|
|
8952
|
+
summary: Type$1.String({ minLength: 1 }),
|
|
8953
|
+
verification: Type$1.Optional(VerificationRecord)
|
|
9136
8954
|
}, {
|
|
9137
8955
|
$id: "FulfillBriefOutput",
|
|
9138
8956
|
additionalProperties: false
|
|
@@ -9143,19 +8961,18 @@ var FulfillBriefOutput = Type$1.Object({
|
|
|
9143
8961
|
* `judge_pack` — independently score a rendered pack against a rubric.
|
|
9144
8962
|
*
|
|
9145
8963
|
* output_kind: judgment
|
|
9146
|
-
* criteria: required (
|
|
9147
|
-
*
|
|
8964
|
+
* criteria: required (`successCriteria.rubric` — see #852 amendment and
|
|
8965
|
+
* Phase 2 issue #881)
|
|
9148
8966
|
* references: required (must reference the `render_pack` task it judges,
|
|
9149
8967
|
* role='judged_work')
|
|
9150
8968
|
*
|
|
9151
8969
|
* Step 3 of the three-session attribution loop (#875). Mirrors
|
|
9152
8970
|
* `assess_brief` in shape, but over a rendered context pack.
|
|
9153
8971
|
*
|
|
9154
|
-
* Phase 1 rubric storage: the rubric body
|
|
9155
|
-
*
|
|
9156
|
-
* replace the inline body with a `
|
|
9157
|
-
*
|
|
9158
|
-
* building without a fetch.
|
|
8972
|
+
* Phase 1 rubric storage: the rubric body lives at
|
|
8973
|
+
* `input.successCriteria.rubric` and is pinned via the task's `inputCid`.
|
|
8974
|
+
* Phase 2 (#881) will replace the inline body with a `rubricCid`
|
|
8975
|
+
* referencing a stored `rubrics` row; the envelope stays the same.
|
|
9159
8976
|
*
|
|
9160
8977
|
* The judge MUST be a different agent from the renderer. Enforced at
|
|
9161
8978
|
* claim time by the runtime, not in the wire schema.
|
|
@@ -9164,7 +8981,7 @@ var JUDGE_PACK_TYPE = "judge_pack";
|
|
|
9164
8981
|
var JudgePackInput = Type$1.Object({
|
|
9165
8982
|
renderedPackId: Type$1.String({ format: "uuid" }),
|
|
9166
8983
|
sourcePackId: Type$1.String({ format: "uuid" }),
|
|
9167
|
-
|
|
8984
|
+
successCriteria: SuccessCriteria
|
|
9168
8985
|
}, {
|
|
9169
8986
|
$id: "JudgePackInput",
|
|
9170
8987
|
additionalProperties: false
|
|
@@ -9177,6 +8994,7 @@ var JudgePackScore = Type$1.Object({
|
|
|
9177
8994
|
maximum: 1
|
|
9178
8995
|
}),
|
|
9179
8996
|
rationale: Type$1.Optional(Type$1.String()),
|
|
8997
|
+
assertions: Type$1.Optional(Type$1.Array(AssertionResult, { minItems: 1 })),
|
|
9180
8998
|
evidence: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Unknown()))
|
|
9181
8999
|
}, {
|
|
9182
9000
|
$id: "JudgePackScore",
|
|
@@ -9195,6 +9013,39 @@ var JudgePackOutput = Type$1.Object({
|
|
|
9195
9013
|
$id: "JudgePackOutput",
|
|
9196
9014
|
additionalProperties: false
|
|
9197
9015
|
});
|
|
9016
|
+
/**
|
|
9017
|
+
* Cross-field validator for JudgePackOutput. Run after the TypeBox
|
|
9018
|
+
* schema check passes. Enforces invariants the schema can't express:
|
|
9019
|
+
*
|
|
9020
|
+
* 1. If a `JudgePackScore` carries an `assertions` array (i.e. the
|
|
9021
|
+
* judge ran the criterion in `llm_checklist` mode), its numeric
|
|
9022
|
+
* `score` MUST equal `1` if every `assertions[i].passed` is true,
|
|
9023
|
+
* else `0`. The prompt instructs the judge to derive `score` from
|
|
9024
|
+
* the array, but the LLM can drift — without this check, the
|
|
9025
|
+
* runtime accepts inconsistent payloads and propagates them into
|
|
9026
|
+
* composite scores and judge attestations (#999 P1).
|
|
9027
|
+
*
|
|
9028
|
+
* 2. If `score` is exactly `1` AND `assertions` is present, every
|
|
9029
|
+
* assertion must have `passed: true`. Catches the failure mode in
|
|
9030
|
+
* the issue: "score: 1 with a failing assertion accepted."
|
|
9031
|
+
*
|
|
9032
|
+
* Cross-rubric checks (e.g. "did the judge populate `assertions` for
|
|
9033
|
+
* every criterion the rubric marked `llm_checklist`?") require the
|
|
9034
|
+
* input rubric and live in a separate, runtime-side validator. This
|
|
9035
|
+
* one is rubric-agnostic on purpose — it catches within-score
|
|
9036
|
+
* inconsistency without needing the original task input.
|
|
9037
|
+
*/
|
|
9038
|
+
function validateJudgePackOutput(output) {
|
|
9039
|
+
const scores = output.scores;
|
|
9040
|
+
for (let i = 0; i < scores.length; i++) {
|
|
9041
|
+
const s = scores[i];
|
|
9042
|
+
if (!s.assertions) continue;
|
|
9043
|
+
const allPassed = s.assertions.every((a) => a.passed);
|
|
9044
|
+
const expected = allPassed ? 1 : 0;
|
|
9045
|
+
if (s.score !== expected) return `scores[${i}] (criterionId="${s.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${s.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
|
|
9046
|
+
}
|
|
9047
|
+
return null;
|
|
9048
|
+
}
|
|
9198
9049
|
//#endregion
|
|
9199
9050
|
//#region ../tasks/src/task-types/render-pack.ts
|
|
9200
9051
|
/**
|
|
@@ -9216,7 +9067,8 @@ var RENDER_PACK_TYPE = "render_pack";
|
|
|
9216
9067
|
var RenderPackInput = Type$1.Object({
|
|
9217
9068
|
packId: Type$1.String({ format: "uuid" }),
|
|
9218
9069
|
persist: Type$1.Optional(Type$1.Boolean()),
|
|
9219
|
-
pinned: Type$1.Optional(Type$1.Boolean())
|
|
9070
|
+
pinned: Type$1.Optional(Type$1.Boolean()),
|
|
9071
|
+
successCriteria: Type$1.Optional(SuccessCriteria)
|
|
9220
9072
|
}, {
|
|
9221
9073
|
$id: "RenderPackInput",
|
|
9222
9074
|
additionalProperties: false
|
|
@@ -9227,7 +9079,8 @@ var RenderPackOutput = Type$1.Object({
|
|
|
9227
9079
|
renderMethod: Type$1.String({ minLength: 1 }),
|
|
9228
9080
|
byteSize: Type$1.Number({ minimum: 0 }),
|
|
9229
9081
|
entriesRendered: Type$1.Number({ minimum: 0 }),
|
|
9230
|
-
summary: Type$1.String({ minLength: 1 })
|
|
9082
|
+
summary: Type$1.String({ minLength: 1 }),
|
|
9083
|
+
verification: Type$1.Optional(VerificationRecord)
|
|
9231
9084
|
}, {
|
|
9232
9085
|
$id: "RenderPackOutput",
|
|
9233
9086
|
additionalProperties: false
|
|
@@ -9235,6 +9088,33 @@ var RenderPackOutput = Type$1.Object({
|
|
|
9235
9088
|
//#endregion
|
|
9236
9089
|
//#region ../tasks/src/task-types/index.ts
|
|
9237
9090
|
/**
|
|
9091
|
+
* Validate that a judgment-task input carries a rubric inside its
|
|
9092
|
+
* `successCriteria` envelope, and that the rubric's weights sum to 1.
|
|
9093
|
+
* Used for `assess_brief` and `judge_pack`.
|
|
9094
|
+
*/
|
|
9095
|
+
function validateJudgmentInput(input) {
|
|
9096
|
+
const sc = input.successCriteria;
|
|
9097
|
+
if (!sc) return "successCriteria is required for judgment tasks";
|
|
9098
|
+
if (!sc.rubric) return "successCriteria.rubric is required for judgment tasks";
|
|
9099
|
+
return validateRubricWeights(sc.rubric);
|
|
9100
|
+
}
|
|
9101
|
+
/**
|
|
9102
|
+
* Cross-field rule: when `input.successCriteria` is set, the producer's
|
|
9103
|
+
* output MUST carry a `verification` block (the LLM's self-assessment).
|
|
9104
|
+
* When it is unset, the output MUST NOT carry one (avoid garbage data).
|
|
9105
|
+
*
|
|
9106
|
+
* Used by all three fulfillment task types. Judgment task outputs do
|
|
9107
|
+
* NOT use this — their entire output IS a structured judgment, so a
|
|
9108
|
+
* separate self-assessment field would be circular.
|
|
9109
|
+
*/
|
|
9110
|
+
function requireVerificationWhenCriteriaPresent(output, input) {
|
|
9111
|
+
const hasCriteria = input !== void 0 && input !== null && input.successCriteria !== void 0;
|
|
9112
|
+
const hasVerification = output.verification !== void 0;
|
|
9113
|
+
if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the criteria";
|
|
9114
|
+
if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no criteria to assess against";
|
|
9115
|
+
return null;
|
|
9116
|
+
}
|
|
9117
|
+
/**
|
|
9238
9118
|
* Client-side task-type registry. Mirrors the server-owned DB registry
|
|
9239
9119
|
* (PR 2). PR 0 shipped the two brief types; this PR adds the three
|
|
9240
9120
|
* pack-pipeline types for the three-session attribution loop (#875).
|
|
@@ -9249,41 +9129,41 @@ var BUILT_IN_TASK_TYPES = {
|
|
|
9249
9129
|
inputSchema: FulfillBriefInput,
|
|
9250
9130
|
outputSchema: FulfillBriefOutput,
|
|
9251
9131
|
outputKind: "artifact",
|
|
9252
|
-
|
|
9253
|
-
|
|
9132
|
+
requiresReferences: false,
|
|
9133
|
+
validateOutput: requireVerificationWhenCriteriaPresent
|
|
9254
9134
|
},
|
|
9255
9135
|
[ASSESS_BRIEF_TYPE]: {
|
|
9256
9136
|
name: ASSESS_BRIEF_TYPE,
|
|
9257
9137
|
inputSchema: AssessBriefInput,
|
|
9258
9138
|
outputSchema: AssessBriefOutput,
|
|
9259
9139
|
outputKind: "judgment",
|
|
9260
|
-
|
|
9261
|
-
|
|
9140
|
+
requiresReferences: true,
|
|
9141
|
+
validateInput: validateJudgmentInput
|
|
9262
9142
|
},
|
|
9263
9143
|
[CURATE_PACK_TYPE]: {
|
|
9264
9144
|
name: CURATE_PACK_TYPE,
|
|
9265
9145
|
inputSchema: CuratePackInput,
|
|
9266
9146
|
outputSchema: CuratePackOutput,
|
|
9267
9147
|
outputKind: "artifact",
|
|
9268
|
-
|
|
9269
|
-
|
|
9148
|
+
requiresReferences: false,
|
|
9149
|
+
validateOutput: requireVerificationWhenCriteriaPresent
|
|
9270
9150
|
},
|
|
9271
9151
|
[RENDER_PACK_TYPE]: {
|
|
9272
9152
|
name: RENDER_PACK_TYPE,
|
|
9273
9153
|
inputSchema: RenderPackInput,
|
|
9274
9154
|
outputSchema: RenderPackOutput,
|
|
9275
9155
|
outputKind: "artifact",
|
|
9276
|
-
|
|
9277
|
-
|
|
9156
|
+
requiresReferences: false,
|
|
9157
|
+
validateOutput: requireVerificationWhenCriteriaPresent
|
|
9278
9158
|
},
|
|
9279
9159
|
[JUDGE_PACK_TYPE]: {
|
|
9280
9160
|
name: JUDGE_PACK_TYPE,
|
|
9281
9161
|
inputSchema: JudgePackInput,
|
|
9282
9162
|
outputSchema: JudgePackOutput,
|
|
9283
9163
|
outputKind: "judgment",
|
|
9284
|
-
requiresCriteria: false,
|
|
9285
9164
|
requiresReferences: true,
|
|
9286
|
-
validateInput:
|
|
9165
|
+
validateInput: validateJudgmentInput,
|
|
9166
|
+
validateOutput: validateJudgePackOutput
|
|
9287
9167
|
}
|
|
9288
9168
|
};
|
|
9289
9169
|
//#endregion
|
|
@@ -9313,13 +9193,30 @@ function schemaErrors(prefix, schema, value) {
|
|
|
9313
9193
|
message: error.message
|
|
9314
9194
|
}));
|
|
9315
9195
|
}
|
|
9316
|
-
function validateTaskOutput(taskType, output) {
|
|
9196
|
+
function validateTaskOutput(taskType, output, input) {
|
|
9317
9197
|
const entry = getTaskTypeEntry(taskType);
|
|
9318
9198
|
if (!entry) return [{
|
|
9319
9199
|
field: "taskType",
|
|
9320
9200
|
message: `Unknown task type: ${taskType}`
|
|
9321
9201
|
}];
|
|
9322
|
-
|
|
9202
|
+
const errors = schemaErrors("output", entry.outputSchema, output);
|
|
9203
|
+
if (errors.length > 0) return errors;
|
|
9204
|
+
if (entry.validateOutput) {
|
|
9205
|
+
const validationError = entry.validateOutput(output, input);
|
|
9206
|
+
if (validationError) return [{
|
|
9207
|
+
field: "output",
|
|
9208
|
+
message: validationError
|
|
9209
|
+
}];
|
|
9210
|
+
}
|
|
9211
|
+
return [];
|
|
9212
|
+
}
|
|
9213
|
+
/**
|
|
9214
|
+
* Resolve the TypeBox output schema registered for `taskType`. Returns
|
|
9215
|
+
* `null` for unknown task types — callers (e.g. submit-tool factories)
|
|
9216
|
+
* decide how to surface that.
|
|
9217
|
+
*/
|
|
9218
|
+
function getTaskOutputSchema(taskType) {
|
|
9219
|
+
return getTaskTypeEntry(taskType)?.outputSchema ?? null;
|
|
9323
9220
|
}
|
|
9324
9221
|
//#endregion
|
|
9325
9222
|
//#region ../tasks/src/wire.ts
|
|
@@ -9451,7 +9348,6 @@ Type$1.Object({
|
|
|
9451
9348
|
input: Type$1.Record(Type$1.String(), Type$1.Unknown()),
|
|
9452
9349
|
inputSchemaCid: Cid,
|
|
9453
9350
|
inputCid: Cid,
|
|
9454
|
-
criteriaCid: Type$1.Union([Cid, Type$1.Null()]),
|
|
9455
9351
|
references: Type$1.Array(TaskRef),
|
|
9456
9352
|
correlationId: Type$1.Union([Uuid, Type$1.Null()]),
|
|
9457
9353
|
imposedByAgentId: Type$1.Union([Uuid, Type$1.Null()]),
|
|
@@ -9549,6 +9445,98 @@ Type$1.Object({
|
|
|
9549
9445
|
additionalProperties: false
|
|
9550
9446
|
});
|
|
9551
9447
|
//#endregion
|
|
9448
|
+
//#region ../agent-runtime/src/output-tools.ts
|
|
9449
|
+
/**
|
|
9450
|
+
* Submit-output tool contract.
|
|
9451
|
+
*
|
|
9452
|
+
* The runtime advertises a per-task-type "submit output" tool in every
|
|
9453
|
+
* prompt. The tool's name and schema must be the same wherever the
|
|
9454
|
+
* agent encounters it: in the system prompt the model reads, in the
|
|
9455
|
+
* executor that registers it, in any future executor that wires it
|
|
9456
|
+
* into a different coding-agent SDK.
|
|
9457
|
+
*
|
|
9458
|
+
* This module is the single source of truth for the (toolName,
|
|
9459
|
+
* description, parametersSchema) triple. It has no executor-specific
|
|
9460
|
+
* dependencies — `agent-runtime` is intentionally agnostic of the
|
|
9461
|
+
* concrete coding-agent runtime — so anything that wants to register
|
|
9462
|
+
* the tool (pi-extension today, a Codex-SDK adapter tomorrow, a local
|
|
9463
|
+
* MCP bridge if we ever go that route) can read the contract here and
|
|
9464
|
+
* wire it into its own tool API.
|
|
9465
|
+
*
|
|
9466
|
+
* Conventions captured here:
|
|
9467
|
+
*
|
|
9468
|
+
* - Tool name shape: `submit_<task_type>_output` (e.g.
|
|
9469
|
+
* `submit_fulfill_brief_output`). This is the string the model
|
|
9470
|
+
* sees in the prompt's "preferred path" instruction.
|
|
9471
|
+
* - Parameters schema: the task type's TypeBox `*Output` schema
|
|
9472
|
+
* **directly**, NOT wrapped in `{ output: <schema> }`. Tool args
|
|
9473
|
+
* ARE the payload, so the model gets field-level guidance at
|
|
9474
|
+
* planning time.
|
|
9475
|
+
* - Description text: shared across executors so the tool's
|
|
9476
|
+
* advertised purpose is identical regardless of who registers it.
|
|
9477
|
+
*/
|
|
9478
|
+
/**
|
|
9479
|
+
* Build the submit-output contract for a task type. Returns `null` if
|
|
9480
|
+
* no output schema is registered for that type — callers (executors)
|
|
9481
|
+
* decide whether that's a hard error, a fallback to the parser-only
|
|
9482
|
+
* path, or anything else.
|
|
9483
|
+
*/
|
|
9484
|
+
function getSubmitOutputContract(taskType) {
|
|
9485
|
+
const schema = getTaskOutputSchema(taskType);
|
|
9486
|
+
if (!schema) return null;
|
|
9487
|
+
return {
|
|
9488
|
+
toolName: submitOutputToolName(taskType),
|
|
9489
|
+
taskType,
|
|
9490
|
+
description: `Submit the structured output for this ${taskType} task. Call exactly once when done. The arguments below ARE the output payload — pass each top-level field of the task type's output schema directly. The runtime validates the args against the schema; mismatches return a tool error you can recover from in the same session. On a valid call the runtime captures the payload and ends the session — you do not need to repeat the JSON in your final assistant message.`,
|
|
9491
|
+
parametersSchema: schema
|
|
9492
|
+
};
|
|
9493
|
+
}
|
|
9494
|
+
/**
|
|
9495
|
+
* Plain-string name builder. Exposed separately so the prompt builder
|
|
9496
|
+
* can advertise the tool name even when the schema lookup is deferred
|
|
9497
|
+
* to the executor (the prompt is built before any tool registration
|
|
9498
|
+
* happens).
|
|
9499
|
+
*/
|
|
9500
|
+
function submitOutputToolName(taskType) {
|
|
9501
|
+
return `submit_${taskType}_output`;
|
|
9502
|
+
}
|
|
9503
|
+
//#endregion
|
|
9504
|
+
//#region ../agent-runtime/src/prompts/final-output.ts
|
|
9505
|
+
function buildFinalOutputBlock(opts) {
|
|
9506
|
+
const { taskType, outputSchemaName, shapeSketch, extraNotes } = opts;
|
|
9507
|
+
const submitTool = submitOutputToolName(taskType);
|
|
9508
|
+
const lines = [
|
|
9509
|
+
"## Final output (read this carefully)",
|
|
9510
|
+
"",
|
|
9511
|
+
`Your VERY LAST action in this conversation MUST report the structured`,
|
|
9512
|
+
`output matching \`${outputSchemaName}\`. Two ways to do it, in order of`,
|
|
9513
|
+
`preference:`,
|
|
9514
|
+
"",
|
|
9515
|
+
`1. **Preferred — call \`${submitTool}\` exactly once** with the payload.`,
|
|
9516
|
+
` The runtime captures the validated arguments and ends the session.`,
|
|
9517
|
+
` If the tool is registered, prefer this path.`,
|
|
9518
|
+
`2. **Fallback** — if the submit tool is unavailable, your very last`,
|
|
9519
|
+
` assistant message MUST be a single JSON object matching`,
|
|
9520
|
+
` \`${outputSchemaName}\`. No prose before or after. No code fences.`,
|
|
9521
|
+
` No "ok" or "done". The runtime parses the last balanced top-level`,
|
|
9522
|
+
` JSON object as the output.`,
|
|
9523
|
+
"",
|
|
9524
|
+
`Failing to report structured output as the very last action means the`,
|
|
9525
|
+
`attempt is marked failed even if the underlying work succeeded.`,
|
|
9526
|
+
"",
|
|
9527
|
+
`Output shape:`,
|
|
9528
|
+
"",
|
|
9529
|
+
"```json",
|
|
9530
|
+
shapeSketch,
|
|
9531
|
+
"```"
|
|
9532
|
+
];
|
|
9533
|
+
if (extraNotes?.length) {
|
|
9534
|
+
lines.push("");
|
|
9535
|
+
for (const note of extraNotes) lines.push(note);
|
|
9536
|
+
}
|
|
9537
|
+
return lines.join("\n");
|
|
9538
|
+
}
|
|
9539
|
+
//#endregion
|
|
9552
9540
|
//#region ../agent-runtime/src/prompts/assess-brief.ts
|
|
9553
9541
|
/**
|
|
9554
9542
|
* Build the system prompt for an `assess_brief` judge attempt.
|
|
@@ -9573,11 +9561,12 @@ Type$1.Object({
|
|
|
9573
9561
|
* anything) work without any code path here.
|
|
9574
9562
|
*/
|
|
9575
9563
|
function buildAssessBriefPrompt(input, ctx) {
|
|
9576
|
-
const
|
|
9577
|
-
const
|
|
9564
|
+
const rubric = input.successCriteria.rubric;
|
|
9565
|
+
const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
|
|
9566
|
+
const preambleSection = rubric.preamble ? [
|
|
9578
9567
|
"### Rubric preamble",
|
|
9579
9568
|
"",
|
|
9580
|
-
|
|
9569
|
+
rubric.preamble,
|
|
9581
9570
|
""
|
|
9582
9571
|
].join("\n") : "";
|
|
9583
9572
|
return [
|
|
@@ -9606,6 +9595,20 @@ function buildAssessBriefPrompt(input, ctx) {
|
|
|
9606
9595
|
" - `summary` set → use as orientation, not as ground truth.",
|
|
9607
9596
|
"Adapt your investigation to whatever the output actually contains. Score conservatively when the producer's output is opaque or thin.",
|
|
9608
9597
|
"",
|
|
9598
|
+
"### Querying the producer's diary entries",
|
|
9599
|
+
"",
|
|
9600
|
+
`Beyond the explicit \`diaryEntryIds[]\` from step 3, the producer's`,
|
|
9601
|
+
"attempts auto-tag every entry with the `task:*` provenance namespace.",
|
|
9602
|
+
"You can pull the full set without enumerating ids by passing the",
|
|
9603
|
+
"`taskFilter` shorthand to `moltnet_list_entries` or",
|
|
9604
|
+
"`moltnet_search_entries`:",
|
|
9605
|
+
"",
|
|
9606
|
+
`- All entries from the producer task: \`taskFilter: { taskId: "${input.targetTaskId}" }\`.`,
|
|
9607
|
+
"- Just the accepted attempt: add `attemptN: <acceptedAttemptN>`.",
|
|
9608
|
+
"- The producer plus any prior chain (when a correlationId was set):",
|
|
9609
|
+
" read it from the task you fetched in step 1 and pass",
|
|
9610
|
+
" `taskFilter: { correlationId: \"<id>\" }`.",
|
|
9611
|
+
"",
|
|
9609
9612
|
preambleSection,
|
|
9610
9613
|
"## Criteria",
|
|
9611
9614
|
"",
|
|
@@ -9613,19 +9616,63 @@ function buildAssessBriefPrompt(input, ctx) {
|
|
|
9613
9616
|
"",
|
|
9614
9617
|
"### Scoring rules",
|
|
9615
9618
|
"",
|
|
9616
|
-
"- `
|
|
9619
|
+
"- `llm_score`: score 0..1 continuous. `rationale` REQUIRED (2–4 sentences).",
|
|
9617
9620
|
"- `boolean`: score exactly 0 or 1. `rationale` optional.",
|
|
9618
9621
|
"- `deterministic_signature_check`: run `moltnet entry verify` on every diary entry returned by step 3 above AND `git verify-commit` on every commit. Score 1 iff ALL signatures are valid; otherwise 0. Populate `evidence.commitsVerified`, `evidence.commitsTotal`, `evidence.signatureFailures`.",
|
|
9619
9622
|
"",
|
|
9620
|
-
"
|
|
9623
|
+
"Write a signed diary entry (tags: \"judgment\", \"assess_brief\") capturing the rationale before reporting structured output.",
|
|
9621
9624
|
"",
|
|
9622
|
-
|
|
9623
|
-
|
|
9624
|
-
|
|
9625
|
-
|
|
9625
|
+
buildFinalOutputBlock({
|
|
9626
|
+
taskType: "assess_brief",
|
|
9627
|
+
outputSchemaName: "AssessBriefOutput",
|
|
9628
|
+
shapeSketch: [
|
|
9629
|
+
"{",
|
|
9630
|
+
" \"scores\": [",
|
|
9631
|
+
" { \"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {} }",
|
|
9632
|
+
" ],",
|
|
9633
|
+
" \"composite\": <sum>,",
|
|
9634
|
+
" \"verdict\": \"<1-3 sentence overall>\",",
|
|
9635
|
+
" \"judgeModel\": \"<provider:model>\"",
|
|
9636
|
+
"}"
|
|
9637
|
+
].join("\n"),
|
|
9638
|
+
extraNotes: ["`composite` = Σ(weight_i × score_i) recomputed. The runtime rejects a mismatch."]
|
|
9639
|
+
})
|
|
9626
9640
|
].filter(Boolean).join("\n");
|
|
9627
9641
|
}
|
|
9628
9642
|
//#endregion
|
|
9643
|
+
//#region ../agent-runtime/src/prompts/self-verification.ts
|
|
9644
|
+
function buildSelfVerificationBlock(taskId) {
|
|
9645
|
+
return [
|
|
9646
|
+
"## Self-verification",
|
|
9647
|
+
"",
|
|
9648
|
+
`Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.successCriteria\`.`,
|
|
9649
|
+
"",
|
|
9650
|
+
"- If `input.successCriteria` is **absent**, omit `verification` from your",
|
|
9651
|
+
" final output entirely.",
|
|
9652
|
+
"- If `input.successCriteria` is **present**, you MUST include a",
|
|
9653
|
+
" `verification` block in your final output. Evaluate every applicable",
|
|
9654
|
+
" item — `gates`, `assertions`, `rubric` criteria, `sideEffects` — against",
|
|
9655
|
+
" your produced work and emit one result per id. Be honest: a `fail` with",
|
|
9656
|
+
" a one-line reason is more useful than a false `pass`. Use `skip` (with a",
|
|
9657
|
+
" `detail`) when you genuinely could not determine a result. Compute",
|
|
9658
|
+
" `passed = results.every(r => r.status !== 'fail')`.",
|
|
9659
|
+
"",
|
|
9660
|
+
"Verification shape:",
|
|
9661
|
+
"",
|
|
9662
|
+
"```json",
|
|
9663
|
+
"{",
|
|
9664
|
+
" \"inputCid\": \"<the inputCid you saw on the task>\",",
|
|
9665
|
+
" \"results\": [",
|
|
9666
|
+
" { \"id\": \"<criterion id>\", \"kind\": \"assertion|gate|rubric|sideEffect\",",
|
|
9667
|
+
" \"status\": \"pass|fail|skip\", \"detail\": \"<optional one-liner>\" }",
|
|
9668
|
+
" ],",
|
|
9669
|
+
" \"passed\": <boolean>",
|
|
9670
|
+
"}",
|
|
9671
|
+
"```",
|
|
9672
|
+
""
|
|
9673
|
+
].join("\n");
|
|
9674
|
+
}
|
|
9675
|
+
//#endregion
|
|
9629
9676
|
//#region ../agent-runtime/src/prompts/curate-pack.ts
|
|
9630
9677
|
/**
|
|
9631
9678
|
* Build the system prompt for a `curate_pack` task.
|
|
@@ -9699,9 +9746,16 @@ function buildCuratePackPrompt(input, ctx) {
|
|
|
9699
9746
|
"## Tools available (not a recipe — use what the situation calls for)",
|
|
9700
9747
|
"",
|
|
9701
9748
|
"- `moltnet_diary_tags` — tag inventory with counts. Cheap reconnaissance",
|
|
9702
|
-
" when the prompt implies a scope but not a tag.",
|
|
9749
|
+
" when the prompt implies a scope but not a tag. Pass",
|
|
9750
|
+
" `prefix: \"task:\"` to enumerate task-provenance tags only",
|
|
9751
|
+
" (`task:type:*`, `task:correlation:*`, etc.).",
|
|
9703
9752
|
"- `moltnet_search_entries` — hybrid semantic + lexical search.",
|
|
9704
|
-
"
|
|
9753
|
+
" Filters AND with the query: pass `tags`, `excludeTags`,",
|
|
9754
|
+
" `entryTypes`, or the `taskFilter` shorthand to narrow before",
|
|
9755
|
+
" ranking. Example: `taskFilter: { taskType: \"fulfill_brief\" }`",
|
|
9756
|
+
" returns only entries from fulfill_brief attempts.",
|
|
9757
|
+
"- `moltnet_list_entries` — multi-tag (AND) listing with optional",
|
|
9758
|
+
" `excludeTags`, `entryType`, and the same `taskFilter` shorthand.",
|
|
9705
9759
|
"- `moltnet_get_entry` — full entry read, for disambiguation.",
|
|
9706
9760
|
"- `moltnet_pack_create` — terminal call that persists the pack.",
|
|
9707
9761
|
"",
|
|
@@ -9747,31 +9801,32 @@ function buildCuratePackPrompt(input, ctx) {
|
|
|
9747
9801
|
"",
|
|
9748
9802
|
"## Hard constraints",
|
|
9749
9803
|
"",
|
|
9750
|
-
"- Do NOT call `moltnet_pack_render`
|
|
9751
|
-
" those belong to the next sessions.",
|
|
9804
|
+
"- Do NOT call `moltnet_pack_render` — that belongs to the next session.",
|
|
9752
9805
|
"- Do NOT write diary entries unless curation surfaces a genuine",
|
|
9753
9806
|
" incident worth recording. The curation reasoning lives in the task",
|
|
9754
9807
|
" output, not in the diary.",
|
|
9755
9808
|
"- Respect hard include/exclude filters literally.",
|
|
9756
9809
|
"",
|
|
9757
|
-
|
|
9758
|
-
|
|
9759
|
-
|
|
9760
|
-
|
|
9761
|
-
|
|
9762
|
-
|
|
9763
|
-
|
|
9764
|
-
|
|
9765
|
-
|
|
9766
|
-
|
|
9767
|
-
|
|
9768
|
-
|
|
9769
|
-
|
|
9770
|
-
|
|
9771
|
-
|
|
9772
|
-
|
|
9773
|
-
|
|
9774
|
-
|
|
9810
|
+
buildSelfVerificationBlock(ctx.taskId),
|
|
9811
|
+
buildFinalOutputBlock({
|
|
9812
|
+
taskType: "curate_pack",
|
|
9813
|
+
outputSchemaName: "CuratePackOutput",
|
|
9814
|
+
shapeSketch: [
|
|
9815
|
+
"{",
|
|
9816
|
+
" \"packId\": \"<uuid>\",",
|
|
9817
|
+
" \"packCid\": \"<cid>\",",
|
|
9818
|
+
" \"entries\": [",
|
|
9819
|
+
" { \"entryId\": \"<uuid>\", \"rank\": 1, \"rationale\": \"<why>\" }",
|
|
9820
|
+
" ],",
|
|
9821
|
+
" \"recipeParams\": { \"recipe\": \"...\", \"prompt\": \"...\", ... },",
|
|
9822
|
+
" \"checkpoints\": [",
|
|
9823
|
+
" { \"phase\": \"recon\", \"candidateIds\": [...], \"droppedIds\": [...], \"notes\": \"...\" }",
|
|
9824
|
+
" ],",
|
|
9825
|
+
" \"summary\": \"<2-4 sentences: what you looked for, how you narrowed, what defines the final set>\",",
|
|
9826
|
+
" \"verification\": <required iff input.successCriteria; see Self-verification>",
|
|
9827
|
+
"}"
|
|
9828
|
+
].join("\n")
|
|
9829
|
+
})
|
|
9775
9830
|
].filter((l) => l !== null).join("\n");
|
|
9776
9831
|
}
|
|
9777
9832
|
//#endregion
|
|
@@ -9829,17 +9884,28 @@ function buildFulfillBriefPrompt(input, ctx) {
|
|
|
9829
9884
|
" `MoltNet-Diary: <id>` (per the runtime instructor).",
|
|
9830
9885
|
"6. Push the branch and open a PR.",
|
|
9831
9886
|
"",
|
|
9832
|
-
|
|
9833
|
-
|
|
9834
|
-
|
|
9835
|
-
|
|
9836
|
-
|
|
9887
|
+
buildSelfVerificationBlock(ctx.taskId),
|
|
9888
|
+
buildFinalOutputBlock({
|
|
9889
|
+
taskType: "fulfill_brief",
|
|
9890
|
+
outputSchemaName: "FulfillBriefOutput",
|
|
9891
|
+
shapeSketch: [
|
|
9892
|
+
"{",
|
|
9893
|
+
" \"branch\": \"<branch-name>\",",
|
|
9894
|
+
" \"commits\": [{ \"sha\": \"...\", \"message\": \"...\", \"diaryEntryId\": \"...\" }],",
|
|
9895
|
+
" \"pullRequestUrl\": \"<url-or-null>\",",
|
|
9896
|
+
" \"diaryEntryIds\": [\"...\"],",
|
|
9897
|
+
" \"summary\": \"<1-3 sentence recap>\",",
|
|
9898
|
+
" \"verification\": <required iff input.successCriteria; see Self-verification>",
|
|
9899
|
+
"}"
|
|
9900
|
+
].join("\n")
|
|
9901
|
+
})
|
|
9837
9902
|
].filter(Boolean).join("\n");
|
|
9838
9903
|
}
|
|
9839
9904
|
//#endregion
|
|
9840
9905
|
//#region ../agent-runtime/src/prompts/judge-pack.ts
|
|
9841
9906
|
function buildJudgePackPrompt(input, ctx) {
|
|
9842
|
-
const { renderedPackId, sourcePackId,
|
|
9907
|
+
const { renderedPackId, sourcePackId, successCriteria } = input;
|
|
9908
|
+
const rubric = successCriteria.rubric;
|
|
9843
9909
|
const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
|
|
9844
9910
|
const preambleSection = rubric.preamble ? [
|
|
9845
9911
|
"### Rubric preamble",
|
|
@@ -9869,7 +9935,7 @@ function buildJudgePackPrompt(input, ctx) {
|
|
|
9869
9935
|
"",
|
|
9870
9936
|
"1. Call `moltnet_rendered_pack_get` for the rendered pack. Keep the",
|
|
9871
9937
|
" `content` string — you will score it.",
|
|
9872
|
-
"2. Call `moltnet_pack_get` with `
|
|
9938
|
+
"2. Call `moltnet_pack_get` with `expandEntries: true` for the source",
|
|
9873
9939
|
" pack. Keep the source entries for grounding / coverage checks.",
|
|
9874
9940
|
"3. For each criterion, score according to its `scoring` mode (see",
|
|
9875
9941
|
" Scoring rules below). Produce rationales where required.",
|
|
@@ -9882,9 +9948,23 @@ function buildJudgePackPrompt(input, ctx) {
|
|
|
9882
9948
|
"",
|
|
9883
9949
|
"### Scoring rules",
|
|
9884
9950
|
"",
|
|
9885
|
-
"- `
|
|
9951
|
+
"- `llm_score`: score 0..1 continuous. `rationale` REQUIRED (2–4",
|
|
9886
9952
|
" sentences pointing at specific evidence in the rendered content or",
|
|
9887
|
-
" the source entries).",
|
|
9953
|
+
" the source entries). NOTE: this mode smooths individual failures",
|
|
9954
|
+
" into the gradient. Prefer `llm_checklist` for grounding,",
|
|
9955
|
+
" faithfulness, or any property where one failure is a real failure.",
|
|
9956
|
+
"- `llm_checklist`: enumerate per-claim binary assertions instead of",
|
|
9957
|
+
" picking a continuous score. For each assertion, return",
|
|
9958
|
+
" `{ id, text, passed: bool, evidence: string }`. `evidence` is",
|
|
9959
|
+
" REQUIRED for both PASS and FAIL — for PASS, quote the supporting",
|
|
9960
|
+
" span (rendered or source) or cite the source entry id; for FAIL,",
|
|
9961
|
+
" quote the offending claim verbatim and explain why it fails.",
|
|
9962
|
+
" Don't give the benefit of the doubt: if a claim looks supported but",
|
|
9963
|
+
" you cannot point at the supporting source span, mark it FAIL with",
|
|
9964
|
+
" evidence = \"no supporting span found\". Set the criterion `score`",
|
|
9965
|
+
" to `1` iff every assertion passes, else `0` — the runtime checks",
|
|
9966
|
+
" this matches the assertions array. Populate `assertions` on the",
|
|
9967
|
+
" score object; leave `evidence` (the structured record) empty.",
|
|
9888
9968
|
"- `boolean`: score exactly 0 or 1. `rationale` optional.",
|
|
9889
9969
|
"- `deterministic_signature_check`: batch-fetch ALL referenced source",
|
|
9890
9970
|
" entries in a single call — `moltnet_list_entries` with `entryIds` set",
|
|
@@ -9915,23 +9995,36 @@ function buildJudgePackPrompt(input, ctx) {
|
|
|
9915
9995
|
" may leak guidance that biases judgment.",
|
|
9916
9996
|
"- Keep the session focused on scoring; no speculative exploration.",
|
|
9917
9997
|
"",
|
|
9918
|
-
"## Final output",
|
|
9919
|
-
"",
|
|
9920
|
-
"Write to stdout a JSON object matching `JudgePackOutput`:",
|
|
9921
|
-
"```",
|
|
9922
|
-
"{",
|
|
9923
|
-
" \"scores\": [{\"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {...}}],",
|
|
9924
|
-
" \"composite\": <sum-of-weighted-scores>,",
|
|
9925
|
-
" \"verdict\": \"<1-3 sentence overall>\",",
|
|
9926
|
-
" \"judgeModel\": \"<provider:model>\",",
|
|
9927
|
-
" \"rendererBinaryCid\": \"<cid-string-only-if-available>\"",
|
|
9928
|
-
"}",
|
|
9929
|
-
"```",
|
|
9930
|
-
"Omit `rendererBinaryCid` entirely when no binary CID is exposed by",
|
|
9931
|
-
"`moltnet_rendered_pack_get`. Do NOT emit `null` — the field is optional",
|
|
9932
|
-
"and absence is the correct representation when unavailable.",
|
|
9933
9998
|
`Write a signed diary entry (tags: \`judgment\`, \`judge_pack\`, \`rubric:${rubric.rubricId}\`) capturing the rationale before`,
|
|
9934
|
-
"
|
|
9999
|
+
"reporting structured output.",
|
|
10000
|
+
"",
|
|
10001
|
+
buildFinalOutputBlock({
|
|
10002
|
+
taskType: "judge_pack",
|
|
10003
|
+
outputSchemaName: "JudgePackOutput",
|
|
10004
|
+
shapeSketch: [
|
|
10005
|
+
"{",
|
|
10006
|
+
" \"scores\": [",
|
|
10007
|
+
" { \"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {} },",
|
|
10008
|
+
" {",
|
|
10009
|
+
" \"criterionId\": \"<llm_checklist criterion>\",",
|
|
10010
|
+
" \"score\": 0, // 1 iff every assertion passed",
|
|
10011
|
+
" \"assertions\": [",
|
|
10012
|
+
" { \"id\": \"claim-1\", \"text\": \"...\", \"passed\": false, \"evidence\": \"...\" }",
|
|
10013
|
+
" ]",
|
|
10014
|
+
" }",
|
|
10015
|
+
" ],",
|
|
10016
|
+
" \"composite\": <sum-of-weighted-scores>,",
|
|
10017
|
+
" \"verdict\": \"<1-3 sentence overall>\",",
|
|
10018
|
+
" \"judgeModel\": \"<provider:model>\",",
|
|
10019
|
+
" \"rendererBinaryCid\": \"<cid-string-only-if-available>\"",
|
|
10020
|
+
"}"
|
|
10021
|
+
].join("\n"),
|
|
10022
|
+
extraNotes: [
|
|
10023
|
+
"Omit `rendererBinaryCid` entirely when no binary CID is exposed by",
|
|
10024
|
+
"`moltnet_rendered_pack_get`. Do NOT emit `null` — the field is",
|
|
10025
|
+
"optional and absence is the correct representation when unavailable."
|
|
10026
|
+
]
|
|
10027
|
+
})
|
|
9935
10028
|
].filter((l) => l !== null).join("\n");
|
|
9936
10029
|
}
|
|
9937
10030
|
//#endregion
|
|
@@ -9960,7 +10053,7 @@ function buildRenderPackPrompt(input, ctx) {
|
|
|
9960
10053
|
"",
|
|
9961
10054
|
"## Workflow",
|
|
9962
10055
|
"",
|
|
9963
|
-
"1. Call `moltnet_pack_get` with `
|
|
10056
|
+
"1. Call `moltnet_pack_get` with `expandEntries: true` to inspect the",
|
|
9964
10057
|
" source entries. Read it — you need the entry count for your output.",
|
|
9965
10058
|
"2. Call `moltnet_pack_render` with:",
|
|
9966
10059
|
` - \`packId\`: \`${packId}\``,
|
|
@@ -9972,24 +10065,25 @@ function buildRenderPackPrompt(input, ctx) {
|
|
|
9972
10065
|
"## Constraints",
|
|
9973
10066
|
"",
|
|
9974
10067
|
"- Do NOT modify the source pack or its entries.",
|
|
9975
|
-
"- Do NOT call `moltnet_rendered_pack_judge`.",
|
|
9976
10068
|
"- Do NOT write diary entries unless a genuine incident occurs",
|
|
9977
10069
|
" (rendering failure, invariant violation).",
|
|
9978
10070
|
"",
|
|
9979
|
-
|
|
9980
|
-
|
|
9981
|
-
|
|
9982
|
-
|
|
9983
|
-
|
|
9984
|
-
|
|
9985
|
-
|
|
9986
|
-
|
|
9987
|
-
|
|
9988
|
-
|
|
9989
|
-
|
|
9990
|
-
|
|
9991
|
-
|
|
9992
|
-
|
|
10071
|
+
buildSelfVerificationBlock(ctx.taskId),
|
|
10072
|
+
buildFinalOutputBlock({
|
|
10073
|
+
taskType: "render_pack",
|
|
10074
|
+
outputSchemaName: "RenderPackOutput",
|
|
10075
|
+
shapeSketch: [
|
|
10076
|
+
"{",
|
|
10077
|
+
" \"renderedPackId\": \"<uuid-or-null>\",",
|
|
10078
|
+
" \"renderedCid\": \"<cid>\",",
|
|
10079
|
+
" \"renderMethod\": \"<label>\",",
|
|
10080
|
+
" \"byteSize\": <int>,",
|
|
10081
|
+
" \"entriesRendered\": <int>,",
|
|
10082
|
+
" \"summary\": \"<1-3 sentence recap>\",",
|
|
10083
|
+
" \"verification\": <required iff input.successCriteria; see Self-verification>",
|
|
10084
|
+
"}"
|
|
10085
|
+
].join("\n")
|
|
10086
|
+
})
|
|
9993
10087
|
].join("\n");
|
|
9994
10088
|
}
|
|
9995
10089
|
//#endregion
|
|
@@ -12020,7 +12114,7 @@ var require_transport = /* @__PURE__ */ __commonJSMin(((exports, module) => {
|
|
|
12020
12114
|
var { existsSync: existsSync$1 } = __require("node:fs");
|
|
12021
12115
|
var getCallers = require_caller();
|
|
12022
12116
|
var { join: join$1, isAbsolute, sep } = __require("node:path");
|
|
12023
|
-
var { fileURLToPath
|
|
12117
|
+
var { fileURLToPath } = __require("node:url");
|
|
12024
12118
|
var sleep = require_atomic_sleep();
|
|
12025
12119
|
var onExit = require_on_exit_leak_free();
|
|
12026
12120
|
var ThreadStream = require_thread_stream();
|
|
@@ -12076,7 +12170,7 @@ var require_transport = /* @__PURE__ */ __commonJSMin(((exports, module) => {
|
|
|
12076
12170
|
if (!unquoted) return false;
|
|
12077
12171
|
let path = unquoted;
|
|
12078
12172
|
if (path.startsWith("file://")) try {
|
|
12079
|
-
path = fileURLToPath
|
|
12173
|
+
path = fileURLToPath(path);
|
|
12080
12174
|
} catch {
|
|
12081
12175
|
return false;
|
|
12082
12176
|
}
|
|
@@ -13567,9 +13661,13 @@ function buildRuntimeInstructor(ctx) {
|
|
|
13567
13661
|
`- During this task, every diary entry MUST land in \`${ctx.diaryId}\``,
|
|
13568
13662
|
" (the task diary). The MCP `moltnet_create_entry` tool enforces this",
|
|
13569
13663
|
" and rejects mismatched explicit `diaryId` parameters.",
|
|
13570
|
-
`- Provenance tags \`task:${ctx.taskId}\`, \`
|
|
13571
|
-
` and \`
|
|
13572
|
-
"
|
|
13664
|
+
`- Provenance tags \`task:id:${ctx.taskId}\`, \`task:type:${ctx.taskType}\`,`,
|
|
13665
|
+
` and \`task:attempt:${ctx.attemptN}\`${ctx.correlationId ? `, plus \`task:correlation:${ctx.correlationId}\`` : ""} are auto-injected on every entry.`,
|
|
13666
|
+
" These share the `task:` namespace so `moltnet_diary_tags` with",
|
|
13667
|
+
" `prefix: \"task:\"` lists every task-scoped tag, and the",
|
|
13668
|
+
" `taskFilter` shorthand on `moltnet_list_entries` /",
|
|
13669
|
+
" `moltnet_search_entries` expands into them. You may add additional",
|
|
13670
|
+
" tags but you cannot remove the auto-injected ones.",
|
|
13573
13671
|
"",
|
|
13574
13672
|
"## Accountable commits",
|
|
13575
13673
|
"",
|
|
@@ -13598,42 +13696,78 @@ function buildRuntimeInstructor(ctx) {
|
|
|
13598
13696
|
}
|
|
13599
13697
|
//#endregion
|
|
13600
13698
|
//#region src/runtime/task-output.ts
|
|
13601
|
-
|
|
13699
|
+
var METER_NAME = "@themoltnet/pi-extension/task-output";
|
|
13700
|
+
var parseResultCounter = null;
|
|
13701
|
+
function getParseResultCounter() {
|
|
13702
|
+
if (parseResultCounter) return parseResultCounter;
|
|
13703
|
+
parseResultCounter = metrics.getMeter(METER_NAME).createCounter("agent_runtime.task_output.parse_result", {
|
|
13704
|
+
description: "Outcome of structured task-output capture, labelled by task_type, model, and code (success | output_missing | output_validation_failed | unknown_task_type | output_cid_compute_failed | captured_via_tool).",
|
|
13705
|
+
unit: "1"
|
|
13706
|
+
});
|
|
13707
|
+
return parseResultCounter;
|
|
13708
|
+
}
|
|
13709
|
+
/**
|
|
13710
|
+
* Record one parse-result observation. Exposed so the executor can also
|
|
13711
|
+
* record the `captured_via_tool` outcome from the submit-tool path
|
|
13712
|
+
* without bouncing through the parser. Labels: `task_type`, `model`, `code`.
|
|
13713
|
+
*/
|
|
13714
|
+
function recordTaskOutputParseResult(args) {
|
|
13715
|
+
getParseResultCounter().add(1, {
|
|
13716
|
+
task_type: args.taskType,
|
|
13717
|
+
model: args.model ?? "unknown",
|
|
13718
|
+
code: args.code
|
|
13719
|
+
});
|
|
13720
|
+
}
|
|
13721
|
+
async function parseStructuredTaskOutput(assistantText, taskType, opts = {}) {
|
|
13722
|
+
const record = (code) => recordTaskOutputParseResult({
|
|
13723
|
+
taskType,
|
|
13724
|
+
model: opts.model,
|
|
13725
|
+
code
|
|
13726
|
+
});
|
|
13602
13727
|
const extracted = extractJsonObject(assistantText);
|
|
13603
|
-
if (!extracted)
|
|
13604
|
-
|
|
13605
|
-
|
|
13606
|
-
|
|
13607
|
-
|
|
13608
|
-
|
|
13609
|
-
|
|
13610
|
-
|
|
13728
|
+
if (!extracted) {
|
|
13729
|
+
record("output_missing");
|
|
13730
|
+
return {
|
|
13731
|
+
output: null,
|
|
13732
|
+
outputCid: null,
|
|
13733
|
+
error: {
|
|
13734
|
+
code: "output_missing",
|
|
13735
|
+
message: "Agent did not emit a parseable JSON object as its final message."
|
|
13736
|
+
}
|
|
13737
|
+
};
|
|
13738
|
+
}
|
|
13611
13739
|
const errors = validateTaskOutput(taskType, extracted);
|
|
13612
13740
|
if (errors.length > 0) {
|
|
13613
13741
|
const details = errors.slice(0, 3).map((error) => `${error.field}: ${error.message}`);
|
|
13614
13742
|
const [firstError] = errors;
|
|
13743
|
+
const code = firstError?.field === "taskType" ? "unknown_task_type" : "output_validation_failed";
|
|
13744
|
+
record(code);
|
|
13615
13745
|
return {
|
|
13616
13746
|
output: null,
|
|
13617
13747
|
outputCid: null,
|
|
13618
13748
|
error: {
|
|
13619
|
-
code
|
|
13749
|
+
code,
|
|
13620
13750
|
message: `Output failed schema validation: ${details.join("; ")}`
|
|
13621
13751
|
}
|
|
13622
13752
|
};
|
|
13623
13753
|
}
|
|
13624
13754
|
try {
|
|
13755
|
+
const outputCid = await computeJsonCid(extracted);
|
|
13756
|
+
record("success");
|
|
13625
13757
|
return {
|
|
13626
13758
|
output: extracted,
|
|
13627
|
-
outputCid
|
|
13759
|
+
outputCid,
|
|
13628
13760
|
error: null
|
|
13629
13761
|
};
|
|
13630
13762
|
} catch (error) {
|
|
13763
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
13764
|
+
record("output_cid_compute_failed");
|
|
13631
13765
|
return {
|
|
13632
13766
|
output: null,
|
|
13633
13767
|
outputCid: null,
|
|
13634
13768
|
error: {
|
|
13635
13769
|
code: "output_cid_compute_failed",
|
|
13636
|
-
message: `Validated output could not be canonicalized: ${
|
|
13770
|
+
message: `Validated output could not be canonicalized: ${message}`
|
|
13637
13771
|
}
|
|
13638
13772
|
};
|
|
13639
13773
|
}
|
|
@@ -13689,6 +13823,99 @@ function extractJsonObject(text) {
|
|
|
13689
13823
|
return null;
|
|
13690
13824
|
}
|
|
13691
13825
|
//#endregion
|
|
13826
|
+
//#region src/runtime/submit-output-tool.ts
|
|
13827
|
+
/**
|
|
13828
|
+
* Sentinel thrown when the requested task type has no registered output
|
|
13829
|
+
* schema. The executor recognises this specific error class and falls
|
|
13830
|
+
* back to the parser path; any other error from `createSubmitOutputTool`
|
|
13831
|
+
* is unexpected and must propagate.
|
|
13832
|
+
*/
|
|
13833
|
+
var UnknownTaskTypeForSubmitToolError = class extends Error {
|
|
13834
|
+
constructor(taskType) {
|
|
13835
|
+
super(`createSubmitOutputTool: no output schema registered for task type "${taskType}"`);
|
|
13836
|
+
this.taskType = taskType;
|
|
13837
|
+
this.name = "UnknownTaskTypeForSubmitToolError";
|
|
13838
|
+
}
|
|
13839
|
+
};
|
|
13840
|
+
function createSubmitOutputTool(taskType, opts = {}) {
|
|
13841
|
+
const contract = getSubmitOutputContract(taskType);
|
|
13842
|
+
if (!contract) throw new UnknownTaskTypeForSubmitToolError(taskType);
|
|
13843
|
+
const schema = contract.parametersSchema;
|
|
13844
|
+
let captured = null;
|
|
13845
|
+
let callCount = 0;
|
|
13846
|
+
return {
|
|
13847
|
+
tool: defineTool({
|
|
13848
|
+
name: contract.toolName,
|
|
13849
|
+
label: `Submit ${taskType} output`,
|
|
13850
|
+
description: contract.description,
|
|
13851
|
+
parameters: schema,
|
|
13852
|
+
async execute(_id, params) {
|
|
13853
|
+
const errors = validateTaskOutput(taskType, params);
|
|
13854
|
+
if (errors.length > 0) {
|
|
13855
|
+
const detailMsg = errors.slice(0, 3).map((err) => `${err.field}: ${err.message}`).join("; ");
|
|
13856
|
+
const details = {
|
|
13857
|
+
captured: false,
|
|
13858
|
+
callCount,
|
|
13859
|
+
error: "output_validation_failed"
|
|
13860
|
+
};
|
|
13861
|
+
recordTaskOutputParseResult({
|
|
13862
|
+
taskType,
|
|
13863
|
+
model: opts.model,
|
|
13864
|
+
code: "output_validation_failed"
|
|
13865
|
+
});
|
|
13866
|
+
return {
|
|
13867
|
+
content: [{
|
|
13868
|
+
type: "text",
|
|
13869
|
+
text: `Output failed validation: ${detailMsg}. Re-call this tool with a corrected output.`
|
|
13870
|
+
}],
|
|
13871
|
+
details,
|
|
13872
|
+
isError: true
|
|
13873
|
+
};
|
|
13874
|
+
}
|
|
13875
|
+
captured = params;
|
|
13876
|
+
callCount += 1;
|
|
13877
|
+
return {
|
|
13878
|
+
content: [{
|
|
13879
|
+
type: "text",
|
|
13880
|
+
text: "Output captured. The runtime now has the validated payload; no further action is needed for output reporting."
|
|
13881
|
+
}],
|
|
13882
|
+
details: {
|
|
13883
|
+
captured: true,
|
|
13884
|
+
callCount,
|
|
13885
|
+
error: null
|
|
13886
|
+
},
|
|
13887
|
+
terminate: true
|
|
13888
|
+
};
|
|
13889
|
+
}
|
|
13890
|
+
}),
|
|
13891
|
+
getCaptured: () => captured,
|
|
13892
|
+
getCallCount: () => callCount
|
|
13893
|
+
};
|
|
13894
|
+
}
|
|
13895
|
+
/**
|
|
13896
|
+
* Build the submit-tool wiring for one task attempt. Returns a handle
|
|
13897
|
+
* (or `null` if no submit-tool should be registered) plus the
|
|
13898
|
+
* `customTools`-shaped array ready to spread into the session config.
|
|
13899
|
+
*
|
|
13900
|
+
* The catch is **narrowed** to `UnknownTaskTypeForSubmitToolError` —
|
|
13901
|
+
* exporters/dependency-API drift would otherwise be silently degraded
|
|
13902
|
+
* to parser-only behaviour, which reintroduces the failure mode this
|
|
13903
|
+
* change is fixing. Any other error from the factory propagates.
|
|
13904
|
+
*/
|
|
13905
|
+
function resolveSubmitTools(taskType, opts = {}) {
|
|
13906
|
+
let handle;
|
|
13907
|
+
try {
|
|
13908
|
+
handle = createSubmitOutputTool(taskType, opts);
|
|
13909
|
+
} catch (err) {
|
|
13910
|
+
if (err instanceof UnknownTaskTypeForSubmitToolError) handle = null;
|
|
13911
|
+
else throw err;
|
|
13912
|
+
}
|
|
13913
|
+
return {
|
|
13914
|
+
handle,
|
|
13915
|
+
tools: handle ? [handle.tool] : []
|
|
13916
|
+
};
|
|
13917
|
+
}
|
|
13918
|
+
//#endregion
|
|
13692
13919
|
//#region src/runtime/execute-pi-task.ts
|
|
13693
13920
|
/**
|
|
13694
13921
|
* executePiTask — run a single Task attempt using pi-coding-agent inside a
|
|
@@ -13834,6 +14061,8 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
13834
14061
|
createEditToolDefinition(mountPath, { operations: createGondolinEditOps(managed.vm, mountPath) }),
|
|
13835
14062
|
createBashToolDefinition(mountPath, { operations: createGondolinBashOps(managed.vm, mountPath) })
|
|
13836
14063
|
];
|
|
14064
|
+
const { handle: submitToolHandle, tools: submitToolDefs } = resolveSubmitTools(task.taskType, { model: opts.model });
|
|
14065
|
+
const submitTools = submitToolDefs;
|
|
13837
14066
|
try {
|
|
13838
14067
|
const moltnetAgent = await connect({ configDir: managed.agentDir });
|
|
13839
14068
|
const moltnetTools = createMoltNetTools({
|
|
@@ -13885,7 +14114,11 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
13885
14114
|
agentDir: piAuthDir,
|
|
13886
14115
|
cwd: mountPath,
|
|
13887
14116
|
model: modelHandle,
|
|
13888
|
-
customTools: [
|
|
14117
|
+
customTools: [
|
|
14118
|
+
...gondolinCustomTools,
|
|
14119
|
+
...moltnetTools,
|
|
14120
|
+
...submitTools
|
|
14121
|
+
],
|
|
13889
14122
|
sessionManager: SessionManager.inMemory(),
|
|
13890
14123
|
resourceLoader
|
|
13891
14124
|
})).session;
|
|
@@ -13962,14 +14195,43 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
13962
14195
|
let parsedOutputCid = null;
|
|
13963
14196
|
let parseError = null;
|
|
13964
14197
|
if (!runError && !llmAbort && !cancelled) {
|
|
13965
|
-
const
|
|
13966
|
-
|
|
13967
|
-
|
|
13968
|
-
|
|
13969
|
-
|
|
13970
|
-
|
|
13971
|
-
|
|
13972
|
-
|
|
14198
|
+
const captured = submitToolHandle?.getCaptured() ?? null;
|
|
14199
|
+
if (captured) try {
|
|
14200
|
+
parsedOutput = captured;
|
|
14201
|
+
parsedOutputCid = await computeJsonCid(captured);
|
|
14202
|
+
recordTaskOutputParseResult({
|
|
14203
|
+
taskType: task.taskType,
|
|
14204
|
+
model: opts.model,
|
|
14205
|
+
code: "captured_via_tool"
|
|
14206
|
+
});
|
|
14207
|
+
} catch (err) {
|
|
14208
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
14209
|
+
parsedOutput = null;
|
|
14210
|
+
parsedOutputCid = null;
|
|
14211
|
+
parseError = {
|
|
14212
|
+
code: "output_cid_compute_failed",
|
|
14213
|
+
message: `Captured submit-tool output could not be canonicalized: ${message}`
|
|
14214
|
+
};
|
|
14215
|
+
recordTaskOutputParseResult({
|
|
14216
|
+
taskType: task.taskType,
|
|
14217
|
+
model: opts.model,
|
|
14218
|
+
code: "output_cid_compute_failed"
|
|
14219
|
+
});
|
|
14220
|
+
await emit("error", {
|
|
14221
|
+
message: parseError.message,
|
|
14222
|
+
phase: "output_validation"
|
|
14223
|
+
});
|
|
14224
|
+
}
|
|
14225
|
+
else {
|
|
14226
|
+
const parsed = await parseStructuredTaskOutput(assistantText, task.taskType, { model: opts.model });
|
|
14227
|
+
parsedOutput = parsed.output;
|
|
14228
|
+
parsedOutputCid = parsed.outputCid;
|
|
14229
|
+
parseError = parsed.error;
|
|
14230
|
+
if (parseError) await emit("error", {
|
|
14231
|
+
message: parseError.message,
|
|
14232
|
+
phase: "output_validation"
|
|
14233
|
+
});
|
|
14234
|
+
}
|
|
13973
14235
|
}
|
|
13974
14236
|
if (cancelled) return {
|
|
13975
14237
|
taskId: task.id,
|
|
@@ -14365,4 +14627,4 @@ function moltnetExtension(pi) {
|
|
|
14365
14627
|
registerMoltnetReflectCommand(pi, state);
|
|
14366
14628
|
}
|
|
14367
14629
|
//#endregion
|
|
14368
|
-
export { HOST_EXEC_DEFAULT_BASE_ENV, activateAgentEnv,
|
|
14630
|
+
export { HOST_EXEC_DEFAULT_BASE_ENV, activateAgentEnv, createGondolinBashOps, createGondolinEditOps, createGondolinReadOps, createGondolinWriteOps, createMoltNetTools, createPiOtelExtension, createPiTaskExecutor, moltnetExtension as default, ensureSnapshot, executePiTask, findMainWorktree, loadCredentials, resumeVm, toGuestPath };
|