@themoltnet/pi-extension 0.14.0 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +132 -0
- package/dist/index.js +899 -88
- package/package.json +3 -3
package/dist/index.d.ts
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
import { AgentSession } from '@earendil-works/pi-coding-agent';
|
|
2
|
+
import { Api } from '@earendil-works/pi-ai';
|
|
1
3
|
import { BashOperations } from '@earendil-works/pi-coding-agent';
|
|
2
4
|
import { connect } from '@themoltnet/sdk';
|
|
3
5
|
import { EditOperations } from '@earendil-works/pi-coding-agent';
|
|
4
6
|
import { ExtensionAPI } from '@earendil-works/pi-coding-agent';
|
|
7
|
+
import { LoadSkillsResult } from '@earendil-works/pi-coding-agent';
|
|
8
|
+
import { Model } from '@earendil-works/pi-ai';
|
|
5
9
|
import { ReadOperations } from '@earendil-works/pi-coding-agent';
|
|
6
10
|
import { Skill } from '@earendil-works/pi-coding-agent';
|
|
7
11
|
import { Static } from '@sinclair/typebox';
|
|
@@ -27,6 +31,33 @@ import { WriteOperations } from '@earendil-works/pi-coding-agent';
|
|
|
27
31
|
*/
|
|
28
32
|
export declare function activateAgentEnv(agentEnv: Record<string, string | undefined>, repoRoot: string): void;
|
|
29
33
|
|
|
34
|
+
/**
|
|
35
|
+
* Construct an in-memory `AgentSession`. The caller is responsible for
|
|
36
|
+
* eventually invoking `session.prompt(...)` and for tearing down — the
|
|
37
|
+
* helper does no lifecycle management beyond construction.
|
|
38
|
+
*/
|
|
39
|
+
export declare function buildAgentSession(args: BuildAgentSessionArgs): Promise<AgentSession>;
|
|
40
|
+
|
|
41
|
+
declare interface BuildAgentSessionArgs {
|
|
42
|
+
/** Host directory mounted at /workspace inside the VM. */
|
|
43
|
+
mountPath: string;
|
|
44
|
+
/** pi auth directory (resolved from `PI_CODING_AGENT_DIR` or `~/.pi/agent`). */
|
|
45
|
+
piAuthDir: string;
|
|
46
|
+
/** Resolved pi model handle (provider + model id). */
|
|
47
|
+
modelHandle: Model<Api>;
|
|
48
|
+
/** Pre-built customTools array. Caller composes Gondolin + MoltNet + submit tools. */
|
|
49
|
+
customTools: ToolDefinition[];
|
|
50
|
+
/** System-prompt fragments appended after pi's defaults. Parent passes the
|
|
51
|
+
* runtime instructor; subagents pass their narrower variant. */
|
|
52
|
+
appendSystemPrompt: string[];
|
|
53
|
+
/** Skills to advertise in `<available_skills>`. Default: empty list. */
|
|
54
|
+
skillsOverride?: () => LoadSkillsResult;
|
|
55
|
+
/** Span attributes merged onto every OTel span the session emits. */
|
|
56
|
+
otelSpanAttrs: Record<string, string | number | boolean>;
|
|
57
|
+
/** Agent name for `gen_ai.agent.name` on the root span. */
|
|
58
|
+
agentName: string;
|
|
59
|
+
}
|
|
60
|
+
|
|
30
61
|
declare interface ClaimedTask {
|
|
31
62
|
/** The claimed task payload itself. */
|
|
32
63
|
task: Task;
|
|
@@ -83,6 +114,73 @@ export declare function createPiOtelExtension(options?: PiOtelOptions): (pi: Ext
|
|
|
83
114
|
*/
|
|
84
115
|
export declare function createPiTaskExecutor(opts: ExecutePiTaskOptions): (claimedTask: ClaimedTask, reporter: TaskReporter) => Promise<TaskOutput>;
|
|
85
116
|
|
|
117
|
+
/**
|
|
118
|
+
* Build the subagent custom tool for a parent session. The handle
|
|
119
|
+
* exposes the call counter so executors can emit summary telemetry
|
|
120
|
+
* when the parent terminates.
|
|
121
|
+
*/
|
|
122
|
+
export declare function createSubagentTool(args: CreateSubagentToolArgs): SubagentToolHandle;
|
|
123
|
+
|
|
124
|
+
export declare interface CreateSubagentToolArgs {
|
|
125
|
+
/** Host directory mounted at /workspace inside the VM. */
|
|
126
|
+
mountPath: string;
|
|
127
|
+
/** pi auth directory the parent resolved. */
|
|
128
|
+
piAuthDir: string;
|
|
129
|
+
/** Resolved pi model handle — subagents share it. */
|
|
130
|
+
modelHandle: Model<Api>;
|
|
131
|
+
/** Agent name for telemetry. */
|
|
132
|
+
agentName: string;
|
|
133
|
+
/**
|
|
134
|
+
* Custom tools every subagent inherits (Gondolin-routed
|
|
135
|
+
* Read/Write/Edit/Bash + moltnet_* tools, etc). MUST NOT include
|
|
136
|
+
* the parent's submit-output tool, the parent's `subagent` tool,
|
|
137
|
+
* or any other parent-only artefact — the caller is responsible
|
|
138
|
+
* for filtering. The subagent appends its own submit tool.
|
|
139
|
+
*/
|
|
140
|
+
inheritedCustomTools: ToolDefinition[];
|
|
141
|
+
/**
|
|
142
|
+
* The parent runtime instructor verbatim. Subagents prepend it to
|
|
143
|
+
* their own short "you are a subagent" preamble so the same
|
|
144
|
+
* invariants (gh auth, diary discipline, accountable commits)
|
|
145
|
+
* apply if the subagent takes those actions. The parent's task
|
|
146
|
+
* description dictates whether they should.
|
|
147
|
+
*/
|
|
148
|
+
parentRuntimeInstructor: string;
|
|
149
|
+
parentTaskId: string;
|
|
150
|
+
parentTaskType: string;
|
|
151
|
+
parentAttemptN: number;
|
|
152
|
+
/**
|
|
153
|
+
* Parent task's cancel signal. When the daemon cancels the parent
|
|
154
|
+
* task (operator cancel or task-level `runningTimeoutSec` expiry),
|
|
155
|
+
* each in-flight subagent's inner `session.abort()` is invoked so
|
|
156
|
+
* it tears down promptly instead of running until its own LLM
|
|
157
|
+
* call resolves. Mirrors the existing `wireSessionAbort` pattern
|
|
158
|
+
* the parent session uses.
|
|
159
|
+
*
|
|
160
|
+
* Optional only because the test seam can omit it; production
|
|
161
|
+
* callers (executePiTask) pass `reporter.cancelSignal`.
|
|
162
|
+
*/
|
|
163
|
+
parentCancelSignal?: AbortSignal;
|
|
164
|
+
/**
|
|
165
|
+
* Per-call fallback timeout. Defends against an inner session that
|
|
166
|
+
* ignores `abort()` for any reason (LLM provider stuck, tool call
|
|
167
|
+
* hanging on I/O, etc.). When the timeout fires, `session.abort()`
|
|
168
|
+
* is invoked and the tool returns `isError: true` with a
|
|
169
|
+
* `subagent_timed_out` reason the parent LLM can recover from.
|
|
170
|
+
*
|
|
171
|
+
* Default: 5 minutes. Set to `0` to disable (relying purely on
|
|
172
|
+
* parentCancelSignal). Negative values are treated as the default.
|
|
173
|
+
*/
|
|
174
|
+
timeoutMs?: number;
|
|
175
|
+
/**
|
|
176
|
+
* Test seam. Production callers leave this undefined and get
|
|
177
|
+
* `buildAgentSession` from the factory module. Tests inject a mock
|
|
178
|
+
* that returns a stub session implementing only `prompt()` to
|
|
179
|
+
* exercise the tool's logic without booting a VM.
|
|
180
|
+
*/
|
|
181
|
+
buildAgentSession?: (args: BuildAgentSessionArgs) => Promise<AgentSession>;
|
|
182
|
+
}
|
|
183
|
+
|
|
86
184
|
/**
|
|
87
185
|
* Ensure a cached snapshot exists, building one if needed.
|
|
88
186
|
* Returns the absolute path to the qcow2 checkpoint file.
|
|
@@ -279,6 +377,17 @@ export declare interface SandboxConfig {
|
|
|
279
377
|
/** Overlay disk size (default '3G'). */
|
|
280
378
|
overlaySize?: string;
|
|
281
379
|
};
|
|
380
|
+
/** Shell commands to run every VM resume, after platform setup
|
|
381
|
+
* (TLS, DNS, git safe.directory, tmpfs node_modules) and before
|
|
382
|
+
* the agent session starts. Use for per-session bootstrap that
|
|
383
|
+
* doesn't belong baked into the snapshot.
|
|
384
|
+
*
|
|
385
|
+
* Not included in the snapshot cache key — changes here apply on
|
|
386
|
+
* every resume without triggering a snapshot rebuild. Each command
|
|
387
|
+
* runs in a fresh shell with `set -eu` and `set -o pipefail`; a
|
|
388
|
+
* non-zero exit (including from any segment of a pipeline) aborts
|
|
389
|
+
* resume with the failing command's stderr/stdout tail. */
|
|
390
|
+
resumeCommands?: string[];
|
|
282
391
|
/** VFS shadow settings — hide host paths from the guest. */
|
|
283
392
|
vfs?: {
|
|
284
393
|
/** Paths (relative to workspace root) to shadow from the host mount. */
|
|
@@ -300,6 +409,29 @@ export declare interface SandboxConfig {
|
|
|
300
409
|
/** Extract snapshot-specific config for backwards compat with ensureSnapshot. */
|
|
301
410
|
export declare type SnapshotConfig = NonNullable<SandboxConfig['snapshot']>;
|
|
302
411
|
|
|
412
|
+
export declare interface SubagentToolHandle {
|
|
413
|
+
/** ToolDefinition to register via `customTools` on the parent session. */
|
|
414
|
+
readonly tool: ToolDefinition;
|
|
415
|
+
/** How many times the parent LLM has called this tool. */
|
|
416
|
+
getCallCount: () => number;
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
/**
|
|
420
|
+
* Parameters shape the parent LLM sees when calling the subagent tool.
|
|
421
|
+
*
|
|
422
|
+
* - `task` — natural-language instructions for the subagent.
|
|
423
|
+
* The parent authors this per call. Must be
|
|
424
|
+
* non-empty.
|
|
425
|
+
* - `output_schema` — name of a registered SubagentOutputContract.
|
|
426
|
+
* Resolved at call time; unknown names error.
|
|
427
|
+
*/
|
|
428
|
+
export declare const SubagentToolParameters: TObject<{
|
|
429
|
+
task: TString;
|
|
430
|
+
output_schema: TString;
|
|
431
|
+
}>;
|
|
432
|
+
|
|
433
|
+
export declare type SubagentToolParameters = Static<typeof SubagentToolParameters>;
|
|
434
|
+
|
|
303
435
|
/**
|
|
304
436
|
* The Task promise body.
|
|
305
437
|
*
|
package/dist/index.js
CHANGED
|
@@ -2515,11 +2515,12 @@ function createCryptoNamespace(context, signingRequests) {
|
|
|
2515
2515
|
function createDiariesNamespace(context) {
|
|
2516
2516
|
const { client, auth } = context;
|
|
2517
2517
|
return {
|
|
2518
|
-
async list(query) {
|
|
2518
|
+
async list(query, headers) {
|
|
2519
2519
|
return unwrapResult(await listDiaries({
|
|
2520
2520
|
client,
|
|
2521
2521
|
auth,
|
|
2522
|
-
query
|
|
2522
|
+
query,
|
|
2523
|
+
headers
|
|
2523
2524
|
}));
|
|
2524
2525
|
},
|
|
2525
2526
|
async create(body, headers) {
|
|
@@ -8177,6 +8178,27 @@ var BASE_ALLOWED_HOSTS = [
|
|
|
8177
8178
|
"*.googlesource.com"
|
|
8178
8179
|
];
|
|
8179
8180
|
/**
|
|
8181
|
+
* Run a shell command in the guest and throw if it fails. Mirror of
|
|
8182
|
+
* `run()` in `snapshot.ts` for the resume-side hook chain — every
|
|
8183
|
+
* setup step is essential to a healthy session, so a silent non-zero
|
|
8184
|
+
* exit (e.g. a mount that fails into the FUSE write path, or a
|
|
8185
|
+
* consumer-provided resume command that fails to install pnpm) must
|
|
8186
|
+
* surface immediately rather than fall through to cryptic agent
|
|
8187
|
+
* errors later.
|
|
8188
|
+
*/
|
|
8189
|
+
async function vmRun(vm, label, command) {
|
|
8190
|
+
const wrapped = `set -eu\nset -o pipefail\n${command}`;
|
|
8191
|
+
const r = await vm.exec([
|
|
8192
|
+
"sh",
|
|
8193
|
+
"-c",
|
|
8194
|
+
wrapped
|
|
8195
|
+
]);
|
|
8196
|
+
if (r.exitCode !== 0) {
|
|
8197
|
+
const tail = [r.stderr, r.stdout].filter(Boolean).join("\n").slice(-800);
|
|
8198
|
+
throw new Error(`resume step "${label}" failed (exit ${r.exitCode}):\n${tail}`);
|
|
8199
|
+
}
|
|
8200
|
+
}
|
|
8201
|
+
/**
|
|
8180
8202
|
* Resume a VM from a checkpoint, inject credentials, configure egress +
|
|
8181
8203
|
* TLS. Returns the managed VM handle.
|
|
8182
8204
|
*/
|
|
@@ -8236,8 +8258,9 @@ async function resumeVm(config) {
|
|
|
8236
8258
|
update-ca-certificates 2>/dev/null
|
|
8237
8259
|
cat /etc/gondolin/mitm/ca.crt >> /etc/ssl/certs/ca-certificates.crt
|
|
8238
8260
|
'`);
|
|
8239
|
-
await vm
|
|
8240
|
-
|
|
8261
|
+
await vmRun(vm, "DNS resolvers", `printf 'nameserver 8.8.8.8\\nnameserver 1.1.1.1\\n' > /etc/resolv.conf`);
|
|
8262
|
+
await vmRun(vm, "git safe.directory", `git config --system --add safe.directory '*'`);
|
|
8263
|
+
for (const [i, cmd] of (config.sandboxConfig?.resumeCommands ?? []).entries()) await vmRun(vm, `resumeCommands[${i}]`, cmd);
|
|
8241
8264
|
const vmSshDir = `${vmAgentDir}/ssh`;
|
|
8242
8265
|
await vm.exec(`mkdir -p ${vmAgentDir}/ssh /home/agent/.pi/agent`);
|
|
8243
8266
|
if (creds.piAuthJson !== null) await vm.fs.writeFile("/home/agent/.pi/agent/auth.json", creds.piAuthJson, { mode: 384 });
|
|
@@ -8580,59 +8603,37 @@ function extractUsage(message) {
|
|
|
8580
8603
|
};
|
|
8581
8604
|
}
|
|
8582
8605
|
//#endregion
|
|
8583
|
-
//#region
|
|
8584
|
-
var
|
|
8606
|
+
//#region src/runtime/agent-session-factory.ts
|
|
8607
|
+
var NO_SKILLS = () => ({
|
|
8608
|
+
skills: [],
|
|
8609
|
+
diagnostics: []
|
|
8610
|
+
});
|
|
8585
8611
|
/**
|
|
8586
|
-
*
|
|
8587
|
-
*
|
|
8588
|
-
*
|
|
8589
|
-
* built prompt.
|
|
8590
|
-
*
|
|
8591
|
-
* Per-binding semantics (V1):
|
|
8592
|
-
* - `skill` → `deliver.skill({ slug, content })` once per ref.
|
|
8593
|
-
* Slug collisions on distinct contents are
|
|
8594
|
-
* refused loudly.
|
|
8595
|
-
* - `prompt_prefix` → content appended to `systemPromptPrefix` with
|
|
8596
|
-
* the canonical `\n\n---\n\n` separator (in
|
|
8597
|
-
* declared order).
|
|
8598
|
-
* - `user_inline` → content appended to `userInlineSuffix` in
|
|
8599
|
-
* declared order, same separator.
|
|
8600
|
-
*
|
|
8601
|
-
* No fetching, no hashing — bytes are inlined in `ContextRef.content`,
|
|
8602
|
-
* and the task's `inputCid` already pins the entire input. The imposer
|
|
8603
|
-
* chose these bytes; the resolver just dispatches them.
|
|
8604
|
-
*
|
|
8605
|
-
* The function is pure with respect to its arguments: file writes are
|
|
8606
|
-
* confined to the injected `deliver` callback, which makes the
|
|
8607
|
-
* resolver trivial to test.
|
|
8612
|
+
* Construct an in-memory `AgentSession`. The caller is responsible for
|
|
8613
|
+
* eventually invoking `session.prompt(...)` and for tearing down — the
|
|
8614
|
+
* helper does no lifecycle management beyond construction.
|
|
8608
8615
|
*/
|
|
8609
|
-
async function
|
|
8610
|
-
const
|
|
8611
|
-
|
|
8612
|
-
|
|
8613
|
-
|
|
8614
|
-
|
|
8615
|
-
|
|
8616
|
-
|
|
8617
|
-
|
|
8618
|
-
|
|
8619
|
-
|
|
8620
|
-
|
|
8621
|
-
|
|
8622
|
-
|
|
8623
|
-
|
|
8624
|
-
|
|
8625
|
-
|
|
8626
|
-
|
|
8627
|
-
|
|
8628
|
-
|
|
8629
|
-
|
|
8630
|
-
}
|
|
8631
|
-
return {
|
|
8632
|
-
injected,
|
|
8633
|
-
systemPromptPrefix: promptParts.join(PROMPT_SEPARATOR),
|
|
8634
|
-
userInlineSuffix: userParts.join(PROMPT_SEPARATOR)
|
|
8635
|
-
};
|
|
8616
|
+
async function buildAgentSession(args) {
|
|
8617
|
+
const piOtelExtension = createPiOtelExtension({
|
|
8618
|
+
agentName: args.agentName,
|
|
8619
|
+
spanAttributes: args.otelSpanAttrs
|
|
8620
|
+
});
|
|
8621
|
+
const resourceLoader = new DefaultResourceLoader({
|
|
8622
|
+
cwd: args.mountPath,
|
|
8623
|
+
agentDir: args.piAuthDir,
|
|
8624
|
+
extensionFactories: [piOtelExtension],
|
|
8625
|
+
appendSystemPrompt: args.appendSystemPrompt,
|
|
8626
|
+
skillsOverride: args.skillsOverride ?? NO_SKILLS
|
|
8627
|
+
});
|
|
8628
|
+
await resourceLoader.reload();
|
|
8629
|
+
return (await createAgentSession({
|
|
8630
|
+
agentDir: args.piAuthDir,
|
|
8631
|
+
cwd: args.mountPath,
|
|
8632
|
+
model: args.modelHandle,
|
|
8633
|
+
customTools: args.customTools,
|
|
8634
|
+
sessionManager: SessionManager.inMemory(),
|
|
8635
|
+
resourceLoader
|
|
8636
|
+
})).session;
|
|
8636
8637
|
}
|
|
8637
8638
|
//#endregion
|
|
8638
8639
|
//#region ../tasks/src/formats.ts
|
|
@@ -8851,7 +8852,7 @@ unchanged" is.
|
|
|
8851
8852
|
* (server-side schema check). Self-assessment is a truthful self-rating,
|
|
8852
8853
|
* NOT enforcement — `verification.passed=false` does not block /complete
|
|
8853
8854
|
* and does not affect `acceptedAttemptN`. See
|
|
8854
|
-
* `docs/agent-runtime.md` for the full producer/judge flow.
|
|
8855
|
+
* `docs/understand/agent-runtime.md` for the full producer/judge flow.
|
|
8855
8856
|
*
|
|
8856
8857
|
* **Binding evaluation** (judgment tasks: `assess_brief`, `judge_pack`).
|
|
8857
8858
|
* A separate task whose IS the application of `successCriteria` to
|
|
@@ -9008,6 +9009,39 @@ var AssessBriefOutput = Type$1.Object({
|
|
|
9008
9009
|
$id: "AssessBriefOutput",
|
|
9009
9010
|
additionalProperties: false
|
|
9010
9011
|
});
|
|
9012
|
+
/**
|
|
9013
|
+
* Async preflight (#1096):
|
|
9014
|
+
* - `targetTaskId` resolves to a real task the caller can see.
|
|
9015
|
+
* - The target is a `fulfill_brief` (you cannot grade an arbitrary
|
|
9016
|
+
* task type as if it were a brief fulfillment).
|
|
9017
|
+
* - The target is `completed` with an accepted attempt — grading
|
|
9018
|
+
* an in-flight or failed task would either race or grade nothing.
|
|
9019
|
+
*
|
|
9020
|
+
* Agent-distinctness ("assessor ≠ producer") is a runtime / auth-
|
|
9021
|
+
* layer concern and intentionally NOT checked here. It belongs in
|
|
9022
|
+
* an auth-aware claim-time check.
|
|
9023
|
+
*/
|
|
9024
|
+
async function validateAssessBriefInputAsync(input, ctx) {
|
|
9025
|
+
const { targetTaskId } = input;
|
|
9026
|
+
const errors = [];
|
|
9027
|
+
const target = await ctx.resolveTask(targetTaskId);
|
|
9028
|
+
if (!target) {
|
|
9029
|
+
errors.push({
|
|
9030
|
+
field: "targetTaskId",
|
|
9031
|
+
message: `targetTaskId ${targetTaskId} does not resolve to a task you can read`
|
|
9032
|
+
});
|
|
9033
|
+
return errors;
|
|
9034
|
+
}
|
|
9035
|
+
if (target.taskType !== "fulfill_brief") errors.push({
|
|
9036
|
+
field: "targetTaskId",
|
|
9037
|
+
message: `targetTaskId ${targetTaskId} is a ${target.taskType}, not a fulfill_brief`
|
|
9038
|
+
});
|
|
9039
|
+
if (target.status !== "completed" || target.acceptedAttemptN === null) errors.push({
|
|
9040
|
+
field: "targetTaskId",
|
|
9041
|
+
message: `targetTaskId ${targetTaskId} is not completed with an accepted attempt (status=${target.status}, acceptedAttemptN=${target.acceptedAttemptN})`
|
|
9042
|
+
});
|
|
9043
|
+
return errors;
|
|
9044
|
+
}
|
|
9011
9045
|
//#endregion
|
|
9012
9046
|
//#region ../tasks/src/task-types/curate-pack.ts
|
|
9013
9047
|
/**
|
|
@@ -9206,6 +9240,311 @@ function validateJudgePackOutput(output) {
|
|
|
9206
9240
|
}
|
|
9207
9241
|
return null;
|
|
9208
9242
|
}
|
|
9243
|
+
/**
|
|
9244
|
+
* Async preflight (#1096):
|
|
9245
|
+
* - `renderedPackId` resolves to a rendered_packs row.
|
|
9246
|
+
* - `sourcePackId` resolves to a context_packs row.
|
|
9247
|
+
* - The rendered pack actually came from the claimed source pack —
|
|
9248
|
+
* `renderedPack.sourcePackId === input.sourcePackId`. Without
|
|
9249
|
+
* this check a judge can be tricked into grading rendering A as
|
|
9250
|
+
* if it came from source B.
|
|
9251
|
+
*/
|
|
9252
|
+
async function validateJudgePackInputAsync(input, ctx) {
|
|
9253
|
+
const { renderedPackId, sourcePackId } = input;
|
|
9254
|
+
const errors = [];
|
|
9255
|
+
const [rendered, source] = await Promise.all([ctx.resolveRenderedPack(renderedPackId), ctx.resolveContextPack(sourcePackId)]);
|
|
9256
|
+
if (!rendered) errors.push({
|
|
9257
|
+
field: "renderedPackId",
|
|
9258
|
+
message: `renderedPackId ${renderedPackId} does not resolve to a rendered pack you can read`
|
|
9259
|
+
});
|
|
9260
|
+
if (!source) errors.push({
|
|
9261
|
+
field: "sourcePackId",
|
|
9262
|
+
message: `sourcePackId ${sourcePackId} does not resolve to a context pack you can read`
|
|
9263
|
+
});
|
|
9264
|
+
if (rendered && source && rendered.sourcePackId !== source.id) errors.push({
|
|
9265
|
+
field: "sourcePackId",
|
|
9266
|
+
message: `renderedPack ${renderedPackId} was produced from source ${rendered.sourcePackId}, not from sourcePackId=${sourcePackId}`
|
|
9267
|
+
});
|
|
9268
|
+
return errors;
|
|
9269
|
+
}
|
|
9270
|
+
//#endregion
|
|
9271
|
+
//#region ../tasks/src/task-types/judge-eval-variant.ts
|
|
9272
|
+
/**
|
|
9273
|
+
* `judge_eval_variant` — score N variants of a `run_eval` scenario
|
|
9274
|
+
* against a single rubric, in one pass, with per-variant subagent
|
|
9275
|
+
* isolation.
|
|
9276
|
+
*
|
|
9277
|
+
* output_kind: judgment
|
|
9278
|
+
* criteria: required (`successCriteria.rubric` — same envelope shape as
|
|
9279
|
+
* `judge_pack` / `assess_brief`)
|
|
9280
|
+
* references: not required at the input layer — `runTaskIds` already
|
|
9281
|
+
* pin the targets being graded.
|
|
9282
|
+
*
|
|
9283
|
+
* Slice 2 of #943. The parent task carries the rubric and the list of
|
|
9284
|
+
* variant `run_eval` task ids. The pi executor registers the generic
|
|
9285
|
+
* `subagent` custom tool (#1087), and the parent LLM calls
|
|
9286
|
+
* `subagent({ task, output_schema: 'judge_eval_variant_result' })` once
|
|
9287
|
+
* per variant — each child session has fresh context, fetches the
|
|
9288
|
+
* variant's accepted attempt output via `moltnet_get_task` /
|
|
9289
|
+
* `moltnet_list_task_attempts`, and grades against the rubric.
|
|
9290
|
+
*
|
|
9291
|
+
* Reuses `JudgePackScore` from `judge_pack` for per-criterion scoring
|
|
9292
|
+
* (Lane 1 binary via `llm_checklist`, Lane 2 graded via `llm_score`,
|
|
9293
|
+
* deterministic_*) — the score shape is the same across judgment
|
|
9294
|
+
* tasks; only the wrapping (per-variant grouping + deltas) differs.
|
|
9295
|
+
*
|
|
9296
|
+
* Cross-task input invariants — "all targets share the same
|
|
9297
|
+
* correlation_id, all are `run_eval`, all are completed with an
|
|
9298
|
+
* accepted attempt, all share byte-identical `input.successCriteria`"
|
|
9299
|
+
* — REQUIRE async DB lookups and live in `validateInputAsync` below,
|
|
9300
|
+
* which the task service runs at create time (#1096 wiring). The
|
|
9301
|
+
* TypeBox layer here only enforces shape: UUID format,
|
|
9302
|
+
* minItems/maxItems, rubric presence + weight invariant.
|
|
9303
|
+
*/
|
|
9304
|
+
var JUDGE_EVAL_VARIANT_TYPE = "judge_eval_variant";
|
|
9305
|
+
var JudgeEvalVariantInput = Type$1.Object({
|
|
9306
|
+
runTaskIds: Type$1.Array(Type$1.String({ format: "uuid" }), {
|
|
9307
|
+
minItems: 2,
|
|
9308
|
+
maxItems: 10
|
|
9309
|
+
}),
|
|
9310
|
+
successCriteria: SuccessCriteria
|
|
9311
|
+
}, {
|
|
9312
|
+
$id: "JudgeEvalVariantInput",
|
|
9313
|
+
additionalProperties: false
|
|
9314
|
+
});
|
|
9315
|
+
/**
|
|
9316
|
+
* Per-variant grading. `scores[]` shape is identical to `JudgePackScore`
|
|
9317
|
+
* (mode-aware: binary via `llm_checklist`, graded via `llm_score`,
|
|
9318
|
+
* deterministic_*). Reuse the type rather than re-declare.
|
|
9319
|
+
*
|
|
9320
|
+
* This is also the **subagent output contract** — the parent's
|
|
9321
|
+
* `subagent` tool resolves the contract name `judge_eval_variant_result`
|
|
9322
|
+
* to this schema. See `agent-runtime`'s subagent contract registry.
|
|
9323
|
+
*/
|
|
9324
|
+
var JudgeEvalVariantResult = Type$1.Object({
|
|
9325
|
+
runTaskId: Type$1.String({ format: "uuid" }),
|
|
9326
|
+
variantLabel: Type$1.String({
|
|
9327
|
+
minLength: 1,
|
|
9328
|
+
maxLength: 64,
|
|
9329
|
+
pattern: "^(?!.* - ).*$"
|
|
9330
|
+
}),
|
|
9331
|
+
scores: Type$1.Array(JudgePackScore, { minItems: 1 }),
|
|
9332
|
+
composite: Type$1.Number({
|
|
9333
|
+
minimum: 0,
|
|
9334
|
+
maximum: 1
|
|
9335
|
+
}),
|
|
9336
|
+
verdict: Type$1.String({ minLength: 1 })
|
|
9337
|
+
}, {
|
|
9338
|
+
$id: "JudgeEvalVariantResult",
|
|
9339
|
+
additionalProperties: false
|
|
9340
|
+
});
|
|
9341
|
+
var JudgeEvalVariantOutput = Type$1.Object({
|
|
9342
|
+
results: Type$1.Array(JudgeEvalVariantResult, { minItems: 2 }),
|
|
9343
|
+
deltas: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Number({
|
|
9344
|
+
minimum: -1,
|
|
9345
|
+
maximum: 1
|
|
9346
|
+
}))),
|
|
9347
|
+
judgeModel: Type$1.Optional(Type$1.String({ minLength: 1 })),
|
|
9348
|
+
traceparent: Type$1.String({ minLength: 1 })
|
|
9349
|
+
}, {
|
|
9350
|
+
$id: "JudgeEvalVariantOutput",
|
|
9351
|
+
additionalProperties: false
|
|
9352
|
+
});
|
|
9353
|
+
/**
|
|
9354
|
+
* Synchronous input invariants beyond TypeBox shape: rubric must be
|
|
9355
|
+
* present (already required by the schema, but the rubric body has
|
|
9356
|
+
* its own per-criterion weight invariant) and the rubric's weights
|
|
9357
|
+
* must sum to 1.
|
|
9358
|
+
*
|
|
9359
|
+
* Cross-task invariants (all targets are `run_eval`, all completed,
|
|
9360
|
+
* share `correlation_id`, byte-identical `input.successCriteria`)
|
|
9361
|
+
* are NOT checked here — they require async DB lookups against
|
|
9362
|
+
* `runTaskIds` and live in `validateJudgeEvalVariantInputAsync`
|
|
9363
|
+
* below, invoked by the task service at create time (#1096).
|
|
9364
|
+
*/
|
|
9365
|
+
function validateJudgeEvalVariantInput(input) {
|
|
9366
|
+
const sc = input.successCriteria;
|
|
9367
|
+
if (!sc) return "successCriteria is required for judge_eval_variant";
|
|
9368
|
+
if (!sc.rubric) return "successCriteria.rubric is required for judge_eval_variant";
|
|
9369
|
+
return validateRubricWeights(sc.rubric);
|
|
9370
|
+
}
|
|
9371
|
+
/**
|
|
9372
|
+
* Output cross-field invariants the schema cannot express:
|
|
9373
|
+
*
|
|
9374
|
+
* 1. `results.length === input.runTaskIds.length` — every variant
|
|
9375
|
+
* the imposer asked for must be graded. Partial grading
|
|
9376
|
+
* invalidates cross-variant comparison; fail the whole task
|
|
9377
|
+
* rather than silently report a subset.
|
|
9378
|
+
*
|
|
9379
|
+
* 2. `results[i].runTaskId === input.runTaskIds[i]` — order is
|
|
9380
|
+
* load-bearing for downstream consumers (e.g. deltas keyed by
|
|
9381
|
+
* adjacent pairs). Mismatch is an LLM bug; reject loudly.
|
|
9382
|
+
*
|
|
9383
|
+
* 3. Each `result.scores` follows the same `llm_checklist` rule
|
|
9384
|
+
* `judge_pack` enforces (#999): if a score has an `assertions`
|
|
9385
|
+
* array, the numeric score MUST be `1` iff every assertion
|
|
9386
|
+
* passes. Inconsistent payloads pollute attestations.
|
|
9387
|
+
*
|
|
9388
|
+
* 4. Each `result.composite` MUST equal the rubric-weighted sum
|
|
9389
|
+
* `Σ(weight_j × scores[j].score)`. The parent (and any subagent
|
|
9390
|
+
* it delegated to) is supposed to compute this; surfacing a
|
|
9391
|
+
* drift here catches LLMs that hand-wave the arithmetic.
|
|
9392
|
+
*
|
|
9393
|
+
* 5. Optional `deltas` keys MUST be of the form `"A - B"` where
|
|
9394
|
+
* both `A` and `B` are variantLabels present in `results`.
|
|
9395
|
+
* Values are not range-checked (any float in [-1, 1] is
|
|
9396
|
+
* arithmetically possible).
|
|
9397
|
+
*/
|
|
9398
|
+
function validateJudgeEvalVariantOutput(output, input) {
|
|
9399
|
+
const out = output;
|
|
9400
|
+
const inp = input;
|
|
9401
|
+
if (inp) {
|
|
9402
|
+
if (out.results.length !== inp.runTaskIds.length) return `results.length (${out.results.length}) does not match input.runTaskIds.length (${inp.runTaskIds.length}). Every variant must be graded; partial grading is rejected.`;
|
|
9403
|
+
for (let i = 0; i < out.results.length; i++) if (out.results[i].runTaskId !== inp.runTaskIds[i]) return `results[${i}].runTaskId (${out.results[i].runTaskId}) does not match input.runTaskIds[${i}] (${inp.runTaskIds[i]}). Order must align with input for downstream delta computation.`;
|
|
9404
|
+
}
|
|
9405
|
+
for (let r = 0; r < out.results.length; r++) {
|
|
9406
|
+
const result = out.results[r];
|
|
9407
|
+
for (let s = 0; s < result.scores.length; s++) {
|
|
9408
|
+
const sc = result.scores[s];
|
|
9409
|
+
if (!sc.assertions) continue;
|
|
9410
|
+
const allPassed = sc.assertions.every((a) => a.passed);
|
|
9411
|
+
const expected = allPassed ? 1 : 0;
|
|
9412
|
+
if (sc.score !== expected) return `results[${r}].scores[${s}] (criterionId="${sc.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${sc.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
|
|
9413
|
+
}
|
|
9414
|
+
}
|
|
9415
|
+
if (inp?.successCriteria?.rubric) {
|
|
9416
|
+
const criteria = inp.successCriteria.rubric.criteria;
|
|
9417
|
+
const weightById = new Map(criteria.map((c) => [c.id, c.weight]));
|
|
9418
|
+
for (let r = 0; r < out.results.length; r++) {
|
|
9419
|
+
const result = out.results[r];
|
|
9420
|
+
let sum = 0;
|
|
9421
|
+
for (const sc of result.scores) {
|
|
9422
|
+
const w = weightById.get(sc.criterionId);
|
|
9423
|
+
if (w === void 0) return `results[${r}].scores: criterionId "${sc.criterionId}" is not in the input rubric (known: ${Array.from(weightById.keys()).join(", ")}). Score every rubric criterion exactly once; do not invent new ids.`;
|
|
9424
|
+
sum += w * sc.score;
|
|
9425
|
+
}
|
|
9426
|
+
if (Math.abs(sum - result.composite) > .001) return `results[${r}].composite (${result.composite}) does not match Σ(weight × score) (${sum.toFixed(6)}). Composite must be the rubric-weighted sum of per-criterion scores (drift > 0.001).`;
|
|
9427
|
+
}
|
|
9428
|
+
}
|
|
9429
|
+
if (out.deltas) {
|
|
9430
|
+
const labels = new Set(out.results.map((r) => r.variantLabel));
|
|
9431
|
+
for (const key of Object.keys(out.deltas)) {
|
|
9432
|
+
const m = /^(.+?) - (.+)$/.exec(key);
|
|
9433
|
+
if (!m) return `deltas key "${key}" is not of the form "<variantLabel-A> - <variantLabel-B>". Use a single space-hyphen-space separator between labels.`;
|
|
9434
|
+
const [, a, b] = m;
|
|
9435
|
+
if (!labels.has(a) || !labels.has(b)) return `deltas key "${key}" references variantLabel(s) not present in results: ${!labels.has(a) ? `"${a}" missing` : ""}${!labels.has(a) && !labels.has(b) ? ", " : ""}${!labels.has(b) ? `"${b}" missing` : ""}`;
|
|
9436
|
+
}
|
|
9437
|
+
}
|
|
9438
|
+
return null;
|
|
9439
|
+
}
|
|
9440
|
+
/**
|
|
9441
|
+
* Local stable-stringify for cross-variant `successCriteria` byte-
|
|
9442
|
+
* equality. Recursively sorts object keys; arrays preserve order
|
|
9443
|
+
* (intentional — rubric criteria order is semantically meaningful).
|
|
9444
|
+
* Mirrors the canonical-JSON shape `crypto-service` uses for CIDs,
|
|
9445
|
+
* without taking on a crypto-service dep just for this comparison.
|
|
9446
|
+
*/
|
|
9447
|
+
function stableStringify(value) {
|
|
9448
|
+
if (value === null || typeof value !== "object") return JSON.stringify(value);
|
|
9449
|
+
if (Array.isArray(value)) return "[" + value.map(stableStringify).join(",") + "]";
|
|
9450
|
+
const obj = value;
|
|
9451
|
+
return "{" + Object.keys(obj).sort().map((k) => JSON.stringify(k) + ":" + stableStringify(obj[k])).join(",") + "}";
|
|
9452
|
+
}
|
|
9453
|
+
/**
|
|
9454
|
+
* Async preflight for `judge_eval_variant` (#1096 + #943):
|
|
9455
|
+
*
|
|
9456
|
+
* 1. Every `runTaskIds[i]` resolves to a task the caller can read.
|
|
9457
|
+
* 2. Every resolved task is `taskType === 'run_eval'`.
|
|
9458
|
+
* 3. Every resolved task is `status === 'completed'` with a
|
|
9459
|
+
* non-null `acceptedAttemptN` — grading an unaccepted attempt
|
|
9460
|
+
* races with re-attempts and pollutes the judge attestation.
|
|
9461
|
+
* 4. Every resolved task shares a non-null `correlationId`, and all
|
|
9462
|
+
* `correlationId`s are equal. Without this an imposer could
|
|
9463
|
+
* fabricate a "variant set" by stapling unrelated runs together.
|
|
9464
|
+
* 5. The shared `correlationId` is NOT already sealed. A previous
|
|
9465
|
+
* judge_eval_variant against the same group is final; produce a
|
|
9466
|
+
* fresh correlation_id for a new judging round rather than
|
|
9467
|
+
* adding contradictory verdicts to a sealed group.
|
|
9468
|
+
* 6. Every variant's `input.successCriteria` is byte-identical (via
|
|
9469
|
+
* stable-stringify). Different rubrics across "variants" makes
|
|
9470
|
+
* the comparison meaningless.
|
|
9471
|
+
*/
|
|
9472
|
+
async function validateJudgeEvalVariantInputAsync(input, ctx) {
|
|
9473
|
+
const { runTaskIds } = input;
|
|
9474
|
+
const errors = [];
|
|
9475
|
+
const resolved = await Promise.all(runTaskIds.map((id) => ctx.resolveTask(id)));
|
|
9476
|
+
let missingTargets = false;
|
|
9477
|
+
const presentTargets = [];
|
|
9478
|
+
for (let i = 0; i < runTaskIds.length; i++) {
|
|
9479
|
+
const t = resolved[i];
|
|
9480
|
+
if (!t) {
|
|
9481
|
+
missingTargets = true;
|
|
9482
|
+
errors.push({
|
|
9483
|
+
field: `runTaskIds[${i}]`,
|
|
9484
|
+
message: `runTaskIds[${i}]=${runTaskIds[i]} does not resolve to a task you can read`
|
|
9485
|
+
});
|
|
9486
|
+
continue;
|
|
9487
|
+
}
|
|
9488
|
+
presentTargets.push(t);
|
|
9489
|
+
if (t.taskType !== "run_eval") errors.push({
|
|
9490
|
+
field: `runTaskIds[${i}]`,
|
|
9491
|
+
message: `runTaskIds[${i}]=${runTaskIds[i]} is a ${t.taskType}, not a run_eval`
|
|
9492
|
+
});
|
|
9493
|
+
if (t.status !== "completed" || t.acceptedAttemptN === null) errors.push({
|
|
9494
|
+
field: `runTaskIds[${i}]`,
|
|
9495
|
+
message: `runTaskIds[${i}]=${runTaskIds[i]} is not completed with an accepted attempt (status=${t.status}, acceptedAttemptN=${t.acceptedAttemptN})`
|
|
9496
|
+
});
|
|
9497
|
+
}
|
|
9498
|
+
if (missingTargets || presentTargets.length === 0) return errors;
|
|
9499
|
+
const correlationIds = new Set(presentTargets.map((t) => t.correlationId ?? "__null__"));
|
|
9500
|
+
if (correlationIds.has("__null__")) errors.push({
|
|
9501
|
+
field: "runTaskIds",
|
|
9502
|
+
message: "one or more run_eval targets have no correlation_id; cannot group as variants"
|
|
9503
|
+
});
|
|
9504
|
+
if (correlationIds.size > 1) errors.push({
|
|
9505
|
+
field: "runTaskIds",
|
|
9506
|
+
message: `run_eval targets span multiple correlation_ids (${Array.from(correlationIds).join(", ")}); variants must share one`
|
|
9507
|
+
});
|
|
9508
|
+
if (errors.length > 0) return errors;
|
|
9509
|
+
const correlationId = presentTargets[0].correlationId;
|
|
9510
|
+
if (!correlationId) return errors;
|
|
9511
|
+
const seal = await ctx.findCorrelationSeal(correlationId);
|
|
9512
|
+
if (seal) errors.push({
|
|
9513
|
+
field: "runTaskIds",
|
|
9514
|
+
message: `correlation_id ${correlationId} is already sealed by ${seal.sealedByTaskType}/${seal.sealedByTaskId} at ${seal.sealedAt}; use a fresh correlation_id for a new judging round`
|
|
9515
|
+
});
|
|
9516
|
+
const first = stableStringify(presentTargets[0].input.successCriteria);
|
|
9517
|
+
for (let i = 1; i < presentTargets.length; i++) if (stableStringify(presentTargets[i].input.successCriteria) !== first) {
|
|
9518
|
+
errors.push({
|
|
9519
|
+
field: `runTaskIds[${i}]`,
|
|
9520
|
+
message: `runTaskIds[${i}] has a different input.successCriteria than runTaskIds[0]; all variants must share the rubric and gates`
|
|
9521
|
+
});
|
|
9522
|
+
break;
|
|
9523
|
+
}
|
|
9524
|
+
return errors;
|
|
9525
|
+
}
|
|
9526
|
+
/**
|
|
9527
|
+
* Side effect emitted on successful `judge_eval_variant` create:
|
|
9528
|
+
* seal the shared correlation_id atomically with the insert. The
|
|
9529
|
+
* task service applies the seal in the same transaction; a
|
|
9530
|
+
* concurrent second `judge_eval_variant` against the same group
|
|
9531
|
+
* loses the race and is rejected with a clean conflict error.
|
|
9532
|
+
*
|
|
9533
|
+
* The seal applies to the SHARED correlation_id of the targets —
|
|
9534
|
+
* NOT to the judge task's own correlationId (which is typically
|
|
9535
|
+
* null or distinct). The task service derives the correlationId
|
|
9536
|
+
* for the effect from the resolved targets, not from the judge
|
|
9537
|
+
* task row.
|
|
9538
|
+
*/
|
|
9539
|
+
async function onCreateJudgeEvalVariant(input, ctx) {
|
|
9540
|
+
const { runTaskIds } = input;
|
|
9541
|
+
const first = await ctx.resolveTask(runTaskIds[0]);
|
|
9542
|
+
if (!first?.correlationId) return [];
|
|
9543
|
+
return [{
|
|
9544
|
+
kind: "sealCorrelation",
|
|
9545
|
+
correlationId: first.correlationId
|
|
9546
|
+
}];
|
|
9547
|
+
}
|
|
9209
9548
|
//#endregion
|
|
9210
9549
|
//#region ../tasks/src/task-types/render-pack.ts
|
|
9211
9550
|
/**
|
|
@@ -9245,6 +9584,18 @@ var RenderPackOutput = Type$1.Object({
|
|
|
9245
9584
|
$id: "RenderPackOutput",
|
|
9246
9585
|
additionalProperties: false
|
|
9247
9586
|
});
|
|
9587
|
+
/**
|
|
9588
|
+
* Async preflight (#1096): `packId` resolves to a context_packs row
|
|
9589
|
+
* the caller can read.
|
|
9590
|
+
*/
|
|
9591
|
+
async function validateRenderPackInputAsync(input, ctx) {
|
|
9592
|
+
const { packId } = input;
|
|
9593
|
+
if (!await ctx.resolveContextPack(packId)) return [{
|
|
9594
|
+
field: "packId",
|
|
9595
|
+
message: `packId ${packId} does not resolve to a context pack you can read`
|
|
9596
|
+
}];
|
|
9597
|
+
return [];
|
|
9598
|
+
}
|
|
9248
9599
|
//#endregion
|
|
9249
9600
|
//#region ../tasks/src/task-types/run-eval.ts
|
|
9250
9601
|
/**
|
|
@@ -9352,7 +9703,8 @@ var BUILT_IN_TASK_TYPES = {
|
|
|
9352
9703
|
outputSchema: AssessBriefOutput,
|
|
9353
9704
|
outputKind: "judgment",
|
|
9354
9705
|
requiresReferences: true,
|
|
9355
|
-
validateInput: validateJudgmentInput
|
|
9706
|
+
validateInput: validateJudgmentInput,
|
|
9707
|
+
validateInputAsync: validateAssessBriefInputAsync
|
|
9356
9708
|
},
|
|
9357
9709
|
[CURATE_PACK_TYPE]: {
|
|
9358
9710
|
name: CURATE_PACK_TYPE,
|
|
@@ -9368,7 +9720,8 @@ var BUILT_IN_TASK_TYPES = {
|
|
|
9368
9720
|
outputSchema: RenderPackOutput,
|
|
9369
9721
|
outputKind: "artifact",
|
|
9370
9722
|
requiresReferences: false,
|
|
9371
|
-
validateOutput: requireVerificationWhenCriteriaPresent
|
|
9723
|
+
validateOutput: requireVerificationWhenCriteriaPresent,
|
|
9724
|
+
validateInputAsync: validateRenderPackInputAsync
|
|
9372
9725
|
},
|
|
9373
9726
|
[JUDGE_PACK_TYPE]: {
|
|
9374
9727
|
name: JUDGE_PACK_TYPE,
|
|
@@ -9377,7 +9730,8 @@ var BUILT_IN_TASK_TYPES = {
|
|
|
9377
9730
|
outputKind: "judgment",
|
|
9378
9731
|
requiresReferences: true,
|
|
9379
9732
|
validateInput: validateJudgmentInput,
|
|
9380
|
-
validateOutput: validateJudgePackOutput
|
|
9733
|
+
validateOutput: validateJudgePackOutput,
|
|
9734
|
+
validateInputAsync: validateJudgePackInputAsync
|
|
9381
9735
|
},
|
|
9382
9736
|
[RUN_EVAL_TYPE]: {
|
|
9383
9737
|
name: RUN_EVAL_TYPE,
|
|
@@ -9386,6 +9740,18 @@ var BUILT_IN_TASK_TYPES = {
|
|
|
9386
9740
|
outputKind: "artifact",
|
|
9387
9741
|
requiresReferences: false,
|
|
9388
9742
|
validateOutput: validateRunEvalOutput
|
|
9743
|
+
},
|
|
9744
|
+
[JUDGE_EVAL_VARIANT_TYPE]: {
|
|
9745
|
+
name: JUDGE_EVAL_VARIANT_TYPE,
|
|
9746
|
+
inputSchema: JudgeEvalVariantInput,
|
|
9747
|
+
outputSchema: JudgeEvalVariantOutput,
|
|
9748
|
+
outputKind: "judgment",
|
|
9749
|
+
requiresReferences: false,
|
|
9750
|
+
validateInput: validateJudgeEvalVariantInput,
|
|
9751
|
+
validateOutput: validateJudgeEvalVariantOutput,
|
|
9752
|
+
validateInputAsync: validateJudgeEvalVariantInputAsync,
|
|
9753
|
+
onCreate: onCreateJudgeEvalVariant,
|
|
9754
|
+
usesSubagents: true
|
|
9389
9755
|
}
|
|
9390
9756
|
};
|
|
9391
9757
|
//#endregion
|
|
@@ -9440,6 +9806,15 @@ function validateTaskOutput(taskType, output, input) {
|
|
|
9440
9806
|
function getTaskOutputSchema(taskType) {
|
|
9441
9807
|
return getTaskTypeEntry(taskType)?.outputSchema ?? null;
|
|
9442
9808
|
}
|
|
9809
|
+
/**
|
|
9810
|
+
* Whether sessions running this task type should have the generic
|
|
9811
|
+
* `subagent` custom tool registered. Returns `false` for unknown task
|
|
9812
|
+
* types and for task types that didn't opt in. See `TaskTypeEntry`
|
|
9813
|
+
* for the design rationale.
|
|
9814
|
+
*/
|
|
9815
|
+
function taskTypeUsesSubagents(taskType) {
|
|
9816
|
+
return getTaskTypeEntry(taskType)?.usesSubagents === true;
|
|
9817
|
+
}
|
|
9443
9818
|
//#endregion
|
|
9444
9819
|
//#region ../tasks/src/wire.ts
|
|
9445
9820
|
/**
|
|
@@ -9676,6 +10051,133 @@ Type$1.Object({
|
|
|
9676
10051
|
additionalProperties: false
|
|
9677
10052
|
});
|
|
9678
10053
|
//#endregion
|
|
10054
|
+
//#region ../agent-runtime/src/subagent-output-contracts.ts
|
|
10055
|
+
var REGISTRY = /* @__PURE__ */ new Map();
|
|
10056
|
+
/**
|
|
10057
|
+
* Register a subagent output contract. Idempotent: re-registering the
|
|
10058
|
+
* same name with a different schema throws — contracts are meant to
|
|
10059
|
+
* be stable. Re-registering with the identical contract object (same
|
|
10060
|
+
* reference) is a no-op for HMR and test convenience.
|
|
10061
|
+
*
|
|
10062
|
+
* Typically called at module-init time alongside task-type
|
|
10063
|
+
* registration. See task-types/index.ts in @moltnet/tasks for the
|
|
10064
|
+
* conventional pattern.
|
|
10065
|
+
*/
|
|
10066
|
+
function registerSubagentOutputContract(contract) {
|
|
10067
|
+
if (!contract.name || contract.name.trim().length === 0) throw new Error("subagent output contract name is required");
|
|
10068
|
+
if (!/^[a-z][a-z0-9_]*$/.test(contract.name)) throw new Error(`subagent output contract name '${contract.name}' must be lower_snake_case (starts with a letter, then [a-z0-9_]+)`);
|
|
10069
|
+
const existing = REGISTRY.get(contract.name);
|
|
10070
|
+
if (existing && existing !== contract) {
|
|
10071
|
+
if (existing.parametersSchema !== contract.parametersSchema) throw new Error(`subagent output contract '${contract.name}' is already registered with a different schema; refusing to override`);
|
|
10072
|
+
}
|
|
10073
|
+
REGISTRY.set(contract.name, contract);
|
|
10074
|
+
}
|
|
10075
|
+
/**
|
|
10076
|
+
* Resolve a subagent output contract by name. Returns `null` for
|
|
10077
|
+
* unknown names — callers (the subagent custom tool) decide whether
|
|
10078
|
+
* that's a tool error the parent LLM can recover from or a hard fail.
|
|
10079
|
+
*/
|
|
10080
|
+
function getSubagentOutputContract(name) {
|
|
10081
|
+
return REGISTRY.get(name) ?? null;
|
|
10082
|
+
}
|
|
10083
|
+
/**
|
|
10084
|
+
* List all registered contracts. Useful for diagnostics and for the
|
|
10085
|
+
* subagent tool's parameter description so a parent LLM can see what
|
|
10086
|
+
* contracts are available without enumerating them in its prompt.
|
|
10087
|
+
*/
|
|
10088
|
+
function listSubagentOutputContracts() {
|
|
10089
|
+
return [...REGISTRY.values()];
|
|
10090
|
+
}
|
|
10091
|
+
//#endregion
|
|
10092
|
+
//#region ../agent-runtime/src/built-in-contract-registrations.ts
|
|
10093
|
+
/**
|
|
10094
|
+
* Built-in subagent output contracts (#1087, #943).
|
|
10095
|
+
*
|
|
10096
|
+
* Why this is an exported function and not a module-init side
|
|
10097
|
+
* effect:
|
|
10098
|
+
*
|
|
10099
|
+
* - The registry is process-global. Module-init registration
|
|
10100
|
+
* fires exactly once per Node process (ESM modules are cached
|
|
10101
|
+
* by URL). Tests that call `__resetSubagentOutputContractsForTests()`
|
|
10102
|
+
* to start from an empty registry have no way to repopulate
|
|
10103
|
+
* the built-ins without re-evaluating the module — which the
|
|
10104
|
+
* cache prevents. PR #1101 review M4.
|
|
10105
|
+
* - An explicit `registerBuiltInSubagentContracts()` lets the
|
|
10106
|
+
* package index call it once at module load AND lets test
|
|
10107
|
+
* setup hooks call it again after `__reset...`.
|
|
10108
|
+
* - `registerSubagentOutputContract` is itself idempotent for
|
|
10109
|
+
* identical re-registrations, so calling this function twice
|
|
10110
|
+
* in the same process is safe.
|
|
10111
|
+
*
|
|
10112
|
+
* Adding a new built-in: extend the body of this function. Do not
|
|
10113
|
+
* call `registerSubagentOutputContract` from anywhere else in the
|
|
10114
|
+
* package — keeping all built-ins in one function makes the set
|
|
10115
|
+
* auditable.
|
|
10116
|
+
*/
|
|
10117
|
+
function registerBuiltInSubagentContracts() {
|
|
10118
|
+
registerSubagentOutputContract({
|
|
10119
|
+
name: "judge_eval_variant_result",
|
|
10120
|
+
description: "Per-variant grading result produced by a subagent of judge_eval_variant: scores against the shared rubric, composite, and a 1-3 sentence verdict for a single variant.",
|
|
10121
|
+
parametersSchema: JudgeEvalVariantResult
|
|
10122
|
+
});
|
|
10123
|
+
}
|
|
10124
|
+
registerBuiltInSubagentContracts();
|
|
10125
|
+
//#endregion
|
|
10126
|
+
//#region ../agent-runtime/src/context-bindings.ts
|
|
10127
|
+
var PROMPT_SEPARATOR = "\n\n---\n\n";
|
|
10128
|
+
/**
|
|
10129
|
+
* Resolve `task.input.context[]` into delivered side-effects (skills
|
|
10130
|
+
* persisted via `deliver.skill`) and prompt fragments
|
|
10131
|
+
* (`systemPromptPrefix`, `userInlineSuffix`) the caller weaves into the
|
|
10132
|
+
* built prompt.
|
|
10133
|
+
*
|
|
10134
|
+
* Per-binding semantics (V1):
|
|
10135
|
+
* - `skill` → `deliver.skill({ slug, content })` once per ref.
|
|
10136
|
+
* Slug collisions on distinct contents are
|
|
10137
|
+
* refused loudly.
|
|
10138
|
+
* - `prompt_prefix` → content appended to `systemPromptPrefix` with
|
|
10139
|
+
* the canonical `\n\n---\n\n` separator (in
|
|
10140
|
+
* declared order).
|
|
10141
|
+
* - `user_inline` → content appended to `userInlineSuffix` in
|
|
10142
|
+
* declared order, same separator.
|
|
10143
|
+
*
|
|
10144
|
+
* No fetching, no hashing — bytes are inlined in `ContextRef.content`,
|
|
10145
|
+
* and the task's `inputCid` already pins the entire input. The imposer
|
|
10146
|
+
* chose these bytes; the resolver just dispatches them.
|
|
10147
|
+
*
|
|
10148
|
+
* The function is pure with respect to its arguments: file writes are
|
|
10149
|
+
* confined to the injected `deliver` callback, which makes the
|
|
10150
|
+
* resolver trivial to test.
|
|
10151
|
+
*/
|
|
10152
|
+
async function resolveTaskContext(args) {
|
|
10153
|
+
const promptParts = [];
|
|
10154
|
+
const userParts = [];
|
|
10155
|
+
const injected = [];
|
|
10156
|
+
const usedSlugs = /* @__PURE__ */ new Map();
|
|
10157
|
+
for (const ref of args.context) {
|
|
10158
|
+
if (ref.binding === "skill") {
|
|
10159
|
+
const prior = usedSlugs.get(ref.slug);
|
|
10160
|
+
if (prior !== void 0) {
|
|
10161
|
+
if (prior !== ref.content) throw new Error(`slug collision on '${ref.slug}': two skill entries share the same slug but have different content`);
|
|
10162
|
+
injected.push(ref);
|
|
10163
|
+
continue;
|
|
10164
|
+
}
|
|
10165
|
+
usedSlugs.set(ref.slug, ref.content);
|
|
10166
|
+
await args.deliver.skill({
|
|
10167
|
+
slug: ref.slug,
|
|
10168
|
+
content: ref.content
|
|
10169
|
+
});
|
|
10170
|
+
} else if (ref.binding === "prompt_prefix") promptParts.push(ref.content);
|
|
10171
|
+
else userParts.push(ref.content);
|
|
10172
|
+
injected.push(ref);
|
|
10173
|
+
}
|
|
10174
|
+
return {
|
|
10175
|
+
injected,
|
|
10176
|
+
systemPromptPrefix: promptParts.join(PROMPT_SEPARATOR),
|
|
10177
|
+
userInlineSuffix: userParts.join(PROMPT_SEPARATOR)
|
|
10178
|
+
};
|
|
10179
|
+
}
|
|
10180
|
+
//#endregion
|
|
9679
10181
|
//#region ../agent-runtime/src/output-tools.ts
|
|
9680
10182
|
/**
|
|
9681
10183
|
* Submit-output tool contract.
|
|
@@ -10148,6 +10650,109 @@ function buildFulfillBriefUserPrompt(input, ctx) {
|
|
|
10148
10650
|
].filter(Boolean).join("\n");
|
|
10149
10651
|
}
|
|
10150
10652
|
//#endregion
|
|
10653
|
+
//#region ../agent-runtime/src/prompts/judge-eval-variant.ts
|
|
10654
|
+
/**
|
|
10655
|
+
* Build the first user-message prompt for a `judge_eval_variant` task
|
|
10656
|
+
* (#943 Slice 2).
|
|
10657
|
+
*
|
|
10658
|
+
* The parent agent's job is **fan-out-and-collect**: for each
|
|
10659
|
+
* `runTaskIds[i]`, spawn an isolated subagent via the `subagent` custom
|
|
10660
|
+
* tool (#1087), have it grade that variant against the shared rubric,
|
|
10661
|
+
* and collect each subagent's structured `judge_eval_variant_result`
|
|
10662
|
+
* payload. The parent does NOT grade itself; it composes the per-
|
|
10663
|
+
* variant results into the final `judge_eval_variant` output (results
|
|
10664
|
+
* array + optional deltas + verdicts).
|
|
10665
|
+
*
|
|
10666
|
+
* Isolation is the point: each variant gets a fresh subagent session
|
|
10667
|
+
* with no carryover context from sibling variants, so per-variant
|
|
10668
|
+
* grading is independent. Cost is bounded by `maxItems: 10` on
|
|
10669
|
+
* runTaskIds.
|
|
10670
|
+
*/
|
|
10671
|
+
function buildJudgeEvalVariantUserPrompt(input, ctx) {
|
|
10672
|
+
const { runTaskIds, successCriteria } = input;
|
|
10673
|
+
const rubric = successCriteria.rubric;
|
|
10674
|
+
if (!rubric) throw new Error("judge_eval_variant requires successCriteria.rubric — none present");
|
|
10675
|
+
const escapeCell = (s) => s.replace(/\\/g, "\\\\").replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
|
|
10676
|
+
const criteriaTable = rubric.criteria.map((c) => `| \`${c.id}\` | ${c.weight.toFixed(3)} | ${c.scoring} | ${escapeCell(c.description)} |`).join("\n");
|
|
10677
|
+
const targetsBlock = runTaskIds.map((id, i) => `${i + 1}. \`${id}\``).join("\n");
|
|
10678
|
+
const finalOutputBlock = buildFinalOutputBlock({
|
|
10679
|
+
taskType: "judge_eval_variant",
|
|
10680
|
+
outputSchemaName: "JudgeEvalVariantOutput",
|
|
10681
|
+
shapeSketch: [
|
|
10682
|
+
"{",
|
|
10683
|
+
" \"results\": [",
|
|
10684
|
+
" {",
|
|
10685
|
+
" \"runTaskId\": \"<runTaskIds[i]>\",",
|
|
10686
|
+
" \"variantLabel\": \"<from variant input>\",",
|
|
10687
|
+
" \"scores\": [ { \"criterionId\": \"...\", \"score\": 0..1, \"rationale\": \"...\", \"assertions\": [...]? } ],",
|
|
10688
|
+
" \"composite\": <Σ(weight × score), 0..1>,",
|
|
10689
|
+
" \"verdict\": \"<1-3 sentences>\"",
|
|
10690
|
+
" },",
|
|
10691
|
+
" ...one entry per runTaskIds[i], same order",
|
|
10692
|
+
" ],",
|
|
10693
|
+
" \"deltas\": { \"<labelA> - <labelB>\": <composite(A) - composite(B)> }, // optional",
|
|
10694
|
+
" \"judgeModel\": \"<id>\", // optional",
|
|
10695
|
+
" \"traceparent\": \"<from claim>\"",
|
|
10696
|
+
"}"
|
|
10697
|
+
].join("\n")
|
|
10698
|
+
});
|
|
10699
|
+
return [
|
|
10700
|
+
"# Judge Eval Variants\n",
|
|
10701
|
+
`You are grading ${runTaskIds.length} variants of a single run_eval scenario`,
|
|
10702
|
+
"against ONE shared rubric. Your job is fan-out-and-collect — you do not",
|
|
10703
|
+
"grade yourself.",
|
|
10704
|
+
"",
|
|
10705
|
+
`Task id: \`${ctx.taskId}\``,
|
|
10706
|
+
`Diary: \`${ctx.diaryId}\``,
|
|
10707
|
+
"",
|
|
10708
|
+
"### Targets (variants to grade)",
|
|
10709
|
+
"",
|
|
10710
|
+
targetsBlock,
|
|
10711
|
+
"",
|
|
10712
|
+
"Each target is a completed `run_eval` task in the same correlation group.",
|
|
10713
|
+
"Read its accepted attempt via `moltnet_get_task` / `moltnet_list_task_attempts`",
|
|
10714
|
+
"to see the producer's output before grading.",
|
|
10715
|
+
"",
|
|
10716
|
+
"### Rubric",
|
|
10717
|
+
"",
|
|
10718
|
+
rubric.preamble ? `${rubric.preamble}\n` : "",
|
|
10719
|
+
"| Criterion | Weight | Scoring | Description |",
|
|
10720
|
+
"| --- | --- | --- | --- |",
|
|
10721
|
+
criteriaTable,
|
|
10722
|
+
"",
|
|
10723
|
+
"### How to grade",
|
|
10724
|
+
"",
|
|
10725
|
+
"For EACH `runTaskIds[i]`:",
|
|
10726
|
+
"",
|
|
10727
|
+
"1. Call the `subagent` custom tool with:",
|
|
10728
|
+
" - `task`: a brief instructing the subagent to grade ONLY that variant",
|
|
10729
|
+
" against the rubric above; include the target task id and the rubric",
|
|
10730
|
+
" verbatim. The subagent has the same MoltNet tools and can fetch the",
|
|
10731
|
+
" accepted attempt output independently.",
|
|
10732
|
+
" - `output_schema`: `\"judge_eval_variant_result\"`",
|
|
10733
|
+
"2. Receive the subagent's structured `judge_eval_variant_result` payload.",
|
|
10734
|
+
"3. Append it to your `results[]` array, **in the same order as input.runTaskIds**.",
|
|
10735
|
+
"",
|
|
10736
|
+
"Do NOT score any variant in your own session. The whole point of the",
|
|
10737
|
+
"subagent fan-out is per-variant context isolation — grading two variants",
|
|
10738
|
+
"back-to-back in one session lets the second be biased by the first.",
|
|
10739
|
+
"",
|
|
10740
|
+
"### Composite arithmetic",
|
|
10741
|
+
"",
|
|
10742
|
+
"Each `composite` MUST equal `Σ(criterion.weight × score)` over the rubric",
|
|
10743
|
+
"criteria. Drift > 0.001 is rejected. Subagents are instructed to compute it",
|
|
10744
|
+
"themselves; double-check before assembling the final output.",
|
|
10745
|
+
"",
|
|
10746
|
+
"### Deltas (optional)",
|
|
10747
|
+
"",
|
|
10748
|
+
"If useful, populate `deltas` with pairwise composite differences keyed by",
|
|
10749
|
+
"`\"<variantLabel-A> - <variantLabel-B>\"` (single space-hyphen-space). Both",
|
|
10750
|
+
"labels must appear in `results`. Omit `deltas` entirely if not used.",
|
|
10751
|
+
"",
|
|
10752
|
+
finalOutputBlock
|
|
10753
|
+
].filter((s) => s !== "").join("\n");
|
|
10754
|
+
}
|
|
10755
|
+
//#endregion
|
|
10151
10756
|
//#region ../agent-runtime/src/prompts/judge-pack.ts
|
|
10152
10757
|
function buildJudgePackUserPrompt(input, ctx) {
|
|
10153
10758
|
const { renderedPackId, sourcePackId, successCriteria } = input;
|
|
@@ -10454,6 +11059,15 @@ function buildTaskUserPrompt(task, ctx) {
|
|
|
10454
11059
|
diaryId: ctx.diaryId,
|
|
10455
11060
|
taskId: ctx.taskId
|
|
10456
11061
|
});
|
|
11062
|
+
case JUDGE_EVAL_VARIANT_TYPE:
|
|
11063
|
+
if (!Value.Check(JudgeEvalVariantInput, task.input)) {
|
|
11064
|
+
const errors = [...Value.Errors(JudgeEvalVariantInput, task.input)];
|
|
11065
|
+
throw new Error(`judge_eval_variant input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
|
|
11066
|
+
}
|
|
11067
|
+
return buildJudgeEvalVariantUserPrompt(task.input, {
|
|
11068
|
+
diaryId: ctx.diaryId,
|
|
11069
|
+
taskId: ctx.taskId
|
|
11070
|
+
});
|
|
10457
11071
|
case RUN_EVAL_TYPE:
|
|
10458
11072
|
if (!Value.Check(RunEvalInput, task.input)) {
|
|
10459
11073
|
const errors = [...Value.Errors(RunEvalInput, task.input)];
|
|
@@ -14128,6 +14742,190 @@ function buildRuntimeInstructor(ctx) {
|
|
|
14128
14742
|
].join("\n");
|
|
14129
14743
|
}
|
|
14130
14744
|
//#endregion
|
|
14745
|
+
//#region src/runtime/subagent-tool.ts
|
|
14746
|
+
var SUBAGENT_SUBMIT_TOOL_NAME = "submit_subagent_output";
|
|
14747
|
+
/**
|
|
14748
|
+
* Parameters shape the parent LLM sees when calling the subagent tool.
|
|
14749
|
+
*
|
|
14750
|
+
* - `task` — natural-language instructions for the subagent.
|
|
14751
|
+
* The parent authors this per call. Must be
|
|
14752
|
+
* non-empty.
|
|
14753
|
+
* - `output_schema` — name of a registered SubagentOutputContract.
|
|
14754
|
+
* Resolved at call time; unknown names error.
|
|
14755
|
+
*/
|
|
14756
|
+
var SubagentToolParameters = Type$1.Object({
|
|
14757
|
+
task: Type$1.String({
|
|
14758
|
+
minLength: 1,
|
|
14759
|
+
description: "Natural-language instructions for the subagent. The subagent starts with a fresh conversation and a narrowed system prompt; this is the only context it has from you."
|
|
14760
|
+
}),
|
|
14761
|
+
output_schema: Type$1.String({
|
|
14762
|
+
minLength: 1,
|
|
14763
|
+
description: "Name of a registered subagent output contract. The subagent must submit a structured payload via `submit_subagent_output` matching this contract."
|
|
14764
|
+
})
|
|
14765
|
+
}, { additionalProperties: false });
|
|
14766
|
+
var DEFAULT_SUBAGENT_TIMEOUT_MS = 300 * 1e3;
|
|
14767
|
+
/**
|
|
14768
|
+
* Build the subagent custom tool for a parent session. The handle
|
|
14769
|
+
* exposes the call counter so executors can emit summary telemetry
|
|
14770
|
+
* when the parent terminates.
|
|
14771
|
+
*/
|
|
14772
|
+
function createSubagentTool(args) {
|
|
14773
|
+
const buildSession = args.buildAgentSession ?? buildAgentSession;
|
|
14774
|
+
let callCount = 0;
|
|
14775
|
+
return {
|
|
14776
|
+
tool: defineTool({
|
|
14777
|
+
name: "subagent",
|
|
14778
|
+
label: "Delegate to subagent",
|
|
14779
|
+
description: subagentToolDescription(),
|
|
14780
|
+
parameters: SubagentToolParameters,
|
|
14781
|
+
async execute(_id, params) {
|
|
14782
|
+
if (!Value.Check(SubagentToolParameters, params)) return toolError(`subagent: invalid parameters: ${JSON.stringify([...Value.Errors(SubagentToolParameters, params)].slice(0, 3))}`);
|
|
14783
|
+
const { task, output_schema } = params;
|
|
14784
|
+
const contract = getSubagentOutputContract(output_schema);
|
|
14785
|
+
if (!contract) return toolError(`subagent: unknown output_schema "${output_schema}". Registered contracts: [${listSubagentOutputContracts().map((c) => c.name).join(", ")}]`);
|
|
14786
|
+
callCount += 1;
|
|
14787
|
+
const callIndex = callCount;
|
|
14788
|
+
let captured = null;
|
|
14789
|
+
const submitTool = defineTool({
|
|
14790
|
+
name: SUBAGENT_SUBMIT_TOOL_NAME,
|
|
14791
|
+
label: `Submit ${output_schema}`,
|
|
14792
|
+
description: `Submit your structured output for this subagent task. Call exactly once when done. Args MUST match the ${output_schema} contract; mismatches return a tool error you can recover from in the same session.`,
|
|
14793
|
+
parameters: contract.parametersSchema,
|
|
14794
|
+
async execute(_innerId, innerParams) {
|
|
14795
|
+
if (!Value.Check(contract.parametersSchema, innerParams)) return toolError(`submit_subagent_output: schema validation failed: ${[...Value.Errors(contract.parametersSchema, innerParams)].slice(0, 3).map((e) => `${e.path}: ${e.message}`).join("; ")}. Re-call with a corrected payload.`);
|
|
14796
|
+
captured = innerParams;
|
|
14797
|
+
return {
|
|
14798
|
+
content: [{
|
|
14799
|
+
type: "text",
|
|
14800
|
+
text: "Output captured. Subagent session will terminate; no further action needed."
|
|
14801
|
+
}],
|
|
14802
|
+
details: { captured: true },
|
|
14803
|
+
terminate: true
|
|
14804
|
+
};
|
|
14805
|
+
}
|
|
14806
|
+
});
|
|
14807
|
+
const subagentInstructor = buildSubagentInstructor({
|
|
14808
|
+
contractName: output_schema,
|
|
14809
|
+
contractDescription: contract.description,
|
|
14810
|
+
parentTaskId: args.parentTaskId,
|
|
14811
|
+
callIndex
|
|
14812
|
+
});
|
|
14813
|
+
const session = await buildSession({
|
|
14814
|
+
mountPath: args.mountPath,
|
|
14815
|
+
piAuthDir: args.piAuthDir,
|
|
14816
|
+
modelHandle: args.modelHandle,
|
|
14817
|
+
agentName: args.agentName,
|
|
14818
|
+
customTools: [...args.inheritedCustomTools, submitTool],
|
|
14819
|
+
appendSystemPrompt: [args.parentRuntimeInstructor, subagentInstructor],
|
|
14820
|
+
skillsOverride: () => ({
|
|
14821
|
+
skills: [],
|
|
14822
|
+
diagnostics: []
|
|
14823
|
+
}),
|
|
14824
|
+
otelSpanAttrs: {
|
|
14825
|
+
"moltnet.task.id": args.parentTaskId,
|
|
14826
|
+
"moltnet.task.type": args.parentTaskType,
|
|
14827
|
+
"moltnet.task.attempt": args.parentAttemptN,
|
|
14828
|
+
"moltnet.subagent.contract": output_schema,
|
|
14829
|
+
"moltnet.subagent.index": callIndex
|
|
14830
|
+
}
|
|
14831
|
+
});
|
|
14832
|
+
let abortReason = null;
|
|
14833
|
+
let abortInvoked = false;
|
|
14834
|
+
const fireAbort = (reason) => {
|
|
14835
|
+
if (abortInvoked) return;
|
|
14836
|
+
abortInvoked = true;
|
|
14837
|
+
abortReason = reason;
|
|
14838
|
+
session.abort().catch((err) => {
|
|
14839
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
14840
|
+
process.stderr.write(`[subagent] inner session.abort() failed: ${message}\n`);
|
|
14841
|
+
});
|
|
14842
|
+
};
|
|
14843
|
+
const cancelListener = args.parentCancelSignal ? (() => {
|
|
14844
|
+
const signal = args.parentCancelSignal;
|
|
14845
|
+
const listener = () => fireAbort("parent_cancelled");
|
|
14846
|
+
if (signal.aborted) listener();
|
|
14847
|
+
else signal.addEventListener("abort", listener, { once: true });
|
|
14848
|
+
return () => signal.removeEventListener("abort", listener);
|
|
14849
|
+
})() : null;
|
|
14850
|
+
const timeoutMs = args.timeoutMs === void 0 || args.timeoutMs < 0 ? DEFAULT_SUBAGENT_TIMEOUT_MS : args.timeoutMs;
|
|
14851
|
+
const timeoutHandle = timeoutMs > 0 ? setTimeout(() => fireAbort("subagent_timed_out"), timeoutMs) : null;
|
|
14852
|
+
try {
|
|
14853
|
+
await session.prompt(task);
|
|
14854
|
+
} catch (err) {
|
|
14855
|
+
return toolError(`subagent: inner session.prompt() threw: ${err instanceof Error ? err.message : String(err)}`);
|
|
14856
|
+
} finally {
|
|
14857
|
+
if (timeoutHandle) clearTimeout(timeoutHandle);
|
|
14858
|
+
if (cancelListener) cancelListener();
|
|
14859
|
+
}
|
|
14860
|
+
if (abortReason !== null) return toolError(`subagent: ${abortReason === "subagent_timed_out" ? `subagent timed out after ${timeoutMs}ms` : "parent task was cancelled"}. The parent should fail this task or retry with a clearer scope.`);
|
|
14861
|
+
if (captured === null) return toolError(`subagent: inner session ended without calling ${SUBAGENT_SUBMIT_TOOL_NAME}. The parent should retry with clearer instructions or fail the task.`);
|
|
14862
|
+
return {
|
|
14863
|
+
content: [{
|
|
14864
|
+
type: "text",
|
|
14865
|
+
text: JSON.stringify(captured)
|
|
14866
|
+
}],
|
|
14867
|
+
details: {
|
|
14868
|
+
captured: true,
|
|
14869
|
+
contract: output_schema,
|
|
14870
|
+
callIndex
|
|
14871
|
+
}
|
|
14872
|
+
};
|
|
14873
|
+
}
|
|
14874
|
+
}),
|
|
14875
|
+
getCallCount: () => callCount
|
|
14876
|
+
};
|
|
14877
|
+
}
|
|
14878
|
+
function subagentToolDescription() {
|
|
14879
|
+
return [
|
|
14880
|
+
"Delegate a sub-task to a fresh subagent session with isolated context.",
|
|
14881
|
+
"",
|
|
14882
|
+
"The subagent starts with no conversation history and only the `task` ",
|
|
14883
|
+
"string you provide as its instructions. It runs in the same VM with ",
|
|
14884
|
+
"the same tools you have (Gondolin-routed Read/Write/Edit/Bash, ",
|
|
14885
|
+
"moltnet_* tools), and is expected to call ",
|
|
14886
|
+
`\`${SUBAGENT_SUBMIT_TOOL_NAME}\` with a payload matching the named `,
|
|
14887
|
+
"contract before its session ends.",
|
|
14888
|
+
"",
|
|
14889
|
+
"On success, the tool result is the JSON-stringified subagent payload.",
|
|
14890
|
+
"On failure (unknown contract, validation error, subagent did not ",
|
|
14891
|
+
"submit) the tool returns isError:true with a recoverable message."
|
|
14892
|
+
].join("\n");
|
|
14893
|
+
}
|
|
14894
|
+
function buildSubagentInstructor(args) {
|
|
14895
|
+
return [
|
|
14896
|
+
"# You are a subagent",
|
|
14897
|
+
"",
|
|
14898
|
+
`Parent task: \`${args.parentTaskId}\` (subagent call #${args.callIndex}).`,
|
|
14899
|
+
"",
|
|
14900
|
+
`Your assigned output contract is \`${args.contractName}\`:`,
|
|
14901
|
+
`${args.contractDescription}`,
|
|
14902
|
+
"",
|
|
14903
|
+
"Rules for this session:",
|
|
14904
|
+
"",
|
|
14905
|
+
`- You MUST call \`${SUBAGENT_SUBMIT_TOOL_NAME}\` exactly once with a `,
|
|
14906
|
+
" payload matching the contract above. Your session terminates on ",
|
|
14907
|
+
" the valid call.",
|
|
14908
|
+
"- The parent's message above is your task. Do not invent additional ",
|
|
14909
|
+
" steps the parent did not request.",
|
|
14910
|
+
"- All MoltNet runtime invariants from the parent runtime instructor ",
|
|
14911
|
+
" apply (diary discipline, gh-auth pattern, etc.) IF you take any ",
|
|
14912
|
+
" action that would trigger them. Most subagents do not commit code ",
|
|
14913
|
+
" or open PRs — only do so if your task message explicitly requires it.",
|
|
14914
|
+
"- You do NOT have access to the `subagent` tool. Do not attempt nested ",
|
|
14915
|
+
" delegation; do the work yourself."
|
|
14916
|
+
].join("\n");
|
|
14917
|
+
}
|
|
14918
|
+
function toolError(text) {
|
|
14919
|
+
return {
|
|
14920
|
+
content: [{
|
|
14921
|
+
type: "text",
|
|
14922
|
+
text
|
|
14923
|
+
}],
|
|
14924
|
+
details: { captured: false },
|
|
14925
|
+
isError: true
|
|
14926
|
+
};
|
|
14927
|
+
}
|
|
14928
|
+
//#endregion
|
|
14131
14929
|
//#region src/runtime/task-output.ts
|
|
14132
14930
|
var METER_NAME = "@themoltnet/pi-extension/task-output";
|
|
14133
14931
|
var parseResultCounter = null;
|
|
@@ -14439,6 +15237,7 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
14439
15237
|
const taskTeamId = task.teamId ?? "";
|
|
14440
15238
|
let reporterOpen = false;
|
|
14441
15239
|
let session = null;
|
|
15240
|
+
let subagentHandle = null;
|
|
14442
15241
|
const finalUsage = emptyUsage(opts.provider, opts.model);
|
|
14443
15242
|
let cancelListener = null;
|
|
14444
15243
|
const makeFailedOutput = (code, message, usage = finalUsage) => ({
|
|
@@ -14556,47 +15355,55 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
14556
15355
|
});
|
|
14557
15356
|
const piAuthDir = process.env.PI_CODING_AGENT_DIR ?? join(homedir(), ".pi", "agent");
|
|
14558
15357
|
const modelHandle = getModel(opts.provider, opts.model);
|
|
14559
|
-
const
|
|
14560
|
-
agentName: opts.agentName,
|
|
14561
|
-
spanAttributes: {
|
|
14562
|
-
"moltnet.task.id": task.id,
|
|
14563
|
-
"moltnet.task.attempt": attemptN,
|
|
14564
|
-
"moltnet.task.type": task.taskType
|
|
14565
|
-
}
|
|
14566
|
-
});
|
|
14567
|
-
const appendSystemPrompt = [buildRuntimeInstructor({
|
|
15358
|
+
const runtimeInstructor = buildRuntimeInstructor({
|
|
14568
15359
|
taskId: task.id,
|
|
14569
15360
|
taskType: task.taskType,
|
|
14570
15361
|
attemptN,
|
|
14571
15362
|
diaryId,
|
|
14572
15363
|
agentName: opts.agentName,
|
|
14573
15364
|
correlationId: task.correlationId ?? null
|
|
14574
|
-
})
|
|
15365
|
+
});
|
|
15366
|
+
const appendSystemPrompt = [runtimeInstructor];
|
|
14575
15367
|
if (injectedContext.systemPromptPrefix) appendSystemPrompt.push(injectedContext.systemPromptPrefix);
|
|
14576
15368
|
const injectedSkills = injectedContext.skills;
|
|
14577
|
-
const
|
|
14578
|
-
|
|
14579
|
-
|
|
14580
|
-
|
|
15369
|
+
const parentSubagentTools = [];
|
|
15370
|
+
if (taskTypeUsesSubagents(task.taskType)) {
|
|
15371
|
+
subagentHandle = createSubagentTool({
|
|
15372
|
+
mountPath,
|
|
15373
|
+
piAuthDir,
|
|
15374
|
+
modelHandle,
|
|
15375
|
+
agentName: opts.agentName,
|
|
15376
|
+
inheritedCustomTools: [...gondolinCustomTools, ...moltnetTools],
|
|
15377
|
+
parentRuntimeInstructor: runtimeInstructor,
|
|
15378
|
+
parentTaskId: task.id,
|
|
15379
|
+
parentTaskType: task.taskType,
|
|
15380
|
+
parentAttemptN: attemptN,
|
|
15381
|
+
parentCancelSignal: reporter.cancelSignal
|
|
15382
|
+
});
|
|
15383
|
+
parentSubagentTools.push(subagentHandle.tool);
|
|
15384
|
+
}
|
|
15385
|
+
session = await buildAgentSession({
|
|
15386
|
+
mountPath,
|
|
15387
|
+
piAuthDir,
|
|
15388
|
+
modelHandle,
|
|
15389
|
+
agentName: opts.agentName,
|
|
15390
|
+
customTools: [
|
|
15391
|
+
...gondolinCustomTools,
|
|
15392
|
+
...moltnetTools,
|
|
15393
|
+
...submitTools,
|
|
15394
|
+
...parentSubagentTools
|
|
15395
|
+
],
|
|
14581
15396
|
appendSystemPrompt,
|
|
14582
15397
|
skillsOverride: () => ({
|
|
14583
15398
|
skills: injectedSkills,
|
|
14584
15399
|
diagnostics: []
|
|
14585
|
-
})
|
|
15400
|
+
}),
|
|
15401
|
+
otelSpanAttrs: {
|
|
15402
|
+
"moltnet.task.id": task.id,
|
|
15403
|
+
"moltnet.task.attempt": attemptN,
|
|
15404
|
+
"moltnet.task.type": task.taskType
|
|
15405
|
+
}
|
|
14586
15406
|
});
|
|
14587
|
-
await resourceLoader.reload();
|
|
14588
|
-
session = (await createAgentSession({
|
|
14589
|
-
agentDir: piAuthDir,
|
|
14590
|
-
cwd: mountPath,
|
|
14591
|
-
model: modelHandle,
|
|
14592
|
-
customTools: [
|
|
14593
|
-
...gondolinCustomTools,
|
|
14594
|
-
...moltnetTools,
|
|
14595
|
-
...submitTools
|
|
14596
|
-
],
|
|
14597
|
-
sessionManager: SessionManager.inMemory(),
|
|
14598
|
-
resourceLoader
|
|
14599
|
-
})).session;
|
|
14600
15407
|
} catch (err) {
|
|
14601
15408
|
const message = err instanceof Error ? err.message : String(err);
|
|
14602
15409
|
await emit("error", {
|
|
@@ -14667,6 +15474,10 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
14667
15474
|
phase: "session_prompt"
|
|
14668
15475
|
});
|
|
14669
15476
|
}
|
|
15477
|
+
if (subagentHandle && subagentHandle.getCallCount() > 0) await emit("info", {
|
|
15478
|
+
event: "subagent_summary",
|
|
15479
|
+
callCount: subagentHandle.getCallCount()
|
|
15480
|
+
});
|
|
14670
15481
|
await Promise.all(recordingPromise);
|
|
14671
15482
|
const cancelled = reporter.cancelSignal.aborted;
|
|
14672
15483
|
let parsedOutput = null;
|
|
@@ -15126,4 +15937,4 @@ function moltnetExtension(pi) {
|
|
|
15126
15937
|
registerMoltnetReflectCommand(pi, state);
|
|
15127
15938
|
}
|
|
15128
15939
|
//#endregion
|
|
15129
|
-
export { HOST_EXEC_DEFAULT_BASE_ENV, activateAgentEnv, createGondolinBashOps, createGondolinEditOps, createGondolinReadOps, createGondolinWriteOps, createMoltNetTools, createPiOtelExtension, createPiTaskExecutor, moltnetExtension as default, ensureSnapshot, executePiTask, findMainWorktree, injectTaskContext, loadCredentials, resumeVm, toGuestPath };
|
|
15940
|
+
export { HOST_EXEC_DEFAULT_BASE_ENV, activateAgentEnv, buildAgentSession, createGondolinBashOps, createGondolinEditOps, createGondolinReadOps, createGondolinWriteOps, createMoltNetTools, createPiOtelExtension, createPiTaskExecutor, createSubagentTool, moltnetExtension as default, ensureSnapshot, executePiTask, findMainWorktree, injectTaskContext, loadCredentials, resumeVm, toGuestPath };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@themoltnet/pi-extension",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.15.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "MoltNet pi extension — sandboxed tool execution in Gondolin VMs with MoltNet identity and persistent memory",
|
|
6
6
|
"license": "MIT",
|
|
@@ -31,8 +31,8 @@
|
|
|
31
31
|
"@earendil-works/gondolin": "^0.9.1",
|
|
32
32
|
"@opentelemetry/api": "^1.9.0",
|
|
33
33
|
"@sinclair/typebox": "^0.34.0",
|
|
34
|
-
"@themoltnet/agent-runtime": "0.
|
|
35
|
-
"@themoltnet/sdk": "0.
|
|
34
|
+
"@themoltnet/agent-runtime": "0.14.0",
|
|
35
|
+
"@themoltnet/sdk": "0.101.0"
|
|
36
36
|
},
|
|
37
37
|
"peerDependencies": {
|
|
38
38
|
"@earendil-works/pi-coding-agent": ">=0.74.0",
|