@themoltnet/pi-extension 0.18.1 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -1
- package/dist/index.d.ts +64 -5
- package/dist/index.js +532 -377
- package/package.json +3 -3
package/dist/index.js
CHANGED
|
@@ -2386,12 +2386,20 @@ var MoltNetError = class extends Error {
|
|
|
2386
2386
|
code;
|
|
2387
2387
|
statusCode;
|
|
2388
2388
|
detail;
|
|
2389
|
+
/**
|
|
2390
|
+
* Populated when the server returned a `VALIDATION_FAILED` problem
|
|
2391
|
+
* (status 400) with field-level errors. Empty / undefined for every
|
|
2392
|
+
* other problem kind. Imposer scripts surface these to operators so
|
|
2393
|
+
* they don't have to re-run with curl to see what was rejected.
|
|
2394
|
+
*/
|
|
2395
|
+
validationErrors;
|
|
2389
2396
|
constructor(message, options) {
|
|
2390
2397
|
super(message);
|
|
2391
2398
|
this.name = "MoltNetError";
|
|
2392
2399
|
this.code = options.code;
|
|
2393
2400
|
this.statusCode = options.statusCode;
|
|
2394
2401
|
this.detail = options.detail;
|
|
2402
|
+
this.validationErrors = options.validationErrors;
|
|
2395
2403
|
}
|
|
2396
2404
|
};
|
|
2397
2405
|
var NetworkError = class extends MoltNetError {
|
|
@@ -2415,10 +2423,14 @@ var AuthenticationError = class extends MoltNetError {
|
|
|
2415
2423
|
};
|
|
2416
2424
|
function problemToError(problem, statusCode) {
|
|
2417
2425
|
const title = problem.title ?? "Request failed";
|
|
2418
|
-
|
|
2426
|
+
const message = problem.detail ? `${title}: ${problem.detail}` : title;
|
|
2427
|
+
const rawErrors = problem.errors;
|
|
2428
|
+
const validationErrors = Array.isArray(rawErrors) ? rawErrors.filter((e) => typeof e === "object" && e !== null && typeof e.field === "string" && typeof e.message === "string") : void 0;
|
|
2429
|
+
return new MoltNetError(message, {
|
|
2419
2430
|
code: problem.type ?? problem.code ?? "UNKNOWN",
|
|
2420
2431
|
statusCode,
|
|
2421
|
-
detail: problem.detail
|
|
2432
|
+
detail: problem.detail,
|
|
2433
|
+
validationErrors
|
|
2422
2434
|
});
|
|
2423
2435
|
}
|
|
2424
2436
|
//#endregion
|
|
@@ -7767,6 +7779,41 @@ function createMoltNetTools(config) {
|
|
|
7767
7779
|
};
|
|
7768
7780
|
}
|
|
7769
7781
|
});
|
|
7782
|
+
const listTaskMessages = defineTool({
|
|
7783
|
+
name: "moltnet_list_task_messages",
|
|
7784
|
+
label: "List MoltNet Task Attempt Messages",
|
|
7785
|
+
description: "List messages for a specific task attempt. Use this when you need the turn-by-turn execution record behind an accepted attempt — tool calls, text deltas, and error/info events that do not appear in the attempt output alone.",
|
|
7786
|
+
parameters: Type.Object({
|
|
7787
|
+
taskId: Type.String({ description: "Task ID (UUID)." }),
|
|
7788
|
+
attemptN: Type.Integer({
|
|
7789
|
+
minimum: 1,
|
|
7790
|
+
description: "Attempt number to inspect."
|
|
7791
|
+
}),
|
|
7792
|
+
afterSeq: Type.Optional(Type.Integer({
|
|
7793
|
+
minimum: 0,
|
|
7794
|
+
description: "Optional cursor: only return messages with seq > afterSeq."
|
|
7795
|
+
})),
|
|
7796
|
+
limit: Type.Optional(Type.Integer({
|
|
7797
|
+
minimum: 1,
|
|
7798
|
+
maximum: 500,
|
|
7799
|
+
description: "Optional maximum messages to return. Defaults to the API value."
|
|
7800
|
+
}))
|
|
7801
|
+
}),
|
|
7802
|
+
async execute(_id, params) {
|
|
7803
|
+
const { agent } = ensureConnected(config);
|
|
7804
|
+
const messages = await agent.tasks.listMessages(params.taskId, params.attemptN, {
|
|
7805
|
+
afterSeq: params.afterSeq,
|
|
7806
|
+
limit: params.limit
|
|
7807
|
+
});
|
|
7808
|
+
return {
|
|
7809
|
+
content: [{
|
|
7810
|
+
type: "text",
|
|
7811
|
+
text: JSON.stringify(messages, null, 2)
|
|
7812
|
+
}],
|
|
7813
|
+
details: {}
|
|
7814
|
+
};
|
|
7815
|
+
}
|
|
7816
|
+
});
|
|
7770
7817
|
const reviewSessionErrors = defineTool({
|
|
7771
7818
|
name: "moltnet_review_session_errors",
|
|
7772
7819
|
label: "Review Session Tool Errors",
|
|
@@ -7815,6 +7862,7 @@ function createMoltNetTools(config) {
|
|
|
7815
7862
|
createEntry,
|
|
7816
7863
|
getTask,
|
|
7817
7864
|
listTaskAttempts,
|
|
7865
|
+
listTaskMessages,
|
|
7818
7866
|
reviewSessionErrors,
|
|
7819
7867
|
defineTool({
|
|
7820
7868
|
name: "moltnet_host_exec",
|
|
@@ -8113,6 +8161,12 @@ var GUEST_WORKSPACE$2 = "/workspace";
|
|
|
8113
8161
|
* investigation and the alternatives we rejected.
|
|
8114
8162
|
*/
|
|
8115
8163
|
var GUEST_TASK_SKILLS_MOUNT = "/moltnet-task-skills";
|
|
8164
|
+
function shouldRunResumeCommand(entry, ctx) {
|
|
8165
|
+
if (typeof entry === "string") return true;
|
|
8166
|
+
const workspaceModes = entry.when?.workspaceMode;
|
|
8167
|
+
if (workspaceModes && !workspaceModes.includes(ctx.workspaceMode)) return false;
|
|
8168
|
+
return true;
|
|
8169
|
+
}
|
|
8116
8170
|
/**
|
|
8117
8171
|
* Resolve the main worktree root (where .moltnet/ lives — it's untracked,
|
|
8118
8172
|
* only exists in the main worktree, not in git worktrees).
|
|
@@ -8258,6 +8312,7 @@ async function resumeVm(config) {
|
|
|
8258
8312
|
...envOverrides
|
|
8259
8313
|
};
|
|
8260
8314
|
const resources = config.sandboxConfig?.resources;
|
|
8315
|
+
const workspaceMode = config.workspaceMode ?? "shared_mount";
|
|
8261
8316
|
const vm = await VmCheckpoint.load(config.checkpointPath).resume({
|
|
8262
8317
|
httpHooks,
|
|
8263
8318
|
env: vmEnv,
|
|
@@ -8276,7 +8331,32 @@ async function resumeVm(config) {
|
|
|
8276
8331
|
'`);
|
|
8277
8332
|
await vmRun(vm, "DNS resolvers", `printf 'nameserver 8.8.8.8\\nnameserver 1.1.1.1\\n' > /etc/resolv.conf`);
|
|
8278
8333
|
await vmRun(vm, "git safe.directory", `git config --system --add safe.directory '*'`);
|
|
8279
|
-
for (const [i,
|
|
8334
|
+
for (const [i, entry] of (config.sandboxConfig?.resumeCommands ?? []).entries()) {
|
|
8335
|
+
if (!shouldRunResumeCommand(entry, { workspaceMode })) continue;
|
|
8336
|
+
const { run, retries, backoffMs } = typeof entry === "string" ? {
|
|
8337
|
+
run: entry,
|
|
8338
|
+
retries: 0,
|
|
8339
|
+
backoffMs: 2e3
|
|
8340
|
+
} : {
|
|
8341
|
+
run: entry.run,
|
|
8342
|
+
retries: entry.retries ?? 0,
|
|
8343
|
+
backoffMs: entry.retryBackoffMs ?? 2e3
|
|
8344
|
+
};
|
|
8345
|
+
const label = `resumeCommands[${i}]`;
|
|
8346
|
+
let lastErr;
|
|
8347
|
+
for (let attempt = 0; attempt <= retries; attempt++) try {
|
|
8348
|
+
await vmRun(vm, label, run);
|
|
8349
|
+
lastErr = void 0;
|
|
8350
|
+
break;
|
|
8351
|
+
} catch (err) {
|
|
8352
|
+
lastErr = err;
|
|
8353
|
+
if (attempt === retries) break;
|
|
8354
|
+
await new Promise((resolve) => {
|
|
8355
|
+
setTimeout(resolve, (attempt + 1) * backoffMs);
|
|
8356
|
+
});
|
|
8357
|
+
}
|
|
8358
|
+
if (lastErr) throw lastErr instanceof Error ? lastErr : new Error(String(lastErr));
|
|
8359
|
+
}
|
|
8280
8360
|
const vmSshDir = `${vmAgentDir}/ssh`;
|
|
8281
8361
|
await vm.exec(`mkdir -p ${vmAgentDir}/ssh /home/agent/.pi/agent`);
|
|
8282
8362
|
if (creds.piAuthJson !== null) await vm.fs.writeFile("/home/agent/.pi/agent/auth.json", creds.piAuthJson, { mode: 384 });
|
|
@@ -8655,7 +8735,8 @@ async function buildAgentSession(args) {
|
|
|
8655
8735
|
await resourceLoader.reload();
|
|
8656
8736
|
const sessionManager = args.sessionPersistence ? await resolvePersistentSessionManager({
|
|
8657
8737
|
cwd: args.cwdPath,
|
|
8658
|
-
sessionDir: args.sessionPersistence.sessionDir
|
|
8738
|
+
sessionDir: args.sessionPersistence.sessionDir,
|
|
8739
|
+
forkFromSessionPath: args.sessionPersistence.forkFromSessionPath
|
|
8659
8740
|
}) : SessionManager.inMemory(args.cwdPath);
|
|
8660
8741
|
return (await createAgentSession({
|
|
8661
8742
|
agentDir: args.piAuthDir,
|
|
@@ -8667,6 +8748,7 @@ async function buildAgentSession(args) {
|
|
|
8667
8748
|
})).session;
|
|
8668
8749
|
}
|
|
8669
8750
|
async function resolvePersistentSessionManager(args) {
|
|
8751
|
+
if (args.forkFromSessionPath) return SessionManager.forkFrom(args.forkFromSessionPath, args.cwd, args.sessionDir);
|
|
8670
8752
|
await SessionManager.list(args.cwd, args.sessionDir);
|
|
8671
8753
|
return SessionManager.continueRecent(args.cwd, args.sessionDir);
|
|
8672
8754
|
}
|
|
@@ -8683,6 +8765,11 @@ var PROMPT_SEPARATOR = "\n\n---\n\n";
|
|
|
8683
8765
|
* - `skill` → `deliver.skill({ slug, content })` once per ref.
|
|
8684
8766
|
* Slug collisions on distinct contents are
|
|
8685
8767
|
* refused loudly.
|
|
8768
|
+
* - `context_inline`→ persist raw bytes via `deliver.contextFile(...)`
|
|
8769
|
+
* and inject them into the prompt in an explicit,
|
|
8770
|
+
* named block. Intended for eval/context experiments
|
|
8771
|
+
* where the content must be in the model context
|
|
8772
|
+
* window, not merely discoverable as a skill.
|
|
8686
8773
|
* - `prompt_prefix` → content appended to `systemPromptPrefix` with
|
|
8687
8774
|
* the canonical `\n\n---\n\n` separator (in
|
|
8688
8775
|
* declared order).
|
|
@@ -8715,6 +8802,13 @@ async function resolveTaskContext(args) {
|
|
|
8715
8802
|
slug: ref.slug,
|
|
8716
8803
|
content: ref.content
|
|
8717
8804
|
});
|
|
8805
|
+
} else if (ref.binding === "context_inline") {
|
|
8806
|
+
await args.deliver.contextFile({
|
|
8807
|
+
slug: ref.slug,
|
|
8808
|
+
content: ref.content,
|
|
8809
|
+
suggestedFileName: `${ref.slug}.md`
|
|
8810
|
+
});
|
|
8811
|
+
promptParts.push(formatInlineContextBlock(ref.slug, ref.content));
|
|
8718
8812
|
} else if (ref.binding === "prompt_prefix") promptParts.push(ref.content);
|
|
8719
8813
|
else userParts.push(ref.content);
|
|
8720
8814
|
injected.push(ref);
|
|
@@ -8725,6 +8819,23 @@ async function resolveTaskContext(args) {
|
|
|
8725
8819
|
userInlineSuffix: userParts.join(PROMPT_SEPARATOR)
|
|
8726
8820
|
};
|
|
8727
8821
|
}
|
|
8822
|
+
function formatInlineContextBlock(slug, content) {
|
|
8823
|
+
return [
|
|
8824
|
+
"### Injected Task Context",
|
|
8825
|
+
"",
|
|
8826
|
+
`Context id: \`${slug}\``,
|
|
8827
|
+
"The following raw context was supplied by the task creator. Treat it",
|
|
8828
|
+
"as task-relevant background that may override generic coding instincts",
|
|
8829
|
+
"when it contains repo- or workflow-specific constraints.",
|
|
8830
|
+
"The same content is also materialized in the workspace as",
|
|
8831
|
+
"`/workspace/context-pack.md` and mirrored in `AGENTS.md` for",
|
|
8832
|
+
"repo-context discovery.",
|
|
8833
|
+
"",
|
|
8834
|
+
"<context>",
|
|
8835
|
+
content,
|
|
8836
|
+
"</context>"
|
|
8837
|
+
].join("\n");
|
|
8838
|
+
}
|
|
8728
8839
|
//#endregion
|
|
8729
8840
|
//#region ../tasks/src/formats.ts
|
|
8730
8841
|
/**
|
|
@@ -8748,6 +8859,7 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
|
|
|
8748
8859
|
*/
|
|
8749
8860
|
var ContextBinding = Type$1.Union([
|
|
8750
8861
|
Type$1.Literal("skill"),
|
|
8862
|
+
Type$1.Literal("context_inline"),
|
|
8751
8863
|
Type$1.Literal("prompt_prefix"),
|
|
8752
8864
|
Type$1.Literal("user_inline")
|
|
8753
8865
|
], { $id: "ContextBinding" });
|
|
@@ -8764,9 +8876,14 @@ var ContextBinding = Type$1.Union([
|
|
|
8764
8876
|
* name under the runtime's skill discovery path. Must be
|
|
8765
8877
|
* kebab-case-safe (alphanumeric + dashes/underscores).
|
|
8766
8878
|
* - `binding` — how the bytes are delivered to the LLM (see above).
|
|
8767
|
-
* - `content` — the actual bytes (UTF-8 text). Capped at
|
|
8879
|
+
* - `content` — the actual bytes (UTF-8 text). Capped at 64 KiB per
|
|
8768
8880
|
* entry; total per-task context bytes are bounded by the
|
|
8769
8881
|
* soft `maxItems` cap and per-binding daemon limits.
|
|
8882
|
+
* Raised from 32 KiB in 2026-05 — protocol-heavy operator
|
|
8883
|
+
* skills (e.g. `.claude/skills/legreffier/SKILL.md`) ship
|
|
8884
|
+
* at ~35 KiB inline, and the original cap was sized for
|
|
8885
|
+
* short example skills, not the kind of skill the eval
|
|
8886
|
+
* substrate is dogfooded on (#943, #823).
|
|
8770
8887
|
*/
|
|
8771
8888
|
var ContextRef = Type$1.Object({
|
|
8772
8889
|
slug: Type$1.String({
|
|
@@ -8777,7 +8894,7 @@ var ContextRef = Type$1.Object({
|
|
|
8777
8894
|
binding: ContextBinding,
|
|
8778
8895
|
content: Type$1.String({
|
|
8779
8896
|
minLength: 1,
|
|
8780
|
-
maxLength:
|
|
8897
|
+
maxLength: 65536
|
|
8781
8898
|
})
|
|
8782
8899
|
}, {
|
|
8783
8900
|
$id: "ContextRef",
|
|
@@ -9341,61 +9458,33 @@ async function validateJudgePackInputAsync(input, ctx) {
|
|
|
9341
9458
|
return errors;
|
|
9342
9459
|
}
|
|
9343
9460
|
//#endregion
|
|
9344
|
-
//#region ../tasks/src/task-types/judge-eval-
|
|
9461
|
+
//#region ../tasks/src/task-types/judge-eval-attempt.ts
|
|
9345
9462
|
/**
|
|
9346
|
-
* `
|
|
9347
|
-
*
|
|
9348
|
-
* isolation.
|
|
9463
|
+
* `judge_eval_attempt` — score one completed `run_eval` attempt against a
|
|
9464
|
+
* hidden judge rubric.
|
|
9349
9465
|
*
|
|
9350
9466
|
* output_kind: judgment
|
|
9351
|
-
* criteria: required (`successCriteria.rubric`
|
|
9352
|
-
*
|
|
9353
|
-
*
|
|
9354
|
-
* pin the targets being graded.
|
|
9355
|
-
*
|
|
9356
|
-
* Slice 2 of #943. The parent task carries the rubric and the list of
|
|
9357
|
-
* variant `run_eval` task ids. The pi executor registers the generic
|
|
9358
|
-
* `subagent` custom tool (#1087), and the parent LLM calls
|
|
9359
|
-
* `subagent({ task, output_schema: 'judge_eval_variant_result' })` once
|
|
9360
|
-
* per variant — each child session has fresh context, fetches the
|
|
9361
|
-
* variant's accepted attempt output via `moltnet_get_task` /
|
|
9362
|
-
* `moltnet_list_task_attempts`, and grades against the rubric.
|
|
9467
|
+
* criteria: required (`successCriteria.rubric`)
|
|
9468
|
+
* references: not required at the input layer — `targetTaskId` +
|
|
9469
|
+
* `targetAttemptN` pin the producer attempt being judged.
|
|
9363
9470
|
*
|
|
9364
|
-
*
|
|
9365
|
-
*
|
|
9366
|
-
*
|
|
9367
|
-
*
|
|
9368
|
-
|
|
9369
|
-
|
|
9370
|
-
|
|
9371
|
-
|
|
9372
|
-
|
|
9373
|
-
* which the task service runs at create time (#1096 wiring). The
|
|
9374
|
-
* TypeBox layer here only enforces shape: UUID format,
|
|
9375
|
-
* minItems/maxItems, rubric presence + weight invariant.
|
|
9376
|
-
*/
|
|
9377
|
-
var JUDGE_EVAL_VARIANT_TYPE = "judge_eval_variant";
|
|
9378
|
-
var JudgeEvalVariantInput = Type$1.Object({
|
|
9379
|
-
runTaskIds: Type$1.Array(Type$1.String({ format: "uuid" }), {
|
|
9380
|
-
minItems: 2,
|
|
9381
|
-
maxItems: 10
|
|
9382
|
-
}),
|
|
9471
|
+
* This replaces the earlier parent/subagent `judge_eval_variant` design.
|
|
9472
|
+
* The unit of judgment is one producer attempt. Cross-variant deltas can be
|
|
9473
|
+
* computed later at read time from stored scores, rather than materialized as
|
|
9474
|
+
* their own task output.
|
|
9475
|
+
*/
|
|
9476
|
+
var JUDGE_EVAL_ATTEMPT_TYPE = "judge_eval_attempt";
|
|
9477
|
+
var JudgeEvalAttemptInput = Type$1.Object({
|
|
9478
|
+
targetTaskId: Type$1.String({ format: "uuid" }),
|
|
9479
|
+
targetAttemptN: Type$1.Integer({ minimum: 1 }),
|
|
9383
9480
|
successCriteria: SuccessCriteria
|
|
9384
9481
|
}, {
|
|
9385
|
-
$id: "
|
|
9482
|
+
$id: "JudgeEvalAttemptInput",
|
|
9386
9483
|
additionalProperties: false
|
|
9387
9484
|
});
|
|
9388
|
-
|
|
9389
|
-
|
|
9390
|
-
|
|
9391
|
-
* deterministic_*). Reuse the type rather than re-declare.
|
|
9392
|
-
*
|
|
9393
|
-
* This is also the **subagent output contract** — the parent's
|
|
9394
|
-
* `subagent` tool resolves the contract name `judge_eval_variant_result`
|
|
9395
|
-
* to this schema. See `agent-runtime`'s subagent contract registry.
|
|
9396
|
-
*/
|
|
9397
|
-
var JudgeEvalVariantResult = Type$1.Object({
|
|
9398
|
-
runTaskId: Type$1.String({ format: "uuid" }),
|
|
9485
|
+
var JudgeEvalAttemptOutput = Type$1.Object({
|
|
9486
|
+
targetTaskId: Type$1.String({ format: "uuid" }),
|
|
9487
|
+
targetAttemptN: Type$1.Integer({ minimum: 1 }),
|
|
9399
9488
|
variantLabel: Type$1.String({
|
|
9400
9489
|
minLength: 1,
|
|
9401
9490
|
maxLength: 64,
|
|
@@ -9406,216 +9495,126 @@ var JudgeEvalVariantResult = Type$1.Object({
|
|
|
9406
9495
|
minimum: 0,
|
|
9407
9496
|
maximum: 1
|
|
9408
9497
|
}),
|
|
9409
|
-
verdict: Type$1.String({ minLength: 1 })
|
|
9410
|
-
}, {
|
|
9411
|
-
$id: "JudgeEvalVariantResult",
|
|
9412
|
-
additionalProperties: false
|
|
9413
|
-
});
|
|
9414
|
-
var JudgeEvalVariantOutput = Type$1.Object({
|
|
9415
|
-
results: Type$1.Array(JudgeEvalVariantResult, { minItems: 2 }),
|
|
9416
|
-
deltas: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Number({
|
|
9417
|
-
minimum: -1,
|
|
9418
|
-
maximum: 1
|
|
9419
|
-
}))),
|
|
9498
|
+
verdict: Type$1.String({ minLength: 1 }),
|
|
9420
9499
|
judgeModel: Type$1.Optional(Type$1.String({ minLength: 1 })),
|
|
9421
9500
|
traceparent: Type$1.String({ minLength: 1 })
|
|
9422
9501
|
}, {
|
|
9423
|
-
$id: "
|
|
9502
|
+
$id: "JudgeEvalAttemptOutput",
|
|
9424
9503
|
additionalProperties: false
|
|
9425
9504
|
});
|
|
9426
|
-
|
|
9427
|
-
* Synchronous input invariants beyond TypeBox shape: rubric must be
|
|
9428
|
-
* present (already required by the schema, but the rubric body has
|
|
9429
|
-
* its own per-criterion weight invariant) and the rubric's weights
|
|
9430
|
-
* must sum to 1.
|
|
9431
|
-
*
|
|
9432
|
-
* Cross-task invariants (all targets are `run_eval`, all completed,
|
|
9433
|
-
* share `correlation_id`, byte-identical `input.successCriteria`)
|
|
9434
|
-
* are NOT checked here — they require async DB lookups against
|
|
9435
|
-
* `runTaskIds` and live in `validateJudgeEvalVariantInputAsync`
|
|
9436
|
-
* below, invoked by the task service at create time (#1096).
|
|
9437
|
-
*/
|
|
9438
|
-
function validateJudgeEvalVariantInput(input) {
|
|
9505
|
+
function validateJudgeEvalAttemptInput(input) {
|
|
9439
9506
|
const sc = input.successCriteria;
|
|
9440
|
-
if (!sc) return "successCriteria is required for
|
|
9441
|
-
if (!sc.rubric) return "successCriteria.rubric is required for
|
|
9507
|
+
if (!sc) return "successCriteria is required for judge_eval_attempt";
|
|
9508
|
+
if (!sc.rubric) return "successCriteria.rubric is required for judge_eval_attempt";
|
|
9442
9509
|
return validateRubricWeights(sc.rubric);
|
|
9443
9510
|
}
|
|
9444
|
-
|
|
9445
|
-
* Output cross-field invariants the schema cannot express:
|
|
9446
|
-
*
|
|
9447
|
-
* 1. `results.length === input.runTaskIds.length` — every variant
|
|
9448
|
-
* the imposer asked for must be graded. Partial grading
|
|
9449
|
-
* invalidates cross-variant comparison; fail the whole task
|
|
9450
|
-
* rather than silently report a subset.
|
|
9451
|
-
*
|
|
9452
|
-
* 2. `results[i].runTaskId === input.runTaskIds[i]` — order is
|
|
9453
|
-
* load-bearing for downstream consumers (e.g. deltas keyed by
|
|
9454
|
-
* adjacent pairs). Mismatch is an LLM bug; reject loudly.
|
|
9455
|
-
*
|
|
9456
|
-
* 3. Each `result.scores` follows the same `llm_checklist` rule
|
|
9457
|
-
* `judge_pack` enforces (#999): if a score has an `assertions`
|
|
9458
|
-
* array, the numeric score MUST be `1` iff every assertion
|
|
9459
|
-
* passes. Inconsistent payloads pollute attestations.
|
|
9460
|
-
*
|
|
9461
|
-
* 4. Each `result.composite` MUST equal the rubric-weighted sum
|
|
9462
|
-
* `Σ(weight_j × scores[j].score)`. The parent (and any subagent
|
|
9463
|
-
* it delegated to) is supposed to compute this; surfacing a
|
|
9464
|
-
* drift here catches LLMs that hand-wave the arithmetic.
|
|
9465
|
-
*
|
|
9466
|
-
* 5. Optional `deltas` keys MUST be of the form `"A - B"` where
|
|
9467
|
-
* both `A` and `B` are variantLabels present in `results`.
|
|
9468
|
-
* Values are not range-checked (any float in [-1, 1] is
|
|
9469
|
-
* arithmetically possible).
|
|
9470
|
-
*/
|
|
9471
|
-
function validateJudgeEvalVariantOutput(output, input) {
|
|
9511
|
+
function validateJudgeEvalAttemptOutput(output, input) {
|
|
9472
9512
|
const out = output;
|
|
9473
9513
|
const inp = input;
|
|
9474
9514
|
if (inp) {
|
|
9475
|
-
if (out.
|
|
9476
|
-
|
|
9515
|
+
if (out.targetTaskId !== inp.targetTaskId) return `output.targetTaskId (${out.targetTaskId}) does not match input.targetTaskId (${inp.targetTaskId})`;
|
|
9516
|
+
if (out.targetAttemptN !== inp.targetAttemptN) return `output.targetAttemptN (${out.targetAttemptN}) does not match input.targetAttemptN (${inp.targetAttemptN})`;
|
|
9477
9517
|
}
|
|
9478
|
-
for (let
|
|
9479
|
-
const
|
|
9480
|
-
|
|
9481
|
-
|
|
9482
|
-
|
|
9483
|
-
|
|
9484
|
-
const expected = allPassed ? 1 : 0;
|
|
9485
|
-
if (sc.score !== expected) return `results[${r}].scores[${s}] (criterionId="${sc.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${sc.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
|
|
9486
|
-
}
|
|
9518
|
+
for (let s = 0; s < out.scores.length; s++) {
|
|
9519
|
+
const sc = out.scores[s];
|
|
9520
|
+
if (!sc.assertions) continue;
|
|
9521
|
+
const allPassed = sc.assertions.every((a) => a.passed);
|
|
9522
|
+
const expected = allPassed ? 1 : 0;
|
|
9523
|
+
if (sc.score !== expected) return `scores[${s}] (criterionId="${sc.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${sc.score}. Score must be 1 iff every assertion passes, else 0.`;
|
|
9487
9524
|
}
|
|
9488
9525
|
if (inp?.successCriteria?.rubric) {
|
|
9489
9526
|
const criteria = inp.successCriteria.rubric.criteria;
|
|
9490
9527
|
const weightById = new Map(criteria.map((c) => [c.id, c.weight]));
|
|
9491
|
-
|
|
9492
|
-
|
|
9493
|
-
|
|
9494
|
-
|
|
9495
|
-
|
|
9496
|
-
if (w === void 0) return `results[${r}].scores: criterionId "${sc.criterionId}" is not in the input rubric (known: ${Array.from(weightById.keys()).join(", ")}). Score every rubric criterion exactly once; do not invent new ids.`;
|
|
9497
|
-
sum += w * sc.score;
|
|
9498
|
-
}
|
|
9499
|
-
if (Math.abs(sum - result.composite) > .001) return `results[${r}].composite (${result.composite}) does not match Σ(weight × score) (${sum.toFixed(6)}). Composite must be the rubric-weighted sum of per-criterion scores (drift > 0.001).`;
|
|
9500
|
-
}
|
|
9501
|
-
}
|
|
9502
|
-
if (out.deltas) {
|
|
9503
|
-
const labels = new Set(out.results.map((r) => r.variantLabel));
|
|
9504
|
-
for (const key of Object.keys(out.deltas)) {
|
|
9505
|
-
const m = /^(.+?) - (.+)$/.exec(key);
|
|
9506
|
-
if (!m) return `deltas key "${key}" is not of the form "<variantLabel-A> - <variantLabel-B>". Use a single space-hyphen-space separator between labels.`;
|
|
9507
|
-
const [, a, b] = m;
|
|
9508
|
-
if (!labels.has(a) || !labels.has(b)) return `deltas key "${key}" references variantLabel(s) not present in results: ${!labels.has(a) ? `"${a}" missing` : ""}${!labels.has(a) && !labels.has(b) ? ", " : ""}${!labels.has(b) ? `"${b}" missing` : ""}`;
|
|
9528
|
+
let sum = 0;
|
|
9529
|
+
for (const sc of out.scores) {
|
|
9530
|
+
const w = weightById.get(sc.criterionId);
|
|
9531
|
+
if (w === void 0) return `scores references unknown criterionId "${sc.criterionId}"`;
|
|
9532
|
+
sum += w * sc.score;
|
|
9509
9533
|
}
|
|
9534
|
+
const rounded = Math.round(sum * 1e3) / 1e3;
|
|
9535
|
+
if (Math.abs(rounded - out.composite) > .001) return `composite (${out.composite}) does not match weighted rubric sum (${rounded})`;
|
|
9510
9536
|
}
|
|
9511
9537
|
return null;
|
|
9512
9538
|
}
|
|
9513
|
-
|
|
9514
|
-
|
|
9515
|
-
* equality. Recursively sorts object keys; arrays preserve order
|
|
9516
|
-
* (intentional — rubric criteria order is semantically meaningful).
|
|
9517
|
-
* Mirrors the canonical-JSON shape `crypto-service` uses for CIDs,
|
|
9518
|
-
* without taking on a crypto-service dep just for this comparison.
|
|
9519
|
-
*/
|
|
9520
|
-
function stableStringify(value) {
|
|
9521
|
-
if (value === null || typeof value !== "object") return JSON.stringify(value);
|
|
9522
|
-
if (Array.isArray(value)) return "[" + value.map(stableStringify).join(",") + "]";
|
|
9523
|
-
const obj = value;
|
|
9524
|
-
return "{" + Object.keys(obj).sort().map((k) => JSON.stringify(k) + ":" + stableStringify(obj[k])).join(",") + "}";
|
|
9525
|
-
}
|
|
9526
|
-
/**
|
|
9527
|
-
* Async preflight for `judge_eval_variant` (#1096 + #943):
|
|
9528
|
-
*
|
|
9529
|
-
* 1. Every `runTaskIds[i]` resolves to a task the caller can read.
|
|
9530
|
-
* 2. Every resolved task is `taskType === 'run_eval'`.
|
|
9531
|
-
* 3. Every resolved task is `status === 'completed'` with a
|
|
9532
|
-
* non-null `acceptedAttemptN` — grading an unaccepted attempt
|
|
9533
|
-
* races with re-attempts and pollutes the judge attestation.
|
|
9534
|
-
* 4. Every resolved task shares a non-null `correlationId`, and all
|
|
9535
|
-
* `correlationId`s are equal. Without this an imposer could
|
|
9536
|
-
* fabricate a "variant set" by stapling unrelated runs together.
|
|
9537
|
-
* 5. The shared `correlationId` is NOT already sealed. A previous
|
|
9538
|
-
* judge_eval_variant against the same group is final; produce a
|
|
9539
|
-
* fresh correlation_id for a new judging round rather than
|
|
9540
|
-
* adding contradictory verdicts to a sealed group.
|
|
9541
|
-
* 6. Every variant's `input.successCriteria` is byte-identical (via
|
|
9542
|
-
* stable-stringify). Different rubrics across "variants" makes
|
|
9543
|
-
* the comparison meaningless.
|
|
9544
|
-
*/
|
|
9545
|
-
async function validateJudgeEvalVariantInputAsync(input, ctx) {
|
|
9546
|
-
const { runTaskIds } = input;
|
|
9539
|
+
async function validateJudgeEvalAttemptInputAsync(input, ctx) {
|
|
9540
|
+
const inp = input;
|
|
9547
9541
|
const errors = [];
|
|
9548
|
-
const
|
|
9549
|
-
|
|
9550
|
-
|
|
9551
|
-
|
|
9552
|
-
|
|
9553
|
-
|
|
9554
|
-
|
|
9555
|
-
|
|
9556
|
-
|
|
9557
|
-
|
|
9558
|
-
|
|
9559
|
-
|
|
9560
|
-
}
|
|
9561
|
-
presentTargets.push(t);
|
|
9562
|
-
if (t.taskType !== "run_eval") errors.push({
|
|
9563
|
-
field: `runTaskIds[${i}]`,
|
|
9564
|
-
message: `runTaskIds[${i}]=${runTaskIds[i]} is a ${t.taskType}, not a run_eval`
|
|
9565
|
-
});
|
|
9566
|
-
if (t.status !== "completed" || t.acceptedAttemptN === null) errors.push({
|
|
9567
|
-
field: `runTaskIds[${i}]`,
|
|
9568
|
-
message: `runTaskIds[${i}]=${runTaskIds[i]} is not completed with an accepted attempt (status=${t.status}, acceptedAttemptN=${t.acceptedAttemptN})`
|
|
9569
|
-
});
|
|
9570
|
-
}
|
|
9571
|
-
if (missingTargets || presentTargets.length === 0) return errors;
|
|
9572
|
-
const correlationIds = new Set(presentTargets.map((t) => t.correlationId ?? "__null__"));
|
|
9573
|
-
if (correlationIds.has("__null__")) errors.push({
|
|
9574
|
-
field: "runTaskIds",
|
|
9575
|
-
message: "one or more run_eval targets have no correlation_id; cannot group as variants"
|
|
9542
|
+
const target = await ctx.resolveTask(inp.targetTaskId);
|
|
9543
|
+
if (!target) return [{
|
|
9544
|
+
field: "targetTaskId",
|
|
9545
|
+
message: `targetTaskId=${inp.targetTaskId} does not resolve to a task you can read`
|
|
9546
|
+
}];
|
|
9547
|
+
if (target.taskType !== "run_eval") errors.push({
|
|
9548
|
+
field: "targetTaskId",
|
|
9549
|
+
message: `targetTaskId=${inp.targetTaskId} is a ${target.taskType}, not a run_eval`
|
|
9550
|
+
});
|
|
9551
|
+
if (target.status !== "completed" || target.acceptedAttemptN === null) errors.push({
|
|
9552
|
+
field: "targetTaskId",
|
|
9553
|
+
message: `targetTaskId=${inp.targetTaskId} is not completed with an accepted attempt (status=${target.status}, acceptedAttemptN=${target.acceptedAttemptN})`
|
|
9576
9554
|
});
|
|
9577
|
-
if (
|
|
9578
|
-
field: "
|
|
9579
|
-
message: `
|
|
9555
|
+
else if (target.acceptedAttemptN !== inp.targetAttemptN) errors.push({
|
|
9556
|
+
field: "targetAttemptN",
|
|
9557
|
+
message: `targetAttemptN=${inp.targetAttemptN} does not match the producer's acceptedAttemptN=${target.acceptedAttemptN}`
|
|
9580
9558
|
});
|
|
9581
|
-
if (
|
|
9582
|
-
|
|
9583
|
-
|
|
9584
|
-
|
|
9585
|
-
if (
|
|
9586
|
-
|
|
9587
|
-
|
|
9559
|
+
if (!target.correlationId) errors.push({
|
|
9560
|
+
field: "targetTaskId",
|
|
9561
|
+
message: "target run_eval has no correlation_id; cannot enforce duplicate-judge protection"
|
|
9562
|
+
});
|
|
9563
|
+
if (errors.length > 0 || !target.correlationId) return errors;
|
|
9564
|
+
const rubric = inp.successCriteria.rubric;
|
|
9565
|
+
const duplicate = (await ctx.listTasksByCorrelation(target.correlationId)).find((task) => {
|
|
9566
|
+
if (task.taskType !== "judge_eval_attempt") return false;
|
|
9567
|
+
if (task.status === "failed" || task.status === "cancelled" || task.status === "expired") return false;
|
|
9568
|
+
const existing = task.input;
|
|
9569
|
+
const existingRubric = existing.successCriteria?.rubric;
|
|
9570
|
+
return existing.targetTaskId === inp.targetTaskId && existing.targetAttemptN === inp.targetAttemptN && existingRubric?.rubricId === rubric?.rubricId && existingRubric?.version === rubric?.version;
|
|
9571
|
+
});
|
|
9572
|
+
if (duplicate) errors.push({
|
|
9573
|
+
field: "targetTaskId",
|
|
9574
|
+
message: `judge task ${duplicate.id} already exists for (${inp.targetTaskId}, attempt ${inp.targetAttemptN}, rubric ${rubric?.rubricId}@${rubric?.version})`
|
|
9588
9575
|
});
|
|
9589
|
-
const first = stableStringify(presentTargets[0].input.successCriteria);
|
|
9590
|
-
for (let i = 1; i < presentTargets.length; i++) if (stableStringify(presentTargets[i].input.successCriteria) !== first) {
|
|
9591
|
-
errors.push({
|
|
9592
|
-
field: `runTaskIds[${i}]`,
|
|
9593
|
-
message: `runTaskIds[${i}] has a different input.successCriteria than runTaskIds[0]; all variants must share the rubric and gates`
|
|
9594
|
-
});
|
|
9595
|
-
break;
|
|
9596
|
-
}
|
|
9597
9576
|
return errors;
|
|
9598
9577
|
}
|
|
9599
|
-
|
|
9600
|
-
|
|
9601
|
-
|
|
9602
|
-
|
|
9603
|
-
* concurrent second `judge_eval_variant` against the same group
|
|
9604
|
-
* loses the race and is rejected with a clean conflict error.
|
|
9605
|
-
*
|
|
9606
|
-
* The seal applies to the SHARED correlation_id of the targets —
|
|
9607
|
-
* NOT to the judge task's own correlationId (which is typically
|
|
9608
|
-
* null or distinct). The task service derives the correlationId
|
|
9609
|
-
* for the effect from the resolved targets, not from the judge
|
|
9610
|
-
* task row.
|
|
9611
|
-
*/
|
|
9612
|
-
async function onCreateJudgeEvalVariant(input, ctx) {
|
|
9613
|
-
const { runTaskIds } = input;
|
|
9614
|
-
const first = await ctx.resolveTask(runTaskIds[0]);
|
|
9615
|
-
if (!first?.correlationId) return [];
|
|
9578
|
+
async function onCreateJudgeEvalAttempt(input, _ctx) {
|
|
9579
|
+
const judge = input;
|
|
9580
|
+
const rubric = judge.successCriteria.rubric;
|
|
9581
|
+
if (!rubric) return [];
|
|
9616
9582
|
return [{
|
|
9617
|
-
kind: "
|
|
9618
|
-
|
|
9583
|
+
kind: "guardTaskUniqueness",
|
|
9584
|
+
taskType: JUDGE_EVAL_ATTEMPT_TYPE,
|
|
9585
|
+
lockKey: [
|
|
9586
|
+
JUDGE_EVAL_ATTEMPT_TYPE,
|
|
9587
|
+
judge.targetTaskId,
|
|
9588
|
+
String(judge.targetAttemptN),
|
|
9589
|
+
rubric.rubricId,
|
|
9590
|
+
rubric.version
|
|
9591
|
+
].join(":"),
|
|
9592
|
+
inputMatches: [
|
|
9593
|
+
{
|
|
9594
|
+
path: ["targetTaskId"],
|
|
9595
|
+
value: judge.targetTaskId
|
|
9596
|
+
},
|
|
9597
|
+
{
|
|
9598
|
+
path: ["targetAttemptN"],
|
|
9599
|
+
value: judge.targetAttemptN
|
|
9600
|
+
},
|
|
9601
|
+
{
|
|
9602
|
+
path: [
|
|
9603
|
+
"successCriteria",
|
|
9604
|
+
"rubric",
|
|
9605
|
+
"rubricId"
|
|
9606
|
+
],
|
|
9607
|
+
value: rubric.rubricId
|
|
9608
|
+
},
|
|
9609
|
+
{
|
|
9610
|
+
path: [
|
|
9611
|
+
"successCriteria",
|
|
9612
|
+
"rubric",
|
|
9613
|
+
"version"
|
|
9614
|
+
],
|
|
9615
|
+
value: rubric.version
|
|
9616
|
+
}
|
|
9617
|
+
]
|
|
9619
9618
|
}];
|
|
9620
9619
|
}
|
|
9621
9620
|
//#endregion
|
|
@@ -9739,14 +9738,43 @@ async function validateRenderPackInputAsync(input, ctx) {
|
|
|
9739
9738
|
//#region ../tasks/src/task-types/run-eval.ts
|
|
9740
9739
|
/**
|
|
9741
9740
|
* `run_eval` — execute a scenario prompt under a named variant for
|
|
9742
|
-
* later
|
|
9741
|
+
* later per-attempt grading by `judge_eval_attempt` tasks.
|
|
9743
9742
|
*
|
|
9744
9743
|
* output_kind: artifact
|
|
9745
|
-
* criteria: optional (when set,
|
|
9746
|
-
*
|
|
9744
|
+
* criteria: optional producer-only checks (when set,
|
|
9745
|
+
* output.verification is required — the judge rubric remains hidden
|
|
9746
|
+
* on downstream `judge_eval_attempt` tasks)
|
|
9747
9747
|
* references: not required (scenario lives entirely in input)
|
|
9748
9748
|
*/
|
|
9749
9749
|
var RUN_EVAL_TYPE = "run_eval";
|
|
9750
|
+
var RunEvalMode = Type$1.Union([Type$1.Literal("vitro"), Type$1.Literal("vivo")], { $id: "RunEvalMode" });
|
|
9751
|
+
var RunEvalWorkspace = Type$1.Union([
|
|
9752
|
+
Type$1.Literal("none"),
|
|
9753
|
+
Type$1.Literal("shared_mount"),
|
|
9754
|
+
Type$1.Literal("dedicated_worktree")
|
|
9755
|
+
], { $id: "RunEvalWorkspace" });
|
|
9756
|
+
var RunEvalExecution = Type$1.Object({
|
|
9757
|
+
mode: RunEvalMode,
|
|
9758
|
+
workspace: RunEvalWorkspace
|
|
9759
|
+
}, {
|
|
9760
|
+
$id: "RunEvalExecution",
|
|
9761
|
+
additionalProperties: false
|
|
9762
|
+
});
|
|
9763
|
+
/**
|
|
9764
|
+
* Producer-visible checks for `run_eval`. Deliberately forbids `rubric`
|
|
9765
|
+
* so the variant runner cannot see the downstream judge's answer key.
|
|
9766
|
+
* Keep the rest of the SuccessCriteria envelope available for generic
|
|
9767
|
+
* process / structure checks (`gates`, `assertions`, `sideEffects`).
|
|
9768
|
+
*/
|
|
9769
|
+
var RunEvalSuccessCriteria = Type$1.Object({
|
|
9770
|
+
version: Type$1.Literal(1),
|
|
9771
|
+
gates: Type$1.Optional(SuccessCriteria.properties.gates),
|
|
9772
|
+
assertions: Type$1.Optional(SuccessCriteria.properties.assertions),
|
|
9773
|
+
sideEffects: Type$1.Optional(SuccessCriteria.properties.sideEffects)
|
|
9774
|
+
}, {
|
|
9775
|
+
$id: "RunEvalSuccessCriteria",
|
|
9776
|
+
additionalProperties: false
|
|
9777
|
+
});
|
|
9750
9778
|
var RunEvalInput = Type$1.Object({
|
|
9751
9779
|
scenario: Type$1.Object({
|
|
9752
9780
|
prompt: Type$1.String({ minLength: 1 }),
|
|
@@ -9756,8 +9784,9 @@ var RunEvalInput = Type$1.Object({
|
|
|
9756
9784
|
minLength: 1,
|
|
9757
9785
|
maxLength: 64
|
|
9758
9786
|
}),
|
|
9787
|
+
execution: RunEvalExecution,
|
|
9759
9788
|
context: TaskContext,
|
|
9760
|
-
successCriteria: Type$1.Optional(
|
|
9789
|
+
successCriteria: Type$1.Optional(RunEvalSuccessCriteria)
|
|
9761
9790
|
}, {
|
|
9762
9791
|
$id: "RunEvalInput",
|
|
9763
9792
|
additionalProperties: false
|
|
@@ -9785,8 +9814,8 @@ var RunEvalOutput = Type$1.Object({
|
|
|
9785
9814
|
function validateRunEvalOutput(output, input) {
|
|
9786
9815
|
const hasCriteria = input !== null && input !== void 0 && input.successCriteria !== void 0;
|
|
9787
9816
|
const hasVerification = output !== null && output !== void 0 && output.verification !== void 0;
|
|
9788
|
-
if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the
|
|
9789
|
-
if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no
|
|
9817
|
+
if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the producer checks";
|
|
9818
|
+
if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no producer checks to assess against";
|
|
9790
9819
|
return null;
|
|
9791
9820
|
}
|
|
9792
9821
|
//#endregion
|
|
@@ -9902,24 +9931,24 @@ var BUILT_IN_TASK_TYPES = {
|
|
|
9902
9931
|
inputSchema: RunEvalInput,
|
|
9903
9932
|
outputSchema: RunEvalOutput,
|
|
9904
9933
|
outputKind: "artifact",
|
|
9905
|
-
|
|
9934
|
+
resumable: true,
|
|
9935
|
+
workspaceScope: "session",
|
|
9906
9936
|
sessionScope: "custom",
|
|
9907
9937
|
requiresReferences: false,
|
|
9908
9938
|
validateOutput: validateRunEvalOutput
|
|
9909
9939
|
},
|
|
9910
|
-
[
|
|
9911
|
-
name:
|
|
9912
|
-
inputSchema:
|
|
9913
|
-
outputSchema:
|
|
9940
|
+
[JUDGE_EVAL_ATTEMPT_TYPE]: {
|
|
9941
|
+
name: JUDGE_EVAL_ATTEMPT_TYPE,
|
|
9942
|
+
inputSchema: JudgeEvalAttemptInput,
|
|
9943
|
+
outputSchema: JudgeEvalAttemptOutput,
|
|
9914
9944
|
outputKind: "judgment",
|
|
9915
9945
|
workspaceScope: "attempt",
|
|
9916
|
-
sessionScope: "
|
|
9946
|
+
sessionScope: "none",
|
|
9917
9947
|
requiresReferences: false,
|
|
9918
|
-
validateInput:
|
|
9919
|
-
validateOutput:
|
|
9920
|
-
validateInputAsync:
|
|
9921
|
-
onCreate:
|
|
9922
|
-
usesSubagents: true
|
|
9948
|
+
validateInput: validateJudgeEvalAttemptInput,
|
|
9949
|
+
validateOutput: validateJudgeEvalAttemptOutput,
|
|
9950
|
+
validateInputAsync: validateJudgeEvalAttemptInputAsync,
|
|
9951
|
+
onCreate: onCreateJudgeEvalAttempt
|
|
9923
9952
|
}
|
|
9924
9953
|
};
|
|
9925
9954
|
//#endregion
|
|
@@ -10283,20 +10312,16 @@ function buildFinalOutputBlock(opts) {
|
|
|
10283
10312
|
"## Final output (read this carefully)",
|
|
10284
10313
|
"",
|
|
10285
10314
|
`Your VERY LAST action in this conversation MUST report the structured`,
|
|
10286
|
-
`output matching \`${outputSchemaName}
|
|
10287
|
-
`preference:`,
|
|
10315
|
+
`output matching \`${outputSchemaName}\`.`,
|
|
10288
10316
|
"",
|
|
10289
|
-
`
|
|
10290
|
-
`
|
|
10291
|
-
`
|
|
10292
|
-
`
|
|
10293
|
-
`
|
|
10294
|
-
` \`${outputSchemaName}\`. No prose before or after. No code fences.`,
|
|
10295
|
-
` No "ok" or "done". The runtime parses the last balanced top-level`,
|
|
10296
|
-
` JSON object as the output.`,
|
|
10317
|
+
`Call \`${submitTool}\` exactly once with the payload.`,
|
|
10318
|
+
`The runtime captures the validated arguments and ends the session.`,
|
|
10319
|
+
`Do NOT emit the output as plain assistant text. Do NOT rely on a`,
|
|
10320
|
+
`JSON-in-message fallback. If you do not call \`${submitTool}\`, the`,
|
|
10321
|
+
`attempt fails even if the underlying work succeeded.`,
|
|
10297
10322
|
"",
|
|
10298
|
-
`
|
|
10299
|
-
`
|
|
10323
|
+
`Your final assistant text before that tool call may explain your work,`,
|
|
10324
|
+
`but the submit-tool call itself must be your VERY LAST action.`,
|
|
10300
10325
|
"",
|
|
10301
10326
|
`Output shape:`,
|
|
10302
10327
|
"",
|
|
@@ -10434,21 +10459,30 @@ function buildAssessBriefUserPrompt(input, ctx) {
|
|
|
10434
10459
|
}
|
|
10435
10460
|
//#endregion
|
|
10436
10461
|
//#region ../agent-runtime/src/prompts/self-verification.ts
|
|
10437
|
-
function buildSelfVerificationBlock(taskId) {
|
|
10462
|
+
function buildSelfVerificationBlock(taskId, criteriaField = "successCriteria") {
|
|
10438
10463
|
return [
|
|
10439
10464
|
"## Self-verification",
|
|
10440
10465
|
"",
|
|
10441
|
-
`
|
|
10466
|
+
`If \`input.${criteriaField}\` is set on this task, your final output MUST`,
|
|
10467
|
+
"include a `verification` block. **The runtime/server rejects task",
|
|
10468
|
+
`submission without \`verification\` when \`${criteriaField}\` is present**`,
|
|
10469
|
+
"— the request fails validation and the attempt is discarded, even if the",
|
|
10470
|
+
"underlying work succeeded. Do not call the submit tool until you have",
|
|
10471
|
+
"computed the verification payload.",
|
|
10442
10472
|
"",
|
|
10443
|
-
|
|
10473
|
+
`Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.${criteriaField}\`.`,
|
|
10474
|
+
"",
|
|
10475
|
+
`- If \`input.${criteriaField}\` is **absent**, omit \`verification\` from your`,
|
|
10444
10476
|
" final output entirely.",
|
|
10445
|
-
|
|
10446
|
-
" `verification` block in your final output. Evaluate every applicable",
|
|
10477
|
+
`- If \`input.${criteriaField}\` is **present**, evaluate every applicable`,
|
|
10447
10478
|
" item — `gates`, `assertions`, `rubric` criteria, `sideEffects` — against",
|
|
10448
10479
|
" your produced work and emit one result per id. Be honest: a `fail` with",
|
|
10449
10480
|
" a one-line reason is more useful than a false `pass`. Use `skip` (with a",
|
|
10450
10481
|
" `detail`) when you genuinely could not determine a result. Compute",
|
|
10451
10482
|
" `passed = results.every(r => r.status !== 'fail')`.",
|
|
10483
|
+
"- `verification` MUST be a JSON object. Never send a string, markdown",
|
|
10484
|
+
" block, null, or an empty placeholder. The submit tool expects an object",
|
|
10485
|
+
" with `inputCid`, `results`, and `passed` fields.",
|
|
10452
10486
|
"",
|
|
10453
10487
|
"Verification shape:",
|
|
10454
10488
|
"",
|
|
@@ -10462,6 +10496,23 @@ function buildSelfVerificationBlock(taskId) {
|
|
|
10462
10496
|
" \"passed\": <boolean>",
|
|
10463
10497
|
"}",
|
|
10464
10498
|
"```",
|
|
10499
|
+
"",
|
|
10500
|
+
"Minimal valid example:",
|
|
10501
|
+
"",
|
|
10502
|
+
"```json",
|
|
10503
|
+
"{",
|
|
10504
|
+
" \"inputCid\": \"<task inputCid>\",",
|
|
10505
|
+
" \"results\": [",
|
|
10506
|
+
" {",
|
|
10507
|
+
" \"id\": \"<criterion id>\",",
|
|
10508
|
+
" \"kind\": \"rubric\",",
|
|
10509
|
+
" \"status\": \"pass\",",
|
|
10510
|
+
" \"detail\": \"one-line reason\"",
|
|
10511
|
+
" }",
|
|
10512
|
+
" ],",
|
|
10513
|
+
" \"passed\": true",
|
|
10514
|
+
"}",
|
|
10515
|
+
"```",
|
|
10465
10516
|
""
|
|
10466
10517
|
].join("\n");
|
|
10467
10518
|
}
|
|
@@ -10712,69 +10763,62 @@ function buildFulfillBriefUserPrompt(input, ctx) {
|
|
|
10712
10763
|
].filter(Boolean).join("\n");
|
|
10713
10764
|
}
|
|
10714
10765
|
//#endregion
|
|
10715
|
-
//#region ../agent-runtime/src/prompts/judge-eval-
|
|
10716
|
-
|
|
10717
|
-
|
|
10718
|
-
|
|
10719
|
-
*
|
|
10720
|
-
* The parent agent's job is **fan-out-and-collect**: for each
|
|
10721
|
-
* `runTaskIds[i]`, spawn an isolated subagent via the `subagent` custom
|
|
10722
|
-
* tool (#1087), have it grade that variant against the shared rubric,
|
|
10723
|
-
* and collect each subagent's structured `judge_eval_variant_result`
|
|
10724
|
-
* payload. The parent does NOT grade itself; it composes the per-
|
|
10725
|
-
* variant results into the final `judge_eval_variant` output (results
|
|
10726
|
-
* array + optional deltas + verdicts).
|
|
10727
|
-
*
|
|
10728
|
-
* Isolation is the point: each variant gets a fresh subagent session
|
|
10729
|
-
* with no carryover context from sibling variants, so per-variant
|
|
10730
|
-
* grading is independent. Cost is bounded by `maxItems: 10` on
|
|
10731
|
-
* runTaskIds.
|
|
10732
|
-
*/
|
|
10733
|
-
function buildJudgeEvalVariantUserPrompt(input, ctx) {
|
|
10734
|
-
const { runTaskIds, successCriteria } = input;
|
|
10735
|
-
const rubric = successCriteria.rubric;
|
|
10736
|
-
if (!rubric) throw new Error("judge_eval_variant requires successCriteria.rubric — none present");
|
|
10766
|
+
//#region ../agent-runtime/src/prompts/judge-eval-attempt.ts
|
|
10767
|
+
function buildJudgeEvalAttemptUserPrompt(input, ctx) {
|
|
10768
|
+
const rubric = input.successCriteria.rubric;
|
|
10769
|
+
if (!rubric) throw new Error("judge_eval_attempt requires successCriteria.rubric — none present");
|
|
10737
10770
|
const escapeCell = (s) => s.replace(/\\/g, "\\\\").replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
|
|
10738
10771
|
const criteriaTable = rubric.criteria.map((c) => `| \`${c.id}\` | ${c.weight.toFixed(3)} | ${c.scoring} | ${escapeCell(c.description)} |`).join("\n");
|
|
10739
|
-
const targetsBlock = runTaskIds.map((id, i) => `${i + 1}. \`${id}\``).join("\n");
|
|
10740
10772
|
const finalOutputBlock = buildFinalOutputBlock({
|
|
10741
|
-
taskType: "
|
|
10742
|
-
outputSchemaName: "
|
|
10773
|
+
taskType: "judge_eval_attempt",
|
|
10774
|
+
outputSchemaName: "JudgeEvalAttemptOutput",
|
|
10743
10775
|
shapeSketch: [
|
|
10744
10776
|
"{",
|
|
10745
|
-
|
|
10746
|
-
"
|
|
10747
|
-
"
|
|
10748
|
-
"
|
|
10749
|
-
"
|
|
10750
|
-
"
|
|
10751
|
-
" \"verdict\": \"<1-3 sentences>\"",
|
|
10752
|
-
" },",
|
|
10753
|
-
" ...one entry per runTaskIds[i], same order",
|
|
10754
|
-
" ],",
|
|
10755
|
-
" \"deltas\": { \"<labelA> - <labelB>\": <composite(A) - composite(B)> }, // optional",
|
|
10777
|
+
` "targetTaskId": "${input.targetTaskId}",`,
|
|
10778
|
+
` "targetAttemptN": ${input.targetAttemptN},`,
|
|
10779
|
+
" \"variantLabel\": \"<from producer input>\",",
|
|
10780
|
+
" \"scores\": [ { \"criterionId\": \"...\", \"score\": 0..1, \"rationale\": \"...\", \"assertions\": [...]? } ],",
|
|
10781
|
+
" \"composite\": <Σ(weight × score), 0..1>,",
|
|
10782
|
+
" \"verdict\": \"<1-3 sentences>\",",
|
|
10756
10783
|
" \"judgeModel\": \"<id>\", // optional",
|
|
10757
10784
|
" \"traceparent\": \"<from claim>\"",
|
|
10758
10785
|
"}"
|
|
10759
10786
|
].join("\n")
|
|
10760
10787
|
});
|
|
10788
|
+
const workspaceSection = ctx.workspace?.attached === true ? [
|
|
10789
|
+
"### Workspace",
|
|
10790
|
+
"",
|
|
10791
|
+
"Your current workspace is already attached to the producer attempt",
|
|
10792
|
+
"you are judging. Inspect files directly from the current workspace",
|
|
10793
|
+
"root instead of inventing synthetic `artifact_<taskId>` paths.",
|
|
10794
|
+
"If the accepted attempt output lists `artifacts[].path`, treat those",
|
|
10795
|
+
"paths as relative to the current workspace root unless the output",
|
|
10796
|
+
"explicitly says otherwise.",
|
|
10797
|
+
ctx.workspace.mode === "dedicated_worktree" ? `This attachment is a dedicated producer worktree${ctx.workspace.branch ? ` on branch \`${ctx.workspace.branch}\`` : ""}.` : ctx.workspace.mode === "scratch_mount" ? "This attachment is the producer scratch workspace mounted with shadow writes for safe inspection." : "This attachment is the producer shared workspace mounted with shadow writes for safe inspection.",
|
|
10798
|
+
""
|
|
10799
|
+
].join("\n") : "";
|
|
10761
10800
|
return [
|
|
10762
|
-
"# Judge Eval
|
|
10763
|
-
|
|
10764
|
-
"
|
|
10765
|
-
"grade yourself.",
|
|
10801
|
+
"# Judge Eval Attempt\n",
|
|
10802
|
+
"You are grading one accepted `run_eval` producer attempt against a hidden",
|
|
10803
|
+
"judge rubric. Do not delegate to subagents. Grade in this session only.",
|
|
10766
10804
|
"",
|
|
10767
10805
|
`Task id: \`${ctx.taskId}\``,
|
|
10768
10806
|
`Diary: \`${ctx.diaryId}\``,
|
|
10807
|
+
`Producer task: \`${input.targetTaskId}\``,
|
|
10808
|
+
`Producer attempt: \`${input.targetAttemptN}\``,
|
|
10769
10809
|
"",
|
|
10770
|
-
"###
|
|
10771
|
-
"",
|
|
10772
|
-
targetsBlock,
|
|
10810
|
+
"### Evidence gathering",
|
|
10773
10811
|
"",
|
|
10774
|
-
|
|
10775
|
-
|
|
10776
|
-
|
|
10812
|
+
`1. Call \`moltnet_get_task\` with taskId=\`${input.targetTaskId}\`.`,
|
|
10813
|
+
`2. Call \`moltnet_list_task_attempts\` with taskId=\`${input.targetTaskId}\` and inspect the accepted attempt matching \`${input.targetAttemptN}\`.`,
|
|
10814
|
+
`3. Call \`moltnet_list_task_messages\` with taskId=\`${input.targetTaskId}\`, attemptN=\`${input.targetAttemptN}\` to inspect the producer's turn-by-turn behavior.`,
|
|
10815
|
+
"4. Use the accepted attempt output, attempt messages, and any accessible",
|
|
10816
|
+
" artifacts or workspace evidence available in your environment.",
|
|
10817
|
+
" Read artifact files from the mounted producer workspace when present;",
|
|
10818
|
+
" do not assume detached `artifact_<taskId>` directories exist.",
|
|
10819
|
+
"5. Score strictly against the rubric below.",
|
|
10777
10820
|
"",
|
|
10821
|
+
workspaceSection,
|
|
10778
10822
|
"### Rubric",
|
|
10779
10823
|
"",
|
|
10780
10824
|
rubric.preamble ? `${rubric.preamble}\n` : "",
|
|
@@ -10782,34 +10826,10 @@ function buildJudgeEvalVariantUserPrompt(input, ctx) {
|
|
|
10782
10826
|
"| --- | --- | --- | --- |",
|
|
10783
10827
|
criteriaTable,
|
|
10784
10828
|
"",
|
|
10785
|
-
"### How to grade",
|
|
10786
|
-
"",
|
|
10787
|
-
"For EACH `runTaskIds[i]`:",
|
|
10788
|
-
"",
|
|
10789
|
-
"1. Call the `subagent` custom tool with:",
|
|
10790
|
-
" - `task`: a brief instructing the subagent to grade ONLY that variant",
|
|
10791
|
-
" against the rubric above; include the target task id and the rubric",
|
|
10792
|
-
" verbatim. The subagent has the same MoltNet tools and can fetch the",
|
|
10793
|
-
" accepted attempt output independently.",
|
|
10794
|
-
" - `output_schema`: `\"judge_eval_variant_result\"`",
|
|
10795
|
-
"2. Receive the subagent's structured `judge_eval_variant_result` payload.",
|
|
10796
|
-
"3. Append it to your `results[]` array, **in the same order as input.runTaskIds**.",
|
|
10797
|
-
"",
|
|
10798
|
-
"Do NOT score any variant in your own session. The whole point of the",
|
|
10799
|
-
"subagent fan-out is per-variant context isolation — grading two variants",
|
|
10800
|
-
"back-to-back in one session lets the second be biased by the first.",
|
|
10801
|
-
"",
|
|
10802
10829
|
"### Composite arithmetic",
|
|
10803
10830
|
"",
|
|
10804
|
-
"
|
|
10805
|
-
"criteria. Drift > 0.001 is rejected.
|
|
10806
|
-
"themselves; double-check before assembling the final output.",
|
|
10807
|
-
"",
|
|
10808
|
-
"### Deltas (optional)",
|
|
10809
|
-
"",
|
|
10810
|
-
"If useful, populate `deltas` with pairwise composite differences keyed by",
|
|
10811
|
-
"`\"<variantLabel-A> - <variantLabel-B>\"` (single space-hyphen-space). Both",
|
|
10812
|
-
"labels must appear in `results`. Omit `deltas` entirely if not used.",
|
|
10831
|
+
"Your `composite` MUST equal `Σ(criterion.weight × score)` over the rubric",
|
|
10832
|
+
"criteria. Drift > 0.001 is rejected.",
|
|
10813
10833
|
"",
|
|
10814
10834
|
finalOutputBlock
|
|
10815
10835
|
].filter((s) => s !== "").join("\n");
|
|
@@ -11106,8 +11126,9 @@ function buildRenderPackUserPrompt(input, ctx) {
|
|
|
11106
11126
|
* Build the first user-message prompt for a `run_eval` task.
|
|
11107
11127
|
*
|
|
11108
11128
|
* Free-form: no git workflow, no commit ceremony. The executor produces
|
|
11109
|
-
* a textual response (and optional file artifacts) that
|
|
11110
|
-
* `
|
|
11129
|
+
* a textual response (and optional file artifacts) that later
|
|
11130
|
+
* `judge_eval_attempt` task(s) grade against their own hidden
|
|
11131
|
+
* rubric.
|
|
11111
11132
|
*
|
|
11112
11133
|
* Context delivery is handled by `resolveTaskContext` (see
|
|
11113
11134
|
* libs/agent-runtime/src/context-bindings.ts) and runs BEFORE this
|
|
@@ -11117,7 +11138,9 @@ function buildRenderPackUserPrompt(input, ctx) {
|
|
|
11117
11138
|
* builder does NOT inline `input.context[]` itself.
|
|
11118
11139
|
*/
|
|
11119
11140
|
function buildRunEvalUserPrompt(input, ctx) {
|
|
11120
|
-
const { scenario, variantLabel, successCriteria } = input;
|
|
11141
|
+
const { scenario, variantLabel, execution, successCriteria } = input;
|
|
11142
|
+
const hasContext = input.context.length > 0;
|
|
11143
|
+
const hasInlineContext = input.context.some((entry) => entry.binding === "context_inline");
|
|
11121
11144
|
const inputFilesSection = scenario.inputFiles?.length ? [
|
|
11122
11145
|
"### Input files",
|
|
11123
11146
|
"",
|
|
@@ -11130,9 +11153,30 @@ function buildRunEvalUserPrompt(input, ctx) {
|
|
|
11130
11153
|
"",
|
|
11131
11154
|
`This task carries correlationId \`${ctx.correlationId}\`. It joins`,
|
|
11132
11155
|
"this variant to its sibling `run_eval` tasks (other variants of the",
|
|
11133
|
-
"same scenario
|
|
11134
|
-
"
|
|
11135
|
-
"
|
|
11156
|
+
"same scenario and to any later `judge_eval_attempt` tasks created",
|
|
11157
|
+
"against those variants. You do not need to act on it directly — it",
|
|
11158
|
+
"is recorded for cross-variant aggregation at query time.",
|
|
11159
|
+
""
|
|
11160
|
+
].join("\n") : "";
|
|
11161
|
+
const executionSection = [
|
|
11162
|
+
"### Execution mode",
|
|
11163
|
+
"",
|
|
11164
|
+
`Mode: \`${execution.mode}\``,
|
|
11165
|
+
`Workspace: \`${execution.workspace}\``,
|
|
11166
|
+
execution.workspace === "none" ? "You are running in a scratch workspace with no repository checkout mounted. Do not assume git history or repo files are present unless the scenario provided them explicitly." : execution.workspace === "shared_mount" ? "You are running against the daemon shared mount. Treat any repository mutations as affecting the mounted checkout directly." : "You are running in a dedicated disposable git worktree isolated from the daemon shared checkout.",
|
|
11167
|
+
""
|
|
11168
|
+
].join("\n");
|
|
11169
|
+
const contextDisciplineSection = hasContext ? [
|
|
11170
|
+
"### Injected context discipline",
|
|
11171
|
+
"",
|
|
11172
|
+
"This task includes extra injected context from the task creator.",
|
|
11173
|
+
"You MUST inspect and use that context BEFORE you write solution",
|
|
11174
|
+
"files or draft your final answer.",
|
|
11175
|
+
"Do not solve first and only review the context afterward.",
|
|
11176
|
+
hasInlineContext ? "For `context_inline`, your FIRST content-inspection step should be a `read` of `/workspace/context-pack.md` before your first `write` call. The same content is also mirrored in `/workspace/AGENTS.md` and may be referenced from `/workspace/.claude/CLAUDE.md`." : "If injected context was provided as a skill, inspect that task-injected context before solving.",
|
|
11177
|
+
hasInlineContext ? "If `/workspace/context-pack.md` exists and you skip reading it before writing solution files, you are not following the task instructions." : "Do not rely on memory alone when task-injected context is available; inspect it first.",
|
|
11178
|
+
"If the injected context contains repo- or workflow-specific rules,",
|
|
11179
|
+
"those rules override your generic instincts.",
|
|
11136
11180
|
""
|
|
11137
11181
|
].join("\n") : "";
|
|
11138
11182
|
const finalOutputBlock = buildFinalOutputBlock({
|
|
@@ -11145,7 +11189,13 @@ function buildRunEvalUserPrompt(input, ctx) {
|
|
|
11145
11189
|
" \"totalTokens\": <int>,",
|
|
11146
11190
|
" \"durationMs\": <int>,",
|
|
11147
11191
|
" \"traceparent\": \"<from claim>\",",
|
|
11148
|
-
" \"verification\":
|
|
11192
|
+
" \"verification\": {",
|
|
11193
|
+
" \"inputCid\": \"<task inputCid>\",",
|
|
11194
|
+
" \"results\": [",
|
|
11195
|
+
" { \"id\": \"<criterion id>\", \"kind\": \"rubric\", \"status\": \"pass|fail|skip\", \"detail\": \"<optional one-liner>\" }",
|
|
11196
|
+
" ],",
|
|
11197
|
+
" \"passed\": <boolean>",
|
|
11198
|
+
" } // required iff input.successCriteria; must be an object, never a string",
|
|
11149
11199
|
"}"
|
|
11150
11200
|
].join("\n")
|
|
11151
11201
|
});
|
|
@@ -11153,6 +11203,8 @@ function buildRunEvalUserPrompt(input, ctx) {
|
|
|
11153
11203
|
"# Run Eval Agent\n",
|
|
11154
11204
|
`You are running an evaluation scenario as variant \`${variantLabel}\`.\nTask id: \`${ctx.taskId}\`\n`,
|
|
11155
11205
|
correlationSection,
|
|
11206
|
+
executionSection,
|
|
11207
|
+
contextDisciplineSection,
|
|
11156
11208
|
`### Scenario\n\n${scenario.prompt}\n`,
|
|
11157
11209
|
inputFilesSection,
|
|
11158
11210
|
verificationSection,
|
|
@@ -11224,6 +11276,16 @@ function buildTaskUserPrompt(task, ctx) {
|
|
|
11224
11276
|
diaryId: ctx.diaryId,
|
|
11225
11277
|
taskId: ctx.taskId
|
|
11226
11278
|
});
|
|
11279
|
+
case JUDGE_EVAL_ATTEMPT_TYPE:
|
|
11280
|
+
if (!Value.Check(JudgeEvalAttemptInput, task.input)) {
|
|
11281
|
+
const errors = [...Value.Errors(JudgeEvalAttemptInput, task.input)];
|
|
11282
|
+
throw new Error(`judge_eval_attempt input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
|
|
11283
|
+
}
|
|
11284
|
+
return buildJudgeEvalAttemptUserPrompt(task.input, {
|
|
11285
|
+
diaryId: ctx.diaryId,
|
|
11286
|
+
taskId: ctx.taskId,
|
|
11287
|
+
workspace: ctx.workspace
|
|
11288
|
+
});
|
|
11227
11289
|
case PR_REVIEW_TYPE:
|
|
11228
11290
|
if (!Value.Check(PrReviewInput, task.input)) {
|
|
11229
11291
|
const errors = [...Value.Errors(PrReviewInput, task.input)];
|
|
@@ -11234,15 +11296,6 @@ function buildTaskUserPrompt(task, ctx) {
|
|
|
11234
11296
|
taskId: ctx.taskId,
|
|
11235
11297
|
workspace: ctx.workspace
|
|
11236
11298
|
});
|
|
11237
|
-
case JUDGE_EVAL_VARIANT_TYPE:
|
|
11238
|
-
if (!Value.Check(JudgeEvalVariantInput, task.input)) {
|
|
11239
|
-
const errors = [...Value.Errors(JudgeEvalVariantInput, task.input)];
|
|
11240
|
-
throw new Error(`judge_eval_variant input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
|
|
11241
|
-
}
|
|
11242
|
-
return buildJudgeEvalVariantUserPrompt(task.input, {
|
|
11243
|
-
diaryId: ctx.diaryId,
|
|
11244
|
-
taskId: ctx.taskId
|
|
11245
|
-
});
|
|
11246
11299
|
case RUN_EVAL_TYPE:
|
|
11247
11300
|
if (!Value.Check(RunEvalInput, task.input)) {
|
|
11248
11301
|
const errors = [...Value.Errors(RunEvalInput, task.input)];
|
|
@@ -14760,6 +14813,11 @@ var require_multistream = /* @__PURE__ */ __commonJSMin(((exports, module) => {
|
|
|
14760
14813
|
* paths under this mount via `toGuestPath` in `tool-operations.ts`.
|
|
14761
14814
|
*/
|
|
14762
14815
|
var SKILL_ROOT_IN_VM = GUEST_TASK_SKILLS_MOUNT;
|
|
14816
|
+
var INLINE_CONTEXT_ROOT_IN_VM = "/workspace/.moltnet/context";
|
|
14817
|
+
var WORKSPACE_CONTEXT_PACK = "/workspace/context-pack.md";
|
|
14818
|
+
var WORKSPACE_AGENTS_MD = "/workspace/AGENTS.md";
|
|
14819
|
+
var WORKSPACE_CLAUDE_DIR = "/workspace/.claude";
|
|
14820
|
+
var WORKSPACE_CLAUDE_MD = "/workspace/.claude/CLAUDE.md";
|
|
14763
14821
|
/** Bounds borrowed from pi's skill validation; conservative caps so a
|
|
14764
14822
|
* malformed SKILL.md doesn't bloat the system prompt. */
|
|
14765
14823
|
var MAX_SKILL_NAME = 64;
|
|
@@ -14770,21 +14828,40 @@ var MAX_SKILL_DESCRIPTION = 1024;
|
|
|
14770
14828
|
*/
|
|
14771
14829
|
async function injectTaskContext(args) {
|
|
14772
14830
|
const skills = [];
|
|
14831
|
+
const inlineContexts = [];
|
|
14773
14832
|
const resolved = await resolveTaskContext({
|
|
14774
14833
|
context: args.context,
|
|
14775
|
-
deliver: {
|
|
14776
|
-
|
|
14777
|
-
|
|
14778
|
-
|
|
14779
|
-
|
|
14780
|
-
|
|
14781
|
-
|
|
14782
|
-
|
|
14783
|
-
|
|
14784
|
-
|
|
14785
|
-
|
|
14786
|
-
|
|
14834
|
+
deliver: {
|
|
14835
|
+
skill: async ({ slug, content }) => {
|
|
14836
|
+
const dir = `${SKILL_ROOT_IN_VM}/${slug}`;
|
|
14837
|
+
const filePath = `${dir}/SKILL.md`;
|
|
14838
|
+
await args.fs.mkdir(dir, { recursive: true });
|
|
14839
|
+
await args.fs.writeFile(filePath, content, { mode: 420 });
|
|
14840
|
+
skills.push(buildSyntheticSkill({
|
|
14841
|
+
slug,
|
|
14842
|
+
content,
|
|
14843
|
+
filePath,
|
|
14844
|
+
dir
|
|
14845
|
+
}));
|
|
14846
|
+
},
|
|
14847
|
+
contextFile: async ({ suggestedFileName, content }) => {
|
|
14848
|
+
await args.fs.mkdir(INLINE_CONTEXT_ROOT_IN_VM, { recursive: true });
|
|
14849
|
+
const filePath = `${INLINE_CONTEXT_ROOT_IN_VM}/${suggestedFileName}`;
|
|
14850
|
+
await args.fs.writeFile(filePath, content, { mode: 420 });
|
|
14851
|
+
inlineContexts.push({
|
|
14852
|
+
slug: suggestedFileName.replace(/\.md$/u, ""),
|
|
14853
|
+
content
|
|
14854
|
+
});
|
|
14855
|
+
}
|
|
14856
|
+
}
|
|
14787
14857
|
});
|
|
14858
|
+
if (inlineContexts.length > 0) {
|
|
14859
|
+
const packContent = buildWorkspaceContextPack(inlineContexts);
|
|
14860
|
+
await args.fs.writeFile(WORKSPACE_CONTEXT_PACK, packContent, { mode: 420 });
|
|
14861
|
+
await args.fs.writeFile(WORKSPACE_AGENTS_MD, packContent, { mode: 420 });
|
|
14862
|
+
await args.fs.mkdir(WORKSPACE_CLAUDE_DIR, { recursive: true });
|
|
14863
|
+
await args.fs.writeFile(WORKSPACE_CLAUDE_MD, "@../context-pack.md\n", { mode: 420 });
|
|
14864
|
+
}
|
|
14788
14865
|
return {
|
|
14789
14866
|
injected: resolved.injected,
|
|
14790
14867
|
skills,
|
|
@@ -14792,6 +14869,17 @@ async function injectTaskContext(args) {
|
|
|
14792
14869
|
userInlineSuffix: resolved.userInlineSuffix
|
|
14793
14870
|
};
|
|
14794
14871
|
}
|
|
14872
|
+
function buildWorkspaceContextPack(contexts) {
|
|
14873
|
+
return [
|
|
14874
|
+
"# Context Pack",
|
|
14875
|
+
"",
|
|
14876
|
+
...contexts.map(({ slug, content }) => [
|
|
14877
|
+
`## ${slug}`,
|
|
14878
|
+
"",
|
|
14879
|
+
content.trimEnd()
|
|
14880
|
+
].join("\n"))
|
|
14881
|
+
].join("\n\n").trimEnd() + "\n";
|
|
14882
|
+
}
|
|
14795
14883
|
/**
|
|
14796
14884
|
* Build a `Skill` object pi will faithfully render in
|
|
14797
14885
|
* `<available_skills>`. We extract `name` and `description` from the
|
|
@@ -15155,7 +15243,7 @@ async function parseStructuredTaskOutput(assistantText, taskType, opts = {}) {
|
|
|
15155
15243
|
}
|
|
15156
15244
|
};
|
|
15157
15245
|
}
|
|
15158
|
-
const errors = validateTaskOutput(taskType, extracted);
|
|
15246
|
+
const errors = validateTaskOutput(taskType, extracted, opts.input);
|
|
15159
15247
|
if (errors.length > 0) {
|
|
15160
15248
|
const details = errors.slice(0, 3).map((error) => `${error.field}: ${error.message}`);
|
|
15161
15249
|
const [firstError] = errors;
|
|
@@ -15269,7 +15357,7 @@ function createSubmitOutputTool(taskType, opts = {}) {
|
|
|
15269
15357
|
description: contract.description,
|
|
15270
15358
|
parameters: schema,
|
|
15271
15359
|
async execute(_id, params) {
|
|
15272
|
-
const errors = validateTaskOutput(taskType, params);
|
|
15360
|
+
const errors = validateTaskOutput(taskType, params, opts.input);
|
|
15273
15361
|
if (errors.length > 0) {
|
|
15274
15362
|
const detailMsg = errors.slice(0, 3).map((err) => `${err.field}: ${err.message}`).join("; ");
|
|
15275
15363
|
const details = {
|
|
@@ -15338,6 +15426,39 @@ function resolveSubmitTools(taskType, opts = {}) {
|
|
|
15338
15426
|
//#region src/runtime/task-workspace.ts
|
|
15339
15427
|
function prepareTaskWorkspace(task, requestedMountPath, executionPlan) {
|
|
15340
15428
|
const branch = executionPlan?.worktreeBranch ?? null;
|
|
15429
|
+
const workspaceMode = executionPlan?.workspaceMode ?? "shared_mount";
|
|
15430
|
+
const attachedWorkspace = executionPlan?.workspaceAttachment ?? null;
|
|
15431
|
+
if (attachedWorkspace) return {
|
|
15432
|
+
mountPath: attachedWorkspace.mountPath,
|
|
15433
|
+
cwdPath: attachedWorkspace.cwdPath,
|
|
15434
|
+
mode: workspaceMode,
|
|
15435
|
+
branch,
|
|
15436
|
+
cleanup: () => {}
|
|
15437
|
+
};
|
|
15438
|
+
if (workspaceMode === "scratch_mount") {
|
|
15439
|
+
const scratchDir = resolveTaskScratchPath(findMainWorktree(), executionPlan?.workspaceId ?? `task-${task.id}`);
|
|
15440
|
+
const keepWorkspace = executionPlan?.workspaceScope === "session" && executionPlan.sessionKey !== null;
|
|
15441
|
+
if (keepWorkspace) mkdirSync(scratchDir, { recursive: true });
|
|
15442
|
+
else {
|
|
15443
|
+
rmSync(scratchDir, {
|
|
15444
|
+
recursive: true,
|
|
15445
|
+
force: true
|
|
15446
|
+
});
|
|
15447
|
+
mkdirSync(scratchDir, { recursive: true });
|
|
15448
|
+
}
|
|
15449
|
+
return {
|
|
15450
|
+
mountPath: scratchDir,
|
|
15451
|
+
cwdPath: scratchDir,
|
|
15452
|
+
mode: "scratch_mount",
|
|
15453
|
+
branch: null,
|
|
15454
|
+
cleanup: keepWorkspace ? () => {} : () => {
|
|
15455
|
+
rmSync(scratchDir, {
|
|
15456
|
+
recursive: true,
|
|
15457
|
+
force: true
|
|
15458
|
+
});
|
|
15459
|
+
}
|
|
15460
|
+
};
|
|
15461
|
+
}
|
|
15341
15462
|
if (!branch) return {
|
|
15342
15463
|
mountPath: requestedMountPath,
|
|
15343
15464
|
cwdPath: requestedMountPath,
|
|
@@ -15375,6 +15496,9 @@ function prepareTaskWorkspace(task, requestedMountPath, executionPlan) {
|
|
|
15375
15496
|
function resolveTaskWorktreePath(mainRepo, workspaceId) {
|
|
15376
15497
|
return join(mainRepo, ".worktrees", workspaceId);
|
|
15377
15498
|
}
|
|
15499
|
+
function resolveTaskScratchPath(mainRepo, workspaceId) {
|
|
15500
|
+
return join(mainRepo, ".moltnet", "d", "task-workspaces", workspaceId);
|
|
15501
|
+
}
|
|
15378
15502
|
function ensureReusableTaskWorktree(mainRepo, worktreeDir, branch) {
|
|
15379
15503
|
if (isRegisteredWorktree(mainRepo, worktreeDir)) return;
|
|
15380
15504
|
if (existsSync(worktreeDir)) throw new Error(`Expected reusable worktree ${worktreeDir} to be git-managed, but it exists outside git worktree metadata.`);
|
|
@@ -15611,12 +15735,14 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
15611
15735
|
return makeFailedOutput("worktree_setup_failed", message);
|
|
15612
15736
|
}
|
|
15613
15737
|
try {
|
|
15738
|
+
const sandboxConfig = applyExecutionPlanSandboxOverrides(opts.sandboxConfig, executionPlan);
|
|
15614
15739
|
managed = await resumeVm({
|
|
15615
15740
|
checkpointPath,
|
|
15616
15741
|
agentName: opts.agentName,
|
|
15617
15742
|
mountPath,
|
|
15743
|
+
workspaceMode: workspace.mode,
|
|
15618
15744
|
extraAllowedHosts: opts.extraAllowedHosts,
|
|
15619
|
-
sandboxConfig
|
|
15745
|
+
sandboxConfig
|
|
15620
15746
|
});
|
|
15621
15747
|
} catch (err) {
|
|
15622
15748
|
const message = err instanceof Error ? err.message : String(err);
|
|
@@ -15645,7 +15771,8 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
15645
15771
|
taskId: task.id,
|
|
15646
15772
|
workspace: {
|
|
15647
15773
|
mode: activeWorkspace.mode,
|
|
15648
|
-
branch: activeWorkspace.branch
|
|
15774
|
+
branch: activeWorkspace.branch,
|
|
15775
|
+
attached: executionPlan?.workspaceAttachment !== void 0
|
|
15649
15776
|
},
|
|
15650
15777
|
extras: opts.promptExtras
|
|
15651
15778
|
});
|
|
@@ -15687,7 +15814,10 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
15687
15814
|
createEditToolDefinition(mountPath, { operations: createGondolinEditOps(managed.vm, mountPath) }),
|
|
15688
15815
|
createBashToolDefinition(mountPath, { operations: createGondolinBashOps(managed.vm, mountPath) })
|
|
15689
15816
|
];
|
|
15690
|
-
const { handle: submitToolHandle, tools: submitToolDefs } = resolveSubmitTools(task.taskType, {
|
|
15817
|
+
const { handle: submitToolHandle, tools: submitToolDefs } = resolveSubmitTools(task.taskType, {
|
|
15818
|
+
model: opts.model,
|
|
15819
|
+
input: task.input
|
|
15820
|
+
});
|
|
15691
15821
|
const submitTools = submitToolDefs;
|
|
15692
15822
|
try {
|
|
15693
15823
|
const moltnetAgent = await connect({ configDir: managed.agentDir });
|
|
@@ -15906,8 +16036,20 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
15906
16036
|
phase: "output_validation"
|
|
15907
16037
|
});
|
|
15908
16038
|
}
|
|
15909
|
-
else {
|
|
15910
|
-
|
|
16039
|
+
else if (submitToolHandle) {
|
|
16040
|
+
parseError = {
|
|
16041
|
+
code: "output_missing",
|
|
16042
|
+
message: "Agent did not submit output through the task submit tool. A valid submit tool call is required to complete this task type."
|
|
16043
|
+
};
|
|
16044
|
+
await emit("error", {
|
|
16045
|
+
message: parseError.message,
|
|
16046
|
+
phase: "output_validation"
|
|
16047
|
+
});
|
|
16048
|
+
} else {
|
|
16049
|
+
const parsed = await parseStructuredTaskOutput(assistantText, task.taskType, {
|
|
16050
|
+
model: opts.model,
|
|
16051
|
+
input: task.input
|
|
16052
|
+
});
|
|
15911
16053
|
parsedOutput = parsed.output;
|
|
15912
16054
|
parsedOutputCid = parsed.outputCid;
|
|
15913
16055
|
parseError = parsed.error;
|
|
@@ -15993,6 +16135,18 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
15993
16135
|
}
|
|
15994
16136
|
}
|
|
15995
16137
|
}
|
|
16138
|
+
function applyExecutionPlanSandboxOverrides(sandboxConfig, executionPlan) {
|
|
16139
|
+
const shadowWrites = executionPlan?.workspaceAttachment?.shadowWrites;
|
|
16140
|
+
if (!shadowWrites) return sandboxConfig;
|
|
16141
|
+
return {
|
|
16142
|
+
...sandboxConfig,
|
|
16143
|
+
vfs: {
|
|
16144
|
+
...sandboxConfig?.vfs,
|
|
16145
|
+
shadow: ["**"],
|
|
16146
|
+
shadowMode: shadowWrites
|
|
16147
|
+
}
|
|
16148
|
+
};
|
|
16149
|
+
}
|
|
15996
16150
|
function emptyUsage(provider, model) {
|
|
15997
16151
|
return {
|
|
15998
16152
|
inputTokens: 0,
|
|
@@ -16210,6 +16364,7 @@ function moltnetExtension(pi) {
|
|
|
16210
16364
|
checkpointPath,
|
|
16211
16365
|
agentName,
|
|
16212
16366
|
mountPath,
|
|
16367
|
+
workspaceMode: "shared_mount",
|
|
16213
16368
|
sandboxConfig
|
|
16214
16369
|
});
|
|
16215
16370
|
activateAgentEnv(managed.credentials.agentEnv, mainRepo);
|