@themoltnet/pi-extension 0.18.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -1
- package/dist/index.d.ts +64 -5
- package/dist/index.js +741 -388
- package/package.json +3 -3
package/dist/index.js
CHANGED
|
@@ -2386,12 +2386,20 @@ var MoltNetError = class extends Error {
|
|
|
2386
2386
|
code;
|
|
2387
2387
|
statusCode;
|
|
2388
2388
|
detail;
|
|
2389
|
+
/**
|
|
2390
|
+
* Populated when the server returned a `VALIDATION_FAILED` problem
|
|
2391
|
+
* (status 400) with field-level errors. Empty / undefined for every
|
|
2392
|
+
* other problem kind. Imposer scripts surface these to operators so
|
|
2393
|
+
* they don't have to re-run with curl to see what was rejected.
|
|
2394
|
+
*/
|
|
2395
|
+
validationErrors;
|
|
2389
2396
|
constructor(message, options) {
|
|
2390
2397
|
super(message);
|
|
2391
2398
|
this.name = "MoltNetError";
|
|
2392
2399
|
this.code = options.code;
|
|
2393
2400
|
this.statusCode = options.statusCode;
|
|
2394
2401
|
this.detail = options.detail;
|
|
2402
|
+
this.validationErrors = options.validationErrors;
|
|
2395
2403
|
}
|
|
2396
2404
|
};
|
|
2397
2405
|
var NetworkError = class extends MoltNetError {
|
|
@@ -2415,10 +2423,14 @@ var AuthenticationError = class extends MoltNetError {
|
|
|
2415
2423
|
};
|
|
2416
2424
|
function problemToError(problem, statusCode) {
|
|
2417
2425
|
const title = problem.title ?? "Request failed";
|
|
2418
|
-
|
|
2426
|
+
const message = problem.detail ? `${title}: ${problem.detail}` : title;
|
|
2427
|
+
const rawErrors = problem.errors;
|
|
2428
|
+
const validationErrors = Array.isArray(rawErrors) ? rawErrors.filter((e) => typeof e === "object" && e !== null && typeof e.field === "string" && typeof e.message === "string") : void 0;
|
|
2429
|
+
return new MoltNetError(message, {
|
|
2419
2430
|
code: problem.type ?? problem.code ?? "UNKNOWN",
|
|
2420
2431
|
statusCode,
|
|
2421
|
-
detail: problem.detail
|
|
2432
|
+
detail: problem.detail,
|
|
2433
|
+
validationErrors
|
|
2422
2434
|
});
|
|
2423
2435
|
}
|
|
2424
2436
|
//#endregion
|
|
@@ -7767,6 +7779,41 @@ function createMoltNetTools(config) {
|
|
|
7767
7779
|
};
|
|
7768
7780
|
}
|
|
7769
7781
|
});
|
|
7782
|
+
const listTaskMessages = defineTool({
|
|
7783
|
+
name: "moltnet_list_task_messages",
|
|
7784
|
+
label: "List MoltNet Task Attempt Messages",
|
|
7785
|
+
description: "List messages for a specific task attempt. Use this when you need the turn-by-turn execution record behind an accepted attempt — tool calls, text deltas, and error/info events that do not appear in the attempt output alone.",
|
|
7786
|
+
parameters: Type.Object({
|
|
7787
|
+
taskId: Type.String({ description: "Task ID (UUID)." }),
|
|
7788
|
+
attemptN: Type.Integer({
|
|
7789
|
+
minimum: 1,
|
|
7790
|
+
description: "Attempt number to inspect."
|
|
7791
|
+
}),
|
|
7792
|
+
afterSeq: Type.Optional(Type.Integer({
|
|
7793
|
+
minimum: 0,
|
|
7794
|
+
description: "Optional cursor: only return messages with seq > afterSeq."
|
|
7795
|
+
})),
|
|
7796
|
+
limit: Type.Optional(Type.Integer({
|
|
7797
|
+
minimum: 1,
|
|
7798
|
+
maximum: 500,
|
|
7799
|
+
description: "Optional maximum messages to return. Defaults to the API value."
|
|
7800
|
+
}))
|
|
7801
|
+
}),
|
|
7802
|
+
async execute(_id, params) {
|
|
7803
|
+
const { agent } = ensureConnected(config);
|
|
7804
|
+
const messages = await agent.tasks.listMessages(params.taskId, params.attemptN, {
|
|
7805
|
+
afterSeq: params.afterSeq,
|
|
7806
|
+
limit: params.limit
|
|
7807
|
+
});
|
|
7808
|
+
return {
|
|
7809
|
+
content: [{
|
|
7810
|
+
type: "text",
|
|
7811
|
+
text: JSON.stringify(messages, null, 2)
|
|
7812
|
+
}],
|
|
7813
|
+
details: {}
|
|
7814
|
+
};
|
|
7815
|
+
}
|
|
7816
|
+
});
|
|
7770
7817
|
const reviewSessionErrors = defineTool({
|
|
7771
7818
|
name: "moltnet_review_session_errors",
|
|
7772
7819
|
label: "Review Session Tool Errors",
|
|
@@ -7815,6 +7862,7 @@ function createMoltNetTools(config) {
|
|
|
7815
7862
|
createEntry,
|
|
7816
7863
|
getTask,
|
|
7817
7864
|
listTaskAttempts,
|
|
7865
|
+
listTaskMessages,
|
|
7818
7866
|
reviewSessionErrors,
|
|
7819
7867
|
defineTool({
|
|
7820
7868
|
name: "moltnet_host_exec",
|
|
@@ -8113,6 +8161,12 @@ var GUEST_WORKSPACE$2 = "/workspace";
|
|
|
8113
8161
|
* investigation and the alternatives we rejected.
|
|
8114
8162
|
*/
|
|
8115
8163
|
var GUEST_TASK_SKILLS_MOUNT = "/moltnet-task-skills";
|
|
8164
|
+
function shouldRunResumeCommand(entry, ctx) {
|
|
8165
|
+
if (typeof entry === "string") return true;
|
|
8166
|
+
const workspaceModes = entry.when?.workspaceMode;
|
|
8167
|
+
if (workspaceModes && !workspaceModes.includes(ctx.workspaceMode)) return false;
|
|
8168
|
+
return true;
|
|
8169
|
+
}
|
|
8116
8170
|
/**
|
|
8117
8171
|
* Resolve the main worktree root (where .moltnet/ lives — it's untracked,
|
|
8118
8172
|
* only exists in the main worktree, not in git worktrees).
|
|
@@ -8258,6 +8312,7 @@ async function resumeVm(config) {
|
|
|
8258
8312
|
...envOverrides
|
|
8259
8313
|
};
|
|
8260
8314
|
const resources = config.sandboxConfig?.resources;
|
|
8315
|
+
const workspaceMode = config.workspaceMode ?? "shared_mount";
|
|
8261
8316
|
const vm = await VmCheckpoint.load(config.checkpointPath).resume({
|
|
8262
8317
|
httpHooks,
|
|
8263
8318
|
env: vmEnv,
|
|
@@ -8276,7 +8331,32 @@ async function resumeVm(config) {
|
|
|
8276
8331
|
'`);
|
|
8277
8332
|
await vmRun(vm, "DNS resolvers", `printf 'nameserver 8.8.8.8\\nnameserver 1.1.1.1\\n' > /etc/resolv.conf`);
|
|
8278
8333
|
await vmRun(vm, "git safe.directory", `git config --system --add safe.directory '*'`);
|
|
8279
|
-
for (const [i,
|
|
8334
|
+
for (const [i, entry] of (config.sandboxConfig?.resumeCommands ?? []).entries()) {
|
|
8335
|
+
if (!shouldRunResumeCommand(entry, { workspaceMode })) continue;
|
|
8336
|
+
const { run, retries, backoffMs } = typeof entry === "string" ? {
|
|
8337
|
+
run: entry,
|
|
8338
|
+
retries: 0,
|
|
8339
|
+
backoffMs: 2e3
|
|
8340
|
+
} : {
|
|
8341
|
+
run: entry.run,
|
|
8342
|
+
retries: entry.retries ?? 0,
|
|
8343
|
+
backoffMs: entry.retryBackoffMs ?? 2e3
|
|
8344
|
+
};
|
|
8345
|
+
const label = `resumeCommands[${i}]`;
|
|
8346
|
+
let lastErr;
|
|
8347
|
+
for (let attempt = 0; attempt <= retries; attempt++) try {
|
|
8348
|
+
await vmRun(vm, label, run);
|
|
8349
|
+
lastErr = void 0;
|
|
8350
|
+
break;
|
|
8351
|
+
} catch (err) {
|
|
8352
|
+
lastErr = err;
|
|
8353
|
+
if (attempt === retries) break;
|
|
8354
|
+
await new Promise((resolve) => {
|
|
8355
|
+
setTimeout(resolve, (attempt + 1) * backoffMs);
|
|
8356
|
+
});
|
|
8357
|
+
}
|
|
8358
|
+
if (lastErr) throw lastErr instanceof Error ? lastErr : new Error(String(lastErr));
|
|
8359
|
+
}
|
|
8280
8360
|
const vmSshDir = `${vmAgentDir}/ssh`;
|
|
8281
8361
|
await vm.exec(`mkdir -p ${vmAgentDir}/ssh /home/agent/.pi/agent`);
|
|
8282
8362
|
if (creds.piAuthJson !== null) await vm.fs.writeFile("/home/agent/.pi/agent/auth.json", creds.piAuthJson, { mode: 384 });
|
|
@@ -8655,7 +8735,8 @@ async function buildAgentSession(args) {
|
|
|
8655
8735
|
await resourceLoader.reload();
|
|
8656
8736
|
const sessionManager = args.sessionPersistence ? await resolvePersistentSessionManager({
|
|
8657
8737
|
cwd: args.cwdPath,
|
|
8658
|
-
sessionDir: args.sessionPersistence.sessionDir
|
|
8738
|
+
sessionDir: args.sessionPersistence.sessionDir,
|
|
8739
|
+
forkFromSessionPath: args.sessionPersistence.forkFromSessionPath
|
|
8659
8740
|
}) : SessionManager.inMemory(args.cwdPath);
|
|
8660
8741
|
return (await createAgentSession({
|
|
8661
8742
|
agentDir: args.piAuthDir,
|
|
@@ -8667,6 +8748,7 @@ async function buildAgentSession(args) {
|
|
|
8667
8748
|
})).session;
|
|
8668
8749
|
}
|
|
8669
8750
|
async function resolvePersistentSessionManager(args) {
|
|
8751
|
+
if (args.forkFromSessionPath) return SessionManager.forkFrom(args.forkFromSessionPath, args.cwd, args.sessionDir);
|
|
8670
8752
|
await SessionManager.list(args.cwd, args.sessionDir);
|
|
8671
8753
|
return SessionManager.continueRecent(args.cwd, args.sessionDir);
|
|
8672
8754
|
}
|
|
@@ -8683,6 +8765,11 @@ var PROMPT_SEPARATOR = "\n\n---\n\n";
|
|
|
8683
8765
|
* - `skill` → `deliver.skill({ slug, content })` once per ref.
|
|
8684
8766
|
* Slug collisions on distinct contents are
|
|
8685
8767
|
* refused loudly.
|
|
8768
|
+
* - `context_inline`→ persist raw bytes via `deliver.contextFile(...)`
|
|
8769
|
+
* and inject them into the prompt in an explicit,
|
|
8770
|
+
* named block. Intended for eval/context experiments
|
|
8771
|
+
* where the content must be in the model context
|
|
8772
|
+
* window, not merely discoverable as a skill.
|
|
8686
8773
|
* - `prompt_prefix` → content appended to `systemPromptPrefix` with
|
|
8687
8774
|
* the canonical `\n\n---\n\n` separator (in
|
|
8688
8775
|
* declared order).
|
|
@@ -8715,6 +8802,13 @@ async function resolveTaskContext(args) {
|
|
|
8715
8802
|
slug: ref.slug,
|
|
8716
8803
|
content: ref.content
|
|
8717
8804
|
});
|
|
8805
|
+
} else if (ref.binding === "context_inline") {
|
|
8806
|
+
await args.deliver.contextFile({
|
|
8807
|
+
slug: ref.slug,
|
|
8808
|
+
content: ref.content,
|
|
8809
|
+
suggestedFileName: `${ref.slug}.md`
|
|
8810
|
+
});
|
|
8811
|
+
promptParts.push(formatInlineContextBlock(ref.slug, ref.content));
|
|
8718
8812
|
} else if (ref.binding === "prompt_prefix") promptParts.push(ref.content);
|
|
8719
8813
|
else userParts.push(ref.content);
|
|
8720
8814
|
injected.push(ref);
|
|
@@ -8725,6 +8819,23 @@ async function resolveTaskContext(args) {
|
|
|
8725
8819
|
userInlineSuffix: userParts.join(PROMPT_SEPARATOR)
|
|
8726
8820
|
};
|
|
8727
8821
|
}
|
|
8822
|
+
function formatInlineContextBlock(slug, content) {
|
|
8823
|
+
return [
|
|
8824
|
+
"### Injected Task Context",
|
|
8825
|
+
"",
|
|
8826
|
+
`Context id: \`${slug}\``,
|
|
8827
|
+
"The following raw context was supplied by the task creator. Treat it",
|
|
8828
|
+
"as task-relevant background that may override generic coding instincts",
|
|
8829
|
+
"when it contains repo- or workflow-specific constraints.",
|
|
8830
|
+
"The same content is also materialized in the workspace as",
|
|
8831
|
+
"`/workspace/context-pack.md` and mirrored in `AGENTS.md` for",
|
|
8832
|
+
"repo-context discovery.",
|
|
8833
|
+
"",
|
|
8834
|
+
"<context>",
|
|
8835
|
+
content,
|
|
8836
|
+
"</context>"
|
|
8837
|
+
].join("\n");
|
|
8838
|
+
}
|
|
8728
8839
|
//#endregion
|
|
8729
8840
|
//#region ../tasks/src/formats.ts
|
|
8730
8841
|
/**
|
|
@@ -8748,6 +8859,7 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
|
|
|
8748
8859
|
*/
|
|
8749
8860
|
var ContextBinding = Type$1.Union([
|
|
8750
8861
|
Type$1.Literal("skill"),
|
|
8862
|
+
Type$1.Literal("context_inline"),
|
|
8751
8863
|
Type$1.Literal("prompt_prefix"),
|
|
8752
8864
|
Type$1.Literal("user_inline")
|
|
8753
8865
|
], { $id: "ContextBinding" });
|
|
@@ -8764,9 +8876,14 @@ var ContextBinding = Type$1.Union([
|
|
|
8764
8876
|
* name under the runtime's skill discovery path. Must be
|
|
8765
8877
|
* kebab-case-safe (alphanumeric + dashes/underscores).
|
|
8766
8878
|
* - `binding` — how the bytes are delivered to the LLM (see above).
|
|
8767
|
-
* - `content` — the actual bytes (UTF-8 text). Capped at
|
|
8879
|
+
* - `content` — the actual bytes (UTF-8 text). Capped at 64 KiB per
|
|
8768
8880
|
* entry; total per-task context bytes are bounded by the
|
|
8769
8881
|
* soft `maxItems` cap and per-binding daemon limits.
|
|
8882
|
+
* Raised from 32 KiB in 2026-05 — protocol-heavy operator
|
|
8883
|
+
* skills (e.g. `.claude/skills/legreffier/SKILL.md`) ship
|
|
8884
|
+
* at ~35 KiB inline, and the original cap was sized for
|
|
8885
|
+
* short example skills, not the kind of skill the eval
|
|
8886
|
+
* substrate is dogfooded on (#943, #823).
|
|
8770
8887
|
*/
|
|
8771
8888
|
var ContextRef = Type$1.Object({
|
|
8772
8889
|
slug: Type$1.String({
|
|
@@ -8777,7 +8894,7 @@ var ContextRef = Type$1.Object({
|
|
|
8777
8894
|
binding: ContextBinding,
|
|
8778
8895
|
content: Type$1.String({
|
|
8779
8896
|
minLength: 1,
|
|
8780
|
-
maxLength:
|
|
8897
|
+
maxLength: 65536
|
|
8781
8898
|
})
|
|
8782
8899
|
}, {
|
|
8783
8900
|
$id: "ContextRef",
|
|
@@ -9341,61 +9458,33 @@ async function validateJudgePackInputAsync(input, ctx) {
|
|
|
9341
9458
|
return errors;
|
|
9342
9459
|
}
|
|
9343
9460
|
//#endregion
|
|
9344
|
-
//#region ../tasks/src/task-types/judge-eval-
|
|
9461
|
+
//#region ../tasks/src/task-types/judge-eval-attempt.ts
|
|
9345
9462
|
/**
|
|
9346
|
-
* `
|
|
9347
|
-
*
|
|
9348
|
-
* isolation.
|
|
9463
|
+
* `judge_eval_attempt` — score one completed `run_eval` attempt against a
|
|
9464
|
+
* hidden judge rubric.
|
|
9349
9465
|
*
|
|
9350
9466
|
* output_kind: judgment
|
|
9351
|
-
* criteria: required (`successCriteria.rubric`
|
|
9352
|
-
*
|
|
9353
|
-
*
|
|
9354
|
-
* pin the targets being graded.
|
|
9355
|
-
*
|
|
9356
|
-
* Slice 2 of #943. The parent task carries the rubric and the list of
|
|
9357
|
-
* variant `run_eval` task ids. The pi executor registers the generic
|
|
9358
|
-
* `subagent` custom tool (#1087), and the parent LLM calls
|
|
9359
|
-
* `subagent({ task, output_schema: 'judge_eval_variant_result' })` once
|
|
9360
|
-
* per variant — each child session has fresh context, fetches the
|
|
9361
|
-
* variant's accepted attempt output via `moltnet_get_task` /
|
|
9362
|
-
* `moltnet_list_task_attempts`, and grades against the rubric.
|
|
9467
|
+
* criteria: required (`successCriteria.rubric`)
|
|
9468
|
+
* references: not required at the input layer — `targetTaskId` +
|
|
9469
|
+
* `targetAttemptN` pin the producer attempt being judged.
|
|
9363
9470
|
*
|
|
9364
|
-
*
|
|
9365
|
-
*
|
|
9366
|
-
*
|
|
9367
|
-
*
|
|
9368
|
-
|
|
9369
|
-
|
|
9370
|
-
|
|
9371
|
-
|
|
9372
|
-
|
|
9373
|
-
* which the task service runs at create time (#1096 wiring). The
|
|
9374
|
-
* TypeBox layer here only enforces shape: UUID format,
|
|
9375
|
-
* minItems/maxItems, rubric presence + weight invariant.
|
|
9376
|
-
*/
|
|
9377
|
-
var JUDGE_EVAL_VARIANT_TYPE = "judge_eval_variant";
|
|
9378
|
-
var JudgeEvalVariantInput = Type$1.Object({
|
|
9379
|
-
runTaskIds: Type$1.Array(Type$1.String({ format: "uuid" }), {
|
|
9380
|
-
minItems: 2,
|
|
9381
|
-
maxItems: 10
|
|
9382
|
-
}),
|
|
9471
|
+
* This replaces the earlier parent/subagent `judge_eval_variant` design.
|
|
9472
|
+
* The unit of judgment is one producer attempt. Cross-variant deltas can be
|
|
9473
|
+
* computed later at read time from stored scores, rather than materialized as
|
|
9474
|
+
* their own task output.
|
|
9475
|
+
*/
|
|
9476
|
+
var JUDGE_EVAL_ATTEMPT_TYPE = "judge_eval_attempt";
|
|
9477
|
+
var JudgeEvalAttemptInput = Type$1.Object({
|
|
9478
|
+
targetTaskId: Type$1.String({ format: "uuid" }),
|
|
9479
|
+
targetAttemptN: Type$1.Integer({ minimum: 1 }),
|
|
9383
9480
|
successCriteria: SuccessCriteria
|
|
9384
9481
|
}, {
|
|
9385
|
-
$id: "
|
|
9482
|
+
$id: "JudgeEvalAttemptInput",
|
|
9386
9483
|
additionalProperties: false
|
|
9387
9484
|
});
|
|
9388
|
-
|
|
9389
|
-
|
|
9390
|
-
|
|
9391
|
-
* deterministic_*). Reuse the type rather than re-declare.
|
|
9392
|
-
*
|
|
9393
|
-
* This is also the **subagent output contract** — the parent's
|
|
9394
|
-
* `subagent` tool resolves the contract name `judge_eval_variant_result`
|
|
9395
|
-
* to this schema. See `agent-runtime`'s subagent contract registry.
|
|
9396
|
-
*/
|
|
9397
|
-
var JudgeEvalVariantResult = Type$1.Object({
|
|
9398
|
-
runTaskId: Type$1.String({ format: "uuid" }),
|
|
9485
|
+
var JudgeEvalAttemptOutput = Type$1.Object({
|
|
9486
|
+
targetTaskId: Type$1.String({ format: "uuid" }),
|
|
9487
|
+
targetAttemptN: Type$1.Integer({ minimum: 1 }),
|
|
9399
9488
|
variantLabel: Type$1.String({
|
|
9400
9489
|
minLength: 1,
|
|
9401
9490
|
maxLength: 64,
|
|
@@ -9406,219 +9495,195 @@ var JudgeEvalVariantResult = Type$1.Object({
|
|
|
9406
9495
|
minimum: 0,
|
|
9407
9496
|
maximum: 1
|
|
9408
9497
|
}),
|
|
9409
|
-
verdict: Type$1.String({ minLength: 1 })
|
|
9410
|
-
}, {
|
|
9411
|
-
$id: "JudgeEvalVariantResult",
|
|
9412
|
-
additionalProperties: false
|
|
9413
|
-
});
|
|
9414
|
-
var JudgeEvalVariantOutput = Type$1.Object({
|
|
9415
|
-
results: Type$1.Array(JudgeEvalVariantResult, { minItems: 2 }),
|
|
9416
|
-
deltas: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Number({
|
|
9417
|
-
minimum: -1,
|
|
9418
|
-
maximum: 1
|
|
9419
|
-
}))),
|
|
9498
|
+
verdict: Type$1.String({ minLength: 1 }),
|
|
9420
9499
|
judgeModel: Type$1.Optional(Type$1.String({ minLength: 1 })),
|
|
9421
9500
|
traceparent: Type$1.String({ minLength: 1 })
|
|
9422
9501
|
}, {
|
|
9423
|
-
$id: "
|
|
9502
|
+
$id: "JudgeEvalAttemptOutput",
|
|
9424
9503
|
additionalProperties: false
|
|
9425
9504
|
});
|
|
9426
|
-
|
|
9427
|
-
* Synchronous input invariants beyond TypeBox shape: rubric must be
|
|
9428
|
-
* present (already required by the schema, but the rubric body has
|
|
9429
|
-
* its own per-criterion weight invariant) and the rubric's weights
|
|
9430
|
-
* must sum to 1.
|
|
9431
|
-
*
|
|
9432
|
-
* Cross-task invariants (all targets are `run_eval`, all completed,
|
|
9433
|
-
* share `correlation_id`, byte-identical `input.successCriteria`)
|
|
9434
|
-
* are NOT checked here — they require async DB lookups against
|
|
9435
|
-
* `runTaskIds` and live in `validateJudgeEvalVariantInputAsync`
|
|
9436
|
-
* below, invoked by the task service at create time (#1096).
|
|
9437
|
-
*/
|
|
9438
|
-
function validateJudgeEvalVariantInput(input) {
|
|
9505
|
+
function validateJudgeEvalAttemptInput(input) {
|
|
9439
9506
|
const sc = input.successCriteria;
|
|
9440
|
-
if (!sc) return "successCriteria is required for
|
|
9441
|
-
if (!sc.rubric) return "successCriteria.rubric is required for
|
|
9507
|
+
if (!sc) return "successCriteria is required for judge_eval_attempt";
|
|
9508
|
+
if (!sc.rubric) return "successCriteria.rubric is required for judge_eval_attempt";
|
|
9442
9509
|
return validateRubricWeights(sc.rubric);
|
|
9443
9510
|
}
|
|
9444
|
-
|
|
9445
|
-
* Output cross-field invariants the schema cannot express:
|
|
9446
|
-
*
|
|
9447
|
-
* 1. `results.length === input.runTaskIds.length` — every variant
|
|
9448
|
-
* the imposer asked for must be graded. Partial grading
|
|
9449
|
-
* invalidates cross-variant comparison; fail the whole task
|
|
9450
|
-
* rather than silently report a subset.
|
|
9451
|
-
*
|
|
9452
|
-
* 2. `results[i].runTaskId === input.runTaskIds[i]` — order is
|
|
9453
|
-
* load-bearing for downstream consumers (e.g. deltas keyed by
|
|
9454
|
-
* adjacent pairs). Mismatch is an LLM bug; reject loudly.
|
|
9455
|
-
*
|
|
9456
|
-
* 3. Each `result.scores` follows the same `llm_checklist` rule
|
|
9457
|
-
* `judge_pack` enforces (#999): if a score has an `assertions`
|
|
9458
|
-
* array, the numeric score MUST be `1` iff every assertion
|
|
9459
|
-
* passes. Inconsistent payloads pollute attestations.
|
|
9460
|
-
*
|
|
9461
|
-
* 4. Each `result.composite` MUST equal the rubric-weighted sum
|
|
9462
|
-
* `Σ(weight_j × scores[j].score)`. The parent (and any subagent
|
|
9463
|
-
* it delegated to) is supposed to compute this; surfacing a
|
|
9464
|
-
* drift here catches LLMs that hand-wave the arithmetic.
|
|
9465
|
-
*
|
|
9466
|
-
* 5. Optional `deltas` keys MUST be of the form `"A - B"` where
|
|
9467
|
-
* both `A` and `B` are variantLabels present in `results`.
|
|
9468
|
-
* Values are not range-checked (any float in [-1, 1] is
|
|
9469
|
-
* arithmetically possible).
|
|
9470
|
-
*/
|
|
9471
|
-
function validateJudgeEvalVariantOutput(output, input) {
|
|
9511
|
+
function validateJudgeEvalAttemptOutput(output, input) {
|
|
9472
9512
|
const out = output;
|
|
9473
9513
|
const inp = input;
|
|
9474
9514
|
if (inp) {
|
|
9475
|
-
if (out.
|
|
9476
|
-
|
|
9515
|
+
if (out.targetTaskId !== inp.targetTaskId) return `output.targetTaskId (${out.targetTaskId}) does not match input.targetTaskId (${inp.targetTaskId})`;
|
|
9516
|
+
if (out.targetAttemptN !== inp.targetAttemptN) return `output.targetAttemptN (${out.targetAttemptN}) does not match input.targetAttemptN (${inp.targetAttemptN})`;
|
|
9477
9517
|
}
|
|
9478
|
-
for (let
|
|
9479
|
-
const
|
|
9480
|
-
|
|
9481
|
-
|
|
9482
|
-
|
|
9483
|
-
|
|
9484
|
-
const expected = allPassed ? 1 : 0;
|
|
9485
|
-
if (sc.score !== expected) return `results[${r}].scores[${s}] (criterionId="${sc.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${sc.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
|
|
9486
|
-
}
|
|
9518
|
+
for (let s = 0; s < out.scores.length; s++) {
|
|
9519
|
+
const sc = out.scores[s];
|
|
9520
|
+
if (!sc.assertions) continue;
|
|
9521
|
+
const allPassed = sc.assertions.every((a) => a.passed);
|
|
9522
|
+
const expected = allPassed ? 1 : 0;
|
|
9523
|
+
if (sc.score !== expected) return `scores[${s}] (criterionId="${sc.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${sc.score}. Score must be 1 iff every assertion passes, else 0.`;
|
|
9487
9524
|
}
|
|
9488
9525
|
if (inp?.successCriteria?.rubric) {
|
|
9489
9526
|
const criteria = inp.successCriteria.rubric.criteria;
|
|
9490
9527
|
const weightById = new Map(criteria.map((c) => [c.id, c.weight]));
|
|
9491
|
-
|
|
9492
|
-
|
|
9493
|
-
|
|
9494
|
-
|
|
9495
|
-
|
|
9496
|
-
if (w === void 0) return `results[${r}].scores: criterionId "${sc.criterionId}" is not in the input rubric (known: ${Array.from(weightById.keys()).join(", ")}). Score every rubric criterion exactly once; do not invent new ids.`;
|
|
9497
|
-
sum += w * sc.score;
|
|
9498
|
-
}
|
|
9499
|
-
if (Math.abs(sum - result.composite) > .001) return `results[${r}].composite (${result.composite}) does not match Σ(weight × score) (${sum.toFixed(6)}). Composite must be the rubric-weighted sum of per-criterion scores (drift > 0.001).`;
|
|
9500
|
-
}
|
|
9501
|
-
}
|
|
9502
|
-
if (out.deltas) {
|
|
9503
|
-
const labels = new Set(out.results.map((r) => r.variantLabel));
|
|
9504
|
-
for (const key of Object.keys(out.deltas)) {
|
|
9505
|
-
const m = /^(.+?) - (.+)$/.exec(key);
|
|
9506
|
-
if (!m) return `deltas key "${key}" is not of the form "<variantLabel-A> - <variantLabel-B>". Use a single space-hyphen-space separator between labels.`;
|
|
9507
|
-
const [, a, b] = m;
|
|
9508
|
-
if (!labels.has(a) || !labels.has(b)) return `deltas key "${key}" references variantLabel(s) not present in results: ${!labels.has(a) ? `"${a}" missing` : ""}${!labels.has(a) && !labels.has(b) ? ", " : ""}${!labels.has(b) ? `"${b}" missing` : ""}`;
|
|
9528
|
+
let sum = 0;
|
|
9529
|
+
for (const sc of out.scores) {
|
|
9530
|
+
const w = weightById.get(sc.criterionId);
|
|
9531
|
+
if (w === void 0) return `scores references unknown criterionId "${sc.criterionId}"`;
|
|
9532
|
+
sum += w * sc.score;
|
|
9509
9533
|
}
|
|
9534
|
+
const rounded = Math.round(sum * 1e3) / 1e3;
|
|
9535
|
+
if (Math.abs(rounded - out.composite) > .001) return `composite (${out.composite}) does not match weighted rubric sum (${rounded})`;
|
|
9510
9536
|
}
|
|
9511
9537
|
return null;
|
|
9512
9538
|
}
|
|
9513
|
-
|
|
9514
|
-
|
|
9515
|
-
* equality. Recursively sorts object keys; arrays preserve order
|
|
9516
|
-
* (intentional — rubric criteria order is semantically meaningful).
|
|
9517
|
-
* Mirrors the canonical-JSON shape `crypto-service` uses for CIDs,
|
|
9518
|
-
* without taking on a crypto-service dep just for this comparison.
|
|
9519
|
-
*/
|
|
9520
|
-
function stableStringify(value) {
|
|
9521
|
-
if (value === null || typeof value !== "object") return JSON.stringify(value);
|
|
9522
|
-
if (Array.isArray(value)) return "[" + value.map(stableStringify).join(",") + "]";
|
|
9523
|
-
const obj = value;
|
|
9524
|
-
return "{" + Object.keys(obj).sort().map((k) => JSON.stringify(k) + ":" + stableStringify(obj[k])).join(",") + "}";
|
|
9525
|
-
}
|
|
9526
|
-
/**
|
|
9527
|
-
* Async preflight for `judge_eval_variant` (#1096 + #943):
|
|
9528
|
-
*
|
|
9529
|
-
* 1. Every `runTaskIds[i]` resolves to a task the caller can read.
|
|
9530
|
-
* 2. Every resolved task is `taskType === 'run_eval'`.
|
|
9531
|
-
* 3. Every resolved task is `status === 'completed'` with a
|
|
9532
|
-
* non-null `acceptedAttemptN` — grading an unaccepted attempt
|
|
9533
|
-
* races with re-attempts and pollutes the judge attestation.
|
|
9534
|
-
* 4. Every resolved task shares a non-null `correlationId`, and all
|
|
9535
|
-
* `correlationId`s are equal. Without this an imposer could
|
|
9536
|
-
* fabricate a "variant set" by stapling unrelated runs together.
|
|
9537
|
-
* 5. The shared `correlationId` is NOT already sealed. A previous
|
|
9538
|
-
* judge_eval_variant against the same group is final; produce a
|
|
9539
|
-
* fresh correlation_id for a new judging round rather than
|
|
9540
|
-
* adding contradictory verdicts to a sealed group.
|
|
9541
|
-
* 6. Every variant's `input.successCriteria` is byte-identical (via
|
|
9542
|
-
* stable-stringify). Different rubrics across "variants" makes
|
|
9543
|
-
* the comparison meaningless.
|
|
9544
|
-
*/
|
|
9545
|
-
async function validateJudgeEvalVariantInputAsync(input, ctx) {
|
|
9546
|
-
const { runTaskIds } = input;
|
|
9539
|
+
async function validateJudgeEvalAttemptInputAsync(input, ctx) {
|
|
9540
|
+
const inp = input;
|
|
9547
9541
|
const errors = [];
|
|
9548
|
-
const
|
|
9549
|
-
|
|
9550
|
-
|
|
9551
|
-
|
|
9552
|
-
|
|
9553
|
-
|
|
9554
|
-
|
|
9555
|
-
|
|
9556
|
-
field: `runTaskIds[${i}]`,
|
|
9557
|
-
message: `runTaskIds[${i}]=${runTaskIds[i]} does not resolve to a task you can read`
|
|
9558
|
-
});
|
|
9559
|
-
continue;
|
|
9560
|
-
}
|
|
9561
|
-
presentTargets.push(t);
|
|
9562
|
-
if (t.taskType !== "run_eval") errors.push({
|
|
9563
|
-
field: `runTaskIds[${i}]`,
|
|
9564
|
-
message: `runTaskIds[${i}]=${runTaskIds[i]} is a ${t.taskType}, not a run_eval`
|
|
9565
|
-
});
|
|
9566
|
-
if (t.status !== "completed" || t.acceptedAttemptN === null) errors.push({
|
|
9567
|
-
field: `runTaskIds[${i}]`,
|
|
9568
|
-
message: `runTaskIds[${i}]=${runTaskIds[i]} is not completed with an accepted attempt (status=${t.status}, acceptedAttemptN=${t.acceptedAttemptN})`
|
|
9569
|
-
});
|
|
9570
|
-
}
|
|
9571
|
-
if (missingTargets || presentTargets.length === 0) return errors;
|
|
9572
|
-
const correlationIds = new Set(presentTargets.map((t) => t.correlationId ?? "__null__"));
|
|
9573
|
-
if (correlationIds.has("__null__")) errors.push({
|
|
9574
|
-
field: "runTaskIds",
|
|
9575
|
-
message: "one or more run_eval targets have no correlation_id; cannot group as variants"
|
|
9542
|
+
const target = await ctx.resolveTask(inp.targetTaskId);
|
|
9543
|
+
if (!target) return [{
|
|
9544
|
+
field: "targetTaskId",
|
|
9545
|
+
message: `targetTaskId=${inp.targetTaskId} does not resolve to a task you can read`
|
|
9546
|
+
}];
|
|
9547
|
+
if (target.taskType !== "run_eval") errors.push({
|
|
9548
|
+
field: "targetTaskId",
|
|
9549
|
+
message: `targetTaskId=${inp.targetTaskId} is a ${target.taskType}, not a run_eval`
|
|
9576
9550
|
});
|
|
9577
|
-
if (
|
|
9578
|
-
field: "
|
|
9579
|
-
message: `
|
|
9551
|
+
if (target.status !== "completed" || target.acceptedAttemptN === null) errors.push({
|
|
9552
|
+
field: "targetTaskId",
|
|
9553
|
+
message: `targetTaskId=${inp.targetTaskId} is not completed with an accepted attempt (status=${target.status}, acceptedAttemptN=${target.acceptedAttemptN})`
|
|
9580
9554
|
});
|
|
9581
|
-
if (
|
|
9582
|
-
|
|
9583
|
-
|
|
9584
|
-
|
|
9585
|
-
if (
|
|
9586
|
-
field: "
|
|
9587
|
-
message:
|
|
9555
|
+
else if (target.acceptedAttemptN !== inp.targetAttemptN) errors.push({
|
|
9556
|
+
field: "targetAttemptN",
|
|
9557
|
+
message: `targetAttemptN=${inp.targetAttemptN} does not match the producer's acceptedAttemptN=${target.acceptedAttemptN}`
|
|
9558
|
+
});
|
|
9559
|
+
if (!target.correlationId) errors.push({
|
|
9560
|
+
field: "targetTaskId",
|
|
9561
|
+
message: "target run_eval has no correlation_id; cannot enforce duplicate-judge protection"
|
|
9562
|
+
});
|
|
9563
|
+
if (errors.length > 0 || !target.correlationId) return errors;
|
|
9564
|
+
const rubric = inp.successCriteria.rubric;
|
|
9565
|
+
const duplicate = (await ctx.listTasksByCorrelation(target.correlationId)).find((task) => {
|
|
9566
|
+
if (task.taskType !== "judge_eval_attempt") return false;
|
|
9567
|
+
if (task.status === "failed" || task.status === "cancelled" || task.status === "expired") return false;
|
|
9568
|
+
const existing = task.input;
|
|
9569
|
+
const existingRubric = existing.successCriteria?.rubric;
|
|
9570
|
+
return existing.targetTaskId === inp.targetTaskId && existing.targetAttemptN === inp.targetAttemptN && existingRubric?.rubricId === rubric?.rubricId && existingRubric?.version === rubric?.version;
|
|
9571
|
+
});
|
|
9572
|
+
if (duplicate) errors.push({
|
|
9573
|
+
field: "targetTaskId",
|
|
9574
|
+
message: `judge task ${duplicate.id} already exists for (${inp.targetTaskId}, attempt ${inp.targetAttemptN}, rubric ${rubric?.rubricId}@${rubric?.version})`
|
|
9588
9575
|
});
|
|
9589
|
-
const first = stableStringify(presentTargets[0].input.successCriteria);
|
|
9590
|
-
for (let i = 1; i < presentTargets.length; i++) if (stableStringify(presentTargets[i].input.successCriteria) !== first) {
|
|
9591
|
-
errors.push({
|
|
9592
|
-
field: `runTaskIds[${i}]`,
|
|
9593
|
-
message: `runTaskIds[${i}] has a different input.successCriteria than runTaskIds[0]; all variants must share the rubric and gates`
|
|
9594
|
-
});
|
|
9595
|
-
break;
|
|
9596
|
-
}
|
|
9597
9576
|
return errors;
|
|
9598
9577
|
}
|
|
9599
|
-
|
|
9600
|
-
|
|
9601
|
-
|
|
9602
|
-
|
|
9603
|
-
* concurrent second `judge_eval_variant` against the same group
|
|
9604
|
-
* loses the race and is rejected with a clean conflict error.
|
|
9605
|
-
*
|
|
9606
|
-
* The seal applies to the SHARED correlation_id of the targets —
|
|
9607
|
-
* NOT to the judge task's own correlationId (which is typically
|
|
9608
|
-
* null or distinct). The task service derives the correlationId
|
|
9609
|
-
* for the effect from the resolved targets, not from the judge
|
|
9610
|
-
* task row.
|
|
9611
|
-
*/
|
|
9612
|
-
async function onCreateJudgeEvalVariant(input, ctx) {
|
|
9613
|
-
const { runTaskIds } = input;
|
|
9614
|
-
const first = await ctx.resolveTask(runTaskIds[0]);
|
|
9615
|
-
if (!first?.correlationId) return [];
|
|
9578
|
+
async function onCreateJudgeEvalAttempt(input, _ctx) {
|
|
9579
|
+
const judge = input;
|
|
9580
|
+
const rubric = judge.successCriteria.rubric;
|
|
9581
|
+
if (!rubric) return [];
|
|
9616
9582
|
return [{
|
|
9617
|
-
kind: "
|
|
9618
|
-
|
|
9583
|
+
kind: "guardTaskUniqueness",
|
|
9584
|
+
taskType: JUDGE_EVAL_ATTEMPT_TYPE,
|
|
9585
|
+
lockKey: [
|
|
9586
|
+
JUDGE_EVAL_ATTEMPT_TYPE,
|
|
9587
|
+
judge.targetTaskId,
|
|
9588
|
+
String(judge.targetAttemptN),
|
|
9589
|
+
rubric.rubricId,
|
|
9590
|
+
rubric.version
|
|
9591
|
+
].join(":"),
|
|
9592
|
+
inputMatches: [
|
|
9593
|
+
{
|
|
9594
|
+
path: ["targetTaskId"],
|
|
9595
|
+
value: judge.targetTaskId
|
|
9596
|
+
},
|
|
9597
|
+
{
|
|
9598
|
+
path: ["targetAttemptN"],
|
|
9599
|
+
value: judge.targetAttemptN
|
|
9600
|
+
},
|
|
9601
|
+
{
|
|
9602
|
+
path: [
|
|
9603
|
+
"successCriteria",
|
|
9604
|
+
"rubric",
|
|
9605
|
+
"rubricId"
|
|
9606
|
+
],
|
|
9607
|
+
value: rubric.rubricId
|
|
9608
|
+
},
|
|
9609
|
+
{
|
|
9610
|
+
path: [
|
|
9611
|
+
"successCriteria",
|
|
9612
|
+
"rubric",
|
|
9613
|
+
"version"
|
|
9614
|
+
],
|
|
9615
|
+
value: rubric.version
|
|
9616
|
+
}
|
|
9617
|
+
]
|
|
9619
9618
|
}];
|
|
9620
9619
|
}
|
|
9621
9620
|
//#endregion
|
|
9621
|
+
//#region ../tasks/src/task-types/pr-review.ts
|
|
9622
|
+
var PR_REVIEW_TYPE = "pr_review";
|
|
9623
|
+
var PrReviewSubject = Type$1.Object({
|
|
9624
|
+
title: Type$1.String({ minLength: 1 }),
|
|
9625
|
+
summary: Type$1.String({ minLength: 1 }),
|
|
9626
|
+
resourceUrls: Type$1.Optional(Type$1.Array(Type$1.String({ minLength: 1 }))),
|
|
9627
|
+
inspectionHints: Type$1.Optional(Type$1.Array(Type$1.String({ minLength: 1 })))
|
|
9628
|
+
}, {
|
|
9629
|
+
$id: "PrReviewSubject",
|
|
9630
|
+
additionalProperties: false
|
|
9631
|
+
});
|
|
9632
|
+
var PrReviewInput = Type$1.Object({
|
|
9633
|
+
subject: PrReviewSubject,
|
|
9634
|
+
taskPrompt: Type$1.Optional(Type$1.String({ minLength: 1 })),
|
|
9635
|
+
successCriteria: SuccessCriteria
|
|
9636
|
+
}, {
|
|
9637
|
+
$id: "PrReviewInput",
|
|
9638
|
+
additionalProperties: false
|
|
9639
|
+
});
|
|
9640
|
+
var PrReviewScore = Type$1.Object({
|
|
9641
|
+
criterionId: Type$1.String({ minLength: 1 }),
|
|
9642
|
+
score: Type$1.Union([Type$1.Literal(0), Type$1.Literal(1)]),
|
|
9643
|
+
rationale: Type$1.String({ minLength: 1 })
|
|
9644
|
+
}, {
|
|
9645
|
+
$id: "PrReviewScore",
|
|
9646
|
+
additionalProperties: false
|
|
9647
|
+
});
|
|
9648
|
+
var PrReviewOutput = Type$1.Object({
|
|
9649
|
+
scores: Type$1.Array(PrReviewScore, { minItems: 1 }),
|
|
9650
|
+
composite: Type$1.Number({
|
|
9651
|
+
minimum: 0,
|
|
9652
|
+
maximum: 1
|
|
9653
|
+
}),
|
|
9654
|
+
verdict: Type$1.String({ minLength: 1 })
|
|
9655
|
+
}, {
|
|
9656
|
+
$id: "PrReviewOutput",
|
|
9657
|
+
additionalProperties: false
|
|
9658
|
+
});
|
|
9659
|
+
function requireBooleanRubric(rubric) {
|
|
9660
|
+
for (const criterion of rubric.criteria) if (criterion.scoring !== "boolean") return `pr_review requires boolean scoring for every rubric criterion; criterion "${criterion.id}" uses "${criterion.scoring}"`;
|
|
9661
|
+
return null;
|
|
9662
|
+
}
|
|
9663
|
+
function validatePrReviewInput(input) {
|
|
9664
|
+
const sc = input.successCriteria;
|
|
9665
|
+
if (!sc) return "successCriteria is required for judgment tasks";
|
|
9666
|
+
if (!sc.rubric) return "successCriteria.rubric is required for judgment tasks";
|
|
9667
|
+
return validateRubricWeights(sc.rubric) ?? requireBooleanRubric(sc.rubric);
|
|
9668
|
+
}
|
|
9669
|
+
function validatePrReviewOutput(output, input) {
|
|
9670
|
+
if (!input) return null;
|
|
9671
|
+
const scores = output.scores;
|
|
9672
|
+
const rubric = input.successCriteria.rubric;
|
|
9673
|
+
if (!rubric) return null;
|
|
9674
|
+
if (scores.length !== rubric.criteria.length) return `scores length ${scores.length} does not match rubric criteria length ${rubric.criteria.length}`;
|
|
9675
|
+
let composite = 0;
|
|
9676
|
+
for (let i = 0; i < rubric.criteria.length; i++) {
|
|
9677
|
+
const criterion = rubric.criteria[i];
|
|
9678
|
+
const score = scores[i];
|
|
9679
|
+
if (score.criterionId !== criterion.id) return `scores[${i}] has criterionId "${score.criterionId}" but rubric expects "${criterion.id}" in that position`;
|
|
9680
|
+
composite += criterion.weight * score.score;
|
|
9681
|
+
}
|
|
9682
|
+
const claimed = output.composite;
|
|
9683
|
+
if (Math.abs(claimed - composite) > 1e-6) return `composite ${claimed} does not match weighted sum ${composite.toFixed(6)}`;
|
|
9684
|
+
return null;
|
|
9685
|
+
}
|
|
9686
|
+
//#endregion
|
|
9622
9687
|
//#region ../tasks/src/task-types/render-pack.ts
|
|
9623
9688
|
/**
|
|
9624
9689
|
* `render_pack` — turn a context pack into a signed rendered artefact.
|
|
@@ -9673,14 +9738,43 @@ async function validateRenderPackInputAsync(input, ctx) {
|
|
|
9673
9738
|
//#region ../tasks/src/task-types/run-eval.ts
|
|
9674
9739
|
/**
|
|
9675
9740
|
* `run_eval` — execute a scenario prompt under a named variant for
|
|
9676
|
-
* later
|
|
9741
|
+
* later per-attempt grading by `judge_eval_attempt` tasks.
|
|
9677
9742
|
*
|
|
9678
9743
|
* output_kind: artifact
|
|
9679
|
-
* criteria: optional (when set,
|
|
9680
|
-
*
|
|
9744
|
+
* criteria: optional producer-only checks (when set,
|
|
9745
|
+
* output.verification is required — the judge rubric remains hidden
|
|
9746
|
+
* on downstream `judge_eval_attempt` tasks)
|
|
9681
9747
|
* references: not required (scenario lives entirely in input)
|
|
9682
9748
|
*/
|
|
9683
9749
|
var RUN_EVAL_TYPE = "run_eval";
|
|
9750
|
+
var RunEvalMode = Type$1.Union([Type$1.Literal("vitro"), Type$1.Literal("vivo")], { $id: "RunEvalMode" });
|
|
9751
|
+
var RunEvalWorkspace = Type$1.Union([
|
|
9752
|
+
Type$1.Literal("none"),
|
|
9753
|
+
Type$1.Literal("shared_mount"),
|
|
9754
|
+
Type$1.Literal("dedicated_worktree")
|
|
9755
|
+
], { $id: "RunEvalWorkspace" });
|
|
9756
|
+
var RunEvalExecution = Type$1.Object({
|
|
9757
|
+
mode: RunEvalMode,
|
|
9758
|
+
workspace: RunEvalWorkspace
|
|
9759
|
+
}, {
|
|
9760
|
+
$id: "RunEvalExecution",
|
|
9761
|
+
additionalProperties: false
|
|
9762
|
+
});
|
|
9763
|
+
/**
|
|
9764
|
+
* Producer-visible checks for `run_eval`. Deliberately forbids `rubric`
|
|
9765
|
+
* so the variant runner cannot see the downstream judge's answer key.
|
|
9766
|
+
* Keep the rest of the SuccessCriteria envelope available for generic
|
|
9767
|
+
* process / structure checks (`gates`, `assertions`, `sideEffects`).
|
|
9768
|
+
*/
|
|
9769
|
+
var RunEvalSuccessCriteria = Type$1.Object({
|
|
9770
|
+
version: Type$1.Literal(1),
|
|
9771
|
+
gates: Type$1.Optional(SuccessCriteria.properties.gates),
|
|
9772
|
+
assertions: Type$1.Optional(SuccessCriteria.properties.assertions),
|
|
9773
|
+
sideEffects: Type$1.Optional(SuccessCriteria.properties.sideEffects)
|
|
9774
|
+
}, {
|
|
9775
|
+
$id: "RunEvalSuccessCriteria",
|
|
9776
|
+
additionalProperties: false
|
|
9777
|
+
});
|
|
9684
9778
|
var RunEvalInput = Type$1.Object({
|
|
9685
9779
|
scenario: Type$1.Object({
|
|
9686
9780
|
prompt: Type$1.String({ minLength: 1 }),
|
|
@@ -9690,8 +9784,9 @@ var RunEvalInput = Type$1.Object({
|
|
|
9690
9784
|
minLength: 1,
|
|
9691
9785
|
maxLength: 64
|
|
9692
9786
|
}),
|
|
9787
|
+
execution: RunEvalExecution,
|
|
9693
9788
|
context: TaskContext,
|
|
9694
|
-
successCriteria: Type$1.Optional(
|
|
9789
|
+
successCriteria: Type$1.Optional(RunEvalSuccessCriteria)
|
|
9695
9790
|
}, {
|
|
9696
9791
|
$id: "RunEvalInput",
|
|
9697
9792
|
additionalProperties: false
|
|
@@ -9719,8 +9814,8 @@ var RunEvalOutput = Type$1.Object({
|
|
|
9719
9814
|
function validateRunEvalOutput(output, input) {
|
|
9720
9815
|
const hasCriteria = input !== null && input !== void 0 && input.successCriteria !== void 0;
|
|
9721
9816
|
const hasVerification = output !== null && output !== void 0 && output.verification !== void 0;
|
|
9722
|
-
if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the
|
|
9723
|
-
if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no
|
|
9817
|
+
if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the producer checks";
|
|
9818
|
+
if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no producer checks to assess against";
|
|
9724
9819
|
return null;
|
|
9725
9820
|
}
|
|
9726
9821
|
//#endregion
|
|
@@ -9786,6 +9881,18 @@ var BUILT_IN_TASK_TYPES = {
|
|
|
9786
9881
|
validateInput: validateJudgmentInput,
|
|
9787
9882
|
validateInputAsync: validateAssessBriefInputAsync
|
|
9788
9883
|
},
|
|
9884
|
+
[PR_REVIEW_TYPE]: {
|
|
9885
|
+
name: PR_REVIEW_TYPE,
|
|
9886
|
+
inputSchema: PrReviewInput,
|
|
9887
|
+
outputSchema: PrReviewOutput,
|
|
9888
|
+
outputKind: "judgment",
|
|
9889
|
+
workspaceMode: "dedicated_worktree",
|
|
9890
|
+
workspaceScope: "attempt",
|
|
9891
|
+
sessionScope: "none",
|
|
9892
|
+
requiresReferences: false,
|
|
9893
|
+
validateInput: validatePrReviewInput,
|
|
9894
|
+
validateOutput: validatePrReviewOutput
|
|
9895
|
+
},
|
|
9789
9896
|
[CURATE_PACK_TYPE]: {
|
|
9790
9897
|
name: CURATE_PACK_TYPE,
|
|
9791
9898
|
inputSchema: CuratePackInput,
|
|
@@ -9824,24 +9931,24 @@ var BUILT_IN_TASK_TYPES = {
|
|
|
9824
9931
|
inputSchema: RunEvalInput,
|
|
9825
9932
|
outputSchema: RunEvalOutput,
|
|
9826
9933
|
outputKind: "artifact",
|
|
9827
|
-
|
|
9934
|
+
resumable: true,
|
|
9935
|
+
workspaceScope: "session",
|
|
9828
9936
|
sessionScope: "custom",
|
|
9829
9937
|
requiresReferences: false,
|
|
9830
9938
|
validateOutput: validateRunEvalOutput
|
|
9831
9939
|
},
|
|
9832
|
-
[
|
|
9833
|
-
name:
|
|
9834
|
-
inputSchema:
|
|
9835
|
-
outputSchema:
|
|
9940
|
+
[JUDGE_EVAL_ATTEMPT_TYPE]: {
|
|
9941
|
+
name: JUDGE_EVAL_ATTEMPT_TYPE,
|
|
9942
|
+
inputSchema: JudgeEvalAttemptInput,
|
|
9943
|
+
outputSchema: JudgeEvalAttemptOutput,
|
|
9836
9944
|
outputKind: "judgment",
|
|
9837
9945
|
workspaceScope: "attempt",
|
|
9838
|
-
sessionScope: "
|
|
9946
|
+
sessionScope: "none",
|
|
9839
9947
|
requiresReferences: false,
|
|
9840
|
-
validateInput:
|
|
9841
|
-
validateOutput:
|
|
9842
|
-
validateInputAsync:
|
|
9843
|
-
onCreate:
|
|
9844
|
-
usesSubagents: true
|
|
9948
|
+
validateInput: validateJudgeEvalAttemptInput,
|
|
9949
|
+
validateOutput: validateJudgeEvalAttemptOutput,
|
|
9950
|
+
validateInputAsync: validateJudgeEvalAttemptInputAsync,
|
|
9951
|
+
onCreate: onCreateJudgeEvalAttempt
|
|
9845
9952
|
}
|
|
9846
9953
|
};
|
|
9847
9954
|
//#endregion
|
|
@@ -10205,20 +10312,16 @@ function buildFinalOutputBlock(opts) {
|
|
|
10205
10312
|
"## Final output (read this carefully)",
|
|
10206
10313
|
"",
|
|
10207
10314
|
`Your VERY LAST action in this conversation MUST report the structured`,
|
|
10208
|
-
`output matching \`${outputSchemaName}
|
|
10209
|
-
`preference:`,
|
|
10315
|
+
`output matching \`${outputSchemaName}\`.`,
|
|
10210
10316
|
"",
|
|
10211
|
-
`
|
|
10212
|
-
`
|
|
10213
|
-
`
|
|
10214
|
-
`
|
|
10215
|
-
`
|
|
10216
|
-
` \`${outputSchemaName}\`. No prose before or after. No code fences.`,
|
|
10217
|
-
` No "ok" or "done". The runtime parses the last balanced top-level`,
|
|
10218
|
-
` JSON object as the output.`,
|
|
10317
|
+
`Call \`${submitTool}\` exactly once with the payload.`,
|
|
10318
|
+
`The runtime captures the validated arguments and ends the session.`,
|
|
10319
|
+
`Do NOT emit the output as plain assistant text. Do NOT rely on a`,
|
|
10320
|
+
`JSON-in-message fallback. If you do not call \`${submitTool}\`, the`,
|
|
10321
|
+
`attempt fails even if the underlying work succeeded.`,
|
|
10219
10322
|
"",
|
|
10220
|
-
`
|
|
10221
|
-
`
|
|
10323
|
+
`Your final assistant text before that tool call may explain your work,`,
|
|
10324
|
+
`but the submit-tool call itself must be your VERY LAST action.`,
|
|
10222
10325
|
"",
|
|
10223
10326
|
`Output shape:`,
|
|
10224
10327
|
"",
|
|
@@ -10233,6 +10336,20 @@ function buildFinalOutputBlock(opts) {
|
|
|
10233
10336
|
return lines.join("\n");
|
|
10234
10337
|
}
|
|
10235
10338
|
//#endregion
|
|
10339
|
+
//#region ../agent-runtime/src/prompts/rubric-common.ts
|
|
10340
|
+
function renderRubricCriteriaList(rubric) {
|
|
10341
|
+
return rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
|
|
10342
|
+
}
|
|
10343
|
+
function renderRubricPreambleSection(rubric) {
|
|
10344
|
+
if (!rubric.preamble) return null;
|
|
10345
|
+
return [
|
|
10346
|
+
"### Rubric preamble",
|
|
10347
|
+
"",
|
|
10348
|
+
rubric.preamble,
|
|
10349
|
+
""
|
|
10350
|
+
].join("\n");
|
|
10351
|
+
}
|
|
10352
|
+
//#endregion
|
|
10236
10353
|
//#region ../agent-runtime/src/prompts/assess-brief.ts
|
|
10237
10354
|
/**
|
|
10238
10355
|
* Build the first user-message prompt for an `assess_brief` judge attempt.
|
|
@@ -10258,13 +10375,8 @@ function buildFinalOutputBlock(opts) {
|
|
|
10258
10375
|
*/
|
|
10259
10376
|
function buildAssessBriefUserPrompt(input, ctx) {
|
|
10260
10377
|
const rubric = input.successCriteria.rubric;
|
|
10261
|
-
const criteriaList = rubric
|
|
10262
|
-
const preambleSection = rubric
|
|
10263
|
-
"### Rubric preamble",
|
|
10264
|
-
"",
|
|
10265
|
-
rubric.preamble,
|
|
10266
|
-
""
|
|
10267
|
-
].join("\n") : "";
|
|
10378
|
+
const criteriaList = renderRubricCriteriaList(rubric);
|
|
10379
|
+
const preambleSection = renderRubricPreambleSection(rubric) ?? "";
|
|
10268
10380
|
const workspaceSection = ctx.workspace?.mode === "dedicated_worktree" ? [
|
|
10269
10381
|
"### Workspace",
|
|
10270
10382
|
"",
|
|
@@ -10347,21 +10459,30 @@ function buildAssessBriefUserPrompt(input, ctx) {
|
|
|
10347
10459
|
}
|
|
10348
10460
|
//#endregion
|
|
10349
10461
|
//#region ../agent-runtime/src/prompts/self-verification.ts
|
|
10350
|
-
function buildSelfVerificationBlock(taskId) {
|
|
10462
|
+
function buildSelfVerificationBlock(taskId, criteriaField = "successCriteria") {
|
|
10351
10463
|
return [
|
|
10352
10464
|
"## Self-verification",
|
|
10353
10465
|
"",
|
|
10354
|
-
`
|
|
10466
|
+
`If \`input.${criteriaField}\` is set on this task, your final output MUST`,
|
|
10467
|
+
"include a `verification` block. **The runtime/server rejects task",
|
|
10468
|
+
`submission without \`verification\` when \`${criteriaField}\` is present**`,
|
|
10469
|
+
"— the request fails validation and the attempt is discarded, even if the",
|
|
10470
|
+
"underlying work succeeded. Do not call the submit tool until you have",
|
|
10471
|
+
"computed the verification payload.",
|
|
10472
|
+
"",
|
|
10473
|
+
`Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.${criteriaField}\`.`,
|
|
10355
10474
|
"",
|
|
10356
|
-
|
|
10475
|
+
`- If \`input.${criteriaField}\` is **absent**, omit \`verification\` from your`,
|
|
10357
10476
|
" final output entirely.",
|
|
10358
|
-
|
|
10359
|
-
" `verification` block in your final output. Evaluate every applicable",
|
|
10477
|
+
`- If \`input.${criteriaField}\` is **present**, evaluate every applicable`,
|
|
10360
10478
|
" item — `gates`, `assertions`, `rubric` criteria, `sideEffects` — against",
|
|
10361
10479
|
" your produced work and emit one result per id. Be honest: a `fail` with",
|
|
10362
10480
|
" a one-line reason is more useful than a false `pass`. Use `skip` (with a",
|
|
10363
10481
|
" `detail`) when you genuinely could not determine a result. Compute",
|
|
10364
10482
|
" `passed = results.every(r => r.status !== 'fail')`.",
|
|
10483
|
+
"- `verification` MUST be a JSON object. Never send a string, markdown",
|
|
10484
|
+
" block, null, or an empty placeholder. The submit tool expects an object",
|
|
10485
|
+
" with `inputCid`, `results`, and `passed` fields.",
|
|
10365
10486
|
"",
|
|
10366
10487
|
"Verification shape:",
|
|
10367
10488
|
"",
|
|
@@ -10375,6 +10496,23 @@ function buildSelfVerificationBlock(taskId) {
|
|
|
10375
10496
|
" \"passed\": <boolean>",
|
|
10376
10497
|
"}",
|
|
10377
10498
|
"```",
|
|
10499
|
+
"",
|
|
10500
|
+
"Minimal valid example:",
|
|
10501
|
+
"",
|
|
10502
|
+
"```json",
|
|
10503
|
+
"{",
|
|
10504
|
+
" \"inputCid\": \"<task inputCid>\",",
|
|
10505
|
+
" \"results\": [",
|
|
10506
|
+
" {",
|
|
10507
|
+
" \"id\": \"<criterion id>\",",
|
|
10508
|
+
" \"kind\": \"rubric\",",
|
|
10509
|
+
" \"status\": \"pass\",",
|
|
10510
|
+
" \"detail\": \"one-line reason\"",
|
|
10511
|
+
" }",
|
|
10512
|
+
" ],",
|
|
10513
|
+
" \"passed\": true",
|
|
10514
|
+
"}",
|
|
10515
|
+
"```",
|
|
10378
10516
|
""
|
|
10379
10517
|
].join("\n");
|
|
10380
10518
|
}
|
|
@@ -10625,69 +10763,62 @@ function buildFulfillBriefUserPrompt(input, ctx) {
|
|
|
10625
10763
|
].filter(Boolean).join("\n");
|
|
10626
10764
|
}
|
|
10627
10765
|
//#endregion
|
|
10628
|
-
//#region ../agent-runtime/src/prompts/judge-eval-
|
|
10629
|
-
|
|
10630
|
-
|
|
10631
|
-
|
|
10632
|
-
*
|
|
10633
|
-
* The parent agent's job is **fan-out-and-collect**: for each
|
|
10634
|
-
* `runTaskIds[i]`, spawn an isolated subagent via the `subagent` custom
|
|
10635
|
-
* tool (#1087), have it grade that variant against the shared rubric,
|
|
10636
|
-
* and collect each subagent's structured `judge_eval_variant_result`
|
|
10637
|
-
* payload. The parent does NOT grade itself; it composes the per-
|
|
10638
|
-
* variant results into the final `judge_eval_variant` output (results
|
|
10639
|
-
* array + optional deltas + verdicts).
|
|
10640
|
-
*
|
|
10641
|
-
* Isolation is the point: each variant gets a fresh subagent session
|
|
10642
|
-
* with no carryover context from sibling variants, so per-variant
|
|
10643
|
-
* grading is independent. Cost is bounded by `maxItems: 10` on
|
|
10644
|
-
* runTaskIds.
|
|
10645
|
-
*/
|
|
10646
|
-
function buildJudgeEvalVariantUserPrompt(input, ctx) {
|
|
10647
|
-
const { runTaskIds, successCriteria } = input;
|
|
10648
|
-
const rubric = successCriteria.rubric;
|
|
10649
|
-
if (!rubric) throw new Error("judge_eval_variant requires successCriteria.rubric — none present");
|
|
10766
|
+
//#region ../agent-runtime/src/prompts/judge-eval-attempt.ts
|
|
10767
|
+
function buildJudgeEvalAttemptUserPrompt(input, ctx) {
|
|
10768
|
+
const rubric = input.successCriteria.rubric;
|
|
10769
|
+
if (!rubric) throw new Error("judge_eval_attempt requires successCriteria.rubric — none present");
|
|
10650
10770
|
const escapeCell = (s) => s.replace(/\\/g, "\\\\").replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
|
|
10651
10771
|
const criteriaTable = rubric.criteria.map((c) => `| \`${c.id}\` | ${c.weight.toFixed(3)} | ${c.scoring} | ${escapeCell(c.description)} |`).join("\n");
|
|
10652
|
-
const targetsBlock = runTaskIds.map((id, i) => `${i + 1}. \`${id}\``).join("\n");
|
|
10653
10772
|
const finalOutputBlock = buildFinalOutputBlock({
|
|
10654
|
-
taskType: "
|
|
10655
|
-
outputSchemaName: "
|
|
10773
|
+
taskType: "judge_eval_attempt",
|
|
10774
|
+
outputSchemaName: "JudgeEvalAttemptOutput",
|
|
10656
10775
|
shapeSketch: [
|
|
10657
10776
|
"{",
|
|
10658
|
-
|
|
10659
|
-
"
|
|
10660
|
-
"
|
|
10661
|
-
"
|
|
10662
|
-
"
|
|
10663
|
-
"
|
|
10664
|
-
" \"verdict\": \"<1-3 sentences>\"",
|
|
10665
|
-
" },",
|
|
10666
|
-
" ...one entry per runTaskIds[i], same order",
|
|
10667
|
-
" ],",
|
|
10668
|
-
" \"deltas\": { \"<labelA> - <labelB>\": <composite(A) - composite(B)> }, // optional",
|
|
10777
|
+
` "targetTaskId": "${input.targetTaskId}",`,
|
|
10778
|
+
` "targetAttemptN": ${input.targetAttemptN},`,
|
|
10779
|
+
" \"variantLabel\": \"<from producer input>\",",
|
|
10780
|
+
" \"scores\": [ { \"criterionId\": \"...\", \"score\": 0..1, \"rationale\": \"...\", \"assertions\": [...]? } ],",
|
|
10781
|
+
" \"composite\": <Σ(weight × score), 0..1>,",
|
|
10782
|
+
" \"verdict\": \"<1-3 sentences>\",",
|
|
10669
10783
|
" \"judgeModel\": \"<id>\", // optional",
|
|
10670
10784
|
" \"traceparent\": \"<from claim>\"",
|
|
10671
10785
|
"}"
|
|
10672
10786
|
].join("\n")
|
|
10673
10787
|
});
|
|
10788
|
+
const workspaceSection = ctx.workspace?.attached === true ? [
|
|
10789
|
+
"### Workspace",
|
|
10790
|
+
"",
|
|
10791
|
+
"Your current workspace is already attached to the producer attempt",
|
|
10792
|
+
"you are judging. Inspect files directly from the current workspace",
|
|
10793
|
+
"root instead of inventing synthetic `artifact_<taskId>` paths.",
|
|
10794
|
+
"If the accepted attempt output lists `artifacts[].path`, treat those",
|
|
10795
|
+
"paths as relative to the current workspace root unless the output",
|
|
10796
|
+
"explicitly says otherwise.",
|
|
10797
|
+
ctx.workspace.mode === "dedicated_worktree" ? `This attachment is a dedicated producer worktree${ctx.workspace.branch ? ` on branch \`${ctx.workspace.branch}\`` : ""}.` : ctx.workspace.mode === "scratch_mount" ? "This attachment is the producer scratch workspace mounted with shadow writes for safe inspection." : "This attachment is the producer shared workspace mounted with shadow writes for safe inspection.",
|
|
10798
|
+
""
|
|
10799
|
+
].join("\n") : "";
|
|
10674
10800
|
return [
|
|
10675
|
-
"# Judge Eval
|
|
10676
|
-
|
|
10677
|
-
"
|
|
10678
|
-
"grade yourself.",
|
|
10801
|
+
"# Judge Eval Attempt\n",
|
|
10802
|
+
"You are grading one accepted `run_eval` producer attempt against a hidden",
|
|
10803
|
+
"judge rubric. Do not delegate to subagents. Grade in this session only.",
|
|
10679
10804
|
"",
|
|
10680
10805
|
`Task id: \`${ctx.taskId}\``,
|
|
10681
10806
|
`Diary: \`${ctx.diaryId}\``,
|
|
10807
|
+
`Producer task: \`${input.targetTaskId}\``,
|
|
10808
|
+
`Producer attempt: \`${input.targetAttemptN}\``,
|
|
10682
10809
|
"",
|
|
10683
|
-
"###
|
|
10684
|
-
"",
|
|
10685
|
-
targetsBlock,
|
|
10810
|
+
"### Evidence gathering",
|
|
10686
10811
|
"",
|
|
10687
|
-
|
|
10688
|
-
|
|
10689
|
-
|
|
10812
|
+
`1. Call \`moltnet_get_task\` with taskId=\`${input.targetTaskId}\`.`,
|
|
10813
|
+
`2. Call \`moltnet_list_task_attempts\` with taskId=\`${input.targetTaskId}\` and inspect the accepted attempt matching \`${input.targetAttemptN}\`.`,
|
|
10814
|
+
`3. Call \`moltnet_list_task_messages\` with taskId=\`${input.targetTaskId}\`, attemptN=\`${input.targetAttemptN}\` to inspect the producer's turn-by-turn behavior.`,
|
|
10815
|
+
"4. Use the accepted attempt output, attempt messages, and any accessible",
|
|
10816
|
+
" artifacts or workspace evidence available in your environment.",
|
|
10817
|
+
" Read artifact files from the mounted producer workspace when present;",
|
|
10818
|
+
" do not assume detached `artifact_<taskId>` directories exist.",
|
|
10819
|
+
"5. Score strictly against the rubric below.",
|
|
10690
10820
|
"",
|
|
10821
|
+
workspaceSection,
|
|
10691
10822
|
"### Rubric",
|
|
10692
10823
|
"",
|
|
10693
10824
|
rubric.preamble ? `${rubric.preamble}\n` : "",
|
|
@@ -10695,34 +10826,10 @@ function buildJudgeEvalVariantUserPrompt(input, ctx) {
|
|
|
10695
10826
|
"| --- | --- | --- | --- |",
|
|
10696
10827
|
criteriaTable,
|
|
10697
10828
|
"",
|
|
10698
|
-
"### How to grade",
|
|
10699
|
-
"",
|
|
10700
|
-
"For EACH `runTaskIds[i]`:",
|
|
10701
|
-
"",
|
|
10702
|
-
"1. Call the `subagent` custom tool with:",
|
|
10703
|
-
" - `task`: a brief instructing the subagent to grade ONLY that variant",
|
|
10704
|
-
" against the rubric above; include the target task id and the rubric",
|
|
10705
|
-
" verbatim. The subagent has the same MoltNet tools and can fetch the",
|
|
10706
|
-
" accepted attempt output independently.",
|
|
10707
|
-
" - `output_schema`: `\"judge_eval_variant_result\"`",
|
|
10708
|
-
"2. Receive the subagent's structured `judge_eval_variant_result` payload.",
|
|
10709
|
-
"3. Append it to your `results[]` array, **in the same order as input.runTaskIds**.",
|
|
10710
|
-
"",
|
|
10711
|
-
"Do NOT score any variant in your own session. The whole point of the",
|
|
10712
|
-
"subagent fan-out is per-variant context isolation — grading two variants",
|
|
10713
|
-
"back-to-back in one session lets the second be biased by the first.",
|
|
10714
|
-
"",
|
|
10715
10829
|
"### Composite arithmetic",
|
|
10716
10830
|
"",
|
|
10717
|
-
"
|
|
10718
|
-
"criteria. Drift > 0.001 is rejected.
|
|
10719
|
-
"themselves; double-check before assembling the final output.",
|
|
10720
|
-
"",
|
|
10721
|
-
"### Deltas (optional)",
|
|
10722
|
-
"",
|
|
10723
|
-
"If useful, populate `deltas` with pairwise composite differences keyed by",
|
|
10724
|
-
"`\"<variantLabel-A> - <variantLabel-B>\"` (single space-hyphen-space). Both",
|
|
10725
|
-
"labels must appear in `results`. Omit `deltas` entirely if not used.",
|
|
10831
|
+
"Your `composite` MUST equal `Σ(criterion.weight × score)` over the rubric",
|
|
10832
|
+
"criteria. Drift > 0.001 is rejected.",
|
|
10726
10833
|
"",
|
|
10727
10834
|
finalOutputBlock
|
|
10728
10835
|
].filter((s) => s !== "").join("\n");
|
|
@@ -10732,13 +10839,8 @@ function buildJudgeEvalVariantUserPrompt(input, ctx) {
|
|
|
10732
10839
|
function buildJudgePackUserPrompt(input, ctx) {
|
|
10733
10840
|
const { renderedPackId, sourcePackId, successCriteria } = input;
|
|
10734
10841
|
const rubric = successCriteria.rubric;
|
|
10735
|
-
const criteriaList = rubric
|
|
10736
|
-
const preambleSection = rubric
|
|
10737
|
-
"### Rubric preamble",
|
|
10738
|
-
"",
|
|
10739
|
-
rubric.preamble,
|
|
10740
|
-
""
|
|
10741
|
-
].join("\n") : null;
|
|
10842
|
+
const criteriaList = renderRubricCriteriaList(rubric);
|
|
10843
|
+
const preambleSection = renderRubricPreambleSection(rubric);
|
|
10742
10844
|
return [
|
|
10743
10845
|
"# Judge Pack Agent",
|
|
10744
10846
|
"",
|
|
@@ -10854,6 +10956,112 @@ function buildJudgePackUserPrompt(input, ctx) {
|
|
|
10854
10956
|
].filter((l) => l !== null).join("\n");
|
|
10855
10957
|
}
|
|
10856
10958
|
//#endregion
|
|
10959
|
+
//#region ../agent-runtime/src/prompts/pr-review.ts
|
|
10960
|
+
function buildPrReviewUserPrompt(input, ctx) {
|
|
10961
|
+
const rubric = input.successCriteria.rubric;
|
|
10962
|
+
const criteriaList = renderRubricCriteriaList(rubric);
|
|
10963
|
+
const preambleSection = renderRubricPreambleSection(rubric);
|
|
10964
|
+
const taskPromptSection = input.taskPrompt ? [
|
|
10965
|
+
"## Task-specific instructions",
|
|
10966
|
+
"",
|
|
10967
|
+
input.taskPrompt,
|
|
10968
|
+
""
|
|
10969
|
+
].join("\n") : "";
|
|
10970
|
+
const resourceSection = input.subject.resourceUrls && input.subject.resourceUrls.length > 0 ? [
|
|
10971
|
+
"### Resources",
|
|
10972
|
+
"",
|
|
10973
|
+
...input.subject.resourceUrls.map((url) => `- ${url}`),
|
|
10974
|
+
""
|
|
10975
|
+
].join("\n") : "";
|
|
10976
|
+
const hintsSection = input.subject.inspectionHints && input.subject.inspectionHints.length > 0 ? [
|
|
10977
|
+
"### Inspection hints",
|
|
10978
|
+
"",
|
|
10979
|
+
...input.subject.inspectionHints.map((hint) => `- ${hint}`),
|
|
10980
|
+
""
|
|
10981
|
+
].join("\n") : "";
|
|
10982
|
+
const workspaceSection = ctx.workspace?.mode === "dedicated_worktree" ? [
|
|
10983
|
+
"### Workspace",
|
|
10984
|
+
"",
|
|
10985
|
+
"This review attempt is running inside a dedicated disposable git",
|
|
10986
|
+
"worktree. Inspect and reason inside this workspace only.",
|
|
10987
|
+
ctx.workspace.branch ? `The current review branch is \`${ctx.workspace.branch}\`.` : "The current checkout is disposable and will be cleaned up when the task ends.",
|
|
10988
|
+
""
|
|
10989
|
+
].join("\n") : "";
|
|
10990
|
+
return [
|
|
10991
|
+
"# Review Agent",
|
|
10992
|
+
"",
|
|
10993
|
+
"You are an independent judge. You did NOT produce the subject under review.",
|
|
10994
|
+
"Assess it strictly against the rubric below and emit a structured judgment.",
|
|
10995
|
+
"You may inspect the local workspace and the referenced resources, but do NOT modify anything.",
|
|
10996
|
+
"",
|
|
10997
|
+
`Your diary ID is: ${ctx.diaryId}`,
|
|
10998
|
+
`This task's id is: ${ctx.taskId}`,
|
|
10999
|
+
"",
|
|
11000
|
+
"## Subject",
|
|
11001
|
+
"",
|
|
11002
|
+
`**Title:** ${input.subject.title}`,
|
|
11003
|
+
"",
|
|
11004
|
+
input.subject.summary,
|
|
11005
|
+
"",
|
|
11006
|
+
resourceSection,
|
|
11007
|
+
hintsSection,
|
|
11008
|
+
workspaceSection,
|
|
11009
|
+
"### Execution contract",
|
|
11010
|
+
"",
|
|
11011
|
+
"Treat the provided subject, resources, inspection hints, and any",
|
|
11012
|
+
"task-specific instructions as the full",
|
|
11013
|
+
"review contract for this task.",
|
|
11014
|
+
"",
|
|
11015
|
+
"If the task-specific instructions or inspection hints require an outward action tied to the review",
|
|
11016
|
+
"(for example publishing the judgment somewhere), perform that action as",
|
|
11017
|
+
"part of the task before reporting structured output.",
|
|
11018
|
+
"",
|
|
11019
|
+
"## Review workflow",
|
|
11020
|
+
"",
|
|
11021
|
+
"1. Read the subject summary, resources, inspection hints, and any",
|
|
11022
|
+
" task-specific instructions before scoring.",
|
|
11023
|
+
"2. Inspect the target artefact directly using the tools and resources the",
|
|
11024
|
+
" task makes available.",
|
|
11025
|
+
"3. If you are in a dedicated disposable worktree and need the review target",
|
|
11026
|
+
" checked out locally, do that work inside this disposable workspace only.",
|
|
11027
|
+
"4. Apply the rubric strictly. This task is about complexity and",
|
|
11028
|
+
" reviewability, not correctness or feature desirability.",
|
|
11029
|
+
"5. Perform any required outward action before emitting the final",
|
|
11030
|
+
" structured output.",
|
|
11031
|
+
"",
|
|
11032
|
+
taskPromptSection,
|
|
11033
|
+
preambleSection,
|
|
11034
|
+
"## Criteria",
|
|
11035
|
+
"",
|
|
11036
|
+
criteriaList,
|
|
11037
|
+
"",
|
|
11038
|
+
"### Scoring rules",
|
|
11039
|
+
"",
|
|
11040
|
+
"- Every criterion uses binary scoring only.",
|
|
11041
|
+
"- Score `1` when the subject clearly clears the criterion.",
|
|
11042
|
+
"- Score `0` when it does not, or when the evidence is ambiguous.",
|
|
11043
|
+
"- `rationale` is REQUIRED for every score. Keep it concrete and audit-friendly.",
|
|
11044
|
+
"- Compute `composite = Σ(weight_i × score_i)` exactly; the runtime rejects mismatches.",
|
|
11045
|
+
"",
|
|
11046
|
+
"Write a signed diary entry (tags: `judgment`, `pr_review`) capturing the rationale before reporting structured output.",
|
|
11047
|
+
"",
|
|
11048
|
+
buildFinalOutputBlock({
|
|
11049
|
+
taskType: "pr_review",
|
|
11050
|
+
outputSchemaName: "PrReviewOutput",
|
|
11051
|
+
shapeSketch: [
|
|
11052
|
+
"{",
|
|
11053
|
+
" \"scores\": [",
|
|
11054
|
+
" { \"criterionId\": \"...\", \"score\": 0, \"rationale\": \"...\" }",
|
|
11055
|
+
" ],",
|
|
11056
|
+
" \"composite\": <sum-of-weighted-binary-scores>,",
|
|
11057
|
+
" \"verdict\": \"<1-3 sentence overall>\"",
|
|
11058
|
+
"}"
|
|
11059
|
+
].join("\n"),
|
|
11060
|
+
extraNotes: ["`scores` MUST stay in the same order as the rubric criteria.", "`score` MUST be exactly `0` or `1` for every criterion."]
|
|
11061
|
+
})
|
|
11062
|
+
].filter(Boolean).join("\n");
|
|
11063
|
+
}
|
|
11064
|
+
//#endregion
|
|
10857
11065
|
//#region ../agent-runtime/src/prompts/render-pack.ts
|
|
10858
11066
|
/**
|
|
10859
11067
|
* Build the first user-message prompt for a `render_pack` task. Almost mechanical:
|
|
@@ -10918,8 +11126,9 @@ function buildRenderPackUserPrompt(input, ctx) {
|
|
|
10918
11126
|
* Build the first user-message prompt for a `run_eval` task.
|
|
10919
11127
|
*
|
|
10920
11128
|
* Free-form: no git workflow, no commit ceremony. The executor produces
|
|
10921
|
-
* a textual response (and optional file artifacts) that
|
|
10922
|
-
* `
|
|
11129
|
+
* a textual response (and optional file artifacts) that later
|
|
11130
|
+
* `judge_eval_attempt` task(s) grade against their own hidden
|
|
11131
|
+
* rubric.
|
|
10923
11132
|
*
|
|
10924
11133
|
* Context delivery is handled by `resolveTaskContext` (see
|
|
10925
11134
|
* libs/agent-runtime/src/context-bindings.ts) and runs BEFORE this
|
|
@@ -10929,7 +11138,9 @@ function buildRenderPackUserPrompt(input, ctx) {
|
|
|
10929
11138
|
* builder does NOT inline `input.context[]` itself.
|
|
10930
11139
|
*/
|
|
10931
11140
|
function buildRunEvalUserPrompt(input, ctx) {
|
|
10932
|
-
const { scenario, variantLabel, successCriteria } = input;
|
|
11141
|
+
const { scenario, variantLabel, execution, successCriteria } = input;
|
|
11142
|
+
const hasContext = input.context.length > 0;
|
|
11143
|
+
const hasInlineContext = input.context.some((entry) => entry.binding === "context_inline");
|
|
10933
11144
|
const inputFilesSection = scenario.inputFiles?.length ? [
|
|
10934
11145
|
"### Input files",
|
|
10935
11146
|
"",
|
|
@@ -10942,9 +11153,30 @@ function buildRunEvalUserPrompt(input, ctx) {
|
|
|
10942
11153
|
"",
|
|
10943
11154
|
`This task carries correlationId \`${ctx.correlationId}\`. It joins`,
|
|
10944
11155
|
"this variant to its sibling `run_eval` tasks (other variants of the",
|
|
10945
|
-
"same scenario
|
|
10946
|
-
"
|
|
10947
|
-
"
|
|
11156
|
+
"same scenario and to any later `judge_eval_attempt` tasks created",
|
|
11157
|
+
"against those variants. You do not need to act on it directly — it",
|
|
11158
|
+
"is recorded for cross-variant aggregation at query time.",
|
|
11159
|
+
""
|
|
11160
|
+
].join("\n") : "";
|
|
11161
|
+
const executionSection = [
|
|
11162
|
+
"### Execution mode",
|
|
11163
|
+
"",
|
|
11164
|
+
`Mode: \`${execution.mode}\``,
|
|
11165
|
+
`Workspace: \`${execution.workspace}\``,
|
|
11166
|
+
execution.workspace === "none" ? "You are running in a scratch workspace with no repository checkout mounted. Do not assume git history or repo files are present unless the scenario provided them explicitly." : execution.workspace === "shared_mount" ? "You are running against the daemon shared mount. Treat any repository mutations as affecting the mounted checkout directly." : "You are running in a dedicated disposable git worktree isolated from the daemon shared checkout.",
|
|
11167
|
+
""
|
|
11168
|
+
].join("\n");
|
|
11169
|
+
const contextDisciplineSection = hasContext ? [
|
|
11170
|
+
"### Injected context discipline",
|
|
11171
|
+
"",
|
|
11172
|
+
"This task includes extra injected context from the task creator.",
|
|
11173
|
+
"You MUST inspect and use that context BEFORE you write solution",
|
|
11174
|
+
"files or draft your final answer.",
|
|
11175
|
+
"Do not solve first and only review the context afterward.",
|
|
11176
|
+
hasInlineContext ? "For `context_inline`, your FIRST content-inspection step should be a `read` of `/workspace/context-pack.md` before your first `write` call. The same content is also mirrored in `/workspace/AGENTS.md` and may be referenced from `/workspace/.claude/CLAUDE.md`." : "If injected context was provided as a skill, inspect that task-injected context before solving.",
|
|
11177
|
+
hasInlineContext ? "If `/workspace/context-pack.md` exists and you skip reading it before writing solution files, you are not following the task instructions." : "Do not rely on memory alone when task-injected context is available; inspect it first.",
|
|
11178
|
+
"If the injected context contains repo- or workflow-specific rules,",
|
|
11179
|
+
"those rules override your generic instincts.",
|
|
10948
11180
|
""
|
|
10949
11181
|
].join("\n") : "";
|
|
10950
11182
|
const finalOutputBlock = buildFinalOutputBlock({
|
|
@@ -10957,7 +11189,13 @@ function buildRunEvalUserPrompt(input, ctx) {
|
|
|
10957
11189
|
" \"totalTokens\": <int>,",
|
|
10958
11190
|
" \"durationMs\": <int>,",
|
|
10959
11191
|
" \"traceparent\": \"<from claim>\",",
|
|
10960
|
-
" \"verification\":
|
|
11192
|
+
" \"verification\": {",
|
|
11193
|
+
" \"inputCid\": \"<task inputCid>\",",
|
|
11194
|
+
" \"results\": [",
|
|
11195
|
+
" { \"id\": \"<criterion id>\", \"kind\": \"rubric\", \"status\": \"pass|fail|skip\", \"detail\": \"<optional one-liner>\" }",
|
|
11196
|
+
" ],",
|
|
11197
|
+
" \"passed\": <boolean>",
|
|
11198
|
+
" } // required iff input.successCriteria; must be an object, never a string",
|
|
10961
11199
|
"}"
|
|
10962
11200
|
].join("\n")
|
|
10963
11201
|
});
|
|
@@ -10965,6 +11203,8 @@ function buildRunEvalUserPrompt(input, ctx) {
|
|
|
10965
11203
|
"# Run Eval Agent\n",
|
|
10966
11204
|
`You are running an evaluation scenario as variant \`${variantLabel}\`.\nTask id: \`${ctx.taskId}\`\n`,
|
|
10967
11205
|
correlationSection,
|
|
11206
|
+
executionSection,
|
|
11207
|
+
contextDisciplineSection,
|
|
10968
11208
|
`### Scenario\n\n${scenario.prompt}\n`,
|
|
10969
11209
|
inputFilesSection,
|
|
10970
11210
|
verificationSection,
|
|
@@ -11036,14 +11276,25 @@ function buildTaskUserPrompt(task, ctx) {
|
|
|
11036
11276
|
diaryId: ctx.diaryId,
|
|
11037
11277
|
taskId: ctx.taskId
|
|
11038
11278
|
});
|
|
11039
|
-
case
|
|
11040
|
-
if (!Value.Check(
|
|
11041
|
-
const errors = [...Value.Errors(
|
|
11042
|
-
throw new Error(`
|
|
11279
|
+
case JUDGE_EVAL_ATTEMPT_TYPE:
|
|
11280
|
+
if (!Value.Check(JudgeEvalAttemptInput, task.input)) {
|
|
11281
|
+
const errors = [...Value.Errors(JudgeEvalAttemptInput, task.input)];
|
|
11282
|
+
throw new Error(`judge_eval_attempt input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
|
|
11043
11283
|
}
|
|
11044
|
-
return
|
|
11284
|
+
return buildJudgeEvalAttemptUserPrompt(task.input, {
|
|
11045
11285
|
diaryId: ctx.diaryId,
|
|
11046
|
-
taskId: ctx.taskId
|
|
11286
|
+
taskId: ctx.taskId,
|
|
11287
|
+
workspace: ctx.workspace
|
|
11288
|
+
});
|
|
11289
|
+
case PR_REVIEW_TYPE:
|
|
11290
|
+
if (!Value.Check(PrReviewInput, task.input)) {
|
|
11291
|
+
const errors = [...Value.Errors(PrReviewInput, task.input)];
|
|
11292
|
+
throw new Error(`pr_review input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
|
|
11293
|
+
}
|
|
11294
|
+
return buildPrReviewUserPrompt(task.input, {
|
|
11295
|
+
diaryId: ctx.diaryId,
|
|
11296
|
+
taskId: ctx.taskId,
|
|
11297
|
+
workspace: ctx.workspace
|
|
11047
11298
|
});
|
|
11048
11299
|
case RUN_EVAL_TYPE:
|
|
11049
11300
|
if (!Value.Check(RunEvalInput, task.input)) {
|
|
@@ -14562,6 +14813,11 @@ var require_multistream = /* @__PURE__ */ __commonJSMin(((exports, module) => {
|
|
|
14562
14813
|
* paths under this mount via `toGuestPath` in `tool-operations.ts`.
|
|
14563
14814
|
*/
|
|
14564
14815
|
var SKILL_ROOT_IN_VM = GUEST_TASK_SKILLS_MOUNT;
|
|
14816
|
+
var INLINE_CONTEXT_ROOT_IN_VM = "/workspace/.moltnet/context";
|
|
14817
|
+
var WORKSPACE_CONTEXT_PACK = "/workspace/context-pack.md";
|
|
14818
|
+
var WORKSPACE_AGENTS_MD = "/workspace/AGENTS.md";
|
|
14819
|
+
var WORKSPACE_CLAUDE_DIR = "/workspace/.claude";
|
|
14820
|
+
var WORKSPACE_CLAUDE_MD = "/workspace/.claude/CLAUDE.md";
|
|
14565
14821
|
/** Bounds borrowed from pi's skill validation; conservative caps so a
|
|
14566
14822
|
* malformed SKILL.md doesn't bloat the system prompt. */
|
|
14567
14823
|
var MAX_SKILL_NAME = 64;
|
|
@@ -14572,21 +14828,40 @@ var MAX_SKILL_DESCRIPTION = 1024;
|
|
|
14572
14828
|
*/
|
|
14573
14829
|
async function injectTaskContext(args) {
|
|
14574
14830
|
const skills = [];
|
|
14831
|
+
const inlineContexts = [];
|
|
14575
14832
|
const resolved = await resolveTaskContext({
|
|
14576
14833
|
context: args.context,
|
|
14577
|
-
deliver: {
|
|
14578
|
-
|
|
14579
|
-
|
|
14580
|
-
|
|
14581
|
-
|
|
14582
|
-
|
|
14583
|
-
|
|
14584
|
-
|
|
14585
|
-
|
|
14586
|
-
|
|
14587
|
-
|
|
14588
|
-
|
|
14834
|
+
deliver: {
|
|
14835
|
+
skill: async ({ slug, content }) => {
|
|
14836
|
+
const dir = `${SKILL_ROOT_IN_VM}/${slug}`;
|
|
14837
|
+
const filePath = `${dir}/SKILL.md`;
|
|
14838
|
+
await args.fs.mkdir(dir, { recursive: true });
|
|
14839
|
+
await args.fs.writeFile(filePath, content, { mode: 420 });
|
|
14840
|
+
skills.push(buildSyntheticSkill({
|
|
14841
|
+
slug,
|
|
14842
|
+
content,
|
|
14843
|
+
filePath,
|
|
14844
|
+
dir
|
|
14845
|
+
}));
|
|
14846
|
+
},
|
|
14847
|
+
contextFile: async ({ suggestedFileName, content }) => {
|
|
14848
|
+
await args.fs.mkdir(INLINE_CONTEXT_ROOT_IN_VM, { recursive: true });
|
|
14849
|
+
const filePath = `${INLINE_CONTEXT_ROOT_IN_VM}/${suggestedFileName}`;
|
|
14850
|
+
await args.fs.writeFile(filePath, content, { mode: 420 });
|
|
14851
|
+
inlineContexts.push({
|
|
14852
|
+
slug: suggestedFileName.replace(/\.md$/u, ""),
|
|
14853
|
+
content
|
|
14854
|
+
});
|
|
14855
|
+
}
|
|
14856
|
+
}
|
|
14589
14857
|
});
|
|
14858
|
+
if (inlineContexts.length > 0) {
|
|
14859
|
+
const packContent = buildWorkspaceContextPack(inlineContexts);
|
|
14860
|
+
await args.fs.writeFile(WORKSPACE_CONTEXT_PACK, packContent, { mode: 420 });
|
|
14861
|
+
await args.fs.writeFile(WORKSPACE_AGENTS_MD, packContent, { mode: 420 });
|
|
14862
|
+
await args.fs.mkdir(WORKSPACE_CLAUDE_DIR, { recursive: true });
|
|
14863
|
+
await args.fs.writeFile(WORKSPACE_CLAUDE_MD, "@../context-pack.md\n", { mode: 420 });
|
|
14864
|
+
}
|
|
14590
14865
|
return {
|
|
14591
14866
|
injected: resolved.injected,
|
|
14592
14867
|
skills,
|
|
@@ -14594,6 +14869,17 @@ async function injectTaskContext(args) {
|
|
|
14594
14869
|
userInlineSuffix: resolved.userInlineSuffix
|
|
14595
14870
|
};
|
|
14596
14871
|
}
|
|
14872
|
+
function buildWorkspaceContextPack(contexts) {
|
|
14873
|
+
return [
|
|
14874
|
+
"# Context Pack",
|
|
14875
|
+
"",
|
|
14876
|
+
...contexts.map(({ slug, content }) => [
|
|
14877
|
+
`## ${slug}`,
|
|
14878
|
+
"",
|
|
14879
|
+
content.trimEnd()
|
|
14880
|
+
].join("\n"))
|
|
14881
|
+
].join("\n\n").trimEnd() + "\n";
|
|
14882
|
+
}
|
|
14597
14883
|
/**
|
|
14598
14884
|
* Build a `Skill` object pi will faithfully render in
|
|
14599
14885
|
* `<available_skills>`. We extract `name` and `description` from the
|
|
@@ -14957,7 +15243,7 @@ async function parseStructuredTaskOutput(assistantText, taskType, opts = {}) {
|
|
|
14957
15243
|
}
|
|
14958
15244
|
};
|
|
14959
15245
|
}
|
|
14960
|
-
const errors = validateTaskOutput(taskType, extracted);
|
|
15246
|
+
const errors = validateTaskOutput(taskType, extracted, opts.input);
|
|
14961
15247
|
if (errors.length > 0) {
|
|
14962
15248
|
const details = errors.slice(0, 3).map((error) => `${error.field}: ${error.message}`);
|
|
14963
15249
|
const [firstError] = errors;
|
|
@@ -15071,7 +15357,7 @@ function createSubmitOutputTool(taskType, opts = {}) {
|
|
|
15071
15357
|
description: contract.description,
|
|
15072
15358
|
parameters: schema,
|
|
15073
15359
|
async execute(_id, params) {
|
|
15074
|
-
const errors = validateTaskOutput(taskType, params);
|
|
15360
|
+
const errors = validateTaskOutput(taskType, params, opts.input);
|
|
15075
15361
|
if (errors.length > 0) {
|
|
15076
15362
|
const detailMsg = errors.slice(0, 3).map((err) => `${err.field}: ${err.message}`).join("; ");
|
|
15077
15363
|
const details = {
|
|
@@ -15140,6 +15426,39 @@ function resolveSubmitTools(taskType, opts = {}) {
|
|
|
15140
15426
|
//#region src/runtime/task-workspace.ts
|
|
15141
15427
|
function prepareTaskWorkspace(task, requestedMountPath, executionPlan) {
|
|
15142
15428
|
const branch = executionPlan?.worktreeBranch ?? null;
|
|
15429
|
+
const workspaceMode = executionPlan?.workspaceMode ?? "shared_mount";
|
|
15430
|
+
const attachedWorkspace = executionPlan?.workspaceAttachment ?? null;
|
|
15431
|
+
if (attachedWorkspace) return {
|
|
15432
|
+
mountPath: attachedWorkspace.mountPath,
|
|
15433
|
+
cwdPath: attachedWorkspace.cwdPath,
|
|
15434
|
+
mode: workspaceMode,
|
|
15435
|
+
branch,
|
|
15436
|
+
cleanup: () => {}
|
|
15437
|
+
};
|
|
15438
|
+
if (workspaceMode === "scratch_mount") {
|
|
15439
|
+
const scratchDir = resolveTaskScratchPath(findMainWorktree(), executionPlan?.workspaceId ?? `task-${task.id}`);
|
|
15440
|
+
const keepWorkspace = executionPlan?.workspaceScope === "session" && executionPlan.sessionKey !== null;
|
|
15441
|
+
if (keepWorkspace) mkdirSync(scratchDir, { recursive: true });
|
|
15442
|
+
else {
|
|
15443
|
+
rmSync(scratchDir, {
|
|
15444
|
+
recursive: true,
|
|
15445
|
+
force: true
|
|
15446
|
+
});
|
|
15447
|
+
mkdirSync(scratchDir, { recursive: true });
|
|
15448
|
+
}
|
|
15449
|
+
return {
|
|
15450
|
+
mountPath: scratchDir,
|
|
15451
|
+
cwdPath: scratchDir,
|
|
15452
|
+
mode: "scratch_mount",
|
|
15453
|
+
branch: null,
|
|
15454
|
+
cleanup: keepWorkspace ? () => {} : () => {
|
|
15455
|
+
rmSync(scratchDir, {
|
|
15456
|
+
recursive: true,
|
|
15457
|
+
force: true
|
|
15458
|
+
});
|
|
15459
|
+
}
|
|
15460
|
+
};
|
|
15461
|
+
}
|
|
15143
15462
|
if (!branch) return {
|
|
15144
15463
|
mountPath: requestedMountPath,
|
|
15145
15464
|
cwdPath: requestedMountPath,
|
|
@@ -15177,6 +15496,9 @@ function prepareTaskWorkspace(task, requestedMountPath, executionPlan) {
|
|
|
15177
15496
|
function resolveTaskWorktreePath(mainRepo, workspaceId) {
|
|
15178
15497
|
return join(mainRepo, ".worktrees", workspaceId);
|
|
15179
15498
|
}
|
|
15499
|
+
function resolveTaskScratchPath(mainRepo, workspaceId) {
|
|
15500
|
+
return join(mainRepo, ".moltnet", "d", "task-workspaces", workspaceId);
|
|
15501
|
+
}
|
|
15180
15502
|
function ensureReusableTaskWorktree(mainRepo, worktreeDir, branch) {
|
|
15181
15503
|
if (isRegisteredWorktree(mainRepo, worktreeDir)) return;
|
|
15182
15504
|
if (existsSync(worktreeDir)) throw new Error(`Expected reusable worktree ${worktreeDir} to be git-managed, but it exists outside git worktree metadata.`);
|
|
@@ -15413,12 +15735,14 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
15413
15735
|
return makeFailedOutput("worktree_setup_failed", message);
|
|
15414
15736
|
}
|
|
15415
15737
|
try {
|
|
15738
|
+
const sandboxConfig = applyExecutionPlanSandboxOverrides(opts.sandboxConfig, executionPlan);
|
|
15416
15739
|
managed = await resumeVm({
|
|
15417
15740
|
checkpointPath,
|
|
15418
15741
|
agentName: opts.agentName,
|
|
15419
15742
|
mountPath,
|
|
15743
|
+
workspaceMode: workspace.mode,
|
|
15420
15744
|
extraAllowedHosts: opts.extraAllowedHosts,
|
|
15421
|
-
sandboxConfig
|
|
15745
|
+
sandboxConfig
|
|
15422
15746
|
});
|
|
15423
15747
|
} catch (err) {
|
|
15424
15748
|
const message = err instanceof Error ? err.message : String(err);
|
|
@@ -15447,7 +15771,8 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
15447
15771
|
taskId: task.id,
|
|
15448
15772
|
workspace: {
|
|
15449
15773
|
mode: activeWorkspace.mode,
|
|
15450
|
-
branch: activeWorkspace.branch
|
|
15774
|
+
branch: activeWorkspace.branch,
|
|
15775
|
+
attached: executionPlan?.workspaceAttachment !== void 0
|
|
15451
15776
|
},
|
|
15452
15777
|
extras: opts.promptExtras
|
|
15453
15778
|
});
|
|
@@ -15489,7 +15814,10 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
15489
15814
|
createEditToolDefinition(mountPath, { operations: createGondolinEditOps(managed.vm, mountPath) }),
|
|
15490
15815
|
createBashToolDefinition(mountPath, { operations: createGondolinBashOps(managed.vm, mountPath) })
|
|
15491
15816
|
];
|
|
15492
|
-
const { handle: submitToolHandle, tools: submitToolDefs } = resolveSubmitTools(task.taskType, {
|
|
15817
|
+
const { handle: submitToolHandle, tools: submitToolDefs } = resolveSubmitTools(task.taskType, {
|
|
15818
|
+
model: opts.model,
|
|
15819
|
+
input: task.input
|
|
15820
|
+
});
|
|
15493
15821
|
const submitTools = submitToolDefs;
|
|
15494
15822
|
try {
|
|
15495
15823
|
const moltnetAgent = await connect({ configDir: managed.agentDir });
|
|
@@ -15708,8 +16036,20 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
15708
16036
|
phase: "output_validation"
|
|
15709
16037
|
});
|
|
15710
16038
|
}
|
|
15711
|
-
else {
|
|
15712
|
-
|
|
16039
|
+
else if (submitToolHandle) {
|
|
16040
|
+
parseError = {
|
|
16041
|
+
code: "output_missing",
|
|
16042
|
+
message: "Agent did not submit output through the task submit tool. A valid submit tool call is required to complete this task type."
|
|
16043
|
+
};
|
|
16044
|
+
await emit("error", {
|
|
16045
|
+
message: parseError.message,
|
|
16046
|
+
phase: "output_validation"
|
|
16047
|
+
});
|
|
16048
|
+
} else {
|
|
16049
|
+
const parsed = await parseStructuredTaskOutput(assistantText, task.taskType, {
|
|
16050
|
+
model: opts.model,
|
|
16051
|
+
input: task.input
|
|
16052
|
+
});
|
|
15713
16053
|
parsedOutput = parsed.output;
|
|
15714
16054
|
parsedOutputCid = parsed.outputCid;
|
|
15715
16055
|
parseError = parsed.error;
|
|
@@ -15795,6 +16135,18 @@ async function executePiTask(claimedTask, reporter, opts) {
|
|
|
15795
16135
|
}
|
|
15796
16136
|
}
|
|
15797
16137
|
}
|
|
16138
|
+
function applyExecutionPlanSandboxOverrides(sandboxConfig, executionPlan) {
|
|
16139
|
+
const shadowWrites = executionPlan?.workspaceAttachment?.shadowWrites;
|
|
16140
|
+
if (!shadowWrites) return sandboxConfig;
|
|
16141
|
+
return {
|
|
16142
|
+
...sandboxConfig,
|
|
16143
|
+
vfs: {
|
|
16144
|
+
...sandboxConfig?.vfs,
|
|
16145
|
+
shadow: ["**"],
|
|
16146
|
+
shadowMode: shadowWrites
|
|
16147
|
+
}
|
|
16148
|
+
};
|
|
16149
|
+
}
|
|
15798
16150
|
function emptyUsage(provider, model) {
|
|
15799
16151
|
return {
|
|
15800
16152
|
inputTokens: 0,
|
|
@@ -16012,6 +16364,7 @@ function moltnetExtension(pi) {
|
|
|
16012
16364
|
checkpointPath,
|
|
16013
16365
|
agentName,
|
|
16014
16366
|
mountPath,
|
|
16367
|
+
workspaceMode: "shared_mount",
|
|
16015
16368
|
sandboxConfig
|
|
16016
16369
|
});
|
|
16017
16370
|
activateAgentEnv(managed.credentials.agentEnv, mainRepo);
|