@themoltnet/pi-extension 0.15.0 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +11 -0
- package/dist/index.js +635 -82
- package/package.json +3 -3
package/dist/index.d.ts
CHANGED
|
@@ -377,6 +377,17 @@ export declare interface SandboxConfig {
|
|
|
377
377
|
/** Overlay disk size (default '3G'). */
|
|
378
378
|
overlaySize?: string;
|
|
379
379
|
};
|
|
380
|
+
/** Shell commands to run every VM resume, after platform setup
|
|
381
|
+
* (TLS, DNS, git safe.directory, tmpfs node_modules) and before
|
|
382
|
+
* the agent session starts. Use for per-session bootstrap that
|
|
383
|
+
* doesn't belong baked into the snapshot.
|
|
384
|
+
*
|
|
385
|
+
* Not included in the snapshot cache key — changes here apply on
|
|
386
|
+
* every resume without triggering a snapshot rebuild. Each command
|
|
387
|
+
* runs in a fresh shell with `set -eu` and `set -o pipefail`; a
|
|
388
|
+
* non-zero exit (including from any segment of a pipeline) aborts
|
|
389
|
+
* resume with the failing command's stderr/stdout tail. */
|
|
390
|
+
resumeCommands?: string[];
|
|
380
391
|
/** VFS shadow settings — hide host paths from the guest. */
|
|
381
392
|
vfs?: {
|
|
382
393
|
/** Paths (relative to workspace root) to shadow from the host mount. */
|
package/dist/index.js
CHANGED
|
@@ -2515,11 +2515,12 @@ function createCryptoNamespace(context, signingRequests) {
|
|
|
2515
2515
|
function createDiariesNamespace(context) {
|
|
2516
2516
|
const { client, auth } = context;
|
|
2517
2517
|
return {
|
|
2518
|
-
async list(query) {
|
|
2518
|
+
async list(query, headers) {
|
|
2519
2519
|
return unwrapResult(await listDiaries({
|
|
2520
2520
|
client,
|
|
2521
2521
|
auth,
|
|
2522
|
-
query
|
|
2522
|
+
query,
|
|
2523
|
+
headers
|
|
2523
2524
|
}));
|
|
2524
2525
|
},
|
|
2525
2526
|
async create(body, headers) {
|
|
@@ -8177,6 +8178,27 @@ var BASE_ALLOWED_HOSTS = [
|
|
|
8177
8178
|
"*.googlesource.com"
|
|
8178
8179
|
];
|
|
8179
8180
|
/**
|
|
8181
|
+
* Run a shell command in the guest and throw if it fails. Mirror of
|
|
8182
|
+
* `run()` in `snapshot.ts` for the resume-side hook chain — every
|
|
8183
|
+
* setup step is essential to a healthy session, so a silent non-zero
|
|
8184
|
+
* exit (e.g. a mount that fails into the FUSE write path, or a
|
|
8185
|
+
* consumer-provided resume command that fails to install pnpm) must
|
|
8186
|
+
* surface immediately rather than fall through to cryptic agent
|
|
8187
|
+
* errors later.
|
|
8188
|
+
*/
|
|
8189
|
+
async function vmRun(vm, label, command) {
|
|
8190
|
+
const wrapped = `set -eu\nset -o pipefail\n${command}`;
|
|
8191
|
+
const r = await vm.exec([
|
|
8192
|
+
"sh",
|
|
8193
|
+
"-c",
|
|
8194
|
+
wrapped
|
|
8195
|
+
]);
|
|
8196
|
+
if (r.exitCode !== 0) {
|
|
8197
|
+
const tail = [r.stderr, r.stdout].filter(Boolean).join("\n").slice(-800);
|
|
8198
|
+
throw new Error(`resume step "${label}" failed (exit ${r.exitCode}):\n${tail}`);
|
|
8199
|
+
}
|
|
8200
|
+
}
|
|
8201
|
+
/**
|
|
8180
8202
|
* Resume a VM from a checkpoint, inject credentials, configure egress +
|
|
8181
8203
|
* TLS. Returns the managed VM handle.
|
|
8182
8204
|
*/
|
|
@@ -8236,8 +8258,9 @@ async function resumeVm(config) {
|
|
|
8236
8258
|
update-ca-certificates 2>/dev/null
|
|
8237
8259
|
cat /etc/gondolin/mitm/ca.crt >> /etc/ssl/certs/ca-certificates.crt
|
|
8238
8260
|
'`);
|
|
8239
|
-
await vm
|
|
8240
|
-
|
|
8261
|
+
await vmRun(vm, "DNS resolvers", `printf 'nameserver 8.8.8.8\\nnameserver 1.1.1.1\\n' > /etc/resolv.conf`);
|
|
8262
|
+
await vmRun(vm, "git safe.directory", `git config --system --add safe.directory '*'`);
|
|
8263
|
+
for (const [i, cmd] of (config.sandboxConfig?.resumeCommands ?? []).entries()) await vmRun(vm, `resumeCommands[${i}]`, cmd);
|
|
8241
8264
|
const vmSshDir = `${vmAgentDir}/ssh`;
|
|
8242
8265
|
await vm.exec(`mkdir -p ${vmAgentDir}/ssh /home/agent/.pi/agent`);
|
|
8243
8266
|
if (creds.piAuthJson !== null) await vm.fs.writeFile("/home/agent/.pi/agent/auth.json", creds.piAuthJson, { mode: 384 });
|
|
@@ -8613,61 +8636,6 @@ async function buildAgentSession(args) {
|
|
|
8613
8636
|
})).session;
|
|
8614
8637
|
}
|
|
8615
8638
|
//#endregion
|
|
8616
|
-
//#region ../agent-runtime/src/context-bindings.ts
|
|
8617
|
-
var PROMPT_SEPARATOR = "\n\n---\n\n";
|
|
8618
|
-
/**
|
|
8619
|
-
* Resolve `task.input.context[]` into delivered side-effects (skills
|
|
8620
|
-
* persisted via `deliver.skill`) and prompt fragments
|
|
8621
|
-
* (`systemPromptPrefix`, `userInlineSuffix`) the caller weaves into the
|
|
8622
|
-
* built prompt.
|
|
8623
|
-
*
|
|
8624
|
-
* Per-binding semantics (V1):
|
|
8625
|
-
* - `skill` → `deliver.skill({ slug, content })` once per ref.
|
|
8626
|
-
* Slug collisions on distinct contents are
|
|
8627
|
-
* refused loudly.
|
|
8628
|
-
* - `prompt_prefix` → content appended to `systemPromptPrefix` with
|
|
8629
|
-
* the canonical `\n\n---\n\n` separator (in
|
|
8630
|
-
* declared order).
|
|
8631
|
-
* - `user_inline` → content appended to `userInlineSuffix` in
|
|
8632
|
-
* declared order, same separator.
|
|
8633
|
-
*
|
|
8634
|
-
* No fetching, no hashing — bytes are inlined in `ContextRef.content`,
|
|
8635
|
-
* and the task's `inputCid` already pins the entire input. The imposer
|
|
8636
|
-
* chose these bytes; the resolver just dispatches them.
|
|
8637
|
-
*
|
|
8638
|
-
* The function is pure with respect to its arguments: file writes are
|
|
8639
|
-
* confined to the injected `deliver` callback, which makes the
|
|
8640
|
-
* resolver trivial to test.
|
|
8641
|
-
*/
|
|
8642
|
-
async function resolveTaskContext(args) {
|
|
8643
|
-
const promptParts = [];
|
|
8644
|
-
const userParts = [];
|
|
8645
|
-
const injected = [];
|
|
8646
|
-
const usedSlugs = /* @__PURE__ */ new Map();
|
|
8647
|
-
for (const ref of args.context) {
|
|
8648
|
-
if (ref.binding === "skill") {
|
|
8649
|
-
const prior = usedSlugs.get(ref.slug);
|
|
8650
|
-
if (prior !== void 0) {
|
|
8651
|
-
if (prior !== ref.content) throw new Error(`slug collision on '${ref.slug}': two skill entries share the same slug but have different content`);
|
|
8652
|
-
injected.push(ref);
|
|
8653
|
-
continue;
|
|
8654
|
-
}
|
|
8655
|
-
usedSlugs.set(ref.slug, ref.content);
|
|
8656
|
-
await args.deliver.skill({
|
|
8657
|
-
slug: ref.slug,
|
|
8658
|
-
content: ref.content
|
|
8659
|
-
});
|
|
8660
|
-
} else if (ref.binding === "prompt_prefix") promptParts.push(ref.content);
|
|
8661
|
-
else userParts.push(ref.content);
|
|
8662
|
-
injected.push(ref);
|
|
8663
|
-
}
|
|
8664
|
-
return {
|
|
8665
|
-
injected,
|
|
8666
|
-
systemPromptPrefix: promptParts.join(PROMPT_SEPARATOR),
|
|
8667
|
-
userInlineSuffix: userParts.join(PROMPT_SEPARATOR)
|
|
8668
|
-
};
|
|
8669
|
-
}
|
|
8670
|
-
//#endregion
|
|
8671
8639
|
//#region ../tasks/src/formats.ts
|
|
8672
8640
|
/**
|
|
8673
8641
|
* Register TypeBox string formats used across Task / TaskOutput / task-type
|
|
@@ -8884,7 +8852,7 @@ unchanged" is.
|
|
|
8884
8852
|
* (server-side schema check). Self-assessment is a truthful self-rating,
|
|
8885
8853
|
* NOT enforcement — `verification.passed=false` does not block /complete
|
|
8886
8854
|
* and does not affect `acceptedAttemptN`. See
|
|
8887
|
-
* `docs/agent-runtime.md` for the full producer/judge flow.
|
|
8855
|
+
* `docs/understand/agent-runtime.md` for the full producer/judge flow.
|
|
8888
8856
|
*
|
|
8889
8857
|
* **Binding evaluation** (judgment tasks: `assess_brief`, `judge_pack`).
|
|
8890
8858
|
* A separate task whose IS the application of `successCriteria` to
|
|
@@ -9041,6 +9009,39 @@ var AssessBriefOutput = Type$1.Object({
|
|
|
9041
9009
|
$id: "AssessBriefOutput",
|
|
9042
9010
|
additionalProperties: false
|
|
9043
9011
|
});
|
|
9012
|
+
/**
|
|
9013
|
+
* Async preflight (#1096):
|
|
9014
|
+
* - `targetTaskId` resolves to a real task the caller can see.
|
|
9015
|
+
* - The target is a `fulfill_brief` (you cannot grade an arbitrary
|
|
9016
|
+
* task type as if it were a brief fulfillment).
|
|
9017
|
+
* - The target is `completed` with an accepted attempt — grading
|
|
9018
|
+
* an in-flight or failed task would either race or grade nothing.
|
|
9019
|
+
*
|
|
9020
|
+
* Agent-distinctness ("assessor ≠ producer") is a runtime / auth-
|
|
9021
|
+
* layer concern and intentionally NOT checked here. It belongs in
|
|
9022
|
+
* an auth-aware claim-time check.
|
|
9023
|
+
*/
|
|
9024
|
+
async function validateAssessBriefInputAsync(input, ctx) {
|
|
9025
|
+
const { targetTaskId } = input;
|
|
9026
|
+
const errors = [];
|
|
9027
|
+
const target = await ctx.resolveTask(targetTaskId);
|
|
9028
|
+
if (!target) {
|
|
9029
|
+
errors.push({
|
|
9030
|
+
field: "targetTaskId",
|
|
9031
|
+
message: `targetTaskId ${targetTaskId} does not resolve to a task you can read`
|
|
9032
|
+
});
|
|
9033
|
+
return errors;
|
|
9034
|
+
}
|
|
9035
|
+
if (target.taskType !== "fulfill_brief") errors.push({
|
|
9036
|
+
field: "targetTaskId",
|
|
9037
|
+
message: `targetTaskId ${targetTaskId} is a ${target.taskType}, not a fulfill_brief`
|
|
9038
|
+
});
|
|
9039
|
+
if (target.status !== "completed" || target.acceptedAttemptN === null) errors.push({
|
|
9040
|
+
field: "targetTaskId",
|
|
9041
|
+
message: `targetTaskId ${targetTaskId} is not completed with an accepted attempt (status=${target.status}, acceptedAttemptN=${target.acceptedAttemptN})`
|
|
9042
|
+
});
|
|
9043
|
+
return errors;
|
|
9044
|
+
}
|
|
9044
9045
|
//#endregion
|
|
9045
9046
|
//#region ../tasks/src/task-types/curate-pack.ts
|
|
9046
9047
|
/**
|
|
@@ -9239,6 +9240,311 @@ function validateJudgePackOutput(output) {
|
|
|
9239
9240
|
}
|
|
9240
9241
|
return null;
|
|
9241
9242
|
}
|
|
9243
|
+
/**
|
|
9244
|
+
* Async preflight (#1096):
|
|
9245
|
+
* - `renderedPackId` resolves to a rendered_packs row.
|
|
9246
|
+
* - `sourcePackId` resolves to a context_packs row.
|
|
9247
|
+
* - The rendered pack actually came from the claimed source pack —
|
|
9248
|
+
* `renderedPack.sourcePackId === input.sourcePackId`. Without
|
|
9249
|
+
* this check a judge can be tricked into grading rendering A as
|
|
9250
|
+
* if it came from source B.
|
|
9251
|
+
*/
|
|
9252
|
+
async function validateJudgePackInputAsync(input, ctx) {
|
|
9253
|
+
const { renderedPackId, sourcePackId } = input;
|
|
9254
|
+
const errors = [];
|
|
9255
|
+
const [rendered, source] = await Promise.all([ctx.resolveRenderedPack(renderedPackId), ctx.resolveContextPack(sourcePackId)]);
|
|
9256
|
+
if (!rendered) errors.push({
|
|
9257
|
+
field: "renderedPackId",
|
|
9258
|
+
message: `renderedPackId ${renderedPackId} does not resolve to a rendered pack you can read`
|
|
9259
|
+
});
|
|
9260
|
+
if (!source) errors.push({
|
|
9261
|
+
field: "sourcePackId",
|
|
9262
|
+
message: `sourcePackId ${sourcePackId} does not resolve to a context pack you can read`
|
|
9263
|
+
});
|
|
9264
|
+
if (rendered && source && rendered.sourcePackId !== source.id) errors.push({
|
|
9265
|
+
field: "sourcePackId",
|
|
9266
|
+
message: `renderedPack ${renderedPackId} was produced from source ${rendered.sourcePackId}, not from sourcePackId=${sourcePackId}`
|
|
9267
|
+
});
|
|
9268
|
+
return errors;
|
|
9269
|
+
}
|
|
9270
|
+
//#endregion
|
|
9271
|
+
//#region ../tasks/src/task-types/judge-eval-variant.ts
|
|
9272
|
+
/**
|
|
9273
|
+
* `judge_eval_variant` — score N variants of a `run_eval` scenario
|
|
9274
|
+
* against a single rubric, in one pass, with per-variant subagent
|
|
9275
|
+
* isolation.
|
|
9276
|
+
*
|
|
9277
|
+
* output_kind: judgment
|
|
9278
|
+
* criteria: required (`successCriteria.rubric` — same envelope shape as
|
|
9279
|
+
* `judge_pack` / `assess_brief`)
|
|
9280
|
+
* references: not required at the input layer — `runTaskIds` already
|
|
9281
|
+
* pin the targets being graded.
|
|
9282
|
+
*
|
|
9283
|
+
* Slice 2 of #943. The parent task carries the rubric and the list of
|
|
9284
|
+
* variant `run_eval` task ids. The pi executor registers the generic
|
|
9285
|
+
* `subagent` custom tool (#1087), and the parent LLM calls
|
|
9286
|
+
* `subagent({ task, output_schema: 'judge_eval_variant_result' })` once
|
|
9287
|
+
* per variant — each child session has fresh context, fetches the
|
|
9288
|
+
* variant's accepted attempt output via `moltnet_get_task` /
|
|
9289
|
+
* `moltnet_list_task_attempts`, and grades against the rubric.
|
|
9290
|
+
*
|
|
9291
|
+
* Reuses `JudgePackScore` from `judge_pack` for per-criterion scoring
|
|
9292
|
+
* (Lane 1 binary via `llm_checklist`, Lane 2 graded via `llm_score`,
|
|
9293
|
+
* deterministic_*) — the score shape is the same across judgment
|
|
9294
|
+
* tasks; only the wrapping (per-variant grouping + deltas) differs.
|
|
9295
|
+
*
|
|
9296
|
+
* Cross-task input invariants — "all targets share the same
|
|
9297
|
+
* correlation_id, all are `run_eval`, all are completed with an
|
|
9298
|
+
* accepted attempt, all share byte-identical `input.successCriteria`"
|
|
9299
|
+
* — REQUIRE async DB lookups and live in `validateInputAsync` below,
|
|
9300
|
+
* which the task service runs at create time (#1096 wiring). The
|
|
9301
|
+
* TypeBox layer here only enforces shape: UUID format,
|
|
9302
|
+
* minItems/maxItems, rubric presence + weight invariant.
|
|
9303
|
+
*/
|
|
9304
|
+
var JUDGE_EVAL_VARIANT_TYPE = "judge_eval_variant";
|
|
9305
|
+
var JudgeEvalVariantInput = Type$1.Object({
|
|
9306
|
+
runTaskIds: Type$1.Array(Type$1.String({ format: "uuid" }), {
|
|
9307
|
+
minItems: 2,
|
|
9308
|
+
maxItems: 10
|
|
9309
|
+
}),
|
|
9310
|
+
successCriteria: SuccessCriteria
|
|
9311
|
+
}, {
|
|
9312
|
+
$id: "JudgeEvalVariantInput",
|
|
9313
|
+
additionalProperties: false
|
|
9314
|
+
});
|
|
9315
|
+
/**
|
|
9316
|
+
* Per-variant grading. `scores[]` shape is identical to `JudgePackScore`
|
|
9317
|
+
* (mode-aware: binary via `llm_checklist`, graded via `llm_score`,
|
|
9318
|
+
* deterministic_*). Reuse the type rather than re-declare.
|
|
9319
|
+
*
|
|
9320
|
+
* This is also the **subagent output contract** — the parent's
|
|
9321
|
+
* `subagent` tool resolves the contract name `judge_eval_variant_result`
|
|
9322
|
+
* to this schema. See `agent-runtime`'s subagent contract registry.
|
|
9323
|
+
*/
|
|
9324
|
+
var JudgeEvalVariantResult = Type$1.Object({
|
|
9325
|
+
runTaskId: Type$1.String({ format: "uuid" }),
|
|
9326
|
+
variantLabel: Type$1.String({
|
|
9327
|
+
minLength: 1,
|
|
9328
|
+
maxLength: 64,
|
|
9329
|
+
pattern: "^(?!.* - ).*$"
|
|
9330
|
+
}),
|
|
9331
|
+
scores: Type$1.Array(JudgePackScore, { minItems: 1 }),
|
|
9332
|
+
composite: Type$1.Number({
|
|
9333
|
+
minimum: 0,
|
|
9334
|
+
maximum: 1
|
|
9335
|
+
}),
|
|
9336
|
+
verdict: Type$1.String({ minLength: 1 })
|
|
9337
|
+
}, {
|
|
9338
|
+
$id: "JudgeEvalVariantResult",
|
|
9339
|
+
additionalProperties: false
|
|
9340
|
+
});
|
|
9341
|
+
var JudgeEvalVariantOutput = Type$1.Object({
|
|
9342
|
+
results: Type$1.Array(JudgeEvalVariantResult, { minItems: 2 }),
|
|
9343
|
+
deltas: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Number({
|
|
9344
|
+
minimum: -1,
|
|
9345
|
+
maximum: 1
|
|
9346
|
+
}))),
|
|
9347
|
+
judgeModel: Type$1.Optional(Type$1.String({ minLength: 1 })),
|
|
9348
|
+
traceparent: Type$1.String({ minLength: 1 })
|
|
9349
|
+
}, {
|
|
9350
|
+
$id: "JudgeEvalVariantOutput",
|
|
9351
|
+
additionalProperties: false
|
|
9352
|
+
});
|
|
9353
|
+
/**
|
|
9354
|
+
* Synchronous input invariants beyond TypeBox shape: rubric must be
|
|
9355
|
+
* present (already required by the schema, but the rubric body has
|
|
9356
|
+
* its own per-criterion weight invariant) and the rubric's weights
|
|
9357
|
+
* must sum to 1.
|
|
9358
|
+
*
|
|
9359
|
+
* Cross-task invariants (all targets are `run_eval`, all completed,
|
|
9360
|
+
* share `correlation_id`, byte-identical `input.successCriteria`)
|
|
9361
|
+
* are NOT checked here — they require async DB lookups against
|
|
9362
|
+
* `runTaskIds` and live in `validateJudgeEvalVariantInputAsync`
|
|
9363
|
+
* below, invoked by the task service at create time (#1096).
|
|
9364
|
+
*/
|
|
9365
|
+
function validateJudgeEvalVariantInput(input) {
|
|
9366
|
+
const sc = input.successCriteria;
|
|
9367
|
+
if (!sc) return "successCriteria is required for judge_eval_variant";
|
|
9368
|
+
if (!sc.rubric) return "successCriteria.rubric is required for judge_eval_variant";
|
|
9369
|
+
return validateRubricWeights(sc.rubric);
|
|
9370
|
+
}
|
|
9371
|
+
/**
|
|
9372
|
+
* Output cross-field invariants the schema cannot express:
|
|
9373
|
+
*
|
|
9374
|
+
* 1. `results.length === input.runTaskIds.length` — every variant
|
|
9375
|
+
* the imposer asked for must be graded. Partial grading
|
|
9376
|
+
* invalidates cross-variant comparison; fail the whole task
|
|
9377
|
+
* rather than silently report a subset.
|
|
9378
|
+
*
|
|
9379
|
+
* 2. `results[i].runTaskId === input.runTaskIds[i]` — order is
|
|
9380
|
+
* load-bearing for downstream consumers (e.g. deltas keyed by
|
|
9381
|
+
* adjacent pairs). Mismatch is an LLM bug; reject loudly.
|
|
9382
|
+
*
|
|
9383
|
+
* 3. Each `result.scores` follows the same `llm_checklist` rule
|
|
9384
|
+
* `judge_pack` enforces (#999): if a score has an `assertions`
|
|
9385
|
+
* array, the numeric score MUST be `1` iff every assertion
|
|
9386
|
+
* passes. Inconsistent payloads pollute attestations.
|
|
9387
|
+
*
|
|
9388
|
+
* 4. Each `result.composite` MUST equal the rubric-weighted sum
|
|
9389
|
+
* `Σ(weight_j × scores[j].score)`. The parent (and any subagent
|
|
9390
|
+
* it delegated to) is supposed to compute this; surfacing a
|
|
9391
|
+
* drift here catches LLMs that hand-wave the arithmetic.
|
|
9392
|
+
*
|
|
9393
|
+
* 5. Optional `deltas` keys MUST be of the form `"A - B"` where
|
|
9394
|
+
* both `A` and `B` are variantLabels present in `results`.
|
|
9395
|
+
* Values are not range-checked (any float in [-1, 1] is
|
|
9396
|
+
* arithmetically possible).
|
|
9397
|
+
*/
|
|
9398
|
+
function validateJudgeEvalVariantOutput(output, input) {
|
|
9399
|
+
const out = output;
|
|
9400
|
+
const inp = input;
|
|
9401
|
+
if (inp) {
|
|
9402
|
+
if (out.results.length !== inp.runTaskIds.length) return `results.length (${out.results.length}) does not match input.runTaskIds.length (${inp.runTaskIds.length}). Every variant must be graded; partial grading is rejected.`;
|
|
9403
|
+
for (let i = 0; i < out.results.length; i++) if (out.results[i].runTaskId !== inp.runTaskIds[i]) return `results[${i}].runTaskId (${out.results[i].runTaskId}) does not match input.runTaskIds[${i}] (${inp.runTaskIds[i]}). Order must align with input for downstream delta computation.`;
|
|
9404
|
+
}
|
|
9405
|
+
for (let r = 0; r < out.results.length; r++) {
|
|
9406
|
+
const result = out.results[r];
|
|
9407
|
+
for (let s = 0; s < result.scores.length; s++) {
|
|
9408
|
+
const sc = result.scores[s];
|
|
9409
|
+
if (!sc.assertions) continue;
|
|
9410
|
+
const allPassed = sc.assertions.every((a) => a.passed);
|
|
9411
|
+
const expected = allPassed ? 1 : 0;
|
|
9412
|
+
if (sc.score !== expected) return `results[${r}].scores[${s}] (criterionId="${sc.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${sc.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
|
|
9413
|
+
}
|
|
9414
|
+
}
|
|
9415
|
+
if (inp?.successCriteria?.rubric) {
|
|
9416
|
+
const criteria = inp.successCriteria.rubric.criteria;
|
|
9417
|
+
const weightById = new Map(criteria.map((c) => [c.id, c.weight]));
|
|
9418
|
+
for (let r = 0; r < out.results.length; r++) {
|
|
9419
|
+
const result = out.results[r];
|
|
9420
|
+
let sum = 0;
|
|
9421
|
+
for (const sc of result.scores) {
|
|
9422
|
+
const w = weightById.get(sc.criterionId);
|
|
9423
|
+
if (w === void 0) return `results[${r}].scores: criterionId "${sc.criterionId}" is not in the input rubric (known: ${Array.from(weightById.keys()).join(", ")}). Score every rubric criterion exactly once; do not invent new ids.`;
|
|
9424
|
+
sum += w * sc.score;
|
|
9425
|
+
}
|
|
9426
|
+
if (Math.abs(sum - result.composite) > .001) return `results[${r}].composite (${result.composite}) does not match Σ(weight × score) (${sum.toFixed(6)}). Composite must be the rubric-weighted sum of per-criterion scores (drift > 0.001).`;
|
|
9427
|
+
}
|
|
9428
|
+
}
|
|
9429
|
+
if (out.deltas) {
|
|
9430
|
+
const labels = new Set(out.results.map((r) => r.variantLabel));
|
|
9431
|
+
for (const key of Object.keys(out.deltas)) {
|
|
9432
|
+
const m = /^(.+?) - (.+)$/.exec(key);
|
|
9433
|
+
if (!m) return `deltas key "${key}" is not of the form "<variantLabel-A> - <variantLabel-B>". Use a single space-hyphen-space separator between labels.`;
|
|
9434
|
+
const [, a, b] = m;
|
|
9435
|
+
if (!labels.has(a) || !labels.has(b)) return `deltas key "${key}" references variantLabel(s) not present in results: ${!labels.has(a) ? `"${a}" missing` : ""}${!labels.has(a) && !labels.has(b) ? ", " : ""}${!labels.has(b) ? `"${b}" missing` : ""}`;
|
|
9436
|
+
}
|
|
9437
|
+
}
|
|
9438
|
+
return null;
|
|
9439
|
+
}
|
|
9440
|
+
/**
|
|
9441
|
+
* Local stable-stringify for cross-variant `successCriteria` byte-
|
|
9442
|
+
* equality. Recursively sorts object keys; arrays preserve order
|
|
9443
|
+
* (intentional — rubric criteria order is semantically meaningful).
|
|
9444
|
+
* Mirrors the canonical-JSON shape `crypto-service` uses for CIDs,
|
|
9445
|
+
* without taking on a crypto-service dep just for this comparison.
|
|
9446
|
+
*/
|
|
9447
|
+
function stableStringify(value) {
|
|
9448
|
+
if (value === null || typeof value !== "object") return JSON.stringify(value);
|
|
9449
|
+
if (Array.isArray(value)) return "[" + value.map(stableStringify).join(",") + "]";
|
|
9450
|
+
const obj = value;
|
|
9451
|
+
return "{" + Object.keys(obj).sort().map((k) => JSON.stringify(k) + ":" + stableStringify(obj[k])).join(",") + "}";
|
|
9452
|
+
}
|
|
9453
|
+
/**
|
|
9454
|
+
* Async preflight for `judge_eval_variant` (#1096 + #943):
|
|
9455
|
+
*
|
|
9456
|
+
* 1. Every `runTaskIds[i]` resolves to a task the caller can read.
|
|
9457
|
+
* 2. Every resolved task is `taskType === 'run_eval'`.
|
|
9458
|
+
* 3. Every resolved task is `status === 'completed'` with a
|
|
9459
|
+
* non-null `acceptedAttemptN` — grading an unaccepted attempt
|
|
9460
|
+
* races with re-attempts and pollutes the judge attestation.
|
|
9461
|
+
* 4. Every resolved task shares a non-null `correlationId`, and all
|
|
9462
|
+
* `correlationId`s are equal. Without this an imposer could
|
|
9463
|
+
* fabricate a "variant set" by stapling unrelated runs together.
|
|
9464
|
+
* 5. The shared `correlationId` is NOT already sealed. A previous
|
|
9465
|
+
* judge_eval_variant against the same group is final; produce a
|
|
9466
|
+
* fresh correlation_id for a new judging round rather than
|
|
9467
|
+
* adding contradictory verdicts to a sealed group.
|
|
9468
|
+
* 6. Every variant's `input.successCriteria` is byte-identical (via
|
|
9469
|
+
* stable-stringify). Different rubrics across "variants" makes
|
|
9470
|
+
* the comparison meaningless.
|
|
9471
|
+
*/
|
|
9472
|
+
async function validateJudgeEvalVariantInputAsync(input, ctx) {
|
|
9473
|
+
const { runTaskIds } = input;
|
|
9474
|
+
const errors = [];
|
|
9475
|
+
const resolved = await Promise.all(runTaskIds.map((id) => ctx.resolveTask(id)));
|
|
9476
|
+
let missingTargets = false;
|
|
9477
|
+
const presentTargets = [];
|
|
9478
|
+
for (let i = 0; i < runTaskIds.length; i++) {
|
|
9479
|
+
const t = resolved[i];
|
|
9480
|
+
if (!t) {
|
|
9481
|
+
missingTargets = true;
|
|
9482
|
+
errors.push({
|
|
9483
|
+
field: `runTaskIds[${i}]`,
|
|
9484
|
+
message: `runTaskIds[${i}]=${runTaskIds[i]} does not resolve to a task you can read`
|
|
9485
|
+
});
|
|
9486
|
+
continue;
|
|
9487
|
+
}
|
|
9488
|
+
presentTargets.push(t);
|
|
9489
|
+
if (t.taskType !== "run_eval") errors.push({
|
|
9490
|
+
field: `runTaskIds[${i}]`,
|
|
9491
|
+
message: `runTaskIds[${i}]=${runTaskIds[i]} is a ${t.taskType}, not a run_eval`
|
|
9492
|
+
});
|
|
9493
|
+
if (t.status !== "completed" || t.acceptedAttemptN === null) errors.push({
|
|
9494
|
+
field: `runTaskIds[${i}]`,
|
|
9495
|
+
message: `runTaskIds[${i}]=${runTaskIds[i]} is not completed with an accepted attempt (status=${t.status}, acceptedAttemptN=${t.acceptedAttemptN})`
|
|
9496
|
+
});
|
|
9497
|
+
}
|
|
9498
|
+
if (missingTargets || presentTargets.length === 0) return errors;
|
|
9499
|
+
const correlationIds = new Set(presentTargets.map((t) => t.correlationId ?? "__null__"));
|
|
9500
|
+
if (correlationIds.has("__null__")) errors.push({
|
|
9501
|
+
field: "runTaskIds",
|
|
9502
|
+
message: "one or more run_eval targets have no correlation_id; cannot group as variants"
|
|
9503
|
+
});
|
|
9504
|
+
if (correlationIds.size > 1) errors.push({
|
|
9505
|
+
field: "runTaskIds",
|
|
9506
|
+
message: `run_eval targets span multiple correlation_ids (${Array.from(correlationIds).join(", ")}); variants must share one`
|
|
9507
|
+
});
|
|
9508
|
+
if (errors.length > 0) return errors;
|
|
9509
|
+
const correlationId = presentTargets[0].correlationId;
|
|
9510
|
+
if (!correlationId) return errors;
|
|
9511
|
+
const seal = await ctx.findCorrelationSeal(correlationId);
|
|
9512
|
+
if (seal) errors.push({
|
|
9513
|
+
field: "runTaskIds",
|
|
9514
|
+
message: `correlation_id ${correlationId} is already sealed by ${seal.sealedByTaskType}/${seal.sealedByTaskId} at ${seal.sealedAt}; use a fresh correlation_id for a new judging round`
|
|
9515
|
+
});
|
|
9516
|
+
const first = stableStringify(presentTargets[0].input.successCriteria);
|
|
9517
|
+
for (let i = 1; i < presentTargets.length; i++) if (stableStringify(presentTargets[i].input.successCriteria) !== first) {
|
|
9518
|
+
errors.push({
|
|
9519
|
+
field: `runTaskIds[${i}]`,
|
|
9520
|
+
message: `runTaskIds[${i}] has a different input.successCriteria than runTaskIds[0]; all variants must share the rubric and gates`
|
|
9521
|
+
});
|
|
9522
|
+
break;
|
|
9523
|
+
}
|
|
9524
|
+
return errors;
|
|
9525
|
+
}
|
|
9526
|
+
/**
|
|
9527
|
+
* Side effect emitted on successful `judge_eval_variant` create:
|
|
9528
|
+
* seal the shared correlation_id atomically with the insert. The
|
|
9529
|
+
* task service applies the seal in the same transaction; a
|
|
9530
|
+
* concurrent second `judge_eval_variant` against the same group
|
|
9531
|
+
* loses the race and is rejected with a clean conflict error.
|
|
9532
|
+
*
|
|
9533
|
+
* The seal applies to the SHARED correlation_id of the targets —
|
|
9534
|
+
* NOT to the judge task's own correlationId (which is typically
|
|
9535
|
+
* null or distinct). The task service derives the correlationId
|
|
9536
|
+
* for the effect from the resolved targets, not from the judge
|
|
9537
|
+
* task row.
|
|
9538
|
+
*/
|
|
9539
|
+
async function onCreateJudgeEvalVariant(input, ctx) {
|
|
9540
|
+
const { runTaskIds } = input;
|
|
9541
|
+
const first = await ctx.resolveTask(runTaskIds[0]);
|
|
9542
|
+
if (!first?.correlationId) return [];
|
|
9543
|
+
return [{
|
|
9544
|
+
kind: "sealCorrelation",
|
|
9545
|
+
correlationId: first.correlationId
|
|
9546
|
+
}];
|
|
9547
|
+
}
|
|
9242
9548
|
//#endregion
|
|
9243
9549
|
//#region ../tasks/src/task-types/render-pack.ts
|
|
9244
9550
|
/**
|
|
@@ -9278,6 +9584,18 @@ var RenderPackOutput = Type$1.Object({
|
|
|
9278
9584
|
$id: "RenderPackOutput",
|
|
9279
9585
|
additionalProperties: false
|
|
9280
9586
|
});
|
|
9587
|
+
/**
|
|
9588
|
+
* Async preflight (#1096): `packId` resolves to a context_packs row
|
|
9589
|
+
* the caller can read.
|
|
9590
|
+
*/
|
|
9591
|
+
async function validateRenderPackInputAsync(input, ctx) {
|
|
9592
|
+
const { packId } = input;
|
|
9593
|
+
if (!await ctx.resolveContextPack(packId)) return [{
|
|
9594
|
+
field: "packId",
|
|
9595
|
+
message: `packId ${packId} does not resolve to a context pack you can read`
|
|
9596
|
+
}];
|
|
9597
|
+
return [];
|
|
9598
|
+
}
|
|
9281
9599
|
//#endregion
|
|
9282
9600
|
//#region ../tasks/src/task-types/run-eval.ts
|
|
9283
9601
|
/**
|
|
@@ -9385,7 +9703,8 @@ var BUILT_IN_TASK_TYPES = {
|
|
|
9385
9703
|
outputSchema: AssessBriefOutput,
|
|
9386
9704
|
outputKind: "judgment",
|
|
9387
9705
|
requiresReferences: true,
|
|
9388
|
-
validateInput: validateJudgmentInput
|
|
9706
|
+
validateInput: validateJudgmentInput,
|
|
9707
|
+
validateInputAsync: validateAssessBriefInputAsync
|
|
9389
9708
|
},
|
|
9390
9709
|
[CURATE_PACK_TYPE]: {
|
|
9391
9710
|
name: CURATE_PACK_TYPE,
|
|
@@ -9401,7 +9720,8 @@ var BUILT_IN_TASK_TYPES = {
|
|
|
9401
9720
|
outputSchema: RenderPackOutput,
|
|
9402
9721
|
outputKind: "artifact",
|
|
9403
9722
|
requiresReferences: false,
|
|
9404
|
-
validateOutput: requireVerificationWhenCriteriaPresent
|
|
9723
|
+
validateOutput: requireVerificationWhenCriteriaPresent,
|
|
9724
|
+
validateInputAsync: validateRenderPackInputAsync
|
|
9405
9725
|
},
|
|
9406
9726
|
[JUDGE_PACK_TYPE]: {
|
|
9407
9727
|
name: JUDGE_PACK_TYPE,
|
|
@@ -9410,7 +9730,8 @@ var BUILT_IN_TASK_TYPES = {
|
|
|
9410
9730
|
outputKind: "judgment",
|
|
9411
9731
|
requiresReferences: true,
|
|
9412
9732
|
validateInput: validateJudgmentInput,
|
|
9413
|
-
validateOutput: validateJudgePackOutput
|
|
9733
|
+
validateOutput: validateJudgePackOutput,
|
|
9734
|
+
validateInputAsync: validateJudgePackInputAsync
|
|
9414
9735
|
},
|
|
9415
9736
|
[RUN_EVAL_TYPE]: {
|
|
9416
9737
|
name: RUN_EVAL_TYPE,
|
|
@@ -9419,6 +9740,18 @@ var BUILT_IN_TASK_TYPES = {
|
|
|
9419
9740
|
outputKind: "artifact",
|
|
9420
9741
|
requiresReferences: false,
|
|
9421
9742
|
validateOutput: validateRunEvalOutput
|
|
9743
|
+
},
|
|
9744
|
+
[JUDGE_EVAL_VARIANT_TYPE]: {
|
|
9745
|
+
name: JUDGE_EVAL_VARIANT_TYPE,
|
|
9746
|
+
inputSchema: JudgeEvalVariantInput,
|
|
9747
|
+
outputSchema: JudgeEvalVariantOutput,
|
|
9748
|
+
outputKind: "judgment",
|
|
9749
|
+
requiresReferences: false,
|
|
9750
|
+
validateInput: validateJudgeEvalVariantInput,
|
|
9751
|
+
validateOutput: validateJudgeEvalVariantOutput,
|
|
9752
|
+
validateInputAsync: validateJudgeEvalVariantInputAsync,
|
|
9753
|
+
onCreate: onCreateJudgeEvalVariant,
|
|
9754
|
+
usesSubagents: true
|
|
9422
9755
|
}
|
|
9423
9756
|
};
|
|
9424
9757
|
//#endregion
|
|
@@ -9718,6 +10051,133 @@ Type$1.Object({
|
|
|
9718
10051
|
additionalProperties: false
|
|
9719
10052
|
});
|
|
9720
10053
|
//#endregion
|
|
10054
|
+
//#region ../agent-runtime/src/subagent-output-contracts.ts
|
|
10055
|
+
var REGISTRY = /* @__PURE__ */ new Map();
|
|
10056
|
+
/**
|
|
10057
|
+
* Register a subagent output contract. Idempotent: re-registering the
|
|
10058
|
+
* same name with a different schema throws — contracts are meant to
|
|
10059
|
+
* be stable. Re-registering with the identical contract object (same
|
|
10060
|
+
* reference) is a no-op for HMR and test convenience.
|
|
10061
|
+
*
|
|
10062
|
+
* Typically called at module-init time alongside task-type
|
|
10063
|
+
* registration. See task-types/index.ts in @moltnet/tasks for the
|
|
10064
|
+
* conventional pattern.
|
|
10065
|
+
*/
|
|
10066
|
+
function registerSubagentOutputContract(contract) {
|
|
10067
|
+
if (!contract.name || contract.name.trim().length === 0) throw new Error("subagent output contract name is required");
|
|
10068
|
+
if (!/^[a-z][a-z0-9_]*$/.test(contract.name)) throw new Error(`subagent output contract name '${contract.name}' must be lower_snake_case (starts with a letter, then [a-z0-9_]+)`);
|
|
10069
|
+
const existing = REGISTRY.get(contract.name);
|
|
10070
|
+
if (existing && existing !== contract) {
|
|
10071
|
+
if (existing.parametersSchema !== contract.parametersSchema) throw new Error(`subagent output contract '${contract.name}' is already registered with a different schema; refusing to override`);
|
|
10072
|
+
}
|
|
10073
|
+
REGISTRY.set(contract.name, contract);
|
|
10074
|
+
}
|
|
10075
|
+
/**
|
|
10076
|
+
* Resolve a subagent output contract by name. Returns `null` for
|
|
10077
|
+
* unknown names — callers (the subagent custom tool) decide whether
|
|
10078
|
+
* that's a tool error the parent LLM can recover from or a hard fail.
|
|
10079
|
+
*/
|
|
10080
|
+
function getSubagentOutputContract(name) {
|
|
10081
|
+
return REGISTRY.get(name) ?? null;
|
|
10082
|
+
}
|
|
10083
|
+
/**
|
|
10084
|
+
* List all registered contracts. Useful for diagnostics and for the
|
|
10085
|
+
* subagent tool's parameter description so a parent LLM can see what
|
|
10086
|
+
* contracts are available without enumerating them in its prompt.
|
|
10087
|
+
*/
|
|
10088
|
+
function listSubagentOutputContracts() {
|
|
10089
|
+
return [...REGISTRY.values()];
|
|
10090
|
+
}
|
|
10091
|
+
//#endregion
|
|
10092
|
+
//#region ../agent-runtime/src/built-in-contract-registrations.ts
|
|
10093
|
+
/**
|
|
10094
|
+
* Built-in subagent output contracts (#1087, #943).
|
|
10095
|
+
*
|
|
10096
|
+
* Why this is an exported function and not a module-init side
|
|
10097
|
+
* effect:
|
|
10098
|
+
*
|
|
10099
|
+
* - The registry is process-global. Module-init registration
|
|
10100
|
+
* fires exactly once per Node process (ESM modules are cached
|
|
10101
|
+
* by URL). Tests that call `__resetSubagentOutputContractsForTests()`
|
|
10102
|
+
* to start from an empty registry have no way to repopulate
|
|
10103
|
+
* the built-ins without re-evaluating the module — which the
|
|
10104
|
+
* cache prevents. PR #1101 review M4.
|
|
10105
|
+
* - An explicit `registerBuiltInSubagentContracts()` lets the
|
|
10106
|
+
* package index call it once at module load AND lets test
|
|
10107
|
+
* setup hooks call it again after `__reset...`.
|
|
10108
|
+
* - `registerSubagentOutputContract` is itself idempotent for
|
|
10109
|
+
* identical re-registrations, so calling this function twice
|
|
10110
|
+
* in the same process is safe.
|
|
10111
|
+
*
|
|
10112
|
+
* Adding a new built-in: extend the body of this function. Do not
|
|
10113
|
+
* call `registerSubagentOutputContract` from anywhere else in the
|
|
10114
|
+
* package — keeping all built-ins in one function makes the set
|
|
10115
|
+
* auditable.
|
|
10116
|
+
*/
|
|
10117
|
+
function registerBuiltInSubagentContracts() {
|
|
10118
|
+
registerSubagentOutputContract({
|
|
10119
|
+
name: "judge_eval_variant_result",
|
|
10120
|
+
description: "Per-variant grading result produced by a subagent of judge_eval_variant: scores against the shared rubric, composite, and a 1-3 sentence verdict for a single variant.",
|
|
10121
|
+
parametersSchema: JudgeEvalVariantResult
|
|
10122
|
+
});
|
|
10123
|
+
}
|
|
10124
|
+
registerBuiltInSubagentContracts();
|
|
10125
|
+
//#endregion
|
|
10126
|
+
//#region ../agent-runtime/src/context-bindings.ts
|
|
10127
|
+
var PROMPT_SEPARATOR = "\n\n---\n\n";
|
|
10128
|
+
/**
|
|
10129
|
+
* Resolve `task.input.context[]` into delivered side-effects (skills
|
|
10130
|
+
* persisted via `deliver.skill`) and prompt fragments
|
|
10131
|
+
* (`systemPromptPrefix`, `userInlineSuffix`) the caller weaves into the
|
|
10132
|
+
* built prompt.
|
|
10133
|
+
*
|
|
10134
|
+
* Per-binding semantics (V1):
|
|
10135
|
+
* - `skill` → `deliver.skill({ slug, content })` once per ref.
|
|
10136
|
+
* Slug collisions on distinct contents are
|
|
10137
|
+
* refused loudly.
|
|
10138
|
+
* - `prompt_prefix` → content appended to `systemPromptPrefix` with
|
|
10139
|
+
* the canonical `\n\n---\n\n` separator (in
|
|
10140
|
+
* declared order).
|
|
10141
|
+
* - `user_inline` → content appended to `userInlineSuffix` in
|
|
10142
|
+
* declared order, same separator.
|
|
10143
|
+
*
|
|
10144
|
+
* No fetching, no hashing — bytes are inlined in `ContextRef.content`,
|
|
10145
|
+
* and the task's `inputCid` already pins the entire input. The imposer
|
|
10146
|
+
* chose these bytes; the resolver just dispatches them.
|
|
10147
|
+
*
|
|
10148
|
+
* The function is pure with respect to its arguments: file writes are
|
|
10149
|
+
* confined to the injected `deliver` callback, which makes the
|
|
10150
|
+
* resolver trivial to test.
|
|
10151
|
+
*/
|
|
10152
|
+
async function resolveTaskContext(args) {
|
|
10153
|
+
const promptParts = [];
|
|
10154
|
+
const userParts = [];
|
|
10155
|
+
const injected = [];
|
|
10156
|
+
const usedSlugs = /* @__PURE__ */ new Map();
|
|
10157
|
+
for (const ref of args.context) {
|
|
10158
|
+
if (ref.binding === "skill") {
|
|
10159
|
+
const prior = usedSlugs.get(ref.slug);
|
|
10160
|
+
if (prior !== void 0) {
|
|
10161
|
+
if (prior !== ref.content) throw new Error(`slug collision on '${ref.slug}': two skill entries share the same slug but have different content`);
|
|
10162
|
+
injected.push(ref);
|
|
10163
|
+
continue;
|
|
10164
|
+
}
|
|
10165
|
+
usedSlugs.set(ref.slug, ref.content);
|
|
10166
|
+
await args.deliver.skill({
|
|
10167
|
+
slug: ref.slug,
|
|
10168
|
+
content: ref.content
|
|
10169
|
+
});
|
|
10170
|
+
} else if (ref.binding === "prompt_prefix") promptParts.push(ref.content);
|
|
10171
|
+
else userParts.push(ref.content);
|
|
10172
|
+
injected.push(ref);
|
|
10173
|
+
}
|
|
10174
|
+
return {
|
|
10175
|
+
injected,
|
|
10176
|
+
systemPromptPrefix: promptParts.join(PROMPT_SEPARATOR),
|
|
10177
|
+
userInlineSuffix: userParts.join(PROMPT_SEPARATOR)
|
|
10178
|
+
};
|
|
10179
|
+
}
|
|
10180
|
+
//#endregion
|
|
9721
10181
|
//#region ../agent-runtime/src/output-tools.ts
|
|
9722
10182
|
/**
|
|
9723
10183
|
* Submit-output tool contract.
|
|
@@ -10190,6 +10650,109 @@ function buildFulfillBriefUserPrompt(input, ctx) {
|
|
|
10190
10650
|
].filter(Boolean).join("\n");
|
|
10191
10651
|
}
|
|
10192
10652
|
//#endregion
|
|
10653
|
+
//#region ../agent-runtime/src/prompts/judge-eval-variant.ts
|
|
10654
|
+
/**
|
|
10655
|
+
* Build the first user-message prompt for a `judge_eval_variant` task
|
|
10656
|
+
* (#943 Slice 2).
|
|
10657
|
+
*
|
|
10658
|
+
* The parent agent's job is **fan-out-and-collect**: for each
|
|
10659
|
+
* `runTaskIds[i]`, spawn an isolated subagent via the `subagent` custom
|
|
10660
|
+
* tool (#1087), have it grade that variant against the shared rubric,
|
|
10661
|
+
* and collect each subagent's structured `judge_eval_variant_result`
|
|
10662
|
+
* payload. The parent does NOT grade itself; it composes the per-
|
|
10663
|
+
* variant results into the final `judge_eval_variant` output (results
|
|
10664
|
+
* array + optional deltas + verdicts).
|
|
10665
|
+
*
|
|
10666
|
+
* Isolation is the point: each variant gets a fresh subagent session
|
|
10667
|
+
* with no carryover context from sibling variants, so per-variant
|
|
10668
|
+
* grading is independent. Cost is bounded by `maxItems: 10` on
|
|
10669
|
+
* runTaskIds.
|
|
10670
|
+
*/
|
|
10671
|
+
function buildJudgeEvalVariantUserPrompt(input, ctx) {
|
|
10672
|
+
const { runTaskIds, successCriteria } = input;
|
|
10673
|
+
const rubric = successCriteria.rubric;
|
|
10674
|
+
if (!rubric) throw new Error("judge_eval_variant requires successCriteria.rubric — none present");
|
|
10675
|
+
const escapeCell = (s) => s.replace(/\\/g, "\\\\").replace(/\|/g, "\\|").replace(/\r?\n/g, " ");
|
|
10676
|
+
const criteriaTable = rubric.criteria.map((c) => `| \`${c.id}\` | ${c.weight.toFixed(3)} | ${c.scoring} | ${escapeCell(c.description)} |`).join("\n");
|
|
10677
|
+
const targetsBlock = runTaskIds.map((id, i) => `${i + 1}. \`${id}\``).join("\n");
|
|
10678
|
+
const finalOutputBlock = buildFinalOutputBlock({
|
|
10679
|
+
taskType: "judge_eval_variant",
|
|
10680
|
+
outputSchemaName: "JudgeEvalVariantOutput",
|
|
10681
|
+
shapeSketch: [
|
|
10682
|
+
"{",
|
|
10683
|
+
" \"results\": [",
|
|
10684
|
+
" {",
|
|
10685
|
+
" \"runTaskId\": \"<runTaskIds[i]>\",",
|
|
10686
|
+
" \"variantLabel\": \"<from variant input>\",",
|
|
10687
|
+
" \"scores\": [ { \"criterionId\": \"...\", \"score\": 0..1, \"rationale\": \"...\", \"assertions\": [...]? } ],",
|
|
10688
|
+
" \"composite\": <Σ(weight × score), 0..1>,",
|
|
10689
|
+
" \"verdict\": \"<1-3 sentences>\"",
|
|
10690
|
+
" },",
|
|
10691
|
+
" ...one entry per runTaskIds[i], same order",
|
|
10692
|
+
" ],",
|
|
10693
|
+
" \"deltas\": { \"<labelA> - <labelB>\": <composite(A) - composite(B)> }, // optional",
|
|
10694
|
+
" \"judgeModel\": \"<id>\", // optional",
|
|
10695
|
+
" \"traceparent\": \"<from claim>\"",
|
|
10696
|
+
"}"
|
|
10697
|
+
].join("\n")
|
|
10698
|
+
});
|
|
10699
|
+
return [
|
|
10700
|
+
"# Judge Eval Variants\n",
|
|
10701
|
+
`You are grading ${runTaskIds.length} variants of a single run_eval scenario`,
|
|
10702
|
+
"against ONE shared rubric. Your job is fan-out-and-collect — you do not",
|
|
10703
|
+
"grade yourself.",
|
|
10704
|
+
"",
|
|
10705
|
+
`Task id: \`${ctx.taskId}\``,
|
|
10706
|
+
`Diary: \`${ctx.diaryId}\``,
|
|
10707
|
+
"",
|
|
10708
|
+
"### Targets (variants to grade)",
|
|
10709
|
+
"",
|
|
10710
|
+
targetsBlock,
|
|
10711
|
+
"",
|
|
10712
|
+
"Each target is a completed `run_eval` task in the same correlation group.",
|
|
10713
|
+
"Read its accepted attempt via `moltnet_get_task` / `moltnet_list_task_attempts`",
|
|
10714
|
+
"to see the producer's output before grading.",
|
|
10715
|
+
"",
|
|
10716
|
+
"### Rubric",
|
|
10717
|
+
"",
|
|
10718
|
+
rubric.preamble ? `${rubric.preamble}\n` : "",
|
|
10719
|
+
"| Criterion | Weight | Scoring | Description |",
|
|
10720
|
+
"| --- | --- | --- | --- |",
|
|
10721
|
+
criteriaTable,
|
|
10722
|
+
"",
|
|
10723
|
+
"### How to grade",
|
|
10724
|
+
"",
|
|
10725
|
+
"For EACH `runTaskIds[i]`:",
|
|
10726
|
+
"",
|
|
10727
|
+
"1. Call the `subagent` custom tool with:",
|
|
10728
|
+
" - `task`: a brief instructing the subagent to grade ONLY that variant",
|
|
10729
|
+
" against the rubric above; include the target task id and the rubric",
|
|
10730
|
+
" verbatim. The subagent has the same MoltNet tools and can fetch the",
|
|
10731
|
+
" accepted attempt output independently.",
|
|
10732
|
+
" - `output_schema`: `\"judge_eval_variant_result\"`",
|
|
10733
|
+
"2. Receive the subagent's structured `judge_eval_variant_result` payload.",
|
|
10734
|
+
"3. Append it to your `results[]` array, **in the same order as input.runTaskIds**.",
|
|
10735
|
+
"",
|
|
10736
|
+
"Do NOT score any variant in your own session. The whole point of the",
|
|
10737
|
+
"subagent fan-out is per-variant context isolation — grading two variants",
|
|
10738
|
+
"back-to-back in one session lets the second be biased by the first.",
|
|
10739
|
+
"",
|
|
10740
|
+
"### Composite arithmetic",
|
|
10741
|
+
"",
|
|
10742
|
+
"Each `composite` MUST equal `Σ(criterion.weight × score)` over the rubric",
|
|
10743
|
+
"criteria. Drift > 0.001 is rejected. Subagents are instructed to compute it",
|
|
10744
|
+
"themselves; double-check before assembling the final output.",
|
|
10745
|
+
"",
|
|
10746
|
+
"### Deltas (optional)",
|
|
10747
|
+
"",
|
|
10748
|
+
"If useful, populate `deltas` with pairwise composite differences keyed by",
|
|
10749
|
+
"`\"<variantLabel-A> - <variantLabel-B>\"` (single space-hyphen-space). Both",
|
|
10750
|
+
"labels must appear in `results`. Omit `deltas` entirely if not used.",
|
|
10751
|
+
"",
|
|
10752
|
+
finalOutputBlock
|
|
10753
|
+
].filter((s) => s !== "").join("\n");
|
|
10754
|
+
}
|
|
10755
|
+
//#endregion
|
|
10193
10756
|
//#region ../agent-runtime/src/prompts/judge-pack.ts
|
|
10194
10757
|
function buildJudgePackUserPrompt(input, ctx) {
|
|
10195
10758
|
const { renderedPackId, sourcePackId, successCriteria } = input;
|
|
@@ -10496,6 +11059,15 @@ function buildTaskUserPrompt(task, ctx) {
|
|
|
10496
11059
|
diaryId: ctx.diaryId,
|
|
10497
11060
|
taskId: ctx.taskId
|
|
10498
11061
|
});
|
|
11062
|
+
case JUDGE_EVAL_VARIANT_TYPE:
|
|
11063
|
+
if (!Value.Check(JudgeEvalVariantInput, task.input)) {
|
|
11064
|
+
const errors = [...Value.Errors(JudgeEvalVariantInput, task.input)];
|
|
11065
|
+
throw new Error(`judge_eval_variant input failed validation: ${JSON.stringify(errors.slice(0, 3))}`);
|
|
11066
|
+
}
|
|
11067
|
+
return buildJudgeEvalVariantUserPrompt(task.input, {
|
|
11068
|
+
diaryId: ctx.diaryId,
|
|
11069
|
+
taskId: ctx.taskId
|
|
11070
|
+
});
|
|
10499
11071
|
case RUN_EVAL_TYPE:
|
|
10500
11072
|
if (!Value.Check(RunEvalInput, task.input)) {
|
|
10501
11073
|
const errors = [...Value.Errors(RunEvalInput, task.input)];
|
|
@@ -13977,25 +14549,6 @@ var require_multistream = /* @__PURE__ */ __commonJSMin(((exports, module) => {
|
|
|
13977
14549
|
module.exports.pino = pino;
|
|
13978
14550
|
})))();
|
|
13979
14551
|
//#endregion
|
|
13980
|
-
//#region ../agent-runtime/src/subagent-output-contracts.ts
|
|
13981
|
-
var REGISTRY = /* @__PURE__ */ new Map();
|
|
13982
|
-
/**
|
|
13983
|
-
* Resolve a subagent output contract by name. Returns `null` for
|
|
13984
|
-
* unknown names — callers (the subagent custom tool) decide whether
|
|
13985
|
-
* that's a tool error the parent LLM can recover from or a hard fail.
|
|
13986
|
-
*/
|
|
13987
|
-
function getSubagentOutputContract(name) {
|
|
13988
|
-
return REGISTRY.get(name) ?? null;
|
|
13989
|
-
}
|
|
13990
|
-
/**
|
|
13991
|
-
* List all registered contracts. Useful for diagnostics and for the
|
|
13992
|
-
* subagent tool's parameter description so a parent LLM can see what
|
|
13993
|
-
* contracts are available without enumerating them in its prompt.
|
|
13994
|
-
*/
|
|
13995
|
-
function listSubagentOutputContracts() {
|
|
13996
|
-
return [...REGISTRY.values()];
|
|
13997
|
-
}
|
|
13998
|
-
//#endregion
|
|
13999
14552
|
//#region src/runtime/inject-task-context.ts
|
|
14000
14553
|
/**
|
|
14001
14554
|
* Slice 1.5 of #943 — wire the agent-runtime resolver into the
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@themoltnet/pi-extension",
|
|
3
|
-
"version": "0.15.
|
|
3
|
+
"version": "0.15.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "MoltNet pi extension — sandboxed tool execution in Gondolin VMs with MoltNet identity and persistent memory",
|
|
6
6
|
"license": "MIT",
|
|
@@ -31,8 +31,8 @@
|
|
|
31
31
|
"@earendil-works/gondolin": "^0.9.1",
|
|
32
32
|
"@opentelemetry/api": "^1.9.0",
|
|
33
33
|
"@sinclair/typebox": "^0.34.0",
|
|
34
|
-
"@themoltnet/agent-runtime": "0.
|
|
35
|
-
"@themoltnet/sdk": "0.
|
|
34
|
+
"@themoltnet/agent-runtime": "0.14.0",
|
|
35
|
+
"@themoltnet/sdk": "0.101.0"
|
|
36
36
|
},
|
|
37
37
|
"peerDependencies": {
|
|
38
38
|
"@earendil-works/pi-coding-agent": ">=0.74.0",
|