@themoltnet/pi-extension 0.11.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +7 -2
- package/dist/index.js +373 -75
- package/package.json +22 -7
package/dist/index.d.ts
CHANGED
|
@@ -246,7 +246,6 @@ declare const Task: TObject< {
|
|
|
246
246
|
input: TRecord<TString, TUnknown>;
|
|
247
247
|
inputSchemaCid: TString;
|
|
248
248
|
inputCid: TString;
|
|
249
|
-
criteriaCid: TUnion<[TString, TNull]>;
|
|
250
249
|
references: TArray<TObject< {
|
|
251
250
|
taskId: TUnion<[TString, TNull]>;
|
|
252
251
|
outputCid: TString;
|
|
@@ -427,7 +426,13 @@ export declare interface VmConfig {
|
|
|
427
426
|
export declare interface VmCredentials {
|
|
428
427
|
moltnetJson: string;
|
|
429
428
|
agentEnvRaw: string;
|
|
430
|
-
|
|
429
|
+
/**
|
|
430
|
+
* Pi OAuth/API-key auth blob. Null when neither `~/.pi/agent/auth.json`
|
|
431
|
+
* (or its `PI_AUTH_PATH` override) is present — in that case the daemon
|
|
432
|
+
* relies on Pi's env-var providers (`ANTHROPIC_API_KEY`, etc.) carried
|
|
433
|
+
* via `agentEnv` and the host environment instead. CI uses this path.
|
|
434
|
+
*/
|
|
435
|
+
piAuthJson: string | null;
|
|
431
436
|
agentEnv: Record<string, string | undefined>;
|
|
432
437
|
gitconfig: string | null;
|
|
433
438
|
sshPrivateKey: string | null;
|
package/dist/index.js
CHANGED
|
@@ -8195,9 +8195,8 @@ function findMainWorktree() {
|
|
|
8195
8195
|
function loadCredentials(agentDir) {
|
|
8196
8196
|
const moltnetJson = readFileSync(path.join(agentDir, "moltnet.json"), "utf8");
|
|
8197
8197
|
const agentEnvRaw = readFileSync(path.join(agentDir, "env"), "utf8");
|
|
8198
|
-
const piAuthPath = path.join(process.env.HOME ?? "", ".pi", "agent", "auth.json");
|
|
8199
|
-
|
|
8200
|
-
const piAuthJson = readFileSync(piAuthPath, "utf8");
|
|
8198
|
+
const piAuthPath = process.env.PI_AUTH_PATH ?? path.join(process.env.HOME ?? "", ".pi", "agent", "auth.json");
|
|
8199
|
+
const piAuthJson = existsSync(piAuthPath) ? readFileSync(piAuthPath, "utf8") : null;
|
|
8201
8200
|
const gitconfigPath = path.join(agentDir, "gitconfig");
|
|
8202
8201
|
const gitconfig = existsSync(gitconfigPath) ? readFileSync(gitconfigPath, "utf8") : null;
|
|
8203
8202
|
const sshDir = path.join(agentDir, "ssh");
|
|
@@ -8315,7 +8314,7 @@ async function resumeVm(config) {
|
|
|
8315
8314
|
nameserver 1.1.1.1" > /etc/resolv.conf'`);
|
|
8316
8315
|
const vmSshDir = `${vmAgentDir}/ssh`;
|
|
8317
8316
|
await vm.exec(`mkdir -p ${vmAgentDir}/ssh /home/agent/.pi/agent`);
|
|
8318
|
-
await vm.fs.writeFile("/home/agent/.pi/agent/auth.json", creds.piAuthJson, { mode: 384 });
|
|
8317
|
+
if (creds.piAuthJson !== null) await vm.fs.writeFile("/home/agent/.pi/agent/auth.json", creds.piAuthJson, { mode: 384 });
|
|
8319
8318
|
const vmMoltnetJson = rewriteMoltnetJsonPaths(creds.moltnetJson, vmAgentDir, vmSshDir, creds.githubAppPemFilename);
|
|
8320
8319
|
await vm.fs.writeFile(`${vmAgentDir}/moltnet.json`, vmMoltnetJson, { mode: 384 });
|
|
8321
8320
|
await vm.fs.writeFile(`${vmAgentDir}/env`, creds.agentEnvRaw, { mode: 384 });
|
|
@@ -8558,7 +8557,13 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
|
|
|
8558
8557
|
/**
|
|
8559
8558
|
* How a judge must score a single criterion.
|
|
8560
8559
|
*
|
|
8561
|
-
* - `
|
|
8560
|
+
* - `llm_score`: 0..1 continuous, `rationale` required. Smooths failures
|
|
8561
|
+
* into the gradient — use `llm_checklist` instead for properties where
|
|
8562
|
+
* a single failure is a real failure (grounding, faithfulness).
|
|
8563
|
+
* - `llm_checklist`: judge enumerates per-claim assertions with
|
|
8564
|
+
* `{passed, evidence}`. The criterion's numeric `score` is derived:
|
|
8565
|
+
* `1` iff every assertion passes, else `0`. Per-claim evidence is the
|
|
8566
|
+
* dataset for cluster-analysis of failure modes. See #999.
|
|
8562
8567
|
* - `boolean`: 0 or 1, `rationale` optional.
|
|
8563
8568
|
* - `deterministic_signature_check`: judge runs a signature check;
|
|
8564
8569
|
* result is 0 or 1. No LLM discretion.
|
|
@@ -8566,11 +8571,31 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
|
|
|
8566
8571
|
* appears in the rendered output; 0 or 1.
|
|
8567
8572
|
*/
|
|
8568
8573
|
var RubricScoringMode = Type$1.Union([
|
|
8569
|
-
Type$1.Literal("
|
|
8574
|
+
Type$1.Literal("llm_score"),
|
|
8575
|
+
Type$1.Literal("llm_checklist"),
|
|
8570
8576
|
Type$1.Literal("boolean"),
|
|
8571
8577
|
Type$1.Literal("deterministic_signature_check"),
|
|
8572
8578
|
Type$1.Literal("deterministic_coverage_check")
|
|
8573
8579
|
], { $id: "RubricScoringMode" });
|
|
8580
|
+
/**
|
|
8581
|
+
* One binary check produced by an `llm_checklist`-mode criterion.
|
|
8582
|
+
*
|
|
8583
|
+
* `evidence` is REQUIRED for both PASS and FAIL — agentskills.io grading
|
|
8584
|
+
* principle: \"Don't give the benefit of the doubt.\" A PASS without
|
|
8585
|
+
* concrete evidence (a quoted span, an entry id, a source location)
|
|
8586
|
+
* cannot be audited. A FAIL without evidence cannot be clustered into
|
|
8587
|
+
* structural fixes. The same shape is reused by `judge-eval-variant`
|
|
8588
|
+
* (#943) so tooling, dashboards, and analysis stay uniform.
|
|
8589
|
+
*/
|
|
8590
|
+
var AssertionResult = Type$1.Object({
|
|
8591
|
+
id: Type$1.String({ minLength: 1 }),
|
|
8592
|
+
text: Type$1.String({ minLength: 1 }),
|
|
8593
|
+
passed: Type$1.Boolean(),
|
|
8594
|
+
evidence: Type$1.String({ minLength: 1 })
|
|
8595
|
+
}, {
|
|
8596
|
+
$id: "AssertionResult",
|
|
8597
|
+
additionalProperties: false
|
|
8598
|
+
});
|
|
8574
8599
|
var RubricCriterion = Type$1.Object({
|
|
8575
8600
|
id: Type$1.String({ minLength: 1 }),
|
|
8576
8601
|
description: Type$1.String({ minLength: 1 }),
|
|
@@ -8630,44 +8655,165 @@ unrelated subsystems and the test coverage on the auth path is
|
|
|
8630
8655
|
unchanged" is.
|
|
8631
8656
|
`.trim();
|
|
8632
8657
|
//#endregion
|
|
8658
|
+
//#region ../tasks/src/success-criteria.ts
|
|
8659
|
+
/**
|
|
8660
|
+
* SuccessCriteria — imposer-stated acceptance criteria, evaluated in two
|
|
8661
|
+
* complementary places.
|
|
8662
|
+
*
|
|
8663
|
+
* Before this envelope existed, criteria were scattered: a vestigial
|
|
8664
|
+
* `criteriaCid` column nobody resolved, an `acceptanceCriteria: string[]`
|
|
8665
|
+
* field on `fulfill_brief.input` that was "interpreted by the claiming
|
|
8666
|
+
* agent," and inline `rubric` / `criteria[]` fields on judgment-task
|
|
8667
|
+
* inputs. None of those were machine-verifiable end-to-end.
|
|
8668
|
+
*
|
|
8669
|
+
* This module defines a single, content-addressable envelope an imposer
|
|
8670
|
+
* attaches to any task type. It has four orthogonal sections — pick
|
|
8671
|
+
* whichever apply per task type:
|
|
8672
|
+
*
|
|
8673
|
+
* - `gates` Deterministic structural checks (CID/schema match)
|
|
8674
|
+
* - `assertions` Declarative claims about output JSON
|
|
8675
|
+
* - `rubric` Weighted-criteria scoring instrument, reused
|
|
8676
|
+
* verbatim from `./rubric.ts`.
|
|
8677
|
+
* - `sideEffects` Required process side-effects (e.g. diary entry)
|
|
8678
|
+
*
|
|
8679
|
+
* ## Two roles, two task types
|
|
8680
|
+
*
|
|
8681
|
+
* **Producer self-assessment** (fulfillment tasks: `fulfill_brief`,
|
|
8682
|
+
* `curate_pack`, `render_pack`). The producer **LLM** evaluates the
|
|
8683
|
+
* criteria against its own output and emits a `VerificationRecord`
|
|
8684
|
+
* inside `output.verification`. The daemon is pure passthrough — it
|
|
8685
|
+
* does not run `evaluateAssertions`, does not inspect the verification
|
|
8686
|
+
* record. The REST API is dumb storage; it never re-runs assertions and
|
|
8687
|
+
* never runs LLMs. The cross-field rule
|
|
8688
|
+
* `requireVerificationWhenCriteriaPresent` enforces "verification
|
|
8689
|
+
* required iff successCriteria present" at task-output validation time
|
|
8690
|
+
* (server-side schema check). Self-assessment is a truthful self-rating,
|
|
8691
|
+
* NOT enforcement — `verification.passed=false` does not block /complete
|
|
8692
|
+
* and does not affect `acceptedAttemptN`. See
|
|
8693
|
+
* `docs/agent-runtime.md` for the full producer/judge flow.
|
|
8694
|
+
*
|
|
8695
|
+
* **Binding evaluation** (judgment tasks: `assess_brief`, `judge_pack`).
|
|
8696
|
+
* A separate task whose IS the application of `successCriteria` to
|
|
8697
|
+
* someone else's output. Different agent (enforced at claim time), same
|
|
8698
|
+
* envelope. The judge's verdict is binding: this is the *gate* in the
|
|
8699
|
+
* MoltNet model. The rubric inside `successCriteria.rubric` IS the job
|
|
8700
|
+
* spec for the judge.
|
|
8701
|
+
*
|
|
8702
|
+
* The clean chain: producer task with `successCriteria` → producer
|
|
8703
|
+
* self-assesses honestly → imposer (or automation) creates a downstream
|
|
8704
|
+
* judgment task that references the same `successCriteria` (or a
|
|
8705
|
+
* stricter rubric) → judgment task delivers the binding verdict.
|
|
8706
|
+
*
|
|
8707
|
+
* Storage: SuccessCriteria lives inline at `task.input.successCriteria`,
|
|
8708
|
+
* pinned via the task's `inputCid`. No separate column or hash. When
|
|
8709
|
+
* #881 lands, the `rubric` field can graduate to `{ rubricCid }` lookup
|
|
8710
|
+
* without changing this envelope, and producer + judge tasks can pin
|
|
8711
|
+
* the SAME rubric across the chain for end-to-end auditability.
|
|
8712
|
+
*/
|
|
8713
|
+
var SchemaCheckSpec = Type$1.Object({ schemaCid: Type$1.String({ minLength: 1 }) }, { additionalProperties: false });
|
|
8714
|
+
var CidEqualsSpec = Type$1.Object({
|
|
8715
|
+
path: Type$1.String({ minLength: 1 }),
|
|
8716
|
+
expected: Type$1.String({ minLength: 1 })
|
|
8717
|
+
}, { additionalProperties: false });
|
|
8718
|
+
var Gate = Type$1.Union([Type$1.Object({
|
|
8719
|
+
id: Type$1.String({ minLength: 1 }),
|
|
8720
|
+
kind: Type$1.Literal("schema-check"),
|
|
8721
|
+
spec: SchemaCheckSpec,
|
|
8722
|
+
required: Type$1.Boolean()
|
|
8723
|
+
}, { additionalProperties: false }), Type$1.Object({
|
|
8724
|
+
id: Type$1.String({ minLength: 1 }),
|
|
8725
|
+
kind: Type$1.Literal("cid-equals"),
|
|
8726
|
+
spec: CidEqualsSpec,
|
|
8727
|
+
required: Type$1.Boolean()
|
|
8728
|
+
}, { additionalProperties: false })], { $id: "Gate" });
|
|
8729
|
+
var AssertionOp = Type$1.Union([
|
|
8730
|
+
Type$1.Literal("exists"),
|
|
8731
|
+
Type$1.Literal("equals"),
|
|
8732
|
+
Type$1.Literal("matches"),
|
|
8733
|
+
Type$1.Literal("in-range"),
|
|
8734
|
+
Type$1.Literal("min-length")
|
|
8735
|
+
], { $id: "AssertionOp" });
|
|
8736
|
+
var Assertion = Type$1.Object({
|
|
8737
|
+
id: Type$1.String({ minLength: 1 }),
|
|
8738
|
+
path: Type$1.String({ minLength: 1 }),
|
|
8739
|
+
op: AssertionOp,
|
|
8740
|
+
value: Type$1.Optional(Type$1.Unknown())
|
|
8741
|
+
}, {
|
|
8742
|
+
$id: "Assertion",
|
|
8743
|
+
additionalProperties: false
|
|
8744
|
+
});
|
|
8745
|
+
var SideEffectsSpec = Type$1.Object({
|
|
8746
|
+
diaryEntryRequired: Type$1.Optional(Type$1.Boolean()),
|
|
8747
|
+
diaryEntryTags: Type$1.Optional(Type$1.Array(Type$1.String({ minLength: 1 }))),
|
|
8748
|
+
referencedEntries: Type$1.Optional(Type$1.Integer({ minimum: 0 }))
|
|
8749
|
+
}, {
|
|
8750
|
+
$id: "SideEffectsSpec",
|
|
8751
|
+
additionalProperties: false
|
|
8752
|
+
});
|
|
8753
|
+
var SuccessCriteria = Type$1.Object({
|
|
8754
|
+
version: Type$1.Literal(1),
|
|
8755
|
+
gates: Type$1.Optional(Type$1.Array(Gate)),
|
|
8756
|
+
assertions: Type$1.Optional(Type$1.Array(Assertion)),
|
|
8757
|
+
rubric: Type$1.Optional(Rubric),
|
|
8758
|
+
minComposite: Type$1.Optional(Type$1.Number({
|
|
8759
|
+
minimum: 0,
|
|
8760
|
+
maximum: 1
|
|
8761
|
+
})),
|
|
8762
|
+
sideEffects: Type$1.Optional(SideEffectsSpec)
|
|
8763
|
+
}, {
|
|
8764
|
+
$id: "SuccessCriteria",
|
|
8765
|
+
additionalProperties: false
|
|
8766
|
+
});
|
|
8767
|
+
var VerificationResultStatus = Type$1.Union([
|
|
8768
|
+
Type$1.Literal("pass"),
|
|
8769
|
+
Type$1.Literal("fail"),
|
|
8770
|
+
Type$1.Literal("skip")
|
|
8771
|
+
], { $id: "VerificationResultStatus" });
|
|
8772
|
+
var VerificationResultKind = Type$1.Union([
|
|
8773
|
+
Type$1.Literal("gate"),
|
|
8774
|
+
Type$1.Literal("assertion"),
|
|
8775
|
+
Type$1.Literal("rubric"),
|
|
8776
|
+
Type$1.Literal("sideEffect")
|
|
8777
|
+
], { $id: "VerificationResultKind" });
|
|
8778
|
+
var VerificationResult = Type$1.Object({
|
|
8779
|
+
id: Type$1.String({ minLength: 1 }),
|
|
8780
|
+
kind: VerificationResultKind,
|
|
8781
|
+
status: VerificationResultStatus,
|
|
8782
|
+
detail: Type$1.Optional(Type$1.String())
|
|
8783
|
+
}, {
|
|
8784
|
+
$id: "VerificationResult",
|
|
8785
|
+
additionalProperties: false
|
|
8786
|
+
});
|
|
8787
|
+
var VerificationRecord = Type$1.Object({
|
|
8788
|
+
inputCid: Type$1.String({ minLength: 1 }),
|
|
8789
|
+
results: Type$1.Array(VerificationResult),
|
|
8790
|
+
passed: Type$1.Boolean()
|
|
8791
|
+
}, {
|
|
8792
|
+
$id: "VerificationRecord",
|
|
8793
|
+
additionalProperties: false
|
|
8794
|
+
});
|
|
8795
|
+
//#endregion
|
|
8633
8796
|
//#region ../tasks/src/task-types/assess-brief.ts
|
|
8634
8797
|
/**
|
|
8635
8798
|
* `assess_brief` — independently evaluate a fulfilled brief.
|
|
8636
8799
|
*
|
|
8637
8800
|
* output_kind: judgment
|
|
8638
|
-
* criteria: required (rubric
|
|
8639
|
-
*
|
|
8801
|
+
* criteria: required (`successCriteria.rubric` — same envelope as
|
|
8802
|
+
* `judge_pack`)
|
|
8640
8803
|
* references: required (must reference the target `fulfill_brief` task)
|
|
8641
8804
|
*
|
|
8642
8805
|
* The assessor is a different agent from the producer (enforced by the
|
|
8643
8806
|
* server / runtime at claim time — not in the wire schema).
|
|
8807
|
+
*
|
|
8808
|
+
* The rubric in `successCriteria` IS the job spec — the assessor applies
|
|
8809
|
+
* it to the target task's output and emits per-criterion scores. Other
|
|
8810
|
+
* sections (`assertions`, `gates`, `sideEffects`) MAY be present and are
|
|
8811
|
+
* evaluated against the *assessor's output*.
|
|
8644
8812
|
*/
|
|
8645
8813
|
var ASSESS_BRIEF_TYPE = "assess_brief";
|
|
8646
|
-
/**
|
|
8647
|
-
* One criterion lifted from the rubric. Denormalized into the input so the
|
|
8648
|
-
* assessor prompt can be built without a second fetch; the `criteria_cid`
|
|
8649
|
-
* on the Task row remains authoritative for verification.
|
|
8650
|
-
*/
|
|
8651
|
-
var AssessBriefCriterion = Type$1.Object({
|
|
8652
|
-
id: Type$1.String({ minLength: 1 }),
|
|
8653
|
-
description: Type$1.String({ minLength: 1 }),
|
|
8654
|
-
weight: Type$1.Number({
|
|
8655
|
-
minimum: 0,
|
|
8656
|
-
maximum: 1
|
|
8657
|
-
}),
|
|
8658
|
-
scoring: Type$1.Union([
|
|
8659
|
-
Type$1.Literal("llm_judged"),
|
|
8660
|
-
Type$1.Literal("boolean"),
|
|
8661
|
-
Type$1.Literal("deterministic_signature_check")
|
|
8662
|
-
])
|
|
8663
|
-
}, {
|
|
8664
|
-
$id: "AssessBriefCriterion",
|
|
8665
|
-
additionalProperties: false
|
|
8666
|
-
});
|
|
8667
8814
|
var AssessBriefInput = Type$1.Object({
|
|
8668
8815
|
targetTaskId: Type$1.String({ format: "uuid" }),
|
|
8669
|
-
|
|
8670
|
-
rubricPreamble: Type$1.Optional(Type$1.String())
|
|
8816
|
+
successCriteria: SuccessCriteria
|
|
8671
8817
|
}, {
|
|
8672
8818
|
$id: "AssessBriefInput",
|
|
8673
8819
|
additionalProperties: false
|
|
@@ -8736,7 +8882,8 @@ var CuratePackInput = Type$1.Object({
|
|
|
8736
8882
|
prefix: Type$1.Optional(Type$1.String())
|
|
8737
8883
|
}, { additionalProperties: false })),
|
|
8738
8884
|
tokenBudget: Type$1.Optional(Type$1.Number({ minimum: 500 })),
|
|
8739
|
-
recipe: Type$1.Optional(Type$1.Union([Type$1.Literal("topic-focused-v1"), Type$1.Literal("scope-inventory-v1")]))
|
|
8885
|
+
recipe: Type$1.Optional(Type$1.Union([Type$1.Literal("topic-focused-v1"), Type$1.Literal("scope-inventory-v1")])),
|
|
8886
|
+
successCriteria: Type$1.Optional(SuccessCriteria)
|
|
8740
8887
|
}, {
|
|
8741
8888
|
$id: "CuratePackInput",
|
|
8742
8889
|
additionalProperties: false
|
|
@@ -8761,7 +8908,8 @@ var CuratePackOutput = Type$1.Object({
|
|
|
8761
8908
|
droppedIds: Type$1.Optional(Type$1.Array(Type$1.String({ format: "uuid" }))),
|
|
8762
8909
|
notes: Type$1.String({ minLength: 1 })
|
|
8763
8910
|
}, { additionalProperties: false }))),
|
|
8764
|
-
summary: Type$1.String({ minLength: 1 })
|
|
8911
|
+
summary: Type$1.String({ minLength: 1 }),
|
|
8912
|
+
verification: Type$1.Optional(VerificationRecord)
|
|
8765
8913
|
}, {
|
|
8766
8914
|
$id: "CuratePackOutput",
|
|
8767
8915
|
additionalProperties: false
|
|
@@ -8780,6 +8928,7 @@ var FulfillBriefInput = Type$1.Object({
|
|
|
8780
8928
|
brief: Type$1.String({ minLength: 1 }),
|
|
8781
8929
|
title: Type$1.Optional(Type$1.String()),
|
|
8782
8930
|
acceptanceCriteria: Type$1.Optional(Type$1.Array(Type$1.String())),
|
|
8931
|
+
successCriteria: Type$1.Optional(SuccessCriteria),
|
|
8783
8932
|
seedFiles: Type$1.Optional(Type$1.Array(Type$1.String())),
|
|
8784
8933
|
scopeHint: Type$1.Optional(Type$1.String())
|
|
8785
8934
|
}, {
|
|
@@ -8799,7 +8948,8 @@ var FulfillBriefOutput = Type$1.Object({
|
|
|
8799
8948
|
}, { additionalProperties: false })),
|
|
8800
8949
|
pullRequestUrl: Type$1.Union([Type$1.String(), Type$1.Null()]),
|
|
8801
8950
|
diaryEntryIds: Type$1.Array(Type$1.String({ format: "uuid" })),
|
|
8802
|
-
summary: Type$1.String({ minLength: 1 })
|
|
8951
|
+
summary: Type$1.String({ minLength: 1 }),
|
|
8952
|
+
verification: Type$1.Optional(VerificationRecord)
|
|
8803
8953
|
}, {
|
|
8804
8954
|
$id: "FulfillBriefOutput",
|
|
8805
8955
|
additionalProperties: false
|
|
@@ -8810,19 +8960,18 @@ var FulfillBriefOutput = Type$1.Object({
|
|
|
8810
8960
|
* `judge_pack` — independently score a rendered pack against a rubric.
|
|
8811
8961
|
*
|
|
8812
8962
|
* output_kind: judgment
|
|
8813
|
-
* criteria: required (
|
|
8814
|
-
*
|
|
8963
|
+
* criteria: required (`successCriteria.rubric` — see #852 amendment and
|
|
8964
|
+
* Phase 2 issue #881)
|
|
8815
8965
|
* references: required (must reference the `render_pack` task it judges,
|
|
8816
8966
|
* role='judged_work')
|
|
8817
8967
|
*
|
|
8818
8968
|
* Step 3 of the three-session attribution loop (#875). Mirrors
|
|
8819
8969
|
* `assess_brief` in shape, but over a rendered context pack.
|
|
8820
8970
|
*
|
|
8821
|
-
* Phase 1 rubric storage: the rubric body
|
|
8822
|
-
*
|
|
8823
|
-
* replace the inline body with a `
|
|
8824
|
-
*
|
|
8825
|
-
* building without a fetch.
|
|
8971
|
+
* Phase 1 rubric storage: the rubric body lives at
|
|
8972
|
+
* `input.successCriteria.rubric` and is pinned via the task's `inputCid`.
|
|
8973
|
+
* Phase 2 (#881) will replace the inline body with a `rubricCid`
|
|
8974
|
+
* referencing a stored `rubrics` row; the envelope stays the same.
|
|
8826
8975
|
*
|
|
8827
8976
|
* The judge MUST be a different agent from the renderer. Enforced at
|
|
8828
8977
|
* claim time by the runtime, not in the wire schema.
|
|
@@ -8831,7 +8980,7 @@ var JUDGE_PACK_TYPE = "judge_pack";
|
|
|
8831
8980
|
var JudgePackInput = Type$1.Object({
|
|
8832
8981
|
renderedPackId: Type$1.String({ format: "uuid" }),
|
|
8833
8982
|
sourcePackId: Type$1.String({ format: "uuid" }),
|
|
8834
|
-
|
|
8983
|
+
successCriteria: SuccessCriteria
|
|
8835
8984
|
}, {
|
|
8836
8985
|
$id: "JudgePackInput",
|
|
8837
8986
|
additionalProperties: false
|
|
@@ -8844,6 +8993,7 @@ var JudgePackScore = Type$1.Object({
|
|
|
8844
8993
|
maximum: 1
|
|
8845
8994
|
}),
|
|
8846
8995
|
rationale: Type$1.Optional(Type$1.String()),
|
|
8996
|
+
assertions: Type$1.Optional(Type$1.Array(AssertionResult, { minItems: 1 })),
|
|
8847
8997
|
evidence: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Unknown()))
|
|
8848
8998
|
}, {
|
|
8849
8999
|
$id: "JudgePackScore",
|
|
@@ -8862,6 +9012,39 @@ var JudgePackOutput = Type$1.Object({
|
|
|
8862
9012
|
$id: "JudgePackOutput",
|
|
8863
9013
|
additionalProperties: false
|
|
8864
9014
|
});
|
|
9015
|
+
/**
|
|
9016
|
+
* Cross-field validator for JudgePackOutput. Run after the TypeBox
|
|
9017
|
+
* schema check passes. Enforces invariants the schema can't express:
|
|
9018
|
+
*
|
|
9019
|
+
* 1. If a `JudgePackScore` carries an `assertions` array (i.e. the
|
|
9020
|
+
* judge ran the criterion in `llm_checklist` mode), its numeric
|
|
9021
|
+
* `score` MUST equal `1` if every `assertions[i].passed` is true,
|
|
9022
|
+
* else `0`. The prompt instructs the judge to derive `score` from
|
|
9023
|
+
* the array, but the LLM can drift — without this check, the
|
|
9024
|
+
* runtime accepts inconsistent payloads and propagates them into
|
|
9025
|
+
* composite scores and judge attestations (#999 P1).
|
|
9026
|
+
*
|
|
9027
|
+
* 2. If `score` is exactly `1` AND `assertions` is present, every
|
|
9028
|
+
* assertion must have `passed: true`. Catches the failure mode in
|
|
9029
|
+
* the issue: "score: 1 with a failing assertion accepted."
|
|
9030
|
+
*
|
|
9031
|
+
* Cross-rubric checks (e.g. "did the judge populate `assertions` for
|
|
9032
|
+
* every criterion the rubric marked `llm_checklist`?") require the
|
|
9033
|
+
* input rubric and live in a separate, runtime-side validator. This
|
|
9034
|
+
* one is rubric-agnostic on purpose — it catches within-score
|
|
9035
|
+
* inconsistency without needing the original task input.
|
|
9036
|
+
*/
|
|
9037
|
+
function validateJudgePackOutput(output) {
|
|
9038
|
+
const scores = output.scores;
|
|
9039
|
+
for (let i = 0; i < scores.length; i++) {
|
|
9040
|
+
const s = scores[i];
|
|
9041
|
+
if (!s.assertions) continue;
|
|
9042
|
+
const allPassed = s.assertions.every((a) => a.passed);
|
|
9043
|
+
const expected = allPassed ? 1 : 0;
|
|
9044
|
+
if (s.score !== expected) return `scores[${i}] (criterionId="${s.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${s.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
|
|
9045
|
+
}
|
|
9046
|
+
return null;
|
|
9047
|
+
}
|
|
8865
9048
|
//#endregion
|
|
8866
9049
|
//#region ../tasks/src/task-types/render-pack.ts
|
|
8867
9050
|
/**
|
|
@@ -8883,7 +9066,8 @@ var RENDER_PACK_TYPE = "render_pack";
|
|
|
8883
9066
|
var RenderPackInput = Type$1.Object({
|
|
8884
9067
|
packId: Type$1.String({ format: "uuid" }),
|
|
8885
9068
|
persist: Type$1.Optional(Type$1.Boolean()),
|
|
8886
|
-
pinned: Type$1.Optional(Type$1.Boolean())
|
|
9069
|
+
pinned: Type$1.Optional(Type$1.Boolean()),
|
|
9070
|
+
successCriteria: Type$1.Optional(SuccessCriteria)
|
|
8887
9071
|
}, {
|
|
8888
9072
|
$id: "RenderPackInput",
|
|
8889
9073
|
additionalProperties: false
|
|
@@ -8894,7 +9078,8 @@ var RenderPackOutput = Type$1.Object({
|
|
|
8894
9078
|
renderMethod: Type$1.String({ minLength: 1 }),
|
|
8895
9079
|
byteSize: Type$1.Number({ minimum: 0 }),
|
|
8896
9080
|
entriesRendered: Type$1.Number({ minimum: 0 }),
|
|
8897
|
-
summary: Type$1.String({ minLength: 1 })
|
|
9081
|
+
summary: Type$1.String({ minLength: 1 }),
|
|
9082
|
+
verification: Type$1.Optional(VerificationRecord)
|
|
8898
9083
|
}, {
|
|
8899
9084
|
$id: "RenderPackOutput",
|
|
8900
9085
|
additionalProperties: false
|
|
@@ -8902,6 +9087,33 @@ var RenderPackOutput = Type$1.Object({
|
|
|
8902
9087
|
//#endregion
|
|
8903
9088
|
//#region ../tasks/src/task-types/index.ts
|
|
8904
9089
|
/**
|
|
9090
|
+
* Validate that a judgment-task input carries a rubric inside its
|
|
9091
|
+
* `successCriteria` envelope, and that the rubric's weights sum to 1.
|
|
9092
|
+
* Used for `assess_brief` and `judge_pack`.
|
|
9093
|
+
*/
|
|
9094
|
+
function validateJudgmentInput(input) {
|
|
9095
|
+
const sc = input.successCriteria;
|
|
9096
|
+
if (!sc) return "successCriteria is required for judgment tasks";
|
|
9097
|
+
if (!sc.rubric) return "successCriteria.rubric is required for judgment tasks";
|
|
9098
|
+
return validateRubricWeights(sc.rubric);
|
|
9099
|
+
}
|
|
9100
|
+
/**
|
|
9101
|
+
* Cross-field rule: when `input.successCriteria` is set, the producer's
|
|
9102
|
+
* output MUST carry a `verification` block (the LLM's self-assessment).
|
|
9103
|
+
* When it is unset, the output MUST NOT carry one (avoid garbage data).
|
|
9104
|
+
*
|
|
9105
|
+
* Used by all three fulfillment task types. Judgment task outputs do
|
|
9106
|
+
* NOT use this — their entire output IS a structured judgment, so a
|
|
9107
|
+
* separate self-assessment field would be circular.
|
|
9108
|
+
*/
|
|
9109
|
+
function requireVerificationWhenCriteriaPresent(output, input) {
|
|
9110
|
+
const hasCriteria = input !== void 0 && input !== null && input.successCriteria !== void 0;
|
|
9111
|
+
const hasVerification = output.verification !== void 0;
|
|
9112
|
+
if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the criteria";
|
|
9113
|
+
if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no criteria to assess against";
|
|
9114
|
+
return null;
|
|
9115
|
+
}
|
|
9116
|
+
/**
|
|
8905
9117
|
* Client-side task-type registry. Mirrors the server-owned DB registry
|
|
8906
9118
|
* (PR 2). PR 0 shipped the two brief types; this PR adds the three
|
|
8907
9119
|
* pack-pipeline types for the three-session attribution loop (#875).
|
|
@@ -8916,41 +9128,41 @@ var BUILT_IN_TASK_TYPES = {
|
|
|
8916
9128
|
inputSchema: FulfillBriefInput,
|
|
8917
9129
|
outputSchema: FulfillBriefOutput,
|
|
8918
9130
|
outputKind: "artifact",
|
|
8919
|
-
|
|
8920
|
-
|
|
9131
|
+
requiresReferences: false,
|
|
9132
|
+
validateOutput: requireVerificationWhenCriteriaPresent
|
|
8921
9133
|
},
|
|
8922
9134
|
[ASSESS_BRIEF_TYPE]: {
|
|
8923
9135
|
name: ASSESS_BRIEF_TYPE,
|
|
8924
9136
|
inputSchema: AssessBriefInput,
|
|
8925
9137
|
outputSchema: AssessBriefOutput,
|
|
8926
9138
|
outputKind: "judgment",
|
|
8927
|
-
|
|
8928
|
-
|
|
9139
|
+
requiresReferences: true,
|
|
9140
|
+
validateInput: validateJudgmentInput
|
|
8929
9141
|
},
|
|
8930
9142
|
[CURATE_PACK_TYPE]: {
|
|
8931
9143
|
name: CURATE_PACK_TYPE,
|
|
8932
9144
|
inputSchema: CuratePackInput,
|
|
8933
9145
|
outputSchema: CuratePackOutput,
|
|
8934
9146
|
outputKind: "artifact",
|
|
8935
|
-
|
|
8936
|
-
|
|
9147
|
+
requiresReferences: false,
|
|
9148
|
+
validateOutput: requireVerificationWhenCriteriaPresent
|
|
8937
9149
|
},
|
|
8938
9150
|
[RENDER_PACK_TYPE]: {
|
|
8939
9151
|
name: RENDER_PACK_TYPE,
|
|
8940
9152
|
inputSchema: RenderPackInput,
|
|
8941
9153
|
outputSchema: RenderPackOutput,
|
|
8942
9154
|
outputKind: "artifact",
|
|
8943
|
-
|
|
8944
|
-
|
|
9155
|
+
requiresReferences: false,
|
|
9156
|
+
validateOutput: requireVerificationWhenCriteriaPresent
|
|
8945
9157
|
},
|
|
8946
9158
|
[JUDGE_PACK_TYPE]: {
|
|
8947
9159
|
name: JUDGE_PACK_TYPE,
|
|
8948
9160
|
inputSchema: JudgePackInput,
|
|
8949
9161
|
outputSchema: JudgePackOutput,
|
|
8950
9162
|
outputKind: "judgment",
|
|
8951
|
-
requiresCriteria: false,
|
|
8952
9163
|
requiresReferences: true,
|
|
8953
|
-
validateInput:
|
|
9164
|
+
validateInput: validateJudgmentInput,
|
|
9165
|
+
validateOutput: validateJudgePackOutput
|
|
8954
9166
|
}
|
|
8955
9167
|
};
|
|
8956
9168
|
//#endregion
|
|
@@ -8980,13 +9192,22 @@ function schemaErrors(prefix, schema, value) {
|
|
|
8980
9192
|
message: error.message
|
|
8981
9193
|
}));
|
|
8982
9194
|
}
|
|
8983
|
-
function validateTaskOutput(taskType, output) {
|
|
9195
|
+
function validateTaskOutput(taskType, output, input) {
|
|
8984
9196
|
const entry = getTaskTypeEntry(taskType);
|
|
8985
9197
|
if (!entry) return [{
|
|
8986
9198
|
field: "taskType",
|
|
8987
9199
|
message: `Unknown task type: ${taskType}`
|
|
8988
9200
|
}];
|
|
8989
|
-
|
|
9201
|
+
const errors = schemaErrors("output", entry.outputSchema, output);
|
|
9202
|
+
if (errors.length > 0) return errors;
|
|
9203
|
+
if (entry.validateOutput) {
|
|
9204
|
+
const validationError = entry.validateOutput(output, input);
|
|
9205
|
+
if (validationError) return [{
|
|
9206
|
+
field: "output",
|
|
9207
|
+
message: validationError
|
|
9208
|
+
}];
|
|
9209
|
+
}
|
|
9210
|
+
return [];
|
|
8990
9211
|
}
|
|
8991
9212
|
/**
|
|
8992
9213
|
* Resolve the TypeBox output schema registered for `taskType`. Returns
|
|
@@ -9126,7 +9347,6 @@ Type$1.Object({
|
|
|
9126
9347
|
input: Type$1.Record(Type$1.String(), Type$1.Unknown()),
|
|
9127
9348
|
inputSchemaCid: Cid,
|
|
9128
9349
|
inputCid: Cid,
|
|
9129
|
-
criteriaCid: Type$1.Union([Cid, Type$1.Null()]),
|
|
9130
9350
|
references: Type$1.Array(TaskRef),
|
|
9131
9351
|
correlationId: Type$1.Union([Uuid, Type$1.Null()]),
|
|
9132
9352
|
imposedByAgentId: Type$1.Union([Uuid, Type$1.Null()]),
|
|
@@ -9340,11 +9560,12 @@ function buildFinalOutputBlock(opts) {
|
|
|
9340
9560
|
* anything) work without any code path here.
|
|
9341
9561
|
*/
|
|
9342
9562
|
function buildAssessBriefPrompt(input, ctx) {
|
|
9343
|
-
const
|
|
9344
|
-
const
|
|
9563
|
+
const rubric = input.successCriteria.rubric;
|
|
9564
|
+
const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
|
|
9565
|
+
const preambleSection = rubric.preamble ? [
|
|
9345
9566
|
"### Rubric preamble",
|
|
9346
9567
|
"",
|
|
9347
|
-
|
|
9568
|
+
rubric.preamble,
|
|
9348
9569
|
""
|
|
9349
9570
|
].join("\n") : "";
|
|
9350
9571
|
return [
|
|
@@ -9394,7 +9615,7 @@ function buildAssessBriefPrompt(input, ctx) {
|
|
|
9394
9615
|
"",
|
|
9395
9616
|
"### Scoring rules",
|
|
9396
9617
|
"",
|
|
9397
|
-
"- `
|
|
9618
|
+
"- `llm_score`: score 0..1 continuous. `rationale` REQUIRED (2–4 sentences).",
|
|
9398
9619
|
"- `boolean`: score exactly 0 or 1. `rationale` optional.",
|
|
9399
9620
|
"- `deterministic_signature_check`: run `moltnet entry verify` on every diary entry returned by step 3 above AND `git verify-commit` on every commit. Score 1 iff ALL signatures are valid; otherwise 0. Populate `evidence.commitsVerified`, `evidence.commitsTotal`, `evidence.signatureFailures`.",
|
|
9400
9621
|
"",
|
|
@@ -9418,6 +9639,39 @@ function buildAssessBriefPrompt(input, ctx) {
|
|
|
9418
9639
|
].filter(Boolean).join("\n");
|
|
9419
9640
|
}
|
|
9420
9641
|
//#endregion
|
|
9642
|
+
//#region ../agent-runtime/src/prompts/self-verification.ts
|
|
9643
|
+
function buildSelfVerificationBlock(taskId) {
|
|
9644
|
+
return [
|
|
9645
|
+
"## Self-verification",
|
|
9646
|
+
"",
|
|
9647
|
+
`Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.successCriteria\`.`,
|
|
9648
|
+
"",
|
|
9649
|
+
"- If `input.successCriteria` is **absent**, omit `verification` from your",
|
|
9650
|
+
" final output entirely.",
|
|
9651
|
+
"- If `input.successCriteria` is **present**, you MUST include a",
|
|
9652
|
+
" `verification` block in your final output. Evaluate every applicable",
|
|
9653
|
+
" item — `gates`, `assertions`, `rubric` criteria, `sideEffects` — against",
|
|
9654
|
+
" your produced work and emit one result per id. Be honest: a `fail` with",
|
|
9655
|
+
" a one-line reason is more useful than a false `pass`. Use `skip` (with a",
|
|
9656
|
+
" `detail`) when you genuinely could not determine a result. Compute",
|
|
9657
|
+
" `passed = results.every(r => r.status !== 'fail')`.",
|
|
9658
|
+
"",
|
|
9659
|
+
"Verification shape:",
|
|
9660
|
+
"",
|
|
9661
|
+
"```json",
|
|
9662
|
+
"{",
|
|
9663
|
+
" \"inputCid\": \"<the inputCid you saw on the task>\",",
|
|
9664
|
+
" \"results\": [",
|
|
9665
|
+
" { \"id\": \"<criterion id>\", \"kind\": \"assertion|gate|rubric|sideEffect\",",
|
|
9666
|
+
" \"status\": \"pass|fail|skip\", \"detail\": \"<optional one-liner>\" }",
|
|
9667
|
+
" ],",
|
|
9668
|
+
" \"passed\": <boolean>",
|
|
9669
|
+
"}",
|
|
9670
|
+
"```",
|
|
9671
|
+
""
|
|
9672
|
+
].join("\n");
|
|
9673
|
+
}
|
|
9674
|
+
//#endregion
|
|
9421
9675
|
//#region ../agent-runtime/src/prompts/curate-pack.ts
|
|
9422
9676
|
/**
|
|
9423
9677
|
* Build the system prompt for a `curate_pack` task.
|
|
@@ -9552,6 +9806,7 @@ function buildCuratePackPrompt(input, ctx) {
|
|
|
9552
9806
|
" output, not in the diary.",
|
|
9553
9807
|
"- Respect hard include/exclude filters literally.",
|
|
9554
9808
|
"",
|
|
9809
|
+
buildSelfVerificationBlock(ctx.taskId),
|
|
9555
9810
|
buildFinalOutputBlock({
|
|
9556
9811
|
taskType: "curate_pack",
|
|
9557
9812
|
outputSchemaName: "CuratePackOutput",
|
|
@@ -9566,7 +9821,8 @@ function buildCuratePackPrompt(input, ctx) {
|
|
|
9566
9821
|
" \"checkpoints\": [",
|
|
9567
9822
|
" { \"phase\": \"recon\", \"candidateIds\": [...], \"droppedIds\": [...], \"notes\": \"...\" }",
|
|
9568
9823
|
" ],",
|
|
9569
|
-
" \"summary\": \"<2-4 sentences: what you looked for, how you narrowed, what defines the final set>\"",
|
|
9824
|
+
" \"summary\": \"<2-4 sentences: what you looked for, how you narrowed, what defines the final set>\",",
|
|
9825
|
+
" \"verification\": <required iff input.successCriteria; see Self-verification>",
|
|
9570
9826
|
"}"
|
|
9571
9827
|
].join("\n")
|
|
9572
9828
|
})
|
|
@@ -9596,7 +9852,21 @@ function buildFulfillBriefPrompt(input, ctx) {
|
|
|
9596
9852
|
...seedFiles.map((f) => `- \`${f}\``),
|
|
9597
9853
|
""
|
|
9598
9854
|
].join("\n") : "";
|
|
9599
|
-
const branchSlug = scopeHint ? `feat/${scopeHint}-` : "feat/";
|
|
9855
|
+
const branchSlug = ctx.correlationId ? `moltnet/${ctx.correlationId}/` : scopeHint ? `feat/${scopeHint}-` : "feat/";
|
|
9856
|
+
const correlationSection = ctx.correlationId ? [
|
|
9857
|
+
"### Correlation",
|
|
9858
|
+
"",
|
|
9859
|
+
`This task carries correlationId \`${ctx.correlationId}\`. You MUST:`,
|
|
9860
|
+
"",
|
|
9861
|
+
`1. Name your branch \`moltnet/${ctx.correlationId}/<short-slug>\` — use a`,
|
|
9862
|
+
" slug derived from the brief title (lowercase-kebab, ≤60 chars).",
|
|
9863
|
+
`2. Include the trailer \`Moltnet-Correlation-Id: ${ctx.correlationId}\` on`,
|
|
9864
|
+
" your **first** commit on that branch (subsequent commits do not need it).",
|
|
9865
|
+
"",
|
|
9866
|
+
"These are recovery anchors for the MoltNet mention-bot. Do not deviate",
|
|
9867
|
+
"from this branch naming scheme when correlationId is set.",
|
|
9868
|
+
""
|
|
9869
|
+
].join("\n") : "";
|
|
9600
9870
|
return [
|
|
9601
9871
|
"# Fulfill Brief Agent",
|
|
9602
9872
|
"",
|
|
@@ -9616,6 +9886,7 @@ function buildFulfillBriefPrompt(input, ctx) {
|
|
|
9616
9886
|
"",
|
|
9617
9887
|
criteriaSection,
|
|
9618
9888
|
seedSection,
|
|
9889
|
+
correlationSection,
|
|
9619
9890
|
"### Workflow",
|
|
9620
9891
|
"",
|
|
9621
9892
|
`1. Create a feature branch (starting prefix suggestion: \`${branchSlug}<short-slug>\`).`,
|
|
@@ -9627,6 +9898,7 @@ function buildFulfillBriefPrompt(input, ctx) {
|
|
|
9627
9898
|
" `MoltNet-Diary: <id>` (per the runtime instructor).",
|
|
9628
9899
|
"6. Push the branch and open a PR.",
|
|
9629
9900
|
"",
|
|
9901
|
+
buildSelfVerificationBlock(ctx.taskId),
|
|
9630
9902
|
buildFinalOutputBlock({
|
|
9631
9903
|
taskType: "fulfill_brief",
|
|
9632
9904
|
outputSchemaName: "FulfillBriefOutput",
|
|
@@ -9636,7 +9908,8 @@ function buildFulfillBriefPrompt(input, ctx) {
|
|
|
9636
9908
|
" \"commits\": [{ \"sha\": \"...\", \"message\": \"...\", \"diaryEntryId\": \"...\" }],",
|
|
9637
9909
|
" \"pullRequestUrl\": \"<url-or-null>\",",
|
|
9638
9910
|
" \"diaryEntryIds\": [\"...\"],",
|
|
9639
|
-
" \"summary\": \"<1-3 sentence recap>\"",
|
|
9911
|
+
" \"summary\": \"<1-3 sentence recap>\",",
|
|
9912
|
+
" \"verification\": <required iff input.successCriteria; see Self-verification>",
|
|
9640
9913
|
"}"
|
|
9641
9914
|
].join("\n")
|
|
9642
9915
|
})
|
|
@@ -9645,7 +9918,8 @@ function buildFulfillBriefPrompt(input, ctx) {
|
|
|
9645
9918
|
//#endregion
|
|
9646
9919
|
//#region ../agent-runtime/src/prompts/judge-pack.ts
|
|
9647
9920
|
function buildJudgePackPrompt(input, ctx) {
|
|
9648
|
-
const { renderedPackId, sourcePackId,
|
|
9921
|
+
const { renderedPackId, sourcePackId, successCriteria } = input;
|
|
9922
|
+
const rubric = successCriteria.rubric;
|
|
9649
9923
|
const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
|
|
9650
9924
|
const preambleSection = rubric.preamble ? [
|
|
9651
9925
|
"### Rubric preamble",
|
|
@@ -9675,7 +9949,7 @@ function buildJudgePackPrompt(input, ctx) {
|
|
|
9675
9949
|
"",
|
|
9676
9950
|
"1. Call `moltnet_rendered_pack_get` for the rendered pack. Keep the",
|
|
9677
9951
|
" `content` string — you will score it.",
|
|
9678
|
-
"2. Call `moltnet_pack_get` with `
|
|
9952
|
+
"2. Call `moltnet_pack_get` with `expandEntries: true` for the source",
|
|
9679
9953
|
" pack. Keep the source entries for grounding / coverage checks.",
|
|
9680
9954
|
"3. For each criterion, score according to its `scoring` mode (see",
|
|
9681
9955
|
" Scoring rules below). Produce rationales where required.",
|
|
@@ -9688,9 +9962,23 @@ function buildJudgePackPrompt(input, ctx) {
|
|
|
9688
9962
|
"",
|
|
9689
9963
|
"### Scoring rules",
|
|
9690
9964
|
"",
|
|
9691
|
-
"- `
|
|
9965
|
+
"- `llm_score`: score 0..1 continuous. `rationale` REQUIRED (2–4",
|
|
9692
9966
|
" sentences pointing at specific evidence in the rendered content or",
|
|
9693
|
-
" the source entries).",
|
|
9967
|
+
" the source entries). NOTE: this mode smooths individual failures",
|
|
9968
|
+
" into the gradient. Prefer `llm_checklist` for grounding,",
|
|
9969
|
+
" faithfulness, or any property where one failure is a real failure.",
|
|
9970
|
+
"- `llm_checklist`: enumerate per-claim binary assertions instead of",
|
|
9971
|
+
" picking a continuous score. For each assertion, return",
|
|
9972
|
+
" `{ id, text, passed: bool, evidence: string }`. `evidence` is",
|
|
9973
|
+
" REQUIRED for both PASS and FAIL — for PASS, quote the supporting",
|
|
9974
|
+
" span (rendered or source) or cite the source entry id; for FAIL,",
|
|
9975
|
+
" quote the offending claim verbatim and explain why it fails.",
|
|
9976
|
+
" Don't give the benefit of the doubt: if a claim looks supported but",
|
|
9977
|
+
" you cannot point at the supporting source span, mark it FAIL with",
|
|
9978
|
+
" evidence = \"no supporting span found\". Set the criterion `score`",
|
|
9979
|
+
" to `1` iff every assertion passes, else `0` — the runtime checks",
|
|
9980
|
+
" this matches the assertions array. Populate `assertions` on the",
|
|
9981
|
+
" score object; leave `evidence` (the structured record) empty.",
|
|
9694
9982
|
"- `boolean`: score exactly 0 or 1. `rationale` optional.",
|
|
9695
9983
|
"- `deterministic_signature_check`: batch-fetch ALL referenced source",
|
|
9696
9984
|
" entries in a single call — `moltnet_list_entries` with `entryIds` set",
|
|
@@ -9730,7 +10018,14 @@ function buildJudgePackPrompt(input, ctx) {
|
|
|
9730
10018
|
shapeSketch: [
|
|
9731
10019
|
"{",
|
|
9732
10020
|
" \"scores\": [",
|
|
9733
|
-
" { \"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {} }",
|
|
10021
|
+
" { \"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {} },",
|
|
10022
|
+
" {",
|
|
10023
|
+
" \"criterionId\": \"<llm_checklist criterion>\",",
|
|
10024
|
+
" \"score\": 0, // 1 iff every assertion passed",
|
|
10025
|
+
" \"assertions\": [",
|
|
10026
|
+
" { \"id\": \"claim-1\", \"text\": \"...\", \"passed\": false, \"evidence\": \"...\" }",
|
|
10027
|
+
" ]",
|
|
10028
|
+
" }",
|
|
9734
10029
|
" ],",
|
|
9735
10030
|
" \"composite\": <sum-of-weighted-scores>,",
|
|
9736
10031
|
" \"verdict\": \"<1-3 sentence overall>\",",
|
|
@@ -9772,7 +10067,7 @@ function buildRenderPackPrompt(input, ctx) {
|
|
|
9772
10067
|
"",
|
|
9773
10068
|
"## Workflow",
|
|
9774
10069
|
"",
|
|
9775
|
-
"1. Call `moltnet_pack_get` with `
|
|
10070
|
+
"1. Call `moltnet_pack_get` with `expandEntries: true` to inspect the",
|
|
9776
10071
|
" source entries. Read it — you need the entry count for your output.",
|
|
9777
10072
|
"2. Call `moltnet_pack_render` with:",
|
|
9778
10073
|
` - \`packId\`: \`${packId}\``,
|
|
@@ -9787,6 +10082,7 @@ function buildRenderPackPrompt(input, ctx) {
|
|
|
9787
10082
|
"- Do NOT write diary entries unless a genuine incident occurs",
|
|
9788
10083
|
" (rendering failure, invariant violation).",
|
|
9789
10084
|
"",
|
|
10085
|
+
buildSelfVerificationBlock(ctx.taskId),
|
|
9790
10086
|
buildFinalOutputBlock({
|
|
9791
10087
|
taskType: "render_pack",
|
|
9792
10088
|
outputSchemaName: "RenderPackOutput",
|
|
@@ -9797,7 +10093,8 @@ function buildRenderPackPrompt(input, ctx) {
|
|
|
9797
10093
|
" \"renderMethod\": \"<label>\",",
|
|
9798
10094
|
" \"byteSize\": <int>,",
|
|
9799
10095
|
" \"entriesRendered\": <int>,",
|
|
9800
|
-
" \"summary\": \"<1-3 sentence recap>\"",
|
|
10096
|
+
" \"summary\": \"<1-3 sentence recap>\",",
|
|
10097
|
+
" \"verification\": <required iff input.successCriteria; see Self-verification>",
|
|
9801
10098
|
"}"
|
|
9802
10099
|
].join("\n")
|
|
9803
10100
|
})
|
|
@@ -9818,7 +10115,8 @@ function buildPromptForTask(task, ctx) {
|
|
|
9818
10115
|
}
|
|
9819
10116
|
return buildFulfillBriefPrompt(task.input, {
|
|
9820
10117
|
diaryId: ctx.diaryId,
|
|
9821
|
-
taskId: ctx.taskId
|
|
10118
|
+
taskId: ctx.taskId,
|
|
10119
|
+
correlationId: task.correlationId
|
|
9822
10120
|
});
|
|
9823
10121
|
case ASSESS_BRIEF_TYPE:
|
|
9824
10122
|
if (!Value.Check(AssessBriefInput, task.input)) {
|
|
@@ -13567,9 +13865,9 @@ function createSubmitOutputTool(taskType, opts = {}) {
|
|
|
13567
13865
|
description: contract.description,
|
|
13568
13866
|
parameters: schema,
|
|
13569
13867
|
async execute(_id, params) {
|
|
13570
|
-
const errors =
|
|
13868
|
+
const errors = validateTaskOutput(taskType, params);
|
|
13571
13869
|
if (errors.length > 0) {
|
|
13572
|
-
const detailMsg = errors.slice(0, 3).map((err) => `${err.
|
|
13870
|
+
const detailMsg = errors.slice(0, 3).map((err) => `${err.field}: ${err.message}`).join("; ");
|
|
13573
13871
|
const details = {
|
|
13574
13872
|
captured: false,
|
|
13575
13873
|
callCount,
|
|
@@ -13583,7 +13881,7 @@ function createSubmitOutputTool(taskType, opts = {}) {
|
|
|
13583
13881
|
return {
|
|
13584
13882
|
content: [{
|
|
13585
13883
|
type: "text",
|
|
13586
|
-
text: `Output failed
|
|
13884
|
+
text: `Output failed validation: ${detailMsg}. Re-call this tool with a corrected output.`
|
|
13587
13885
|
}],
|
|
13588
13886
|
details,
|
|
13589
13887
|
isError: true
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@themoltnet/pi-extension",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.13.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "MoltNet pi extension — sandboxed tool execution in Gondolin VMs with MoltNet identity and persistent memory",
|
|
6
6
|
"license": "MIT",
|
|
@@ -31,8 +31,8 @@
|
|
|
31
31
|
"@earendil-works/gondolin": "^0.7.0",
|
|
32
32
|
"@opentelemetry/api": "^1.9.0",
|
|
33
33
|
"@sinclair/typebox": "^0.34.0",
|
|
34
|
-
"@themoltnet/
|
|
35
|
-
"@themoltnet/
|
|
34
|
+
"@themoltnet/agent-runtime": "0.10.0",
|
|
35
|
+
"@themoltnet/sdk": "0.98.0"
|
|
36
36
|
},
|
|
37
37
|
"peerDependencies": {
|
|
38
38
|
"@mariozechner/pi-coding-agent": ">=0.73.0",
|
|
@@ -61,10 +61,25 @@
|
|
|
61
61
|
"engines": {
|
|
62
62
|
"node": ">=22"
|
|
63
63
|
},
|
|
64
|
+
"nx": {
|
|
65
|
+
"tags": [
|
|
66
|
+
"type:runtime",
|
|
67
|
+
"scope:agent",
|
|
68
|
+
"platform:extension"
|
|
69
|
+
],
|
|
70
|
+
"targets": {
|
|
71
|
+
"test-ci": {
|
|
72
|
+
"executor": "nx:noop",
|
|
73
|
+
"dependsOn": [
|
|
74
|
+
"test"
|
|
75
|
+
],
|
|
76
|
+
"metadata": {
|
|
77
|
+
"description": "Alias for `test` on projects without atomization."
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
},
|
|
64
82
|
"scripts": {
|
|
65
|
-
"
|
|
66
|
-
"check:pack": "tsx ../../tools/src/check-pack.ts --package .",
|
|
67
|
-
"build": "vite build",
|
|
68
|
-
"test": "vitest run --passWithNoTests"
|
|
83
|
+
"check:pack": "tsx ../../tools/src/check-pack.ts --package ."
|
|
69
84
|
}
|
|
70
85
|
}
|