@themoltnet/pi-extension 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -246,7 +246,6 @@ declare const Task: TObject< {
246
246
  input: TRecord<TString, TUnknown>;
247
247
  inputSchemaCid: TString;
248
248
  inputCid: TString;
249
- criteriaCid: TUnion<[TString, TNull]>;
250
249
  references: TArray<TObject< {
251
250
  taskId: TUnion<[TString, TNull]>;
252
251
  outputCid: TString;
@@ -427,7 +426,13 @@ export declare interface VmConfig {
427
426
  export declare interface VmCredentials {
428
427
  moltnetJson: string;
429
428
  agentEnvRaw: string;
430
- piAuthJson: string;
429
+ /**
430
+ * Pi OAuth/API-key auth blob. Null when neither `~/.pi/agent/auth.json`
431
+ * (or its `PI_AUTH_PATH` override) is present — in that case the daemon
432
+ * relies on Pi's env-var providers (`ANTHROPIC_API_KEY`, etc.) carried
433
+ * via `agentEnv` and the host environment instead. CI uses this path.
434
+ */
435
+ piAuthJson: string | null;
431
436
  agentEnv: Record<string, string | undefined>;
432
437
  gitconfig: string | null;
433
438
  sshPrivateKey: string | null;
package/dist/index.js CHANGED
@@ -8195,9 +8195,8 @@ function findMainWorktree() {
8195
8195
  function loadCredentials(agentDir) {
8196
8196
  const moltnetJson = readFileSync(path.join(agentDir, "moltnet.json"), "utf8");
8197
8197
  const agentEnvRaw = readFileSync(path.join(agentDir, "env"), "utf8");
8198
- const piAuthPath = path.join(process.env.HOME ?? "", ".pi", "agent", "auth.json");
8199
- if (!existsSync(piAuthPath)) throw new Error(`Pi OAuth credentials not found at ${piAuthPath}. Run: pi login`);
8200
- const piAuthJson = readFileSync(piAuthPath, "utf8");
8198
+ const piAuthPath = process.env.PI_AUTH_PATH ?? path.join(process.env.HOME ?? "", ".pi", "agent", "auth.json");
8199
+ const piAuthJson = existsSync(piAuthPath) ? readFileSync(piAuthPath, "utf8") : null;
8201
8200
  const gitconfigPath = path.join(agentDir, "gitconfig");
8202
8201
  const gitconfig = existsSync(gitconfigPath) ? readFileSync(gitconfigPath, "utf8") : null;
8203
8202
  const sshDir = path.join(agentDir, "ssh");
@@ -8315,7 +8314,7 @@ async function resumeVm(config) {
8315
8314
  nameserver 1.1.1.1" > /etc/resolv.conf'`);
8316
8315
  const vmSshDir = `${vmAgentDir}/ssh`;
8317
8316
  await vm.exec(`mkdir -p ${vmAgentDir}/ssh /home/agent/.pi/agent`);
8318
- await vm.fs.writeFile("/home/agent/.pi/agent/auth.json", creds.piAuthJson, { mode: 384 });
8317
+ if (creds.piAuthJson !== null) await vm.fs.writeFile("/home/agent/.pi/agent/auth.json", creds.piAuthJson, { mode: 384 });
8319
8318
  const vmMoltnetJson = rewriteMoltnetJsonPaths(creds.moltnetJson, vmAgentDir, vmSshDir, creds.githubAppPemFilename);
8320
8319
  await vm.fs.writeFile(`${vmAgentDir}/moltnet.json`, vmMoltnetJson, { mode: 384 });
8321
8320
  await vm.fs.writeFile(`${vmAgentDir}/env`, creds.agentEnvRaw, { mode: 384 });
@@ -8558,7 +8557,13 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
8558
8557
  /**
8559
8558
  * How a judge must score a single criterion.
8560
8559
  *
8561
- * - `llm_judged`: 0..1 continuous, `rationale` required.
8560
+ * - `llm_score`: 0..1 continuous, `rationale` required. Smooths failures
8561
+ * into the gradient — use `llm_checklist` instead for properties where
8562
+ * a single failure is a real failure (grounding, faithfulness).
8563
+ * - `llm_checklist`: judge enumerates per-claim assertions with
8564
+ * `{passed, evidence}`. The criterion's numeric `score` is derived:
8565
+ * `1` iff every assertion passes, else `0`. Per-claim evidence is the
8566
+ * dataset for cluster-analysis of failure modes. See #999.
8562
8567
  * - `boolean`: 0 or 1, `rationale` optional.
8563
8568
  * - `deterministic_signature_check`: judge runs a signature check;
8564
8569
  * result is 0 or 1. No LLM discretion.
@@ -8566,11 +8571,31 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
8566
8571
  * appears in the rendered output; 0 or 1.
8567
8572
  */
8568
8573
  var RubricScoringMode = Type$1.Union([
8569
- Type$1.Literal("llm_judged"),
8574
+ Type$1.Literal("llm_score"),
8575
+ Type$1.Literal("llm_checklist"),
8570
8576
  Type$1.Literal("boolean"),
8571
8577
  Type$1.Literal("deterministic_signature_check"),
8572
8578
  Type$1.Literal("deterministic_coverage_check")
8573
8579
  ], { $id: "RubricScoringMode" });
8580
+ /**
8581
+ * One binary check produced by an `llm_checklist`-mode criterion.
8582
+ *
8583
+ * `evidence` is REQUIRED for both PASS and FAIL — agentskills.io grading
8584
+ * principle: \"Don't give the benefit of the doubt.\" A PASS without
8585
+ * concrete evidence (a quoted span, an entry id, a source location)
8586
+ * cannot be audited. A FAIL without evidence cannot be clustered into
8587
+ * structural fixes. The same shape is reused by `judge-eval-variant`
8588
+ * (#943) so tooling, dashboards, and analysis stay uniform.
8589
+ */
8590
+ var AssertionResult = Type$1.Object({
8591
+ id: Type$1.String({ minLength: 1 }),
8592
+ text: Type$1.String({ minLength: 1 }),
8593
+ passed: Type$1.Boolean(),
8594
+ evidence: Type$1.String({ minLength: 1 })
8595
+ }, {
8596
+ $id: "AssertionResult",
8597
+ additionalProperties: false
8598
+ });
8574
8599
  var RubricCriterion = Type$1.Object({
8575
8600
  id: Type$1.String({ minLength: 1 }),
8576
8601
  description: Type$1.String({ minLength: 1 }),
@@ -8630,44 +8655,165 @@ unrelated subsystems and the test coverage on the auth path is
8630
8655
  unchanged" is.
8631
8656
  `.trim();
8632
8657
  //#endregion
8658
+ //#region ../tasks/src/success-criteria.ts
8659
+ /**
8660
+ * SuccessCriteria — imposer-stated acceptance criteria, evaluated in two
8661
+ * complementary places.
8662
+ *
8663
+ * Before this envelope existed, criteria were scattered: a vestigial
8664
+ * `criteriaCid` column nobody resolved, an `acceptanceCriteria: string[]`
8665
+ * field on `fulfill_brief.input` that was "interpreted by the claiming
8666
+ * agent," and inline `rubric` / `criteria[]` fields on judgment-task
8667
+ * inputs. None of those were machine-verifiable end-to-end.
8668
+ *
8669
+ * This module defines a single, content-addressable envelope an imposer
8670
+ * attaches to any task type. It has four orthogonal sections — pick
8671
+ * whichever apply per task type:
8672
+ *
8673
+ * - `gates` Deterministic structural checks (CID/schema match)
8674
+ * - `assertions` Declarative claims about output JSON
8675
+ * - `rubric` Weighted-criteria scoring instrument, reused
8676
+ * verbatim from `./rubric.ts`.
8677
+ * - `sideEffects` Required process side-effects (e.g. diary entry)
8678
+ *
8679
+ * ## Two roles, two task types
8680
+ *
8681
+ * **Producer self-assessment** (fulfillment tasks: `fulfill_brief`,
8682
+ * `curate_pack`, `render_pack`). The producer **LLM** evaluates the
8683
+ * criteria against its own output and emits a `VerificationRecord`
8684
+ * inside `output.verification`. The daemon is pure passthrough — it
8685
+ * does not run `evaluateAssertions`, does not inspect the verification
8686
+ * record. The REST API is dumb storage; it never re-runs assertions and
8687
+ * never runs LLMs. The cross-field rule
8688
+ * `requireVerificationWhenCriteriaPresent` enforces "verification
8689
+ * required iff successCriteria present" at task-output validation time
8690
+ * (server-side schema check). Self-assessment is a truthful self-rating,
8691
+ * NOT enforcement — `verification.passed=false` does not block /complete
8692
+ * and does not affect `acceptedAttemptN`. See
8693
+ * `docs/agent-runtime.md` for the full producer/judge flow.
8694
+ *
8695
+ * **Binding evaluation** (judgment tasks: `assess_brief`, `judge_pack`).
8696
+ * A separate task whose IS the application of `successCriteria` to
8697
+ * someone else's output. Different agent (enforced at claim time), same
8698
+ * envelope. The judge's verdict is binding: this is the *gate* in the
8699
+ * MoltNet model. The rubric inside `successCriteria.rubric` IS the job
8700
+ * spec for the judge.
8701
+ *
8702
+ * The clean chain: producer task with `successCriteria` → producer
8703
+ * self-assesses honestly → imposer (or automation) creates a downstream
8704
+ * judgment task that references the same `successCriteria` (or a
8705
+ * stricter rubric) → judgment task delivers the binding verdict.
8706
+ *
8707
+ * Storage: SuccessCriteria lives inline at `task.input.successCriteria`,
8708
+ * pinned via the task's `inputCid`. No separate column or hash. When
8709
+ * #881 lands, the `rubric` field can graduate to `{ rubricCid }` lookup
8710
+ * without changing this envelope, and producer + judge tasks can pin
8711
+ * the SAME rubric across the chain for end-to-end auditability.
8712
+ */
8713
+ var SchemaCheckSpec = Type$1.Object({ schemaCid: Type$1.String({ minLength: 1 }) }, { additionalProperties: false });
8714
+ var CidEqualsSpec = Type$1.Object({
8715
+ path: Type$1.String({ minLength: 1 }),
8716
+ expected: Type$1.String({ minLength: 1 })
8717
+ }, { additionalProperties: false });
8718
+ var Gate = Type$1.Union([Type$1.Object({
8719
+ id: Type$1.String({ minLength: 1 }),
8720
+ kind: Type$1.Literal("schema-check"),
8721
+ spec: SchemaCheckSpec,
8722
+ required: Type$1.Boolean()
8723
+ }, { additionalProperties: false }), Type$1.Object({
8724
+ id: Type$1.String({ minLength: 1 }),
8725
+ kind: Type$1.Literal("cid-equals"),
8726
+ spec: CidEqualsSpec,
8727
+ required: Type$1.Boolean()
8728
+ }, { additionalProperties: false })], { $id: "Gate" });
8729
+ var AssertionOp = Type$1.Union([
8730
+ Type$1.Literal("exists"),
8731
+ Type$1.Literal("equals"),
8732
+ Type$1.Literal("matches"),
8733
+ Type$1.Literal("in-range"),
8734
+ Type$1.Literal("min-length")
8735
+ ], { $id: "AssertionOp" });
8736
+ var Assertion = Type$1.Object({
8737
+ id: Type$1.String({ minLength: 1 }),
8738
+ path: Type$1.String({ minLength: 1 }),
8739
+ op: AssertionOp,
8740
+ value: Type$1.Optional(Type$1.Unknown())
8741
+ }, {
8742
+ $id: "Assertion",
8743
+ additionalProperties: false
8744
+ });
8745
+ var SideEffectsSpec = Type$1.Object({
8746
+ diaryEntryRequired: Type$1.Optional(Type$1.Boolean()),
8747
+ diaryEntryTags: Type$1.Optional(Type$1.Array(Type$1.String({ minLength: 1 }))),
8748
+ referencedEntries: Type$1.Optional(Type$1.Integer({ minimum: 0 }))
8749
+ }, {
8750
+ $id: "SideEffectsSpec",
8751
+ additionalProperties: false
8752
+ });
8753
+ var SuccessCriteria = Type$1.Object({
8754
+ version: Type$1.Literal(1),
8755
+ gates: Type$1.Optional(Type$1.Array(Gate)),
8756
+ assertions: Type$1.Optional(Type$1.Array(Assertion)),
8757
+ rubric: Type$1.Optional(Rubric),
8758
+ minComposite: Type$1.Optional(Type$1.Number({
8759
+ minimum: 0,
8760
+ maximum: 1
8761
+ })),
8762
+ sideEffects: Type$1.Optional(SideEffectsSpec)
8763
+ }, {
8764
+ $id: "SuccessCriteria",
8765
+ additionalProperties: false
8766
+ });
8767
+ var VerificationResultStatus = Type$1.Union([
8768
+ Type$1.Literal("pass"),
8769
+ Type$1.Literal("fail"),
8770
+ Type$1.Literal("skip")
8771
+ ], { $id: "VerificationResultStatus" });
8772
+ var VerificationResultKind = Type$1.Union([
8773
+ Type$1.Literal("gate"),
8774
+ Type$1.Literal("assertion"),
8775
+ Type$1.Literal("rubric"),
8776
+ Type$1.Literal("sideEffect")
8777
+ ], { $id: "VerificationResultKind" });
8778
+ var VerificationResult = Type$1.Object({
8779
+ id: Type$1.String({ minLength: 1 }),
8780
+ kind: VerificationResultKind,
8781
+ status: VerificationResultStatus,
8782
+ detail: Type$1.Optional(Type$1.String())
8783
+ }, {
8784
+ $id: "VerificationResult",
8785
+ additionalProperties: false
8786
+ });
8787
+ var VerificationRecord = Type$1.Object({
8788
+ inputCid: Type$1.String({ minLength: 1 }),
8789
+ results: Type$1.Array(VerificationResult),
8790
+ passed: Type$1.Boolean()
8791
+ }, {
8792
+ $id: "VerificationRecord",
8793
+ additionalProperties: false
8794
+ });
8795
+ //#endregion
8633
8796
  //#region ../tasks/src/task-types/assess-brief.ts
8634
8797
  /**
8635
8798
  * `assess_brief` — independently evaluate a fulfilled brief.
8636
8799
  *
8637
8800
  * output_kind: judgment
8638
- * criteria: required (rubric lives as a diary entry with tag='rubric';
8639
- * the Task's `criteria_cid` points at that entry)
8801
+ * criteria: required (`successCriteria.rubric` same envelope as
8802
+ * `judge_pack`)
8640
8803
  * references: required (must reference the target `fulfill_brief` task)
8641
8804
  *
8642
8805
  * The assessor is a different agent from the producer (enforced by the
8643
8806
  * server / runtime at claim time — not in the wire schema).
8807
+ *
8808
+ * The rubric in `successCriteria` IS the job spec — the assessor applies
8809
+ * it to the target task's output and emits per-criterion scores. Other
8810
+ * sections (`assertions`, `gates`, `sideEffects`) MAY be present and are
8811
+ * evaluated against the *assessor's output*.
8644
8812
  */
8645
8813
  var ASSESS_BRIEF_TYPE = "assess_brief";
8646
- /**
8647
- * One criterion lifted from the rubric. Denormalized into the input so the
8648
- * assessor prompt can be built without a second fetch; the `criteria_cid`
8649
- * on the Task row remains authoritative for verification.
8650
- */
8651
- var AssessBriefCriterion = Type$1.Object({
8652
- id: Type$1.String({ minLength: 1 }),
8653
- description: Type$1.String({ minLength: 1 }),
8654
- weight: Type$1.Number({
8655
- minimum: 0,
8656
- maximum: 1
8657
- }),
8658
- scoring: Type$1.Union([
8659
- Type$1.Literal("llm_judged"),
8660
- Type$1.Literal("boolean"),
8661
- Type$1.Literal("deterministic_signature_check")
8662
- ])
8663
- }, {
8664
- $id: "AssessBriefCriterion",
8665
- additionalProperties: false
8666
- });
8667
8814
  var AssessBriefInput = Type$1.Object({
8668
8815
  targetTaskId: Type$1.String({ format: "uuid" }),
8669
- criteria: Type$1.Array(AssessBriefCriterion, { minItems: 1 }),
8670
- rubricPreamble: Type$1.Optional(Type$1.String())
8816
+ successCriteria: SuccessCriteria
8671
8817
  }, {
8672
8818
  $id: "AssessBriefInput",
8673
8819
  additionalProperties: false
@@ -8736,7 +8882,8 @@ var CuratePackInput = Type$1.Object({
8736
8882
  prefix: Type$1.Optional(Type$1.String())
8737
8883
  }, { additionalProperties: false })),
8738
8884
  tokenBudget: Type$1.Optional(Type$1.Number({ minimum: 500 })),
8739
- recipe: Type$1.Optional(Type$1.Union([Type$1.Literal("topic-focused-v1"), Type$1.Literal("scope-inventory-v1")]))
8885
+ recipe: Type$1.Optional(Type$1.Union([Type$1.Literal("topic-focused-v1"), Type$1.Literal("scope-inventory-v1")])),
8886
+ successCriteria: Type$1.Optional(SuccessCriteria)
8740
8887
  }, {
8741
8888
  $id: "CuratePackInput",
8742
8889
  additionalProperties: false
@@ -8761,7 +8908,8 @@ var CuratePackOutput = Type$1.Object({
8761
8908
  droppedIds: Type$1.Optional(Type$1.Array(Type$1.String({ format: "uuid" }))),
8762
8909
  notes: Type$1.String({ minLength: 1 })
8763
8910
  }, { additionalProperties: false }))),
8764
- summary: Type$1.String({ minLength: 1 })
8911
+ summary: Type$1.String({ minLength: 1 }),
8912
+ verification: Type$1.Optional(VerificationRecord)
8765
8913
  }, {
8766
8914
  $id: "CuratePackOutput",
8767
8915
  additionalProperties: false
@@ -8780,6 +8928,7 @@ var FulfillBriefInput = Type$1.Object({
8780
8928
  brief: Type$1.String({ minLength: 1 }),
8781
8929
  title: Type$1.Optional(Type$1.String()),
8782
8930
  acceptanceCriteria: Type$1.Optional(Type$1.Array(Type$1.String())),
8931
+ successCriteria: Type$1.Optional(SuccessCriteria),
8783
8932
  seedFiles: Type$1.Optional(Type$1.Array(Type$1.String())),
8784
8933
  scopeHint: Type$1.Optional(Type$1.String())
8785
8934
  }, {
@@ -8799,7 +8948,8 @@ var FulfillBriefOutput = Type$1.Object({
8799
8948
  }, { additionalProperties: false })),
8800
8949
  pullRequestUrl: Type$1.Union([Type$1.String(), Type$1.Null()]),
8801
8950
  diaryEntryIds: Type$1.Array(Type$1.String({ format: "uuid" })),
8802
- summary: Type$1.String({ minLength: 1 })
8951
+ summary: Type$1.String({ minLength: 1 }),
8952
+ verification: Type$1.Optional(VerificationRecord)
8803
8953
  }, {
8804
8954
  $id: "FulfillBriefOutput",
8805
8955
  additionalProperties: false
@@ -8810,19 +8960,18 @@ var FulfillBriefOutput = Type$1.Object({
8810
8960
  * `judge_pack` — independently score a rendered pack against a rubric.
8811
8961
  *
8812
8962
  * output_kind: judgment
8813
- * criteria: required (embedded `rubric` — see Phase 1 design in #852
8814
- * amendment and Phase 2 issue #881)
8963
+ * criteria: required (`successCriteria.rubric` — see #852 amendment and
8964
+ * Phase 2 issue #881)
8815
8965
  * references: required (must reference the `render_pack` task it judges,
8816
8966
  * role='judged_work')
8817
8967
  *
8818
8968
  * Step 3 of the three-session attribution loop (#875). Mirrors
8819
8969
  * `assess_brief` in shape, but over a rendered context pack.
8820
8970
  *
8821
- * Phase 1 rubric storage: the rubric body is inlined in `input.rubric`.
8822
- * Integrity is pinned via the task's `input_cid`. Phase 2 (#881) will
8823
- * replace the inline body with a `rubric_cid` referencing a `rubrics`
8824
- * table row; the denormalized `criteria[]` projection stays for prompt
8825
- * building without a fetch.
8971
+ * Phase 1 rubric storage: the rubric body lives at
8972
+ * `input.successCriteria.rubric` and is pinned via the task's `inputCid`.
8973
+ * Phase 2 (#881) will replace the inline body with a `rubricCid`
8974
+ * referencing a stored `rubrics` row; the envelope stays the same.
8826
8975
  *
8827
8976
  * The judge MUST be a different agent from the renderer. Enforced at
8828
8977
  * claim time by the runtime, not in the wire schema.
@@ -8831,7 +8980,7 @@ var JUDGE_PACK_TYPE = "judge_pack";
8831
8980
  var JudgePackInput = Type$1.Object({
8832
8981
  renderedPackId: Type$1.String({ format: "uuid" }),
8833
8982
  sourcePackId: Type$1.String({ format: "uuid" }),
8834
- rubric: Rubric
8983
+ successCriteria: SuccessCriteria
8835
8984
  }, {
8836
8985
  $id: "JudgePackInput",
8837
8986
  additionalProperties: false
@@ -8844,6 +8993,7 @@ var JudgePackScore = Type$1.Object({
8844
8993
  maximum: 1
8845
8994
  }),
8846
8995
  rationale: Type$1.Optional(Type$1.String()),
8996
+ assertions: Type$1.Optional(Type$1.Array(AssertionResult, { minItems: 1 })),
8847
8997
  evidence: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Unknown()))
8848
8998
  }, {
8849
8999
  $id: "JudgePackScore",
@@ -8862,6 +9012,39 @@ var JudgePackOutput = Type$1.Object({
8862
9012
  $id: "JudgePackOutput",
8863
9013
  additionalProperties: false
8864
9014
  });
9015
+ /**
9016
+ * Cross-field validator for JudgePackOutput. Run after the TypeBox
9017
+ * schema check passes. Enforces invariants the schema can't express:
9018
+ *
9019
+ * 1. If a `JudgePackScore` carries an `assertions` array (i.e. the
9020
+ * judge ran the criterion in `llm_checklist` mode), its numeric
9021
+ * `score` MUST equal `1` if every `assertions[i].passed` is true,
9022
+ * else `0`. The prompt instructs the judge to derive `score` from
9023
+ * the array, but the LLM can drift — without this check, the
9024
+ * runtime accepts inconsistent payloads and propagates them into
9025
+ * composite scores and judge attestations (#999 P1).
9026
+ *
9027
+ * 2. If `score` is exactly `1` AND `assertions` is present, every
9028
+ * assertion must have `passed: true`. Catches the failure mode in
9029
+ * the issue: "score: 1 with a failing assertion accepted."
9030
+ *
9031
+ * Cross-rubric checks (e.g. "did the judge populate `assertions` for
9032
+ * every criterion the rubric marked `llm_checklist`?") require the
9033
+ * input rubric and live in a separate, runtime-side validator. This
9034
+ * one is rubric-agnostic on purpose — it catches within-score
9035
+ * inconsistency without needing the original task input.
9036
+ */
9037
+ function validateJudgePackOutput(output) {
9038
+ const scores = output.scores;
9039
+ for (let i = 0; i < scores.length; i++) {
9040
+ const s = scores[i];
9041
+ if (!s.assertions) continue;
9042
+ const allPassed = s.assertions.every((a) => a.passed);
9043
+ const expected = allPassed ? 1 : 0;
9044
+ if (s.score !== expected) return `scores[${i}] (criterionId="${s.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${s.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
9045
+ }
9046
+ return null;
9047
+ }
8865
9048
  //#endregion
8866
9049
  //#region ../tasks/src/task-types/render-pack.ts
8867
9050
  /**
@@ -8883,7 +9066,8 @@ var RENDER_PACK_TYPE = "render_pack";
8883
9066
  var RenderPackInput = Type$1.Object({
8884
9067
  packId: Type$1.String({ format: "uuid" }),
8885
9068
  persist: Type$1.Optional(Type$1.Boolean()),
8886
- pinned: Type$1.Optional(Type$1.Boolean())
9069
+ pinned: Type$1.Optional(Type$1.Boolean()),
9070
+ successCriteria: Type$1.Optional(SuccessCriteria)
8887
9071
  }, {
8888
9072
  $id: "RenderPackInput",
8889
9073
  additionalProperties: false
@@ -8894,7 +9078,8 @@ var RenderPackOutput = Type$1.Object({
8894
9078
  renderMethod: Type$1.String({ minLength: 1 }),
8895
9079
  byteSize: Type$1.Number({ minimum: 0 }),
8896
9080
  entriesRendered: Type$1.Number({ minimum: 0 }),
8897
- summary: Type$1.String({ minLength: 1 })
9081
+ summary: Type$1.String({ minLength: 1 }),
9082
+ verification: Type$1.Optional(VerificationRecord)
8898
9083
  }, {
8899
9084
  $id: "RenderPackOutput",
8900
9085
  additionalProperties: false
@@ -8902,6 +9087,33 @@ var RenderPackOutput = Type$1.Object({
8902
9087
  //#endregion
8903
9088
  //#region ../tasks/src/task-types/index.ts
8904
9089
  /**
9090
+ * Validate that a judgment-task input carries a rubric inside its
9091
+ * `successCriteria` envelope, and that the rubric's weights sum to 1.
9092
+ * Used for `assess_brief` and `judge_pack`.
9093
+ */
9094
+ function validateJudgmentInput(input) {
9095
+ const sc = input.successCriteria;
9096
+ if (!sc) return "successCriteria is required for judgment tasks";
9097
+ if (!sc.rubric) return "successCriteria.rubric is required for judgment tasks";
9098
+ return validateRubricWeights(sc.rubric);
9099
+ }
9100
+ /**
9101
+ * Cross-field rule: when `input.successCriteria` is set, the producer's
9102
+ * output MUST carry a `verification` block (the LLM's self-assessment).
9103
+ * When it is unset, the output MUST NOT carry one (avoid garbage data).
9104
+ *
9105
+ * Used by all three fulfillment task types. Judgment task outputs do
9106
+ * NOT use this — their entire output IS a structured judgment, so a
9107
+ * separate self-assessment field would be circular.
9108
+ */
9109
+ function requireVerificationWhenCriteriaPresent(output, input) {
9110
+ const hasCriteria = input !== void 0 && input !== null && input.successCriteria !== void 0;
9111
+ const hasVerification = output.verification !== void 0;
9112
+ if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the criteria";
9113
+ if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no criteria to assess against";
9114
+ return null;
9115
+ }
9116
+ /**
8905
9117
  * Client-side task-type registry. Mirrors the server-owned DB registry
8906
9118
  * (PR 2). PR 0 shipped the two brief types; this PR adds the three
8907
9119
  * pack-pipeline types for the three-session attribution loop (#875).
@@ -8916,41 +9128,41 @@ var BUILT_IN_TASK_TYPES = {
8916
9128
  inputSchema: FulfillBriefInput,
8917
9129
  outputSchema: FulfillBriefOutput,
8918
9130
  outputKind: "artifact",
8919
- requiresCriteria: false,
8920
- requiresReferences: false
9131
+ requiresReferences: false,
9132
+ validateOutput: requireVerificationWhenCriteriaPresent
8921
9133
  },
8922
9134
  [ASSESS_BRIEF_TYPE]: {
8923
9135
  name: ASSESS_BRIEF_TYPE,
8924
9136
  inputSchema: AssessBriefInput,
8925
9137
  outputSchema: AssessBriefOutput,
8926
9138
  outputKind: "judgment",
8927
- requiresCriteria: true,
8928
- requiresReferences: true
9139
+ requiresReferences: true,
9140
+ validateInput: validateJudgmentInput
8929
9141
  },
8930
9142
  [CURATE_PACK_TYPE]: {
8931
9143
  name: CURATE_PACK_TYPE,
8932
9144
  inputSchema: CuratePackInput,
8933
9145
  outputSchema: CuratePackOutput,
8934
9146
  outputKind: "artifact",
8935
- requiresCriteria: false,
8936
- requiresReferences: false
9147
+ requiresReferences: false,
9148
+ validateOutput: requireVerificationWhenCriteriaPresent
8937
9149
  },
8938
9150
  [RENDER_PACK_TYPE]: {
8939
9151
  name: RENDER_PACK_TYPE,
8940
9152
  inputSchema: RenderPackInput,
8941
9153
  outputSchema: RenderPackOutput,
8942
9154
  outputKind: "artifact",
8943
- requiresCriteria: false,
8944
- requiresReferences: false
9155
+ requiresReferences: false,
9156
+ validateOutput: requireVerificationWhenCriteriaPresent
8945
9157
  },
8946
9158
  [JUDGE_PACK_TYPE]: {
8947
9159
  name: JUDGE_PACK_TYPE,
8948
9160
  inputSchema: JudgePackInput,
8949
9161
  outputSchema: JudgePackOutput,
8950
9162
  outputKind: "judgment",
8951
- requiresCriteria: false,
8952
9163
  requiresReferences: true,
8953
- validateInput: (input) => validateRubricWeights(input.rubric)
9164
+ validateInput: validateJudgmentInput,
9165
+ validateOutput: validateJudgePackOutput
8954
9166
  }
8955
9167
  };
8956
9168
  //#endregion
@@ -8980,13 +9192,22 @@ function schemaErrors(prefix, schema, value) {
8980
9192
  message: error.message
8981
9193
  }));
8982
9194
  }
8983
- function validateTaskOutput(taskType, output) {
9195
+ function validateTaskOutput(taskType, output, input) {
8984
9196
  const entry = getTaskTypeEntry(taskType);
8985
9197
  if (!entry) return [{
8986
9198
  field: "taskType",
8987
9199
  message: `Unknown task type: ${taskType}`
8988
9200
  }];
8989
- return schemaErrors("output", entry.outputSchema, output);
9201
+ const errors = schemaErrors("output", entry.outputSchema, output);
9202
+ if (errors.length > 0) return errors;
9203
+ if (entry.validateOutput) {
9204
+ const validationError = entry.validateOutput(output, input);
9205
+ if (validationError) return [{
9206
+ field: "output",
9207
+ message: validationError
9208
+ }];
9209
+ }
9210
+ return [];
8990
9211
  }
8991
9212
  /**
8992
9213
  * Resolve the TypeBox output schema registered for `taskType`. Returns
@@ -9126,7 +9347,6 @@ Type$1.Object({
9126
9347
  input: Type$1.Record(Type$1.String(), Type$1.Unknown()),
9127
9348
  inputSchemaCid: Cid,
9128
9349
  inputCid: Cid,
9129
- criteriaCid: Type$1.Union([Cid, Type$1.Null()]),
9130
9350
  references: Type$1.Array(TaskRef),
9131
9351
  correlationId: Type$1.Union([Uuid, Type$1.Null()]),
9132
9352
  imposedByAgentId: Type$1.Union([Uuid, Type$1.Null()]),
@@ -9340,11 +9560,12 @@ function buildFinalOutputBlock(opts) {
9340
9560
  * anything) work without any code path here.
9341
9561
  */
9342
9562
  function buildAssessBriefPrompt(input, ctx) {
9343
- const criteriaList = input.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
9344
- const preambleSection = input.rubricPreamble ? [
9563
+ const rubric = input.successCriteria.rubric;
9564
+ const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
9565
+ const preambleSection = rubric.preamble ? [
9345
9566
  "### Rubric preamble",
9346
9567
  "",
9347
- input.rubricPreamble,
9568
+ rubric.preamble,
9348
9569
  ""
9349
9570
  ].join("\n") : "";
9350
9571
  return [
@@ -9394,7 +9615,7 @@ function buildAssessBriefPrompt(input, ctx) {
9394
9615
  "",
9395
9616
  "### Scoring rules",
9396
9617
  "",
9397
- "- `llm_judged`: score 0..1 continuous. `rationale` REQUIRED (2–4 sentences).",
9618
+ "- `llm_score`: score 0..1 continuous. `rationale` REQUIRED (2–4 sentences).",
9398
9619
  "- `boolean`: score exactly 0 or 1. `rationale` optional.",
9399
9620
  "- `deterministic_signature_check`: run `moltnet entry verify` on every diary entry returned by step 3 above AND `git verify-commit` on every commit. Score 1 iff ALL signatures are valid; otherwise 0. Populate `evidence.commitsVerified`, `evidence.commitsTotal`, `evidence.signatureFailures`.",
9400
9621
  "",
@@ -9418,6 +9639,39 @@ function buildAssessBriefPrompt(input, ctx) {
9418
9639
  ].filter(Boolean).join("\n");
9419
9640
  }
9420
9641
  //#endregion
9642
+ //#region ../agent-runtime/src/prompts/self-verification.ts
9643
+ function buildSelfVerificationBlock(taskId) {
9644
+ return [
9645
+ "## Self-verification",
9646
+ "",
9647
+ `Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.successCriteria\`.`,
9648
+ "",
9649
+ "- If `input.successCriteria` is **absent**, omit `verification` from your",
9650
+ " final output entirely.",
9651
+ "- If `input.successCriteria` is **present**, you MUST include a",
9652
+ " `verification` block in your final output. Evaluate every applicable",
9653
+ " item — `gates`, `assertions`, `rubric` criteria, `sideEffects` — against",
9654
+ " your produced work and emit one result per id. Be honest: a `fail` with",
9655
+ " a one-line reason is more useful than a false `pass`. Use `skip` (with a",
9656
+ " `detail`) when you genuinely could not determine a result. Compute",
9657
+ " `passed = results.every(r => r.status !== 'fail')`.",
9658
+ "",
9659
+ "Verification shape:",
9660
+ "",
9661
+ "```json",
9662
+ "{",
9663
+ " \"inputCid\": \"<the inputCid you saw on the task>\",",
9664
+ " \"results\": [",
9665
+ " { \"id\": \"<criterion id>\", \"kind\": \"assertion|gate|rubric|sideEffect\",",
9666
+ " \"status\": \"pass|fail|skip\", \"detail\": \"<optional one-liner>\" }",
9667
+ " ],",
9668
+ " \"passed\": <boolean>",
9669
+ "}",
9670
+ "```",
9671
+ ""
9672
+ ].join("\n");
9673
+ }
9674
+ //#endregion
9421
9675
  //#region ../agent-runtime/src/prompts/curate-pack.ts
9422
9676
  /**
9423
9677
  * Build the system prompt for a `curate_pack` task.
@@ -9552,6 +9806,7 @@ function buildCuratePackPrompt(input, ctx) {
9552
9806
  " output, not in the diary.",
9553
9807
  "- Respect hard include/exclude filters literally.",
9554
9808
  "",
9809
+ buildSelfVerificationBlock(ctx.taskId),
9555
9810
  buildFinalOutputBlock({
9556
9811
  taskType: "curate_pack",
9557
9812
  outputSchemaName: "CuratePackOutput",
@@ -9566,7 +9821,8 @@ function buildCuratePackPrompt(input, ctx) {
9566
9821
  " \"checkpoints\": [",
9567
9822
  " { \"phase\": \"recon\", \"candidateIds\": [...], \"droppedIds\": [...], \"notes\": \"...\" }",
9568
9823
  " ],",
9569
- " \"summary\": \"<2-4 sentences: what you looked for, how you narrowed, what defines the final set>\"",
9824
+ " \"summary\": \"<2-4 sentences: what you looked for, how you narrowed, what defines the final set>\",",
9825
+ " \"verification\": <required iff input.successCriteria; see Self-verification>",
9570
9826
  "}"
9571
9827
  ].join("\n")
9572
9828
  })
@@ -9596,7 +9852,21 @@ function buildFulfillBriefPrompt(input, ctx) {
9596
9852
  ...seedFiles.map((f) => `- \`${f}\``),
9597
9853
  ""
9598
9854
  ].join("\n") : "";
9599
- const branchSlug = scopeHint ? `feat/${scopeHint}-` : "feat/";
9855
+ const branchSlug = ctx.correlationId ? `moltnet/${ctx.correlationId}/` : scopeHint ? `feat/${scopeHint}-` : "feat/";
9856
+ const correlationSection = ctx.correlationId ? [
9857
+ "### Correlation",
9858
+ "",
9859
+ `This task carries correlationId \`${ctx.correlationId}\`. You MUST:`,
9860
+ "",
9861
+ `1. Name your branch \`moltnet/${ctx.correlationId}/<short-slug>\` — use a`,
9862
+ " slug derived from the brief title (lowercase-kebab, ≤60 chars).",
9863
+ `2. Include the trailer \`Moltnet-Correlation-Id: ${ctx.correlationId}\` on`,
9864
+ " your **first** commit on that branch (subsequent commits do not need it).",
9865
+ "",
9866
+ "These are recovery anchors for the MoltNet mention-bot. Do not deviate",
9867
+ "from this branch naming scheme when correlationId is set.",
9868
+ ""
9869
+ ].join("\n") : "";
9600
9870
  return [
9601
9871
  "# Fulfill Brief Agent",
9602
9872
  "",
@@ -9616,6 +9886,7 @@ function buildFulfillBriefPrompt(input, ctx) {
9616
9886
  "",
9617
9887
  criteriaSection,
9618
9888
  seedSection,
9889
+ correlationSection,
9619
9890
  "### Workflow",
9620
9891
  "",
9621
9892
  `1. Create a feature branch (starting prefix suggestion: \`${branchSlug}<short-slug>\`).`,
@@ -9627,6 +9898,7 @@ function buildFulfillBriefPrompt(input, ctx) {
9627
9898
  " `MoltNet-Diary: <id>` (per the runtime instructor).",
9628
9899
  "6. Push the branch and open a PR.",
9629
9900
  "",
9901
+ buildSelfVerificationBlock(ctx.taskId),
9630
9902
  buildFinalOutputBlock({
9631
9903
  taskType: "fulfill_brief",
9632
9904
  outputSchemaName: "FulfillBriefOutput",
@@ -9636,7 +9908,8 @@ function buildFulfillBriefPrompt(input, ctx) {
9636
9908
  " \"commits\": [{ \"sha\": \"...\", \"message\": \"...\", \"diaryEntryId\": \"...\" }],",
9637
9909
  " \"pullRequestUrl\": \"<url-or-null>\",",
9638
9910
  " \"diaryEntryIds\": [\"...\"],",
9639
- " \"summary\": \"<1-3 sentence recap>\"",
9911
+ " \"summary\": \"<1-3 sentence recap>\",",
9912
+ " \"verification\": <required iff input.successCriteria; see Self-verification>",
9640
9913
  "}"
9641
9914
  ].join("\n")
9642
9915
  })
@@ -9645,7 +9918,8 @@ function buildFulfillBriefPrompt(input, ctx) {
9645
9918
  //#endregion
9646
9919
  //#region ../agent-runtime/src/prompts/judge-pack.ts
9647
9920
  function buildJudgePackPrompt(input, ctx) {
9648
- const { renderedPackId, sourcePackId, rubric } = input;
9921
+ const { renderedPackId, sourcePackId, successCriteria } = input;
9922
+ const rubric = successCriteria.rubric;
9649
9923
  const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
9650
9924
  const preambleSection = rubric.preamble ? [
9651
9925
  "### Rubric preamble",
@@ -9675,7 +9949,7 @@ function buildJudgePackPrompt(input, ctx) {
9675
9949
  "",
9676
9950
  "1. Call `moltnet_rendered_pack_get` for the rendered pack. Keep the",
9677
9951
  " `content` string — you will score it.",
9678
- "2. Call `moltnet_pack_get` with `expand: \"entries\"` for the source",
9952
+ "2. Call `moltnet_pack_get` with `expandEntries: true` for the source",
9679
9953
  " pack. Keep the source entries for grounding / coverage checks.",
9680
9954
  "3. For each criterion, score according to its `scoring` mode (see",
9681
9955
  " Scoring rules below). Produce rationales where required.",
@@ -9688,9 +9962,23 @@ function buildJudgePackPrompt(input, ctx) {
9688
9962
  "",
9689
9963
  "### Scoring rules",
9690
9964
  "",
9691
- "- `llm_judged`: score 0..1 continuous. `rationale` REQUIRED (2–4",
9965
+ "- `llm_score`: score 0..1 continuous. `rationale` REQUIRED (2–4",
9692
9966
  " sentences pointing at specific evidence in the rendered content or",
9693
- " the source entries).",
9967
+ " the source entries). NOTE: this mode smooths individual failures",
9968
+ " into the gradient. Prefer `llm_checklist` for grounding,",
9969
+ " faithfulness, or any property where one failure is a real failure.",
9970
+ "- `llm_checklist`: enumerate per-claim binary assertions instead of",
9971
+ " picking a continuous score. For each assertion, return",
9972
+ " `{ id, text, passed: bool, evidence: string }`. `evidence` is",
9973
+ " REQUIRED for both PASS and FAIL — for PASS, quote the supporting",
9974
+ " span (rendered or source) or cite the source entry id; for FAIL,",
9975
+ " quote the offending claim verbatim and explain why it fails.",
9976
+ " Don't give the benefit of the doubt: if a claim looks supported but",
9977
+ " you cannot point at the supporting source span, mark it FAIL with",
9978
+ " evidence = \"no supporting span found\". Set the criterion `score`",
9979
+ " to `1` iff every assertion passes, else `0` — the runtime checks",
9980
+ " this matches the assertions array. Populate `assertions` on the",
9981
+ " score object; leave `evidence` (the structured record) empty.",
9694
9982
  "- `boolean`: score exactly 0 or 1. `rationale` optional.",
9695
9983
  "- `deterministic_signature_check`: batch-fetch ALL referenced source",
9696
9984
  " entries in a single call — `moltnet_list_entries` with `entryIds` set",
@@ -9730,7 +10018,14 @@ function buildJudgePackPrompt(input, ctx) {
9730
10018
  shapeSketch: [
9731
10019
  "{",
9732
10020
  " \"scores\": [",
9733
- " { \"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {} }",
10021
+ " { \"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {} },",
10022
+ " {",
10023
+ " \"criterionId\": \"<llm_checklist criterion>\",",
10024
+ " \"score\": 0, // 1 iff every assertion passed",
10025
+ " \"assertions\": [",
10026
+ " { \"id\": \"claim-1\", \"text\": \"...\", \"passed\": false, \"evidence\": \"...\" }",
10027
+ " ]",
10028
+ " }",
9734
10029
  " ],",
9735
10030
  " \"composite\": <sum-of-weighted-scores>,",
9736
10031
  " \"verdict\": \"<1-3 sentence overall>\",",
@@ -9772,7 +10067,7 @@ function buildRenderPackPrompt(input, ctx) {
9772
10067
  "",
9773
10068
  "## Workflow",
9774
10069
  "",
9775
- "1. Call `moltnet_pack_get` with `expand: \"entries\"` to inspect the",
10070
+ "1. Call `moltnet_pack_get` with `expandEntries: true` to inspect the",
9776
10071
  " source entries. Read it — you need the entry count for your output.",
9777
10072
  "2. Call `moltnet_pack_render` with:",
9778
10073
  ` - \`packId\`: \`${packId}\``,
@@ -9787,6 +10082,7 @@ function buildRenderPackPrompt(input, ctx) {
9787
10082
  "- Do NOT write diary entries unless a genuine incident occurs",
9788
10083
  " (rendering failure, invariant violation).",
9789
10084
  "",
10085
+ buildSelfVerificationBlock(ctx.taskId),
9790
10086
  buildFinalOutputBlock({
9791
10087
  taskType: "render_pack",
9792
10088
  outputSchemaName: "RenderPackOutput",
@@ -9797,7 +10093,8 @@ function buildRenderPackPrompt(input, ctx) {
9797
10093
  " \"renderMethod\": \"<label>\",",
9798
10094
  " \"byteSize\": <int>,",
9799
10095
  " \"entriesRendered\": <int>,",
9800
- " \"summary\": \"<1-3 sentence recap>\"",
10096
+ " \"summary\": \"<1-3 sentence recap>\",",
10097
+ " \"verification\": <required iff input.successCriteria; see Self-verification>",
9801
10098
  "}"
9802
10099
  ].join("\n")
9803
10100
  })
@@ -9818,7 +10115,8 @@ function buildPromptForTask(task, ctx) {
9818
10115
  }
9819
10116
  return buildFulfillBriefPrompt(task.input, {
9820
10117
  diaryId: ctx.diaryId,
9821
- taskId: ctx.taskId
10118
+ taskId: ctx.taskId,
10119
+ correlationId: task.correlationId
9822
10120
  });
9823
10121
  case ASSESS_BRIEF_TYPE:
9824
10122
  if (!Value.Check(AssessBriefInput, task.input)) {
@@ -13567,9 +13865,9 @@ function createSubmitOutputTool(taskType, opts = {}) {
13567
13865
  description: contract.description,
13568
13866
  parameters: schema,
13569
13867
  async execute(_id, params) {
13570
- const errors = [...Value.Errors(schema, params)];
13868
+ const errors = validateTaskOutput(taskType, params);
13571
13869
  if (errors.length > 0) {
13572
- const detailMsg = errors.slice(0, 3).map((err) => `${err.path || "<root>"}: ${err.message}`).join("; ");
13870
+ const detailMsg = errors.slice(0, 3).map((err) => `${err.field}: ${err.message}`).join("; ");
13573
13871
  const details = {
13574
13872
  captured: false,
13575
13873
  callCount,
@@ -13583,7 +13881,7 @@ function createSubmitOutputTool(taskType, opts = {}) {
13583
13881
  return {
13584
13882
  content: [{
13585
13883
  type: "text",
13586
- text: `Output failed schema validation: ${detailMsg}. Re-call this tool with a corrected output.`
13884
+ text: `Output failed validation: ${detailMsg}. Re-call this tool with a corrected output.`
13587
13885
  }],
13588
13886
  details,
13589
13887
  isError: true
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@themoltnet/pi-extension",
3
- "version": "0.11.0",
3
+ "version": "0.13.0",
4
4
  "type": "module",
5
5
  "description": "MoltNet pi extension — sandboxed tool execution in Gondolin VMs with MoltNet identity and persistent memory",
6
6
  "license": "MIT",
@@ -31,8 +31,8 @@
31
31
  "@earendil-works/gondolin": "^0.7.0",
32
32
  "@opentelemetry/api": "^1.9.0",
33
33
  "@sinclair/typebox": "^0.34.0",
34
- "@themoltnet/sdk": "0.97.0",
35
- "@themoltnet/agent-runtime": "0.8.0"
34
+ "@themoltnet/agent-runtime": "0.10.0",
35
+ "@themoltnet/sdk": "0.98.0"
36
36
  },
37
37
  "peerDependencies": {
38
38
  "@mariozechner/pi-coding-agent": ">=0.73.0",
@@ -61,10 +61,25 @@
61
61
  "engines": {
62
62
  "node": ">=22"
63
63
  },
64
+ "nx": {
65
+ "tags": [
66
+ "type:runtime",
67
+ "scope:agent",
68
+ "platform:extension"
69
+ ],
70
+ "targets": {
71
+ "test-ci": {
72
+ "executor": "nx:noop",
73
+ "dependsOn": [
74
+ "test"
75
+ ],
76
+ "metadata": {
77
+ "description": "Alias for `test` on projects without atomization."
78
+ }
79
+ }
80
+ }
81
+ },
64
82
  "scripts": {
65
- "lint": "eslint src/",
66
- "check:pack": "tsx ../../tools/src/check-pack.ts --package .",
67
- "build": "vite build",
68
- "test": "vitest run --passWithNoTests"
83
+ "check:pack": "tsx ../../tools/src/check-pack.ts --package ."
69
84
  }
70
85
  }