@themoltnet/pi-extension 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -246,7 +246,6 @@ declare const Task: TObject< {
246
246
  input: TRecord<TString, TUnknown>;
247
247
  inputSchemaCid: TString;
248
248
  inputCid: TString;
249
- criteriaCid: TUnion<[TString, TNull]>;
250
249
  references: TArray<TObject< {
251
250
  taskId: TUnion<[TString, TNull]>;
252
251
  outputCid: TString;
package/dist/index.js CHANGED
@@ -8558,7 +8558,13 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
8558
8558
  /**
8559
8559
  * How a judge must score a single criterion.
8560
8560
  *
8561
- * - `llm_judged`: 0..1 continuous, `rationale` required.
8561
+ * - `llm_score`: 0..1 continuous, `rationale` required. Smooths failures
8562
+ * into the gradient — use `llm_checklist` instead for properties where
8563
+ * a single failure is a real failure (grounding, faithfulness).
8564
+ * - `llm_checklist`: judge enumerates per-claim assertions with
8565
+ * `{passed, evidence}`. The criterion's numeric `score` is derived:
8566
+ * `1` iff every assertion passes, else `0`. Per-claim evidence is the
8567
+ * dataset for cluster-analysis of failure modes. See #999.
8562
8568
  * - `boolean`: 0 or 1, `rationale` optional.
8563
8569
  * - `deterministic_signature_check`: judge runs a signature check;
8564
8570
  * result is 0 or 1. No LLM discretion.
@@ -8566,11 +8572,31 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
8566
8572
  * appears in the rendered output; 0 or 1.
8567
8573
  */
8568
8574
  var RubricScoringMode = Type$1.Union([
8569
- Type$1.Literal("llm_judged"),
8575
+ Type$1.Literal("llm_score"),
8576
+ Type$1.Literal("llm_checklist"),
8570
8577
  Type$1.Literal("boolean"),
8571
8578
  Type$1.Literal("deterministic_signature_check"),
8572
8579
  Type$1.Literal("deterministic_coverage_check")
8573
8580
  ], { $id: "RubricScoringMode" });
8581
+ /**
8582
+ * One binary check produced by an `llm_checklist`-mode criterion.
8583
+ *
8584
+ * `evidence` is REQUIRED for both PASS and FAIL — agentskills.io grading
8585
+ * principle: \"Don't give the benefit of the doubt.\" A PASS without
8586
+ * concrete evidence (a quoted span, an entry id, a source location)
8587
+ * cannot be audited. A FAIL without evidence cannot be clustered into
8588
+ * structural fixes. The same shape is reused by `judge-eval-variant`
8589
+ * (#943) so tooling, dashboards, and analysis stay uniform.
8590
+ */
8591
+ var AssertionResult = Type$1.Object({
8592
+ id: Type$1.String({ minLength: 1 }),
8593
+ text: Type$1.String({ minLength: 1 }),
8594
+ passed: Type$1.Boolean(),
8595
+ evidence: Type$1.String({ minLength: 1 })
8596
+ }, {
8597
+ $id: "AssertionResult",
8598
+ additionalProperties: false
8599
+ });
8574
8600
  var RubricCriterion = Type$1.Object({
8575
8601
  id: Type$1.String({ minLength: 1 }),
8576
8602
  description: Type$1.String({ minLength: 1 }),
@@ -8630,44 +8656,165 @@ unrelated subsystems and the test coverage on the auth path is
8630
8656
  unchanged" is.
8631
8657
  `.trim();
8632
8658
  //#endregion
8659
+ //#region ../tasks/src/success-criteria.ts
8660
+ /**
8661
+ * SuccessCriteria — imposer-stated acceptance criteria, evaluated in two
8662
+ * complementary places.
8663
+ *
8664
+ * Before this envelope existed, criteria were scattered: a vestigial
8665
+ * `criteriaCid` column nobody resolved, an `acceptanceCriteria: string[]`
8666
+ * field on `fulfill_brief.input` that was "interpreted by the claiming
8667
+ * agent," and inline `rubric` / `criteria[]` fields on judgment-task
8668
+ * inputs. None of those were machine-verifiable end-to-end.
8669
+ *
8670
+ * This module defines a single, content-addressable envelope an imposer
8671
+ * attaches to any task type. It has four orthogonal sections — pick
8672
+ * whichever apply per task type:
8673
+ *
8674
+ * - `gates` Deterministic structural checks (CID/schema match)
8675
+ * - `assertions` Declarative claims about output JSON
8676
+ * - `rubric` Weighted-criteria scoring instrument, reused
8677
+ * verbatim from `./rubric.ts`.
8678
+ * - `sideEffects` Required process side-effects (e.g. diary entry)
8679
+ *
8680
+ * ## Two roles, two task types
8681
+ *
8682
+ * **Producer self-assessment** (fulfillment tasks: `fulfill_brief`,
8683
+ * `curate_pack`, `render_pack`). The producer **LLM** evaluates the
8684
+ * criteria against its own output and emits a `VerificationRecord`
8685
+ * inside `output.verification`. The daemon is pure passthrough — it
8686
+ * does not run `evaluateAssertions`, does not inspect the verification
8687
+ * record. The REST API is dumb storage; it never re-runs assertions and
8688
+ * never runs LLMs. The cross-field rule
8689
+ * `requireVerificationWhenCriteriaPresent` enforces "verification
8690
+ * required iff successCriteria present" at task-output validation time
8691
+ * (server-side schema check). Self-assessment is a truthful self-rating,
8692
+ * NOT enforcement — `verification.passed=false` does not block /complete
8693
+ * and does not affect `acceptedAttemptN`. See
8694
+ * `docs/agent-runtime.md` for the full producer/judge flow.
8695
+ *
8696
+ * **Binding evaluation** (judgment tasks: `assess_brief`, `judge_pack`).
8697
+ * A separate task whose IS the application of `successCriteria` to
8698
+ * someone else's output. Different agent (enforced at claim time), same
8699
+ * envelope. The judge's verdict is binding: this is the *gate* in the
8700
+ * MoltNet model. The rubric inside `successCriteria.rubric` IS the job
8701
+ * spec for the judge.
8702
+ *
8703
+ * The clean chain: producer task with `successCriteria` → producer
8704
+ * self-assesses honestly → imposer (or automation) creates a downstream
8705
+ * judgment task that references the same `successCriteria` (or a
8706
+ * stricter rubric) → judgment task delivers the binding verdict.
8707
+ *
8708
+ * Storage: SuccessCriteria lives inline at `task.input.successCriteria`,
8709
+ * pinned via the task's `inputCid`. No separate column or hash. When
8710
+ * #881 lands, the `rubric` field can graduate to `{ rubricCid }` lookup
8711
+ * without changing this envelope, and producer + judge tasks can pin
8712
+ * the SAME rubric across the chain for end-to-end auditability.
8713
+ */
8714
+ var SchemaCheckSpec = Type$1.Object({ schemaCid: Type$1.String({ minLength: 1 }) }, { additionalProperties: false });
8715
+ var CidEqualsSpec = Type$1.Object({
8716
+ path: Type$1.String({ minLength: 1 }),
8717
+ expected: Type$1.String({ minLength: 1 })
8718
+ }, { additionalProperties: false });
8719
+ var Gate = Type$1.Union([Type$1.Object({
8720
+ id: Type$1.String({ minLength: 1 }),
8721
+ kind: Type$1.Literal("schema-check"),
8722
+ spec: SchemaCheckSpec,
8723
+ required: Type$1.Boolean()
8724
+ }, { additionalProperties: false }), Type$1.Object({
8725
+ id: Type$1.String({ minLength: 1 }),
8726
+ kind: Type$1.Literal("cid-equals"),
8727
+ spec: CidEqualsSpec,
8728
+ required: Type$1.Boolean()
8729
+ }, { additionalProperties: false })], { $id: "Gate" });
8730
+ var AssertionOp = Type$1.Union([
8731
+ Type$1.Literal("exists"),
8732
+ Type$1.Literal("equals"),
8733
+ Type$1.Literal("matches"),
8734
+ Type$1.Literal("in-range"),
8735
+ Type$1.Literal("min-length")
8736
+ ], { $id: "AssertionOp" });
8737
+ var Assertion = Type$1.Object({
8738
+ id: Type$1.String({ minLength: 1 }),
8739
+ path: Type$1.String({ minLength: 1 }),
8740
+ op: AssertionOp,
8741
+ value: Type$1.Optional(Type$1.Unknown())
8742
+ }, {
8743
+ $id: "Assertion",
8744
+ additionalProperties: false
8745
+ });
8746
+ var SideEffectsSpec = Type$1.Object({
8747
+ diaryEntryRequired: Type$1.Optional(Type$1.Boolean()),
8748
+ diaryEntryTags: Type$1.Optional(Type$1.Array(Type$1.String({ minLength: 1 }))),
8749
+ referencedEntries: Type$1.Optional(Type$1.Integer({ minimum: 0 }))
8750
+ }, {
8751
+ $id: "SideEffectsSpec",
8752
+ additionalProperties: false
8753
+ });
8754
+ var SuccessCriteria = Type$1.Object({
8755
+ version: Type$1.Literal(1),
8756
+ gates: Type$1.Optional(Type$1.Array(Gate)),
8757
+ assertions: Type$1.Optional(Type$1.Array(Assertion)),
8758
+ rubric: Type$1.Optional(Rubric),
8759
+ minComposite: Type$1.Optional(Type$1.Number({
8760
+ minimum: 0,
8761
+ maximum: 1
8762
+ })),
8763
+ sideEffects: Type$1.Optional(SideEffectsSpec)
8764
+ }, {
8765
+ $id: "SuccessCriteria",
8766
+ additionalProperties: false
8767
+ });
8768
+ var VerificationResultStatus = Type$1.Union([
8769
+ Type$1.Literal("pass"),
8770
+ Type$1.Literal("fail"),
8771
+ Type$1.Literal("skip")
8772
+ ], { $id: "VerificationResultStatus" });
8773
+ var VerificationResultKind = Type$1.Union([
8774
+ Type$1.Literal("gate"),
8775
+ Type$1.Literal("assertion"),
8776
+ Type$1.Literal("rubric"),
8777
+ Type$1.Literal("sideEffect")
8778
+ ], { $id: "VerificationResultKind" });
8779
+ var VerificationResult = Type$1.Object({
8780
+ id: Type$1.String({ minLength: 1 }),
8781
+ kind: VerificationResultKind,
8782
+ status: VerificationResultStatus,
8783
+ detail: Type$1.Optional(Type$1.String())
8784
+ }, {
8785
+ $id: "VerificationResult",
8786
+ additionalProperties: false
8787
+ });
8788
+ var VerificationRecord = Type$1.Object({
8789
+ inputCid: Type$1.String({ minLength: 1 }),
8790
+ results: Type$1.Array(VerificationResult),
8791
+ passed: Type$1.Boolean()
8792
+ }, {
8793
+ $id: "VerificationRecord",
8794
+ additionalProperties: false
8795
+ });
8796
+ //#endregion
8633
8797
  //#region ../tasks/src/task-types/assess-brief.ts
8634
8798
  /**
8635
8799
  * `assess_brief` — independently evaluate a fulfilled brief.
8636
8800
  *
8637
8801
  * output_kind: judgment
8638
- * criteria: required (rubric lives as a diary entry with tag='rubric';
8639
- * the Task's `criteria_cid` points at that entry)
8802
+ * criteria: required (`successCriteria.rubric` same envelope as
8803
+ * `judge_pack`)
8640
8804
  * references: required (must reference the target `fulfill_brief` task)
8641
8805
  *
8642
8806
  * The assessor is a different agent from the producer (enforced by the
8643
8807
  * server / runtime at claim time — not in the wire schema).
8808
+ *
8809
+ * The rubric in `successCriteria` IS the job spec — the assessor applies
8810
+ * it to the target task's output and emits per-criterion scores. Other
8811
+ * sections (`assertions`, `gates`, `sideEffects`) MAY be present and are
8812
+ * evaluated against the *assessor's output*.
8644
8813
  */
8645
8814
  var ASSESS_BRIEF_TYPE = "assess_brief";
8646
- /**
8647
- * One criterion lifted from the rubric. Denormalized into the input so the
8648
- * assessor prompt can be built without a second fetch; the `criteria_cid`
8649
- * on the Task row remains authoritative for verification.
8650
- */
8651
- var AssessBriefCriterion = Type$1.Object({
8652
- id: Type$1.String({ minLength: 1 }),
8653
- description: Type$1.String({ minLength: 1 }),
8654
- weight: Type$1.Number({
8655
- minimum: 0,
8656
- maximum: 1
8657
- }),
8658
- scoring: Type$1.Union([
8659
- Type$1.Literal("llm_judged"),
8660
- Type$1.Literal("boolean"),
8661
- Type$1.Literal("deterministic_signature_check")
8662
- ])
8663
- }, {
8664
- $id: "AssessBriefCriterion",
8665
- additionalProperties: false
8666
- });
8667
8815
  var AssessBriefInput = Type$1.Object({
8668
8816
  targetTaskId: Type$1.String({ format: "uuid" }),
8669
- criteria: Type$1.Array(AssessBriefCriterion, { minItems: 1 }),
8670
- rubricPreamble: Type$1.Optional(Type$1.String())
8817
+ successCriteria: SuccessCriteria
8671
8818
  }, {
8672
8819
  $id: "AssessBriefInput",
8673
8820
  additionalProperties: false
@@ -8736,7 +8883,8 @@ var CuratePackInput = Type$1.Object({
8736
8883
  prefix: Type$1.Optional(Type$1.String())
8737
8884
  }, { additionalProperties: false })),
8738
8885
  tokenBudget: Type$1.Optional(Type$1.Number({ minimum: 500 })),
8739
- recipe: Type$1.Optional(Type$1.Union([Type$1.Literal("topic-focused-v1"), Type$1.Literal("scope-inventory-v1")]))
8886
+ recipe: Type$1.Optional(Type$1.Union([Type$1.Literal("topic-focused-v1"), Type$1.Literal("scope-inventory-v1")])),
8887
+ successCriteria: Type$1.Optional(SuccessCriteria)
8740
8888
  }, {
8741
8889
  $id: "CuratePackInput",
8742
8890
  additionalProperties: false
@@ -8761,7 +8909,8 @@ var CuratePackOutput = Type$1.Object({
8761
8909
  droppedIds: Type$1.Optional(Type$1.Array(Type$1.String({ format: "uuid" }))),
8762
8910
  notes: Type$1.String({ minLength: 1 })
8763
8911
  }, { additionalProperties: false }))),
8764
- summary: Type$1.String({ minLength: 1 })
8912
+ summary: Type$1.String({ minLength: 1 }),
8913
+ verification: Type$1.Optional(VerificationRecord)
8765
8914
  }, {
8766
8915
  $id: "CuratePackOutput",
8767
8916
  additionalProperties: false
@@ -8780,6 +8929,7 @@ var FulfillBriefInput = Type$1.Object({
8780
8929
  brief: Type$1.String({ minLength: 1 }),
8781
8930
  title: Type$1.Optional(Type$1.String()),
8782
8931
  acceptanceCriteria: Type$1.Optional(Type$1.Array(Type$1.String())),
8932
+ successCriteria: Type$1.Optional(SuccessCriteria),
8783
8933
  seedFiles: Type$1.Optional(Type$1.Array(Type$1.String())),
8784
8934
  scopeHint: Type$1.Optional(Type$1.String())
8785
8935
  }, {
@@ -8799,7 +8949,8 @@ var FulfillBriefOutput = Type$1.Object({
8799
8949
  }, { additionalProperties: false })),
8800
8950
  pullRequestUrl: Type$1.Union([Type$1.String(), Type$1.Null()]),
8801
8951
  diaryEntryIds: Type$1.Array(Type$1.String({ format: "uuid" })),
8802
- summary: Type$1.String({ minLength: 1 })
8952
+ summary: Type$1.String({ minLength: 1 }),
8953
+ verification: Type$1.Optional(VerificationRecord)
8803
8954
  }, {
8804
8955
  $id: "FulfillBriefOutput",
8805
8956
  additionalProperties: false
@@ -8810,19 +8961,18 @@ var FulfillBriefOutput = Type$1.Object({
8810
8961
  * `judge_pack` — independently score a rendered pack against a rubric.
8811
8962
  *
8812
8963
  * output_kind: judgment
8813
- * criteria: required (embedded `rubric` — see Phase 1 design in #852
8814
- * amendment and Phase 2 issue #881)
8964
+ * criteria: required (`successCriteria.rubric` — see #852 amendment and
8965
+ * Phase 2 issue #881)
8815
8966
  * references: required (must reference the `render_pack` task it judges,
8816
8967
  * role='judged_work')
8817
8968
  *
8818
8969
  * Step 3 of the three-session attribution loop (#875). Mirrors
8819
8970
  * `assess_brief` in shape, but over a rendered context pack.
8820
8971
  *
8821
- * Phase 1 rubric storage: the rubric body is inlined in `input.rubric`.
8822
- * Integrity is pinned via the task's `input_cid`. Phase 2 (#881) will
8823
- * replace the inline body with a `rubric_cid` referencing a `rubrics`
8824
- * table row; the denormalized `criteria[]` projection stays for prompt
8825
- * building without a fetch.
8972
+ * Phase 1 rubric storage: the rubric body lives at
8973
+ * `input.successCriteria.rubric` and is pinned via the task's `inputCid`.
8974
+ * Phase 2 (#881) will replace the inline body with a `rubricCid`
8975
+ * referencing a stored `rubrics` row; the envelope stays the same.
8826
8976
  *
8827
8977
  * The judge MUST be a different agent from the renderer. Enforced at
8828
8978
  * claim time by the runtime, not in the wire schema.
@@ -8831,7 +8981,7 @@ var JUDGE_PACK_TYPE = "judge_pack";
8831
8981
  var JudgePackInput = Type$1.Object({
8832
8982
  renderedPackId: Type$1.String({ format: "uuid" }),
8833
8983
  sourcePackId: Type$1.String({ format: "uuid" }),
8834
- rubric: Rubric
8984
+ successCriteria: SuccessCriteria
8835
8985
  }, {
8836
8986
  $id: "JudgePackInput",
8837
8987
  additionalProperties: false
@@ -8844,6 +8994,7 @@ var JudgePackScore = Type$1.Object({
8844
8994
  maximum: 1
8845
8995
  }),
8846
8996
  rationale: Type$1.Optional(Type$1.String()),
8997
+ assertions: Type$1.Optional(Type$1.Array(AssertionResult, { minItems: 1 })),
8847
8998
  evidence: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Unknown()))
8848
8999
  }, {
8849
9000
  $id: "JudgePackScore",
@@ -8862,6 +9013,39 @@ var JudgePackOutput = Type$1.Object({
8862
9013
  $id: "JudgePackOutput",
8863
9014
  additionalProperties: false
8864
9015
  });
9016
+ /**
9017
+ * Cross-field validator for JudgePackOutput. Run after the TypeBox
9018
+ * schema check passes. Enforces invariants the schema can't express:
9019
+ *
9020
+ * 1. If a `JudgePackScore` carries an `assertions` array (i.e. the
9021
+ * judge ran the criterion in `llm_checklist` mode), its numeric
9022
+ * `score` MUST equal `1` if every `assertions[i].passed` is true,
9023
+ * else `0`. The prompt instructs the judge to derive `score` from
9024
+ * the array, but the LLM can drift — without this check, the
9025
+ * runtime accepts inconsistent payloads and propagates them into
9026
+ * composite scores and judge attestations (#999 P1).
9027
+ *
9028
+ * 2. If `score` is exactly `1` AND `assertions` is present, every
9029
+ * assertion must have `passed: true`. Catches the failure mode in
9030
+ * the issue: "score: 1 with a failing assertion accepted."
9031
+ *
9032
+ * Cross-rubric checks (e.g. "did the judge populate `assertions` for
9033
+ * every criterion the rubric marked `llm_checklist`?") require the
9034
+ * input rubric and live in a separate, runtime-side validator. This
9035
+ * one is rubric-agnostic on purpose — it catches within-score
9036
+ * inconsistency without needing the original task input.
9037
+ */
9038
+ function validateJudgePackOutput(output) {
9039
+ const scores = output.scores;
9040
+ for (let i = 0; i < scores.length; i++) {
9041
+ const s = scores[i];
9042
+ if (!s.assertions) continue;
9043
+ const allPassed = s.assertions.every((a) => a.passed);
9044
+ const expected = allPassed ? 1 : 0;
9045
+ if (s.score !== expected) return `scores[${i}] (criterionId="${s.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${s.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
9046
+ }
9047
+ return null;
9048
+ }
8865
9049
  //#endregion
8866
9050
  //#region ../tasks/src/task-types/render-pack.ts
8867
9051
  /**
@@ -8883,7 +9067,8 @@ var RENDER_PACK_TYPE = "render_pack";
8883
9067
  var RenderPackInput = Type$1.Object({
8884
9068
  packId: Type$1.String({ format: "uuid" }),
8885
9069
  persist: Type$1.Optional(Type$1.Boolean()),
8886
- pinned: Type$1.Optional(Type$1.Boolean())
9070
+ pinned: Type$1.Optional(Type$1.Boolean()),
9071
+ successCriteria: Type$1.Optional(SuccessCriteria)
8887
9072
  }, {
8888
9073
  $id: "RenderPackInput",
8889
9074
  additionalProperties: false
@@ -8894,7 +9079,8 @@ var RenderPackOutput = Type$1.Object({
8894
9079
  renderMethod: Type$1.String({ minLength: 1 }),
8895
9080
  byteSize: Type$1.Number({ minimum: 0 }),
8896
9081
  entriesRendered: Type$1.Number({ minimum: 0 }),
8897
- summary: Type$1.String({ minLength: 1 })
9082
+ summary: Type$1.String({ minLength: 1 }),
9083
+ verification: Type$1.Optional(VerificationRecord)
8898
9084
  }, {
8899
9085
  $id: "RenderPackOutput",
8900
9086
  additionalProperties: false
@@ -8902,6 +9088,33 @@ var RenderPackOutput = Type$1.Object({
8902
9088
  //#endregion
8903
9089
  //#region ../tasks/src/task-types/index.ts
8904
9090
  /**
9091
+ * Validate that a judgment-task input carries a rubric inside its
9092
+ * `successCriteria` envelope, and that the rubric's weights sum to 1.
9093
+ * Used for `assess_brief` and `judge_pack`.
9094
+ */
9095
+ function validateJudgmentInput(input) {
9096
+ const sc = input.successCriteria;
9097
+ if (!sc) return "successCriteria is required for judgment tasks";
9098
+ if (!sc.rubric) return "successCriteria.rubric is required for judgment tasks";
9099
+ return validateRubricWeights(sc.rubric);
9100
+ }
9101
+ /**
9102
+ * Cross-field rule: when `input.successCriteria` is set, the producer's
9103
+ * output MUST carry a `verification` block (the LLM's self-assessment).
9104
+ * When it is unset, the output MUST NOT carry one (avoid garbage data).
9105
+ *
9106
+ * Used by all three fulfillment task types. Judgment task outputs do
9107
+ * NOT use this — their entire output IS a structured judgment, so a
9108
+ * separate self-assessment field would be circular.
9109
+ */
9110
+ function requireVerificationWhenCriteriaPresent(output, input) {
9111
+ const hasCriteria = input !== void 0 && input !== null && input.successCriteria !== void 0;
9112
+ const hasVerification = output.verification !== void 0;
9113
+ if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the criteria";
9114
+ if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no criteria to assess against";
9115
+ return null;
9116
+ }
9117
+ /**
8905
9118
  * Client-side task-type registry. Mirrors the server-owned DB registry
8906
9119
  * (PR 2). PR 0 shipped the two brief types; this PR adds the three
8907
9120
  * pack-pipeline types for the three-session attribution loop (#875).
@@ -8916,41 +9129,41 @@ var BUILT_IN_TASK_TYPES = {
8916
9129
  inputSchema: FulfillBriefInput,
8917
9130
  outputSchema: FulfillBriefOutput,
8918
9131
  outputKind: "artifact",
8919
- requiresCriteria: false,
8920
- requiresReferences: false
9132
+ requiresReferences: false,
9133
+ validateOutput: requireVerificationWhenCriteriaPresent
8921
9134
  },
8922
9135
  [ASSESS_BRIEF_TYPE]: {
8923
9136
  name: ASSESS_BRIEF_TYPE,
8924
9137
  inputSchema: AssessBriefInput,
8925
9138
  outputSchema: AssessBriefOutput,
8926
9139
  outputKind: "judgment",
8927
- requiresCriteria: true,
8928
- requiresReferences: true
9140
+ requiresReferences: true,
9141
+ validateInput: validateJudgmentInput
8929
9142
  },
8930
9143
  [CURATE_PACK_TYPE]: {
8931
9144
  name: CURATE_PACK_TYPE,
8932
9145
  inputSchema: CuratePackInput,
8933
9146
  outputSchema: CuratePackOutput,
8934
9147
  outputKind: "artifact",
8935
- requiresCriteria: false,
8936
- requiresReferences: false
9148
+ requiresReferences: false,
9149
+ validateOutput: requireVerificationWhenCriteriaPresent
8937
9150
  },
8938
9151
  [RENDER_PACK_TYPE]: {
8939
9152
  name: RENDER_PACK_TYPE,
8940
9153
  inputSchema: RenderPackInput,
8941
9154
  outputSchema: RenderPackOutput,
8942
9155
  outputKind: "artifact",
8943
- requiresCriteria: false,
8944
- requiresReferences: false
9156
+ requiresReferences: false,
9157
+ validateOutput: requireVerificationWhenCriteriaPresent
8945
9158
  },
8946
9159
  [JUDGE_PACK_TYPE]: {
8947
9160
  name: JUDGE_PACK_TYPE,
8948
9161
  inputSchema: JudgePackInput,
8949
9162
  outputSchema: JudgePackOutput,
8950
9163
  outputKind: "judgment",
8951
- requiresCriteria: false,
8952
9164
  requiresReferences: true,
8953
- validateInput: (input) => validateRubricWeights(input.rubric)
9165
+ validateInput: validateJudgmentInput,
9166
+ validateOutput: validateJudgePackOutput
8954
9167
  }
8955
9168
  };
8956
9169
  //#endregion
@@ -8980,13 +9193,22 @@ function schemaErrors(prefix, schema, value) {
8980
9193
  message: error.message
8981
9194
  }));
8982
9195
  }
8983
- function validateTaskOutput(taskType, output) {
9196
+ function validateTaskOutput(taskType, output, input) {
8984
9197
  const entry = getTaskTypeEntry(taskType);
8985
9198
  if (!entry) return [{
8986
9199
  field: "taskType",
8987
9200
  message: `Unknown task type: ${taskType}`
8988
9201
  }];
8989
- return schemaErrors("output", entry.outputSchema, output);
9202
+ const errors = schemaErrors("output", entry.outputSchema, output);
9203
+ if (errors.length > 0) return errors;
9204
+ if (entry.validateOutput) {
9205
+ const validationError = entry.validateOutput(output, input);
9206
+ if (validationError) return [{
9207
+ field: "output",
9208
+ message: validationError
9209
+ }];
9210
+ }
9211
+ return [];
8990
9212
  }
8991
9213
  /**
8992
9214
  * Resolve the TypeBox output schema registered for `taskType`. Returns
@@ -9126,7 +9348,6 @@ Type$1.Object({
9126
9348
  input: Type$1.Record(Type$1.String(), Type$1.Unknown()),
9127
9349
  inputSchemaCid: Cid,
9128
9350
  inputCid: Cid,
9129
- criteriaCid: Type$1.Union([Cid, Type$1.Null()]),
9130
9351
  references: Type$1.Array(TaskRef),
9131
9352
  correlationId: Type$1.Union([Uuid, Type$1.Null()]),
9132
9353
  imposedByAgentId: Type$1.Union([Uuid, Type$1.Null()]),
@@ -9340,11 +9561,12 @@ function buildFinalOutputBlock(opts) {
9340
9561
  * anything) work without any code path here.
9341
9562
  */
9342
9563
  function buildAssessBriefPrompt(input, ctx) {
9343
- const criteriaList = input.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
9344
- const preambleSection = input.rubricPreamble ? [
9564
+ const rubric = input.successCriteria.rubric;
9565
+ const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
9566
+ const preambleSection = rubric.preamble ? [
9345
9567
  "### Rubric preamble",
9346
9568
  "",
9347
- input.rubricPreamble,
9569
+ rubric.preamble,
9348
9570
  ""
9349
9571
  ].join("\n") : "";
9350
9572
  return [
@@ -9394,7 +9616,7 @@ function buildAssessBriefPrompt(input, ctx) {
9394
9616
  "",
9395
9617
  "### Scoring rules",
9396
9618
  "",
9397
- "- `llm_judged`: score 0..1 continuous. `rationale` REQUIRED (2–4 sentences).",
9619
+ "- `llm_score`: score 0..1 continuous. `rationale` REQUIRED (2–4 sentences).",
9398
9620
  "- `boolean`: score exactly 0 or 1. `rationale` optional.",
9399
9621
  "- `deterministic_signature_check`: run `moltnet entry verify` on every diary entry returned by step 3 above AND `git verify-commit` on every commit. Score 1 iff ALL signatures are valid; otherwise 0. Populate `evidence.commitsVerified`, `evidence.commitsTotal`, `evidence.signatureFailures`.",
9400
9622
  "",
@@ -9418,6 +9640,39 @@ function buildAssessBriefPrompt(input, ctx) {
9418
9640
  ].filter(Boolean).join("\n");
9419
9641
  }
9420
9642
  //#endregion
9643
+ //#region ../agent-runtime/src/prompts/self-verification.ts
9644
+ function buildSelfVerificationBlock(taskId) {
9645
+ return [
9646
+ "## Self-verification",
9647
+ "",
9648
+ `Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.successCriteria\`.`,
9649
+ "",
9650
+ "- If `input.successCriteria` is **absent**, omit `verification` from your",
9651
+ " final output entirely.",
9652
+ "- If `input.successCriteria` is **present**, you MUST include a",
9653
+ " `verification` block in your final output. Evaluate every applicable",
9654
+ " item — `gates`, `assertions`, `rubric` criteria, `sideEffects` — against",
9655
+ " your produced work and emit one result per id. Be honest: a `fail` with",
9656
+ " a one-line reason is more useful than a false `pass`. Use `skip` (with a",
9657
+ " `detail`) when you genuinely could not determine a result. Compute",
9658
+ " `passed = results.every(r => r.status !== 'fail')`.",
9659
+ "",
9660
+ "Verification shape:",
9661
+ "",
9662
+ "```json",
9663
+ "{",
9664
+ " \"inputCid\": \"<the inputCid you saw on the task>\",",
9665
+ " \"results\": [",
9666
+ " { \"id\": \"<criterion id>\", \"kind\": \"assertion|gate|rubric|sideEffect\",",
9667
+ " \"status\": \"pass|fail|skip\", \"detail\": \"<optional one-liner>\" }",
9668
+ " ],",
9669
+ " \"passed\": <boolean>",
9670
+ "}",
9671
+ "```",
9672
+ ""
9673
+ ].join("\n");
9674
+ }
9675
+ //#endregion
9421
9676
  //#region ../agent-runtime/src/prompts/curate-pack.ts
9422
9677
  /**
9423
9678
  * Build the system prompt for a `curate_pack` task.
@@ -9552,6 +9807,7 @@ function buildCuratePackPrompt(input, ctx) {
9552
9807
  " output, not in the diary.",
9553
9808
  "- Respect hard include/exclude filters literally.",
9554
9809
  "",
9810
+ buildSelfVerificationBlock(ctx.taskId),
9555
9811
  buildFinalOutputBlock({
9556
9812
  taskType: "curate_pack",
9557
9813
  outputSchemaName: "CuratePackOutput",
@@ -9566,7 +9822,8 @@ function buildCuratePackPrompt(input, ctx) {
9566
9822
  " \"checkpoints\": [",
9567
9823
  " { \"phase\": \"recon\", \"candidateIds\": [...], \"droppedIds\": [...], \"notes\": \"...\" }",
9568
9824
  " ],",
9569
- " \"summary\": \"<2-4 sentences: what you looked for, how you narrowed, what defines the final set>\"",
9825
+ " \"summary\": \"<2-4 sentences: what you looked for, how you narrowed, what defines the final set>\",",
9826
+ " \"verification\": <required iff input.successCriteria; see Self-verification>",
9570
9827
  "}"
9571
9828
  ].join("\n")
9572
9829
  })
@@ -9627,6 +9884,7 @@ function buildFulfillBriefPrompt(input, ctx) {
9627
9884
  " `MoltNet-Diary: <id>` (per the runtime instructor).",
9628
9885
  "6. Push the branch and open a PR.",
9629
9886
  "",
9887
+ buildSelfVerificationBlock(ctx.taskId),
9630
9888
  buildFinalOutputBlock({
9631
9889
  taskType: "fulfill_brief",
9632
9890
  outputSchemaName: "FulfillBriefOutput",
@@ -9636,7 +9894,8 @@ function buildFulfillBriefPrompt(input, ctx) {
9636
9894
  " \"commits\": [{ \"sha\": \"...\", \"message\": \"...\", \"diaryEntryId\": \"...\" }],",
9637
9895
  " \"pullRequestUrl\": \"<url-or-null>\",",
9638
9896
  " \"diaryEntryIds\": [\"...\"],",
9639
- " \"summary\": \"<1-3 sentence recap>\"",
9897
+ " \"summary\": \"<1-3 sentence recap>\",",
9898
+ " \"verification\": <required iff input.successCriteria; see Self-verification>",
9640
9899
  "}"
9641
9900
  ].join("\n")
9642
9901
  })
@@ -9645,7 +9904,8 @@ function buildFulfillBriefPrompt(input, ctx) {
9645
9904
  //#endregion
9646
9905
  //#region ../agent-runtime/src/prompts/judge-pack.ts
9647
9906
  function buildJudgePackPrompt(input, ctx) {
9648
- const { renderedPackId, sourcePackId, rubric } = input;
9907
+ const { renderedPackId, sourcePackId, successCriteria } = input;
9908
+ const rubric = successCriteria.rubric;
9649
9909
  const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
9650
9910
  const preambleSection = rubric.preamble ? [
9651
9911
  "### Rubric preamble",
@@ -9675,7 +9935,7 @@ function buildJudgePackPrompt(input, ctx) {
9675
9935
  "",
9676
9936
  "1. Call `moltnet_rendered_pack_get` for the rendered pack. Keep the",
9677
9937
  " `content` string — you will score it.",
9678
- "2. Call `moltnet_pack_get` with `expand: \"entries\"` for the source",
9938
+ "2. Call `moltnet_pack_get` with `expandEntries: true` for the source",
9679
9939
  " pack. Keep the source entries for grounding / coverage checks.",
9680
9940
  "3. For each criterion, score according to its `scoring` mode (see",
9681
9941
  " Scoring rules below). Produce rationales where required.",
@@ -9688,9 +9948,23 @@ function buildJudgePackPrompt(input, ctx) {
9688
9948
  "",
9689
9949
  "### Scoring rules",
9690
9950
  "",
9691
- "- `llm_judged`: score 0..1 continuous. `rationale` REQUIRED (2–4",
9951
+ "- `llm_score`: score 0..1 continuous. `rationale` REQUIRED (2–4",
9692
9952
  " sentences pointing at specific evidence in the rendered content or",
9693
- " the source entries).",
9953
+ " the source entries). NOTE: this mode smooths individual failures",
9954
+ " into the gradient. Prefer `llm_checklist` for grounding,",
9955
+ " faithfulness, or any property where one failure is a real failure.",
9956
+ "- `llm_checklist`: enumerate per-claim binary assertions instead of",
9957
+ " picking a continuous score. For each assertion, return",
9958
+ " `{ id, text, passed: bool, evidence: string }`. `evidence` is",
9959
+ " REQUIRED for both PASS and FAIL — for PASS, quote the supporting",
9960
+ " span (rendered or source) or cite the source entry id; for FAIL,",
9961
+ " quote the offending claim verbatim and explain why it fails.",
9962
+ " Don't give the benefit of the doubt: if a claim looks supported but",
9963
+ " you cannot point at the supporting source span, mark it FAIL with",
9964
+ " evidence = \"no supporting span found\". Set the criterion `score`",
9965
+ " to `1` iff every assertion passes, else `0` — the runtime checks",
9966
+ " this matches the assertions array. Populate `assertions` on the",
9967
+ " score object; leave `evidence` (the structured record) empty.",
9694
9968
  "- `boolean`: score exactly 0 or 1. `rationale` optional.",
9695
9969
  "- `deterministic_signature_check`: batch-fetch ALL referenced source",
9696
9970
  " entries in a single call — `moltnet_list_entries` with `entryIds` set",
@@ -9730,7 +10004,14 @@ function buildJudgePackPrompt(input, ctx) {
9730
10004
  shapeSketch: [
9731
10005
  "{",
9732
10006
  " \"scores\": [",
9733
- " { \"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {} }",
10007
+ " { \"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {} },",
10008
+ " {",
10009
+ " \"criterionId\": \"<llm_checklist criterion>\",",
10010
+ " \"score\": 0, // 1 iff every assertion passed",
10011
+ " \"assertions\": [",
10012
+ " { \"id\": \"claim-1\", \"text\": \"...\", \"passed\": false, \"evidence\": \"...\" }",
10013
+ " ]",
10014
+ " }",
9734
10015
  " ],",
9735
10016
  " \"composite\": <sum-of-weighted-scores>,",
9736
10017
  " \"verdict\": \"<1-3 sentence overall>\",",
@@ -9772,7 +10053,7 @@ function buildRenderPackPrompt(input, ctx) {
9772
10053
  "",
9773
10054
  "## Workflow",
9774
10055
  "",
9775
- "1. Call `moltnet_pack_get` with `expand: \"entries\"` to inspect the",
10056
+ "1. Call `moltnet_pack_get` with `expandEntries: true` to inspect the",
9776
10057
  " source entries. Read it — you need the entry count for your output.",
9777
10058
  "2. Call `moltnet_pack_render` with:",
9778
10059
  ` - \`packId\`: \`${packId}\``,
@@ -9787,6 +10068,7 @@ function buildRenderPackPrompt(input, ctx) {
9787
10068
  "- Do NOT write diary entries unless a genuine incident occurs",
9788
10069
  " (rendering failure, invariant violation).",
9789
10070
  "",
10071
+ buildSelfVerificationBlock(ctx.taskId),
9790
10072
  buildFinalOutputBlock({
9791
10073
  taskType: "render_pack",
9792
10074
  outputSchemaName: "RenderPackOutput",
@@ -9797,7 +10079,8 @@ function buildRenderPackPrompt(input, ctx) {
9797
10079
  " \"renderMethod\": \"<label>\",",
9798
10080
  " \"byteSize\": <int>,",
9799
10081
  " \"entriesRendered\": <int>,",
9800
- " \"summary\": \"<1-3 sentence recap>\"",
10082
+ " \"summary\": \"<1-3 sentence recap>\",",
10083
+ " \"verification\": <required iff input.successCriteria; see Self-verification>",
9801
10084
  "}"
9802
10085
  ].join("\n")
9803
10086
  })
@@ -13567,9 +13850,9 @@ function createSubmitOutputTool(taskType, opts = {}) {
13567
13850
  description: contract.description,
13568
13851
  parameters: schema,
13569
13852
  async execute(_id, params) {
13570
- const errors = [...Value.Errors(schema, params)];
13853
+ const errors = validateTaskOutput(taskType, params);
13571
13854
  if (errors.length > 0) {
13572
- const detailMsg = errors.slice(0, 3).map((err) => `${err.path || "<root>"}: ${err.message}`).join("; ");
13855
+ const detailMsg = errors.slice(0, 3).map((err) => `${err.field}: ${err.message}`).join("; ");
13573
13856
  const details = {
13574
13857
  captured: false,
13575
13858
  callCount,
@@ -13583,7 +13866,7 @@ function createSubmitOutputTool(taskType, opts = {}) {
13583
13866
  return {
13584
13867
  content: [{
13585
13868
  type: "text",
13586
- text: `Output failed schema validation: ${detailMsg}. Re-call this tool with a corrected output.`
13869
+ text: `Output failed validation: ${detailMsg}. Re-call this tool with a corrected output.`
13587
13870
  }],
13588
13871
  details,
13589
13872
  isError: true
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@themoltnet/pi-extension",
3
- "version": "0.11.0",
3
+ "version": "0.12.0",
4
4
  "type": "module",
5
5
  "description": "MoltNet pi extension — sandboxed tool execution in Gondolin VMs with MoltNet identity and persistent memory",
6
6
  "license": "MIT",
@@ -31,8 +31,8 @@
31
31
  "@earendil-works/gondolin": "^0.7.0",
32
32
  "@opentelemetry/api": "^1.9.0",
33
33
  "@sinclair/typebox": "^0.34.0",
34
- "@themoltnet/sdk": "0.97.0",
35
- "@themoltnet/agent-runtime": "0.8.0"
34
+ "@themoltnet/agent-runtime": "0.9.0",
35
+ "@themoltnet/sdk": "0.98.0"
36
36
  },
37
37
  "peerDependencies": {
38
38
  "@mariozechner/pi-coding-agent": ">=0.73.0",
@@ -61,10 +61,25 @@
61
61
  "engines": {
62
62
  "node": ">=22"
63
63
  },
64
+ "nx": {
65
+ "tags": [
66
+ "type:runtime",
67
+ "scope:agent",
68
+ "platform:extension"
69
+ ],
70
+ "targets": {
71
+ "test-ci": {
72
+ "executor": "nx:noop",
73
+ "dependsOn": [
74
+ "test"
75
+ ],
76
+ "metadata": {
77
+ "description": "Alias for `test` on projects without atomization."
78
+ }
79
+ }
80
+ }
81
+ },
64
82
  "scripts": {
65
- "lint": "eslint src/",
66
- "check:pack": "tsx ../../tools/src/check-pack.ts --package .",
67
- "build": "vite build",
68
- "test": "vitest run --passWithNoTests"
83
+ "check:pack": "tsx ../../tools/src/check-pack.ts --package ."
69
84
  }
70
85
  }