npm - @themoltnet/pi-extension - Versions diffs - 0.11.0 → 0.12.0 - Mend

@themoltnet/pi-extension 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -246,7 +246,6 @@ declare const Task: TObject<    {
     input: TRecord<TString, TUnknown>;
     inputSchemaCid: TString;
     inputCid: TString;
-    criteriaCid: TUnion<[TString, TNull]>;
     references: TArray<TObject<    {
         taskId: TUnion<[TString, TNull]>;
         outputCid: TString;

package/dist/index.js CHANGED Viewed

@@ -8558,7 +8558,13 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
 /**
 * How a judge must score a single criterion.
 *
-* - `llm_judged`: 0..1 continuous, `rationale` required.
+* - `llm_score`: 0..1 continuous, `rationale` required. Smooths failures
+*   into the gradient — use `llm_checklist` instead for properties where
+*   a single failure is a real failure (grounding, faithfulness).
+* - `llm_checklist`: judge enumerates per-claim assertions with
+*   `{passed, evidence}`. The criterion's numeric `score` is derived:
+*   `1` iff every assertion passes, else `0`. Per-claim evidence is the
+*   dataset for cluster-analysis of failure modes. See #999.
 * - `boolean`: 0 or 1, `rationale` optional.
 * - `deterministic_signature_check`: judge runs a signature check;
 *   result is 0 or 1. No LLM discretion.
@@ -8566,11 +8572,31 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
 *   appears in the rendered output; 0 or 1.
 */
 var RubricScoringMode = Type$1.Union([
-	Type$1.Literal("llm_judged"),
+	Type$1.Literal("llm_score"),
+	Type$1.Literal("llm_checklist"),
 	Type$1.Literal("boolean"),
 	Type$1.Literal("deterministic_signature_check"),
 	Type$1.Literal("deterministic_coverage_check")
 ], { $id: "RubricScoringMode" });
+/**
+* One binary check produced by an `llm_checklist`-mode criterion.
+*
+* `evidence` is REQUIRED for both PASS and FAIL — agentskills.io grading
+* principle: \"Don't give the benefit of the doubt.\" A PASS without
+* concrete evidence (a quoted span, an entry id, a source location)
+* cannot be audited. A FAIL without evidence cannot be clustered into
+* structural fixes. The same shape is reused by `judge-eval-variant`
+* (#943) so tooling, dashboards, and analysis stay uniform.
+*/
+var AssertionResult = Type$1.Object({
+	id: Type$1.String({ minLength: 1 }),
+	text: Type$1.String({ minLength: 1 }),
+	passed: Type$1.Boolean(),
+	evidence: Type$1.String({ minLength: 1 })
+}, {
+	$id: "AssertionResult",
+	additionalProperties: false
+});
 var RubricCriterion = Type$1.Object({
 	id: Type$1.String({ minLength: 1 }),
 	description: Type$1.String({ minLength: 1 }),
@@ -8630,44 +8656,165 @@ unrelated subsystems and the test coverage on the auth path is
 unchanged" is.
 `.trim();
 //#endregion
+//#region ../tasks/src/success-criteria.ts
+/**
+* SuccessCriteria — imposer-stated acceptance criteria, evaluated in two
+* complementary places.
+*
+* Before this envelope existed, criteria were scattered: a vestigial
+* `criteriaCid` column nobody resolved, an `acceptanceCriteria: string[]`
+* field on `fulfill_brief.input` that was "interpreted by the claiming
+* agent," and inline `rubric` / `criteria[]` fields on judgment-task
+* inputs. None of those were machine-verifiable end-to-end.
+*
+* This module defines a single, content-addressable envelope an imposer
+* attaches to any task type. It has four orthogonal sections — pick
+* whichever apply per task type:
+*
+*   - `gates`        Deterministic structural checks (CID/schema match)
+*   - `assertions`   Declarative claims about output JSON
+*   - `rubric`       Weighted-criteria scoring instrument, reused
+*                    verbatim from `./rubric.ts`.
+*   - `sideEffects`  Required process side-effects (e.g. diary entry)
+*
+* ## Two roles, two task types
+*
+* **Producer self-assessment** (fulfillment tasks: `fulfill_brief`,
+* `curate_pack`, `render_pack`). The producer **LLM** evaluates the
+* criteria against its own output and emits a `VerificationRecord`
+* inside `output.verification`. The daemon is pure passthrough — it
+* does not run `evaluateAssertions`, does not inspect the verification
+* record. The REST API is dumb storage; it never re-runs assertions and
+* never runs LLMs. The cross-field rule
+* `requireVerificationWhenCriteriaPresent` enforces "verification
+* required iff successCriteria present" at task-output validation time
+* (server-side schema check). Self-assessment is a truthful self-rating,
+* NOT enforcement — `verification.passed=false` does not block /complete
+* and does not affect `acceptedAttemptN`. See
+* `docs/agent-runtime.md` for the full producer/judge flow.
+*
+* **Binding evaluation** (judgment tasks: `assess_brief`, `judge_pack`).
+* A separate task whose IS the application of `successCriteria` to
+* someone else's output. Different agent (enforced at claim time), same
+* envelope. The judge's verdict is binding: this is the *gate* in the
+* MoltNet model. The rubric inside `successCriteria.rubric` IS the job
+* spec for the judge.
+*
+* The clean chain: producer task with `successCriteria` → producer
+* self-assesses honestly → imposer (or automation) creates a downstream
+* judgment task that references the same `successCriteria` (or a
+* stricter rubric) → judgment task delivers the binding verdict.
+*
+* Storage: SuccessCriteria lives inline at `task.input.successCriteria`,
+* pinned via the task's `inputCid`. No separate column or hash. When
+* #881 lands, the `rubric` field can graduate to `{ rubricCid }` lookup
+* without changing this envelope, and producer + judge tasks can pin
+* the SAME rubric across the chain for end-to-end auditability.
+*/
+var SchemaCheckSpec = Type$1.Object({ schemaCid: Type$1.String({ minLength: 1 }) }, { additionalProperties: false });
+var CidEqualsSpec = Type$1.Object({
+	path: Type$1.String({ minLength: 1 }),
+	expected: Type$1.String({ minLength: 1 })
+}, { additionalProperties: false });
+var Gate = Type$1.Union([Type$1.Object({
+	id: Type$1.String({ minLength: 1 }),
+	kind: Type$1.Literal("schema-check"),
+	spec: SchemaCheckSpec,
+	required: Type$1.Boolean()
+}, { additionalProperties: false }), Type$1.Object({
+	id: Type$1.String({ minLength: 1 }),
+	kind: Type$1.Literal("cid-equals"),
+	spec: CidEqualsSpec,
+	required: Type$1.Boolean()
+}, { additionalProperties: false })], { $id: "Gate" });
+var AssertionOp = Type$1.Union([
+	Type$1.Literal("exists"),
+	Type$1.Literal("equals"),
+	Type$1.Literal("matches"),
+	Type$1.Literal("in-range"),
+	Type$1.Literal("min-length")
+], { $id: "AssertionOp" });
+var Assertion = Type$1.Object({
+	id: Type$1.String({ minLength: 1 }),
+	path: Type$1.String({ minLength: 1 }),
+	op: AssertionOp,
+	value: Type$1.Optional(Type$1.Unknown())
+}, {
+	$id: "Assertion",
+	additionalProperties: false
+});
+var SideEffectsSpec = Type$1.Object({
+	diaryEntryRequired: Type$1.Optional(Type$1.Boolean()),
+	diaryEntryTags: Type$1.Optional(Type$1.Array(Type$1.String({ minLength: 1 }))),
+	referencedEntries: Type$1.Optional(Type$1.Integer({ minimum: 0 }))
+}, {
+	$id: "SideEffectsSpec",
+	additionalProperties: false
+});
+var SuccessCriteria = Type$1.Object({
+	version: Type$1.Literal(1),
+	gates: Type$1.Optional(Type$1.Array(Gate)),
+	assertions: Type$1.Optional(Type$1.Array(Assertion)),
+	rubric: Type$1.Optional(Rubric),
+	minComposite: Type$1.Optional(Type$1.Number({
+		minimum: 0,
+		maximum: 1
+	})),
+	sideEffects: Type$1.Optional(SideEffectsSpec)
+}, {
+	$id: "SuccessCriteria",
+	additionalProperties: false
+});
+var VerificationResultStatus = Type$1.Union([
+	Type$1.Literal("pass"),
+	Type$1.Literal("fail"),
+	Type$1.Literal("skip")
+], { $id: "VerificationResultStatus" });
+var VerificationResultKind = Type$1.Union([
+	Type$1.Literal("gate"),
+	Type$1.Literal("assertion"),
+	Type$1.Literal("rubric"),
+	Type$1.Literal("sideEffect")
+], { $id: "VerificationResultKind" });
+var VerificationResult = Type$1.Object({
+	id: Type$1.String({ minLength: 1 }),
+	kind: VerificationResultKind,
+	status: VerificationResultStatus,
+	detail: Type$1.Optional(Type$1.String())
+}, {
+	$id: "VerificationResult",
+	additionalProperties: false
+});
+var VerificationRecord = Type$1.Object({
+	inputCid: Type$1.String({ minLength: 1 }),
+	results: Type$1.Array(VerificationResult),
+	passed: Type$1.Boolean()
+}, {
+	$id: "VerificationRecord",
+	additionalProperties: false
+});
+//#endregion
 //#region ../tasks/src/task-types/assess-brief.ts
 /**
 * `assess_brief` — independently evaluate a fulfilled brief.
 *
 * output_kind: judgment
-* criteria: required (rubric lives as a diary entry with tag='rubric';
-*   the Task's `criteria_cid` points at that entry)
+* criteria: required (`successCriteria.rubric` — same envelope as
+*   `judge_pack`)
 * references: required (must reference the target `fulfill_brief` task)
 *
 * The assessor is a different agent from the producer (enforced by the
 * server / runtime at claim time — not in the wire schema).
+*
+* The rubric in `successCriteria` IS the job spec — the assessor applies
+* it to the target task's output and emits per-criterion scores. Other
+* sections (`assertions`, `gates`, `sideEffects`) MAY be present and are
+* evaluated against the *assessor's output*.
 */
 var ASSESS_BRIEF_TYPE = "assess_brief";
-/**
-* One criterion lifted from the rubric. Denormalized into the input so the
-* assessor prompt can be built without a second fetch; the `criteria_cid`
-* on the Task row remains authoritative for verification.
-*/
-var AssessBriefCriterion = Type$1.Object({
-	id: Type$1.String({ minLength: 1 }),
-	description: Type$1.String({ minLength: 1 }),
-	weight: Type$1.Number({
-		minimum: 0,
-		maximum: 1
-	}),
-	scoring: Type$1.Union([
-		Type$1.Literal("llm_judged"),
-		Type$1.Literal("boolean"),
-		Type$1.Literal("deterministic_signature_check")
-	])
-}, {
-	$id: "AssessBriefCriterion",
-	additionalProperties: false
-});
 var AssessBriefInput = Type$1.Object({
 	targetTaskId: Type$1.String({ format: "uuid" }),
-	criteria: Type$1.Array(AssessBriefCriterion, { minItems: 1 }),
-	rubricPreamble: Type$1.Optional(Type$1.String())
+	successCriteria: SuccessCriteria
 }, {
 	$id: "AssessBriefInput",
 	additionalProperties: false
@@ -8736,7 +8883,8 @@ var CuratePackInput = Type$1.Object({
 		prefix: Type$1.Optional(Type$1.String())
 	}, { additionalProperties: false })),
 	tokenBudget: Type$1.Optional(Type$1.Number({ minimum: 500 })),
-	recipe: Type$1.Optional(Type$1.Union([Type$1.Literal("topic-focused-v1"), Type$1.Literal("scope-inventory-v1")]))
+	recipe: Type$1.Optional(Type$1.Union([Type$1.Literal("topic-focused-v1"), Type$1.Literal("scope-inventory-v1")])),
+	successCriteria: Type$1.Optional(SuccessCriteria)
 }, {
 	$id: "CuratePackInput",
 	additionalProperties: false
@@ -8761,7 +8909,8 @@ var CuratePackOutput = Type$1.Object({
 		droppedIds: Type$1.Optional(Type$1.Array(Type$1.String({ format: "uuid" }))),
 		notes: Type$1.String({ minLength: 1 })
 	}, { additionalProperties: false }))),
-	summary: Type$1.String({ minLength: 1 })
+	summary: Type$1.String({ minLength: 1 }),
+	verification: Type$1.Optional(VerificationRecord)
 }, {
 	$id: "CuratePackOutput",
 	additionalProperties: false
@@ -8780,6 +8929,7 @@ var FulfillBriefInput = Type$1.Object({
 	brief: Type$1.String({ minLength: 1 }),
 	title: Type$1.Optional(Type$1.String()),
 	acceptanceCriteria: Type$1.Optional(Type$1.Array(Type$1.String())),
+	successCriteria: Type$1.Optional(SuccessCriteria),
 	seedFiles: Type$1.Optional(Type$1.Array(Type$1.String())),
 	scopeHint: Type$1.Optional(Type$1.String())
 }, {
@@ -8799,7 +8949,8 @@ var FulfillBriefOutput = Type$1.Object({
 	}, { additionalProperties: false })),
 	pullRequestUrl: Type$1.Union([Type$1.String(), Type$1.Null()]),
 	diaryEntryIds: Type$1.Array(Type$1.String({ format: "uuid" })),
-	summary: Type$1.String({ minLength: 1 })
+	summary: Type$1.String({ minLength: 1 }),
+	verification: Type$1.Optional(VerificationRecord)
 }, {
 	$id: "FulfillBriefOutput",
 	additionalProperties: false
@@ -8810,19 +8961,18 @@ var FulfillBriefOutput = Type$1.Object({
 * `judge_pack` — independently score a rendered pack against a rubric.
 *
 * output_kind: judgment
-* criteria: required (embedded `rubric` — see Phase 1 design in #852
-*   amendment and Phase 2 issue #881)
+* criteria: required (`successCriteria.rubric` — see #852 amendment and
+*   Phase 2 issue #881)
 * references: required (must reference the `render_pack` task it judges,
 *   role='judged_work')
 *
 * Step 3 of the three-session attribution loop (#875). Mirrors
 * `assess_brief` in shape, but over a rendered context pack.
 *
-* Phase 1 rubric storage: the rubric body is inlined in `input.rubric`.
-* Integrity is pinned via the task's `input_cid`. Phase 2 (#881) will
-* replace the inline body with a `rubric_cid` referencing a `rubrics`
-* table row; the denormalized `criteria[]` projection stays for prompt
-* building without a fetch.
+* Phase 1 rubric storage: the rubric body lives at
+* `input.successCriteria.rubric` and is pinned via the task's `inputCid`.
+* Phase 2 (#881) will replace the inline body with a `rubricCid`
+* referencing a stored `rubrics` row; the envelope stays the same.
 *
 * The judge MUST be a different agent from the renderer. Enforced at
 * claim time by the runtime, not in the wire schema.
@@ -8831,7 +8981,7 @@ var JUDGE_PACK_TYPE = "judge_pack";
 var JudgePackInput = Type$1.Object({
 	renderedPackId: Type$1.String({ format: "uuid" }),
 	sourcePackId: Type$1.String({ format: "uuid" }),
-	rubric: Rubric
+	successCriteria: SuccessCriteria
 }, {
 	$id: "JudgePackInput",
 	additionalProperties: false
@@ -8844,6 +8994,7 @@ var JudgePackScore = Type$1.Object({
 		maximum: 1
 	}),
 	rationale: Type$1.Optional(Type$1.String()),
+	assertions: Type$1.Optional(Type$1.Array(AssertionResult, { minItems: 1 })),
 	evidence: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Unknown()))
 }, {
 	$id: "JudgePackScore",
@@ -8862,6 +9013,39 @@ var JudgePackOutput = Type$1.Object({
 	$id: "JudgePackOutput",
 	additionalProperties: false
 });
+/**
+* Cross-field validator for JudgePackOutput. Run after the TypeBox
+* schema check passes. Enforces invariants the schema can't express:
+*
+* 1. If a `JudgePackScore` carries an `assertions` array (i.e. the
+*    judge ran the criterion in `llm_checklist` mode), its numeric
+*    `score` MUST equal `1` if every `assertions[i].passed` is true,
+*    else `0`. The prompt instructs the judge to derive `score` from
+*    the array, but the LLM can drift — without this check, the
+*    runtime accepts inconsistent payloads and propagates them into
+*    composite scores and judge attestations (#999 P1).
+*
+* 2. If `score` is exactly `1` AND `assertions` is present, every
+*    assertion must have `passed: true`. Catches the failure mode in
+*    the issue: "score: 1 with a failing assertion accepted."
+*
+* Cross-rubric checks (e.g. "did the judge populate `assertions` for
+* every criterion the rubric marked `llm_checklist`?") require the
+* input rubric and live in a separate, runtime-side validator. This
+* one is rubric-agnostic on purpose — it catches within-score
+* inconsistency without needing the original task input.
+*/
+function validateJudgePackOutput(output) {
+	const scores = output.scores;
+	for (let i = 0; i < scores.length; i++) {
+		const s = scores[i];
+		if (!s.assertions) continue;
+		const allPassed = s.assertions.every((a) => a.passed);
+		const expected = allPassed ? 1 : 0;
+		if (s.score !== expected) return `scores[${i}] (criterionId="${s.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${s.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
+	}
+	return null;
+}
 //#endregion
 //#region ../tasks/src/task-types/render-pack.ts
 /**
@@ -8883,7 +9067,8 @@ var RENDER_PACK_TYPE = "render_pack";
 var RenderPackInput = Type$1.Object({
 	packId: Type$1.String({ format: "uuid" }),
 	persist: Type$1.Optional(Type$1.Boolean()),
-	pinned: Type$1.Optional(Type$1.Boolean())
+	pinned: Type$1.Optional(Type$1.Boolean()),
+	successCriteria: Type$1.Optional(SuccessCriteria)
 }, {
 	$id: "RenderPackInput",
 	additionalProperties: false
@@ -8894,7 +9079,8 @@ var RenderPackOutput = Type$1.Object({
 	renderMethod: Type$1.String({ minLength: 1 }),
 	byteSize: Type$1.Number({ minimum: 0 }),
 	entriesRendered: Type$1.Number({ minimum: 0 }),
-	summary: Type$1.String({ minLength: 1 })
+	summary: Type$1.String({ minLength: 1 }),
+	verification: Type$1.Optional(VerificationRecord)
 }, {
 	$id: "RenderPackOutput",
 	additionalProperties: false
@@ -8902,6 +9088,33 @@ var RenderPackOutput = Type$1.Object({
 //#endregion
 //#region ../tasks/src/task-types/index.ts
 /**
+* Validate that a judgment-task input carries a rubric inside its
+* `successCriteria` envelope, and that the rubric's weights sum to 1.
+* Used for `assess_brief` and `judge_pack`.
+*/
+function validateJudgmentInput(input) {
+	const sc = input.successCriteria;
+	if (!sc) return "successCriteria is required for judgment tasks";
+	if (!sc.rubric) return "successCriteria.rubric is required for judgment tasks";
+	return validateRubricWeights(sc.rubric);
+}
+/**
+* Cross-field rule: when `input.successCriteria` is set, the producer's
+* output MUST carry a `verification` block (the LLM's self-assessment).
+* When it is unset, the output MUST NOT carry one (avoid garbage data).
+*
+* Used by all three fulfillment task types. Judgment task outputs do
+* NOT use this — their entire output IS a structured judgment, so a
+* separate self-assessment field would be circular.
+*/
+function requireVerificationWhenCriteriaPresent(output, input) {
+	const hasCriteria = input !== void 0 && input !== null && input.successCriteria !== void 0;
+	const hasVerification = output.verification !== void 0;
+	if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the criteria";
+	if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no criteria to assess against";
+	return null;
+}
+/**
 * Client-side task-type registry. Mirrors the server-owned DB registry
 * (PR 2). PR 0 shipped the two brief types; this PR adds the three
 * pack-pipeline types for the three-session attribution loop (#875).
@@ -8916,41 +9129,41 @@ var BUILT_IN_TASK_TYPES = {
 		inputSchema: FulfillBriefInput,
 		outputSchema: FulfillBriefOutput,
 		outputKind: "artifact",
-		requiresCriteria: false,
-		requiresReferences: false
+		requiresReferences: false,
+		validateOutput: requireVerificationWhenCriteriaPresent
 	},
 	[ASSESS_BRIEF_TYPE]: {
 		name: ASSESS_BRIEF_TYPE,
 		inputSchema: AssessBriefInput,
 		outputSchema: AssessBriefOutput,
 		outputKind: "judgment",
-		requiresCriteria: true,
-		requiresReferences: true
+		requiresReferences: true,
+		validateInput: validateJudgmentInput
 	},
 	[CURATE_PACK_TYPE]: {
 		name: CURATE_PACK_TYPE,
 		inputSchema: CuratePackInput,
 		outputSchema: CuratePackOutput,
 		outputKind: "artifact",
-		requiresCriteria: false,
-		requiresReferences: false
+		requiresReferences: false,
+		validateOutput: requireVerificationWhenCriteriaPresent
 	},
 	[RENDER_PACK_TYPE]: {
 		name: RENDER_PACK_TYPE,
 		inputSchema: RenderPackInput,
 		outputSchema: RenderPackOutput,
 		outputKind: "artifact",
-		requiresCriteria: false,
-		requiresReferences: false
+		requiresReferences: false,
+		validateOutput: requireVerificationWhenCriteriaPresent
 	},
 	[JUDGE_PACK_TYPE]: {
 		name: JUDGE_PACK_TYPE,
 		inputSchema: JudgePackInput,
 		outputSchema: JudgePackOutput,
 		outputKind: "judgment",
-		requiresCriteria: false,
 		requiresReferences: true,
-		validateInput: (input) => validateRubricWeights(input.rubric)
+		validateInput: validateJudgmentInput,
+		validateOutput: validateJudgePackOutput
 	}
 };
 //#endregion
@@ -8980,13 +9193,22 @@ function schemaErrors(prefix, schema, value) {
 		message: error.message
 	}));
 }
-function validateTaskOutput(taskType, output) {
+function validateTaskOutput(taskType, output, input) {
 	const entry = getTaskTypeEntry(taskType);
 	if (!entry) return [{
 		field: "taskType",
 		message: `Unknown task type: ${taskType}`
 	}];
-	return schemaErrors("output", entry.outputSchema, output);
+	const errors = schemaErrors("output", entry.outputSchema, output);
+	if (errors.length > 0) return errors;
+	if (entry.validateOutput) {
+		const validationError = entry.validateOutput(output, input);
+		if (validationError) return [{
+			field: "output",
+			message: validationError
+		}];
+	}
+	return [];
 }
 /**
 * Resolve the TypeBox output schema registered for `taskType`. Returns
@@ -9126,7 +9348,6 @@ Type$1.Object({
 	input: Type$1.Record(Type$1.String(), Type$1.Unknown()),
 	inputSchemaCid: Cid,
 	inputCid: Cid,
-	criteriaCid: Type$1.Union([Cid, Type$1.Null()]),
 	references: Type$1.Array(TaskRef),
 	correlationId: Type$1.Union([Uuid, Type$1.Null()]),
 	imposedByAgentId: Type$1.Union([Uuid, Type$1.Null()]),
@@ -9340,11 +9561,12 @@ function buildFinalOutputBlock(opts) {
 * anything) work without any code path here.
 */
 function buildAssessBriefPrompt(input, ctx) {
-	const criteriaList = input.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
-	const preambleSection = input.rubricPreamble ? [
+	const rubric = input.successCriteria.rubric;
+	const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
+	const preambleSection = rubric.preamble ? [
 		"### Rubric preamble",
 		"",
-		input.rubricPreamble,
+		rubric.preamble,
 		""
 	].join("\n") : "";
 	return [
@@ -9394,7 +9616,7 @@ function buildAssessBriefPrompt(input, ctx) {
 		"",
 		"### Scoring rules",
 		"",
-		"- `llm_judged`: score 0..1 continuous. `rationale` REQUIRED (2–4 sentences).",
+		"- `llm_score`: score 0..1 continuous. `rationale` REQUIRED (2–4 sentences).",
 		"- `boolean`: score exactly 0 or 1. `rationale` optional.",
 		"- `deterministic_signature_check`: run `moltnet entry verify` on every diary entry returned by step 3 above AND `git verify-commit` on every commit. Score 1 iff ALL signatures are valid; otherwise 0. Populate `evidence.commitsVerified`, `evidence.commitsTotal`, `evidence.signatureFailures`.",
 		"",
@@ -9418,6 +9640,39 @@ function buildAssessBriefPrompt(input, ctx) {
 	].filter(Boolean).join("\n");
 }
 //#endregion
+//#region ../agent-runtime/src/prompts/self-verification.ts
+function buildSelfVerificationBlock(taskId) {
+	return [
+		"## Self-verification",
+		"",
+		`Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.successCriteria\`.`,
+		"",
+		"- If `input.successCriteria` is **absent**, omit `verification` from your",
+		"  final output entirely.",
+		"- If `input.successCriteria` is **present**, you MUST include a",
+		"  `verification` block in your final output. Evaluate every applicable",
+		"  item — `gates`, `assertions`, `rubric` criteria, `sideEffects` — against",
+		"  your produced work and emit one result per id. Be honest: a `fail` with",
+		"  a one-line reason is more useful than a false `pass`. Use `skip` (with a",
+		"  `detail`) when you genuinely could not determine a result. Compute",
+		"  `passed = results.every(r => r.status !== 'fail')`.",
+		"",
+		"Verification shape:",
+		"",
+		"```json",
+		"{",
+		"  \"inputCid\": \"<the inputCid you saw on the task>\",",
+		"  \"results\": [",
+		"    { \"id\": \"<criterion id>\", \"kind\": \"assertion|gate|rubric|sideEffect\",",
+		"      \"status\": \"pass|fail|skip\", \"detail\": \"<optional one-liner>\" }",
+		"  ],",
+		"  \"passed\": <boolean>",
+		"}",
+		"```",
+		""
+	].join("\n");
+}
+//#endregion
 //#region ../agent-runtime/src/prompts/curate-pack.ts
 /**
 * Build the system prompt for a `curate_pack` task.
@@ -9552,6 +9807,7 @@ function buildCuratePackPrompt(input, ctx) {
 		"  output, not in the diary.",
 		"- Respect hard include/exclude filters literally.",
 		"",
+		buildSelfVerificationBlock(ctx.taskId),
 		buildFinalOutputBlock({
 			taskType: "curate_pack",
 			outputSchemaName: "CuratePackOutput",
@@ -9566,7 +9822,8 @@ function buildCuratePackPrompt(input, ctx) {
 				"  \"checkpoints\": [",
 				"    { \"phase\": \"recon\", \"candidateIds\": [...], \"droppedIds\": [...], \"notes\": \"...\" }",
 				"  ],",
-				"  \"summary\": \"<2-4 sentences: what you looked for, how you narrowed, what defines the final set>\"",
+				"  \"summary\": \"<2-4 sentences: what you looked for, how you narrowed, what defines the final set>\",",
+				"  \"verification\": <required iff input.successCriteria; see Self-verification>",
 				"}"
 			].join("\n")
 		})
@@ -9627,6 +9884,7 @@ function buildFulfillBriefPrompt(input, ctx) {
 		"   `MoltNet-Diary: <id>` (per the runtime instructor).",
 		"6. Push the branch and open a PR.",
 		"",
+		buildSelfVerificationBlock(ctx.taskId),
 		buildFinalOutputBlock({
 			taskType: "fulfill_brief",
 			outputSchemaName: "FulfillBriefOutput",
@@ -9636,7 +9894,8 @@ function buildFulfillBriefPrompt(input, ctx) {
 				"  \"commits\": [{ \"sha\": \"...\", \"message\": \"...\", \"diaryEntryId\": \"...\" }],",
 				"  \"pullRequestUrl\": \"<url-or-null>\",",
 				"  \"diaryEntryIds\": [\"...\"],",
-				"  \"summary\": \"<1-3 sentence recap>\"",
+				"  \"summary\": \"<1-3 sentence recap>\",",
+				"  \"verification\": <required iff input.successCriteria; see Self-verification>",
 				"}"
 			].join("\n")
 		})
@@ -9645,7 +9904,8 @@ function buildFulfillBriefPrompt(input, ctx) {
 //#endregion
 //#region ../agent-runtime/src/prompts/judge-pack.ts
 function buildJudgePackPrompt(input, ctx) {
-	const { renderedPackId, sourcePackId, rubric } = input;
+	const { renderedPackId, sourcePackId, successCriteria } = input;
+	const rubric = successCriteria.rubric;
 	const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
 	const preambleSection = rubric.preamble ? [
 		"### Rubric preamble",
@@ -9675,7 +9935,7 @@ function buildJudgePackPrompt(input, ctx) {
 		"",
 		"1. Call `moltnet_rendered_pack_get` for the rendered pack. Keep the",
 		"   `content` string — you will score it.",
-		"2. Call `moltnet_pack_get` with `expand: \"entries\"` for the source",
+		"2. Call `moltnet_pack_get` with `expandEntries: true` for the source",
 		"   pack. Keep the source entries for grounding / coverage checks.",
 		"3. For each criterion, score according to its `scoring` mode (see",
 		"   Scoring rules below). Produce rationales where required.",
@@ -9688,9 +9948,23 @@ function buildJudgePackPrompt(input, ctx) {
 		"",
 		"### Scoring rules",
 		"",
-		"- `llm_judged`: score 0..1 continuous. `rationale` REQUIRED (2–4",
+		"- `llm_score`: score 0..1 continuous. `rationale` REQUIRED (2–4",
 		"  sentences pointing at specific evidence in the rendered content or",
-		"  the source entries).",
+		"  the source entries). NOTE: this mode smooths individual failures",
+		"  into the gradient. Prefer `llm_checklist` for grounding,",
+		"  faithfulness, or any property where one failure is a real failure.",
+		"- `llm_checklist`: enumerate per-claim binary assertions instead of",
+		"  picking a continuous score. For each assertion, return",
+		"  `{ id, text, passed: bool, evidence: string }`. `evidence` is",
+		"  REQUIRED for both PASS and FAIL — for PASS, quote the supporting",
+		"  span (rendered or source) or cite the source entry id; for FAIL,",
+		"  quote the offending claim verbatim and explain why it fails.",
+		"  Don't give the benefit of the doubt: if a claim looks supported but",
+		"  you cannot point at the supporting source span, mark it FAIL with",
+		"  evidence = \"no supporting span found\". Set the criterion `score`",
+		"  to `1` iff every assertion passes, else `0` — the runtime checks",
+		"  this matches the assertions array. Populate `assertions` on the",
+		"  score object; leave `evidence` (the structured record) empty.",
 		"- `boolean`: score exactly 0 or 1. `rationale` optional.",
 		"- `deterministic_signature_check`: batch-fetch ALL referenced source",
 		"  entries in a single call — `moltnet_list_entries` with `entryIds` set",
@@ -9730,7 +10004,14 @@ function buildJudgePackPrompt(input, ctx) {
 			shapeSketch: [
 				"{",
 				"  \"scores\": [",
-				"    { \"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {} }",
+				"    { \"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {} },",
+				"    {",
+				"      \"criterionId\": \"<llm_checklist criterion>\",",
+				"      \"score\": 0,                          // 1 iff every assertion passed",
+				"      \"assertions\": [",
+				"        { \"id\": \"claim-1\", \"text\": \"...\", \"passed\": false, \"evidence\": \"...\" }",
+				"      ]",
+				"    }",
 				"  ],",
 				"  \"composite\": <sum-of-weighted-scores>,",
 				"  \"verdict\": \"<1-3 sentence overall>\",",
@@ -9772,7 +10053,7 @@ function buildRenderPackPrompt(input, ctx) {
 		"",
 		"## Workflow",
 		"",
-		"1. Call `moltnet_pack_get` with `expand: \"entries\"` to inspect the",
+		"1. Call `moltnet_pack_get` with `expandEntries: true` to inspect the",
 		"   source entries. Read it — you need the entry count for your output.",
 		"2. Call `moltnet_pack_render` with:",
 		`   - \`packId\`: \`${packId}\``,
@@ -9787,6 +10068,7 @@ function buildRenderPackPrompt(input, ctx) {
 		"- Do NOT write diary entries unless a genuine incident occurs",
 		"  (rendering failure, invariant violation).",
 		"",
+		buildSelfVerificationBlock(ctx.taskId),
 		buildFinalOutputBlock({
 			taskType: "render_pack",
 			outputSchemaName: "RenderPackOutput",
@@ -9797,7 +10079,8 @@ function buildRenderPackPrompt(input, ctx) {
 				"  \"renderMethod\": \"<label>\",",
 				"  \"byteSize\": <int>,",
 				"  \"entriesRendered\": <int>,",
-				"  \"summary\": \"<1-3 sentence recap>\"",
+				"  \"summary\": \"<1-3 sentence recap>\",",
+				"  \"verification\": <required iff input.successCriteria; see Self-verification>",
 				"}"
 			].join("\n")
 		})
@@ -13567,9 +13850,9 @@ function createSubmitOutputTool(taskType, opts = {}) {
 			description: contract.description,
 			parameters: schema,
 			async execute(_id, params) {
-				const errors = [...Value.Errors(schema, params)];
+				const errors = validateTaskOutput(taskType, params);
 				if (errors.length > 0) {
-					const detailMsg = errors.slice(0, 3).map((err) => `${err.path || "<root>"}: ${err.message}`).join("; ");
+					const detailMsg = errors.slice(0, 3).map((err) => `${err.field}: ${err.message}`).join("; ");
 					const details = {
 						captured: false,
 						callCount,
@@ -13583,7 +13866,7 @@ function createSubmitOutputTool(taskType, opts = {}) {
 					return {
 						content: [{
 							type: "text",
-							text: `Output failed schema validation: ${detailMsg}. Re-call this tool with a corrected output.`
+							text: `Output failed validation: ${detailMsg}. Re-call this tool with a corrected output.`
 						}],
 						details,
 						isError: true

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@themoltnet/pi-extension",
-  "version": "0.11.0",
+  "version": "0.12.0",
   "type": "module",
   "description": "MoltNet pi extension — sandboxed tool execution in Gondolin VMs with MoltNet identity and persistent memory",
   "license": "MIT",
@@ -31,8 +31,8 @@
     "@earendil-works/gondolin": "^0.7.0",
     "@opentelemetry/api": "^1.9.0",
     "@sinclair/typebox": "^0.34.0",
-    "@themoltnet/sdk": "0.97.0",
-    "@themoltnet/agent-runtime": "0.8.0"
+    "@themoltnet/agent-runtime": "0.9.0",
+    "@themoltnet/sdk": "0.98.0"
   },
   "peerDependencies": {
     "@mariozechner/pi-coding-agent": ">=0.73.0",
@@ -61,10 +61,25 @@
   "engines": {
     "node": ">=22"
   },
+  "nx": {
+    "tags": [
+      "type:runtime",
+      "scope:agent",
+      "platform:extension"
+    ],
+    "targets": {
+      "test-ci": {
+        "executor": "nx:noop",
+        "dependsOn": [
+          "test"
+        ],
+        "metadata": {
+          "description": "Alias for `test` on projects without atomization."
+        }
+      }
+    }
+  },
   "scripts": {
-    "lint": "eslint src/",
-    "check:pack": "tsx ../../tools/src/check-pack.ts --package .",
-    "build": "vite build",
-    "test": "vitest run --passWithNoTests"
+    "check:pack": "tsx ../../tools/src/check-pack.ts --package ."
   }
 }