npm - @sanity/ailf - Versions diffs - 4.6.0 → 5.0.0 - Mend

@sanity/ailf 4.6.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

package/canonical/grader-references/agent-harness-tools.yaml +42 -0
package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
package/canonical/grader-references/mcp-server-spec.yaml +51 -0
package/canonical/grader-references/portable-text.yaml +48 -0
package/config/rubrics.ts +38 -2
package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
package/dist/_vendor/ailf-core/examples/index.js +146 -47
package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
package/dist/_vendor/ailf-core/schemas/index.js +9 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
package/dist/_vendor/ailf-core/services/index.js +5 -0
package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
package/dist/_vendor/ailf-core/types/attribution.js +18 -0
package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
package/dist/_vendor/ailf-core/types/confidence.js +7 -0
package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
package/dist/_vendor/ailf-core/types/index.js +15 -1
package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
package/dist/adapters/api-client/build-request.d.ts +1 -0
package/dist/adapters/api-client/build-request.js +3 -0
package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
package/dist/adapters/attribution/index.d.ts +9 -0
package/dist/adapters/attribution/index.js +8 -0
package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
package/dist/adapters/config-sources/file-config-adapter.js +1 -0
package/dist/adapters/grader-outputs/index.d.ts +10 -0
package/dist/adapters/grader-outputs/index.js +8 -0
package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
package/dist/adapters/grader-outputs/legacy/index.js +10 -0
package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
package/dist/adapters/index.d.ts +3 -0
package/dist/adapters/index.js +4 -0
package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
package/dist/adapters/task-sources/repo-schemas.js +19 -2
package/dist/commands/calculate-scores.js +1 -1
package/dist/commands/explain-handler.js +1 -1
package/dist/commands/lookup-doc.d.ts +1 -1
package/dist/commands/lookup-doc.js +3 -3
package/dist/commands/pipeline-action.d.ts +6 -0
package/dist/commands/pipeline-action.js +2 -0
package/dist/commands/remote-pipeline.js +1 -0
package/dist/composition-root.d.ts +36 -0
package/dist/composition-root.js +48 -0
package/dist/config/rubrics.ts +38 -2
package/dist/grader/agent-harness.d.ts +14 -0
package/dist/grader/agent-harness.js +17 -0
package/dist/grader/common.d.ts +17 -0
package/dist/grader/common.js +21 -0
package/dist/grader/index.d.ts +38 -0
package/dist/grader/index.js +75 -0
package/dist/grader/knowledge-probe.d.ts +14 -0
package/dist/grader/knowledge-probe.js +18 -0
package/dist/grader/literacy.d.ts +13 -0
package/dist/grader/literacy.js +17 -0
package/dist/grader/mcp.d.ts +14 -0
package/dist/grader/mcp.js +18 -0
package/dist/orchestration/build-app-context.js +1 -0
package/dist/orchestration/build-step-sequence.js +5 -0
package/dist/orchestration/steps/calculate-scores-step.js +23 -1
package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
package/dist/orchestration/steps/compute-attribution-step.js +279 -0
package/dist/orchestration/steps/gap-analysis-step.js +35 -7
package/dist/orchestration/steps/index.d.ts +1 -0
package/dist/orchestration/steps/index.js +1 -0
package/dist/pipeline/attribution.d.ts +15 -0
package/dist/pipeline/attribution.js +18 -9
package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
package/dist/pipeline/borderline-consensus-runner.js +124 -0
package/dist/pipeline/borderline-detector.d.ts +24 -0
package/dist/pipeline/borderline-detector.js +26 -0
package/dist/pipeline/calculate-scores.d.ts +114 -3
package/dist/pipeline/calculate-scores.js +426 -24
package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
package/dist/pipeline/compiler/literacy-bridge.js +35 -17
package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
package/dist/pipeline/compiler/rubric-resolution.js +9 -1
package/dist/pipeline/compute-attribution.d.ts +80 -0
package/dist/pipeline/compute-attribution.js +196 -0
package/dist/pipeline/failure-modes.d.ts +52 -17
package/dist/pipeline/failure-modes.js +178 -117
package/dist/pipeline/map-request-to-config.js +1 -0
package/package.json +6 -4

package/dist/adapters/grader-outputs/promptfoo-grader-output.js ADDED Viewed

@@ -0,0 +1,93 @@
+/**
+ * promptfoo-grader-output.ts — Zod schema for the structured grader output
+ * (GRAD-02) emitted by the promptfoo grader process and consumed by the
+ * eval pipeline.
+ *
+ * The schema asserts `satisfies z.ZodType<GraderJudgment>` against the
+ * canonical domain type in `packages/core/src/types/grader-judgment.ts`
+ * (D0045 / W0187) — drift between schema and type is a build error.
+ * The domain type was authored independently in Plan 01-01; this file
+ * authors ONLY the schema and never derives the domain type from the
+ * schema itself (no schema-derived self-reference allowed by D0045).
+ *
+ * `graderJudgmentsVersion` is co-located with the schema (VER-01 D-02 —
+ * source-of-truth file owns its version constant). Bumped by hand when
+ * the grader rubric, prompt template, or judgment shape changes.
+ *
+ * Phase 3 will replace the inline `JSON.parse` at
+ * `pipeline/calculate-scores.ts:380-392` (Pitfall #4) so all grader
+ * output flows through this schema.
+ *
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
+ * @see docs/decisions/D0049-shared-confidence-contract.md
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
+ */
+import { z } from "zod";
+import { brandedString, ConfidenceSchema } from "../../_vendor/ailf-core/schemas/index.js";
+/**
+ * VER-01 D-02 — co-located version constant. Bumped by hand when the
+ * grader rubric, prompt template, or judgment shape changes in a way
+ * that should invalidate cached Diagnoses.
+ *
+ * Phase 3 GRAD-05 bumped this from `"0.1.0"` to `"1.0.0"` (semver
+ * major) — the additive GRAD-02 surface is now required + the schema
+ * is `.strict()`. AILF has no installed external base; the legacy
+ * parser at `./legacy/promptfoo-grader-output-legacy.ts` is the named
+ * consumer for already-stored historical reports.
+ */
+export const graderJudgmentsVersion = "1.0.0";
+const DocCitationRoleSchema = z.enum([
+    "supports",
+    "contradicts",
+    "missing",
+    "irrelevant",
+]);
+const DocCitationSchema = z.object({
+    documentId: z.string().min(1),
+    slug: z.string().optional(),
+    role: DocCitationRoleSchema,
+    hallucinated: z.boolean().optional(),
+});
+const CriterionSubJudgmentSchema = z.object({
+    criterionId: z.string().min(1),
+    met: z.boolean(),
+    evidence: z.string().max(280),
+    confidence: ConfidenceSchema,
+});
+/**
+ * Canonical schema for {@link GraderJudgment}. Required fields mirror
+ * the existing pipeline core (Doc 03 §"existing, unchanged"):
+ * `taskId`, `modelId`, `dimension`, `reason`, `score`. Phase 3 GRAD-05
+ * has tightened the additive surface to required and added `.strict()`
+ * — the schema rejects unknown fields (defense-in-depth against future
+ * prompt-injection attempts that try to smuggle keys through the
+ * grader emission).
+ *
+ * Branded `JudgmentId` is represented at runtime by a non-empty string;
+ * the schema routes the brand through `brandedString<"JudgmentId">()`
+ * — the project's single audited cast site for branded-string
+ * schemas (project typescript rule: no `as` on `unknown`).
+ */
+export const GraderJudgmentSchema = z
+    .object({
+    // ── Existing pipeline core (required — Doc 03 §"existing, unchanged") ─
+    taskId: z.string().min(1),
+    modelId: z.string().min(1),
+    dimension: z.string().min(1),
+    reason: z.string(),
+    score: z.number(),
+    outputFailure: z.boolean().optional(),
+    // ── GRAD-02 additive — required from Phase 3 GRAD-05 ───────────────
+    judgmentId: brandedString(),
+    subJudgments: z.array(CriterionSubJudgmentSchema),
+    docCitations: z.array(DocCitationSchema),
+    failureMode: z.string(),
+    confidence: ConfidenceSchema,
+    hallucinationCheckedAgainst: z.array(z.string()),
+    metadata: z.object({
+        graderModel: z.string().min(1),
+        graderJudgmentsVersion: z.string().min(1),
+    }),
+})
+    .strict();

package/dist/adapters/index.d.ts CHANGED Viewed

@@ -10,3 +10,6 @@ export { PromptfooEvalAdapter } from "./eval-runners/index.js";
 export { ConsoleLogger, type ConsoleLoggerOptions, JsonLogger, QuietLogger, } from "./loggers/index.js";
 export { CliConfigAdapter, FileConfigAdapter } from "./config-sources/index.js";
 export { DtsPackageSurface, InMemoryPackageSurface, type DtsPackageSurfaceOptions, type PackageRootResolver, parseDtsExports, type ParsedDtsExports, } from "./package-surface/index.js";
+export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./grader-outputs/index.js";
+export { AttributionMetaSchema, JudgmentAttributionSchema, } from "./attribution/index.js";
+export type { AttributionMeta, DocAttribution, GraderJudgment, JudgmentAttribution, } from "../_vendor/ailf-core/index.d.ts";

package/dist/adapters/index.js CHANGED Viewed

@@ -10,3 +10,7 @@ export { PromptfooEvalAdapter } from "./eval-runners/index.js";
 export { ConsoleLogger, JsonLogger, QuietLogger, } from "./loggers/index.js";
 export { CliConfigAdapter, FileConfigAdapter } from "./config-sources/index.js";
 export { DtsPackageSurface, InMemoryPackageSurface, parseDtsExports, } from "./package-surface/index.js";
+// Phase 1 Plan 02 — actionability-ladder adapter schemas (GRAD-02, ATTR-01).
+// Named re-exports only (W0124 / D0045).
+export { GraderJudgmentSchema, graderJudgmentsVersion, } from "./grader-outputs/index.js";
+export { AttributionMetaSchema, JudgmentAttributionSchema, } from "./attribution/index.js";

package/dist/adapters/task-sources/content-lake-task-source.d.ts CHANGED Viewed

@@ -55,9 +55,13 @@ interface ContentLakeCanonicalDoc {
     sectionSlug?: string;
     slug?: string;
 }
+interface ContentLakeCriterion {
+    id?: string;
+    text?: string;
+}
 /** Assertion shape from the Content Lake (mirrors the Studio schema). */
 interface ContentLakeAssertion {
-    criteria?: string[];
+    criteria?: ContentLakeCriterion[];
     template?: string;
     threshold?: number;
     type?: string;

package/dist/adapters/task-sources/content-lake-task-source.js CHANGED Viewed

@@ -73,7 +73,13 @@ const TASKS_QUERY = /* groq */ `
     perspective,
     reason
   },
-  "assertions": coalesce(assertions, assert),
+  "assertions": coalesce(assertions, assert)[] {
+    type, template, weight, value, threshold,
+    "criteria": criteria[] {
+      "id": coalesce(id.current, _key),
+      "text": coalesce(text, @)
+    }
+  },
   rawAssert,
   baseline,
   tags,
@@ -256,8 +262,28 @@ function mapAssertions(raw) {
         .filter((a) => !!a.type)
         .map((a) => {
         if (a.type === "llm-rubric" && a.template && a.criteria) {
+            // Tighten the runtime contract: the GROQ projection's
+            // `coalesce(text, @)` falls through to the entire criterion
+            // element when `text` is missing, so a partial legacy criterion
+            // like `{_key: "abc"}` arrives here as `{ id: "abc", text: {...} }`
+            // — `text` set to the whole `@` object. Explicit type checks
+            // drop those with a diagnostic, instead of letting the non-string
+            // `text` propagate until the outer ContentLakeAuthorableTaskSchema
+            // parse fails deep inside the assertions array (noisy diagnostic).
             return {
-                criteria: a.criteria,
+                criteria: a.criteria
+                    .filter((c) => {
+                    if (!c)
+                        return false;
+                    const idOk = typeof c.id === "string" && c.id.length > 0;
+                    const textOk = typeof c.text === "string" && c.text.length > 0;
+                    if (!idOk || !textOk) {
+                        console.warn(`[ContentLakeTaskSource] dropping malformed criterion: ${JSON.stringify(c).slice(0, 100)}`);
+                        return false;
+                    }
+                    return true;
+                })
+                    .map((c) => ({ id: c.id, text: c.text })),
                 template: a.template,
                 type: "llm-rubric",
                 ...(a.weight !== undefined ? { weight: a.weight } : {}),

package/dist/adapters/task-sources/repo-schemas.d.ts CHANGED Viewed

@@ -32,6 +32,40 @@ export type CuratedAssertionType = (typeof CURATED_ASSERTION_TYPES)[number];
  */
 export declare const RUBRIC_TEMPLATE_NAMES: readonly ["task-completion", "code-correctness", "doc-coverage", "mcp-input-validation", "mcp-output-correctness", "mcp-error-handling", "mcp-security", "factual-correctness", "completeness", "currency", "process-quality", "agent-output", "agent-tool-usage"];
 export type RubricTemplateName = (typeof RUBRIC_TEMPLATE_NAMES)[number];
+/**
+ * A single criterion within an llm-rubric assertion. Stable id-text pair.
+ */
+export declare const CriterionRefSchema: z.ZodObject<{
+    id: z.ZodString;
+    text: z.ZodString;
+}, z.core.$strip>;
+/**
+ * A templated LLM-rubric assertion — uses one of the predefined rubric
+ * templates with author-supplied criteria.
+ */
+export declare const TemplatedAssertionSchema: z.ZodObject<{
+    type: z.ZodLiteral<"llm-rubric">;
+    template: z.ZodEnum<{
+        "task-completion": "task-completion";
+        "code-correctness": "code-correctness";
+        "doc-coverage": "doc-coverage";
+        "mcp-input-validation": "mcp-input-validation";
+        "mcp-output-correctness": "mcp-output-correctness";
+        "mcp-error-handling": "mcp-error-handling";
+        "mcp-security": "mcp-security";
+        "factual-correctness": "factual-correctness";
+        completeness: "completeness";
+        currency: "currency";
+        "process-quality": "process-quality";
+        "agent-output": "agent-output";
+        "agent-tool-usage": "agent-tool-usage";
+    }>;
+    criteria: z.ZodArray<z.ZodObject<{
+        id: z.ZodString;
+        text: z.ZodString;
+    }, z.core.$strip>>;
+    weight: z.ZodOptional<z.ZodNumber>;
+}, z.core.$strip>;
 /**
  * Zod schema for a single task definition — a mode-discriminated union
  * mirroring `GeneralizedTaskDefinition`.
@@ -73,7 +107,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
             "agent-output": "agent-output";
             "agent-tool-usage": "agent-tool-usage";
         }>;
-        criteria: z.ZodArray<z.ZodString>;
+        criteria: z.ZodArray<z.ZodObject<{
+            id: z.ZodString;
+            text: z.ZodString;
+        }, z.core.$strip>>;
         weight: z.ZodOptional<z.ZodNumber>;
     }, z.core.$strip>, z.ZodObject<{
         type: z.ZodEnum<{
@@ -187,7 +224,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
             "agent-output": "agent-output";
             "agent-tool-usage": "agent-tool-usage";
         }>;
-        criteria: z.ZodArray<z.ZodString>;
+        criteria: z.ZodArray<z.ZodObject<{
+            id: z.ZodString;
+            text: z.ZodString;
+        }, z.core.$strip>>;
         weight: z.ZodOptional<z.ZodNumber>;
     }, z.core.$strip>, z.ZodObject<{
         type: z.ZodEnum<{
@@ -341,7 +381,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
             "agent-output": "agent-output";
             "agent-tool-usage": "agent-tool-usage";
         }>;
-        criteria: z.ZodArray<z.ZodString>;
+        criteria: z.ZodArray<z.ZodObject<{
+            id: z.ZodString;
+            text: z.ZodString;
+        }, z.core.$strip>>;
         weight: z.ZodOptional<z.ZodNumber>;
     }, z.core.$strip>, z.ZodObject<{
         type: z.ZodEnum<{
@@ -472,7 +515,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
             "agent-output": "agent-output";
             "agent-tool-usage": "agent-tool-usage";
         }>;
-        criteria: z.ZodArray<z.ZodString>;
+        criteria: z.ZodArray<z.ZodObject<{
+            id: z.ZodString;
+            text: z.ZodString;
+        }, z.core.$strip>>;
         weight: z.ZodOptional<z.ZodNumber>;
     }, z.core.$strip>, z.ZodObject<{
         type: z.ZodEnum<{
@@ -591,7 +637,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
             "agent-output": "agent-output";
             "agent-tool-usage": "agent-tool-usage";
         }>;
-        criteria: z.ZodArray<z.ZodString>;
+        criteria: z.ZodArray<z.ZodObject<{
+            id: z.ZodString;
+            text: z.ZodString;
+        }, z.core.$strip>>;
         weight: z.ZodOptional<z.ZodNumber>;
     }, z.core.$strip>, z.ZodObject<{
         type: z.ZodEnum<{
@@ -699,7 +748,10 @@ export declare const ContentLakeAuthorableTaskSchema: z.ZodObject<{
             "agent-output": "agent-output";
             "agent-tool-usage": "agent-tool-usage";
         }>;
-        criteria: z.ZodArray<z.ZodString>;
+        criteria: z.ZodArray<z.ZodObject<{
+            id: z.ZodString;
+            text: z.ZodString;
+        }, z.core.$strip>>;
         weight: z.ZodOptional<z.ZodNumber>;
     }, z.core.$strip>, z.ZodObject<{
         type: z.ZodEnum<{
@@ -819,7 +871,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
             "agent-output": "agent-output";
             "agent-tool-usage": "agent-tool-usage";
         }>;
-        criteria: z.ZodArray<z.ZodString>;
+        criteria: z.ZodArray<z.ZodObject<{
+            id: z.ZodString;
+            text: z.ZodString;
+        }, z.core.$strip>>;
         weight: z.ZodOptional<z.ZodNumber>;
     }, z.core.$strip>, z.ZodObject<{
         type: z.ZodEnum<{
@@ -933,7 +988,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
             "agent-output": "agent-output";
             "agent-tool-usage": "agent-tool-usage";
         }>;
-        criteria: z.ZodArray<z.ZodString>;
+        criteria: z.ZodArray<z.ZodObject<{
+            id: z.ZodString;
+            text: z.ZodString;
+        }, z.core.$strip>>;
         weight: z.ZodOptional<z.ZodNumber>;
     }, z.core.$strip>, z.ZodObject<{
         type: z.ZodEnum<{
@@ -1087,7 +1145,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
             "agent-output": "agent-output";
             "agent-tool-usage": "agent-tool-usage";
         }>;
-        criteria: z.ZodArray<z.ZodString>;
+        criteria: z.ZodArray<z.ZodObject<{
+            id: z.ZodString;
+            text: z.ZodString;
+        }, z.core.$strip>>;
         weight: z.ZodOptional<z.ZodNumber>;
     }, z.core.$strip>, z.ZodObject<{
         type: z.ZodEnum<{
@@ -1218,7 +1279,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
             "agent-output": "agent-output";
             "agent-tool-usage": "agent-tool-usage";
         }>;
-        criteria: z.ZodArray<z.ZodString>;
+        criteria: z.ZodArray<z.ZodObject<{
+            id: z.ZodString;
+            text: z.ZodString;
+        }, z.core.$strip>>;
         weight: z.ZodOptional<z.ZodNumber>;
     }, z.core.$strip>, z.ZodObject<{
         type: z.ZodEnum<{
@@ -1337,7 +1401,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
             "agent-output": "agent-output";
             "agent-tool-usage": "agent-tool-usage";
         }>;
-        criteria: z.ZodArray<z.ZodString>;
+        criteria: z.ZodArray<z.ZodObject<{
+            id: z.ZodString;
+            text: z.ZodString;
+        }, z.core.$strip>>;
         weight: z.ZodOptional<z.ZodNumber>;
     }, z.core.$strip>, z.ZodObject<{
         type: z.ZodEnum<{
@@ -1468,6 +1535,7 @@ export declare const RepoConfigSchema: z.ZodObject<{
     execution: z.ZodOptional<z.ZodObject<{
         concurrency: z.ZodOptional<z.ZodNumber>;
         graderReplications: z.ZodOptional<z.ZodNumber>;
+        borderlineReplications: z.ZodOptional<z.ZodNumber>;
         gapAnalysis: z.ZodOptional<z.ZodBoolean>;
         apiUrl: z.ZodOptional<z.ZodString>;
     }, z.core.$strip>>;

package/dist/adapters/task-sources/repo-schemas.js CHANGED Viewed

@@ -111,14 +111,26 @@ const CanonicalDocRefSchema = z.union([
 // ---------------------------------------------------------------------------
 // Assertion schemas
 // ---------------------------------------------------------------------------
+/**
+ * A single criterion within an llm-rubric assertion. Stable id-text pair.
+ */
+export const CriterionRefSchema = z.object({
+    id: z
+        .string()
+        .min(1)
+        .regex(/^[a-z0-9][a-z0-9-]*$/, {
+        message: "criterion id must be lowercase alphanumeric with hyphens",
+    }),
+    text: z.string().min(1),
+});
 /**
  * A templated LLM-rubric assertion — uses one of the predefined rubric
  * templates with author-supplied criteria.
  */
-const TemplatedAssertionSchema = z.object({
+export const TemplatedAssertionSchema = z.object({
     type: z.literal("llm-rubric"),
     template: z.enum(RUBRIC_TEMPLATE_NAMES),
-    criteria: z.array(z.string().min(1)).min(1),
+    criteria: z.array(CriterionRefSchema).min(1),
     weight: z.number().optional(),
 });
 /**
@@ -562,6 +574,11 @@ const ExecutionConfigSchema = z
     .object({
     concurrency: z.number().int().positive().optional(),
     graderReplications: z.number().int().positive().optional(),
+    /**
+     * Plan 03-04 GRAD-04 — replications per borderline judgment.
+     * Default 3 (composition-root). Positive integer.
+     */
+    borderlineReplications: z.number().int().positive().optional(),
     gapAnalysis: z.boolean().optional(),
     apiUrl: z.string().url().optional(),
 })

package/dist/commands/calculate-scores.js CHANGED Viewed

@@ -38,7 +38,7 @@ export function createCalculateScoresCommand() {
                 remote: false,
                 apiUrl: "https://ailf-api.sanity.build",
             });
-            const result = calculateAndWriteScores({
+            const result = await calculateAndWriteScores({
                 resultsPath,
                 rootDir: ctx.config.rootDir,
                 source: opts.source,

package/dist/commands/explain-handler.js CHANGED Viewed

@@ -298,7 +298,7 @@ const EXPLAIN_REGISTRY = {
         ],
     },
     "lookup-doc": {
-        description: "Search Sanity for documentation articles by keyword (find slugs for canonicalDocs)",
+        description: "Search Sanity for documentation articles by keyword (find slugs for contextDocs)",
         steps: [
             {
                 cacheStatus: "miss",

package/dist/commands/lookup-doc.d.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 /**
  * lookup-doc command — search Sanity for documentation articles by keyword.
  *
- * Helps external contributors find the correct `slug` for canonicalDocs
+ * Helps external contributors find the correct `slug` for contextDocs
  * references without needing to browse the CMS or guess from URLs.
  *
  * Usage:

package/dist/commands/lookup-doc.js CHANGED Viewed

@@ -1,7 +1,7 @@
 /**
  * lookup-doc command — search Sanity for documentation articles by keyword.
  *
- * Helps external contributors find the correct `slug` for canonicalDocs
+ * Helps external contributors find the correct `slug` for contextDocs
  * references without needing to browse the CMS or guess from URLs.
  *
  * Usage:
@@ -14,7 +14,7 @@
 import { Command } from "commander";
 export function createLookupDocCommand() {
     return new Command("lookup-doc")
-        .description("Search Sanity docs by keyword — find slugs for canonicalDocs references")
+        .description("Search Sanity docs by keyword — find slugs for contextDocs references")
         .argument("<keyword>", "Search keyword (matches title and slug)")
         .option("-l, --limit <n>", "Maximum results to show", parseInt, 10)
         .option("-s, --source <name>", "Documentation source (from sources.yaml)")
@@ -73,7 +73,7 @@ export function createLookupDocCommand() {
             console.log(`  ${"".padEnd(maxSlugLen + 6)}  │ Section: ${section}\n`);
         }
         console.log("  Usage in .ailf/tasks/*.yaml:\n");
-        console.log("  canonicalDocs:");
+        console.log("  contextDocs:");
         console.log(`    - slug: ${results[0].slug}`);
         console.log(`      reason: "${results[0].title}"`);
         if (results[0].sectionSlug) {

package/dist/commands/pipeline-action.d.ts CHANGED Viewed

@@ -27,6 +27,12 @@ export interface ResolvedOptions {
     dryRun: boolean;
     gapAnalysisEnabled: boolean;
     graderReplications?: number;
+    /**
+     * Replications per borderline judgment for the GRAD-04 intra-grader
+     * consensus pass. Sourced from `.ailf/config.yaml`'s
+     * `execution.borderlineReplications`.
+     */
+    borderlineReplications?: number;
     /** Grader context policy from `.ailf/config.yaml` `grader.context` */
     graderContext?: "rubric-only" | "with-docs";
     headerArgs: string[];

package/dist/commands/pipeline-action.js CHANGED Viewed

@@ -248,6 +248,7 @@ export function computeResolvedOptions(opts) {
     //   env var (where one exists) > .ailf/config.yaml > built-in default
     const concurrency = repoConfig?.execution?.concurrency;
     const graderReplications = repoConfig?.execution?.graderReplications;
+    const borderlineReplications = repoConfig?.execution?.borderlineReplications;
     const gapAnalysisEnabled = repoConfig?.execution?.gapAnalysis ?? true;
     // Grader context policy. Cascade: env var > .ailf/config.yaml > unset
     // (defaults to rubric-only at the EvalConfig boundary). The env var is the
@@ -291,6 +292,7 @@ export function computeResolvedOptions(opts) {
         dryRun: opts.dryRun,
         gapAnalysisEnabled,
         graderReplications,
+        borderlineReplications,
         graderContext,
         headerArgs,
         impactSummary,

package/dist/commands/remote-pipeline.js CHANGED Viewed

@@ -142,6 +142,7 @@ function toConfigSlice(opts) {
         perspectiveOverride: opts.perspectiveOverride,
         graderContext: opts.graderContext,
         graderReplications: opts.graderReplications,
+        borderlineReplications: opts.borderlineReplications,
         gapAnalysisEnabled: opts.gapAnalysisEnabled,
         noRemoteCache: opts.noRemoteCache,
         // D0037 / W0069 caller envelope overrides — flags override env vars

package/dist/composition-root.d.ts CHANGED Viewed

@@ -16,6 +16,7 @@
  * @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
  */
 import { type AppContext, type ArtifactWriter, type ArtifactWriterProgressOptions, type AssertionRegistration, type LLMClient, type Logger, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
+import { type BorderlineConsensusOptions, type BorderlineConsensusResult } from "./pipeline/borderline-consensus-runner.js";
 import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource } from "./adapters/task-sources/index.js";
 /**
  * Create a fully wired AppContext from resolved configuration.
@@ -83,3 +84,38 @@ export declare function createTaskSource(config: ResolvedConfig): CompositeTaskS
  * explicit mode whitelists.
  */
 export declare const FRAMEWORK_ASSERTIONS: AssertionRegistration[];
+/**
+ * Severity boundaries from `packages/eval/config/thresholds.ts`
+ * (severity.critical/warning/info `composite-below` at L50/54/58 — 30, 50,
+ * 60). The borderline detector flags a judgment when its score is within
+ * ±5 of any of these. Composition-root reads them ONCE and threads the
+ * typed `readonly number[]` into `runBorderlineConsensus` rather than
+ * re-deriving them at each call site (Pitfall 5 — single source of truth
+ * for the scale).
+ */
+export declare const BORDERLINE_SEVERITY_THRESHOLDS: readonly number[];
+/**
+ * Default replications per borderline judgment when the caller's
+ * `RepoConfig.execution.borderlineReplications` is unset (locked answer
+ * #4 in plan 03-04). Three replications + the original score = four
+ * scores per consistency record, which is the minimum that produces a
+ * non-degenerate stdDev / median split.
+ */
+export declare const DEFAULT_BORDERLINE_REPLICATIONS = 3;
+/**
+ * Factory for the borderline-consensus runner. Returns a function that
+ * applies the severity-threshold and replication defaults from
+ * composition-root, leaving the live grader entry point (the `regrade`
+ * callback) and the candidate `judgments` array as runtime inputs.
+ *
+ * The pipeline-side caller (currently `pipeline/calculate-scores.ts`'s
+ * post-extraction junction) supplies the `regrade` callback that maps a
+ * `GraderJudgment` to a fresh score via the response/rubric text from
+ * the original Promptfoo result. See the runner's header for the
+ * rationale on injecting the regrader rather than calling `gradeOnce`
+ * inline (Pitfall 6 — preserve the runner's purity wrt the existing
+ * grader-comparison split).
+ */
+export declare function createBorderlineConsensusRunner(opts: {
+    borderlineReplications?: number;
+}): (args: Pick<BorderlineConsensusOptions, "judgments" | "logger" | "regrade">) => Promise<BorderlineConsensusResult>;

package/dist/composition-root.js CHANGED Viewed

@@ -27,6 +27,7 @@ import { resolveUploadConcurrency, setDefaultUploadConcurrency, } from "./artifa
 import { UploadMetrics } from "./artifact-capture/upload-metrics.js";
 import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
 import { AnthropicLLMClient, OpenAILLMClient } from "./adapters/llm/index.js";
+import { runBorderlineConsensus, } from "./pipeline/borderline-consensus-runner.js";
 import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
 import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
 import { PromptfooEvalAdapter } from "./adapters/eval-runners/promptfoo-eval-adapter.js";
@@ -493,3 +494,50 @@ function createReportStore(config) {
             undefined,
     });
 }
+// ---------------------------------------------------------------------------
+// Borderline-consensus wiring (Plan 03-04 / GRAD-04)
+// ---------------------------------------------------------------------------
+/**
+ * Severity boundaries from `packages/eval/config/thresholds.ts`
+ * (severity.critical/warning/info `composite-below` at L50/54/58 — 30, 50,
+ * 60). The borderline detector flags a judgment when its score is within
+ * ±5 of any of these. Composition-root reads them ONCE and threads the
+ * typed `readonly number[]` into `runBorderlineConsensus` rather than
+ * re-deriving them at each call site (Pitfall 5 — single source of truth
+ * for the scale).
+ */
+export const BORDERLINE_SEVERITY_THRESHOLDS = [
+    30, 50, 60,
+];
+/**
+ * Default replications per borderline judgment when the caller's
+ * `RepoConfig.execution.borderlineReplications` is unset (locked answer
+ * #4 in plan 03-04). Three replications + the original score = four
+ * scores per consistency record, which is the minimum that produces a
+ * non-degenerate stdDev / median split.
+ */
+export const DEFAULT_BORDERLINE_REPLICATIONS = 3;
+/**
+ * Factory for the borderline-consensus runner. Returns a function that
+ * applies the severity-threshold and replication defaults from
+ * composition-root, leaving the live grader entry point (the `regrade`
+ * callback) and the candidate `judgments` array as runtime inputs.
+ *
+ * The pipeline-side caller (currently `pipeline/calculate-scores.ts`'s
+ * post-extraction junction) supplies the `regrade` callback that maps a
+ * `GraderJudgment` to a fresh score via the response/rubric text from
+ * the original Promptfoo result. See the runner's header for the
+ * rationale on injecting the regrader rather than calling `gradeOnce`
+ * inline (Pitfall 6 — preserve the runner's purity wrt the existing
+ * grader-comparison split).
+ */
+export function createBorderlineConsensusRunner(opts) {
+    const replications = opts.borderlineReplications ?? DEFAULT_BORDERLINE_REPLICATIONS;
+    return (args) => runBorderlineConsensus({
+        judgments: args.judgments,
+        ...(args.logger ? { logger: args.logger } : {}),
+        regrade: args.regrade,
+        replications,
+        thresholds: BORDERLINE_SEVERITY_THRESHOLDS,
+    });
+}