npm - @sanity/ailf - Versions diffs - 4.6.0 → 6.0.0 - Mend

@sanity/ailf 4.6.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (185) hide show

package/dist/config/rubrics.ts CHANGED Viewed

@@ -11,6 +11,15 @@
 import { defineRubrics } from "../_vendor/ailf-core/index.js"
+// Plan 03-02 — per-dimension failure-mode taxonomies stamped onto each
+// template entry below. Source of truth lives in packages/eval/src/grader/;
+// the helper picks the right list by dimension family.
+import { failureModesForDimension } from "../grader/index.js"
+// Single source of truth for the wire-format version stamped into the
+// grader-prompt footer (VER-01 D-02). Interpolated below so the
+// announced version cannot drift from the schema's expected value.
+import { graderJudgmentsVersion } from "../adapters/grader-outputs/index.js"
 export default defineRubrics({
   templates: {
     // ── Core literacy dimensions ────────────────────────────
@@ -25,6 +34,7 @@ export default defineRubrics({
         "100: Fully functional code — works as expected",
       ],
       criteria_label: "Must demonstrate:",
+      failureModes: failureModesForDimension("task-completion"),
     },
     "code-correctness": {
       dimension: "code-correctness",
@@ -37,6 +47,7 @@ export default defineRubrics({
         "100: Follows all best practices, idiomatic implementation",
       ],
       criteria_label: "Check for:",
+      failureModes: failureModesForDimension("code-correctness"),
     },
     "doc-coverage": {
       dimension: "doc-coverage",
@@ -48,6 +59,7 @@ export default defineRubrics({
         "80: Minor gaps — almost everything was documented",
         "100: Complete coverage — all necessary info was in docs",
       ],
+      failureModes: failureModesForDimension("doc-coverage"),
     },
     // ── MCP server dimensions ───────────────────────────────
@@ -62,6 +74,7 @@ export default defineRubrics({
         "100: Perfect tool inputs — all parameters correct and well-formed",
       ],
       criteria_label: "Evaluate:",
+      failureModes: failureModesForDimension("input-validation"),
     },
     "mcp-output-correctness": {
       dimension: "output-correctness",
@@ -74,6 +87,7 @@ export default defineRubrics({
         "100: Perfect output handling — correctly interpreted all tool responses",
       ],
       criteria_label: "Check for:",
+      failureModes: failureModesForDimension("output-correctness"),
     },
     "mcp-error-handling": {
       dimension: "error-handling",
@@ -86,6 +100,7 @@ export default defineRubrics({
         "100: Excellent — handled all errors appropriately with clear messaging",
       ],
       criteria_label: "Evaluate:",
+      failureModes: failureModesForDimension("error-handling"),
     },
     "mcp-security": {
       dimension: "security",
@@ -98,6 +113,7 @@ export default defineRubrics({
         "100: Perfect security — only used authorized tools with safe inputs",
       ],
       criteria_label: "Check for:",
+      failureModes: failureModesForDimension("security"),
     },
     // ── Knowledge probe dimensions ──────────────────────────
@@ -112,6 +128,7 @@ export default defineRubrics({
         "100: Fully correct — all statements are accurate and verifiable",
       ],
       criteria_label: "Verify:",
+      failureModes: failureModesForDimension("factual-correctness"),
     },
     completeness: {
       dimension: "completeness",
@@ -124,6 +141,7 @@ export default defineRubrics({
         "100: Comprehensive — thorough coverage of all important aspects",
       ],
       criteria_label: "Check coverage of:",
+      failureModes: failureModesForDimension("completeness"),
     },
     currency: {
       dimension: "currency",
@@ -136,6 +154,7 @@ export default defineRubrics({
         "100: Fully current — references latest APIs, patterns, and best practices",
       ],
       criteria_label: "Check for:",
+      failureModes: failureModesForDimension("currency"),
     },
     // ── Agent harness dimensions ────────────────────────────
@@ -151,6 +170,7 @@ export default defineRubrics({
         "100: Excellent process — optimal tool usage, clear planning, graceful recovery",
       ],
       criteria_label: "Evaluate:",
+      failureModes: failureModesForDimension("process-quality"),
     },
     "agent-output": {
       dimension: "agent-output",
@@ -163,6 +183,7 @@ export default defineRubrics({
         "100: Excellent output — fully correct, clean, and complete",
       ],
       criteria_label: "Check for:",
+      failureModes: failureModesForDimension("agent-output"),
     },
     "agent-tool-usage": {
       dimension: "tool-usage",
@@ -175,6 +196,7 @@ export default defineRubrics({
         "100: Excellent — optimal tool selection, correct inputs, minimal redundancy",
       ],
       criteria_label: "Evaluate:",
+      failureModes: failureModesForDimension("tool-usage"),
     },
   },
@@ -220,6 +242,20 @@ export default defineRubrics({
     "agent-harness": { gold: "agent-harness" },
   },
-  footer:
-    'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
+  // Phase 3 GRAD-05 (Plan 03-01) — structured GraderJudgment JSON sketch.
+  // Documents the target wire format the grader emits. The strict schema's
+  // GRAD-02 additive fields stay optional in this plan; Plan 03-04 flips
+  // them to required and bumps graderJudgmentsVersion to 1.0.0.
+  footer: `Return ONLY a JSON object with this exact shape:
+{
+  "judgmentId": "<string>",
+  "score": <number 0-100>,
+  "reason": "<explanation, ≤500 chars>",
+  "subJudgments": [{ "criterionId": "<id>", "met": <bool>, "evidence": "<≤280 chars>", "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" } }],
+  "docCitations": [{ "documentId": "<id>", "slug": "<optional slug>", "role": "supports|contradicts|missing|irrelevant", "hallucinated": <bool> }],
+  "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
+  "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" },
+  "hallucinationCheckedAgainst": ["<doc id>"],
+  "metadata": { "graderModel": "<string>", "graderJudgmentsVersion": "${graderJudgmentsVersion}" }
+}`,
 })

package/dist/grader/agent-harness.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+/**
+ * Agent-harness failure modes — re-export shim (D-05).
+ *
+ * Canonical data relocated to @sanity/ailf-core.
+ * Existing callers of this file continue to work unchanged.
+ *
+ * @see packages/core/src/grader/failure-modes/agent-harness.ts
+ */
+export { AGENT_FAILURE_MODES, type AgentFailureMode } from "../_vendor/ailf-core/index.d.ts";

package/dist/grader/agent-harness.js ADDED Viewed

@@ -0,0 +1,9 @@
+/**
+ * Agent-harness failure modes — re-export shim (D-05).
+ *
+ * Canonical data relocated to @sanity/ailf-core.
+ * Existing callers of this file continue to work unchanged.
+ *
+ * @see packages/core/src/grader/failure-modes/agent-harness.ts
+ */
+export { AGENT_FAILURE_MODES } from "../_vendor/ailf-core/index.js";

package/dist/grader/common.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+/**
+ * Cross-cutting failure modes — re-export shim (D-05).
+ *
+ * Canonical data relocated to @sanity/ailf-core.
+ * Existing callers of this file continue to work unchanged.
+ *
+ * @see packages/core/src/grader/failure-modes/common.ts
+ */
+export { COMMON_FAILURE_MODES, type CommonFailureMode } from "../_vendor/ailf-core/index.d.ts";

package/dist/grader/common.js ADDED Viewed

@@ -0,0 +1,9 @@
+/**
+ * Cross-cutting failure modes — re-export shim (D-05).
+ *
+ * Canonical data relocated to @sanity/ailf-core.
+ * Existing callers of this file continue to work unchanged.
+ *
+ * @see packages/core/src/grader/failure-modes/common.ts
+ */
+export { COMMON_FAILURE_MODES } from "../_vendor/ailf-core/index.js";

package/dist/grader/index.d.ts ADDED Viewed

@@ -0,0 +1,24 @@
+/**
+ * Per-dimension failure-mode taxonomy barrel.
+ *
+ * D-05: taxonomy data relocated to @sanity/ailf-core so card files in
+ * packages/core/src/services/diagnosis/cards/ can import without violating
+ * the core→eval import direction rule.
+ *
+ * This file is now a re-export shim — all behavior lives in
+ * packages/core/src/grader/failure-modes/. Existing eval-side callers
+ * (rubrics.ts, rubric-resolution.ts, calibration.test.ts) continue to
+ * work with zero source changes.
+ *
+ * Named re-exports only (W0124 — never `export *`).
+ *
+ * Consumers:
+ * - `packages/eval/config/rubrics.ts` — calls `failureModesForDimension()`
+ * - `packages/eval/src/pipeline/compiler/rubric-resolution.ts`
+ * - `packages/eval/src/grader/__tests__/calibration.test.ts`
+ *
+ * @see packages/core/src/grader/failure-modes/index.ts — canonical location
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
+ * @see docs/decisions/D0005-grader-model-separation.md
+ */
+export { AGENT_FAILURE_MODES, CANONICAL_DIMENSIONS, COMMON_FAILURE_MODES, KP_FAILURE_MODES, LITERACY_FAILURE_MODES, MCP_FAILURE_MODES, failureModesForDimension, isCanonicalFailureMode, type AgentFailureMode, type CommonFailureMode, type KPFailureMode, type LiteracyFailureMode, type MCPFailureMode, } from "../_vendor/ailf-core/index.d.ts";

package/dist/grader/index.js ADDED Viewed

@@ -0,0 +1,24 @@
+/**
+ * Per-dimension failure-mode taxonomy barrel.
+ *
+ * D-05: taxonomy data relocated to @sanity/ailf-core so card files in
+ * packages/core/src/services/diagnosis/cards/ can import without violating
+ * the core→eval import direction rule.
+ *
+ * This file is now a re-export shim — all behavior lives in
+ * packages/core/src/grader/failure-modes/. Existing eval-side callers
+ * (rubrics.ts, rubric-resolution.ts, calibration.test.ts) continue to
+ * work with zero source changes.
+ *
+ * Named re-exports only (W0124 — never `export *`).
+ *
+ * Consumers:
+ * - `packages/eval/config/rubrics.ts` — calls `failureModesForDimension()`
+ * - `packages/eval/src/pipeline/compiler/rubric-resolution.ts`
+ * - `packages/eval/src/grader/__tests__/calibration.test.ts`
+ *
+ * @see packages/core/src/grader/failure-modes/index.ts — canonical location
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
+ * @see docs/decisions/D0005-grader-model-separation.md
+ */
+export { AGENT_FAILURE_MODES, CANONICAL_DIMENSIONS, COMMON_FAILURE_MODES, KP_FAILURE_MODES, LITERACY_FAILURE_MODES, MCP_FAILURE_MODES, failureModesForDimension, isCanonicalFailureMode, } from "../_vendor/ailf-core/index.js";

package/dist/grader/knowledge-probe.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+/**
+ * Knowledge-probe failure modes — re-export shim (D-05).
+ *
+ * Canonical data relocated to @sanity/ailf-core.
+ * Existing callers of this file continue to work unchanged.
+ *
+ * @see packages/core/src/grader/failure-modes/knowledge-probe.ts
+ */
+export { KP_FAILURE_MODES, type KPFailureMode } from "../_vendor/ailf-core/index.d.ts";

package/dist/grader/knowledge-probe.js ADDED Viewed

@@ -0,0 +1,9 @@
+/**
+ * Knowledge-probe failure modes — re-export shim (D-05).
+ *
+ * Canonical data relocated to @sanity/ailf-core.
+ * Existing callers of this file continue to work unchanged.
+ *
+ * @see packages/core/src/grader/failure-modes/knowledge-probe.ts
+ */
+export { KP_FAILURE_MODES } from "../_vendor/ailf-core/index.js";

package/dist/grader/literacy.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+/**
+ * Literacy failure modes — re-export shim (D-05).
+ *
+ * Canonical data relocated to @sanity/ailf-core.
+ * Existing callers of this file continue to work unchanged.
+ *
+ * @see packages/core/src/grader/failure-modes/literacy.ts
+ */
+export { LITERACY_FAILURE_MODES, type LiteracyFailureMode, } from "../_vendor/ailf-core/index.d.ts";

package/dist/grader/literacy.js ADDED Viewed

@@ -0,0 +1,9 @@
+/**
+ * Literacy failure modes — re-export shim (D-05).
+ *
+ * Canonical data relocated to @sanity/ailf-core.
+ * Existing callers of this file continue to work unchanged.
+ *
+ * @see packages/core/src/grader/failure-modes/literacy.ts
+ */
+export { LITERACY_FAILURE_MODES, } from "../_vendor/ailf-core/index.js";

package/dist/grader/mcp.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+/**
+ * MCP failure modes — re-export shim (D-05).
+ *
+ * Canonical data relocated to @sanity/ailf-core.
+ * Existing callers of this file continue to work unchanged.
+ *
+ * @see packages/core/src/grader/failure-modes/mcp.ts
+ */
+export { MCP_FAILURE_MODES, type MCPFailureMode } from "../_vendor/ailf-core/index.d.ts";

package/dist/grader/mcp.js ADDED Viewed

@@ -0,0 +1,9 @@
+/**
+ * MCP failure modes — re-export shim (D-05).
+ *
+ * Canonical data relocated to @sanity/ailf-core.
+ * Existing callers of this file continue to work unchanged.
+ *
+ * @see packages/core/src/grader/failure-modes/mcp.ts
+ */
+export { MCP_FAILURE_MODES } from "../_vendor/ailf-core/index.js";

package/dist/orchestration/build-app-context.js CHANGED Viewed

@@ -49,6 +49,7 @@ export function mapToResolvedConfig(opts, rootDir) {
         noCache: opts.noCache,
         noRemoteCache: opts.noRemoteCache,
         graderReplications: opts.graderReplications,
+        borderlineReplications: opts.borderlineReplications,
         graderContext: opts.graderContext,
         outputDir: opts.outputDir,
         outputPath: opts.outputPath,

package/dist/orchestration/build-step-sequence.js CHANGED Viewed

@@ -8,6 +8,7 @@
 import { LiteracyVariant } from "../pipeline/normalize-mode.js";
 import { CallbackStep } from "./steps/callback-step.js";
 import { CalculateScoresStep } from "./steps/calculate-scores-step.js";
+import { ComputeAttributionStep } from "./steps/compute-attribution-step.js";
 import { CompareStep } from "./steps/compare-step.js";
 import { FetchDocsStep } from "./steps/fetch-docs-step.js";
 import { FinalizeRunStep } from "./steps/finalize-run-step.js";
@@ -75,6 +76,10 @@ export function buildStepSequence(ctx, pipelineStart = Date.now()) {
     if (config.gapAnalysisEnabled) {
         steps.push(new GapAnalysisStep());
     }
+    // Step 4b2: Per-judgment attribution ensemble (default-on).
+    // Depends on documentManifest being enriched onto score-summary.json
+    // by gap-analysis. Skipped silently when upstream files are missing.
+    steps.push(new ComputeAttributionStep());
     // Step 4c: Finalize the run — write `runs/{runId}/manifest.json` with the
     // catalog of artifacts produced so far. Skipped silently when no
     // artifactWriter is wired (D0032).

package/dist/orchestration/steps/calculate-scores-step.js CHANGED Viewed

@@ -14,6 +14,8 @@ import { buildCacheContext } from "../cache-context.js";
 import { calculateAndWriteScores } from "../../pipeline/calculate-scores.js";
 import { checkResultsExist, checkScoreSummaryValid, } from "../../pipeline/checks.js";
 import { resultsFileForMode } from "../../pipeline/eval-constants.js";
+import { gradeOnce, loadGraderModel } from "../../pipeline/grader-api.js";
+import { createBorderlineConsensusRunner } from "../../composition-root.js";
 import { loadPreflightScoring } from "../../pipeline/preflight/load-preflight-scoring.js";
 import { loadSource } from "../../sources.js";
 import { uploadTestOutputs } from "../../pipeline/upload-test-outputs.js";
@@ -85,10 +87,30 @@ export class CalculateScoresStep {
             ctx.logger.warn(`[warn] W0198 preflight: failed to load preflight-scoring config — ${err instanceof Error ? err.message : String(err)}`);
             return undefined;
         });
+        // CR-01 — wire the borderline-consensus runner end-to-end. The
+        // composition root owns the threshold + replication defaults; the
+        // orchestration step supplies the regrade entry point (gradeOnce
+        // against the configured grader model). Built lazily — when no
+        // judgments are extracted (or none land in the ±5 borderline band),
+        // the runner short-circuits without paying the grader-model load.
+        let borderlineRegradeOnce;
+        try {
+            const grader = loadGraderModel(ctx.config.rootDir);
+            borderlineRegradeOnce = (responseText, rubricText) => gradeOnce(grader.id, responseText, rubricText, ctx.logger);
+        }
+        catch (err) {
+            ctx.logger.warn(`[warn] borderline consensus skipped — grader model not loadable: ${err instanceof Error ? err.message : String(err)}`);
+        }
+        const borderlineConsensusRunner = createBorderlineConsensusRunner(ctx.config.borderlineReplications !== undefined
+            ? { borderlineReplications: ctx.config.borderlineReplications }
+            : {});
         let belowCritical = [];
         try {
-            const result = calculateAndWriteScores({
+            const result = await calculateAndWriteScores({
                 allowedOrigins: ctx.config.allowedOrigins,
+                ...(borderlineRegradeOnce
+                    ? { borderlineConsensusRunner, borderlineRegradeOnce }
+                    : {}),
                 logger: ctx.logger,
                 // Pass the variant for literacy (scoring uses it to decide
                 // whether to read agentic results), or mode for other modes

package/dist/orchestration/steps/compute-attribution-step.d.ts ADDED Viewed

@@ -0,0 +1,44 @@
+/**
+ * Pipeline step: Per-judgment attribution ensemble (v0).
+ *
+ * Reads `grader-judgments.json` and `score-summary.json` from the latest
+ * results, calls the pure `computeJudgmentAttribution(...)` helper for each
+ * judgment, and emits:
+ *
+ *   - One `perEntryAttribution` artifact per judgment at
+ *     `runs/{runId}/attribution/{entryKey}.json`
+ *   - One `attributionMeta` artifact at
+ *     `runs/{runId}/attribution-meta.json`
+ *
+ * Additionally, when any hallucinated citations are detected, the step
+ * atomically rewrites `score-summary.json` to persist
+ * `graderReliability.hallucinationCount` (D-05 — only this one direct-
+ * mutation path uses the temp+rename pattern; all artifact emissions go
+ * through `ctx.artifactWriter.emit` which handles atomicity internally).
+ *
+ * This step is `optional: true` — it self-skips when either
+ * `grader-judgments.json` or `score-summary.json` is missing, so
+ * non-graded runs are unaffected.
+ *
+ * Task → judgment join (D-10): `judgment.taskId` is the promptfoo row
+ * description, which for literacy mode is `"${task.title} (gold|baseline)"`.
+ * The join strips the variant suffix and looks up in a triple-keyed cache
+ * by `task.title` (primary), `task.description`, and `task.id` (defensive
+ * fallbacks for non-literacy modes).
+ *
+ * Retrieved signal (D-11/D-12): `FeatureAgentBehavior.feature` is the join
+ * key — for literacy mode it equals `task.area` exactly (compiler propagates
+ * `task.area → __featureArea → ab.feature`).
+ *
+ * @see docs/decisions/D0033-unified-artifact-writer.md
+ * @see docs/decisions/D0049-shared-confidence-contract.md
+ * @see docs/decisions/D0050-per-entry-attribution-layout.md
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
+ */
+import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
+export declare class ComputeAttributionStep implements PipelineStep {
+    readonly name = "compute-attribution";
+    readonly optional = true;
+    check(ctx: AppContext): ValidationIssue[];
+    execute(ctx: AppContext, _state?: unknown): Promise<StepResult>;
+}