npm - @sanity/ailf - Versions diffs - 4.5.0 → 5.0.0 - Mend

@sanity/ailf 4.5.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

package/canonical/grader-references/agent-harness-tools.yaml +42 -0
package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
package/canonical/grader-references/mcp-server-spec.yaml +51 -0
package/canonical/grader-references/portable-text.yaml +48 -0
package/config/rubrics.ts +38 -2
package/dist/_vendor/ailf-core/artifact-registry.d.ts +197 -2
package/dist/_vendor/ailf-core/artifact-registry.js +419 -5
package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
package/dist/_vendor/ailf-core/examples/index.js +146 -47
package/dist/_vendor/ailf-core/ports/context.d.ts +26 -0
package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
package/dist/_vendor/ailf-core/ports/index.js +1 -0
package/dist/_vendor/ailf-core/ports/llm-client.d.ts +112 -0
package/dist/_vendor/ailf-core/ports/llm-client.js +68 -0
package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
package/dist/_vendor/ailf-core/schemas/index.js +9 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
package/dist/_vendor/ailf-core/services/index.js +5 -0
package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
package/dist/_vendor/ailf-core/types/attribution.js +18 -0
package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
package/dist/_vendor/ailf-core/types/confidence.d.ts +68 -0
package/dist/_vendor/ailf-core/types/confidence.js +56 -0
package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
package/dist/_vendor/ailf-core/types/index.d.ts +82 -29
package/dist/_vendor/ailf-core/types/index.js +16 -1
package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
package/dist/adapters/api-client/build-request.d.ts +1 -0
package/dist/adapters/api-client/build-request.js +3 -0
package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
package/dist/adapters/attribution/index.d.ts +9 -0
package/dist/adapters/attribution/index.js +8 -0
package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
package/dist/adapters/config-sources/file-config-adapter.js +1 -0
package/dist/adapters/grader-outputs/index.d.ts +10 -0
package/dist/adapters/grader-outputs/index.js +8 -0
package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
package/dist/adapters/grader-outputs/legacy/index.js +10 -0
package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
package/dist/adapters/index.d.ts +3 -0
package/dist/adapters/index.js +4 -0
package/dist/adapters/llm/anthropic-llm-client.d.ts +48 -0
package/dist/adapters/llm/anthropic-llm-client.js +205 -0
package/dist/adapters/llm/fake-llm-client.d.ts +49 -0
package/dist/adapters/llm/fake-llm-client.js +63 -0
package/dist/adapters/llm/index.d.ts +9 -0
package/dist/adapters/llm/index.js +4 -0
package/dist/adapters/llm/openai-llm-client.d.ts +44 -0
package/dist/adapters/llm/openai-llm-client.js +168 -0
package/dist/adapters/llm/pricing.d.ts +12 -0
package/dist/adapters/llm/pricing.js +8 -0
package/dist/adapters/llm/retry.d.ts +56 -0
package/dist/adapters/llm/retry.js +66 -0
package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
package/dist/adapters/task-sources/repo-schemas.d.ts +90 -22
package/dist/adapters/task-sources/repo-schemas.js +19 -2
package/dist/artifact-capture/api-gateway-artifact-writer.js +2 -1
package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +2 -1
package/dist/artifact-capture/gcs-artifact-writer.js +3 -1
package/dist/artifact-capture/local-fs-artifact-writer.js +3 -1
package/dist/commands/calculate-scores.js +1 -1
package/dist/commands/explain-handler.js +1 -1
package/dist/commands/lookup-doc.d.ts +1 -1
package/dist/commands/lookup-doc.js +3 -3
package/dist/commands/pipeline-action.d.ts +6 -0
package/dist/commands/pipeline-action.js +2 -0
package/dist/commands/remote-pipeline.js +1 -0
package/dist/composition-root.d.ts +59 -1
package/dist/composition-root.js +95 -0
package/dist/config/rubrics.ts +38 -2
package/dist/grader/agent-harness.d.ts +14 -0
package/dist/grader/agent-harness.js +17 -0
package/dist/grader/common.d.ts +17 -0
package/dist/grader/common.js +21 -0
package/dist/grader/index.d.ts +38 -0
package/dist/grader/index.js +75 -0
package/dist/grader/knowledge-probe.d.ts +14 -0
package/dist/grader/knowledge-probe.js +18 -0
package/dist/grader/literacy.d.ts +13 -0
package/dist/grader/literacy.js +17 -0
package/dist/grader/mcp.d.ts +14 -0
package/dist/grader/mcp.js +18 -0
package/dist/orchestration/build-app-context.js +1 -0
package/dist/orchestration/build-step-sequence.js +5 -0
package/dist/orchestration/steps/calculate-scores-step.js +23 -1
package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
package/dist/orchestration/steps/compute-attribution-step.js +279 -0
package/dist/orchestration/steps/gap-analysis-step.js +35 -7
package/dist/orchestration/steps/index.d.ts +1 -0
package/dist/orchestration/steps/index.js +1 -0
package/dist/pipeline/attribution.d.ts +15 -0
package/dist/pipeline/attribution.js +18 -9
package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
package/dist/pipeline/borderline-consensus-runner.js +124 -0
package/dist/pipeline/borderline-detector.d.ts +24 -0
package/dist/pipeline/borderline-detector.js +26 -0
package/dist/pipeline/calculate-scores.d.ts +114 -3
package/dist/pipeline/calculate-scores.js +426 -24
package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
package/dist/pipeline/compiler/literacy-bridge.js +35 -17
package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
package/dist/pipeline/compiler/rubric-resolution.js +9 -1
package/dist/pipeline/compute-attribution.d.ts +80 -0
package/dist/pipeline/compute-attribution.js +196 -0
package/dist/pipeline/failure-modes.d.ts +52 -17
package/dist/pipeline/failure-modes.js +178 -117
package/dist/pipeline/map-request-to-config.js +1 -0
package/package.json +6 -4

package/dist/_vendor/ailf-core/types/grader-judgment.d.ts ADDED Viewed

@@ -0,0 +1,125 @@
+/**
+ * GraderJudgment core domain types — canonical shapes for structured
+ * grader output (Doc 03, GRAD-02).
+ *
+ * Authored INDEPENDENTLY of any Zod schema (D0045 doctrine — Plan 02's
+ * `GraderJudgmentSchema` `satisfies` against this type, not the other
+ * way around). A tautological `satisfies z.ZodType<z.infer<typeof
+ * GraderJudgmentSchema>>` is forbidden.
+ *
+ * Phase 1 retained the existing pipeline core (`taskId`, `modelId`,
+ * `dimension`, `reason`, `score`, `outputFailure?`) as required for
+ * backward compat with Phase 0 callers (Doc 03 §"existing, unchanged")
+ * and added the GRAD-02 additive fields (`judgmentId`, `subJudgments`,
+ * `docCitations`, `failureMode`, `confidence`,
+ * `hallucinationCheckedAgainst`, `metadata`) as additive in Phase 1;
+ * required from Phase 3 GRAD-05.
+ *
+ * Phase 3 GRAD-05 has flipped the additive fields to required (this
+ * file) and the corresponding Zod schema in
+ * `packages/eval/src/adapters/grader-outputs/promptfoo-grader-output.ts`
+ * is `.strict()` with `graderJudgmentsVersion = "1.0.0"`. The
+ * read-only legacy parser at `…/legacy/promptfoo-grader-output-legacy.ts`
+ * (against `LegacyGraderJudgment`) is the named consumer for already-
+ * stored historical reports through GRAD-06 cutover.
+ *
+ * @see docs/decisions/D0049-shared-confidence-contract.md
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
+ */
+import type { JudgmentId } from "./branded-ids.js";
+import type { Confidence } from "./confidence.js";
+/**
+ * Role enum for doc citations attached to a grader judgment (GRAD-02).
+ * Closed string-literal union — Phase 3 may extend.
+ */
+export type DocCitationRole = "supports" | "contradicts" | "missing" | "irrelevant";
+/**
+ * A single doc the grader cited while reasoning. `documentId` is the
+ * canonical D0052 reference (id, not slug); `slug` is a human-readable
+ * annotation only. `hallucinated` is set true at adapter time when the
+ * `slug` does not resolve against the task's `contextDocs` set.
+ */
+export interface DocCitation {
+    /** Canonical D0052 document ref (id, not slug). */
+    documentId: string;
+    /** Optional human-readable annotation. Never the identity. */
+    slug?: string;
+    role: DocCitationRole;
+    /** True when `slug` is not in the resolvable-set. */
+    hallucinated?: boolean;
+}
+/**
+ * Per-criterion sub-judgment — one entry per task-criterion bullet
+ * (Doc 03 §"per-criterion sub-judgments"). The `criterionId` is the
+ * stable identifier declared on the task's `criteria` array (Phase 2
+ * GRAD-01 schema-sync), not synthesized at grade time.
+ */
+export interface CriterionSubJudgment {
+    /** Stable criterion identifier — matches `CriterionRef.id` from the task definition (D0052). */
+    criterionId: string;
+    met: boolean;
+    /** ≤280 chars — quote or paraphrase. */
+    evidence: string;
+    /** Grader self-confidence on this single criterion (D0049). */
+    confidence: Confidence;
+}
+/**
+ * The structured grader judgment — Phase 3 GRAD-05 shape.
+ *
+ * Existing pipeline core (Doc 03 §"existing, unchanged"): `taskId`,
+ * `modelId`, `dimension`, `reason`, `score`. The pre-existing
+ * `outputFailure?` remains optional. The
+ * `contextDocs? (legacy alias: canonicalDocs)` annotation (StoredJudgment
+ * extension) lives on the storage extension type, not here.
+ *
+ * Additive in Phase 1; required from Phase 3 GRAD-05: `judgmentId`,
+ * `subJudgments`, `docCitations`, `failureMode`, `confidence`,
+ * `hallucinationCheckedAgainst`, `metadata`. The corresponding Zod
+ * schema in `packages/eval/src/adapters/grader-outputs/promptfoo-grader-output.ts`
+ * is `.strict()` with `graderJudgmentsVersion = "1.0.0"`.
+ */
+export interface GraderJudgment {
+    /** Rubric template name (e.g. "task-completion", "code-correctness"). */
+    dimension: string;
+    /** The model that produced the response being graded. */
+    modelId: string;
+    /**
+     * True when the model failed to produce meaningful output (empty
+     * response, API error, or refusal). Distinguishes infrastructure
+     * failures from genuinely incorrect responses — a score of 0 from no
+     * output is fundamentally different from a score of 0 from wrong
+     * output.
+     */
+    outputFailure?: boolean;
+    /** The grader's natural-language reasoning. */
+    reason: string;
+    /** Numeric score in [0, 100] (normalized). */
+    score: number;
+    /** The task this judgment belongs to. */
+    taskId: string;
+    /**
+     * D0052 granular branded id. Required from Phase 3 GRAD-05 — every
+     * grader emission carries one.
+     */
+    judgmentId: JudgmentId;
+    /** Per-criterion sub-judgments. */
+    subJudgments: CriterionSubJudgment[];
+    /** Doc citations with role + hallucinated flag. */
+    docCitations: DocCitation[];
+    /**
+     * Per-dimension failure mode. Phase 3 GRAD-03 stamps the taxonomy
+     * literal at the runtime grader-prompt; the value is a free-form
+     * string for forward compat with future taxonomy extensions.
+     */
+    failureMode: string;
+    /** Grader self-confidence per D0049. */
+    confidence: Confidence;
+    /** Hallucination cross-check (Pitfall #11) — union of task.context.docs and run.documentManifest. */
+    hallucinationCheckedAgainst: string[];
+    /** Metadata about the grader run. */
+    metadata: {
+        graderModel: string;
+        graderJudgmentsVersion: string;
+    };
+}

package/dist/_vendor/ailf-core/types/grader-judgment.js ADDED Viewed

@@ -0,0 +1,30 @@
+/**
+ * GraderJudgment core domain types — canonical shapes for structured
+ * grader output (Doc 03, GRAD-02).
+ *
+ * Authored INDEPENDENTLY of any Zod schema (D0045 doctrine — Plan 02's
+ * `GraderJudgmentSchema` `satisfies` against this type, not the other
+ * way around). A tautological `satisfies z.ZodType<z.infer<typeof
+ * GraderJudgmentSchema>>` is forbidden.
+ *
+ * Phase 1 retained the existing pipeline core (`taskId`, `modelId`,
+ * `dimension`, `reason`, `score`, `outputFailure?`) as required for
+ * backward compat with Phase 0 callers (Doc 03 §"existing, unchanged")
+ * and added the GRAD-02 additive fields (`judgmentId`, `subJudgments`,
+ * `docCitations`, `failureMode`, `confidence`,
+ * `hallucinationCheckedAgainst`, `metadata`) as additive in Phase 1;
+ * required from Phase 3 GRAD-05.
+ *
+ * Phase 3 GRAD-05 has flipped the additive fields to required (this
+ * file) and the corresponding Zod schema in
+ * `packages/eval/src/adapters/grader-outputs/promptfoo-grader-output.ts`
+ * is `.strict()` with `graderJudgmentsVersion = "1.0.0"`. The
+ * read-only legacy parser at `…/legacy/promptfoo-grader-output-legacy.ts`
+ * (against `LegacyGraderJudgment`) is the named consumer for already-
+ * stored historical reports through GRAD-06 cutover.
+ *
+ * @see docs/decisions/D0049-shared-confidence-contract.md
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
+ */
+export {};

package/dist/_vendor/ailf-core/types/index.d.ts CHANGED Viewed

@@ -13,6 +13,7 @@ import type { DocumentRef as _DocumentRef, EvalMode, RunContext } from "../../ai
 import type { ArtifactType } from "../artifact-registry.js";
 import type { SymbolPreflightReport } from "./symbol-preflight-report.js";
 import type { AssociationValues, RunId } from "./branded-ids.js";
+import type { GraderJudgment } from "./grader-judgment.js";
 export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "./scoring-input.js";
 export type { DocumentRef, RunContext, RunTrigger } from "../../ailf-shared/index.d.ts";
 export type { StoredBaseline, StoredReport, StoredRun, StoredTaskResult, StoredTrace, SchemaVersioned, } from "./storage-schema.js";
@@ -30,9 +31,15 @@ export type { PipelineRequest, PipelineRequestCallback, PipelineRequestCallerExe
 export type { PackageSurfaceConfig, PackageSurfaceEntry, } from "./package-surface.js";
 export type { SymbolPreflightDeduction, SymbolPreflightFinding, SymbolPreflightReport, SymbolPreflightUnresolvedReason, } from "./symbol-preflight-report.js";
 export { DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT, type PreflightRubricContext, type PreflightScoringConfig, } from "./preflight-scoring.js";
-export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, Err, FixtureId, IdValidationError, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
-export { err, fixtureId, generateRunId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
-export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLakeAuthorableTask, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PromptVars, PathDocRef, PerspectiveDocRef, ReservedPromptVarKey, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
+export type { Confidence, ConfidenceDerivation } from "./confidence.js";
+export { CONVENTIONAL_DERIVATIONS, isConfidence } from "./confidence.js";
+export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, Err, FixtureId, IdValidationError, JudgmentId, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
+export { err, fixtureId, generateJudgmentId, generateRunId, judgmentId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
+export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLakeAuthorableTask, CriterionRef, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PromptVars, PathDocRef, PerspectiveDocRef, ReservedPromptVarKey, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
+export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
+export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./attribution.js";
+export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderJudgment, } from "./grader-judgment.js";
+export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
 type DocumentRef = _DocumentRef;
 /** Aggregated retrieval metrics for a feature area */
 export interface AreaRetrievalMetrics {
@@ -126,8 +133,31 @@ export interface FailureModeReport {
     /** Total judgments analyzed */
     totalJudgments: number;
 }
-/** Failure mode classification for a low-scoring judgment */
-export type FailureModeType = "api-error" | "incorrect-docs" | "missing-docs" | "model-limitation" | "outdated-docs" | "poor-structure" | "unclassified";
+/**
+ * Failure mode classification for a low-scoring judgment.
+ *
+ * Open-set string (Plan 03-02 per-dimension taxonomies introduce modes
+ * outside the original literacy enum: `false-floor`, `spec-mismatch`,
+ * `tool-misuse`, `factual-error`, `hallucination`, etc. — the grader
+ * is told these are legal answers via the rubric prompt). The legacy
+ * literacy enum survives as `LegacyFailureModeType` for the
+ * report-aggregation helpers that need stable bucket ordering and
+ * icon tables; consumers that only care about presence/absence treat
+ * `FailureModeType` as `string`.
+ */
+export type FailureModeType = string;
+/**
+ * Closed enum of the original literacy failure modes — used by the
+ * report formatters that iterate buckets in a stable order. Adding to
+ * this list is a deliberate extension; modes outside it still flow
+ * through the report (per-area `modes` record), just without a
+ * pre-allocated bucket in `summary`.
+ */
+export type LegacyFailureModeType = "api-error" | "incorrect-docs" | "missing-docs" | "model-limitation" | "outdated-docs" | "poor-structure" | "unclassified";
+/** Set of canonical legacy modes — exported for report-formatter use. */
+export declare const LEGACY_FAILURE_MODES: readonly LegacyFailureModeType[];
+/** Type guard for legacy modes. */
+export declare function isLegacyFailureMode(mode: string): mode is LegacyFailureModeType;
 /** Per-feature-area score breakdown */
 export interface FeatureScore {
     /**
@@ -259,30 +289,16 @@ export interface GapEstimate {
     /** Specific remediation description */
     remediation: string;
 }
-/** A single grader judgment — one per assertion per test */
-export interface GraderJudgment {
-    /** The rubric template used (task-completion, code-correctness, doc-coverage) */
-    dimension: string;
-    /** The model that produced the response being graded */
-    modelId: string;
+/** Enriched grader judgment with stored documentation context refs. */
+export interface StoredJudgment extends GraderJudgment {
     /**
-     * True when the model failed to produce meaningful output (empty response,
-     * API error, or refusal). Distinguishes infrastructure failures from
-     * genuinely incorrect responses — a score of 0 from no output is
-     * fundamentally different from a score of 0 from wrong output.
+     * Documentation context the task expected the model to use.
+     *
+     * Legacy alias `canonicalDocs` may appear on stored reports written
+     * before Phase 2 — readers should tolerate both. Writers (the pipeline)
+     * always emit `contextDocs`.
      */
-    outputFailure?: boolean;
-    /** The grader's natural language reasoning */
-    reason: string;
-    /** The numeric score (0–100) */
-    score: number;
-    /** The task this judgment belongs to */
-    taskId: string;
-}
-/** Enriched grader judgment with canonical doc references, stored in reports */
-export interface StoredJudgment extends GraderJudgment {
-    /** Canonical docs that the task expected the model to use */
-    canonicalDocs?: DocumentRef[];
+    contextDocs?: DocumentRef[];
 }
 /**
  * Per-test result stored in reports for drill-down and audit.
@@ -294,8 +310,11 @@ export interface StoredJudgment extends GraderJudgment {
 export interface StoredTestResult {
     /** Resolved feature area (from __featureArea or description) */
     area: string;
-    /** Canonical docs the task expected the model to use */
-    canonicalDocs?: DocumentRef[];
+    /**
+     * Documentation context the task expected the model to use.
+     * Legacy alias `canonicalDocs` may appear on pre-Phase-2 reports.
+     */
+    contextDocs?: DocumentRef[];
     /** Weighted composite score (gold variant only) */
     compositeScore?: number;
     /** Per-test cost (USD) */
@@ -347,6 +366,40 @@ export interface StoredTestResult {
 }
 /** Grader consistency diagnostics — does not affect scores, reported alongside */
 export interface GraderReliability {
+    /**
+     * Plan 03-03 — count of grader-emission vs ceiling cross-check disagreements.
+     *
+     * Incremented by the live pipeline when `validateFailureMode(...)` returns
+     * `level: "medium"` (the grader's emitted `failureMode` does not agree with
+     * the ceiling-decomposition mode). Surfaces calibration drift over time
+     * without affecting scores. Optional — undefined when the run did not
+     * exercise the failure-mode validator (e.g., grader-consistency-only paths).
+     *
+     * @see docs/decisions/D0049-shared-confidence-contract.md
+     */
+    failureModeCalibration?: number;
+    /**
+     * Plan 03-03 — count of strict-schema parse failures during grader-output
+     * extraction. Wired at the parse-fail branch in `extractGraderJudgments`;
+     * incremented when `GraderJudgmentSchema.safeParse` rejects a payload and
+     * the pipeline drops to the Phase 1 minimal-shape fallback.
+     *
+     * Plan 03-04 will tighten the strict schema (`.strict()` + GRAD-02 fields
+     * required) and graders will emit the structured wire format in earnest;
+     * this counter measures pre-hard-fail drift.
+     */
+    parseFailures?: number;
+    /**
+     * Phase 4 ATTR-01 — count of grader citations whose `slug` was not
+     * in the resolvable-set (`hallucinationCheckedAgainst`).
+     * Incremented by `computeJudgmentAttribution(...)` for every
+     * citation that fails the hallucination short-circuit (Success
+     * Criterion #5). A counter, not a ratio — consumers compute the
+     * rate by dividing by total-citations if needed.
+     *
+     * @see docs/decisions/D0049-shared-confidence-contract.md
+     */
+    hallucinationCount?: number;
     /** Inter-grader agreement (from multi-grader comparison) — Phase 3 */
     agreement?: {
         /** Models compared against the primary grader */

package/dist/_vendor/ailf-core/types/index.js CHANGED Viewed

@@ -17,7 +17,22 @@ export { InMemoryPluginRegistry } from "./plugin-registry.js";
 // the mode-specific version, they import from "./eval-mode-config.js".
 export { evalModeType } from "./eval-mode-config.js";
 export { DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT, } from "./preflight-scoring.js";
-export { err, fixtureId, generateRunId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
+export { CONVENTIONAL_DERIVATIONS, isConfidence } from "./confidence.js";
+export { err, fixtureId, generateJudgmentId, generateRunId, judgmentId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
+/** Set of canonical legacy modes — exported for report-formatter use. */
+export const LEGACY_FAILURE_MODES = [
+    "api-error",
+    "incorrect-docs",
+    "missing-docs",
+    "model-limitation",
+    "outdated-docs",
+    "poor-structure",
+    "unclassified",
+];
+/** Type guard for legacy modes. */
+export function isLegacyFailureMode(mode) {
+    return LEGACY_FAILURE_MODES.includes(mode);
+}
 // ---------------------------------------------------------------------------
 // Comparison (Approach 2: structured comparison output)
 // ---------------------------------------------------------------------------

package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts ADDED Viewed

@@ -0,0 +1,55 @@
+/**
+ * LegacyGraderJudgment — Phase 1 superset core only, with NO GRAD-02
+ * additive surface. Used by the read-only legacy parser at
+ * `packages/eval/src/adapters/grader-outputs/legacy/` for historical
+ * pre-Phase-3 reports.
+ *
+ * Reports are immutable events — once a Report is written to Content
+ * Lake the structured grader-judgment shape it captures cannot be
+ * back-filled. The legacy parser exists so historical-report rendering
+ * paths can keep deserializing pre-Phase-3 output through Phase 7
+ * (GRAD-06 cutover removes Studio's `reason`-only fallback rendering
+ * paths and the legacy adapter alongside).
+ *
+ * Authored INDEPENDENTLY of any Zod schema (D0045 doctrine — the
+ * legacy schema in
+ * `packages/eval/src/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.ts`
+ * `satisfies z.ZodType<LegacyGraderJudgment>` against this type, not
+ * the other way around). A tautological
+ * `satisfies z.ZodType<z.infer<typeof Schema>>` is forbidden.
+ *
+ * Invariant — live grader output that fails the strict
+ * `GraderJudgmentSchema` MUST NOT fall back to this schema. Drop to
+ * `failureMode: "unclassified"` instead. The legacy parser is invoked
+ * ONLY by historical-report rendering paths.
+ *
+ * @see ./grader-judgment.ts — the Phase 1+ structured shape (live path)
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
+ */
+/**
+ * The Phase 1 free-prose grader judgment as historical reports captured
+ * it. Mirrors the existing-pipeline-core surface of {@link GraderJudgment}
+ * (the required fields) and the pre-existing optional `outputFailure`
+ * flag. NO GRAD-02 additive fields (`subJudgments`, `docCitations`,
+ * `failureMode`, `confidence`, `hallucinationCheckedAgainst`,
+ * `metadata`) — those are by construction absent on pre-Phase-3 output.
+ */
+export interface LegacyGraderJudgment {
+    /** Rubric template name (e.g. "task-completion", "code-correctness"). */
+    dimension: string;
+    /** The model that produced the response being graded. */
+    modelId: string;
+    /**
+     * True when the model failed to produce meaningful output (empty
+     * response, API error, or refusal). Same semantics as
+     * {@link GraderJudgment.outputFailure}.
+     */
+    outputFailure?: boolean;
+    /** The grader's natural-language reasoning (free-prose Phase 1 shape). */
+    reason: string;
+    /** Numeric score in [0, 100] (normalized). */
+    score: number;
+    /** The task this judgment belongs to. */
+    taskId: string;
+}

package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js ADDED Viewed

@@ -0,0 +1,30 @@
+/**
+ * LegacyGraderJudgment — Phase 1 superset core only, with NO GRAD-02
+ * additive surface. Used by the read-only legacy parser at
+ * `packages/eval/src/adapters/grader-outputs/legacy/` for historical
+ * pre-Phase-3 reports.
+ *
+ * Reports are immutable events — once a Report is written to Content
+ * Lake the structured grader-judgment shape it captures cannot be
+ * back-filled. The legacy parser exists so historical-report rendering
+ * paths can keep deserializing pre-Phase-3 output through Phase 7
+ * (GRAD-06 cutover removes Studio's `reason`-only fallback rendering
+ * paths and the legacy adapter alongside).
+ *
+ * Authored INDEPENDENTLY of any Zod schema (D0045 doctrine — the
+ * legacy schema in
+ * `packages/eval/src/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.ts`
+ * `satisfies z.ZodType<LegacyGraderJudgment>` against this type, not
+ * the other way around). A tautological
+ * `satisfies z.ZodType<z.infer<typeof Schema>>` is forbidden.
+ *
+ * Invariant — live grader output that fails the strict
+ * `GraderJudgmentSchema` MUST NOT fall back to this schema. Drop to
+ * `failureMode: "unclassified"` instead. The legacy parser is invoked
+ * ONLY by historical-report rendering paths.
+ *
+ * @see ./grader-judgment.ts — the Phase 1+ structured shape (live path)
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
+ */
+export {};

package/dist/_vendor/ailf-core/types/pipeline-request.d.ts CHANGED Viewed

@@ -84,6 +84,7 @@ export interface PipelineRequest {
     dataset?: string;
     debug?: PipelineRequestDebug | boolean;
     executor?: PipelineRequestCallerExecutor;
+    borderlineReplications?: number;
     gapAnalysis?: boolean;
     graderContext?: "rubric-only" | "with-docs";
     graderReplications?: number;

package/dist/_vendor/ailf-core/types/repo-config.d.ts CHANGED Viewed

@@ -47,6 +47,14 @@ export interface RepoPublishConfig {
 /** Execution-tier knobs — replaces the retired `--concurrency` / `--api-url` flags. */
 export interface RepoExecutionConfig {
     apiUrl?: string;
+    /**
+     * Plan 03-04 GRAD-04 — replications per borderline judgment for the
+     * intra-grader consensus pass. Default 3 (set in composition-root).
+     * A judgment is "borderline" when its score lies within ±5 of any
+     * severity boundary (30/50/60). Non-borderline judgments are not
+     * re-graded.
+     */
+    borderlineReplications?: number;
     concurrency?: number;
     gapAnalysis?: boolean;
     graderReplications?: number;

package/dist/_vendor/ailf-shared/document-ref.d.ts CHANGED Viewed

@@ -8,7 +8,7 @@
  * Attachable at every level of the report hierarchy:
  * - ScoreSummary.documentManifest — all docs used in the evaluation
  * - FeatureScore.documents — docs used for a specific area
- * - StoredJudgment.canonicalDocs — docs expected for a specific task
+ * - StoredJudgment.contextDocs (legacy alias: canonicalDocs) — docs expected for a specific task
  */
 export interface DocumentRef {
     /**

package/dist/adapters/api-client/build-request.d.ts CHANGED Viewed

@@ -59,6 +59,7 @@ export interface RemoteConfigSlice {
     perspectiveOverride?: string;
     graderContext?: "rubric-only" | "with-docs";
     graderReplications?: number;
+    borderlineReplications?: number;
     gapAnalysisEnabled?: boolean;
     noRemoteCache?: boolean;
     /**

package/dist/adapters/api-client/build-request.js CHANGED Viewed

@@ -130,6 +130,9 @@ export async function buildRemoteRequest(options) {
     if (config.graderReplications) {
         raw.graderReplications = config.graderReplications;
     }
+    if (config.borderlineReplications) {
+        raw.borderlineReplications = config.borderlineReplications;
+    }
     if (config.gapAnalysisEnabled)
         raw.gapAnalysis = true;
     if (config.noRemoteCache)

package/dist/adapters/attribution/attribution-meta-writer.d.ts ADDED Viewed

@@ -0,0 +1,35 @@
+/**
+ * attribution-meta-writer.ts — Zod schema for the run-scoped
+ * attribution metadata artifact (ATTR-01) emitted by Phase 4 and read
+ * back alongside the per-entry attribution objects.
+ *
+ * The schema asserts `satisfies z.ZodType<AttributionMeta>` against the
+ * canonical domain type in `packages/core/src/types/attribution.ts`
+ * (D0045 / W0187) — drift is a build error.
+ *
+ * `embeddingModel` is REQUIRED (Pitfall #6): silently downgrading to a
+ * default has caused regressions in adjacent codebases — model swaps
+ * MUST invalidate cached weights.
+ *
+ * Phase 1 lands the SHAPE only — no compute, no file I/O.
+ *
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
+ * @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
+ */
+import { z } from "zod";
+/**
+ * Canonical schema for {@link AttributionMeta}. Persisted at
+ * `runs/{runId}/attribution/_meta.json` (or whatever bulk path the
+ * Phase 4 descriptor pins) and parsed on read.
+ */
+export declare const AttributionMetaSchema: z.ZodObject<{
+    ensembleVersion: z.ZodString;
+    embeddingModel: z.ZodString;
+    calibrationSetVersion: z.ZodOptional<z.ZodString>;
+    weights: z.ZodObject<{
+        citation: z.ZodNumber;
+        canonical: z.ZodNumber;
+        retrieved: z.ZodNumber;
+    }, z.core.$strip>;
+}, z.core.$strip>;
+export type { AttributionMeta } from "../../_vendor/ailf-core/index.d.ts";

package/dist/adapters/attribution/attribution-meta-writer.js ADDED Viewed

@@ -0,0 +1,34 @@
+/**
+ * attribution-meta-writer.ts — Zod schema for the run-scoped
+ * attribution metadata artifact (ATTR-01) emitted by Phase 4 and read
+ * back alongside the per-entry attribution objects.
+ *
+ * The schema asserts `satisfies z.ZodType<AttributionMeta>` against the
+ * canonical domain type in `packages/core/src/types/attribution.ts`
+ * (D0045 / W0187) — drift is a build error.
+ *
+ * `embeddingModel` is REQUIRED (Pitfall #6): silently downgrading to a
+ * default has caused regressions in adjacent codebases — model swaps
+ * MUST invalidate cached weights.
+ *
+ * Phase 1 lands the SHAPE only — no compute, no file I/O.
+ *
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
+ * @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
+ */
+import { z } from "zod";
+/**
+ * Canonical schema for {@link AttributionMeta}. Persisted at
+ * `runs/{runId}/attribution/_meta.json` (or whatever bulk path the
+ * Phase 4 descriptor pins) and parsed on read.
+ */
+export const AttributionMetaSchema = z.object({
+    ensembleVersion: z.string().min(1),
+    embeddingModel: z.string().min(1),
+    calibrationSetVersion: z.string().optional(),
+    weights: z.object({
+        citation: z.number(),
+        canonical: z.number(),
+        retrieved: z.number(),
+    }),
+});

package/dist/adapters/attribution/index.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+/**
+ * attribution adapter barrel — named re-exports only (W0124 / D0045).
+ *
+ * The attribution schemas live here so they enter the D0045
+ * `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
+ */
+export { JudgmentAttributionSchema } from "./per-entry-attribution-writer.js";
+export { AttributionMetaSchema } from "./attribution-meta-writer.js";
+export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "../../_vendor/ailf-core/index.d.ts";

package/dist/adapters/attribution/index.js ADDED Viewed

@@ -0,0 +1,8 @@
+/**
+ * attribution adapter barrel — named re-exports only (W0124 / D0045).
+ *
+ * The attribution schemas live here so they enter the D0045
+ * `pnpm check-trust-boundary-satisfies` SCAN_ROOTS gate.
+ */
+export { JudgmentAttributionSchema } from "./per-entry-attribution-writer.js";
+export { AttributionMetaSchema } from "./attribution-meta-writer.js";

package/dist/adapters/attribution/per-entry-attribution-writer.d.ts ADDED Viewed

@@ -0,0 +1,56 @@
+/**
+ * per-entry-attribution-writer.ts — Zod schema for the per-judgment
+ * attribution artifact (ATTR-01) emitted by Phase 4's
+ * `ComputeAttributionStep` and read back by Phase 5's diagnosis runner.
+ *
+ * The schema asserts `satisfies z.ZodType<JudgmentAttribution>` against
+ * the canonical domain type in `packages/core/src/types/attribution.ts`
+ * (D0045 / W0187) — drift between schema and type is a build error.
+ *
+ * Phase 1 lands the SHAPE only — no compute, no file I/O. Phase 4 wires
+ * the writer; Phase 5 wires the reader. Both `satisfies` against this
+ * single source-of-truth schema.
+ *
+ * `hallucinationCheckedAgainst` is REQUIRED (Pitfall #11): consumers
+ * must be able to audit citation grounding without re-deriving the
+ * resolvable-set. The canonical task field is `contextDocs`; do NOT
+ * invent `expectedDocs` / `usedDocs` synonyms.
+ *
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
+ * @see docs/decisions/D0049-shared-confidence-contract.md
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
+ * @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
+ */
+import { z } from "zod";
+/**
+ * Canonical schema for {@link JudgmentAttribution}. Persisted at
+ * `runs/{runId}/attribution/{entryKey}.json` (Phase 4) and parsed by
+ * the diagnosis runner on read (Phase 5).
+ */
+export declare const JudgmentAttributionSchema: z.ZodObject<{
+    judgmentRef: z.ZodString;
+    taskId: z.ZodString;
+    modelId: z.ZodString;
+    dimension: z.ZodString;
+    attributions: z.ZodArray<z.ZodObject<{
+        documentId: z.ZodString;
+        slug: z.ZodOptional<z.ZodString>;
+        score: z.ZodNumber;
+        signals: z.ZodObject<{
+            citation: z.ZodOptional<z.ZodNumber>;
+            canonical: z.ZodOptional<z.ZodNumber>;
+            retrieved: z.ZodOptional<z.ZodNumber>;
+        }, z.core.$strip>;
+        confidence: z.ZodObject<{
+            level: z.ZodEnum<{
+                low: "low";
+                medium: "medium";
+                high: "high";
+            }>;
+            signalsPresent: z.ZodNumber;
+            derivation: z.ZodString;
+        }, z.core.$strip>;
+    }, z.core.$strip>>;
+    hallucinationCheckedAgainst: z.ZodArray<z.ZodString>;
+}, z.core.$strip>;
+export type { DocAttribution, JudgmentAttribution } from "../../_vendor/ailf-core/index.d.ts";