npm - @sanity/ailf - Versions diffs - 4.6.0 → 6.0.0 - Mend

@sanity/ailf 4.6.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (185) hide show

package/dist/pipeline/borderline-detector.js ADDED Viewed

@@ -0,0 +1,26 @@
+/**
+ * pipeline/borderline-detector.ts
+ *
+ * GRAD-04 borderline-band predicate. Pure computation; no I/O.
+ *
+ * A judgment is "borderline" when its score lies within ±5 of any of
+ * the three rubric thresholds (severity boundaries 30 / 50 / 60 from
+ * packages/eval/config/thresholds.ts:50/54/58 — critical / warning /
+ * info edges).
+ *
+ * Per D0005 (grader-model separation), borderline judgments trigger
+ * intra-grader consensus replication of the SAME pinned grader rather
+ * than inter-grader ensemble — preserving D0005's reproducibility
+ * posture.
+ *
+ * @see docs/decisions/D0005-grader-model-separation.md
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
+ */
+export const BORDERLINE_BAND = 5;
+/**
+ * Returns true when `score` lies within ±BORDERLINE_BAND of any
+ * configured threshold. Pure function — safe to call N×.
+ */
+export function isBorderline(score, thresholds) {
+    return thresholds.some((t) => Math.abs(score - t) <= BORDERLINE_BAND);
+}

package/dist/pipeline/calculate-scores.d.ts CHANGED Viewed

@@ -1,4 +1,5 @@
-import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type SymbolPreflightReport, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
+import { type ActualScoreEntry, type ComponentResult, type GraderReliability, type Logger, type StoredTestResult, type SymbolPreflightReport, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
+import type { JudgmentConsistency } from "./grader-consistency.js";
 import { type ResolvedSourceConfig } from "../sources.js";
 import type { FeatureScore, GraderJudgment, PerModelEntry } from "./types.js";
 import { type ScoreTestGroupOptions } from "./compiler/scoring-bridge.js";
@@ -108,8 +109,84 @@ export declare function calculateScoresPerModel(resultsPath: string, goldProfile
  * assertion produces one GraderJudgment entry.
  *
  * Phase 3a prerequisite: structured judgment data for failure mode extraction.
+ *
+ * @param resultsPath - Path to the Promptfoo results JSON file.
+ * @param telemetry - Optional reliability counter (Plan 03-03). When passed,
+ *   `parseFailures` is incremented on every strict-schema rejection so the
+ *   live pipeline can surface schema drift over time.
+ *   `runId` (when supplied) is threaded into synthesized fall-back judgment
+ *   ids so dedup keys are unique per-run.
+ */
+export declare function extractGraderJudgments(resultsPath: string, telemetry?: {
+    reliability: GraderReliability;
+    runId?: string;
+}): GraderJudgment[];
+/**
+ * Stamp every grader judgment with a D0049 ceiling-cross-check confidence
+ * triple and increment `GraderReliability.failureModeCalibration` whenever
+ * the grader's emitted `failureMode` disagrees with the
+ * ceiling-decomposition mode.
+ *
+ * Plan 03-03 — the grader's emitted `failureMode` is the source of truth
+ * for the mode itself (Plan 03-02 per-dimension taxonomies); this pass
+ * stamps confidence based on whether the structural ceiling signal agrees
+ * and surfaces calibration drift as a counter on `GraderReliability`.
+ *
+ * The function mutates `judgments` in place — it overlays
+ * `judgment.confidence` with the ceiling-cross-check stamp. If a judgment
+ * already carries a confidence from the strict-schema parse (Plan 03-01),
+ * the ceiling-cross-check stamp REPLACES it because the validator's
+ * derivation tag is the live-pipeline contract; the parsed-shape
+ * confidence (if emitted by the grader) is preserved on the original
+ * `parsedJudgment` upstream of this site.
+ *
+ * `hallucinationCheckedAgainst` is NOT populated here — `extractGraderJudgments`
+ * does not have access to `task.contextDocs ∪ run.documentManifest` at this
+ * site (the union travels through a separate path in
+ * `gap-analysis-step.ts`'s document-enrichment flow). Plan 03-04 will
+ * couple the doc-union population at the strict-schema flip site so the
+ * field is populated alongside the required-flip.
+ *
+ * @param judgments - Grader judgments produced by `extractGraderJudgments`.
+ * @param scores - Per-area feature scores; `ceilingScore` and `floorScore`
+ *   come from this lookup. Missing areas default to ceiling 100, floor 0
+ *   (preserves the pre-Plan-03-03 fall-back from `buildFailureModeReport`).
+ * @param reliability - `GraderReliability` sink whose
+ *   `failureModeCalibration` counter is incremented on disagreement.
+ *
+ * @see docs/decisions/D0005-grader-model-separation.md
+ * @see docs/decisions/D0049-shared-confidence-contract.md
  */
-export declare function extractGraderJudgments(resultsPath: string): GraderJudgment[];
+export declare function validateGraderJudgmentsCalibration(judgments: GraderJudgment[], scores: FeatureScore[], reliability: GraderReliability): void;
+/**
+ * Populate Pitfall #11 hallucination cross-check fields on grader
+ * judgments (Plan 03-04 GRAD-05).
+ *
+ * For each judgment, sets `hallucinationCheckedAgainst` to the union of
+ * (a) the slugs of docs the task declared in `context.docs` and (b) the
+ * run's full document manifest. For each entry in `judgment.docCitations`
+ * that carries a `slug`, sets `hallucinated: !union.has(slug)` — a slug
+ * that does not appear in either set is a fabrication, not a real
+ * resolvable doc.
+ *
+ * Mutates `judgments` in place. Slug-less citations are left with
+ * `hallucinated` undefined since the lookup key is the slug per the
+ * GraderJudgment domain doc-comment ("slug does not resolve against the
+ * task's contextDocs set").
+ *
+ * Called from `gap-analysis-step` (literacy mode only) after
+ * `descToDocRefs` and `documentManifest` are built — both inputs are
+ * unavailable at extract time. Non-literacy modes skip this step
+ * entirely (the runtime contract has no canonical doc set to check
+ * against).
+ *
+ * @param judgments - Grader judgments to enrich. Mutated in place.
+ * @param taskDocSlugs - Map from base task description (the form judgment
+ *   `taskId` carries after stripping the `(gold)` / `(baseline)` suffix)
+ *   to the slugs declared in that task's `context.docs`.
+ * @param manifestSlugs - All slugs in the run's document manifest.
+ */
+export declare function populateHallucinationFields(judgments: GraderJudgment[], taskDocSlugs: Map<string, string[]>, manifestSlugs: Iterable<string>): void;
 /**
  * Extract per-test results with model output from evaluation results.
  *
@@ -165,9 +242,43 @@ export declare function scoreAgenticResults(resultsPath: string, profile: Record
  */
 export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): Record<string, Record<string, ActualScoreEntry>>;
 /** Options for the calculate-scores main() function. */
+/**
+ * Pre-built runner closure for the GRAD-04 borderline-consensus pass.
+ *
+ * The composition root produces one of these via
+ * `createBorderlineConsensusRunner` so the threshold + replication
+ * defaults stay co-located with the rest of the pipeline wiring.
+ * `calculateAndWriteScores` invokes it after `extractGraderJudgments`
+ * and before persisting `grader-judgments.json` so the judgments file
+ * carries the consensus-merged scores rather than the original
+ * single-replica grader output (CR-01).
+ */
+export type BorderlineConsensusRunner = (args: {
+    judgments: GraderJudgment[];
+    logger?: Logger;
+    regrade: (judgment: GraderJudgment) => Promise<number>;
+}) => Promise<{
+    consistencyByJudgment: Map<string, JudgmentConsistency>;
+    judgments: GraderJudgment[];
+}>;
 export interface CalculateScoresOptions {
     /** Allowed origins for source isolation reporting */
     allowedOrigins?: string[];
+    /**
+     * Pre-built borderline-consensus runner (CR-01). When provided AND
+     * non-zero `borderlineReplications`, runs after extraction and
+     * persists `borderline-consistency.json` alongside
+     * `grader-judgments.json`. When omitted, the pipeline keeps the
+     * single-replica scores — preserving Phase 2 behavior for callers
+     * that haven't opted in.
+     */
+    borderlineConsensusRunner?: BorderlineConsensusRunner;
+    /**
+     * Optional regrade entry point used by the borderline runner. Wired
+     * in by the orchestration step from grader-api.ts; when absent, the
+     * runner is skipped because there's nothing to call.
+     */
+    borderlineRegradeOnce?: (responseText: string, rubricText: string) => Promise<null | number>;
     /** Logger instance (defaults to ConsoleLogger if not provided) */
     logger?: Logger;
     /** Evaluation mode (controls which result files are read) */
@@ -209,4 +320,4 @@ export interface CalculateScoresResult {
     /** Summary of test execution outcomes (total, passed, failed, errored). */
     testSummary?: TestSummary;
 }
-export declare function calculateAndWriteScores(options: CalculateScoresOptions): CalculateScoresResult;
+export declare function calculateAndWriteScores(options: CalculateScoresOptions): Promise<CalculateScoresResult>;