@sanity/ailf 4.6.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/canonical/grader-references/agent-harness-tools.yaml +42 -0
  2. package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
  3. package/canonical/grader-references/mcp-server-spec.yaml +51 -0
  4. package/canonical/grader-references/portable-text.yaml +48 -0
  5. package/config/rubrics.ts +38 -2
  6. package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
  7. package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
  8. package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
  9. package/dist/_vendor/ailf-core/examples/index.js +146 -47
  10. package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
  11. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
  12. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
  13. package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
  14. package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
  15. package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
  16. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  17. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
  18. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  19. package/dist/_vendor/ailf-core/schemas/index.js +9 -0
  20. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  21. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
  22. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
  23. package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
  24. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
  25. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
  26. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
  27. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
  28. package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
  29. package/dist/_vendor/ailf-core/services/index.js +5 -0
  30. package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
  31. package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
  32. package/dist/_vendor/ailf-core/types/attribution.js +18 -0
  33. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
  34. package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
  35. package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
  36. package/dist/_vendor/ailf-core/types/confidence.js +7 -0
  37. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
  38. package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
  39. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
  40. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
  41. package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
  42. package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
  43. package/dist/_vendor/ailf-core/types/index.js +15 -1
  44. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
  45. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
  46. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  47. package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
  48. package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
  49. package/dist/adapters/api-client/build-request.d.ts +1 -0
  50. package/dist/adapters/api-client/build-request.js +3 -0
  51. package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
  52. package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
  53. package/dist/adapters/attribution/index.d.ts +9 -0
  54. package/dist/adapters/attribution/index.js +8 -0
  55. package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
  56. package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
  57. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  58. package/dist/adapters/grader-outputs/index.d.ts +10 -0
  59. package/dist/adapters/grader-outputs/index.js +8 -0
  60. package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
  61. package/dist/adapters/grader-outputs/legacy/index.js +10 -0
  62. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
  63. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
  64. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
  65. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
  66. package/dist/adapters/index.d.ts +3 -0
  67. package/dist/adapters/index.js +4 -0
  68. package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
  69. package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
  70. package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
  71. package/dist/adapters/task-sources/repo-schemas.js +19 -2
  72. package/dist/commands/calculate-scores.js +1 -1
  73. package/dist/commands/explain-handler.js +1 -1
  74. package/dist/commands/lookup-doc.d.ts +1 -1
  75. package/dist/commands/lookup-doc.js +3 -3
  76. package/dist/commands/pipeline-action.d.ts +6 -0
  77. package/dist/commands/pipeline-action.js +2 -0
  78. package/dist/commands/remote-pipeline.js +1 -0
  79. package/dist/composition-root.d.ts +36 -0
  80. package/dist/composition-root.js +48 -0
  81. package/dist/config/rubrics.ts +38 -2
  82. package/dist/grader/agent-harness.d.ts +14 -0
  83. package/dist/grader/agent-harness.js +17 -0
  84. package/dist/grader/common.d.ts +17 -0
  85. package/dist/grader/common.js +21 -0
  86. package/dist/grader/index.d.ts +38 -0
  87. package/dist/grader/index.js +75 -0
  88. package/dist/grader/knowledge-probe.d.ts +14 -0
  89. package/dist/grader/knowledge-probe.js +18 -0
  90. package/dist/grader/literacy.d.ts +13 -0
  91. package/dist/grader/literacy.js +17 -0
  92. package/dist/grader/mcp.d.ts +14 -0
  93. package/dist/grader/mcp.js +18 -0
  94. package/dist/orchestration/build-app-context.js +1 -0
  95. package/dist/orchestration/build-step-sequence.js +5 -0
  96. package/dist/orchestration/steps/calculate-scores-step.js +23 -1
  97. package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
  98. package/dist/orchestration/steps/compute-attribution-step.js +279 -0
  99. package/dist/orchestration/steps/gap-analysis-step.js +35 -7
  100. package/dist/orchestration/steps/index.d.ts +1 -0
  101. package/dist/orchestration/steps/index.js +1 -0
  102. package/dist/pipeline/attribution.d.ts +15 -0
  103. package/dist/pipeline/attribution.js +18 -9
  104. package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
  105. package/dist/pipeline/borderline-consensus-runner.js +124 -0
  106. package/dist/pipeline/borderline-detector.d.ts +24 -0
  107. package/dist/pipeline/borderline-detector.js +26 -0
  108. package/dist/pipeline/calculate-scores.d.ts +114 -3
  109. package/dist/pipeline/calculate-scores.js +426 -24
  110. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  111. package/dist/pipeline/compiler/literacy-bridge.js +35 -17
  112. package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
  113. package/dist/pipeline/compiler/rubric-resolution.js +9 -1
  114. package/dist/pipeline/compute-attribution.d.ts +80 -0
  115. package/dist/pipeline/compute-attribution.js +196 -0
  116. package/dist/pipeline/failure-modes.d.ts +52 -17
  117. package/dist/pipeline/failure-modes.js +178 -117
  118. package/dist/pipeline/map-request-to-config.js +1 -0
  119. package/package.json +6 -4
@@ -0,0 +1,26 @@
1
+ /**
2
+ * pipeline/borderline-detector.ts
3
+ *
4
+ * GRAD-04 borderline-band predicate. Pure computation; no I/O.
5
+ *
6
+ * A judgment is "borderline" when its score lies within ±5 of any of
7
+ * the three rubric thresholds (severity boundaries 30 / 50 / 60 from
8
+ * packages/eval/config/thresholds.ts:50/54/58 — critical / warning /
9
+ * info edges).
10
+ *
11
+ * Per D0005 (grader-model separation), borderline judgments trigger
12
+ * intra-grader consensus replication of the SAME pinned grader rather
13
+ * than inter-grader ensemble — preserving D0005's reproducibility
14
+ * posture.
15
+ *
16
+ * @see docs/decisions/D0005-grader-model-separation.md
17
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
18
+ */
19
+ export const BORDERLINE_BAND = 5;
20
+ /**
21
+ * Returns true when `score` lies within ±BORDERLINE_BAND of any
22
+ * configured threshold. Pure function — safe to call N×.
23
+ */
24
+ export function isBorderline(score, thresholds) {
25
+ return thresholds.some((t) => Math.abs(score - t) <= BORDERLINE_BAND);
26
+ }
@@ -1,4 +1,5 @@
1
- import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type SymbolPreflightReport, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
1
+ import { type ActualScoreEntry, type ComponentResult, type GraderReliability, type Logger, type StoredTestResult, type SymbolPreflightReport, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
2
+ import type { JudgmentConsistency } from "./grader-consistency.js";
2
3
  import { type ResolvedSourceConfig } from "../sources.js";
3
4
  import type { FeatureScore, GraderJudgment, PerModelEntry } from "./types.js";
4
5
  import { type ScoreTestGroupOptions } from "./compiler/scoring-bridge.js";
@@ -108,8 +109,84 @@ export declare function calculateScoresPerModel(resultsPath: string, goldProfile
108
109
  * assertion produces one GraderJudgment entry.
109
110
  *
110
111
  * Phase 3a prerequisite: structured judgment data for failure mode extraction.
112
+ *
113
+ * @param resultsPath - Path to the Promptfoo results JSON file.
114
+ * @param telemetry - Optional reliability counter (Plan 03-03). When passed,
115
+ * `parseFailures` is incremented on every strict-schema rejection so the
116
+ * live pipeline can surface schema drift over time.
117
+ * `runId` (when supplied) is threaded into synthesized fall-back judgment
118
+ * ids so dedup keys are unique per-run.
119
+ */
120
+ export declare function extractGraderJudgments(resultsPath: string, telemetry?: {
121
+ reliability: GraderReliability;
122
+ runId?: string;
123
+ }): GraderJudgment[];
124
+ /**
125
+ * Stamp every grader judgment with a D0049 ceiling-cross-check confidence
126
+ * triple and increment `GraderReliability.failureModeCalibration` whenever
127
+ * the grader's emitted `failureMode` disagrees with the
128
+ * ceiling-decomposition mode.
129
+ *
130
+ * Plan 03-03 — the grader's emitted `failureMode` is the source of truth
131
+ * for the mode itself (Plan 03-02 per-dimension taxonomies); this pass
132
+ * stamps confidence based on whether the structural ceiling signal agrees
133
+ * and surfaces calibration drift as a counter on `GraderReliability`.
134
+ *
135
+ * The function mutates `judgments` in place — it overlays
136
+ * `judgment.confidence` with the ceiling-cross-check stamp. If a judgment
137
+ * already carries a confidence from the strict-schema parse (Plan 03-01),
138
+ * the ceiling-cross-check stamp REPLACES it because the validator's
139
+ * derivation tag is the live-pipeline contract; the parsed-shape
140
+ * confidence (if emitted by the grader) is preserved on the original
141
+ * `parsedJudgment` upstream of this site.
142
+ *
143
+ * `hallucinationCheckedAgainst` is NOT populated here — `extractGraderJudgments`
144
+ * does not have access to `task.contextDocs ∪ run.documentManifest` at this
145
+ * site (the union travels through a separate path in
146
+ * `gap-analysis-step.ts`'s document-enrichment flow). Plan 03-04 will
147
+ * couple the doc-union population at the strict-schema flip site so the
148
+ * field is populated alongside the required-flip.
149
+ *
150
+ * @param judgments - Grader judgments produced by `extractGraderJudgments`.
151
+ * @param scores - Per-area feature scores; `ceilingScore` and `floorScore`
152
+ * come from this lookup. Missing areas default to ceiling 100, floor 0
153
+ * (preserves the pre-Plan-03-03 fall-back from `buildFailureModeReport`).
154
+ * @param reliability - `GraderReliability` sink whose
155
+ * `failureModeCalibration` counter is incremented on disagreement.
156
+ *
157
+ * @see docs/decisions/D0005-grader-model-separation.md
158
+ * @see docs/decisions/D0049-shared-confidence-contract.md
111
159
  */
112
- export declare function extractGraderJudgments(resultsPath: string): GraderJudgment[];
160
+ export declare function validateGraderJudgmentsCalibration(judgments: GraderJudgment[], scores: FeatureScore[], reliability: GraderReliability): void;
161
+ /**
162
+ * Populate Pitfall #11 hallucination cross-check fields on grader
163
+ * judgments (Plan 03-04 GRAD-05).
164
+ *
165
+ * For each judgment, sets `hallucinationCheckedAgainst` to the union of
166
+ * (a) the slugs of docs the task declared in `context.docs` and (b) the
167
+ * run's full document manifest. For each entry in `judgment.docCitations`
168
+ * that carries a `slug`, sets `hallucinated: !union.has(slug)` — a slug
169
+ * that does not appear in either set is a fabrication, not a real
170
+ * resolvable doc.
171
+ *
172
+ * Mutates `judgments` in place. Slug-less citations are left with
173
+ * `hallucinated` undefined since the lookup key is the slug per the
174
+ * GraderJudgment domain doc-comment ("slug does not resolve against the
175
+ * task's contextDocs set").
176
+ *
177
+ * Called from `gap-analysis-step` (literacy mode only) after
178
+ * `descToDocRefs` and `documentManifest` are built — both inputs are
179
+ * unavailable at extract time. Non-literacy modes skip this step
180
+ * entirely (the runtime contract has no canonical doc set to check
181
+ * against).
182
+ *
183
+ * @param judgments - Grader judgments to enrich. Mutated in place.
184
+ * @param taskDocSlugs - Map from base task description (the form judgment
185
+ * `taskId` carries after stripping the `(gold)` / `(baseline)` suffix)
186
+ * to the slugs declared in that task's `context.docs`.
187
+ * @param manifestSlugs - All slugs in the run's document manifest.
188
+ */
189
+ export declare function populateHallucinationFields(judgments: GraderJudgment[], taskDocSlugs: Map<string, string[]>, manifestSlugs: Iterable<string>): void;
113
190
  /**
114
191
  * Extract per-test results with model output from evaluation results.
115
192
  *
@@ -165,9 +242,43 @@ export declare function scoreAgenticResults(resultsPath: string, profile: Record
165
242
  */
166
243
  export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): Record<string, Record<string, ActualScoreEntry>>;
167
244
  /** Options for the calculate-scores main() function. */
245
+ /**
246
+ * Pre-built runner closure for the GRAD-04 borderline-consensus pass.
247
+ *
248
+ * The composition root produces one of these via
249
+ * `createBorderlineConsensusRunner` so the threshold + replication
250
+ * defaults stay co-located with the rest of the pipeline wiring.
251
+ * `calculateAndWriteScores` invokes it after `extractGraderJudgments`
252
+ * and before persisting `grader-judgments.json` so the judgments file
253
+ * carries the consensus-merged scores rather than the original
254
+ * single-replica grader output (CR-01).
255
+ */
256
+ export type BorderlineConsensusRunner = (args: {
257
+ judgments: GraderJudgment[];
258
+ logger?: Logger;
259
+ regrade: (judgment: GraderJudgment) => Promise<number>;
260
+ }) => Promise<{
261
+ consistencyByJudgment: Map<string, JudgmentConsistency>;
262
+ judgments: GraderJudgment[];
263
+ }>;
168
264
  export interface CalculateScoresOptions {
169
265
  /** Allowed origins for source isolation reporting */
170
266
  allowedOrigins?: string[];
267
+ /**
268
+ * Pre-built borderline-consensus runner (CR-01). When provided AND
269
+ * non-zero `borderlineReplications`, runs after extraction and
270
+ * persists `borderline-consistency.json` alongside
271
+ * `grader-judgments.json`. When omitted, the pipeline keeps the
272
+ * single-replica scores — preserving Phase 2 behavior for callers
273
+ * that haven't opted in.
274
+ */
275
+ borderlineConsensusRunner?: BorderlineConsensusRunner;
276
+ /**
277
+ * Optional regrade entry point used by the borderline runner. Wired
278
+ * in by the orchestration step from grader-api.ts; when absent, the
279
+ * runner is skipped because there's nothing to call.
280
+ */
281
+ borderlineRegradeOnce?: (responseText: string, rubricText: string) => Promise<null | number>;
171
282
  /** Logger instance (defaults to ConsoleLogger if not provided) */
172
283
  logger?: Logger;
173
284
  /** Evaluation mode (controls which result files are read) */
@@ -209,4 +320,4 @@ export interface CalculateScoresResult {
209
320
  /** Summary of test execution outcomes (total, passed, failed, errored). */
210
321
  testSummary?: TestSummary;
211
322
  }
212
- export declare function calculateAndWriteScores(options: CalculateScoresOptions): CalculateScoresResult;
323
+ export declare function calculateAndWriteScores(options: CalculateScoresOptions): Promise<CalculateScoresResult>;