@sanity/ailf 4.1.0 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/config/package-surface.ts +37 -0
  2. package/config/preflight-scoring.ts +26 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/artifact-registry.js +47 -0
  5. package/dist/_vendor/ailf-core/config-helpers.d.ts +35 -0
  6. package/dist/_vendor/ailf-core/config-helpers.js +67 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  8. package/dist/_vendor/ailf-core/index.js +1 -1
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +18 -0
  10. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +30 -0
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +3 -1
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  13. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +23 -0
  14. package/dist/_vendor/ailf-core/ports/package-surface-resolver.d.ts +71 -0
  15. package/dist/_vendor/ailf-core/ports/package-surface-resolver.js +36 -0
  16. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +6 -0
  17. package/dist/_vendor/ailf-core/schemas/eval-config.js +14 -0
  18. package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
  19. package/dist/_vendor/ailf-core/schemas/index.js +1 -0
  20. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.d.ts +51 -0
  21. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.js +57 -0
  22. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +20 -3
  23. package/dist/_vendor/ailf-core/types/index.d.ts +13 -1
  24. package/dist/_vendor/ailf-core/types/index.js +1 -0
  25. package/dist/_vendor/ailf-core/types/package-surface.d.ts +36 -0
  26. package/dist/_vendor/ailf-core/types/package-surface.js +13 -0
  27. package/dist/_vendor/ailf-core/types/preflight-scoring.d.ts +52 -0
  28. package/dist/_vendor/ailf-core/types/preflight-scoring.js +18 -0
  29. package/dist/_vendor/ailf-core/types/repo-config.d.ts +14 -0
  30. package/dist/_vendor/ailf-core/types/symbol-preflight-report.d.ts +66 -0
  31. package/dist/_vendor/ailf-core/types/symbol-preflight-report.js +25 -0
  32. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  33. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +25 -5
  34. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +276 -95
  35. package/dist/adapters/index.d.ts +1 -0
  36. package/dist/adapters/index.js +1 -0
  37. package/dist/adapters/package-surface/dts-package-surface.d.ts +46 -0
  38. package/dist/adapters/package-surface/dts-package-surface.js +173 -0
  39. package/dist/adapters/package-surface/in-memory-package-surface.d.ts +15 -0
  40. package/dist/adapters/package-surface/in-memory-package-surface.js +28 -0
  41. package/dist/adapters/package-surface/index.d.ts +9 -0
  42. package/dist/adapters/package-surface/index.js +8 -0
  43. package/dist/adapters/package-surface/parse-dts-exports.d.ts +31 -0
  44. package/dist/adapters/package-surface/parse-dts-exports.js +54 -0
  45. package/dist/adapters/task-sources/repo-schemas.d.ts +22 -0
  46. package/dist/adapters/task-sources/repo-schemas.js +93 -1
  47. package/dist/adapters/task-sources/repo-task-source.js +11 -2
  48. package/dist/commands/pipeline-action.d.ts +2 -0
  49. package/dist/commands/pipeline-action.js +12 -0
  50. package/dist/commands/remote-pipeline.js +9 -2
  51. package/dist/commands/remote-results.d.ts +12 -1
  52. package/dist/commands/remote-results.js +25 -5
  53. package/dist/commands/validate-tasks.js +8 -2
  54. package/dist/composition-root.js +9 -0
  55. package/dist/config/package-surface.ts +37 -0
  56. package/dist/config/preflight-scoring.ts +26 -0
  57. package/dist/index.d.ts +2 -2
  58. package/dist/index.js +1 -1
  59. package/dist/orchestration/build-app-context.js +1 -0
  60. package/dist/orchestration/pipeline-orchestrator.d.ts +19 -1
  61. package/dist/orchestration/pipeline-orchestrator.js +38 -0
  62. package/dist/orchestration/steps/calculate-scores-step.js +11 -0
  63. package/dist/orchestration/steps/generate-configs-step.js +16 -1
  64. package/dist/orchestration/steps/run-eval-step.js +27 -0
  65. package/dist/pipeline/calculate-scores.d.ts +66 -5
  66. package/dist/pipeline/calculate-scores.js +141 -27
  67. package/dist/pipeline/compiler/index.d.ts +1 -1
  68. package/dist/pipeline/compiler/index.js +1 -1
  69. package/dist/pipeline/compiler/literacy-bridge.d.ts +9 -0
  70. package/dist/pipeline/compiler/literacy-bridge.js +2 -0
  71. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +0 -12
  72. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +0 -12
  73. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +1 -1
  74. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +31 -4
  75. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +190 -6
  76. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +2 -0
  77. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +17 -2
  78. package/dist/pipeline/compiler/rubric-resolution.d.ts +17 -1
  79. package/dist/pipeline/compiler/rubric-resolution.js +78 -2
  80. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -2
  81. package/dist/pipeline/compiler/scoring-bridge.js +104 -10
  82. package/dist/pipeline/eval-fingerprint.d.ts +9 -0
  83. package/dist/pipeline/eval-fingerprint.js +7 -1
  84. package/dist/pipeline/preflight/compute-preflight.d.ts +67 -0
  85. package/dist/pipeline/preflight/compute-preflight.js +118 -0
  86. package/dist/pipeline/preflight/emit-symbol-preflight.d.ts +51 -0
  87. package/dist/pipeline/preflight/emit-symbol-preflight.js +102 -0
  88. package/dist/pipeline/preflight/load-package-surface.d.ts +14 -0
  89. package/dist/pipeline/preflight/load-package-surface.js +19 -0
  90. package/dist/pipeline/preflight/load-preflight-context.d.ts +13 -0
  91. package/dist/pipeline/preflight/load-preflight-context.js +25 -0
  92. package/dist/pipeline/preflight/load-preflight-scoring.d.ts +12 -0
  93. package/dist/pipeline/preflight/load-preflight-scoring.js +17 -0
  94. package/dist/pipeline/preflight/parse-imports.d.ts +62 -0
  95. package/dist/pipeline/preflight/parse-imports.js +125 -0
  96. package/dist/report-store.d.ts +8 -0
  97. package/dist/report-store.js +55 -6
  98. package/dist/sanity/document-renderers.d.ts +106 -0
  99. package/dist/sanity/document-renderers.js +307 -0
  100. package/dist/sanity/queries.d.ts +32 -11
  101. package/dist/sanity/queries.js +78 -0
  102. package/dist/sanity/symbol-index.d.ts +98 -0
  103. package/dist/sanity/symbol-index.js +615 -0
  104. package/dist/tasks/knowledge-probe/define-type-api.task.ts +2 -6
  105. package/dist/tasks/knowledge-probe/groq-projections.task.ts +0 -5
  106. package/dist/tasks/literacy/content-lake.task.ts +4 -10
  107. package/dist/tasks/literacy/frameworks.task.ts +2 -8
  108. package/dist/tasks/literacy/functions.task.ts +1 -4
  109. package/dist/tasks/literacy/groq.task.ts +3 -12
  110. package/dist/tasks/literacy/image-handling.task.ts +1 -4
  111. package/dist/tasks/literacy/nextjs-live.task.ts +1 -4
  112. package/dist/tasks/literacy/portable-text.task.ts +2 -8
  113. package/dist/tasks/literacy/studio-setup.task.ts +2 -8
  114. package/dist/tasks/literacy/visual-editing.task.ts +2 -8
  115. package/package.json +2 -1
  116. package/tasks/knowledge-probe/define-type-api.task.ts +2 -6
  117. package/tasks/knowledge-probe/groq-projections.task.ts +0 -5
  118. package/tasks/literacy/content-lake.task.ts +4 -10
  119. package/tasks/literacy/frameworks.task.ts +2 -8
  120. package/tasks/literacy/functions.task.ts +1 -4
  121. package/tasks/literacy/groq.task.ts +3 -12
  122. package/tasks/literacy/image-handling.task.ts +1 -4
  123. package/tasks/literacy/nextjs-live.task.ts +1 -4
  124. package/tasks/literacy/portable-text.task.ts +2 -8
  125. package/tasks/literacy/studio-setup.task.ts +2 -8
  126. package/tasks/literacy/visual-editing.task.ts +2 -8
@@ -29,7 +29,7 @@
29
29
  */
30
30
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
31
31
  import { join } from "path";
32
- import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
32
+ import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
33
33
  import { calculateCost } from "../agent-observer/pricing.js";
34
34
  import { ConsoleLogger } from "../adapters/loggers/index.js";
35
35
  import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
@@ -38,7 +38,7 @@ import { loadRubricTemplates } from "./rubric-loader.js";
38
38
  import { resolveProfile } from "./profile-resolution.js";
39
39
  import { loadSource } from "../sources.js";
40
40
  import { LiteracyVariant } from "./normalize-mode.js";
41
- import { scoreTestGroup } from "./compiler/scoring-bridge.js";
41
+ import { scoreTestGroup, } from "./compiler/scoring-bridge.js";
42
42
  // Re-export from core for backward compatibility.
43
43
  // Existing imports from this file continue to work unchanged.
44
44
  export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
@@ -52,7 +52,7 @@ export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, par
52
52
  * @returns Record keyed by model ID, or null if only one model was used
53
53
  * (per-model breakdown is redundant when there's only one model).
54
54
  */
55
- export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfile) {
55
+ export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfile, preflightOptions) {
56
56
  const results = readAndNormalizeResults(resultsPath);
57
57
  // Group results by provider
58
58
  const byModel = {};
@@ -72,7 +72,7 @@ export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfil
72
72
  }
73
73
  const perModel = [];
74
74
  for (const [modelId, { label, results: modelResults }] of Object.entries(byModel)) {
75
- const scores = scoreResults(modelResults, goldProfile, baselineProfile, modelId);
75
+ const scores = scoreResults(modelResults, goldProfile, baselineProfile, modelId, preflightOptions);
76
76
  const totalTests = scores.reduce((s, sc) => s + sc.testCount, 0);
77
77
  const totalCost = scores.reduce((s, sc) => s + sc.totalCost, 0);
78
78
  const avgScore = scores.length > 0
@@ -408,9 +408,111 @@ function buildSourceVerification(root, source, verificationCtx) {
408
408
  * Calculate overall scores (all models combined).
409
409
  * This is the original scoring path — backward compatible.
410
410
  */
411
- function calculateScores(resultsPath, goldProfile, baselineProfile) {
411
+ function calculateScores(resultsPath, goldProfile, baselineProfile, preflightOptions) {
412
412
  const results = readAndNormalizeResults(resultsPath);
413
- return scoreResults(results, goldProfile, baselineProfile);
413
+ return scoreResults(results, goldProfile, baselineProfile, undefined, preflightOptions);
414
+ }
415
+ /**
416
+ * W0198 — build a `ScoreTestGroupOptions` that the scoring bridge can
417
+ * use to look up a `SymbolPreflightReport` for any given `TestResult`.
418
+ *
419
+ * Mirrors the keying scheme `emitSymbolPreflight` uses:
420
+ * `${runId}/${mode}/${task}/${model}` where `(mode, task)` come from
421
+ * `resolveVariantMode(test.description, defaultMode)`.
422
+ *
423
+ * Returns `undefined` (effectively a no-op) when reports are absent,
424
+ * empty, or the runId hasn't been provided — those collapse cleanly
425
+ * to the pre-W0198 path. The runId branch logs a warning when reports
426
+ * exist but the caller forgot to wire `runId` so the silent
427
+ * preflight-disabled state doesn't go unobserved.
428
+ */
429
+ function makePreflightOptions(reports, runId, defaultMode, weight, logger) {
430
+ if (!reports || reports.size === 0)
431
+ return undefined;
432
+ if (!runId) {
433
+ logger?.warn(`[warn] W0198 preflight: ${reports.size} preflight report(s) provided but no runId — skipping merge into code-correctness`);
434
+ return undefined;
435
+ }
436
+ return {
437
+ preflightWeight: weight,
438
+ preflightForTest: (test) => {
439
+ const modelId = test.providerId ?? test.providerLabel ?? "unknown-model";
440
+ const { mode: axisMode, task } = resolveVariantMode(test.description, defaultMode);
441
+ const key = `${runId}/${axisMode}/${task}/${modelId}`;
442
+ return reports.get(key);
443
+ },
444
+ };
445
+ }
446
+ /**
447
+ * W0198 — aggregate every per-test `SymbolPreflightReport` into a single
448
+ * resolver-health summary. Returns `undefined` when the run had no
449
+ * preflight reports (manifest disabled, resolver missing, or every
450
+ * candidate output cited zero in-scope packages) so the consumer can
451
+ * cleanly omit the field from the score summary instead of writing a
452
+ * vacuous block of zeros.
453
+ *
454
+ * Exported for the dedicated unit test in `preflight-summary.test.ts`;
455
+ * production calls go through `calculateAndWriteScores`, which threads
456
+ * the result into the `EvalScoreSummary.preflight` field.
457
+ */
458
+ export function summarizePreflight(reports) {
459
+ if (!reports || reports.size === 0)
460
+ return undefined;
461
+ let totalFindings = 0;
462
+ let exists = 0;
463
+ let missing = 0;
464
+ let unresolved = 0;
465
+ for (const report of reports.values()) {
466
+ for (const finding of report.findings) {
467
+ totalFindings++;
468
+ if (finding.result === "exists") {
469
+ exists++;
470
+ }
471
+ else if (finding.result === "missing") {
472
+ missing++;
473
+ }
474
+ else if (finding.result === "unresolved") {
475
+ unresolved++;
476
+ }
477
+ else {
478
+ // Exhaustiveness guard: a future fourth `result` variant lands
479
+ // here and surfaces as a build error rather than silently
480
+ // counting into `unresolved`.
481
+ const _exhaustive = finding;
482
+ void _exhaustive;
483
+ }
484
+ }
485
+ }
486
+ return {
487
+ reportCount: reports.size,
488
+ totalFindings,
489
+ exists,
490
+ missing,
491
+ unresolved,
492
+ ...(totalFindings > 0 && { unresolvedRate: unresolved / totalFindings }),
493
+ };
494
+ }
495
+ /**
496
+ * Print the preflight summary to the run log. Format mirrors the other
497
+ * single-line health signals (URL fetch, agent isolation) so CI grep can
498
+ * extract `unresolvedRate` directly from the log when score-summary.json
499
+ * isn't already in scope.
500
+ */
501
+ function printPreflightSummary(summary, log) {
502
+ if (!summary)
503
+ return;
504
+ // `unresolvedRate` is absent when the run produced reports but no
505
+ // findings — distinguish vacuous-green from all-resolved so CI doesn't
506
+ // misread the threshold.
507
+ const rateLabel = summary.unresolvedRate === undefined
508
+ ? "n/a (no findings)"
509
+ : `${(summary.unresolvedRate * 100).toFixed(1)}%`;
510
+ log.info("-".repeat(80));
511
+ log.info("SYMBOL PREFLIGHT (W0198)");
512
+ log.info("-".repeat(80));
513
+ log.info(` ${summary.reportCount} report(s), ${summary.totalFindings} finding(s): ${summary.exists} exists / ${summary.missing} missing / ${summary.unresolved} unresolved`);
514
+ log.info(` unresolvedRate: ${rateLabel} (resolver-health signal — not a candidate score factor)`);
515
+ log.info("");
414
516
  }
415
517
  /**
416
518
  * Extracts agent behavior summary from a test result's metadata.
@@ -644,7 +746,7 @@ function readAndNormalizeResults(resultsPath, log) {
644
746
  * @param baselineProfile Weight profile for baseline (without-docs) entries
645
747
  * @param modelId Optional model identifier to tag each FeatureScore
646
748
  */
647
- function scoreResults(results, goldProfile, baselineProfile, modelId) {
749
+ function scoreResults(results, goldProfile, baselineProfile, modelId, preflightOptions) {
648
750
  // Group by feature + docs/no-docs
649
751
  const byFeature = {};
650
752
  for (const result of results) {
@@ -663,12 +765,12 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
663
765
  const scores = [];
664
766
  for (const [feature, data] of Object.entries(byFeature)) {
665
767
  // --- With docs (gold / ceiling) — scored via 4-tier engine ---
666
- const gold = scoreTestGroup(data.withDocs, goldProfile, feature);
768
+ const gold = scoreTestGroup(data.withDocs, goldProfile, feature, preflightOptions);
667
769
  // --- Without docs (baseline / floor) ---
668
770
  // Uses the baseline profile (e.g. "output-only") which may exclude
669
771
  // dimensions like doc-coverage that are undefined without docs.
670
772
  // See docs/design-docs/named-scoring-profiles.md.
671
- const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature);
773
+ const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature, preflightOptions);
672
774
  const featureCost = gold.totalCost + baseline.totalCost;
673
775
  const ceilingScore = gold.composite;
674
776
  const floorScore = baseline.composite;
@@ -709,7 +811,7 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
709
811
  * Literacy-specific fields (ceilingScore, floorScore, docLift, docQualityGap)
710
812
  * are set to 0 for backward compatibility with downstream consumers.
711
813
  */
712
- function scoreAgentHarnessResults(results, profile) {
814
+ function scoreAgentHarnessResults(results, profile, preflightOptions) {
713
815
  // Group by task ID (extracted from description: "task-id — Title")
714
816
  const byTask = {};
715
817
  for (const result of results) {
@@ -721,7 +823,7 @@ function scoreAgentHarnessResults(results, profile) {
721
823
  }
722
824
  const scores = [];
723
825
  for (const [taskId, taskResults] of Object.entries(byTask)) {
724
- const scored = scoreTestGroup(taskResults, profile, taskId);
826
+ const scored = scoreTestGroup(taskResults, profile, taskId, preflightOptions);
725
827
  const totalCost = scored.totalCost;
726
828
  // Detect feature area for backward compat (used by report grouping)
727
829
  const feature = taskResults[0]?.vars.__featureArea ??
@@ -774,7 +876,7 @@ function extractTaskId(description) {
774
876
  * currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
775
877
  * docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
776
878
  */
777
- export function scoreKnowledgeProbeResults(results, profile) {
879
+ export function scoreKnowledgeProbeResults(results, profile, preflightOptions) {
778
880
  const byFeature = {};
779
881
  for (const result of results) {
780
882
  const feature = result.vars.__featureArea || detectFeatureArea(result.description);
@@ -785,7 +887,7 @@ export function scoreKnowledgeProbeResults(results, profile) {
785
887
  }
786
888
  const scores = [];
787
889
  for (const [feature, featureResults] of Object.entries(byFeature)) {
788
- const scored = scoreTestGroup(featureResults, profile, feature);
890
+ const scored = scoreTestGroup(featureResults, profile, feature, preflightOptions);
789
891
  scores.push({
790
892
  assertionPassRate: scored.dimensions.assertionPassRate,
791
893
  ceilingScore: 0,
@@ -817,7 +919,7 @@ export function scoreKnowledgeProbeResults(results, profile) {
817
919
  * Returns a record keyed by feature area with the composite actual score.
818
920
  */
819
921
  // ActualScoreEntry — imported from @sanity/ailf-core via pipeline/types.js
820
- export function scoreAgenticResults(resultsPath, profile) {
922
+ export function scoreAgenticResults(resultsPath, profile, preflightOptions) {
821
923
  const results = readAndNormalizeResults(resultsPath);
822
924
  // Group by feature area
823
925
  const byFeature = {};
@@ -830,7 +932,7 @@ export function scoreAgenticResults(resultsPath, profile) {
830
932
  }
831
933
  const entries = {};
832
934
  for (const [feature, featureResults] of Object.entries(byFeature)) {
833
- const scored = scoreTestGroup(featureResults, profile, feature);
935
+ const scored = scoreTestGroup(featureResults, profile, feature, preflightOptions);
834
936
  entries[feature] = {
835
937
  actualScore: scored.composite,
836
938
  codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
@@ -849,7 +951,7 @@ export function scoreAgenticResults(resultsPath, profile) {
849
951
  * producing a map of model → feature → ActualScoreEntry.
850
952
  * Used to enrich the per-model breakdown with actual scores in full mode.
851
953
  */
852
- export function scoreAgenticResultsPerModel(resultsPath, profile) {
954
+ export function scoreAgenticResultsPerModel(resultsPath, profile, preflightOptions) {
853
955
  const results = readAndNormalizeResults(resultsPath);
854
956
  // Group by model, then feature
855
957
  const byModel = {};
@@ -866,7 +968,7 @@ export function scoreAgenticResultsPerModel(resultsPath, profile) {
866
968
  for (const [modelId, features] of Object.entries(byModel)) {
867
969
  perModel[modelId] = {};
868
970
  for (const [feature, featureResults] of Object.entries(features)) {
869
- const scored = scoreTestGroup(featureResults, profile, feature);
971
+ const scored = scoreTestGroup(featureResults, profile, feature, preflightOptions);
870
972
  perModel[modelId][feature] = {
871
973
  actualScore: scored.composite,
872
974
  codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
@@ -912,6 +1014,15 @@ export function calculateAndWriteScores(options) {
912
1014
  }
913
1015
  // Determine mode — controls which result files are read
914
1016
  const mode = options.mode ?? LiteracyVariant.STANDARD;
1017
+ // W0198 — assemble preflight options once. The helper returns
1018
+ // `undefined` when reports / runId are missing, so all downstream
1019
+ // callers handle the no-preflight case uniformly.
1020
+ const preflightOptions = makePreflightOptions(options.preflightReports, options.runId, mode, options.preflightWeight, log);
1021
+ // W0198 — resolver-health summary. Independent of `preflightOptions`
1022
+ // (which gates the score merge): when reports exist but the runId is
1023
+ // missing, scoring stays on the rubric-only path while telemetry still
1024
+ // surfaces, so the resolver's drift remains visible.
1025
+ const preflightSummary = summarizePreflight(options.preflightReports);
915
1026
  const baselineResultsPath = options.resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
916
1027
  // Agentic results path (only used in full mode)
917
1028
  const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
@@ -940,7 +1051,7 @@ export function calculateAndWriteScores(options) {
940
1051
  const agentProfile = resolveProfile("agent-harness", "gold", rubricConfig);
941
1052
  log.debug("Agent-harness scoring profile", agentProfile);
942
1053
  const results = readAndNormalizeResults(baselineResultsPath);
943
- const scores = scoreAgentHarnessResults(results, agentProfile);
1054
+ const scores = scoreAgentHarnessResults(results, agentProfile, preflightOptions);
944
1055
  log.debug("Agent-harness scores calculated", {
945
1056
  taskCount: scores.length,
946
1057
  tasks: scores.map((s) => ({
@@ -960,7 +1071,7 @@ export function calculateAndWriteScores(options) {
960
1071
  const summary = printReport(scores, urlRefs, source, null, // no agent behavior (that's for literacy agentic mode)
961
1072
  graderCost, null, // no per-model breakdown
962
1073
  null, // no source isolation
963
- sourceVerification, "agent-harness", log);
1074
+ sourceVerification, "agent-harness", log, preflightSummary);
964
1075
  // Persist
965
1076
  const outDir = join(ROOT, "results", "latest");
966
1077
  mkdirSync(outDir, { recursive: true });
@@ -992,7 +1103,7 @@ export function calculateAndWriteScores(options) {
992
1103
  const probeProfile = resolveProfile("knowledge-probe", "gold", rubricConfig);
993
1104
  log.debug("Knowledge-probe scoring profile", probeProfile);
994
1105
  const results = readAndNormalizeResults(baselineResultsPath);
995
- const scores = scoreKnowledgeProbeResults(results, probeProfile);
1106
+ const scores = scoreKnowledgeProbeResults(results, probeProfile, preflightOptions);
996
1107
  log.debug("Knowledge-probe scores calculated", {
997
1108
  featureCount: scores.length,
998
1109
  features: scores.map((s) => ({
@@ -1012,7 +1123,7 @@ export function calculateAndWriteScores(options) {
1012
1123
  const summary = printReport(scores, urlRefs, source, null, // no agent behavior — KP is closed-book
1013
1124
  graderCost, null, // no per-model breakdown for now
1014
1125
  null, // no source isolation — KP doesn't fetch sources
1015
- sourceVerification, "knowledge-probe", log);
1126
+ sourceVerification, "knowledge-probe", log, preflightSummary);
1016
1127
  // Persist
1017
1128
  const outDir = join(ROOT, "results", "latest");
1018
1129
  mkdirSync(outDir, { recursive: true });
@@ -1041,7 +1152,7 @@ export function calculateAndWriteScores(options) {
1041
1152
  gold: goldProfile,
1042
1153
  baseline: baselineProfileWeights,
1043
1154
  });
1044
- const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights);
1155
+ const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
1045
1156
  log.debug("Baseline scores calculated", {
1046
1157
  featureCount: baselineScores.length,
1047
1158
  features: baselineScores.map((s) => ({
@@ -1051,7 +1162,7 @@ export function calculateAndWriteScores(options) {
1051
1162
  docLift: s.docLift,
1052
1163
  })),
1053
1164
  });
1054
- const perModel = calculateScoresPerModel(baselineResultsPath, goldProfile, baselineProfileWeights);
1165
+ const perModel = calculateScoresPerModel(baselineResultsPath, goldProfile, baselineProfileWeights, preflightOptions);
1055
1166
  const urlRefs = aggregateUrlReferences(baselineResultsPath);
1056
1167
  const sourceVerification = buildSourceVerification(ROOT, source, {
1057
1168
  allowedOrigins: options.allowedOrigins,
@@ -1067,7 +1178,7 @@ export function calculateAndWriteScores(options) {
1067
1178
  if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
1068
1179
  log.info(`\nReading agentic results from: ${agenticResultsPath}`);
1069
1180
  const agenticProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC);
1070
- const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile);
1181
+ const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile, preflightOptions);
1071
1182
  log.debug("Agentic scores calculated", {
1072
1183
  featureCount: Object.keys(agenticScores).length,
1073
1184
  features: Object.entries(agenticScores).map(([f, s]) => ({
@@ -1080,7 +1191,7 @@ export function calculateAndWriteScores(options) {
1080
1191
  evaluationMode = LiteracyVariant.FULL;
1081
1192
  // Merge agentic actual scores into the per-model breakdown
1082
1193
  if (perModel) {
1083
- const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile);
1194
+ const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile, preflightOptions);
1084
1195
  for (const entry of perModel) {
1085
1196
  const modelAgentic = agenticPerModel[entry.modelId];
1086
1197
  if (modelAgentic) {
@@ -1115,7 +1226,7 @@ export function calculateAndWriteScores(options) {
1115
1226
  ? LiteracyVariant.OBSERVED
1116
1227
  : LiteracyVariant.STANDARD;
1117
1228
  }
1118
- const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log);
1229
+ const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log, preflightSummary);
1119
1230
  // Persist
1120
1231
  const outDir = join(ROOT, "results", "latest");
1121
1232
  mkdirSync(outDir, { recursive: true });
@@ -1269,7 +1380,7 @@ function printPerModelReport(perModel, log) {
1269
1380
  // ---------------------------------------------------------------------------
1270
1381
  // Main
1271
1382
  // ---------------------------------------------------------------------------
1272
- function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log) {
1383
+ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log, preflightSummary) {
1273
1384
  const _log = log ?? new ConsoleLogger();
1274
1385
  _log.info("\n" + "=".repeat(80));
1275
1386
  _log.info(" SANITY AI LITERACY SCORE REPORT");
@@ -1428,6 +1539,8 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
1428
1539
  if (perModel) {
1429
1540
  printPerModelReport(perModel, _log);
1430
1541
  }
1542
+ // W0198 — symbol preflight resolver-health summary
1543
+ printPreflightSummary(preflightSummary, _log);
1431
1544
  // URL References
1432
1545
  printUrlReport(urlRefs, _log);
1433
1546
  // Agent Behavior (only present when run with instrumented provider)
@@ -1557,6 +1670,7 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
1557
1670
  }
1558
1671
  : undefined,
1559
1672
  ...(perModel && { perModel }),
1673
+ ...(preflightSummary && { preflight: preflightSummary }),
1560
1674
  ...(sourceIsolation && { sourceIsolation }),
1561
1675
  ...(sourceVerification && { sourceVerification }),
1562
1676
  timestamp: new Date().toISOString(),
@@ -21,6 +21,6 @@ export { checkBudget, classifyToolCall, classifyToolCalls, collectTrace, compute
21
21
  export { registerSanityLiteracyPreset, sanityLiteracyPreset, } from "./presets/index.js";
22
22
  export { buildIgnoreFieldsWrapper, compareWithIgnoredFields, stripFields, } from "./ignore-fields.js";
23
23
  export { simpleHash } from "./hash.js";
24
- export { scoreTestGroup, type BridgedScoreResult } from "./scoring-bridge.js";
24
+ export { preflightToScore, scoreTestGroup, type BridgedScoreResult, type ScoreTestGroupOptions, } from "./scoring-bridge.js";
25
25
  export { ConfigNotFoundError, loadConfigFile, tryLoadConfigFile, } from "./config-loader.js";
26
26
  export type { ConfigLoadResult } from "./config-loader.js";
@@ -37,6 +37,6 @@ export { buildIgnoreFieldsWrapper, compareWithIgnoredFields, stripFields, } from
37
37
  // Hash utility
38
38
  export { simpleHash } from "./hash.js";
39
39
  // Scoring bridge — 4-tier engine integration
40
- export { scoreTestGroup } from "./scoring-bridge.js";
40
+ export { preflightToScore, scoreTestGroup, } from "./scoring-bridge.js";
41
41
  // Unified config loader
42
42
  export { ConfigNotFoundError, loadConfigFile, tryLoadConfigFile, } from "./config-loader.js";
@@ -20,6 +20,7 @@
20
20
  */
21
21
  import type { LiteracyTaskDefinition } from "../../_vendor/ailf-core/index.d.ts";
22
22
  import { type LiteracyCompileResult } from "./mode-handlers/literacy/index.js";
23
+ import type { PreflightRubricContext } from "./rubric-resolution.js";
23
24
  import { type LiteracyEvalSubMode } from "../normalize-mode.js";
24
25
  /** Options for compiling all literacy tasks via the new compiler */
25
26
  export interface LiteracyBridgeOptions {
@@ -35,6 +36,14 @@ export interface LiteracyBridgeOptions {
35
36
  label: string;
36
37
  config?: Record<string, unknown>;
37
38
  }[];
39
+ /** Grader context policy passed through to `compileLiteracyTask`. */
40
+ graderContext?: "rubric-only" | "with-docs";
41
+ /**
42
+ * W0198 Phase 6 — preflight context passed through to every task's
43
+ * `code-correctness` rubric so the grader treats the deterministic
44
+ * lane's existence verdicts as ground truth.
45
+ */
46
+ preflightContext?: PreflightRubricContext;
38
47
  }
39
48
  /** Result of compiling all literacy tasks */
40
49
  export interface LiteracyBridgeResult {
@@ -73,6 +73,8 @@ export function compileLiteracyTasks(tasks, options) {
73
73
  evalMode: options.evalMode,
74
74
  models: options.models,
75
75
  rubricConfig,
76
+ graderContext: options.graderContext,
77
+ preflightContext: options.preflightContext,
76
78
  };
77
79
  for (const node of orderedNodes) {
78
80
  const task = taskMap.get(node.taskId);
@@ -30,10 +30,6 @@ export const scaffoldProjectTask = {
30
30
  "2. Configure sanity.config.ts with project ID 'test-project' and dataset 'production'\n" +
31
31
  "3. Create a 'post' schema type with title, slug, body, and author fields\n" +
32
32
  "4. Ensure the project builds without errors",
33
- vars: {
34
- task: "Scaffold a Sanity Studio project with a post schema type. " +
35
- "The project should build cleanly.",
36
- },
37
33
  },
38
34
  assertions: [
39
35
  { type: "file-exists", value: "sanity.config.ts" },
@@ -70,10 +66,6 @@ export const modifyCodeTask = {
70
66
  text: "In the existing Sanity Studio project, add a custom document action " +
71
67
  "that logs a message before publishing. Follow the Sanity docs for " +
72
68
  "custom document actions.",
73
- vars: {
74
- task: "Add a custom document action that wraps the default publish action " +
75
- "and logs 'Publishing document: <title>' before executing.",
76
- },
77
69
  },
78
70
  assertions: [
79
71
  { type: "file-exists", value: "actions/logPublishAction.ts" },
@@ -127,10 +119,6 @@ export const multiFileRefactorTask = {
127
119
  "3. Query method calls (fetch → client.fetch with new signature)\n" +
128
120
  "4. Mutation helpers (create/patch/delete API changes)\n" +
129
121
  "Ensure the project compiles after migration.",
130
- vars: {
131
- task: "Migrate the codebase from @sanity/client v5 to v6, " +
132
- "updating all files. Project must compile cleanly after migration.",
133
- },
134
122
  },
135
123
  assertions: [
136
124
  {
@@ -38,10 +38,6 @@ export const groqProjectionTask = {
38
38
  "5. Array slicing with `[0..5]` and `[0...5]`\n" +
39
39
  "6. Conditional projections using `select()`\n\n" +
40
40
  "Provide working code examples for each.",
41
- vars: {
42
- task: "Explain GROQ projection syntax with working code examples " +
43
- "covering projections, spread, dereference, slicing, and select().",
44
- },
45
41
  },
46
42
  assertions: [
47
43
  { type: "contains", value: "->" },
@@ -89,10 +85,6 @@ export const defineTypeApiTask = {
89
85
  "3. Why were these typed helpers introduced? What did they replace?\n" +
90
86
  "4. Show a complete example of a document schema with various field types\n" +
91
87
  "5. How do you add validation rules using the typed API?",
92
- vars: {
93
- task: "Explain Sanity's defineType/defineField schema API with examples, " +
94
- "motivation, and validation rules.",
95
- },
96
88
  },
97
89
  assertions: [
98
90
  { type: "contains", value: "defineType" },
@@ -142,10 +134,6 @@ export const ecosystemComparisonTask = {
142
134
  "4. Developer experience and customization\n" +
143
135
  "5. Pricing models\n" +
144
136
  "6. When would you choose one over the other?",
145
- vars: {
146
- task: "Compare Sanity and Contentful across architecture, content modeling, " +
147
- "querying, DX, pricing, and use case fit.",
148
- },
149
137
  },
150
138
  assertions: [
151
139
  { type: "contains-any", value: ["GROQ", "groq"] },
@@ -7,7 +7,7 @@
7
7
  import type { LiteracyTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
8
8
  import type { PromptfooAssertion } from "../../assertion-mapper.js";
9
9
  import type { LiteracyCompileOptions } from "./types.js";
10
- export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[]): PromptfooAssertion[];
10
+ export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[], canonicalReference?: string): PromptfooAssertion[];
11
11
  /**
12
12
  * Build baseline assertions matching the legacy expand-tasks behavior.
13
13
  *
@@ -8,11 +8,11 @@ import { resolveTemplatedAssertion } from "../../rubric-resolution.js";
8
8
  // ---------------------------------------------------------------------------
9
9
  // Assertion resolution
10
10
  // ---------------------------------------------------------------------------
11
- export function resolveAssertions(task, options, warnings) {
11
+ export function resolveAssertions(task, options, warnings, canonicalReference) {
12
12
  const assertions = [];
13
13
  for (const a of task.assertions ?? []) {
14
14
  if (a.type === "llm-rubric" && "template" in a) {
15
- const resolved = resolveTemplatedAssertion(a, options?.rubricConfig, options?.graderProvider, warnings);
15
+ const resolved = resolveTemplatedAssertion(a, options?.rubricConfig, options?.graderProvider, warnings, canonicalReference, options?.preflightContext);
16
16
  if (resolved)
17
17
  assertions.push(resolved);
18
18
  }
@@ -31,7 +31,7 @@ export function resolveAssertions(task, options, warnings) {
31
31
  }
32
32
  // Doc-coverage auto-generation
33
33
  if (task.docCoverage) {
34
- const docCoverageAssertion = buildDocCoverageAssertion(options?.rubricConfig, options?.graderProvider);
34
+ const docCoverageAssertion = buildDocCoverageAssertion(options?.rubricConfig, options?.graderProvider, canonicalReference);
35
35
  if (docCoverageAssertion)
36
36
  assertions.push(docCoverageAssertion);
37
37
  }
@@ -40,22 +40,49 @@ export function resolveAssertions(task, options, warnings) {
40
40
  // ---------------------------------------------------------------------------
41
41
  // Doc-coverage assertion
42
42
  // ---------------------------------------------------------------------------
43
- function buildDocCoverageAssertion(rubricConfig, graderProvider) {
43
+ function buildDocCoverageAssertion(rubricConfig, graderProvider, canonicalReference) {
44
44
  if (!rubricConfig?.templates["doc-coverage"])
45
45
  return null;
46
46
  const template = rubricConfig.templates["doc-coverage"];
47
47
  const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
48
48
  const rubricValue = `${template.header}\n${scaleText}\n\n` +
49
49
  `Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
50
+ // doc-coverage benefits from the same authoritative reference — the grader
51
+ // needs the doc content to judge whether the candidate actually used what
52
+ // was documented.
53
+ const rubricPrompt = canonicalReference
54
+ ? buildDocCoverageRubricPrompt(rubricValue, canonicalReference)
55
+ : undefined;
50
56
  return {
51
57
  type: "llm-rubric",
52
58
  value: rubricValue,
59
+ ...(rubricPrompt ? { rubricPrompt } : {}),
53
60
  ...(graderProvider ? { provider: graderProvider } : {}),
54
61
  ...(template.dimension
55
62
  ? { metadata: { dimension: template.dimension, maxScore: 100 } }
56
63
  : {}),
57
64
  };
58
65
  }
66
+ function buildDocCoverageRubricPrompt(rubric, reference) {
67
+ return [
68
+ "You are grading documentation coverage of a candidate response.",
69
+ "",
70
+ "AUTHORITATIVE REFERENCE — this is what the candidate had access to.",
71
+ "Score how well the candidate used what was documented here. Do not",
72
+ "penalize the candidate for missing information that is absent from",
73
+ "the reference.",
74
+ "",
75
+ "--- BEGIN REFERENCE ---",
76
+ reference,
77
+ "--- END REFERENCE ---",
78
+ "",
79
+ "RUBRIC:",
80
+ rubric,
81
+ "",
82
+ "CANDIDATE RESPONSE:",
83
+ "{{output}}",
84
+ ].join("\n");
85
+ }
59
86
  // ---------------------------------------------------------------------------
60
87
  // Baseline assertion filtering
61
88
  // ---------------------------------------------------------------------------