@sanity/ailf 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/config/models.yaml +3 -2
  2. package/dist/_vendor/ailf-core/types/index.d.ts +53 -0
  3. package/dist/composition-root.js +7 -2
  4. package/dist/orchestration/pipeline-orchestrator.js +27 -2
  5. package/dist/orchestration/step-runner.js +8 -0
  6. package/dist/orchestration/steps/calculate-scores-step.js +4 -0
  7. package/dist/orchestration/steps/generate-configs-step.js +1 -0
  8. package/dist/orchestration/steps/grader-consistency-step.js +1 -0
  9. package/dist/orchestration/steps/mirror-repo-tasks-step.js +2 -1
  10. package/dist/pipeline/calculate-scores.d.ts +5 -0
  11. package/dist/pipeline/calculate-scores.js +219 -146
  12. package/dist/pipeline/coverage-audit.d.ts +2 -1
  13. package/dist/pipeline/coverage-audit.js +5 -3
  14. package/dist/pipeline/expand-tasks.d.ts +2 -1
  15. package/dist/pipeline/expand-tasks.js +33 -2
  16. package/dist/pipeline/generate-configs.d.ts +3 -1
  17. package/dist/pipeline/generate-configs.js +47 -28
  18. package/dist/pipeline/grader-api.d.ts +2 -1
  19. package/dist/pipeline/grader-api.js +11 -9
  20. package/dist/pipeline/grader-compare-runner.d.ts +3 -0
  21. package/dist/pipeline/grader-compare-runner.js +21 -19
  22. package/dist/pipeline/grader-consistency-runner.d.ts +3 -0
  23. package/dist/pipeline/grader-consistency-runner.js +16 -14
  24. package/dist/pipeline/grader-sensitivity-runner.d.ts +3 -0
  25. package/dist/pipeline/grader-sensitivity-runner.js +18 -16
  26. package/dist/pipeline/grader-validate-runner.d.ts +3 -0
  27. package/dist/pipeline/grader-validate-runner.js +16 -14
  28. package/dist/pipeline/mirror-repo-tasks.d.ts +3 -1
  29. package/dist/pipeline/mirror-repo-tasks.js +8 -6
  30. package/dist/pipeline/provenance.d.ts +3 -0
  31. package/dist/pipeline/provenance.js +25 -3
  32. package/dist/sources.d.ts +2 -1
  33. package/dist/sources.js +28 -1
  34. package/package.json +3 -3
@@ -27,6 +27,7 @@
27
27
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
28
28
  import { join } from "path";
29
29
  import { calculateCost } from "../agent-observer/pricing.js";
30
+ import { ConsoleLogger } from "../adapters/loggers/index.js";
30
31
  import { checkResultsExist } from "./checks.js";
31
32
  import { loadRubricTemplates } from "./expand-tasks.js";
32
33
  import { loadSource } from "../sources.js";
@@ -375,63 +376,63 @@ function extractGraderCost(resultsPath) {
375
376
  /**
376
377
  * Prints a formatted report of agent behavior observations.
377
378
  */
378
- function printAgentBehaviorReport(agentBehavior) {
379
- console.log("-".repeat(80));
380
- console.log("AGENT BEHAVIOR OBSERVATION");
381
- console.log("-".repeat(80));
382
- console.log();
379
+ function printAgentBehaviorReport(agentBehavior, log) {
380
+ log.info("-".repeat(80));
381
+ log.info("AGENT BEHAVIOR OBSERVATION");
382
+ log.info("-".repeat(80));
383
+ log.info("");
383
384
  // Summary table
384
385
  const h = "| Feature Area | Tests | Doc Pages | Searches | Net (ms) |";
385
386
  const sep = "|---------------------|-------|-----------|----------|----------|";
386
- console.log(h);
387
- console.log(sep);
387
+ log.info(h);
388
+ log.info(sep);
388
389
  for (const ab of agentBehavior) {
389
- console.log(`| ${ab.feature.padEnd(19)} | ` +
390
+ log.info(`| ${ab.feature.padEnd(19)} | ` +
390
391
  `${ab.tasksWithBehaviorData.toString().padStart(5)} | ` +
391
392
  `${ab.avgDocPagesVisited.toFixed(1).padStart(9)} | ` +
392
393
  `${ab.avgSearchesPerformed.toFixed(1).padStart(8)} | ` +
393
394
  `${Math.round(ab.avgNetworkTimeMs).toString().padStart(8)} |`);
394
395
  }
395
- console.log();
396
+ log.info("");
396
397
  // Doc pages visited
397
- console.log(" Doc pages visited:");
398
+ log.info(" Doc pages visited:");
398
399
  for (const ab of agentBehavior) {
399
400
  if (ab.docSlugsVisited.length === 0) {
400
- console.log(` ${ab.feature}: (none)`);
401
+ log.info(` ${ab.feature}: (none)`);
401
402
  }
402
403
  else {
403
- console.log(` ${ab.feature}:`);
404
+ log.info(` ${ab.feature}:`);
404
405
  for (const slug of ab.docSlugsVisited) {
405
- console.log(` - /docs/${slug}`);
406
+ log.info(` - /docs/${slug}`);
406
407
  }
407
408
  }
408
409
  }
409
- console.log();
410
+ log.info("");
410
411
  // Search queries
411
412
  const hasSearches = agentBehavior.some((ab) => ab.searchQueries.length > 0);
412
413
  if (hasSearches) {
413
- console.log(" Search queries:");
414
+ log.info(" Search queries:");
414
415
  for (const ab of agentBehavior) {
415
416
  if (ab.searchQueries.length === 0) {
416
417
  continue;
417
418
  }
418
- console.log(` ${ab.feature}:`);
419
+ log.info(` ${ab.feature}:`);
419
420
  for (const q of ab.searchQueries) {
420
- console.log(` - "${q}"`);
421
+ log.info(` - "${q}"`);
421
422
  }
422
423
  }
423
- console.log();
424
+ log.info("");
424
425
  }
425
426
  // External domains
426
427
  const allExternalDomains = [
427
428
  ...new Set(agentBehavior.flatMap((ab) => ab.externalDomains)),
428
429
  ];
429
430
  if (allExternalDomains.length > 0) {
430
- console.log(" External domains contacted:");
431
+ log.info(" External domains contacted:");
431
432
  for (const d of allExternalDomains) {
432
- console.log(` - ${d}`);
433
+ log.info(` - ${d}`);
433
434
  }
434
- console.log();
435
+ log.info("");
435
436
  }
436
437
  }
437
438
  // ---------------------------------------------------------------------------
@@ -444,9 +445,15 @@ function printAgentBehaviorReport(agentBehavior) {
444
445
  * Reads the raw Promptfoo output file and normalizes each result so that
445
446
  * `description` is always a top-level field (pulled from `testCase` if needed).
446
447
  */
447
- function readAndNormalizeResults(resultsPath) {
448
+ function readAndNormalizeResults(resultsPath, log) {
449
+ const _log = log ?? new ConsoleLogger();
448
450
  const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
449
451
  const wrapper = file.results ?? file;
452
+ _log.debug("Reading results file", {
453
+ path: resultsPath,
454
+ resultCount: wrapper.results.length,
455
+ stats: wrapper.stats,
456
+ });
450
457
  const all = wrapper.results.map((r) => ({
451
458
  cost: r.cost ?? 0,
452
459
  description: r.testCase?.description ?? "unknown",
@@ -463,15 +470,20 @@ function readAndNormalizeResults(resultsPath) {
463
470
  // Promptfoo sets gradingResult to null when a test errors before grading.
464
471
  const valid = all.filter((r) => r.gradingResult !== null);
465
472
  const skipped = all.length - valid.length;
473
+ _log.debug("Filtered null gradingResults", {
474
+ totalResults: all.length,
475
+ validResults: valid.length,
476
+ skippedCount: skipped,
477
+ });
466
478
  if (skipped > 0) {
467
- console.warn(` ⚠ Skipping ${skipped} of ${all.length} result(s) with null gradingResult (errored tests):`);
479
+ _log.warn(`⚠ Skipping ${skipped} of ${all.length} result(s) with null gradingResult (errored tests):`);
468
480
  for (const r of all) {
469
481
  if (r.gradingResult === null) {
470
482
  const providerLabel = r.provider ? `[${r.provider}] ` : "";
471
483
  const errorMsg = r.error
472
484
  ? r.error.slice(0, 150)
473
485
  : "unknown error (no error field in result)";
474
- console.warn(` ✗ ${providerLabel}"${r.description}" — ${errorMsg}`);
486
+ _log.warn(`✗ ${providerLabel}"${r.description}" — ${errorMsg}`);
475
487
  }
476
488
  }
477
489
  }
@@ -555,7 +567,7 @@ function scoreResults(results, weights, modelId) {
555
567
  const ceilingScore = Math.round(withDocsTotal);
556
568
  const floorScore = Math.round(withoutDocsScore);
557
569
  const docLift = ceilingScore - floorScore;
558
- scores.push({
570
+ const featureScore = {
559
571
  ceilingScore,
560
572
  codeCorrectness: Math.round(avgCode),
561
573
  docCoverage: Math.round(avgDoc),
@@ -569,7 +581,8 @@ function scoreResults(results, weights, modelId) {
569
581
  testCount: data.withDocs.length,
570
582
  totalCost: featureCost,
571
583
  totalScore: ceilingScore,
572
- });
584
+ };
585
+ scores.push(featureScore);
573
586
  }
574
587
  return scores.sort((a, b) => a.feature.localeCompare(b.feature));
575
588
  }
@@ -654,6 +667,7 @@ export function scoreAgenticResults(resultsPath, weights) {
654
667
  const CRITICAL_THRESHOLD = 40;
655
668
  export function calculateAndWriteScores(options) {
656
669
  const ROOT = options.rootDir;
670
+ const log = options.logger ?? new ConsoleLogger();
657
671
  const sourceName = options.source;
658
672
  // Pre-resolved source wins over name-based lookup
659
673
  let source = options.resolvedSource;
@@ -662,7 +676,7 @@ export function calculateAndWriteScores(options) {
662
676
  source = loadSource(sourceName);
663
677
  }
664
678
  catch {
665
- console.warn(` [warn] Could not load source "${sourceName}", proceeding without source metadata`);
679
+ log.warn(`[warn] Could not load source "${sourceName}", proceeding without source metadata`);
666
680
  }
667
681
  }
668
682
  // Determine mode — controls which result files are read
@@ -679,13 +693,23 @@ export function calculateAndWriteScores(options) {
679
693
  .join("; ");
680
694
  throw new Error(`Results validation failed: ${details}. Run 'pnpm eval' first to generate results.`);
681
695
  }
682
- console.log(`Reading results from: ${baselineResultsPath}`);
696
+ log.info(`Reading results from: ${baselineResultsPath}`);
683
697
  if (source) {
684
- console.log(`Source: ${sourceName} (${source.baseUrl})`);
698
+ log.info(`Source: ${sourceName} (${source.baseUrl})`);
685
699
  }
686
700
  // Load dimension weights from rubrics.yaml
687
701
  const rubricConfig = loadRubricTemplates(ROOT);
702
+ log.debug("Loaded rubric weights", { weights: rubricConfig.weights });
688
703
  const baselineScores = calculateScores(baselineResultsPath, rubricConfig.weights);
704
+ log.debug("Baseline scores calculated", {
705
+ featureCount: baselineScores.length,
706
+ features: baselineScores.map((s) => ({
707
+ feature: s.feature,
708
+ ceilingScore: s.ceilingScore,
709
+ floorScore: s.floorScore,
710
+ docLift: s.docLift,
711
+ })),
712
+ });
689
713
  const perModel = calculateScoresPerModel(baselineResultsPath, rubricConfig.weights);
690
714
  const urlRefs = aggregateUrlReferences(baselineResultsPath);
691
715
  const sourceVerification = buildSourceVerification(ROOT, source, {
@@ -700,8 +724,16 @@ export function calculateAndWriteScores(options) {
700
724
  let sourceIsolation = null;
701
725
  let evaluationMode;
702
726
  if (mode === "full" && existsSync(agenticResultsPath)) {
703
- console.log(`\nReading agentic results from: ${agenticResultsPath}`);
727
+ log.info(`\nReading agentic results from: ${agenticResultsPath}`);
704
728
  const agenticScores = scoreAgenticResults(agenticResultsPath, rubricConfig.weights);
729
+ log.debug("Agentic scores calculated", {
730
+ featureCount: Object.keys(agenticScores).length,
731
+ features: Object.entries(agenticScores).map(([f, s]) => ({
732
+ feature: f,
733
+ actualScore: s.actualScore,
734
+ testCount: s.testCount,
735
+ })),
736
+ });
705
737
  scores = mergeScores(baselineScores, agenticScores);
706
738
  evaluationMode = "full";
707
739
  // Aggregate agent behavior and source isolation from agentic results
@@ -728,12 +760,12 @@ export function calculateAndWriteScores(options) {
728
760
  sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
729
761
  evaluationMode = mode === "observed" ? "observed" : "baseline";
730
762
  }
731
- const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode);
763
+ const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log);
732
764
  // Persist
733
765
  const outDir = join(ROOT, "results", "latest");
734
766
  mkdirSync(outDir, { recursive: true });
735
767
  writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
736
- console.log("Score summary written to results/latest/score-summary.json");
768
+ log.info("Score summary written to results/latest/score-summary.json");
737
769
  // Extract and persist grader judgments (Phase 3a: failure mode extraction)
738
770
  const judgments = extractGraderJudgments(baselineResultsPath);
739
771
  // In full mode, also extract judgments from agentic results
@@ -743,20 +775,60 @@ export function calculateAndWriteScores(options) {
743
775
  }
744
776
  if (judgments.length > 0) {
745
777
  writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
746
- console.log(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
778
+ log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
747
779
  }
748
- return { belowCritical: summary.belowCritical };
780
+ // Compute test summary from the raw results file
781
+ const testSummary = computeTestSummary(baselineResultsPath);
782
+ return { belowCritical: summary.belowCritical, testSummary };
783
+ }
784
+ /**
785
+ * Compute a TestSummary from a raw Promptfoo results file.
786
+ * Counts total, passed, failed, and errored tests with error details.
787
+ */
788
+ function computeTestSummary(resultsPath) {
789
+ const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
790
+ const wrapper = file.results ?? file;
791
+ const rawResults = wrapper.results;
792
+ let passed = 0;
793
+ let failed = 0;
794
+ let errored = 0;
795
+ const errors = [];
796
+ for (const r of rawResults) {
797
+ if (r.gradingResult === null || r.gradingResult === undefined) {
798
+ errored++;
799
+ errors.push({
800
+ model: r.provider?.label ?? r.provider?.id ?? "unknown",
801
+ task: r.testCase?.description ?? "unknown",
802
+ error: r.error
803
+ ? r.error.slice(0, 200)
804
+ : "unknown error (null gradingResult)",
805
+ });
806
+ }
807
+ else if (r.gradingResult.pass) {
808
+ passed++;
809
+ }
810
+ else {
811
+ failed++;
812
+ }
813
+ }
814
+ return {
815
+ total: rawResults.length,
816
+ passed,
817
+ failed,
818
+ errored,
819
+ ...(errors.length > 0 ? { errors } : {}),
820
+ };
749
821
  }
750
- function printPerModelReport(perModel) {
751
- console.log("-".repeat(80));
752
- console.log("PER-MODEL BREAKDOWN");
753
- console.log("-".repeat(80));
754
- console.log();
822
+ function printPerModelReport(perModel, log) {
823
+ log.info("-".repeat(80));
824
+ log.info("PER-MODEL BREAKDOWN");
825
+ log.info("-".repeat(80));
826
+ log.info("");
755
827
  // Model summary table
756
828
  const h = "| Model | Avg Score | Avg Lift | Tests | Cost |";
757
829
  const sep = "|--------------------------------|-----------|----------|-------|----------|";
758
- console.log(h);
759
- console.log(sep);
830
+ log.info(h);
831
+ log.info(sep);
760
832
  const sorted = [...perModel].sort((a, b) => b.overall.avgScore - a.overall.avgScore);
761
833
  for (const entry of sorted) {
762
834
  const displayName = entry.label || entry.modelId;
@@ -766,63 +838,64 @@ function printPerModelReport(perModel) {
766
838
  const liftStr = entry.overall.avgDocLift >= 0
767
839
  ? `+${entry.overall.avgDocLift.toFixed(1)}`
768
840
  : entry.overall.avgDocLift.toFixed(1);
769
- console.log(`| ${displayName.padEnd(30)} | ` +
841
+ log.info(`| ${displayName.padEnd(30)} | ` +
770
842
  `${entry.overall.avgScore.toFixed(1).padStart(9)} | ` +
771
843
  `${liftStr.padStart(8)} | ` +
772
844
  `${entry.overall.testCount.toString().padStart(5)} | ` +
773
845
  `${costStr.padStart(8)} |`);
774
846
  }
775
- console.log();
847
+ log.info("");
776
848
  // Per-model × per-area breakdown
777
849
  for (const entry of sorted) {
778
850
  const displayName = entry.label || entry.modelId;
779
- console.log(` ${displayName} (${entry.modelId}):`);
851
+ log.info(` ${displayName} (${entry.modelId}):`);
780
852
  const areaH = " | Feature Area | Task | Code | Docs | Total | Lift |";
781
853
  const areaSep = " |---------------------|------|------|------|-------|------|";
782
- console.log(areaH);
783
- console.log(areaSep);
854
+ log.info(areaH);
855
+ log.info(areaSep);
784
856
  for (const s of entry.scores) {
785
857
  const lift = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
786
- console.log(` | ${s.feature.padEnd(19)} | ` +
858
+ log.info(` | ${s.feature.padEnd(19)} | ` +
787
859
  `${s.taskCompletion.toString().padStart(4)} | ` +
788
860
  `${s.codeCorrectness.toString().padStart(4)} | ` +
789
861
  `${s.docCoverage.toString().padStart(4)} | ` +
790
862
  `${s.totalScore.toString().padStart(5)} | ` +
791
863
  `${lift.padStart(4)} |`);
792
864
  }
793
- console.log();
865
+ log.info("");
794
866
  }
795
867
  // Cost-per-quality-point
796
868
  const modelsWithCost = sorted.filter((e) => e.overall.cost && e.overall.cost > 0);
797
869
  if (modelsWithCost.length > 0) {
798
- console.log(" Cost per quality point:");
870
+ log.info(" Cost per quality point:");
799
871
  for (const entry of modelsWithCost) {
800
872
  const displayName = entry.label;
801
873
  const costPerPoint = entry.overall.avgScore > 0
802
874
  ? (entry.overall.cost ?? 0) / entry.overall.avgScore
803
875
  : 0;
804
- console.log(` ${displayName}: $${costPerPoint.toFixed(6)}/point (score: ${entry.overall.avgScore.toFixed(1)}, cost: $${(entry.overall.cost ?? 0).toFixed(4)})`);
876
+ log.info(` ${displayName}: $${costPerPoint.toFixed(6)}/point (score: ${entry.overall.avgScore.toFixed(1)}, cost: $${(entry.overall.cost ?? 0).toFixed(4)})`);
805
877
  }
806
- console.log();
878
+ log.info("");
807
879
  }
808
880
  }
809
881
  // ---------------------------------------------------------------------------
810
882
  // Main
811
883
  // ---------------------------------------------------------------------------
812
- function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode) {
813
- console.log("\n" + "=".repeat(80));
814
- console.log(" SANITY AI LITERACY SCORE REPORT");
815
- console.log("=".repeat(80));
816
- console.log();
884
+ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log) {
885
+ const _log = log ?? new ConsoleLogger();
886
+ _log.info("\n" + "=".repeat(80));
887
+ _log.info(" SANITY AI LITERACY SCORE REPORT");
888
+ _log.info("=".repeat(80));
889
+ _log.info("");
817
890
  // Table header
818
891
  const h = "| Feature Area | Task | Code | Docs | Total | w/o Docs | Doc Lift |";
819
892
  const sep = "|---------------------|------|------|------|-------|----------|----------|";
820
- console.log(h);
821
- console.log(sep);
893
+ _log.info(h);
894
+ _log.info(sep);
822
895
  for (const s of scores) {
823
896
  const status = s.totalScore < CRITICAL_THRESHOLD ? "!!" : "ok";
824
897
  const lift = s.docLift > 0 ? `+${s.docLift}` : `${s.docLift}`;
825
- console.log(`| ${status} ${s.feature.padEnd(17)} | ` +
898
+ _log.info(`| ${status} ${s.feature.padEnd(17)} | ` +
826
899
  `${s.taskCompletion.toString().padStart(4)} | ` +
827
900
  `${s.codeCorrectness.toString().padStart(4)} | ` +
828
901
  `${s.docCoverage.toString().padStart(4)} | ` +
@@ -830,7 +903,7 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
830
903
  `${s.floorScore.toString().padStart(8)} | ` +
831
904
  `${lift.padStart(8)} |`);
832
905
  }
833
- console.log();
906
+ _log.info("");
834
907
  // OKR status
835
908
  const belowCritical = scores.filter((s) => s.totalScore < CRITICAL_THRESHOLD);
836
909
  const lowestScore = scores.reduce((min, s) => s.totalScore < min.totalScore ? s : min);
@@ -844,69 +917,69 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
844
917
  area: s.feature,
845
918
  docLift: s.docLift,
846
919
  }));
847
- console.log("-".repeat(80));
848
- console.log("OKR STATUS");
849
- console.log("-".repeat(80));
850
- console.log();
920
+ _log.info("-".repeat(80));
921
+ _log.info("OKR STATUS");
922
+ _log.info("-".repeat(80));
923
+ _log.info("");
851
924
  if (belowCritical.length === 0) {
852
- console.log(" KR1: PASS -- All areas above critical threshold (>=40)");
925
+ _log.info(" KR1: PASS -- All areas above critical threshold (>=40)");
853
926
  }
854
927
  else {
855
- console.log(" KR1: FAIL -- Areas below critical threshold:");
856
- belowCritical.forEach((s) => console.log(` - ${s.feature}: ${s.totalScore}`));
928
+ _log.info(" KR1: FAIL -- Areas below critical threshold:");
929
+ belowCritical.forEach((s) => _log.info(` - ${s.feature}: ${s.totalScore}`));
857
930
  }
858
- console.log();
859
- console.log(` Lowest area: ${lowestScore.feature} (${lowestScore.totalScore})`);
860
- console.log(` Target: +15 points improvement`);
861
- console.log();
862
- console.log(` Avg score: ${avgScore.toFixed(1)}`);
863
- console.log(` Avg doc lift: +${avgLift.toFixed(1)} points`);
864
- console.log(` (Doc lift = how much docs help vs parametric knowledge alone)`);
865
- console.log();
931
+ _log.info("");
932
+ _log.info(` Lowest area: ${lowestScore.feature} (${lowestScore.totalScore})`);
933
+ _log.info(` Target: +15 points improvement`);
934
+ _log.info("");
935
+ _log.info(` Avg score: ${avgScore.toFixed(1)}`);
936
+ _log.info(` Avg doc lift: +${avgLift.toFixed(1)} points`);
937
+ _log.info(` (Doc lift = how much docs help vs parametric knowledge alone)`);
938
+ _log.info("");
866
939
  // Ceiling decomposition
867
- console.log("-".repeat(80));
868
- console.log("CEILING DECOMPOSITION");
869
- console.log("-".repeat(80));
870
- console.log();
940
+ _log.info("-".repeat(80));
941
+ _log.info("CEILING DECOMPOSITION");
942
+ _log.info("-".repeat(80));
943
+ _log.info("");
871
944
  const ceilH = "| Feature Area | Floor | Ceiling | Doc Lift | Quality Gap |";
872
945
  const ceilSep = "|---------------------|-------|---------|----------|-------------|";
873
- console.log(ceilH);
874
- console.log(ceilSep);
946
+ _log.info(ceilH);
947
+ _log.info(ceilSep);
875
948
  for (const s of scores) {
876
949
  const liftStr = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
877
950
  const liftFlag = s.negativeDocLift ? " 🚨" : "";
878
- console.log(`| ${s.feature.padEnd(19)} | ` +
951
+ _log.info(`| ${s.feature.padEnd(19)} | ` +
879
952
  `${s.floorScore.toString().padStart(5)} | ` +
880
953
  `${s.ceilingScore.toString().padStart(7)} | ` +
881
954
  `${liftStr.padStart(8)}${liftFlag} | ` +
882
955
  `${s.docQualityGap.toString().padStart(11)} |`);
883
956
  }
884
- console.log();
957
+ _log.info("");
885
958
  if (negativeDocLiftAreas.length > 0) {
886
- console.log(" 🚨 NEGATIVE DOC LIFT DETECTED:");
959
+ _log.info(" 🚨 NEGATIVE DOC LIFT DETECTED:");
887
960
  for (const { area, docLift } of negativeDocLiftAreas) {
888
961
  const s = scores.find((sc) => sc.feature === area);
889
- console.log(` ${area}: Doc Lift = ${docLift} (floor: ${s.floorScore}, ceiling: ${s.ceilingScore})`);
962
+ _log.info(` ${area}: Doc Lift = ${docLift} (floor: ${s.floorScore}, ceiling: ${s.ceilingScore})`);
890
963
  }
891
- console.log(" Documentation is HURTING model performance for these areas.");
892
- console.log(" See docs/design-docs/scenario-matrix/evaluation-ceiling.md");
893
- console.log();
964
+ _log.info(" Documentation is HURTING model performance for these areas.");
965
+ _log.info(" See docs/design-docs/scenario-matrix/evaluation-ceiling.md");
966
+ _log.info("");
894
967
  }
895
968
  else {
896
- console.log(" ✅ No areas with negative Doc Lift detected.");
897
- console.log();
969
+ _log.info(" ✅ No areas with negative Doc Lift detected.");
970
+ _log.info("");
898
971
  }
899
972
  // Three-layer decomposition (only when actual scores are present)
900
973
  const hasActualScores = scores.some((s) => s.actualScore !== undefined);
901
974
  if (hasActualScores) {
902
- console.log("-".repeat(80));
903
- console.log("THREE-LAYER DECOMPOSITION (floor → ceiling → actual)");
904
- console.log("-".repeat(80));
905
- console.log();
975
+ _log.info("-".repeat(80));
976
+ _log.info("THREE-LAYER DECOMPOSITION (floor → ceiling → actual)");
977
+ _log.info("-".repeat(80));
978
+ _log.info("");
906
979
  const decompH = "| Feature Area | Floor | Ceiling | Actual | Doc Lift | Ret. Gap | Infra % |";
907
980
  const decompSep = "|---------------------|-------|---------|--------|----------|----------|---------|";
908
- console.log(decompH);
909
- console.log(decompSep);
981
+ _log.info(decompH);
982
+ _log.info(decompSep);
910
983
  for (const s of scores) {
911
984
  const liftStr = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
912
985
  const actualStr = s.actualScore !== undefined ? s.actualScore.toString() : "—";
@@ -919,7 +992,7 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
919
992
  ? `${Math.round(s.infrastructureEfficiency * 100)}%`
920
993
  : "—";
921
994
  const flag = s.invertedRetrievalGap ? " 🔄" : "";
922
- console.log(`| ${s.feature.padEnd(19)} | ` +
995
+ _log.info(`| ${s.feature.padEnd(19)} | ` +
923
996
  `${s.floorScore.toString().padStart(5)} | ` +
924
997
  `${s.ceilingScore.toString().padStart(7)} | ` +
925
998
  `${actualStr.padStart(6)} | ` +
@@ -927,10 +1000,10 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
927
1000
  `${(gapStr + flag).padStart(8)} | ` +
928
1001
  `${infraStr.padStart(7)} |`);
929
1002
  }
930
- console.log();
931
- console.log(" Doc Lift = ceiling − floor | Ret. Gap = ceiling − actual | Infra = actual / ceiling");
932
- console.log(" 🔄 = inverted retrieval gap (agents avoid bad docs → higher actual than ceiling)");
933
- console.log();
1003
+ _log.info("");
1004
+ _log.info(" Doc Lift = ceiling − floor | Ret. Gap = ceiling − actual | Infra = actual / ceiling");
1005
+ _log.info(" 🔄 = inverted retrieval gap (agents avoid bad docs → higher actual than ceiling)");
1006
+ _log.info("");
934
1007
  }
935
1008
  // Cost summary
936
1009
  const totalCost = scores.reduce((sum, s) => sum + s.totalCost, 0);
@@ -938,66 +1011,66 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
938
1011
  const graderCostTotal = graderCost?.cost ?? 0;
939
1012
  const combinedCost = totalCost + graderCostTotal;
940
1013
  if (totalCost > 0 || graderCostTotal > 0) {
941
- console.log("-".repeat(80));
942
- console.log("COST SUMMARY");
943
- console.log("-".repeat(80));
944
- console.log();
945
- console.log(` Provider cost: $${totalCost.toFixed(4)}`);
1014
+ _log.info("-".repeat(80));
1015
+ _log.info("COST SUMMARY");
1016
+ _log.info("-".repeat(80));
1017
+ _log.info("");
1018
+ _log.info(` Provider cost: $${totalCost.toFixed(4)}`);
946
1019
  if (graderCostTotal > 0) {
947
1020
  const graderLabel = graderCost?.model ?? "unknown";
948
- console.log(` Grader cost: $${graderCostTotal.toFixed(4)} (${graderLabel}, ${(graderCost?.totalTokens ?? 0).toLocaleString()} tokens)`);
1021
+ _log.info(` Grader cost: $${graderCostTotal.toFixed(4)} (${graderLabel}, ${(graderCost?.totalTokens ?? 0).toLocaleString()} tokens)`);
949
1022
  }
950
- console.log(` Total cost: $${combinedCost.toFixed(4)}`);
951
- console.log(` Avg cost per test: $${(combinedCost / (totalTests || 1)).toFixed(4)}`);
952
- console.log();
1023
+ _log.info(` Total cost: $${combinedCost.toFixed(4)}`);
1024
+ _log.info(` Avg cost per test: $${(combinedCost / (totalTests || 1)).toFixed(4)}`);
1025
+ _log.info("");
953
1026
  const costHeader = "| Feature Area | Tests | Cost | Avg/Test |";
954
1027
  const costSep = "|---------------------|-------|----------|----------|";
955
- console.log(costHeader);
956
- console.log(costSep);
1028
+ _log.info(costHeader);
1029
+ _log.info(costSep);
957
1030
  for (const s of scores) {
958
1031
  const avgCost = s.testCount > 0 ? s.totalCost / s.testCount : 0;
959
- console.log(`| ${s.feature.padEnd(19)} | ` +
1032
+ _log.info(`| ${s.feature.padEnd(19)} | ` +
960
1033
  `${s.testCount.toString().padStart(5)} | ` +
961
1034
  `$${s.totalCost.toFixed(4).padStart(7)} | ` +
962
1035
  `$${avgCost.toFixed(4).padStart(7)} |`);
963
1036
  }
964
- console.log();
1037
+ _log.info("");
965
1038
  }
966
1039
  // Per-model breakdown
967
1040
  if (perModel) {
968
- printPerModelReport(perModel);
1041
+ printPerModelReport(perModel, _log);
969
1042
  }
970
1043
  // URL References
971
- printUrlReport(urlRefs);
1044
+ printUrlReport(urlRefs, _log);
972
1045
  // Agent Behavior (only present when run with instrumented provider)
973
1046
  if (agentBehavior && agentBehavior.length > 0) {
974
- printAgentBehaviorReport(agentBehavior);
1047
+ printAgentBehaviorReport(agentBehavior, _log);
975
1048
  }
976
1049
  // Source verification (unified report for all modes)
977
1050
  if (sourceVerification || sourceIsolation) {
978
- console.log("-".repeat(80));
979
- console.log("📋 SOURCE VERIFICATION");
980
- console.log("-".repeat(80));
1051
+ _log.info("-".repeat(80));
1052
+ _log.info("📋 SOURCE VERIFICATION");
1053
+ _log.info("-".repeat(80));
981
1054
  if (sourceVerification) {
982
- console.log(` Source: ${sourceVerification.source}`);
983
- console.log(` Mode: ${sourceVerification.mode}`);
1055
+ _log.info(` Source: ${sourceVerification.source}`);
1056
+ _log.info(` Mode: ${sourceVerification.mode}`);
984
1057
  if (sourceVerification.allowedOrigins) {
985
- console.log(` Sandbox: ${sourceVerification.allowedOrigins.join(", ")}`);
1058
+ _log.info(` Sandbox: ${sourceVerification.allowedOrigins.join(", ")}`);
986
1059
  }
987
1060
  if (sourceVerification.searchMode) {
988
- console.log(` Search: ${sourceVerification.searchMode}`);
1061
+ _log.info(` Search: ${sourceVerification.searchMode}`);
989
1062
  }
990
1063
  // URL fetch results (baseline mode with direct URLs)
991
1064
  if (sourceVerification.urlFetch) {
992
1065
  const uf = sourceVerification.urlFetch;
993
- console.log();
994
- console.log(` URL fetch: ${uf.totalFetched} fetched, ${uf.totalFailed} failed`);
1066
+ _log.info("");
1067
+ _log.info(` URL fetch: ${uf.totalFetched} fetched, ${uf.totalFailed} failed`);
995
1068
  for (const f of uf.fetchedUrls) {
996
- console.log(` ✅ ${f.url} (via ${f.method})`);
1069
+ _log.info(` ✅ ${f.url} (via ${f.method})`);
997
1070
  }
998
1071
  for (const f of uf.failures) {
999
1072
  // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means no error info
1000
- console.log(` ⚠️ ${f.url}: ${f.error || "unknown error"}`);
1073
+ _log.info(` ⚠️ ${f.url}: ${f.error || "unknown error"}`);
1001
1074
  }
1002
1075
  }
1003
1076
  }
@@ -1005,22 +1078,22 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
1005
1078
  if (sourceIsolation) {
1006
1079
  const pct = Math.round(sourceIsolation.isolationScore * 100);
1007
1080
  const icon = sourceIsolation.offOrigin === 0 ? "✅" : "⚠️";
1008
- console.log();
1009
- console.log(` Agent isolation: ${icon} ${pct}% (${sourceIsolation.onOrigin}/${sourceIsolation.total} on-origin)`);
1081
+ _log.info("");
1082
+ _log.info(` Agent isolation: ${icon} ${pct}% (${sourceIsolation.onOrigin}/${sourceIsolation.total} on-origin)`);
1010
1083
  if (sourceIsolation.offOrigin > 0) {
1011
- console.log(` Off-origin fetches: ${sourceIsolation.offOrigin}`);
1084
+ _log.info(` Off-origin fetches: ${sourceIsolation.offOrigin}`);
1012
1085
  for (const url of sourceIsolation.offOriginUrls.slice(0, 10)) {
1013
- console.log(` • ${url}`);
1086
+ _log.info(` • ${url}`);
1014
1087
  }
1015
1088
  }
1016
1089
  if (Object.keys(sourceIsolation.originBreakdown).length > 0) {
1017
- console.log(" Origin breakdown:");
1090
+ _log.info(" Origin breakdown:");
1018
1091
  for (const [origin, count] of Object.entries(sourceIsolation.originBreakdown).sort((a, b) => b[1] - a[1])) {
1019
- console.log(` ${origin}: ${count}`);
1092
+ _log.info(` ${origin}: ${count}`);
1020
1093
  }
1021
1094
  }
1022
1095
  }
1023
- console.log();
1096
+ _log.info("");
1024
1097
  }
1025
1098
  // Build overall agent behavior stats for summary
1026
1099
  const overallAgentBehavior = agentBehavior && agentBehavior.length > 0
@@ -1102,31 +1175,31 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
1102
1175
  urlReferences: urlRefs,
1103
1176
  };
1104
1177
  }
1105
- function printUrlReport(urlRefs) {
1106
- console.log("-".repeat(80));
1107
- console.log("URL REFERENCES");
1108
- console.log("-".repeat(80));
1109
- console.log();
1178
+ function printUrlReport(urlRefs, log) {
1179
+ log.info("-".repeat(80));
1180
+ log.info("URL REFERENCES");
1181
+ log.info("-".repeat(80));
1182
+ log.info("");
1110
1183
  for (const ref of urlRefs) {
1111
1184
  const goldUrls = Object.entries(ref.gold.urls).sort((a, b) => b[1] - a[1]);
1112
1185
  const baselineUrls = Object.entries(ref.baseline.urls).sort((a, b) => b[1] - a[1]);
1113
1186
  if (goldUrls.length > 0) {
1114
- console.log(` ${ref.feature} (gold):`);
1187
+ log.info(` ${ref.feature} (gold):`);
1115
1188
  for (const [url, count] of goldUrls) {
1116
1189
  const suffix = count > 1 ? ` (${count} tests)` : "";
1117
- console.log(` ${url}${suffix}`);
1190
+ log.info(` ${url}${suffix}`);
1118
1191
  }
1119
1192
  }
1120
1193
  if (baselineUrls.length > 0) {
1121
- console.log(` ${ref.feature} (baseline):`);
1194
+ log.info(` ${ref.feature} (baseline):`);
1122
1195
  for (const [url, count] of baselineUrls) {
1123
1196
  const suffix = count > 1 ? ` (${count} tests)` : "";
1124
- console.log(` ${url}${suffix} [parametric]`);
1197
+ log.info(` ${url}${suffix} [parametric]`);
1125
1198
  }
1126
1199
  }
1127
1200
  if (goldUrls.length === 0 && baselineUrls.length === 0) {
1128
- console.log(` ${ref.feature}: no URLs referenced`);
1201
+ log.info(` ${ref.feature}: no URLs referenced`);
1129
1202
  }
1130
- console.log();
1203
+ log.info("");
1131
1204
  }
1132
1205
  }