@sanity/ailf 0.1.34 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/LICENSE +21 -0
  2. package/config/airbyte/ai_literacy_framework.connector.yaml +6 -0
  3. package/config/bigquery/views/reports.sql +1 -0
  4. package/dist/_vendor/ailf-core/examples/index.d.ts +10 -20
  5. package/dist/_vendor/ailf-core/examples/index.js +10 -20
  6. package/dist/_vendor/ailf-core/ports/task-source.d.ts +2 -0
  7. package/dist/_vendor/ailf-core/types/index.d.ts +65 -0
  8. package/dist/_vendor/ailf-tasks/schemas.d.ts +12 -0
  9. package/dist/_vendor/ailf-tasks/schemas.js +4 -0
  10. package/dist/adapters/task-sources/content-lake-task-source.js +9 -1
  11. package/dist/adapters/task-sources/repo-task-source.js +19 -4
  12. package/dist/commands/calculate-scores.js +5 -1
  13. package/dist/commands/publish.js +3 -0
  14. package/dist/composition-root.js +7 -2
  15. package/dist/orchestration/pipeline-orchestrator.js +27 -2
  16. package/dist/orchestration/step-runner.js +8 -0
  17. package/dist/orchestration/steps/calculate-scores-step.js +22 -19
  18. package/dist/orchestration/steps/generate-configs-step.js +1 -0
  19. package/dist/orchestration/steps/grader-consistency-step.js +1 -0
  20. package/dist/orchestration/steps/mirror-repo-tasks-step.js +2 -1
  21. package/dist/orchestration/steps/publish-report-step.js +3 -0
  22. package/dist/pipeline/calculate-scores.d.ts +11 -1
  23. package/dist/pipeline/calculate-scores.js +222 -157
  24. package/dist/pipeline/coverage-audit.d.ts +2 -1
  25. package/dist/pipeline/coverage-audit.js +5 -3
  26. package/dist/pipeline/expand-tasks.d.ts +2 -1
  27. package/dist/pipeline/expand-tasks.js +33 -2
  28. package/dist/pipeline/generate-configs.d.ts +3 -1
  29. package/dist/pipeline/generate-configs.js +51 -37
  30. package/dist/pipeline/grader-api.d.ts +2 -1
  31. package/dist/pipeline/grader-api.js +11 -9
  32. package/dist/pipeline/grader-compare-runner.d.ts +3 -0
  33. package/dist/pipeline/grader-compare-runner.js +21 -19
  34. package/dist/pipeline/grader-consistency-runner.d.ts +3 -0
  35. package/dist/pipeline/grader-consistency-runner.js +16 -14
  36. package/dist/pipeline/grader-sensitivity-runner.d.ts +3 -0
  37. package/dist/pipeline/grader-sensitivity-runner.js +18 -16
  38. package/dist/pipeline/grader-validate-runner.d.ts +3 -0
  39. package/dist/pipeline/grader-validate-runner.js +16 -14
  40. package/dist/pipeline/mirror-repo-tasks.d.ts +80 -1
  41. package/dist/pipeline/mirror-repo-tasks.js +148 -32
  42. package/dist/pipeline/provenance.d.ts +3 -0
  43. package/dist/pipeline/provenance.js +25 -3
  44. package/dist/pipeline/report-title.d.ts +66 -0
  45. package/dist/pipeline/report-title.js +118 -0
  46. package/dist/report-store.js +2 -0
  47. package/dist/sinks/bigquery/index.d.ts +1 -0
  48. package/dist/sinks/bigquery/index.js +1 -0
  49. package/dist/sources.d.ts +2 -1
  50. package/dist/sources.js +28 -1
  51. package/package.json +23 -23
@@ -27,6 +27,7 @@
27
27
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
28
28
  import { join } from "path";
29
29
  import { calculateCost } from "../agent-observer/pricing.js";
30
+ import { ConsoleLogger } from "../adapters/loggers/index.js";
30
31
  import { checkResultsExist } from "./checks.js";
31
32
  import { loadRubricTemplates } from "./expand-tasks.js";
32
33
  import { loadSource } from "../sources.js";
@@ -375,63 +376,63 @@ function extractGraderCost(resultsPath) {
375
376
  /**
376
377
  * Prints a formatted report of agent behavior observations.
377
378
  */
378
- function printAgentBehaviorReport(agentBehavior) {
379
- console.log("-".repeat(80));
380
- console.log("AGENT BEHAVIOR OBSERVATION");
381
- console.log("-".repeat(80));
382
- console.log();
379
+ function printAgentBehaviorReport(agentBehavior, log) {
380
+ log.info("-".repeat(80));
381
+ log.info("AGENT BEHAVIOR OBSERVATION");
382
+ log.info("-".repeat(80));
383
+ log.info("");
383
384
  // Summary table
384
385
  const h = "| Feature Area | Tests | Doc Pages | Searches | Net (ms) |";
385
386
  const sep = "|---------------------|-------|-----------|----------|----------|";
386
- console.log(h);
387
- console.log(sep);
387
+ log.info(h);
388
+ log.info(sep);
388
389
  for (const ab of agentBehavior) {
389
- console.log(`| ${ab.feature.padEnd(19)} | ` +
390
+ log.info(`| ${ab.feature.padEnd(19)} | ` +
390
391
  `${ab.tasksWithBehaviorData.toString().padStart(5)} | ` +
391
392
  `${ab.avgDocPagesVisited.toFixed(1).padStart(9)} | ` +
392
393
  `${ab.avgSearchesPerformed.toFixed(1).padStart(8)} | ` +
393
394
  `${Math.round(ab.avgNetworkTimeMs).toString().padStart(8)} |`);
394
395
  }
395
- console.log();
396
+ log.info("");
396
397
  // Doc pages visited
397
- console.log(" Doc pages visited:");
398
+ log.info(" Doc pages visited:");
398
399
  for (const ab of agentBehavior) {
399
400
  if (ab.docSlugsVisited.length === 0) {
400
- console.log(` ${ab.feature}: (none)`);
401
+ log.info(` ${ab.feature}: (none)`);
401
402
  }
402
403
  else {
403
- console.log(` ${ab.feature}:`);
404
+ log.info(` ${ab.feature}:`);
404
405
  for (const slug of ab.docSlugsVisited) {
405
- console.log(` - /docs/${slug}`);
406
+ log.info(` - /docs/${slug}`);
406
407
  }
407
408
  }
408
409
  }
409
- console.log();
410
+ log.info("");
410
411
  // Search queries
411
412
  const hasSearches = agentBehavior.some((ab) => ab.searchQueries.length > 0);
412
413
  if (hasSearches) {
413
- console.log(" Search queries:");
414
+ log.info(" Search queries:");
414
415
  for (const ab of agentBehavior) {
415
416
  if (ab.searchQueries.length === 0) {
416
417
  continue;
417
418
  }
418
- console.log(` ${ab.feature}:`);
419
+ log.info(` ${ab.feature}:`);
419
420
  for (const q of ab.searchQueries) {
420
- console.log(` - "${q}"`);
421
+ log.info(` - "${q}"`);
421
422
  }
422
423
  }
423
- console.log();
424
+ log.info("");
424
425
  }
425
426
  // External domains
426
427
  const allExternalDomains = [
427
428
  ...new Set(agentBehavior.flatMap((ab) => ab.externalDomains)),
428
429
  ];
429
430
  if (allExternalDomains.length > 0) {
430
- console.log(" External domains contacted:");
431
+ log.info(" External domains contacted:");
431
432
  for (const d of allExternalDomains) {
432
- console.log(` - ${d}`);
433
+ log.info(` - ${d}`);
433
434
  }
434
- console.log();
435
+ log.info("");
435
436
  }
436
437
  }
437
438
  // ---------------------------------------------------------------------------
@@ -444,9 +445,15 @@ function printAgentBehaviorReport(agentBehavior) {
444
445
  * Reads the raw Promptfoo output file and normalizes each result so that
445
446
  * `description` is always a top-level field (pulled from `testCase` if needed).
446
447
  */
447
- function readAndNormalizeResults(resultsPath) {
448
+ function readAndNormalizeResults(resultsPath, log) {
449
+ const _log = log ?? new ConsoleLogger();
448
450
  const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
449
451
  const wrapper = file.results ?? file;
452
+ _log.debug("Reading results file", {
453
+ path: resultsPath,
454
+ resultCount: wrapper.results.length,
455
+ stats: wrapper.stats,
456
+ });
450
457
  const all = wrapper.results.map((r) => ({
451
458
  cost: r.cost ?? 0,
452
459
  description: r.testCase?.description ?? "unknown",
@@ -463,15 +470,20 @@ function readAndNormalizeResults(resultsPath) {
463
470
  // Promptfoo sets gradingResult to null when a test errors before grading.
464
471
  const valid = all.filter((r) => r.gradingResult !== null);
465
472
  const skipped = all.length - valid.length;
473
+ _log.debug("Filtered null gradingResults", {
474
+ totalResults: all.length,
475
+ validResults: valid.length,
476
+ skippedCount: skipped,
477
+ });
466
478
  if (skipped > 0) {
467
- console.warn(` ⚠ Skipping ${skipped} of ${all.length} result(s) with null gradingResult (errored tests):`);
479
+ _log.warn(`⚠ Skipping ${skipped} of ${all.length} result(s) with null gradingResult (errored tests):`);
468
480
  for (const r of all) {
469
481
  if (r.gradingResult === null) {
470
482
  const providerLabel = r.provider ? `[${r.provider}] ` : "";
471
483
  const errorMsg = r.error
472
484
  ? r.error.slice(0, 150)
473
485
  : "unknown error (no error field in result)";
474
- console.warn(` ✗ ${providerLabel}"${r.description}" — ${errorMsg}`);
486
+ _log.warn(`✗ ${providerLabel}"${r.description}" — ${errorMsg}`);
475
487
  }
476
488
  }
477
489
  }
@@ -555,7 +567,7 @@ function scoreResults(results, weights, modelId) {
555
567
  const ceilingScore = Math.round(withDocsTotal);
556
568
  const floorScore = Math.round(withoutDocsScore);
557
569
  const docLift = ceilingScore - floorScore;
558
- scores.push({
570
+ const featureScore = {
559
571
  ceilingScore,
560
572
  codeCorrectness: Math.round(avgCode),
561
573
  docCoverage: Math.round(avgDoc),
@@ -569,7 +581,8 @@ function scoreResults(results, weights, modelId) {
569
581
  testCount: data.withDocs.length,
570
582
  totalCost: featureCost,
571
583
  totalScore: ceilingScore,
572
- });
584
+ };
585
+ scores.push(featureScore);
573
586
  }
574
587
  return scores.sort((a, b) => a.feature.localeCompare(b.feature));
575
588
  }
@@ -654,6 +667,7 @@ export function scoreAgenticResults(resultsPath, weights) {
654
667
  const CRITICAL_THRESHOLD = 40;
655
668
  export function calculateAndWriteScores(options) {
656
669
  const ROOT = options.rootDir;
670
+ const log = options.logger ?? new ConsoleLogger();
657
671
  const sourceName = options.source;
658
672
  // Pre-resolved source wins over name-based lookup
659
673
  let source = options.resolvedSource;
@@ -662,7 +676,7 @@ export function calculateAndWriteScores(options) {
662
676
  source = loadSource(sourceName);
663
677
  }
664
678
  catch {
665
- console.warn(` [warn] Could not load source "${sourceName}", proceeding without source metadata`);
679
+ log.warn(`[warn] Could not load source "${sourceName}", proceeding without source metadata`);
666
680
  }
667
681
  }
668
682
  // Determine mode — controls which result files are read
@@ -674,23 +688,28 @@ export function calculateAndWriteScores(options) {
674
688
  const resultsIssues = checkResultsExist(ROOT, baselineResultsPath);
675
689
  const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
676
690
  if (resultsErrors.length > 0) {
677
- console.error("❌ Results validation failed:");
678
- for (const e of resultsErrors) {
679
- console.error(` ERROR: ${e.message}`);
680
- if (e.path) {
681
- console.error(` at ${e.path}`);
682
- }
683
- }
684
- console.error("\nRun 'pnpm eval' first to generate results, then 'pnpm calculate-scores'.");
685
- process.exit(1);
691
+ const details = resultsErrors
692
+ .map((e) => (e.path ? `${e.message} (at ${e.path})` : e.message))
693
+ .join("; ");
694
+ throw new Error(`Results validation failed: ${details}. Run 'pnpm eval' first to generate results.`);
686
695
  }
687
- console.log(`Reading results from: ${baselineResultsPath}`);
696
+ log.info(`Reading results from: ${baselineResultsPath}`);
688
697
  if (source) {
689
- console.log(`Source: ${sourceName} (${source.baseUrl})`);
698
+ log.info(`Source: ${sourceName} (${source.baseUrl})`);
690
699
  }
691
700
  // Load dimension weights from rubrics.yaml
692
701
  const rubricConfig = loadRubricTemplates(ROOT);
702
+ log.debug("Loaded rubric weights", { weights: rubricConfig.weights });
693
703
  const baselineScores = calculateScores(baselineResultsPath, rubricConfig.weights);
704
+ log.debug("Baseline scores calculated", {
705
+ featureCount: baselineScores.length,
706
+ features: baselineScores.map((s) => ({
707
+ feature: s.feature,
708
+ ceilingScore: s.ceilingScore,
709
+ floorScore: s.floorScore,
710
+ docLift: s.docLift,
711
+ })),
712
+ });
694
713
  const perModel = calculateScoresPerModel(baselineResultsPath, rubricConfig.weights);
695
714
  const urlRefs = aggregateUrlReferences(baselineResultsPath);
696
715
  const sourceVerification = buildSourceVerification(ROOT, source, {
@@ -705,8 +724,16 @@ export function calculateAndWriteScores(options) {
705
724
  let sourceIsolation = null;
706
725
  let evaluationMode;
707
726
  if (mode === "full" && existsSync(agenticResultsPath)) {
708
- console.log(`\nReading agentic results from: ${agenticResultsPath}`);
727
+ log.info(`\nReading agentic results from: ${agenticResultsPath}`);
709
728
  const agenticScores = scoreAgenticResults(agenticResultsPath, rubricConfig.weights);
729
+ log.debug("Agentic scores calculated", {
730
+ featureCount: Object.keys(agenticScores).length,
731
+ features: Object.entries(agenticScores).map(([f, s]) => ({
732
+ feature: f,
733
+ actualScore: s.actualScore,
734
+ testCount: s.testCount,
735
+ })),
736
+ });
710
737
  scores = mergeScores(baselineScores, agenticScores);
711
738
  evaluationMode = "full";
712
739
  // Aggregate agent behavior and source isolation from agentic results
@@ -733,12 +760,12 @@ export function calculateAndWriteScores(options) {
733
760
  sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
734
761
  evaluationMode = mode === "observed" ? "observed" : "baseline";
735
762
  }
736
- const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode);
763
+ const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log);
737
764
  // Persist
738
765
  const outDir = join(ROOT, "results", "latest");
739
766
  mkdirSync(outDir, { recursive: true });
740
767
  writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
741
- console.log("Score summary written to results/latest/score-summary.json");
768
+ log.info("Score summary written to results/latest/score-summary.json");
742
769
  // Extract and persist grader judgments (Phase 3a: failure mode extraction)
743
770
  const judgments = extractGraderJudgments(baselineResultsPath);
744
771
  // In full mode, also extract judgments from agentic results
@@ -748,23 +775,60 @@ export function calculateAndWriteScores(options) {
748
775
  }
749
776
  if (judgments.length > 0) {
750
777
  writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
751
- console.log(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
778
+ log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
752
779
  }
753
- // Exit with non-zero if any area below critical threshold
754
- if (summary.belowCritical.length > 0) {
755
- process.exit(1);
780
+ // Compute test summary from the raw results file
781
+ const testSummary = computeTestSummary(baselineResultsPath);
782
+ return { belowCritical: summary.belowCritical, testSummary };
783
+ }
784
+ /**
785
+ * Compute a TestSummary from a raw Promptfoo results file.
786
+ * Counts total, passed, failed, and errored tests with error details.
787
+ */
788
+ function computeTestSummary(resultsPath) {
789
+ const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
790
+ const wrapper = file.results ?? file;
791
+ const rawResults = wrapper.results;
792
+ let passed = 0;
793
+ let failed = 0;
794
+ let errored = 0;
795
+ const errors = [];
796
+ for (const r of rawResults) {
797
+ if (r.gradingResult === null || r.gradingResult === undefined) {
798
+ errored++;
799
+ errors.push({
800
+ model: r.provider?.label ?? r.provider?.id ?? "unknown",
801
+ task: r.testCase?.description ?? "unknown",
802
+ error: r.error
803
+ ? r.error.slice(0, 200)
804
+ : "unknown error (null gradingResult)",
805
+ });
806
+ }
807
+ else if (r.gradingResult.pass) {
808
+ passed++;
809
+ }
810
+ else {
811
+ failed++;
812
+ }
756
813
  }
814
+ return {
815
+ total: rawResults.length,
816
+ passed,
817
+ failed,
818
+ errored,
819
+ ...(errors.length > 0 ? { errors } : {}),
820
+ };
757
821
  }
758
- function printPerModelReport(perModel) {
759
- console.log("-".repeat(80));
760
- console.log("PER-MODEL BREAKDOWN");
761
- console.log("-".repeat(80));
762
- console.log();
822
+ function printPerModelReport(perModel, log) {
823
+ log.info("-".repeat(80));
824
+ log.info("PER-MODEL BREAKDOWN");
825
+ log.info("-".repeat(80));
826
+ log.info("");
763
827
  // Model summary table
764
828
  const h = "| Model | Avg Score | Avg Lift | Tests | Cost |";
765
829
  const sep = "|--------------------------------|-----------|----------|-------|----------|";
766
- console.log(h);
767
- console.log(sep);
830
+ log.info(h);
831
+ log.info(sep);
768
832
  const sorted = [...perModel].sort((a, b) => b.overall.avgScore - a.overall.avgScore);
769
833
  for (const entry of sorted) {
770
834
  const displayName = entry.label || entry.modelId;
@@ -774,63 +838,64 @@ function printPerModelReport(perModel) {
774
838
  const liftStr = entry.overall.avgDocLift >= 0
775
839
  ? `+${entry.overall.avgDocLift.toFixed(1)}`
776
840
  : entry.overall.avgDocLift.toFixed(1);
777
- console.log(`| ${displayName.padEnd(30)} | ` +
841
+ log.info(`| ${displayName.padEnd(30)} | ` +
778
842
  `${entry.overall.avgScore.toFixed(1).padStart(9)} | ` +
779
843
  `${liftStr.padStart(8)} | ` +
780
844
  `${entry.overall.testCount.toString().padStart(5)} | ` +
781
845
  `${costStr.padStart(8)} |`);
782
846
  }
783
- console.log();
847
+ log.info("");
784
848
  // Per-model × per-area breakdown
785
849
  for (const entry of sorted) {
786
850
  const displayName = entry.label || entry.modelId;
787
- console.log(` ${displayName} (${entry.modelId}):`);
851
+ log.info(` ${displayName} (${entry.modelId}):`);
788
852
  const areaH = " | Feature Area | Task | Code | Docs | Total | Lift |";
789
853
  const areaSep = " |---------------------|------|------|------|-------|------|";
790
- console.log(areaH);
791
- console.log(areaSep);
854
+ log.info(areaH);
855
+ log.info(areaSep);
792
856
  for (const s of entry.scores) {
793
857
  const lift = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
794
- console.log(` | ${s.feature.padEnd(19)} | ` +
858
+ log.info(` | ${s.feature.padEnd(19)} | ` +
795
859
  `${s.taskCompletion.toString().padStart(4)} | ` +
796
860
  `${s.codeCorrectness.toString().padStart(4)} | ` +
797
861
  `${s.docCoverage.toString().padStart(4)} | ` +
798
862
  `${s.totalScore.toString().padStart(5)} | ` +
799
863
  `${lift.padStart(4)} |`);
800
864
  }
801
- console.log();
865
+ log.info("");
802
866
  }
803
867
  // Cost-per-quality-point
804
868
  const modelsWithCost = sorted.filter((e) => e.overall.cost && e.overall.cost > 0);
805
869
  if (modelsWithCost.length > 0) {
806
- console.log(" Cost per quality point:");
870
+ log.info(" Cost per quality point:");
807
871
  for (const entry of modelsWithCost) {
808
872
  const displayName = entry.label;
809
873
  const costPerPoint = entry.overall.avgScore > 0
810
874
  ? (entry.overall.cost ?? 0) / entry.overall.avgScore
811
875
  : 0;
812
- console.log(` ${displayName}: $${costPerPoint.toFixed(6)}/point (score: ${entry.overall.avgScore.toFixed(1)}, cost: $${(entry.overall.cost ?? 0).toFixed(4)})`);
876
+ log.info(` ${displayName}: $${costPerPoint.toFixed(6)}/point (score: ${entry.overall.avgScore.toFixed(1)}, cost: $${(entry.overall.cost ?? 0).toFixed(4)})`);
813
877
  }
814
- console.log();
878
+ log.info("");
815
879
  }
816
880
  }
817
881
  // ---------------------------------------------------------------------------
818
882
  // Main
819
883
  // ---------------------------------------------------------------------------
820
- function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode) {
821
- console.log("\n" + "=".repeat(80));
822
- console.log(" SANITY AI LITERACY SCORE REPORT");
823
- console.log("=".repeat(80));
824
- console.log();
884
+ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log) {
885
+ const _log = log ?? new ConsoleLogger();
886
+ _log.info("\n" + "=".repeat(80));
887
+ _log.info(" SANITY AI LITERACY SCORE REPORT");
888
+ _log.info("=".repeat(80));
889
+ _log.info("");
825
890
  // Table header
826
891
  const h = "| Feature Area | Task | Code | Docs | Total | w/o Docs | Doc Lift |";
827
892
  const sep = "|---------------------|------|------|------|-------|----------|----------|";
828
- console.log(h);
829
- console.log(sep);
893
+ _log.info(h);
894
+ _log.info(sep);
830
895
  for (const s of scores) {
831
896
  const status = s.totalScore < CRITICAL_THRESHOLD ? "!!" : "ok";
832
897
  const lift = s.docLift > 0 ? `+${s.docLift}` : `${s.docLift}`;
833
- console.log(`| ${status} ${s.feature.padEnd(17)} | ` +
898
+ _log.info(`| ${status} ${s.feature.padEnd(17)} | ` +
834
899
  `${s.taskCompletion.toString().padStart(4)} | ` +
835
900
  `${s.codeCorrectness.toString().padStart(4)} | ` +
836
901
  `${s.docCoverage.toString().padStart(4)} | ` +
@@ -838,7 +903,7 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
838
903
  `${s.floorScore.toString().padStart(8)} | ` +
839
904
  `${lift.padStart(8)} |`);
840
905
  }
841
- console.log();
906
+ _log.info("");
842
907
  // OKR status
843
908
  const belowCritical = scores.filter((s) => s.totalScore < CRITICAL_THRESHOLD);
844
909
  const lowestScore = scores.reduce((min, s) => s.totalScore < min.totalScore ? s : min);
@@ -852,69 +917,69 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
852
917
  area: s.feature,
853
918
  docLift: s.docLift,
854
919
  }));
855
- console.log("-".repeat(80));
856
- console.log("OKR STATUS");
857
- console.log("-".repeat(80));
858
- console.log();
920
+ _log.info("-".repeat(80));
921
+ _log.info("OKR STATUS");
922
+ _log.info("-".repeat(80));
923
+ _log.info("");
859
924
  if (belowCritical.length === 0) {
860
- console.log(" KR1: PASS -- All areas above critical threshold (>=40)");
925
+ _log.info(" KR1: PASS -- All areas above critical threshold (>=40)");
861
926
  }
862
927
  else {
863
- console.log(" KR1: FAIL -- Areas below critical threshold:");
864
- belowCritical.forEach((s) => console.log(` - ${s.feature}: ${s.totalScore}`));
928
+ _log.info(" KR1: FAIL -- Areas below critical threshold:");
929
+ belowCritical.forEach((s) => _log.info(` - ${s.feature}: ${s.totalScore}`));
865
930
  }
866
- console.log();
867
- console.log(` Lowest area: ${lowestScore.feature} (${lowestScore.totalScore})`);
868
- console.log(` Target: +15 points improvement`);
869
- console.log();
870
- console.log(` Avg score: ${avgScore.toFixed(1)}`);
871
- console.log(` Avg doc lift: +${avgLift.toFixed(1)} points`);
872
- console.log(` (Doc lift = how much docs help vs parametric knowledge alone)`);
873
- console.log();
931
+ _log.info("");
932
+ _log.info(` Lowest area: ${lowestScore.feature} (${lowestScore.totalScore})`);
933
+ _log.info(` Target: +15 points improvement`);
934
+ _log.info("");
935
+ _log.info(` Avg score: ${avgScore.toFixed(1)}`);
936
+ _log.info(` Avg doc lift: +${avgLift.toFixed(1)} points`);
937
+ _log.info(` (Doc lift = how much docs help vs parametric knowledge alone)`);
938
+ _log.info("");
874
939
  // Ceiling decomposition
875
- console.log("-".repeat(80));
876
- console.log("CEILING DECOMPOSITION");
877
- console.log("-".repeat(80));
878
- console.log();
940
+ _log.info("-".repeat(80));
941
+ _log.info("CEILING DECOMPOSITION");
942
+ _log.info("-".repeat(80));
943
+ _log.info("");
879
944
  const ceilH = "| Feature Area | Floor | Ceiling | Doc Lift | Quality Gap |";
880
945
  const ceilSep = "|---------------------|-------|---------|----------|-------------|";
881
- console.log(ceilH);
882
- console.log(ceilSep);
946
+ _log.info(ceilH);
947
+ _log.info(ceilSep);
883
948
  for (const s of scores) {
884
949
  const liftStr = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
885
950
  const liftFlag = s.negativeDocLift ? " 🚨" : "";
886
- console.log(`| ${s.feature.padEnd(19)} | ` +
951
+ _log.info(`| ${s.feature.padEnd(19)} | ` +
887
952
  `${s.floorScore.toString().padStart(5)} | ` +
888
953
  `${s.ceilingScore.toString().padStart(7)} | ` +
889
954
  `${liftStr.padStart(8)}${liftFlag} | ` +
890
955
  `${s.docQualityGap.toString().padStart(11)} |`);
891
956
  }
892
- console.log();
957
+ _log.info("");
893
958
  if (negativeDocLiftAreas.length > 0) {
894
- console.log(" 🚨 NEGATIVE DOC LIFT DETECTED:");
959
+ _log.info(" 🚨 NEGATIVE DOC LIFT DETECTED:");
895
960
  for (const { area, docLift } of negativeDocLiftAreas) {
896
961
  const s = scores.find((sc) => sc.feature === area);
897
- console.log(` ${area}: Doc Lift = ${docLift} (floor: ${s.floorScore}, ceiling: ${s.ceilingScore})`);
962
+ _log.info(` ${area}: Doc Lift = ${docLift} (floor: ${s.floorScore}, ceiling: ${s.ceilingScore})`);
898
963
  }
899
- console.log(" Documentation is HURTING model performance for these areas.");
900
- console.log(" See docs/design-docs/scenario-matrix/evaluation-ceiling.md");
901
- console.log();
964
+ _log.info(" Documentation is HURTING model performance for these areas.");
965
+ _log.info(" See docs/design-docs/scenario-matrix/evaluation-ceiling.md");
966
+ _log.info("");
902
967
  }
903
968
  else {
904
- console.log(" ✅ No areas with negative Doc Lift detected.");
905
- console.log();
969
+ _log.info(" ✅ No areas with negative Doc Lift detected.");
970
+ _log.info("");
906
971
  }
907
972
  // Three-layer decomposition (only when actual scores are present)
908
973
  const hasActualScores = scores.some((s) => s.actualScore !== undefined);
909
974
  if (hasActualScores) {
910
- console.log("-".repeat(80));
911
- console.log("THREE-LAYER DECOMPOSITION (floor → ceiling → actual)");
912
- console.log("-".repeat(80));
913
- console.log();
975
+ _log.info("-".repeat(80));
976
+ _log.info("THREE-LAYER DECOMPOSITION (floor → ceiling → actual)");
977
+ _log.info("-".repeat(80));
978
+ _log.info("");
914
979
  const decompH = "| Feature Area | Floor | Ceiling | Actual | Doc Lift | Ret. Gap | Infra % |";
915
980
  const decompSep = "|---------------------|-------|---------|--------|----------|----------|---------|";
916
- console.log(decompH);
917
- console.log(decompSep);
981
+ _log.info(decompH);
982
+ _log.info(decompSep);
918
983
  for (const s of scores) {
919
984
  const liftStr = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
920
985
  const actualStr = s.actualScore !== undefined ? s.actualScore.toString() : "—";
@@ -927,7 +992,7 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
927
992
  ? `${Math.round(s.infrastructureEfficiency * 100)}%`
928
993
  : "—";
929
994
  const flag = s.invertedRetrievalGap ? " 🔄" : "";
930
- console.log(`| ${s.feature.padEnd(19)} | ` +
995
+ _log.info(`| ${s.feature.padEnd(19)} | ` +
931
996
  `${s.floorScore.toString().padStart(5)} | ` +
932
997
  `${s.ceilingScore.toString().padStart(7)} | ` +
933
998
  `${actualStr.padStart(6)} | ` +
@@ -935,10 +1000,10 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
935
1000
  `${(gapStr + flag).padStart(8)} | ` +
936
1001
  `${infraStr.padStart(7)} |`);
937
1002
  }
938
- console.log();
939
- console.log(" Doc Lift = ceiling − floor | Ret. Gap = ceiling − actual | Infra = actual / ceiling");
940
- console.log(" 🔄 = inverted retrieval gap (agents avoid bad docs → higher actual than ceiling)");
941
- console.log();
1003
+ _log.info("");
1004
+ _log.info(" Doc Lift = ceiling − floor | Ret. Gap = ceiling − actual | Infra = actual / ceiling");
1005
+ _log.info(" 🔄 = inverted retrieval gap (agents avoid bad docs → higher actual than ceiling)");
1006
+ _log.info("");
942
1007
  }
943
1008
  // Cost summary
944
1009
  const totalCost = scores.reduce((sum, s) => sum + s.totalCost, 0);
@@ -946,66 +1011,66 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
946
1011
  const graderCostTotal = graderCost?.cost ?? 0;
947
1012
  const combinedCost = totalCost + graderCostTotal;
948
1013
  if (totalCost > 0 || graderCostTotal > 0) {
949
- console.log("-".repeat(80));
950
- console.log("COST SUMMARY");
951
- console.log("-".repeat(80));
952
- console.log();
953
- console.log(` Provider cost: $${totalCost.toFixed(4)}`);
1014
+ _log.info("-".repeat(80));
1015
+ _log.info("COST SUMMARY");
1016
+ _log.info("-".repeat(80));
1017
+ _log.info("");
1018
+ _log.info(` Provider cost: $${totalCost.toFixed(4)}`);
954
1019
  if (graderCostTotal > 0) {
955
1020
  const graderLabel = graderCost?.model ?? "unknown";
956
- console.log(` Grader cost: $${graderCostTotal.toFixed(4)} (${graderLabel}, ${(graderCost?.totalTokens ?? 0).toLocaleString()} tokens)`);
1021
+ _log.info(` Grader cost: $${graderCostTotal.toFixed(4)} (${graderLabel}, ${(graderCost?.totalTokens ?? 0).toLocaleString()} tokens)`);
957
1022
  }
958
- console.log(` Total cost: $${combinedCost.toFixed(4)}`);
959
- console.log(` Avg cost per test: $${(combinedCost / (totalTests || 1)).toFixed(4)}`);
960
- console.log();
1023
+ _log.info(` Total cost: $${combinedCost.toFixed(4)}`);
1024
+ _log.info(` Avg cost per test: $${(combinedCost / (totalTests || 1)).toFixed(4)}`);
1025
+ _log.info("");
961
1026
  const costHeader = "| Feature Area | Tests | Cost | Avg/Test |";
962
1027
  const costSep = "|---------------------|-------|----------|----------|";
963
- console.log(costHeader);
964
- console.log(costSep);
1028
+ _log.info(costHeader);
1029
+ _log.info(costSep);
965
1030
  for (const s of scores) {
966
1031
  const avgCost = s.testCount > 0 ? s.totalCost / s.testCount : 0;
967
- console.log(`| ${s.feature.padEnd(19)} | ` +
1032
+ _log.info(`| ${s.feature.padEnd(19)} | ` +
968
1033
  `${s.testCount.toString().padStart(5)} | ` +
969
1034
  `$${s.totalCost.toFixed(4).padStart(7)} | ` +
970
1035
  `$${avgCost.toFixed(4).padStart(7)} |`);
971
1036
  }
972
- console.log();
1037
+ _log.info("");
973
1038
  }
974
1039
  // Per-model breakdown
975
1040
  if (perModel) {
976
- printPerModelReport(perModel);
1041
+ printPerModelReport(perModel, _log);
977
1042
  }
978
1043
  // URL References
979
- printUrlReport(urlRefs);
1044
+ printUrlReport(urlRefs, _log);
980
1045
  // Agent Behavior (only present when run with instrumented provider)
981
1046
  if (agentBehavior && agentBehavior.length > 0) {
982
- printAgentBehaviorReport(agentBehavior);
1047
+ printAgentBehaviorReport(agentBehavior, _log);
983
1048
  }
984
1049
  // Source verification (unified report for all modes)
985
1050
  if (sourceVerification || sourceIsolation) {
986
- console.log("-".repeat(80));
987
- console.log("📋 SOURCE VERIFICATION");
988
- console.log("-".repeat(80));
1051
+ _log.info("-".repeat(80));
1052
+ _log.info("📋 SOURCE VERIFICATION");
1053
+ _log.info("-".repeat(80));
989
1054
  if (sourceVerification) {
990
- console.log(` Source: ${sourceVerification.source}`);
991
- console.log(` Mode: ${sourceVerification.mode}`);
1055
+ _log.info(` Source: ${sourceVerification.source}`);
1056
+ _log.info(` Mode: ${sourceVerification.mode}`);
992
1057
  if (sourceVerification.allowedOrigins) {
993
- console.log(` Sandbox: ${sourceVerification.allowedOrigins.join(", ")}`);
1058
+ _log.info(` Sandbox: ${sourceVerification.allowedOrigins.join(", ")}`);
994
1059
  }
995
1060
  if (sourceVerification.searchMode) {
996
- console.log(` Search: ${sourceVerification.searchMode}`);
1061
+ _log.info(` Search: ${sourceVerification.searchMode}`);
997
1062
  }
998
1063
  // URL fetch results (baseline mode with direct URLs)
999
1064
  if (sourceVerification.urlFetch) {
1000
1065
  const uf = sourceVerification.urlFetch;
1001
- console.log();
1002
- console.log(` URL fetch: ${uf.totalFetched} fetched, ${uf.totalFailed} failed`);
1066
+ _log.info("");
1067
+ _log.info(` URL fetch: ${uf.totalFetched} fetched, ${uf.totalFailed} failed`);
1003
1068
  for (const f of uf.fetchedUrls) {
1004
- console.log(` ✅ ${f.url} (via ${f.method})`);
1069
+ _log.info(` ✅ ${f.url} (via ${f.method})`);
1005
1070
  }
1006
1071
  for (const f of uf.failures) {
1007
1072
  // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means no error info
1008
- console.log(` ⚠️ ${f.url}: ${f.error || "unknown error"}`);
1073
+ _log.info(` ⚠️ ${f.url}: ${f.error || "unknown error"}`);
1009
1074
  }
1010
1075
  }
1011
1076
  }
@@ -1013,22 +1078,22 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
1013
1078
  if (sourceIsolation) {
1014
1079
  const pct = Math.round(sourceIsolation.isolationScore * 100);
1015
1080
  const icon = sourceIsolation.offOrigin === 0 ? "✅" : "⚠️";
1016
- console.log();
1017
- console.log(` Agent isolation: ${icon} ${pct}% (${sourceIsolation.onOrigin}/${sourceIsolation.total} on-origin)`);
1081
+ _log.info("");
1082
+ _log.info(` Agent isolation: ${icon} ${pct}% (${sourceIsolation.onOrigin}/${sourceIsolation.total} on-origin)`);
1018
1083
  if (sourceIsolation.offOrigin > 0) {
1019
- console.log(` Off-origin fetches: ${sourceIsolation.offOrigin}`);
1084
+ _log.info(` Off-origin fetches: ${sourceIsolation.offOrigin}`);
1020
1085
  for (const url of sourceIsolation.offOriginUrls.slice(0, 10)) {
1021
- console.log(` • ${url}`);
1086
+ _log.info(` • ${url}`);
1022
1087
  }
1023
1088
  }
1024
1089
  if (Object.keys(sourceIsolation.originBreakdown).length > 0) {
1025
- console.log(" Origin breakdown:");
1090
+ _log.info(" Origin breakdown:");
1026
1091
  for (const [origin, count] of Object.entries(sourceIsolation.originBreakdown).sort((a, b) => b[1] - a[1])) {
1027
- console.log(` ${origin}: ${count}`);
1092
+ _log.info(` ${origin}: ${count}`);
1028
1093
  }
1029
1094
  }
1030
1095
  }
1031
- console.log();
1096
+ _log.info("");
1032
1097
  }
1033
1098
  // Build overall agent behavior stats for summary
1034
1099
  const overallAgentBehavior = agentBehavior && agentBehavior.length > 0
@@ -1110,31 +1175,31 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
1110
1175
  urlReferences: urlRefs,
1111
1176
  };
1112
1177
  }
1113
- function printUrlReport(urlRefs) {
1114
- console.log("-".repeat(80));
1115
- console.log("URL REFERENCES");
1116
- console.log("-".repeat(80));
1117
- console.log();
1178
+ function printUrlReport(urlRefs, log) {
1179
+ log.info("-".repeat(80));
1180
+ log.info("URL REFERENCES");
1181
+ log.info("-".repeat(80));
1182
+ log.info("");
1118
1183
  for (const ref of urlRefs) {
1119
1184
  const goldUrls = Object.entries(ref.gold.urls).sort((a, b) => b[1] - a[1]);
1120
1185
  const baselineUrls = Object.entries(ref.baseline.urls).sort((a, b) => b[1] - a[1]);
1121
1186
  if (goldUrls.length > 0) {
1122
- console.log(` ${ref.feature} (gold):`);
1187
+ log.info(` ${ref.feature} (gold):`);
1123
1188
  for (const [url, count] of goldUrls) {
1124
1189
  const suffix = count > 1 ? ` (${count} tests)` : "";
1125
- console.log(` ${url}${suffix}`);
1190
+ log.info(` ${url}${suffix}`);
1126
1191
  }
1127
1192
  }
1128
1193
  if (baselineUrls.length > 0) {
1129
- console.log(` ${ref.feature} (baseline):`);
1194
+ log.info(` ${ref.feature} (baseline):`);
1130
1195
  for (const [url, count] of baselineUrls) {
1131
1196
  const suffix = count > 1 ? ` (${count} tests)` : "";
1132
- console.log(` ${url}${suffix} [parametric]`);
1197
+ log.info(` ${url}${suffix} [parametric]`);
1133
1198
  }
1134
1199
  }
1135
1200
  if (goldUrls.length === 0 && baselineUrls.length === 0) {
1136
- console.log(` ${ref.feature}: no URLs referenced`);
1201
+ log.info(` ${ref.feature}: no URLs referenced`);
1137
1202
  }
1138
- console.log();
1203
+ log.info("");
1139
1204
  }
1140
1205
  }