@sanity/ailf 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/models.yaml +3 -2
- package/dist/_vendor/ailf-core/types/index.d.ts +53 -0
- package/dist/composition-root.js +7 -2
- package/dist/orchestration/pipeline-orchestrator.js +27 -2
- package/dist/orchestration/step-runner.js +8 -0
- package/dist/orchestration/steps/calculate-scores-step.js +4 -0
- package/dist/orchestration/steps/generate-configs-step.js +1 -0
- package/dist/orchestration/steps/grader-consistency-step.js +1 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +2 -1
- package/dist/pipeline/calculate-scores.d.ts +5 -0
- package/dist/pipeline/calculate-scores.js +219 -146
- package/dist/pipeline/coverage-audit.d.ts +2 -1
- package/dist/pipeline/coverage-audit.js +5 -3
- package/dist/pipeline/expand-tasks.d.ts +2 -1
- package/dist/pipeline/expand-tasks.js +33 -2
- package/dist/pipeline/generate-configs.d.ts +3 -1
- package/dist/pipeline/generate-configs.js +47 -28
- package/dist/pipeline/grader-api.d.ts +2 -1
- package/dist/pipeline/grader-api.js +11 -9
- package/dist/pipeline/grader-compare-runner.d.ts +3 -0
- package/dist/pipeline/grader-compare-runner.js +21 -19
- package/dist/pipeline/grader-consistency-runner.d.ts +3 -0
- package/dist/pipeline/grader-consistency-runner.js +16 -14
- package/dist/pipeline/grader-sensitivity-runner.d.ts +3 -0
- package/dist/pipeline/grader-sensitivity-runner.js +18 -16
- package/dist/pipeline/grader-validate-runner.d.ts +3 -0
- package/dist/pipeline/grader-validate-runner.js +16 -14
- package/dist/pipeline/mirror-repo-tasks.d.ts +3 -1
- package/dist/pipeline/mirror-repo-tasks.js +8 -6
- package/dist/pipeline/provenance.d.ts +3 -0
- package/dist/pipeline/provenance.js +25 -3
- package/dist/sources.d.ts +2 -1
- package/dist/sources.js +28 -1
- package/package.json +3 -3
|
@@ -27,6 +27,7 @@
|
|
|
27
27
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
28
28
|
import { join } from "path";
|
|
29
29
|
import { calculateCost } from "../agent-observer/pricing.js";
|
|
30
|
+
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
30
31
|
import { checkResultsExist } from "./checks.js";
|
|
31
32
|
import { loadRubricTemplates } from "./expand-tasks.js";
|
|
32
33
|
import { loadSource } from "../sources.js";
|
|
@@ -375,63 +376,63 @@ function extractGraderCost(resultsPath) {
|
|
|
375
376
|
/**
|
|
376
377
|
* Prints a formatted report of agent behavior observations.
|
|
377
378
|
*/
|
|
378
|
-
function printAgentBehaviorReport(agentBehavior) {
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
379
|
+
function printAgentBehaviorReport(agentBehavior, log) {
|
|
380
|
+
log.info("-".repeat(80));
|
|
381
|
+
log.info("AGENT BEHAVIOR OBSERVATION");
|
|
382
|
+
log.info("-".repeat(80));
|
|
383
|
+
log.info("");
|
|
383
384
|
// Summary table
|
|
384
385
|
const h = "| Feature Area | Tests | Doc Pages | Searches | Net (ms) |";
|
|
385
386
|
const sep = "|---------------------|-------|-----------|----------|----------|";
|
|
386
|
-
|
|
387
|
-
|
|
387
|
+
log.info(h);
|
|
388
|
+
log.info(sep);
|
|
388
389
|
for (const ab of agentBehavior) {
|
|
389
|
-
|
|
390
|
+
log.info(`| ${ab.feature.padEnd(19)} | ` +
|
|
390
391
|
`${ab.tasksWithBehaviorData.toString().padStart(5)} | ` +
|
|
391
392
|
`${ab.avgDocPagesVisited.toFixed(1).padStart(9)} | ` +
|
|
392
393
|
`${ab.avgSearchesPerformed.toFixed(1).padStart(8)} | ` +
|
|
393
394
|
`${Math.round(ab.avgNetworkTimeMs).toString().padStart(8)} |`);
|
|
394
395
|
}
|
|
395
|
-
|
|
396
|
+
log.info("");
|
|
396
397
|
// Doc pages visited
|
|
397
|
-
|
|
398
|
+
log.info(" Doc pages visited:");
|
|
398
399
|
for (const ab of agentBehavior) {
|
|
399
400
|
if (ab.docSlugsVisited.length === 0) {
|
|
400
|
-
|
|
401
|
+
log.info(` ${ab.feature}: (none)`);
|
|
401
402
|
}
|
|
402
403
|
else {
|
|
403
|
-
|
|
404
|
+
log.info(` ${ab.feature}:`);
|
|
404
405
|
for (const slug of ab.docSlugsVisited) {
|
|
405
|
-
|
|
406
|
+
log.info(` - /docs/${slug}`);
|
|
406
407
|
}
|
|
407
408
|
}
|
|
408
409
|
}
|
|
409
|
-
|
|
410
|
+
log.info("");
|
|
410
411
|
// Search queries
|
|
411
412
|
const hasSearches = agentBehavior.some((ab) => ab.searchQueries.length > 0);
|
|
412
413
|
if (hasSearches) {
|
|
413
|
-
|
|
414
|
+
log.info(" Search queries:");
|
|
414
415
|
for (const ab of agentBehavior) {
|
|
415
416
|
if (ab.searchQueries.length === 0) {
|
|
416
417
|
continue;
|
|
417
418
|
}
|
|
418
|
-
|
|
419
|
+
log.info(` ${ab.feature}:`);
|
|
419
420
|
for (const q of ab.searchQueries) {
|
|
420
|
-
|
|
421
|
+
log.info(` - "${q}"`);
|
|
421
422
|
}
|
|
422
423
|
}
|
|
423
|
-
|
|
424
|
+
log.info("");
|
|
424
425
|
}
|
|
425
426
|
// External domains
|
|
426
427
|
const allExternalDomains = [
|
|
427
428
|
...new Set(agentBehavior.flatMap((ab) => ab.externalDomains)),
|
|
428
429
|
];
|
|
429
430
|
if (allExternalDomains.length > 0) {
|
|
430
|
-
|
|
431
|
+
log.info(" External domains contacted:");
|
|
431
432
|
for (const d of allExternalDomains) {
|
|
432
|
-
|
|
433
|
+
log.info(` - ${d}`);
|
|
433
434
|
}
|
|
434
|
-
|
|
435
|
+
log.info("");
|
|
435
436
|
}
|
|
436
437
|
}
|
|
437
438
|
// ---------------------------------------------------------------------------
|
|
@@ -444,9 +445,15 @@ function printAgentBehaviorReport(agentBehavior) {
|
|
|
444
445
|
* Reads the raw Promptfoo output file and normalizes each result so that
|
|
445
446
|
* `description` is always a top-level field (pulled from `testCase` if needed).
|
|
446
447
|
*/
|
|
447
|
-
function readAndNormalizeResults(resultsPath) {
|
|
448
|
+
function readAndNormalizeResults(resultsPath, log) {
|
|
449
|
+
const _log = log ?? new ConsoleLogger();
|
|
448
450
|
const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
449
451
|
const wrapper = file.results ?? file;
|
|
452
|
+
_log.debug("Reading results file", {
|
|
453
|
+
path: resultsPath,
|
|
454
|
+
resultCount: wrapper.results.length,
|
|
455
|
+
stats: wrapper.stats,
|
|
456
|
+
});
|
|
450
457
|
const all = wrapper.results.map((r) => ({
|
|
451
458
|
cost: r.cost ?? 0,
|
|
452
459
|
description: r.testCase?.description ?? "unknown",
|
|
@@ -463,15 +470,20 @@ function readAndNormalizeResults(resultsPath) {
|
|
|
463
470
|
// Promptfoo sets gradingResult to null when a test errors before grading.
|
|
464
471
|
const valid = all.filter((r) => r.gradingResult !== null);
|
|
465
472
|
const skipped = all.length - valid.length;
|
|
473
|
+
_log.debug("Filtered null gradingResults", {
|
|
474
|
+
totalResults: all.length,
|
|
475
|
+
validResults: valid.length,
|
|
476
|
+
skippedCount: skipped,
|
|
477
|
+
});
|
|
466
478
|
if (skipped > 0) {
|
|
467
|
-
|
|
479
|
+
_log.warn(`⚠ Skipping ${skipped} of ${all.length} result(s) with null gradingResult (errored tests):`);
|
|
468
480
|
for (const r of all) {
|
|
469
481
|
if (r.gradingResult === null) {
|
|
470
482
|
const providerLabel = r.provider ? `[${r.provider}] ` : "";
|
|
471
483
|
const errorMsg = r.error
|
|
472
484
|
? r.error.slice(0, 150)
|
|
473
485
|
: "unknown error (no error field in result)";
|
|
474
|
-
|
|
486
|
+
_log.warn(`✗ ${providerLabel}"${r.description}" — ${errorMsg}`);
|
|
475
487
|
}
|
|
476
488
|
}
|
|
477
489
|
}
|
|
@@ -555,7 +567,7 @@ function scoreResults(results, weights, modelId) {
|
|
|
555
567
|
const ceilingScore = Math.round(withDocsTotal);
|
|
556
568
|
const floorScore = Math.round(withoutDocsScore);
|
|
557
569
|
const docLift = ceilingScore - floorScore;
|
|
558
|
-
|
|
570
|
+
const featureScore = {
|
|
559
571
|
ceilingScore,
|
|
560
572
|
codeCorrectness: Math.round(avgCode),
|
|
561
573
|
docCoverage: Math.round(avgDoc),
|
|
@@ -569,7 +581,8 @@ function scoreResults(results, weights, modelId) {
|
|
|
569
581
|
testCount: data.withDocs.length,
|
|
570
582
|
totalCost: featureCost,
|
|
571
583
|
totalScore: ceilingScore,
|
|
572
|
-
}
|
|
584
|
+
};
|
|
585
|
+
scores.push(featureScore);
|
|
573
586
|
}
|
|
574
587
|
return scores.sort((a, b) => a.feature.localeCompare(b.feature));
|
|
575
588
|
}
|
|
@@ -654,6 +667,7 @@ export function scoreAgenticResults(resultsPath, weights) {
|
|
|
654
667
|
const CRITICAL_THRESHOLD = 40;
|
|
655
668
|
export function calculateAndWriteScores(options) {
|
|
656
669
|
const ROOT = options.rootDir;
|
|
670
|
+
const log = options.logger ?? new ConsoleLogger();
|
|
657
671
|
const sourceName = options.source;
|
|
658
672
|
// Pre-resolved source wins over name-based lookup
|
|
659
673
|
let source = options.resolvedSource;
|
|
@@ -662,7 +676,7 @@ export function calculateAndWriteScores(options) {
|
|
|
662
676
|
source = loadSource(sourceName);
|
|
663
677
|
}
|
|
664
678
|
catch {
|
|
665
|
-
|
|
679
|
+
log.warn(`[warn] Could not load source "${sourceName}", proceeding without source metadata`);
|
|
666
680
|
}
|
|
667
681
|
}
|
|
668
682
|
// Determine mode — controls which result files are read
|
|
@@ -679,13 +693,23 @@ export function calculateAndWriteScores(options) {
|
|
|
679
693
|
.join("; ");
|
|
680
694
|
throw new Error(`Results validation failed: ${details}. Run 'pnpm eval' first to generate results.`);
|
|
681
695
|
}
|
|
682
|
-
|
|
696
|
+
log.info(`Reading results from: ${baselineResultsPath}`);
|
|
683
697
|
if (source) {
|
|
684
|
-
|
|
698
|
+
log.info(`Source: ${sourceName} (${source.baseUrl})`);
|
|
685
699
|
}
|
|
686
700
|
// Load dimension weights from rubrics.yaml
|
|
687
701
|
const rubricConfig = loadRubricTemplates(ROOT);
|
|
702
|
+
log.debug("Loaded rubric weights", { weights: rubricConfig.weights });
|
|
688
703
|
const baselineScores = calculateScores(baselineResultsPath, rubricConfig.weights);
|
|
704
|
+
log.debug("Baseline scores calculated", {
|
|
705
|
+
featureCount: baselineScores.length,
|
|
706
|
+
features: baselineScores.map((s) => ({
|
|
707
|
+
feature: s.feature,
|
|
708
|
+
ceilingScore: s.ceilingScore,
|
|
709
|
+
floorScore: s.floorScore,
|
|
710
|
+
docLift: s.docLift,
|
|
711
|
+
})),
|
|
712
|
+
});
|
|
689
713
|
const perModel = calculateScoresPerModel(baselineResultsPath, rubricConfig.weights);
|
|
690
714
|
const urlRefs = aggregateUrlReferences(baselineResultsPath);
|
|
691
715
|
const sourceVerification = buildSourceVerification(ROOT, source, {
|
|
@@ -700,8 +724,16 @@ export function calculateAndWriteScores(options) {
|
|
|
700
724
|
let sourceIsolation = null;
|
|
701
725
|
let evaluationMode;
|
|
702
726
|
if (mode === "full" && existsSync(agenticResultsPath)) {
|
|
703
|
-
|
|
727
|
+
log.info(`\nReading agentic results from: ${agenticResultsPath}`);
|
|
704
728
|
const agenticScores = scoreAgenticResults(agenticResultsPath, rubricConfig.weights);
|
|
729
|
+
log.debug("Agentic scores calculated", {
|
|
730
|
+
featureCount: Object.keys(agenticScores).length,
|
|
731
|
+
features: Object.entries(agenticScores).map(([f, s]) => ({
|
|
732
|
+
feature: f,
|
|
733
|
+
actualScore: s.actualScore,
|
|
734
|
+
testCount: s.testCount,
|
|
735
|
+
})),
|
|
736
|
+
});
|
|
705
737
|
scores = mergeScores(baselineScores, agenticScores);
|
|
706
738
|
evaluationMode = "full";
|
|
707
739
|
// Aggregate agent behavior and source isolation from agentic results
|
|
@@ -728,12 +760,12 @@ export function calculateAndWriteScores(options) {
|
|
|
728
760
|
sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
|
|
729
761
|
evaluationMode = mode === "observed" ? "observed" : "baseline";
|
|
730
762
|
}
|
|
731
|
-
const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode);
|
|
763
|
+
const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log);
|
|
732
764
|
// Persist
|
|
733
765
|
const outDir = join(ROOT, "results", "latest");
|
|
734
766
|
mkdirSync(outDir, { recursive: true });
|
|
735
767
|
writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
|
|
736
|
-
|
|
768
|
+
log.info("Score summary written to results/latest/score-summary.json");
|
|
737
769
|
// Extract and persist grader judgments (Phase 3a: failure mode extraction)
|
|
738
770
|
const judgments = extractGraderJudgments(baselineResultsPath);
|
|
739
771
|
// In full mode, also extract judgments from agentic results
|
|
@@ -743,20 +775,60 @@ export function calculateAndWriteScores(options) {
|
|
|
743
775
|
}
|
|
744
776
|
if (judgments.length > 0) {
|
|
745
777
|
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
746
|
-
|
|
778
|
+
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
747
779
|
}
|
|
748
|
-
|
|
780
|
+
// Compute test summary from the raw results file
|
|
781
|
+
const testSummary = computeTestSummary(baselineResultsPath);
|
|
782
|
+
return { belowCritical: summary.belowCritical, testSummary };
|
|
783
|
+
}
|
|
784
|
+
/**
|
|
785
|
+
* Compute a TestSummary from a raw Promptfoo results file.
|
|
786
|
+
* Counts total, passed, failed, and errored tests with error details.
|
|
787
|
+
*/
|
|
788
|
+
function computeTestSummary(resultsPath) {
|
|
789
|
+
const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
790
|
+
const wrapper = file.results ?? file;
|
|
791
|
+
const rawResults = wrapper.results;
|
|
792
|
+
let passed = 0;
|
|
793
|
+
let failed = 0;
|
|
794
|
+
let errored = 0;
|
|
795
|
+
const errors = [];
|
|
796
|
+
for (const r of rawResults) {
|
|
797
|
+
if (r.gradingResult === null || r.gradingResult === undefined) {
|
|
798
|
+
errored++;
|
|
799
|
+
errors.push({
|
|
800
|
+
model: r.provider?.label ?? r.provider?.id ?? "unknown",
|
|
801
|
+
task: r.testCase?.description ?? "unknown",
|
|
802
|
+
error: r.error
|
|
803
|
+
? r.error.slice(0, 200)
|
|
804
|
+
: "unknown error (null gradingResult)",
|
|
805
|
+
});
|
|
806
|
+
}
|
|
807
|
+
else if (r.gradingResult.pass) {
|
|
808
|
+
passed++;
|
|
809
|
+
}
|
|
810
|
+
else {
|
|
811
|
+
failed++;
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
return {
|
|
815
|
+
total: rawResults.length,
|
|
816
|
+
passed,
|
|
817
|
+
failed,
|
|
818
|
+
errored,
|
|
819
|
+
...(errors.length > 0 ? { errors } : {}),
|
|
820
|
+
};
|
|
749
821
|
}
|
|
750
|
-
function printPerModelReport(perModel) {
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
822
|
+
function printPerModelReport(perModel, log) {
|
|
823
|
+
log.info("-".repeat(80));
|
|
824
|
+
log.info("PER-MODEL BREAKDOWN");
|
|
825
|
+
log.info("-".repeat(80));
|
|
826
|
+
log.info("");
|
|
755
827
|
// Model summary table
|
|
756
828
|
const h = "| Model | Avg Score | Avg Lift | Tests | Cost |";
|
|
757
829
|
const sep = "|--------------------------------|-----------|----------|-------|----------|";
|
|
758
|
-
|
|
759
|
-
|
|
830
|
+
log.info(h);
|
|
831
|
+
log.info(sep);
|
|
760
832
|
const sorted = [...perModel].sort((a, b) => b.overall.avgScore - a.overall.avgScore);
|
|
761
833
|
for (const entry of sorted) {
|
|
762
834
|
const displayName = entry.label || entry.modelId;
|
|
@@ -766,63 +838,64 @@ function printPerModelReport(perModel) {
|
|
|
766
838
|
const liftStr = entry.overall.avgDocLift >= 0
|
|
767
839
|
? `+${entry.overall.avgDocLift.toFixed(1)}`
|
|
768
840
|
: entry.overall.avgDocLift.toFixed(1);
|
|
769
|
-
|
|
841
|
+
log.info(`| ${displayName.padEnd(30)} | ` +
|
|
770
842
|
`${entry.overall.avgScore.toFixed(1).padStart(9)} | ` +
|
|
771
843
|
`${liftStr.padStart(8)} | ` +
|
|
772
844
|
`${entry.overall.testCount.toString().padStart(5)} | ` +
|
|
773
845
|
`${costStr.padStart(8)} |`);
|
|
774
846
|
}
|
|
775
|
-
|
|
847
|
+
log.info("");
|
|
776
848
|
// Per-model × per-area breakdown
|
|
777
849
|
for (const entry of sorted) {
|
|
778
850
|
const displayName = entry.label || entry.modelId;
|
|
779
|
-
|
|
851
|
+
log.info(` ${displayName} (${entry.modelId}):`);
|
|
780
852
|
const areaH = " | Feature Area | Task | Code | Docs | Total | Lift |";
|
|
781
853
|
const areaSep = " |---------------------|------|------|------|-------|------|";
|
|
782
|
-
|
|
783
|
-
|
|
854
|
+
log.info(areaH);
|
|
855
|
+
log.info(areaSep);
|
|
784
856
|
for (const s of entry.scores) {
|
|
785
857
|
const lift = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
|
|
786
|
-
|
|
858
|
+
log.info(` | ${s.feature.padEnd(19)} | ` +
|
|
787
859
|
`${s.taskCompletion.toString().padStart(4)} | ` +
|
|
788
860
|
`${s.codeCorrectness.toString().padStart(4)} | ` +
|
|
789
861
|
`${s.docCoverage.toString().padStart(4)} | ` +
|
|
790
862
|
`${s.totalScore.toString().padStart(5)} | ` +
|
|
791
863
|
`${lift.padStart(4)} |`);
|
|
792
864
|
}
|
|
793
|
-
|
|
865
|
+
log.info("");
|
|
794
866
|
}
|
|
795
867
|
// Cost-per-quality-point
|
|
796
868
|
const modelsWithCost = sorted.filter((e) => e.overall.cost && e.overall.cost > 0);
|
|
797
869
|
if (modelsWithCost.length > 0) {
|
|
798
|
-
|
|
870
|
+
log.info(" Cost per quality point:");
|
|
799
871
|
for (const entry of modelsWithCost) {
|
|
800
872
|
const displayName = entry.label;
|
|
801
873
|
const costPerPoint = entry.overall.avgScore > 0
|
|
802
874
|
? (entry.overall.cost ?? 0) / entry.overall.avgScore
|
|
803
875
|
: 0;
|
|
804
|
-
|
|
876
|
+
log.info(` ${displayName}: $${costPerPoint.toFixed(6)}/point (score: ${entry.overall.avgScore.toFixed(1)}, cost: $${(entry.overall.cost ?? 0).toFixed(4)})`);
|
|
805
877
|
}
|
|
806
|
-
|
|
878
|
+
log.info("");
|
|
807
879
|
}
|
|
808
880
|
}
|
|
809
881
|
// ---------------------------------------------------------------------------
|
|
810
882
|
// Main
|
|
811
883
|
// ---------------------------------------------------------------------------
|
|
812
|
-
function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode) {
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
884
|
+
function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log) {
|
|
885
|
+
const _log = log ?? new ConsoleLogger();
|
|
886
|
+
_log.info("\n" + "=".repeat(80));
|
|
887
|
+
_log.info(" SANITY AI LITERACY SCORE REPORT");
|
|
888
|
+
_log.info("=".repeat(80));
|
|
889
|
+
_log.info("");
|
|
817
890
|
// Table header
|
|
818
891
|
const h = "| Feature Area | Task | Code | Docs | Total | w/o Docs | Doc Lift |";
|
|
819
892
|
const sep = "|---------------------|------|------|------|-------|----------|----------|";
|
|
820
|
-
|
|
821
|
-
|
|
893
|
+
_log.info(h);
|
|
894
|
+
_log.info(sep);
|
|
822
895
|
for (const s of scores) {
|
|
823
896
|
const status = s.totalScore < CRITICAL_THRESHOLD ? "!!" : "ok";
|
|
824
897
|
const lift = s.docLift > 0 ? `+${s.docLift}` : `${s.docLift}`;
|
|
825
|
-
|
|
898
|
+
_log.info(`| ${status} ${s.feature.padEnd(17)} | ` +
|
|
826
899
|
`${s.taskCompletion.toString().padStart(4)} | ` +
|
|
827
900
|
`${s.codeCorrectness.toString().padStart(4)} | ` +
|
|
828
901
|
`${s.docCoverage.toString().padStart(4)} | ` +
|
|
@@ -830,7 +903,7 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
830
903
|
`${s.floorScore.toString().padStart(8)} | ` +
|
|
831
904
|
`${lift.padStart(8)} |`);
|
|
832
905
|
}
|
|
833
|
-
|
|
906
|
+
_log.info("");
|
|
834
907
|
// OKR status
|
|
835
908
|
const belowCritical = scores.filter((s) => s.totalScore < CRITICAL_THRESHOLD);
|
|
836
909
|
const lowestScore = scores.reduce((min, s) => s.totalScore < min.totalScore ? s : min);
|
|
@@ -844,69 +917,69 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
844
917
|
area: s.feature,
|
|
845
918
|
docLift: s.docLift,
|
|
846
919
|
}));
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
920
|
+
_log.info("-".repeat(80));
|
|
921
|
+
_log.info("OKR STATUS");
|
|
922
|
+
_log.info("-".repeat(80));
|
|
923
|
+
_log.info("");
|
|
851
924
|
if (belowCritical.length === 0) {
|
|
852
|
-
|
|
925
|
+
_log.info(" KR1: PASS -- All areas above critical threshold (>=40)");
|
|
853
926
|
}
|
|
854
927
|
else {
|
|
855
|
-
|
|
856
|
-
belowCritical.forEach((s) =>
|
|
928
|
+
_log.info(" KR1: FAIL -- Areas below critical threshold:");
|
|
929
|
+
belowCritical.forEach((s) => _log.info(` - ${s.feature}: ${s.totalScore}`));
|
|
857
930
|
}
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
931
|
+
_log.info("");
|
|
932
|
+
_log.info(` Lowest area: ${lowestScore.feature} (${lowestScore.totalScore})`);
|
|
933
|
+
_log.info(` Target: +15 points improvement`);
|
|
934
|
+
_log.info("");
|
|
935
|
+
_log.info(` Avg score: ${avgScore.toFixed(1)}`);
|
|
936
|
+
_log.info(` Avg doc lift: +${avgLift.toFixed(1)} points`);
|
|
937
|
+
_log.info(` (Doc lift = how much docs help vs parametric knowledge alone)`);
|
|
938
|
+
_log.info("");
|
|
866
939
|
// Ceiling decomposition
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
940
|
+
_log.info("-".repeat(80));
|
|
941
|
+
_log.info("CEILING DECOMPOSITION");
|
|
942
|
+
_log.info("-".repeat(80));
|
|
943
|
+
_log.info("");
|
|
871
944
|
const ceilH = "| Feature Area | Floor | Ceiling | Doc Lift | Quality Gap |";
|
|
872
945
|
const ceilSep = "|---------------------|-------|---------|----------|-------------|";
|
|
873
|
-
|
|
874
|
-
|
|
946
|
+
_log.info(ceilH);
|
|
947
|
+
_log.info(ceilSep);
|
|
875
948
|
for (const s of scores) {
|
|
876
949
|
const liftStr = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
|
|
877
950
|
const liftFlag = s.negativeDocLift ? " 🚨" : "";
|
|
878
|
-
|
|
951
|
+
_log.info(`| ${s.feature.padEnd(19)} | ` +
|
|
879
952
|
`${s.floorScore.toString().padStart(5)} | ` +
|
|
880
953
|
`${s.ceilingScore.toString().padStart(7)} | ` +
|
|
881
954
|
`${liftStr.padStart(8)}${liftFlag} | ` +
|
|
882
955
|
`${s.docQualityGap.toString().padStart(11)} |`);
|
|
883
956
|
}
|
|
884
|
-
|
|
957
|
+
_log.info("");
|
|
885
958
|
if (negativeDocLiftAreas.length > 0) {
|
|
886
|
-
|
|
959
|
+
_log.info(" 🚨 NEGATIVE DOC LIFT DETECTED:");
|
|
887
960
|
for (const { area, docLift } of negativeDocLiftAreas) {
|
|
888
961
|
const s = scores.find((sc) => sc.feature === area);
|
|
889
|
-
|
|
962
|
+
_log.info(` ${area}: Doc Lift = ${docLift} (floor: ${s.floorScore}, ceiling: ${s.ceilingScore})`);
|
|
890
963
|
}
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
964
|
+
_log.info(" Documentation is HURTING model performance for these areas.");
|
|
965
|
+
_log.info(" See docs/design-docs/scenario-matrix/evaluation-ceiling.md");
|
|
966
|
+
_log.info("");
|
|
894
967
|
}
|
|
895
968
|
else {
|
|
896
|
-
|
|
897
|
-
|
|
969
|
+
_log.info(" ✅ No areas with negative Doc Lift detected.");
|
|
970
|
+
_log.info("");
|
|
898
971
|
}
|
|
899
972
|
// Three-layer decomposition (only when actual scores are present)
|
|
900
973
|
const hasActualScores = scores.some((s) => s.actualScore !== undefined);
|
|
901
974
|
if (hasActualScores) {
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
975
|
+
_log.info("-".repeat(80));
|
|
976
|
+
_log.info("THREE-LAYER DECOMPOSITION (floor → ceiling → actual)");
|
|
977
|
+
_log.info("-".repeat(80));
|
|
978
|
+
_log.info("");
|
|
906
979
|
const decompH = "| Feature Area | Floor | Ceiling | Actual | Doc Lift | Ret. Gap | Infra % |";
|
|
907
980
|
const decompSep = "|---------------------|-------|---------|--------|----------|----------|---------|";
|
|
908
|
-
|
|
909
|
-
|
|
981
|
+
_log.info(decompH);
|
|
982
|
+
_log.info(decompSep);
|
|
910
983
|
for (const s of scores) {
|
|
911
984
|
const liftStr = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
|
|
912
985
|
const actualStr = s.actualScore !== undefined ? s.actualScore.toString() : "—";
|
|
@@ -919,7 +992,7 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
919
992
|
? `${Math.round(s.infrastructureEfficiency * 100)}%`
|
|
920
993
|
: "—";
|
|
921
994
|
const flag = s.invertedRetrievalGap ? " 🔄" : "";
|
|
922
|
-
|
|
995
|
+
_log.info(`| ${s.feature.padEnd(19)} | ` +
|
|
923
996
|
`${s.floorScore.toString().padStart(5)} | ` +
|
|
924
997
|
`${s.ceilingScore.toString().padStart(7)} | ` +
|
|
925
998
|
`${actualStr.padStart(6)} | ` +
|
|
@@ -927,10 +1000,10 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
927
1000
|
`${(gapStr + flag).padStart(8)} | ` +
|
|
928
1001
|
`${infraStr.padStart(7)} |`);
|
|
929
1002
|
}
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
1003
|
+
_log.info("");
|
|
1004
|
+
_log.info(" Doc Lift = ceiling − floor | Ret. Gap = ceiling − actual | Infra = actual / ceiling");
|
|
1005
|
+
_log.info(" 🔄 = inverted retrieval gap (agents avoid bad docs → higher actual than ceiling)");
|
|
1006
|
+
_log.info("");
|
|
934
1007
|
}
|
|
935
1008
|
// Cost summary
|
|
936
1009
|
const totalCost = scores.reduce((sum, s) => sum + s.totalCost, 0);
|
|
@@ -938,66 +1011,66 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
938
1011
|
const graderCostTotal = graderCost?.cost ?? 0;
|
|
939
1012
|
const combinedCost = totalCost + graderCostTotal;
|
|
940
1013
|
if (totalCost > 0 || graderCostTotal > 0) {
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
1014
|
+
_log.info("-".repeat(80));
|
|
1015
|
+
_log.info("COST SUMMARY");
|
|
1016
|
+
_log.info("-".repeat(80));
|
|
1017
|
+
_log.info("");
|
|
1018
|
+
_log.info(` Provider cost: $${totalCost.toFixed(4)}`);
|
|
946
1019
|
if (graderCostTotal > 0) {
|
|
947
1020
|
const graderLabel = graderCost?.model ?? "unknown";
|
|
948
|
-
|
|
1021
|
+
_log.info(` Grader cost: $${graderCostTotal.toFixed(4)} (${graderLabel}, ${(graderCost?.totalTokens ?? 0).toLocaleString()} tokens)`);
|
|
949
1022
|
}
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
1023
|
+
_log.info(` Total cost: $${combinedCost.toFixed(4)}`);
|
|
1024
|
+
_log.info(` Avg cost per test: $${(combinedCost / (totalTests || 1)).toFixed(4)}`);
|
|
1025
|
+
_log.info("");
|
|
953
1026
|
const costHeader = "| Feature Area | Tests | Cost | Avg/Test |";
|
|
954
1027
|
const costSep = "|---------------------|-------|----------|----------|";
|
|
955
|
-
|
|
956
|
-
|
|
1028
|
+
_log.info(costHeader);
|
|
1029
|
+
_log.info(costSep);
|
|
957
1030
|
for (const s of scores) {
|
|
958
1031
|
const avgCost = s.testCount > 0 ? s.totalCost / s.testCount : 0;
|
|
959
|
-
|
|
1032
|
+
_log.info(`| ${s.feature.padEnd(19)} | ` +
|
|
960
1033
|
`${s.testCount.toString().padStart(5)} | ` +
|
|
961
1034
|
`$${s.totalCost.toFixed(4).padStart(7)} | ` +
|
|
962
1035
|
`$${avgCost.toFixed(4).padStart(7)} |`);
|
|
963
1036
|
}
|
|
964
|
-
|
|
1037
|
+
_log.info("");
|
|
965
1038
|
}
|
|
966
1039
|
// Per-model breakdown
|
|
967
1040
|
if (perModel) {
|
|
968
|
-
printPerModelReport(perModel);
|
|
1041
|
+
printPerModelReport(perModel, _log);
|
|
969
1042
|
}
|
|
970
1043
|
// URL References
|
|
971
|
-
printUrlReport(urlRefs);
|
|
1044
|
+
printUrlReport(urlRefs, _log);
|
|
972
1045
|
// Agent Behavior (only present when run with instrumented provider)
|
|
973
1046
|
if (agentBehavior && agentBehavior.length > 0) {
|
|
974
|
-
printAgentBehaviorReport(agentBehavior);
|
|
1047
|
+
printAgentBehaviorReport(agentBehavior, _log);
|
|
975
1048
|
}
|
|
976
1049
|
// Source verification (unified report for all modes)
|
|
977
1050
|
if (sourceVerification || sourceIsolation) {
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
1051
|
+
_log.info("-".repeat(80));
|
|
1052
|
+
_log.info("📋 SOURCE VERIFICATION");
|
|
1053
|
+
_log.info("-".repeat(80));
|
|
981
1054
|
if (sourceVerification) {
|
|
982
|
-
|
|
983
|
-
|
|
1055
|
+
_log.info(` Source: ${sourceVerification.source}`);
|
|
1056
|
+
_log.info(` Mode: ${sourceVerification.mode}`);
|
|
984
1057
|
if (sourceVerification.allowedOrigins) {
|
|
985
|
-
|
|
1058
|
+
_log.info(` Sandbox: ${sourceVerification.allowedOrigins.join(", ")}`);
|
|
986
1059
|
}
|
|
987
1060
|
if (sourceVerification.searchMode) {
|
|
988
|
-
|
|
1061
|
+
_log.info(` Search: ${sourceVerification.searchMode}`);
|
|
989
1062
|
}
|
|
990
1063
|
// URL fetch results (baseline mode with direct URLs)
|
|
991
1064
|
if (sourceVerification.urlFetch) {
|
|
992
1065
|
const uf = sourceVerification.urlFetch;
|
|
993
|
-
|
|
994
|
-
|
|
1066
|
+
_log.info("");
|
|
1067
|
+
_log.info(` URL fetch: ${uf.totalFetched} fetched, ${uf.totalFailed} failed`);
|
|
995
1068
|
for (const f of uf.fetchedUrls) {
|
|
996
|
-
|
|
1069
|
+
_log.info(` ✅ ${f.url} (via ${f.method})`);
|
|
997
1070
|
}
|
|
998
1071
|
for (const f of uf.failures) {
|
|
999
1072
|
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means no error info
|
|
1000
|
-
|
|
1073
|
+
_log.info(` ⚠️ ${f.url}: ${f.error || "unknown error"}`);
|
|
1001
1074
|
}
|
|
1002
1075
|
}
|
|
1003
1076
|
}
|
|
@@ -1005,22 +1078,22 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
1005
1078
|
if (sourceIsolation) {
|
|
1006
1079
|
const pct = Math.round(sourceIsolation.isolationScore * 100);
|
|
1007
1080
|
const icon = sourceIsolation.offOrigin === 0 ? "✅" : "⚠️";
|
|
1008
|
-
|
|
1009
|
-
|
|
1081
|
+
_log.info("");
|
|
1082
|
+
_log.info(` Agent isolation: ${icon} ${pct}% (${sourceIsolation.onOrigin}/${sourceIsolation.total} on-origin)`);
|
|
1010
1083
|
if (sourceIsolation.offOrigin > 0) {
|
|
1011
|
-
|
|
1084
|
+
_log.info(` Off-origin fetches: ${sourceIsolation.offOrigin}`);
|
|
1012
1085
|
for (const url of sourceIsolation.offOriginUrls.slice(0, 10)) {
|
|
1013
|
-
|
|
1086
|
+
_log.info(` • ${url}`);
|
|
1014
1087
|
}
|
|
1015
1088
|
}
|
|
1016
1089
|
if (Object.keys(sourceIsolation.originBreakdown).length > 0) {
|
|
1017
|
-
|
|
1090
|
+
_log.info(" Origin breakdown:");
|
|
1018
1091
|
for (const [origin, count] of Object.entries(sourceIsolation.originBreakdown).sort((a, b) => b[1] - a[1])) {
|
|
1019
|
-
|
|
1092
|
+
_log.info(` ${origin}: ${count}`);
|
|
1020
1093
|
}
|
|
1021
1094
|
}
|
|
1022
1095
|
}
|
|
1023
|
-
|
|
1096
|
+
_log.info("");
|
|
1024
1097
|
}
|
|
1025
1098
|
// Build overall agent behavior stats for summary
|
|
1026
1099
|
const overallAgentBehavior = agentBehavior && agentBehavior.length > 0
|
|
@@ -1102,31 +1175,31 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
1102
1175
|
urlReferences: urlRefs,
|
|
1103
1176
|
};
|
|
1104
1177
|
}
|
|
1105
|
-
function printUrlReport(urlRefs) {
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1178
|
+
function printUrlReport(urlRefs, log) {
|
|
1179
|
+
log.info("-".repeat(80));
|
|
1180
|
+
log.info("URL REFERENCES");
|
|
1181
|
+
log.info("-".repeat(80));
|
|
1182
|
+
log.info("");
|
|
1110
1183
|
for (const ref of urlRefs) {
|
|
1111
1184
|
const goldUrls = Object.entries(ref.gold.urls).sort((a, b) => b[1] - a[1]);
|
|
1112
1185
|
const baselineUrls = Object.entries(ref.baseline.urls).sort((a, b) => b[1] - a[1]);
|
|
1113
1186
|
if (goldUrls.length > 0) {
|
|
1114
|
-
|
|
1187
|
+
log.info(` ${ref.feature} (gold):`);
|
|
1115
1188
|
for (const [url, count] of goldUrls) {
|
|
1116
1189
|
const suffix = count > 1 ? ` (${count} tests)` : "";
|
|
1117
|
-
|
|
1190
|
+
log.info(` ${url}${suffix}`);
|
|
1118
1191
|
}
|
|
1119
1192
|
}
|
|
1120
1193
|
if (baselineUrls.length > 0) {
|
|
1121
|
-
|
|
1194
|
+
log.info(` ${ref.feature} (baseline):`);
|
|
1122
1195
|
for (const [url, count] of baselineUrls) {
|
|
1123
1196
|
const suffix = count > 1 ? ` (${count} tests)` : "";
|
|
1124
|
-
|
|
1197
|
+
log.info(` ${url}${suffix} [parametric]`);
|
|
1125
1198
|
}
|
|
1126
1199
|
}
|
|
1127
1200
|
if (goldUrls.length === 0 && baselineUrls.length === 0) {
|
|
1128
|
-
|
|
1201
|
+
log.info(` ${ref.feature}: no URLs referenced`);
|
|
1129
1202
|
}
|
|
1130
|
-
|
|
1203
|
+
log.info("");
|
|
1131
1204
|
}
|
|
1132
1205
|
}
|