@sanity/ailf 0.1.34 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/config/airbyte/ai_literacy_framework.connector.yaml +6 -0
- package/config/bigquery/views/reports.sql +1 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +10 -20
- package/dist/_vendor/ailf-core/examples/index.js +10 -20
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +2 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +65 -0
- package/dist/_vendor/ailf-tasks/schemas.d.ts +12 -0
- package/dist/_vendor/ailf-tasks/schemas.js +4 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +9 -1
- package/dist/adapters/task-sources/repo-task-source.js +19 -4
- package/dist/commands/calculate-scores.js +5 -1
- package/dist/commands/publish.js +3 -0
- package/dist/composition-root.js +7 -2
- package/dist/orchestration/pipeline-orchestrator.js +27 -2
- package/dist/orchestration/step-runner.js +8 -0
- package/dist/orchestration/steps/calculate-scores-step.js +22 -19
- package/dist/orchestration/steps/generate-configs-step.js +1 -0
- package/dist/orchestration/steps/grader-consistency-step.js +1 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +2 -1
- package/dist/orchestration/steps/publish-report-step.js +3 -0
- package/dist/pipeline/calculate-scores.d.ts +11 -1
- package/dist/pipeline/calculate-scores.js +222 -157
- package/dist/pipeline/coverage-audit.d.ts +2 -1
- package/dist/pipeline/coverage-audit.js +5 -3
- package/dist/pipeline/expand-tasks.d.ts +2 -1
- package/dist/pipeline/expand-tasks.js +33 -2
- package/dist/pipeline/generate-configs.d.ts +3 -1
- package/dist/pipeline/generate-configs.js +51 -37
- package/dist/pipeline/grader-api.d.ts +2 -1
- package/dist/pipeline/grader-api.js +11 -9
- package/dist/pipeline/grader-compare-runner.d.ts +3 -0
- package/dist/pipeline/grader-compare-runner.js +21 -19
- package/dist/pipeline/grader-consistency-runner.d.ts +3 -0
- package/dist/pipeline/grader-consistency-runner.js +16 -14
- package/dist/pipeline/grader-sensitivity-runner.d.ts +3 -0
- package/dist/pipeline/grader-sensitivity-runner.js +18 -16
- package/dist/pipeline/grader-validate-runner.d.ts +3 -0
- package/dist/pipeline/grader-validate-runner.js +16 -14
- package/dist/pipeline/mirror-repo-tasks.d.ts +80 -1
- package/dist/pipeline/mirror-repo-tasks.js +148 -32
- package/dist/pipeline/provenance.d.ts +3 -0
- package/dist/pipeline/provenance.js +25 -3
- package/dist/pipeline/report-title.d.ts +66 -0
- package/dist/pipeline/report-title.js +118 -0
- package/dist/report-store.js +2 -0
- package/dist/sinks/bigquery/index.d.ts +1 -0
- package/dist/sinks/bigquery/index.js +1 -0
- package/dist/sources.d.ts +2 -1
- package/dist/sources.js +28 -1
- package/package.json +23 -23
|
@@ -27,6 +27,7 @@
|
|
|
27
27
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
28
28
|
import { join } from "path";
|
|
29
29
|
import { calculateCost } from "../agent-observer/pricing.js";
|
|
30
|
+
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
30
31
|
import { checkResultsExist } from "./checks.js";
|
|
31
32
|
import { loadRubricTemplates } from "./expand-tasks.js";
|
|
32
33
|
import { loadSource } from "../sources.js";
|
|
@@ -375,63 +376,63 @@ function extractGraderCost(resultsPath) {
|
|
|
375
376
|
/**
|
|
376
377
|
* Prints a formatted report of agent behavior observations.
|
|
377
378
|
*/
|
|
378
|
-
function printAgentBehaviorReport(agentBehavior) {
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
379
|
+
function printAgentBehaviorReport(agentBehavior, log) {
|
|
380
|
+
log.info("-".repeat(80));
|
|
381
|
+
log.info("AGENT BEHAVIOR OBSERVATION");
|
|
382
|
+
log.info("-".repeat(80));
|
|
383
|
+
log.info("");
|
|
383
384
|
// Summary table
|
|
384
385
|
const h = "| Feature Area | Tests | Doc Pages | Searches | Net (ms) |";
|
|
385
386
|
const sep = "|---------------------|-------|-----------|----------|----------|";
|
|
386
|
-
|
|
387
|
-
|
|
387
|
+
log.info(h);
|
|
388
|
+
log.info(sep);
|
|
388
389
|
for (const ab of agentBehavior) {
|
|
389
|
-
|
|
390
|
+
log.info(`| ${ab.feature.padEnd(19)} | ` +
|
|
390
391
|
`${ab.tasksWithBehaviorData.toString().padStart(5)} | ` +
|
|
391
392
|
`${ab.avgDocPagesVisited.toFixed(1).padStart(9)} | ` +
|
|
392
393
|
`${ab.avgSearchesPerformed.toFixed(1).padStart(8)} | ` +
|
|
393
394
|
`${Math.round(ab.avgNetworkTimeMs).toString().padStart(8)} |`);
|
|
394
395
|
}
|
|
395
|
-
|
|
396
|
+
log.info("");
|
|
396
397
|
// Doc pages visited
|
|
397
|
-
|
|
398
|
+
log.info(" Doc pages visited:");
|
|
398
399
|
for (const ab of agentBehavior) {
|
|
399
400
|
if (ab.docSlugsVisited.length === 0) {
|
|
400
|
-
|
|
401
|
+
log.info(` ${ab.feature}: (none)`);
|
|
401
402
|
}
|
|
402
403
|
else {
|
|
403
|
-
|
|
404
|
+
log.info(` ${ab.feature}:`);
|
|
404
405
|
for (const slug of ab.docSlugsVisited) {
|
|
405
|
-
|
|
406
|
+
log.info(` - /docs/${slug}`);
|
|
406
407
|
}
|
|
407
408
|
}
|
|
408
409
|
}
|
|
409
|
-
|
|
410
|
+
log.info("");
|
|
410
411
|
// Search queries
|
|
411
412
|
const hasSearches = agentBehavior.some((ab) => ab.searchQueries.length > 0);
|
|
412
413
|
if (hasSearches) {
|
|
413
|
-
|
|
414
|
+
log.info(" Search queries:");
|
|
414
415
|
for (const ab of agentBehavior) {
|
|
415
416
|
if (ab.searchQueries.length === 0) {
|
|
416
417
|
continue;
|
|
417
418
|
}
|
|
418
|
-
|
|
419
|
+
log.info(` ${ab.feature}:`);
|
|
419
420
|
for (const q of ab.searchQueries) {
|
|
420
|
-
|
|
421
|
+
log.info(` - "${q}"`);
|
|
421
422
|
}
|
|
422
423
|
}
|
|
423
|
-
|
|
424
|
+
log.info("");
|
|
424
425
|
}
|
|
425
426
|
// External domains
|
|
426
427
|
const allExternalDomains = [
|
|
427
428
|
...new Set(agentBehavior.flatMap((ab) => ab.externalDomains)),
|
|
428
429
|
];
|
|
429
430
|
if (allExternalDomains.length > 0) {
|
|
430
|
-
|
|
431
|
+
log.info(" External domains contacted:");
|
|
431
432
|
for (const d of allExternalDomains) {
|
|
432
|
-
|
|
433
|
+
log.info(` - ${d}`);
|
|
433
434
|
}
|
|
434
|
-
|
|
435
|
+
log.info("");
|
|
435
436
|
}
|
|
436
437
|
}
|
|
437
438
|
// ---------------------------------------------------------------------------
|
|
@@ -444,9 +445,15 @@ function printAgentBehaviorReport(agentBehavior) {
|
|
|
444
445
|
* Reads the raw Promptfoo output file and normalizes each result so that
|
|
445
446
|
* `description` is always a top-level field (pulled from `testCase` if needed).
|
|
446
447
|
*/
|
|
447
|
-
function readAndNormalizeResults(resultsPath) {
|
|
448
|
+
function readAndNormalizeResults(resultsPath, log) {
|
|
449
|
+
const _log = log ?? new ConsoleLogger();
|
|
448
450
|
const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
449
451
|
const wrapper = file.results ?? file;
|
|
452
|
+
_log.debug("Reading results file", {
|
|
453
|
+
path: resultsPath,
|
|
454
|
+
resultCount: wrapper.results.length,
|
|
455
|
+
stats: wrapper.stats,
|
|
456
|
+
});
|
|
450
457
|
const all = wrapper.results.map((r) => ({
|
|
451
458
|
cost: r.cost ?? 0,
|
|
452
459
|
description: r.testCase?.description ?? "unknown",
|
|
@@ -463,15 +470,20 @@ function readAndNormalizeResults(resultsPath) {
|
|
|
463
470
|
// Promptfoo sets gradingResult to null when a test errors before grading.
|
|
464
471
|
const valid = all.filter((r) => r.gradingResult !== null);
|
|
465
472
|
const skipped = all.length - valid.length;
|
|
473
|
+
_log.debug("Filtered null gradingResults", {
|
|
474
|
+
totalResults: all.length,
|
|
475
|
+
validResults: valid.length,
|
|
476
|
+
skippedCount: skipped,
|
|
477
|
+
});
|
|
466
478
|
if (skipped > 0) {
|
|
467
|
-
|
|
479
|
+
_log.warn(`⚠ Skipping ${skipped} of ${all.length} result(s) with null gradingResult (errored tests):`);
|
|
468
480
|
for (const r of all) {
|
|
469
481
|
if (r.gradingResult === null) {
|
|
470
482
|
const providerLabel = r.provider ? `[${r.provider}] ` : "";
|
|
471
483
|
const errorMsg = r.error
|
|
472
484
|
? r.error.slice(0, 150)
|
|
473
485
|
: "unknown error (no error field in result)";
|
|
474
|
-
|
|
486
|
+
_log.warn(`✗ ${providerLabel}"${r.description}" — ${errorMsg}`);
|
|
475
487
|
}
|
|
476
488
|
}
|
|
477
489
|
}
|
|
@@ -555,7 +567,7 @@ function scoreResults(results, weights, modelId) {
|
|
|
555
567
|
const ceilingScore = Math.round(withDocsTotal);
|
|
556
568
|
const floorScore = Math.round(withoutDocsScore);
|
|
557
569
|
const docLift = ceilingScore - floorScore;
|
|
558
|
-
|
|
570
|
+
const featureScore = {
|
|
559
571
|
ceilingScore,
|
|
560
572
|
codeCorrectness: Math.round(avgCode),
|
|
561
573
|
docCoverage: Math.round(avgDoc),
|
|
@@ -569,7 +581,8 @@ function scoreResults(results, weights, modelId) {
|
|
|
569
581
|
testCount: data.withDocs.length,
|
|
570
582
|
totalCost: featureCost,
|
|
571
583
|
totalScore: ceilingScore,
|
|
572
|
-
}
|
|
584
|
+
};
|
|
585
|
+
scores.push(featureScore);
|
|
573
586
|
}
|
|
574
587
|
return scores.sort((a, b) => a.feature.localeCompare(b.feature));
|
|
575
588
|
}
|
|
@@ -654,6 +667,7 @@ export function scoreAgenticResults(resultsPath, weights) {
|
|
|
654
667
|
const CRITICAL_THRESHOLD = 40;
|
|
655
668
|
export function calculateAndWriteScores(options) {
|
|
656
669
|
const ROOT = options.rootDir;
|
|
670
|
+
const log = options.logger ?? new ConsoleLogger();
|
|
657
671
|
const sourceName = options.source;
|
|
658
672
|
// Pre-resolved source wins over name-based lookup
|
|
659
673
|
let source = options.resolvedSource;
|
|
@@ -662,7 +676,7 @@ export function calculateAndWriteScores(options) {
|
|
|
662
676
|
source = loadSource(sourceName);
|
|
663
677
|
}
|
|
664
678
|
catch {
|
|
665
|
-
|
|
679
|
+
log.warn(`[warn] Could not load source "${sourceName}", proceeding without source metadata`);
|
|
666
680
|
}
|
|
667
681
|
}
|
|
668
682
|
// Determine mode — controls which result files are read
|
|
@@ -674,23 +688,28 @@ export function calculateAndWriteScores(options) {
|
|
|
674
688
|
const resultsIssues = checkResultsExist(ROOT, baselineResultsPath);
|
|
675
689
|
const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
|
|
676
690
|
if (resultsErrors.length > 0) {
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
console.error(` at ${e.path}`);
|
|
682
|
-
}
|
|
683
|
-
}
|
|
684
|
-
console.error("\nRun 'pnpm eval' first to generate results, then 'pnpm calculate-scores'.");
|
|
685
|
-
process.exit(1);
|
|
691
|
+
const details = resultsErrors
|
|
692
|
+
.map((e) => (e.path ? `${e.message} (at ${e.path})` : e.message))
|
|
693
|
+
.join("; ");
|
|
694
|
+
throw new Error(`Results validation failed: ${details}. Run 'pnpm eval' first to generate results.`);
|
|
686
695
|
}
|
|
687
|
-
|
|
696
|
+
log.info(`Reading results from: ${baselineResultsPath}`);
|
|
688
697
|
if (source) {
|
|
689
|
-
|
|
698
|
+
log.info(`Source: ${sourceName} (${source.baseUrl})`);
|
|
690
699
|
}
|
|
691
700
|
// Load dimension weights from rubrics.yaml
|
|
692
701
|
const rubricConfig = loadRubricTemplates(ROOT);
|
|
702
|
+
log.debug("Loaded rubric weights", { weights: rubricConfig.weights });
|
|
693
703
|
const baselineScores = calculateScores(baselineResultsPath, rubricConfig.weights);
|
|
704
|
+
log.debug("Baseline scores calculated", {
|
|
705
|
+
featureCount: baselineScores.length,
|
|
706
|
+
features: baselineScores.map((s) => ({
|
|
707
|
+
feature: s.feature,
|
|
708
|
+
ceilingScore: s.ceilingScore,
|
|
709
|
+
floorScore: s.floorScore,
|
|
710
|
+
docLift: s.docLift,
|
|
711
|
+
})),
|
|
712
|
+
});
|
|
694
713
|
const perModel = calculateScoresPerModel(baselineResultsPath, rubricConfig.weights);
|
|
695
714
|
const urlRefs = aggregateUrlReferences(baselineResultsPath);
|
|
696
715
|
const sourceVerification = buildSourceVerification(ROOT, source, {
|
|
@@ -705,8 +724,16 @@ export function calculateAndWriteScores(options) {
|
|
|
705
724
|
let sourceIsolation = null;
|
|
706
725
|
let evaluationMode;
|
|
707
726
|
if (mode === "full" && existsSync(agenticResultsPath)) {
|
|
708
|
-
|
|
727
|
+
log.info(`\nReading agentic results from: ${agenticResultsPath}`);
|
|
709
728
|
const agenticScores = scoreAgenticResults(agenticResultsPath, rubricConfig.weights);
|
|
729
|
+
log.debug("Agentic scores calculated", {
|
|
730
|
+
featureCount: Object.keys(agenticScores).length,
|
|
731
|
+
features: Object.entries(agenticScores).map(([f, s]) => ({
|
|
732
|
+
feature: f,
|
|
733
|
+
actualScore: s.actualScore,
|
|
734
|
+
testCount: s.testCount,
|
|
735
|
+
})),
|
|
736
|
+
});
|
|
710
737
|
scores = mergeScores(baselineScores, agenticScores);
|
|
711
738
|
evaluationMode = "full";
|
|
712
739
|
// Aggregate agent behavior and source isolation from agentic results
|
|
@@ -733,12 +760,12 @@ export function calculateAndWriteScores(options) {
|
|
|
733
760
|
sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
|
|
734
761
|
evaluationMode = mode === "observed" ? "observed" : "baseline";
|
|
735
762
|
}
|
|
736
|
-
const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode);
|
|
763
|
+
const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log);
|
|
737
764
|
// Persist
|
|
738
765
|
const outDir = join(ROOT, "results", "latest");
|
|
739
766
|
mkdirSync(outDir, { recursive: true });
|
|
740
767
|
writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
|
|
741
|
-
|
|
768
|
+
log.info("Score summary written to results/latest/score-summary.json");
|
|
742
769
|
// Extract and persist grader judgments (Phase 3a: failure mode extraction)
|
|
743
770
|
const judgments = extractGraderJudgments(baselineResultsPath);
|
|
744
771
|
// In full mode, also extract judgments from agentic results
|
|
@@ -748,23 +775,60 @@ export function calculateAndWriteScores(options) {
|
|
|
748
775
|
}
|
|
749
776
|
if (judgments.length > 0) {
|
|
750
777
|
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
751
|
-
|
|
778
|
+
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
752
779
|
}
|
|
753
|
-
//
|
|
754
|
-
|
|
755
|
-
|
|
780
|
+
// Compute test summary from the raw results file
|
|
781
|
+
const testSummary = computeTestSummary(baselineResultsPath);
|
|
782
|
+
return { belowCritical: summary.belowCritical, testSummary };
|
|
783
|
+
}
|
|
784
|
+
/**
|
|
785
|
+
* Compute a TestSummary from a raw Promptfoo results file.
|
|
786
|
+
* Counts total, passed, failed, and errored tests with error details.
|
|
787
|
+
*/
|
|
788
|
+
function computeTestSummary(resultsPath) {
|
|
789
|
+
const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
790
|
+
const wrapper = file.results ?? file;
|
|
791
|
+
const rawResults = wrapper.results;
|
|
792
|
+
let passed = 0;
|
|
793
|
+
let failed = 0;
|
|
794
|
+
let errored = 0;
|
|
795
|
+
const errors = [];
|
|
796
|
+
for (const r of rawResults) {
|
|
797
|
+
if (r.gradingResult === null || r.gradingResult === undefined) {
|
|
798
|
+
errored++;
|
|
799
|
+
errors.push({
|
|
800
|
+
model: r.provider?.label ?? r.provider?.id ?? "unknown",
|
|
801
|
+
task: r.testCase?.description ?? "unknown",
|
|
802
|
+
error: r.error
|
|
803
|
+
? r.error.slice(0, 200)
|
|
804
|
+
: "unknown error (null gradingResult)",
|
|
805
|
+
});
|
|
806
|
+
}
|
|
807
|
+
else if (r.gradingResult.pass) {
|
|
808
|
+
passed++;
|
|
809
|
+
}
|
|
810
|
+
else {
|
|
811
|
+
failed++;
|
|
812
|
+
}
|
|
756
813
|
}
|
|
814
|
+
return {
|
|
815
|
+
total: rawResults.length,
|
|
816
|
+
passed,
|
|
817
|
+
failed,
|
|
818
|
+
errored,
|
|
819
|
+
...(errors.length > 0 ? { errors } : {}),
|
|
820
|
+
};
|
|
757
821
|
}
|
|
758
|
-
function printPerModelReport(perModel) {
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
822
|
+
function printPerModelReport(perModel, log) {
|
|
823
|
+
log.info("-".repeat(80));
|
|
824
|
+
log.info("PER-MODEL BREAKDOWN");
|
|
825
|
+
log.info("-".repeat(80));
|
|
826
|
+
log.info("");
|
|
763
827
|
// Model summary table
|
|
764
828
|
const h = "| Model | Avg Score | Avg Lift | Tests | Cost |";
|
|
765
829
|
const sep = "|--------------------------------|-----------|----------|-------|----------|";
|
|
766
|
-
|
|
767
|
-
|
|
830
|
+
log.info(h);
|
|
831
|
+
log.info(sep);
|
|
768
832
|
const sorted = [...perModel].sort((a, b) => b.overall.avgScore - a.overall.avgScore);
|
|
769
833
|
for (const entry of sorted) {
|
|
770
834
|
const displayName = entry.label || entry.modelId;
|
|
@@ -774,63 +838,64 @@ function printPerModelReport(perModel) {
|
|
|
774
838
|
const liftStr = entry.overall.avgDocLift >= 0
|
|
775
839
|
? `+${entry.overall.avgDocLift.toFixed(1)}`
|
|
776
840
|
: entry.overall.avgDocLift.toFixed(1);
|
|
777
|
-
|
|
841
|
+
log.info(`| ${displayName.padEnd(30)} | ` +
|
|
778
842
|
`${entry.overall.avgScore.toFixed(1).padStart(9)} | ` +
|
|
779
843
|
`${liftStr.padStart(8)} | ` +
|
|
780
844
|
`${entry.overall.testCount.toString().padStart(5)} | ` +
|
|
781
845
|
`${costStr.padStart(8)} |`);
|
|
782
846
|
}
|
|
783
|
-
|
|
847
|
+
log.info("");
|
|
784
848
|
// Per-model × per-area breakdown
|
|
785
849
|
for (const entry of sorted) {
|
|
786
850
|
const displayName = entry.label || entry.modelId;
|
|
787
|
-
|
|
851
|
+
log.info(` ${displayName} (${entry.modelId}):`);
|
|
788
852
|
const areaH = " | Feature Area | Task | Code | Docs | Total | Lift |";
|
|
789
853
|
const areaSep = " |---------------------|------|------|------|-------|------|";
|
|
790
|
-
|
|
791
|
-
|
|
854
|
+
log.info(areaH);
|
|
855
|
+
log.info(areaSep);
|
|
792
856
|
for (const s of entry.scores) {
|
|
793
857
|
const lift = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
|
|
794
|
-
|
|
858
|
+
log.info(` | ${s.feature.padEnd(19)} | ` +
|
|
795
859
|
`${s.taskCompletion.toString().padStart(4)} | ` +
|
|
796
860
|
`${s.codeCorrectness.toString().padStart(4)} | ` +
|
|
797
861
|
`${s.docCoverage.toString().padStart(4)} | ` +
|
|
798
862
|
`${s.totalScore.toString().padStart(5)} | ` +
|
|
799
863
|
`${lift.padStart(4)} |`);
|
|
800
864
|
}
|
|
801
|
-
|
|
865
|
+
log.info("");
|
|
802
866
|
}
|
|
803
867
|
// Cost-per-quality-point
|
|
804
868
|
const modelsWithCost = sorted.filter((e) => e.overall.cost && e.overall.cost > 0);
|
|
805
869
|
if (modelsWithCost.length > 0) {
|
|
806
|
-
|
|
870
|
+
log.info(" Cost per quality point:");
|
|
807
871
|
for (const entry of modelsWithCost) {
|
|
808
872
|
const displayName = entry.label;
|
|
809
873
|
const costPerPoint = entry.overall.avgScore > 0
|
|
810
874
|
? (entry.overall.cost ?? 0) / entry.overall.avgScore
|
|
811
875
|
: 0;
|
|
812
|
-
|
|
876
|
+
log.info(` ${displayName}: $${costPerPoint.toFixed(6)}/point (score: ${entry.overall.avgScore.toFixed(1)}, cost: $${(entry.overall.cost ?? 0).toFixed(4)})`);
|
|
813
877
|
}
|
|
814
|
-
|
|
878
|
+
log.info("");
|
|
815
879
|
}
|
|
816
880
|
}
|
|
817
881
|
// ---------------------------------------------------------------------------
|
|
818
882
|
// Main
|
|
819
883
|
// ---------------------------------------------------------------------------
|
|
820
|
-
function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode) {
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
884
|
+
function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log) {
|
|
885
|
+
const _log = log ?? new ConsoleLogger();
|
|
886
|
+
_log.info("\n" + "=".repeat(80));
|
|
887
|
+
_log.info(" SANITY AI LITERACY SCORE REPORT");
|
|
888
|
+
_log.info("=".repeat(80));
|
|
889
|
+
_log.info("");
|
|
825
890
|
// Table header
|
|
826
891
|
const h = "| Feature Area | Task | Code | Docs | Total | w/o Docs | Doc Lift |";
|
|
827
892
|
const sep = "|---------------------|------|------|------|-------|----------|----------|";
|
|
828
|
-
|
|
829
|
-
|
|
893
|
+
_log.info(h);
|
|
894
|
+
_log.info(sep);
|
|
830
895
|
for (const s of scores) {
|
|
831
896
|
const status = s.totalScore < CRITICAL_THRESHOLD ? "!!" : "ok";
|
|
832
897
|
const lift = s.docLift > 0 ? `+${s.docLift}` : `${s.docLift}`;
|
|
833
|
-
|
|
898
|
+
_log.info(`| ${status} ${s.feature.padEnd(17)} | ` +
|
|
834
899
|
`${s.taskCompletion.toString().padStart(4)} | ` +
|
|
835
900
|
`${s.codeCorrectness.toString().padStart(4)} | ` +
|
|
836
901
|
`${s.docCoverage.toString().padStart(4)} | ` +
|
|
@@ -838,7 +903,7 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
838
903
|
`${s.floorScore.toString().padStart(8)} | ` +
|
|
839
904
|
`${lift.padStart(8)} |`);
|
|
840
905
|
}
|
|
841
|
-
|
|
906
|
+
_log.info("");
|
|
842
907
|
// OKR status
|
|
843
908
|
const belowCritical = scores.filter((s) => s.totalScore < CRITICAL_THRESHOLD);
|
|
844
909
|
const lowestScore = scores.reduce((min, s) => s.totalScore < min.totalScore ? s : min);
|
|
@@ -852,69 +917,69 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
852
917
|
area: s.feature,
|
|
853
918
|
docLift: s.docLift,
|
|
854
919
|
}));
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
920
|
+
_log.info("-".repeat(80));
|
|
921
|
+
_log.info("OKR STATUS");
|
|
922
|
+
_log.info("-".repeat(80));
|
|
923
|
+
_log.info("");
|
|
859
924
|
if (belowCritical.length === 0) {
|
|
860
|
-
|
|
925
|
+
_log.info(" KR1: PASS -- All areas above critical threshold (>=40)");
|
|
861
926
|
}
|
|
862
927
|
else {
|
|
863
|
-
|
|
864
|
-
belowCritical.forEach((s) =>
|
|
928
|
+
_log.info(" KR1: FAIL -- Areas below critical threshold:");
|
|
929
|
+
belowCritical.forEach((s) => _log.info(` - ${s.feature}: ${s.totalScore}`));
|
|
865
930
|
}
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
931
|
+
_log.info("");
|
|
932
|
+
_log.info(` Lowest area: ${lowestScore.feature} (${lowestScore.totalScore})`);
|
|
933
|
+
_log.info(` Target: +15 points improvement`);
|
|
934
|
+
_log.info("");
|
|
935
|
+
_log.info(` Avg score: ${avgScore.toFixed(1)}`);
|
|
936
|
+
_log.info(` Avg doc lift: +${avgLift.toFixed(1)} points`);
|
|
937
|
+
_log.info(` (Doc lift = how much docs help vs parametric knowledge alone)`);
|
|
938
|
+
_log.info("");
|
|
874
939
|
// Ceiling decomposition
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
940
|
+
_log.info("-".repeat(80));
|
|
941
|
+
_log.info("CEILING DECOMPOSITION");
|
|
942
|
+
_log.info("-".repeat(80));
|
|
943
|
+
_log.info("");
|
|
879
944
|
const ceilH = "| Feature Area | Floor | Ceiling | Doc Lift | Quality Gap |";
|
|
880
945
|
const ceilSep = "|---------------------|-------|---------|----------|-------------|";
|
|
881
|
-
|
|
882
|
-
|
|
946
|
+
_log.info(ceilH);
|
|
947
|
+
_log.info(ceilSep);
|
|
883
948
|
for (const s of scores) {
|
|
884
949
|
const liftStr = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
|
|
885
950
|
const liftFlag = s.negativeDocLift ? " 🚨" : "";
|
|
886
|
-
|
|
951
|
+
_log.info(`| ${s.feature.padEnd(19)} | ` +
|
|
887
952
|
`${s.floorScore.toString().padStart(5)} | ` +
|
|
888
953
|
`${s.ceilingScore.toString().padStart(7)} | ` +
|
|
889
954
|
`${liftStr.padStart(8)}${liftFlag} | ` +
|
|
890
955
|
`${s.docQualityGap.toString().padStart(11)} |`);
|
|
891
956
|
}
|
|
892
|
-
|
|
957
|
+
_log.info("");
|
|
893
958
|
if (negativeDocLiftAreas.length > 0) {
|
|
894
|
-
|
|
959
|
+
_log.info(" 🚨 NEGATIVE DOC LIFT DETECTED:");
|
|
895
960
|
for (const { area, docLift } of negativeDocLiftAreas) {
|
|
896
961
|
const s = scores.find((sc) => sc.feature === area);
|
|
897
|
-
|
|
962
|
+
_log.info(` ${area}: Doc Lift = ${docLift} (floor: ${s.floorScore}, ceiling: ${s.ceilingScore})`);
|
|
898
963
|
}
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
964
|
+
_log.info(" Documentation is HURTING model performance for these areas.");
|
|
965
|
+
_log.info(" See docs/design-docs/scenario-matrix/evaluation-ceiling.md");
|
|
966
|
+
_log.info("");
|
|
902
967
|
}
|
|
903
968
|
else {
|
|
904
|
-
|
|
905
|
-
|
|
969
|
+
_log.info(" ✅ No areas with negative Doc Lift detected.");
|
|
970
|
+
_log.info("");
|
|
906
971
|
}
|
|
907
972
|
// Three-layer decomposition (only when actual scores are present)
|
|
908
973
|
const hasActualScores = scores.some((s) => s.actualScore !== undefined);
|
|
909
974
|
if (hasActualScores) {
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
975
|
+
_log.info("-".repeat(80));
|
|
976
|
+
_log.info("THREE-LAYER DECOMPOSITION (floor → ceiling → actual)");
|
|
977
|
+
_log.info("-".repeat(80));
|
|
978
|
+
_log.info("");
|
|
914
979
|
const decompH = "| Feature Area | Floor | Ceiling | Actual | Doc Lift | Ret. Gap | Infra % |";
|
|
915
980
|
const decompSep = "|---------------------|-------|---------|--------|----------|----------|---------|";
|
|
916
|
-
|
|
917
|
-
|
|
981
|
+
_log.info(decompH);
|
|
982
|
+
_log.info(decompSep);
|
|
918
983
|
for (const s of scores) {
|
|
919
984
|
const liftStr = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
|
|
920
985
|
const actualStr = s.actualScore !== undefined ? s.actualScore.toString() : "—";
|
|
@@ -927,7 +992,7 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
927
992
|
? `${Math.round(s.infrastructureEfficiency * 100)}%`
|
|
928
993
|
: "—";
|
|
929
994
|
const flag = s.invertedRetrievalGap ? " 🔄" : "";
|
|
930
|
-
|
|
995
|
+
_log.info(`| ${s.feature.padEnd(19)} | ` +
|
|
931
996
|
`${s.floorScore.toString().padStart(5)} | ` +
|
|
932
997
|
`${s.ceilingScore.toString().padStart(7)} | ` +
|
|
933
998
|
`${actualStr.padStart(6)} | ` +
|
|
@@ -935,10 +1000,10 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
935
1000
|
`${(gapStr + flag).padStart(8)} | ` +
|
|
936
1001
|
`${infraStr.padStart(7)} |`);
|
|
937
1002
|
}
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
1003
|
+
_log.info("");
|
|
1004
|
+
_log.info(" Doc Lift = ceiling − floor | Ret. Gap = ceiling − actual | Infra = actual / ceiling");
|
|
1005
|
+
_log.info(" 🔄 = inverted retrieval gap (agents avoid bad docs → higher actual than ceiling)");
|
|
1006
|
+
_log.info("");
|
|
942
1007
|
}
|
|
943
1008
|
// Cost summary
|
|
944
1009
|
const totalCost = scores.reduce((sum, s) => sum + s.totalCost, 0);
|
|
@@ -946,66 +1011,66 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
946
1011
|
const graderCostTotal = graderCost?.cost ?? 0;
|
|
947
1012
|
const combinedCost = totalCost + graderCostTotal;
|
|
948
1013
|
if (totalCost > 0 || graderCostTotal > 0) {
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
1014
|
+
_log.info("-".repeat(80));
|
|
1015
|
+
_log.info("COST SUMMARY");
|
|
1016
|
+
_log.info("-".repeat(80));
|
|
1017
|
+
_log.info("");
|
|
1018
|
+
_log.info(` Provider cost: $${totalCost.toFixed(4)}`);
|
|
954
1019
|
if (graderCostTotal > 0) {
|
|
955
1020
|
const graderLabel = graderCost?.model ?? "unknown";
|
|
956
|
-
|
|
1021
|
+
_log.info(` Grader cost: $${graderCostTotal.toFixed(4)} (${graderLabel}, ${(graderCost?.totalTokens ?? 0).toLocaleString()} tokens)`);
|
|
957
1022
|
}
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
1023
|
+
_log.info(` Total cost: $${combinedCost.toFixed(4)}`);
|
|
1024
|
+
_log.info(` Avg cost per test: $${(combinedCost / (totalTests || 1)).toFixed(4)}`);
|
|
1025
|
+
_log.info("");
|
|
961
1026
|
const costHeader = "| Feature Area | Tests | Cost | Avg/Test |";
|
|
962
1027
|
const costSep = "|---------------------|-------|----------|----------|";
|
|
963
|
-
|
|
964
|
-
|
|
1028
|
+
_log.info(costHeader);
|
|
1029
|
+
_log.info(costSep);
|
|
965
1030
|
for (const s of scores) {
|
|
966
1031
|
const avgCost = s.testCount > 0 ? s.totalCost / s.testCount : 0;
|
|
967
|
-
|
|
1032
|
+
_log.info(`| ${s.feature.padEnd(19)} | ` +
|
|
968
1033
|
`${s.testCount.toString().padStart(5)} | ` +
|
|
969
1034
|
`$${s.totalCost.toFixed(4).padStart(7)} | ` +
|
|
970
1035
|
`$${avgCost.toFixed(4).padStart(7)} |`);
|
|
971
1036
|
}
|
|
972
|
-
|
|
1037
|
+
_log.info("");
|
|
973
1038
|
}
|
|
974
1039
|
// Per-model breakdown
|
|
975
1040
|
if (perModel) {
|
|
976
|
-
printPerModelReport(perModel);
|
|
1041
|
+
printPerModelReport(perModel, _log);
|
|
977
1042
|
}
|
|
978
1043
|
// URL References
|
|
979
|
-
printUrlReport(urlRefs);
|
|
1044
|
+
printUrlReport(urlRefs, _log);
|
|
980
1045
|
// Agent Behavior (only present when run with instrumented provider)
|
|
981
1046
|
if (agentBehavior && agentBehavior.length > 0) {
|
|
982
|
-
printAgentBehaviorReport(agentBehavior);
|
|
1047
|
+
printAgentBehaviorReport(agentBehavior, _log);
|
|
983
1048
|
}
|
|
984
1049
|
// Source verification (unified report for all modes)
|
|
985
1050
|
if (sourceVerification || sourceIsolation) {
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
1051
|
+
_log.info("-".repeat(80));
|
|
1052
|
+
_log.info("📋 SOURCE VERIFICATION");
|
|
1053
|
+
_log.info("-".repeat(80));
|
|
989
1054
|
if (sourceVerification) {
|
|
990
|
-
|
|
991
|
-
|
|
1055
|
+
_log.info(` Source: ${sourceVerification.source}`);
|
|
1056
|
+
_log.info(` Mode: ${sourceVerification.mode}`);
|
|
992
1057
|
if (sourceVerification.allowedOrigins) {
|
|
993
|
-
|
|
1058
|
+
_log.info(` Sandbox: ${sourceVerification.allowedOrigins.join(", ")}`);
|
|
994
1059
|
}
|
|
995
1060
|
if (sourceVerification.searchMode) {
|
|
996
|
-
|
|
1061
|
+
_log.info(` Search: ${sourceVerification.searchMode}`);
|
|
997
1062
|
}
|
|
998
1063
|
// URL fetch results (baseline mode with direct URLs)
|
|
999
1064
|
if (sourceVerification.urlFetch) {
|
|
1000
1065
|
const uf = sourceVerification.urlFetch;
|
|
1001
|
-
|
|
1002
|
-
|
|
1066
|
+
_log.info("");
|
|
1067
|
+
_log.info(` URL fetch: ${uf.totalFetched} fetched, ${uf.totalFailed} failed`);
|
|
1003
1068
|
for (const f of uf.fetchedUrls) {
|
|
1004
|
-
|
|
1069
|
+
_log.info(` ✅ ${f.url} (via ${f.method})`);
|
|
1005
1070
|
}
|
|
1006
1071
|
for (const f of uf.failures) {
|
|
1007
1072
|
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means no error info
|
|
1008
|
-
|
|
1073
|
+
_log.info(` ⚠️ ${f.url}: ${f.error || "unknown error"}`);
|
|
1009
1074
|
}
|
|
1010
1075
|
}
|
|
1011
1076
|
}
|
|
@@ -1013,22 +1078,22 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
1013
1078
|
if (sourceIsolation) {
|
|
1014
1079
|
const pct = Math.round(sourceIsolation.isolationScore * 100);
|
|
1015
1080
|
const icon = sourceIsolation.offOrigin === 0 ? "✅" : "⚠️";
|
|
1016
|
-
|
|
1017
|
-
|
|
1081
|
+
_log.info("");
|
|
1082
|
+
_log.info(` Agent isolation: ${icon} ${pct}% (${sourceIsolation.onOrigin}/${sourceIsolation.total} on-origin)`);
|
|
1018
1083
|
if (sourceIsolation.offOrigin > 0) {
|
|
1019
|
-
|
|
1084
|
+
_log.info(` Off-origin fetches: ${sourceIsolation.offOrigin}`);
|
|
1020
1085
|
for (const url of sourceIsolation.offOriginUrls.slice(0, 10)) {
|
|
1021
|
-
|
|
1086
|
+
_log.info(` • ${url}`);
|
|
1022
1087
|
}
|
|
1023
1088
|
}
|
|
1024
1089
|
if (Object.keys(sourceIsolation.originBreakdown).length > 0) {
|
|
1025
|
-
|
|
1090
|
+
_log.info(" Origin breakdown:");
|
|
1026
1091
|
for (const [origin, count] of Object.entries(sourceIsolation.originBreakdown).sort((a, b) => b[1] - a[1])) {
|
|
1027
|
-
|
|
1092
|
+
_log.info(` ${origin}: ${count}`);
|
|
1028
1093
|
}
|
|
1029
1094
|
}
|
|
1030
1095
|
}
|
|
1031
|
-
|
|
1096
|
+
_log.info("");
|
|
1032
1097
|
}
|
|
1033
1098
|
// Build overall agent behavior stats for summary
|
|
1034
1099
|
const overallAgentBehavior = agentBehavior && agentBehavior.length > 0
|
|
@@ -1110,31 +1175,31 @@ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perMode
|
|
|
1110
1175
|
urlReferences: urlRefs,
|
|
1111
1176
|
};
|
|
1112
1177
|
}
|
|
1113
|
-
function printUrlReport(urlRefs) {
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1178
|
+
function printUrlReport(urlRefs, log) {
|
|
1179
|
+
log.info("-".repeat(80));
|
|
1180
|
+
log.info("URL REFERENCES");
|
|
1181
|
+
log.info("-".repeat(80));
|
|
1182
|
+
log.info("");
|
|
1118
1183
|
for (const ref of urlRefs) {
|
|
1119
1184
|
const goldUrls = Object.entries(ref.gold.urls).sort((a, b) => b[1] - a[1]);
|
|
1120
1185
|
const baselineUrls = Object.entries(ref.baseline.urls).sort((a, b) => b[1] - a[1]);
|
|
1121
1186
|
if (goldUrls.length > 0) {
|
|
1122
|
-
|
|
1187
|
+
log.info(` ${ref.feature} (gold):`);
|
|
1123
1188
|
for (const [url, count] of goldUrls) {
|
|
1124
1189
|
const suffix = count > 1 ? ` (${count} tests)` : "";
|
|
1125
|
-
|
|
1190
|
+
log.info(` ${url}${suffix}`);
|
|
1126
1191
|
}
|
|
1127
1192
|
}
|
|
1128
1193
|
if (baselineUrls.length > 0) {
|
|
1129
|
-
|
|
1194
|
+
log.info(` ${ref.feature} (baseline):`);
|
|
1130
1195
|
for (const [url, count] of baselineUrls) {
|
|
1131
1196
|
const suffix = count > 1 ? ` (${count} tests)` : "";
|
|
1132
|
-
|
|
1197
|
+
log.info(` ${url}${suffix} [parametric]`);
|
|
1133
1198
|
}
|
|
1134
1199
|
}
|
|
1135
1200
|
if (goldUrls.length === 0 && baselineUrls.length === 0) {
|
|
1136
|
-
|
|
1201
|
+
log.info(` ${ref.feature}: no URLs referenced`);
|
|
1137
1202
|
}
|
|
1138
|
-
|
|
1203
|
+
log.info("");
|
|
1139
1204
|
}
|
|
1140
1205
|
}
|