@sanity/ailf 0.1.20 → 0.1.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -565,8 +565,30 @@ export interface RetrievalMetrics {
|
|
|
565
565
|
avgRecall: number;
|
|
566
566
|
};
|
|
567
567
|
}
|
|
568
|
+
/** Per-feature agent behavior data — how agents interacted with docs */
|
|
569
|
+
export interface FeatureAgentBehavior {
|
|
570
|
+
avgDocPagesVisited: number;
|
|
571
|
+
avgNetworkTimeMs: number;
|
|
572
|
+
avgSearchesPerformed: number;
|
|
573
|
+
docSlugsVisited: string[];
|
|
574
|
+
externalDomains: string[];
|
|
575
|
+
feature: string;
|
|
576
|
+
searchQueries: string[];
|
|
577
|
+
tasksWithBehaviorData: number;
|
|
578
|
+
}
|
|
579
|
+
/** Aggregate agent behavior stats across all features */
|
|
580
|
+
export interface OverallAgentBehavior {
|
|
581
|
+
avgDocPagesVisited: number;
|
|
582
|
+
avgNetworkTimeMs: number;
|
|
583
|
+
avgSearchesPerformed: number;
|
|
584
|
+
testsWithBehaviorData: number;
|
|
585
|
+
totalUniqueDocSlugs: number;
|
|
586
|
+
totalUniqueSearchQueries: number;
|
|
587
|
+
}
|
|
568
588
|
/** Top-level score summary (the shape of score-summary.json) */
|
|
569
589
|
export interface ScoreSummary {
|
|
590
|
+
/** Per-feature agent behavior data (only present when agentic mode ran) */
|
|
591
|
+
agentBehavior?: FeatureAgentBehavior[];
|
|
570
592
|
belowCritical: string[];
|
|
571
593
|
/**
|
|
572
594
|
* All Sanity documents used across the entire evaluation.
|
|
@@ -604,6 +626,8 @@ export interface ScoreSummary {
|
|
|
604
626
|
docLift: number;
|
|
605
627
|
}[];
|
|
606
628
|
overall: {
|
|
629
|
+
/** Aggregate agent behavior stats (only present when agentic mode ran) */
|
|
630
|
+
agentBehavior?: OverallAgentBehavior;
|
|
607
631
|
/** Average actual (agentic) score across areas. Absent if no agentic data. */
|
|
608
632
|
avgActualScore?: number;
|
|
609
633
|
/** Average ceiling score across all areas */
|
|
@@ -324,13 +324,25 @@ function calculateScores(resultsPath, weights) {
|
|
|
324
324
|
/**
|
|
325
325
|
* Extracts agent behavior summary from a test result's metadata.
|
|
326
326
|
* Returns null if the test was not run with the instrumented provider.
|
|
327
|
+
*
|
|
328
|
+
* Checks two locations because Promptfoo may flatten/merge the metadata
|
|
329
|
+
* object differently than how the provider originally set it:
|
|
330
|
+
* 1. metadata.agentBehaviorSummary (set directly by AgenticProvider)
|
|
331
|
+
* 2. metadata.agentBehavior.summary (nested in the full behavior log)
|
|
327
332
|
*/
|
|
328
333
|
function extractAgentBehavior(test) {
|
|
329
334
|
const { metadata } = test;
|
|
330
|
-
if (
|
|
331
|
-
return
|
|
335
|
+
if (metadata?.agentBehaviorSummary) {
|
|
336
|
+
return metadata.agentBehaviorSummary;
|
|
337
|
+
}
|
|
338
|
+
// Fallback: Promptfoo may drop the top-level agentBehaviorSummary
|
|
339
|
+
// field during serialization, but the data is nested inside the
|
|
340
|
+
// full agentBehavior log.
|
|
341
|
+
const behavior = metadata?.agentBehavior;
|
|
342
|
+
if (behavior?.summary) {
|
|
343
|
+
return behavior.summary;
|
|
332
344
|
}
|
|
333
|
-
return
|
|
345
|
+
return null;
|
|
334
346
|
}
|
|
335
347
|
/**
|
|
336
348
|
* Extracts grader (assertion) cost from the raw Promptfoo results file.
|