@ls-stack/agent-eval 0.8.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-ZFLdu8-r.mjs → app-sK9CjpNI.mjs} +17 -5
- package/dist/apps/web/dist/assets/index-ChgByJbI.css +1 -0
- package/dist/apps/web/dist/assets/index-CmY0_D5Z.js +113 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-DQK5W0je.mjs → cli-DLlRkyLH.mjs} +13 -4
- package/dist/index.d.mts +543 -101
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +3 -2
- package/dist/{runOrchestration-HaMahl6b.mjs → runOrchestration-DwqX9_T7.mjs} +2528 -286
- package/dist/{runner-CmVPWava.mjs → runner-Gtlmvm3w.mjs} +2 -2
- package/dist/{runner--XPZ5D7N.mjs → runner-JrBz8ISs.mjs} +1 -1
- package/dist/src-Bx-CV6Wo.mjs +3 -0
- package/package.json +1 -1
- package/dist/apps/web/dist/assets/index-ClE28i5w.css +0 -1
- package/dist/apps/web/dist/assets/index-CvJmtK1T.js +0 -113
- package/dist/src-r3FQAaw6.mjs +0 -3
package/dist/index.d.mts
CHANGED
|
@@ -913,6 +913,20 @@ declare const caseDetailSchema: z$1.ZodObject<{
|
|
|
913
913
|
stack: z$1.ZodOptional<z$1.ZodString>;
|
|
914
914
|
}, z$1.core.$strip>>;
|
|
915
915
|
trial: z$1.ZodNumber;
|
|
916
|
+
cacheRefs: z$1.ZodDefault<z$1.ZodArray<z$1.ZodObject<{
|
|
917
|
+
type: z$1.ZodLiteral<"value">;
|
|
918
|
+
name: z$1.ZodString;
|
|
919
|
+
namespace: z$1.ZodString;
|
|
920
|
+
key: z$1.ZodString;
|
|
921
|
+
status: z$1.ZodEnum<{
|
|
922
|
+
hit: "hit";
|
|
923
|
+
miss: "miss";
|
|
924
|
+
refresh: "refresh";
|
|
925
|
+
bypass: "bypass";
|
|
926
|
+
}>;
|
|
927
|
+
storedAt: z$1.ZodOptional<z$1.ZodString>;
|
|
928
|
+
age: z$1.ZodOptional<z$1.ZodNumber>;
|
|
929
|
+
}, z$1.core.$strip>>>;
|
|
916
930
|
}, z$1.core.$strip>;
|
|
917
931
|
/** Full case payload including inputs, trace, outputs, and failures. */
|
|
918
932
|
type CaseDetail = z$1.infer<typeof caseDetailSchema>;
|
|
@@ -1363,60 +1377,16 @@ type EvalTitleLike = {
|
|
|
1363
1377
|
*/
|
|
1364
1378
|
declare function getEvalTitle(evalLike: EvalTitleLike): string;
|
|
1365
1379
|
//#endregion
|
|
1366
|
-
//#region ../shared/src/
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
"run.cancelled": "run.cancelled";
|
|
1377
|
-
"run.error": "run.error";
|
|
1378
|
-
}>;
|
|
1379
|
-
/** Server-sent event name emitted by the runner or backend. */
|
|
1380
|
-
type SseEventType = z$1.infer<typeof sseEventTypeSchema>;
|
|
1381
|
-
/** Schema for the SSE envelope used to stream run updates to clients. */
|
|
1382
|
-
declare const sseEnvelopeSchema: z$1.ZodObject<{
|
|
1383
|
-
type: z$1.ZodString;
|
|
1384
|
-
runId: z$1.ZodOptional<z$1.ZodString>;
|
|
1385
|
-
timestamp: z$1.ZodString;
|
|
1386
|
-
payload: z$1.ZodUnknown;
|
|
1387
|
-
}, z$1.core.$strip>;
|
|
1388
|
-
/** Wire format for a streamed event emitted during eval execution. */
|
|
1389
|
-
type SseEnvelope = z$1.infer<typeof sseEnvelopeSchema>;
|
|
1390
|
-
//#endregion
|
|
1391
|
-
//#region ../shared/src/schemas/api.d.ts
|
|
1392
|
-
/** Schema for the API request that starts a new eval run. */
|
|
1393
|
-
declare const createRunRequestSchema: z$1.ZodObject<{
|
|
1394
|
-
target: z$1.ZodObject<{
|
|
1395
|
-
mode: z$1.ZodEnum<{
|
|
1396
|
-
all: "all";
|
|
1397
|
-
evalIds: "evalIds";
|
|
1398
|
-
caseIds: "caseIds";
|
|
1399
|
-
}>;
|
|
1400
|
-
evalIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
1401
|
-
caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
1402
|
-
}, z$1.core.$strip>;
|
|
1403
|
-
trials: z$1.ZodNumber;
|
|
1404
|
-
cache: z$1.ZodOptional<z$1.ZodObject<{
|
|
1405
|
-
mode: z$1.ZodDefault<z$1.ZodEnum<{
|
|
1406
|
-
use: "use";
|
|
1407
|
-
bypass: "bypass";
|
|
1408
|
-
refresh: "refresh";
|
|
1409
|
-
}>>;
|
|
1410
|
-
}, z$1.core.$strip>>;
|
|
1411
|
-
}, z$1.core.$strip>;
|
|
1412
|
-
/** Request payload accepted by the run creation endpoint. */
|
|
1413
|
-
type CreateRunRequest = z$1.infer<typeof createRunRequestSchema>;
|
|
1414
|
-
/** Schema for updating a UI-authored manual score on one persisted case. */
|
|
1415
|
-
declare const updateManualScoreRequestSchema: z$1.ZodObject<{
|
|
1416
|
-
value: z$1.ZodNullable<z$1.ZodNumber>;
|
|
1417
|
-
}, z$1.core.$strip>;
|
|
1418
|
-
/** Request payload accepted by the manual score update endpoint. */
|
|
1419
|
-
type UpdateManualScoreRequest = z$1.infer<typeof updateManualScoreRequestSchema>;
|
|
1380
|
+
//#region ../shared/src/utils/getNestedAttribute.d.ts
|
|
1381
|
+
/**
|
|
1382
|
+
* Read a value from `source` by walking a dot-separated path.
|
|
1383
|
+
*
|
|
1384
|
+
* Returns `undefined` when any segment of the path is missing or when an
|
|
1385
|
+
* intermediate value is not a plain object. Used by trace-attribute display,
|
|
1386
|
+
* the LLM calls extractor, and any consumer that needs to look up nested
|
|
1387
|
+
* properties from a span's `attributes` record.
|
|
1388
|
+
*/
|
|
1389
|
+
declare function getNestedAttribute(value: unknown, path: string): unknown;
|
|
1420
1390
|
//#endregion
|
|
1421
1391
|
//#region ../shared/src/schemas/config.d.ts
|
|
1422
1392
|
/** Strategy used to collapse repeated trials into one stored case result. */
|
|
@@ -1426,6 +1396,144 @@ declare const trialSelectionModeSchema: z$1.ZodEnum<{
|
|
|
1426
1396
|
}>;
|
|
1427
1397
|
/** Strategy used to collapse repeated trials into one stored case result. */
|
|
1428
1398
|
type TrialSelectionMode = z$1.infer<typeof trialSelectionModeSchema>;
|
|
1399
|
+
/** Render formats supported by an LLM-call metric in the UI. */
|
|
1400
|
+
declare const llmCallMetricFormatSchema: z$1.ZodEnum<{
|
|
1401
|
+
string: "string";
|
|
1402
|
+
number: "number";
|
|
1403
|
+
boolean: "boolean";
|
|
1404
|
+
duration: "duration";
|
|
1405
|
+
json: "json";
|
|
1406
|
+
}>;
|
|
1407
|
+
/** Render format applied to an LLM-call metric value. */
|
|
1408
|
+
type LlmCallMetricFormat = z$1.infer<typeof llmCallMetricFormatSchema>;
|
|
1409
|
+
/** Where an LLM-call metric is rendered inside the LLM calls tab. */
|
|
1410
|
+
declare const llmCallMetricPlacementSchema: z$1.ZodEnum<{
|
|
1411
|
+
header: "header";
|
|
1412
|
+
body: "body";
|
|
1413
|
+
}>;
|
|
1414
|
+
/** Placement option for an LLM-call metric. */
|
|
1415
|
+
type LlmCallMetricPlacement = z$1.infer<typeof llmCallMetricPlacementSchema>;
|
|
1416
|
+
/**
|
|
1417
|
+
* Schema for a single user-defined metric attached to LLM call rows.
|
|
1418
|
+
*
|
|
1419
|
+
* Each metric reads `path` from the span's `attributes` and renders the value
|
|
1420
|
+
* with the configured `format` and `numberFormat`. `placements` controls
|
|
1421
|
+
* whether the metric appears as a chip on the collapsed row header, as a row
|
|
1422
|
+
* inside the expanded body, or both. Defaults to `['body']` when omitted.
|
|
1423
|
+
*/
|
|
1424
|
+
declare const llmCallMetricSchema: z$1.ZodObject<{
|
|
1425
|
+
label: z$1.ZodString;
|
|
1426
|
+
tooltip: z$1.ZodOptional<z$1.ZodString>;
|
|
1427
|
+
path: z$1.ZodString;
|
|
1428
|
+
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
1429
|
+
string: "string";
|
|
1430
|
+
number: "number";
|
|
1431
|
+
boolean: "boolean";
|
|
1432
|
+
duration: "duration";
|
|
1433
|
+
json: "json";
|
|
1434
|
+
}>>;
|
|
1435
|
+
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
1436
|
+
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
1437
|
+
header: "header";
|
|
1438
|
+
body: "body";
|
|
1439
|
+
}>>>;
|
|
1440
|
+
}, z$1.core.$strip>;
|
|
1441
|
+
/** User-defined metric authored in `agent-evals.config.ts`. */
|
|
1442
|
+
type LlmCallMetric = z$1.infer<typeof llmCallMetricSchema>;
|
|
1443
|
+
/** Schema for the global LLM calls config block in `agent-evals.config.ts`. */
|
|
1444
|
+
declare const llmCallsConfigSchema: z$1.ZodObject<{
|
|
1445
|
+
kinds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
1446
|
+
attributes: z$1.ZodOptional<z$1.ZodObject<{
|
|
1447
|
+
model: z$1.ZodOptional<z$1.ZodString>;
|
|
1448
|
+
provider: z$1.ZodOptional<z$1.ZodString>;
|
|
1449
|
+
inputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1450
|
+
outputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1451
|
+
cachedInputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1452
|
+
cacheCreationInputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1453
|
+
reasoningTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1454
|
+
totalTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1455
|
+
cost: z$1.ZodOptional<z$1.ZodString>;
|
|
1456
|
+
inputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1457
|
+
outputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1458
|
+
cachedInputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1459
|
+
cacheCreationInputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1460
|
+
reasoningCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1461
|
+
steps: z$1.ZodOptional<z$1.ZodString>;
|
|
1462
|
+
finishReason: z$1.ZodOptional<z$1.ZodString>;
|
|
1463
|
+
input: z$1.ZodOptional<z$1.ZodString>;
|
|
1464
|
+
output: z$1.ZodOptional<z$1.ZodString>;
|
|
1465
|
+
reasoning: z$1.ZodOptional<z$1.ZodString>;
|
|
1466
|
+
toolCalls: z$1.ZodOptional<z$1.ZodString>;
|
|
1467
|
+
}, z$1.core.$strip>>;
|
|
1468
|
+
metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
1469
|
+
label: z$1.ZodString;
|
|
1470
|
+
tooltip: z$1.ZodOptional<z$1.ZodString>;
|
|
1471
|
+
path: z$1.ZodString;
|
|
1472
|
+
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
1473
|
+
string: "string";
|
|
1474
|
+
number: "number";
|
|
1475
|
+
boolean: "boolean";
|
|
1476
|
+
duration: "duration";
|
|
1477
|
+
json: "json";
|
|
1478
|
+
}>>;
|
|
1479
|
+
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
1480
|
+
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
1481
|
+
header: "header";
|
|
1482
|
+
body: "body";
|
|
1483
|
+
}>>>;
|
|
1484
|
+
}, z$1.core.$strip>>>;
|
|
1485
|
+
}, z$1.core.$strip>;
|
|
1486
|
+
/** Authored LLM calls config accepted from `agent-evals.config.ts`. */
|
|
1487
|
+
type LlmCallsConfigInput = z$1.infer<typeof llmCallsConfigSchema>;
|
|
1488
|
+
/** Resolved LLM-calls config sent to the UI with all defaults applied. */
|
|
1489
|
+
type ResolvedLlmCallsConfig = {
|
|
1490
|
+
kinds: string[];
|
|
1491
|
+
attributes: {
|
|
1492
|
+
model: string;
|
|
1493
|
+
provider: string;
|
|
1494
|
+
inputTokens: string;
|
|
1495
|
+
outputTokens: string;
|
|
1496
|
+
cachedInputTokens: string;
|
|
1497
|
+
cacheCreationInputTokens: string;
|
|
1498
|
+
reasoningTokens: string;
|
|
1499
|
+
totalTokens: string;
|
|
1500
|
+
cost: string;
|
|
1501
|
+
inputCost: string;
|
|
1502
|
+
outputCost: string;
|
|
1503
|
+
cachedInputCost: string;
|
|
1504
|
+
cacheCreationInputCost: string;
|
|
1505
|
+
reasoningCost: string;
|
|
1506
|
+
steps: string;
|
|
1507
|
+
finishReason: string;
|
|
1508
|
+
input: string;
|
|
1509
|
+
output: string;
|
|
1510
|
+
reasoning: string;
|
|
1511
|
+
toolCalls: string;
|
|
1512
|
+
};
|
|
1513
|
+
metrics: ResolvedLlmCallMetric[];
|
|
1514
|
+
};
|
|
1515
|
+
/** Fully-resolved LLM-call metric used by the runner and UI. */
|
|
1516
|
+
type ResolvedLlmCallMetric = {
|
|
1517
|
+
label: string;
|
|
1518
|
+
tooltip?: string;
|
|
1519
|
+
path: string;
|
|
1520
|
+
format: LlmCallMetricFormat;
|
|
1521
|
+
numberFormat?: NumberDisplayOptions;
|
|
1522
|
+
placements: LlmCallMetricPlacement[];
|
|
1523
|
+
};
|
|
1524
|
+
/** Default LLM-calls config the UI uses before the workspace fetch resolves. */
|
|
1525
|
+
declare const DEFAULT_LLM_CALLS_CONFIG: ResolvedLlmCallsConfig;
|
|
1526
|
+
/**
|
|
1527
|
+
* Resolve the user-authored LLM-calls config to a fully-defaulted shape used
|
|
1528
|
+
* by the UI to derive the LLM calls tab.
|
|
1529
|
+
*
|
|
1530
|
+
* - Missing or empty `kinds` falls back to `['llm']`.
|
|
1531
|
+
* - Missing `attributes.<field>` falls back to the corresponding default
|
|
1532
|
+
* attribute path.
|
|
1533
|
+
* - Missing `metrics[].format` defaults to `'string'`.
|
|
1534
|
+
* - Missing `metrics[].placements` defaults to `['body']`.
|
|
1535
|
+
*/
|
|
1536
|
+
declare function resolveLlmCallsConfig(input: LlmCallsConfigInput | undefined): ResolvedLlmCallsConfig;
|
|
1429
1537
|
/** Top-level config authored in `agent-evals.config.ts`. */
|
|
1430
1538
|
type AgentEvalsConfig = {
|
|
1431
1539
|
/** Root directory used to resolve all relative paths. Defaults to `process.cwd()`. */workspaceRoot?: string; /** Glob patterns (relative to `workspaceRoot`) used to discover eval files. */
|
|
@@ -1455,6 +1563,32 @@ type AgentEvalsConfig = {
|
|
|
1455
1563
|
* definition taking precedence for matching `key` or `path` entries.
|
|
1456
1564
|
*/
|
|
1457
1565
|
traceDisplay?: TraceDisplayInputConfig;
|
|
1566
|
+
/**
|
|
1567
|
+
* Configuration for the "LLM calls" tab in the case-run drawer.
|
|
1568
|
+
*
|
|
1569
|
+
* Determines which trace spans are treated as LLM calls (`kinds`), how
|
|
1570
|
+
* structured fields like `model` and `usage.inputTokens` are read from
|
|
1571
|
+
* span attributes, and which custom user-defined metrics are surfaced on
|
|
1572
|
+
* each call. All fields are optional and fall back to the documented
|
|
1573
|
+
* defaults; the LLM calls tab is shown automatically when at least one
|
|
1574
|
+
* matching span exists in a case run.
|
|
1575
|
+
*
|
|
1576
|
+
* @example
|
|
1577
|
+
* ```ts
|
|
1578
|
+
* llmCalls: {
|
|
1579
|
+
* kinds: ['llm', 'ai-sdk.generateText'],
|
|
1580
|
+
* attributes: {
|
|
1581
|
+
* cachedInputTokens: 'usage.cache_read_input_tokens',
|
|
1582
|
+
* },
|
|
1583
|
+
* metrics: [
|
|
1584
|
+
* { label: 'Tokens/sec', path: 'tokensPerSecond', format: 'number',
|
|
1585
|
+
* numberFormat: { decimalPlaces: 1 }, placements: ['header', 'body'] },
|
|
1586
|
+
* { label: 'Retries', path: 'retryCount', format: 'number' },
|
|
1587
|
+
* ],
|
|
1588
|
+
* }
|
|
1589
|
+
* ```
|
|
1590
|
+
*/
|
|
1591
|
+
llmCalls?: LlmCallsConfigInput;
|
|
1458
1592
|
/**
|
|
1459
1593
|
* Optional controls for the operation cache. When omitted, the cache is
|
|
1460
1594
|
* enabled and stored under `<workspaceRoot>/.agent-evals/cache`.
|
|
@@ -1463,9 +1597,15 @@ type AgentEvalsConfig = {
|
|
|
1463
1597
|
/** Disable the cache entirely; spans with `cache` options execute as if uncached. */enabled?: boolean; /** Override the directory used to persist cache entries. */
|
|
1464
1598
|
dir?: string;
|
|
1465
1599
|
/**
|
|
1466
|
-
*
|
|
1467
|
-
* non-positive or non-finite values fall back to the default.
|
|
1600
|
+
* Default maximum entries retained for each cache namespace. Defaults to
|
|
1601
|
+
* `100`; non-positive or non-finite values fall back to the default.
|
|
1602
|
+
*/
|
|
1603
|
+
maxEntriesPerNamespace?: number;
|
|
1604
|
+
/**
|
|
1605
|
+
* Exact namespace-specific retention caps. Values override
|
|
1606
|
+
* `maxEntriesPerNamespace` for matching namespaces.
|
|
1468
1607
|
*/
|
|
1608
|
+
maxEntriesByNamespace?: Record<string, number>; /** Legacy alias for `maxEntriesPerNamespace`, retained so older config files keep working. */
|
|
1469
1609
|
maxEntriesPerEval?: number;
|
|
1470
1610
|
};
|
|
1471
1611
|
};
|
|
@@ -1509,13 +1649,118 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
1509
1649
|
transform: z$1.ZodOptional<z$1.ZodCustom<TraceAttributeTransform, TraceAttributeTransform>>;
|
|
1510
1650
|
}, z$1.core.$strip>>>;
|
|
1511
1651
|
}, z$1.core.$strip>>;
|
|
1652
|
+
llmCalls: z$1.ZodOptional<z$1.ZodObject<{
|
|
1653
|
+
kinds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
1654
|
+
attributes: z$1.ZodOptional<z$1.ZodObject<{
|
|
1655
|
+
model: z$1.ZodOptional<z$1.ZodString>;
|
|
1656
|
+
provider: z$1.ZodOptional<z$1.ZodString>;
|
|
1657
|
+
inputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1658
|
+
outputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1659
|
+
cachedInputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1660
|
+
cacheCreationInputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1661
|
+
reasoningTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1662
|
+
totalTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1663
|
+
cost: z$1.ZodOptional<z$1.ZodString>;
|
|
1664
|
+
inputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1665
|
+
outputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1666
|
+
cachedInputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1667
|
+
cacheCreationInputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1668
|
+
reasoningCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1669
|
+
steps: z$1.ZodOptional<z$1.ZodString>;
|
|
1670
|
+
finishReason: z$1.ZodOptional<z$1.ZodString>;
|
|
1671
|
+
input: z$1.ZodOptional<z$1.ZodString>;
|
|
1672
|
+
output: z$1.ZodOptional<z$1.ZodString>;
|
|
1673
|
+
reasoning: z$1.ZodOptional<z$1.ZodString>;
|
|
1674
|
+
toolCalls: z$1.ZodOptional<z$1.ZodString>;
|
|
1675
|
+
}, z$1.core.$strip>>;
|
|
1676
|
+
metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
1677
|
+
label: z$1.ZodString;
|
|
1678
|
+
tooltip: z$1.ZodOptional<z$1.ZodString>;
|
|
1679
|
+
path: z$1.ZodString;
|
|
1680
|
+
format: z$1.ZodOptional<z$1.ZodEnum<{
|
|
1681
|
+
string: "string";
|
|
1682
|
+
number: "number";
|
|
1683
|
+
boolean: "boolean";
|
|
1684
|
+
duration: "duration";
|
|
1685
|
+
json: "json";
|
|
1686
|
+
}>>;
|
|
1687
|
+
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
1688
|
+
placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
|
|
1689
|
+
header: "header";
|
|
1690
|
+
body: "body";
|
|
1691
|
+
}>>>;
|
|
1692
|
+
}, z$1.core.$strip>>>;
|
|
1693
|
+
}, z$1.core.$strip>>;
|
|
1512
1694
|
cache: z$1.ZodOptional<z$1.ZodObject<{
|
|
1513
1695
|
enabled: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
1514
1696
|
dir: z$1.ZodOptional<z$1.ZodString>;
|
|
1697
|
+
maxEntriesPerNamespace: z$1.ZodPipe<z$1.ZodTransform<number | undefined, unknown>, z$1.ZodOptional<z$1.ZodNumber>>;
|
|
1698
|
+
maxEntriesByNamespace: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodNumber>>;
|
|
1515
1699
|
maxEntriesPerEval: z$1.ZodPipe<z$1.ZodTransform<number | undefined, unknown>, z$1.ZodOptional<z$1.ZodNumber>>;
|
|
1516
1700
|
}, z$1.core.$strip>>;
|
|
1517
1701
|
}, z$1.core.$strip>;
|
|
1518
1702
|
//#endregion
|
|
1703
|
+
//#region ../shared/src/utils/extractLlmCalls.d.ts
|
|
1704
|
+
/** Resolved value for one user-defined metric on an LLM call row. */
|
|
1705
|
+
type LlmCallMetricValue = {
|
|
1706
|
+
label: string;
|
|
1707
|
+
tooltip: string | undefined;
|
|
1708
|
+
rawValue: unknown;
|
|
1709
|
+
format: LlmCallMetricFormat;
|
|
1710
|
+
numberFormat: NumberDisplayOptions | undefined;
|
|
1711
|
+
placements: LlmCallMetricPlacement[];
|
|
1712
|
+
};
|
|
1713
|
+
/** Single entry rendered as one expandable row in the LLM calls tab. */
|
|
1714
|
+
type LlmCallEntry = {
|
|
1715
|
+
id: string;
|
|
1716
|
+
name: string;
|
|
1717
|
+
kind: string;
|
|
1718
|
+
status: EvalTraceSpan['status'];
|
|
1719
|
+
model: string | null;
|
|
1720
|
+
provider: string | null;
|
|
1721
|
+
inputTokens: number | null;
|
|
1722
|
+
outputTokens: number | null;
|
|
1723
|
+
cachedInputTokens: number | null;
|
|
1724
|
+
cacheCreationInputTokens: number | null;
|
|
1725
|
+
reasoningTokens: number | null;
|
|
1726
|
+
totalTokens: number | null;
|
|
1727
|
+
costUsd: number | null;
|
|
1728
|
+
inputCostUsd: number | null;
|
|
1729
|
+
outputCostUsd: number | null;
|
|
1730
|
+
cachedInputCostUsd: number | null;
|
|
1731
|
+
cacheCreationInputCostUsd: number | null;
|
|
1732
|
+
reasoningCostUsd: number | null; /** Number of inference rounds. Derived from the array length when `stepDetails` is set. */
|
|
1733
|
+
stepCount: number | null; /** Per-step breakdown when the configured `steps` attribute resolves to an array. */
|
|
1734
|
+
stepDetails: unknown[] | null;
|
|
1735
|
+
finishReason: string | null;
|
|
1736
|
+
latencyMs: number | null;
|
|
1737
|
+
input: unknown;
|
|
1738
|
+
output: unknown;
|
|
1739
|
+
reasoning: unknown;
|
|
1740
|
+
toolCalls: unknown;
|
|
1741
|
+
metrics: LlmCallMetricValue[];
|
|
1742
|
+
warnings: EvalTraceSpanWarning[];
|
|
1743
|
+
error: EvalTraceSpanError | null;
|
|
1744
|
+
};
|
|
1745
|
+
/**
|
|
1746
|
+
* Filter `spans` down to LLM calls and project each one to the structured
|
|
1747
|
+
* shape consumed by the LLM calls tab.
|
|
1748
|
+
*
|
|
1749
|
+
* Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
|
|
1750
|
+
* (`model`, token counts, cost, etc.) are read via `getNestedAttribute` from
|
|
1751
|
+
* the configured paths, with safe coercion to `string | null` / `number |
|
|
1752
|
+
* null`. `totalTokens` falls back to a sum of input + output + cached when no
|
|
1753
|
+
* explicit total attribute is present. The `steps` attribute path may resolve
|
|
1754
|
+
* to either a number (rendered as the inference-round count) or an array of
|
|
1755
|
+
* per-step detail objects (rendered as a Steps section in the body, with
|
|
1756
|
+
* `stepCount` derived from the array length). `latencyMs` is `null` while the
|
|
1757
|
+
* span is still running. User-defined `metrics` whose path resolves to
|
|
1758
|
+
* `undefined` are dropped, but `null`, `0`, and `false` are preserved as
|
|
1759
|
+
* legitimate values worth displaying. Original span order is preserved so the
|
|
1760
|
+
* LLM calls tab matches the ordering in the Trace tab.
|
|
1761
|
+
*/
|
|
1762
|
+
declare function extractLlmCalls(spans: EvalTraceSpan[], config: ResolvedLlmCallsConfig): LlmCallEntry[];
|
|
1763
|
+
//#endregion
|
|
1519
1764
|
//#region ../shared/src/schemas/cache.d.ts
|
|
1520
1765
|
/**
|
|
1521
1766
|
* Mode that controls how the cache is consulted for a given run.
|
|
@@ -1535,6 +1780,7 @@ type CacheMode = z$1.infer<typeof cacheModeSchema>;
|
|
|
1535
1780
|
declare const spanCacheOptionsSchema: z$1.ZodObject<{
|
|
1536
1781
|
key: z$1.ZodUnknown;
|
|
1537
1782
|
namespace: z$1.ZodOptional<z$1.ZodString>;
|
|
1783
|
+
serializeFileBytes: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
1538
1784
|
}, z$1.core.$strip>;
|
|
1539
1785
|
/** Options accepted by an `evalTracer.span` call to opt the span into caching. */
|
|
1540
1786
|
type SpanCacheOptions = z$1.infer<typeof spanCacheOptionsSchema>;
|
|
@@ -1545,6 +1791,38 @@ declare const cacheOperationTypeSchema: z$1.ZodEnum<{
|
|
|
1545
1791
|
}>;
|
|
1546
1792
|
/** Category of operation stored in the eval cache. */
|
|
1547
1793
|
type CacheOperationType = z$1.infer<typeof cacheOperationTypeSchema>;
|
|
1794
|
+
/** Status of a cache lookup recorded on a span or case scope. */
|
|
1795
|
+
declare const cacheStatusSchema: z$1.ZodEnum<{
|
|
1796
|
+
bypass: "bypass";
|
|
1797
|
+
refresh: "refresh";
|
|
1798
|
+
hit: "hit";
|
|
1799
|
+
miss: "miss";
|
|
1800
|
+
}>;
|
|
1801
|
+
/** Status of a cache lookup recorded on a span or case scope. */
|
|
1802
|
+
type CacheStatus = z$1.infer<typeof cacheStatusSchema>;
|
|
1803
|
+
/**
|
|
1804
|
+
* Reference to a value-cache lookup performed via `evalTracer.cache(...)`.
|
|
1805
|
+
*
|
|
1806
|
+
* Refs are appended to the active span's `cache.refs` attribute when the call
|
|
1807
|
+
* happens inside a `traceSpan(...)` body, or to the case scope's
|
|
1808
|
+
* `caseCacheRefs` bucket when the call is made directly from the case body.
|
|
1809
|
+
*/
|
|
1810
|
+
declare const traceCacheRefSchema: z$1.ZodObject<{
|
|
1811
|
+
type: z$1.ZodLiteral<"value">;
|
|
1812
|
+
name: z$1.ZodString;
|
|
1813
|
+
namespace: z$1.ZodString;
|
|
1814
|
+
key: z$1.ZodString;
|
|
1815
|
+
status: z$1.ZodEnum<{
|
|
1816
|
+
bypass: "bypass";
|
|
1817
|
+
refresh: "refresh";
|
|
1818
|
+
hit: "hit";
|
|
1819
|
+
miss: "miss";
|
|
1820
|
+
}>;
|
|
1821
|
+
storedAt: z$1.ZodOptional<z$1.ZodString>;
|
|
1822
|
+
age: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1823
|
+
}, z$1.core.$strip>;
|
|
1824
|
+
/** Reference to a value-cache lookup performed via `evalTracer.cache(...)`. */
|
|
1825
|
+
type TraceCacheRef = z$1.infer<typeof traceCacheRefSchema>;
|
|
1548
1826
|
/** Summary of a single persisted cache entry, used by list/delete endpoints. */
|
|
1549
1827
|
declare const cacheListItemSchema: z$1.ZodObject<{
|
|
1550
1828
|
key: z$1.ZodString;
|
|
@@ -1824,6 +2102,93 @@ declare const cacheFileSchema: z$1.ZodObject<{
|
|
|
1824
2102
|
/** Persisted per-owner cache file contents. */
|
|
1825
2103
|
type CacheFile = z$1.infer<typeof cacheFileSchema>;
|
|
1826
2104
|
//#endregion
|
|
2105
|
+
//#region ../shared/src/utils/extractCacheHits.d.ts
|
|
2106
|
+
/**
|
|
2107
|
+
* Single cache-hit entry rendered as one row in the case drawer's
|
|
2108
|
+
* "Cache hits" tab.
|
|
2109
|
+
*
|
|
2110
|
+
* `origin === 'span'` rows came from a span's `cache.status` attribute or from
|
|
2111
|
+
* a `cache.refs` ref attached to a span body. `origin === 'caseRoot'` rows
|
|
2112
|
+
* came from `evalTracer.cache(...)` calls made directly from the case body
|
|
2113
|
+
* (no surrounding `traceSpan`), which would otherwise be invisible.
|
|
2114
|
+
*/
|
|
2115
|
+
type CacheHitEntry = {
|
|
2116
|
+
id: string;
|
|
2117
|
+
source: 'span' | 'value';
|
|
2118
|
+
origin: 'span' | 'caseRoot';
|
|
2119
|
+
name: string;
|
|
2120
|
+
namespace: string;
|
|
2121
|
+
key: string;
|
|
2122
|
+
storedAt: string | undefined;
|
|
2123
|
+
age: number | undefined;
|
|
2124
|
+
spanId: string | undefined;
|
|
2125
|
+
};
|
|
2126
|
+
/**
|
|
2127
|
+
* Collect every `status === 'hit'` cache event recorded for a case run.
|
|
2128
|
+
*
|
|
2129
|
+
* Walks `spans` for span-level cache hits (`attributes['cache.status'] ===
|
|
2130
|
+
* 'hit'`) and per-span value-cache refs (`attributes['cache.refs']`), then
|
|
2131
|
+
* appends spanless value-cache refs persisted on the case scope. Non-hit
|
|
2132
|
+
* statuses (`miss`/`refresh`/`bypass`) are skipped — they remain visible
|
|
2133
|
+
* inline in the Trace tab.
|
|
2134
|
+
*/
|
|
2135
|
+
declare function extractCacheHits(spans: EvalTraceSpan[], caseCacheRefs: TraceCacheRef[]): CacheHitEntry[];
|
|
2136
|
+
//#endregion
|
|
2137
|
+
//#region ../shared/src/schemas/sse.d.ts
|
|
2138
|
+
declare const sseEventTypeSchema: z$1.ZodEnum<{
|
|
2139
|
+
"discovery.updated": "discovery.updated";
|
|
2140
|
+
"run.started": "run.started";
|
|
2141
|
+
"run.summary": "run.summary";
|
|
2142
|
+
"case.started": "case.started";
|
|
2143
|
+
"case.updated": "case.updated";
|
|
2144
|
+
"case.finished": "case.finished";
|
|
2145
|
+
"trace.span": "trace.span";
|
|
2146
|
+
"run.finished": "run.finished";
|
|
2147
|
+
"run.cancelled": "run.cancelled";
|
|
2148
|
+
"run.error": "run.error";
|
|
2149
|
+
}>;
|
|
2150
|
+
/** Server-sent event name emitted by the runner or backend. */
|
|
2151
|
+
type SseEventType = z$1.infer<typeof sseEventTypeSchema>;
|
|
2152
|
+
/** Schema for the SSE envelope used to stream run updates to clients. */
|
|
2153
|
+
declare const sseEnvelopeSchema: z$1.ZodObject<{
|
|
2154
|
+
type: z$1.ZodString;
|
|
2155
|
+
runId: z$1.ZodOptional<z$1.ZodString>;
|
|
2156
|
+
timestamp: z$1.ZodString;
|
|
2157
|
+
payload: z$1.ZodUnknown;
|
|
2158
|
+
}, z$1.core.$strip>;
|
|
2159
|
+
/** Wire format for a streamed event emitted during eval execution. */
|
|
2160
|
+
type SseEnvelope = z$1.infer<typeof sseEnvelopeSchema>;
|
|
2161
|
+
//#endregion
|
|
2162
|
+
//#region ../shared/src/schemas/api.d.ts
|
|
2163
|
+
/** Schema for the API request that starts a new eval run. */
|
|
2164
|
+
declare const createRunRequestSchema: z$1.ZodObject<{
|
|
2165
|
+
target: z$1.ZodObject<{
|
|
2166
|
+
mode: z$1.ZodEnum<{
|
|
2167
|
+
all: "all";
|
|
2168
|
+
evalIds: "evalIds";
|
|
2169
|
+
caseIds: "caseIds";
|
|
2170
|
+
}>;
|
|
2171
|
+
evalIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2172
|
+
caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2173
|
+
}, z$1.core.$strip>;
|
|
2174
|
+
trials: z$1.ZodNumber;
|
|
2175
|
+
cache: z$1.ZodOptional<z$1.ZodObject<{
|
|
2176
|
+
mode: z$1.ZodDefault<z$1.ZodEnum<{
|
|
2177
|
+
use: "use";
|
|
2178
|
+
bypass: "bypass";
|
|
2179
|
+
refresh: "refresh";
|
|
2180
|
+
}>>;
|
|
2181
|
+
}, z$1.core.$strip>>;
|
|
2182
|
+
}, z$1.core.$strip>;
|
|
2183
|
+
/** Request payload accepted by the run creation endpoint. */
|
|
2184
|
+
type CreateRunRequest = z$1.infer<typeof createRunRequestSchema>;
|
|
2185
|
+
/** Schema for updating a UI-authored manual score on one persisted case. */
|
|
2186
|
+
declare const updateManualScoreRequestSchema: z$1.ZodObject<{
|
|
2187
|
+
value: z$1.ZodNullable<z$1.ZodNumber>;
|
|
2188
|
+
}, z$1.core.$strip>;
|
|
2189
|
+
/** Request payload accepted by the manual score update endpoint. */
|
|
2190
|
+
type UpdateManualScoreRequest = z$1.infer<typeof updateManualScoreRequestSchema>;
|
|
2191
|
+
//#endregion
|
|
1827
2192
|
//#region ../sdk/src/types.d.ts
|
|
1828
2193
|
/** Single authored eval case with its stable identifier and input payload. */
|
|
1829
2194
|
type EvalCase<TInput> = {
|
|
@@ -1831,7 +2196,7 @@ type EvalCase<TInput> = {
|
|
|
1831
2196
|
input: TInput;
|
|
1832
2197
|
tags?: string[];
|
|
1833
2198
|
};
|
|
1834
|
-
/** Runtime output values collected from
|
|
2199
|
+
/** Runtime output values collected from output helpers and `deriveFromTracing`. */
|
|
1835
2200
|
type EvalOutputs = Record<string, unknown>;
|
|
1836
2201
|
/**
|
|
1837
2202
|
* Schema used to validate and type an eval's collected runtime outputs.
|
|
@@ -1884,9 +2249,31 @@ type EvalTraceTree = {
|
|
|
1884
2249
|
flattenDfs: () => EvalTraceSpan[];
|
|
1885
2250
|
checkpoints: Map<string, unknown>;
|
|
1886
2251
|
};
|
|
2252
|
+
/** Type-safe output writer passed to an eval's `execute` function. */
|
|
2253
|
+
type EvalSetOutput<TOutputs extends EvalOutputs = EvalOutputs> = <TKey extends Extract<keyof TOutputs, string>>(
|
|
2254
|
+
/**
|
|
2255
|
+
* Output field to record. For narrowed output maps, this must be one of the
|
|
2256
|
+
* known output keys.
|
|
2257
|
+
*/
|
|
2258
|
+
|
|
2259
|
+
key: TKey,
|
|
2260
|
+
/**
|
|
2261
|
+
* Value for the output field. For narrowed output maps, this must match the
|
|
2262
|
+
* field's declared output type.
|
|
2263
|
+
*/
|
|
2264
|
+
|
|
2265
|
+
value: TOutputs[TKey]) => void;
|
|
1887
2266
|
/** Context passed to an eval's `execute` function for a single case run. */
|
|
1888
|
-
type EvalExecuteContext<TInput> = {
|
|
1889
|
-
input: TInput;
|
|
2267
|
+
type EvalExecuteContext<TInput, TOutputs extends EvalOutputs = EvalOutputs> = {
|
|
2268
|
+
/** Authored input for the active eval case. */input: TInput;
|
|
2269
|
+
/**
|
|
2270
|
+
* Record or replace an output value for the current case scope.
|
|
2271
|
+
*
|
|
2272
|
+
* When the eval has a narrowed outputs generic, keys and values are typed
|
|
2273
|
+
* from that output map. The recorded values are still validated by
|
|
2274
|
+
* `outputsSchema` before computed scores run.
|
|
2275
|
+
*/
|
|
2276
|
+
setOutput: EvalSetOutput<TOutputs>;
|
|
1890
2277
|
};
|
|
1891
2278
|
/** Context passed to `deriveFromTracing` after execution has completed. */
|
|
1892
2279
|
type EvalDeriveContext<TInput> = {
|
|
@@ -1928,8 +2315,31 @@ type EvalManualScoreDef = EvalColumnOverride & {
|
|
|
1928
2315
|
*/
|
|
1929
2316
|
passThreshold?: number;
|
|
1930
2317
|
};
|
|
1931
|
-
|
|
1932
|
-
|
|
2318
|
+
type EvalDefinitionOutputSchemaConfig<TOutputs extends EvalOutputs> = [EvalOutputs] extends [TOutputs] ? {
|
|
2319
|
+
/**
|
|
2320
|
+
* Optional schema for runtime outputs collected through output helpers
|
|
2321
|
+
* and `deriveFromTracing`.
|
|
2322
|
+
*
|
|
2323
|
+
* The runner validates configured output fields before scoring. For
|
|
2324
|
+
* Zod object schemas, only declared keys are passed to the schema;
|
|
2325
|
+
* parsed fields are merged back into the raw output map, so schema
|
|
2326
|
+
* defaults and transforms apply to configured fields while
|
|
2327
|
+
* unconfigured outputs are kept unchanged. Validation failures mark
|
|
2328
|
+
* the case as failed and skip computed scores.
|
|
2329
|
+
*/
|
|
2330
|
+
outputsSchema?: EvalOutputsSchema<TOutputs>;
|
|
2331
|
+
} : {
|
|
2332
|
+
/**
|
|
2333
|
+
* Required schema for typed runtime outputs collected through output
|
|
2334
|
+
* helpers and `deriveFromTracing`.
|
|
2335
|
+
*
|
|
2336
|
+
* When `EvalDefinition` or `defineEval` receives an explicit narrowed
|
|
2337
|
+
* outputs generic, this schema is required so scorer inputs are backed
|
|
2338
|
+
* by runtime validation before computed scores run.
|
|
2339
|
+
*/
|
|
2340
|
+
outputsSchema: EvalOutputsSchema<TOutputs>;
|
|
2341
|
+
};
|
|
2342
|
+
type EvalDefinitionBase<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs> = {
|
|
1933
2343
|
id: string;
|
|
1934
2344
|
title?: string;
|
|
1935
2345
|
/**
|
|
@@ -1939,17 +2349,6 @@ type EvalDefinition<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs
|
|
|
1939
2349
|
* eval once using a synthetic case with empty object input.
|
|
1940
2350
|
*/
|
|
1941
2351
|
cases?: EvalCase<TInput>[] | (() => Promise<EvalCase<TInput>[]>);
|
|
1942
|
-
/**
|
|
1943
|
-
* Optional schema for runtime outputs collected through `setEvalOutput` and
|
|
1944
|
-
* `deriveFromTracing`.
|
|
1945
|
-
*
|
|
1946
|
-
* The runner validates configured output fields before scoring. For Zod
|
|
1947
|
-
* object schemas, only declared keys are passed to the schema; parsed fields
|
|
1948
|
-
* are merged back into the raw output map, so schema defaults and transforms
|
|
1949
|
-
* apply to configured fields while unconfigured outputs are kept unchanged.
|
|
1950
|
-
* Validation failures mark the case as failed and skip computed scores.
|
|
1951
|
-
*/
|
|
1952
|
-
outputsSchema?: EvalOutputsSchema<TOutputs>;
|
|
1953
2352
|
columns?: EvalColumns;
|
|
1954
2353
|
/**
|
|
1955
2354
|
* Per-eval trace attribute display rules for the UI.
|
|
@@ -1959,7 +2358,7 @@ type EvalDefinition<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs
|
|
|
1959
2358
|
* `key` is provided.
|
|
1960
2359
|
*/
|
|
1961
2360
|
traceDisplay?: TraceDisplayInputConfig;
|
|
1962
|
-
execute: (ctx: EvalExecuteContext<TInput>) => Promise<void> | void;
|
|
2361
|
+
execute: (ctx: EvalExecuteContext<TInput, TOutputs>) => Promise<void> | void;
|
|
1963
2362
|
deriveFromTracing?: (ctx: EvalDeriveContext<TInput>) => Partial<TOutputs> | Promise<Partial<TOutputs>>;
|
|
1964
2363
|
scores?: Record<string, EvalScoreDef<TInput, TOutputs>>;
|
|
1965
2364
|
/**
|
|
@@ -1994,13 +2393,21 @@ type EvalDefinition<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs
|
|
|
1994
2393
|
*
|
|
1995
2394
|
* Each chart declares its `type` (`area | line | bar`) and one or more
|
|
1996
2395
|
* `metrics`. Built-in metrics (`passRate`, `durationMs`) aggregate
|
|
1997
|
-
* the run summary. Column metrics aggregate a score or numeric
|
|
1998
|
-
*
|
|
1999
|
-
* `
|
|
2000
|
-
*
|
|
2396
|
+
* the run summary. Column metrics aggregate a score or numeric output column
|
|
2397
|
+
* across the run using an `aggregate` reducer (`avg`, `sum`, `min`, `max`,
|
|
2398
|
+
* `latest`, `passThresholdRate`). `passThresholdRate` requires a score column
|
|
2399
|
+
* with `passThreshold`.
|
|
2001
2400
|
*/
|
|
2002
2401
|
charts?: EvalChartsConfig;
|
|
2003
2402
|
};
|
|
2403
|
+
/**
|
|
2404
|
+
* Complete authored eval definition consumed by `defineEval`.
|
|
2405
|
+
*
|
|
2406
|
+
* `outputsSchema` is optional for the default loose output map. When the
|
|
2407
|
+
* `TOutputs` generic is narrowed, `outputsSchema` is required so the runtime
|
|
2408
|
+
* validates collected outputs before exposing them as typed scorer inputs.
|
|
2409
|
+
*/
|
|
2410
|
+
type EvalDefinition<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs> = EvalDefinitionBase<TInput, TOutputs> & EvalDefinitionOutputSchemaConfig<TOutputs>;
|
|
2004
2411
|
//#endregion
|
|
2005
2412
|
//#region ../sdk/src/defineEval.d.ts
|
|
2006
2413
|
/**
|
|
@@ -2058,7 +2465,9 @@ type CacheRecordingFrame = {
|
|
|
2058
2465
|
};
|
|
2059
2466
|
/** Mutable per-case runtime state stored in async local storage. */
|
|
2060
2467
|
type EvalCaseScope = {
|
|
2061
|
-
caseId: string; /**
|
|
2468
|
+
caseId: string; /** Stable prefix used by `nextEvalId()` for this eval case scope. */
|
|
2469
|
+
idPrefix: string | undefined; /** Monotonic per-scope counter used by `nextEvalId()`. */
|
|
2470
|
+
nextEvalIdCounter: number; /** Authored input for the current case, when provided by the runner. */
|
|
2062
2471
|
input?: unknown;
|
|
2063
2472
|
outputs: Record<string, unknown>; /** Structured assertion failures recorded for the current case. */
|
|
2064
2473
|
assertionFailures: AssertionFailure[];
|
|
@@ -2077,6 +2486,12 @@ type EvalCaseScope = {
|
|
|
2077
2486
|
*/
|
|
2078
2487
|
replayingDepth: number; /** Runner-provided cache adapter + mode; absent when caching is disabled. */
|
|
2079
2488
|
cacheContext: CacheScopeContext | undefined;
|
|
2489
|
+
/**
|
|
2490
|
+
* Value-cache refs recorded by `evalTracer.cache(...)` calls made with no
|
|
2491
|
+
* active span. Span-bound refs are appended to the owning span's
|
|
2492
|
+
* `cache.refs` attribute instead.
|
|
2493
|
+
*/
|
|
2494
|
+
caseCacheRefs: TraceCacheRef[];
|
|
2080
2495
|
};
|
|
2081
2496
|
/** Error thrown when an eval assertion fails during case execution. */
|
|
2082
2497
|
declare class EvalAssertionError extends Error {
|
|
@@ -2109,7 +2524,8 @@ declare function getEvalCaseInput(path: string): unknown;
|
|
|
2109
2524
|
declare function setScopeCacheContext(scope: EvalCaseScope, context: CacheScopeContext): void;
|
|
2110
2525
|
/** Optional inputs accepted when starting a new eval case scope. */
|
|
2111
2526
|
type RunInEvalScopeOptions = {
|
|
2112
|
-
/** Authored input for the active eval case. */input?: unknown; /**
|
|
2527
|
+
/** Authored input for the active eval case. */input?: unknown; /** Stable prefix used when generating scoped IDs with `nextEvalId()`. */
|
|
2528
|
+
idPrefix?: string; /** Cache adapter + mode attached to the scope before `fn` runs. */
|
|
2113
2529
|
cacheContext?: CacheScopeContext;
|
|
2114
2530
|
};
|
|
2115
2531
|
/**
|
|
@@ -2121,6 +2537,15 @@ declare function runInEvalScope<T>(caseId: string, fn: () => Promise<T> | T, opt
|
|
|
2121
2537
|
scope: EvalCaseScope;
|
|
2122
2538
|
error: Error | undefined;
|
|
2123
2539
|
}>;
|
|
2540
|
+
/**
|
|
2541
|
+
* Return the next deterministic ID for the active eval case execution.
|
|
2542
|
+
*
|
|
2543
|
+
* The runner derives the ID prefix from the eval file, eval id, and case id,
|
|
2544
|
+
* then this helper appends a per-scope sequence number. Calls outside an
|
|
2545
|
+
* active eval case scope throw so accidental product-code usage is caught
|
|
2546
|
+
* immediately.
|
|
2547
|
+
*/
|
|
2548
|
+
declare function nextEvalId(): string;
|
|
2124
2549
|
/**
|
|
2125
2550
|
* Record or replace an output value for the current case scope.
|
|
2126
2551
|
*
|
|
@@ -2171,43 +2596,46 @@ type CaptureEvalSpanErrorOptions = {
|
|
|
2171
2596
|
level?: CaptureEvalSpanErrorLevel;
|
|
2172
2597
|
};
|
|
2173
2598
|
//#endregion
|
|
2174
|
-
//#region ../sdk/src/cacheRecording.d.ts
|
|
2175
|
-
/** Cache reference appended to the active span by `evalTracer.cache(...)`. */
|
|
2176
|
-
type TraceCacheRef = {
|
|
2177
|
-
type: 'value';
|
|
2178
|
-
name: string;
|
|
2179
|
-
namespace: string;
|
|
2180
|
-
key: string;
|
|
2181
|
-
status: 'hit' | 'miss' | 'refresh' | 'bypass';
|
|
2182
|
-
storedAt?: string;
|
|
2183
|
-
age?: number;
|
|
2184
|
-
};
|
|
2185
|
-
//#endregion
|
|
2186
2599
|
//#region ../sdk/src/valueCache.d.ts
|
|
2187
2600
|
/** Info accepted by `evalTracer.cache(info, fn)` for spanless value caching. */
|
|
2188
2601
|
type TraceCacheInfo = {
|
|
2189
2602
|
/** Display name used for cache listings and the default namespace. */name: string; /** Arbitrary JSON-safe value used to derive the cache key. */
|
|
2190
2603
|
key: unknown; /** Override the default namespace (`${evalId}__${name}`). */
|
|
2191
2604
|
namespace?: string;
|
|
2605
|
+
/**
|
|
2606
|
+
* Include native `Blob`/`File` bytes in the cache key. By default only stable
|
|
2607
|
+
* metadata (`type`, `size`, plus `name`/`lastModified` for `File`) is used.
|
|
2608
|
+
*/
|
|
2609
|
+
serializeFileBytes?: boolean;
|
|
2192
2610
|
};
|
|
2193
2611
|
//#endregion
|
|
2194
2612
|
//#region ../sdk/src/cacheKey.d.ts
|
|
2613
|
+
/** Components folded into a deterministic cache key hash. */
|
|
2195
2614
|
type CacheKeyHashInput = {
|
|
2196
|
-
namespace: string;
|
|
2197
|
-
codeFingerprint: string;
|
|
2615
|
+
/** Cache namespace, usually derived from the eval id and operation name. */namespace: string; /** Eval source fingerprint used to invalidate cache entries on code edits. */
|
|
2616
|
+
codeFingerprint: string; /** User-authored cache key value. */
|
|
2198
2617
|
key: unknown;
|
|
2199
2618
|
};
|
|
2619
|
+
/** Optional controls for cache key hashing. */
|
|
2620
|
+
type CacheKeyHashOptions = {
|
|
2621
|
+
/**
|
|
2622
|
+
* When true, native `Blob` and `File` values are read asynchronously and
|
|
2623
|
+
* hashed by bytes plus stable metadata. Defaults to metadata-only hashing.
|
|
2624
|
+
*/
|
|
2625
|
+
serializeFileBytes?: boolean;
|
|
2626
|
+
};
|
|
2200
2627
|
/**
|
|
2201
2628
|
* Hash the components of a cache key into a deterministic hex digest.
|
|
2202
2629
|
*
|
|
2203
|
-
* Native `Blob` and `File` values
|
|
2204
|
-
*
|
|
2630
|
+
* Native `Blob` and `File` values use stable metadata by default. Pass
|
|
2631
|
+
* `serializeFileBytes: true` to read them asynchronously and include their byte
|
|
2632
|
+
* hash in the key.
|
|
2205
2633
|
*/
|
|
2206
|
-
declare function hashCacheKey(input: CacheKeyHashInput): Promise<string>;
|
|
2634
|
+
declare function hashCacheKey(input: CacheKeyHashInput, options?: CacheKeyHashOptions): Promise<string>;
|
|
2207
2635
|
/**
|
|
2208
2636
|
* Synchronously hash cache key components. This supports JSON-like data and
|
|
2209
2637
|
* in-memory binary values such as `Buffer`, `ArrayBuffer`, and typed arrays,
|
|
2210
|
-
*
|
|
2638
|
+
* plus stable metadata for native `Blob` and `File` values.
|
|
2211
2639
|
*/
|
|
2212
2640
|
declare function hashCacheKeySync(input: CacheKeyHashInput): string;
|
|
2213
2641
|
//#endregion
|
|
@@ -2296,8 +2724,8 @@ type TraceSpanInfoUncached = TraceSpanInfoBase & {
|
|
|
2296
2724
|
/**
|
|
2297
2725
|
* Info accepted by `evalTracer.span(info, fn)` when opting in to caching.
|
|
2298
2726
|
*
|
|
2299
|
-
* Cached spans return `Promise<unknown>` because the replayed value
|
|
2300
|
-
*
|
|
2727
|
+
* Cached spans return `Promise<unknown>` because the replayed value is revived
|
|
2728
|
+
* from persisted cache data on hit. Narrow the value yourself when you need a
|
|
2301
2729
|
* typed return.
|
|
2302
2730
|
*/
|
|
2303
2731
|
type TraceSpanInfoCached = TraceSpanInfoBase & {
|
|
@@ -2388,9 +2816,23 @@ type EvalRunner = {
|
|
|
2388
2816
|
subscribe(runId: string, listener: (event: SseEnvelope) => void): () => void; /** Subscribe to discovery updates triggered by file changes or manual refresh. */
|
|
2389
2817
|
subscribeDiscovery(listener: (event: SseEnvelope) => void): () => void; /** Stop background filesystem watchers owned by this runner instance. */
|
|
2390
2818
|
close(): Promise<void>; /** Resolve the workspace root backing this runner instance. */
|
|
2391
|
-
getWorkspaceRoot(): string;
|
|
2819
|
+
getWorkspaceRoot(): string;
|
|
2820
|
+
/**
|
|
2821
|
+
* Resolved LLM-calls config used by the UI to derive the LLM calls tab.
|
|
2822
|
+
*
|
|
2823
|
+
* Returns the workspace's `llmCalls` config block from
|
|
2824
|
+
* `agent-evals.config.ts` with all defaults applied.
|
|
2825
|
+
*/
|
|
2826
|
+
getLlmCallsConfig(): ResolvedLlmCallsConfig; /** Resolve a persisted artifact path when artifact storage is supported. */
|
|
2392
2827
|
getArtifactPath(artifactId: string): string | undefined; /** Return summaries for every persisted cache entry in the workspace. */
|
|
2393
2828
|
listCache(): Promise<CacheListItem[]>;
|
|
2829
|
+
/**
|
|
2830
|
+
* Return the full persisted cache entry for `namespace` + `key`, including
|
|
2831
|
+
* its recording. Returns `null` when no entry matches. Used by the case
|
|
2832
|
+
* drawer's Cache hits tab to lazily fetch the cached return value when a
|
|
2833
|
+
* row is expanded.
|
|
2834
|
+
*/
|
|
2835
|
+
getCacheEntry(namespace: string, key: string): Promise<CacheEntry | null>;
|
|
2394
2836
|
/**
|
|
2395
2837
|
* Remove cache entries matching `filter`, or all entries when no filter is
|
|
2396
2838
|
* supplied.
|
|
@@ -2445,4 +2887,4 @@ declare function createRunner({
|
|
|
2445
2887
|
*/
|
|
2446
2888
|
declare function runCli(argv: string[]): Promise<void>;
|
|
2447
2889
|
//#endregion
|
|
2448
|
-
export { type AgentEvalsConfig, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type NumberDisplayOptions, type RepoFileRef, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, mergeEvalOutput, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
2890
|
+
export { type AgentEvalsConfig, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|