@ls-stack/agent-eval 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,8 +25,8 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-CmY0_D5Z.js"></script>
29
- <link rel="stylesheet" crossorigin href="/assets/index-ChgByJbI.css">
28
+ <script type="module" crossorigin src="/assets/index-CsSDwFI1.js"></script>
29
+ <link rel="stylesheet" crossorigin href="/assets/index-BZ60j9UY.css">
30
30
  </head>
31
31
  <body>
32
32
  <div id="root"></div>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-DrPk66xh.mjs";
2
+ import { t as runCli } from "./cli-COzPxKg2.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { A as deriveScopedSummaryFromCases, O as getEvalTitle, P as runSummarySchema, V as resolveLlmCallsConfig, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as normalizeScoreDef, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, k as getEvalDisplayStatus, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, rn as getEvalRegistry, s as nextShortIdFromSnapshots, u as recomputeEvalStatusesInRuns, v as loadConfig, x as createFsCacheStore, y as buildDeclaredColumnDefs } from "./runOrchestration-DA4Rh5g0.mjs";
1
+ import { A as getEvalDisplayStatus, F as runSummarySchema, J as resolveLlmCallsConfig, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as normalizeScoreDef, c as persistCaseDetail, d as recomputePersistedCaseStatus, dn as getEvalRegistry, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, j as deriveScopedSummaryFromCases, k as getEvalTitle, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, q as resolveApiCallsConfig, r as getLastRunStatuses, s as nextShortIdFromSnapshots, u as recomputeEvalStatusesInRuns, v as loadConfig, x as createFsCacheStore, y as buildDeclaredColumnDefs } from "./runOrchestration-COFhQvTJ.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
4
  import { dirname, join, relative, resolve } from "node:path";
@@ -418,6 +418,7 @@ function createRunner({ watchForChanges = true } = {}) {
418
418
  let localStateDir;
419
419
  let cacheStore;
420
420
  let llmCallsConfig = resolveLlmCallsConfig(void 0);
421
+ let apiCallsConfig = resolveApiCallsConfig(void 0);
421
422
  const evals = /* @__PURE__ */ new Map();
422
423
  const runs = /* @__PURE__ */ new Map();
423
424
  const lastRunStatusMap = /* @__PURE__ */ new Map();
@@ -441,6 +442,7 @@ function createRunner({ watchForChanges = true } = {}) {
441
442
  workspaceRoot = config.workspaceRoot ?? process.cwd();
442
443
  localStateDir = resolve(workspaceRoot, ".agent-evals");
443
444
  llmCallsConfig = resolveLlmCallsConfig(config.llmCalls);
445
+ apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
444
446
  await mkdir(localStateDir, { recursive: true });
445
447
  await mkdir(join(localStateDir, "runs"), { recursive: true });
446
448
  cacheStore = createFsCacheStore({
@@ -812,6 +814,9 @@ function createRunner({ watchForChanges = true } = {}) {
812
814
  getLlmCallsConfig() {
813
815
  return llmCallsConfig;
814
816
  },
817
+ getApiCallsConfig() {
818
+ return apiCallsConfig;
819
+ },
815
820
  getArtifactPath(artifactId_) {
816
821
  return resolveArtifactPath(join(localStateDir, "runs"), artifactId_);
817
822
  }
@@ -1045,8 +1050,8 @@ async function commandApp(args) {
1045
1050
  const { serve } = await import("@hono/node-server");
1046
1051
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
1047
1052
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
1048
- const appModule = await import("./app-hkNNN_jn.mjs");
1049
- const runnerModule = await import("./runner-BzT3B9OF.mjs");
1053
+ const appModule = await import("./app-DI3IUGb_.mjs");
1054
+ const runnerModule = await import("./runner-sMZXoDp3.mjs");
1050
1055
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
1051
1056
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
1052
1057
  await runnerModule.initRunner();
package/dist/index.d.mts CHANGED
@@ -1406,6 +1406,16 @@ declare const llmCallMetricFormatSchema: z$1.ZodEnum<{
1406
1406
  }>;
1407
1407
  /** Render format applied to an LLM-call metric value. */
1408
1408
  type LlmCallMetricFormat = z$1.infer<typeof llmCallMetricFormatSchema>;
1409
+ /** Render formats supported by an API-call metric in the UI. */
1410
+ declare const apiCallMetricFormatSchema: z$1.ZodEnum<{
1411
+ string: "string";
1412
+ number: "number";
1413
+ boolean: "boolean";
1414
+ duration: "duration";
1415
+ json: "json";
1416
+ }>;
1417
+ /** Render format applied to an API-call metric value. */
1418
+ type ApiCallMetricFormat = z$1.infer<typeof apiCallMetricFormatSchema>;
1409
1419
  /** Where an LLM-call metric is rendered inside the LLM calls tab. */
1410
1420
  declare const llmCallMetricPlacementSchema: z$1.ZodEnum<{
1411
1421
  header: "header";
@@ -1413,6 +1423,13 @@ declare const llmCallMetricPlacementSchema: z$1.ZodEnum<{
1413
1423
  }>;
1414
1424
  /** Placement option for an LLM-call metric. */
1415
1425
  type LlmCallMetricPlacement = z$1.infer<typeof llmCallMetricPlacementSchema>;
1426
+ /** Where an API-call metric is rendered inside the API calls tab. */
1427
+ declare const apiCallMetricPlacementSchema: z$1.ZodEnum<{
1428
+ header: "header";
1429
+ body: "body";
1430
+ }>;
1431
+ /** Placement option for an API-call metric. */
1432
+ type ApiCallMetricPlacement = z$1.infer<typeof apiCallMetricPlacementSchema>;
1416
1433
  /**
1417
1434
  * Schema for a single user-defined metric attached to LLM call rows.
1418
1435
  *
@@ -1440,6 +1457,33 @@ declare const llmCallMetricSchema: z$1.ZodObject<{
1440
1457
  }, z$1.core.$strip>;
1441
1458
  /** User-defined metric authored in `agent-evals.config.ts`. */
1442
1459
  type LlmCallMetric = z$1.infer<typeof llmCallMetricSchema>;
1460
+ /**
1461
+ * Schema for a single user-defined metric attached to API call rows.
1462
+ *
1463
+ * Each metric reads `path` from the span's `attributes` and renders the value
1464
+ * with the configured `format` and `numberFormat`. `placements` controls
1465
+ * whether the metric appears as a chip on the collapsed row header, as a row
1466
+ * inside the expanded body, or both. Defaults to `['body']` when omitted.
1467
+ */
1468
+ declare const apiCallMetricSchema: z$1.ZodObject<{
1469
+ label: z$1.ZodString;
1470
+ tooltip: z$1.ZodOptional<z$1.ZodString>;
1471
+ path: z$1.ZodString;
1472
+ format: z$1.ZodOptional<z$1.ZodEnum<{
1473
+ string: "string";
1474
+ number: "number";
1475
+ boolean: "boolean";
1476
+ duration: "duration";
1477
+ json: "json";
1478
+ }>>;
1479
+ numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
1480
+ placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
1481
+ header: "header";
1482
+ body: "body";
1483
+ }>>>;
1484
+ }, z$1.core.$strip>;
1485
+ /** User-defined API-call metric authored in `agent-evals.config.ts`. */
1486
+ type ApiCallMetric = z$1.infer<typeof apiCallMetricSchema>;
1443
1487
  /** Schema for the global LLM calls config block in `agent-evals.config.ts`. */
1444
1488
  declare const llmCallsConfigSchema: z$1.ZodObject<{
1445
1489
  kinds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
@@ -1485,6 +1529,41 @@ declare const llmCallsConfigSchema: z$1.ZodObject<{
1485
1529
  }, z$1.core.$strip>;
1486
1530
  /** Authored LLM calls config accepted from `agent-evals.config.ts`. */
1487
1531
  type LlmCallsConfigInput = z$1.infer<typeof llmCallsConfigSchema>;
1532
+ /** Schema for the global API calls config block in `agent-evals.config.ts`. */
1533
+ declare const apiCallsConfigSchema: z$1.ZodObject<{
1534
+ kinds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
1535
+ attributes: z$1.ZodOptional<z$1.ZodObject<{
1536
+ method: z$1.ZodOptional<z$1.ZodString>;
1537
+ url: z$1.ZodOptional<z$1.ZodString>;
1538
+ statusCode: z$1.ZodOptional<z$1.ZodString>;
1539
+ request: z$1.ZodOptional<z$1.ZodString>;
1540
+ response: z$1.ZodOptional<z$1.ZodString>;
1541
+ requestBody: z$1.ZodOptional<z$1.ZodString>;
1542
+ responseBody: z$1.ZodOptional<z$1.ZodString>;
1543
+ headers: z$1.ZodOptional<z$1.ZodString>;
1544
+ durationMs: z$1.ZodOptional<z$1.ZodString>;
1545
+ error: z$1.ZodOptional<z$1.ZodString>;
1546
+ }, z$1.core.$strip>>;
1547
+ metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
1548
+ label: z$1.ZodString;
1549
+ tooltip: z$1.ZodOptional<z$1.ZodString>;
1550
+ path: z$1.ZodString;
1551
+ format: z$1.ZodOptional<z$1.ZodEnum<{
1552
+ string: "string";
1553
+ number: "number";
1554
+ boolean: "boolean";
1555
+ duration: "duration";
1556
+ json: "json";
1557
+ }>>;
1558
+ numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
1559
+ placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
1560
+ header: "header";
1561
+ body: "body";
1562
+ }>>>;
1563
+ }, z$1.core.$strip>>>;
1564
+ }, z$1.core.$strip>;
1565
+ /** Authored API calls config accepted from `agent-evals.config.ts`. */
1566
+ type ApiCallsConfigInput = z$1.infer<typeof apiCallsConfigSchema>;
1488
1567
  /** Resolved LLM-calls config sent to the UI with all defaults applied. */
1489
1568
  type ResolvedLlmCallsConfig = {
1490
1569
  kinds: string[];
@@ -1512,6 +1591,23 @@ type ResolvedLlmCallsConfig = {
1512
1591
  };
1513
1592
  metrics: ResolvedLlmCallMetric[];
1514
1593
  };
1594
+ /** Resolved API-calls config sent to the UI with all defaults applied. */
1595
+ type ResolvedApiCallsConfig = {
1596
+ kinds: string[];
1597
+ attributes: {
1598
+ method: string;
1599
+ url: string;
1600
+ statusCode: string;
1601
+ request: string;
1602
+ response: string;
1603
+ requestBody: string;
1604
+ responseBody: string;
1605
+ headers: string;
1606
+ durationMs: string;
1607
+ error: string;
1608
+ };
1609
+ metrics: ResolvedApiCallMetric[];
1610
+ };
1515
1611
  /** Fully-resolved LLM-call metric used by the runner and UI. */
1516
1612
  type ResolvedLlmCallMetric = {
1517
1613
  label: string;
@@ -1521,8 +1617,19 @@ type ResolvedLlmCallMetric = {
1521
1617
  numberFormat?: NumberDisplayOptions;
1522
1618
  placements: LlmCallMetricPlacement[];
1523
1619
  };
1620
+ /** Fully-resolved API-call metric used by the runner and UI. */
1621
+ type ResolvedApiCallMetric = {
1622
+ label: string;
1623
+ tooltip?: string;
1624
+ path: string;
1625
+ format: ApiCallMetricFormat;
1626
+ numberFormat?: NumberDisplayOptions;
1627
+ placements: ApiCallMetricPlacement[];
1628
+ };
1524
1629
  /** Default LLM-calls config the UI uses before the workspace fetch resolves. */
1525
1630
  declare const DEFAULT_LLM_CALLS_CONFIG: ResolvedLlmCallsConfig;
1631
+ /** Default API-calls config the UI uses before the workspace fetch resolves. */
1632
+ declare const DEFAULT_API_CALLS_CONFIG: ResolvedApiCallsConfig;
1526
1633
  /**
1527
1634
  * Resolve the user-authored LLM-calls config to a fully-defaulted shape used
1528
1635
  * by the UI to derive the LLM calls tab.
@@ -1534,6 +1641,17 @@ declare const DEFAULT_LLM_CALLS_CONFIG: ResolvedLlmCallsConfig;
1534
1641
  * - Missing `metrics[].placements` defaults to `['body']`.
1535
1642
  */
1536
1643
  declare function resolveLlmCallsConfig(input: LlmCallsConfigInput | undefined): ResolvedLlmCallsConfig;
1644
+ /**
1645
+ * Resolve the user-authored API-calls config to a fully-defaulted shape used
1646
+ * by the UI to derive the API calls tab.
1647
+ *
1648
+ * - Missing or empty `kinds` falls back to common API/HTTP span kinds.
1649
+ * - Missing `attributes.<field>` falls back to the corresponding default
1650
+ * attribute path.
1651
+ * - Missing `metrics[].format` defaults to `'string'`.
1652
+ * - Missing `metrics[].placements` defaults to `['body']`.
1653
+ */
1654
+ declare function resolveApiCallsConfig(input: ApiCallsConfigInput | undefined): ResolvedApiCallsConfig;
1537
1655
  /** Top-level config authored in `agent-evals.config.ts`. */
1538
1656
  type AgentEvalsConfig = {
1539
1657
  /** Root directory used to resolve all relative paths. Defaults to `process.cwd()`. */workspaceRoot?: string; /** Glob patterns (relative to `workspaceRoot`) used to discover eval files. */
@@ -1589,6 +1707,30 @@ type AgentEvalsConfig = {
1589
1707
  * ```
1590
1708
  */
1591
1709
  llmCalls?: LlmCallsConfigInput;
1710
+ /**
1711
+ * Configuration for the "API calls" tab in the case-run drawer.
1712
+ *
1713
+ * Determines which trace spans are treated as API calls (`kinds`), how
1714
+ * structured fields like `method`, `url`, and `statusCode` are read from
1715
+ * span attributes, and which custom user-defined metrics are surfaced on
1716
+ * each call. All fields are optional and fall back to the documented
1717
+ * defaults; the API calls tab is shown automatically when at least one
1718
+ * matching span exists in a case run.
1719
+ *
1720
+ * @example
1721
+ * ```ts
1722
+ * apiCalls: {
1723
+ * kinds: ['api', 'http.client', 'undici.request'],
1724
+ * attributes: {
1725
+ * statusCode: 'http.status_code',
1726
+ * },
1727
+ * metrics: [
1728
+ * { label: 'Retries', path: 'retryCount', format: 'number' },
1729
+ * ],
1730
+ * }
1731
+ * ```
1732
+ */
1733
+ apiCalls?: ApiCallsConfigInput;
1592
1734
  /**
1593
1735
  * Optional controls for the operation cache. When omitted, the cache is
1594
1736
  * enabled and stored under `<workspaceRoot>/.agent-evals/cache`.
@@ -1691,6 +1833,38 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
1691
1833
  }>>>;
1692
1834
  }, z$1.core.$strip>>>;
1693
1835
  }, z$1.core.$strip>>;
1836
+ apiCalls: z$1.ZodOptional<z$1.ZodObject<{
1837
+ kinds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
1838
+ attributes: z$1.ZodOptional<z$1.ZodObject<{
1839
+ method: z$1.ZodOptional<z$1.ZodString>;
1840
+ url: z$1.ZodOptional<z$1.ZodString>;
1841
+ statusCode: z$1.ZodOptional<z$1.ZodString>;
1842
+ request: z$1.ZodOptional<z$1.ZodString>;
1843
+ response: z$1.ZodOptional<z$1.ZodString>;
1844
+ requestBody: z$1.ZodOptional<z$1.ZodString>;
1845
+ responseBody: z$1.ZodOptional<z$1.ZodString>;
1846
+ headers: z$1.ZodOptional<z$1.ZodString>;
1847
+ durationMs: z$1.ZodOptional<z$1.ZodString>;
1848
+ error: z$1.ZodOptional<z$1.ZodString>;
1849
+ }, z$1.core.$strip>>;
1850
+ metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
1851
+ label: z$1.ZodString;
1852
+ tooltip: z$1.ZodOptional<z$1.ZodString>;
1853
+ path: z$1.ZodString;
1854
+ format: z$1.ZodOptional<z$1.ZodEnum<{
1855
+ string: "string";
1856
+ number: "number";
1857
+ boolean: "boolean";
1858
+ duration: "duration";
1859
+ json: "json";
1860
+ }>>;
1861
+ numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
1862
+ placements: z$1.ZodOptional<z$1.ZodArray<z$1.ZodEnum<{
1863
+ header: "header";
1864
+ body: "body";
1865
+ }>>>;
1866
+ }, z$1.core.$strip>>>;
1867
+ }, z$1.core.$strip>>;
1694
1868
  cache: z$1.ZodOptional<z$1.ZodObject<{
1695
1869
  enabled: z$1.ZodOptional<z$1.ZodBoolean>;
1696
1870
  dir: z$1.ZodOptional<z$1.ZodString>;
@@ -1761,6 +1935,50 @@ type LlmCallEntry = {
1761
1935
  */
1762
1936
  declare function extractLlmCalls(spans: EvalTraceSpan[], config: ResolvedLlmCallsConfig): LlmCallEntry[];
1763
1937
  //#endregion
1938
+ //#region ../shared/src/utils/extractApiCalls.d.ts
1939
+ /** Resolved value for one user-defined metric on an API call row. */
1940
+ type ApiCallMetricValue = {
1941
+ label: string;
1942
+ tooltip: string | undefined;
1943
+ rawValue: unknown;
1944
+ format: ApiCallMetricFormat;
1945
+ numberFormat: NumberDisplayOptions | undefined;
1946
+ placements: ApiCallMetricPlacement[];
1947
+ };
1948
+ /** Single entry rendered as one expandable row in the API calls tab. */
1949
+ type ApiCallEntry = {
1950
+ id: string;
1951
+ name: string;
1952
+ kind: string;
1953
+ status: EvalTraceSpan['status'];
1954
+ method: string | null;
1955
+ url: string | null;
1956
+ statusCode: number | null;
1957
+ latencyMs: number | null;
1958
+ request: unknown;
1959
+ response: unknown;
1960
+ requestBody: unknown;
1961
+ responseBody: unknown;
1962
+ headers: unknown;
1963
+ errorPayload: unknown;
1964
+ metrics: ApiCallMetricValue[];
1965
+ warnings: EvalTraceSpanWarning[];
1966
+ error: EvalTraceSpanError | null;
1967
+ };
1968
+ /**
1969
+ * Filter `spans` down to API calls and project each one to the structured
1970
+ * shape consumed by the API calls tab.
1971
+ *
1972
+ * Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
1973
+ * (`method`, `url`, `statusCode`, etc.) are read via `getNestedAttribute` from
1974
+ * the configured paths. `durationMs` takes precedence for latency, with a
1975
+ * fallback to the span start/end timestamps. User-defined `metrics` whose path
1976
+ * resolves to `undefined` are dropped, but `null`, `0`, and `false` are
1977
+ * preserved as legitimate values worth displaying. Original span order is
1978
+ * preserved so the API calls tab matches the ordering in the Trace tab.
1979
+ */
1980
+ declare function extractApiCalls(spans: EvalTraceSpan[], config: ResolvedApiCallsConfig): ApiCallEntry[];
1981
+ //#endregion
1764
1982
  //#region ../shared/src/schemas/cache.d.ts
1765
1983
  /**
1766
1984
  * Mode that controls how the cache is consulted for a given run.
@@ -2196,7 +2414,7 @@ type EvalCase<TInput> = {
2196
2414
  input: TInput;
2197
2415
  tags?: string[];
2198
2416
  };
2199
- /** Runtime output values collected from `setEvalOutput` and `deriveFromTracing`. */
2417
+ /** Runtime output values collected from output helpers and `deriveFromTracing`. */
2200
2418
  type EvalOutputs = Record<string, unknown>;
2201
2419
  /**
2202
2420
  * Schema used to validate and type an eval's collected runtime outputs.
@@ -2249,9 +2467,31 @@ type EvalTraceTree = {
2249
2467
  flattenDfs: () => EvalTraceSpan[];
2250
2468
  checkpoints: Map<string, unknown>;
2251
2469
  };
2470
+ /** Type-safe output writer passed to an eval's `execute` function. */
2471
+ type EvalSetOutput<TOutputs extends EvalOutputs = EvalOutputs> = <TKey extends Extract<keyof TOutputs, string>>(
2472
+ /**
2473
+ * Output field to record. For narrowed output maps, this must be one of the
2474
+ * known output keys.
2475
+ */
2476
+
2477
+ key: TKey,
2478
+ /**
2479
+ * Value for the output field. For narrowed output maps, this must match the
2480
+ * field's declared output type.
2481
+ */
2482
+
2483
+ value: TOutputs[TKey]) => void;
2252
2484
  /** Context passed to an eval's `execute` function for a single case run. */
2253
- type EvalExecuteContext<TInput> = {
2254
- input: TInput;
2485
+ type EvalExecuteContext<TInput, TOutputs extends EvalOutputs = EvalOutputs> = {
2486
+ /** Authored input for the active eval case. */input: TInput;
2487
+ /**
2488
+ * Record or replace an output value for the current case scope.
2489
+ *
2490
+ * When the eval has a narrowed outputs generic, keys and values are typed
2491
+ * from that output map. The recorded values are still validated by
2492
+ * `outputsSchema` before computed scores run.
2493
+ */
2494
+ setOutput: EvalSetOutput<TOutputs>;
2255
2495
  };
2256
2496
  /** Context passed to `deriveFromTracing` after execution has completed. */
2257
2497
  type EvalDeriveContext<TInput> = {
@@ -2293,8 +2533,31 @@ type EvalManualScoreDef = EvalColumnOverride & {
2293
2533
  */
2294
2534
  passThreshold?: number;
2295
2535
  };
2296
- /** Complete authored eval definition consumed by `defineEval`. */
2297
- type EvalDefinition<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs> = {
2536
+ type EvalDefinitionOutputSchemaConfig<TOutputs extends EvalOutputs> = [EvalOutputs] extends [TOutputs] ? {
2537
+ /**
2538
+ * Optional schema for runtime outputs collected through output helpers
2539
+ * and `deriveFromTracing`.
2540
+ *
2541
+ * The runner validates configured output fields before scoring. For
2542
+ * Zod object schemas, only declared keys are passed to the schema;
2543
+ * parsed fields are merged back into the raw output map, so schema
2544
+ * defaults and transforms apply to configured fields while
2545
+ * unconfigured outputs are kept unchanged. Validation failures mark
2546
+ * the case as failed and skip computed scores.
2547
+ */
2548
+ outputsSchema?: EvalOutputsSchema<TOutputs>;
2549
+ } : {
2550
+ /**
2551
+ * Required schema for typed runtime outputs collected through output
2552
+ * helpers and `deriveFromTracing`.
2553
+ *
2554
+ * When `EvalDefinition` or `defineEval` receives an explicit narrowed
2555
+ * outputs generic, this schema is required so scorer inputs are backed
2556
+ * by runtime validation before computed scores run.
2557
+ */
2558
+ outputsSchema: EvalOutputsSchema<TOutputs>;
2559
+ };
2560
+ type EvalDefinitionBase<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs> = {
2298
2561
  id: string;
2299
2562
  title?: string;
2300
2563
  /**
@@ -2304,17 +2567,6 @@ type EvalDefinition<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs
2304
2567
  * eval once using a synthetic case with empty object input.
2305
2568
  */
2306
2569
  cases?: EvalCase<TInput>[] | (() => Promise<EvalCase<TInput>[]>);
2307
- /**
2308
- * Optional schema for runtime outputs collected through `setEvalOutput` and
2309
- * `deriveFromTracing`.
2310
- *
2311
- * The runner validates configured output fields before scoring. For Zod
2312
- * object schemas, only declared keys are passed to the schema; parsed fields
2313
- * are merged back into the raw output map, so schema defaults and transforms
2314
- * apply to configured fields while unconfigured outputs are kept unchanged.
2315
- * Validation failures mark the case as failed and skip computed scores.
2316
- */
2317
- outputsSchema?: EvalOutputsSchema<TOutputs>;
2318
2570
  columns?: EvalColumns;
2319
2571
  /**
2320
2572
  * Per-eval trace attribute display rules for the UI.
@@ -2324,7 +2576,7 @@ type EvalDefinition<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs
2324
2576
  * `key` is provided.
2325
2577
  */
2326
2578
  traceDisplay?: TraceDisplayInputConfig;
2327
- execute: (ctx: EvalExecuteContext<TInput>) => Promise<void> | void;
2579
+ execute: (ctx: EvalExecuteContext<TInput, TOutputs>) => Promise<void> | void;
2328
2580
  deriveFromTracing?: (ctx: EvalDeriveContext<TInput>) => Partial<TOutputs> | Promise<Partial<TOutputs>>;
2329
2581
  scores?: Record<string, EvalScoreDef<TInput, TOutputs>>;
2330
2582
  /**
@@ -2359,13 +2611,21 @@ type EvalDefinition<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs
2359
2611
  *
2360
2612
  * Each chart declares its `type` (`area | line | bar`) and one or more
2361
2613
  * `metrics`. Built-in metrics (`passRate`, `durationMs`) aggregate
2362
- * the run summary. Column metrics aggregate a score or numeric `setEvalOutput`
2363
- * column across the run using an `aggregate` reducer (`avg`, `sum`, `min`,
2364
- * `max`, `latest`, `passThresholdRate`). `passThresholdRate` requires a
2365
- * score column with `passThreshold`.
2614
+ * the run summary. Column metrics aggregate a score or numeric output column
2615
+ * across the run using an `aggregate` reducer (`avg`, `sum`, `min`, `max`,
2616
+ * `latest`, `passThresholdRate`). `passThresholdRate` requires a score column
2617
+ * with `passThreshold`.
2366
2618
  */
2367
2619
  charts?: EvalChartsConfig;
2368
2620
  };
2621
+ /**
2622
+ * Complete authored eval definition consumed by `defineEval`.
2623
+ *
2624
+ * `outputsSchema` is optional for the default loose output map. When the
2625
+ * `TOutputs` generic is narrowed, `outputsSchema` is required so the runtime
2626
+ * validates collected outputs before exposing them as typed scorer inputs.
2627
+ */
2628
+ type EvalDefinition<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs> = EvalDefinitionBase<TInput, TOutputs> & EvalDefinitionOutputSchemaConfig<TOutputs>;
2369
2629
  //#endregion
2370
2630
  //#region ../sdk/src/defineEval.d.ts
2371
2631
  /**
@@ -2423,7 +2683,9 @@ type CacheRecordingFrame = {
2423
2683
  };
2424
2684
  /** Mutable per-case runtime state stored in async local storage. */
2425
2685
  type EvalCaseScope = {
2426
- caseId: string; /** Authored input for the current case, when provided by the runner. */
2686
+ caseId: string; /** Stable prefix used by `nextEvalId()` for this eval case scope. */
2687
+ idPrefix: string | undefined; /** Monotonic per-scope counter used by `nextEvalId()`. */
2688
+ nextEvalIdCounter: number; /** Authored input for the current case, when provided by the runner. */
2427
2689
  input?: unknown;
2428
2690
  outputs: Record<string, unknown>; /** Structured assertion failures recorded for the current case. */
2429
2691
  assertionFailures: AssertionFailure[];
@@ -2480,7 +2742,8 @@ declare function getEvalCaseInput(path: string): unknown;
2480
2742
  declare function setScopeCacheContext(scope: EvalCaseScope, context: CacheScopeContext): void;
2481
2743
  /** Optional inputs accepted when starting a new eval case scope. */
2482
2744
  type RunInEvalScopeOptions = {
2483
- /** Authored input for the active eval case. */input?: unknown; /** Cache adapter + mode attached to the scope before `fn` runs. */
2745
+ /** Authored input for the active eval case. */input?: unknown; /** Stable prefix used when generating scoped IDs with `nextEvalId()`. */
2746
+ idPrefix?: string; /** Cache adapter + mode attached to the scope before `fn` runs. */
2484
2747
  cacheContext?: CacheScopeContext;
2485
2748
  };
2486
2749
  /**
@@ -2492,6 +2755,15 @@ declare function runInEvalScope<T>(caseId: string, fn: () => Promise<T> | T, opt
2492
2755
  scope: EvalCaseScope;
2493
2756
  error: Error | undefined;
2494
2757
  }>;
2758
+ /**
2759
+ * Return the next deterministic ID for the active eval case execution.
2760
+ *
2761
+ * The runner derives the ID prefix from the eval file, eval id, and case id,
2762
+ * then this helper appends a per-scope sequence number. Calls outside an
2763
+ * active eval case scope throw so accidental product-code usage is caught
2764
+ * immediately.
2765
+ */
2766
+ declare function nextEvalId(): string;
2495
2767
  /**
2496
2768
  * Record or replace an output value for the current case scope.
2497
2769
  *
@@ -2769,7 +3041,14 @@ type EvalRunner = {
2769
3041
  * Returns the workspace's `llmCalls` config block from
2770
3042
  * `agent-evals.config.ts` with all defaults applied.
2771
3043
  */
2772
- getLlmCallsConfig(): ResolvedLlmCallsConfig; /** Resolve a persisted artifact path when artifact storage is supported. */
3044
+ getLlmCallsConfig(): ResolvedLlmCallsConfig;
3045
+ /**
3046
+ * Resolved API-calls config used by the UI to derive the API calls tab.
3047
+ *
3048
+ * Returns the workspace's `apiCalls` config block from
3049
+ * `agent-evals.config.ts` with all defaults applied.
3050
+ */
3051
+ getApiCallsConfig(): ResolvedApiCallsConfig; /** Resolve a persisted artifact path when artifact storage is supported. */
2773
3052
  getArtifactPath(artifactId: string): string | undefined; /** Return summaries for every persisted cache entry in the workspace. */
2774
3053
  listCache(): Promise<CacheListItem[]>;
2775
3054
  /**
@@ -2833,4 +3112,4 @@ declare function createRunner({
2833
3112
  */
2834
3113
  declare function runCli(argv: string[]): Promise<void>;
2835
3114
  //#endregion
2836
- export { type AgentEvalsConfig, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
3115
+ export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallsConfigInput, type NumberDisplayOptions, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as evalChartAxisSchema, $t as setEvalOutput, A as deriveScopedSummaryFromCases, At as columnFormatSchema, B as llmCallsConfigSchema, Bt as evalSpan, C as updateManualScoreRequestSchema, Ct as traceDisplayInputConfigSchema, D as getNestedAttribute, Dt as traceSpanWarningSchema, E as extractLlmCalls, Et as traceSpanSchema, F as DEFAULT_LLM_CALLS_CONFIG, Ft as repoFileRefSchema, G as caseRowSchema, Gt as appendToEvalOutput, H as trialSelectionModeSchema, Ht as hashCacheKey, I as agentEvalsConfigSchema, It as runArtifactRefSchema, J as evalStatItemSchema, Jt as getEvalCaseInput, K as evalFreshnessStatusSchema, Kt as evalAssert, L as llmCallMetricFormatSchema, Lt as z, M as deriveStatusFromChildStatuses, Mt as fileRefSchema, N as runManifestSchema, Nt as jsonCellSchema, O as getEvalTitle, Ot as cellValueSchema, P as runSummarySchema, Pt as numberDisplayOptionsSchema, Q as evalChartAggregateSchema, Qt as runInEvalScope, R as llmCallMetricPlacementSchema, Rt as buildTraceTree, S as createRunRequestSchema, St as traceDisplayConfigSchema, T as extractCacheHits, Tt as traceSpanKindSchema, U as assertionFailureSchema, Ut as hashCacheKeySync, V as resolveLlmCallsConfig, Vt as evalTracer, W as caseDetailSchema, Wt as EvalAssertionError, X as evalSummarySchema, Xt as isInEvalScope, Y as evalStatsConfigSchema, Yt as incrementEvalOutput, Z as scoreTraceSchema, Zt as mergeEvalOutput, _t as traceCacheRefSchema, at as evalChartTypeSchema, bt as traceAttributeDisplayPlacementSchema, ct as cacheFileSchema, dt as cacheOperationTypeSchema, en as setScopeCacheContext, et as evalChartBuiltinMetricSchema, ft as cacheRecordingOpSchema, gt as spanCacheOptionsSchema, ht as serializedCacheSpanSchema, it as evalChartTooltipExtraSchema, j as deriveStatusFromCaseRows, jt as columnKindSchema, k as getEvalDisplayStatus, kt as columnDefSchema, lt as cacheListItemSchema, mt as cacheStatusSchema, nn as defineEval, nt as evalChartConfigSchema, ot as evalChartsConfigSchema, pt as cacheRecordingSchema, q as evalStatAggregateSchema, qt as getCurrentScope, rn as getEvalRegistry, rt as evalChartMetricSchema, st as cacheEntrySchema, tn as repoFile, tt as evalChartColorSchema, ut as cacheModeSchema, vt as traceAttributeDisplayFormatSchema, w as sseEnvelopeSchema, wt as traceSpanErrorSchema, xt as traceAttributeDisplaySchema, yt as traceAttributeDisplayInputSchema, z as llmCallMetricSchema, zt as captureEvalSpanError } from "./runOrchestration-DA4Rh5g0.mjs";
2
- import { n as createRunner, t as runCli } from "./cli-DrPk66xh.mjs";
3
- import "./src-CfprG1RW.mjs";
4
- export { DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
1
+ import { $ as evalFreshnessStatusSchema, $t as evalAssert, A as getEvalDisplayStatus, At as traceDisplayInputConfigSchema, B as apiCallMetricPlacementSchema, Bt as jsonCellSchema, C as updateManualScoreRequestSchema, Ct as spanCacheOptionsSchema, D as extractLlmCalls, Dt as traceAttributeDisplayPlacementSchema, E as extractApiCalls, Et as traceAttributeDisplayInputSchema, F as runSummarySchema, Ft as cellValueSchema, G as llmCallMetricSchema, Gt as buildTraceTree, H as apiCallsConfigSchema, Ht as repoFileRefSchema, I as DEFAULT_API_CALLS_CONFIG, It as columnDefSchema, J as resolveLlmCallsConfig, Jt as evalTracer, K as llmCallsConfigSchema, Kt as captureEvalSpanError, L as DEFAULT_LLM_CALLS_CONFIG, Lt as columnFormatSchema, M as deriveStatusFromCaseRows, Mt as traceSpanKindSchema, N as deriveStatusFromChildStatuses, Nt as traceSpanSchema, O as getNestedAttribute, Ot as traceAttributeDisplaySchema, P as runManifestSchema, Pt as traceSpanWarningSchema, Q as caseRowSchema, Qt as appendToEvalOutput, R as agentEvalsConfigSchema, Rt as columnKindSchema, S as createRunRequestSchema, St as serializedCacheSpanSchema, T as extractCacheHits, Tt as traceAttributeDisplayFormatSchema, U as llmCallMetricFormatSchema, Ut as runArtifactRefSchema, V as apiCallMetricSchema, Vt as numberDisplayOptionsSchema, W as llmCallMetricPlacementSchema, Wt as z, X as assertionFailureSchema, Xt as hashCacheKeySync, Y as trialSelectionModeSchema, Yt as hashCacheKey, Z as caseDetailSchema, Zt as EvalAssertionError, _t as cacheModeSchema, an as nextEvalId, at as evalChartAggregateSchema, bt as cacheRecordingSchema, cn as setScopeCacheContext, ct as evalChartColorSchema, dn as getEvalRegistry, dt as evalChartTooltipExtraSchema, en as getCurrentScope, et as evalStatAggregateSchema, ft as evalChartTypeSchema, gt as cacheListItemSchema, ht as cacheFileSchema, in as mergeEvalOutput, it as scoreTraceSchema, j as deriveScopedSummaryFromCases, jt as traceSpanErrorSchema, k as getEvalTitle, kt as traceDisplayConfigSchema, ln as repoFile, lt as evalChartConfigSchema, mt as cacheEntrySchema, nn as incrementEvalOutput, nt as evalStatsConfigSchema, on as runInEvalScope, ot as evalChartAxisSchema, pt as evalChartsConfigSchema, q as resolveApiCallsConfig, qt as evalSpan, rn as isInEvalScope, rt as evalSummarySchema, sn as setEvalOutput, st as evalChartBuiltinMetricSchema, tn as getEvalCaseInput, tt as evalStatItemSchema, un as defineEval, ut as evalChartMetricSchema, vt as cacheOperationTypeSchema, w as sseEnvelopeSchema, wt as traceCacheRefSchema, xt as cacheStatusSchema, yt as cacheRecordingOpSchema, z as apiCallMetricFormatSchema, zt as fileRefSchema } from "./runOrchestration-COFhQvTJ.mjs";
2
+ import { n as createRunner, t as runCli } from "./cli-COzPxKg2.mjs";
3
+ import "./src-OZSs693X.mjs";
4
+ export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
package/dist/runChild.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { N as runManifestSchema, P as runSummarySchema, S as createRunRequestSchema, Y as evalStatsConfigSchema, kt as columnDefSchema, ot as evalChartsConfigSchema, t as executeRun, v as loadConfig, x as createFsCacheStore } from "./runOrchestration-DA4Rh5g0.mjs";
1
+ import { F as runSummarySchema, It as columnDefSchema, P as runManifestSchema, S as createRunRequestSchema, nt as evalStatsConfigSchema, pt as evalChartsConfigSchema, t as executeRun, v as loadConfig, x as createFsCacheStore } from "./runOrchestration-COFhQvTJ.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { z } from "zod/v4";