@ls-stack/agent-eval 0.21.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-CmwmcUgG.mjs → app-Cw79dJDr.mjs} +3 -3
- package/dist/apps/web/dist/assets/index-AUDD3rNB.js +118 -0
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/{cli-DumvanQI.mjs → cli-D3QNOcPN.mjs} +3 -3
- package/dist/index.d.mts +102 -56
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-zYAcAPtS.mjs → runOrchestration-CimthgI7.mjs} +248 -90
- package/dist/{runner-Dy_PECaf.mjs → runner-4yNYRvmF.mjs} +2 -2
- package/dist/{runner-BcwyX9CO.mjs → runner-B-SYzW8w.mjs} +1 -1
- package/dist/src-CcXfWT4M.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +36 -17
- package/dist/apps/web/dist/assets/index-EXO08yya.js +0 -118
- package/dist/src-BoAJb4wC.mjs +0 -3
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-AUDD3rNB.js"></script>
|
|
29
29
|
<link rel="stylesheet" crossorigin href="/assets/index-r0dVFK0B.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { C as validateCharts, F as deriveScopedSummaryFromCases,
|
|
1
|
+
import { C as validateCharts, F as deriveScopedSummaryFromCases, Ln as getEvalRegistry, N as getEvalTitle, P as getEvalDisplayStatus, S as normalizeScoreDef, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as loadConfig, c as persistCaseDetail, d as recomputePersistedCaseStatus, et as resolveApiCallsConfig, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, tt as resolveLlmCallsConfig, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig, z as runSummarySchema } from "./runOrchestration-CimthgI7.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join, relative, resolve } from "node:path";
|
|
@@ -959,8 +959,8 @@ async function commandApp(args) {
|
|
|
959
959
|
const { serve } = await import("@hono/node-server");
|
|
960
960
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
961
961
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
962
|
-
const appModule = await import("./app-
|
|
963
|
-
const runnerModule = await import("./runner-
|
|
962
|
+
const appModule = await import("./app-Cw79dJDr.mjs");
|
|
963
|
+
const runnerModule = await import("./runner-B-SYzW8w.mjs");
|
|
964
964
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
965
965
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
966
966
|
await runnerModule.initRunner();
|
package/dist/index.d.mts
CHANGED
|
@@ -645,7 +645,7 @@ declare const caseRowSchema: z$1.ZodObject<{
|
|
|
645
645
|
cancelled: "cancelled";
|
|
646
646
|
pending: "pending";
|
|
647
647
|
}>;
|
|
648
|
-
|
|
648
|
+
durationMs: z$1.ZodNullable<z$1.ZodNumber>;
|
|
649
649
|
costUsd: z$1.ZodOptional<z$1.ZodNullable<z$1.ZodNumber>>;
|
|
650
650
|
columns: z$1.ZodRecord<z$1.ZodString, z$1.ZodUnion<readonly [z$1.ZodType<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown, z$1.core.$ZodTypeInternals<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown>>, z$1.ZodUnion<readonly [z$1.ZodObject<{
|
|
651
651
|
source: z$1.ZodLiteral<"repo">;
|
|
@@ -1486,7 +1486,7 @@ declare const defaultConfigKeySchema: z$1.ZodEnum<{
|
|
|
1486
1486
|
cachedInputTokens: "cachedInputTokens";
|
|
1487
1487
|
cacheCreationInputTokens: "cacheCreationInputTokens";
|
|
1488
1488
|
reasoningTokens: "reasoningTokens";
|
|
1489
|
-
|
|
1489
|
+
llmDurationMs: "llmDurationMs";
|
|
1490
1490
|
}>;
|
|
1491
1491
|
/** Built-in eval-level output/column key. */
|
|
1492
1492
|
type DefaultConfigKey = z$1.infer<typeof defaultConfigKeySchema>;
|
|
@@ -1501,7 +1501,7 @@ declare const removeDefaultConfigSchema: z$1.ZodUnion<readonly [z$1.ZodLiteral<t
|
|
|
1501
1501
|
cachedInputTokens: "cachedInputTokens";
|
|
1502
1502
|
cacheCreationInputTokens: "cacheCreationInputTokens";
|
|
1503
1503
|
reasoningTokens: "reasoningTokens";
|
|
1504
|
-
|
|
1504
|
+
llmDurationMs: "llmDurationMs";
|
|
1505
1505
|
}>>]>;
|
|
1506
1506
|
/** Removal config for built-in eval-level outputs and UI metadata. */
|
|
1507
1507
|
type RemoveDefaultConfig = z$1.infer<typeof removeDefaultConfigSchema>;
|
|
@@ -1595,7 +1595,7 @@ declare const apiCallMetricSchema: z$1.ZodObject<{
|
|
|
1595
1595
|
type ApiCallMetric = z$1.infer<typeof apiCallMetricSchema>;
|
|
1596
1596
|
/**
|
|
1597
1597
|
* Schema for one model/provider pricing entry used to derive LLM-call costs
|
|
1598
|
-
* from token counts
|
|
1598
|
+
* from token counts.
|
|
1599
1599
|
*/
|
|
1600
1600
|
declare const llmCallPricingSchema: z$1.ZodObject<{
|
|
1601
1601
|
model: z$1.ZodString;
|
|
@@ -1604,6 +1604,7 @@ declare const llmCallPricingSchema: z$1.ZodObject<{
|
|
|
1604
1604
|
outputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1605
1605
|
cachedInputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1606
1606
|
cacheCreationInputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1607
|
+
cacheCreationInput1hUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1607
1608
|
reasoningUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1608
1609
|
}, z$1.core.$strip>;
|
|
1609
1610
|
/** Model/provider pricing entry authored in `agent-evals.config.ts`. */
|
|
@@ -1618,15 +1619,9 @@ declare const llmCallsConfigSchema: z$1.ZodObject<{
|
|
|
1618
1619
|
outputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1619
1620
|
cachedInputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1620
1621
|
cacheCreationInputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1622
|
+
cacheCreationInput1hTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1621
1623
|
reasoningTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1622
|
-
|
|
1623
|
-
tokensPerSecond: z$1.ZodOptional<z$1.ZodString>;
|
|
1624
|
-
cost: z$1.ZodOptional<z$1.ZodString>;
|
|
1625
|
-
inputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1626
|
-
outputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1627
|
-
cachedInputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1628
|
-
cacheCreationInputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1629
|
-
reasoningCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1624
|
+
latencyMs: z$1.ZodOptional<z$1.ZodString>;
|
|
1630
1625
|
steps: z$1.ZodOptional<z$1.ZodString>;
|
|
1631
1626
|
finishReason: z$1.ZodOptional<z$1.ZodString>;
|
|
1632
1627
|
input: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -1641,6 +1636,7 @@ declare const llmCallsConfigSchema: z$1.ZodObject<{
|
|
|
1641
1636
|
outputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1642
1637
|
cachedInputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1643
1638
|
cacheCreationInputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1639
|
+
cacheCreationInput1hUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1644
1640
|
reasoningUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1645
1641
|
}, z$1.core.$strip>>>;
|
|
1646
1642
|
metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
@@ -1714,15 +1710,9 @@ type ResolvedLlmCallsConfig = {
|
|
|
1714
1710
|
outputTokens: string;
|
|
1715
1711
|
cachedInputTokens: string;
|
|
1716
1712
|
cacheCreationInputTokens: string;
|
|
1713
|
+
cacheCreationInput1hTokens: string;
|
|
1717
1714
|
reasoningTokens: string;
|
|
1718
|
-
|
|
1719
|
-
tokensPerSecond: string;
|
|
1720
|
-
cost: string;
|
|
1721
|
-
inputCost: string;
|
|
1722
|
-
outputCost: string;
|
|
1723
|
-
cachedInputCost: string;
|
|
1724
|
-
cacheCreationInputCost: string;
|
|
1725
|
-
reasoningCost: string;
|
|
1715
|
+
latencyMs: string;
|
|
1726
1716
|
steps: string;
|
|
1727
1717
|
finishReason: string;
|
|
1728
1718
|
input: string;
|
|
@@ -1776,6 +1766,7 @@ type ResolvedLlmCallPricing = {
|
|
|
1776
1766
|
outputUsdPerMillion?: number;
|
|
1777
1767
|
cachedInputUsdPerMillion?: number;
|
|
1778
1768
|
cacheCreationInputUsdPerMillion?: number;
|
|
1769
|
+
cacheCreationInput1hUsdPerMillion?: number;
|
|
1779
1770
|
reasoningUsdPerMillion?: number;
|
|
1780
1771
|
};
|
|
1781
1772
|
/** Default LLM-calls config the UI uses before the workspace fetch resolves. */
|
|
@@ -1791,8 +1782,8 @@ declare const DEFAULT_API_CALLS_CONFIG: ResolvedApiCallsConfig;
|
|
|
1791
1782
|
* attribute path.
|
|
1792
1783
|
* - Missing `metrics[].format` defaults to `'string'`.
|
|
1793
1784
|
* - Missing `metrics[].placements` defaults to `['body']`.
|
|
1794
|
-
* - Missing `pricing` defaults to an empty registry;
|
|
1795
|
-
*
|
|
1785
|
+
* - Missing `pricing` defaults to an empty registry; built-in costs are only
|
|
1786
|
+
* derived from configured pricing and token counts.
|
|
1796
1787
|
*/
|
|
1797
1788
|
declare function resolveLlmCallsConfig(input: LlmCallsConfigInput | undefined): ResolvedLlmCallsConfig;
|
|
1798
1789
|
/**
|
|
@@ -1846,10 +1837,10 @@ type AgentEvalsConfig = {
|
|
|
1846
1837
|
*
|
|
1847
1838
|
* Determines which trace spans are treated as LLM calls (`kinds`), how
|
|
1848
1839
|
* structured fields like `model` and `usage.inputTokens` are read from
|
|
1849
|
-
* span attributes,
|
|
1850
|
-
*
|
|
1851
|
-
* defaults; the LLM calls tab is
|
|
1852
|
-
* matching span exists in a case run.
|
|
1840
|
+
* span attributes, which pricing table derives built-in costs, and which
|
|
1841
|
+
* custom user-defined metrics are surfaced on each call. All fields are
|
|
1842
|
+
* optional and fall back to the documented defaults; the LLM calls tab is
|
|
1843
|
+
* shown automatically when at least one matching span exists in a case run.
|
|
1853
1844
|
*
|
|
1854
1845
|
* @example
|
|
1855
1846
|
* ```ts
|
|
@@ -1982,15 +1973,9 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
1982
1973
|
outputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1983
1974
|
cachedInputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1984
1975
|
cacheCreationInputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1976
|
+
cacheCreationInput1hTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1985
1977
|
reasoningTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1986
|
-
|
|
1987
|
-
tokensPerSecond: z$1.ZodOptional<z$1.ZodString>;
|
|
1988
|
-
cost: z$1.ZodOptional<z$1.ZodString>;
|
|
1989
|
-
inputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1990
|
-
outputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1991
|
-
cachedInputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1992
|
-
cacheCreationInputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1993
|
-
reasoningCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1978
|
+
latencyMs: z$1.ZodOptional<z$1.ZodString>;
|
|
1994
1979
|
steps: z$1.ZodOptional<z$1.ZodString>;
|
|
1995
1980
|
finishReason: z$1.ZodOptional<z$1.ZodString>;
|
|
1996
1981
|
input: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -2005,6 +1990,7 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
2005
1990
|
outputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
2006
1991
|
cachedInputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
2007
1992
|
cacheCreationInputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1993
|
+
cacheCreationInput1hUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
2008
1994
|
reasoningUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
2009
1995
|
}, z$1.core.$strip>>>;
|
|
2010
1996
|
metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
@@ -2035,7 +2021,7 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
2035
2021
|
cachedInputTokens: "cachedInputTokens";
|
|
2036
2022
|
cacheCreationInputTokens: "cacheCreationInputTokens";
|
|
2037
2023
|
reasoningTokens: "reasoningTokens";
|
|
2038
|
-
|
|
2024
|
+
llmDurationMs: "llmDurationMs";
|
|
2039
2025
|
}>>]>>;
|
|
2040
2026
|
apiCalls: z$1.ZodOptional<z$1.ZodObject<{
|
|
2041
2027
|
kinds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
@@ -2104,7 +2090,8 @@ type LlmCallEntry = {
|
|
|
2104
2090
|
cachedInputTokens: number | null;
|
|
2105
2091
|
cacheCreationInputTokens: number | null;
|
|
2106
2092
|
reasoningTokens: number | null;
|
|
2107
|
-
totalTokens: number | null;
|
|
2093
|
+
totalTokens: number | null; /** Time to first token for the LLM call in milliseconds, when reported by the span. */
|
|
2094
|
+
latencyMs: number | null;
|
|
2108
2095
|
tokensPerSecond: number | null;
|
|
2109
2096
|
costUsd: number | null;
|
|
2110
2097
|
inputCostUsd: number | null;
|
|
@@ -2114,8 +2101,8 @@ type LlmCallEntry = {
|
|
|
2114
2101
|
reasoningCostUsd: number | null; /** Number of inference rounds. Derived from the array length when `stepDetails` is set. */
|
|
2115
2102
|
stepCount: number | null; /** Per-step breakdown when the configured `steps` attribute resolves to an array. */
|
|
2116
2103
|
stepDetails: unknown[] | null;
|
|
2117
|
-
finishReason: string | null;
|
|
2118
|
-
|
|
2104
|
+
finishReason: string | null; /** Elapsed LLM call span duration in milliseconds. */
|
|
2105
|
+
durationMs: number | null;
|
|
2119
2106
|
input: unknown;
|
|
2120
2107
|
output: unknown;
|
|
2121
2108
|
reasoning: unknown;
|
|
@@ -2129,16 +2116,22 @@ type LlmCallEntry = {
|
|
|
2129
2116
|
* shape consumed by the LLM calls tab.
|
|
2130
2117
|
*
|
|
2131
2118
|
* Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
|
|
2132
|
-
* (`model`, token counts,
|
|
2119
|
+
* (`model`, token counts, latency, etc.) are read via
|
|
2133
2120
|
* `getNestedAttribute` from the configured paths, with safe coercion to
|
|
2134
|
-
* `string | null` / `number | null`.
|
|
2135
|
-
*
|
|
2136
|
-
* `
|
|
2137
|
-
*
|
|
2138
|
-
*
|
|
2139
|
-
*
|
|
2140
|
-
*
|
|
2141
|
-
*
|
|
2121
|
+
* `string | null` / `number | null`. `latencyMs` is an explicit
|
|
2122
|
+
* time-to-first-token attribute; full span elapsed time is reported separately
|
|
2123
|
+
* as `durationMs`. Built-in USD costs are derived only from configured model
|
|
2124
|
+
* pricing and token counts. `totalTokens` is always derived from input +
|
|
2125
|
+
* output tokens. Cached input and cache creation tokens are reported
|
|
2126
|
+
* separately because they are subsets of input/output usage. The main cache
|
|
2127
|
+
* creation token field is treated as the total write count; optional one-hour
|
|
2128
|
+
* cache creation tokens only split that total for cost calculation. Base input
|
|
2129
|
+
* cost uses input minus cache read/write tokens so cached tokens are not
|
|
2130
|
+
* charged twice. Cache read/write costs still contribute to the total USD cost
|
|
2131
|
+
* at their configured rates. The `steps` attribute path may resolve to an array
|
|
2132
|
+
* of per-step detail objects, with `stepCount` derived from the array length.
|
|
2133
|
+
* `durationMs` and `tokensPerSecond` are `null` while the span is still
|
|
2134
|
+
* running. User-defined `metrics` whose path resolves to
|
|
2142
2135
|
* `undefined` are dropped, but `null`, `0`, and `false` are preserved as
|
|
2143
2136
|
* legitimate values worth displaying. Original span order is preserved so the
|
|
2144
2137
|
* LLM calls tab matches the ordering in the Trace tab.
|
|
@@ -2163,8 +2156,8 @@ type ApiCallEntry = {
|
|
|
2163
2156
|
status: EvalTraceSpan['status'];
|
|
2164
2157
|
method: string | null;
|
|
2165
2158
|
url: string | null;
|
|
2166
|
-
statusCode: number | null;
|
|
2167
|
-
|
|
2159
|
+
statusCode: number | null; /** Elapsed API call duration in milliseconds. */
|
|
2160
|
+
durationMs: number | null;
|
|
2168
2161
|
request: unknown;
|
|
2169
2162
|
response: unknown;
|
|
2170
2163
|
requestBody: unknown;
|
|
@@ -2181,10 +2174,10 @@ type ApiCallEntry = {
|
|
|
2181
2174
|
*
|
|
2182
2175
|
* Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
|
|
2183
2176
|
* (`method`, `url`, `statusCode`, etc.) are read via `getNestedAttribute` from
|
|
2184
|
-
* the configured paths. `durationMs` takes precedence
|
|
2185
|
-
* fallback to the span start/end timestamps. User-defined `metrics`
|
|
2186
|
-
* resolves to `undefined` are dropped, but `null`, `0`, and `false`
|
|
2187
|
-
* preserved as legitimate values worth displaying. Original span order is
|
|
2177
|
+
* the configured paths. An explicit `durationMs` attribute takes precedence,
|
|
2178
|
+
* with a fallback to the span start/end timestamps. User-defined `metrics`
|
|
2179
|
+
* whose path resolves to `undefined` are dropped, but `null`, `0`, and `false`
|
|
2180
|
+
* are preserved as legitimate values worth displaying. Original span order is
|
|
2188
2181
|
* preserved so the API calls tab matches the ordering in the Trace tab.
|
|
2189
2182
|
*/
|
|
2190
2183
|
declare function extractApiCalls(spans: EvalTraceSpan[], config: ResolvedApiCallsConfig): ApiCallEntry[];
|
|
@@ -2765,6 +2758,12 @@ type EvalCase<TInput> = {
|
|
|
2765
2758
|
};
|
|
2766
2759
|
/** Runtime output values collected from output helpers and `deriveFromTracing`. */
|
|
2767
2760
|
type EvalOutputs = Record<string, unknown>;
|
|
2761
|
+
/**
|
|
2762
|
+
* Initial wall-clock time used by an eval's shifted Date clock.
|
|
2763
|
+
*
|
|
2764
|
+
* Pass `'now'` to opt one eval back into the real current clock.
|
|
2765
|
+
*/
|
|
2766
|
+
type EvalStartTime = Date | number | string;
|
|
2768
2767
|
/**
|
|
2769
2768
|
* Schema used to validate and type an eval's collected runtime outputs.
|
|
2770
2769
|
*
|
|
@@ -2932,6 +2931,23 @@ type EvalDefinitionBase<TInput = unknown, TOutputs extends EvalOutputs = EvalOut
|
|
|
2932
2931
|
* delay case finalization; late mutations are not guaranteed to persist.
|
|
2933
2932
|
*/
|
|
2934
2933
|
waitForBackgroundJobs?: boolean;
|
|
2934
|
+
/**
|
|
2935
|
+
* Optional initial wall-clock time for this eval's runtime.
|
|
2936
|
+
*
|
|
2937
|
+
* When set, `new Date()` and `Date.now()` inside case generation, execution,
|
|
2938
|
+
* tracing, derived outputs, and scorers start from this wall-clock value and
|
|
2939
|
+
* then continue advancing with real elapsed time. The default is
|
|
2940
|
+
* `2026-04-10T00:00:00.000Z`. Pass `'now'` to use the real current clock for
|
|
2941
|
+
* this eval. Timers are not faked, so `setTimeout` and other asynchronous
|
|
2942
|
+
* work still run normally.
|
|
2943
|
+
*/
|
|
2944
|
+
startTime?: EvalStartTime;
|
|
2945
|
+
/**
|
|
2946
|
+
* Freeze the eval Date clock at `startTime` until `advanceEvalTime(...)`
|
|
2947
|
+
* moves it manually. Defaults to `false`, so eval time advances with real
|
|
2948
|
+
* elapsed time from the configured `startTime`.
|
|
2949
|
+
*/
|
|
2950
|
+
freezeTime?: boolean;
|
|
2935
2951
|
execute: (ctx: EvalExecuteContext<TInput, TOutputs>) => Promise<void> | void;
|
|
2936
2952
|
deriveFromTracing?: (ctx: EvalDeriveContext<TInput>) => Partial<TOutputs> | Promise<Partial<TOutputs>>;
|
|
2937
2953
|
scores?: Record<string, EvalScoreDef<TInput, TOutputs>>;
|
|
@@ -3025,6 +3041,9 @@ declare function defineEval<TInput = unknown, TOutputs extends EvalOutputs = Eva
|
|
|
3025
3041
|
declare function repoFile(path: string, mimeType?: string): RepoFileRef;
|
|
3026
3042
|
//#endregion
|
|
3027
3043
|
//#region ../sdk/src/runtime.d.ts
|
|
3044
|
+
declare global {
|
|
3045
|
+
var __agentEvalsRealDate: DateConstructor | undefined;
|
|
3046
|
+
}
|
|
3028
3047
|
/**
|
|
3029
3048
|
* Raw-key debug payload passed alongside cache writes.
|
|
3030
3049
|
*
|
|
@@ -3070,7 +3089,15 @@ type CacheRecordingFrame = {
|
|
|
3070
3089
|
};
|
|
3071
3090
|
/** Mutable per-case runtime state stored in async local storage. */
|
|
3072
3091
|
type EvalCaseScope = {
|
|
3073
|
-
caseId: string; /**
|
|
3092
|
+
caseId: string; /** Initial wall-clock time used by Date APIs inside this eval case. */
|
|
3093
|
+
startTime: EvalStartTime | undefined; /** Mutable shifted wall-clock state shared across this eval case's phases. */
|
|
3094
|
+
evalClockState: {
|
|
3095
|
+
startMs: number;
|
|
3096
|
+
realStartMs: number;
|
|
3097
|
+
offsetMs: number;
|
|
3098
|
+
frozen: boolean;
|
|
3099
|
+
shifted: boolean;
|
|
3100
|
+
}; /** Stable prefix used by `nextEvalId()` for this eval case scope. */
|
|
3074
3101
|
idPrefix: string | undefined; /** Monotonic per-scope counter used by `nextEvalId()`. */
|
|
3075
3102
|
nextEvalIdCounter: number; /** Authored input for the current case, when provided by the runner. */
|
|
3076
3103
|
input?: unknown;
|
|
@@ -3108,11 +3135,28 @@ type EvalCaseScope = {
|
|
|
3108
3135
|
* modules imported while a run is being prepared.
|
|
3109
3136
|
*/
|
|
3110
3137
|
type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'outputsSchema' | 'scorer';
|
|
3138
|
+
/** Time unit accepted by `advanceEvalTime(unit, amount)`. */
|
|
3139
|
+
type EvalTimeUnit = 'millisecond' | 'milliseconds' | 'second' | 'seconds' | 'minute' | 'minutes' | 'hour' | 'hours' | 'day' | 'days';
|
|
3111
3140
|
type EvalLogLevelInput = RunLogLevel | 'warning';
|
|
3112
3141
|
/** Error thrown when an eval assertion fails during case execution. */
|
|
3113
3142
|
declare class EvalAssertionError extends Error {
|
|
3114
3143
|
constructor(message: string);
|
|
3115
3144
|
}
|
|
3145
|
+
/**
|
|
3146
|
+
* Return the wall-clock start time captured for the active eval.
|
|
3147
|
+
*
|
|
3148
|
+
* For `startTime: 'now'`, this is the real time captured when the eval clock
|
|
3149
|
+
* context was created.
|
|
3150
|
+
*/
|
|
3151
|
+
declare function getEvalStartTime(): Date;
|
|
3152
|
+
/**
|
|
3153
|
+
* Advance the active eval's shifted Date clock and return the new time.
|
|
3154
|
+
*
|
|
3155
|
+
* Throws outside an active shifted eval clock. Evals that set
|
|
3156
|
+
* `startTime: 'now'` use the real current clock unless `freezeTime: true` is
|
|
3157
|
+
* also set.
|
|
3158
|
+
*/
|
|
3159
|
+
declare function advanceEvalTime(unit: EvalTimeUnit, amount: number): Date;
|
|
3116
3160
|
/** Return the current eval scope for the active async context, if any. */
|
|
3117
3161
|
declare function getCurrentScope(): EvalCaseScope | undefined;
|
|
3118
3162
|
/**
|
|
@@ -3162,7 +3206,9 @@ type RunInEvalScopeOptions = {
|
|
|
3162
3206
|
idPrefix?: string; /** Cache adapter + mode attached to the scope before `fn` runs. */
|
|
3163
3207
|
cacheContext?: CacheScopeContext; /** Whether registered background jobs should settle before scope finalizes. */
|
|
3164
3208
|
waitForBackgroundJobs?: boolean; /** Eval runner phase exposed through `isInEvalScope()`. Defaults to `eval`. */
|
|
3165
|
-
runtimeScope?: EvalRuntimeScope;
|
|
3209
|
+
runtimeScope?: EvalRuntimeScope; /** Initial wall-clock time used by `new Date()` and `Date.now()` in this eval. */
|
|
3210
|
+
startTime?: EvalStartTime; /** Whether Date APIs stay frozen until advanced manually. */
|
|
3211
|
+
freezeTime?: boolean;
|
|
3166
3212
|
};
|
|
3167
3213
|
/** Execute a callback while `isInEvalScope()` reports a runner phase. */
|
|
3168
3214
|
declare function runInEvalRuntimeScope<T>(runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
|
|
@@ -3577,4 +3623,4 @@ declare function createRunner({
|
|
|
3577
3623
|
*/
|
|
3578
3624
|
declare function runCli(argv: string[]): Promise<void>;
|
|
3579
3625
|
//#endregion
|
|
3580
|
-
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
3626
|
+
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as removeDefaultConfigSchema, $t as columnKindSchema, A as extractApiCalls, An as
|
|
2
|
-
import { n as createRunner, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
1
|
+
import { $ as removeDefaultConfigSchema, $t as columnKindSchema, A as extractApiCalls, An as runInEvalScope, At as cacheFileSchema, B as DEFAULT_API_CALLS_CONFIG, Bt as traceAttributeDisplayFormatSchema, Cn as getEvalCaseInput, Ct as evalChartTooltipExtraSchema, D as sseEnvelopeSchema, Dn as mergeEvalOutput, Dt as cacheDebugKeyFileSchema, E as updateManualScoreRequestSchema, En as isInEvalScope, Et as cacheDebugKeyEntrySchema, F as deriveScopedSummaryFromCases, Fn as repoFile, Ft as cacheRecordingSchema, G as apiCallMetricSchema, Gt as traceDisplayInputConfigSchema, H as agentEvalsConfigSchema, Ht as traceAttributeDisplayPlacementSchema, I as deriveStatusFromCaseRows, In as defineEval, It as cacheStatusSchema, J as llmCallMetricFormatSchema, Jt as traceSpanSchema, K as apiCallsConfigSchema, Kt as traceSpanErrorSchema, L as deriveStatusFromChildStatuses, Ln as getEvalRegistry, Lt as serializedCacheSpanSchema, M as getNestedAttribute, Mn as setEvalOutput, Mt as cacheModeSchema, N as getEvalTitle, Nn as setScopeCacheContext, Nt as cacheOperationTypeSchema, O as extractCacheEntries, On as nextEvalId, Ot as cacheEntrySchema, P as getEvalDisplayStatus, Pn as startEvalBackgroundJob, Pt as cacheRecordingOpSchema, Q as llmCallsConfigSchema, Qt as columnFormatSchema, R as runManifestSchema, Rt as spanCacheOptionsSchema, Sn as getCurrentScope, St as evalChartMetricSchema, T as createRunRequestSchema, Tn as incrementEvalOutput, Tt as evalChartsConfigSchema, U as apiCallMetricFormatSchema, Ut as traceAttributeDisplaySchema, V as DEFAULT_LLM_CALLS_CONFIG, Vt as traceAttributeDisplayInputSchema, W as apiCallMetricPlacementSchema, Wt as traceDisplayConfigSchema, X as llmCallMetricSchema, Xt as cellValueSchema, Y as llmCallMetricPlacementSchema, Yt as traceSpanWarningSchema, Z as llmCallPricingSchema, Zt as columnDefSchema, _n as advanceEvalTime, _t as evalChartAggregateSchema, an as z, at as caseDetailSchema, bn as evalAssert, bt as evalChartColorSchema, cn as evalSpan, ct as evalStatAggregateSchema, dn as hashCacheKeySync, dt as evalSummarySchema, en as fileRefSchema, et as resolveApiCallsConfig, fn as deserializeCacheRecording, ft as runLogEntrySchema, gn as EvalAssertionError, gt as scoreTraceSchema, hn as serializeCacheValue, ht as runLogPhaseSchema, in as runArtifactRefSchema, it as assertionFailureSchema, j as extractLlmCalls, jn as runInExistingEvalScope, jt as cacheListItemSchema, k as extractCacheHits, kn as runInEvalRuntimeScope, kt as cacheEntryWithDebugKeySchema, ln as evalTracer, lt as evalStatItemSchema, mn as serializeCacheRecording, mt as runLogLocationSchema, nn as numberDisplayOptionsSchema, nt as runLogsConfigSchema, on as buildTraceTree, ot as caseRowSchema, pn as deserializeCacheValue, pt as runLogLevelSchema, q as defaultConfigKeySchema, qt as traceSpanKindSchema, rn as repoFileRefSchema, rt as trialSelectionModeSchema, sn as captureEvalSpanError, st as evalFreshnessStatusSchema, tn as jsonCellSchema, tt as resolveLlmCallsConfig, un as hashCacheKey, ut as evalStatsConfigSchema, vn as appendToEvalOutput, vt as evalChartAxisSchema, wn as getEvalStartTime, wt as evalChartTypeSchema, xn as evalLog, xt as evalChartConfigSchema, yt as evalChartBuiltinMetricSchema, z as runSummarySchema, zt as traceCacheRefSchema } from "./runOrchestration-CimthgI7.mjs";
|
|
2
|
+
import { n as createRunner, t as runCli } from "./cli-D3QNOcPN.mjs";
|
|
3
|
+
import "./src-CcXfWT4M.mjs";
|
|
4
|
+
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { R as runManifestSchema, T as createRunRequestSchema, Tt as evalChartsConfigSchema, Zt as columnDefSchema, b as loadConfig, t as executeRun, ut as evalStatsConfigSchema, v as parseEvalMetas,
|
|
1
|
+
import { R as runManifestSchema, T as createRunRequestSchema, Tt as evalChartsConfigSchema, Zt as columnDefSchema, b as loadConfig, t as executeRun, ut as evalStatsConfigSchema, v as parseEvalMetas, w as createFsCacheStore, yn as configureEvalRunLogs, z as runSummarySchema } from "./runOrchestration-CimthgI7.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|