@ls-stack/agent-eval 0.20.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DsiLU65H.mjs → app-moDHbg1O.mjs} +3 -3
- package/dist/apps/web/dist/assets/index-AUDD3rNB.js +118 -0
- package/dist/apps/web/dist/assets/{index-CvR6QCLa.css → index-r0dVFK0B.css} +1 -1
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-weogme5U.mjs → cli-C0EtHhEO.mjs} +3 -3
- package/dist/index.d.mts +56 -61
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-Cv1kiOAG.mjs → runOrchestration-D1edUDhp.mjs} +155 -140
- package/dist/{runner-DzrMtgBu.mjs → runner-C9nP2VKL.mjs} +2 -2
- package/dist/{runner-B25oRQxX.mjs → runner-CyRhIzci.mjs} +1 -1
- package/dist/src-D-HuV8I-.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +30 -20
- package/dist/apps/web/dist/assets/index-Cba4MFa0.js +0 -118
- package/dist/src-B879LZfo.mjs +0 -3
package/dist/index.d.mts
CHANGED
|
@@ -39,8 +39,9 @@ type NumberDisplayOptions = {
|
|
|
39
39
|
/** Number notation used when rendering the value. */notation?: 'standard' | 'compact'; /** Compact style used when `notation: 'compact'` is enabled. */
|
|
40
40
|
compactDisplay?: 'short' | 'long'; /** String prepended to the rendered number, such as `$`. */
|
|
41
41
|
prefix?: string; /** String appended to the rendered number, such as ` ms`. */
|
|
42
|
-
suffix?: string; /**
|
|
43
|
-
|
|
42
|
+
suffix?: string; /** Minimum number of decimal places to render. */
|
|
43
|
+
minDecimalPlaces?: number; /** Maximum number of decimal places to render. */
|
|
44
|
+
maxDecimalPlaces?: number;
|
|
44
45
|
};
|
|
45
46
|
/** Schema for numeric presentation options used by number-formatted values. */
|
|
46
47
|
declare const numberDisplayOptionsSchema: z$1.ZodType<NumberDisplayOptions>;
|
|
@@ -98,7 +99,6 @@ declare const columnDefSchema: z$1.ZodObject<{
|
|
|
98
99
|
passThreshold: z$1.ZodOptional<z$1.ZodNumber>;
|
|
99
100
|
maxStars: z$1.ZodOptional<z$1.ZodNumber>;
|
|
100
101
|
hideInTable: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
101
|
-
sortable: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
102
102
|
align: z$1.ZodOptional<z$1.ZodEnum<{
|
|
103
103
|
left: "left";
|
|
104
104
|
center: "center";
|
|
@@ -412,6 +412,7 @@ declare const evalStatItemSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
|
|
|
412
412
|
passFail: "passFail";
|
|
413
413
|
stars: "stars";
|
|
414
414
|
}>>;
|
|
415
|
+
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
415
416
|
accent: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
416
417
|
}, z$1.core.$strip>], "kind">;
|
|
417
418
|
/** Single stat rendered in the EvalCard stats row. */
|
|
@@ -449,6 +450,7 @@ declare const evalStatsConfigSchema: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1
|
|
|
449
450
|
passFail: "passFail";
|
|
450
451
|
stars: "stars";
|
|
451
452
|
}>>;
|
|
453
|
+
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
452
454
|
accent: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
453
455
|
}, z$1.core.$strip>], "kind">>;
|
|
454
456
|
/** Ordered list of stats rendered in the EvalCard stats row. */
|
|
@@ -496,7 +498,6 @@ declare const evalSummarySchema: z$1.ZodObject<{
|
|
|
496
498
|
passThreshold: z$1.ZodOptional<z$1.ZodNumber>;
|
|
497
499
|
maxStars: z$1.ZodOptional<z$1.ZodNumber>;
|
|
498
500
|
hideInTable: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
499
|
-
sortable: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
500
501
|
align: z$1.ZodOptional<z$1.ZodEnum<{
|
|
501
502
|
left: "left";
|
|
502
503
|
center: "center";
|
|
@@ -544,6 +545,7 @@ declare const evalSummarySchema: z$1.ZodObject<{
|
|
|
544
545
|
passFail: "passFail";
|
|
545
546
|
stars: "stars";
|
|
546
547
|
}>>;
|
|
548
|
+
numberFormat: z$1.ZodOptional<z$1.ZodType<NumberDisplayOptions, unknown, z$1.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
547
549
|
accent: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
548
550
|
}, z$1.core.$strip>], "kind">>>;
|
|
549
551
|
charts: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
@@ -643,7 +645,7 @@ declare const caseRowSchema: z$1.ZodObject<{
|
|
|
643
645
|
cancelled: "cancelled";
|
|
644
646
|
pending: "pending";
|
|
645
647
|
}>;
|
|
646
|
-
|
|
648
|
+
durationMs: z$1.ZodNullable<z$1.ZodNumber>;
|
|
647
649
|
costUsd: z$1.ZodOptional<z$1.ZodNullable<z$1.ZodNumber>>;
|
|
648
650
|
columns: z$1.ZodRecord<z$1.ZodString, z$1.ZodUnion<readonly [z$1.ZodType<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown, z$1.core.$ZodTypeInternals<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown>>, z$1.ZodUnion<readonly [z$1.ZodObject<{
|
|
649
651
|
source: z$1.ZodLiteral<"repo">;
|
|
@@ -1484,7 +1486,7 @@ declare const defaultConfigKeySchema: z$1.ZodEnum<{
|
|
|
1484
1486
|
cachedInputTokens: "cachedInputTokens";
|
|
1485
1487
|
cacheCreationInputTokens: "cacheCreationInputTokens";
|
|
1486
1488
|
reasoningTokens: "reasoningTokens";
|
|
1487
|
-
|
|
1489
|
+
llmDurationMs: "llmDurationMs";
|
|
1488
1490
|
}>;
|
|
1489
1491
|
/** Built-in eval-level output/column key. */
|
|
1490
1492
|
type DefaultConfigKey = z$1.infer<typeof defaultConfigKeySchema>;
|
|
@@ -1499,7 +1501,7 @@ declare const removeDefaultConfigSchema: z$1.ZodUnion<readonly [z$1.ZodLiteral<t
|
|
|
1499
1501
|
cachedInputTokens: "cachedInputTokens";
|
|
1500
1502
|
cacheCreationInputTokens: "cacheCreationInputTokens";
|
|
1501
1503
|
reasoningTokens: "reasoningTokens";
|
|
1502
|
-
|
|
1504
|
+
llmDurationMs: "llmDurationMs";
|
|
1503
1505
|
}>>]>;
|
|
1504
1506
|
/** Removal config for built-in eval-level outputs and UI metadata. */
|
|
1505
1507
|
type RemoveDefaultConfig = z$1.infer<typeof removeDefaultConfigSchema>;
|
|
@@ -1593,7 +1595,7 @@ declare const apiCallMetricSchema: z$1.ZodObject<{
|
|
|
1593
1595
|
type ApiCallMetric = z$1.infer<typeof apiCallMetricSchema>;
|
|
1594
1596
|
/**
|
|
1595
1597
|
* Schema for one model/provider pricing entry used to derive LLM-call costs
|
|
1596
|
-
* from token counts
|
|
1598
|
+
* from token counts.
|
|
1597
1599
|
*/
|
|
1598
1600
|
declare const llmCallPricingSchema: z$1.ZodObject<{
|
|
1599
1601
|
model: z$1.ZodString;
|
|
@@ -1602,6 +1604,7 @@ declare const llmCallPricingSchema: z$1.ZodObject<{
|
|
|
1602
1604
|
outputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1603
1605
|
cachedInputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1604
1606
|
cacheCreationInputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1607
|
+
cacheCreationInput1hUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1605
1608
|
reasoningUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1606
1609
|
}, z$1.core.$strip>;
|
|
1607
1610
|
/** Model/provider pricing entry authored in `agent-evals.config.ts`. */
|
|
@@ -1616,15 +1619,9 @@ declare const llmCallsConfigSchema: z$1.ZodObject<{
|
|
|
1616
1619
|
outputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1617
1620
|
cachedInputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1618
1621
|
cacheCreationInputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1622
|
+
cacheCreationInput1hTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1619
1623
|
reasoningTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1620
|
-
|
|
1621
|
-
tokensPerSecond: z$1.ZodOptional<z$1.ZodString>;
|
|
1622
|
-
cost: z$1.ZodOptional<z$1.ZodString>;
|
|
1623
|
-
inputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1624
|
-
outputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1625
|
-
cachedInputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1626
|
-
cacheCreationInputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1627
|
-
reasoningCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1624
|
+
latencyMs: z$1.ZodOptional<z$1.ZodString>;
|
|
1628
1625
|
steps: z$1.ZodOptional<z$1.ZodString>;
|
|
1629
1626
|
finishReason: z$1.ZodOptional<z$1.ZodString>;
|
|
1630
1627
|
input: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -1639,6 +1636,7 @@ declare const llmCallsConfigSchema: z$1.ZodObject<{
|
|
|
1639
1636
|
outputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1640
1637
|
cachedInputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1641
1638
|
cacheCreationInputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1639
|
+
cacheCreationInput1hUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1642
1640
|
reasoningUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1643
1641
|
}, z$1.core.$strip>>>;
|
|
1644
1642
|
metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
@@ -1712,15 +1710,9 @@ type ResolvedLlmCallsConfig = {
|
|
|
1712
1710
|
outputTokens: string;
|
|
1713
1711
|
cachedInputTokens: string;
|
|
1714
1712
|
cacheCreationInputTokens: string;
|
|
1713
|
+
cacheCreationInput1hTokens: string;
|
|
1715
1714
|
reasoningTokens: string;
|
|
1716
|
-
|
|
1717
|
-
tokensPerSecond: string;
|
|
1718
|
-
cost: string;
|
|
1719
|
-
inputCost: string;
|
|
1720
|
-
outputCost: string;
|
|
1721
|
-
cachedInputCost: string;
|
|
1722
|
-
cacheCreationInputCost: string;
|
|
1723
|
-
reasoningCost: string;
|
|
1715
|
+
latencyMs: string;
|
|
1724
1716
|
steps: string;
|
|
1725
1717
|
finishReason: string;
|
|
1726
1718
|
input: string;
|
|
@@ -1774,6 +1766,7 @@ type ResolvedLlmCallPricing = {
|
|
|
1774
1766
|
outputUsdPerMillion?: number;
|
|
1775
1767
|
cachedInputUsdPerMillion?: number;
|
|
1776
1768
|
cacheCreationInputUsdPerMillion?: number;
|
|
1769
|
+
cacheCreationInput1hUsdPerMillion?: number;
|
|
1777
1770
|
reasoningUsdPerMillion?: number;
|
|
1778
1771
|
};
|
|
1779
1772
|
/** Default LLM-calls config the UI uses before the workspace fetch resolves. */
|
|
@@ -1789,8 +1782,8 @@ declare const DEFAULT_API_CALLS_CONFIG: ResolvedApiCallsConfig;
|
|
|
1789
1782
|
* attribute path.
|
|
1790
1783
|
* - Missing `metrics[].format` defaults to `'string'`.
|
|
1791
1784
|
* - Missing `metrics[].placements` defaults to `['body']`.
|
|
1792
|
-
* - Missing `pricing` defaults to an empty registry;
|
|
1793
|
-
*
|
|
1785
|
+
* - Missing `pricing` defaults to an empty registry; built-in costs are only
|
|
1786
|
+
* derived from configured pricing and token counts.
|
|
1794
1787
|
*/
|
|
1795
1788
|
declare function resolveLlmCallsConfig(input: LlmCallsConfigInput | undefined): ResolvedLlmCallsConfig;
|
|
1796
1789
|
/**
|
|
@@ -1844,10 +1837,10 @@ type AgentEvalsConfig = {
|
|
|
1844
1837
|
*
|
|
1845
1838
|
* Determines which trace spans are treated as LLM calls (`kinds`), how
|
|
1846
1839
|
* structured fields like `model` and `usage.inputTokens` are read from
|
|
1847
|
-
* span attributes,
|
|
1848
|
-
*
|
|
1849
|
-
* defaults; the LLM calls tab is
|
|
1850
|
-
* matching span exists in a case run.
|
|
1840
|
+
* span attributes, which pricing table derives built-in costs, and which
|
|
1841
|
+
* custom user-defined metrics are surfaced on each call. All fields are
|
|
1842
|
+
* optional and fall back to the documented defaults; the LLM calls tab is
|
|
1843
|
+
* shown automatically when at least one matching span exists in a case run.
|
|
1851
1844
|
*
|
|
1852
1845
|
* @example
|
|
1853
1846
|
* ```ts
|
|
@@ -1980,15 +1973,9 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
1980
1973
|
outputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1981
1974
|
cachedInputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1982
1975
|
cacheCreationInputTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1976
|
+
cacheCreationInput1hTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1983
1977
|
reasoningTokens: z$1.ZodOptional<z$1.ZodString>;
|
|
1984
|
-
|
|
1985
|
-
tokensPerSecond: z$1.ZodOptional<z$1.ZodString>;
|
|
1986
|
-
cost: z$1.ZodOptional<z$1.ZodString>;
|
|
1987
|
-
inputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1988
|
-
outputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1989
|
-
cachedInputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1990
|
-
cacheCreationInputCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1991
|
-
reasoningCost: z$1.ZodOptional<z$1.ZodString>;
|
|
1978
|
+
latencyMs: z$1.ZodOptional<z$1.ZodString>;
|
|
1992
1979
|
steps: z$1.ZodOptional<z$1.ZodString>;
|
|
1993
1980
|
finishReason: z$1.ZodOptional<z$1.ZodString>;
|
|
1994
1981
|
input: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -2003,6 +1990,7 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
2003
1990
|
outputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
2004
1991
|
cachedInputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
2005
1992
|
cacheCreationInputUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
1993
|
+
cacheCreationInput1hUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
2006
1994
|
reasoningUsdPerMillion: z$1.ZodOptional<z$1.ZodNumber>;
|
|
2007
1995
|
}, z$1.core.$strip>>>;
|
|
2008
1996
|
metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
@@ -2033,7 +2021,7 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
2033
2021
|
cachedInputTokens: "cachedInputTokens";
|
|
2034
2022
|
cacheCreationInputTokens: "cacheCreationInputTokens";
|
|
2035
2023
|
reasoningTokens: "reasoningTokens";
|
|
2036
|
-
|
|
2024
|
+
llmDurationMs: "llmDurationMs";
|
|
2037
2025
|
}>>]>>;
|
|
2038
2026
|
apiCalls: z$1.ZodOptional<z$1.ZodObject<{
|
|
2039
2027
|
kinds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
@@ -2102,7 +2090,8 @@ type LlmCallEntry = {
|
|
|
2102
2090
|
cachedInputTokens: number | null;
|
|
2103
2091
|
cacheCreationInputTokens: number | null;
|
|
2104
2092
|
reasoningTokens: number | null;
|
|
2105
|
-
totalTokens: number | null;
|
|
2093
|
+
totalTokens: number | null; /** Time to first token for the LLM call in milliseconds, when reported by the span. */
|
|
2094
|
+
latencyMs: number | null;
|
|
2106
2095
|
tokensPerSecond: number | null;
|
|
2107
2096
|
costUsd: number | null;
|
|
2108
2097
|
inputCostUsd: number | null;
|
|
@@ -2112,8 +2101,8 @@ type LlmCallEntry = {
|
|
|
2112
2101
|
reasoningCostUsd: number | null; /** Number of inference rounds. Derived from the array length when `stepDetails` is set. */
|
|
2113
2102
|
stepCount: number | null; /** Per-step breakdown when the configured `steps` attribute resolves to an array. */
|
|
2114
2103
|
stepDetails: unknown[] | null;
|
|
2115
|
-
finishReason: string | null;
|
|
2116
|
-
|
|
2104
|
+
finishReason: string | null; /** Elapsed LLM call span duration in milliseconds. */
|
|
2105
|
+
durationMs: number | null;
|
|
2117
2106
|
input: unknown;
|
|
2118
2107
|
output: unknown;
|
|
2119
2108
|
reasoning: unknown;
|
|
@@ -2127,16 +2116,22 @@ type LlmCallEntry = {
|
|
|
2127
2116
|
* shape consumed by the LLM calls tab.
|
|
2128
2117
|
*
|
|
2129
2118
|
* Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
|
|
2130
|
-
* (`model`, token counts,
|
|
2119
|
+
* (`model`, token counts, latency, etc.) are read via
|
|
2131
2120
|
* `getNestedAttribute` from the configured paths, with safe coercion to
|
|
2132
|
-
* `string | null` / `number | null`.
|
|
2133
|
-
*
|
|
2134
|
-
* `
|
|
2135
|
-
*
|
|
2136
|
-
*
|
|
2137
|
-
*
|
|
2138
|
-
*
|
|
2139
|
-
*
|
|
2121
|
+
* `string | null` / `number | null`. `latencyMs` is an explicit
|
|
2122
|
+
* time-to-first-token attribute; full span elapsed time is reported separately
|
|
2123
|
+
* as `durationMs`. Built-in USD costs are derived only from configured model
|
|
2124
|
+
* pricing and token counts. `totalTokens` is always derived from input +
|
|
2125
|
+
* output tokens. Cached input and cache creation tokens are reported
|
|
2126
|
+
* separately because they are subsets of input/output usage. The main cache
|
|
2127
|
+
* creation token field is treated as the total write count; optional one-hour
|
|
2128
|
+
* cache creation tokens only split that total for cost calculation. Base input
|
|
2129
|
+
* cost uses input minus cache read/write tokens so cached tokens are not
|
|
2130
|
+
* charged twice. Cache read/write costs still contribute to the total USD cost
|
|
2131
|
+
* at their configured rates. The `steps` attribute path may resolve to an array
|
|
2132
|
+
* of per-step detail objects, with `stepCount` derived from the array length.
|
|
2133
|
+
* `durationMs` and `tokensPerSecond` are `null` while the span is still
|
|
2134
|
+
* running. User-defined `metrics` whose path resolves to
|
|
2140
2135
|
* `undefined` are dropped, but `null`, `0`, and `false` are preserved as
|
|
2141
2136
|
* legitimate values worth displaying. Original span order is preserved so the
|
|
2142
2137
|
* LLM calls tab matches the ordering in the Trace tab.
|
|
@@ -2161,8 +2156,8 @@ type ApiCallEntry = {
|
|
|
2161
2156
|
status: EvalTraceSpan['status'];
|
|
2162
2157
|
method: string | null;
|
|
2163
2158
|
url: string | null;
|
|
2164
|
-
statusCode: number | null;
|
|
2165
|
-
|
|
2159
|
+
statusCode: number | null; /** Elapsed API call duration in milliseconds. */
|
|
2160
|
+
durationMs: number | null;
|
|
2166
2161
|
request: unknown;
|
|
2167
2162
|
response: unknown;
|
|
2168
2163
|
requestBody: unknown;
|
|
@@ -2179,10 +2174,10 @@ type ApiCallEntry = {
|
|
|
2179
2174
|
*
|
|
2180
2175
|
* Spans whose `kind` is not in `config.kinds` are dropped. Structured fields
|
|
2181
2176
|
* (`method`, `url`, `statusCode`, etc.) are read via `getNestedAttribute` from
|
|
2182
|
-
* the configured paths. `durationMs` takes precedence
|
|
2183
|
-
* fallback to the span start/end timestamps. User-defined `metrics`
|
|
2184
|
-
* resolves to `undefined` are dropped, but `null`, `0`, and `false`
|
|
2185
|
-
* preserved as legitimate values worth displaying. Original span order is
|
|
2177
|
+
* the configured paths. An explicit `durationMs` attribute takes precedence,
|
|
2178
|
+
* with a fallback to the span start/end timestamps. User-defined `metrics`
|
|
2179
|
+
* whose path resolves to `undefined` are dropped, but `null`, `0`, and `false`
|
|
2180
|
+
* are preserved as legitimate values worth displaying. Original span order is
|
|
2186
2181
|
* preserved so the API calls tab matches the ordering in the Trace tab.
|
|
2187
2182
|
*/
|
|
2188
2183
|
declare function extractApiCalls(spans: EvalTraceSpan[], config: ResolvedApiCallsConfig): ApiCallEntry[];
|
|
@@ -2784,7 +2779,7 @@ type EvalColumnOverride = {
|
|
|
2784
2779
|
/**
|
|
2785
2780
|
* Extra options for `format: 'number'`.
|
|
2786
2781
|
*
|
|
2787
|
-
* Use this to add a prefix or suffix,
|
|
2782
|
+
* Use this to add a prefix or suffix, control minimum and maximum decimal
|
|
2788
2783
|
* places, or switch to compact notation such as `1.2K`.
|
|
2789
2784
|
*/
|
|
2790
2785
|
numberFormat?: NumberDisplayOptions;
|
|
@@ -2792,8 +2787,7 @@ type EvalColumnOverride = {
|
|
|
2792
2787
|
* Hides the column from the runs table while keeping it available in detail
|
|
2793
2788
|
* views and raw output data.
|
|
2794
2789
|
*/
|
|
2795
|
-
hideInTable?: boolean; /**
|
|
2796
|
-
sortable?: boolean; /** Horizontal alignment used when rendering the column cells. */
|
|
2790
|
+
hideInTable?: boolean; /** Horizontal alignment used when rendering the column cells. */
|
|
2797
2791
|
align?: 'left' | 'center' | 'right';
|
|
2798
2792
|
/**
|
|
2799
2793
|
* Maximum number of stars used when `format: 'stars'`.
|
|
@@ -2954,7 +2948,8 @@ type EvalDefinitionBase<TInput = unknown, TOutputs extends EvalOutputs = EvalOut
|
|
|
2954
2948
|
* column across the latest run's cases — `key` must match one of the eval's
|
|
2955
2949
|
* score or column keys, and only finite numeric values participate in the
|
|
2956
2950
|
* reduction. When no case has a numeric value for the key the stat renders
|
|
2957
|
-
* an em dash. `label` and `
|
|
2951
|
+
* an em dash. `label`, `format`, and `numberFormat` default to the matching
|
|
2952
|
+
* `ColumnDef`.
|
|
2958
2953
|
*/
|
|
2959
2954
|
stats?: EvalStatsConfig;
|
|
2960
2955
|
/**
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as removeDefaultConfigSchema, $t as columnKindSchema, A as extractApiCalls, An as setEvalOutput, At as cacheFileSchema, B as DEFAULT_API_CALLS_CONFIG, Bt as traceAttributeDisplayFormatSchema, Cn as incrementEvalOutput, Ct as evalChartTooltipExtraSchema, D as sseEnvelopeSchema, Dn as runInEvalRuntimeScope, Dt as cacheDebugKeyFileSchema, E as updateManualScoreRequestSchema, En as nextEvalId, Et as cacheDebugKeyEntrySchema, F as deriveScopedSummaryFromCases, Fn as getEvalRegistry, Ft as cacheRecordingSchema, G as apiCallMetricSchema, Gt as traceDisplayInputConfigSchema, H as agentEvalsConfigSchema, Ht as traceAttributeDisplayPlacementSchema, I as deriveStatusFromCaseRows, It as cacheStatusSchema, J as llmCallMetricFormatSchema, Jt as traceSpanSchema, K as apiCallsConfigSchema, Kt as traceSpanErrorSchema, L as deriveStatusFromChildStatuses, Lt as serializedCacheSpanSchema, M as getNestedAttribute, Mn as startEvalBackgroundJob, Mt as cacheModeSchema, N as getEvalTitle, Nn as repoFile, Nt as cacheOperationTypeSchema, O as extractCacheEntries, On as runInEvalScope, Ot as cacheEntrySchema, P as getEvalDisplayStatus, Pn as defineEval, Pt as cacheRecordingOpSchema, Q as llmCallsConfigSchema, Qt as columnFormatSchema, R as runManifestSchema, Rt as spanCacheOptionsSchema, Sn as getEvalCaseInput, St as evalChartMetricSchema, T as createRunRequestSchema, Tn as mergeEvalOutput, Tt as evalChartsConfigSchema, U as apiCallMetricFormatSchema, Ut as traceAttributeDisplaySchema, V as DEFAULT_LLM_CALLS_CONFIG, Vt as traceAttributeDisplayInputSchema, W as apiCallMetricPlacementSchema, Wt as traceDisplayConfigSchema, X as llmCallMetricSchema, Xt as cellValueSchema, Y as llmCallMetricPlacementSchema, Yt as traceSpanWarningSchema, Z as llmCallPricingSchema, Zt as columnDefSchema, _n as appendToEvalOutput, _t as evalChartAggregateSchema, an as z, at as caseDetailSchema, bn as evalLog, bt as evalChartColorSchema, cn as evalSpan, ct as evalStatAggregateSchema, dn as hashCacheKeySync, dt as evalSummarySchema, en as fileRefSchema, et as resolveApiCallsConfig, fn as deserializeCacheRecording, ft as runLogEntrySchema, gn as EvalAssertionError, gt as scoreTraceSchema, hn as serializeCacheValue, ht as runLogPhaseSchema, in as runArtifactRefSchema, it as assertionFailureSchema, j as extractLlmCalls, jn as setScopeCacheContext, jt as cacheListItemSchema, k as extractCacheHits, kn as runInExistingEvalScope, kt as cacheEntryWithDebugKeySchema, ln as evalTracer, lt as evalStatItemSchema, mn as serializeCacheRecording, mt as runLogLocationSchema, nn as numberDisplayOptionsSchema, nt as runLogsConfigSchema, on as buildTraceTree, ot as caseRowSchema, pn as deserializeCacheValue, pt as runLogLevelSchema, q as defaultConfigKeySchema, qt as traceSpanKindSchema, rn as repoFileRefSchema, rt as trialSelectionModeSchema, sn as captureEvalSpanError, st as evalFreshnessStatusSchema, tn as jsonCellSchema, tt as resolveLlmCallsConfig, un as hashCacheKey, ut as evalStatsConfigSchema, vt as evalChartAxisSchema, wn as isInEvalScope, wt as evalChartTypeSchema, xn as getCurrentScope, xt as evalChartConfigSchema, yn as evalAssert, yt as evalChartBuiltinMetricSchema, z as runSummarySchema, zt as traceCacheRefSchema } from "./runOrchestration-
|
|
2
|
-
import { n as createRunner, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
1
|
+
import { $ as removeDefaultConfigSchema, $t as columnKindSchema, A as extractApiCalls, An as setEvalOutput, At as cacheFileSchema, B as DEFAULT_API_CALLS_CONFIG, Bt as traceAttributeDisplayFormatSchema, Cn as incrementEvalOutput, Ct as evalChartTooltipExtraSchema, D as sseEnvelopeSchema, Dn as runInEvalRuntimeScope, Dt as cacheDebugKeyFileSchema, E as updateManualScoreRequestSchema, En as nextEvalId, Et as cacheDebugKeyEntrySchema, F as deriveScopedSummaryFromCases, Fn as getEvalRegistry, Ft as cacheRecordingSchema, G as apiCallMetricSchema, Gt as traceDisplayInputConfigSchema, H as agentEvalsConfigSchema, Ht as traceAttributeDisplayPlacementSchema, I as deriveStatusFromCaseRows, It as cacheStatusSchema, J as llmCallMetricFormatSchema, Jt as traceSpanSchema, K as apiCallsConfigSchema, Kt as traceSpanErrorSchema, L as deriveStatusFromChildStatuses, Lt as serializedCacheSpanSchema, M as getNestedAttribute, Mn as startEvalBackgroundJob, Mt as cacheModeSchema, N as getEvalTitle, Nn as repoFile, Nt as cacheOperationTypeSchema, O as extractCacheEntries, On as runInEvalScope, Ot as cacheEntrySchema, P as getEvalDisplayStatus, Pn as defineEval, Pt as cacheRecordingOpSchema, Q as llmCallsConfigSchema, Qt as columnFormatSchema, R as runManifestSchema, Rt as spanCacheOptionsSchema, Sn as getEvalCaseInput, St as evalChartMetricSchema, T as createRunRequestSchema, Tn as mergeEvalOutput, Tt as evalChartsConfigSchema, U as apiCallMetricFormatSchema, Ut as traceAttributeDisplaySchema, V as DEFAULT_LLM_CALLS_CONFIG, Vt as traceAttributeDisplayInputSchema, W as apiCallMetricPlacementSchema, Wt as traceDisplayConfigSchema, X as llmCallMetricSchema, Xt as cellValueSchema, Y as llmCallMetricPlacementSchema, Yt as traceSpanWarningSchema, Z as llmCallPricingSchema, Zt as columnDefSchema, _n as appendToEvalOutput, _t as evalChartAggregateSchema, an as z, at as caseDetailSchema, bn as evalLog, bt as evalChartColorSchema, cn as evalSpan, ct as evalStatAggregateSchema, dn as hashCacheKeySync, dt as evalSummarySchema, en as fileRefSchema, et as resolveApiCallsConfig, fn as deserializeCacheRecording, ft as runLogEntrySchema, gn as EvalAssertionError, gt as scoreTraceSchema, hn as serializeCacheValue, ht as runLogPhaseSchema, in as runArtifactRefSchema, it as assertionFailureSchema, j as extractLlmCalls, jn as setScopeCacheContext, jt as cacheListItemSchema, k as extractCacheHits, kn as runInExistingEvalScope, kt as cacheEntryWithDebugKeySchema, ln as evalTracer, lt as evalStatItemSchema, mn as serializeCacheRecording, mt as runLogLocationSchema, nn as numberDisplayOptionsSchema, nt as runLogsConfigSchema, on as buildTraceTree, ot as caseRowSchema, pn as deserializeCacheValue, pt as runLogLevelSchema, q as defaultConfigKeySchema, qt as traceSpanKindSchema, rn as repoFileRefSchema, rt as trialSelectionModeSchema, sn as captureEvalSpanError, st as evalFreshnessStatusSchema, tn as jsonCellSchema, tt as resolveLlmCallsConfig, un as hashCacheKey, ut as evalStatsConfigSchema, vt as evalChartAxisSchema, wn as isInEvalScope, wt as evalChartTypeSchema, xn as getCurrentScope, xt as evalChartConfigSchema, yn as evalAssert, yt as evalChartBuiltinMetricSchema, z as runSummarySchema, zt as traceCacheRefSchema } from "./runOrchestration-D1edUDhp.mjs";
|
|
2
|
+
import { n as createRunner, t as runCli } from "./cli-C0EtHhEO.mjs";
|
|
3
|
+
import "./src-D-HuV8I-.mjs";
|
|
4
4
|
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { R as runManifestSchema, T as createRunRequestSchema, Tt as evalChartsConfigSchema, Zt as columnDefSchema, b as loadConfig, t as executeRun, ut as evalStatsConfigSchema, v as parseEvalMetas, vn as configureEvalRunLogs, w as createFsCacheStore, z as runSummarySchema } from "./runOrchestration-
|
|
1
|
+
import { R as runManifestSchema, T as createRunRequestSchema, Tt as evalChartsConfigSchema, Zt as columnDefSchema, b as loadConfig, t as executeRun, ut as evalStatsConfigSchema, v as parseEvalMetas, vn as configureEvalRunLogs, w as createFsCacheStore, z as runSummarySchema } from "./runOrchestration-D1edUDhp.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|