@ls-stack/agent-eval 0.22.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-moDHbg1O.mjs → app-DYRmucgj.mjs} +3 -3
- package/dist/apps/web/dist/assets/{index-AUDD3rNB.js → index-KbbX3NYr.js} +35 -35
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/{cli-C0EtHhEO.mjs → cli-Be0x8CS3.mjs} +3 -3
- package/dist/index.d.mts +106 -9
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-D1edUDhp.mjs → runOrchestration-D697g6Qe.mjs} +281 -42
- package/dist/{runner-C9nP2VKL.mjs → runner-B4SosWgD.mjs} +2 -2
- package/dist/{runner-CyRhIzci.mjs → runner-jSujaSKt.mjs} +1 -1
- package/dist/src-D6cettg0.mjs +3 -0
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +21 -5
- package/dist/src-D-HuV8I-.mjs +0 -3
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-KbbX3NYr.js"></script>
|
|
29
29
|
<link rel="stylesheet" crossorigin href="/assets/index-r0dVFK0B.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { C as validateCharts, F as
|
|
1
|
+
import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, Rn as getEvalRegistry, S as normalizeScoreDef, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as loadConfig, c as persistCaseDetail, d as recomputePersistedCaseStatus, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, l as persistRunState, m as buildEvalSummary, n as generateRunId, nt as resolveLlmCallsConfig, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, tt as resolveApiCallsConfig, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-D697g6Qe.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join, relative, resolve } from "node:path";
|
|
@@ -959,8 +959,8 @@ async function commandApp(args) {
|
|
|
959
959
|
const { serve } = await import("@hono/node-server");
|
|
960
960
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
961
961
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
962
|
-
const appModule = await import("./app-
|
|
963
|
-
const runnerModule = await import("./runner-
|
|
962
|
+
const appModule = await import("./app-DYRmucgj.mjs");
|
|
963
|
+
const runnerModule = await import("./runner-jSujaSKt.mjs");
|
|
964
964
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
965
965
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
966
966
|
await runnerModule.initRunner();
|
package/dist/index.d.mts
CHANGED
|
@@ -1539,13 +1539,35 @@ declare const apiCallMetricPlacementSchema: z$1.ZodEnum<{
|
|
|
1539
1539
|
}>;
|
|
1540
1540
|
/** Placement option for an API-call metric. */
|
|
1541
1541
|
type ApiCallMetricPlacement = z$1.infer<typeof apiCallMetricPlacementSchema>;
|
|
1542
|
+
/** Context passed to an LLM/API-call derived attribute function. */
|
|
1543
|
+
type CallDerivedAttributeContext = {
|
|
1544
|
+
/** Raw attributes from the matching trace span. */attributes: Record<string, unknown> | undefined; /** Matching trace span. */
|
|
1545
|
+
span: EvalTraceSpan; /** Dot-path helper for reading from `span.attributes`. */
|
|
1546
|
+
get: (path: string) => unknown;
|
|
1547
|
+
};
|
|
1548
|
+
/**
|
|
1549
|
+
* Runner-side function used to derive one new span attribute from a matching
|
|
1550
|
+
* LLM/API-call span. Return `undefined` to omit the attribute for that span.
|
|
1551
|
+
*/
|
|
1552
|
+
type CallDerivedAttribute = (ctx: CallDerivedAttributeContext) => unknown;
|
|
1553
|
+
/** One resolved derived span attribute rule. */
|
|
1554
|
+
type ResolvedCallDerivedAttribute = {
|
|
1555
|
+
/** Dot-path where the derived value is persisted on `span.attributes`. */path: string;
|
|
1556
|
+
/**
|
|
1557
|
+
* Function that derives the persisted value for each matching span. Omitted
|
|
1558
|
+
* after this config is serialized to the browser.
|
|
1559
|
+
*/
|
|
1560
|
+
compute?: CallDerivedAttribute;
|
|
1561
|
+
};
|
|
1542
1562
|
/**
|
|
1543
1563
|
* Schema for a single user-defined metric attached to LLM call rows.
|
|
1544
1564
|
*
|
|
1545
1565
|
* Each metric reads `path` from the span's `attributes` and renders the value
|
|
1546
|
-
* with the configured `format` and `numberFormat`.
|
|
1547
|
-
*
|
|
1548
|
-
*
|
|
1566
|
+
* with the configured `format` and `numberFormat`. Use
|
|
1567
|
+
* `llmCalls.derivedAttributes` when a metric should read a value computed from
|
|
1568
|
+
* other attributes. `placements` controls whether the metric appears as a chip
|
|
1569
|
+
* on the collapsed row header, as a row inside the expanded body, or both.
|
|
1570
|
+
* Defaults to `['body']` when omitted.
|
|
1549
1571
|
*/
|
|
1550
1572
|
declare const llmCallMetricSchema: z$1.ZodObject<{
|
|
1551
1573
|
label: z$1.ZodString;
|
|
@@ -1570,9 +1592,11 @@ type LlmCallMetric = z$1.infer<typeof llmCallMetricSchema>;
|
|
|
1570
1592
|
* Schema for a single user-defined metric attached to API call rows.
|
|
1571
1593
|
*
|
|
1572
1594
|
* Each metric reads `path` from the span's `attributes` and renders the value
|
|
1573
|
-
* with the configured `format` and `numberFormat`.
|
|
1574
|
-
*
|
|
1575
|
-
*
|
|
1595
|
+
* with the configured `format` and `numberFormat`. Use
|
|
1596
|
+
* `apiCalls.derivedAttributes` when a metric should read a value computed from
|
|
1597
|
+
* other attributes. `placements` controls whether the metric appears as a chip
|
|
1598
|
+
* on the collapsed row header, as a row inside the expanded body, or both.
|
|
1599
|
+
* Defaults to `['body']` when omitted.
|
|
1576
1600
|
*/
|
|
1577
1601
|
declare const apiCallMetricSchema: z$1.ZodObject<{
|
|
1578
1602
|
label: z$1.ZodString;
|
|
@@ -1629,6 +1653,7 @@ declare const llmCallsConfigSchema: z$1.ZodObject<{
|
|
|
1629
1653
|
reasoning: z$1.ZodOptional<z$1.ZodString>;
|
|
1630
1654
|
toolCalls: z$1.ZodOptional<z$1.ZodString>;
|
|
1631
1655
|
}, z$1.core.$strip>>;
|
|
1656
|
+
derivedAttributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodCustom<CallDerivedAttribute, CallDerivedAttribute>>>;
|
|
1632
1657
|
pricing: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
1633
1658
|
model: z$1.ZodString;
|
|
1634
1659
|
provider: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -1674,6 +1699,7 @@ declare const apiCallsConfigSchema: z$1.ZodObject<{
|
|
|
1674
1699
|
durationMs: z$1.ZodOptional<z$1.ZodString>;
|
|
1675
1700
|
error: z$1.ZodOptional<z$1.ZodString>;
|
|
1676
1701
|
}, z$1.core.$strip>>;
|
|
1702
|
+
derivedAttributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodCustom<CallDerivedAttribute, CallDerivedAttribute>>>;
|
|
1677
1703
|
metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
1678
1704
|
label: z$1.ZodString;
|
|
1679
1705
|
tooltip: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -1720,6 +1746,7 @@ type ResolvedLlmCallsConfig = {
|
|
|
1720
1746
|
reasoning: string;
|
|
1721
1747
|
toolCalls: string;
|
|
1722
1748
|
};
|
|
1749
|
+
derivedAttributes: ResolvedCallDerivedAttribute[];
|
|
1723
1750
|
metrics: ResolvedLlmCallMetric[];
|
|
1724
1751
|
pricing: ResolvedLlmCallPricing[];
|
|
1725
1752
|
};
|
|
@@ -1738,6 +1765,7 @@ type ResolvedApiCallsConfig = {
|
|
|
1738
1765
|
durationMs: string;
|
|
1739
1766
|
error: string;
|
|
1740
1767
|
};
|
|
1768
|
+
derivedAttributes: ResolvedCallDerivedAttribute[];
|
|
1741
1769
|
metrics: ResolvedApiCallMetric[];
|
|
1742
1770
|
};
|
|
1743
1771
|
/** Fully-resolved LLM-call metric used by the runner and UI. */
|
|
@@ -1983,6 +2011,7 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
1983
2011
|
reasoning: z$1.ZodOptional<z$1.ZodString>;
|
|
1984
2012
|
toolCalls: z$1.ZodOptional<z$1.ZodString>;
|
|
1985
2013
|
}, z$1.core.$strip>>;
|
|
2014
|
+
derivedAttributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodCustom<CallDerivedAttribute, CallDerivedAttribute>>>;
|
|
1986
2015
|
pricing: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
1987
2016
|
model: z$1.ZodString;
|
|
1988
2017
|
provider: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -2037,6 +2066,7 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
2037
2066
|
durationMs: z$1.ZodOptional<z$1.ZodString>;
|
|
2038
2067
|
error: z$1.ZodOptional<z$1.ZodString>;
|
|
2039
2068
|
}, z$1.core.$strip>>;
|
|
2069
|
+
derivedAttributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodCustom<CallDerivedAttribute, CallDerivedAttribute>>>;
|
|
2040
2070
|
metrics: z$1.ZodOptional<z$1.ZodArray<z$1.ZodObject<{
|
|
2041
2071
|
label: z$1.ZodString;
|
|
2042
2072
|
tooltip: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -2067,6 +2097,20 @@ declare const agentEvalsConfigSchema: z$1.ZodObject<{
|
|
|
2067
2097
|
}, z$1.core.$strip>>;
|
|
2068
2098
|
}, z$1.core.$strip>;
|
|
2069
2099
|
//#endregion
|
|
2100
|
+
//#region ../shared/src/utils/deriveCallAttributes.d.ts
|
|
2101
|
+
/**
|
|
2102
|
+
* Persist configured derived attributes onto matching LLM/API spans.
|
|
2103
|
+
*
|
|
2104
|
+
* These derived attributes are applied before trace consumers run, so
|
|
2105
|
+
* `deriveFromTracing`, default usage extraction, trace display, and call
|
|
2106
|
+
* metrics can all read them by normal dot-path lookup.
|
|
2107
|
+
*/
|
|
2108
|
+
declare function applyDerivedCallAttributes(params: {
|
|
2109
|
+
spans: EvalTraceSpan[];
|
|
2110
|
+
llmCallsConfig: ResolvedLlmCallsConfig;
|
|
2111
|
+
apiCallsConfig: ResolvedApiCallsConfig;
|
|
2112
|
+
}): EvalTraceSpan[];
|
|
2113
|
+
//#endregion
|
|
2070
2114
|
//#region ../shared/src/utils/extractLlmCalls.d.ts
|
|
2071
2115
|
/** Resolved value for one user-defined metric on an LLM call row. */
|
|
2072
2116
|
type LlmCallMetricValue = {
|
|
@@ -2758,6 +2802,12 @@ type EvalCase<TInput> = {
|
|
|
2758
2802
|
};
|
|
2759
2803
|
/** Runtime output values collected from output helpers and `deriveFromTracing`. */
|
|
2760
2804
|
type EvalOutputs = Record<string, unknown>;
|
|
2805
|
+
/**
|
|
2806
|
+
* Initial wall-clock time used by an eval's shifted Date clock.
|
|
2807
|
+
*
|
|
2808
|
+
* Pass `'now'` to opt one eval back into the real current clock.
|
|
2809
|
+
*/
|
|
2810
|
+
type EvalStartTime = Date | number | string;
|
|
2761
2811
|
/**
|
|
2762
2812
|
* Schema used to validate and type an eval's collected runtime outputs.
|
|
2763
2813
|
*
|
|
@@ -2925,6 +2975,23 @@ type EvalDefinitionBase<TInput = unknown, TOutputs extends EvalOutputs = EvalOut
|
|
|
2925
2975
|
* delay case finalization; late mutations are not guaranteed to persist.
|
|
2926
2976
|
*/
|
|
2927
2977
|
waitForBackgroundJobs?: boolean;
|
|
2978
|
+
/**
|
|
2979
|
+
* Optional initial wall-clock time for this eval's runtime.
|
|
2980
|
+
*
|
|
2981
|
+
* When set, `new Date()` and `Date.now()` inside case generation, execution,
|
|
2982
|
+
* tracing, derived outputs, and scorers start from this wall-clock value and
|
|
2983
|
+
* then continue advancing with real elapsed time. The default is
|
|
2984
|
+
* `2026-04-10T00:00:00.000Z`. Pass `'now'` to use the real current clock for
|
|
2985
|
+
* this eval. Timers are not faked, so `setTimeout` and other asynchronous
|
|
2986
|
+
* work still run normally.
|
|
2987
|
+
*/
|
|
2988
|
+
startTime?: EvalStartTime;
|
|
2989
|
+
/**
|
|
2990
|
+
* Freeze the eval Date clock at `startTime` until `advanceEvalTime(...)`
|
|
2991
|
+
* moves it manually. Defaults to `false`, so eval time advances with real
|
|
2992
|
+
* elapsed time from the configured `startTime`.
|
|
2993
|
+
*/
|
|
2994
|
+
freezeTime?: boolean;
|
|
2928
2995
|
execute: (ctx: EvalExecuteContext<TInput, TOutputs>) => Promise<void> | void;
|
|
2929
2996
|
deriveFromTracing?: (ctx: EvalDeriveContext<TInput>) => Partial<TOutputs> | Promise<Partial<TOutputs>>;
|
|
2930
2997
|
scores?: Record<string, EvalScoreDef<TInput, TOutputs>>;
|
|
@@ -3018,6 +3085,9 @@ declare function defineEval<TInput = unknown, TOutputs extends EvalOutputs = Eva
|
|
|
3018
3085
|
declare function repoFile(path: string, mimeType?: string): RepoFileRef;
|
|
3019
3086
|
//#endregion
|
|
3020
3087
|
//#region ../sdk/src/runtime.d.ts
|
|
3088
|
+
declare global {
|
|
3089
|
+
var __agentEvalsRealDate: DateConstructor | undefined;
|
|
3090
|
+
}
|
|
3021
3091
|
/**
|
|
3022
3092
|
* Raw-key debug payload passed alongside cache writes.
|
|
3023
3093
|
*
|
|
@@ -3063,7 +3133,15 @@ type CacheRecordingFrame = {
|
|
|
3063
3133
|
};
|
|
3064
3134
|
/** Mutable per-case runtime state stored in async local storage. */
|
|
3065
3135
|
type EvalCaseScope = {
|
|
3066
|
-
caseId: string; /**
|
|
3136
|
+
caseId: string; /** Initial wall-clock time used by Date APIs inside this eval case. */
|
|
3137
|
+
startTime: EvalStartTime | undefined; /** Mutable shifted wall-clock state shared across this eval case's phases. */
|
|
3138
|
+
evalClockState: {
|
|
3139
|
+
startMs: number;
|
|
3140
|
+
realStartMs: number;
|
|
3141
|
+
offsetMs: number;
|
|
3142
|
+
frozen: boolean;
|
|
3143
|
+
shifted: boolean;
|
|
3144
|
+
}; /** Stable prefix used by `nextEvalId()` for this eval case scope. */
|
|
3067
3145
|
idPrefix: string | undefined; /** Monotonic per-scope counter used by `nextEvalId()`. */
|
|
3068
3146
|
nextEvalIdCounter: number; /** Authored input for the current case, when provided by the runner. */
|
|
3069
3147
|
input?: unknown;
|
|
@@ -3101,11 +3179,28 @@ type EvalCaseScope = {
|
|
|
3101
3179
|
* modules imported while a run is being prepared.
|
|
3102
3180
|
*/
|
|
3103
3181
|
type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'outputsSchema' | 'scorer';
|
|
3182
|
+
/** Time unit accepted by `advanceEvalTime(unit, amount)`. */
|
|
3183
|
+
type EvalTimeUnit = 'millisecond' | 'milliseconds' | 'second' | 'seconds' | 'minute' | 'minutes' | 'hour' | 'hours' | 'day' | 'days';
|
|
3104
3184
|
type EvalLogLevelInput = RunLogLevel | 'warning';
|
|
3105
3185
|
/** Error thrown when an eval assertion fails during case execution. */
|
|
3106
3186
|
declare class EvalAssertionError extends Error {
|
|
3107
3187
|
constructor(message: string);
|
|
3108
3188
|
}
|
|
3189
|
+
/**
|
|
3190
|
+
* Return the wall-clock start time captured for the active eval.
|
|
3191
|
+
*
|
|
3192
|
+
* For `startTime: 'now'`, this is the real time captured when the eval clock
|
|
3193
|
+
* context was created.
|
|
3194
|
+
*/
|
|
3195
|
+
declare function getEvalStartTime(): Date;
|
|
3196
|
+
/**
|
|
3197
|
+
* Advance the active eval's shifted Date clock and return the new time.
|
|
3198
|
+
*
|
|
3199
|
+
* Throws outside an active shifted eval clock. Evals that set
|
|
3200
|
+
* `startTime: 'now'` use the real current clock unless `freezeTime: true` is
|
|
3201
|
+
* also set.
|
|
3202
|
+
*/
|
|
3203
|
+
declare function advanceEvalTime(unit: EvalTimeUnit, amount: number): Date;
|
|
3109
3204
|
/** Return the current eval scope for the active async context, if any. */
|
|
3110
3205
|
declare function getCurrentScope(): EvalCaseScope | undefined;
|
|
3111
3206
|
/**
|
|
@@ -3155,7 +3250,9 @@ type RunInEvalScopeOptions = {
|
|
|
3155
3250
|
idPrefix?: string; /** Cache adapter + mode attached to the scope before `fn` runs. */
|
|
3156
3251
|
cacheContext?: CacheScopeContext; /** Whether registered background jobs should settle before scope finalizes. */
|
|
3157
3252
|
waitForBackgroundJobs?: boolean; /** Eval runner phase exposed through `isInEvalScope()`. Defaults to `eval`. */
|
|
3158
|
-
runtimeScope?: EvalRuntimeScope;
|
|
3253
|
+
runtimeScope?: EvalRuntimeScope; /** Initial wall-clock time used by `new Date()` and `Date.now()` in this eval. */
|
|
3254
|
+
startTime?: EvalStartTime; /** Whether Date APIs stay frozen until advanced manually. */
|
|
3255
|
+
freezeTime?: boolean;
|
|
3159
3256
|
};
|
|
3160
3257
|
/** Execute a callback while `isInEvalScope()` reports a runner phase. */
|
|
3161
3258
|
declare function runInEvalRuntimeScope<T>(runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
|
|
@@ -3570,4 +3667,4 @@ declare function createRunner({
|
|
|
3570
3667
|
*/
|
|
3571
3668
|
declare function runCli(argv: string[]): Promise<void>;
|
|
3572
3669
|
//#endregion
|
|
3573
|
-
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
3670
|
+
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as
|
|
2
|
-
import { n as createRunner, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
1
|
+
import { $ as llmCallsConfigSchema, $t as columnFormatSchema, A as extractApiCalls, An as runInEvalRuntimeScope, At as cacheEntryWithDebugKeySchema, B as runSummarySchema, Bt as traceCacheRefSchema, Cn as getCurrentScope, Ct as evalChartMetricSchema, D as sseEnvelopeSchema, Dn as isInEvalScope, Dt as cacheDebugKeyEntrySchema, E as updateManualScoreRequestSchema, En as incrementEvalOutput, Et as evalChartsConfigSchema, F as getEvalDisplayStatus, Fn as startEvalBackgroundJob, Ft as cacheRecordingOpSchema, G as apiCallMetricPlacementSchema, Gt as traceDisplayConfigSchema, H as DEFAULT_LLM_CALLS_CONFIG, Ht as traceAttributeDisplayInputSchema, I as deriveScopedSummaryFromCases, In as repoFile, It as cacheRecordingSchema, J as defaultConfigKeySchema, Jt as traceSpanKindSchema, K as apiCallMetricSchema, Kt as traceDisplayInputConfigSchema, L as deriveStatusFromCaseRows, Ln as defineEval, Lt as cacheStatusSchema, M as applyDerivedCallAttributes, Mn as runInExistingEvalScope, Mt as cacheListItemSchema, N as getNestedAttribute, Nn as setEvalOutput, Nt as cacheModeSchema, O as extractCacheEntries, On as mergeEvalOutput, Ot as cacheDebugKeyFileSchema, P as getEvalTitle, Pn as setScopeCacheContext, Pt as cacheOperationTypeSchema, Q as llmCallPricingSchema, Qt as columnDefSchema, R as deriveStatusFromChildStatuses, Rn as getEvalRegistry, Rt as serializedCacheSpanSchema, Sn as evalLog, St as evalChartConfigSchema, T as createRunRequestSchema, Tn as getEvalStartTime, Tt as evalChartTypeSchema, U as agentEvalsConfigSchema, Ut as traceAttributeDisplayPlacementSchema, V as DEFAULT_API_CALLS_CONFIG, Vt as traceAttributeDisplayFormatSchema, W as apiCallMetricFormatSchema, Wt as traceAttributeDisplaySchema, X as llmCallMetricPlacementSchema, Xt as traceSpanWarningSchema, Y as llmCallMetricFormatSchema, Yt as traceSpanSchema, Z as llmCallMetricSchema, Zt as cellValueSchema, _n as EvalAssertionError, _t as scoreTraceSchema, an as runArtifactRefSchema, at as assertionFailureSchema, bt as evalChartBuiltinMetricSchema, cn as captureEvalSpanError, ct as evalFreshnessStatusSchema, dn as hashCacheKey, dt as evalStatsConfigSchema, en as columnKindSchema, et as removeDefaultConfigSchema, fn as hashCacheKeySync, ft as evalSummarySchema, gn as serializeCacheValue, gt as runLogPhaseSchema, hn as serializeCacheRecording, ht as runLogLocationSchema, in as repoFileRefSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as runInEvalScope, jt as cacheFileSchema, k as extractCacheHits, kn as nextEvalId, kt as cacheEntrySchema, ln as evalSpan, lt as evalStatAggregateSchema, mn as deserializeCacheValue, mt as runLogLevelSchema, nn as jsonCellSchema, nt as resolveLlmCallsConfig, on as z, ot as caseDetailSchema, pn as deserializeCacheRecording, pt as runLogEntrySchema, q as apiCallsConfigSchema, qt as traceSpanErrorSchema, rn as numberDisplayOptionsSchema, rt as runLogsConfigSchema, sn as buildTraceTree, st as caseRowSchema, tn as fileRefSchema, tt as resolveApiCallsConfig, un as evalTracer, ut as evalStatItemSchema, vn as advanceEvalTime, vt as evalChartAggregateSchema, wn as getEvalCaseInput, wt as evalChartTooltipExtraSchema, xn as evalAssert, xt as evalChartColorSchema, yn as appendToEvalOutput, yt as evalChartAxisSchema, z as runManifestSchema, zt as spanCacheOptionsSchema } from "./runOrchestration-D697g6Qe.mjs";
|
|
2
|
+
import { n as createRunner, t as runCli } from "./cli-Be0x8CS3.mjs";
|
|
3
|
+
import "./src-D6cettg0.mjs";
|
|
4
|
+
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { B as runSummarySchema, Et as evalChartsConfigSchema, Qt as columnDefSchema, T as createRunRequestSchema, b as loadConfig, bn as configureEvalRunLogs, dt as evalStatsConfigSchema, t as executeRun, v as parseEvalMetas, w as createFsCacheStore, z as runManifestSchema } from "./runOrchestration-D697g6Qe.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|