@ls-stack/agent-eval 0.58.1 → 0.58.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DhMIbjlE.mjs → app-BxD6aHbp.mjs} +52 -7
- package/dist/apps/web/dist/assets/index-BMWBZw_u.js +377 -0
- package/dist/apps/web/dist/assets/index-CHH7m5Cv.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +2 -1
- package/dist/{cli-_g2qOMK6.mjs → cli-HBwXIJsg.mjs} +31 -5
- package/dist/index.d.mts +76 -17
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-d42Lm0i5.mjs → runExecution-pHJ0_TzH.mjs} +125 -21
- package/dist/{runOrchestration-CvmFeOmT.mjs → runOrchestration-ngVXShH4.mjs} +73 -6
- package/dist/{runner-BKogjiYd.mjs → runner-BnZMGBla.mjs} +1 -1
- package/dist/{runner-MSr8sAWm.mjs → runner-D_pz2NON.mjs} +2 -2
- package/dist/{src-CdZsOn6y.mjs → src-AeXGBJ26.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +18 -3
- package/dist/apps/web/dist/assets/index-Cz9p4l-t.js +0 -377
- package/dist/apps/web/dist/assets/index-DtARRwsS.css +0 -1
package/dist/index.d.mts
CHANGED
|
@@ -268,6 +268,7 @@ declare const runLogEntrySchema$1: z$1.ZodObject<{
|
|
|
268
268
|
phase: z$1.ZodEnum<{
|
|
269
269
|
eval: "eval";
|
|
270
270
|
derive: "derive";
|
|
271
|
+
tracingAssertions: "tracingAssertions";
|
|
271
272
|
outputsSchema: "outputsSchema";
|
|
272
273
|
scorer: "scorer";
|
|
273
274
|
}>;
|
|
@@ -399,10 +400,17 @@ type EvalCase$1$1<TInput = unknown> = {
|
|
|
399
400
|
};
|
|
400
401
|
/** Query helpers built from the flattened trace recorded for one eval case. */
|
|
401
402
|
type EvalTraceTree = {
|
|
402
|
-
spans: EvalTraceSpan$2[];
|
|
403
|
-
rootSpans: EvalTraceSpan$2[];
|
|
404
|
-
findSpan: (name: string) => EvalTraceSpan$2 | undefined;
|
|
405
|
-
|
|
403
|
+
/** Flat span list in creation order. */spans: EvalTraceSpan$2[]; /** Top-level spans whose `parentId` is `null`. */
|
|
404
|
+
rootSpans: EvalTraceSpan$2[]; /** Return the first span whose name exactly matches `name`. */
|
|
405
|
+
findSpan: (name: string) => EvalTraceSpan$2 | undefined; /** Return every span whose name exactly matches `name`. */
|
|
406
|
+
findSpans: (name: string) => EvalTraceSpan$2[]; /** Return whether any span name exactly matches `name`. */
|
|
407
|
+
hasSpan: (name: string) => boolean; /** Return every span whose kind exactly matches `kind`. */
|
|
408
|
+
findSpansByKind: (kind: string) => EvalTraceSpan$2[]; /** Return every span with `kind: 'tool'`. */
|
|
409
|
+
findToolCallSpans: () => EvalTraceSpan$2[]; /** Return the names of every span with `kind: 'tool'`. */
|
|
410
|
+
listToolCallSpanNames: () => string[]; /** Return whether a `kind: 'tool'` span has a name exactly matching `name`. */
|
|
411
|
+
hasToolCallSpan: (name: string) => boolean; /** Return span names in creation order, optionally filtered by kind. */
|
|
412
|
+
listSpanNames: (kind?: string) => string[]; /** Return span names in depth-first tree order, optionally filtered by kind. */
|
|
413
|
+
listSpanNamesDfs: (kind?: string) => string[]; /** Return all spans in depth-first tree order. */
|
|
406
414
|
flattenDfs: () => EvalTraceSpan$2[];
|
|
407
415
|
checkpoints: Map<string, unknown>;
|
|
408
416
|
};
|
|
@@ -421,6 +429,12 @@ type EvalDeriveMap<TInput = unknown> = Record<string, EvalDeriveValueFn<TInput>>
|
|
|
421
429
|
type EvalDeriveFn<TInput = unknown> = (ctx: EvalDeriveContext<TInput>) => Record<string, unknown> | Promise<Record<string, unknown>>;
|
|
422
430
|
/** Trace-derived output config accepted globally and on eval definitions. */
|
|
423
431
|
type EvalDeriveConfig<TInput = unknown> = EvalDeriveMap<TInput> | EvalDeriveFn<TInput>;
|
|
432
|
+
/** Function that records trace-derived assertions for one case. */
|
|
433
|
+
type EvalTracingAssertionsFn<TInput = unknown> = (ctx: EvalDeriveContext<TInput>) => MaybePromise$1<void>;
|
|
434
|
+
/** Keyed trace-derived assertion config for grouping related checks. */
|
|
435
|
+
type EvalTracingAssertionsMap<TInput = unknown> = Record<string, EvalTracingAssertionsFn<TInput>>;
|
|
436
|
+
/** Trace-derived assertion config accepted globally and on eval definitions. */
|
|
437
|
+
type EvalTracingAssertionsConfig<TInput = unknown> = EvalTracingAssertionsMap<TInput> | EvalTracingAssertionsFn<TInput>;
|
|
424
438
|
/** UI overrides for a derived or scored column emitted by an eval. */
|
|
425
439
|
type EvalColumnOverride = {
|
|
426
440
|
/** Display label shown for the column in tables and detail views. */label?: string;
|
|
@@ -1142,9 +1156,18 @@ type EvalDefinitionBase<TInput = unknown, TOutputs extends EvalOutputs = EvalOut
|
|
|
1142
1156
|
*
|
|
1143
1157
|
* Prefer the keyed map form when each key has one derivation. The
|
|
1144
1158
|
* object-returning callback form is also supported. Derived values only fill
|
|
1145
|
-
* keys not already recorded during execution.
|
|
1159
|
+
* keys not already recorded during execution. Assertion helpers are not
|
|
1160
|
+
* allowed here; use `tracingAssertions` for trace-derived pass/fail checks.
|
|
1146
1161
|
*/
|
|
1147
1162
|
deriveFromTracing?: EvalDeriveConfig<TInput>;
|
|
1163
|
+
/**
|
|
1164
|
+
* Record assertions from the finished execution trace.
|
|
1165
|
+
*
|
|
1166
|
+
* Runs after `deriveFromTracing` and before output schema validation and
|
|
1167
|
+
* scores. Use `evalAssert(...)` or `evalExpect(...)` inside the callback to
|
|
1168
|
+
* write normal assertion results without creating score columns.
|
|
1169
|
+
*/
|
|
1170
|
+
tracingAssertions?: EvalTracingAssertionsConfig<TInput>;
|
|
1148
1171
|
/**
|
|
1149
1172
|
* Computed score columns for each case.
|
|
1150
1173
|
*
|
|
@@ -1503,12 +1526,16 @@ type EvalCaseScope = {
|
|
|
1503
1526
|
* covers run-time module/environment loading, including top-level code in
|
|
1504
1527
|
* modules imported while a run is being prepared.
|
|
1505
1528
|
*/
|
|
1506
|
-
type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'outputsSchema' | 'scorer';
|
|
1529
|
+
type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'tracingAssertions' | 'outputsSchema' | 'scorer';
|
|
1507
1530
|
type EvalLogLevelInput = RunLogLevel$1 | 'warning';
|
|
1508
1531
|
/** Error thrown when an eval assertion fails during case execution. */
|
|
1509
1532
|
declare class EvalAssertionError extends Error {
|
|
1510
1533
|
constructor(message: string);
|
|
1511
1534
|
}
|
|
1535
|
+
/** Error thrown when an SDK helper is used in an unsupported runner phase. */
|
|
1536
|
+
declare class EvalRuntimeUsageError extends Error {
|
|
1537
|
+
constructor(message: string);
|
|
1538
|
+
}
|
|
1512
1539
|
/** Return the host process clock, bypassing the eval Date shim. */
|
|
1513
1540
|
/**
|
|
1514
1541
|
* Eval time helpers for reading and moving the active eval clock.
|
|
@@ -1539,8 +1566,10 @@ declare function getCurrentScope(): EvalCaseScope | undefined;
|
|
|
1539
1566
|
*
|
|
1540
1567
|
* Returns `null` outside eval-owned work, `env` while the runner is loading
|
|
1541
1568
|
* eval modules for a run, `cases` while generating cases, `eval` while running
|
|
1542
|
-
* case `execute`, `derive` while deriving outputs from traces,
|
|
1543
|
-
*
|
|
1569
|
+
* case `execute`, `derive` while deriving outputs from traces,
|
|
1570
|
+
* `tracingAssertions` while checking trace-derived assertions,
|
|
1571
|
+
* `outputsSchema` while validating outputs, and `scorer` while computing
|
|
1572
|
+
* scores.
|
|
1544
1573
|
*/
|
|
1545
1574
|
declare function isInEvalScope(): EvalRuntimeScope | null;
|
|
1546
1575
|
/**
|
|
@@ -1656,7 +1685,8 @@ declare function incrementEvalOutput(key: string, delta: number): void;
|
|
|
1656
1685
|
* Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
|
|
1657
1686
|
* can safely reuse `evalAssert(...)` when it also runs outside an eval. The
|
|
1658
1687
|
* TypeScript assertion signature still narrows the checked value after the
|
|
1659
|
-
* call.
|
|
1688
|
+
* call. Calls inside `deriveFromTracing` throw because derivations must only
|
|
1689
|
+
* write outputs; use `tracingAssertions` for trace-derived pass/fail checks.
|
|
1660
1690
|
*/
|
|
1661
1691
|
declare function evalAssert(condition: unknown, message: string): asserts condition; //#endregion
|
|
1662
1692
|
//#region src/valueCache.d.ts
|
|
@@ -2753,6 +2783,7 @@ type RunLogLevel = z$1.infer<typeof runLogLevelSchema>;
|
|
|
2753
2783
|
declare const runLogPhaseSchema: z$1.ZodEnum<{
|
|
2754
2784
|
eval: "eval";
|
|
2755
2785
|
derive: "derive";
|
|
2786
|
+
tracingAssertions: "tracingAssertions";
|
|
2756
2787
|
outputsSchema: "outputsSchema";
|
|
2757
2788
|
scorer: "scorer";
|
|
2758
2789
|
}>;
|
|
@@ -2779,6 +2810,7 @@ declare const runLogEntrySchema: z$1.ZodObject<{
|
|
|
2779
2810
|
phase: z$1.ZodEnum<{
|
|
2780
2811
|
eval: "eval";
|
|
2781
2812
|
derive: "derive";
|
|
2813
|
+
tracingAssertions: "tracingAssertions";
|
|
2782
2814
|
outputsSchema: "outputsSchema";
|
|
2783
2815
|
scorer: "scorer";
|
|
2784
2816
|
}>;
|
|
@@ -3137,6 +3169,7 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
3137
3169
|
phase: z$1.ZodEnum<{
|
|
3138
3170
|
eval: "eval";
|
|
3139
3171
|
derive: "derive";
|
|
3172
|
+
tracingAssertions: "tracingAssertions";
|
|
3140
3173
|
outputsSchema: "outputsSchema";
|
|
3141
3174
|
scorer: "scorer";
|
|
3142
3175
|
}>;
|
|
@@ -3610,7 +3643,7 @@ type ScopedCaseSummary = {
|
|
|
3610
3643
|
//#endregion
|
|
3611
3644
|
//#region src/evalStatus.d.ts
|
|
3612
3645
|
/** Display status used for eval, file, and folder UI surfaces. */
|
|
3613
|
-
type EvalDisplayStatus = DerivedStatus | 'stale' | 'outdated' | 'unscored';
|
|
3646
|
+
type EvalDisplayStatus = DerivedStatus | 'enqueued' | 'stale' | 'outdated' | 'unscored';
|
|
3614
3647
|
/**
|
|
3615
3648
|
* Derive the user-facing eval status from the raw latest run result plus
|
|
3616
3649
|
* freshness state.
|
|
@@ -3658,10 +3691,17 @@ type EvalCase$1<TInput = unknown> = {
|
|
|
3658
3691
|
};
|
|
3659
3692
|
/** Query helpers built from the flattened trace recorded for one eval case. */
|
|
3660
3693
|
type EvalTraceTree$1 = {
|
|
3661
|
-
spans: EvalTraceSpan$1[];
|
|
3662
|
-
rootSpans: EvalTraceSpan$1[];
|
|
3663
|
-
findSpan: (name: string) => EvalTraceSpan$1 | undefined;
|
|
3664
|
-
|
|
3694
|
+
/** Flat span list in creation order. */spans: EvalTraceSpan$1[]; /** Top-level spans whose `parentId` is `null`. */
|
|
3695
|
+
rootSpans: EvalTraceSpan$1[]; /** Return the first span whose name exactly matches `name`. */
|
|
3696
|
+
findSpan: (name: string) => EvalTraceSpan$1 | undefined; /** Return every span whose name exactly matches `name`. */
|
|
3697
|
+
findSpans: (name: string) => EvalTraceSpan$1[]; /** Return whether any span name exactly matches `name`. */
|
|
3698
|
+
hasSpan: (name: string) => boolean; /** Return every span whose kind exactly matches `kind`. */
|
|
3699
|
+
findSpansByKind: (kind: string) => EvalTraceSpan$1[]; /** Return every span with `kind: 'tool'`. */
|
|
3700
|
+
findToolCallSpans: () => EvalTraceSpan$1[]; /** Return the names of every span with `kind: 'tool'`. */
|
|
3701
|
+
listToolCallSpanNames: () => string[]; /** Return whether a `kind: 'tool'` span has a name exactly matching `name`. */
|
|
3702
|
+
hasToolCallSpan: (name: string) => boolean; /** Return span names in creation order, optionally filtered by kind. */
|
|
3703
|
+
listSpanNames: (kind?: string) => string[]; /** Return span names in depth-first tree order, optionally filtered by kind. */
|
|
3704
|
+
listSpanNamesDfs: (kind?: string) => string[]; /** Return all spans in depth-first tree order. */
|
|
3665
3705
|
flattenDfs: () => EvalTraceSpan$1[];
|
|
3666
3706
|
checkpoints: Map<string, unknown>;
|
|
3667
3707
|
};
|
|
@@ -3681,6 +3721,13 @@ type EvalDeriveFn$1<TInput = unknown> = (ctx: EvalDeriveContext$1<TInput>) => Re
|
|
|
3681
3721
|
/** Trace-derived output config accepted globally and on eval definitions. */
|
|
3682
3722
|
type EvalDeriveConfig$1<TInput = unknown> = EvalDeriveMap$1<TInput> | EvalDeriveFn$1<TInput>;
|
|
3683
3723
|
/** Schema for keyed or object-returning trace-derived output config. */
|
|
3724
|
+
/** Function that records trace-derived assertions for one case. */
|
|
3725
|
+
type EvalTracingAssertionsFn$1<TInput = unknown> = (ctx: EvalDeriveContext$1<TInput>) => MaybePromise<void>;
|
|
3726
|
+
/** Keyed trace-derived assertion config for grouping related checks. */
|
|
3727
|
+
type EvalTracingAssertionsMap$1<TInput = unknown> = Record<string, EvalTracingAssertionsFn$1<TInput>>;
|
|
3728
|
+
/** Trace-derived assertion config accepted globally and on eval definitions. */
|
|
3729
|
+
type EvalTracingAssertionsConfig$1<TInput = unknown> = EvalTracingAssertionsMap$1<TInput> | EvalTracingAssertionsFn$1<TInput>;
|
|
3730
|
+
/** Schema for function or keyed trace-derived assertion config. */
|
|
3684
3731
|
/** UI overrides for a derived or scored column emitted by an eval. */
|
|
3685
3732
|
type EvalColumnOverride$1 = {
|
|
3686
3733
|
/** Display label shown for the column in tables and detail views. */label?: string;
|
|
@@ -4133,9 +4180,19 @@ type AgentEvalsConfig$1 = {
|
|
|
4133
4180
|
* Prefer the keyed map form for shared metrics:
|
|
4134
4181
|
* `{ toolCalls: ({ trace }) => trace.findSpansByKind('tool').length }`.
|
|
4135
4182
|
* The object-returning function form is also supported. Derived outputs
|
|
4136
|
-
* only fill keys that were not already recorded by eval execution.
|
|
4183
|
+
* only fill keys that were not already recorded by eval execution. Do not
|
|
4184
|
+
* call assertion helpers here; use `tracingAssertions` for trace-derived
|
|
4185
|
+
* pass/fail checks.
|
|
4137
4186
|
*/
|
|
4138
4187
|
deriveFromTracing?: EvalDeriveConfig$1;
|
|
4188
|
+
/**
|
|
4189
|
+
* Workspace-wide assertions derived from the finished execution trace.
|
|
4190
|
+
*
|
|
4191
|
+
* These run after `deriveFromTracing` and before output schema validation and
|
|
4192
|
+
* scores. Use `evalAssert(...)` or `evalExpect(...)` inside the callback to
|
|
4193
|
+
* record normal assertion results without creating fake score columns.
|
|
4194
|
+
*/
|
|
4195
|
+
tracingAssertions?: EvalTracingAssertionsConfig$1;
|
|
4139
4196
|
/**
|
|
4140
4197
|
* Workspace-wide stats prepended to every eval's stats row.
|
|
4141
4198
|
*
|
|
@@ -6366,6 +6423,7 @@ declare const caseDetailSchema: z$1.ZodObject<{
|
|
|
6366
6423
|
phase: z$1.ZodEnum<{
|
|
6367
6424
|
eval: "eval";
|
|
6368
6425
|
derive: "derive";
|
|
6426
|
+
tracingAssertions: "tracingAssertions";
|
|
6369
6427
|
outputsSchema: "outputsSchema";
|
|
6370
6428
|
scorer: "scorer";
|
|
6371
6429
|
}>;
|
|
@@ -6992,7 +7050,8 @@ type EvalRunner = {
|
|
|
6992
7050
|
getEvals(): EvalSummary$1[]; /** Look up one discovered eval by id. */
|
|
6993
7051
|
getEval(id: string): EvalSummary$1 | undefined; /** Return discovery errors that should be shown before running evals. */
|
|
6994
7052
|
getDiscoveryIssues(): DiscoveryIssue$1[]; /** Return current config-reload state for the long-running app server. */
|
|
6995
|
-
getConfigReloadState(): ConfigReloadState$1; /**
|
|
7053
|
+
getConfigReloadState(): ConfigReloadState$1; /** Return the effective per-run case concurrency after applying defaults. */
|
|
7054
|
+
getConfiguredConcurrency(): number; /** Re-scan configured eval files and emit a discovery update to listeners. */
|
|
6996
7055
|
refreshDiscovery(): Promise<void>;
|
|
6997
7056
|
startRun(request: CreateRunRequest$1): Promise<{
|
|
6998
7057
|
manifest: RunManifest$1;
|
|
@@ -7227,4 +7286,4 @@ declare function defineEval<TInput = unknown, TOutputs extends EvalOutputs = Eva
|
|
|
7227
7286
|
/** Return whether the active eval case has tags matching the typed input. */
|
|
7228
7287
|
declare function matchesEvalTags(input: EvalTagMatchInput): boolean;
|
|
7229
7288
|
//#endregion
|
|
7230
|
-
export { AgentEvalTagRegistry, AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheRepairSummary, type CacheScopeContext, type CacheSerializationOptions, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CallDerivedAttributesConfig, type CallDerivedAttributesFn, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type ConfigReloadState, type ConfigReloadStatus, type CreateRunRequest, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCacheConfig, EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, EvalDefinition, type EvalDeriveConfig, type EvalDeriveContext, type EvalDeriveFn, type EvalDeriveMap, type EvalDeriveValueFn, type EvalDisplayStatus, type EvalExecuteContext, type EvalExpectation, type EvalFreshnessStatus, type EvalManualInputConfig, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, EvalTag, EvalTagMatchInput, type EvalTraceTree, type JsonCell, type LlmCallCostBreakdown, type LlmCallCostCurrency, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallPricingRate, type LlmCallPricingRegistry, type LlmCallSimulatedTokens, type LlmCallsConfigInput, type LlmCostScenario, type ManualInputDescriptor, type ManualInputFieldDescriptor, type ManualInputFieldKind, type ManualInputFieldOverride, type ManualInputFieldsConfig, type ManualInputFileValue, type ManualInputSelectOption, type MaterializeManualInputFilesResult, type NumberDisplayOptions, type ReadManualInputFileResult, type RemoveDefaultConfig, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallCostCurrency, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
|
7289
|
+
export { AgentEvalTagRegistry, AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheRepairSummary, type CacheScopeContext, type CacheSerializationOptions, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CallDerivedAttributesConfig, type CallDerivedAttributesFn, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type ConfigReloadState, type ConfigReloadStatus, type CreateRunRequest, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCacheConfig, EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, EvalDefinition, type EvalDeriveConfig, type EvalDeriveContext, type EvalDeriveFn, type EvalDeriveMap, type EvalDeriveValueFn, type EvalDisplayStatus, type EvalExecuteContext, type EvalExpectation, type EvalFreshnessStatus, type EvalManualInputConfig, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, EvalRuntimeUsageError, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, EvalTag, EvalTagMatchInput, type EvalTraceTree, type EvalTracingAssertionsConfig, type EvalTracingAssertionsFn, type EvalTracingAssertionsMap, type JsonCell, type LlmCallCostBreakdown, type LlmCallCostCurrency, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallPricingRate, type LlmCallPricingRegistry, type LlmCallSimulatedTokens, type LlmCallsConfigInput, type LlmCostScenario, type ManualInputDescriptor, type ManualInputFieldDescriptor, type ManualInputFieldKind, type ManualInputFieldOverride, type ManualInputFieldsConfig, type ManualInputFileValue, type ManualInputSelectOption, type MaterializeManualInputFilesResult, type NumberDisplayOptions, type ReadManualInputFileResult, type RemoveDefaultConfig, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallCostCurrency, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import { n as matchesEvalTags, t as defineEval } from "./src-
|
|
4
|
-
export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
|
1
|
+
import { $ as setScopeCacheContext, A as repoFile, B as evalTime, C as evalTracer, D as deserializeCacheValue, E as deserializeCacheRecording, F as EvalRuntimeUsageError, Ft as getEvalRegistry, H as getEvalCaseInput, I as appendToEvalOutput, J as runInEvalRuntimeScope, K as mergeEvalOutput, M as readManualInputFile, N as evalExpect, O as serializeCacheRecording, P as EvalAssertionError, Q as setEvalOutput, R as evalAssert, S as evalSpan, T as hashCacheKeySync, U as incrementEvalOutput, V as getCurrentScope, W as isInEvalScope, X as runInExistingEvalScope, Y as runInEvalScope, at as extractApiCalls, b as buildTraceTree, ct as simulateTokenAllocation, et as startEvalBackgroundJob, it as extractCacheHits, j as manualInputFileValueSchema, k as serializeCacheValue, ot as extractLlmCalls, q as nextEvalId, rt as extractCacheEntries, st as simulateLlmCallCost, ut as getNestedAttribute, w as hashCacheKey, x as captureEvalSpanError, y as z, z as evalLog } from "./runExecution-pHJ0_TzH.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-HBwXIJsg.mjs";
|
|
3
|
+
import { n as matchesEvalTags, t as defineEval } from "./src-AeXGBJ26.mjs";
|
|
4
|
+
export { EvalAssertionError, EvalRuntimeUsageError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { At as
|
|
2
|
-
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-
|
|
1
|
+
import { At as evalStatsConfigSchema, L as configureEvalRunLogs, Mt as evalChartsConfigSchema, Nt as columnDefSchema, Tt as buildEvalKey, bt as runManifestSchema, jt as manualInputDescriptorSchema, kt as evalStatAggregateSchema, l as registerAgentEvalsPackageResolutionHooks, p as loadConfig, tt as createRunRequestSchema, v as createFsCacheStore, xt as runSummarySchema } from "./runExecution-pHJ0_TzH.mjs";
|
|
2
|
+
import { S as parseEvalDiscovery, m as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-ngVXShH4.mjs";
|
|
3
3
|
import { z } from "zod/v4";
|
|
4
4
|
import { readFile } from "node:fs/promises";
|
|
5
5
|
import { relative } from "node:path";
|
|
@@ -851,6 +851,7 @@ const runLogLevelSchema = z.enum([
|
|
|
851
851
|
const runLogPhaseSchema = z.enum([
|
|
852
852
|
"eval",
|
|
853
853
|
"derive",
|
|
854
|
+
"tracingAssertions",
|
|
854
855
|
"outputsSchema",
|
|
855
856
|
"scorer"
|
|
856
857
|
]);
|
|
@@ -1008,6 +1009,9 @@ const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfi
|
|
|
1008
1009
|
const evalDeriveValueFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a derive output function" });
|
|
1009
1010
|
/** Schema for keyed or object-returning trace-derived output config. */
|
|
1010
1011
|
const evalDeriveConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a deriveFromTracing function" }), z.record(z.string().min(1), evalDeriveValueFnSchema)]);
|
|
1012
|
+
const evalTracingAssertionsFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a tracing assertions function" });
|
|
1013
|
+
/** Schema for function or keyed trace-derived assertion config. */
|
|
1014
|
+
const evalTracingAssertionsConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a tracingAssertions function" }), z.record(z.string().min(1), evalTracingAssertionsFnSchema)]);
|
|
1011
1015
|
/** Schema for UI overrides on derived or scored columns. */
|
|
1012
1016
|
const evalColumnOverrideSchema = z.object({
|
|
1013
1017
|
label: z.string().optional(),
|
|
@@ -1411,6 +1415,7 @@ const agentEvalsConfigSchema = z.object({
|
|
|
1411
1415
|
traceDisplay: traceDisplayInputConfigSchema.optional(),
|
|
1412
1416
|
columns: evalColumnsSchema.optional(),
|
|
1413
1417
|
deriveFromTracing: evalDeriveConfigSchema.optional(),
|
|
1418
|
+
tracingAssertions: evalTracingAssertionsConfigSchema.optional(),
|
|
1414
1419
|
stats: evalStatsConfigSchema.optional(),
|
|
1415
1420
|
defaultStatAggregate: evalStatAggregateSchema.optional(),
|
|
1416
1421
|
llmCalls: llmCallsConfigSchema.optional(),
|
|
@@ -1847,8 +1852,9 @@ function deriveScopedSummaryFromCases(params) {
|
|
|
1847
1852
|
* freshness state.
|
|
1848
1853
|
*/
|
|
1849
1854
|
function getEvalDisplayStatus(params) {
|
|
1850
|
-
const { stale, outdated, lastRunStatus, isRunning = false } = params;
|
|
1855
|
+
const { stale, outdated, lastRunStatus, isRunning = false, isEnqueued = false } = params;
|
|
1851
1856
|
if (isRunning || lastRunStatus === "running") return "running";
|
|
1857
|
+
if (isEnqueued) return "enqueued";
|
|
1852
1858
|
if (lastRunStatus === "pass") {
|
|
1853
1859
|
if (stale) return "stale";
|
|
1854
1860
|
if (outdated) return "outdated";
|
|
@@ -2718,6 +2724,17 @@ var EvalAssertionError = class extends Error {
|
|
|
2718
2724
|
this.name = "EvalAssertionError";
|
|
2719
2725
|
}
|
|
2720
2726
|
};
|
|
2727
|
+
/** Error thrown when an SDK helper is used in an unsupported runner phase. */
|
|
2728
|
+
var EvalRuntimeUsageError = class extends Error {
|
|
2729
|
+
constructor(message) {
|
|
2730
|
+
super(message);
|
|
2731
|
+
this.name = "EvalRuntimeUsageError";
|
|
2732
|
+
}
|
|
2733
|
+
};
|
|
2734
|
+
/** Throw when assertion helpers are used in a runner phase that forbids them. */
|
|
2735
|
+
function assertEvalAssertionsAllowed(apiName) {
|
|
2736
|
+
if (getCurrentScope() && runtimeScopeStorage.getStore() === "derive") throw new EvalRuntimeUsageError(`${apiName} cannot be used inside deriveFromTracing. Use tracingAssertions for trace-derived assertions.`);
|
|
2737
|
+
}
|
|
2721
2738
|
function getEvalClockStateNowMs(state) {
|
|
2722
2739
|
const elapsedMs = state.frozen ? 0 : realDate.now() - state.realStartMs;
|
|
2723
2740
|
return state.startMs + elapsedMs + state.offsetMs;
|
|
@@ -2823,8 +2840,10 @@ function recordSpanForActiveCacheRecording(scope, spanId) {
|
|
|
2823
2840
|
*
|
|
2824
2841
|
* Returns `null` outside eval-owned work, `env` while the runner is loading
|
|
2825
2842
|
* eval modules for a run, `cases` while generating cases, `eval` while running
|
|
2826
|
-
* case `execute`, `derive` while deriving outputs from traces,
|
|
2827
|
-
*
|
|
2843
|
+
* case `execute`, `derive` while deriving outputs from traces,
|
|
2844
|
+
* `tracingAssertions` while checking trace-derived assertions,
|
|
2845
|
+
* `outputsSchema` while validating outputs, and `scorer` while computing
|
|
2846
|
+
* scores.
|
|
2828
2847
|
*/
|
|
2829
2848
|
function isInEvalScope() {
|
|
2830
2849
|
if (activeEvalRuntimeScopeCount === 0) return null;
|
|
@@ -2845,7 +2864,7 @@ function normalizeLogLevel(level) {
|
|
|
2845
2864
|
}
|
|
2846
2865
|
function getCurrentLogPhase() {
|
|
2847
2866
|
const runtimeScope = runtimeScopeStorage.getStore();
|
|
2848
|
-
if (runtimeScope === "eval" || runtimeScope === "derive" || runtimeScope === "outputsSchema" || runtimeScope === "scorer") return runtimeScope;
|
|
2867
|
+
if (runtimeScope === "eval" || runtimeScope === "derive" || runtimeScope === "tracingAssertions" || runtimeScope === "outputsSchema" || runtimeScope === "scorer") return runtimeScope;
|
|
2849
2868
|
return null;
|
|
2850
2869
|
}
|
|
2851
2870
|
function formatLogArgs(args) {
|
|
@@ -3300,10 +3319,12 @@ function incrementEvalOutput(key, delta) {
|
|
|
3300
3319
|
* Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
|
|
3301
3320
|
* can safely reuse `evalAssert(...)` when it also runs outside an eval. The
|
|
3302
3321
|
* TypeScript assertion signature still narrows the checked value after the
|
|
3303
|
-
* call.
|
|
3322
|
+
* call. Calls inside `deriveFromTracing` throw because derivations must only
|
|
3323
|
+
* write outputs; use `tracingAssertions` for trace-derived pass/fail checks.
|
|
3304
3324
|
*/
|
|
3305
3325
|
function evalAssert(condition, message) {
|
|
3306
3326
|
const scope = getCurrentScope();
|
|
3327
|
+
assertEvalAssertionsAllowed("evalAssert(...)");
|
|
3307
3328
|
if (condition) {
|
|
3308
3329
|
if (scope) scope.assertions.push({
|
|
3309
3330
|
message,
|
|
@@ -3454,6 +3475,7 @@ var EvalExpectationImpl = class EvalExpectationImpl {
|
|
|
3454
3475
|
* case scope is active, matching `evalAssert(...)`.
|
|
3455
3476
|
*/
|
|
3456
3477
|
function evalExpect(value) {
|
|
3478
|
+
assertEvalAssertionsAllowed("evalExpect(...)");
|
|
3457
3479
|
return new EvalExpectationImpl(value, false);
|
|
3458
3480
|
}
|
|
3459
3481
|
//#endregion
|
|
@@ -5005,25 +5027,53 @@ const evalTracer = {
|
|
|
5005
5027
|
};
|
|
5006
5028
|
/** Build a queryable trace tree helper from a flat span list and checkpoints. */
|
|
5007
5029
|
function buildTraceTree(spans, checkpoints) {
|
|
5030
|
+
const rootSpans = spans.filter((s) => s.parentId === null);
|
|
5031
|
+
const flattenDfs = () => {
|
|
5032
|
+
const result = [];
|
|
5033
|
+
function visit(parentId) {
|
|
5034
|
+
for (const childSpan of spans) if (childSpan.parentId === parentId) {
|
|
5035
|
+
result.push(childSpan);
|
|
5036
|
+
visit(childSpan.id);
|
|
5037
|
+
}
|
|
5038
|
+
}
|
|
5039
|
+
visit(null);
|
|
5040
|
+
return result;
|
|
5041
|
+
};
|
|
5042
|
+
const filterSpanNames = (sourceSpans, kind) => {
|
|
5043
|
+
return sourceSpans.filter((span) => kind === void 0 || span.kind === kind).map((span) => span.name);
|
|
5044
|
+
};
|
|
5008
5045
|
return {
|
|
5009
5046
|
spans,
|
|
5010
|
-
rootSpans
|
|
5047
|
+
rootSpans,
|
|
5011
5048
|
findSpan(name) {
|
|
5012
5049
|
return spans.find((s) => s.name === name);
|
|
5013
5050
|
},
|
|
5051
|
+
findSpans(name) {
|
|
5052
|
+
return spans.filter((s) => s.name === name);
|
|
5053
|
+
},
|
|
5054
|
+
hasSpan(name) {
|
|
5055
|
+
return spans.some((s) => s.name === name);
|
|
5056
|
+
},
|
|
5014
5057
|
findSpansByKind(kind) {
|
|
5015
5058
|
return spans.filter((s) => s.kind === kind);
|
|
5016
5059
|
},
|
|
5060
|
+
findToolCallSpans() {
|
|
5061
|
+
return spans.filter((s) => s.kind === "tool");
|
|
5062
|
+
},
|
|
5063
|
+
listToolCallSpanNames() {
|
|
5064
|
+
return filterSpanNames(spans, "tool");
|
|
5065
|
+
},
|
|
5066
|
+
hasToolCallSpan(name) {
|
|
5067
|
+
return spans.some((s) => s.kind === "tool" && s.name === name);
|
|
5068
|
+
},
|
|
5069
|
+
listSpanNames(kind) {
|
|
5070
|
+
return filterSpanNames(spans, kind);
|
|
5071
|
+
},
|
|
5072
|
+
listSpanNamesDfs(kind) {
|
|
5073
|
+
return filterSpanNames(flattenDfs(), kind);
|
|
5074
|
+
},
|
|
5017
5075
|
flattenDfs() {
|
|
5018
|
-
|
|
5019
|
-
function visit(parentId) {
|
|
5020
|
-
for (const childSpan of spans) if (childSpan.parentId === parentId) {
|
|
5021
|
-
result.push(childSpan);
|
|
5022
|
-
visit(childSpan.id);
|
|
5023
|
-
}
|
|
5024
|
-
}
|
|
5025
|
-
visit(null);
|
|
5026
|
-
return result;
|
|
5076
|
+
return flattenDfs();
|
|
5027
5077
|
},
|
|
5028
5078
|
checkpoints
|
|
5029
5079
|
};
|
|
@@ -6665,7 +6715,7 @@ async function resolveDeriveFromTracingConfig(params) {
|
|
|
6665
6715
|
return derived;
|
|
6666
6716
|
}
|
|
6667
6717
|
async function runDeriveFromTracingConfig(params) {
|
|
6668
|
-
if (params.deriveFromTracing === void 0) return;
|
|
6718
|
+
if (params.deriveFromTracing === void 0) return null;
|
|
6669
6719
|
const { deriveFromTracing } = params;
|
|
6670
6720
|
try {
|
|
6671
6721
|
const derived = await runInExistingEvalScope(params.scope, "derive", async () => await resolveDeriveFromTracingConfig({
|
|
@@ -6677,13 +6727,53 @@ async function runDeriveFromTracingConfig(params) {
|
|
|
6677
6727
|
outputs: params.scope.outputs,
|
|
6678
6728
|
derived
|
|
6679
6729
|
});
|
|
6730
|
+
return null;
|
|
6680
6731
|
} catch (e) {
|
|
6732
|
+
if (e instanceof EvalRuntimeUsageError) return e;
|
|
6681
6733
|
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
6682
6734
|
recordAssertionFailure(params.scope, toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
6735
|
+
return null;
|
|
6683
6736
|
}
|
|
6684
6737
|
}
|
|
6738
|
+
async function runOneTracingAssertion(params) {
|
|
6739
|
+
const { label, tracingAssertion, scope, traceTree, evalCase } = params;
|
|
6740
|
+
const failureCountBefore = scope.assertionFailures.length;
|
|
6741
|
+
const ctx = {
|
|
6742
|
+
trace: traceTree,
|
|
6743
|
+
input: evalCase.input,
|
|
6744
|
+
case: evalCase
|
|
6745
|
+
};
|
|
6746
|
+
try {
|
|
6747
|
+
await runInExistingEvalScope(scope, "tracingAssertions", async () => {
|
|
6748
|
+
await callUnknownFunction(tracingAssertion, [ctx]);
|
|
6749
|
+
});
|
|
6750
|
+
} catch (e) {
|
|
6751
|
+
if (e instanceof EvalAssertionError && scope.assertionFailures.length > failureCountBefore) return;
|
|
6752
|
+
recordAssertionFailure(scope, toAssertionFailure(`${label} threw: ${e instanceof Error ? e.message : String(e)}`, e instanceof Error ? e : void 0));
|
|
6753
|
+
}
|
|
6754
|
+
}
|
|
6755
|
+
async function runTracingAssertionsConfig(params) {
|
|
6756
|
+
if (params.tracingAssertions === void 0) return;
|
|
6757
|
+
if (typeof params.tracingAssertions === "function") {
|
|
6758
|
+
await runOneTracingAssertion({
|
|
6759
|
+
label: "tracingAssertions",
|
|
6760
|
+
tracingAssertion: params.tracingAssertions,
|
|
6761
|
+
scope: params.scope,
|
|
6762
|
+
traceTree: params.traceTree,
|
|
6763
|
+
evalCase: params.evalCase
|
|
6764
|
+
});
|
|
6765
|
+
return;
|
|
6766
|
+
}
|
|
6767
|
+
for (const [key, tracingAssertion] of Object.entries(params.tracingAssertions)) await runOneTracingAssertion({
|
|
6768
|
+
label: `tracingAssertions "${key}"`,
|
|
6769
|
+
tracingAssertion,
|
|
6770
|
+
scope: params.scope,
|
|
6771
|
+
traceTree: params.traceTree,
|
|
6772
|
+
evalCase: params.evalCase
|
|
6773
|
+
});
|
|
6774
|
+
}
|
|
6685
6775
|
async function runCase(params) {
|
|
6686
|
-
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
6776
|
+
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, globalTracingAssertions, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
6687
6777
|
const scopedIdPrefix = buildScopedEvalIdPrefix({
|
|
6688
6778
|
evalId,
|
|
6689
6779
|
evalFilePath,
|
|
@@ -6728,22 +6818,36 @@ async function runCase(params) {
|
|
|
6728
6818
|
apiCallsConfig
|
|
6729
6819
|
});
|
|
6730
6820
|
const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
|
|
6731
|
-
|
|
6821
|
+
let nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
6732
6822
|
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) recordAssertionFailure(scope, toAssertionFailure(executeError.message, executeError));
|
|
6733
6823
|
if (!nonAssertError) {
|
|
6734
|
-
await runDeriveFromTracingConfig({
|
|
6824
|
+
nonAssertError = await runDeriveFromTracingConfig({
|
|
6735
6825
|
deriveFromTracing: globalDeriveFromTracing,
|
|
6736
6826
|
scope,
|
|
6737
6827
|
traceTree,
|
|
6738
6828
|
evalCase
|
|
6739
6829
|
});
|
|
6740
|
-
await runDeriveFromTracingConfig({
|
|
6830
|
+
if (!nonAssertError) nonAssertError = await runDeriveFromTracingConfig({
|
|
6741
6831
|
deriveFromTracing: evalDef.deriveFromTracing,
|
|
6742
6832
|
scope,
|
|
6743
6833
|
traceTree,
|
|
6744
6834
|
evalCase
|
|
6745
6835
|
});
|
|
6746
6836
|
}
|
|
6837
|
+
if (!nonAssertError) {
|
|
6838
|
+
await runTracingAssertionsConfig({
|
|
6839
|
+
tracingAssertions: globalTracingAssertions,
|
|
6840
|
+
scope,
|
|
6841
|
+
traceTree,
|
|
6842
|
+
evalCase
|
|
6843
|
+
});
|
|
6844
|
+
await runTracingAssertionsConfig({
|
|
6845
|
+
tracingAssertions: evalDef.tracingAssertions,
|
|
6846
|
+
scope,
|
|
6847
|
+
traceTree,
|
|
6848
|
+
evalCase
|
|
6849
|
+
});
|
|
6850
|
+
}
|
|
6747
6851
|
if (!nonAssertError) addDefaultOutputs({
|
|
6748
6852
|
outputs: scope.outputs,
|
|
6749
6853
|
spans: spansWithDerivedAttributes,
|
|
@@ -6933,4 +7037,4 @@ function recordAssertionFailure(scope, failure) {
|
|
|
6933
7037
|
});
|
|
6934
7038
|
}
|
|
6935
7039
|
//#endregion
|
|
6936
|
-
export {
|
|
7040
|
+
export { setScopeCacheContext as $, repoFile as A, evalStatsConfigSchema as At, evalTime as B, evalTracer as C, resolveLlmCallsConfig as Ct, deserializeCacheValue as D, caseDetailSchema as Dt, deserializeCacheRecording as E, getCaseRowCaseKey as Et, EvalRuntimeUsageError as F, getEvalRegistry as Ft, matchesEvalTags as G, getEvalCaseInput as H, appendToEvalOutput as I, runWithEvalRegistry as It, runInEvalRuntimeScope as J, mergeEvalOutput as K, configureEvalRunLogs as L, readManualInputFile as M, evalChartsConfigSchema as Mt, evalExpect as N, columnDefSchema as Nt, serializeCacheRecording as O, caseRowSchema as Ot, EvalAssertionError as P, defineEval as Pt, setEvalOutput as Q, evalAssert as R, evalSpan as S, resolveApiCallsConfig as St, hashCacheKeySync as T, buildEvalKey as Tt, incrementEvalOutput as U, getCurrentScope as V, isInEvalScope as W, runInExistingEvalScope as X, runInEvalScope as Y, runWithEvalClock as Z, createBufferedCacheStore as _, matchesTagsFilter as _t, isCaseChildParentMessage as a, extractApiCalls as at, buildTraceTree as b, runManifestSchema as bt, resolveArtifactPath as c, simulateTokenAllocation as ct, loadEvalModule as d, getEvalTitle as dt, startEvalBackgroundJob as et, resolveEvalDefaultConfig as f, getEvalDisplayStatus as ft, commitPendingCacheWrites as g, dedupeEvalTags as gt, normalizeScoreDef as h, deriveStatusFromChildStatuses as ht, isCaseChildMessage as i, extractCacheHits as it, manualInputFileValueSchema as j, manualInputDescriptorSchema as jt, serializeCacheValue as k, evalStatAggregateSchema as kt, registerAgentEvalsPackageResolutionHooks as l, applyDerivedCallAttributes as lt, buildDeclaredColumnDefs as m, deriveStatusFromCaseRows as mt, resolveRunnableEvalCases as n, updateManualScoreRequestSchema as nt, stripTerminalControlCodes as o, extractLlmCalls as ot, loadConfig as p, deriveScopedSummaryFromCases as pt, nextEvalId as q, runCase as r, extractCacheEntries as rt, resolveTracePresentation as s, simulateLlmCallCost as st, filterEvalCases as t, createRunRequestSchema as tt, runWithModuleIsolation as u, getNestedAttribute as ut, createFsCacheStore as v, validateEvalTagName as vt, hashCacheKey as w, buildCaseKey as wt, captureEvalSpanError as x, runSummarySchema as xt, z$1 as y, validateTagsFilterExpression as yt, evalLog as z };
|