@ls-stack/agent-eval 0.26.2 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,7 @@
25
25
  href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
26
26
  rel="stylesheet"
27
27
  />
28
- <script type="module" crossorigin src="/assets/index-CFF1eYUm.js"></script>
28
+ <script type="module" crossorigin src="/assets/index-DR2haqvV.js"></script>
29
29
  <link rel="stylesheet" crossorigin href="/assets/index-DOXT0Y9V.css">
30
30
  </head>
31
31
  <body>
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-C4yumCXE.mjs";
2
+ import { t as runCli } from "./cli-DRkwWgTj.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Un as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-B31SV_Bq.mjs";
1
+ import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Wn as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-DB0dwGrd.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
4
  import { dirname, join, relative, resolve } from "node:path";
@@ -1041,8 +1041,8 @@ async function commandApp(args) {
1041
1041
  const { serve } = await import("@hono/node-server");
1042
1042
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
1043
1043
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
1044
- const appModule = await import("./app-BxR4SbjA.mjs");
1045
- const runnerModule = await import("./runner-D5QMY3B_.mjs");
1044
+ const appModule = await import("./app-ByMLOds2.mjs");
1045
+ const runnerModule = await import("./runner-39KGoaM1.mjs");
1046
1046
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
1047
1047
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
1048
1048
  await runnerModule.initRunner();
package/dist/index.d.mts CHANGED
@@ -2288,7 +2288,7 @@ type CacheMode = z$1.infer<typeof cacheModeSchema>;
2288
2288
  /** Options accepted by an `evalTracer.span` call to opt the span into caching. */
2289
2289
  declare const spanCacheOptionsSchema: z$1.ZodObject<{
2290
2290
  key: z$1.ZodUnknown;
2291
- namespace: z$1.ZodOptional<z$1.ZodString>;
2291
+ namespace: z$1.ZodString;
2292
2292
  serializeFileBytes: z$1.ZodOptional<z$1.ZodBoolean>;
2293
2293
  }, z$1.core.$strip>;
2294
2294
  /** Options accepted by an `evalTracer.span` call to opt the span into caching. */
@@ -3119,6 +3119,37 @@ declare function getEvalRegistry(): Map<string, EvalRegistryEntry>;
3119
3119
  */
3120
3120
  declare function defineEval<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs>(definition: EvalDefinition<TInput, TOutputs>): void;
3121
3121
  //#endregion
3122
+ //#region ../sdk/src/evalExpect.d.ts
3123
+ /**
3124
+ * Focused expectation helpers for eval case invariants.
3125
+ *
3126
+ * These matchers intentionally cover comparisons that produce clearer failure
3127
+ * messages than a plain `evalAssert(...)`. Use `evalAssert(...)` directly for
3128
+ * truthiness checks and custom type narrowing.
3129
+ */
3130
+ type EvalExpectation<T> = {
3131
+ /** Invert the next matcher. */readonly not: EvalExpectation<T>; /** Assert strict `Object.is(...)` equality. */
3132
+ toBe(expected: unknown): void; /** Assert Node.js deep strict equality. */
3133
+ toEqual(expected: unknown): void; /** Assert that object properties recursively match the expected subset. */
3134
+ toMatchObject(expected: Record<string, unknown>): void; /** Assert substring, array item, or set item containment. */
3135
+ toContain(expected: unknown): void; /** Assert the value has a numeric `length` equal to `expected`. */
3136
+ toHaveLength(expected: number): void; /** Assert a dot-path property exists, optionally with a deep-equal value. */
3137
+ toHaveProperty(path: string, ...expected: [] | [unknown]): void; /** Assert the received number is greater than `expected`. */
3138
+ toBeGreaterThan(expected: number): void; /** Assert the received number is greater than or equal to `expected`. */
3139
+ toBeGreaterThanOrEqual(expected: number): void; /** Assert the received number is less than `expected`. */
3140
+ toBeLessThan(expected: number): void; /** Assert the received number is less than or equal to `expected`. */
3141
+ toBeLessThanOrEqual(expected: number): void; /** Assert the received number is close to `expected` at `precision` decimals. */
3142
+ toBeCloseTo(expected: number, precision?: number): void; /** Assert the received string matches the regular expression. */
3143
+ toMatch(expected: RegExp): void;
3144
+ };
3145
+ /**
3146
+ * Create focused expectation helpers for the current eval case.
3147
+ *
3148
+ * Failed expectations record assertion failures and throw only while an eval
3149
+ * case scope is active, matching `evalAssert(...)`.
3150
+ */
3151
+ declare function evalExpect<T>(value: T): EvalExpectation<T>;
3152
+ //#endregion
3122
3153
  //#region ../sdk/src/repoFile.d.ts
3123
3154
  /**
3124
3155
  * Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
@@ -3168,7 +3199,7 @@ type CacheAdapter = {
3168
3199
  type CacheScopeContext = {
3169
3200
  adapter: CacheAdapter;
3170
3201
  mode: CacheMode;
3171
- evalId: string; /** Hash of the eval source file; used to invalidate on code changes. */
3202
+ evalId: string; /** Hash of the eval source file, stored as cache metadata for inspection. */
3172
3203
  codeFingerprint: string;
3173
3204
  };
3174
3205
  /** Active recording frame captured while a cached operation body executes. */
@@ -3356,12 +3387,14 @@ declare function mergeEvalOutput(key: string, patch: Record<string, unknown>): v
3356
3387
  */
3357
3388
  declare function incrementEvalOutput(key: string, delta: number): void;
3358
3389
  /**
3359
- * Assert a condition for the current eval case and throw on failure.
3390
+ * Assert a truthy condition for the current eval case and throw on failure.
3360
3391
  *
3361
3392
  * Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
3362
- * can safely reuse `evalAssert(...)` when it also runs outside an eval.
3393
+ * can safely reuse `evalAssert(...)` when it also runs outside an eval. The
3394
+ * TypeScript assertion signature still narrows the checked value after the
3395
+ * call.
3363
3396
  */
3364
- declare function evalAssert(condition: boolean, message: string): void;
3397
+ declare function evalAssert(condition: unknown, message: string): asserts condition;
3365
3398
  //#endregion
3366
3399
  //#region ../sdk/src/cacheSerialization.d.ts
3367
3400
  declare const serializedCacheValueMarker = "__agentEvalsCacheSerialization";
@@ -3419,8 +3452,7 @@ type TraceCacheInfo = {
3419
3452
  //#region ../sdk/src/cacheKey.d.ts
3420
3453
  /** Components folded into a deterministic cache key hash. */
3421
3454
  type CacheKeyHashInput = {
3422
- /** Cache namespace, usually derived from the eval id and operation name. */namespace: string; /** Eval source fingerprint used to invalidate cache entries on code edits. */
3423
- codeFingerprint: string; /** User-authored cache key value. */
3455
+ /** Cache namespace, usually derived from the eval id and operation name. */namespace: string; /** User-authored cache key value. */
3424
3456
  key: unknown;
3425
3457
  };
3426
3458
  /** Optional controls for cache key hashing. */
@@ -3722,4 +3754,4 @@ declare function createRunner({
3722
3754
  */
3723
3755
  declare function runCli(argv: string[]): Promise<void>;
3724
3756
  //#endregion
3725
- export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
3757
+ export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalExpectation, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalExpect, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getEvalStartTime, At as evalChartTypeSchema, B as runSummarySchema, Bn as startEvalBackgroundJob, Bt as cacheRecordingOpSchema, Cn as advanceEvalTime, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dn as evalLog, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as evalAssert, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as runInEvalRuntimeScope, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as defineEval, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as runInEvalScope, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInExistingEvalScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as isInEvalScope, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as mergeEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as getCurrentScope, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as nextEvalId, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as setEvalOutput, Rt as cacheModeSchema, Sn as EvalAssertionError, St as scoreTraceSchema, T as createRunRequestSchema, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as getEvalRegistry, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as repoFile, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as incrementEvalOutput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as getEvalCaseInput, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as appendToEvalOutput, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as setScopeCacheContext, zt as cacheOperationTypeSchema } from "./runOrchestration-B31SV_Bq.mjs";
2
- import { n as createRunner, t as runCli } from "./cli-C4yumCXE.mjs";
3
- import "./src-CLMrRle2.mjs";
4
- export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
1
+ import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getCurrentScope, At as evalChartTypeSchema, B as runSummarySchema, Bn as setEvalOutput, Bt as cacheRecordingOpSchema, Cn as evalExpect, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as appendToEvalOutput, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as mergeEvalOutput, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as startEvalBackgroundJob, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as nextEvalId, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInEvalRuntimeScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as getEvalStartTime, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as incrementEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as evalAssert, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as isInEvalScope, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as runInEvalScope, Rt as cacheModeSchema, Sn as repoFile, St as scoreTraceSchema, T as createRunRequestSchema, Tn as advanceEvalTime, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as defineEval, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as setScopeCacheContext, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wn as getEvalRegistry, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as getEvalCaseInput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as evalLog, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as EvalAssertionError, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as runInExistingEvalScope, zt as cacheOperationTypeSchema } from "./runOrchestration-DB0dwGrd.mjs";
2
+ import { n as createRunner, t as runCli } from "./cli-DRkwWgTj.mjs";
3
+ import "./src-BwKm3sKU.mjs";
4
+ export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalExpect, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
package/dist/runChild.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { B as runSummarySchema, T as createRunRequestSchema, Tn as configureEvalRunLogs, b as loadConfig, gt as evalStatsConfigSchema, jt as evalChartsConfigSchema, ot as buildEvalKey, r as getTargetEvals$1, rn as columnDefSchema, t as executeRun, v as parseEvalDiscovery, w as createFsCacheStore, z as runManifestSchema } from "./runOrchestration-B31SV_Bq.mjs";
1
+ import { B as runSummarySchema, Dn as configureEvalRunLogs, T as createRunRequestSchema, b as loadConfig, gt as evalStatsConfigSchema, jt as evalChartsConfigSchema, ot as buildEvalKey, r as getTargetEvals$1, rn as columnDefSchema, t as executeRun, v as parseEvalDiscovery, w as createFsCacheStore, z as runManifestSchema } from "./runOrchestration-DB0dwGrd.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { relative } from "node:path";
@@ -2,9 +2,9 @@ import { createRequire, registerHooks } from "node:module";
2
2
  import { createHash } from "node:crypto";
3
3
  import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
4
4
  import { extname, isAbsolute, join, relative, resolve } from "node:path";
5
- import { z, z as z$1 } from "zod/v4";
5
+ import { formatWithOptions, isDeepStrictEqual } from "node:util";
6
6
  import { AsyncLocalStorage } from "node:async_hooks";
7
- import { formatWithOptions } from "node:util";
7
+ import { z, z as z$1 } from "zod/v4";
8
8
  import { Buffer as Buffer$1 } from "node:buffer";
9
9
  import { gunzipSync, gzipSync } from "node:zlib";
10
10
  import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
@@ -29,23 +29,6 @@ function defineEval(definition) {
29
29
  });
30
30
  }
31
31
  //#endregion
32
- //#region ../sdk/src/repoFile.ts
33
- /**
34
- * Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
35
- * by a column configured with `format: 'image' | 'audio' | 'video' | 'file'`.
36
- *
37
- * @param path Relative or absolute path to the repository file.
38
- * @param mimeType Optional MIME type hint for UI rendering.
39
- * @returns A repo-backed file reference suitable for file/media columns.
40
- */
41
- function repoFile(path, mimeType) {
42
- return {
43
- source: "repo",
44
- path,
45
- mimeType
46
- };
47
- }
48
- //#endregion
49
32
  //#region ../sdk/src/runtime.ts
50
33
  const scopeStorage = new AsyncLocalStorage();
51
34
  const runtimeScopeStorage = new AsyncLocalStorage();
@@ -639,10 +622,12 @@ function incrementEvalOutput(key, delta) {
639
622
  });
640
623
  }
641
624
  /**
642
- * Assert a condition for the current eval case and throw on failure.
625
+ * Assert a truthy condition for the current eval case and throw on failure.
643
626
  *
644
627
  * Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
645
- * can safely reuse `evalAssert(...)` when it also runs outside an eval.
628
+ * can safely reuse `evalAssert(...)` when it also runs outside an eval. The
629
+ * TypeScript assertion signature still narrows the checked value after the
630
+ * call.
646
631
  */
647
632
  function evalAssert(condition, message) {
648
633
  if (condition) return;
@@ -653,6 +638,156 @@ function evalAssert(condition, message) {
653
638
  throw error;
654
639
  }
655
640
  //#endregion
641
+ //#region ../sdk/src/evalExpect.ts
642
+ const expectFormatOptions = {
643
+ depth: 5,
644
+ maxArrayLength: 50,
645
+ maxStringLength: 1e3,
646
+ breakLength: 80,
647
+ compact: 3
648
+ };
649
+ function formatExpectValue(value) {
650
+ return formatWithOptions(expectFormatOptions, value);
651
+ }
652
+ function isRecordLike$4(value) {
653
+ return typeof value === "object" && value !== null;
654
+ }
655
+ function isLengthLike(value) {
656
+ if (typeof value === "string") return true;
657
+ return isRecordLike$4(value) && "length" in value && typeof value.length === "number";
658
+ }
659
+ function matchesObjectSubset(received, expected) {
660
+ if (!isRecordLike$4(received)) return false;
661
+ for (const [key, expectedValue] of Object.entries(expected)) {
662
+ if (!(key in received)) return false;
663
+ const receivedValue = received[key];
664
+ if (isRecordLike$4(expectedValue) && !Array.isArray(expectedValue)) {
665
+ if (!matchesObjectSubset(receivedValue, expectedValue)) return false;
666
+ continue;
667
+ }
668
+ if (!isDeepStrictEqual(receivedValue, expectedValue)) return false;
669
+ }
670
+ return true;
671
+ }
672
+ function containsValue(received, expected) {
673
+ if (typeof received === "string") return typeof expected === "string" && received.includes(expected);
674
+ if (Array.isArray(received)) return received.includes(expected);
675
+ if (received instanceof Set) return received.has(expected);
676
+ return false;
677
+ }
678
+ function getPropertyAtPath(received, path) {
679
+ if (path === "") return {
680
+ exists: false,
681
+ value: void 0
682
+ };
683
+ let current = received;
684
+ for (const key of path.split(".")) {
685
+ if (!isRecordLike$4(current) || !(key in current)) return {
686
+ exists: false,
687
+ value: void 0
688
+ };
689
+ current = current[key];
690
+ }
691
+ return {
692
+ exists: true,
693
+ value: current
694
+ };
695
+ }
696
+ var EvalExpectationImpl = class EvalExpectationImpl {
697
+ received;
698
+ negated;
699
+ constructor(received, negated) {
700
+ this.received = received;
701
+ this.negated = negated;
702
+ }
703
+ get not() {
704
+ return new EvalExpectationImpl(this.received, !this.negated);
705
+ }
706
+ toBe(expected) {
707
+ this.check(Object.is(this.received, expected), `Expected ${formatExpectValue(this.received)} to be ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to be ${formatExpectValue(expected)}`);
708
+ }
709
+ toEqual(expected) {
710
+ this.check(isDeepStrictEqual(this.received, expected), `Expected ${formatExpectValue(this.received)} to equal ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to equal ${formatExpectValue(expected)}`);
711
+ }
712
+ toMatchObject(expected) {
713
+ this.check(matchesObjectSubset(this.received, expected), `Expected ${formatExpectValue(this.received)} to match object ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to match object ${formatExpectValue(expected)}`);
714
+ }
715
+ toContain(expected) {
716
+ this.check(containsValue(this.received, expected), `Expected ${formatExpectValue(this.received)} to contain ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to contain ${formatExpectValue(expected)}`);
717
+ }
718
+ toHaveLength(expected) {
719
+ this.check(isLengthLike(this.received) && this.received.length === expected, `Expected ${formatExpectValue(this.received)} to have length ${expected}`, `Expected ${formatExpectValue(this.received)} not to have length ${expected}`);
720
+ }
721
+ toHaveProperty(path, ...expected) {
722
+ const result = getPropertyAtPath(this.received, path);
723
+ const expectedValue = expected[0];
724
+ const pass = result.exists && (expected.length === 0 || isDeepStrictEqual(result.value, expectedValue));
725
+ const expectedSuffix = expected.length === 0 ? "" : ` with value ${formatExpectValue(expectedValue)}`;
726
+ this.check(pass, `Expected ${formatExpectValue(this.received)} to have property "${path}"${expectedSuffix}`, `Expected ${formatExpectValue(this.received)} not to have property "${path}"${expectedSuffix}`);
727
+ }
728
+ toBeGreaterThan(expected) {
729
+ this.checkNumberComparison("to be greater than", expected, (received) => {
730
+ return received > expected;
731
+ });
732
+ }
733
+ toBeGreaterThanOrEqual(expected) {
734
+ this.checkNumberComparison("to be greater than or equal to", expected, (received) => {
735
+ return received >= expected;
736
+ });
737
+ }
738
+ toBeLessThan(expected) {
739
+ this.checkNumberComparison("to be less than", expected, (received) => {
740
+ return received < expected;
741
+ });
742
+ }
743
+ toBeLessThanOrEqual(expected) {
744
+ this.checkNumberComparison("to be less than or equal to", expected, (received) => {
745
+ return received <= expected;
746
+ });
747
+ }
748
+ toBeCloseTo(expected, precision = 2) {
749
+ const tolerance = 10 ** -precision / 2;
750
+ this.check(typeof this.received === "number" && Number.isFinite(this.received) && Math.abs(this.received - expected) < tolerance, `Expected ${formatExpectValue(this.received)} to be close to ${expected} at ${precision} decimals`, `Expected ${formatExpectValue(this.received)} not to be close to ${expected} at ${precision} decimals`);
751
+ }
752
+ toMatch(expected) {
753
+ const statelessExpected = new RegExp(expected.source, expected.flags);
754
+ this.check(typeof this.received === "string" && statelessExpected.test(this.received), `Expected ${formatExpectValue(this.received)} to match ${expected}`, `Expected ${formatExpectValue(this.received)} not to match ${expected}`);
755
+ }
756
+ check(pass, positiveMessage, negativeMessage) {
757
+ if (this.negated ? !pass : pass) return;
758
+ evalAssert(false, this.negated ? negativeMessage : positiveMessage);
759
+ }
760
+ checkNumberComparison(label, expected, predicate) {
761
+ this.check(typeof this.received === "number" && predicate(this.received), `Expected ${formatExpectValue(this.received)} ${label} ${expected}`, `Expected ${formatExpectValue(this.received)} not ${label} ${expected}`);
762
+ }
763
+ };
764
+ /**
765
+ * Create focused expectation helpers for the current eval case.
766
+ *
767
+ * Failed expectations record assertion failures and throw only while an eval
768
+ * case scope is active, matching `evalAssert(...)`.
769
+ */
770
+ function evalExpect(value) {
771
+ return new EvalExpectationImpl(value, false);
772
+ }
773
+ //#endregion
774
+ //#region ../sdk/src/repoFile.ts
775
+ /**
776
+ * Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
777
+ * by a column configured with `format: 'image' | 'audio' | 'video' | 'file'`.
778
+ *
779
+ * @param path Relative or absolute path to the repository file.
780
+ * @param mimeType Optional MIME type hint for UI rendering.
781
+ * @returns A repo-backed file reference suitable for file/media columns.
782
+ */
783
+ function repoFile(path, mimeType) {
784
+ return {
785
+ source: "repo",
786
+ path,
787
+ mimeType
788
+ };
789
+ }
790
+ //#endregion
656
791
  //#region ../sdk/src/cacheSerialization.ts
657
792
  const serializedCacheValueMarker = "__agentEvalsCacheSerialization";
658
793
  const jsonSafeCacheValueVersion = "json-safe-v1";
@@ -1436,7 +1571,6 @@ function createTraceCache(generateSpanId) {
1436
1571
  const namespace = info.namespace ?? `${cacheCtx.evalId}__${info.name}`;
1437
1572
  const keyHash = await hashCacheKey({
1438
1573
  namespace,
1439
- codeFingerprint: cacheCtx.codeFingerprint,
1440
1574
  key: info.key
1441
1575
  }, { serializeFileBytes: info.serializeFileBytes === true });
1442
1576
  const activeSpan = scope.activeSpanStack.at(-1);
@@ -1879,10 +2013,9 @@ async function traceSpanInternal(info, fn) {
1879
2013
  const cacheCtx = scope.cacheContext;
1880
2014
  if (cacheOpts !== void 0 && cacheCtx !== void 0 && scope.replayingDepth === 0) {
1881
2015
  const ctx = cacheCtx;
1882
- const namespace = cacheOpts.namespace ?? `${ctx.evalId}__${info.name}`;
2016
+ const namespace = getRequiredSpanCacheNamespace(cacheOpts);
1883
2017
  const keyHash = await hashCacheKey({
1884
2018
  namespace,
1885
- codeFingerprint: ctx.codeFingerprint,
1886
2019
  key: cacheOpts.key
1887
2020
  }, { serializeFileBytes: cacheOpts.serializeFileBytes === true });
1888
2021
  mergeSpanAttributes(spanRecord, {
@@ -1966,6 +2099,12 @@ async function traceSpanInternal(info, fn) {
1966
2099
  scope.activeSpanStack.pop();
1967
2100
  }
1968
2101
  }
2102
+ function getRequiredSpanCacheNamespace(cacheOpts) {
2103
+ if (!isRecordLike$1(cacheOpts)) throw new Error("Cached spans require a non-empty cache.namespace");
2104
+ const namespace = cacheOpts.namespace;
2105
+ if (typeof namespace !== "string" || namespace.length === 0) throw new Error("Cached spans require a non-empty cache.namespace");
2106
+ return namespace;
2107
+ }
1969
2108
  /**
1970
2109
  * Trace builder used to create hierarchical spans and checkpoints during eval
1971
2110
  * execution.
@@ -2248,8 +2387,8 @@ const cacheModeSchema = z.enum([
2248
2387
  const spanCacheOptionsSchema = z.object({
2249
2388
  /** Arbitrary JSON-safe value used to derive the cache key. */
2250
2389
  key: z.unknown(),
2251
- /** Override the default namespace (`${evalId}__${spanName}`). */
2252
- namespace: z.string().optional(),
2390
+ /** Required cache namespace shared by span cache entries in the same domain. */
2391
+ namespace: z.string().min(1),
2253
2392
  /**
2254
2393
  * Include native `Blob`/`File` bytes in the cache key. By default only stable
2255
2394
  * metadata (`type`, `size`, plus `name`/`lastModified` for `File`) is used.
@@ -4197,7 +4336,7 @@ async function writeDebugKeyFile(debugDir, debugFile) {
4197
4336
  await mkdir(debugDir, { recursive: true });
4198
4337
  const filePath = ownerPath(debugDir, debugFile.owner);
4199
4338
  const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
4200
- await writeFile(tmpPath, JSON.stringify(debugFile));
4339
+ await writeFile(tmpPath, JSON.stringify(debugFile, null, 2));
4201
4340
  await rename(tmpPath, filePath);
4202
4341
  }
4203
4342
  function pruneEntries(entries, namespace, maxEntries, protectedKey) {
@@ -6185,4 +6324,4 @@ function toLastRunStatus(status) {
6185
6324
  return status === "pending" ? null : status;
6186
6325
  }
6187
6326
  //#endregion
6188
- export { llmCallsConfigSchema as $, traceSpanKindSchema as $t, extractApiCalls as A, getEvalStartTime as An, evalChartTypeSchema as At, runSummarySchema as B, startEvalBackgroundJob as Bn, cacheRecordingOpSchema as Bt, validateCharts as C, advanceEvalTime as Cn, evalChartAggregateSchema as Ct, sseEnvelopeSchema as D, evalLog as Dn, evalChartConfigSchema as Dt, updateManualScoreRequestSchema as E, evalAssert as En, evalChartColorSchema as Et, getEvalDisplayStatus as F, runInEvalRuntimeScope as Fn, cacheEntryWithDebugKeySchema as Ft, apiCallMetricPlacementSchema as G, traceCacheRefSchema as Gt, DEFAULT_LLM_CALLS_CONFIG as H, defineEval as Hn, cacheStatusSchema as Ht, deriveScopedSummaryFromCases as I, runInEvalScope as In, cacheFileSchema as It, defaultConfigKeySchema as J, traceAttributeDisplayPlacementSchema as Jt, apiCallMetricSchema as K, traceAttributeDisplayFormatSchema as Kt, deriveStatusFromCaseRows as L, runInExistingEvalScope as Ln, cacheListItemSchema as Lt, applyDerivedCallAttributes as M, isInEvalScope as Mn, cacheDebugKeyEntrySchema as Mt, getNestedAttribute as N, mergeEvalOutput as Nn, cacheDebugKeyFileSchema as Nt, extractCacheEntries as O, getCurrentScope as On, evalChartMetricSchema as Ot, getEvalTitle as P, nextEvalId as Pn, cacheEntrySchema as Pt, llmCallPricingSchema as Q, traceSpanErrorSchema as Qt, deriveStatusFromChildStatuses as R, setEvalOutput as Rn, cacheModeSchema as Rt, normalizeScoreDef as S, EvalAssertionError as Sn, scoreTraceSchema as St, createRunRequestSchema as T, configureEvalRunLogs as Tn, evalChartBuiltinMetricSchema as Tt, agentEvalsConfigSchema as U, getEvalRegistry as Un, serializedCacheSpanSchema as Ut, DEFAULT_API_CALLS_CONFIG as V, repoFile as Vn, cacheRecordingSchema as Vt, apiCallMetricFormatSchema as W, spanCacheOptionsSchema as Wt, llmCallMetricPlacementSchema as X, traceDisplayConfigSchema as Xt, llmCallMetricFormatSchema as Y, traceAttributeDisplaySchema as Yt, llmCallMetricSchema as Z, traceDisplayInputConfigSchema as Zt, loadEvalModule as _, hashCacheKeySync as _n, evalSummarySchema as _t, getLastRunStatuses as a, columnKindSchema as an, buildCaseKey as at, loadConfig as b, serializeCacheRecording as bn, runLogLocationSchema as bt, loadPersistedRunSnapshots as c, numberDisplayOptionsSchema as cn, getCaseRowEvalKey as ct, persistRunState as d, z$1 as dn, caseRowSchema as dt, traceSpanSchema as en, removeDefaultConfigSchema as et, recomputeEvalStatusesInRuns as f, buildTraceTree as fn, discoveryIssueSchema as ft, deriveEvalFreshness as g, hashCacheKey as gn, evalStatsConfigSchema as gt, resolveArtifactPath as h, evalTracer as hn, evalStatItemSchema as ht, generateRunId as i, columnFormatSchema as in, trialSelectionModeSchema as it, extractLlmCalls as j, incrementEvalOutput as jn, evalChartsConfigSchema as jt, extractCacheHits as k, getEvalCaseInput as kn, evalChartTooltipExtraSchema as kt, nextShortIdFromSnapshots as l, repoFileRefSchema as ln, assertionFailureSchema as lt, runTouchesEval as m, evalSpan as mn, evalStatAggregateSchema as mt, getTargetEvalKeys as n, cellValueSchema as nn, resolveLlmCallsConfig as nt, getLatestRunInfos as o, fileRefSchema as on, buildEvalKey as ot, recomputePersistedCaseStatus as p, captureEvalSpanError as pn, evalFreshnessStatusSchema as pt, apiCallsConfigSchema as q, traceAttributeDisplayInputSchema as qt, getTargetEvals as r, columnDefSchema as rn, runLogsConfigSchema as rt, loadPersistedRunSnapshot as s, jsonCellSchema as sn, getCaseRowCaseKey as st, executeRun as t, traceSpanWarningSchema as tn, resolveApiCallsConfig as tt, persistCaseDetail as u, runArtifactRefSchema as un, caseDetailSchema as ut, parseEvalDiscovery as v, deserializeCacheRecording as vn, runLogEntrySchema as vt, createFsCacheStore as w, appendToEvalOutput as wn, evalChartAxisSchema as wt, buildDeclaredColumnDefs as x, serializeCacheValue as xn, runLogPhaseSchema as xt, resolveEvalDefaultConfig as y, deserializeCacheValue as yn, runLogLevelSchema as yt, runManifestSchema as z, setScopeCacheContext as zn, cacheOperationTypeSchema as zt };
6327
+ export { llmCallsConfigSchema as $, traceSpanKindSchema as $t, extractApiCalls as A, getCurrentScope as An, evalChartTypeSchema as At, runSummarySchema as B, setEvalOutput as Bn, cacheRecordingOpSchema as Bt, validateCharts as C, evalExpect as Cn, evalChartAggregateSchema as Ct, sseEnvelopeSchema as D, configureEvalRunLogs as Dn, evalChartConfigSchema as Dt, updateManualScoreRequestSchema as E, appendToEvalOutput as En, evalChartColorSchema as Et, getEvalDisplayStatus as F, mergeEvalOutput as Fn, cacheEntryWithDebugKeySchema as Ft, apiCallMetricPlacementSchema as G, traceCacheRefSchema as Gt, DEFAULT_LLM_CALLS_CONFIG as H, startEvalBackgroundJob as Hn, cacheStatusSchema as Ht, deriveScopedSummaryFromCases as I, nextEvalId as In, cacheFileSchema as It, defaultConfigKeySchema as J, traceAttributeDisplayPlacementSchema as Jt, apiCallMetricSchema as K, traceAttributeDisplayFormatSchema as Kt, deriveStatusFromCaseRows as L, runInEvalRuntimeScope as Ln, cacheListItemSchema as Lt, applyDerivedCallAttributes as M, getEvalStartTime as Mn, cacheDebugKeyEntrySchema as Mt, getNestedAttribute as N, incrementEvalOutput as Nn, cacheDebugKeyFileSchema as Nt, extractCacheEntries as O, evalAssert as On, evalChartMetricSchema as Ot, getEvalTitle as P, isInEvalScope as Pn, cacheEntrySchema as Pt, llmCallPricingSchema as Q, traceSpanErrorSchema as Qt, deriveStatusFromChildStatuses as R, runInEvalScope as Rn, cacheModeSchema as Rt, normalizeScoreDef as S, repoFile as Sn, scoreTraceSchema as St, createRunRequestSchema as T, advanceEvalTime as Tn, evalChartBuiltinMetricSchema as Tt, agentEvalsConfigSchema as U, defineEval as Un, serializedCacheSpanSchema as Ut, DEFAULT_API_CALLS_CONFIG as V, setScopeCacheContext as Vn, cacheRecordingSchema as Vt, apiCallMetricFormatSchema as W, getEvalRegistry as Wn, spanCacheOptionsSchema as Wt, llmCallMetricPlacementSchema as X, traceDisplayConfigSchema as Xt, llmCallMetricFormatSchema as Y, traceAttributeDisplaySchema as Yt, llmCallMetricSchema as Z, traceDisplayInputConfigSchema as Zt, loadEvalModule as _, hashCacheKeySync as _n, evalSummarySchema as _t, getLastRunStatuses as a, columnKindSchema as an, buildCaseKey as at, loadConfig as b, serializeCacheRecording as bn, runLogLocationSchema as bt, loadPersistedRunSnapshots as c, numberDisplayOptionsSchema as cn, getCaseRowEvalKey as ct, persistRunState as d, z$1 as dn, caseRowSchema as dt, traceSpanSchema as en, removeDefaultConfigSchema as et, recomputeEvalStatusesInRuns as f, buildTraceTree as fn, discoveryIssueSchema as ft, deriveEvalFreshness as g, hashCacheKey as gn, evalStatsConfigSchema as gt, resolveArtifactPath as h, evalTracer as hn, evalStatItemSchema as ht, generateRunId as i, columnFormatSchema as in, trialSelectionModeSchema as it, extractLlmCalls as j, getEvalCaseInput as jn, evalChartsConfigSchema as jt, extractCacheHits as k, evalLog as kn, evalChartTooltipExtraSchema as kt, nextShortIdFromSnapshots as l, repoFileRefSchema as ln, assertionFailureSchema as lt, runTouchesEval as m, evalSpan as mn, evalStatAggregateSchema as mt, getTargetEvalKeys as n, cellValueSchema as nn, resolveLlmCallsConfig as nt, getLatestRunInfos as o, fileRefSchema as on, buildEvalKey as ot, recomputePersistedCaseStatus as p, captureEvalSpanError as pn, evalFreshnessStatusSchema as pt, apiCallsConfigSchema as q, traceAttributeDisplayInputSchema as qt, getTargetEvals as r, columnDefSchema as rn, runLogsConfigSchema as rt, loadPersistedRunSnapshot as s, jsonCellSchema as sn, getCaseRowCaseKey as st, executeRun as t, traceSpanWarningSchema as tn, resolveApiCallsConfig as tt, persistCaseDetail as u, runArtifactRefSchema as un, caseDetailSchema as ut, parseEvalDiscovery as v, deserializeCacheRecording as vn, runLogEntrySchema as vt, createFsCacheStore as w, EvalAssertionError as wn, evalChartAxisSchema as wt, buildDeclaredColumnDefs as x, serializeCacheValue as xn, runLogPhaseSchema as xt, resolveEvalDefaultConfig as y, deserializeCacheValue as yn, runLogLevelSchema as yt, runManifestSchema as z, runInExistingEvalScope as zn, cacheOperationTypeSchema as zt };
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-LHN7L-xk.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-DRINcaN_.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-C4yumCXE.mjs";
2
- import "./src-CLMrRle2.mjs";
1
+ import { n as createRunner } from "./cli-DRkwWgTj.mjs";
2
+ import "./src-BwKm3sKU.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-DB0dwGrd.mjs";
2
+ import "./cli-DRkwWgTj.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.26.2",
3
+ "version": "0.27.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -39,8 +39,9 @@ file wires up cases and scoring; the real `evalTracer.span(...)` calls sit
39
39
  inside the workflow, agent, or tool functions that both production and evals
40
40
  invoke.
41
41
 
42
- `evalTracer`, `evalSpan`, output helpers, `evalLog`, and `evalAssert` are
43
- ambient no-ops when called outside an eval case scope, so leaving them in
42
+ `evalTracer`, `evalSpan`, output helpers, `evalLog`, `evalAssert`, and
43
+ `evalExpect` are ambient no-ops when called outside an eval case scope, so
44
+ leaving them in
44
45
  production paths is safe — they only record anything when the product code runs
45
46
  inside an eval's `execute`. Use `isInEvalScope()` to branch on eval-only behavior in shared code
46
47
  (e.g. skip a real network side effect): it returns `null` outside eval-owned
@@ -67,6 +68,7 @@ import {
67
68
  appendToEvalOutput,
68
69
  captureEvalSpanError,
69
70
  evalAssert,
71
+ evalExpect,
70
72
  evalSpan,
71
73
  evalTracer,
72
74
  getEvalCaseInput,
@@ -87,7 +89,10 @@ export async function runRefundWorkflow(input: RefundInput) {
87
89
  {
88
90
  kind: 'llm',
89
91
  name: 'plan-refund',
90
- cache: { key: { prompt: input.message, model: 'gpt-4o-mini' } },
92
+ cache: {
93
+ namespace: 'refund-workflow__plan-refund',
94
+ key: { prompt: input.message, model: 'gpt-4o-mini' },
95
+ },
91
96
  },
92
97
  async () => {
93
98
  let text: string;
@@ -119,6 +124,7 @@ export async function runRefundWorkflow(input: RefundInput) {
119
124
  setEvalOutput('reviewId', reviewId);
120
125
  mergeEvalOutput('metadata', { approved: result.approved });
121
126
  evalAssert(result.approved, 'refund workflow should approve the case');
127
+ evalExpect(result.finalText).toMatch(/refund/i);
122
128
  evalSpan.setAttribute('output', { result, reviewId });
123
129
  return result;
124
130
  },
@@ -319,15 +325,18 @@ Their shapes live in the types; no need to memorize the option set.
319
325
 
320
326
  ## Cached operations
321
327
 
322
- Wrap a costly pure span in `cache: { key }` so later runs replay its recorded
323
- effects without re-executing:
328
+ Wrap a costly pure span in `cache: { namespace, key }` so later runs replay its
329
+ recorded effects without re-executing:
324
330
 
325
331
  ```ts
326
332
  await evalTracer.span(
327
333
  {
328
334
  kind: 'llm',
329
335
  name: 'plan-refund',
330
- cache: { key: { prompt: input.message, model: 'gpt-4o-mini' } },
336
+ cache: {
337
+ namespace: 'refund-workflow__plan-refund',
338
+ key: { prompt: input.message, model: 'gpt-4o-mini' },
339
+ },
331
340
  },
332
341
  async () => {
333
342
  const result = await llm.complete(input.message);
@@ -368,12 +377,13 @@ Mental model:
368
377
  namespace, and hit/miss status. When called directly from the case body
369
378
  (no surrounding span), the ref is recorded on the case detail's `cacheRefs`
370
379
  array.
371
- - The cache key folds in a source-file fingerprint, so editing the eval busts
372
- the cache automatically.
373
- - `cache.namespace` on spans or `namespace` on value caches can share entries
374
- across operations/evals, but the source-file fingerprint still participates
375
- in the final key. Shared namespaces are reusable across evals in the same
376
- file; evals in different files miss even with the same namespace and key.
380
+ - Cache identity is the namespace plus the authored key. Source-file
381
+ fingerprints are stored as metadata for inspection, but do not participate in
382
+ cache-key hashing.
383
+ - Cached spans require an explicit `cache.namespace`; value caches default to
384
+ `${evalId}__${name}` and can be overridden with `namespace`. Matching
385
+ namespaces share entries across operations/evals that use the same authored
386
+ key.
377
387
  - Authored eval ids are unique within one eval file. The exact eval identity is
378
388
  the workspace-relative file path plus eval id, so the same id can be reused in
379
389
  different files. Case ids must be unique within one eval; duplicate case ids
@@ -455,16 +465,17 @@ When adding or changing evals:
455
465
  1. Put the tracing + ambient SDK calls in the product code that runs in both
456
466
  production and evals. Keep eval files thin.
457
467
  2. Use realistic cases drawn from real product flows; avoid placeholder inputs.
458
- 3. `evalAssert` for hard invariants, `scores` for graded signals,
459
- `passThreshold` only on scores that should gate pass/fail.
468
+ 3. `evalAssert` for hard invariants and truthy type narrowing, `evalExpect`
469
+ for non-trivial comparisons, `scores` for graded signals, `passThreshold`
470
+ only on scores that should gate pass/fail.
460
471
  4. Surface reviewable values through execute-context `setOutput` or ambient
461
472
  `setEvalOutput` in shared workflow code, and shape them with `columns`
462
473
  formats from the `ColumnFormat` type.
463
474
  5. Promote high-signal span attributes with `traceDisplay` so they surface in
464
475
  the trace tree and detail pane.
465
- 6. Cache costly pure spans with `cache: { key }` and pure spanless values with
466
- `evalTracer.cache(...)`; never cache operations whose external side effects
467
- you depend on.
476
+ 6. Cache costly pure spans with `cache: { namespace, key }` and pure spanless
477
+ values with `evalTracer.cache(...)`; never cache operations whose external
478
+ side effects you depend on.
468
479
  7. Sanity-check after changes: `agent-evals list`, then
469
480
  `agent-evals run --eval <id>`; use `--file <path|glob>` to target one file
470
481
  when multiple files use the same eval id.
@@ -1,3 +0,0 @@
1
- import "./runOrchestration-B31SV_Bq.mjs";
2
- import "./cli-C4yumCXE.mjs";
3
- export {};