npm - @ls-stack/agent-eval - Versions diffs - 0.26.2 → 0.27.0 - Mend

@ls-stack/agent-eval 0.26.2 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/{app-BxR4SbjA.mjs → app-ByMLOds2.mjs} +3 -3
package/dist/apps/web/dist/assets/{index-CFF1eYUm.js → index-DR2haqvV.js} +1 -1
package/dist/apps/web/dist/index.html +1 -1
package/dist/bin.mjs +1 -1
package/dist/{cli-C4yumCXE.mjs → cli-DRkwWgTj.mjs} +3 -3
package/dist/index.d.mts +40 -8
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +1 -1
package/dist/{runOrchestration-B31SV_Bq.mjs → runOrchestration-DB0dwGrd.mjs} +167 -28
package/dist/{runner-D5QMY3B_.mjs → runner-39KGoaM1.mjs} +1 -1
package/dist/{runner-LHN7L-xk.mjs → runner-DRINcaN_.mjs} +2 -2
package/dist/src-BwKm3sKU.mjs +3 -0
package/package.json +1 -1
package/skills/agent-eval/SKILL.md +28 -17
package/dist/src-CLMrRle2.mjs +0 -3

package/dist/apps/web/dist/index.html CHANGED Viewed

@@ -25,7 +25,7 @@
       href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
       rel="stylesheet"
     />
-    <script type="module" crossorigin src="/assets/index-CFF1eYUm.js"></script>
+    <script type="module" crossorigin src="/assets/index-DR2haqvV.js"></script>
     <link rel="stylesheet" crossorigin href="/assets/index-DOXT0Y9V.css">
   </head>
   <body>

package/dist/bin.mjs CHANGED Viewed

@@ -1,5 +1,5 @@
 #!/usr/bin/env node
-import { t as runCli } from "./cli-C4yumCXE.mjs";
+import { t as runCli } from "./cli-DRkwWgTj.mjs";
 import { spawn } from "node:child_process";
 //#region src/bin.ts
 const moduleMocksFlag = "--experimental-test-module-mocks";

package/dist/{cli-C4yumCXE.mjs → cli-DRkwWgTj.mjs} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Un as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-B31SV_Bq.mjs";
+import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Wn as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-DB0dwGrd.mjs";
 import { createHash } from "node:crypto";
 import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
 import { dirname, join, relative, resolve } from "node:path";
@@ -1041,8 +1041,8 @@ async function commandApp(args) {
 	const { serve } = await import("@hono/node-server");
 	const bundledWebDist = resolve(currentDir, "apps/web/dist");
 	if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
-	const appModule = await import("./app-BxR4SbjA.mjs");
-	const runnerModule = await import("./runner-D5QMY3B_.mjs");
+	const appModule = await import("./app-ByMLOds2.mjs");
+	const runnerModule = await import("./runner-39KGoaM1.mjs");
 	if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
 	if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
 	await runnerModule.initRunner();

package/dist/index.d.mts CHANGED Viewed

@@ -2288,7 +2288,7 @@ type CacheMode = z$1.infer<typeof cacheModeSchema>;
 /** Options accepted by an `evalTracer.span` call to opt the span into caching. */
 declare const spanCacheOptionsSchema: z$1.ZodObject<{
   key: z$1.ZodUnknown;
-  namespace: z$1.ZodOptional<z$1.ZodString>;
+  namespace: z$1.ZodString;
   serializeFileBytes: z$1.ZodOptional<z$1.ZodBoolean>;
 }, z$1.core.$strip>;
 /** Options accepted by an `evalTracer.span` call to opt the span into caching. */
@@ -3119,6 +3119,37 @@ declare function getEvalRegistry(): Map<string, EvalRegistryEntry>;
  */
 declare function defineEval<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs>(definition: EvalDefinition<TInput, TOutputs>): void;
 //#endregion
+//#region ../sdk/src/evalExpect.d.ts
+/**
+ * Focused expectation helpers for eval case invariants.
+ *
+ * These matchers intentionally cover comparisons that produce clearer failure
+ * messages than a plain `evalAssert(...)`. Use `evalAssert(...)` directly for
+ * truthiness checks and custom type narrowing.
+ */
+type EvalExpectation<T> = {
+  /** Invert the next matcher. */readonly not: EvalExpectation<T>; /** Assert strict `Object.is(...)` equality. */
+  toBe(expected: unknown): void; /** Assert Node.js deep strict equality. */
+  toEqual(expected: unknown): void; /** Assert that object properties recursively match the expected subset. */
+  toMatchObject(expected: Record<string, unknown>): void; /** Assert substring, array item, or set item containment. */
+  toContain(expected: unknown): void; /** Assert the value has a numeric `length` equal to `expected`. */
+  toHaveLength(expected: number): void; /** Assert a dot-path property exists, optionally with a deep-equal value. */
+  toHaveProperty(path: string, ...expected: [] | [unknown]): void; /** Assert the received number is greater than `expected`. */
+  toBeGreaterThan(expected: number): void; /** Assert the received number is greater than or equal to `expected`. */
+  toBeGreaterThanOrEqual(expected: number): void; /** Assert the received number is less than `expected`. */
+  toBeLessThan(expected: number): void; /** Assert the received number is less than or equal to `expected`. */
+  toBeLessThanOrEqual(expected: number): void; /** Assert the received number is close to `expected` at `precision` decimals. */
+  toBeCloseTo(expected: number, precision?: number): void; /** Assert the received string matches the regular expression. */
+  toMatch(expected: RegExp): void;
+};
+/**
+ * Create focused expectation helpers for the current eval case.
+ *
+ * Failed expectations record assertion failures and throw only while an eval
+ * case scope is active, matching `evalAssert(...)`.
+ */
+declare function evalExpect<T>(value: T): EvalExpectation<T>;
+//#endregion
 //#region ../sdk/src/repoFile.d.ts
 /**
  * Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
@@ -3168,7 +3199,7 @@ type CacheAdapter = {
 type CacheScopeContext = {
   adapter: CacheAdapter;
   mode: CacheMode;
-  evalId: string; /** Hash of the eval source file; used to invalidate on code changes. */
+  evalId: string; /** Hash of the eval source file, stored as cache metadata for inspection. */
   codeFingerprint: string;
 };
 /** Active recording frame captured while a cached operation body executes. */
@@ -3356,12 +3387,14 @@ declare function mergeEvalOutput(key: string, patch: Record<string, unknown>): v
  */
 declare function incrementEvalOutput(key: string, delta: number): void;
 /**
- * Assert a condition for the current eval case and throw on failure.
+ * Assert a truthy condition for the current eval case and throw on failure.
  *
  * Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
- * can safely reuse `evalAssert(...)` when it also runs outside an eval.
+ * can safely reuse `evalAssert(...)` when it also runs outside an eval. The
+ * TypeScript assertion signature still narrows the checked value after the
+ * call.
  */
-declare function evalAssert(condition: boolean, message: string): void;
+declare function evalAssert(condition: unknown, message: string): asserts condition;
 //#endregion
 //#region ../sdk/src/cacheSerialization.d.ts
 declare const serializedCacheValueMarker = "__agentEvalsCacheSerialization";
@@ -3419,8 +3452,7 @@ type TraceCacheInfo = {
 //#region ../sdk/src/cacheKey.d.ts
 /** Components folded into a deterministic cache key hash. */
 type CacheKeyHashInput = {
-  /** Cache namespace, usually derived from the eval id and operation name. */namespace: string; /** Eval source fingerprint used to invalidate cache entries on code edits. */
-  codeFingerprint: string; /** User-authored cache key value. */
+  /** Cache namespace, usually derived from the eval id and operation name. */namespace: string; /** User-authored cache key value. */
   key: unknown;
 };
 /** Optional controls for cache key hashing. */
@@ -3722,4 +3754,4 @@ declare function createRunner({
  */
 declare function runCli(argv: string[]): Promise<void>;
 //#endregion
-export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
+export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalExpectation, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalExpect, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };

package/dist/index.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getEvalStartTime, At as evalChartTypeSchema, B as runSummarySchema, Bn as startEvalBackgroundJob, Bt as cacheRecordingOpSchema, Cn as advanceEvalTime, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dn as evalLog, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as evalAssert, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as runInEvalRuntimeScope, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as defineEval, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as runInEvalScope, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInExistingEvalScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as isInEvalScope, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as mergeEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as getCurrentScope, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as nextEvalId, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as setEvalOutput, Rt as cacheModeSchema, Sn as EvalAssertionError, St as scoreTraceSchema, T as createRunRequestSchema, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as getEvalRegistry, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as repoFile, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as incrementEvalOutput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as getEvalCaseInput, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as appendToEvalOutput, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as setScopeCacheContext, zt as cacheOperationTypeSchema } from "./runOrchestration-B31SV_Bq.mjs";
-import { n as createRunner, t as runCli } from "./cli-C4yumCXE.mjs";
-import "./src-CLMrRle2.mjs";
-export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
+import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getCurrentScope, At as evalChartTypeSchema, B as runSummarySchema, Bn as setEvalOutput, Bt as cacheRecordingOpSchema, Cn as evalExpect, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as appendToEvalOutput, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as mergeEvalOutput, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as startEvalBackgroundJob, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as nextEvalId, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInEvalRuntimeScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as getEvalStartTime, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as incrementEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as evalAssert, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as isInEvalScope, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as runInEvalScope, Rt as cacheModeSchema, Sn as repoFile, St as scoreTraceSchema, T as createRunRequestSchema, Tn as advanceEvalTime, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as defineEval, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as setScopeCacheContext, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wn as getEvalRegistry, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as getEvalCaseInput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as evalLog, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as EvalAssertionError, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as runInExistingEvalScope, zt as cacheOperationTypeSchema } from "./runOrchestration-DB0dwGrd.mjs";
+import { n as createRunner, t as runCli } from "./cli-DRkwWgTj.mjs";
+import "./src-BwKm3sKU.mjs";
+export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalExpect, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };

package/dist/runChild.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { B as runSummarySchema, T as createRunRequestSchema, Tn as configureEvalRunLogs, b as loadConfig, gt as evalStatsConfigSchema, jt as evalChartsConfigSchema, ot as buildEvalKey, r as getTargetEvals$1, rn as columnDefSchema, t as executeRun, v as parseEvalDiscovery, w as createFsCacheStore, z as runManifestSchema } from "./runOrchestration-B31SV_Bq.mjs";
+import { B as runSummarySchema, Dn as configureEvalRunLogs, T as createRunRequestSchema, b as loadConfig, gt as evalStatsConfigSchema, jt as evalChartsConfigSchema, ot as buildEvalKey, r as getTargetEvals$1, rn as columnDefSchema, t as executeRun, v as parseEvalDiscovery, w as createFsCacheStore, z as runManifestSchema } from "./runOrchestration-DB0dwGrd.mjs";
 import { createHash } from "node:crypto";
 import { readFile } from "node:fs/promises";
 import { relative } from "node:path";

package/dist/{runOrchestration-B31SV_Bq.mjs → runOrchestration-DB0dwGrd.mjs} RENAMED Viewed

@@ -2,9 +2,9 @@ import { createRequire, registerHooks } from "node:module";
 import { createHash } from "node:crypto";
 import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
 import { extname, isAbsolute, join, relative, resolve } from "node:path";
-import { z, z as z$1 } from "zod/v4";
+import { formatWithOptions, isDeepStrictEqual } from "node:util";
 import { AsyncLocalStorage } from "node:async_hooks";
-import { formatWithOptions } from "node:util";
+import { z, z as z$1 } from "zod/v4";
 import { Buffer as Buffer$1 } from "node:buffer";
 import { gunzipSync, gzipSync } from "node:zlib";
 import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
@@ -29,23 +29,6 @@ function defineEval(definition) {
 	});
 }
 //#endregion
-//#region ../sdk/src/repoFile.ts
-/**
-* Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
-* by a column configured with `format: 'image' | 'audio' | 'video' | 'file'`.
-*
-* @param path Relative or absolute path to the repository file.
-* @param mimeType Optional MIME type hint for UI rendering.
-* @returns A repo-backed file reference suitable for file/media columns.
-*/
-function repoFile(path, mimeType) {
-	return {
-		source: "repo",
-		path,
-		mimeType
-	};
-}
-//#endregion
 //#region ../sdk/src/runtime.ts
 const scopeStorage = new AsyncLocalStorage();
 const runtimeScopeStorage = new AsyncLocalStorage();
@@ -639,10 +622,12 @@ function incrementEvalOutput(key, delta) {
 	});
 }
 /**
-* Assert a condition for the current eval case and throw on failure.
+* Assert a truthy condition for the current eval case and throw on failure.
 *
 * Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
-* can safely reuse `evalAssert(...)` when it also runs outside an eval.
+* can safely reuse `evalAssert(...)` when it also runs outside an eval. The
+* TypeScript assertion signature still narrows the checked value after the
+* call.
 */
 function evalAssert(condition, message) {
 	if (condition) return;
@@ -653,6 +638,156 @@ function evalAssert(condition, message) {
 	throw error;
 }
 //#endregion
+//#region ../sdk/src/evalExpect.ts
+const expectFormatOptions = {
+	depth: 5,
+	maxArrayLength: 50,
+	maxStringLength: 1e3,
+	breakLength: 80,
+	compact: 3
+};
+function formatExpectValue(value) {
+	return formatWithOptions(expectFormatOptions, value);
+}
+function isRecordLike$4(value) {
+	return typeof value === "object" && value !== null;
+}
+function isLengthLike(value) {
+	if (typeof value === "string") return true;
+	return isRecordLike$4(value) && "length" in value && typeof value.length === "number";
+}
+function matchesObjectSubset(received, expected) {
+	if (!isRecordLike$4(received)) return false;
+	for (const [key, expectedValue] of Object.entries(expected)) {
+		if (!(key in received)) return false;
+		const receivedValue = received[key];
+		if (isRecordLike$4(expectedValue) && !Array.isArray(expectedValue)) {
+			if (!matchesObjectSubset(receivedValue, expectedValue)) return false;
+			continue;
+		}
+		if (!isDeepStrictEqual(receivedValue, expectedValue)) return false;
+	}
+	return true;
+}
+function containsValue(received, expected) {
+	if (typeof received === "string") return typeof expected === "string" && received.includes(expected);
+	if (Array.isArray(received)) return received.includes(expected);
+	if (received instanceof Set) return received.has(expected);
+	return false;
+}
+function getPropertyAtPath(received, path) {
+	if (path === "") return {
+		exists: false,
+		value: void 0
+	};
+	let current = received;
+	for (const key of path.split(".")) {
+		if (!isRecordLike$4(current) || !(key in current)) return {
+			exists: false,
+			value: void 0
+		};
+		current = current[key];
+	}
+	return {
+		exists: true,
+		value: current
+	};
+}
+var EvalExpectationImpl = class EvalExpectationImpl {
+	received;
+	negated;
+	constructor(received, negated) {
+		this.received = received;
+		this.negated = negated;
+	}
+	get not() {
+		return new EvalExpectationImpl(this.received, !this.negated);
+	}
+	toBe(expected) {
+		this.check(Object.is(this.received, expected), `Expected ${formatExpectValue(this.received)} to be ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to be ${formatExpectValue(expected)}`);
+	}
+	toEqual(expected) {
+		this.check(isDeepStrictEqual(this.received, expected), `Expected ${formatExpectValue(this.received)} to equal ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to equal ${formatExpectValue(expected)}`);
+	}
+	toMatchObject(expected) {
+		this.check(matchesObjectSubset(this.received, expected), `Expected ${formatExpectValue(this.received)} to match object ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to match object ${formatExpectValue(expected)}`);
+	}
+	toContain(expected) {
+		this.check(containsValue(this.received, expected), `Expected ${formatExpectValue(this.received)} to contain ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to contain ${formatExpectValue(expected)}`);
+	}
+	toHaveLength(expected) {
+		this.check(isLengthLike(this.received) && this.received.length === expected, `Expected ${formatExpectValue(this.received)} to have length ${expected}`, `Expected ${formatExpectValue(this.received)} not to have length ${expected}`);
+	}
+	toHaveProperty(path, ...expected) {
+		const result = getPropertyAtPath(this.received, path);
+		const expectedValue = expected[0];
+		const pass = result.exists && (expected.length === 0 || isDeepStrictEqual(result.value, expectedValue));
+		const expectedSuffix = expected.length === 0 ? "" : ` with value ${formatExpectValue(expectedValue)}`;
+		this.check(pass, `Expected ${formatExpectValue(this.received)} to have property "${path}"${expectedSuffix}`, `Expected ${formatExpectValue(this.received)} not to have property "${path}"${expectedSuffix}`);
+	}
+	toBeGreaterThan(expected) {
+		this.checkNumberComparison("to be greater than", expected, (received) => {
+			return received > expected;
+		});
+	}
+	toBeGreaterThanOrEqual(expected) {
+		this.checkNumberComparison("to be greater than or equal to", expected, (received) => {
+			return received >= expected;
+		});
+	}
+	toBeLessThan(expected) {
+		this.checkNumberComparison("to be less than", expected, (received) => {
+			return received < expected;
+		});
+	}
+	toBeLessThanOrEqual(expected) {
+		this.checkNumberComparison("to be less than or equal to", expected, (received) => {
+			return received <= expected;
+		});
+	}
+	toBeCloseTo(expected, precision = 2) {
+		const tolerance = 10 ** -precision / 2;
+		this.check(typeof this.received === "number" && Number.isFinite(this.received) && Math.abs(this.received - expected) < tolerance, `Expected ${formatExpectValue(this.received)} to be close to ${expected} at ${precision} decimals`, `Expected ${formatExpectValue(this.received)} not to be close to ${expected} at ${precision} decimals`);
+	}
+	toMatch(expected) {
+		const statelessExpected = new RegExp(expected.source, expected.flags);
+		this.check(typeof this.received === "string" && statelessExpected.test(this.received), `Expected ${formatExpectValue(this.received)} to match ${expected}`, `Expected ${formatExpectValue(this.received)} not to match ${expected}`);
+	}
+	check(pass, positiveMessage, negativeMessage) {
+		if (this.negated ? !pass : pass) return;
+		evalAssert(false, this.negated ? negativeMessage : positiveMessage);
+	}
+	checkNumberComparison(label, expected, predicate) {
+		this.check(typeof this.received === "number" && predicate(this.received), `Expected ${formatExpectValue(this.received)} ${label} ${expected}`, `Expected ${formatExpectValue(this.received)} not ${label} ${expected}`);
+	}
+};
+/**
+* Create focused expectation helpers for the current eval case.
+*
+* Failed expectations record assertion failures and throw only while an eval
+* case scope is active, matching `evalAssert(...)`.
+*/
+function evalExpect(value) {
+	return new EvalExpectationImpl(value, false);
+}
+//#endregion
+//#region ../sdk/src/repoFile.ts
+/**
+* Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
+* by a column configured with `format: 'image' | 'audio' | 'video' | 'file'`.
+*
+* @param path Relative or absolute path to the repository file.
+* @param mimeType Optional MIME type hint for UI rendering.
+* @returns A repo-backed file reference suitable for file/media columns.
+*/
+function repoFile(path, mimeType) {
+	return {
+		source: "repo",
+		path,
+		mimeType
+	};
+}
+//#endregion
 //#region ../sdk/src/cacheSerialization.ts
 const serializedCacheValueMarker = "__agentEvalsCacheSerialization";
 const jsonSafeCacheValueVersion = "json-safe-v1";
@@ -1436,7 +1571,6 @@ function createTraceCache(generateSpanId) {
 		const namespace = info.namespace ?? `${cacheCtx.evalId}__${info.name}`;
 		const keyHash = await hashCacheKey({
 			namespace,
-			codeFingerprint: cacheCtx.codeFingerprint,
 			key: info.key
 		}, { serializeFileBytes: info.serializeFileBytes === true });
 		const activeSpan = scope.activeSpanStack.at(-1);
@@ -1879,10 +2013,9 @@ async function traceSpanInternal(info, fn) {
 		const cacheCtx = scope.cacheContext;
 		if (cacheOpts !== void 0 && cacheCtx !== void 0 && scope.replayingDepth === 0) {
 			const ctx = cacheCtx;
-			const namespace = cacheOpts.namespace ?? `${ctx.evalId}__${info.name}`;
+			const namespace = getRequiredSpanCacheNamespace(cacheOpts);
 			const keyHash = await hashCacheKey({
 				namespace,
-				codeFingerprint: ctx.codeFingerprint,
 				key: cacheOpts.key
 			}, { serializeFileBytes: cacheOpts.serializeFileBytes === true });
 			mergeSpanAttributes(spanRecord, {
@@ -1966,6 +2099,12 @@ async function traceSpanInternal(info, fn) {
 		scope.activeSpanStack.pop();
 	}
 }
+function getRequiredSpanCacheNamespace(cacheOpts) {
+	if (!isRecordLike$1(cacheOpts)) throw new Error("Cached spans require a non-empty cache.namespace");
+	const namespace = cacheOpts.namespace;
+	if (typeof namespace !== "string" || namespace.length === 0) throw new Error("Cached spans require a non-empty cache.namespace");
+	return namespace;
+}
 /**
 * Trace builder used to create hierarchical spans and checkpoints during eval
 * execution.
@@ -2248,8 +2387,8 @@ const cacheModeSchema = z.enum([
 const spanCacheOptionsSchema = z.object({
 	/** Arbitrary JSON-safe value used to derive the cache key. */
 	key: z.unknown(),
-	/** Override the default namespace (`${evalId}__${spanName}`). */
-	namespace: z.string().optional(),
+	/** Required cache namespace shared by span cache entries in the same domain. */
+	namespace: z.string().min(1),
 	/**
 	* Include native `Blob`/`File` bytes in the cache key. By default only stable
 	* metadata (`type`, `size`, plus `name`/`lastModified` for `File`) is used.
@@ -4197,7 +4336,7 @@ async function writeDebugKeyFile(debugDir, debugFile) {
 	await mkdir(debugDir, { recursive: true });
 	const filePath = ownerPath(debugDir, debugFile.owner);
 	const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
-	await writeFile(tmpPath, JSON.stringify(debugFile));
+	await writeFile(tmpPath, JSON.stringify(debugFile, null, 2));
 	await rename(tmpPath, filePath);
 }
 function pruneEntries(entries, namespace, maxEntries, protectedKey) {
@@ -6185,4 +6324,4 @@ function toLastRunStatus(status) {
 	return status === "pending" ? null : status;
 }
 //#endregion
-export { llmCallsConfigSchema as $, traceSpanKindSchema as $t, extractApiCalls as A, getEvalStartTime as An, evalChartTypeSchema as At, runSummarySchema as B, startEvalBackgroundJob as Bn, cacheRecordingOpSchema as Bt, validateCharts as C, advanceEvalTime as Cn, evalChartAggregateSchema as Ct, sseEnvelopeSchema as D, evalLog as Dn, evalChartConfigSchema as Dt, updateManualScoreRequestSchema as E, evalAssert as En, evalChartColorSchema as Et, getEvalDisplayStatus as F, runInEvalRuntimeScope as Fn, cacheEntryWithDebugKeySchema as Ft, apiCallMetricPlacementSchema as G, traceCacheRefSchema as Gt, DEFAULT_LLM_CALLS_CONFIG as H, defineEval as Hn, cacheStatusSchema as Ht, deriveScopedSummaryFromCases as I, runInEvalScope as In, cacheFileSchema as It, defaultConfigKeySchema as J, traceAttributeDisplayPlacementSchema as Jt, apiCallMetricSchema as K, traceAttributeDisplayFormatSchema as Kt, deriveStatusFromCaseRows as L, runInExistingEvalScope as Ln, cacheListItemSchema as Lt, applyDerivedCallAttributes as M, isInEvalScope as Mn, cacheDebugKeyEntrySchema as Mt, getNestedAttribute as N, mergeEvalOutput as Nn, cacheDebugKeyFileSchema as Nt, extractCacheEntries as O, getCurrentScope as On, evalChartMetricSchema as Ot, getEvalTitle as P, nextEvalId as Pn, cacheEntrySchema as Pt, llmCallPricingSchema as Q, traceSpanErrorSchema as Qt, deriveStatusFromChildStatuses as R, setEvalOutput as Rn, cacheModeSchema as Rt, normalizeScoreDef as S, EvalAssertionError as Sn, scoreTraceSchema as St, createRunRequestSchema as T, configureEvalRunLogs as Tn, evalChartBuiltinMetricSchema as Tt, agentEvalsConfigSchema as U, getEvalRegistry as Un, serializedCacheSpanSchema as Ut, DEFAULT_API_CALLS_CONFIG as V, repoFile as Vn, cacheRecordingSchema as Vt, apiCallMetricFormatSchema as W, spanCacheOptionsSchema as Wt, llmCallMetricPlacementSchema as X, traceDisplayConfigSchema as Xt, llmCallMetricFormatSchema as Y, traceAttributeDisplaySchema as Yt, llmCallMetricSchema as Z, traceDisplayInputConfigSchema as Zt, loadEvalModule as _, hashCacheKeySync as _n, evalSummarySchema as _t, getLastRunStatuses as a, columnKindSchema as an, buildCaseKey as at, loadConfig as b, serializeCacheRecording as bn, runLogLocationSchema as bt, loadPersistedRunSnapshots as c, numberDisplayOptionsSchema as cn, getCaseRowEvalKey as ct, persistRunState as d, z$1 as dn, caseRowSchema as dt, traceSpanSchema as en, removeDefaultConfigSchema as et, recomputeEvalStatusesInRuns as f, buildTraceTree as fn, discoveryIssueSchema as ft, deriveEvalFreshness as g, hashCacheKey as gn, evalStatsConfigSchema as gt, resolveArtifactPath as h, evalTracer as hn, evalStatItemSchema as ht, generateRunId as i, columnFormatSchema as in, trialSelectionModeSchema as it, extractLlmCalls as j, incrementEvalOutput as jn, evalChartsConfigSchema as jt, extractCacheHits as k, getEvalCaseInput as kn, evalChartTooltipExtraSchema as kt, nextShortIdFromSnapshots as l, repoFileRefSchema as ln, assertionFailureSchema as lt, runTouchesEval as m, evalSpan as mn, evalStatAggregateSchema as mt, getTargetEvalKeys as n, cellValueSchema as nn, resolveLlmCallsConfig as nt, getLatestRunInfos as o, fileRefSchema as on, buildEvalKey as ot, recomputePersistedCaseStatus as p, captureEvalSpanError as pn, evalFreshnessStatusSchema as pt, apiCallsConfigSchema as q, traceAttributeDisplayInputSchema as qt, getTargetEvals as r, columnDefSchema as rn, runLogsConfigSchema as rt, loadPersistedRunSnapshot as s, jsonCellSchema as sn, getCaseRowCaseKey as st, executeRun as t, traceSpanWarningSchema as tn, resolveApiCallsConfig as tt, persistCaseDetail as u, runArtifactRefSchema as un, caseDetailSchema as ut, parseEvalDiscovery as v, deserializeCacheRecording as vn, runLogEntrySchema as vt, createFsCacheStore as w, appendToEvalOutput as wn, evalChartAxisSchema as wt, buildDeclaredColumnDefs as x, serializeCacheValue as xn, runLogPhaseSchema as xt, resolveEvalDefaultConfig as y, deserializeCacheValue as yn, runLogLevelSchema as yt, runManifestSchema as z, setScopeCacheContext as zn, cacheOperationTypeSchema as zt };
+export { llmCallsConfigSchema as $, traceSpanKindSchema as $t, extractApiCalls as A, getCurrentScope as An, evalChartTypeSchema as At, runSummarySchema as B, setEvalOutput as Bn, cacheRecordingOpSchema as Bt, validateCharts as C, evalExpect as Cn, evalChartAggregateSchema as Ct, sseEnvelopeSchema as D, configureEvalRunLogs as Dn, evalChartConfigSchema as Dt, updateManualScoreRequestSchema as E, appendToEvalOutput as En, evalChartColorSchema as Et, getEvalDisplayStatus as F, mergeEvalOutput as Fn, cacheEntryWithDebugKeySchema as Ft, apiCallMetricPlacementSchema as G, traceCacheRefSchema as Gt, DEFAULT_LLM_CALLS_CONFIG as H, startEvalBackgroundJob as Hn, cacheStatusSchema as Ht, deriveScopedSummaryFromCases as I, nextEvalId as In, cacheFileSchema as It, defaultConfigKeySchema as J, traceAttributeDisplayPlacementSchema as Jt, apiCallMetricSchema as K, traceAttributeDisplayFormatSchema as Kt, deriveStatusFromCaseRows as L, runInEvalRuntimeScope as Ln, cacheListItemSchema as Lt, applyDerivedCallAttributes as M, getEvalStartTime as Mn, cacheDebugKeyEntrySchema as Mt, getNestedAttribute as N, incrementEvalOutput as Nn, cacheDebugKeyFileSchema as Nt, extractCacheEntries as O, evalAssert as On, evalChartMetricSchema as Ot, getEvalTitle as P, isInEvalScope as Pn, cacheEntrySchema as Pt, llmCallPricingSchema as Q, traceSpanErrorSchema as Qt, deriveStatusFromChildStatuses as R, runInEvalScope as Rn, cacheModeSchema as Rt, normalizeScoreDef as S, repoFile as Sn, scoreTraceSchema as St, createRunRequestSchema as T, advanceEvalTime as Tn, evalChartBuiltinMetricSchema as Tt, agentEvalsConfigSchema as U, defineEval as Un, serializedCacheSpanSchema as Ut, DEFAULT_API_CALLS_CONFIG as V, setScopeCacheContext as Vn, cacheRecordingSchema as Vt, apiCallMetricFormatSchema as W, getEvalRegistry as Wn, spanCacheOptionsSchema as Wt, llmCallMetricPlacementSchema as X, traceDisplayConfigSchema as Xt, llmCallMetricFormatSchema as Y, traceAttributeDisplaySchema as Yt, llmCallMetricSchema as Z, traceDisplayInputConfigSchema as Zt, loadEvalModule as _, hashCacheKeySync as _n, evalSummarySchema as _t, getLastRunStatuses as a, columnKindSchema as an, buildCaseKey as at, loadConfig as b, serializeCacheRecording as bn, runLogLocationSchema as bt, loadPersistedRunSnapshots as c, numberDisplayOptionsSchema as cn, getCaseRowEvalKey as ct, persistRunState as d, z$1 as dn, caseRowSchema as dt, traceSpanSchema as en, removeDefaultConfigSchema as et, recomputeEvalStatusesInRuns as f, buildTraceTree as fn, discoveryIssueSchema as ft, deriveEvalFreshness as g, hashCacheKey as gn, evalStatsConfigSchema as gt, resolveArtifactPath as h, evalTracer as hn, evalStatItemSchema as ht, generateRunId as i, columnFormatSchema as in, trialSelectionModeSchema as it, extractLlmCalls as j, getEvalCaseInput as jn, evalChartsConfigSchema as jt, extractCacheHits as k, evalLog as kn, evalChartTooltipExtraSchema as kt, nextShortIdFromSnapshots as l, repoFileRefSchema as ln, assertionFailureSchema as lt, runTouchesEval as m, evalSpan as mn, evalStatAggregateSchema as mt, getTargetEvalKeys as n, cellValueSchema as nn, resolveLlmCallsConfig as nt, getLatestRunInfos as o, fileRefSchema as on, buildEvalKey as ot, recomputePersistedCaseStatus as p, captureEvalSpanError as pn, evalFreshnessStatusSchema as pt, apiCallsConfigSchema as q, traceAttributeDisplayInputSchema as qt, getTargetEvals as r, columnDefSchema as rn, runLogsConfigSchema as rt, loadPersistedRunSnapshot as s, jsonCellSchema as sn, getCaseRowCaseKey as st, executeRun as t, traceSpanWarningSchema as tn, resolveApiCallsConfig as tt, persistCaseDetail as u, runArtifactRefSchema as un, caseDetailSchema as ut, parseEvalDiscovery as v, deserializeCacheRecording as vn, runLogEntrySchema as vt, createFsCacheStore as w, EvalAssertionError as wn, evalChartAxisSchema as wt, buildDeclaredColumnDefs as x, serializeCacheValue as xn, runLogPhaseSchema as xt, resolveEvalDefaultConfig as y, deserializeCacheValue as yn, runLogLevelSchema as yt, runManifestSchema as z, runInExistingEvalScope as zn, cacheOperationTypeSchema as zt };

package/dist/{runner-D5QMY3B_.mjs → runner-39KGoaM1.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-LHN7L-xk.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-DRINcaN_.mjs";
 export { getRunnerInstance, initRunner };

package/dist/{runner-LHN7L-xk.mjs → runner-DRINcaN_.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-C4yumCXE.mjs";
-import "./src-CLMrRle2.mjs";
+import { n as createRunner } from "./cli-DRkwWgTj.mjs";
+import "./src-BwKm3sKU.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/src-BwKm3sKU.mjs ADDED Viewed

@@ -0,0 +1,3 @@
+import "./runOrchestration-DB0dwGrd.mjs";
+import "./cli-DRkwWgTj.mjs";
+export {};

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.26.2",
+  "version": "0.27.0",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -39,8 +39,9 @@ file wires up cases and scoring; the real `evalTracer.span(...)` calls sit
 inside the workflow, agent, or tool functions that both production and evals
 invoke.
-`evalTracer`, `evalSpan`, output helpers, `evalLog`, and `evalAssert` are
-ambient no-ops when called outside an eval case scope, so leaving them in
+`evalTracer`, `evalSpan`, output helpers, `evalLog`, `evalAssert`, and
+`evalExpect` are ambient no-ops when called outside an eval case scope, so
+leaving them in
 production paths is safe — they only record anything when the product code runs
 inside an eval's `execute`. Use `isInEvalScope()` to branch on eval-only behavior in shared code
 (e.g. skip a real network side effect): it returns `null` outside eval-owned
@@ -67,6 +68,7 @@ import {
   appendToEvalOutput,
   captureEvalSpanError,
   evalAssert,
+  evalExpect,
   evalSpan,
   evalTracer,
   getEvalCaseInput,
@@ -87,7 +89,10 @@ export async function runRefundWorkflow(input: RefundInput) {
         {
           kind: 'llm',
           name: 'plan-refund',
-          cache: { key: { prompt: input.message, model: 'gpt-4o-mini' } },
+          cache: {
+            namespace: 'refund-workflow__plan-refund',
+            key: { prompt: input.message, model: 'gpt-4o-mini' },
+          },
         },
         async () => {
           let text: string;
@@ -119,6 +124,7 @@ export async function runRefundWorkflow(input: RefundInput) {
       setEvalOutput('reviewId', reviewId);
       mergeEvalOutput('metadata', { approved: result.approved });
       evalAssert(result.approved, 'refund workflow should approve the case');
+      evalExpect(result.finalText).toMatch(/refund/i);
       evalSpan.setAttribute('output', { result, reviewId });
       return result;
     },
@@ -319,15 +325,18 @@ Their shapes live in the types; no need to memorize the option set.
 ## Cached operations
-Wrap a costly pure span in `cache: { key }` so later runs replay its recorded
-effects without re-executing:
+Wrap a costly pure span in `cache: { namespace, key }` so later runs replay its
+recorded effects without re-executing:
 ```ts
 await evalTracer.span(
   {
     kind: 'llm',
     name: 'plan-refund',
-    cache: { key: { prompt: input.message, model: 'gpt-4o-mini' } },
+    cache: {
+      namespace: 'refund-workflow__plan-refund',
+      key: { prompt: input.message, model: 'gpt-4o-mini' },
+    },
   },
   async () => {
     const result = await llm.complete(input.message);
@@ -368,12 +377,13 @@ Mental model:
   namespace, and hit/miss status. When called directly from the case body
   (no surrounding span), the ref is recorded on the case detail's `cacheRefs`
   array.
-- The cache key folds in a source-file fingerprint, so editing the eval busts
-  the cache automatically.
-- `cache.namespace` on spans or `namespace` on value caches can share entries
-  across operations/evals, but the source-file fingerprint still participates
-  in the final key. Shared namespaces are reusable across evals in the same
-  file; evals in different files miss even with the same namespace and key.
+- Cache identity is the namespace plus the authored key. Source-file
+  fingerprints are stored as metadata for inspection, but do not participate in
+  cache-key hashing.
+- Cached spans require an explicit `cache.namespace`; value caches default to
+  `${evalId}__${name}` and can be overridden with `namespace`. Matching
+  namespaces share entries across operations/evals that use the same authored
+  key.
 - Authored eval ids are unique within one eval file. The exact eval identity is
   the workspace-relative file path plus eval id, so the same id can be reused in
   different files. Case ids must be unique within one eval; duplicate case ids
@@ -455,16 +465,17 @@ When adding or changing evals:
 1. Put the tracing + ambient SDK calls in the product code that runs in both
    production and evals. Keep eval files thin.
 2. Use realistic cases drawn from real product flows; avoid placeholder inputs.
-3. `evalAssert` for hard invariants, `scores` for graded signals,
-   `passThreshold` only on scores that should gate pass/fail.
+3. `evalAssert` for hard invariants and truthy type narrowing, `evalExpect`
+   for non-trivial comparisons, `scores` for graded signals, `passThreshold`
+   only on scores that should gate pass/fail.
 4. Surface reviewable values through execute-context `setOutput` or ambient
    `setEvalOutput` in shared workflow code, and shape them with `columns`
    formats from the `ColumnFormat` type.
 5. Promote high-signal span attributes with `traceDisplay` so they surface in
    the trace tree and detail pane.
-6. Cache costly pure spans with `cache: { key }` and pure spanless values with
-   `evalTracer.cache(...)`; never cache operations whose external side effects
-   you depend on.
+6. Cache costly pure spans with `cache: { namespace, key }` and pure spanless
+   values with `evalTracer.cache(...)`; never cache operations whose external
+   side effects you depend on.
 7. Sanity-check after changes: `agent-evals list`, then
    `agent-evals run --eval <id>`; use `--file <path|glob>` to target one file
    when multiple files use the same eval id.

package/dist/src-CLMrRle2.mjs DELETED Viewed

@@ -1,3 +0,0 @@
-import "./runOrchestration-B31SV_Bq.mjs";
-import "./cli-C4yumCXE.mjs";
-export {};