@ls-stack/agent-eval 0.26.3 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- import { E as updateManualScoreRequestSchema, T as createRunRequestSchema } from "./runOrchestration-CW_Fzr2c.mjs";
2
- import "./src-CbVnqjW3.mjs";
3
- import { t as getRunnerInstance } from "./runner-BsRloAm3.mjs";
1
+ import { E as updateManualScoreRequestSchema, T as createRunRequestSchema } from "./runOrchestration-DB0dwGrd.mjs";
2
+ import "./src-BwKm3sKU.mjs";
3
+ import { t as getRunnerInstance } from "./runner-DRINcaN_.mjs";
4
4
  import { readFile } from "node:fs/promises";
5
5
  import { dirname, isAbsolute, join, relative, resolve, sep } from "node:path";
6
6
  import { z } from "zod/v4";
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-T6FYE2Bk.mjs";
2
+ import { t as runCli } from "./cli-DRkwWgTj.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Un as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-CW_Fzr2c.mjs";
1
+ import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Wn as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-DB0dwGrd.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
4
  import { dirname, join, relative, resolve } from "node:path";
@@ -1041,8 +1041,8 @@ async function commandApp(args) {
1041
1041
  const { serve } = await import("@hono/node-server");
1042
1042
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
1043
1043
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
1044
- const appModule = await import("./app-C-yzV9J8.mjs");
1045
- const runnerModule = await import("./runner-DesCrD7Z.mjs");
1044
+ const appModule = await import("./app-ByMLOds2.mjs");
1045
+ const runnerModule = await import("./runner-39KGoaM1.mjs");
1046
1046
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
1047
1047
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
1048
1048
  await runnerModule.initRunner();
package/dist/index.d.mts CHANGED
@@ -3119,6 +3119,37 @@ declare function getEvalRegistry(): Map<string, EvalRegistryEntry>;
3119
3119
  */
3120
3120
  declare function defineEval<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs>(definition: EvalDefinition<TInput, TOutputs>): void;
3121
3121
  //#endregion
3122
+ //#region ../sdk/src/evalExpect.d.ts
3123
+ /**
3124
+ * Focused expectation helpers for eval case invariants.
3125
+ *
3126
+ * These matchers intentionally cover comparisons that produce clearer failure
3127
+ * messages than a plain `evalAssert(...)`. Use `evalAssert(...)` directly for
3128
+ * truthiness checks and custom type narrowing.
3129
+ */
3130
+ type EvalExpectation<T> = {
3131
+ /** Invert the next matcher. */readonly not: EvalExpectation<T>; /** Assert strict `Object.is(...)` equality. */
3132
+ toBe(expected: unknown): void; /** Assert Node.js deep strict equality. */
3133
+ toEqual(expected: unknown): void; /** Assert that object properties recursively match the expected subset. */
3134
+ toMatchObject(expected: Record<string, unknown>): void; /** Assert substring, array item, or set item containment. */
3135
+ toContain(expected: unknown): void; /** Assert the value has a numeric `length` equal to `expected`. */
3136
+ toHaveLength(expected: number): void; /** Assert a dot-path property exists, optionally with a deep-equal value. */
3137
+ toHaveProperty(path: string, ...expected: [] | [unknown]): void; /** Assert the received number is greater than `expected`. */
3138
+ toBeGreaterThan(expected: number): void; /** Assert the received number is greater than or equal to `expected`. */
3139
+ toBeGreaterThanOrEqual(expected: number): void; /** Assert the received number is less than `expected`. */
3140
+ toBeLessThan(expected: number): void; /** Assert the received number is less than or equal to `expected`. */
3141
+ toBeLessThanOrEqual(expected: number): void; /** Assert the received number is close to `expected` at `precision` decimals. */
3142
+ toBeCloseTo(expected: number, precision?: number): void; /** Assert the received string matches the regular expression. */
3143
+ toMatch(expected: RegExp): void;
3144
+ };
3145
+ /**
3146
+ * Create focused expectation helpers for the current eval case.
3147
+ *
3148
+ * Failed expectations record assertion failures and throw only while an eval
3149
+ * case scope is active, matching `evalAssert(...)`.
3150
+ */
3151
+ declare function evalExpect<T>(value: T): EvalExpectation<T>;
3152
+ //#endregion
3122
3153
  //#region ../sdk/src/repoFile.d.ts
3123
3154
  /**
3124
3155
  * Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
@@ -3356,12 +3387,14 @@ declare function mergeEvalOutput(key: string, patch: Record<string, unknown>): v
3356
3387
  */
3357
3388
  declare function incrementEvalOutput(key: string, delta: number): void;
3358
3389
  /**
3359
- * Assert a condition for the current eval case and throw on failure.
3390
+ * Assert a truthy condition for the current eval case and throw on failure.
3360
3391
  *
3361
3392
  * Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
3362
- * can safely reuse `evalAssert(...)` when it also runs outside an eval.
3393
+ * can safely reuse `evalAssert(...)` when it also runs outside an eval. The
3394
+ * TypeScript assertion signature still narrows the checked value after the
3395
+ * call.
3363
3396
  */
3364
- declare function evalAssert(condition: boolean, message: string): void;
3397
+ declare function evalAssert(condition: unknown, message: string): asserts condition;
3365
3398
  //#endregion
3366
3399
  //#region ../sdk/src/cacheSerialization.d.ts
3367
3400
  declare const serializedCacheValueMarker = "__agentEvalsCacheSerialization";
@@ -3721,4 +3754,4 @@ declare function createRunner({
3721
3754
  */
3722
3755
  declare function runCli(argv: string[]): Promise<void>;
3723
3756
  //#endregion
3724
- export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
3757
+ export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalExpectation, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalExpect, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getEvalStartTime, At as evalChartTypeSchema, B as runSummarySchema, Bn as startEvalBackgroundJob, Bt as cacheRecordingOpSchema, Cn as advanceEvalTime, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dn as evalLog, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as evalAssert, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as runInEvalRuntimeScope, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as defineEval, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as runInEvalScope, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInExistingEvalScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as isInEvalScope, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as mergeEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as getCurrentScope, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as nextEvalId, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as setEvalOutput, Rt as cacheModeSchema, Sn as EvalAssertionError, St as scoreTraceSchema, T as createRunRequestSchema, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as getEvalRegistry, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as repoFile, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as incrementEvalOutput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as getEvalCaseInput, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as appendToEvalOutput, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as setScopeCacheContext, zt as cacheOperationTypeSchema } from "./runOrchestration-CW_Fzr2c.mjs";
2
- import { n as createRunner, t as runCli } from "./cli-T6FYE2Bk.mjs";
3
- import "./src-CbVnqjW3.mjs";
4
- export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
1
+ import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getCurrentScope, At as evalChartTypeSchema, B as runSummarySchema, Bn as setEvalOutput, Bt as cacheRecordingOpSchema, Cn as evalExpect, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as appendToEvalOutput, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as mergeEvalOutput, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as startEvalBackgroundJob, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as nextEvalId, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInEvalRuntimeScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as getEvalStartTime, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as incrementEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as evalAssert, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as isInEvalScope, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as runInEvalScope, Rt as cacheModeSchema, Sn as repoFile, St as scoreTraceSchema, T as createRunRequestSchema, Tn as advanceEvalTime, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as defineEval, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as setScopeCacheContext, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wn as getEvalRegistry, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as getEvalCaseInput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as evalLog, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as EvalAssertionError, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as runInExistingEvalScope, zt as cacheOperationTypeSchema } from "./runOrchestration-DB0dwGrd.mjs";
2
+ import { n as createRunner, t as runCli } from "./cli-DRkwWgTj.mjs";
3
+ import "./src-BwKm3sKU.mjs";
4
+ export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalExpect, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
package/dist/runChild.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { B as runSummarySchema, T as createRunRequestSchema, Tn as configureEvalRunLogs, b as loadConfig, gt as evalStatsConfigSchema, jt as evalChartsConfigSchema, ot as buildEvalKey, r as getTargetEvals$1, rn as columnDefSchema, t as executeRun, v as parseEvalDiscovery, w as createFsCacheStore, z as runManifestSchema } from "./runOrchestration-CW_Fzr2c.mjs";
1
+ import { B as runSummarySchema, Dn as configureEvalRunLogs, T as createRunRequestSchema, b as loadConfig, gt as evalStatsConfigSchema, jt as evalChartsConfigSchema, ot as buildEvalKey, r as getTargetEvals$1, rn as columnDefSchema, t as executeRun, v as parseEvalDiscovery, w as createFsCacheStore, z as runManifestSchema } from "./runOrchestration-DB0dwGrd.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { relative } from "node:path";
@@ -2,9 +2,9 @@ import { createRequire, registerHooks } from "node:module";
2
2
  import { createHash } from "node:crypto";
3
3
  import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
4
4
  import { extname, isAbsolute, join, relative, resolve } from "node:path";
5
- import { z, z as z$1 } from "zod/v4";
5
+ import { formatWithOptions, isDeepStrictEqual } from "node:util";
6
6
  import { AsyncLocalStorage } from "node:async_hooks";
7
- import { formatWithOptions } from "node:util";
7
+ import { z, z as z$1 } from "zod/v4";
8
8
  import { Buffer as Buffer$1 } from "node:buffer";
9
9
  import { gunzipSync, gzipSync } from "node:zlib";
10
10
  import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
@@ -29,23 +29,6 @@ function defineEval(definition) {
29
29
  });
30
30
  }
31
31
  //#endregion
32
- //#region ../sdk/src/repoFile.ts
33
- /**
34
- * Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
35
- * by a column configured with `format: 'image' | 'audio' | 'video' | 'file'`.
36
- *
37
- * @param path Relative or absolute path to the repository file.
38
- * @param mimeType Optional MIME type hint for UI rendering.
39
- * @returns A repo-backed file reference suitable for file/media columns.
40
- */
41
- function repoFile(path, mimeType) {
42
- return {
43
- source: "repo",
44
- path,
45
- mimeType
46
- };
47
- }
48
- //#endregion
49
32
  //#region ../sdk/src/runtime.ts
50
33
  const scopeStorage = new AsyncLocalStorage();
51
34
  const runtimeScopeStorage = new AsyncLocalStorage();
@@ -639,10 +622,12 @@ function incrementEvalOutput(key, delta) {
639
622
  });
640
623
  }
641
624
  /**
642
- * Assert a condition for the current eval case and throw on failure.
625
+ * Assert a truthy condition for the current eval case and throw on failure.
643
626
  *
644
627
  * Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
645
- * can safely reuse `evalAssert(...)` when it also runs outside an eval.
628
+ * can safely reuse `evalAssert(...)` when it also runs outside an eval. The
629
+ * TypeScript assertion signature still narrows the checked value after the
630
+ * call.
646
631
  */
647
632
  function evalAssert(condition, message) {
648
633
  if (condition) return;
@@ -653,6 +638,156 @@ function evalAssert(condition, message) {
653
638
  throw error;
654
639
  }
655
640
  //#endregion
641
+ //#region ../sdk/src/evalExpect.ts
642
+ const expectFormatOptions = {
643
+ depth: 5,
644
+ maxArrayLength: 50,
645
+ maxStringLength: 1e3,
646
+ breakLength: 80,
647
+ compact: 3
648
+ };
649
+ function formatExpectValue(value) {
650
+ return formatWithOptions(expectFormatOptions, value);
651
+ }
652
+ function isRecordLike$4(value) {
653
+ return typeof value === "object" && value !== null;
654
+ }
655
+ function isLengthLike(value) {
656
+ if (typeof value === "string") return true;
657
+ return isRecordLike$4(value) && "length" in value && typeof value.length === "number";
658
+ }
659
+ function matchesObjectSubset(received, expected) {
660
+ if (!isRecordLike$4(received)) return false;
661
+ for (const [key, expectedValue] of Object.entries(expected)) {
662
+ if (!(key in received)) return false;
663
+ const receivedValue = received[key];
664
+ if (isRecordLike$4(expectedValue) && !Array.isArray(expectedValue)) {
665
+ if (!matchesObjectSubset(receivedValue, expectedValue)) return false;
666
+ continue;
667
+ }
668
+ if (!isDeepStrictEqual(receivedValue, expectedValue)) return false;
669
+ }
670
+ return true;
671
+ }
672
+ function containsValue(received, expected) {
673
+ if (typeof received === "string") return typeof expected === "string" && received.includes(expected);
674
+ if (Array.isArray(received)) return received.includes(expected);
675
+ if (received instanceof Set) return received.has(expected);
676
+ return false;
677
+ }
678
+ function getPropertyAtPath(received, path) {
679
+ if (path === "") return {
680
+ exists: false,
681
+ value: void 0
682
+ };
683
+ let current = received;
684
+ for (const key of path.split(".")) {
685
+ if (!isRecordLike$4(current) || !(key in current)) return {
686
+ exists: false,
687
+ value: void 0
688
+ };
689
+ current = current[key];
690
+ }
691
+ return {
692
+ exists: true,
693
+ value: current
694
+ };
695
+ }
696
+ var EvalExpectationImpl = class EvalExpectationImpl {
697
+ received;
698
+ negated;
699
+ constructor(received, negated) {
700
+ this.received = received;
701
+ this.negated = negated;
702
+ }
703
+ get not() {
704
+ return new EvalExpectationImpl(this.received, !this.negated);
705
+ }
706
+ toBe(expected) {
707
+ this.check(Object.is(this.received, expected), `Expected ${formatExpectValue(this.received)} to be ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to be ${formatExpectValue(expected)}`);
708
+ }
709
+ toEqual(expected) {
710
+ this.check(isDeepStrictEqual(this.received, expected), `Expected ${formatExpectValue(this.received)} to equal ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to equal ${formatExpectValue(expected)}`);
711
+ }
712
+ toMatchObject(expected) {
713
+ this.check(matchesObjectSubset(this.received, expected), `Expected ${formatExpectValue(this.received)} to match object ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to match object ${formatExpectValue(expected)}`);
714
+ }
715
+ toContain(expected) {
716
+ this.check(containsValue(this.received, expected), `Expected ${formatExpectValue(this.received)} to contain ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to contain ${formatExpectValue(expected)}`);
717
+ }
718
+ toHaveLength(expected) {
719
+ this.check(isLengthLike(this.received) && this.received.length === expected, `Expected ${formatExpectValue(this.received)} to have length ${expected}`, `Expected ${formatExpectValue(this.received)} not to have length ${expected}`);
720
+ }
721
+ toHaveProperty(path, ...expected) {
722
+ const result = getPropertyAtPath(this.received, path);
723
+ const expectedValue = expected[0];
724
+ const pass = result.exists && (expected.length === 0 || isDeepStrictEqual(result.value, expectedValue));
725
+ const expectedSuffix = expected.length === 0 ? "" : ` with value ${formatExpectValue(expectedValue)}`;
726
+ this.check(pass, `Expected ${formatExpectValue(this.received)} to have property "${path}"${expectedSuffix}`, `Expected ${formatExpectValue(this.received)} not to have property "${path}"${expectedSuffix}`);
727
+ }
728
+ toBeGreaterThan(expected) {
729
+ this.checkNumberComparison("to be greater than", expected, (received) => {
730
+ return received > expected;
731
+ });
732
+ }
733
+ toBeGreaterThanOrEqual(expected) {
734
+ this.checkNumberComparison("to be greater than or equal to", expected, (received) => {
735
+ return received >= expected;
736
+ });
737
+ }
738
+ toBeLessThan(expected) {
739
+ this.checkNumberComparison("to be less than", expected, (received) => {
740
+ return received < expected;
741
+ });
742
+ }
743
+ toBeLessThanOrEqual(expected) {
744
+ this.checkNumberComparison("to be less than or equal to", expected, (received) => {
745
+ return received <= expected;
746
+ });
747
+ }
748
+ toBeCloseTo(expected, precision = 2) {
749
+ const tolerance = 10 ** -precision / 2;
750
+ this.check(typeof this.received === "number" && Number.isFinite(this.received) && Math.abs(this.received - expected) < tolerance, `Expected ${formatExpectValue(this.received)} to be close to ${expected} at ${precision} decimals`, `Expected ${formatExpectValue(this.received)} not to be close to ${expected} at ${precision} decimals`);
751
+ }
752
+ toMatch(expected) {
753
+ const statelessExpected = new RegExp(expected.source, expected.flags);
754
+ this.check(typeof this.received === "string" && statelessExpected.test(this.received), `Expected ${formatExpectValue(this.received)} to match ${expected}`, `Expected ${formatExpectValue(this.received)} not to match ${expected}`);
755
+ }
756
+ check(pass, positiveMessage, negativeMessage) {
757
+ if (this.negated ? !pass : pass) return;
758
+ evalAssert(false, this.negated ? negativeMessage : positiveMessage);
759
+ }
760
+ checkNumberComparison(label, expected, predicate) {
761
+ this.check(typeof this.received === "number" && predicate(this.received), `Expected ${formatExpectValue(this.received)} ${label} ${expected}`, `Expected ${formatExpectValue(this.received)} not ${label} ${expected}`);
762
+ }
763
+ };
764
+ /**
765
+ * Create focused expectation helpers for the current eval case.
766
+ *
767
+ * Failed expectations record assertion failures and throw only while an eval
768
+ * case scope is active, matching `evalAssert(...)`.
769
+ */
770
+ function evalExpect(value) {
771
+ return new EvalExpectationImpl(value, false);
772
+ }
773
+ //#endregion
774
+ //#region ../sdk/src/repoFile.ts
775
+ /**
776
+ * Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
777
+ * by a column configured with `format: 'image' | 'audio' | 'video' | 'file'`.
778
+ *
779
+ * @param path Relative or absolute path to the repository file.
780
+ * @param mimeType Optional MIME type hint for UI rendering.
781
+ * @returns A repo-backed file reference suitable for file/media columns.
782
+ */
783
+ function repoFile(path, mimeType) {
784
+ return {
785
+ source: "repo",
786
+ path,
787
+ mimeType
788
+ };
789
+ }
790
+ //#endregion
656
791
  //#region ../sdk/src/cacheSerialization.ts
657
792
  const serializedCacheValueMarker = "__agentEvalsCacheSerialization";
658
793
  const jsonSafeCacheValueVersion = "json-safe-v1";
@@ -6189,4 +6324,4 @@ function toLastRunStatus(status) {
6189
6324
  return status === "pending" ? null : status;
6190
6325
  }
6191
6326
  //#endregion
6192
- export { llmCallsConfigSchema as $, traceSpanKindSchema as $t, extractApiCalls as A, getEvalStartTime as An, evalChartTypeSchema as At, runSummarySchema as B, startEvalBackgroundJob as Bn, cacheRecordingOpSchema as Bt, validateCharts as C, advanceEvalTime as Cn, evalChartAggregateSchema as Ct, sseEnvelopeSchema as D, evalLog as Dn, evalChartConfigSchema as Dt, updateManualScoreRequestSchema as E, evalAssert as En, evalChartColorSchema as Et, getEvalDisplayStatus as F, runInEvalRuntimeScope as Fn, cacheEntryWithDebugKeySchema as Ft, apiCallMetricPlacementSchema as G, traceCacheRefSchema as Gt, DEFAULT_LLM_CALLS_CONFIG as H, defineEval as Hn, cacheStatusSchema as Ht, deriveScopedSummaryFromCases as I, runInEvalScope as In, cacheFileSchema as It, defaultConfigKeySchema as J, traceAttributeDisplayPlacementSchema as Jt, apiCallMetricSchema as K, traceAttributeDisplayFormatSchema as Kt, deriveStatusFromCaseRows as L, runInExistingEvalScope as Ln, cacheListItemSchema as Lt, applyDerivedCallAttributes as M, isInEvalScope as Mn, cacheDebugKeyEntrySchema as Mt, getNestedAttribute as N, mergeEvalOutput as Nn, cacheDebugKeyFileSchema as Nt, extractCacheEntries as O, getCurrentScope as On, evalChartMetricSchema as Ot, getEvalTitle as P, nextEvalId as Pn, cacheEntrySchema as Pt, llmCallPricingSchema as Q, traceSpanErrorSchema as Qt, deriveStatusFromChildStatuses as R, setEvalOutput as Rn, cacheModeSchema as Rt, normalizeScoreDef as S, EvalAssertionError as Sn, scoreTraceSchema as St, createRunRequestSchema as T, configureEvalRunLogs as Tn, evalChartBuiltinMetricSchema as Tt, agentEvalsConfigSchema as U, getEvalRegistry as Un, serializedCacheSpanSchema as Ut, DEFAULT_API_CALLS_CONFIG as V, repoFile as Vn, cacheRecordingSchema as Vt, apiCallMetricFormatSchema as W, spanCacheOptionsSchema as Wt, llmCallMetricPlacementSchema as X, traceDisplayConfigSchema as Xt, llmCallMetricFormatSchema as Y, traceAttributeDisplaySchema as Yt, llmCallMetricSchema as Z, traceDisplayInputConfigSchema as Zt, loadEvalModule as _, hashCacheKeySync as _n, evalSummarySchema as _t, getLastRunStatuses as a, columnKindSchema as an, buildCaseKey as at, loadConfig as b, serializeCacheRecording as bn, runLogLocationSchema as bt, loadPersistedRunSnapshots as c, numberDisplayOptionsSchema as cn, getCaseRowEvalKey as ct, persistRunState as d, z$1 as dn, caseRowSchema as dt, traceSpanSchema as en, removeDefaultConfigSchema as et, recomputeEvalStatusesInRuns as f, buildTraceTree as fn, discoveryIssueSchema as ft, deriveEvalFreshness as g, hashCacheKey as gn, evalStatsConfigSchema as gt, resolveArtifactPath as h, evalTracer as hn, evalStatItemSchema as ht, generateRunId as i, columnFormatSchema as in, trialSelectionModeSchema as it, extractLlmCalls as j, incrementEvalOutput as jn, evalChartsConfigSchema as jt, extractCacheHits as k, getEvalCaseInput as kn, evalChartTooltipExtraSchema as kt, nextShortIdFromSnapshots as l, repoFileRefSchema as ln, assertionFailureSchema as lt, runTouchesEval as m, evalSpan as mn, evalStatAggregateSchema as mt, getTargetEvalKeys as n, cellValueSchema as nn, resolveLlmCallsConfig as nt, getLatestRunInfos as o, fileRefSchema as on, buildEvalKey as ot, recomputePersistedCaseStatus as p, captureEvalSpanError as pn, evalFreshnessStatusSchema as pt, apiCallsConfigSchema as q, traceAttributeDisplayInputSchema as qt, getTargetEvals as r, columnDefSchema as rn, runLogsConfigSchema as rt, loadPersistedRunSnapshot as s, jsonCellSchema as sn, getCaseRowCaseKey as st, executeRun as t, traceSpanWarningSchema as tn, resolveApiCallsConfig as tt, persistCaseDetail as u, runArtifactRefSchema as un, caseDetailSchema as ut, parseEvalDiscovery as v, deserializeCacheRecording as vn, runLogEntrySchema as vt, createFsCacheStore as w, appendToEvalOutput as wn, evalChartAxisSchema as wt, buildDeclaredColumnDefs as x, serializeCacheValue as xn, runLogPhaseSchema as xt, resolveEvalDefaultConfig as y, deserializeCacheValue as yn, runLogLevelSchema as yt, runManifestSchema as z, setScopeCacheContext as zn, cacheOperationTypeSchema as zt };
6327
+ export { llmCallsConfigSchema as $, traceSpanKindSchema as $t, extractApiCalls as A, getCurrentScope as An, evalChartTypeSchema as At, runSummarySchema as B, setEvalOutput as Bn, cacheRecordingOpSchema as Bt, validateCharts as C, evalExpect as Cn, evalChartAggregateSchema as Ct, sseEnvelopeSchema as D, configureEvalRunLogs as Dn, evalChartConfigSchema as Dt, updateManualScoreRequestSchema as E, appendToEvalOutput as En, evalChartColorSchema as Et, getEvalDisplayStatus as F, mergeEvalOutput as Fn, cacheEntryWithDebugKeySchema as Ft, apiCallMetricPlacementSchema as G, traceCacheRefSchema as Gt, DEFAULT_LLM_CALLS_CONFIG as H, startEvalBackgroundJob as Hn, cacheStatusSchema as Ht, deriveScopedSummaryFromCases as I, nextEvalId as In, cacheFileSchema as It, defaultConfigKeySchema as J, traceAttributeDisplayPlacementSchema as Jt, apiCallMetricSchema as K, traceAttributeDisplayFormatSchema as Kt, deriveStatusFromCaseRows as L, runInEvalRuntimeScope as Ln, cacheListItemSchema as Lt, applyDerivedCallAttributes as M, getEvalStartTime as Mn, cacheDebugKeyEntrySchema as Mt, getNestedAttribute as N, incrementEvalOutput as Nn, cacheDebugKeyFileSchema as Nt, extractCacheEntries as O, evalAssert as On, evalChartMetricSchema as Ot, getEvalTitle as P, isInEvalScope as Pn, cacheEntrySchema as Pt, llmCallPricingSchema as Q, traceSpanErrorSchema as Qt, deriveStatusFromChildStatuses as R, runInEvalScope as Rn, cacheModeSchema as Rt, normalizeScoreDef as S, repoFile as Sn, scoreTraceSchema as St, createRunRequestSchema as T, advanceEvalTime as Tn, evalChartBuiltinMetricSchema as Tt, agentEvalsConfigSchema as U, defineEval as Un, serializedCacheSpanSchema as Ut, DEFAULT_API_CALLS_CONFIG as V, setScopeCacheContext as Vn, cacheRecordingSchema as Vt, apiCallMetricFormatSchema as W, getEvalRegistry as Wn, spanCacheOptionsSchema as Wt, llmCallMetricPlacementSchema as X, traceDisplayConfigSchema as Xt, llmCallMetricFormatSchema as Y, traceAttributeDisplaySchema as Yt, llmCallMetricSchema as Z, traceDisplayInputConfigSchema as Zt, loadEvalModule as _, hashCacheKeySync as _n, evalSummarySchema as _t, getLastRunStatuses as a, columnKindSchema as an, buildCaseKey as at, loadConfig as b, serializeCacheRecording as bn, runLogLocationSchema as bt, loadPersistedRunSnapshots as c, numberDisplayOptionsSchema as cn, getCaseRowEvalKey as ct, persistRunState as d, z$1 as dn, caseRowSchema as dt, traceSpanSchema as en, removeDefaultConfigSchema as et, recomputeEvalStatusesInRuns as f, buildTraceTree as fn, discoveryIssueSchema as ft, deriveEvalFreshness as g, hashCacheKey as gn, evalStatsConfigSchema as gt, resolveArtifactPath as h, evalTracer as hn, evalStatItemSchema as ht, generateRunId as i, columnFormatSchema as in, trialSelectionModeSchema as it, extractLlmCalls as j, getEvalCaseInput as jn, evalChartsConfigSchema as jt, extractCacheHits as k, evalLog as kn, evalChartTooltipExtraSchema as kt, nextShortIdFromSnapshots as l, repoFileRefSchema as ln, assertionFailureSchema as lt, runTouchesEval as m, evalSpan as mn, evalStatAggregateSchema as mt, getTargetEvalKeys as n, cellValueSchema as nn, resolveLlmCallsConfig as nt, getLatestRunInfos as o, fileRefSchema as on, buildEvalKey as ot, recomputePersistedCaseStatus as p, captureEvalSpanError as pn, evalFreshnessStatusSchema as pt, apiCallsConfigSchema as q, traceAttributeDisplayInputSchema as qt, getTargetEvals as r, columnDefSchema as rn, runLogsConfigSchema as rt, loadPersistedRunSnapshot as s, jsonCellSchema as sn, getCaseRowCaseKey as st, executeRun as t, traceSpanWarningSchema as tn, resolveApiCallsConfig as tt, persistCaseDetail as u, runArtifactRefSchema as un, caseDetailSchema as ut, parseEvalDiscovery as v, deserializeCacheRecording as vn, runLogEntrySchema as vt, createFsCacheStore as w, EvalAssertionError as wn, evalChartAxisSchema as wt, buildDeclaredColumnDefs as x, serializeCacheValue as xn, runLogPhaseSchema as xt, resolveEvalDefaultConfig as y, deserializeCacheValue as yn, runLogLevelSchema as yt, runManifestSchema as z, runInExistingEvalScope as zn, cacheOperationTypeSchema as zt };
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-BsRloAm3.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-DRINcaN_.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-T6FYE2Bk.mjs";
2
- import "./src-CbVnqjW3.mjs";
1
+ import { n as createRunner } from "./cli-DRkwWgTj.mjs";
2
+ import "./src-BwKm3sKU.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-DB0dwGrd.mjs";
2
+ import "./cli-DRkwWgTj.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.26.3",
3
+ "version": "0.27.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -39,8 +39,9 @@ file wires up cases and scoring; the real `evalTracer.span(...)` calls sit
39
39
  inside the workflow, agent, or tool functions that both production and evals
40
40
  invoke.
41
41
 
42
- `evalTracer`, `evalSpan`, output helpers, `evalLog`, and `evalAssert` are
43
- ambient no-ops when called outside an eval case scope, so leaving them in
42
+ `evalTracer`, `evalSpan`, output helpers, `evalLog`, `evalAssert`, and
43
+ `evalExpect` are ambient no-ops when called outside an eval case scope, so
44
+ leaving them in
44
45
  production paths is safe — they only record anything when the product code runs
45
46
  inside an eval's `execute`. Use `isInEvalScope()` to branch on eval-only behavior in shared code
46
47
  (e.g. skip a real network side effect): it returns `null` outside eval-owned
@@ -67,6 +68,7 @@ import {
67
68
  appendToEvalOutput,
68
69
  captureEvalSpanError,
69
70
  evalAssert,
71
+ evalExpect,
70
72
  evalSpan,
71
73
  evalTracer,
72
74
  getEvalCaseInput,
@@ -122,6 +124,7 @@ export async function runRefundWorkflow(input: RefundInput) {
122
124
  setEvalOutput('reviewId', reviewId);
123
125
  mergeEvalOutput('metadata', { approved: result.approved });
124
126
  evalAssert(result.approved, 'refund workflow should approve the case');
127
+ evalExpect(result.finalText).toMatch(/refund/i);
125
128
  evalSpan.setAttribute('output', { result, reviewId });
126
129
  return result;
127
130
  },
@@ -462,8 +465,9 @@ When adding or changing evals:
462
465
  1. Put the tracing + ambient SDK calls in the product code that runs in both
463
466
  production and evals. Keep eval files thin.
464
467
  2. Use realistic cases drawn from real product flows; avoid placeholder inputs.
465
- 3. `evalAssert` for hard invariants, `scores` for graded signals,
466
- `passThreshold` only on scores that should gate pass/fail.
468
+ 3. `evalAssert` for hard invariants and truthy type narrowing, `evalExpect`
469
+ for non-trivial comparisons, `scores` for graded signals, `passThreshold`
470
+ only on scores that should gate pass/fail.
467
471
  4. Surface reviewable values through execute-context `setOutput` or ambient
468
472
  `setEvalOutput` in shared workflow code, and shape them with `columns`
469
473
  formats from the `ColumnFormat` type.
@@ -1,3 +0,0 @@
1
- import "./runOrchestration-CW_Fzr2c.mjs";
2
- import "./cli-T6FYE2Bk.mjs";
3
- export {};