@ls-stack/agent-eval 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- import { E as updateManualScoreRequestSchema, T as createRunRequestSchema } from "./runOrchestration-D1edUDhp.mjs";
2
- import "./src-D-HuV8I-.mjs";
3
- import { t as getRunnerInstance } from "./runner-C9nP2VKL.mjs";
1
+ import { E as updateManualScoreRequestSchema, T as createRunRequestSchema } from "./runOrchestration-CimthgI7.mjs";
2
+ import "./src-CcXfWT4M.mjs";
3
+ import { t as getRunnerInstance } from "./runner-4yNYRvmF.mjs";
4
4
  import { readFile } from "node:fs/promises";
5
5
  import { dirname, isAbsolute, join, relative, resolve, sep } from "node:path";
6
6
  import { z } from "zod/v4";
package/dist/bin.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { t as runCli } from "./cli-C0EtHhEO.mjs";
2
+ import { t as runCli } from "./cli-D3QNOcPN.mjs";
3
3
  import { spawn } from "node:child_process";
4
4
  //#region src/bin.ts
5
5
  const moduleMocksFlag = "--experimental-test-module-mocks";
@@ -1,4 +1,4 @@
1
- import { C as validateCharts, F as deriveScopedSummaryFromCases, Fn as getEvalRegistry, N as getEvalTitle, P as getEvalDisplayStatus, S as normalizeScoreDef, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as loadConfig, c as persistCaseDetail, d as recomputePersistedCaseStatus, et as resolveApiCallsConfig, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, tt as resolveLlmCallsConfig, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig, z as runSummarySchema } from "./runOrchestration-D1edUDhp.mjs";
1
+ import { C as validateCharts, F as deriveScopedSummaryFromCases, Ln as getEvalRegistry, N as getEvalTitle, P as getEvalDisplayStatus, S as normalizeScoreDef, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as loadConfig, c as persistCaseDetail, d as recomputePersistedCaseStatus, et as resolveApiCallsConfig, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, tt as resolveLlmCallsConfig, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig, z as runSummarySchema } from "./runOrchestration-CimthgI7.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
4
4
  import { dirname, join, relative, resolve } from "node:path";
@@ -959,8 +959,8 @@ async function commandApp(args) {
959
959
  const { serve } = await import("@hono/node-server");
960
960
  const bundledWebDist = resolve(currentDir, "apps/web/dist");
961
961
  if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
962
- const appModule = await import("./app-moDHbg1O.mjs");
963
- const runnerModule = await import("./runner-CyRhIzci.mjs");
962
+ const appModule = await import("./app-Cw79dJDr.mjs");
963
+ const runnerModule = await import("./runner-B-SYzW8w.mjs");
964
964
  if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
965
965
  if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
966
966
  await runnerModule.initRunner();
package/dist/index.d.mts CHANGED
@@ -2758,6 +2758,12 @@ type EvalCase<TInput> = {
2758
2758
  };
2759
2759
  /** Runtime output values collected from output helpers and `deriveFromTracing`. */
2760
2760
  type EvalOutputs = Record<string, unknown>;
2761
+ /**
2762
+ * Initial wall-clock time used by an eval's shifted Date clock.
2763
+ *
2764
+ * Pass `'now'` to opt one eval back into the real current clock.
2765
+ */
2766
+ type EvalStartTime = Date | number | string;
2761
2767
  /**
2762
2768
  * Schema used to validate and type an eval's collected runtime outputs.
2763
2769
  *
@@ -2925,6 +2931,23 @@ type EvalDefinitionBase<TInput = unknown, TOutputs extends EvalOutputs = EvalOut
2925
2931
  * delay case finalization; late mutations are not guaranteed to persist.
2926
2932
  */
2927
2933
  waitForBackgroundJobs?: boolean;
2934
+ /**
2935
+ * Optional initial wall-clock time for this eval's runtime.
2936
+ *
2937
+ * When set, `new Date()` and `Date.now()` inside case generation, execution,
2938
+ * tracing, derived outputs, and scorers start from this wall-clock value and
2939
+ * then continue advancing with real elapsed time. The default is
2940
+ * `2026-04-10T00:00:00.000Z`. Pass `'now'` to use the real current clock for
2941
+ * this eval. Timers are not faked, so `setTimeout` and other asynchronous
2942
+ * work still run normally.
2943
+ */
2944
+ startTime?: EvalStartTime;
2945
+ /**
2946
+ * Freeze the eval Date clock at `startTime` until `advanceEvalTime(...)`
2947
+ * moves it manually. Defaults to `false`, so eval time advances with real
2948
+ * elapsed time from the configured `startTime`.
2949
+ */
2950
+ freezeTime?: boolean;
2928
2951
  execute: (ctx: EvalExecuteContext<TInput, TOutputs>) => Promise<void> | void;
2929
2952
  deriveFromTracing?: (ctx: EvalDeriveContext<TInput>) => Partial<TOutputs> | Promise<Partial<TOutputs>>;
2930
2953
  scores?: Record<string, EvalScoreDef<TInput, TOutputs>>;
@@ -3018,6 +3041,9 @@ declare function defineEval<TInput = unknown, TOutputs extends EvalOutputs = Eva
3018
3041
  declare function repoFile(path: string, mimeType?: string): RepoFileRef;
3019
3042
  //#endregion
3020
3043
  //#region ../sdk/src/runtime.d.ts
3044
+ declare global {
3045
+ var __agentEvalsRealDate: DateConstructor | undefined;
3046
+ }
3021
3047
  /**
3022
3048
  * Raw-key debug payload passed alongside cache writes.
3023
3049
  *
@@ -3063,7 +3089,15 @@ type CacheRecordingFrame = {
3063
3089
  };
3064
3090
  /** Mutable per-case runtime state stored in async local storage. */
3065
3091
  type EvalCaseScope = {
3066
- caseId: string; /** Stable prefix used by `nextEvalId()` for this eval case scope. */
3092
+ caseId: string; /** Initial wall-clock time used by Date APIs inside this eval case. */
3093
+ startTime: EvalStartTime | undefined; /** Mutable shifted wall-clock state shared across this eval case's phases. */
3094
+ evalClockState: {
3095
+ startMs: number;
3096
+ realStartMs: number;
3097
+ offsetMs: number;
3098
+ frozen: boolean;
3099
+ shifted: boolean;
3100
+ }; /** Stable prefix used by `nextEvalId()` for this eval case scope. */
3067
3101
  idPrefix: string | undefined; /** Monotonic per-scope counter used by `nextEvalId()`. */
3068
3102
  nextEvalIdCounter: number; /** Authored input for the current case, when provided by the runner. */
3069
3103
  input?: unknown;
@@ -3101,11 +3135,28 @@ type EvalCaseScope = {
3101
3135
  * modules imported while a run is being prepared.
3102
3136
  */
3103
3137
  type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'outputsSchema' | 'scorer';
3138
+ /** Time unit accepted by `advanceEvalTime(unit, amount)`. */
3139
+ type EvalTimeUnit = 'millisecond' | 'milliseconds' | 'second' | 'seconds' | 'minute' | 'minutes' | 'hour' | 'hours' | 'day' | 'days';
3104
3140
  type EvalLogLevelInput = RunLogLevel | 'warning';
3105
3141
  /** Error thrown when an eval assertion fails during case execution. */
3106
3142
  declare class EvalAssertionError extends Error {
3107
3143
  constructor(message: string);
3108
3144
  }
3145
+ /**
3146
+ * Return the wall-clock start time captured for the active eval.
3147
+ *
3148
+ * For `startTime: 'now'`, this is the real time captured when the eval clock
3149
+ * context was created.
3150
+ */
3151
+ declare function getEvalStartTime(): Date;
3152
+ /**
3153
+ * Advance the active eval's shifted Date clock and return the new time.
3154
+ *
3155
+ * Throws outside an active shifted eval clock. Evals that set
3156
+ * `startTime: 'now'` use the real current clock unless `freezeTime: true` is
3157
+ * also set.
3158
+ */
3159
+ declare function advanceEvalTime(unit: EvalTimeUnit, amount: number): Date;
3109
3160
  /** Return the current eval scope for the active async context, if any. */
3110
3161
  declare function getCurrentScope(): EvalCaseScope | undefined;
3111
3162
  /**
@@ -3155,7 +3206,9 @@ type RunInEvalScopeOptions = {
3155
3206
  idPrefix?: string; /** Cache adapter + mode attached to the scope before `fn` runs. */
3156
3207
  cacheContext?: CacheScopeContext; /** Whether registered background jobs should settle before scope finalizes. */
3157
3208
  waitForBackgroundJobs?: boolean; /** Eval runner phase exposed through `isInEvalScope()`. Defaults to `eval`. */
3158
- runtimeScope?: EvalRuntimeScope;
3209
+ runtimeScope?: EvalRuntimeScope; /** Initial wall-clock time used by `new Date()` and `Date.now()` in this eval. */
3210
+ startTime?: EvalStartTime; /** Whether Date APIs stay frozen until advanced manually. */
3211
+ freezeTime?: boolean;
3159
3212
  };
3160
3213
  /** Execute a callback while `isInEvalScope()` reports a runner phase. */
3161
3214
  declare function runInEvalRuntimeScope<T>(runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
@@ -3570,4 +3623,4 @@ declare function createRunner({
3570
3623
  */
3571
3624
  declare function runCli(argv: string[]): Promise<void>;
3572
3625
  //#endregion
3573
- export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
3626
+ export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
package/dist/index.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { $ as removeDefaultConfigSchema, $t as columnKindSchema, A as extractApiCalls, An as setEvalOutput, At as cacheFileSchema, B as DEFAULT_API_CALLS_CONFIG, Bt as traceAttributeDisplayFormatSchema, Cn as incrementEvalOutput, Ct as evalChartTooltipExtraSchema, D as sseEnvelopeSchema, Dn as runInEvalRuntimeScope, Dt as cacheDebugKeyFileSchema, E as updateManualScoreRequestSchema, En as nextEvalId, Et as cacheDebugKeyEntrySchema, F as deriveScopedSummaryFromCases, Fn as getEvalRegistry, Ft as cacheRecordingSchema, G as apiCallMetricSchema, Gt as traceDisplayInputConfigSchema, H as agentEvalsConfigSchema, Ht as traceAttributeDisplayPlacementSchema, I as deriveStatusFromCaseRows, It as cacheStatusSchema, J as llmCallMetricFormatSchema, Jt as traceSpanSchema, K as apiCallsConfigSchema, Kt as traceSpanErrorSchema, L as deriveStatusFromChildStatuses, Lt as serializedCacheSpanSchema, M as getNestedAttribute, Mn as startEvalBackgroundJob, Mt as cacheModeSchema, N as getEvalTitle, Nn as repoFile, Nt as cacheOperationTypeSchema, O as extractCacheEntries, On as runInEvalScope, Ot as cacheEntrySchema, P as getEvalDisplayStatus, Pn as defineEval, Pt as cacheRecordingOpSchema, Q as llmCallsConfigSchema, Qt as columnFormatSchema, R as runManifestSchema, Rt as spanCacheOptionsSchema, Sn as getEvalCaseInput, St as evalChartMetricSchema, T as createRunRequestSchema, Tn as mergeEvalOutput, Tt as evalChartsConfigSchema, U as apiCallMetricFormatSchema, Ut as traceAttributeDisplaySchema, V as DEFAULT_LLM_CALLS_CONFIG, Vt as traceAttributeDisplayInputSchema, W as apiCallMetricPlacementSchema, Wt as traceDisplayConfigSchema, X as llmCallMetricSchema, Xt as cellValueSchema, Y as llmCallMetricPlacementSchema, Yt as traceSpanWarningSchema, Z as llmCallPricingSchema, Zt as columnDefSchema, _n as appendToEvalOutput, _t as evalChartAggregateSchema, an as z, at as caseDetailSchema, bn as evalLog, bt as evalChartColorSchema, cn as evalSpan, ct as evalStatAggregateSchema, dn as hashCacheKeySync, dt as evalSummarySchema, en as fileRefSchema, et as resolveApiCallsConfig, fn as deserializeCacheRecording, ft as runLogEntrySchema, gn as EvalAssertionError, gt as scoreTraceSchema, hn as serializeCacheValue, ht as runLogPhaseSchema, in as runArtifactRefSchema, it as assertionFailureSchema, j as extractLlmCalls, jn as setScopeCacheContext, jt as cacheListItemSchema, k as extractCacheHits, kn as runInExistingEvalScope, kt as cacheEntryWithDebugKeySchema, ln as evalTracer, lt as evalStatItemSchema, mn as serializeCacheRecording, mt as runLogLocationSchema, nn as numberDisplayOptionsSchema, nt as runLogsConfigSchema, on as buildTraceTree, ot as caseRowSchema, pn as deserializeCacheValue, pt as runLogLevelSchema, q as defaultConfigKeySchema, qt as traceSpanKindSchema, rn as repoFileRefSchema, rt as trialSelectionModeSchema, sn as captureEvalSpanError, st as evalFreshnessStatusSchema, tn as jsonCellSchema, tt as resolveLlmCallsConfig, un as hashCacheKey, ut as evalStatsConfigSchema, vt as evalChartAxisSchema, wn as isInEvalScope, wt as evalChartTypeSchema, xn as getCurrentScope, xt as evalChartConfigSchema, yn as evalAssert, yt as evalChartBuiltinMetricSchema, z as runSummarySchema, zt as traceCacheRefSchema } from "./runOrchestration-D1edUDhp.mjs";
2
- import { n as createRunner, t as runCli } from "./cli-C0EtHhEO.mjs";
3
- import "./src-D-HuV8I-.mjs";
4
- export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
1
+ import { $ as removeDefaultConfigSchema, $t as columnKindSchema, A as extractApiCalls, An as runInEvalScope, At as cacheFileSchema, B as DEFAULT_API_CALLS_CONFIG, Bt as traceAttributeDisplayFormatSchema, Cn as getEvalCaseInput, Ct as evalChartTooltipExtraSchema, D as sseEnvelopeSchema, Dn as mergeEvalOutput, Dt as cacheDebugKeyFileSchema, E as updateManualScoreRequestSchema, En as isInEvalScope, Et as cacheDebugKeyEntrySchema, F as deriveScopedSummaryFromCases, Fn as repoFile, Ft as cacheRecordingSchema, G as apiCallMetricSchema, Gt as traceDisplayInputConfigSchema, H as agentEvalsConfigSchema, Ht as traceAttributeDisplayPlacementSchema, I as deriveStatusFromCaseRows, In as defineEval, It as cacheStatusSchema, J as llmCallMetricFormatSchema, Jt as traceSpanSchema, K as apiCallsConfigSchema, Kt as traceSpanErrorSchema, L as deriveStatusFromChildStatuses, Ln as getEvalRegistry, Lt as serializedCacheSpanSchema, M as getNestedAttribute, Mn as setEvalOutput, Mt as cacheModeSchema, N as getEvalTitle, Nn as setScopeCacheContext, Nt as cacheOperationTypeSchema, O as extractCacheEntries, On as nextEvalId, Ot as cacheEntrySchema, P as getEvalDisplayStatus, Pn as startEvalBackgroundJob, Pt as cacheRecordingOpSchema, Q as llmCallsConfigSchema, Qt as columnFormatSchema, R as runManifestSchema, Rt as spanCacheOptionsSchema, Sn as getCurrentScope, St as evalChartMetricSchema, T as createRunRequestSchema, Tn as incrementEvalOutput, Tt as evalChartsConfigSchema, U as apiCallMetricFormatSchema, Ut as traceAttributeDisplaySchema, V as DEFAULT_LLM_CALLS_CONFIG, Vt as traceAttributeDisplayInputSchema, W as apiCallMetricPlacementSchema, Wt as traceDisplayConfigSchema, X as llmCallMetricSchema, Xt as cellValueSchema, Y as llmCallMetricPlacementSchema, Yt as traceSpanWarningSchema, Z as llmCallPricingSchema, Zt as columnDefSchema, _n as advanceEvalTime, _t as evalChartAggregateSchema, an as z, at as caseDetailSchema, bn as evalAssert, bt as evalChartColorSchema, cn as evalSpan, ct as evalStatAggregateSchema, dn as hashCacheKeySync, dt as evalSummarySchema, en as fileRefSchema, et as resolveApiCallsConfig, fn as deserializeCacheRecording, ft as runLogEntrySchema, gn as EvalAssertionError, gt as scoreTraceSchema, hn as serializeCacheValue, ht as runLogPhaseSchema, in as runArtifactRefSchema, it as assertionFailureSchema, j as extractLlmCalls, jn as runInExistingEvalScope, jt as cacheListItemSchema, k as extractCacheHits, kn as runInEvalRuntimeScope, kt as cacheEntryWithDebugKeySchema, ln as evalTracer, lt as evalStatItemSchema, mn as serializeCacheRecording, mt as runLogLocationSchema, nn as numberDisplayOptionsSchema, nt as runLogsConfigSchema, on as buildTraceTree, ot as caseRowSchema, pn as deserializeCacheValue, pt as runLogLevelSchema, q as defaultConfigKeySchema, qt as traceSpanKindSchema, rn as repoFileRefSchema, rt as trialSelectionModeSchema, sn as captureEvalSpanError, st as evalFreshnessStatusSchema, tn as jsonCellSchema, tt as resolveLlmCallsConfig, un as hashCacheKey, ut as evalStatsConfigSchema, vn as appendToEvalOutput, vt as evalChartAxisSchema, wn as getEvalStartTime, wt as evalChartTypeSchema, xn as evalLog, xt as evalChartConfigSchema, yt as evalChartBuiltinMetricSchema, z as runSummarySchema, zt as traceCacheRefSchema } from "./runOrchestration-CimthgI7.mjs";
2
+ import { n as createRunner, t as runCli } from "./cli-D3QNOcPN.mjs";
3
+ import "./src-CcXfWT4M.mjs";
4
+ export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
package/dist/runChild.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { R as runManifestSchema, T as createRunRequestSchema, Tt as evalChartsConfigSchema, Zt as columnDefSchema, b as loadConfig, t as executeRun, ut as evalStatsConfigSchema, v as parseEvalMetas, vn as configureEvalRunLogs, w as createFsCacheStore, z as runSummarySchema } from "./runOrchestration-D1edUDhp.mjs";
1
+ import { R as runManifestSchema, T as createRunRequestSchema, Tt as evalChartsConfigSchema, Zt as columnDefSchema, b as loadConfig, t as executeRun, ut as evalStatsConfigSchema, v as parseEvalMetas, w as createFsCacheStore, yn as configureEvalRunLogs, z as runSummarySchema } from "./runOrchestration-CimthgI7.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { relative } from "node:path";
@@ -49,9 +49,46 @@ function repoFile(path, mimeType) {
49
49
  //#region ../sdk/src/runtime.ts
50
50
  const scopeStorage = new AsyncLocalStorage();
51
51
  const runtimeScopeStorage = new AsyncLocalStorage();
52
+ const evalClockStorage = new AsyncLocalStorage();
52
53
  let activeEvalScopeCount = 0;
53
54
  let activeEvalRuntimeScopeCount = 0;
54
55
  let consoleCaptureEnabled = true;
56
+ const defaultEvalStartTimeMs = Date.parse("2026-04-10T00:00:00.000Z");
57
+ const realDate = globalThis.__agentEvalsRealDate ?? Date;
58
+ globalThis.__agentEvalsRealDate = realDate;
59
+ function toDateConstructorArg(value) {
60
+ if (typeof value === "string" || typeof value === "number" || value instanceof realDate) return value;
61
+ return Number(value);
62
+ }
63
+ function toDateNumberArg(value) {
64
+ return typeof value === "number" ? value : Number(value);
65
+ }
66
+ function constructDateFromArgs(args) {
67
+ if (args.length === 0) return new realDate();
68
+ if (args.length === 1) return new realDate(toDateConstructorArg(args[0]));
69
+ return new realDate(toDateNumberArg(args[0]), toDateNumberArg(args[1]), args[2] === void 0 ? 1 : toDateNumberArg(args[2]), args[3] === void 0 ? 0 : toDateNumberArg(args[3]), args[4] === void 0 ? 0 : toDateNumberArg(args[4]), args[5] === void 0 ? 0 : toDateNumberArg(args[5]), args[6] === void 0 ? 0 : toDateNumberArg(args[6]));
70
+ }
71
+ const evalDate = new Proxy(realDate, {
72
+ apply(target, thisArg, argArray_) {
73
+ const nowMs = getEvalClockNowMs();
74
+ if (nowMs !== null) return new target(nowMs).toString();
75
+ return target.call(thisArg);
76
+ },
77
+ construct(target, argArray, newTarget_) {
78
+ const nowMs = getEvalClockNowMs();
79
+ if (argArray.length === 0 && nowMs !== null) return new target(nowMs);
80
+ return constructDateFromArgs(Array.from(argArray));
81
+ },
82
+ get(target, property) {
83
+ if (property === "now") return getEvalDateNow;
84
+ if (property === "parse") return target.parse;
85
+ if (property === "UTC") return target.UTC;
86
+ if (property === "prototype") return target.prototype;
87
+ if (property === "name") return target.name;
88
+ if (property === "length") return target.length;
89
+ }
90
+ });
91
+ globalThis.Date = evalDate;
55
92
  const maxLogMessageLength = 2e4;
56
93
  const maxLogStringLength = 1e4;
57
94
  const maxLogArrayLength = 100;
@@ -79,6 +116,82 @@ var EvalAssertionError = class extends Error {
79
116
  this.name = "EvalAssertionError";
80
117
  }
81
118
  };
119
+ function getEvalClockStateNowMs(state) {
120
+ const elapsedMs = state.frozen ? 0 : realDate.now() - state.realStartMs;
121
+ return state.startMs + elapsedMs + state.offsetMs;
122
+ }
123
+ function getEvalClockNowMs() {
124
+ const state = evalClockStorage.getStore();
125
+ if (state?.shifted !== true) return null;
126
+ return getEvalClockStateNowMs(state);
127
+ }
128
+ function getEvalDateNow() {
129
+ return getEvalClockNowMs() ?? realDate.now();
130
+ }
131
+ /** Return the host process clock, bypassing the eval Date shim. */
132
+ function getRealDateNowMs() {
133
+ return realDate.now();
134
+ }
135
+ /** Return the shifted wall-clock time for a stored eval clock state. */
136
+ function getEvalClockStateTimeMs(state) {
137
+ if (!state.shifted) return null;
138
+ return getEvalClockStateNowMs(state);
139
+ }
140
+ /**
141
+ * Return the wall-clock start time captured for the active eval.
142
+ *
143
+ * For `startTime: 'now'`, this is the real time captured when the eval clock
144
+ * context was created.
145
+ */
146
+ function getEvalStartTime() {
147
+ const state = evalClockStorage.getStore();
148
+ if (state === void 0) throw new Error("getEvalStartTime() must be called inside an active eval");
149
+ return new realDate(state.startMs);
150
+ }
151
+ function resolveEvalStartTimeMs(startTime) {
152
+ if (startTime === void 0) return defaultEvalStartTimeMs;
153
+ if (startTime === "now") return realDate.now();
154
+ const ms = startTime instanceof realDate ? startTime.getTime() : typeof startTime === "number" ? startTime : realDate.parse(startTime);
155
+ if (Number.isFinite(ms)) return ms;
156
+ throw new Error(`Invalid eval startTime "${String(startTime)}". Use a Date, timestamp, ISO date string, or "now".`);
157
+ }
158
+ function createEvalClockState(startTime, freezeTime) {
159
+ const nowMs = realDate.now();
160
+ return {
161
+ startMs: startTime === "now" ? nowMs : resolveEvalStartTimeMs(startTime),
162
+ realStartMs: nowMs,
163
+ offsetMs: 0,
164
+ frozen: freezeTime,
165
+ shifted: startTime !== "now" || freezeTime
166
+ };
167
+ }
168
+ /** Execute a callback with the eval Date clock shifted from `startTime`. */
169
+ async function runWithEvalClock(startTime, fn, options = {}) {
170
+ return await evalClockStorage.run(createEvalClockState(startTime, options.freezeTime === true), fn);
171
+ }
172
+ function getEvalTimeUnitMs(unit) {
173
+ if (unit === "millisecond" || unit === "milliseconds") return 1;
174
+ if (unit === "second" || unit === "seconds") return 1e3;
175
+ if (unit === "minute" || unit === "minutes") return 6e4;
176
+ if (unit === "hour" || unit === "hours") return 36e5;
177
+ if (unit === "day" || unit === "days") return 864e5;
178
+ throw new Error(`Unsupported eval time unit "${unit}"`);
179
+ }
180
+ /**
181
+ * Advance the active eval's shifted Date clock and return the new time.
182
+ *
183
+ * Throws outside an active shifted eval clock. Evals that set
184
+ * `startTime: 'now'` use the real current clock unless `freezeTime: true` is
185
+ * also set.
186
+ */
187
+ function advanceEvalTime(unit, amount) {
188
+ const state = evalClockStorage.getStore();
189
+ if (state === void 0) throw new Error("advanceEvalTime() must be called inside an active eval");
190
+ if (!state.shifted) throw new Error("advanceEvalTime() requires a shifted eval clock. Remove startTime: \"now\" or set freezeTime: true to use it.");
191
+ if (!Number.isFinite(amount)) throw new Error("advanceEvalTime() amount must be a finite number");
192
+ state.offsetMs += getEvalTimeUnitMs(unit) * amount;
193
+ return new realDate(getEvalClockStateNowMs(state));
194
+ }
82
195
  /** Return the current eval scope for the active async context, if any. */
83
196
  function getCurrentScope() {
84
197
  if (activeEvalScopeCount === 0) return void 0;
@@ -349,7 +462,9 @@ async function runInExistingEvalScope(scope, runtimeScope, fn) {
349
462
  activeEvalScopeCount++;
350
463
  try {
351
464
  return await scopeStorage.run(scope, async () => {
352
- return await runInEvalRuntimeScope(runtimeScope, fn);
465
+ return await evalClockStorage.run(scope.evalClockState, async () => {
466
+ return await runInEvalRuntimeScope(runtimeScope, fn);
467
+ });
353
468
  });
354
469
  } finally {
355
470
  activeEvalScopeCount--;
@@ -362,6 +477,8 @@ async function runInExistingEvalScope(scope, runtimeScope, fn) {
362
477
  async function runInEvalScope(caseId, fn, options = {}) {
363
478
  const scope = {
364
479
  caseId,
480
+ startTime: options.startTime,
481
+ evalClockState: createEvalClockState(options.startTime, options.freezeTime === true),
365
482
  idPrefix: options.idPrefix,
366
483
  nextEvalIdCounter: 0,
367
484
  input: options.input,
@@ -1493,9 +1610,12 @@ function mergeSpanAttribute(span, key, patch) {
1493
1610
  ...patch
1494
1611
  } });
1495
1612
  }
1496
- function finishSpanWithoutThrownError(span) {
1613
+ function addElapsedMsToTimestamp(isoTimestamp, elapsedMs) {
1614
+ return new Date(new Date(isoTimestamp).getTime() + elapsedMs).toISOString();
1615
+ }
1616
+ function finishSpanWithoutThrownError(span, realStartedAt) {
1497
1617
  span.status = hasSpanError(span) ? "error" : "ok";
1498
- span.endedAt = (/* @__PURE__ */ new Date()).toISOString();
1618
+ span.endedAt = addElapsedMsToTimestamp(span.startedAt, getRealDateNowMs() - realStartedAt);
1499
1619
  }
1500
1620
  function createSpanHandle(span) {
1501
1621
  return {
@@ -1737,9 +1857,11 @@ async function traceSpanInternal(info, fn) {
1737
1857
  const scope = getCurrentScope();
1738
1858
  if (!scope) return await fn(noopActiveSpan());
1739
1859
  const id = generateSpanId();
1860
+ const parentId = scope.activeSpanStack.at(-1)?.id ?? null;
1861
+ const realStartedAt = getRealDateNowMs();
1740
1862
  const spanRecord = {
1741
1863
  id,
1742
- parentId: scope.activeSpanStack.at(-1)?.id ?? null,
1864
+ parentId,
1743
1865
  caseId: scope.caseId,
1744
1866
  kind: info.kind,
1745
1867
  name: info.name,
@@ -1779,7 +1901,7 @@ async function traceSpanInternal(info, fn) {
1779
1901
  const recording = deserializeCacheRecording(hit.recording);
1780
1902
  replayRecording(scope, spanRecord, recording, { generateSpanId });
1781
1903
  spanRecord.status = recording.finalStatus ?? (hasSpanError(spanRecord) ? "error" : "ok");
1782
- spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
1904
+ spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
1783
1905
  return recording.returnValue;
1784
1906
  }
1785
1907
  mergeSpanAttributes(spanRecord, { "cache.status": "miss" });
@@ -1798,7 +1920,7 @@ async function traceSpanInternal(info, fn) {
1798
1920
  scope.recordingStack.pop();
1799
1921
  }
1800
1922
  appendSubSpanOps(scope, frame);
1801
- finishSpanWithoutThrownError(spanRecord);
1923
+ finishSpanWithoutThrownError(spanRecord, realStartedAt);
1802
1924
  if (ctx.mode !== "bypass") {
1803
1925
  const recording = {
1804
1926
  returnValue: bodyResult,
@@ -1832,11 +1954,11 @@ async function traceSpanInternal(info, fn) {
1832
1954
  return bodyResult;
1833
1955
  }
1834
1956
  const result = await fn(activeSpan);
1835
- finishSpanWithoutThrownError(spanRecord);
1957
+ finishSpanWithoutThrownError(spanRecord, realStartedAt);
1836
1958
  return result;
1837
1959
  } catch (error) {
1838
1960
  spanRecord.status = "error";
1839
- spanRecord.endedAt = (/* @__PURE__ */ new Date()).toISOString();
1961
+ spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
1840
1962
  spanRecord.error = normalizeTraceError(error);
1841
1963
  throw error;
1842
1964
  } finally {
@@ -5235,7 +5357,9 @@ async function runCase(params) {
5235
5357
  mode: cacheMode,
5236
5358
  evalId,
5237
5359
  codeFingerprint
5238
- } : void 0
5360
+ } : void 0,
5361
+ startTime: evalDef.startTime,
5362
+ freezeTime: evalDef.freezeTime
5239
5363
  });
5240
5364
  const traceTree = buildTraceTree(scope.spans, scope.checkpoints);
5241
5365
  const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
@@ -5276,6 +5400,7 @@ async function runCase(params) {
5276
5400
  }
5277
5401
  const scoreResults = /* @__PURE__ */ new Map();
5278
5402
  const scoringTraces = {};
5403
+ const scoreStartTime = getEvalClockStateTimeMs(scope.evalClockState) ?? evalDef.startTime;
5279
5404
  if (!nonAssertError && scope.assertionFailures.length === 0 && evalDef.scores) for (const [key, def] of Object.entries(evalDef.scores)) {
5280
5405
  const { compute, passThreshold, label } = normalizeScoreDef(def);
5281
5406
  const scoreRun = await runInEvalScope(evalCase.id, async () => {
@@ -5295,7 +5420,9 @@ async function runCase(params) {
5295
5420
  mode: cacheMode,
5296
5421
  evalId: `${evalId}__score__${key}`,
5297
5422
  codeFingerprint
5298
- } : void 0
5423
+ } : void 0,
5424
+ startTime: scoreStartTime,
5425
+ freezeTime: evalDef.freezeTime
5299
5426
  });
5300
5427
  const { trace, traceDisplay } = resolveTracePresentation(scoreRun.scope.spans, globalTraceDisplay, evalDef.traceDisplay);
5301
5428
  scope.logs.push(...scoreRun.scope.logs.map((entry) => ({
@@ -5608,7 +5735,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
5608
5735
  await runInEvalRuntimeScope("cases", async () => {
5609
5736
  await entry.use(async (evalDef) => {
5610
5737
  const cases = filterEvalCases(resolveRunnableEvalCases({
5611
- cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
5738
+ cases: await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime }),
5612
5739
  evalId: evalMeta.id
5613
5740
  }), request.target.evalIds, request.target.caseIds, evalMeta.id);
5614
5741
  runState.summary.totalCases += cases.length;
@@ -5811,4 +5938,4 @@ function toLastRunStatus(status) {
5811
5938
  return status === "pending" ? null : status;
5812
5939
  }
5813
5940
  //#endregion
5814
- export { removeDefaultConfigSchema as $, columnKindSchema as $t, extractApiCalls as A, setEvalOutput as An, cacheFileSchema as At, DEFAULT_API_CALLS_CONFIG as B, traceAttributeDisplayFormatSchema as Bt, validateCharts as C, incrementEvalOutput as Cn, evalChartTooltipExtraSchema as Ct, sseEnvelopeSchema as D, runInEvalRuntimeScope as Dn, cacheDebugKeyFileSchema as Dt, updateManualScoreRequestSchema as E, nextEvalId as En, cacheDebugKeyEntrySchema as Et, deriveScopedSummaryFromCases as F, getEvalRegistry as Fn, cacheRecordingSchema as Ft, apiCallMetricSchema as G, traceDisplayInputConfigSchema as Gt, agentEvalsConfigSchema as H, traceAttributeDisplayPlacementSchema as Ht, deriveStatusFromCaseRows as I, cacheStatusSchema as It, llmCallMetricFormatSchema as J, traceSpanSchema as Jt, apiCallsConfigSchema as K, traceSpanErrorSchema as Kt, deriveStatusFromChildStatuses as L, serializedCacheSpanSchema as Lt, getNestedAttribute as M, startEvalBackgroundJob as Mn, cacheModeSchema as Mt, getEvalTitle as N, repoFile as Nn, cacheOperationTypeSchema as Nt, extractCacheEntries as O, runInEvalScope as On, cacheEntrySchema as Ot, getEvalDisplayStatus as P, defineEval as Pn, cacheRecordingOpSchema as Pt, llmCallsConfigSchema as Q, columnFormatSchema as Qt, runManifestSchema as R, spanCacheOptionsSchema as Rt, normalizeScoreDef as S, getEvalCaseInput as Sn, evalChartMetricSchema as St, createRunRequestSchema as T, mergeEvalOutput as Tn, evalChartsConfigSchema as Tt, apiCallMetricFormatSchema as U, traceAttributeDisplaySchema as Ut, DEFAULT_LLM_CALLS_CONFIG as V, traceAttributeDisplayInputSchema as Vt, apiCallMetricPlacementSchema as W, traceDisplayConfigSchema as Wt, llmCallMetricSchema as X, cellValueSchema as Xt, llmCallMetricPlacementSchema as Y, traceSpanWarningSchema as Yt, llmCallPricingSchema as Z, columnDefSchema as Zt, loadEvalModule as _, appendToEvalOutput as _n, evalChartAggregateSchema as _t, loadPersistedRunSnapshot as a, z$1 as an, caseDetailSchema as at, loadConfig as b, evalLog as bn, evalChartColorSchema as bt, persistCaseDetail as c, evalSpan as cn, evalStatAggregateSchema as ct, recomputePersistedCaseStatus as d, hashCacheKeySync as dn, evalSummarySchema as dt, fileRefSchema as en, resolveApiCallsConfig as et, runTouchesEval as f, deserializeCacheRecording as fn, runLogEntrySchema as ft, setLatestRunInfoMap as g, EvalAssertionError as gn, scoreTraceSchema as gt, getTargetEvalIds as h, serializeCacheValue as hn, runLogPhaseSchema as ht, getLatestRunInfos as i, runArtifactRefSchema as in, assertionFailureSchema as it, extractLlmCalls as j, setScopeCacheContext as jn, cacheListItemSchema as jt, extractCacheHits as k, runInExistingEvalScope as kn, cacheEntryWithDebugKeySchema as kt, persistRunState as l, evalTracer as ln, evalStatItemSchema as lt, buildEvalSummary as m, serializeCacheRecording as mn, runLogLocationSchema as mt, generateRunId as n, numberDisplayOptionsSchema as nn, runLogsConfigSchema as nt, loadPersistedRunSnapshots as o, buildTraceTree as on, caseRowSchema as ot, resolveArtifactPath as p, deserializeCacheValue as pn, runLogLevelSchema as pt, defaultConfigKeySchema as q, traceSpanKindSchema as qt, getLastRunStatuses as r, repoFileRefSchema as rn, trialSelectionModeSchema as rt, nextShortIdFromSnapshots as s, captureEvalSpanError as sn, evalFreshnessStatusSchema as st, executeRun as t, jsonCellSchema as tn, resolveLlmCallsConfig as tt, recomputeEvalStatusesInRuns as u, hashCacheKey as un, evalStatsConfigSchema as ut, parseEvalMetas as v, configureEvalRunLogs as vn, evalChartAxisSchema as vt, createFsCacheStore as w, isInEvalScope as wn, evalChartTypeSchema as wt, buildDeclaredColumnDefs as x, getCurrentScope as xn, evalChartConfigSchema as xt, resolveEvalDefaultConfig as y, evalAssert as yn, evalChartBuiltinMetricSchema as yt, runSummarySchema as z, traceCacheRefSchema as zt };
5941
+ export { removeDefaultConfigSchema as $, columnKindSchema as $t, extractApiCalls as A, runInEvalScope as An, cacheFileSchema as At, DEFAULT_API_CALLS_CONFIG as B, traceAttributeDisplayFormatSchema as Bt, validateCharts as C, getEvalCaseInput as Cn, evalChartTooltipExtraSchema as Ct, sseEnvelopeSchema as D, mergeEvalOutput as Dn, cacheDebugKeyFileSchema as Dt, updateManualScoreRequestSchema as E, isInEvalScope as En, cacheDebugKeyEntrySchema as Et, deriveScopedSummaryFromCases as F, repoFile as Fn, cacheRecordingSchema as Ft, apiCallMetricSchema as G, traceDisplayInputConfigSchema as Gt, agentEvalsConfigSchema as H, traceAttributeDisplayPlacementSchema as Ht, deriveStatusFromCaseRows as I, defineEval as In, cacheStatusSchema as It, llmCallMetricFormatSchema as J, traceSpanSchema as Jt, apiCallsConfigSchema as K, traceSpanErrorSchema as Kt, deriveStatusFromChildStatuses as L, getEvalRegistry as Ln, serializedCacheSpanSchema as Lt, getNestedAttribute as M, setEvalOutput as Mn, cacheModeSchema as Mt, getEvalTitle as N, setScopeCacheContext as Nn, cacheOperationTypeSchema as Nt, extractCacheEntries as O, nextEvalId as On, cacheEntrySchema as Ot, getEvalDisplayStatus as P, startEvalBackgroundJob as Pn, cacheRecordingOpSchema as Pt, llmCallsConfigSchema as Q, columnFormatSchema as Qt, runManifestSchema as R, spanCacheOptionsSchema as Rt, normalizeScoreDef as S, getCurrentScope as Sn, evalChartMetricSchema as St, createRunRequestSchema as T, incrementEvalOutput as Tn, evalChartsConfigSchema as Tt, apiCallMetricFormatSchema as U, traceAttributeDisplaySchema as Ut, DEFAULT_LLM_CALLS_CONFIG as V, traceAttributeDisplayInputSchema as Vt, apiCallMetricPlacementSchema as W, traceDisplayConfigSchema as Wt, llmCallMetricSchema as X, cellValueSchema as Xt, llmCallMetricPlacementSchema as Y, traceSpanWarningSchema as Yt, llmCallPricingSchema as Z, columnDefSchema as Zt, loadEvalModule as _, advanceEvalTime as _n, evalChartAggregateSchema as _t, loadPersistedRunSnapshot as a, z$1 as an, caseDetailSchema as at, loadConfig as b, evalAssert as bn, evalChartColorSchema as bt, persistCaseDetail as c, evalSpan as cn, evalStatAggregateSchema as ct, recomputePersistedCaseStatus as d, hashCacheKeySync as dn, evalSummarySchema as dt, fileRefSchema as en, resolveApiCallsConfig as et, runTouchesEval as f, deserializeCacheRecording as fn, runLogEntrySchema as ft, setLatestRunInfoMap as g, EvalAssertionError as gn, scoreTraceSchema as gt, getTargetEvalIds as h, serializeCacheValue as hn, runLogPhaseSchema as ht, getLatestRunInfos as i, runArtifactRefSchema as in, assertionFailureSchema as it, extractLlmCalls as j, runInExistingEvalScope as jn, cacheListItemSchema as jt, extractCacheHits as k, runInEvalRuntimeScope as kn, cacheEntryWithDebugKeySchema as kt, persistRunState as l, evalTracer as ln, evalStatItemSchema as lt, buildEvalSummary as m, serializeCacheRecording as mn, runLogLocationSchema as mt, generateRunId as n, numberDisplayOptionsSchema as nn, runLogsConfigSchema as nt, loadPersistedRunSnapshots as o, buildTraceTree as on, caseRowSchema as ot, resolveArtifactPath as p, deserializeCacheValue as pn, runLogLevelSchema as pt, defaultConfigKeySchema as q, traceSpanKindSchema as qt, getLastRunStatuses as r, repoFileRefSchema as rn, trialSelectionModeSchema as rt, nextShortIdFromSnapshots as s, captureEvalSpanError as sn, evalFreshnessStatusSchema as st, executeRun as t, jsonCellSchema as tn, resolveLlmCallsConfig as tt, recomputeEvalStatusesInRuns as u, hashCacheKey as un, evalStatsConfigSchema as ut, parseEvalMetas as v, appendToEvalOutput as vn, evalChartAxisSchema as vt, createFsCacheStore as w, getEvalStartTime as wn, evalChartTypeSchema as wt, buildDeclaredColumnDefs as x, evalLog as xn, evalChartConfigSchema as xt, resolveEvalDefaultConfig as y, configureEvalRunLogs as yn, evalChartBuiltinMetricSchema as yt, runSummarySchema as z, traceCacheRefSchema as zt };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-C0EtHhEO.mjs";
2
- import "./src-D-HuV8I-.mjs";
1
+ import { n as createRunner } from "./cli-D3QNOcPN.mjs";
2
+ import "./src-CcXfWT4M.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-C9nP2VKL.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-4yNYRvmF.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-CimthgI7.mjs";
2
+ import "./cli-D3QNOcPN.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.22.0",
3
+ "version": "0.23.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -156,6 +156,19 @@ for settlement; promise and span errors keep their normal behavior. Use
156
156
  `waitForBackgroundJob: false` on a span, or `waitForBackgroundJobs: false` on an
157
157
  eval definition, when background work should not delay finalization.
158
158
 
159
+ Eval Date APIs use a shifted wall clock by default: `new Date()` and
160
+ `Date.now()` start at `2026-04-10T00:00:00.000Z` during case generation,
161
+ execution, tracing, derived outputs, and scorers, then continue advancing with
162
+ real elapsed time. Set `startTime` on a specific `defineEval(...)` to use
163
+ another initial clock value, or set `startTime: 'now'` for that eval to use the
164
+ real current clock. Timers are not faked, so async waits still run normally.
165
+ Set `freezeTime: true` to keep Date APIs frozen until they are moved manually.
166
+ Use `getEvalStartTime()` to read the captured wall-clock start as a `Date`.
167
+ Use `advanceEvalTime(unit, amount)` inside an eval to move the shifted clock
168
+ forward; supported units are `millisecond(s)`, `second(s)`, `minute(s)`,
169
+ `hour(s)`, and `day(s)`. It throws for evals with `startTime: 'now'`, unless
170
+ `freezeTime: true` is also set.
171
+
159
172
  For libraries or observability exporters that already emit span lifecycle
160
173
  events, use `evalTracer.startSpan(...)`, `evalTracer.updateSpan(...)`,
161
174
  `evalTracer.endSpan(...)`, or `evalTracer.recordSpan(...)` to translate those
@@ -1,3 +0,0 @@
1
- import "./runOrchestration-D1edUDhp.mjs";
2
- import "./cli-C0EtHhEO.mjs";
3
- export {};