@ls-stack/agent-eval 0.22.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-moDHbg1O.mjs → app-Cw79dJDr.mjs} +3 -3
- package/dist/bin.mjs +1 -1
- package/dist/{cli-C0EtHhEO.mjs → cli-D3QNOcPN.mjs} +3 -3
- package/dist/index.d.mts +56 -3
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-D1edUDhp.mjs → runOrchestration-CimthgI7.mjs} +139 -12
- package/dist/{runner-C9nP2VKL.mjs → runner-4yNYRvmF.mjs} +2 -2
- package/dist/{runner-CyRhIzci.mjs → runner-B-SYzW8w.mjs} +1 -1
- package/dist/src-CcXfWT4M.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +13 -0
- package/dist/src-D-HuV8I-.mjs +0 -3
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { E as updateManualScoreRequestSchema, T as createRunRequestSchema } from "./runOrchestration-
|
|
2
|
-
import "./src-
|
|
3
|
-
import { t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { E as updateManualScoreRequestSchema, T as createRunRequestSchema } from "./runOrchestration-CimthgI7.mjs";
|
|
2
|
+
import "./src-CcXfWT4M.mjs";
|
|
3
|
+
import { t as getRunnerInstance } from "./runner-4yNYRvmF.mjs";
|
|
4
4
|
import { readFile } from "node:fs/promises";
|
|
5
5
|
import { dirname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
6
6
|
import { z } from "zod/v4";
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { C as validateCharts, F as deriveScopedSummaryFromCases,
|
|
1
|
+
import { C as validateCharts, F as deriveScopedSummaryFromCases, Ln as getEvalRegistry, N as getEvalTitle, P as getEvalDisplayStatus, S as normalizeScoreDef, _ as loadEvalModule, a as loadPersistedRunSnapshot, b as loadConfig, c as persistCaseDetail, d as recomputePersistedCaseStatus, et as resolveApiCallsConfig, f as runTouchesEval, g as setLatestRunInfoMap, h as getTargetEvalIds, i as getLatestRunInfos, l as persistRunState, m as buildEvalSummary, n as generateRunId, o as loadPersistedRunSnapshots, p as resolveArtifactPath, r as getLastRunStatuses, s as nextShortIdFromSnapshots, tt as resolveLlmCallsConfig, u as recomputeEvalStatusesInRuns, v as parseEvalMetas, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig, z as runSummarySchema } from "./runOrchestration-CimthgI7.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join, relative, resolve } from "node:path";
|
|
@@ -959,8 +959,8 @@ async function commandApp(args) {
|
|
|
959
959
|
const { serve } = await import("@hono/node-server");
|
|
960
960
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
961
961
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
962
|
-
const appModule = await import("./app-
|
|
963
|
-
const runnerModule = await import("./runner-
|
|
962
|
+
const appModule = await import("./app-Cw79dJDr.mjs");
|
|
963
|
+
const runnerModule = await import("./runner-B-SYzW8w.mjs");
|
|
964
964
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
965
965
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
966
966
|
await runnerModule.initRunner();
|
package/dist/index.d.mts
CHANGED
|
@@ -2758,6 +2758,12 @@ type EvalCase<TInput> = {
|
|
|
2758
2758
|
};
|
|
2759
2759
|
/** Runtime output values collected from output helpers and `deriveFromTracing`. */
|
|
2760
2760
|
type EvalOutputs = Record<string, unknown>;
|
|
2761
|
+
/**
|
|
2762
|
+
* Initial wall-clock time used by an eval's shifted Date clock.
|
|
2763
|
+
*
|
|
2764
|
+
* Pass `'now'` to opt one eval back into the real current clock.
|
|
2765
|
+
*/
|
|
2766
|
+
type EvalStartTime = Date | number | string;
|
|
2761
2767
|
/**
|
|
2762
2768
|
* Schema used to validate and type an eval's collected runtime outputs.
|
|
2763
2769
|
*
|
|
@@ -2925,6 +2931,23 @@ type EvalDefinitionBase<TInput = unknown, TOutputs extends EvalOutputs = EvalOut
|
|
|
2925
2931
|
* delay case finalization; late mutations are not guaranteed to persist.
|
|
2926
2932
|
*/
|
|
2927
2933
|
waitForBackgroundJobs?: boolean;
|
|
2934
|
+
/**
|
|
2935
|
+
* Optional initial wall-clock time for this eval's runtime.
|
|
2936
|
+
*
|
|
2937
|
+
* When set, `new Date()` and `Date.now()` inside case generation, execution,
|
|
2938
|
+
* tracing, derived outputs, and scorers start from this wall-clock value and
|
|
2939
|
+
* then continue advancing with real elapsed time. The default is
|
|
2940
|
+
* `2026-04-10T00:00:00.000Z`. Pass `'now'` to use the real current clock for
|
|
2941
|
+
* this eval. Timers are not faked, so `setTimeout` and other asynchronous
|
|
2942
|
+
* work still run normally.
|
|
2943
|
+
*/
|
|
2944
|
+
startTime?: EvalStartTime;
|
|
2945
|
+
/**
|
|
2946
|
+
* Freeze the eval Date clock at `startTime` until `advanceEvalTime(...)`
|
|
2947
|
+
* moves it manually. Defaults to `false`, so eval time advances with real
|
|
2948
|
+
* elapsed time from the configured `startTime`.
|
|
2949
|
+
*/
|
|
2950
|
+
freezeTime?: boolean;
|
|
2928
2951
|
execute: (ctx: EvalExecuteContext<TInput, TOutputs>) => Promise<void> | void;
|
|
2929
2952
|
deriveFromTracing?: (ctx: EvalDeriveContext<TInput>) => Partial<TOutputs> | Promise<Partial<TOutputs>>;
|
|
2930
2953
|
scores?: Record<string, EvalScoreDef<TInput, TOutputs>>;
|
|
@@ -3018,6 +3041,9 @@ declare function defineEval<TInput = unknown, TOutputs extends EvalOutputs = Eva
|
|
|
3018
3041
|
declare function repoFile(path: string, mimeType?: string): RepoFileRef;
|
|
3019
3042
|
//#endregion
|
|
3020
3043
|
//#region ../sdk/src/runtime.d.ts
|
|
3044
|
+
declare global {
|
|
3045
|
+
var __agentEvalsRealDate: DateConstructor | undefined;
|
|
3046
|
+
}
|
|
3021
3047
|
/**
|
|
3022
3048
|
* Raw-key debug payload passed alongside cache writes.
|
|
3023
3049
|
*
|
|
@@ -3063,7 +3089,15 @@ type CacheRecordingFrame = {
|
|
|
3063
3089
|
};
|
|
3064
3090
|
/** Mutable per-case runtime state stored in async local storage. */
|
|
3065
3091
|
type EvalCaseScope = {
|
|
3066
|
-
caseId: string; /**
|
|
3092
|
+
caseId: string; /** Initial wall-clock time used by Date APIs inside this eval case. */
|
|
3093
|
+
startTime: EvalStartTime | undefined; /** Mutable shifted wall-clock state shared across this eval case's phases. */
|
|
3094
|
+
evalClockState: {
|
|
3095
|
+
startMs: number;
|
|
3096
|
+
realStartMs: number;
|
|
3097
|
+
offsetMs: number;
|
|
3098
|
+
frozen: boolean;
|
|
3099
|
+
shifted: boolean;
|
|
3100
|
+
}; /** Stable prefix used by `nextEvalId()` for this eval case scope. */
|
|
3067
3101
|
idPrefix: string | undefined; /** Monotonic per-scope counter used by `nextEvalId()`. */
|
|
3068
3102
|
nextEvalIdCounter: number; /** Authored input for the current case, when provided by the runner. */
|
|
3069
3103
|
input?: unknown;
|
|
@@ -3101,11 +3135,28 @@ type EvalCaseScope = {
|
|
|
3101
3135
|
* modules imported while a run is being prepared.
|
|
3102
3136
|
*/
|
|
3103
3137
|
type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'outputsSchema' | 'scorer';
|
|
3138
|
+
/** Time unit accepted by `advanceEvalTime(unit, amount)`. */
|
|
3139
|
+
type EvalTimeUnit = 'millisecond' | 'milliseconds' | 'second' | 'seconds' | 'minute' | 'minutes' | 'hour' | 'hours' | 'day' | 'days';
|
|
3104
3140
|
type EvalLogLevelInput = RunLogLevel | 'warning';
|
|
3105
3141
|
/** Error thrown when an eval assertion fails during case execution. */
|
|
3106
3142
|
declare class EvalAssertionError extends Error {
|
|
3107
3143
|
constructor(message: string);
|
|
3108
3144
|
}
|
|
3145
|
+
/**
|
|
3146
|
+
* Return the wall-clock start time captured for the active eval.
|
|
3147
|
+
*
|
|
3148
|
+
* For `startTime: 'now'`, this is the real time captured when the eval clock
|
|
3149
|
+
* context was created.
|
|
3150
|
+
*/
|
|
3151
|
+
declare function getEvalStartTime(): Date;
|
|
3152
|
+
/**
|
|
3153
|
+
* Advance the active eval's shifted Date clock and return the new time.
|
|
3154
|
+
*
|
|
3155
|
+
* Throws outside an active shifted eval clock. Evals that set
|
|
3156
|
+
* `startTime: 'now'` use the real current clock unless `freezeTime: true` is
|
|
3157
|
+
* also set.
|
|
3158
|
+
*/
|
|
3159
|
+
declare function advanceEvalTime(unit: EvalTimeUnit, amount: number): Date;
|
|
3109
3160
|
/** Return the current eval scope for the active async context, if any. */
|
|
3110
3161
|
declare function getCurrentScope(): EvalCaseScope | undefined;
|
|
3111
3162
|
/**
|
|
@@ -3155,7 +3206,9 @@ type RunInEvalScopeOptions = {
|
|
|
3155
3206
|
idPrefix?: string; /** Cache adapter + mode attached to the scope before `fn` runs. */
|
|
3156
3207
|
cacheContext?: CacheScopeContext; /** Whether registered background jobs should settle before scope finalizes. */
|
|
3157
3208
|
waitForBackgroundJobs?: boolean; /** Eval runner phase exposed through `isInEvalScope()`. Defaults to `eval`. */
|
|
3158
|
-
runtimeScope?: EvalRuntimeScope;
|
|
3209
|
+
runtimeScope?: EvalRuntimeScope; /** Initial wall-clock time used by `new Date()` and `Date.now()` in this eval. */
|
|
3210
|
+
startTime?: EvalStartTime; /** Whether Date APIs stay frozen until advanced manually. */
|
|
3211
|
+
freezeTime?: boolean;
|
|
3159
3212
|
};
|
|
3160
3213
|
/** Execute a callback while `isInEvalScope()` reports a runner phase. */
|
|
3161
3214
|
declare function runInEvalRuntimeScope<T>(runtimeScope: EvalRuntimeScope, fn: () => Promise<T> | T): Promise<T>;
|
|
@@ -3570,4 +3623,4 @@ declare function createRunner({
|
|
|
3570
3623
|
*/
|
|
3571
3624
|
declare function runCli(argv: string[]): Promise<void>;
|
|
3572
3625
|
//#endregion
|
|
3573
|
-
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
3626
|
+
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as removeDefaultConfigSchema, $t as columnKindSchema, A as extractApiCalls, An as
|
|
2
|
-
import { n as createRunner, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
1
|
+
import { $ as removeDefaultConfigSchema, $t as columnKindSchema, A as extractApiCalls, An as runInEvalScope, At as cacheFileSchema, B as DEFAULT_API_CALLS_CONFIG, Bt as traceAttributeDisplayFormatSchema, Cn as getEvalCaseInput, Ct as evalChartTooltipExtraSchema, D as sseEnvelopeSchema, Dn as mergeEvalOutput, Dt as cacheDebugKeyFileSchema, E as updateManualScoreRequestSchema, En as isInEvalScope, Et as cacheDebugKeyEntrySchema, F as deriveScopedSummaryFromCases, Fn as repoFile, Ft as cacheRecordingSchema, G as apiCallMetricSchema, Gt as traceDisplayInputConfigSchema, H as agentEvalsConfigSchema, Ht as traceAttributeDisplayPlacementSchema, I as deriveStatusFromCaseRows, In as defineEval, It as cacheStatusSchema, J as llmCallMetricFormatSchema, Jt as traceSpanSchema, K as apiCallsConfigSchema, Kt as traceSpanErrorSchema, L as deriveStatusFromChildStatuses, Ln as getEvalRegistry, Lt as serializedCacheSpanSchema, M as getNestedAttribute, Mn as setEvalOutput, Mt as cacheModeSchema, N as getEvalTitle, Nn as setScopeCacheContext, Nt as cacheOperationTypeSchema, O as extractCacheEntries, On as nextEvalId, Ot as cacheEntrySchema, P as getEvalDisplayStatus, Pn as startEvalBackgroundJob, Pt as cacheRecordingOpSchema, Q as llmCallsConfigSchema, Qt as columnFormatSchema, R as runManifestSchema, Rt as spanCacheOptionsSchema, Sn as getCurrentScope, St as evalChartMetricSchema, T as createRunRequestSchema, Tn as incrementEvalOutput, Tt as evalChartsConfigSchema, U as apiCallMetricFormatSchema, Ut as traceAttributeDisplaySchema, V as DEFAULT_LLM_CALLS_CONFIG, Vt as traceAttributeDisplayInputSchema, W as apiCallMetricPlacementSchema, Wt as traceDisplayConfigSchema, X as llmCallMetricSchema, Xt as cellValueSchema, Y as llmCallMetricPlacementSchema, Yt as traceSpanWarningSchema, Z as llmCallPricingSchema, Zt as columnDefSchema, _n as advanceEvalTime, _t as evalChartAggregateSchema, an as z, at as caseDetailSchema, bn as evalAssert, bt as evalChartColorSchema, cn as evalSpan, ct as evalStatAggregateSchema, dn as hashCacheKeySync, dt as evalSummarySchema, en as fileRefSchema, et as resolveApiCallsConfig, fn as deserializeCacheRecording, ft as runLogEntrySchema, gn as EvalAssertionError, gt as scoreTraceSchema, hn as serializeCacheValue, ht as runLogPhaseSchema, in as runArtifactRefSchema, it as assertionFailureSchema, j as extractLlmCalls, jn as runInExistingEvalScope, jt as cacheListItemSchema, k as extractCacheHits, kn as runInEvalRuntimeScope, kt as cacheEntryWithDebugKeySchema, ln as evalTracer, lt as evalStatItemSchema, mn as serializeCacheRecording, mt as runLogLocationSchema, nn as numberDisplayOptionsSchema, nt as runLogsConfigSchema, on as buildTraceTree, ot as caseRowSchema, pn as deserializeCacheValue, pt as runLogLevelSchema, q as defaultConfigKeySchema, qt as traceSpanKindSchema, rn as repoFileRefSchema, rt as trialSelectionModeSchema, sn as captureEvalSpanError, st as evalFreshnessStatusSchema, tn as jsonCellSchema, tt as resolveLlmCallsConfig, un as hashCacheKey, ut as evalStatsConfigSchema, vn as appendToEvalOutput, vt as evalChartAxisSchema, wn as getEvalStartTime, wt as evalChartTypeSchema, xn as evalLog, xt as evalChartConfigSchema, yt as evalChartBuiltinMetricSchema, z as runSummarySchema, zt as traceCacheRefSchema } from "./runOrchestration-CimthgI7.mjs";
|
|
2
|
+
import { n as createRunner, t as runCli } from "./cli-D3QNOcPN.mjs";
|
|
3
|
+
import "./src-CcXfWT4M.mjs";
|
|
4
|
+
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { R as runManifestSchema, T as createRunRequestSchema, Tt as evalChartsConfigSchema, Zt as columnDefSchema, b as loadConfig, t as executeRun, ut as evalStatsConfigSchema, v as parseEvalMetas,
|
|
1
|
+
import { R as runManifestSchema, T as createRunRequestSchema, Tt as evalChartsConfigSchema, Zt as columnDefSchema, b as loadConfig, t as executeRun, ut as evalStatsConfigSchema, v as parseEvalMetas, w as createFsCacheStore, yn as configureEvalRunLogs, z as runSummarySchema } from "./runOrchestration-CimthgI7.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|
|
@@ -49,9 +49,46 @@ function repoFile(path, mimeType) {
|
|
|
49
49
|
//#region ../sdk/src/runtime.ts
|
|
50
50
|
const scopeStorage = new AsyncLocalStorage();
|
|
51
51
|
const runtimeScopeStorage = new AsyncLocalStorage();
|
|
52
|
+
const evalClockStorage = new AsyncLocalStorage();
|
|
52
53
|
let activeEvalScopeCount = 0;
|
|
53
54
|
let activeEvalRuntimeScopeCount = 0;
|
|
54
55
|
let consoleCaptureEnabled = true;
|
|
56
|
+
const defaultEvalStartTimeMs = Date.parse("2026-04-10T00:00:00.000Z");
|
|
57
|
+
const realDate = globalThis.__agentEvalsRealDate ?? Date;
|
|
58
|
+
globalThis.__agentEvalsRealDate = realDate;
|
|
59
|
+
function toDateConstructorArg(value) {
|
|
60
|
+
if (typeof value === "string" || typeof value === "number" || value instanceof realDate) return value;
|
|
61
|
+
return Number(value);
|
|
62
|
+
}
|
|
63
|
+
function toDateNumberArg(value) {
|
|
64
|
+
return typeof value === "number" ? value : Number(value);
|
|
65
|
+
}
|
|
66
|
+
function constructDateFromArgs(args) {
|
|
67
|
+
if (args.length === 0) return new realDate();
|
|
68
|
+
if (args.length === 1) return new realDate(toDateConstructorArg(args[0]));
|
|
69
|
+
return new realDate(toDateNumberArg(args[0]), toDateNumberArg(args[1]), args[2] === void 0 ? 1 : toDateNumberArg(args[2]), args[3] === void 0 ? 0 : toDateNumberArg(args[3]), args[4] === void 0 ? 0 : toDateNumberArg(args[4]), args[5] === void 0 ? 0 : toDateNumberArg(args[5]), args[6] === void 0 ? 0 : toDateNumberArg(args[6]));
|
|
70
|
+
}
|
|
71
|
+
const evalDate = new Proxy(realDate, {
|
|
72
|
+
apply(target, thisArg, argArray_) {
|
|
73
|
+
const nowMs = getEvalClockNowMs();
|
|
74
|
+
if (nowMs !== null) return new target(nowMs).toString();
|
|
75
|
+
return target.call(thisArg);
|
|
76
|
+
},
|
|
77
|
+
construct(target, argArray, newTarget_) {
|
|
78
|
+
const nowMs = getEvalClockNowMs();
|
|
79
|
+
if (argArray.length === 0 && nowMs !== null) return new target(nowMs);
|
|
80
|
+
return constructDateFromArgs(Array.from(argArray));
|
|
81
|
+
},
|
|
82
|
+
get(target, property) {
|
|
83
|
+
if (property === "now") return getEvalDateNow;
|
|
84
|
+
if (property === "parse") return target.parse;
|
|
85
|
+
if (property === "UTC") return target.UTC;
|
|
86
|
+
if (property === "prototype") return target.prototype;
|
|
87
|
+
if (property === "name") return target.name;
|
|
88
|
+
if (property === "length") return target.length;
|
|
89
|
+
}
|
|
90
|
+
});
|
|
91
|
+
globalThis.Date = evalDate;
|
|
55
92
|
const maxLogMessageLength = 2e4;
|
|
56
93
|
const maxLogStringLength = 1e4;
|
|
57
94
|
const maxLogArrayLength = 100;
|
|
@@ -79,6 +116,82 @@ var EvalAssertionError = class extends Error {
|
|
|
79
116
|
this.name = "EvalAssertionError";
|
|
80
117
|
}
|
|
81
118
|
};
|
|
119
|
+
function getEvalClockStateNowMs(state) {
|
|
120
|
+
const elapsedMs = state.frozen ? 0 : realDate.now() - state.realStartMs;
|
|
121
|
+
return state.startMs + elapsedMs + state.offsetMs;
|
|
122
|
+
}
|
|
123
|
+
function getEvalClockNowMs() {
|
|
124
|
+
const state = evalClockStorage.getStore();
|
|
125
|
+
if (state?.shifted !== true) return null;
|
|
126
|
+
return getEvalClockStateNowMs(state);
|
|
127
|
+
}
|
|
128
|
+
function getEvalDateNow() {
|
|
129
|
+
return getEvalClockNowMs() ?? realDate.now();
|
|
130
|
+
}
|
|
131
|
+
/** Return the host process clock, bypassing the eval Date shim. */
|
|
132
|
+
function getRealDateNowMs() {
|
|
133
|
+
return realDate.now();
|
|
134
|
+
}
|
|
135
|
+
/** Return the shifted wall-clock time for a stored eval clock state. */
|
|
136
|
+
function getEvalClockStateTimeMs(state) {
|
|
137
|
+
if (!state.shifted) return null;
|
|
138
|
+
return getEvalClockStateNowMs(state);
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Return the wall-clock start time captured for the active eval.
|
|
142
|
+
*
|
|
143
|
+
* For `startTime: 'now'`, this is the real time captured when the eval clock
|
|
144
|
+
* context was created.
|
|
145
|
+
*/
|
|
146
|
+
function getEvalStartTime() {
|
|
147
|
+
const state = evalClockStorage.getStore();
|
|
148
|
+
if (state === void 0) throw new Error("getEvalStartTime() must be called inside an active eval");
|
|
149
|
+
return new realDate(state.startMs);
|
|
150
|
+
}
|
|
151
|
+
function resolveEvalStartTimeMs(startTime) {
|
|
152
|
+
if (startTime === void 0) return defaultEvalStartTimeMs;
|
|
153
|
+
if (startTime === "now") return realDate.now();
|
|
154
|
+
const ms = startTime instanceof realDate ? startTime.getTime() : typeof startTime === "number" ? startTime : realDate.parse(startTime);
|
|
155
|
+
if (Number.isFinite(ms)) return ms;
|
|
156
|
+
throw new Error(`Invalid eval startTime "${String(startTime)}". Use a Date, timestamp, ISO date string, or "now".`);
|
|
157
|
+
}
|
|
158
|
+
function createEvalClockState(startTime, freezeTime) {
|
|
159
|
+
const nowMs = realDate.now();
|
|
160
|
+
return {
|
|
161
|
+
startMs: startTime === "now" ? nowMs : resolveEvalStartTimeMs(startTime),
|
|
162
|
+
realStartMs: nowMs,
|
|
163
|
+
offsetMs: 0,
|
|
164
|
+
frozen: freezeTime,
|
|
165
|
+
shifted: startTime !== "now" || freezeTime
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
/** Execute a callback with the eval Date clock shifted from `startTime`. */
|
|
169
|
+
async function runWithEvalClock(startTime, fn, options = {}) {
|
|
170
|
+
return await evalClockStorage.run(createEvalClockState(startTime, options.freezeTime === true), fn);
|
|
171
|
+
}
|
|
172
|
+
function getEvalTimeUnitMs(unit) {
|
|
173
|
+
if (unit === "millisecond" || unit === "milliseconds") return 1;
|
|
174
|
+
if (unit === "second" || unit === "seconds") return 1e3;
|
|
175
|
+
if (unit === "minute" || unit === "minutes") return 6e4;
|
|
176
|
+
if (unit === "hour" || unit === "hours") return 36e5;
|
|
177
|
+
if (unit === "day" || unit === "days") return 864e5;
|
|
178
|
+
throw new Error(`Unsupported eval time unit "${unit}"`);
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Advance the active eval's shifted Date clock and return the new time.
|
|
182
|
+
*
|
|
183
|
+
* Throws outside an active shifted eval clock. Evals that set
|
|
184
|
+
* `startTime: 'now'` use the real current clock unless `freezeTime: true` is
|
|
185
|
+
* also set.
|
|
186
|
+
*/
|
|
187
|
+
function advanceEvalTime(unit, amount) {
|
|
188
|
+
const state = evalClockStorage.getStore();
|
|
189
|
+
if (state === void 0) throw new Error("advanceEvalTime() must be called inside an active eval");
|
|
190
|
+
if (!state.shifted) throw new Error("advanceEvalTime() requires a shifted eval clock. Remove startTime: \"now\" or set freezeTime: true to use it.");
|
|
191
|
+
if (!Number.isFinite(amount)) throw new Error("advanceEvalTime() amount must be a finite number");
|
|
192
|
+
state.offsetMs += getEvalTimeUnitMs(unit) * amount;
|
|
193
|
+
return new realDate(getEvalClockStateNowMs(state));
|
|
194
|
+
}
|
|
82
195
|
/** Return the current eval scope for the active async context, if any. */
|
|
83
196
|
function getCurrentScope() {
|
|
84
197
|
if (activeEvalScopeCount === 0) return void 0;
|
|
@@ -349,7 +462,9 @@ async function runInExistingEvalScope(scope, runtimeScope, fn) {
|
|
|
349
462
|
activeEvalScopeCount++;
|
|
350
463
|
try {
|
|
351
464
|
return await scopeStorage.run(scope, async () => {
|
|
352
|
-
return await
|
|
465
|
+
return await evalClockStorage.run(scope.evalClockState, async () => {
|
|
466
|
+
return await runInEvalRuntimeScope(runtimeScope, fn);
|
|
467
|
+
});
|
|
353
468
|
});
|
|
354
469
|
} finally {
|
|
355
470
|
activeEvalScopeCount--;
|
|
@@ -362,6 +477,8 @@ async function runInExistingEvalScope(scope, runtimeScope, fn) {
|
|
|
362
477
|
async function runInEvalScope(caseId, fn, options = {}) {
|
|
363
478
|
const scope = {
|
|
364
479
|
caseId,
|
|
480
|
+
startTime: options.startTime,
|
|
481
|
+
evalClockState: createEvalClockState(options.startTime, options.freezeTime === true),
|
|
365
482
|
idPrefix: options.idPrefix,
|
|
366
483
|
nextEvalIdCounter: 0,
|
|
367
484
|
input: options.input,
|
|
@@ -1493,9 +1610,12 @@ function mergeSpanAttribute(span, key, patch) {
|
|
|
1493
1610
|
...patch
|
|
1494
1611
|
} });
|
|
1495
1612
|
}
|
|
1496
|
-
function
|
|
1613
|
+
function addElapsedMsToTimestamp(isoTimestamp, elapsedMs) {
|
|
1614
|
+
return new Date(new Date(isoTimestamp).getTime() + elapsedMs).toISOString();
|
|
1615
|
+
}
|
|
1616
|
+
function finishSpanWithoutThrownError(span, realStartedAt) {
|
|
1497
1617
|
span.status = hasSpanError(span) ? "error" : "ok";
|
|
1498
|
-
span.endedAt = (
|
|
1618
|
+
span.endedAt = addElapsedMsToTimestamp(span.startedAt, getRealDateNowMs() - realStartedAt);
|
|
1499
1619
|
}
|
|
1500
1620
|
function createSpanHandle(span) {
|
|
1501
1621
|
return {
|
|
@@ -1737,9 +1857,11 @@ async function traceSpanInternal(info, fn) {
|
|
|
1737
1857
|
const scope = getCurrentScope();
|
|
1738
1858
|
if (!scope) return await fn(noopActiveSpan());
|
|
1739
1859
|
const id = generateSpanId();
|
|
1860
|
+
const parentId = scope.activeSpanStack.at(-1)?.id ?? null;
|
|
1861
|
+
const realStartedAt = getRealDateNowMs();
|
|
1740
1862
|
const spanRecord = {
|
|
1741
1863
|
id,
|
|
1742
|
-
parentId
|
|
1864
|
+
parentId,
|
|
1743
1865
|
caseId: scope.caseId,
|
|
1744
1866
|
kind: info.kind,
|
|
1745
1867
|
name: info.name,
|
|
@@ -1779,7 +1901,7 @@ async function traceSpanInternal(info, fn) {
|
|
|
1779
1901
|
const recording = deserializeCacheRecording(hit.recording);
|
|
1780
1902
|
replayRecording(scope, spanRecord, recording, { generateSpanId });
|
|
1781
1903
|
spanRecord.status = recording.finalStatus ?? (hasSpanError(spanRecord) ? "error" : "ok");
|
|
1782
|
-
spanRecord.endedAt = (
|
|
1904
|
+
spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
|
|
1783
1905
|
return recording.returnValue;
|
|
1784
1906
|
}
|
|
1785
1907
|
mergeSpanAttributes(spanRecord, { "cache.status": "miss" });
|
|
@@ -1798,7 +1920,7 @@ async function traceSpanInternal(info, fn) {
|
|
|
1798
1920
|
scope.recordingStack.pop();
|
|
1799
1921
|
}
|
|
1800
1922
|
appendSubSpanOps(scope, frame);
|
|
1801
|
-
finishSpanWithoutThrownError(spanRecord);
|
|
1923
|
+
finishSpanWithoutThrownError(spanRecord, realStartedAt);
|
|
1802
1924
|
if (ctx.mode !== "bypass") {
|
|
1803
1925
|
const recording = {
|
|
1804
1926
|
returnValue: bodyResult,
|
|
@@ -1832,11 +1954,11 @@ async function traceSpanInternal(info, fn) {
|
|
|
1832
1954
|
return bodyResult;
|
|
1833
1955
|
}
|
|
1834
1956
|
const result = await fn(activeSpan);
|
|
1835
|
-
finishSpanWithoutThrownError(spanRecord);
|
|
1957
|
+
finishSpanWithoutThrownError(spanRecord, realStartedAt);
|
|
1836
1958
|
return result;
|
|
1837
1959
|
} catch (error) {
|
|
1838
1960
|
spanRecord.status = "error";
|
|
1839
|
-
spanRecord.endedAt = (
|
|
1961
|
+
spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
|
|
1840
1962
|
spanRecord.error = normalizeTraceError(error);
|
|
1841
1963
|
throw error;
|
|
1842
1964
|
} finally {
|
|
@@ -5235,7 +5357,9 @@ async function runCase(params) {
|
|
|
5235
5357
|
mode: cacheMode,
|
|
5236
5358
|
evalId,
|
|
5237
5359
|
codeFingerprint
|
|
5238
|
-
} : void 0
|
|
5360
|
+
} : void 0,
|
|
5361
|
+
startTime: evalDef.startTime,
|
|
5362
|
+
freezeTime: evalDef.freezeTime
|
|
5239
5363
|
});
|
|
5240
5364
|
const traceTree = buildTraceTree(scope.spans, scope.checkpoints);
|
|
5241
5365
|
const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
@@ -5276,6 +5400,7 @@ async function runCase(params) {
|
|
|
5276
5400
|
}
|
|
5277
5401
|
const scoreResults = /* @__PURE__ */ new Map();
|
|
5278
5402
|
const scoringTraces = {};
|
|
5403
|
+
const scoreStartTime = getEvalClockStateTimeMs(scope.evalClockState) ?? evalDef.startTime;
|
|
5279
5404
|
if (!nonAssertError && scope.assertionFailures.length === 0 && evalDef.scores) for (const [key, def] of Object.entries(evalDef.scores)) {
|
|
5280
5405
|
const { compute, passThreshold, label } = normalizeScoreDef(def);
|
|
5281
5406
|
const scoreRun = await runInEvalScope(evalCase.id, async () => {
|
|
@@ -5295,7 +5420,9 @@ async function runCase(params) {
|
|
|
5295
5420
|
mode: cacheMode,
|
|
5296
5421
|
evalId: `${evalId}__score__${key}`,
|
|
5297
5422
|
codeFingerprint
|
|
5298
|
-
} : void 0
|
|
5423
|
+
} : void 0,
|
|
5424
|
+
startTime: scoreStartTime,
|
|
5425
|
+
freezeTime: evalDef.freezeTime
|
|
5299
5426
|
});
|
|
5300
5427
|
const { trace, traceDisplay } = resolveTracePresentation(scoreRun.scope.spans, globalTraceDisplay, evalDef.traceDisplay);
|
|
5301
5428
|
scope.logs.push(...scoreRun.scope.logs.map((entry) => ({
|
|
@@ -5608,7 +5735,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5608
5735
|
await runInEvalRuntimeScope("cases", async () => {
|
|
5609
5736
|
await entry.use(async (evalDef) => {
|
|
5610
5737
|
const cases = filterEvalCases(resolveRunnableEvalCases({
|
|
5611
|
-
cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
|
|
5738
|
+
cases: await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime }),
|
|
5612
5739
|
evalId: evalMeta.id
|
|
5613
5740
|
}), request.target.evalIds, request.target.caseIds, evalMeta.id);
|
|
5614
5741
|
runState.summary.totalCases += cases.length;
|
|
@@ -5811,4 +5938,4 @@ function toLastRunStatus(status) {
|
|
|
5811
5938
|
return status === "pending" ? null : status;
|
|
5812
5939
|
}
|
|
5813
5940
|
//#endregion
|
|
5814
|
-
export { removeDefaultConfigSchema as $, columnKindSchema as $t, extractApiCalls as A,
|
|
5941
|
+
export { removeDefaultConfigSchema as $, columnKindSchema as $t, extractApiCalls as A, runInEvalScope as An, cacheFileSchema as At, DEFAULT_API_CALLS_CONFIG as B, traceAttributeDisplayFormatSchema as Bt, validateCharts as C, getEvalCaseInput as Cn, evalChartTooltipExtraSchema as Ct, sseEnvelopeSchema as D, mergeEvalOutput as Dn, cacheDebugKeyFileSchema as Dt, updateManualScoreRequestSchema as E, isInEvalScope as En, cacheDebugKeyEntrySchema as Et, deriveScopedSummaryFromCases as F, repoFile as Fn, cacheRecordingSchema as Ft, apiCallMetricSchema as G, traceDisplayInputConfigSchema as Gt, agentEvalsConfigSchema as H, traceAttributeDisplayPlacementSchema as Ht, deriveStatusFromCaseRows as I, defineEval as In, cacheStatusSchema as It, llmCallMetricFormatSchema as J, traceSpanSchema as Jt, apiCallsConfigSchema as K, traceSpanErrorSchema as Kt, deriveStatusFromChildStatuses as L, getEvalRegistry as Ln, serializedCacheSpanSchema as Lt, getNestedAttribute as M, setEvalOutput as Mn, cacheModeSchema as Mt, getEvalTitle as N, setScopeCacheContext as Nn, cacheOperationTypeSchema as Nt, extractCacheEntries as O, nextEvalId as On, cacheEntrySchema as Ot, getEvalDisplayStatus as P, startEvalBackgroundJob as Pn, cacheRecordingOpSchema as Pt, llmCallsConfigSchema as Q, columnFormatSchema as Qt, runManifestSchema as R, spanCacheOptionsSchema as Rt, normalizeScoreDef as S, getCurrentScope as Sn, evalChartMetricSchema as St, createRunRequestSchema as T, incrementEvalOutput as Tn, evalChartsConfigSchema as Tt, apiCallMetricFormatSchema as U, traceAttributeDisplaySchema as Ut, DEFAULT_LLM_CALLS_CONFIG as V, traceAttributeDisplayInputSchema as Vt, apiCallMetricPlacementSchema as W, traceDisplayConfigSchema as Wt, llmCallMetricSchema as X, cellValueSchema as Xt, llmCallMetricPlacementSchema as Y, traceSpanWarningSchema as Yt, llmCallPricingSchema as Z, columnDefSchema as Zt, loadEvalModule as _, advanceEvalTime as _n, evalChartAggregateSchema as _t, loadPersistedRunSnapshot as a, z$1 as an, caseDetailSchema as at, loadConfig as b, evalAssert as bn, evalChartColorSchema as bt, persistCaseDetail as c, evalSpan as cn, evalStatAggregateSchema as ct, recomputePersistedCaseStatus as d, hashCacheKeySync as dn, evalSummarySchema as dt, fileRefSchema as en, resolveApiCallsConfig as et, runTouchesEval as f, deserializeCacheRecording as fn, runLogEntrySchema as ft, setLatestRunInfoMap as g, EvalAssertionError as gn, scoreTraceSchema as gt, getTargetEvalIds as h, serializeCacheValue as hn, runLogPhaseSchema as ht, getLatestRunInfos as i, runArtifactRefSchema as in, assertionFailureSchema as it, extractLlmCalls as j, runInExistingEvalScope as jn, cacheListItemSchema as jt, extractCacheHits as k, runInEvalRuntimeScope as kn, cacheEntryWithDebugKeySchema as kt, persistRunState as l, evalTracer as ln, evalStatItemSchema as lt, buildEvalSummary as m, serializeCacheRecording as mn, runLogLocationSchema as mt, generateRunId as n, numberDisplayOptionsSchema as nn, runLogsConfigSchema as nt, loadPersistedRunSnapshots as o, buildTraceTree as on, caseRowSchema as ot, resolveArtifactPath as p, deserializeCacheValue as pn, runLogLevelSchema as pt, defaultConfigKeySchema as q, traceSpanKindSchema as qt, getLastRunStatuses as r, repoFileRefSchema as rn, trialSelectionModeSchema as rt, nextShortIdFromSnapshots as s, captureEvalSpanError as sn, evalFreshnessStatusSchema as st, executeRun as t, jsonCellSchema as tn, resolveLlmCallsConfig as tt, recomputeEvalStatusesInRuns as u, hashCacheKey as un, evalStatsConfigSchema as ut, parseEvalMetas as v, appendToEvalOutput as vn, evalChartAxisSchema as vt, createFsCacheStore as w, getEvalStartTime as wn, evalChartTypeSchema as wt, buildDeclaredColumnDefs as x, evalLog as xn, evalChartConfigSchema as xt, resolveEvalDefaultConfig as y, configureEvalRunLogs as yn, evalChartBuiltinMetricSchema as yt, runSummarySchema as z, traceCacheRefSchema as zt };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-D3QNOcPN.mjs";
|
|
2
|
+
import "./src-CcXfWT4M.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-4yNYRvmF.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
package/package.json
CHANGED
|
@@ -156,6 +156,19 @@ for settlement; promise and span errors keep their normal behavior. Use
|
|
|
156
156
|
`waitForBackgroundJob: false` on a span, or `waitForBackgroundJobs: false` on an
|
|
157
157
|
eval definition, when background work should not delay finalization.
|
|
158
158
|
|
|
159
|
+
Eval Date APIs use a shifted wall clock by default: `new Date()` and
|
|
160
|
+
`Date.now()` start at `2026-04-10T00:00:00.000Z` during case generation,
|
|
161
|
+
execution, tracing, derived outputs, and scorers, then continue advancing with
|
|
162
|
+
real elapsed time. Set `startTime` on a specific `defineEval(...)` to use
|
|
163
|
+
another initial clock value, or set `startTime: 'now'` for that eval to use the
|
|
164
|
+
real current clock. Timers are not faked, so async waits still run normally.
|
|
165
|
+
Set `freezeTime: true` to keep Date APIs frozen until they are moved manually.
|
|
166
|
+
Use `getEvalStartTime()` to read the captured wall-clock start as a `Date`.
|
|
167
|
+
Use `advanceEvalTime(unit, amount)` inside an eval to move the shifted clock
|
|
168
|
+
forward; supported units are `millisecond(s)`, `second(s)`, `minute(s)`,
|
|
169
|
+
`hour(s)`, and `day(s)`. It throws for evals with `startTime: 'now'`, unless
|
|
170
|
+
`freezeTime: true` is also set.
|
|
171
|
+
|
|
159
172
|
For libraries or observability exporters that already emit span lifecycle
|
|
160
173
|
events, use `evalTracer.startSpan(...)`, `evalTracer.updateSpan(...)`,
|
|
161
174
|
`evalTracer.endSpan(...)`, or `evalTracer.recordSpan(...)` to translate those
|
package/dist/src-D-HuV8I-.mjs
DELETED