@ls-stack/agent-eval 0.26.2 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-BxR4SbjA.mjs → app-ByMLOds2.mjs} +3 -3
- package/dist/apps/web/dist/assets/{index-CFF1eYUm.js → index-DR2haqvV.js} +1 -1
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/{cli-C4yumCXE.mjs → cli-DRkwWgTj.mjs} +3 -3
- package/dist/index.d.mts +40 -8
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-B31SV_Bq.mjs → runOrchestration-DB0dwGrd.mjs} +167 -28
- package/dist/{runner-D5QMY3B_.mjs → runner-39KGoaM1.mjs} +1 -1
- package/dist/{runner-LHN7L-xk.mjs → runner-DRINcaN_.mjs} +2 -2
- package/dist/src-BwKm3sKU.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +28 -17
- package/dist/src-CLMrRle2.mjs +0 -3
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-DR2haqvV.js"></script>
|
|
29
29
|
<link rel="stylesheet" crossorigin href="/assets/index-DOXT0Y9V.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef,
|
|
1
|
+
import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Wn as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-DB0dwGrd.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join, relative, resolve } from "node:path";
|
|
@@ -1041,8 +1041,8 @@ async function commandApp(args) {
|
|
|
1041
1041
|
const { serve } = await import("@hono/node-server");
|
|
1042
1042
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
1043
1043
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
1044
|
-
const appModule = await import("./app-
|
|
1045
|
-
const runnerModule = await import("./runner-
|
|
1044
|
+
const appModule = await import("./app-ByMLOds2.mjs");
|
|
1045
|
+
const runnerModule = await import("./runner-39KGoaM1.mjs");
|
|
1046
1046
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
1047
1047
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
1048
1048
|
await runnerModule.initRunner();
|
package/dist/index.d.mts
CHANGED
|
@@ -2288,7 +2288,7 @@ type CacheMode = z$1.infer<typeof cacheModeSchema>;
|
|
|
2288
2288
|
/** Options accepted by an `evalTracer.span` call to opt the span into caching. */
|
|
2289
2289
|
declare const spanCacheOptionsSchema: z$1.ZodObject<{
|
|
2290
2290
|
key: z$1.ZodUnknown;
|
|
2291
|
-
namespace: z$1.
|
|
2291
|
+
namespace: z$1.ZodString;
|
|
2292
2292
|
serializeFileBytes: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2293
2293
|
}, z$1.core.$strip>;
|
|
2294
2294
|
/** Options accepted by an `evalTracer.span` call to opt the span into caching. */
|
|
@@ -3119,6 +3119,37 @@ declare function getEvalRegistry(): Map<string, EvalRegistryEntry>;
|
|
|
3119
3119
|
*/
|
|
3120
3120
|
declare function defineEval<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs>(definition: EvalDefinition<TInput, TOutputs>): void;
|
|
3121
3121
|
//#endregion
|
|
3122
|
+
//#region ../sdk/src/evalExpect.d.ts
|
|
3123
|
+
/**
|
|
3124
|
+
* Focused expectation helpers for eval case invariants.
|
|
3125
|
+
*
|
|
3126
|
+
* These matchers intentionally cover comparisons that produce clearer failure
|
|
3127
|
+
* messages than a plain `evalAssert(...)`. Use `evalAssert(...)` directly for
|
|
3128
|
+
* truthiness checks and custom type narrowing.
|
|
3129
|
+
*/
|
|
3130
|
+
type EvalExpectation<T> = {
|
|
3131
|
+
/** Invert the next matcher. */readonly not: EvalExpectation<T>; /** Assert strict `Object.is(...)` equality. */
|
|
3132
|
+
toBe(expected: unknown): void; /** Assert Node.js deep strict equality. */
|
|
3133
|
+
toEqual(expected: unknown): void; /** Assert that object properties recursively match the expected subset. */
|
|
3134
|
+
toMatchObject(expected: Record<string, unknown>): void; /** Assert substring, array item, or set item containment. */
|
|
3135
|
+
toContain(expected: unknown): void; /** Assert the value has a numeric `length` equal to `expected`. */
|
|
3136
|
+
toHaveLength(expected: number): void; /** Assert a dot-path property exists, optionally with a deep-equal value. */
|
|
3137
|
+
toHaveProperty(path: string, ...expected: [] | [unknown]): void; /** Assert the received number is greater than `expected`. */
|
|
3138
|
+
toBeGreaterThan(expected: number): void; /** Assert the received number is greater than or equal to `expected`. */
|
|
3139
|
+
toBeGreaterThanOrEqual(expected: number): void; /** Assert the received number is less than `expected`. */
|
|
3140
|
+
toBeLessThan(expected: number): void; /** Assert the received number is less than or equal to `expected`. */
|
|
3141
|
+
toBeLessThanOrEqual(expected: number): void; /** Assert the received number is close to `expected` at `precision` decimals. */
|
|
3142
|
+
toBeCloseTo(expected: number, precision?: number): void; /** Assert the received string matches the regular expression. */
|
|
3143
|
+
toMatch(expected: RegExp): void;
|
|
3144
|
+
};
|
|
3145
|
+
/**
|
|
3146
|
+
* Create focused expectation helpers for the current eval case.
|
|
3147
|
+
*
|
|
3148
|
+
* Failed expectations record assertion failures and throw only while an eval
|
|
3149
|
+
* case scope is active, matching `evalAssert(...)`.
|
|
3150
|
+
*/
|
|
3151
|
+
declare function evalExpect<T>(value: T): EvalExpectation<T>;
|
|
3152
|
+
//#endregion
|
|
3122
3153
|
//#region ../sdk/src/repoFile.d.ts
|
|
3123
3154
|
/**
|
|
3124
3155
|
* Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
|
|
@@ -3168,7 +3199,7 @@ type CacheAdapter = {
|
|
|
3168
3199
|
type CacheScopeContext = {
|
|
3169
3200
|
adapter: CacheAdapter;
|
|
3170
3201
|
mode: CacheMode;
|
|
3171
|
-
evalId: string; /** Hash of the eval source file
|
|
3202
|
+
evalId: string; /** Hash of the eval source file, stored as cache metadata for inspection. */
|
|
3172
3203
|
codeFingerprint: string;
|
|
3173
3204
|
};
|
|
3174
3205
|
/** Active recording frame captured while a cached operation body executes. */
|
|
@@ -3356,12 +3387,14 @@ declare function mergeEvalOutput(key: string, patch: Record<string, unknown>): v
|
|
|
3356
3387
|
*/
|
|
3357
3388
|
declare function incrementEvalOutput(key: string, delta: number): void;
|
|
3358
3389
|
/**
|
|
3359
|
-
* Assert a condition for the current eval case and throw on failure.
|
|
3390
|
+
* Assert a truthy condition for the current eval case and throw on failure.
|
|
3360
3391
|
*
|
|
3361
3392
|
* Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
|
|
3362
|
-
* can safely reuse `evalAssert(...)` when it also runs outside an eval.
|
|
3393
|
+
* can safely reuse `evalAssert(...)` when it also runs outside an eval. The
|
|
3394
|
+
* TypeScript assertion signature still narrows the checked value after the
|
|
3395
|
+
* call.
|
|
3363
3396
|
*/
|
|
3364
|
-
declare function evalAssert(condition:
|
|
3397
|
+
declare function evalAssert(condition: unknown, message: string): asserts condition;
|
|
3365
3398
|
//#endregion
|
|
3366
3399
|
//#region ../sdk/src/cacheSerialization.d.ts
|
|
3367
3400
|
declare const serializedCacheValueMarker = "__agentEvalsCacheSerialization";
|
|
@@ -3419,8 +3452,7 @@ type TraceCacheInfo = {
|
|
|
3419
3452
|
//#region ../sdk/src/cacheKey.d.ts
|
|
3420
3453
|
/** Components folded into a deterministic cache key hash. */
|
|
3421
3454
|
type CacheKeyHashInput = {
|
|
3422
|
-
/** Cache namespace, usually derived from the eval id and operation name. */namespace: string; /**
|
|
3423
|
-
codeFingerprint: string; /** User-authored cache key value. */
|
|
3455
|
+
/** Cache namespace, usually derived from the eval id and operation name. */namespace: string; /** User-authored cache key value. */
|
|
3424
3456
|
key: unknown;
|
|
3425
3457
|
};
|
|
3426
3458
|
/** Optional controls for cache key hashing. */
|
|
@@ -3722,4 +3754,4 @@ declare function createRunner({
|
|
|
3722
3754
|
*/
|
|
3723
3755
|
declare function runCli(argv: string[]): Promise<void>;
|
|
3724
3756
|
//#endregion
|
|
3725
|
-
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
3757
|
+
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalExpectation, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalExpect, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as
|
|
2
|
-
import { n as createRunner, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
1
|
+
import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getCurrentScope, At as evalChartTypeSchema, B as runSummarySchema, Bn as setEvalOutput, Bt as cacheRecordingOpSchema, Cn as evalExpect, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as appendToEvalOutput, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as mergeEvalOutput, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as startEvalBackgroundJob, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as nextEvalId, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInEvalRuntimeScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as getEvalStartTime, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as incrementEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as evalAssert, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as isInEvalScope, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as runInEvalScope, Rt as cacheModeSchema, Sn as repoFile, St as scoreTraceSchema, T as createRunRequestSchema, Tn as advanceEvalTime, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as defineEval, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as setScopeCacheContext, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wn as getEvalRegistry, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as getEvalCaseInput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as evalLog, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as EvalAssertionError, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as runInExistingEvalScope, zt as cacheOperationTypeSchema } from "./runOrchestration-DB0dwGrd.mjs";
|
|
2
|
+
import { n as createRunner, t as runCli } from "./cli-DRkwWgTj.mjs";
|
|
3
|
+
import "./src-BwKm3sKU.mjs";
|
|
4
|
+
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalExpect, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { B as runSummarySchema,
|
|
1
|
+
import { B as runSummarySchema, Dn as configureEvalRunLogs, T as createRunRequestSchema, b as loadConfig, gt as evalStatsConfigSchema, jt as evalChartsConfigSchema, ot as buildEvalKey, r as getTargetEvals$1, rn as columnDefSchema, t as executeRun, v as parseEvalDiscovery, w as createFsCacheStore, z as runManifestSchema } from "./runOrchestration-DB0dwGrd.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|
|
@@ -2,9 +2,9 @@ import { createRequire, registerHooks } from "node:module";
|
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
|
|
4
4
|
import { extname, isAbsolute, join, relative, resolve } from "node:path";
|
|
5
|
-
import {
|
|
5
|
+
import { formatWithOptions, isDeepStrictEqual } from "node:util";
|
|
6
6
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
7
|
-
import {
|
|
7
|
+
import { z, z as z$1 } from "zod/v4";
|
|
8
8
|
import { Buffer as Buffer$1 } from "node:buffer";
|
|
9
9
|
import { gunzipSync, gzipSync } from "node:zlib";
|
|
10
10
|
import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
|
|
@@ -29,23 +29,6 @@ function defineEval(definition) {
|
|
|
29
29
|
});
|
|
30
30
|
}
|
|
31
31
|
//#endregion
|
|
32
|
-
//#region ../sdk/src/repoFile.ts
|
|
33
|
-
/**
|
|
34
|
-
* Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
|
|
35
|
-
* by a column configured with `format: 'image' | 'audio' | 'video' | 'file'`.
|
|
36
|
-
*
|
|
37
|
-
* @param path Relative or absolute path to the repository file.
|
|
38
|
-
* @param mimeType Optional MIME type hint for UI rendering.
|
|
39
|
-
* @returns A repo-backed file reference suitable for file/media columns.
|
|
40
|
-
*/
|
|
41
|
-
function repoFile(path, mimeType) {
|
|
42
|
-
return {
|
|
43
|
-
source: "repo",
|
|
44
|
-
path,
|
|
45
|
-
mimeType
|
|
46
|
-
};
|
|
47
|
-
}
|
|
48
|
-
//#endregion
|
|
49
32
|
//#region ../sdk/src/runtime.ts
|
|
50
33
|
const scopeStorage = new AsyncLocalStorage();
|
|
51
34
|
const runtimeScopeStorage = new AsyncLocalStorage();
|
|
@@ -639,10 +622,12 @@ function incrementEvalOutput(key, delta) {
|
|
|
639
622
|
});
|
|
640
623
|
}
|
|
641
624
|
/**
|
|
642
|
-
* Assert a condition for the current eval case and throw on failure.
|
|
625
|
+
* Assert a truthy condition for the current eval case and throw on failure.
|
|
643
626
|
*
|
|
644
627
|
* Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
|
|
645
|
-
* can safely reuse `evalAssert(...)` when it also runs outside an eval.
|
|
628
|
+
* can safely reuse `evalAssert(...)` when it also runs outside an eval. The
|
|
629
|
+
* TypeScript assertion signature still narrows the checked value after the
|
|
630
|
+
* call.
|
|
646
631
|
*/
|
|
647
632
|
function evalAssert(condition, message) {
|
|
648
633
|
if (condition) return;
|
|
@@ -653,6 +638,156 @@ function evalAssert(condition, message) {
|
|
|
653
638
|
throw error;
|
|
654
639
|
}
|
|
655
640
|
//#endregion
|
|
641
|
+
//#region ../sdk/src/evalExpect.ts
|
|
642
|
+
const expectFormatOptions = {
|
|
643
|
+
depth: 5,
|
|
644
|
+
maxArrayLength: 50,
|
|
645
|
+
maxStringLength: 1e3,
|
|
646
|
+
breakLength: 80,
|
|
647
|
+
compact: 3
|
|
648
|
+
};
|
|
649
|
+
function formatExpectValue(value) {
|
|
650
|
+
return formatWithOptions(expectFormatOptions, value);
|
|
651
|
+
}
|
|
652
|
+
function isRecordLike$4(value) {
|
|
653
|
+
return typeof value === "object" && value !== null;
|
|
654
|
+
}
|
|
655
|
+
function isLengthLike(value) {
|
|
656
|
+
if (typeof value === "string") return true;
|
|
657
|
+
return isRecordLike$4(value) && "length" in value && typeof value.length === "number";
|
|
658
|
+
}
|
|
659
|
+
function matchesObjectSubset(received, expected) {
|
|
660
|
+
if (!isRecordLike$4(received)) return false;
|
|
661
|
+
for (const [key, expectedValue] of Object.entries(expected)) {
|
|
662
|
+
if (!(key in received)) return false;
|
|
663
|
+
const receivedValue = received[key];
|
|
664
|
+
if (isRecordLike$4(expectedValue) && !Array.isArray(expectedValue)) {
|
|
665
|
+
if (!matchesObjectSubset(receivedValue, expectedValue)) return false;
|
|
666
|
+
continue;
|
|
667
|
+
}
|
|
668
|
+
if (!isDeepStrictEqual(receivedValue, expectedValue)) return false;
|
|
669
|
+
}
|
|
670
|
+
return true;
|
|
671
|
+
}
|
|
672
|
+
function containsValue(received, expected) {
|
|
673
|
+
if (typeof received === "string") return typeof expected === "string" && received.includes(expected);
|
|
674
|
+
if (Array.isArray(received)) return received.includes(expected);
|
|
675
|
+
if (received instanceof Set) return received.has(expected);
|
|
676
|
+
return false;
|
|
677
|
+
}
|
|
678
|
+
function getPropertyAtPath(received, path) {
|
|
679
|
+
if (path === "") return {
|
|
680
|
+
exists: false,
|
|
681
|
+
value: void 0
|
|
682
|
+
};
|
|
683
|
+
let current = received;
|
|
684
|
+
for (const key of path.split(".")) {
|
|
685
|
+
if (!isRecordLike$4(current) || !(key in current)) return {
|
|
686
|
+
exists: false,
|
|
687
|
+
value: void 0
|
|
688
|
+
};
|
|
689
|
+
current = current[key];
|
|
690
|
+
}
|
|
691
|
+
return {
|
|
692
|
+
exists: true,
|
|
693
|
+
value: current
|
|
694
|
+
};
|
|
695
|
+
}
|
|
696
|
+
var EvalExpectationImpl = class EvalExpectationImpl {
|
|
697
|
+
received;
|
|
698
|
+
negated;
|
|
699
|
+
constructor(received, negated) {
|
|
700
|
+
this.received = received;
|
|
701
|
+
this.negated = negated;
|
|
702
|
+
}
|
|
703
|
+
get not() {
|
|
704
|
+
return new EvalExpectationImpl(this.received, !this.negated);
|
|
705
|
+
}
|
|
706
|
+
toBe(expected) {
|
|
707
|
+
this.check(Object.is(this.received, expected), `Expected ${formatExpectValue(this.received)} to be ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to be ${formatExpectValue(expected)}`);
|
|
708
|
+
}
|
|
709
|
+
toEqual(expected) {
|
|
710
|
+
this.check(isDeepStrictEqual(this.received, expected), `Expected ${formatExpectValue(this.received)} to equal ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to equal ${formatExpectValue(expected)}`);
|
|
711
|
+
}
|
|
712
|
+
toMatchObject(expected) {
|
|
713
|
+
this.check(matchesObjectSubset(this.received, expected), `Expected ${formatExpectValue(this.received)} to match object ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to match object ${formatExpectValue(expected)}`);
|
|
714
|
+
}
|
|
715
|
+
toContain(expected) {
|
|
716
|
+
this.check(containsValue(this.received, expected), `Expected ${formatExpectValue(this.received)} to contain ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to contain ${formatExpectValue(expected)}`);
|
|
717
|
+
}
|
|
718
|
+
toHaveLength(expected) {
|
|
719
|
+
this.check(isLengthLike(this.received) && this.received.length === expected, `Expected ${formatExpectValue(this.received)} to have length ${expected}`, `Expected ${formatExpectValue(this.received)} not to have length ${expected}`);
|
|
720
|
+
}
|
|
721
|
+
toHaveProperty(path, ...expected) {
|
|
722
|
+
const result = getPropertyAtPath(this.received, path);
|
|
723
|
+
const expectedValue = expected[0];
|
|
724
|
+
const pass = result.exists && (expected.length === 0 || isDeepStrictEqual(result.value, expectedValue));
|
|
725
|
+
const expectedSuffix = expected.length === 0 ? "" : ` with value ${formatExpectValue(expectedValue)}`;
|
|
726
|
+
this.check(pass, `Expected ${formatExpectValue(this.received)} to have property "${path}"${expectedSuffix}`, `Expected ${formatExpectValue(this.received)} not to have property "${path}"${expectedSuffix}`);
|
|
727
|
+
}
|
|
728
|
+
toBeGreaterThan(expected) {
|
|
729
|
+
this.checkNumberComparison("to be greater than", expected, (received) => {
|
|
730
|
+
return received > expected;
|
|
731
|
+
});
|
|
732
|
+
}
|
|
733
|
+
toBeGreaterThanOrEqual(expected) {
|
|
734
|
+
this.checkNumberComparison("to be greater than or equal to", expected, (received) => {
|
|
735
|
+
return received >= expected;
|
|
736
|
+
});
|
|
737
|
+
}
|
|
738
|
+
toBeLessThan(expected) {
|
|
739
|
+
this.checkNumberComparison("to be less than", expected, (received) => {
|
|
740
|
+
return received < expected;
|
|
741
|
+
});
|
|
742
|
+
}
|
|
743
|
+
toBeLessThanOrEqual(expected) {
|
|
744
|
+
this.checkNumberComparison("to be less than or equal to", expected, (received) => {
|
|
745
|
+
return received <= expected;
|
|
746
|
+
});
|
|
747
|
+
}
|
|
748
|
+
toBeCloseTo(expected, precision = 2) {
|
|
749
|
+
const tolerance = 10 ** -precision / 2;
|
|
750
|
+
this.check(typeof this.received === "number" && Number.isFinite(this.received) && Math.abs(this.received - expected) < tolerance, `Expected ${formatExpectValue(this.received)} to be close to ${expected} at ${precision} decimals`, `Expected ${formatExpectValue(this.received)} not to be close to ${expected} at ${precision} decimals`);
|
|
751
|
+
}
|
|
752
|
+
toMatch(expected) {
|
|
753
|
+
const statelessExpected = new RegExp(expected.source, expected.flags);
|
|
754
|
+
this.check(typeof this.received === "string" && statelessExpected.test(this.received), `Expected ${formatExpectValue(this.received)} to match ${expected}`, `Expected ${formatExpectValue(this.received)} not to match ${expected}`);
|
|
755
|
+
}
|
|
756
|
+
check(pass, positiveMessage, negativeMessage) {
|
|
757
|
+
if (this.negated ? !pass : pass) return;
|
|
758
|
+
evalAssert(false, this.negated ? negativeMessage : positiveMessage);
|
|
759
|
+
}
|
|
760
|
+
checkNumberComparison(label, expected, predicate) {
|
|
761
|
+
this.check(typeof this.received === "number" && predicate(this.received), `Expected ${formatExpectValue(this.received)} ${label} ${expected}`, `Expected ${formatExpectValue(this.received)} not ${label} ${expected}`);
|
|
762
|
+
}
|
|
763
|
+
};
|
|
764
|
+
/**
|
|
765
|
+
* Create focused expectation helpers for the current eval case.
|
|
766
|
+
*
|
|
767
|
+
* Failed expectations record assertion failures and throw only while an eval
|
|
768
|
+
* case scope is active, matching `evalAssert(...)`.
|
|
769
|
+
*/
|
|
770
|
+
function evalExpect(value) {
|
|
771
|
+
return new EvalExpectationImpl(value, false);
|
|
772
|
+
}
|
|
773
|
+
//#endregion
|
|
774
|
+
//#region ../sdk/src/repoFile.ts
|
|
775
|
+
/**
|
|
776
|
+
* Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
|
|
777
|
+
* by a column configured with `format: 'image' | 'audio' | 'video' | 'file'`.
|
|
778
|
+
*
|
|
779
|
+
* @param path Relative or absolute path to the repository file.
|
|
780
|
+
* @param mimeType Optional MIME type hint for UI rendering.
|
|
781
|
+
* @returns A repo-backed file reference suitable for file/media columns.
|
|
782
|
+
*/
|
|
783
|
+
function repoFile(path, mimeType) {
|
|
784
|
+
return {
|
|
785
|
+
source: "repo",
|
|
786
|
+
path,
|
|
787
|
+
mimeType
|
|
788
|
+
};
|
|
789
|
+
}
|
|
790
|
+
//#endregion
|
|
656
791
|
//#region ../sdk/src/cacheSerialization.ts
|
|
657
792
|
const serializedCacheValueMarker = "__agentEvalsCacheSerialization";
|
|
658
793
|
const jsonSafeCacheValueVersion = "json-safe-v1";
|
|
@@ -1436,7 +1571,6 @@ function createTraceCache(generateSpanId) {
|
|
|
1436
1571
|
const namespace = info.namespace ?? `${cacheCtx.evalId}__${info.name}`;
|
|
1437
1572
|
const keyHash = await hashCacheKey({
|
|
1438
1573
|
namespace,
|
|
1439
|
-
codeFingerprint: cacheCtx.codeFingerprint,
|
|
1440
1574
|
key: info.key
|
|
1441
1575
|
}, { serializeFileBytes: info.serializeFileBytes === true });
|
|
1442
1576
|
const activeSpan = scope.activeSpanStack.at(-1);
|
|
@@ -1879,10 +2013,9 @@ async function traceSpanInternal(info, fn) {
|
|
|
1879
2013
|
const cacheCtx = scope.cacheContext;
|
|
1880
2014
|
if (cacheOpts !== void 0 && cacheCtx !== void 0 && scope.replayingDepth === 0) {
|
|
1881
2015
|
const ctx = cacheCtx;
|
|
1882
|
-
const namespace = cacheOpts
|
|
2016
|
+
const namespace = getRequiredSpanCacheNamespace(cacheOpts);
|
|
1883
2017
|
const keyHash = await hashCacheKey({
|
|
1884
2018
|
namespace,
|
|
1885
|
-
codeFingerprint: ctx.codeFingerprint,
|
|
1886
2019
|
key: cacheOpts.key
|
|
1887
2020
|
}, { serializeFileBytes: cacheOpts.serializeFileBytes === true });
|
|
1888
2021
|
mergeSpanAttributes(spanRecord, {
|
|
@@ -1966,6 +2099,12 @@ async function traceSpanInternal(info, fn) {
|
|
|
1966
2099
|
scope.activeSpanStack.pop();
|
|
1967
2100
|
}
|
|
1968
2101
|
}
|
|
2102
|
+
function getRequiredSpanCacheNamespace(cacheOpts) {
|
|
2103
|
+
if (!isRecordLike$1(cacheOpts)) throw new Error("Cached spans require a non-empty cache.namespace");
|
|
2104
|
+
const namespace = cacheOpts.namespace;
|
|
2105
|
+
if (typeof namespace !== "string" || namespace.length === 0) throw new Error("Cached spans require a non-empty cache.namespace");
|
|
2106
|
+
return namespace;
|
|
2107
|
+
}
|
|
1969
2108
|
/**
|
|
1970
2109
|
* Trace builder used to create hierarchical spans and checkpoints during eval
|
|
1971
2110
|
* execution.
|
|
@@ -2248,8 +2387,8 @@ const cacheModeSchema = z.enum([
|
|
|
2248
2387
|
const spanCacheOptionsSchema = z.object({
|
|
2249
2388
|
/** Arbitrary JSON-safe value used to derive the cache key. */
|
|
2250
2389
|
key: z.unknown(),
|
|
2251
|
-
/**
|
|
2252
|
-
namespace: z.string().
|
|
2390
|
+
/** Required cache namespace shared by span cache entries in the same domain. */
|
|
2391
|
+
namespace: z.string().min(1),
|
|
2253
2392
|
/**
|
|
2254
2393
|
* Include native `Blob`/`File` bytes in the cache key. By default only stable
|
|
2255
2394
|
* metadata (`type`, `size`, plus `name`/`lastModified` for `File`) is used.
|
|
@@ -4197,7 +4336,7 @@ async function writeDebugKeyFile(debugDir, debugFile) {
|
|
|
4197
4336
|
await mkdir(debugDir, { recursive: true });
|
|
4198
4337
|
const filePath = ownerPath(debugDir, debugFile.owner);
|
|
4199
4338
|
const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
|
|
4200
|
-
await writeFile(tmpPath, JSON.stringify(debugFile));
|
|
4339
|
+
await writeFile(tmpPath, JSON.stringify(debugFile, null, 2));
|
|
4201
4340
|
await rename(tmpPath, filePath);
|
|
4202
4341
|
}
|
|
4203
4342
|
function pruneEntries(entries, namespace, maxEntries, protectedKey) {
|
|
@@ -6185,4 +6324,4 @@ function toLastRunStatus(status) {
|
|
|
6185
6324
|
return status === "pending" ? null : status;
|
|
6186
6325
|
}
|
|
6187
6326
|
//#endregion
|
|
6188
|
-
export { llmCallsConfigSchema as $, traceSpanKindSchema as $t, extractApiCalls as A,
|
|
6327
|
+
export { llmCallsConfigSchema as $, traceSpanKindSchema as $t, extractApiCalls as A, getCurrentScope as An, evalChartTypeSchema as At, runSummarySchema as B, setEvalOutput as Bn, cacheRecordingOpSchema as Bt, validateCharts as C, evalExpect as Cn, evalChartAggregateSchema as Ct, sseEnvelopeSchema as D, configureEvalRunLogs as Dn, evalChartConfigSchema as Dt, updateManualScoreRequestSchema as E, appendToEvalOutput as En, evalChartColorSchema as Et, getEvalDisplayStatus as F, mergeEvalOutput as Fn, cacheEntryWithDebugKeySchema as Ft, apiCallMetricPlacementSchema as G, traceCacheRefSchema as Gt, DEFAULT_LLM_CALLS_CONFIG as H, startEvalBackgroundJob as Hn, cacheStatusSchema as Ht, deriveScopedSummaryFromCases as I, nextEvalId as In, cacheFileSchema as It, defaultConfigKeySchema as J, traceAttributeDisplayPlacementSchema as Jt, apiCallMetricSchema as K, traceAttributeDisplayFormatSchema as Kt, deriveStatusFromCaseRows as L, runInEvalRuntimeScope as Ln, cacheListItemSchema as Lt, applyDerivedCallAttributes as M, getEvalStartTime as Mn, cacheDebugKeyEntrySchema as Mt, getNestedAttribute as N, incrementEvalOutput as Nn, cacheDebugKeyFileSchema as Nt, extractCacheEntries as O, evalAssert as On, evalChartMetricSchema as Ot, getEvalTitle as P, isInEvalScope as Pn, cacheEntrySchema as Pt, llmCallPricingSchema as Q, traceSpanErrorSchema as Qt, deriveStatusFromChildStatuses as R, runInEvalScope as Rn, cacheModeSchema as Rt, normalizeScoreDef as S, repoFile as Sn, scoreTraceSchema as St, createRunRequestSchema as T, advanceEvalTime as Tn, evalChartBuiltinMetricSchema as Tt, agentEvalsConfigSchema as U, defineEval as Un, serializedCacheSpanSchema as Ut, DEFAULT_API_CALLS_CONFIG as V, setScopeCacheContext as Vn, cacheRecordingSchema as Vt, apiCallMetricFormatSchema as W, getEvalRegistry as Wn, spanCacheOptionsSchema as Wt, llmCallMetricPlacementSchema as X, traceDisplayConfigSchema as Xt, llmCallMetricFormatSchema as Y, traceAttributeDisplaySchema as Yt, llmCallMetricSchema as Z, traceDisplayInputConfigSchema as Zt, loadEvalModule as _, hashCacheKeySync as _n, evalSummarySchema as _t, getLastRunStatuses as a, columnKindSchema as an, buildCaseKey as at, loadConfig as b, serializeCacheRecording as bn, runLogLocationSchema as bt, loadPersistedRunSnapshots as c, numberDisplayOptionsSchema as cn, getCaseRowEvalKey as ct, persistRunState as d, z$1 as dn, caseRowSchema as dt, traceSpanSchema as en, removeDefaultConfigSchema as et, recomputeEvalStatusesInRuns as f, buildTraceTree as fn, discoveryIssueSchema as ft, deriveEvalFreshness as g, hashCacheKey as gn, evalStatsConfigSchema as gt, resolveArtifactPath as h, evalTracer as hn, evalStatItemSchema as ht, generateRunId as i, columnFormatSchema as in, trialSelectionModeSchema as it, extractLlmCalls as j, getEvalCaseInput as jn, evalChartsConfigSchema as jt, extractCacheHits as k, evalLog as kn, evalChartTooltipExtraSchema as kt, nextShortIdFromSnapshots as l, repoFileRefSchema as ln, assertionFailureSchema as lt, runTouchesEval as m, evalSpan as mn, evalStatAggregateSchema as mt, getTargetEvalKeys as n, cellValueSchema as nn, resolveLlmCallsConfig as nt, getLatestRunInfos as o, fileRefSchema as on, buildEvalKey as ot, recomputePersistedCaseStatus as p, captureEvalSpanError as pn, evalFreshnessStatusSchema as pt, apiCallsConfigSchema as q, traceAttributeDisplayInputSchema as qt, getTargetEvals as r, columnDefSchema as rn, runLogsConfigSchema as rt, loadPersistedRunSnapshot as s, jsonCellSchema as sn, getCaseRowCaseKey as st, executeRun as t, traceSpanWarningSchema as tn, resolveApiCallsConfig as tt, persistCaseDetail as u, runArtifactRefSchema as un, caseDetailSchema as ut, parseEvalDiscovery as v, deserializeCacheRecording as vn, runLogEntrySchema as vt, createFsCacheStore as w, EvalAssertionError as wn, evalChartAxisSchema as wt, buildDeclaredColumnDefs as x, serializeCacheValue as xn, runLogPhaseSchema as xt, resolveEvalDefaultConfig as y, deserializeCacheValue as yn, runLogLevelSchema as yt, runManifestSchema as z, runInExistingEvalScope as zn, cacheOperationTypeSchema as zt };
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-DRINcaN_.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-DRkwWgTj.mjs";
|
|
2
|
+
import "./src-BwKm3sKU.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
package/package.json
CHANGED
|
@@ -39,8 +39,9 @@ file wires up cases and scoring; the real `evalTracer.span(...)` calls sit
|
|
|
39
39
|
inside the workflow, agent, or tool functions that both production and evals
|
|
40
40
|
invoke.
|
|
41
41
|
|
|
42
|
-
`evalTracer`, `evalSpan`, output helpers, `evalLog`,
|
|
43
|
-
ambient no-ops when called outside an eval case scope, so
|
|
42
|
+
`evalTracer`, `evalSpan`, output helpers, `evalLog`, `evalAssert`, and
|
|
43
|
+
`evalExpect` are ambient no-ops when called outside an eval case scope, so
|
|
44
|
+
leaving them in
|
|
44
45
|
production paths is safe — they only record anything when the product code runs
|
|
45
46
|
inside an eval's `execute`. Use `isInEvalScope()` to branch on eval-only behavior in shared code
|
|
46
47
|
(e.g. skip a real network side effect): it returns `null` outside eval-owned
|
|
@@ -67,6 +68,7 @@ import {
|
|
|
67
68
|
appendToEvalOutput,
|
|
68
69
|
captureEvalSpanError,
|
|
69
70
|
evalAssert,
|
|
71
|
+
evalExpect,
|
|
70
72
|
evalSpan,
|
|
71
73
|
evalTracer,
|
|
72
74
|
getEvalCaseInput,
|
|
@@ -87,7 +89,10 @@ export async function runRefundWorkflow(input: RefundInput) {
|
|
|
87
89
|
{
|
|
88
90
|
kind: 'llm',
|
|
89
91
|
name: 'plan-refund',
|
|
90
|
-
cache: {
|
|
92
|
+
cache: {
|
|
93
|
+
namespace: 'refund-workflow__plan-refund',
|
|
94
|
+
key: { prompt: input.message, model: 'gpt-4o-mini' },
|
|
95
|
+
},
|
|
91
96
|
},
|
|
92
97
|
async () => {
|
|
93
98
|
let text: string;
|
|
@@ -119,6 +124,7 @@ export async function runRefundWorkflow(input: RefundInput) {
|
|
|
119
124
|
setEvalOutput('reviewId', reviewId);
|
|
120
125
|
mergeEvalOutput('metadata', { approved: result.approved });
|
|
121
126
|
evalAssert(result.approved, 'refund workflow should approve the case');
|
|
127
|
+
evalExpect(result.finalText).toMatch(/refund/i);
|
|
122
128
|
evalSpan.setAttribute('output', { result, reviewId });
|
|
123
129
|
return result;
|
|
124
130
|
},
|
|
@@ -319,15 +325,18 @@ Their shapes live in the types; no need to memorize the option set.
|
|
|
319
325
|
|
|
320
326
|
## Cached operations
|
|
321
327
|
|
|
322
|
-
Wrap a costly pure span in `cache: { key }` so later runs replay its
|
|
323
|
-
effects without re-executing:
|
|
328
|
+
Wrap a costly pure span in `cache: { namespace, key }` so later runs replay its
|
|
329
|
+
recorded effects without re-executing:
|
|
324
330
|
|
|
325
331
|
```ts
|
|
326
332
|
await evalTracer.span(
|
|
327
333
|
{
|
|
328
334
|
kind: 'llm',
|
|
329
335
|
name: 'plan-refund',
|
|
330
|
-
cache: {
|
|
336
|
+
cache: {
|
|
337
|
+
namespace: 'refund-workflow__plan-refund',
|
|
338
|
+
key: { prompt: input.message, model: 'gpt-4o-mini' },
|
|
339
|
+
},
|
|
331
340
|
},
|
|
332
341
|
async () => {
|
|
333
342
|
const result = await llm.complete(input.message);
|
|
@@ -368,12 +377,13 @@ Mental model:
|
|
|
368
377
|
namespace, and hit/miss status. When called directly from the case body
|
|
369
378
|
(no surrounding span), the ref is recorded on the case detail's `cacheRefs`
|
|
370
379
|
array.
|
|
371
|
-
-
|
|
372
|
-
|
|
373
|
-
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
380
|
+
- Cache identity is the namespace plus the authored key. Source-file
|
|
381
|
+
fingerprints are stored as metadata for inspection, but do not participate in
|
|
382
|
+
cache-key hashing.
|
|
383
|
+
- Cached spans require an explicit `cache.namespace`; value caches default to
|
|
384
|
+
`${evalId}__${name}` and can be overridden with `namespace`. Matching
|
|
385
|
+
namespaces share entries across operations/evals that use the same authored
|
|
386
|
+
key.
|
|
377
387
|
- Authored eval ids are unique within one eval file. The exact eval identity is
|
|
378
388
|
the workspace-relative file path plus eval id, so the same id can be reused in
|
|
379
389
|
different files. Case ids must be unique within one eval; duplicate case ids
|
|
@@ -455,16 +465,17 @@ When adding or changing evals:
|
|
|
455
465
|
1. Put the tracing + ambient SDK calls in the product code that runs in both
|
|
456
466
|
production and evals. Keep eval files thin.
|
|
457
467
|
2. Use realistic cases drawn from real product flows; avoid placeholder inputs.
|
|
458
|
-
3. `evalAssert` for hard invariants
|
|
459
|
-
|
|
468
|
+
3. `evalAssert` for hard invariants and truthy type narrowing, `evalExpect`
|
|
469
|
+
for non-trivial comparisons, `scores` for graded signals, `passThreshold`
|
|
470
|
+
only on scores that should gate pass/fail.
|
|
460
471
|
4. Surface reviewable values through execute-context `setOutput` or ambient
|
|
461
472
|
`setEvalOutput` in shared workflow code, and shape them with `columns`
|
|
462
473
|
formats from the `ColumnFormat` type.
|
|
463
474
|
5. Promote high-signal span attributes with `traceDisplay` so they surface in
|
|
464
475
|
the trace tree and detail pane.
|
|
465
|
-
6. Cache costly pure spans with `cache: { key }` and pure spanless
|
|
466
|
-
`evalTracer.cache(...)`; never cache operations whose external
|
|
467
|
-
you depend on.
|
|
476
|
+
6. Cache costly pure spans with `cache: { namespace, key }` and pure spanless
|
|
477
|
+
values with `evalTracer.cache(...)`; never cache operations whose external
|
|
478
|
+
side effects you depend on.
|
|
468
479
|
7. Sanity-check after changes: `agent-evals list`, then
|
|
469
480
|
`agent-evals run --eval <id>`; use `--file <path|glob>` to target one file
|
|
470
481
|
when multiple files use the same eval id.
|
package/dist/src-CLMrRle2.mjs
DELETED