@ls-stack/agent-eval 0.26.3 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-C-yzV9J8.mjs → app-ByMLOds2.mjs} +3 -3
- package/dist/bin.mjs +1 -1
- package/dist/{cli-T6FYE2Bk.mjs → cli-DRkwWgTj.mjs} +3 -3
- package/dist/index.d.mts +37 -4
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-CW_Fzr2c.mjs → runOrchestration-DB0dwGrd.mjs} +157 -22
- package/dist/{runner-DesCrD7Z.mjs → runner-39KGoaM1.mjs} +1 -1
- package/dist/{runner-BsRloAm3.mjs → runner-DRINcaN_.mjs} +2 -2
- package/dist/src-BwKm3sKU.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +8 -4
- package/dist/src-CbVnqjW3.mjs +0 -3
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { E as updateManualScoreRequestSchema, T as createRunRequestSchema } from "./runOrchestration-
|
|
2
|
-
import "./src-
|
|
3
|
-
import { t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { E as updateManualScoreRequestSchema, T as createRunRequestSchema } from "./runOrchestration-DB0dwGrd.mjs";
|
|
2
|
+
import "./src-BwKm3sKU.mjs";
|
|
3
|
+
import { t as getRunnerInstance } from "./runner-DRINcaN_.mjs";
|
|
4
4
|
import { readFile } from "node:fs/promises";
|
|
5
5
|
import { dirname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
6
6
|
import { z } from "zod/v4";
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef,
|
|
1
|
+
import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Wn as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-DB0dwGrd.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join, relative, resolve } from "node:path";
|
|
@@ -1041,8 +1041,8 @@ async function commandApp(args) {
|
|
|
1041
1041
|
const { serve } = await import("@hono/node-server");
|
|
1042
1042
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
1043
1043
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
1044
|
-
const appModule = await import("./app-
|
|
1045
|
-
const runnerModule = await import("./runner-
|
|
1044
|
+
const appModule = await import("./app-ByMLOds2.mjs");
|
|
1045
|
+
const runnerModule = await import("./runner-39KGoaM1.mjs");
|
|
1046
1046
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
1047
1047
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
1048
1048
|
await runnerModule.initRunner();
|
package/dist/index.d.mts
CHANGED
|
@@ -3119,6 +3119,37 @@ declare function getEvalRegistry(): Map<string, EvalRegistryEntry>;
|
|
|
3119
3119
|
*/
|
|
3120
3120
|
declare function defineEval<TInput = unknown, TOutputs extends EvalOutputs = EvalOutputs>(definition: EvalDefinition<TInput, TOutputs>): void;
|
|
3121
3121
|
//#endregion
|
|
3122
|
+
//#region ../sdk/src/evalExpect.d.ts
|
|
3123
|
+
/**
|
|
3124
|
+
* Focused expectation helpers for eval case invariants.
|
|
3125
|
+
*
|
|
3126
|
+
* These matchers intentionally cover comparisons that produce clearer failure
|
|
3127
|
+
* messages than a plain `evalAssert(...)`. Use `evalAssert(...)` directly for
|
|
3128
|
+
* truthiness checks and custom type narrowing.
|
|
3129
|
+
*/
|
|
3130
|
+
type EvalExpectation<T> = {
|
|
3131
|
+
/** Invert the next matcher. */readonly not: EvalExpectation<T>; /** Assert strict `Object.is(...)` equality. */
|
|
3132
|
+
toBe(expected: unknown): void; /** Assert Node.js deep strict equality. */
|
|
3133
|
+
toEqual(expected: unknown): void; /** Assert that object properties recursively match the expected subset. */
|
|
3134
|
+
toMatchObject(expected: Record<string, unknown>): void; /** Assert substring, array item, or set item containment. */
|
|
3135
|
+
toContain(expected: unknown): void; /** Assert the value has a numeric `length` equal to `expected`. */
|
|
3136
|
+
toHaveLength(expected: number): void; /** Assert a dot-path property exists, optionally with a deep-equal value. */
|
|
3137
|
+
toHaveProperty(path: string, ...expected: [] | [unknown]): void; /** Assert the received number is greater than `expected`. */
|
|
3138
|
+
toBeGreaterThan(expected: number): void; /** Assert the received number is greater than or equal to `expected`. */
|
|
3139
|
+
toBeGreaterThanOrEqual(expected: number): void; /** Assert the received number is less than `expected`. */
|
|
3140
|
+
toBeLessThan(expected: number): void; /** Assert the received number is less than or equal to `expected`. */
|
|
3141
|
+
toBeLessThanOrEqual(expected: number): void; /** Assert the received number is close to `expected` at `precision` decimals. */
|
|
3142
|
+
toBeCloseTo(expected: number, precision?: number): void; /** Assert the received string matches the regular expression. */
|
|
3143
|
+
toMatch(expected: RegExp): void;
|
|
3144
|
+
};
|
|
3145
|
+
/**
|
|
3146
|
+
* Create focused expectation helpers for the current eval case.
|
|
3147
|
+
*
|
|
3148
|
+
* Failed expectations record assertion failures and throw only while an eval
|
|
3149
|
+
* case scope is active, matching `evalAssert(...)`.
|
|
3150
|
+
*/
|
|
3151
|
+
declare function evalExpect<T>(value: T): EvalExpectation<T>;
|
|
3152
|
+
//#endregion
|
|
3122
3153
|
//#region ../sdk/src/repoFile.d.ts
|
|
3123
3154
|
/**
|
|
3124
3155
|
* Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
|
|
@@ -3356,12 +3387,14 @@ declare function mergeEvalOutput(key: string, patch: Record<string, unknown>): v
|
|
|
3356
3387
|
*/
|
|
3357
3388
|
declare function incrementEvalOutput(key: string, delta: number): void;
|
|
3358
3389
|
/**
|
|
3359
|
-
* Assert a condition for the current eval case and throw on failure.
|
|
3390
|
+
* Assert a truthy condition for the current eval case and throw on failure.
|
|
3360
3391
|
*
|
|
3361
3392
|
* Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
|
|
3362
|
-
* can safely reuse `evalAssert(...)` when it also runs outside an eval.
|
|
3393
|
+
* can safely reuse `evalAssert(...)` when it also runs outside an eval. The
|
|
3394
|
+
* TypeScript assertion signature still narrows the checked value after the
|
|
3395
|
+
* call.
|
|
3363
3396
|
*/
|
|
3364
|
-
declare function evalAssert(condition:
|
|
3397
|
+
declare function evalAssert(condition: unknown, message: string): asserts condition;
|
|
3365
3398
|
//#endregion
|
|
3366
3399
|
//#region ../sdk/src/cacheSerialization.d.ts
|
|
3367
3400
|
declare const serializedCacheValueMarker = "__agentEvalsCacheSerialization";
|
|
@@ -3721,4 +3754,4 @@ declare function createRunner({
|
|
|
3721
3754
|
*/
|
|
3722
3755
|
declare function runCli(argv: string[]): Promise<void>;
|
|
3723
3756
|
//#endregion
|
|
3724
|
-
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
3757
|
+
export { type AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalExpectation, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTimeUnit, type EvalTraceSpan, type EvalTraceSpanError, type EvalTraceSpanWarning, type EvalTraceTree, type FileRef, type JsonCell, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallsConfigInput, type NumberDisplayOptions, type RemoveDefaultConfig, type RepoFileRef, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunArtifactRef, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalExpect, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as
|
|
2
|
-
import { n as createRunner, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
4
|
-
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
|
1
|
+
import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getCurrentScope, At as evalChartTypeSchema, B as runSummarySchema, Bn as setEvalOutput, Bt as cacheRecordingOpSchema, Cn as evalExpect, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as appendToEvalOutput, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as mergeEvalOutput, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as startEvalBackgroundJob, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as nextEvalId, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInEvalRuntimeScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as getEvalStartTime, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as incrementEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as evalAssert, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as isInEvalScope, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as runInEvalScope, Rt as cacheModeSchema, Sn as repoFile, St as scoreTraceSchema, T as createRunRequestSchema, Tn as advanceEvalTime, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as defineEval, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as setScopeCacheContext, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wn as getEvalRegistry, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as getEvalCaseInput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as evalLog, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as EvalAssertionError, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as runInExistingEvalScope, zt as cacheOperationTypeSchema } from "./runOrchestration-DB0dwGrd.mjs";
|
|
2
|
+
import { n as createRunner, t as runCli } from "./cli-DRkwWgTj.mjs";
|
|
3
|
+
import "./src-BwKm3sKU.mjs";
|
|
4
|
+
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalExpect, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { B as runSummarySchema,
|
|
1
|
+
import { B as runSummarySchema, Dn as configureEvalRunLogs, T as createRunRequestSchema, b as loadConfig, gt as evalStatsConfigSchema, jt as evalChartsConfigSchema, ot as buildEvalKey, r as getTargetEvals$1, rn as columnDefSchema, t as executeRun, v as parseEvalDiscovery, w as createFsCacheStore, z as runManifestSchema } from "./runOrchestration-DB0dwGrd.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|
|
@@ -2,9 +2,9 @@ import { createRequire, registerHooks } from "node:module";
|
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
|
|
4
4
|
import { extname, isAbsolute, join, relative, resolve } from "node:path";
|
|
5
|
-
import {
|
|
5
|
+
import { formatWithOptions, isDeepStrictEqual } from "node:util";
|
|
6
6
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
7
|
-
import {
|
|
7
|
+
import { z, z as z$1 } from "zod/v4";
|
|
8
8
|
import { Buffer as Buffer$1 } from "node:buffer";
|
|
9
9
|
import { gunzipSync, gzipSync } from "node:zlib";
|
|
10
10
|
import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
|
|
@@ -29,23 +29,6 @@ function defineEval(definition) {
|
|
|
29
29
|
});
|
|
30
30
|
}
|
|
31
31
|
//#endregion
|
|
32
|
-
//#region ../sdk/src/repoFile.ts
|
|
33
|
-
/**
|
|
34
|
-
* Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
|
|
35
|
-
* by a column configured with `format: 'image' | 'audio' | 'video' | 'file'`.
|
|
36
|
-
*
|
|
37
|
-
* @param path Relative or absolute path to the repository file.
|
|
38
|
-
* @param mimeType Optional MIME type hint for UI rendering.
|
|
39
|
-
* @returns A repo-backed file reference suitable for file/media columns.
|
|
40
|
-
*/
|
|
41
|
-
function repoFile(path, mimeType) {
|
|
42
|
-
return {
|
|
43
|
-
source: "repo",
|
|
44
|
-
path,
|
|
45
|
-
mimeType
|
|
46
|
-
};
|
|
47
|
-
}
|
|
48
|
-
//#endregion
|
|
49
32
|
//#region ../sdk/src/runtime.ts
|
|
50
33
|
const scopeStorage = new AsyncLocalStorage();
|
|
51
34
|
const runtimeScopeStorage = new AsyncLocalStorage();
|
|
@@ -639,10 +622,12 @@ function incrementEvalOutput(key, delta) {
|
|
|
639
622
|
});
|
|
640
623
|
}
|
|
641
624
|
/**
|
|
642
|
-
* Assert a condition for the current eval case and throw on failure.
|
|
625
|
+
* Assert a truthy condition for the current eval case and throw on failure.
|
|
643
626
|
*
|
|
644
627
|
* Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
|
|
645
|
-
* can safely reuse `evalAssert(...)` when it also runs outside an eval.
|
|
628
|
+
* can safely reuse `evalAssert(...)` when it also runs outside an eval. The
|
|
629
|
+
* TypeScript assertion signature still narrows the checked value after the
|
|
630
|
+
* call.
|
|
646
631
|
*/
|
|
647
632
|
function evalAssert(condition, message) {
|
|
648
633
|
if (condition) return;
|
|
@@ -653,6 +638,156 @@ function evalAssert(condition, message) {
|
|
|
653
638
|
throw error;
|
|
654
639
|
}
|
|
655
640
|
//#endregion
|
|
641
|
+
//#region ../sdk/src/evalExpect.ts
|
|
642
|
+
const expectFormatOptions = {
|
|
643
|
+
depth: 5,
|
|
644
|
+
maxArrayLength: 50,
|
|
645
|
+
maxStringLength: 1e3,
|
|
646
|
+
breakLength: 80,
|
|
647
|
+
compact: 3
|
|
648
|
+
};
|
|
649
|
+
function formatExpectValue(value) {
|
|
650
|
+
return formatWithOptions(expectFormatOptions, value);
|
|
651
|
+
}
|
|
652
|
+
function isRecordLike$4(value) {
|
|
653
|
+
return typeof value === "object" && value !== null;
|
|
654
|
+
}
|
|
655
|
+
function isLengthLike(value) {
|
|
656
|
+
if (typeof value === "string") return true;
|
|
657
|
+
return isRecordLike$4(value) && "length" in value && typeof value.length === "number";
|
|
658
|
+
}
|
|
659
|
+
function matchesObjectSubset(received, expected) {
|
|
660
|
+
if (!isRecordLike$4(received)) return false;
|
|
661
|
+
for (const [key, expectedValue] of Object.entries(expected)) {
|
|
662
|
+
if (!(key in received)) return false;
|
|
663
|
+
const receivedValue = received[key];
|
|
664
|
+
if (isRecordLike$4(expectedValue) && !Array.isArray(expectedValue)) {
|
|
665
|
+
if (!matchesObjectSubset(receivedValue, expectedValue)) return false;
|
|
666
|
+
continue;
|
|
667
|
+
}
|
|
668
|
+
if (!isDeepStrictEqual(receivedValue, expectedValue)) return false;
|
|
669
|
+
}
|
|
670
|
+
return true;
|
|
671
|
+
}
|
|
672
|
+
function containsValue(received, expected) {
|
|
673
|
+
if (typeof received === "string") return typeof expected === "string" && received.includes(expected);
|
|
674
|
+
if (Array.isArray(received)) return received.includes(expected);
|
|
675
|
+
if (received instanceof Set) return received.has(expected);
|
|
676
|
+
return false;
|
|
677
|
+
}
|
|
678
|
+
function getPropertyAtPath(received, path) {
|
|
679
|
+
if (path === "") return {
|
|
680
|
+
exists: false,
|
|
681
|
+
value: void 0
|
|
682
|
+
};
|
|
683
|
+
let current = received;
|
|
684
|
+
for (const key of path.split(".")) {
|
|
685
|
+
if (!isRecordLike$4(current) || !(key in current)) return {
|
|
686
|
+
exists: false,
|
|
687
|
+
value: void 0
|
|
688
|
+
};
|
|
689
|
+
current = current[key];
|
|
690
|
+
}
|
|
691
|
+
return {
|
|
692
|
+
exists: true,
|
|
693
|
+
value: current
|
|
694
|
+
};
|
|
695
|
+
}
|
|
696
|
+
var EvalExpectationImpl = class EvalExpectationImpl {
|
|
697
|
+
received;
|
|
698
|
+
negated;
|
|
699
|
+
constructor(received, negated) {
|
|
700
|
+
this.received = received;
|
|
701
|
+
this.negated = negated;
|
|
702
|
+
}
|
|
703
|
+
get not() {
|
|
704
|
+
return new EvalExpectationImpl(this.received, !this.negated);
|
|
705
|
+
}
|
|
706
|
+
toBe(expected) {
|
|
707
|
+
this.check(Object.is(this.received, expected), `Expected ${formatExpectValue(this.received)} to be ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to be ${formatExpectValue(expected)}`);
|
|
708
|
+
}
|
|
709
|
+
toEqual(expected) {
|
|
710
|
+
this.check(isDeepStrictEqual(this.received, expected), `Expected ${formatExpectValue(this.received)} to equal ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to equal ${formatExpectValue(expected)}`);
|
|
711
|
+
}
|
|
712
|
+
toMatchObject(expected) {
|
|
713
|
+
this.check(matchesObjectSubset(this.received, expected), `Expected ${formatExpectValue(this.received)} to match object ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to match object ${formatExpectValue(expected)}`);
|
|
714
|
+
}
|
|
715
|
+
toContain(expected) {
|
|
716
|
+
this.check(containsValue(this.received, expected), `Expected ${formatExpectValue(this.received)} to contain ${formatExpectValue(expected)}`, `Expected ${formatExpectValue(this.received)} not to contain ${formatExpectValue(expected)}`);
|
|
717
|
+
}
|
|
718
|
+
toHaveLength(expected) {
|
|
719
|
+
this.check(isLengthLike(this.received) && this.received.length === expected, `Expected ${formatExpectValue(this.received)} to have length ${expected}`, `Expected ${formatExpectValue(this.received)} not to have length ${expected}`);
|
|
720
|
+
}
|
|
721
|
+
toHaveProperty(path, ...expected) {
|
|
722
|
+
const result = getPropertyAtPath(this.received, path);
|
|
723
|
+
const expectedValue = expected[0];
|
|
724
|
+
const pass = result.exists && (expected.length === 0 || isDeepStrictEqual(result.value, expectedValue));
|
|
725
|
+
const expectedSuffix = expected.length === 0 ? "" : ` with value ${formatExpectValue(expectedValue)}`;
|
|
726
|
+
this.check(pass, `Expected ${formatExpectValue(this.received)} to have property "${path}"${expectedSuffix}`, `Expected ${formatExpectValue(this.received)} not to have property "${path}"${expectedSuffix}`);
|
|
727
|
+
}
|
|
728
|
+
toBeGreaterThan(expected) {
|
|
729
|
+
this.checkNumberComparison("to be greater than", expected, (received) => {
|
|
730
|
+
return received > expected;
|
|
731
|
+
});
|
|
732
|
+
}
|
|
733
|
+
toBeGreaterThanOrEqual(expected) {
|
|
734
|
+
this.checkNumberComparison("to be greater than or equal to", expected, (received) => {
|
|
735
|
+
return received >= expected;
|
|
736
|
+
});
|
|
737
|
+
}
|
|
738
|
+
toBeLessThan(expected) {
|
|
739
|
+
this.checkNumberComparison("to be less than", expected, (received) => {
|
|
740
|
+
return received < expected;
|
|
741
|
+
});
|
|
742
|
+
}
|
|
743
|
+
toBeLessThanOrEqual(expected) {
|
|
744
|
+
this.checkNumberComparison("to be less than or equal to", expected, (received) => {
|
|
745
|
+
return received <= expected;
|
|
746
|
+
});
|
|
747
|
+
}
|
|
748
|
+
toBeCloseTo(expected, precision = 2) {
|
|
749
|
+
const tolerance = 10 ** -precision / 2;
|
|
750
|
+
this.check(typeof this.received === "number" && Number.isFinite(this.received) && Math.abs(this.received - expected) < tolerance, `Expected ${formatExpectValue(this.received)} to be close to ${expected} at ${precision} decimals`, `Expected ${formatExpectValue(this.received)} not to be close to ${expected} at ${precision} decimals`);
|
|
751
|
+
}
|
|
752
|
+
toMatch(expected) {
|
|
753
|
+
const statelessExpected = new RegExp(expected.source, expected.flags);
|
|
754
|
+
this.check(typeof this.received === "string" && statelessExpected.test(this.received), `Expected ${formatExpectValue(this.received)} to match ${expected}`, `Expected ${formatExpectValue(this.received)} not to match ${expected}`);
|
|
755
|
+
}
|
|
756
|
+
check(pass, positiveMessage, negativeMessage) {
|
|
757
|
+
if (this.negated ? !pass : pass) return;
|
|
758
|
+
evalAssert(false, this.negated ? negativeMessage : positiveMessage);
|
|
759
|
+
}
|
|
760
|
+
checkNumberComparison(label, expected, predicate) {
|
|
761
|
+
this.check(typeof this.received === "number" && predicate(this.received), `Expected ${formatExpectValue(this.received)} ${label} ${expected}`, `Expected ${formatExpectValue(this.received)} not ${label} ${expected}`);
|
|
762
|
+
}
|
|
763
|
+
};
|
|
764
|
+
/**
|
|
765
|
+
* Create focused expectation helpers for the current eval case.
|
|
766
|
+
*
|
|
767
|
+
* Failed expectations record assertion failures and throw only while an eval
|
|
768
|
+
* case scope is active, matching `evalAssert(...)`.
|
|
769
|
+
*/
|
|
770
|
+
function evalExpect(value) {
|
|
771
|
+
return new EvalExpectationImpl(value, false);
|
|
772
|
+
}
|
|
773
|
+
//#endregion
|
|
774
|
+
//#region ../sdk/src/repoFile.ts
|
|
775
|
+
/**
|
|
776
|
+
* Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
|
|
777
|
+
* by a column configured with `format: 'image' | 'audio' | 'video' | 'file'`.
|
|
778
|
+
*
|
|
779
|
+
* @param path Relative or absolute path to the repository file.
|
|
780
|
+
* @param mimeType Optional MIME type hint for UI rendering.
|
|
781
|
+
* @returns A repo-backed file reference suitable for file/media columns.
|
|
782
|
+
*/
|
|
783
|
+
function repoFile(path, mimeType) {
|
|
784
|
+
return {
|
|
785
|
+
source: "repo",
|
|
786
|
+
path,
|
|
787
|
+
mimeType
|
|
788
|
+
};
|
|
789
|
+
}
|
|
790
|
+
//#endregion
|
|
656
791
|
//#region ../sdk/src/cacheSerialization.ts
|
|
657
792
|
const serializedCacheValueMarker = "__agentEvalsCacheSerialization";
|
|
658
793
|
const jsonSafeCacheValueVersion = "json-safe-v1";
|
|
@@ -6189,4 +6324,4 @@ function toLastRunStatus(status) {
|
|
|
6189
6324
|
return status === "pending" ? null : status;
|
|
6190
6325
|
}
|
|
6191
6326
|
//#endregion
|
|
6192
|
-
export { llmCallsConfigSchema as $, traceSpanKindSchema as $t, extractApiCalls as A,
|
|
6327
|
+
export { llmCallsConfigSchema as $, traceSpanKindSchema as $t, extractApiCalls as A, getCurrentScope as An, evalChartTypeSchema as At, runSummarySchema as B, setEvalOutput as Bn, cacheRecordingOpSchema as Bt, validateCharts as C, evalExpect as Cn, evalChartAggregateSchema as Ct, sseEnvelopeSchema as D, configureEvalRunLogs as Dn, evalChartConfigSchema as Dt, updateManualScoreRequestSchema as E, appendToEvalOutput as En, evalChartColorSchema as Et, getEvalDisplayStatus as F, mergeEvalOutput as Fn, cacheEntryWithDebugKeySchema as Ft, apiCallMetricPlacementSchema as G, traceCacheRefSchema as Gt, DEFAULT_LLM_CALLS_CONFIG as H, startEvalBackgroundJob as Hn, cacheStatusSchema as Ht, deriveScopedSummaryFromCases as I, nextEvalId as In, cacheFileSchema as It, defaultConfigKeySchema as J, traceAttributeDisplayPlacementSchema as Jt, apiCallMetricSchema as K, traceAttributeDisplayFormatSchema as Kt, deriveStatusFromCaseRows as L, runInEvalRuntimeScope as Ln, cacheListItemSchema as Lt, applyDerivedCallAttributes as M, getEvalStartTime as Mn, cacheDebugKeyEntrySchema as Mt, getNestedAttribute as N, incrementEvalOutput as Nn, cacheDebugKeyFileSchema as Nt, extractCacheEntries as O, evalAssert as On, evalChartMetricSchema as Ot, getEvalTitle as P, isInEvalScope as Pn, cacheEntrySchema as Pt, llmCallPricingSchema as Q, traceSpanErrorSchema as Qt, deriveStatusFromChildStatuses as R, runInEvalScope as Rn, cacheModeSchema as Rt, normalizeScoreDef as S, repoFile as Sn, scoreTraceSchema as St, createRunRequestSchema as T, advanceEvalTime as Tn, evalChartBuiltinMetricSchema as Tt, agentEvalsConfigSchema as U, defineEval as Un, serializedCacheSpanSchema as Ut, DEFAULT_API_CALLS_CONFIG as V, setScopeCacheContext as Vn, cacheRecordingSchema as Vt, apiCallMetricFormatSchema as W, getEvalRegistry as Wn, spanCacheOptionsSchema as Wt, llmCallMetricPlacementSchema as X, traceDisplayConfigSchema as Xt, llmCallMetricFormatSchema as Y, traceAttributeDisplaySchema as Yt, llmCallMetricSchema as Z, traceDisplayInputConfigSchema as Zt, loadEvalModule as _, hashCacheKeySync as _n, evalSummarySchema as _t, getLastRunStatuses as a, columnKindSchema as an, buildCaseKey as at, loadConfig as b, serializeCacheRecording as bn, runLogLocationSchema as bt, loadPersistedRunSnapshots as c, numberDisplayOptionsSchema as cn, getCaseRowEvalKey as ct, persistRunState as d, z$1 as dn, caseRowSchema as dt, traceSpanSchema as en, removeDefaultConfigSchema as et, recomputeEvalStatusesInRuns as f, buildTraceTree as fn, discoveryIssueSchema as ft, deriveEvalFreshness as g, hashCacheKey as gn, evalStatsConfigSchema as gt, resolveArtifactPath as h, evalTracer as hn, evalStatItemSchema as ht, generateRunId as i, columnFormatSchema as in, trialSelectionModeSchema as it, extractLlmCalls as j, getEvalCaseInput as jn, evalChartsConfigSchema as jt, extractCacheHits as k, evalLog as kn, evalChartTooltipExtraSchema as kt, nextShortIdFromSnapshots as l, repoFileRefSchema as ln, assertionFailureSchema as lt, runTouchesEval as m, evalSpan as mn, evalStatAggregateSchema as mt, getTargetEvalKeys as n, cellValueSchema as nn, resolveLlmCallsConfig as nt, getLatestRunInfos as o, fileRefSchema as on, buildEvalKey as ot, recomputePersistedCaseStatus as p, captureEvalSpanError as pn, evalFreshnessStatusSchema as pt, apiCallsConfigSchema as q, traceAttributeDisplayInputSchema as qt, getTargetEvals as r, columnDefSchema as rn, runLogsConfigSchema as rt, loadPersistedRunSnapshot as s, jsonCellSchema as sn, getCaseRowCaseKey as st, executeRun as t, traceSpanWarningSchema as tn, resolveApiCallsConfig as tt, persistCaseDetail as u, runArtifactRefSchema as un, caseDetailSchema as ut, parseEvalDiscovery as v, deserializeCacheRecording as vn, runLogEntrySchema as vt, createFsCacheStore as w, EvalAssertionError as wn, evalChartAxisSchema as wt, buildDeclaredColumnDefs as x, serializeCacheValue as xn, runLogPhaseSchema as xt, resolveEvalDefaultConfig as y, deserializeCacheValue as yn, runLogLevelSchema as yt, runManifestSchema as z, runInExistingEvalScope as zn, cacheOperationTypeSchema as zt };
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-DRINcaN_.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-DRkwWgTj.mjs";
|
|
2
|
+
import "./src-BwKm3sKU.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
package/package.json
CHANGED
|
@@ -39,8 +39,9 @@ file wires up cases and scoring; the real `evalTracer.span(...)` calls sit
|
|
|
39
39
|
inside the workflow, agent, or tool functions that both production and evals
|
|
40
40
|
invoke.
|
|
41
41
|
|
|
42
|
-
`evalTracer`, `evalSpan`, output helpers, `evalLog`,
|
|
43
|
-
ambient no-ops when called outside an eval case scope, so
|
|
42
|
+
`evalTracer`, `evalSpan`, output helpers, `evalLog`, `evalAssert`, and
|
|
43
|
+
`evalExpect` are ambient no-ops when called outside an eval case scope, so
|
|
44
|
+
leaving them in
|
|
44
45
|
production paths is safe — they only record anything when the product code runs
|
|
45
46
|
inside an eval's `execute`. Use `isInEvalScope()` to branch on eval-only behavior in shared code
|
|
46
47
|
(e.g. skip a real network side effect): it returns `null` outside eval-owned
|
|
@@ -67,6 +68,7 @@ import {
|
|
|
67
68
|
appendToEvalOutput,
|
|
68
69
|
captureEvalSpanError,
|
|
69
70
|
evalAssert,
|
|
71
|
+
evalExpect,
|
|
70
72
|
evalSpan,
|
|
71
73
|
evalTracer,
|
|
72
74
|
getEvalCaseInput,
|
|
@@ -122,6 +124,7 @@ export async function runRefundWorkflow(input: RefundInput) {
|
|
|
122
124
|
setEvalOutput('reviewId', reviewId);
|
|
123
125
|
mergeEvalOutput('metadata', { approved: result.approved });
|
|
124
126
|
evalAssert(result.approved, 'refund workflow should approve the case');
|
|
127
|
+
evalExpect(result.finalText).toMatch(/refund/i);
|
|
125
128
|
evalSpan.setAttribute('output', { result, reviewId });
|
|
126
129
|
return result;
|
|
127
130
|
},
|
|
@@ -462,8 +465,9 @@ When adding or changing evals:
|
|
|
462
465
|
1. Put the tracing + ambient SDK calls in the product code that runs in both
|
|
463
466
|
production and evals. Keep eval files thin.
|
|
464
467
|
2. Use realistic cases drawn from real product flows; avoid placeholder inputs.
|
|
465
|
-
3. `evalAssert` for hard invariants
|
|
466
|
-
|
|
468
|
+
3. `evalAssert` for hard invariants and truthy type narrowing, `evalExpect`
|
|
469
|
+
for non-trivial comparisons, `scores` for graded signals, `passThreshold`
|
|
470
|
+
only on scores that should gate pass/fail.
|
|
467
471
|
4. Surface reviewable values through execute-context `setOutput` or ambient
|
|
468
472
|
`setEvalOutput` in shared workflow code, and shape them with `columns`
|
|
469
473
|
formats from the `ColumnFormat` type.
|
package/dist/src-CbVnqjW3.mjs
DELETED