@ls-stack/agent-eval 0.26.2 → 0.26.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-BxR4SbjA.mjs → app-C-yzV9J8.mjs} +3 -3
- package/dist/apps/web/dist/assets/{index-CFF1eYUm.js → index-DR2haqvV.js} +1 -1
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/{cli-C4yumCXE.mjs → cli-T6FYE2Bk.mjs} +3 -3
- package/dist/index.d.mts +3 -4
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-B31SV_Bq.mjs → runOrchestration-CW_Fzr2c.mjs} +10 -6
- package/dist/{runner-LHN7L-xk.mjs → runner-BsRloAm3.mjs} +2 -2
- package/dist/{runner-D5QMY3B_.mjs → runner-DesCrD7Z.mjs} +1 -1
- package/dist/src-CbVnqjW3.mjs +3 -0
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +20 -13
- package/dist/src-CLMrRle2.mjs +0 -3
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-DR2haqvV.js"></script>
|
|
29
29
|
<link rel="stylesheet" crossorigin href="/assets/index-DOXT0Y9V.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Un as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-
|
|
1
|
+
import { B as runSummarySchema, C as validateCharts, F as getEvalDisplayStatus, I as deriveScopedSummaryFromCases, P as getEvalTitle, S as normalizeScoreDef, Un as getEvalRegistry, _ as loadEvalModule, a as getLastRunStatuses, b as loadConfig, c as loadPersistedRunSnapshots, ct as getCaseRowEvalKey, d as persistRunState, f as recomputeEvalStatusesInRuns, g as deriveEvalFreshness, h as resolveArtifactPath, i as generateRunId, l as nextShortIdFromSnapshots, m as runTouchesEval, n as getTargetEvalKeys, nt as resolveLlmCallsConfig, o as getLatestRunInfos, ot as buildEvalKey, p as recomputePersistedCaseStatus, s as loadPersistedRunSnapshot, st as getCaseRowCaseKey, tt as resolveApiCallsConfig, u as persistCaseDetail, v as parseEvalDiscovery, w as createFsCacheStore, x as buildDeclaredColumnDefs, y as resolveEvalDefaultConfig } from "./runOrchestration-CW_Fzr2c.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join, relative, resolve } from "node:path";
|
|
@@ -1041,8 +1041,8 @@ async function commandApp(args) {
|
|
|
1041
1041
|
const { serve } = await import("@hono/node-server");
|
|
1042
1042
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
1043
1043
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
1044
|
-
const appModule = await import("./app-
|
|
1045
|
-
const runnerModule = await import("./runner-
|
|
1044
|
+
const appModule = await import("./app-C-yzV9J8.mjs");
|
|
1045
|
+
const runnerModule = await import("./runner-DesCrD7Z.mjs");
|
|
1046
1046
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
1047
1047
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
1048
1048
|
await runnerModule.initRunner();
|
package/dist/index.d.mts
CHANGED
|
@@ -2288,7 +2288,7 @@ type CacheMode = z$1.infer<typeof cacheModeSchema>;
|
|
|
2288
2288
|
/** Options accepted by an `evalTracer.span` call to opt the span into caching. */
|
|
2289
2289
|
declare const spanCacheOptionsSchema: z$1.ZodObject<{
|
|
2290
2290
|
key: z$1.ZodUnknown;
|
|
2291
|
-
namespace: z$1.
|
|
2291
|
+
namespace: z$1.ZodString;
|
|
2292
2292
|
serializeFileBytes: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2293
2293
|
}, z$1.core.$strip>;
|
|
2294
2294
|
/** Options accepted by an `evalTracer.span` call to opt the span into caching. */
|
|
@@ -3168,7 +3168,7 @@ type CacheAdapter = {
|
|
|
3168
3168
|
type CacheScopeContext = {
|
|
3169
3169
|
adapter: CacheAdapter;
|
|
3170
3170
|
mode: CacheMode;
|
|
3171
|
-
evalId: string; /** Hash of the eval source file
|
|
3171
|
+
evalId: string; /** Hash of the eval source file, stored as cache metadata for inspection. */
|
|
3172
3172
|
codeFingerprint: string;
|
|
3173
3173
|
};
|
|
3174
3174
|
/** Active recording frame captured while a cached operation body executes. */
|
|
@@ -3419,8 +3419,7 @@ type TraceCacheInfo = {
|
|
|
3419
3419
|
//#region ../sdk/src/cacheKey.d.ts
|
|
3420
3420
|
/** Components folded into a deterministic cache key hash. */
|
|
3421
3421
|
type CacheKeyHashInput = {
|
|
3422
|
-
/** Cache namespace, usually derived from the eval id and operation name. */namespace: string; /**
|
|
3423
|
-
codeFingerprint: string; /** User-authored cache key value. */
|
|
3422
|
+
/** Cache namespace, usually derived from the eval id and operation name. */namespace: string; /** User-authored cache key value. */
|
|
3424
3423
|
key: unknown;
|
|
3425
3424
|
};
|
|
3426
3425
|
/** Optional controls for cache key hashing. */
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getEvalStartTime, At as evalChartTypeSchema, B as runSummarySchema, Bn as startEvalBackgroundJob, Bt as cacheRecordingOpSchema, Cn as advanceEvalTime, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dn as evalLog, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as evalAssert, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as runInEvalRuntimeScope, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as defineEval, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as runInEvalScope, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInExistingEvalScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as isInEvalScope, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as mergeEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as getCurrentScope, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as nextEvalId, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as setEvalOutput, Rt as cacheModeSchema, Sn as EvalAssertionError, St as scoreTraceSchema, T as createRunRequestSchema, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as getEvalRegistry, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as repoFile, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as incrementEvalOutput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as getEvalCaseInput, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as appendToEvalOutput, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as setScopeCacheContext, zt as cacheOperationTypeSchema } from "./runOrchestration-
|
|
2
|
-
import { n as createRunner, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
1
|
+
import { $ as llmCallsConfigSchema, $t as traceSpanKindSchema, A as extractApiCalls, An as getEvalStartTime, At as evalChartTypeSchema, B as runSummarySchema, Bn as startEvalBackgroundJob, Bt as cacheRecordingOpSchema, Cn as advanceEvalTime, Ct as evalChartAggregateSchema, D as sseEnvelopeSchema, Dn as evalLog, Dt as evalChartConfigSchema, E as updateManualScoreRequestSchema, En as evalAssert, Et as evalChartColorSchema, F as getEvalDisplayStatus, Fn as runInEvalRuntimeScope, Ft as cacheEntryWithDebugKeySchema, G as apiCallMetricPlacementSchema, Gt as traceCacheRefSchema, H as DEFAULT_LLM_CALLS_CONFIG, Hn as defineEval, Ht as cacheStatusSchema, I as deriveScopedSummaryFromCases, In as runInEvalScope, It as cacheFileSchema, J as defaultConfigKeySchema, Jt as traceAttributeDisplayPlacementSchema, K as apiCallMetricSchema, Kt as traceAttributeDisplayFormatSchema, L as deriveStatusFromCaseRows, Ln as runInExistingEvalScope, Lt as cacheListItemSchema, M as applyDerivedCallAttributes, Mn as isInEvalScope, Mt as cacheDebugKeyEntrySchema, N as getNestedAttribute, Nn as mergeEvalOutput, Nt as cacheDebugKeyFileSchema, O as extractCacheEntries, On as getCurrentScope, Ot as evalChartMetricSchema, P as getEvalTitle, Pn as nextEvalId, Pt as cacheEntrySchema, Q as llmCallPricingSchema, Qt as traceSpanErrorSchema, R as deriveStatusFromChildStatuses, Rn as setEvalOutput, Rt as cacheModeSchema, Sn as EvalAssertionError, St as scoreTraceSchema, T as createRunRequestSchema, Tt as evalChartBuiltinMetricSchema, U as agentEvalsConfigSchema, Un as getEvalRegistry, Ut as serializedCacheSpanSchema, V as DEFAULT_API_CALLS_CONFIG, Vn as repoFile, Vt as cacheRecordingSchema, W as apiCallMetricFormatSchema, Wt as spanCacheOptionsSchema, X as llmCallMetricPlacementSchema, Xt as traceDisplayConfigSchema, Y as llmCallMetricFormatSchema, Yt as traceAttributeDisplaySchema, Z as llmCallMetricSchema, Zt as traceDisplayInputConfigSchema, _n as hashCacheKeySync, _t as evalSummarySchema, an as columnKindSchema, at as buildCaseKey, bn as serializeCacheRecording, bt as runLogLocationSchema, cn as numberDisplayOptionsSchema, ct as getCaseRowEvalKey, dn as z, dt as caseRowSchema, en as traceSpanSchema, et as removeDefaultConfigSchema, fn as buildTraceTree, ft as discoveryIssueSchema, gn as hashCacheKey, gt as evalStatsConfigSchema, hn as evalTracer, ht as evalStatItemSchema, in as columnFormatSchema, it as trialSelectionModeSchema, j as extractLlmCalls, jn as incrementEvalOutput, jt as evalChartsConfigSchema, k as extractCacheHits, kn as getEvalCaseInput, kt as evalChartTooltipExtraSchema, ln as repoFileRefSchema, lt as assertionFailureSchema, mn as evalSpan, mt as evalStatAggregateSchema, nn as cellValueSchema, nt as resolveLlmCallsConfig, on as fileRefSchema, ot as buildEvalKey, pn as captureEvalSpanError, pt as evalFreshnessStatusSchema, q as apiCallsConfigSchema, qt as traceAttributeDisplayInputSchema, rn as columnDefSchema, rt as runLogsConfigSchema, sn as jsonCellSchema, st as getCaseRowCaseKey, tn as traceSpanWarningSchema, tt as resolveApiCallsConfig, un as runArtifactRefSchema, ut as caseDetailSchema, vn as deserializeCacheRecording, vt as runLogEntrySchema, wn as appendToEvalOutput, wt as evalChartAxisSchema, xn as serializeCacheValue, xt as runLogPhaseSchema, yn as deserializeCacheValue, yt as runLogLevelSchema, z as runManifestSchema, zn as setScopeCacheContext, zt as cacheOperationTypeSchema } from "./runOrchestration-CW_Fzr2c.mjs";
|
|
2
|
+
import { n as createRunner, t as runCli } from "./cli-T6FYE2Bk.mjs";
|
|
3
|
+
import "./src-CbVnqjW3.mjs";
|
|
4
4
|
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, advanceEvalTime, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, applyDerivedCallAttributes, assertionFailureSchema, buildCaseKey, buildEvalKey, buildTraceTree, cacheDebugKeyEntrySchema, cacheDebugKeyFileSchema, cacheEntrySchema, cacheEntryWithDebugKeySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defaultConfigKeySchema, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, deserializeCacheRecording, deserializeCacheValue, discoveryIssueSchema, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalLog, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, fileRefSchema, getCaseRowCaseKey, getCaseRowEvalKey, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalStartTime, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallPricingSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, removeDefaultConfigSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runLogEntrySchema, runLogLevelSchema, runLogLocationSchema, runLogPhaseSchema, runLogsConfigSchema, runManifestSchema, runSummarySchema, scoreTraceSchema, serializeCacheRecording, serializeCacheValue, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { B as runSummarySchema, T as createRunRequestSchema, Tn as configureEvalRunLogs, b as loadConfig, gt as evalStatsConfigSchema, jt as evalChartsConfigSchema, ot as buildEvalKey, r as getTargetEvals$1, rn as columnDefSchema, t as executeRun, v as parseEvalDiscovery, w as createFsCacheStore, z as runManifestSchema } from "./runOrchestration-
|
|
1
|
+
import { B as runSummarySchema, T as createRunRequestSchema, Tn as configureEvalRunLogs, b as loadConfig, gt as evalStatsConfigSchema, jt as evalChartsConfigSchema, ot as buildEvalKey, r as getTargetEvals$1, rn as columnDefSchema, t as executeRun, v as parseEvalDiscovery, w as createFsCacheStore, z as runManifestSchema } from "./runOrchestration-CW_Fzr2c.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|
|
@@ -1436,7 +1436,6 @@ function createTraceCache(generateSpanId) {
|
|
|
1436
1436
|
const namespace = info.namespace ?? `${cacheCtx.evalId}__${info.name}`;
|
|
1437
1437
|
const keyHash = await hashCacheKey({
|
|
1438
1438
|
namespace,
|
|
1439
|
-
codeFingerprint: cacheCtx.codeFingerprint,
|
|
1440
1439
|
key: info.key
|
|
1441
1440
|
}, { serializeFileBytes: info.serializeFileBytes === true });
|
|
1442
1441
|
const activeSpan = scope.activeSpanStack.at(-1);
|
|
@@ -1879,10 +1878,9 @@ async function traceSpanInternal(info, fn) {
|
|
|
1879
1878
|
const cacheCtx = scope.cacheContext;
|
|
1880
1879
|
if (cacheOpts !== void 0 && cacheCtx !== void 0 && scope.replayingDepth === 0) {
|
|
1881
1880
|
const ctx = cacheCtx;
|
|
1882
|
-
const namespace = cacheOpts
|
|
1881
|
+
const namespace = getRequiredSpanCacheNamespace(cacheOpts);
|
|
1883
1882
|
const keyHash = await hashCacheKey({
|
|
1884
1883
|
namespace,
|
|
1885
|
-
codeFingerprint: ctx.codeFingerprint,
|
|
1886
1884
|
key: cacheOpts.key
|
|
1887
1885
|
}, { serializeFileBytes: cacheOpts.serializeFileBytes === true });
|
|
1888
1886
|
mergeSpanAttributes(spanRecord, {
|
|
@@ -1966,6 +1964,12 @@ async function traceSpanInternal(info, fn) {
|
|
|
1966
1964
|
scope.activeSpanStack.pop();
|
|
1967
1965
|
}
|
|
1968
1966
|
}
|
|
1967
|
+
function getRequiredSpanCacheNamespace(cacheOpts) {
|
|
1968
|
+
if (!isRecordLike$1(cacheOpts)) throw new Error("Cached spans require a non-empty cache.namespace");
|
|
1969
|
+
const namespace = cacheOpts.namespace;
|
|
1970
|
+
if (typeof namespace !== "string" || namespace.length === 0) throw new Error("Cached spans require a non-empty cache.namespace");
|
|
1971
|
+
return namespace;
|
|
1972
|
+
}
|
|
1969
1973
|
/**
|
|
1970
1974
|
* Trace builder used to create hierarchical spans and checkpoints during eval
|
|
1971
1975
|
* execution.
|
|
@@ -2248,8 +2252,8 @@ const cacheModeSchema = z.enum([
|
|
|
2248
2252
|
const spanCacheOptionsSchema = z.object({
|
|
2249
2253
|
/** Arbitrary JSON-safe value used to derive the cache key. */
|
|
2250
2254
|
key: z.unknown(),
|
|
2251
|
-
/**
|
|
2252
|
-
namespace: z.string().
|
|
2255
|
+
/** Required cache namespace shared by span cache entries in the same domain. */
|
|
2256
|
+
namespace: z.string().min(1),
|
|
2253
2257
|
/**
|
|
2254
2258
|
* Include native `Blob`/`File` bytes in the cache key. By default only stable
|
|
2255
2259
|
* metadata (`type`, `size`, plus `name`/`lastModified` for `File`) is used.
|
|
@@ -4197,7 +4201,7 @@ async function writeDebugKeyFile(debugDir, debugFile) {
|
|
|
4197
4201
|
await mkdir(debugDir, { recursive: true });
|
|
4198
4202
|
const filePath = ownerPath(debugDir, debugFile.owner);
|
|
4199
4203
|
const tmpPath = `${filePath}.${process.pid.toString()}.tmp`;
|
|
4200
|
-
await writeFile(tmpPath, JSON.stringify(debugFile));
|
|
4204
|
+
await writeFile(tmpPath, JSON.stringify(debugFile, null, 2));
|
|
4201
4205
|
await rename(tmpPath, filePath);
|
|
4202
4206
|
}
|
|
4203
4207
|
function pruneEntries(entries, namespace, maxEntries, protectedKey) {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-T6FYE2Bk.mjs";
|
|
2
|
+
import "./src-CbVnqjW3.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-BsRloAm3.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
package/package.json
CHANGED
|
@@ -87,7 +87,10 @@ export async function runRefundWorkflow(input: RefundInput) {
|
|
|
87
87
|
{
|
|
88
88
|
kind: 'llm',
|
|
89
89
|
name: 'plan-refund',
|
|
90
|
-
cache: {
|
|
90
|
+
cache: {
|
|
91
|
+
namespace: 'refund-workflow__plan-refund',
|
|
92
|
+
key: { prompt: input.message, model: 'gpt-4o-mini' },
|
|
93
|
+
},
|
|
91
94
|
},
|
|
92
95
|
async () => {
|
|
93
96
|
let text: string;
|
|
@@ -319,15 +322,18 @@ Their shapes live in the types; no need to memorize the option set.
|
|
|
319
322
|
|
|
320
323
|
## Cached operations
|
|
321
324
|
|
|
322
|
-
Wrap a costly pure span in `cache: { key }` so later runs replay its
|
|
323
|
-
effects without re-executing:
|
|
325
|
+
Wrap a costly pure span in `cache: { namespace, key }` so later runs replay its
|
|
326
|
+
recorded effects without re-executing:
|
|
324
327
|
|
|
325
328
|
```ts
|
|
326
329
|
await evalTracer.span(
|
|
327
330
|
{
|
|
328
331
|
kind: 'llm',
|
|
329
332
|
name: 'plan-refund',
|
|
330
|
-
cache: {
|
|
333
|
+
cache: {
|
|
334
|
+
namespace: 'refund-workflow__plan-refund',
|
|
335
|
+
key: { prompt: input.message, model: 'gpt-4o-mini' },
|
|
336
|
+
},
|
|
331
337
|
},
|
|
332
338
|
async () => {
|
|
333
339
|
const result = await llm.complete(input.message);
|
|
@@ -368,12 +374,13 @@ Mental model:
|
|
|
368
374
|
namespace, and hit/miss status. When called directly from the case body
|
|
369
375
|
(no surrounding span), the ref is recorded on the case detail's `cacheRefs`
|
|
370
376
|
array.
|
|
371
|
-
-
|
|
372
|
-
|
|
373
|
-
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
+
- Cache identity is the namespace plus the authored key. Source-file
|
|
378
|
+
fingerprints are stored as metadata for inspection, but do not participate in
|
|
379
|
+
cache-key hashing.
|
|
380
|
+
- Cached spans require an explicit `cache.namespace`; value caches default to
|
|
381
|
+
`${evalId}__${name}` and can be overridden with `namespace`. Matching
|
|
382
|
+
namespaces share entries across operations/evals that use the same authored
|
|
383
|
+
key.
|
|
377
384
|
- Authored eval ids are unique within one eval file. The exact eval identity is
|
|
378
385
|
the workspace-relative file path plus eval id, so the same id can be reused in
|
|
379
386
|
different files. Case ids must be unique within one eval; duplicate case ids
|
|
@@ -462,9 +469,9 @@ When adding or changing evals:
|
|
|
462
469
|
formats from the `ColumnFormat` type.
|
|
463
470
|
5. Promote high-signal span attributes with `traceDisplay` so they surface in
|
|
464
471
|
the trace tree and detail pane.
|
|
465
|
-
6. Cache costly pure spans with `cache: { key }` and pure spanless
|
|
466
|
-
`evalTracer.cache(...)`; never cache operations whose external
|
|
467
|
-
you depend on.
|
|
472
|
+
6. Cache costly pure spans with `cache: { namespace, key }` and pure spanless
|
|
473
|
+
values with `evalTracer.cache(...)`; never cache operations whose external
|
|
474
|
+
side effects you depend on.
|
|
468
475
|
7. Sanity-check after changes: `agent-evals list`, then
|
|
469
476
|
`agent-evals run --eval <id>`; use `--file <path|glob>` to target one file
|
|
470
477
|
when multiple files use the same eval id.
|
package/dist/src-CLMrRle2.mjs
DELETED