@ls-stack/agent-eval 0.36.0 → 0.37.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/runChild.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { A as createRunRequestSchema, C as loadConfig, D as createFsCacheStore, It as manualInputDescriptorSchema, K as runManifestSchema, Ot as evalStatsConfigSchema, Qt as evalChartsConfigSchema, Xn as configureEvalRunLogs, q as runSummarySchema, r as getTargetEvals$1, t as executeRun, vt as buildEvalKey, wn as columnDefSchema, x as parseEvalDiscovery } from "./runOrchestration-V1TxX8es.mjs";
1
+ import { At as evalStatsConfigSchema, C as parseEvalDiscovery, En as columnDefSchema, J as runManifestSchema, M as createRunRequestSchema, Qn as configureEvalRunLogs, Rt as manualInputDescriptorSchema, T as loadConfig, Y as runSummarySchema, bt as buildEvalKey, en as evalChartsConfigSchema, k as createFsCacheStore, p as persistRunState, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-C4o5TcIu.mjs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { relative } from "node:path";
@@ -14,6 +14,7 @@ const evalMetaSchema = z.object({
14
14
  sourceFingerprint: z.string().nullable(),
15
15
  columnDefs: z.array(columnDefSchema),
16
16
  caseCount: z.number().nullable(),
17
+ caseIds: z.array(z.string()).optional(),
17
18
  stats: evalStatsConfigSchema.optional(),
18
19
  charts: evalChartsConfigSchema.optional(),
19
20
  manualInputDescriptor: manualInputDescriptorSchema.optional(),
@@ -27,6 +28,7 @@ const runChildContextSchema = z.object({
27
28
  summary: runSummarySchema,
28
29
  evals: z.array(evalMetaSchema).optional()
29
30
  });
31
+ let activeContext;
30
32
  function sendMessage(message) {
31
33
  if (process.send === void 0) return;
32
34
  process.send(message);
@@ -93,6 +95,7 @@ async function main() {
93
95
  process.exit(1);
94
96
  });
95
97
  const context = await readContext(process.argv[2]);
98
+ activeContext = context;
96
99
  process.chdir(context.workspaceRoot);
97
100
  const config = await loadConfig();
98
101
  configureEvalRunLogs({ captureConsole: config.runLogs?.captureConsole !== false });
@@ -153,7 +156,46 @@ async function main() {
153
156
  evals: [...evals.values()]
154
157
  });
155
158
  }
156
- await main();
159
+ async function handleFatalRunChildError(error) {
160
+ const message = formatUnknownErrorDetails(error);
161
+ process.exitCode = 1;
162
+ console.error(message);
163
+ if (activeContext === void 0) return;
164
+ const endedAt = (/* @__PURE__ */ new Date()).toISOString();
165
+ await persistRunState({
166
+ runDir: activeContext.runDir,
167
+ manifest: {
168
+ ...activeContext.manifest,
169
+ status: "error",
170
+ endedAt
171
+ },
172
+ summary: {
173
+ ...activeContext.summary,
174
+ status: "error",
175
+ errorMessage: message
176
+ },
177
+ cases: [],
178
+ caseDetails: /* @__PURE__ */ new Map(),
179
+ listeners: /* @__PURE__ */ new Set()
180
+ });
181
+ sendMessage({
182
+ type: "event",
183
+ event: {
184
+ type: "run.error",
185
+ runId: activeContext.manifest.id,
186
+ timestamp: endedAt,
187
+ payload: { message }
188
+ }
189
+ });
190
+ }
191
+ function formatUnknownErrorDetails(error) {
192
+ if (error instanceof Error) return error.stack ?? error.message;
193
+ if (typeof error === "string") return error;
194
+ return String(error);
195
+ }
196
+ await main().catch(async (error) => {
197
+ await handleFatalRunChildError(error);
198
+ });
157
199
  process.disconnect();
158
200
  //#endregion
159
201
  export {};
@@ -2980,6 +2980,8 @@ const evalSummarySchema = z.object({
2980
2980
  currentCommitSha: z.string().nullable(),
2981
2981
  columnDefs: z.array(columnDefSchema),
2982
2982
  caseCount: z.number().nullable(),
2983
+ /** Authored case ids discovered for this eval, when case generation has run. */
2984
+ caseIds: z.array(z.string()).optional(),
2983
2985
  lastRunStatus: z.enum([
2984
2986
  "pass",
2985
2987
  "fail",
@@ -3626,6 +3628,11 @@ const runManifestSchema = z.object({
3626
3628
  "cancelled",
3627
3629
  "error"
3628
3630
  ]),
3631
+ /**
3632
+ * Temporary runs are persisted like normal runs, but are deleted before the
3633
+ * next run starts. Older persisted runs default to durable history.
3634
+ */
3635
+ temporary: z.boolean().optional().default(false),
3629
3636
  startedAt: z.string(),
3630
3637
  endedAt: z.string().nullable(),
3631
3638
  /**
@@ -4521,6 +4528,11 @@ const createRunRequestSchema = z.object({
4521
4528
  }),
4522
4529
  trials: z.number().min(1),
4523
4530
  /**
4531
+ * Persist this run as temporary history. Temporary runs are visible while
4532
+ * present, then deleted before the next run of any kind starts.
4533
+ */
4534
+ temporary: z.boolean().optional(),
4535
+ /**
4524
4536
  * Optional cache controls for the run. When omitted, the cache is used in
4525
4537
  * its default read-through / write-on-miss mode.
4526
4538
  */
@@ -6188,6 +6200,27 @@ function runTouchesEval(params) {
6188
6200
  if (params.target.mode === "evalIds") return params.target.evalKeys?.includes(params.evalKey) ?? params.target.evalIds?.includes(params.evalId ?? params.evalKey) ?? false;
6189
6201
  return false;
6190
6202
  }
6203
+ async function deleteTemporaryRuns(params) {
6204
+ let deletedRuns = 0;
6205
+ for (const [runId, run] of [...params.runs]) {
6206
+ if (run.manifest.temporary !== true) continue;
6207
+ if (run.manifest.status === "running") {
6208
+ const endedAt = /* @__PURE__ */ new Date();
6209
+ run.manifest.status = "cancelled";
6210
+ run.manifest.endedAt = endedAt.toISOString();
6211
+ run.summary.status = "cancelled";
6212
+ run.summary.totalDurationMs = endedAt.getTime() - new Date(run.manifest.startedAt).getTime();
6213
+ params.cancelRunningRun(run);
6214
+ }
6215
+ params.runs.delete(runId);
6216
+ await rm(run.runDir, {
6217
+ recursive: true,
6218
+ force: true
6219
+ });
6220
+ deletedRuns += 1;
6221
+ }
6222
+ return deletedRuns;
6223
+ }
6191
6224
  async function recomputeEvalStatusesInRuns(params) {
6192
6225
  let updatedRuns = 0;
6193
6226
  for (const run of params.runs) {
@@ -6384,6 +6417,12 @@ function encodeCaseDetailFileName(caseId) {
6384
6417
  return encodeURIComponent(caseId);
6385
6418
  }
6386
6419
  //#endregion
6420
+ //#region ../runner/src/stackFormatting.ts
6421
+ const orphanedAnsiSgrPattern = /\[(?:\d{1,3}(?:;\d{1,3})*)?m/g;
6422
+ function stripTerminalControlCodes(value) {
6423
+ return stripVTControlCharacters(value).replaceAll(orphanedAnsiSgrPattern, "");
6424
+ }
6425
+ //#endregion
6387
6426
  //#region ../runner/src/moduleIsolation.ts
6388
6427
  const isolationParam = "agent-evals-isolate";
6389
6428
  const pathSegmentSeparatorPattern = /[\\/]+/;
@@ -6474,12 +6513,6 @@ async function runWithModuleIsolation(context, fn) {
6474
6513
  return await isolationStorage.run(context, fn);
6475
6514
  }
6476
6515
  //#endregion
6477
- //#region ../runner/src/stackFormatting.ts
6478
- const orphanedAnsiSgrPattern = /\[(?:\d{1,3}(?:;\d{1,3})*)?m/g;
6479
- function stripTerminalControlCodes(value) {
6480
- return stripVTControlCharacters(value).replaceAll(orphanedAnsiSgrPattern, "");
6481
- }
6482
- //#endregion
6483
6516
  //#region ../runner/src/runExecution.ts
6484
6517
  function filterEvalCases(cases, caseIds) {
6485
6518
  if (!caseIds || caseIds.length === 0) return cases;
@@ -7061,6 +7094,8 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
7061
7094
  const duplicateCaseIds = findDuplicateCaseIds(runnableCases);
7062
7095
  if (duplicateCaseIds.length > 0) throw new Error(`Duplicate case id${duplicateCaseIds.length === 1 ? "" : "s"} in ${evalMeta.filePath}#${evalMeta.id}: ${duplicateCaseIds.join(", ")}`);
7063
7096
  const cases = filterEvalCases(runnableCases, request.target.caseIds);
7097
+ evalMeta.caseCount = runnableCases.length;
7098
+ evalMeta.caseIds = runnableCases.map((evalCase) => evalCase.id);
7064
7099
  runState.summary.totalCases += cases.length;
7065
7100
  const defaultConfig = resolveEvalDefaultConfig({
7066
7101
  evalDef,
@@ -7269,4 +7304,4 @@ function toLastRunStatus(status) {
7269
7304
  return status === "pending" ? null : status;
7270
7305
  }
7271
7306
  //#endregion
7272
- export { apiCallMetricSchema as $, getCurrentScope as $n, cacheDebugKeyEntrySchema as $t, createRunRequestSchema as A, repoFileRefSchema as An, runLogEntrySchema as At, getNestedAttribute as B, deserializeCacheValue as Bn, manualInputNumberFieldSchema as Bt, loadConfig as C, cellValueSchema as Cn, caseRowSchema as Ct, createFsCacheStore as D, fileRefSchema as Dn, evalStatItemSchema as Dt, validateCharts as E, columnKindSchema as En, evalStatAggregateSchema as Et, extractApiCalls as F, evalSpan as Fn, manualInputBooleanFieldSchema as Ft, deriveStatusFromChildStatuses as G, readManualInputFile as Gn, evalChartAxisSchema as Gt, getEvalDisplayStatus as H, serializeCacheValue as Hn, manualInputSelectOptionSchema as Ht, extractLlmCalls as I, evalTracer as In, manualInputDescriptorSchema as It, DEFAULT_API_CALLS_CONFIG as J, advanceEvalTime as Jn, evalChartConfigSchema as Jt, runManifestSchema as K, evalExpect as Kn, evalChartBuiltinMetricSchema as Kt, simulateLlmCallCost as L, hashCacheKey as Ln, manualInputFieldDescriptorSchema as Lt, sseEnvelopeSchema as M, z$1 as Mn, runLogLocationSchema as Mt, extractCacheEntries as N, buildTraceTree as Nn, runLogPhaseSchema as Nt, configReloadStateSchema as O, jsonCellSchema as On, evalStatsConfigSchema as Ot, extractCacheHits as P, captureEvalSpanError as Pn, scoreTraceSchema as Pt, apiCallMetricPlacementSchema as Q, evalLog as Qn, evalChartsConfigSchema as Qt, simulateTokenAllocation as R, hashCacheKeySync as Rn, manualInputJsonFieldSchema as Rt, resolveEvalDefaultConfig as S, traceSpanWarningSchema as Sn, caseDetailSchema as St, normalizeScoreDef as T, columnFormatSchema as Tn, evalFreshnessStatusSchema as Tt, deriveScopedSummaryFromCases as U, repoFile as Un, manualInputTextFieldSchema as Ut, getEvalTitle as V, serializeCacheRecording as Vn, manualInputSelectFieldSchema as Vt, deriveStatusFromCaseRows as W, manualInputFileValueSchema as Wn, evalChartAggregateSchema as Wt, agentEvalsConfigSchema as X, configureEvalRunLogs as Xn, evalChartTooltipExtraSchema as Xt, DEFAULT_LLM_CALLS_CONFIG as Y, appendToEvalOutput as Yn, evalChartMetricSchema as Yt, apiCallMetricFormatSchema as Z, evalAssert as Zn, evalChartTypeSchema as Zt, buildManualInputDescriptor as _, traceDisplayConfigSchema as _n, buildCaseKey as _t, getLastRunStatuses as a, cacheModeSchema as an, nextEvalId as ar, llmCallCostCurrencySchema as at, loadEvalModule as b, traceSpanKindSchema as bn, getCaseRowEvalKey as bt, loadPersistedRunSnapshots as c, cacheRecordingSchema as cn, runInExistingEvalScope as cr, llmCallMetricSchema as ct, persistRunState as d, spanCacheOptionsSchema as dn, startEvalBackgroundJob as dr, llmCallsConfigSchema as dt, cacheDebugKeyFileSchema as en, getEvalCaseInput as er, apiCallsConfigSchema as et, recomputeEvalStatusesInRuns as f, traceCacheRefSchema as fn, defineEval as fr, removeDefaultConfigSchema as ft, resolveArtifactPath as g, traceAttributeDisplaySchema as gn, trialSelectionModeSchema as gt, resolveTracePresentation as h, traceAttributeDisplayPlacementSchema as hn, runLogsConfigSchema as ht, generateRunId as i, cacheListItemSchema as in, mergeEvalOutput as ir, evalDeriveConfigSchema as it, updateManualScoreRequestSchema as j, runArtifactRefSchema as jn, runLogLevelSchema as jt, configReloadStatusSchema as k, numberDisplayOptionsSchema as kn, evalSummarySchema as kt, nextShortIdFromSnapshots as l, cacheStatusSchema as ln, setEvalOutput as lr, llmCallPricingRateSchema as lt, runTouchesEval as m, traceAttributeDisplayInputSchema as mn, resolveLlmCallsConfig as mt, getTargetEvalKeys as n, cacheEntryWithDebugKeySchema as nn, incrementEvalOutput as nr, evalColumnOverrideSchema as nt, getLatestRunInfos as o, cacheOperationTypeSchema as on, runInEvalRuntimeScope as or, llmCallMetricFormatSchema as ot, recomputePersistedCaseStatus as p, traceAttributeDisplayFormatSchema as pn, getEvalRegistry as pr, resolveApiCallsConfig as pt, runSummarySchema as q, EvalAssertionError as qn, evalChartColorSchema as qt, getTargetEvals as r, cacheFileSchema as rn, isInEvalScope as rr, evalColumnsSchema as rt, loadPersistedRunSnapshot as s, cacheRecordingOpSchema as sn, runInEvalScope as sr, llmCallMetricPlacementSchema as st, executeRun as t, cacheEntrySchema as tn, getEvalStartTime as tr, defaultConfigKeySchema as tt, persistCaseDetail as u, serializedCacheSpanSchema as un, setScopeCacheContext as ur, llmCallPricingSchema as ut, parseManualInputValues as v, traceDisplayInputConfigSchema as vn, buildEvalKey as vt, buildDeclaredColumnDefs as w, columnDefSchema as wn, discoveryIssueSchema as wt, parseEvalDiscovery as x, traceSpanSchema as xn, assertionFailureSchema as xt, deriveEvalFreshness as y, traceSpanErrorSchema as yn, getCaseRowCaseKey as yt, applyDerivedCallAttributes as z, deserializeCacheRecording as zn, manualInputMultilineFieldSchema as zt };
7307
+ export { apiCallMetricFormatSchema as $, evalAssert as $n, evalChartTypeSchema as $t, configReloadStateSchema as A, jsonCellSchema as An, evalStatsConfigSchema as At, simulateTokenAllocation as B, hashCacheKeySync as Bn, manualInputJsonFieldSchema as Bt, parseEvalDiscovery as C, traceSpanSchema as Cn, assertionFailureSchema as Ct, normalizeScoreDef as D, columnFormatSchema as Dn, evalFreshnessStatusSchema as Dt, buildDeclaredColumnDefs as E, columnDefSchema as En, discoveryIssueSchema as Et, extractCacheEntries as F, buildTraceTree as Fn, runLogPhaseSchema as Ft, deriveScopedSummaryFromCases as G, repoFile as Gn, manualInputTextFieldSchema as Gt, getNestedAttribute as H, deserializeCacheValue as Hn, manualInputNumberFieldSchema as Ht, extractCacheHits as I, captureEvalSpanError as In, scoreTraceSchema as It, runManifestSchema as J, evalExpect as Jn, evalChartBuiltinMetricSchema as Jt, deriveStatusFromCaseRows as K, manualInputFileValueSchema as Kn, evalChartAggregateSchema as Kt, extractApiCalls as L, evalSpan as Ln, manualInputBooleanFieldSchema as Lt, createRunRequestSchema as M, repoFileRefSchema as Mn, runLogEntrySchema as Mt, updateManualScoreRequestSchema as N, runArtifactRefSchema as Nn, runLogLevelSchema as Nt, validateCharts as O, columnKindSchema as On, evalStatAggregateSchema as Ot, sseEnvelopeSchema as P, z$1 as Pn, runLogLocationSchema as Pt, agentEvalsConfigSchema as Q, configureEvalRunLogs as Qn, evalChartTooltipExtraSchema as Qt, extractLlmCalls as R, evalTracer as Rn, manualInputDescriptorSchema as Rt, loadEvalModule as S, traceSpanKindSchema as Sn, getCaseRowEvalKey as St, loadConfig as T, cellValueSchema as Tn, caseRowSchema as Tt, getEvalTitle as U, serializeCacheRecording as Un, manualInputSelectFieldSchema as Ut, applyDerivedCallAttributes as V, deserializeCacheRecording as Vn, manualInputMultilineFieldSchema as Vt, getEvalDisplayStatus as W, serializeCacheValue as Wn, manualInputSelectOptionSchema as Wt, DEFAULT_API_CALLS_CONFIG as X, advanceEvalTime as Xn, evalChartConfigSchema as Xt, runSummarySchema as Y, EvalAssertionError as Yn, evalChartColorSchema as Yt, DEFAULT_LLM_CALLS_CONFIG as Z, appendToEvalOutput as Zn, evalChartMetricSchema as Zt, resolveTracePresentation as _, traceAttributeDisplayPlacementSchema as _n, runLogsConfigSchema as _t, generateRunId as a, cacheFileSchema as an, isInEvalScope as ar, evalColumnsSchema as at, parseManualInputValues as b, traceDisplayInputConfigSchema as bn, buildEvalKey as bt, loadPersistedRunSnapshot as c, cacheOperationTypeSchema as cn, runInEvalRuntimeScope as cr, llmCallMetricFormatSchema as ct, persistCaseDetail as d, cacheStatusSchema as dn, setEvalOutput as dr, llmCallPricingRateSchema as dt, evalChartsConfigSchema as en, evalLog as er, apiCallMetricPlacementSchema as et, deleteTemporaryRuns as f, serializedCacheSpanSchema as fn, setScopeCacheContext as fr, llmCallPricingSchema as ft, runTouchesEval as g, traceAttributeDisplayInputSchema as gn, resolveLlmCallsConfig as gt, recomputePersistedCaseStatus as h, traceAttributeDisplayFormatSchema as hn, getEvalRegistry as hr, resolveApiCallsConfig as ht, stripTerminalControlCodes as i, cacheEntryWithDebugKeySchema as in, incrementEvalOutput as ir, evalColumnOverrideSchema as it, configReloadStatusSchema as j, numberDisplayOptionsSchema as jn, evalSummarySchema as jt, createFsCacheStore as k, fileRefSchema as kn, evalStatItemSchema as kt, loadPersistedRunSnapshots as l, cacheRecordingOpSchema as ln, runInEvalScope as lr, llmCallMetricPlacementSchema as lt, recomputeEvalStatusesInRuns as m, traceCacheRefSchema as mn, defineEval as mr, removeDefaultConfigSchema as mt, getTargetEvalKeys as n, cacheDebugKeyFileSchema as nn, getEvalCaseInput as nr, apiCallsConfigSchema as nt, getLastRunStatuses as o, cacheListItemSchema as on, mergeEvalOutput as or, evalDeriveConfigSchema as ot, persistRunState as p, spanCacheOptionsSchema as pn, startEvalBackgroundJob as pr, llmCallsConfigSchema as pt, deriveStatusFromChildStatuses as q, readManualInputFile as qn, evalChartAxisSchema as qt, getTargetEvals as r, cacheEntrySchema as rn, getEvalStartTime as rr, defaultConfigKeySchema as rt, getLatestRunInfos as s, cacheModeSchema as sn, nextEvalId as sr, llmCallCostCurrencySchema as st, executeRun as t, cacheDebugKeyEntrySchema as tn, getCurrentScope as tr, apiCallMetricSchema as tt, nextShortIdFromSnapshots as u, cacheRecordingSchema as un, runInExistingEvalScope as ur, llmCallMetricSchema as ut, resolveArtifactPath as v, traceAttributeDisplaySchema as vn, trialSelectionModeSchema as vt, resolveEvalDefaultConfig as w, traceSpanWarningSchema as wn, caseDetailSchema as wt, deriveEvalFreshness as x, traceSpanErrorSchema as xn, getCaseRowCaseKey as xt, buildManualInputDescriptor as y, traceDisplayConfigSchema as yn, buildCaseKey as yt, simulateLlmCallCost as z, hashCacheKey as zn, manualInputFieldDescriptorSchema as zt };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-Dg3abrOv.mjs";
2
- import "./src-DBypR4TV.mjs";
1
+ import { n as createRunner } from "./cli-CwGcJYWe.mjs";
2
+ import "./src--13_4uDG.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-BCs5rzej.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-BTH8m_Er.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-C4o5TcIu.mjs";
2
+ import "./cli-CwGcJYWe.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.36.0",
3
+ "version": "0.37.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -32,8 +32,8 @@
32
32
  "@types/node": "^24.7.2",
33
33
  "typescript": "^5.9.2",
34
34
  "@agent-evals/runner": "0.0.1",
35
- "@agent-evals/shared": "0.0.1",
36
- "@agent-evals/sdk": "0.0.1"
35
+ "@agent-evals/sdk": "0.0.1",
36
+ "@agent-evals/shared": "0.0.1"
37
37
  },
38
38
  "scripts": {
39
39
  "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
@@ -27,7 +27,13 @@ display rules), read the TypeScript declarations shipped with the package:
27
27
  - Unfiltered `agent-evals run` is disabled by default; use `--eval` or `--case`
28
28
  for targeted CLI runs. Set `allowCliRunAll: true` in
29
29
  `agent-evals.config.ts` to opt into run-all CLI behavior. The web UI can
30
- still run grouped evals and confirms before starting more than five.
30
+ still run grouped evals and confirms before starting more than five. On a
31
+ single eval page, the Run chevron can open a picker to run specific authored
32
+ case ids; those case-picked runs are temporary by default and can be made
33
+ durable in the modal.
34
+ - `agent-evals run --temporary` persists a run like normal history, but deletes
35
+ it before the next run starts. Temporary runs appear in `show-runs` and the UI
36
+ while present; normal runs are never deleted by temporary-run cleanup.
31
37
  - `agent-evals app` watches `agent-evals.config.ts` and reloads config in
32
38
  place when the runner is idle. If config changes during an active run, the UI
33
39
  shows a pending reload banner and blocks new runs until the current run
@@ -507,6 +513,8 @@ Run output lives under `.agent-evals/runs/<run-id>/` and cache entries under
507
513
  `.agent-evals/cache/<eval-id>.json`. Files in a run directory include run
508
514
  metadata, a run summary, per-case results, and per-case trace JSON. Inspect
509
515
  these when debugging persisted output, costs, columns, traces, or failures.
516
+ Temporary runs use the same directory layout, but are removed before the next
517
+ run of any kind starts.
510
518
 
511
519
  Use `agent-evals show-runs` when you need stable file
512
520
  paths before reading saved output: