@ls-stack/agent-eval 0.15.0 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-B7FUWsVm.mjs → app-B8e-oWYc.mjs} +3 -3
- package/dist/apps/web/dist/assets/index-BZ1TdyEg.js +117 -0
- package/dist/apps/web/dist/assets/{index-BVnLr79e.css → index-MARPw1bH.css} +1 -1
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-B-sCTyz8.mjs → cli-BmrtjQj_.mjs} +150 -4
- package/dist/index.d.mts +14 -0
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-B3fYtpKo.mjs → runOrchestration-BDyNrRQT.mjs} +2 -0
- package/dist/{runner-vunKoSBu.mjs → runner-CsZqhbiA.mjs} +2 -2
- package/dist/{runner-Dt-Ynv6s.mjs → runner-DABFPXkx.mjs} +1 -1
- package/dist/src-CEAJYN_X.mjs +3 -0
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +53 -45
- package/dist/apps/web/dist/assets/index-Cx2CvM6a.js +0 -117
- package/dist/src-jaOlXwb5.mjs +0 -3
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as caseRowSchema, $t as appendToEvalOutput, A as getEvalTitle, At as traceDisplayConfigSchema, B as apiCallMetricFormatSchema, Bt as fileRefSchema, C as createRunRequestSchema, Ct as serializedCacheSpanSchema, D as extractApiCalls, Dt as traceAttributeDisplayInputSchema, E as extractCacheHits, Et as traceAttributeDisplayFormatSchema, F as runManifestSchema, Ft as traceSpanWarningSchema, G as llmCallMetricPlacementSchema, Gt as z, H as apiCallMetricSchema, Ht as numberDisplayOptionsSchema, I as runSummarySchema, It as cellValueSchema, J as resolveApiCallsConfig, Jt as evalSpan, K as llmCallMetricSchema, Kt as buildTraceTree, L as DEFAULT_API_CALLS_CONFIG, Lt as columnDefSchema, M as deriveScopedSummaryFromCases, Mt as traceSpanErrorSchema, N as deriveStatusFromCaseRows, Nt as traceSpanKindSchema, O as extractLlmCalls, Ot as traceAttributeDisplayPlacementSchema, P as deriveStatusFromChildStatuses, Pt as traceSpanSchema, Q as caseDetailSchema, Qt as EvalAssertionError, R as DEFAULT_LLM_CALLS_CONFIG, Rt as columnFormatSchema, St as cacheStatusSchema, T as sseEnvelopeSchema, Tt as traceCacheRefSchema, U as apiCallsConfigSchema, Ut as repoFileRefSchema, V as apiCallMetricPlacementSchema, Vt as jsonCellSchema, W as llmCallMetricFormatSchema, Wt as runArtifactRefSchema, X as trialSelectionModeSchema, Xt as hashCacheKey, Y as resolveLlmCallsConfig, Yt as evalTracer, Z as assertionFailureSchema, Zt as hashCacheKeySync, _t as cacheListItemSchema, an as mergeEvalOutput, at as scoreTraceSchema, bt as cacheRecordingOpSchema, cn as runInEvalScope, ct as evalChartBuiltinMetricSchema, dn as setScopeCacheContext, dt as evalChartMetricSchema, en as evalAssert, et as evalFreshnessStatusSchema, fn as startEvalBackgroundJob, ft as evalChartTooltipExtraSchema, gt as cacheFileSchema, hn as getEvalRegistry, ht as cacheEntrySchema, in as isInEvalScope, it as evalSummarySchema, j as getEvalDisplayStatus, jt as traceDisplayInputConfigSchema, k as getNestedAttribute, kt as traceAttributeDisplaySchema, ln as runInExistingEvalScope, lt as evalChartColorSchema, mn as defineEval, mt as evalChartsConfigSchema, nn as getEvalCaseInput, nt as evalStatItemSchema, on as nextEvalId, ot as evalChartAggregateSchema, pn as repoFile, pt as evalChartTypeSchema, q as llmCallsConfigSchema, qt as captureEvalSpanError, rn as incrementEvalOutput, rt as evalStatsConfigSchema, sn as runInEvalRuntimeScope, st as evalChartAxisSchema, tn as getCurrentScope, tt as evalStatAggregateSchema, un as setEvalOutput, ut as evalChartConfigSchema, vt as cacheModeSchema, w as updateManualScoreRequestSchema, wt as spanCacheOptionsSchema, xt as cacheRecordingSchema, yt as cacheOperationTypeSchema, z as agentEvalsConfigSchema, zt as columnKindSchema } from "./runOrchestration-
|
|
2
|
-
import { n as createRunner, t as runCli } from "./cli-
|
|
3
|
-
import "./src-
|
|
1
|
+
import { $ as caseRowSchema, $t as appendToEvalOutput, A as getEvalTitle, At as traceDisplayConfigSchema, B as apiCallMetricFormatSchema, Bt as fileRefSchema, C as createRunRequestSchema, Ct as serializedCacheSpanSchema, D as extractApiCalls, Dt as traceAttributeDisplayInputSchema, E as extractCacheHits, Et as traceAttributeDisplayFormatSchema, F as runManifestSchema, Ft as traceSpanWarningSchema, G as llmCallMetricPlacementSchema, Gt as z, H as apiCallMetricSchema, Ht as numberDisplayOptionsSchema, I as runSummarySchema, It as cellValueSchema, J as resolveApiCallsConfig, Jt as evalSpan, K as llmCallMetricSchema, Kt as buildTraceTree, L as DEFAULT_API_CALLS_CONFIG, Lt as columnDefSchema, M as deriveScopedSummaryFromCases, Mt as traceSpanErrorSchema, N as deriveStatusFromCaseRows, Nt as traceSpanKindSchema, O as extractLlmCalls, Ot as traceAttributeDisplayPlacementSchema, P as deriveStatusFromChildStatuses, Pt as traceSpanSchema, Q as caseDetailSchema, Qt as EvalAssertionError, R as DEFAULT_LLM_CALLS_CONFIG, Rt as columnFormatSchema, St as cacheStatusSchema, T as sseEnvelopeSchema, Tt as traceCacheRefSchema, U as apiCallsConfigSchema, Ut as repoFileRefSchema, V as apiCallMetricPlacementSchema, Vt as jsonCellSchema, W as llmCallMetricFormatSchema, Wt as runArtifactRefSchema, X as trialSelectionModeSchema, Xt as hashCacheKey, Y as resolveLlmCallsConfig, Yt as evalTracer, Z as assertionFailureSchema, Zt as hashCacheKeySync, _t as cacheListItemSchema, an as mergeEvalOutput, at as scoreTraceSchema, bt as cacheRecordingOpSchema, cn as runInEvalScope, ct as evalChartBuiltinMetricSchema, dn as setScopeCacheContext, dt as evalChartMetricSchema, en as evalAssert, et as evalFreshnessStatusSchema, fn as startEvalBackgroundJob, ft as evalChartTooltipExtraSchema, gt as cacheFileSchema, hn as getEvalRegistry, ht as cacheEntrySchema, in as isInEvalScope, it as evalSummarySchema, j as getEvalDisplayStatus, jt as traceDisplayInputConfigSchema, k as getNestedAttribute, kt as traceAttributeDisplaySchema, ln as runInExistingEvalScope, lt as evalChartColorSchema, mn as defineEval, mt as evalChartsConfigSchema, nn as getEvalCaseInput, nt as evalStatItemSchema, on as nextEvalId, ot as evalChartAggregateSchema, pn as repoFile, pt as evalChartTypeSchema, q as llmCallsConfigSchema, qt as captureEvalSpanError, rn as incrementEvalOutput, rt as evalStatsConfigSchema, sn as runInEvalRuntimeScope, st as evalChartAxisSchema, tn as getCurrentScope, tt as evalStatAggregateSchema, un as setEvalOutput, ut as evalChartConfigSchema, vt as cacheModeSchema, w as updateManualScoreRequestSchema, wt as spanCacheOptionsSchema, xt as cacheRecordingSchema, yt as cacheOperationTypeSchema, z as agentEvalsConfigSchema, zt as columnKindSchema } from "./runOrchestration-BDyNrRQT.mjs";
|
|
2
|
+
import { n as createRunner, t as runCli } from "./cli-BmrtjQj_.mjs";
|
|
3
|
+
import "./src-CEAJYN_X.mjs";
|
|
4
4
|
export { DEFAULT_API_CALLS_CONFIG, DEFAULT_LLM_CALLS_CONFIG, EvalAssertionError, agentEvalsConfigSchema, apiCallMetricFormatSchema, apiCallMetricPlacementSchema, apiCallMetricSchema, apiCallsConfigSchema, appendToEvalOutput, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheFileSchema, cacheListItemSchema, cacheModeSchema, cacheOperationTypeSchema, cacheRecordingOpSchema, cacheRecordingSchema, cacheStatusSchema, captureEvalSpanError, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, extractApiCalls, extractCacheHits, extractLlmCalls, fileRefSchema, getCurrentScope, getEvalCaseInput, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, jsonCellSchema, llmCallMetricFormatSchema, llmCallMetricPlacementSchema, llmCallMetricSchema, llmCallsConfigSchema, mergeEvalOutput, nextEvalId, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, resolveApiCallsConfig, resolveLlmCallsConfig, runArtifactRefSchema, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, startEvalBackgroundJob, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceCacheRefSchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanErrorSchema, traceSpanKindSchema, traceSpanSchema, traceSpanWarningSchema, trialSelectionModeSchema, updateManualScoreRequestSchema, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { C as createRunRequestSchema, F as runManifestSchema, I as runSummarySchema, Lt as columnDefSchema, S as createFsCacheStore, mt as evalChartsConfigSchema, rt as evalStatsConfigSchema, t as executeRun, v as parseEvalMetas, y as loadConfig } from "./runOrchestration-
|
|
1
|
+
import { C as createRunRequestSchema, F as runManifestSchema, I as runSummarySchema, Lt as columnDefSchema, S as createFsCacheStore, mt as evalChartsConfigSchema, rt as evalStatsConfigSchema, t as executeRun, v as parseEvalMetas, y as loadConfig } from "./runOrchestration-BDyNrRQT.mjs";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|
|
@@ -3913,6 +3913,7 @@ const agentEvalsConfigSchema = z.object({
|
|
|
3913
3913
|
trialSelection: trialSelectionModeSchema.optional(),
|
|
3914
3914
|
concurrency: z.number().optional(),
|
|
3915
3915
|
staleAfterDays: z.number().optional(),
|
|
3916
|
+
allowCliRunAll: z.boolean().optional(),
|
|
3916
3917
|
traceDisplay: traceDisplayInputConfigSchema.optional(),
|
|
3917
3918
|
llmCalls: llmCallsConfigSchema.optional(),
|
|
3918
3919
|
apiCalls: apiCallsConfigSchema.optional(),
|
|
@@ -4881,6 +4882,7 @@ const defaultConfig = {
|
|
|
4881
4882
|
trialSelection: "lowestScore",
|
|
4882
4883
|
concurrency: 2,
|
|
4883
4884
|
staleAfterDays: 14,
|
|
4885
|
+
allowCliRunAll: false,
|
|
4884
4886
|
traceDisplay: { attributes: [{
|
|
4885
4887
|
path: "input",
|
|
4886
4888
|
label: "Input",
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-BmrtjQj_.mjs";
|
|
2
|
+
import "./src-CEAJYN_X.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-CsZqhbiA.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.16.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -59,8 +59,8 @@
|
|
|
59
59
|
"@types/node": "^24.7.2",
|
|
60
60
|
"typescript": "^5.9.2",
|
|
61
61
|
"@agent-evals/runner": "0.0.1",
|
|
62
|
-
"@agent-evals/
|
|
63
|
-
"@agent-evals/
|
|
62
|
+
"@agent-evals/shared": "0.0.1",
|
|
63
|
+
"@agent-evals/sdk": "0.0.1"
|
|
64
64
|
},
|
|
65
65
|
"scripts": {
|
|
66
66
|
"build": "pnpm --filter @agent-evals/web build && tsdown",
|
|
@@ -24,6 +24,10 @@ display rules), read the TypeScript declarations shipped with the package:
|
|
|
24
24
|
Unknown help targets exit non-zero instead of falling back to global help.
|
|
25
25
|
- The CLI automatically loads `.env` from the current workspace. Shell-provided
|
|
26
26
|
environment variables win; pass `--no-env` to disable `.env` loading once.
|
|
27
|
+
- Unfiltered `agent-evals run` is disabled by default; use `--eval` or `--case`
|
|
28
|
+
for targeted CLI runs. Set `allowCliRunAll: true` in
|
|
29
|
+
`agent-evals.config.ts` to opt into run-all CLI behavior. The web UI can
|
|
30
|
+
still run grouped evals and confirms before starting more than five.
|
|
27
31
|
|
|
28
32
|
Assume that enumerated tables in this document may lag behind the types —
|
|
29
33
|
treat the types as source of truth when they disagree.
|
|
@@ -117,21 +121,19 @@ export async function runRefundWorkflow(input: RefundInput) {
|
|
|
117
121
|
}
|
|
118
122
|
```
|
|
119
123
|
|
|
120
|
-
Span `kind` values are open-ended strings
|
|
121
|
-
the UI for every kind used during the app session. Use familiar kinds such as
|
|
124
|
+
Span `kind` values are open-ended strings. Use familiar kinds such as
|
|
122
125
|
`agent`, `tool`, `llm`, `api`, `retrieval`, `scorer`, or `checkpoint` when they
|
|
123
126
|
fit, and preserve external tracer kinds such as `mastra.workflow.step` when they
|
|
124
|
-
are more specific.
|
|
125
|
-
|
|
127
|
+
are more specific. Only the `input` and `output` span attributes are promoted
|
|
128
|
+
automatically; use `traceDisplay` for other span attributes such as `model`,
|
|
126
129
|
`usage`, or `costUsd`.
|
|
127
130
|
|
|
128
131
|
Use `captureEvalSpanError(error)` for recoverable errors on the active
|
|
129
132
|
`evalTracer.span(...)`, such as optional model/tool failures that fall back and
|
|
130
133
|
continue. You can pass one error, multiple error arguments, or an array. The
|
|
131
|
-
span is still marked `error
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
visible in span detail without changing an otherwise successful span's status.
|
|
134
|
+
span is still marked `error`. Pass `'warning'` or `{ level: 'warning' }` as the
|
|
135
|
+
final argument for diagnostics that should not change an otherwise successful
|
|
136
|
+
span's status.
|
|
135
137
|
|
|
136
138
|
If a span callback throws, the SDK automatically marks that span as `error`,
|
|
137
139
|
stores the thrown error on it, and rethrows so the case errors. Use that for
|
|
@@ -208,12 +210,11 @@ or if the case errors. Scores without `passThreshold` are informational.
|
|
|
208
210
|
|
|
209
211
|
Score functions run in their own trace scope, separate from the execution
|
|
210
212
|
trace, so LLM-as-judge scorers can use `evalTracer.span(...)` and cached spans
|
|
211
|
-
without polluting the agent trajectory.
|
|
212
|
-
|
|
213
|
-
stay private to that score.
|
|
213
|
+
without polluting the agent trajectory. Outputs set inside a scorer stay
|
|
214
|
+
private to that score.
|
|
214
215
|
|
|
215
|
-
`manualScores` declares score columns that reviewers fill in
|
|
216
|
-
|
|
216
|
+
`manualScores` declares score columns that reviewers fill in after a run.
|
|
217
|
+
Pending values keep the eval in an `unscored` state instead of failing.
|
|
217
218
|
|
|
218
219
|
See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
219
220
|
(format, threshold, column overrides).
|
|
@@ -245,22 +246,20 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
245
246
|
detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
|
|
246
247
|
user-defined `transform(...)` for derived views (e.g. currency conversion).
|
|
247
248
|
See the `TraceDisplayInputConfig` type.
|
|
248
|
-
- `llmCalls` (in `agent-evals.config.ts`) configures
|
|
249
|
-
|
|
250
|
-
`costUsd`, `input`, `output`, etc. read from conventional
|
|
251
|
-
Override `kinds` to broaden the filter, override
|
|
252
|
-
non-default span shapes, and add entries to
|
|
253
|
-
user metrics (`format: 'string' | 'number' |
|
|
254
|
-
'boolean'`, `placements: ['header' | 'body']`).
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
`
|
|
259
|
-
`
|
|
260
|
-
from conventional attribute paths. Override `kinds` or
|
|
249
|
+
- `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are
|
|
250
|
+
summarized for review. Defaults to `kind: 'llm'` spans with `model`,
|
|
251
|
+
`usage.*`, `costUsd`, `input`, `output`, etc. read from conventional
|
|
252
|
+
attribute paths. Override `kinds` to broaden the filter, override
|
|
253
|
+
`attributes.<field>` for non-default span shapes, and add entries to
|
|
254
|
+
`metrics` to surface arbitrary user metrics (`format: 'string' | 'number' |
|
|
255
|
+
'duration' | 'json' | 'boolean'`, `placements: ['header' | 'body']`).
|
|
256
|
+
- `apiCalls` (in `agent-evals.config.ts`) configures how API-call spans are
|
|
257
|
+
summarized for review. Defaults to `kind: 'api'`, `'http'`, `'http.client'`,
|
|
258
|
+
and `'fetch'` spans with `method`, `url`, `statusCode`, `request`,
|
|
259
|
+
`response`, `requestBody`, `responseBody`, `headers`, `durationMs`, and
|
|
260
|
+
`error` read from conventional attribute paths. Override `kinds` or
|
|
261
261
|
`attributes.<field>` for external tracers, and add `metrics` with the same
|
|
262
|
-
formats and placements as LLM-call metrics.
|
|
263
|
-
matching spans exist.
|
|
262
|
+
formats and placements as LLM-call metrics.
|
|
264
263
|
|
|
265
264
|
Stats rows and history charts on the eval card are opt-in via `stats` /
|
|
266
265
|
`charts` on the eval definition. Their shapes live in the types; no need to
|
|
@@ -313,8 +312,7 @@ Mental model:
|
|
|
313
312
|
span, that span gets a `cache.refs` entry with the value cache name, key,
|
|
314
313
|
namespace, and hit/miss status. When called directly from the case body
|
|
315
314
|
(no surrounding span), the ref is recorded on the case detail's `cacheRefs`
|
|
316
|
-
array
|
|
317
|
-
each hit can be expanded for inspection or deleted by namespace/key.
|
|
315
|
+
array.
|
|
318
316
|
- The cache key folds in a source-file fingerprint, so editing the eval busts
|
|
319
317
|
the cache automatically.
|
|
320
318
|
- `cache.namespace` on spans or `namespace` on value caches can share entries
|
|
@@ -335,19 +333,31 @@ Mental model:
|
|
|
335
333
|
recorded SDK effects preserve richer built-ins such as `Date`, `Map`, `Set`,
|
|
336
334
|
typed arrays, `URL`, `Headers`, `Blob`, and `File` on hits. Cache keys still
|
|
337
335
|
use the deterministic key-hashing rules above.
|
|
338
|
-
- Cache mode per run is controlled by CLI flags (see `agent-evals run --help`)
|
|
339
|
-
and by a chevron menu on each eval card in the UI.
|
|
340
|
-
- The UI Stop action cancels the whole active run by terminating that run's
|
|
341
|
-
isolated execution process.
|
|
336
|
+
- Cache mode per run is controlled by CLI flags (see `agent-evals run --help`).
|
|
342
337
|
|
|
343
338
|
## Artifacts
|
|
344
339
|
|
|
345
340
|
Run output lives under `.agent-evals/runs/<run-id>/` and cache entries under
|
|
346
341
|
`.agent-evals/cache/<eval-id>.json`. Files in a run directory include run
|
|
347
342
|
metadata, a run summary, per-case results, and per-case trace JSON. Inspect
|
|
348
|
-
these when debugging persisted output, costs, columns, traces, or failures
|
|
349
|
-
|
|
350
|
-
|
|
343
|
+
these when debugging persisted output, costs, columns, traces, or failures.
|
|
344
|
+
|
|
345
|
+
Use `agent-evals show-runs` when you need stable file
|
|
346
|
+
paths before reading saved output:
|
|
347
|
+
|
|
348
|
+
```sh
|
|
349
|
+
agent-evals show-runs
|
|
350
|
+
agent-evals show-runs latest --json
|
|
351
|
+
jq . .agent-evals/runs/<run-id>/summary.json
|
|
352
|
+
jq -s . .agent-evals/runs/<run-id>/cases.jsonl
|
|
353
|
+
jq . .agent-evals/runs/<run-id>/case-details/<case-id>.json
|
|
354
|
+
jq . .agent-evals/runs/<run-id>/traces/<case-id>.json
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
Run ids can be full timestamp ids, short ids such as `r0` from
|
|
358
|
+
`agent-evals show-runs`, or `latest`. `show-runs` is only an artifact index;
|
|
359
|
+
the files themselves remain the source of truth for detailed results and
|
|
360
|
+
traces.
|
|
351
361
|
|
|
352
362
|
## Module mocking
|
|
353
363
|
|
|
@@ -386,15 +396,13 @@ When adding or changing evals:
|
|
|
386
396
|
4. Surface reviewable values through execute-context `setOutput` or ambient
|
|
387
397
|
`setEvalOutput` in shared workflow code, and shape them with `columns`
|
|
388
398
|
formats from the `ColumnFormat` type.
|
|
389
|
-
5. Promote high-signal span attributes with `traceDisplay` so
|
|
390
|
-
|
|
399
|
+
5. Promote high-signal span attributes with `traceDisplay` so they surface in
|
|
400
|
+
the trace tree and detail pane.
|
|
391
401
|
6. Cache costly pure spans with `cache: { key }` and pure spanless values with
|
|
392
402
|
`evalTracer.cache(...)`; never cache operations whose external side effects
|
|
393
403
|
you depend on.
|
|
394
404
|
7. Sanity-check after changes: `agent-evals list`, then
|
|
395
|
-
`agent-evals run --eval <id>`.
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
`agent-evals run --inspect-brk --eval <id> --case <case-id>` and attach a
|
|
400
|
-
Node.js debugger before continuing execution.
|
|
405
|
+
`agent-evals run --eval <id>`.
|
|
406
|
+
8. Locate saved artifacts with `agent-evals show-runs latest --json`, then read
|
|
407
|
+
the relevant `summary.json`, `cases.jsonl`, `case-details/<case-id>.json`,
|
|
408
|
+
or `traces/<case-id>.json` file directly.
|