@ls-stack/agent-eval 0.23.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-4yNYRvmF.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-cj1TkR-H.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-D3QNOcPN.mjs";
2
- import "./src-CcXfWT4M.mjs";
1
+ import { n as createRunner } from "./cli-ETfZ15RB.mjs";
2
+ import "./src-CyNb2ycA.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-B31SV_Bq.mjs";
2
+ import "./cli-ETfZ15RB.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.23.0",
3
+ "version": "0.25.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -274,10 +274,12 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
274
274
  attribute paths. `latencyMs` is time to first token; duration, total tokens,
275
275
  tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
276
276
  override `attributes.<field>` for non-default primitive span shapes, configure
277
- `pricing` to derive USD costs from token counts by model/provider, and add
278
- entries to `metrics` to surface arbitrary user metrics (`format: 'string' |
279
- 'number' | 'duration' | 'json' | 'boolean'`, `placements: ['header' |
280
- 'body']`).
277
+ `pricing` to derive USD costs from token counts by model/provider, add
278
+ `derivedAttributes` to persist computed values back onto matching LLM spans
279
+ before trace consumers run, and add entries to `metrics` to surface arbitrary
280
+ user metrics (`format: 'string' | 'number' | 'duration' | 'json' |
281
+ 'boolean'`, `placements: ['header' | 'body']`). `derivedAttributes` keys are
282
+ dot-paths under `span.attributes`; return `undefined` to skip one span.
281
283
  - Default usage config derives missing eval outputs from matching LLM/API spans
282
284
  before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
283
285
  `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
@@ -298,7 +300,8 @@ cacheCreationInputTokens` so cache details are not double-counted.
298
300
  and `'fetch'` spans with `method`, `url`, `statusCode`, `request`,
299
301
  `response`, `requestBody`, `responseBody`, `headers`, `durationMs`, and
300
302
  `error` read from conventional attribute paths. Override `kinds` or
301
- `attributes.<field>` for external tracers, and add `metrics` with the same
303
+ `attributes.<field>` for external tracers, add `derivedAttributes` for
304
+ computed persisted API span attributes, and add `metrics` with the same
302
305
  formats and placements as LLM-call metrics.
303
306
  - `runLogs` (in `agent-evals.config.ts`) controls case log capture. Use
304
307
  `runLogs: { captureConsole: false }` to keep console output in the terminal
@@ -371,6 +374,10 @@ Mental model:
371
374
  across operations/evals, but the source-file fingerprint still participates
372
375
  in the final key. Shared namespaces are reusable across evals in the same
373
376
  file; evals in different files miss even with the same namespace and key.
377
+ - Authored eval ids are unique within one eval file. The exact eval identity is
378
+ the workspace-relative file path plus eval id, so the same id can be reused in
379
+ different files. Case ids must be unique within one eval; duplicate case ids
380
+ are reported as run errors.
374
381
  - Cache keys should be deterministic primitives, arrays, and plain objects.
375
382
  `Buffer`, `ArrayBuffer`, and typed arrays hash by bytes. Native `Blob`/`File`
376
383
  keys use stable metadata by default (`type`, `size`, plus
@@ -459,7 +466,8 @@ When adding or changing evals:
459
466
  `evalTracer.cache(...)`; never cache operations whose external side effects
460
467
  you depend on.
461
468
  7. Sanity-check after changes: `agent-evals list`, then
462
- `agent-evals run --eval <id>`.
469
+ `agent-evals run --eval <id>`; use `--file <path|glob>` to target one file
470
+ when multiple files use the same eval id.
463
471
  8. Locate saved artifacts with `agent-evals show-runs latest --json`, then read
464
472
  the relevant `summary.json`, `cases.jsonl`, `case-details/<case-id>.json`,
465
473
  or `traces/<case-id>.json` file directly.