@ls-stack/agent-eval 0.16.1 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2 +1,2 @@
1
- import { n as initRunner, t as getRunnerInstance } from "./runner-CsZqhbiA.mjs";
1
+ import { n as initRunner, t as getRunnerInstance } from "./runner-RmZPRz-h.mjs";
2
2
  export { getRunnerInstance, initRunner };
@@ -1,5 +1,5 @@
1
- import { n as createRunner } from "./cli-BmrtjQj_.mjs";
2
- import "./src-CEAJYN_X.mjs";
1
+ import { n as createRunner } from "./cli-3zANEAhG.mjs";
2
+ import "./src-BC4OrajN.mjs";
3
3
  //#region ../../apps/server/src/runner.ts
4
4
  let runnerInstance = null;
5
5
  function getRunnerInstance() {
@@ -0,0 +1,3 @@
1
+ import "./runOrchestration-BBg_VUH5.mjs";
2
+ import "./cli-3zANEAhG.mjs";
3
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ls-stack/agent-eval",
3
- "version": "0.16.1",
3
+ "version": "0.18.0",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "agent-evals": "./dist/bin.mjs"
@@ -59,8 +59,8 @@
59
59
  "@types/node": "^24.7.2",
60
60
  "typescript": "^5.9.2",
61
61
  "@agent-evals/runner": "0.0.1",
62
- "@agent-evals/shared": "0.0.1",
63
- "@agent-evals/sdk": "0.0.1"
62
+ "@agent-evals/sdk": "0.0.1",
63
+ "@agent-evals/shared": "0.0.1"
64
64
  },
65
65
  "scripts": {
66
66
  "build": "pnpm --filter @agent-evals/web build && tsdown",
@@ -39,10 +39,10 @@ file wires up cases and scoring; the real `evalTracer.span(...)` calls sit
39
39
  inside the workflow, agent, or tool functions that both production and evals
40
40
  invoke.
41
41
 
42
- `evalTracer`, `evalSpan`, output helpers, and `evalAssert` are ambient no-ops
43
- when called outside an eval case scope, so leaving them in production paths is
44
- safe — they only record anything when the product code runs inside an eval's
45
- `execute`. Use `isInEvalScope()` to branch on eval-only behavior in shared code
42
+ `evalTracer`, `evalSpan`, output helpers, `evalLog`, and `evalAssert` are
43
+ ambient no-ops when called outside an eval case scope, so leaving them in
44
+ production paths is safe — they only record anything when the product code runs
45
+ inside an eval's `execute`. Use `isInEvalScope()` to branch on eval-only behavior in shared code
46
46
  (e.g. skip a real network side effect): it returns `null` outside eval-owned
47
47
  work and returns `'env'`, `'cases'`, `'eval'`, `'derive'`, `'outputsSchema'`, or
48
48
  `'scorer'` during runner phases. Top-level modules imported while a run is being
@@ -52,6 +52,12 @@ prepared see `'env'`; code called from `execute` sees `'eval'`. Use
52
52
  scope it returns `undefined`. Use `nextEvalId()` inside eval-scoped code when a
53
53
  stable generated id is needed; it includes the eval file, eval id, case id, and
54
54
  a per-case sequence number, and throws outside an eval case scope.
55
+ Use `evalLog(level, ...args)` for intentional per-case logs. The runner also
56
+ captures `console.log`, `console.info`, `console.warn`, and `console.error`
57
+ during case-owned phases by default; log arguments are stored as JSON-safe
58
+ values and rendered with the JSON viewer, collapsed previews include best-effort
59
+ code locations when stack data is available, previews are capped, and logs
60
+ inside cached operations are not replayed from cache hits.
55
61
 
56
62
  ### Product code (instrumented once, reused everywhere)
57
63
 
@@ -260,6 +266,10 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
260
266
  `error` read from conventional attribute paths. Override `kinds` or
261
267
  `attributes.<field>` for external tracers, and add `metrics` with the same
262
268
  formats and placements as LLM-call metrics.
269
+ - `runLogs` (in `agent-evals.config.ts`) controls case log capture. Use
270
+ `runLogs: { captureConsole: false }` to keep console output in the terminal
271
+ without persisting console calls to case details. Manual `evalLog(...)` calls
272
+ are still captured.
263
273
 
264
274
  Stats rows and history charts on the eval card are opt-in via `stats` /
265
275
  `charts` on the eval definition. Their shapes live in the types; no need to
@@ -329,6 +339,11 @@ Mental model:
329
339
  `.agent-evals/cache/<owner>.json`; each namespace is capped at 100 entries by
330
340
  default. Configure `cache.maxEntriesPerNamespace` for the default cap and
331
341
  `cache.maxEntriesByNamespace` for exact namespace-specific caps.
342
+ - Authored raw cache keys are stored for debugging under
343
+ `.agent-evals/cache-debug/<owner>.json`. This folder may include prompts,
344
+ user inputs, or other sensitive data, should be gitignored, and is not needed
345
+ for cache reuse. The UI Cache tab shows the raw key when it is available and
346
+ can be filtered to hits or new entries added by cache misses/refreshes.
332
347
  - Cached payloads use advance serialization/deserialization with the Web API plugin set, so return values and
333
348
  recorded SDK effects preserve richer built-ins such as `Date`, `Map`, `Set`,
334
349
  typed arrays, `URL`, `Headers`, `Blob`, and `File` on hits. Cache keys still