npm - @ls-stack/agent-eval - Versions diffs - 0.16.1 → 0.18.0 - Mend

@ls-stack/agent-eval 0.16.1 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/{app-B8e-oWYc.mjs → app-hAlVvT-Q.mjs} +23 -4
package/dist/apps/web/dist/assets/index-BovjyzD8.css +1 -0
package/dist/apps/web/dist/assets/index-BxcwUS7V.js +118 -0
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/{cli-BmrtjQj_.mjs → cli-3zANEAhG.mjs} +64 -13
package/dist/index.d.mts +311 -20
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +2 -1
package/dist/{runOrchestration-BDyNrRQT.mjs → runOrchestration-BBg_VUH5.mjs} +937 -1842
package/dist/{runner-DABFPXkx.mjs → runner-DxlahWDo.mjs} +1 -1
package/dist/{runner-CsZqhbiA.mjs → runner-RmZPRz-h.mjs} +2 -2
package/dist/src-BC4OrajN.mjs +3 -0
package/package.json +3 -3
package/skills/agent-eval/SKILL.md +19 -4
package/dist/apps/web/dist/assets/index-BZ1TdyEg.js +0 -117
package/dist/apps/web/dist/assets/index-MARPw1bH.css +0 -1
package/dist/src-CEAJYN_X.mjs +0 -3

package/dist/{runner-DABFPXkx.mjs → runner-DxlahWDo.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-CsZqhbiA.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-RmZPRz-h.mjs";
 export { getRunnerInstance, initRunner };

package/dist/{runner-CsZqhbiA.mjs → runner-RmZPRz-h.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-BmrtjQj_.mjs";
-import "./src-CEAJYN_X.mjs";
+import { n as createRunner } from "./cli-3zANEAhG.mjs";
+import "./src-BC4OrajN.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/src-BC4OrajN.mjs ADDED Viewed

@@ -0,0 +1,3 @@
+import "./runOrchestration-BBg_VUH5.mjs";
+import "./cli-3zANEAhG.mjs";
+export {};

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.16.1",
+  "version": "0.18.0",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"
@@ -59,8 +59,8 @@
     "@types/node": "^24.7.2",
     "typescript": "^5.9.2",
     "@agent-evals/runner": "0.0.1",
-    "@agent-evals/shared": "0.0.1",
-    "@agent-evals/sdk": "0.0.1"
+    "@agent-evals/sdk": "0.0.1",
+    "@agent-evals/shared": "0.0.1"
   },
   "scripts": {
     "build": "pnpm --filter @agent-evals/web build && tsdown",

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -39,10 +39,10 @@ file wires up cases and scoring; the real `evalTracer.span(...)` calls sit
 inside the workflow, agent, or tool functions that both production and evals
 invoke.
-`evalTracer`, `evalSpan`, output helpers, and `evalAssert` are ambient no-ops
-when called outside an eval case scope, so leaving them in production paths is
-safe — they only record anything when the product code runs inside an eval's
-`execute`. Use `isInEvalScope()` to branch on eval-only behavior in shared code
+`evalTracer`, `evalSpan`, output helpers, `evalLog`, and `evalAssert` are
+ambient no-ops when called outside an eval case scope, so leaving them in
+production paths is safe — they only record anything when the product code runs
+inside an eval's `execute`. Use `isInEvalScope()` to branch on eval-only behavior in shared code
 (e.g. skip a real network side effect): it returns `null` outside eval-owned
 work and returns `'env'`, `'cases'`, `'eval'`, `'derive'`, `'outputsSchema'`, or
 `'scorer'` during runner phases. Top-level modules imported while a run is being
@@ -52,6 +52,12 @@ prepared see `'env'`; code called from `execute` sees `'eval'`. Use
 scope it returns `undefined`. Use `nextEvalId()` inside eval-scoped code when a
 stable generated id is needed; it includes the eval file, eval id, case id, and
 a per-case sequence number, and throws outside an eval case scope.
+Use `evalLog(level, ...args)` for intentional per-case logs. The runner also
+captures `console.log`, `console.info`, `console.warn`, and `console.error`
+during case-owned phases by default; log arguments are stored as JSON-safe
+values and rendered with the JSON viewer, collapsed previews include best-effort
+code locations when stack data is available, previews are capped, and logs
+inside cached operations are not replayed from cache hits.
 ### Product code (instrumented once, reused everywhere)
@@ -260,6 +266,10 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
   `error` read from conventional attribute paths. Override `kinds` or
   `attributes.<field>` for external tracers, and add `metrics` with the same
   formats and placements as LLM-call metrics.
+- `runLogs` (in `agent-evals.config.ts`) controls case log capture. Use
+  `runLogs: { captureConsole: false }` to keep console output in the terminal
+  without persisting console calls to case details. Manual `evalLog(...)` calls
+  are still captured.
 Stats rows and history charts on the eval card are opt-in via `stats` /
 `charts` on the eval definition. Their shapes live in the types; no need to
@@ -329,6 +339,11 @@ Mental model:
   `.agent-evals/cache/<owner>.json`; each namespace is capped at 100 entries by
   default. Configure `cache.maxEntriesPerNamespace` for the default cap and
   `cache.maxEntriesByNamespace` for exact namespace-specific caps.
+- Authored raw cache keys are stored for debugging under
+  `.agent-evals/cache-debug/<owner>.json`. This folder may include prompts,
+  user inputs, or other sensitive data, should be gitignored, and is not needed
+  for cache reuse. The UI Cache tab shows the raw key when it is available and
+  can be filtered to hits or new entries added by cache misses/refreshes.
 - Cached payloads use advance serialization/deserialization with the Web API plugin set, so return values and
   recorded SDK effects preserve richer built-ins such as `Date`, `Map`, `Set`,
   typed arrays, `URL`, `Headers`, `Blob`, and `File` on hits. Cache keys still