npm - @ls-stack/agent-eval - Versions diffs - 0.40.0 → 0.42.0 - Mend

@ls-stack/agent-eval 0.40.0 → 0.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/{app-0k8Y1OBk.mjs → app-1vE5Ryry.mjs} +5 -5
package/dist/apps/web/dist/assets/index-5CB9eJZy.js +140 -0
package/dist/apps/web/dist/assets/index-eFM9VIsz.css +1 -0
package/dist/apps/web/dist/index.html +2 -2
package/dist/bin.mjs +1 -1
package/dist/{cli-Sg8UdOnm.mjs → cli-Bk5g-bat.mjs} +43 -11
package/dist/index.d.mts +3035 -3369
package/dist/index.mjs +4 -4
package/dist/runChild.mjs +5 -3
package/dist/{runOrchestration-CTzVNrDP.mjs → runOrchestration-DhTiT4V0.mjs} +4635 -4262
package/dist/{runner-njK_CtXC.mjs → runner-B1Cyevvr.mjs} +1 -1
package/dist/{runner-Bb9JdFkg.mjs → runner-BG0L4yId.mjs} +2 -2
package/dist/src-t6OVp1li.mjs +13 -0
package/package.json +3 -3
package/skills/agent-eval/SKILL.md +24 -1
package/dist/apps/web/dist/assets/index-C4v6dWcv.js +0 -140
package/dist/apps/web/dist/assets/index-C58_zLA9.css +0 -1
package/dist/src-BZzPFS8r.mjs +0 -3

package/dist/{runner-njK_CtXC.mjs → runner-B1Cyevvr.mjs} RENAMED Viewed

@@ -1,2 +1,2 @@
-import { n as initRunner, t as getRunnerInstance } from "./runner-Bb9JdFkg.mjs";
+import { n as initRunner, t as getRunnerInstance } from "./runner-BG0L4yId.mjs";
 export { getRunnerInstance, initRunner };

package/dist/{runner-Bb9JdFkg.mjs → runner-BG0L4yId.mjs} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { n as createRunner } from "./cli-Sg8UdOnm.mjs";
-import "./src-BZzPFS8r.mjs";
+import { n as createRunner } from "./cli-Bk5g-bat.mjs";
+import "./src-t6OVp1li.mjs";
 //#region ../../apps/server/src/runner.ts
 let runnerInstance = null;
 function getRunnerInstance() {

package/dist/src-t6OVp1li.mjs ADDED Viewed

@@ -0,0 +1,13 @@
+import { Lt as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-DhTiT4V0.mjs";
+import "./cli-Bk5g-bat.mjs";
+//#region src/index.ts
+/** Register an eval definition with typed tag support. */
+function defineEval(definition) {
+	defineEval$1(definition);
+}
+/** Return whether the active eval case has tags matching the typed input. */
+function matchesEvalTags(input) {
+	return matchesEvalTags$1(input);
+}
+//#endregion
+export { matchesEvalTags as n, defineEval as t };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ls-stack/agent-eval",
-  "version": "0.40.0",
+  "version": "0.42.0",
   "type": "module",
   "bin": {
     "agent-evals": "./dist/bin.mjs"
@@ -33,8 +33,8 @@
     "@types/node": "^24.7.2",
     "typescript": "^5.9.2",
     "@agent-evals/runner": "0.0.1",
-    "@agent-evals/sdk": "0.0.1",
-    "@agent-evals/shared": "0.0.1"
+    "@agent-evals/shared": "0.0.1",
+    "@agent-evals/sdk": "0.0.1"
   },
   "scripts": {
     "build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",

package/skills/agent-eval/SKILL.md CHANGED Viewed

@@ -25,7 +25,8 @@ display rules), read the TypeScript declarations shipped with the package:
 - The CLI automatically loads `.env` from the current workspace. Shell-provided
   environment variables win; pass `--no-env` to disable `.env` loading once.
 - Unfiltered `agent-evals run` is disabled by default; use `--eval` or `--case`
-  for targeted CLI runs. Set `allowCliRunAll: true` in
+  for targeted CLI runs, or `--tags-filter <expr>` to run cases matching tags.
+  Set `allowCliRunAll: true` in
   `agent-evals.config.ts` to opt into run-all CLI behavior. The web UI can
   still run grouped evals and confirms before starting more than five. On a
   single eval page, the Run chevron can open a picker to run specific authored
@@ -69,6 +70,25 @@ during case-owned phases by default; log arguments are stored as JSON-safe
 values and rendered with the JSON viewer, collapsed previews include best-effort
 code locations when stack data is available, previews are capped, and logs
 inside cached operations are not replayed from cache hits.
+Use eval tags to target related coverage without naming every case:
+`AgentEvalsConfig.tags` applies workspace-wide tags, `defineEval({ tags })`
+adds eval tags, `case.tags` adds case-only tags, and `removeTags` disables a
+configured global tag for one eval. CLI filters support Vitest-style tag
+expressions such as `agent-evals run --tags-filter "refunds && !slow"`.
+Inside eval-scoped code, use `matchesEvalTags('tag')` or
+`matchesEvalTags({ all, any, not })`; it uses typed exact tag names and returns
+`false` outside a case scope. Projects can narrow tag names with a `.d.ts`
+module augmentation:
+```ts
+import '@ls-stack/agent-eval';
+declare module '@ls-stack/agent-eval' {
+  interface AgentEvalTagRegistry {
+    tags: 'refunds' | 'media' | 'manual' | 'slow';
+  }
+}
+```
 ### Product code (instrumented once, reused everywhere)
@@ -333,6 +353,9 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
   column from the runs table when every rendered row is missing the value,
   `null`, or an empty string; `0` and `false` still count as values, and the
   value remains available in case details and raw output data.
+  In the case detail Output tab, string outputs that look like Markdown render
+  as Markdown even without `format: 'markdown'`, with a Preview/Raw toggle for
+  inspecting the original text.
 - `deriveFromTracing` can be authored globally in `agent-evals.config.ts` or
   locally on one eval. Prefer the keyed map form for shared metrics:
   `deriveFromTracing: { toolCalls: ({ trace }) => trace.findSpansByKind('tool').length }`.