@agent-native/core 0.52.0 → 0.53.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. package/README.md +41 -95
  2. package/blueprints/action/crud.md +98 -0
  3. package/blueprints/channel/discord.md +74 -0
  4. package/blueprints/provider/stripe.md +87 -0
  5. package/blueprints/sandbox/docker.md +78 -0
  6. package/dist/action.d.ts +24 -0
  7. package/dist/action.d.ts.map +1 -1
  8. package/dist/action.js +4 -0
  9. package/dist/action.js.map +1 -1
  10. package/dist/agent/observational-memory/compactor.d.ts +43 -0
  11. package/dist/agent/observational-memory/compactor.d.ts.map +1 -0
  12. package/dist/agent/observational-memory/compactor.js +50 -0
  13. package/dist/agent/observational-memory/compactor.js.map +1 -0
  14. package/dist/agent/observational-memory/config.d.ts +37 -0
  15. package/dist/agent/observational-memory/config.d.ts.map +1 -0
  16. package/dist/agent/observational-memory/config.js +48 -0
  17. package/dist/agent/observational-memory/config.js.map +1 -0
  18. package/dist/agent/observational-memory/index.d.ts +26 -0
  19. package/dist/agent/observational-memory/index.d.ts.map +1 -0
  20. package/dist/agent/observational-memory/index.js +25 -0
  21. package/dist/agent/observational-memory/index.js.map +1 -0
  22. package/dist/agent/observational-memory/internal-run.d.ts +37 -0
  23. package/dist/agent/observational-memory/internal-run.d.ts.map +1 -0
  24. package/dist/agent/observational-memory/internal-run.js +59 -0
  25. package/dist/agent/observational-memory/internal-run.js.map +1 -0
  26. package/dist/agent/observational-memory/message-text.d.ts +13 -0
  27. package/dist/agent/observational-memory/message-text.d.ts.map +1 -0
  28. package/dist/agent/observational-memory/message-text.js +46 -0
  29. package/dist/agent/observational-memory/message-text.js.map +1 -0
  30. package/dist/agent/observational-memory/migrations.d.ts +13 -0
  31. package/dist/agent/observational-memory/migrations.d.ts.map +1 -0
  32. package/dist/agent/observational-memory/migrations.js +43 -0
  33. package/dist/agent/observational-memory/migrations.js.map +1 -0
  34. package/dist/agent/observational-memory/observer.d.ts +37 -0
  35. package/dist/agent/observational-memory/observer.d.ts.map +1 -0
  36. package/dist/agent/observational-memory/observer.js +82 -0
  37. package/dist/agent/observational-memory/observer.js.map +1 -0
  38. package/dist/agent/observational-memory/plugin.d.ts +16 -0
  39. package/dist/agent/observational-memory/plugin.d.ts.map +1 -0
  40. package/dist/agent/observational-memory/plugin.js +26 -0
  41. package/dist/agent/observational-memory/plugin.js.map +1 -0
  42. package/dist/agent/observational-memory/prompts.d.ts +27 -0
  43. package/dist/agent/observational-memory/prompts.d.ts.map +1 -0
  44. package/dist/agent/observational-memory/prompts.js +42 -0
  45. package/dist/agent/observational-memory/prompts.js.map +1 -0
  46. package/dist/agent/observational-memory/read.d.ts +47 -0
  47. package/dist/agent/observational-memory/read.d.ts.map +1 -0
  48. package/dist/agent/observational-memory/read.js +99 -0
  49. package/dist/agent/observational-memory/read.js.map +1 -0
  50. package/dist/agent/observational-memory/reflector.d.ts +31 -0
  51. package/dist/agent/observational-memory/reflector.d.ts.map +1 -0
  52. package/dist/agent/observational-memory/reflector.js +76 -0
  53. package/dist/agent/observational-memory/reflector.js.map +1 -0
  54. package/dist/agent/observational-memory/schema.d.ts +267 -0
  55. package/dist/agent/observational-memory/schema.d.ts.map +1 -0
  56. package/dist/agent/observational-memory/schema.js +48 -0
  57. package/dist/agent/observational-memory/schema.js.map +1 -0
  58. package/dist/agent/observational-memory/store.d.ts +52 -0
  59. package/dist/agent/observational-memory/store.d.ts.map +1 -0
  60. package/dist/agent/observational-memory/store.js +197 -0
  61. package/dist/agent/observational-memory/store.js.map +1 -0
  62. package/dist/agent/observational-memory/types.d.ts +61 -0
  63. package/dist/agent/observational-memory/types.d.ts.map +1 -0
  64. package/dist/agent/observational-memory/types.js +9 -0
  65. package/dist/agent/observational-memory/types.js.map +1 -0
  66. package/dist/agent/production-agent.d.ts +15 -0
  67. package/dist/agent/production-agent.d.ts.map +1 -1
  68. package/dist/agent/production-agent.js +240 -1
  69. package/dist/agent/production-agent.js.map +1 -1
  70. package/dist/agent/run-loop-with-resume.d.ts.map +1 -1
  71. package/dist/agent/run-loop-with-resume.js +49 -0
  72. package/dist/agent/run-loop-with-resume.js.map +1 -1
  73. package/dist/agent/run-store.d.ts +17 -0
  74. package/dist/agent/run-store.d.ts.map +1 -1
  75. package/dist/agent/run-store.js +55 -0
  76. package/dist/agent/run-store.js.map +1 -1
  77. package/dist/agent/runtime-context.d.ts +30 -0
  78. package/dist/agent/runtime-context.d.ts.map +1 -1
  79. package/dist/agent/runtime-context.js +54 -1
  80. package/dist/agent/runtime-context.js.map +1 -1
  81. package/dist/agent/tool-call-journal.d.ts +101 -0
  82. package/dist/agent/tool-call-journal.d.ts.map +1 -0
  83. package/dist/agent/tool-call-journal.js +214 -0
  84. package/dist/agent/tool-call-journal.js.map +1 -0
  85. package/dist/agent/types.d.ts +24 -0
  86. package/dist/agent/types.d.ts.map +1 -1
  87. package/dist/agent/types.js.map +1 -1
  88. package/dist/cli/add.d.ts +109 -0
  89. package/dist/cli/add.d.ts.map +1 -0
  90. package/dist/cli/add.js +352 -0
  91. package/dist/cli/add.js.map +1 -0
  92. package/dist/cli/connect.d.ts +2 -2
  93. package/dist/cli/connect.d.ts.map +1 -1
  94. package/dist/cli/connect.js +92 -24
  95. package/dist/cli/connect.js.map +1 -1
  96. package/dist/cli/eval.d.ts +17 -0
  97. package/dist/cli/eval.d.ts.map +1 -0
  98. package/dist/cli/eval.js +121 -0
  99. package/dist/cli/eval.js.map +1 -0
  100. package/dist/cli/index.js +44 -3
  101. package/dist/cli/index.js.map +1 -1
  102. package/dist/cli/mcp.d.ts.map +1 -1
  103. package/dist/cli/mcp.js +11 -5
  104. package/dist/cli/mcp.js.map +1 -1
  105. package/dist/cli/plan-local.d.ts +66 -5
  106. package/dist/cli/plan-local.d.ts.map +1 -1
  107. package/dist/cli/plan-local.js +495 -19
  108. package/dist/cli/plan-local.js.map +1 -1
  109. package/dist/cli/skills.d.ts +2 -2
  110. package/dist/cli/skills.d.ts.map +1 -1
  111. package/dist/cli/skills.js +70 -59
  112. package/dist/cli/skills.js.map +1 -1
  113. package/dist/client/AssistantChat.d.ts.map +1 -1
  114. package/dist/client/AssistantChat.js +118 -92
  115. package/dist/client/AssistantChat.js.map +1 -1
  116. package/dist/client/agent-chat-adapter.d.ts.map +1 -1
  117. package/dist/client/agent-chat-adapter.js +16 -0
  118. package/dist/client/agent-chat-adapter.js.map +1 -1
  119. package/dist/client/chat/tool-call-display.d.ts +20 -1
  120. package/dist/client/chat/tool-call-display.d.ts.map +1 -1
  121. package/dist/client/chat/tool-call-display.js +32 -7
  122. package/dist/client/chat/tool-call-display.js.map +1 -1
  123. package/dist/client/sse-event-processor.d.ts +13 -0
  124. package/dist/client/sse-event-processor.d.ts.map +1 -1
  125. package/dist/client/sse-event-processor.js +21 -0
  126. package/dist/client/sse-event-processor.js.map +1 -1
  127. package/dist/db/client.d.ts +4 -2
  128. package/dist/db/client.d.ts.map +1 -1
  129. package/dist/db/client.js +6 -4
  130. package/dist/db/client.js.map +1 -1
  131. package/dist/deploy/route-discovery.d.ts.map +1 -1
  132. package/dist/deploy/route-discovery.js +1 -0
  133. package/dist/deploy/route-discovery.js.map +1 -1
  134. package/dist/eval/agent-runner.d.ts +63 -0
  135. package/dist/eval/agent-runner.d.ts.map +1 -0
  136. package/dist/eval/agent-runner.js +142 -0
  137. package/dist/eval/agent-runner.js.map +1 -0
  138. package/dist/eval/define-eval.d.ts +29 -0
  139. package/dist/eval/define-eval.d.ts.map +1 -0
  140. package/dist/eval/define-eval.js +43 -0
  141. package/dist/eval/define-eval.js.map +1 -0
  142. package/dist/eval/index.d.ts +18 -0
  143. package/dist/eval/index.d.ts.map +1 -0
  144. package/dist/eval/index.js +17 -0
  145. package/dist/eval/index.js.map +1 -0
  146. package/dist/eval/report.d.ts +8 -0
  147. package/dist/eval/report.d.ts.map +1 -0
  148. package/dist/eval/report.js +44 -0
  149. package/dist/eval/report.js.map +1 -0
  150. package/dist/eval/runner.d.ts +67 -0
  151. package/dist/eval/runner.d.ts.map +1 -0
  152. package/dist/eval/runner.js +256 -0
  153. package/dist/eval/runner.js.map +1 -0
  154. package/dist/eval/scorer.d.ts +83 -0
  155. package/dist/eval/scorer.d.ts.map +1 -0
  156. package/dist/eval/scorer.js +195 -0
  157. package/dist/eval/scorer.js.map +1 -0
  158. package/dist/eval/types.d.ts +162 -0
  159. package/dist/eval/types.d.ts.map +1 -0
  160. package/dist/eval/types.js +20 -0
  161. package/dist/eval/types.js.map +1 -0
  162. package/dist/observability/traces.d.ts.map +1 -1
  163. package/dist/observability/traces.js +100 -1
  164. package/dist/observability/traces.js.map +1 -1
  165. package/dist/observability/tracing.d.ts +73 -0
  166. package/dist/observability/tracing.d.ts.map +1 -0
  167. package/dist/observability/tracing.js +126 -0
  168. package/dist/observability/tracing.js.map +1 -0
  169. package/dist/onboarding/default-steps.d.ts.map +1 -1
  170. package/dist/onboarding/default-steps.js +4 -1
  171. package/dist/onboarding/default-steps.js.map +1 -1
  172. package/dist/provider-api/actions/query-staged-dataset.d.ts +1 -1
  173. package/dist/scripts/agent-engines/list-agent-engines.d.ts.map +1 -1
  174. package/dist/scripts/agent-engines/list-agent-engines.js +10 -3
  175. package/dist/scripts/agent-engines/list-agent-engines.js.map +1 -1
  176. package/dist/server/action-discovery.d.ts.map +1 -1
  177. package/dist/server/action-discovery.js +4 -0
  178. package/dist/server/action-discovery.js.map +1 -1
  179. package/dist/server/agent-chat-plugin.d.ts +9 -0
  180. package/dist/server/agent-chat-plugin.d.ts.map +1 -1
  181. package/dist/server/agent-chat-plugin.js +118 -110
  182. package/dist/server/agent-chat-plugin.js.map +1 -1
  183. package/dist/server/agent-teams.d.ts +62 -0
  184. package/dist/server/agent-teams.d.ts.map +1 -1
  185. package/dist/server/agent-teams.js +99 -2
  186. package/dist/server/agent-teams.js.map +1 -1
  187. package/dist/server/core-routes-plugin.d.ts.map +1 -1
  188. package/dist/server/core-routes-plugin.js +7 -4
  189. package/dist/server/core-routes-plugin.js.map +1 -1
  190. package/dist/server/credential-provider.d.ts.map +1 -1
  191. package/dist/server/credential-provider.js +2 -0
  192. package/dist/server/credential-provider.js.map +1 -1
  193. package/dist/server/framework-request-handler.d.ts.map +1 -1
  194. package/dist/server/framework-request-handler.js +33 -1
  195. package/dist/server/framework-request-handler.js.map +1 -1
  196. package/dist/server/index.d.ts +1 -0
  197. package/dist/server/index.d.ts.map +1 -1
  198. package/dist/server/index.js +1 -0
  199. package/dist/server/index.js.map +1 -1
  200. package/dist/templates/workspace-core/.agents/skills/external-agents/SKILL.md +10 -0
  201. package/dist/templates/workspace-core/.agents/skills/harness-agents/SKILL.md +20 -0
  202. package/dist/templates/workspace-core/.agents/skills/observability/SKILL.md +20 -0
  203. package/docs/content/agent-teams.md +32 -0
  204. package/docs/content/blueprint-installer.md +73 -0
  205. package/docs/content/evals.md +141 -0
  206. package/docs/content/pr-visual-recap.md +7 -4
  207. package/docs/content/sandbox-adapters.md +134 -0
  208. package/docs/content/template-plan.md +20 -8
  209. package/package.json +5 -1
  210. package/src/templates/workspace-core/.agents/skills/external-agents/SKILL.md +10 -0
  211. package/src/templates/workspace-core/.agents/skills/harness-agents/SKILL.md +20 -0
  212. package/src/templates/workspace-core/.agents/skills/observability/SKILL.md +20 -0
@@ -0,0 +1,256 @@
1
+ /**
2
+ * The evals runner: discover `*.eval.ts` / `evals/*.ts` files, run each eval
3
+ * through its scorer pipeline against the *real* agent loop, score, and report.
4
+ *
5
+ * It is the engine behind `agent-native eval` — when used as a CI deploy gate
6
+ * the CLI exits non-zero if any eval scores below its threshold.
7
+ *
8
+ * Two layers:
9
+ * - `scoreEval` / `runEvals` — pure orchestration over an `AgentRunner` and
10
+ * a list of evals. Fully unit-testable with an injected runner (no model).
11
+ * - `discoverEvalFiles` / `loadEvals` — filesystem discovery + dynamic import
12
+ * of author-written eval modules.
13
+ *
14
+ * Results are also (best-effort) written to the observability eval store so a
15
+ * dashboard can surface CI eval history next to production run evals.
16
+ */
17
+ import nodePath from "node:path";
18
+ import { pathToFileURL } from "node:url";
19
+ import { insertEvalResult } from "../observability/store.js";
20
+ import { DEFAULT_EVAL_THRESHOLD } from "./define-eval.js";
21
+ import { clamp01 } from "./scorer.js";
22
+ import { createAgentRunner } from "./agent-runner.js";
23
+ // ─── Scoring orchestration ────────────────────────────────────────────
24
+ /** Run one scorer's pipeline (preprocess → analyze → score → reason). */
25
+ async function runScorer(scorer, run, runner, threshold) {
26
+ try {
27
+ const pre = scorer.preprocess ? await scorer.preprocess(run) : run;
28
+ const analysis = scorer.analyze
29
+ ? await scorer.analyze(pre, runner.analyzeContext())
30
+ : pre;
31
+ const rawScore = await scorer.generateScore(analysis);
32
+ const score = clamp01(rawScore);
33
+ const reason = scorer.generateReason
34
+ ? await scorer.generateReason({
35
+ run,
36
+ analysis: analysis,
37
+ score,
38
+ })
39
+ : undefined;
40
+ return { scorer: scorer.name, score, reason, passed: score >= threshold };
41
+ }
42
+ catch (err) {
43
+ // A scorer that throws is a failed scorer, not a crashed run — degrade
44
+ // gracefully so one bad scorer can't take down the whole CI gate.
45
+ return {
46
+ scorer: scorer.name,
47
+ score: 0,
48
+ reason: `Scorer errored: ${err instanceof Error ? err.message : String(err)}`,
49
+ passed: false,
50
+ };
51
+ }
52
+ }
53
+ /** Run a single eval: invoke the agent, then score with each scorer. */
54
+ export async function scoreEval(evalCase, runner, opts = {}) {
55
+ const threshold = opts.thresholdOverride ?? evalCase.threshold ?? DEFAULT_EVAL_THRESHOLD;
56
+ let run;
57
+ if (evalCase.run) {
58
+ run = await evalCase.run({
59
+ input: evalCase.input,
60
+ runAgent: (input) => runner.runAgent(input),
61
+ });
62
+ }
63
+ else {
64
+ run = await runner.runAgent(evalCase.input);
65
+ }
66
+ const scores = [];
67
+ for (const scorer of evalCase.scorers) {
68
+ scores.push(await runScorer(scorer, run, runner, threshold));
69
+ }
70
+ const avgScore = scores.length > 0
71
+ ? scores.reduce((s, r) => s + r.score, 0) / scores.length
72
+ : 0;
73
+ return {
74
+ eval: evalCase.name,
75
+ threshold,
76
+ scores,
77
+ // A run that errored, or any sub-threshold scorer, fails the case.
78
+ passed: run.ok && scores.every((s) => s.passed),
79
+ avgScore,
80
+ durationMs: run.durationMs,
81
+ error: run.ok ? undefined : run.error,
82
+ };
83
+ }
84
+ /** Run a batch of evals against one runner and aggregate a report. */
85
+ export async function runEvals(evals, runner, opts = {}) {
86
+ const results = [];
87
+ for (const evalCase of evals) {
88
+ const row = await scoreEval(evalCase, runner, opts);
89
+ results.push(row);
90
+ if (opts.persist)
91
+ await persistEvalRow(row).catch(() => { });
92
+ }
93
+ const passed = results.filter((r) => r.passed).length;
94
+ return {
95
+ total: results.length,
96
+ passed,
97
+ failed: results.length - passed,
98
+ results,
99
+ };
100
+ }
101
+ /**
102
+ * Best-effort write of one eval result to the observability eval store so a
103
+ * dashboard can show CI eval history alongside production run evals. We write
104
+ * one row per (eval × scorer), tagged `evalType: "automated"` with a synthetic
105
+ * `eval:` run id.
106
+ *
107
+ * TODO(live-sampling): the same scorer list should also run on a sampled
108
+ * fraction of *real* production runs. That hook belongs in the agent loop's
109
+ * (not-yet-added) post-run processor seam: when a run finishes, roll the
110
+ * configured sample rate and, if it hits, replay the run output through these
111
+ * scorers and write the rows here. Wiring it now would require the in-loop
112
+ * processor seam another wave is adding — so this is the single intended
113
+ * attachment point, intentionally left as a note.
114
+ */
115
+ async function persistEvalRow(row) {
116
+ const runId = `eval:${row.eval}:${Date.now()}`;
117
+ for (const s of row.scores) {
118
+ const result = {
119
+ id: crypto.randomUUID(),
120
+ runId,
121
+ threadId: null,
122
+ userId: null,
123
+ evalType: "automated",
124
+ criteria: `eval:${row.eval}:${s.scorer}`,
125
+ score: s.score,
126
+ reasoning: s.reason ?? null,
127
+ metadata: {
128
+ source: "cli-eval",
129
+ threshold: row.threshold,
130
+ passed: s.passed,
131
+ },
132
+ createdAt: Date.now(),
133
+ };
134
+ await insertEvalResult(result);
135
+ }
136
+ }
137
+ // ─── Discovery + loading ──────────────────────────────────────────────
138
+ const EVAL_FILE_RE = /\.eval\.(ts|js|mjs)$/;
139
+ const SKIP_DIRS = new Set(["node_modules", "dist", ".git", ".output", "build"]);
140
+ /**
141
+ * Walk `root` for eval files. Matches two conventions:
142
+ * - any `**\/*.eval.ts` (co-located with code), and
143
+ * - any `*.ts` directly inside an `evals/` directory.
144
+ * `pattern` further filters by substring of the relative path.
145
+ */
146
+ export async function discoverEvalFiles(root, pattern) {
147
+ const fs = await import("node:fs");
148
+ const out = [];
149
+ function isEvalFile(full, parentName) {
150
+ const base = nodePath.basename(full);
151
+ if (EVAL_FILE_RE.test(base))
152
+ return true;
153
+ if (parentName === "evals" && /\.(ts|js|mjs)$/.test(base)) {
154
+ // Skip obvious support files inside evals/.
155
+ return !/\.(spec|test|d)\.(ts|js|mjs)$/.test(base);
156
+ }
157
+ return false;
158
+ }
159
+ function walk(dir, parentName) {
160
+ let entries;
161
+ try {
162
+ entries = fs.readdirSync(dir, { withFileTypes: true });
163
+ }
164
+ catch {
165
+ return;
166
+ }
167
+ for (const entry of entries) {
168
+ const full = nodePath.join(dir, entry.name);
169
+ if (entry.isDirectory()) {
170
+ if (SKIP_DIRS.has(entry.name) || entry.name.startsWith("."))
171
+ continue;
172
+ walk(full, entry.name);
173
+ }
174
+ else if (entry.isFile() && isEvalFile(full, parentName)) {
175
+ out.push(full);
176
+ }
177
+ }
178
+ }
179
+ walk(root, nodePath.basename(root));
180
+ out.sort();
181
+ if (!pattern)
182
+ return out;
183
+ return out.filter((f) => nodePath.relative(root, f).includes(pattern));
184
+ }
185
+ /** Pull `Eval` definitions out of a dynamically-imported eval module. */
186
+ function extractEvals(mod) {
187
+ const candidates = [];
188
+ if (mod.default !== undefined)
189
+ candidates.push(mod.default);
190
+ for (const [key, value] of Object.entries(mod)) {
191
+ if (key === "default")
192
+ continue;
193
+ candidates.push(value);
194
+ }
195
+ const evals = [];
196
+ for (const c of candidates.flat()) {
197
+ if (c &&
198
+ typeof c === "object" &&
199
+ typeof c.name === "string" &&
200
+ Array.isArray(c.scorers) &&
201
+ c.input) {
202
+ evals.push(c);
203
+ }
204
+ }
205
+ return evals;
206
+ }
207
+ /** Discover and import all eval files under `root`, returning their evals. */
208
+ export async function loadEvals(root, pattern) {
209
+ const files = await discoverEvalFiles(root, pattern);
210
+ const evals = [];
211
+ for (const file of files) {
212
+ const mod = (await import(pathToFileURL(file).href));
213
+ evals.push(...extractEvals(mod));
214
+ }
215
+ return { files, evals };
216
+ }
217
+ /**
218
+ * End-to-end: load evals, build a runner, score, report. The CLI wraps this
219
+ * and maps `report.failed > 0` to a non-zero exit code (the CI gate).
220
+ */
221
+ export async function runEvalSuite(opts = {}) {
222
+ const cwd = opts.cwd ?? process.cwd();
223
+ let files = [];
224
+ let evals = opts.evals;
225
+ if (!evals) {
226
+ const loaded = await loadEvals(cwd, opts.pattern);
227
+ files = loaded.files;
228
+ evals = loaded.evals;
229
+ }
230
+ const runner = opts.runner ??
231
+ (await createAgentRunner({
232
+ actions: opts.actions ?? (await discoverActions(cwd)),
233
+ systemPrompt: opts.systemPrompt,
234
+ }));
235
+ const report = await runEvals(evals, runner, {
236
+ thresholdOverride: opts.thresholdOverride,
237
+ persist: opts.persist ?? true,
238
+ });
239
+ return { report, files };
240
+ }
241
+ /**
242
+ * Discover the app's actions so the agent under test has the real tool
243
+ * surface. Lazy-imports `autoDiscoverActions` to keep server-only deps out of
244
+ * any browser bundle that might touch this module's types.
245
+ */
246
+ async function discoverActions(cwd) {
247
+ try {
248
+ const { autoDiscoverActions } = await import("../server/action-discovery.js");
249
+ const actionsDir = nodePath.join(cwd, "actions");
250
+ return await autoDiscoverActions(pathToFileURL(actionsDir + "/").href);
251
+ }
252
+ catch {
253
+ return {};
254
+ }
255
+ }
256
+ //# sourceMappingURL=runner.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/eval/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,QAAQ,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AAGzC,OAAO,EAAE,gBAAgB,EAAE,MAAM,2BAA2B,CAAC;AAG7D,OAAO,EAAE,sBAAsB,EAAE,MAAM,kBAAkB,CAAC;AAC1D,OAAO,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAEtC,OAAO,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AAStD,yEAAyE;AAEzE,yEAAyE;AACzE,KAAK,UAAU,SAAS,CACtB,MAA+B,EAC/B,GAAmB,EACnB,MAAmB,EACnB,SAAiB;IAEjB,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;QACnE,MAAM,QAAQ,GAAG,MAAM,CAAC,OAAO;YAC7B,CAAC,CAAC,MAAM,MAAM,CAAC,OAAO,CAAC,GAAY,EAAE,MAAM,CAAC,cAAc,EAAE,CAAC;YAC7D,CAAC,CAAC,GAAG,CAAC;QACR,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,QAAiB,CAAC,CAAC;QAC/D,MAAM,KAAK,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;QAChC,MAAM,MAAM,GAAG,MAAM,CAAC,cAAc;YAClC,CAAC,CAAC,MAAM,MAAM,CAAC,cAAc,CAAC;gBAC1B,GAAG;gBACH,QAAQ,EAAE,QAAiB;gBAC3B,KAAK;aACN,CAAC;YACJ,CAAC,CAAC,SAAS,CAAC;QACd,OAAO,EAAE,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,IAAI,SAAS,EAAE,CAAC;IAC5E,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,uEAAuE;QACvE,kEAAkE;QAClE,OAAO;YACL,MAAM,EAAE,MAAM,CAAC,IAAI;YACnB,KAAK,EAAE,CAAC;YACR,MAAM,EAAE,mBAAmB,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE;YAC7E,MAAM,EAAE,KAAK;SACd,CAAC;IACJ,CAAC;AACH,CAAC;AAED,wEAAwE;AACxE,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,QAAc,EACd,MAAmB,EACnB,OAAuC,EAAE;IAEzC,MAAM,SAAS,GACb,IAAI,CAAC,iBAAiB,IAAI,QAAQ,CAAC,SAAS,IAAI,sBAAsB,CAAC;IAEzE,IAAI,GAAmB,CAAC;IACxB,IAAI,QAAQ,CAAC,GAAG,EAAE,CAAC;QACjB,GAAG,GAAG,MAAM,QAAQ,CAAC,GAAG,CAAC;YACvB,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,QAAQ,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC;SAC5C,CAAC,CAAC;IACL,CAAC;SAAM,CAAC;QACN,GAAG,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;IAC9C,CAAC;IAED,MAAM,MAAM,GAAmB,EAAE,CAAC;IAClC,KAAK,MAAM,MAAM,IAAI,QAAQ,CAAC,OAAO,EAAE,CAAC;QACtC,MAAM,CAAC,IAAI,CAAC,MAAM,SAAS,CAAC,MAAM,EAAE,GAAG,EAAE,MAAM,EAAE,SAAS,CAAC,CAAC,CAAC;IAC/D,CAAC;IAED,MAAM,QAAQ,GACZ,MAAM,CAAC,MAAM,GAAG,CAAC;QACf,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM;QACzD,CAAC,CAAC,CAAC,CAAC;IAER,OAAO;QACL,IAAI,EAAE,QAAQ,CAAC,IAAI;QACnB,SAAS;QACT,MAAM;QACN,mEAAmE;QACnE,MAAM,EAAE,GAAG,CAAC,EAAE,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;QAC/C,QAAQ;QACR,UAAU,EAAE,GAAG,CAAC,UAAU;QAC1B,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK;KACtC,CAAC;AACJ,CAAC;AAED,sEAAsE;AACtE,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,KAAa,EACb,MAAmB,EACnB,OAA0D,EAAE;IAE5D,MAAM,OAAO,GAAoB,EAAE,CAAC;IACpC,KAAK,MAAM,QAAQ,IAAI,KAAK,EAAE,CAAC;QAC7B,MAAM,GAAG,GAAG,MAAM,SAAS,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,CAAC,CAAC;QACpD,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAClB,IAAI,IAAI,CAAC,OAAO;YAAE,MAAM,cAAc,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;IAC9D,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACtD,OAAO;QACL,KAAK,EAAE,OAAO,CAAC,MAAM;QACrB,MAAM;QACN,MAAM,EAAE,OAAO,CAAC,MAAM,GAAG,MAAM;QAC/B,OAAO;KACR,CAAC;AACJ,CAAC;AAED;;;;;;;;;;;;;GAaG;AACH,KAAK,UAAU,cAAc,CAAC,GAAkB;IAC9C,MAAM,KAAK,GAAG,QAAQ,GAAG,CAAC,IAAI,IAAI,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC;IAC/C,KAAK,MAAM,CAAC,IAAI,GAAG,CAAC,MAAM,EAAE,CAAC;QAC3B,MAAM,MAAM,GAA4B;YACtC,EAAE,EAAE,MAAM,CAAC,UAAU,EAAE;YACvB,KAAK;YACL,QAAQ,EAAE,IAAI;YACd,MAAM,EAAE,IAAI;YACZ,QAAQ,EAAE,WAAW;YACrB,QAAQ,EAAE,QAAQ,GAAG,CAAC,IAAI,IAAI,CAAC,CAAC,MAAM,EAAE;YACxC,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,SAAS,EAAE,CAAC,CAAC,MAAM,IAAI,IAAI;YAC3B,QAAQ,EAAE;gBACR,MAAM,EAAE,UAAU;gBAClB,SAAS,EAAE,GAAG,CAAC,SAAS;gBACxB,MAAM,EAAE,CAAC,CAAC,MAAM;aACjB;YACD,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE;SACtB,CAAC;QACF,MAAM,gBAAgB,CAAC,MAAM,CAAC,CAAC;IACjC,CAAC;AACH,CAAC;AAED,yEAAyE;AAEzE,MAAM,YAAY,GAAG,sBAAsB,CAAC;AAC5C,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,CAAC,cAAc,EAAE,MAAM,EAAE,MAAM,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC;AAEhF;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,IAAY,EACZ,OAAgB;IAEhB,MAAM,EAAE,GAAG,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;IACnC,MAAM,GAAG,GAAa,EAAE,CAAC;IAEzB,SAAS,UAAU,CAAC,IAAY,EAAE,UAAkB;QAClD,MAAM,IAAI,GAAG,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACrC,IAAI,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC;YAAE,OAAO,IAAI,CAAC;QACzC,IAAI,UAAU,KAAK,OAAO,IAAI,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YAC1D,4CAA4C;YAC5C,OAAO,CAAC,+BAA+B,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACrD,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC;IAED,SAAS,IAAI,CAAC,GAAW,EAAE,UAAkB;QAC3C,IAAI,OAAmC,CAAC;QACxC,IAAI,CAAC;YACH,OAAO,GAAG,EAAE,CAAC,WAAW,CAAC,GAAG,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC;QACzD,CAAC;QAAC,MAAM,CAAC;YACP,OAAO;QACT,CAAC;QACD,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;YAC5B,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC;YAC5C,IAAI,KAAK,CAAC,WAAW,EAAE,EAAE,CAAC;gBACxB,IAAI,SAAS,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;oBAAE,SAAS;gBACtE,IAAI,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC;YACzB,CAAC;iBAAM,IAAI,KAAK,CAAC,MAAM,EAAE,IAAI,UAAU,CAAC,IAAI,EAAE,UAAU,CAAC,EAAE,CAAC;gBAC1D,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjB,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,CAAC,IAAI,EAAE,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC;IACpC,GAAG,CAAC,IAAI,EAAE,CAAC;IAEX,IAAI,CAAC,OAAO;QAAE,OAAO,GAAG,CAAC;IACzB,OAAO,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;AACzE,CAAC;AAED,yEAAyE;AACzE,SAAS,YAAY,CAAC,GAA4B;IAChD,MAAM,UAAU,GAAc,EAAE,CAAC;IACjC,IAAI,GAAG,CAAC,OAAO,KAAK,SAAS;QAAE,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IAC5D,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;QAC/C,IAAI,GAAG,KAAK,SAAS;YAAE,SAAS;QAChC,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACzB,CAAC;IAED,MAAM,KAAK,GAAW,EAAE,CAAC;IACzB,KAAK,MAAM,CAAC,IAAI,UAAU,CAAC,IAAI,EAAE,EAAE,CAAC;QAClC,IACE,CAAC;YACD,OAAO,CAAC,KAAK,QAAQ;YACrB,OAAQ,CAAU,CAAC,IAAI,KAAK,QAAQ;YACpC,KAAK,CAAC,OAAO,CAAE,CAAU,CAAC,OAAO,CAAC;YACjC,CAAU,CAAC,KAAK,EACjB,CAAC;YACD,KAAK,CAAC,IAAI,CAAC,CAAS,CAAC,CAAC;QACxB,CAAC;IACH,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,8EAA8E;AAC9E,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,IAAY,EACZ,OAAgB;IAEhB,MAAM,KAAK,GAAG,MAAM,iBAAiB,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACrD,MAAM,KAAK,GAAW,EAAE,CAAC;IACzB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,GAAG,GAAG,CAAC,MAAM,MAAM,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAGlD,CAAC;QACF,KAAK,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC;IACnC,CAAC;IACD,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;AAC1B,CAAC;AAuBD;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,OAA4B,EAAE;IAE9B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;IAEtC,IAAI,KAAK,GAAa,EAAE,CAAC;IACzB,IAAI,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC;IACvB,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,GAAG,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;QAClD,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC;QACrB,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC;IACvB,CAAC;IAED,MAAM,MAAM,GACV,IAAI,CAAC,MAAM;QACX,CAAC,MAAM,iBAAiB,CAAC;YACvB,OAAO,EAAE,IAAI,CAAC,OAAO,IAAI,CAAC,MAAM,eAAe,CAAC,GAAG,CAAC,CAAC;YACrD,YAAY,EAAE,IAAI,CAAC,YAAY;SAChC,CAAC,CAAC,CAAC;IAEN,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,KAAK,EAAE,MAAM,EAAE;QAC3C,iBAAiB,EAAE,IAAI,CAAC,iBAAiB;QACzC,OAAO,EAAE,IAAI,CAAC,OAAO,IAAI,IAAI;KAC9B,CAAC,CAAC;IACH,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC;AAC3B,CAAC;AAED;;;;GAIG;AACH,KAAK,UAAU,eAAe,CAC5B,GAAW;IAEX,IAAI,CAAC;QACH,MAAM,EAAE,mBAAmB,EAAE,GAC3B,MAAM,MAAM,CAAC,+BAA+B,CAAC,CAAC;QAChD,MAAM,UAAU,GAAG,QAAQ,CAAC,IAAI,CAAC,GAAG,EAAE,SAAS,CAAC,CAAC;QACjD,OAAO,MAAM,mBAAmB,CAAC,aAAa,CAAC,UAAU,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC;IACzE,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC","sourcesContent":["/**\n * The evals runner: discover `*.eval.ts` / `evals/*.ts` files, run each eval\n * through its scorer pipeline against the *real* agent loop, score, and report.\n *\n * It is the engine behind `agent-native eval` — when used as a CI deploy gate\n * the CLI exits non-zero if any eval scores below its threshold.\n *\n * Two layers:\n * - `scoreEval` / `runEvals` — pure orchestration over an `AgentRunner` and\n * a list of evals. Fully unit-testable with an injected runner (no model).\n * - `discoverEvalFiles` / `loadEvals` — filesystem discovery + dynamic import\n * of author-written eval modules.\n *\n * Results are also (best-effort) written to the observability eval store so a\n * dashboard can surface CI eval history next to production run evals.\n */\n\nimport nodePath from \"node:path\";\nimport { pathToFileURL } from \"node:url\";\n\nimport type { ActionEntry } from \"../agent/production-agent.js\";\nimport { insertEvalResult } from \"../observability/store.js\";\nimport type { EvalResult as ObservabilityEvalResult } from \"../observability/types.js\";\n\nimport { DEFAULT_EVAL_THRESHOLD } from \"./define-eval.js\";\nimport { clamp01 } from \"./scorer.js\";\nimport type { AgentRunner } from \"./agent-runner.js\";\nimport { createAgentRunner } from \"./agent-runner.js\";\nimport type {\n AgentRunOutput,\n Eval,\n EvalResultRow,\n EvalRunReport,\n ScorerResult,\n} from \"./types.js\";\n\n// ─── Scoring orchestration ────────────────────────────────────────────\n\n/** Run one scorer's pipeline (preprocess → analyze → score → reason). */\nasync function runScorer(\n scorer: Eval[\"scorers\"][number],\n run: AgentRunOutput,\n runner: AgentRunner,\n threshold: number,\n): Promise<ScorerResult> {\n try {\n const pre = scorer.preprocess ? await scorer.preprocess(run) : run;\n const analysis = scorer.analyze\n ? await scorer.analyze(pre as never, runner.analyzeContext())\n : pre;\n const rawScore = await scorer.generateScore(analysis as never);\n const score = clamp01(rawScore);\n const reason = scorer.generateReason\n ? await scorer.generateReason({\n run,\n analysis: analysis as never,\n score,\n })\n : undefined;\n return { scorer: scorer.name, score, reason, passed: score >= threshold };\n } catch (err) {\n // A scorer that throws is a failed scorer, not a crashed run — degrade\n // gracefully so one bad scorer can't take down the whole CI gate.\n return {\n scorer: scorer.name,\n score: 0,\n reason: `Scorer errored: ${err instanceof Error ? err.message : String(err)}`,\n passed: false,\n };\n }\n}\n\n/** Run a single eval: invoke the agent, then score with each scorer. */\nexport async function scoreEval(\n evalCase: Eval,\n runner: AgentRunner,\n opts: { thresholdOverride?: number } = {},\n): Promise<EvalResultRow> {\n const threshold =\n opts.thresholdOverride ?? evalCase.threshold ?? DEFAULT_EVAL_THRESHOLD;\n\n let run: AgentRunOutput;\n if (evalCase.run) {\n run = await evalCase.run({\n input: evalCase.input,\n runAgent: (input) => runner.runAgent(input),\n });\n } else {\n run = await runner.runAgent(evalCase.input);\n }\n\n const scores: ScorerResult[] = [];\n for (const scorer of evalCase.scorers) {\n scores.push(await runScorer(scorer, run, runner, threshold));\n }\n\n const avgScore =\n scores.length > 0\n ? scores.reduce((s, r) => s + r.score, 0) / scores.length\n : 0;\n\n return {\n eval: evalCase.name,\n threshold,\n scores,\n // A run that errored, or any sub-threshold scorer, fails the case.\n passed: run.ok && scores.every((s) => s.passed),\n avgScore,\n durationMs: run.durationMs,\n error: run.ok ? undefined : run.error,\n };\n}\n\n/** Run a batch of evals against one runner and aggregate a report. */\nexport async function runEvals(\n evals: Eval[],\n runner: AgentRunner,\n opts: { thresholdOverride?: number; persist?: boolean } = {},\n): Promise<EvalRunReport> {\n const results: EvalResultRow[] = [];\n for (const evalCase of evals) {\n const row = await scoreEval(evalCase, runner, opts);\n results.push(row);\n if (opts.persist) await persistEvalRow(row).catch(() => {});\n }\n\n const passed = results.filter((r) => r.passed).length;\n return {\n total: results.length,\n passed,\n failed: results.length - passed,\n results,\n };\n}\n\n/**\n * Best-effort write of one eval result to the observability eval store so a\n * dashboard can show CI eval history alongside production run evals. We write\n * one row per (eval × scorer), tagged `evalType: \"automated\"` with a synthetic\n * `eval:` run id.\n *\n * TODO(live-sampling): the same scorer list should also run on a sampled\n * fraction of *real* production runs. That hook belongs in the agent loop's\n * (not-yet-added) post-run processor seam: when a run finishes, roll the\n * configured sample rate and, if it hits, replay the run output through these\n * scorers and write the rows here. Wiring it now would require the in-loop\n * processor seam another wave is adding — so this is the single intended\n * attachment point, intentionally left as a note.\n */\nasync function persistEvalRow(row: EvalResultRow): Promise<void> {\n const runId = `eval:${row.eval}:${Date.now()}`;\n for (const s of row.scores) {\n const result: ObservabilityEvalResult = {\n id: crypto.randomUUID(),\n runId,\n threadId: null,\n userId: null,\n evalType: \"automated\",\n criteria: `eval:${row.eval}:${s.scorer}`,\n score: s.score,\n reasoning: s.reason ?? null,\n metadata: {\n source: \"cli-eval\",\n threshold: row.threshold,\n passed: s.passed,\n },\n createdAt: Date.now(),\n };\n await insertEvalResult(result);\n }\n}\n\n// ─── Discovery + loading ──────────────────────────────────────────────\n\nconst EVAL_FILE_RE = /\\.eval\\.(ts|js|mjs)$/;\nconst SKIP_DIRS = new Set([\"node_modules\", \"dist\", \".git\", \".output\", \"build\"]);\n\n/**\n * Walk `root` for eval files. Matches two conventions:\n * - any `**\\/*.eval.ts` (co-located with code), and\n * - any `*.ts` directly inside an `evals/` directory.\n * `pattern` further filters by substring of the relative path.\n */\nexport async function discoverEvalFiles(\n root: string,\n pattern?: string,\n): Promise<string[]> {\n const fs = await import(\"node:fs\");\n const out: string[] = [];\n\n function isEvalFile(full: string, parentName: string): boolean {\n const base = nodePath.basename(full);\n if (EVAL_FILE_RE.test(base)) return true;\n if (parentName === \"evals\" && /\\.(ts|js|mjs)$/.test(base)) {\n // Skip obvious support files inside evals/.\n return !/\\.(spec|test|d)\\.(ts|js|mjs)$/.test(base);\n }\n return false;\n }\n\n function walk(dir: string, parentName: string): void {\n let entries: import(\"node:fs\").Dirent[];\n try {\n entries = fs.readdirSync(dir, { withFileTypes: true });\n } catch {\n return;\n }\n for (const entry of entries) {\n const full = nodePath.join(dir, entry.name);\n if (entry.isDirectory()) {\n if (SKIP_DIRS.has(entry.name) || entry.name.startsWith(\".\")) continue;\n walk(full, entry.name);\n } else if (entry.isFile() && isEvalFile(full, parentName)) {\n out.push(full);\n }\n }\n }\n\n walk(root, nodePath.basename(root));\n out.sort();\n\n if (!pattern) return out;\n return out.filter((f) => nodePath.relative(root, f).includes(pattern));\n}\n\n/** Pull `Eval` definitions out of a dynamically-imported eval module. */\nfunction extractEvals(mod: Record<string, unknown>): Eval[] {\n const candidates: unknown[] = [];\n if (mod.default !== undefined) candidates.push(mod.default);\n for (const [key, value] of Object.entries(mod)) {\n if (key === \"default\") continue;\n candidates.push(value);\n }\n\n const evals: Eval[] = [];\n for (const c of candidates.flat()) {\n if (\n c &&\n typeof c === \"object\" &&\n typeof (c as Eval).name === \"string\" &&\n Array.isArray((c as Eval).scorers) &&\n (c as Eval).input\n ) {\n evals.push(c as Eval);\n }\n }\n return evals;\n}\n\n/** Discover and import all eval files under `root`, returning their evals. */\nexport async function loadEvals(\n root: string,\n pattern?: string,\n): Promise<{ files: string[]; evals: Eval[] }> {\n const files = await discoverEvalFiles(root, pattern);\n const evals: Eval[] = [];\n for (const file of files) {\n const mod = (await import(pathToFileURL(file).href)) as Record<\n string,\n unknown\n >;\n evals.push(...extractEvals(mod));\n }\n return { files, evals };\n}\n\n// ─── High-level entry used by the CLI ─────────────────────────────────\n\nexport interface RunEvalSuiteOptions {\n /** App root to discover eval files + actions under. Defaults to cwd. */\n cwd?: string;\n /** Substring filter on the eval file path. */\n pattern?: string;\n /** Global threshold override (wins over per-eval thresholds). */\n thresholdOverride?: number;\n /** App actions to expose to the agent. Auto-discovered when omitted. */\n actions?: Record<string, ActionEntry>;\n /** System prompt for runs. */\n systemPrompt?: string;\n /** Write results to the observability eval store (default true). */\n persist?: boolean;\n /** Pre-built runner (tests inject this to avoid touching engine/loop). */\n runner?: AgentRunner;\n /** Pre-loaded evals (tests inject this to skip filesystem discovery). */\n evals?: Eval[];\n}\n\n/**\n * End-to-end: load evals, build a runner, score, report. The CLI wraps this\n * and maps `report.failed > 0` to a non-zero exit code (the CI gate).\n */\nexport async function runEvalSuite(\n opts: RunEvalSuiteOptions = {},\n): Promise<{ report: EvalRunReport; files: string[] }> {\n const cwd = opts.cwd ?? process.cwd();\n\n let files: string[] = [];\n let evals = opts.evals;\n if (!evals) {\n const loaded = await loadEvals(cwd, opts.pattern);\n files = loaded.files;\n evals = loaded.evals;\n }\n\n const runner =\n opts.runner ??\n (await createAgentRunner({\n actions: opts.actions ?? (await discoverActions(cwd)),\n systemPrompt: opts.systemPrompt,\n }));\n\n const report = await runEvals(evals, runner, {\n thresholdOverride: opts.thresholdOverride,\n persist: opts.persist ?? true,\n });\n return { report, files };\n}\n\n/**\n * Discover the app's actions so the agent under test has the real tool\n * surface. Lazy-imports `autoDiscoverActions` to keep server-only deps out of\n * any browser bundle that might touch this module's types.\n */\nasync function discoverActions(\n cwd: string,\n): Promise<Record<string, ActionEntry>> {\n try {\n const { autoDiscoverActions } =\n await import(\"../server/action-discovery.js\");\n const actionsDir = nodePath.join(cwd, \"actions\");\n return await autoDiscoverActions(pathToFileURL(actionsDir + \"/\").href);\n } catch {\n return {};\n }\n}\n"]}
@@ -0,0 +1,83 @@
1
+ /**
2
+ * `createScorer` and a batteries-included set of built-in scorers.
3
+ *
4
+ * A scorer is a 4-step pipeline (preprocess → analyze → generateScore →
5
+ * generateReason). `createScorer` is a thin identity-with-validation factory:
6
+ * it enforces the one hard contract (`generateScore` is required) and returns
7
+ * a fully-typed `Scorer`. Built-in scorers below show both flavors:
8
+ *
9
+ * - `exactMatch` / `contains` — pure-JS analyze, no model.
10
+ * - `llmJudge` — analyze runs an LLM judge through the resolved engine
11
+ * (provider-agnostic; the model is whatever the runner resolved).
12
+ */
13
+ import type { AgentRunOutput, Scorer, ScorerDefinition } from "./types.js";
14
+ /**
15
+ * Create a scorer from a 4-step pipeline definition.
16
+ *
17
+ * `generateScore` is the only required step. `preprocess`/`analyze` default to
18
+ * identity (the scorer sees the raw `AgentRunOutput`), and `generateReason` is
19
+ * optional.
20
+ */
21
+ export declare function createScorer<Pre = AgentRunOutput, Ana = Pre>(def: ScorerDefinition<Pre, Ana>): Scorer<Pre, Ana>;
22
+ /** Clamp any number into [0, 1]; coerce non-finite to 0. */
23
+ export declare function clamp01(n: number): number;
24
+ /**
25
+ * `exactMatch` — 1.0 when the agent's (trimmed, case-insensitive by default)
26
+ * text equals `expected`, else 0.0. Pure JS, no model.
27
+ */
28
+ export declare function exactMatch(expected: string, opts?: {
29
+ caseSensitive?: boolean;
30
+ }): Scorer<AgentRunOutput, {
31
+ match: boolean;
32
+ }>;
33
+ /**
34
+ * `contains` — 1.0 when the agent's text contains every required substring
35
+ * (case-insensitive by default). Score is the fraction matched, so a partial
36
+ * hit still surfaces signal. Pure JS, no model.
37
+ */
38
+ export declare function contains(needles: string | string[], opts?: {
39
+ caseSensitive?: boolean;
40
+ }): Scorer<AgentRunOutput, {
41
+ found: string[];
42
+ missing: string[];
43
+ }>;
44
+ /**
45
+ * `usesTool` — 1.0 when the agent invoked the named tool/action at least once.
46
+ * Useful as a behavioral gate ("the agent must call send-email"). Pure JS.
47
+ */
48
+ export declare function usesTool(toolName: string): Scorer<AgentRunOutput, {
49
+ used: boolean;
50
+ }>;
51
+ interface JudgeVerdict {
52
+ score: number;
53
+ reasoning: string;
54
+ }
55
+ export interface LlmJudgeOptions {
56
+ /** Scorer name (defaults to `llm_judge`). */
57
+ name?: string;
58
+ /** What is being judged, e.g. "helpfulness". */
59
+ criteria: string;
60
+ /** A rubric describing what 0.0 vs 1.0 means. */
61
+ rubric?: string;
62
+ /**
63
+ * The score scale the judge is told to use. Output is normalized to [0,1].
64
+ * Defaults to a 0..1 scale.
65
+ */
66
+ scoreRange?: {
67
+ min: number;
68
+ max: number;
69
+ };
70
+ }
71
+ /**
72
+ * `llmJudge` — an LLM-as-judge scorer. The analyze step asks the resolved
73
+ * engine to score the agent output against a natural-language rubric and emit
74
+ * `{ "score": <n>, "reasoning": "<why>" }`. The model is whatever the runner
75
+ * resolved from the engine registry — this scorer NEVER hardcodes a provider
76
+ * or model, so evals stay provider-agnostic.
77
+ */
78
+ export declare function llmJudge(opts: LlmJudgeOptions): Scorer<AgentRunOutput, {
79
+ verdict: JudgeVerdict | null;
80
+ normalized: number;
81
+ }>;
82
+ export {};
83
+ //# sourceMappingURL=scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"scorer.d.ts","sourceRoot":"","sources":["../../src/eval/scorer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AAE3E;;;;;;GAMG;AACH,wBAAgB,YAAY,CAAC,GAAG,GAAG,cAAc,EAAE,GAAG,GAAG,GAAG,EAC1D,GAAG,EAAE,gBAAgB,CAAC,GAAG,EAAE,GAAG,CAAC,GAC9B,MAAM,CAAC,GAAG,EAAE,GAAG,CAAC,CAgBlB;AAED,4DAA4D;AAC5D,wBAAgB,OAAO,CAAC,CAAC,EAAE,MAAM,GAAG,MAAM,CAGzC;AASD;;;GAGG;AACH,wBAAgB,UAAU,CACxB,QAAQ,EAAE,MAAM,EAChB,IAAI,GAAE;IAAE,aAAa,CAAC,EAAE,OAAO,CAAA;CAAO,GACrC,MAAM,CAAC,cAAc,EAAE;IAAE,KAAK,EAAE,OAAO,CAAA;CAAE,CAAC,CAiB5C;AAED;;;;GAIG;AACH,wBAAgB,QAAQ,CACtB,OAAO,EAAE,MAAM,GAAG,MAAM,EAAE,EAC1B,IAAI,GAAE;IAAE,aAAa,CAAC,EAAE,OAAO,CAAA;CAAO,GACrC,MAAM,CAAC,cAAc,EAAE;IAAE,KAAK,EAAE,MAAM,EAAE,CAAC;IAAC,OAAO,EAAE,MAAM,EAAE,CAAA;CAAE,CAAC,CAyBhE;AAED;;;GAGG;AACH,wBAAgB,QAAQ,CACtB,QAAQ,EAAE,MAAM,GACf,MAAM,CAAC,cAAc,EAAE;IAAE,IAAI,EAAE,OAAO,CAAA;CAAE,CAAC,CAe3C;AAID,UAAU,YAAY;IACpB,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,MAAM,CAAC;CACnB;AAsBD,MAAM,WAAW,eAAe;IAC9B,6CAA6C;IAC7C,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,gDAAgD;IAChD,QAAQ,EAAE,MAAM,CAAC;IACjB,iDAAiD;IACjD,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB;;;OAGG;IACH,UAAU,CAAC,EAAE;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE,CAAC;CAC3C;AAED;;;;;;GAMG;AACH,wBAAgB,QAAQ,CACtB,IAAI,EAAE,eAAe,GACpB,MAAM,CACP,cAAc,EACd;IAAE,OAAO,EAAE,YAAY,GAAG,IAAI,CAAC;IAAC,UAAU,EAAE,MAAM,CAAA;CAAE,CACrD,CAmDA"}
@@ -0,0 +1,195 @@
1
+ /**
2
+ * `createScorer` and a batteries-included set of built-in scorers.
3
+ *
4
+ * A scorer is a 4-step pipeline (preprocess → analyze → generateScore →
5
+ * generateReason). `createScorer` is a thin identity-with-validation factory:
6
+ * it enforces the one hard contract (`generateScore` is required) and returns
7
+ * a fully-typed `Scorer`. Built-in scorers below show both flavors:
8
+ *
9
+ * - `exactMatch` / `contains` — pure-JS analyze, no model.
10
+ * - `llmJudge` — analyze runs an LLM judge through the resolved engine
11
+ * (provider-agnostic; the model is whatever the runner resolved).
12
+ */
13
+ /**
14
+ * Create a scorer from a 4-step pipeline definition.
15
+ *
16
+ * `generateScore` is the only required step. `preprocess`/`analyze` default to
17
+ * identity (the scorer sees the raw `AgentRunOutput`), and `generateReason` is
18
+ * optional.
19
+ */
20
+ export function createScorer(def) {
21
+ if (!def.name || typeof def.name !== "string") {
22
+ throw new Error("createScorer: `name` is required");
23
+ }
24
+ if (typeof def.generateScore !== "function") {
25
+ throw new Error(`createScorer("${def.name}"): \`generateScore\` is required`);
26
+ }
27
+ return {
28
+ name: def.name,
29
+ preprocess: def.preprocess,
30
+ analyze: def.analyze,
31
+ generateScore: def.generateScore,
32
+ generateReason: def.generateReason,
33
+ };
34
+ }
35
+ /** Clamp any number into [0, 1]; coerce non-finite to 0. */
36
+ export function clamp01(n) {
37
+ if (!Number.isFinite(n))
38
+ return 0;
39
+ return Math.max(0, Math.min(1, n));
40
+ }
41
+ // ─── Built-in JS scorers ──────────────────────────────────────────────
42
+ /** Normalize for forgiving text comparison (case + surrounding whitespace). */
43
+ function normalize(s) {
44
+ return s.trim().toLowerCase();
45
+ }
46
+ /**
47
+ * `exactMatch` — 1.0 when the agent's (trimmed, case-insensitive by default)
48
+ * text equals `expected`, else 0.0. Pure JS, no model.
49
+ */
50
+ export function exactMatch(expected, opts = {}) {
51
+ return createScorer({
52
+ name: "exact_match",
53
+ analyze(run) {
54
+ const actual = opts.caseSensitive ? run.text.trim() : normalize(run.text);
55
+ const want = opts.caseSensitive ? expected.trim() : normalize(expected);
56
+ return { match: actual === want };
57
+ },
58
+ generateScore({ match }) {
59
+ return match ? 1 : 0;
60
+ },
61
+ generateReason({ analysis }) {
62
+ return analysis.match
63
+ ? `Output exactly matched expected text`
64
+ : `Output did not exactly match expected text`;
65
+ },
66
+ });
67
+ }
68
+ /**
69
+ * `contains` — 1.0 when the agent's text contains every required substring
70
+ * (case-insensitive by default). Score is the fraction matched, so a partial
71
+ * hit still surfaces signal. Pure JS, no model.
72
+ */
73
+ export function contains(needles, opts = {}) {
74
+ const list = (Array.isArray(needles) ? needles : [needles]).filter(Boolean);
75
+ return createScorer({
76
+ name: "contains",
77
+ analyze(run) {
78
+ const hay = opts.caseSensitive ? run.text : run.text.toLowerCase();
79
+ const found = [];
80
+ const missing = [];
81
+ for (const n of list) {
82
+ const needle = opts.caseSensitive ? n : n.toLowerCase();
83
+ if (hay.includes(needle))
84
+ found.push(n);
85
+ else
86
+ missing.push(n);
87
+ }
88
+ return { found, missing };
89
+ },
90
+ generateScore({ found }) {
91
+ return list.length === 0 ? 1 : found.length / list.length;
92
+ },
93
+ generateReason({ analysis }) {
94
+ if (analysis.missing.length === 0) {
95
+ return `All ${list.length} required phrase(s) present`;
96
+ }
97
+ return `Missing: ${analysis.missing.join(", ")}`;
98
+ },
99
+ });
100
+ }
101
+ /**
102
+ * `usesTool` — 1.0 when the agent invoked the named tool/action at least once.
103
+ * Useful as a behavioral gate ("the agent must call send-email"). Pure JS.
104
+ */
105
+ export function usesTool(toolName) {
106
+ return createScorer({
107
+ name: `uses_tool:${toolName}`,
108
+ analyze(run) {
109
+ return { used: run.toolCalls.includes(toolName) };
110
+ },
111
+ generateScore({ used }) {
112
+ return used ? 1 : 0;
113
+ },
114
+ generateReason({ analysis }) {
115
+ return analysis.used
116
+ ? `Agent called \`${toolName}\``
117
+ : `Agent never called \`${toolName}\``;
118
+ },
119
+ });
120
+ }
121
+ /**
122
+ * Pull the first JSON object out of model text (which may be wrapped in prose
123
+ * or a ```json fence) and parse it into a verdict. Returns null on garbage so
124
+ * the caller can degrade gracefully instead of throwing.
125
+ */
126
+ function parseJudgeVerdict(text) {
127
+ const match = text.match(/\{[\s\S]*\}/);
128
+ if (!match)
129
+ return null;
130
+ try {
131
+ const parsed = JSON.parse(match[0]);
132
+ if (typeof parsed.score !== "number")
133
+ return null;
134
+ return {
135
+ score: parsed.score,
136
+ reasoning: typeof parsed.reasoning === "string" ? parsed.reasoning : "",
137
+ };
138
+ }
139
+ catch {
140
+ return null;
141
+ }
142
+ }
143
+ /**
144
+ * `llmJudge` — an LLM-as-judge scorer. The analyze step asks the resolved
145
+ * engine to score the agent output against a natural-language rubric and emit
146
+ * `{ "score": <n>, "reasoning": "<why>" }`. The model is whatever the runner
147
+ * resolved from the engine registry — this scorer NEVER hardcodes a provider
148
+ * or model, so evals stay provider-agnostic.
149
+ */
150
+ export function llmJudge(opts) {
151
+ const min = opts.scoreRange?.min ?? 0;
152
+ const max = opts.scoreRange?.max ?? 1;
153
+ const name = opts.name ?? "llm_judge";
154
+ return createScorer({
155
+ name,
156
+ async analyze(run, ctx) {
157
+ const prompt = `You are an expert evaluator. Score the agent output below against the criteria.
158
+
159
+ ## Criteria
160
+ ${opts.criteria}${opts.rubric ? `\n\n## Rubric\n${opts.rubric}` : ""}
161
+
162
+ ## Agent Output
163
+ ${run.text || "(no text output)"}
164
+
165
+ ## Tools the agent used
166
+ ${run.toolCalls.length ? run.toolCalls.join(", ") : "(none)"}
167
+
168
+ ## Instructions
169
+ Respond with ONLY a JSON object (no markdown, no prose outside the JSON):
170
+ {"score": <number between ${min} and ${max}>, "reasoning": "<brief explanation>"}`;
171
+ const text = await ctx.judge({
172
+ systemPrompt: "You are an evaluation judge. Respond only with valid JSON.",
173
+ prompt,
174
+ maxOutputTokens: 512,
175
+ });
176
+ const verdict = parseJudgeVerdict(text);
177
+ const normalized = verdict === null
178
+ ? 0
179
+ : max > min
180
+ ? (verdict.score - min) / (max - min)
181
+ : verdict.score;
182
+ return { verdict, normalized };
183
+ },
184
+ generateScore({ normalized }) {
185
+ return clamp01(normalized);
186
+ },
187
+ generateReason({ analysis }) {
188
+ if (analysis.verdict === null) {
189
+ return "Judge did not return a parseable verdict";
190
+ }
191
+ return analysis.verdict.reasoning || "(no reasoning provided)";
192
+ },
193
+ });
194
+ }
195
+ //# sourceMappingURL=scorer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"scorer.js","sourceRoot":"","sources":["../../src/eval/scorer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAIH;;;;;;GAMG;AACH,MAAM,UAAU,YAAY,CAC1B,GAA+B;IAE/B,IAAI,CAAC,GAAG,CAAC,IAAI,IAAI,OAAO,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;QAC9C,MAAM,IAAI,KAAK,CAAC,kCAAkC,CAAC,CAAC;IACtD,CAAC;IACD,IAAI,OAAO,GAAG,CAAC,aAAa,KAAK,UAAU,EAAE,CAAC;QAC5C,MAAM,IAAI,KAAK,CACb,iBAAiB,GAAG,CAAC,IAAI,mCAAmC,CAC7D,CAAC;IACJ,CAAC;IACD,OAAO;QACL,IAAI,EAAE,GAAG,CAAC,IAAI;QACd,UAAU,EAAE,GAAG,CAAC,UAAU;QAC1B,OAAO,EAAE,GAAG,CAAC,OAAO;QACpB,aAAa,EAAE,GAAG,CAAC,aAAa;QAChC,cAAc,EAAE,GAAG,CAAC,cAAc;KACnC,CAAC;AACJ,CAAC;AAED,4DAA4D;AAC5D,MAAM,UAAU,OAAO,CAAC,CAAS;IAC/B,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC;QAAE,OAAO,CAAC,CAAC;IAClC,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;AACrC,CAAC;AAED,yEAAyE;AAEzE,+EAA+E;AAC/E,SAAS,SAAS,CAAC,CAAS;IAC1B,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;AAChC,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,UAAU,CACxB,QAAgB,EAChB,OAAoC,EAAE;IAEtC,OAAO,YAAY,CAAqC;QACtD,IAAI,EAAE,aAAa;QACnB,OAAO,CAAC,GAAG;YACT,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YAC1E,MAAM,IAAI,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;YACxE,OAAO,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,EAAE,CAAC;QACpC,CAAC;QACD,aAAa,CAAC,EAAE,KAAK,EAAE;YACrB,OAAO,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACvB,CAAC;QACD,cAAc,CAAC,EAAE,QAAQ,EAAE;YACzB,OAAO,QAAQ,CAAC,KAAK;gBACnB,CAAC,CAAC,sCAAsC;gBACxC,CAAC,CAAC,4CAA4C,CAAC;QACnD,CAAC;KACF,CAAC,CAAC;AACL,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,QAAQ,CACtB,OAA0B,EAC1B,OAAoC,EAAE;IAEtC,MAAM,IAAI,GAAG,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAC5E,OAAO,YAAY,CAAyD;QAC1E,IAAI,EAAE,UAAU;QAChB,OAAO,CAAC,GAAG;YACT,MAAM,GAAG,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC;YACnE,MAAM,KAAK,GAAa,EAAE,CAAC;YAC3B,MAAM,OAAO,GAAa,EAAE,CAAC;YAC7B,KAAK,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;gBACrB,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;gBACxD,IAAI,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC;oBAAE,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;;oBACnC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACvB,CAAC;YACD,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,CAAC;QAC5B,CAAC;QACD,aAAa,CAAC,EAAE,KAAK,EAAE;YACrB,OAAO,IAAI,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC;QAC5D,CAAC;QACD,cAAc,CAAC,EAAE,QAAQ,EAAE;YACzB,IAAI,QAAQ,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAClC,OAAO,OAAO,IAAI,CAAC,MAAM,6BAA6B,CAAC;YACzD,CAAC;YACD,OAAO,YAAY,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;QACnD,CAAC;KACF,CAAC,CAAC;AACL,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,QAAQ,CACtB,QAAgB;IAEhB,OAAO,YAAY,CAAoC;QACrD,IAAI,EAAE,aAAa,QAAQ,EAAE;QAC7B,OAAO,CAAC,GAAG;YACT,OAAO,EAAE,IAAI,EAAE,GAAG,CAAC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;QACpD,CAAC;QACD,aAAa,CAAC,EAAE,IAAI,EAAE;YACpB,OAAO,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACtB,CAAC;QACD,cAAc,CAAC,EAAE,QAAQ,EAAE;YACzB,OAAO,QAAQ,CAAC,IAAI;gBAClB,CAAC,CAAC,kBAAkB,QAAQ,IAAI;gBAChC,CAAC,CAAC,wBAAwB,QAAQ,IAAI,CAAC;QAC3C,CAAC;KACF,CAAC,CAAC;AACL,CAAC;AASD;;;;GAIG;AACH,SAAS,iBAAiB,CAAC,IAAY;IACrC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IACxC,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IACxB,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAA0B,CAAC;QAC7D,IAAI,OAAO,MAAM,CAAC,KAAK,KAAK,QAAQ;YAAE,OAAO,IAAI,CAAC;QAClD,OAAO;YACL,KAAK,EAAE,MAAM,CAAC,KAAK;YACnB,SAAS,EAAE,OAAO,MAAM,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE;SACxE,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAgBD;;;;;;GAMG;AACH,MAAM,UAAU,QAAQ,CACtB,IAAqB;IAKrB,MAAM,GAAG,GAAG,IAAI,CAAC,UAAU,EAAE,GAAG,IAAI,CAAC,CAAC;IACtC,MAAM,GAAG,GAAG,IAAI,CAAC,UAAU,EAAE,GAAG,IAAI,CAAC,CAAC;IACtC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,IAAI,WAAW,CAAC;IAEtC,OAAO,YAAY,CAGjB;QACA,IAAI;QACJ,KAAK,CAAC,OAAO,CAAC,GAAG,EAAE,GAAG;YACpB,MAAM,MAAM,GAAG;;;EAGnB,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,kBAAkB,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE;;;EAGlE,GAAG,CAAC,IAAI,IAAI,kBAAkB;;;EAG9B,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,QAAQ;;;;4BAIhC,GAAG,QAAQ,GAAG,wCAAwC,CAAC;YAE7E,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,KAAK,CAAC;gBAC3B,YAAY,EACV,4DAA4D;gBAC9D,MAAM;gBACN,eAAe,EAAE,GAAG;aACrB,CAAC,CAAC;YACH,MAAM,OAAO,GAAG,iBAAiB,CAAC,IAAI,CAAC,CAAC;YACxC,MAAM,UAAU,GACd,OAAO,KAAK,IAAI;gBACd,CAAC,CAAC,CAAC;gBACH,CAAC,CAAC,GAAG,GAAG,GAAG;oBACT,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,GAAG,GAAG,GAAG,CAAC;oBACrC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC;YACtB,OAAO,EAAE,OAAO,EAAE,UAAU,EAAE,CAAC;QACjC,CAAC;QACD,aAAa,CAAC,EAAE,UAAU,EAAE;YAC1B,OAAO,OAAO,CAAC,UAAU,CAAC,CAAC;QAC7B,CAAC;QACD,cAAc,CAAC,EAAE,QAAQ,EAAE;YACzB,IAAI,QAAQ,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;gBAC9B,OAAO,0CAA0C,CAAC;YACpD,CAAC;YACD,OAAO,QAAQ,CAAC,OAAO,CAAC,SAAS,IAAI,yBAAyB,CAAC;QACjE,CAAC;KACF,CAAC,CAAC;AACL,CAAC","sourcesContent":["/**\n * `createScorer` and a batteries-included set of built-in scorers.\n *\n * A scorer is a 4-step pipeline (preprocess → analyze → generateScore →\n * generateReason). `createScorer` is a thin identity-with-validation factory:\n * it enforces the one hard contract (`generateScore` is required) and returns\n * a fully-typed `Scorer`. Built-in scorers below show both flavors:\n *\n * - `exactMatch` / `contains` — pure-JS analyze, no model.\n * - `llmJudge` — analyze runs an LLM judge through the resolved engine\n * (provider-agnostic; the model is whatever the runner resolved).\n */\n\nimport type { AgentRunOutput, Scorer, ScorerDefinition } from \"./types.js\";\n\n/**\n * Create a scorer from a 4-step pipeline definition.\n *\n * `generateScore` is the only required step. `preprocess`/`analyze` default to\n * identity (the scorer sees the raw `AgentRunOutput`), and `generateReason` is\n * optional.\n */\nexport function createScorer<Pre = AgentRunOutput, Ana = Pre>(\n def: ScorerDefinition<Pre, Ana>,\n): Scorer<Pre, Ana> {\n if (!def.name || typeof def.name !== \"string\") {\n throw new Error(\"createScorer: `name` is required\");\n }\n if (typeof def.generateScore !== \"function\") {\n throw new Error(\n `createScorer(\"${def.name}\"): \\`generateScore\\` is required`,\n );\n }\n return {\n name: def.name,\n preprocess: def.preprocess,\n analyze: def.analyze,\n generateScore: def.generateScore,\n generateReason: def.generateReason,\n };\n}\n\n/** Clamp any number into [0, 1]; coerce non-finite to 0. */\nexport function clamp01(n: number): number {\n if (!Number.isFinite(n)) return 0;\n return Math.max(0, Math.min(1, n));\n}\n\n// ─── Built-in JS scorers ──────────────────────────────────────────────\n\n/** Normalize for forgiving text comparison (case + surrounding whitespace). */\nfunction normalize(s: string): string {\n return s.trim().toLowerCase();\n}\n\n/**\n * `exactMatch` — 1.0 when the agent's (trimmed, case-insensitive by default)\n * text equals `expected`, else 0.0. Pure JS, no model.\n */\nexport function exactMatch(\n expected: string,\n opts: { caseSensitive?: boolean } = {},\n): Scorer<AgentRunOutput, { match: boolean }> {\n return createScorer<AgentRunOutput, { match: boolean }>({\n name: \"exact_match\",\n analyze(run) {\n const actual = opts.caseSensitive ? run.text.trim() : normalize(run.text);\n const want = opts.caseSensitive ? expected.trim() : normalize(expected);\n return { match: actual === want };\n },\n generateScore({ match }) {\n return match ? 1 : 0;\n },\n generateReason({ analysis }) {\n return analysis.match\n ? `Output exactly matched expected text`\n : `Output did not exactly match expected text`;\n },\n });\n}\n\n/**\n * `contains` — 1.0 when the agent's text contains every required substring\n * (case-insensitive by default). Score is the fraction matched, so a partial\n * hit still surfaces signal. Pure JS, no model.\n */\nexport function contains(\n needles: string | string[],\n opts: { caseSensitive?: boolean } = {},\n): Scorer<AgentRunOutput, { found: string[]; missing: string[] }> {\n const list = (Array.isArray(needles) ? needles : [needles]).filter(Boolean);\n return createScorer<AgentRunOutput, { found: string[]; missing: string[] }>({\n name: \"contains\",\n analyze(run) {\n const hay = opts.caseSensitive ? run.text : run.text.toLowerCase();\n const found: string[] = [];\n const missing: string[] = [];\n for (const n of list) {\n const needle = opts.caseSensitive ? n : n.toLowerCase();\n if (hay.includes(needle)) found.push(n);\n else missing.push(n);\n }\n return { found, missing };\n },\n generateScore({ found }) {\n return list.length === 0 ? 1 : found.length / list.length;\n },\n generateReason({ analysis }) {\n if (analysis.missing.length === 0) {\n return `All ${list.length} required phrase(s) present`;\n }\n return `Missing: ${analysis.missing.join(\", \")}`;\n },\n });\n}\n\n/**\n * `usesTool` — 1.0 when the agent invoked the named tool/action at least once.\n * Useful as a behavioral gate (\"the agent must call send-email\"). Pure JS.\n */\nexport function usesTool(\n toolName: string,\n): Scorer<AgentRunOutput, { used: boolean }> {\n return createScorer<AgentRunOutput, { used: boolean }>({\n name: `uses_tool:${toolName}`,\n analyze(run) {\n return { used: run.toolCalls.includes(toolName) };\n },\n generateScore({ used }) {\n return used ? 1 : 0;\n },\n generateReason({ analysis }) {\n return analysis.used\n ? `Agent called \\`${toolName}\\``\n : `Agent never called \\`${toolName}\\``;\n },\n });\n}\n\n// ─── Built-in LLM-judge scorer ────────────────────────────────────────\n\ninterface JudgeVerdict {\n score: number;\n reasoning: string;\n}\n\n/**\n * Pull the first JSON object out of model text (which may be wrapped in prose\n * or a ```json fence) and parse it into a verdict. Returns null on garbage so\n * the caller can degrade gracefully instead of throwing.\n */\nfunction parseJudgeVerdict(text: string): JudgeVerdict | null {\n const match = text.match(/\\{[\\s\\S]*\\}/);\n if (!match) return null;\n try {\n const parsed = JSON.parse(match[0]) as Partial<JudgeVerdict>;\n if (typeof parsed.score !== \"number\") return null;\n return {\n score: parsed.score,\n reasoning: typeof parsed.reasoning === \"string\" ? parsed.reasoning : \"\",\n };\n } catch {\n return null;\n }\n}\n\nexport interface LlmJudgeOptions {\n /** Scorer name (defaults to `llm_judge`). */\n name?: string;\n /** What is being judged, e.g. \"helpfulness\". */\n criteria: string;\n /** A rubric describing what 0.0 vs 1.0 means. */\n rubric?: string;\n /**\n * The score scale the judge is told to use. Output is normalized to [0,1].\n * Defaults to a 0..1 scale.\n */\n scoreRange?: { min: number; max: number };\n}\n\n/**\n * `llmJudge` — an LLM-as-judge scorer. The analyze step asks the resolved\n * engine to score the agent output against a natural-language rubric and emit\n * `{ \"score\": <n>, \"reasoning\": \"<why>\" }`. The model is whatever the runner\n * resolved from the engine registry — this scorer NEVER hardcodes a provider\n * or model, so evals stay provider-agnostic.\n */\nexport function llmJudge(\n opts: LlmJudgeOptions,\n): Scorer<\n AgentRunOutput,\n { verdict: JudgeVerdict | null; normalized: number }\n> {\n const min = opts.scoreRange?.min ?? 0;\n const max = opts.scoreRange?.max ?? 1;\n const name = opts.name ?? \"llm_judge\";\n\n return createScorer<\n AgentRunOutput,\n { verdict: JudgeVerdict | null; normalized: number }\n >({\n name,\n async analyze(run, ctx) {\n const prompt = `You are an expert evaluator. Score the agent output below against the criteria.\n\n## Criteria\n${opts.criteria}${opts.rubric ? `\\n\\n## Rubric\\n${opts.rubric}` : \"\"}\n\n## Agent Output\n${run.text || \"(no text output)\"}\n\n## Tools the agent used\n${run.toolCalls.length ? run.toolCalls.join(\", \") : \"(none)\"}\n\n## Instructions\nRespond with ONLY a JSON object (no markdown, no prose outside the JSON):\n{\"score\": <number between ${min} and ${max}>, \"reasoning\": \"<brief explanation>\"}`;\n\n const text = await ctx.judge({\n systemPrompt:\n \"You are an evaluation judge. Respond only with valid JSON.\",\n prompt,\n maxOutputTokens: 512,\n });\n const verdict = parseJudgeVerdict(text);\n const normalized =\n verdict === null\n ? 0\n : max > min\n ? (verdict.score - min) / (max - min)\n : verdict.score;\n return { verdict, normalized };\n },\n generateScore({ normalized }) {\n return clamp01(normalized);\n },\n generateReason({ analysis }) {\n if (analysis.verdict === null) {\n return \"Judge did not return a parseable verdict\";\n }\n return analysis.verdict.reasoning || \"(no reasoning provided)\";\n },\n });\n}\n"]}