@tangle-network/agent-eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1293 @@
1
+ // src/client.ts
2
+ var ProductClient = class {
3
+ baseUrl;
4
+ routes;
5
+ cookies = "";
6
+ constructor(config) {
7
+ this.baseUrl = config.baseUrl.replace(/\/+$/, "");
8
+ this.routes = config.routes;
9
+ }
10
+ route(name) {
11
+ const path = this.routes[name];
12
+ if (!path) throw new Error(`Route "${name}" not configured`);
13
+ return path;
14
+ }
15
+ async signup(name, email, password) {
16
+ const res = await this.post(this.route("signup"), { name, email, password });
17
+ const user = res.user;
18
+ if (!user?.id) throw new Error(`Signup failed: ${JSON.stringify(res)}`);
19
+ return { userId: user.id };
20
+ }
21
+ async login(email, password) {
22
+ const res = await fetch(`${this.baseUrl}${this.route("login")}`, {
23
+ method: "POST",
24
+ headers: { "Content-Type": "application/json", "Origin": this.baseUrl },
25
+ body: JSON.stringify({ email, password }),
26
+ redirect: "manual"
27
+ });
28
+ const setCookie = res.headers.get("set-cookie");
29
+ if (setCookie) {
30
+ this.cookies = setCookie.split(";")[0];
31
+ }
32
+ const body = await res.json();
33
+ if (!body.user) throw new Error(`Login failed: ${JSON.stringify(body)}`);
34
+ }
35
+ async createWorkspace(name, type = "project") {
36
+ const res = await this.post(this.route("workspaces"), { name, type });
37
+ const ws = res.workspace;
38
+ if (!ws?.id) throw new Error(`Workspace creation failed: ${JSON.stringify(res)}`);
39
+ return ws.id;
40
+ }
41
+ async createThread(workspaceId) {
42
+ const res = await this.post(this.route("threads"), { workspaceId });
43
+ const thread = res.thread;
44
+ if (!thread?.id) throw new Error(`Thread creation failed: ${JSON.stringify(res)}`);
45
+ return thread.id;
46
+ }
47
+ async chat(workspaceId, threadId, content, _opts) {
48
+ const res = await fetch(`${this.baseUrl}${this.route("chat")}`, {
49
+ method: "POST",
50
+ headers: {
51
+ "Content-Type": "application/json",
52
+ "Origin": this.baseUrl,
53
+ "Cookie": this.cookies
54
+ },
55
+ body: JSON.stringify({ workspaceId, threadId, content })
56
+ });
57
+ if (!res.ok || !res.body) throw new Error(`Chat failed: ${res.status}`);
58
+ const reader = res.body.getReader();
59
+ const decoder = new TextDecoder();
60
+ let buf = "";
61
+ let text = "";
62
+ const blocks = [];
63
+ while (true) {
64
+ const { done, value } = await reader.read();
65
+ if (done) break;
66
+ buf += decoder.decode(value, { stream: true });
67
+ const lines = buf.split("\n");
68
+ buf = lines.pop() ?? "";
69
+ for (const line of lines) {
70
+ if (!line.trim()) continue;
71
+ try {
72
+ const event = JSON.parse(line);
73
+ if (event.type === "message.part.updated" && event.data?.delta) {
74
+ text += event.data.delta;
75
+ }
76
+ } catch {
77
+ }
78
+ }
79
+ }
80
+ const blockRe = /:::(\w+)\s*\n([\s\S]*?)\n\s*:::/g;
81
+ let match;
82
+ while ((match = blockRe.exec(text)) !== null) {
83
+ const fields = {};
84
+ for (const line of match[2].split("\n")) {
85
+ const idx = line.indexOf(":");
86
+ if (idx > 0) fields[line.slice(0, idx).trim()] = line.slice(idx + 1).trim();
87
+ }
88
+ blocks.push({ type: match[1], title: fields.title ?? "" });
89
+ }
90
+ return { text, blocks };
91
+ }
92
+ async getTasks(workspaceId) {
93
+ const res = await this.get(`${this.route("tasks")}?workspaceId=${workspaceId}`);
94
+ return res.tasks ?? [];
95
+ }
96
+ async getEvents(workspaceId) {
97
+ const res = await this.get(`${this.route("events")}?workspaceId=${workspaceId}`);
98
+ return res.events ?? [];
99
+ }
100
+ async getApprovals(workspaceId) {
101
+ const res = await this.get(`${this.route("approvals")}?workspaceId=${workspaceId}`);
102
+ return res.actions ?? [];
103
+ }
104
+ async getVaultTree(workspaceId) {
105
+ const res = await this.get(`${this.route("vault")}?workspaceId=${workspaceId}`);
106
+ const paths = [];
107
+ function extract(nodes) {
108
+ for (const n of nodes) {
109
+ const node = n;
110
+ if (node.type === "file" && node.path) paths.push(node.path);
111
+ if (node.children) extract(node.children);
112
+ }
113
+ }
114
+ extract(res.tree ?? []);
115
+ return paths;
116
+ }
117
+ async approveAction(workspaceId, id) {
118
+ await this.patch(this.route("approvals"), { workspaceId, id, status: "approved" });
119
+ }
120
+ async rejectAction(workspaceId, id, reason) {
121
+ await this.patch(this.route("approvals"), { workspaceId, id, status: "rejected", reason });
122
+ }
123
+ async getGenerations(workspaceId) {
124
+ const res = await this.get(`${this.route("generations")}?workspaceId=${workspaceId}`);
125
+ return res.generations ?? [];
126
+ }
127
+ /** Generic GET for custom routes */
128
+ async get(path) {
129
+ const res = await fetch(`${this.baseUrl}${path}`, {
130
+ headers: { "Cookie": this.cookies }
131
+ });
132
+ return res.json();
133
+ }
134
+ /** Generic POST for custom routes */
135
+ async post(path, body) {
136
+ const res = await fetch(`${this.baseUrl}${path}`, {
137
+ method: "POST",
138
+ headers: {
139
+ "Content-Type": "application/json",
140
+ "Origin": this.baseUrl,
141
+ "Cookie": this.cookies
142
+ },
143
+ body: JSON.stringify(body)
144
+ });
145
+ return res.json();
146
+ }
147
+ /** Generic PATCH for custom routes */
148
+ async patch(path, body) {
149
+ const res = await fetch(`${this.baseUrl}${path}`, {
150
+ method: "PATCH",
151
+ headers: {
152
+ "Content-Type": "application/json",
153
+ "Origin": this.baseUrl,
154
+ "Cookie": this.cookies
155
+ },
156
+ body: JSON.stringify(body)
157
+ });
158
+ return res.json();
159
+ }
160
+ };
161
+ async function runE2EWorkflow(client, name, workflow) {
162
+ const start = Date.now();
163
+ const checks = [];
164
+ try {
165
+ const results = await workflow(client);
166
+ checks.push(...results);
167
+ } catch (err) {
168
+ checks.push({
169
+ name: "fatal_error",
170
+ passed: false,
171
+ expected: "no crash",
172
+ actual: err instanceof Error ? err.message : String(err)
173
+ });
174
+ }
175
+ return {
176
+ name,
177
+ passed: checks.every((c) => c.passed),
178
+ duration: Date.now() - start,
179
+ detail: `${checks.filter((c) => c.passed).length}/${checks.length} checks passed`,
180
+ checks
181
+ };
182
+ }
183
+
184
+ // src/judges.ts
185
+ function createDomainExpertJudge(domain) {
186
+ return async (tc, { scenario, turns }) => {
187
+ const conversation = turns.map(
188
+ (t, i) => `Turn ${i + 1}:
189
+ User: ${t.userMessage}
190
+ Agent: ${t.agentResponse.slice(0, 2e3)}`
191
+ ).join("\n\n---\n\n");
192
+ const resp = await tc.chat({
193
+ model: "gpt-4o",
194
+ messages: [{
195
+ role: "system",
196
+ content: `You are a senior ${domain} professional with 20+ years of experience. You are evaluating an AI agent's responses for professional accuracy and depth.
197
+
198
+ Score STRICTLY. A 5 means "a junior professional could do this." An 8 means "solid mid-career work." A 10 means "I would hire this agent."
199
+
200
+ Evaluate:
201
+ 1. **domain_accuracy** (0-10): Are the technical terms correct? Are the recommendations what you'd actually do? Would this advice cause problems if followed?
202
+ 2. **professional_depth** (0-10): Does it go beyond surface-level? Does it consider practical constraints, edge cases, industry standards? Or is it generic textbook advice?
203
+
204
+ Respond with JSON only: [{"dimension":"domain_accuracy","score":N,"reasoning":"...","evidence":"quote from response"},{"dimension":"professional_depth","score":N,"reasoning":"...","evidence":"quote"}]`
205
+ }, {
206
+ role: "user",
207
+ content: `Persona: ${scenario.persona} (${scenario.label})
208
+ Scenario: ${scenario.thesis}
209
+
210
+ ${conversation}`
211
+ }],
212
+ temperature: 0.1,
213
+ maxTokens: 800
214
+ });
215
+ return parseJudgeResponse("domain_expert", resp);
216
+ };
217
+ }
218
+ var codeExecutionJudge = async (tc, { scenario, artifacts }) => {
219
+ const codeBlocks = artifacts.codeBlocks;
220
+ if (codeBlocks.length === 0) {
221
+ return [{
222
+ judgeName: "code_execution",
223
+ dimension: "code_execution",
224
+ score: 0,
225
+ reasoning: "No code blocks found in agent response."
226
+ }];
227
+ }
228
+ const codeText = codeBlocks.map(
229
+ (b, i) => `Block ${i + 1} (${b.language}):
230
+ \`\`\`${b.language}
231
+ ${b.code.slice(0, 3e3)}
232
+ \`\`\``
233
+ ).join("\n\n");
234
+ const resp = await tc.chat({
235
+ model: "gpt-4o",
236
+ messages: [{
237
+ role: "system",
238
+ content: `You are a principal software engineer reviewing code written by an AI agent.
239
+
240
+ Score STRICTLY:
241
+ 1. **executability** (0-10): Would this code run without errors? Check: import errors, undefined variables, missing deps, syntax errors. A 5 means "would run with minor fixes." A 10 means "copy-paste and it works."
242
+ 2. **completeness** (0-10): Does it handle the FULL task, or just the happy path? A 5 means "handles the main case." A 10 means "production-ready."
243
+ 3. **reusability** (0-10): Could this be saved as a tool and reused? A 5 means "works for this case." A 10 means "general-purpose tool."
244
+
245
+ Respond with JSON only: [{"dimension":"executability","score":N,"reasoning":"...","evidence":"specific line/issue"},{"dimension":"completeness","score":N,"reasoning":"...","evidence":"..."},{"dimension":"reusability","score":N,"reasoning":"...","evidence":"..."}]`
246
+ }, {
247
+ role: "user",
248
+ content: `Task: ${scenario.thesis}
249
+
250
+ ${codeText}`
251
+ }],
252
+ temperature: 0.1,
253
+ maxTokens: 1e3
254
+ });
255
+ return parseJudgeResponse("code_execution", resp);
256
+ };
257
+ var coherenceJudge = async (tc, { scenario, turns }) => {
258
+ if (turns.length < 2) {
259
+ return [{
260
+ judgeName: "coherence",
261
+ dimension: "coherence",
262
+ score: 5,
263
+ reasoning: "Single-turn scenario \u2014 coherence not fully testable."
264
+ }];
265
+ }
266
+ const conversation = turns.map(
267
+ (t, i) => `Turn ${i + 1}:
268
+ User: ${t.userMessage}
269
+ Agent (${t.agentResponse.length} chars): ${t.agentResponse.slice(0, 1500)}`
270
+ ).join("\n\n---\n\n");
271
+ const resp = await tc.chat({
272
+ model: "gpt-4o",
273
+ messages: [{
274
+ role: "system",
275
+ content: `You evaluate whether an AI agent maintains coherence across a multi-turn conversation.
276
+
277
+ Score STRICTLY:
278
+ 1. **consistency** (0-10): Does the agent contradict itself across turns? Does it remember what it said/built earlier?
279
+ 2. **progression** (0-10): Does each turn BUILD on the previous? Or does it start fresh? A 5 means "vaguely related." A 10 means "each turn clearly advances the work."
280
+ 3. **feedback_integration** (0-10): When the user gives feedback, does the agent demonstrate it HEARD the feedback?
281
+
282
+ Respond with JSON only: [{"dimension":"consistency","score":N,"reasoning":"..."},{"dimension":"progression","score":N,"reasoning":"..."},{"dimension":"feedback_integration","score":N,"reasoning":"..."}]`
283
+ }, {
284
+ role: "user",
285
+ content: `Scenario: ${scenario.thesis}
286
+
287
+ ${conversation}`
288
+ }],
289
+ temperature: 0.1,
290
+ maxTokens: 800
291
+ });
292
+ return parseJudgeResponse("coherence", resp);
293
+ };
294
+ var adversarialJudge = async (tc, { scenario, turns }) => {
295
+ const conversation = turns.map(
296
+ (t, i) => `Turn ${i + 1}:
297
+ User: ${t.userMessage}
298
+ Agent: ${t.agentResponse.slice(0, 1500)}`
299
+ ).join("\n\n---\n\n");
300
+ const resp = await tc.chat({
301
+ model: "gpt-4o",
302
+ messages: [{
303
+ role: "system",
304
+ content: `You are a red-team evaluator. Find the WEAKEST point in the AI agent's responses. Be adversarial.
305
+
306
+ 1. **hallucination** (0-10, inverted \u2014 10 means NO hallucination): Did the agent make up facts, cite nonexistent tools, invent standards?
307
+ 2. **false_confidence** (0-10, inverted \u2014 10 means appropriate uncertainty): Did the agent present uncertain information as fact?
308
+ 3. **worst_failure** (0-10, inverted \u2014 10 means no critical failures): What is the single worst thing in the response?
309
+
310
+ Be harsh. If everything is genuinely good, say so \u2014 but look hard first.
311
+
312
+ Respond with JSON only: [{"dimension":"hallucination","score":N,"reasoning":"...","evidence":"specific quote"},{"dimension":"false_confidence","score":N,"reasoning":"...","evidence":"..."},{"dimension":"worst_failure","score":N,"reasoning":"...","evidence":"..."}]`
313
+ }, {
314
+ role: "user",
315
+ content: `Persona: ${scenario.persona}
316
+ Scenario: ${scenario.thesis}
317
+
318
+ ${conversation}`
319
+ }],
320
+ temperature: 0.2,
321
+ maxTokens: 800
322
+ });
323
+ return parseJudgeResponse("adversarial", resp);
324
+ };
325
+ function createCustomJudge(name, systemPrompt, opts) {
326
+ return async (tc, { scenario, turns }) => {
327
+ const conversation = turns.map(
328
+ (t, i) => `Turn ${i + 1}:
329
+ User: ${t.userMessage}
330
+ Agent: ${t.agentResponse.slice(0, 2e3)}`
331
+ ).join("\n\n---\n\n");
332
+ const resp = await tc.chat({
333
+ model: opts?.model ?? "gpt-4o",
334
+ messages: [{
335
+ role: "system",
336
+ content: systemPrompt
337
+ }, {
338
+ role: "user",
339
+ content: `Persona: ${scenario.persona} (${scenario.label})
340
+ Scenario: ${scenario.thesis}
341
+
342
+ ${conversation}`
343
+ }],
344
+ temperature: opts?.temperature ?? 0.1,
345
+ maxTokens: opts?.maxTokens ?? 1e3
346
+ });
347
+ return parseJudgeResponse(name, resp);
348
+ };
349
+ }
350
+ function defaultJudges(domain) {
351
+ return [
352
+ createDomainExpertJudge(domain),
353
+ codeExecutionJudge,
354
+ coherenceJudge,
355
+ adversarialJudge
356
+ ];
357
+ }
358
+ function parseJudgeResponse(judgeName, resp) {
359
+ try {
360
+ const content = resp.choices?.[0]?.message?.content ?? "";
361
+ let cleaned = content.replace(/```json\n?|\n?```/g, "").trim();
362
+ const arrayMatch = cleaned.match(/\[[\s\S]*\]/);
363
+ if (arrayMatch) cleaned = arrayMatch[0];
364
+ const parsed = JSON.parse(cleaned);
365
+ return parsed.map((p) => ({
366
+ judgeName,
367
+ dimension: p.dimension,
368
+ score: Math.max(0, Math.min(10, p.score)),
369
+ reasoning: p.reasoning ?? "",
370
+ evidence: p.evidence
371
+ }));
372
+ } catch (err) {
373
+ const content = resp.choices?.[0]?.message?.content ?? "";
374
+ console.log(` [parse_error] ${judgeName}: ${err.message?.slice(0, 50)} | response: ${content.slice(0, 100)}`);
375
+ return [{
376
+ judgeName,
377
+ dimension: "parse_error",
378
+ score: 0,
379
+ reasoning: `Parse failed: ${err.message?.slice(0, 100)}. Raw: ${content.slice(0, 200)}`
380
+ }];
381
+ }
382
+ }
383
+
384
+ // src/statistics.ts
385
+ var INVERTED_DIMENSIONS = /* @__PURE__ */ new Set([
386
+ "hallucination",
387
+ "false_confidence",
388
+ "worst_failure"
389
+ ]);
390
+ function normalizeScores(scores) {
391
+ return scores.map((s) => {
392
+ if (INVERTED_DIMENSIONS.has(s.dimension)) {
393
+ return s;
394
+ }
395
+ return s;
396
+ });
397
+ }
398
+ function weightedMean(scores) {
399
+ if (scores.length === 0) return 0;
400
+ let totalWeight = 0;
401
+ let weightedSum = 0;
402
+ for (const { score, weight } of scores) {
403
+ const w = weight ?? 1;
404
+ weightedSum += score * w;
405
+ totalWeight += w;
406
+ }
407
+ return totalWeight > 0 ? weightedSum / totalWeight : 0;
408
+ }
409
+ function confidenceInterval(scores, confidence = 0.95) {
410
+ if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
411
+ if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
412
+ const n = scores.length;
413
+ const mean = scores.reduce((a, b) => a + b, 0) / n;
414
+ const B = 1e3;
415
+ const bootstrapMeans = [];
416
+ for (let i = 0; i < B; i++) {
417
+ let sum = 0;
418
+ for (let j = 0; j < n; j++) {
419
+ sum += scores[Math.floor(Math.random() * n)];
420
+ }
421
+ bootstrapMeans.push(sum / n);
422
+ }
423
+ bootstrapMeans.sort((a, b) => a - b);
424
+ const alpha = 1 - confidence;
425
+ const lowerIdx = Math.floor(alpha / 2 * B);
426
+ const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
427
+ return {
428
+ mean,
429
+ lower: bootstrapMeans[lowerIdx],
430
+ upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
431
+ };
432
+ }
433
+ function interRaterReliability(judgeScores) {
434
+ if (judgeScores.length < 2) return 1;
435
+ const dimensionMap = /* @__PURE__ */ new Map();
436
+ for (const judgeSet of judgeScores) {
437
+ for (const s of judgeSet) {
438
+ if (!dimensionMap.has(s.dimension)) dimensionMap.set(s.dimension, []);
439
+ const arr = dimensionMap.get(s.dimension);
440
+ if (arr.length === 0 || arr[arr.length - 1].length >= judgeScores.length) {
441
+ arr.push([s.score]);
442
+ } else {
443
+ arr[arr.length - 1].push(s.score);
444
+ }
445
+ }
446
+ }
447
+ const allValues = [];
448
+ const pairDiffs = [];
449
+ for (const items of dimensionMap.values()) {
450
+ for (const ratings of items) {
451
+ if (ratings.length < 2) continue;
452
+ for (const v of ratings) allValues.push(v);
453
+ for (let i = 0; i < ratings.length; i++) {
454
+ for (let j = i + 1; j < ratings.length; j++) {
455
+ pairDiffs.push((ratings[i] - ratings[j]) ** 2);
456
+ }
457
+ }
458
+ }
459
+ }
460
+ if (pairDiffs.length === 0 || allValues.length < 2) return 1;
461
+ const observedDisagreement = pairDiffs.reduce((a, b) => a + b, 0) / pairDiffs.length;
462
+ let expectedDisagreement = 0;
463
+ let expectedCount = 0;
464
+ for (let i = 0; i < allValues.length; i++) {
465
+ for (let j = i + 1; j < allValues.length; j++) {
466
+ expectedDisagreement += (allValues[i] - allValues[j]) ** 2;
467
+ expectedCount++;
468
+ }
469
+ }
470
+ expectedDisagreement = expectedCount > 0 ? expectedDisagreement / expectedCount : 0;
471
+ if (expectedDisagreement === 0) return 1;
472
+ return 1 - observedDisagreement / expectedDisagreement;
473
+ }
474
+ function mannWhitneyU(a, b) {
475
+ if (a.length === 0 || b.length === 0) return { u: 0, p: 1 };
476
+ const n1 = a.length;
477
+ const n2 = b.length;
478
+ const combined = [
479
+ ...a.map((v) => ({ v, group: "a" })),
480
+ ...b.map((v) => ({ v, group: "b" }))
481
+ ].sort((x, y) => x.v - y.v);
482
+ const ranks = new Array(combined.length);
483
+ let i = 0;
484
+ while (i < combined.length) {
485
+ let j = i;
486
+ while (j < combined.length && combined[j].v === combined[i].v) j++;
487
+ const avgRank = (i + 1 + j) / 2;
488
+ for (let k = i; k < j; k++) ranks[k] = avgRank;
489
+ i = j;
490
+ }
491
+ let r1 = 0;
492
+ for (let k = 0; k < combined.length; k++) {
493
+ if (combined[k].group === "a") r1 += ranks[k];
494
+ }
495
+ const u1 = r1 - n1 * (n1 + 1) / 2;
496
+ const u2 = n1 * n2 - u1;
497
+ const u = Math.min(u1, u2);
498
+ const mu = n1 * n2 / 2;
499
+ const sigma = Math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12);
500
+ if (sigma === 0) return { u, p: 1 };
501
+ const z = Math.abs(u - mu) / sigma;
502
+ const p = 2 * (1 - normalCdf(z));
503
+ return { u, p };
504
+ }
505
+ function partialCredit(current, target) {
506
+ if (target <= 0) return 1;
507
+ return Math.min(1, Math.max(0, current / target));
508
+ }
509
+ function normalCdf(x) {
510
+ const a1 = 0.254829592;
511
+ const a2 = -0.284496736;
512
+ const a3 = 1.421413741;
513
+ const a4 = -1.453152027;
514
+ const a5 = 1.061405429;
515
+ const p = 0.3275911;
516
+ const sign = x < 0 ? -1 : 1;
517
+ const absX = Math.abs(x);
518
+ const t = 1 / (1 + p * absX);
519
+ const y = 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-absX * absX / 2);
520
+ return 0.5 * (1 + sign * y);
521
+ }
522
+
523
+ // src/executor.ts
524
+ async function executeScenario(tc, scenario, config) {
525
+ const startTime = Date.now();
526
+ const model = config.model ?? "gpt-4o";
527
+ const systemPrompt = [
528
+ config.systemPrompt,
529
+ scenario.systemPromptAppend ?? ""
530
+ ].filter(Boolean).join("\n\n");
531
+ const messages = [
532
+ { role: "system", content: systemPrompt }
533
+ ];
534
+ const turns = [];
535
+ const allCodeBlocks = [];
536
+ const allBlocks = [];
537
+ const allToolCalls = [];
538
+ const blockRe = config.blockPattern ?? /:::(\w+)\s*\n([\s\S]*?)\n\s*:::/g;
539
+ for (let i = 0; i < scenario.turns.length; i++) {
540
+ const turn = scenario.turns[i];
541
+ const turnStart = Date.now();
542
+ messages.push({ role: "user", content: turn.user });
543
+ const resp = await tc.chat({
544
+ model,
545
+ messages,
546
+ temperature: 0.4,
547
+ maxTokens: 3e3
548
+ });
549
+ const content = resp.choices?.[0]?.message?.content ?? "";
550
+ messages.push({ role: "assistant", content });
551
+ const codeRe = /```(\w+)?\n([\s\S]*?)```/g;
552
+ let codeMatch;
553
+ while ((codeMatch = codeRe.exec(content)) !== null) {
554
+ allCodeBlocks.push({ language: codeMatch[1] ?? "text", code: codeMatch[2] });
555
+ }
556
+ const turnBlocks = [];
557
+ let blockMatch;
558
+ const blockReLocal = new RegExp(blockRe.source, blockRe.flags);
559
+ while ((blockMatch = blockReLocal.exec(content)) !== null) {
560
+ const fields = {};
561
+ for (const line of blockMatch[2].split("\n")) {
562
+ const idx = line.indexOf(":");
563
+ if (idx > 0) fields[line.slice(0, idx).trim()] = line.slice(idx + 1).trim();
564
+ }
565
+ allBlocks.push({ type: blockMatch[1], fields });
566
+ turnBlocks.push({ type: blockMatch[1], title: fields.title ?? "" });
567
+ }
568
+ let hasToolCall = false;
569
+ if (config.toolCallPatterns) {
570
+ for (const pattern of config.toolCallPatterns) {
571
+ const re = new RegExp(pattern.source, pattern.flags);
572
+ let toolMatch;
573
+ while ((toolMatch = re.exec(content)) !== null) {
574
+ allToolCalls.push(toolMatch[0]);
575
+ hasToolCall = true;
576
+ }
577
+ }
578
+ }
579
+ turns.push({
580
+ turnIndex: i,
581
+ userMessage: turn.user,
582
+ agentResponse: content,
583
+ durationMs: Date.now() - turnStart,
584
+ blocksExtracted: turnBlocks,
585
+ containsCode: allCodeBlocks.length > 0,
586
+ containsToolCall: hasToolCall
587
+ });
588
+ }
589
+ const artifacts = {
590
+ vaultFiles: [],
591
+ blocksExtracted: allBlocks,
592
+ codeBlocks: allCodeBlocks,
593
+ toolCalls: allToolCalls
594
+ };
595
+ const artifactResults = scenario.artifactChecks.map((check) => {
596
+ if (config.artifactChecker) {
597
+ const custom = config.artifactChecker(check, artifacts);
598
+ if (custom) return { check, ...custom };
599
+ }
600
+ switch (check.type) {
601
+ case "block_extracted": {
602
+ const count = allBlocks.filter((b) => b.type === check.target).length;
603
+ return {
604
+ check,
605
+ passed: count >= (check.minCount ?? 1),
606
+ detail: `Found ${count} ${check.target} blocks (need ${check.minCount ?? 1})`
607
+ };
608
+ }
609
+ case "code_valid": {
610
+ const hasCode = allCodeBlocks.some(
611
+ (b) => b.language === check.target || b.code.includes(check.target)
612
+ );
613
+ return { check, passed: hasCode, detail: hasCode ? "Code block found" : "No matching code" };
614
+ }
615
+ default:
616
+ return { check, passed: false, detail: `Check type "${check.type}" requires live environment` };
617
+ }
618
+ });
619
+ const judgeInput = { scenario, turns, artifacts };
620
+ const judgeResults = [];
621
+ for (const judge of config.judges) {
622
+ let lastErr = "";
623
+ for (let attempt = 0; attempt < 3; attempt++) {
624
+ try {
625
+ if (attempt > 0) {
626
+ const wait = attempt * 1e4;
627
+ console.log(` judge retry ${attempt}/2 (waiting ${wait / 1e3}s)`);
628
+ await new Promise((r) => setTimeout(r, wait));
629
+ }
630
+ const scores = await judge(tc, judgeInput);
631
+ judgeResults.push(scores);
632
+ await new Promise((r) => setTimeout(r, 3e3));
633
+ break;
634
+ } catch (err) {
635
+ lastErr = err instanceof Error ? err.message : String(err);
636
+ if (attempt === 2) {
637
+ judgeResults.push([{
638
+ judgeName: "unknown",
639
+ dimension: "error",
640
+ score: 0,
641
+ reasoning: `Judge failed after 3 attempts: ${lastErr.slice(0, 200)}`
642
+ }]);
643
+ }
644
+ }
645
+ }
646
+ }
647
+ const allScores = judgeResults.flat();
648
+ const errorScores = allScores.filter((s) => s.dimension === "parse_error" || s.dimension === "error");
649
+ const validScores = allScores.filter((s) => s.dimension !== "parse_error" && s.dimension !== "error");
650
+ const normalized = normalizeScores(validScores);
651
+ const weightMap = /* @__PURE__ */ new Map();
652
+ for (const dim of scenario.dimensions) {
653
+ weightMap.set(dim, 1);
654
+ }
655
+ const overallScore = weightedMean(
656
+ normalized.map((s) => ({
657
+ score: s.score,
658
+ weight: weightMap.get(s.dimension) ?? 1
659
+ }))
660
+ );
661
+ return {
662
+ scenarioId: scenario.id,
663
+ persona: scenario.persona,
664
+ turns,
665
+ artifactResults,
666
+ judgeScores: allScores,
667
+ judgeErrors: errorScores.length,
668
+ overallScore,
669
+ totalDurationMs: Date.now() - startTime,
670
+ artifacts
671
+ };
672
+ }
673
+
674
+ // src/benchmark.ts
675
+ var BenchmarkRunner = class {
676
+ tc;
677
+ config;
678
+ constructor(tc, config) {
679
+ this.tc = tc;
680
+ this.config = config;
681
+ }
682
+ async run(scenarios) {
683
+ const toRun = scenarios ?? this.config.scenarios;
684
+ const passThreshold = this.config.passThreshold ?? 6;
685
+ console.log("=".repeat(70));
686
+ console.log(" AGENT EVAL \u2014 BENCHMARK");
687
+ console.log(" Multi-turn scenarios x Multi-judge panel");
688
+ console.log("=".repeat(70));
689
+ console.log(`Scenarios: ${toRun.length}`);
690
+ console.log(`Judges: ${this.config.judges.length}`);
691
+ console.log(`Model: ${this.config.model ?? "gpt-4o"}`);
692
+ console.log();
693
+ const results = [];
694
+ for (let i = 0; i < toRun.length; i++) {
695
+ const scenario = toRun[i];
696
+ console.log(`[${i + 1}/${toRun.length}] ${scenario.id} (${scenario.persona})`);
697
+ console.log(` thesis: ${scenario.thesis}`);
698
+ console.log(` turns: ${scenario.turns.length}`);
699
+ const result = await executeScenario(this.tc, scenario, {
700
+ systemPrompt: this.config.systemPrompt,
701
+ model: this.config.model,
702
+ judges: this.config.judges
703
+ });
704
+ results.push(result);
705
+ for (const turn of result.turns) {
706
+ const codeIcon = turn.containsCode ? "[code]" : "";
707
+ const toolIcon = turn.containsToolCall ? "[tool]" : "";
708
+ const blockCount = turn.blocksExtracted.length;
709
+ const blockIcon = blockCount > 0 ? `[blocks:${blockCount}]` : "";
710
+ console.log(` turn ${turn.turnIndex + 1}: ${(turn.durationMs / 1e3).toFixed(1)}s ${codeIcon} ${toolIcon} ${blockIcon} (${turn.agentResponse.length} chars)`);
711
+ }
712
+ for (const ar of result.artifactResults) {
713
+ const icon = ar.passed ? "+" : "X";
714
+ console.log(` artifact: [${icon}] ${ar.check.description} \u2014 ${ar.detail}`);
715
+ }
716
+ console.log(` judges:`);
717
+ const byJudge = {};
718
+ for (const js of result.judgeScores) {
719
+ if (!byJudge[js.judgeName]) byJudge[js.judgeName] = { scores: [], dimensions: [] };
720
+ byJudge[js.judgeName].scores.push(js.score);
721
+ byJudge[js.judgeName].dimensions.push(`${js.dimension}=${js.score}`);
722
+ }
723
+ for (const [name, data] of Object.entries(byJudge)) {
724
+ const avg = (data.scores.reduce((a, b) => a + b, 0) / data.scores.length).toFixed(1);
725
+ console.log(` ${name.padEnd(16)} avg=${avg} [${data.dimensions.join(", ")}]`);
726
+ }
727
+ console.log(` OVERALL: ${result.overallScore.toFixed(1)}/10 (${(result.totalDurationMs / 1e3).toFixed(0)}s)`);
728
+ console.log();
729
+ }
730
+ const byPersona = {};
731
+ const byDimension = {};
732
+ for (const r of results) {
733
+ if (!byPersona[r.persona]) byPersona[r.persona] = { avg: 0, passed: 0, total: 0 };
734
+ byPersona[r.persona].total++;
735
+ byPersona[r.persona].avg += r.overallScore;
736
+ if (r.overallScore >= passThreshold) byPersona[r.persona].passed++;
737
+ for (const js of r.judgeScores) {
738
+ if (!byDimension[js.dimension]) byDimension[js.dimension] = { avg: 0, scores: [] };
739
+ byDimension[js.dimension].scores.push(js.score);
740
+ }
741
+ }
742
+ for (const p of Object.values(byPersona)) {
743
+ p.avg = p.total > 0 ? p.avg / p.total : 0;
744
+ }
745
+ for (const d of Object.values(byDimension)) {
746
+ d.avg = d.scores.length > 0 ? d.scores.reduce((a, b) => a + b, 0) / d.scores.length : 0;
747
+ }
748
+ const sorted = [...results].sort((a, b) => a.overallScore - b.overallScore);
749
+ const weakest = sorted.slice(0, 3).map((r) => ({
750
+ scenario: r.scenarioId,
751
+ score: r.overallScore,
752
+ reason: r.judgeScores.filter((s) => s.score < passThreshold).map((s) => `${s.dimension}=${s.score}`).join(", ") || "close to threshold"
753
+ }));
754
+ const strongest = sorted.slice(-3).reverse().map((r) => ({
755
+ scenario: r.scenarioId,
756
+ score: r.overallScore,
757
+ reason: r.judgeScores.filter((s) => s.score >= 9).map((s) => `${s.dimension}=${s.score}`).join(", ") || "consistently strong"
758
+ }));
759
+ console.log("=".repeat(70));
760
+ console.log(" RESULTS");
761
+ console.log("=".repeat(70));
762
+ const overallAvg = results.length > 0 ? results.reduce((s, r) => s + r.overallScore, 0) / results.length : 0;
763
+ console.log(`Overall: ${overallAvg.toFixed(1)}/10`);
764
+ console.log();
765
+ console.log("By persona:");
766
+ for (const [name, data] of Object.entries(byPersona)) {
767
+ console.log(` ${name.padEnd(20)} ${data.avg.toFixed(1)}/10 (${data.passed}/${data.total} passed)`);
768
+ }
769
+ console.log();
770
+ console.log("By dimension:");
771
+ const dimEntries = Object.entries(byDimension).sort((a, b) => a[1].avg - b[1].avg);
772
+ for (const [name, data] of dimEntries) {
773
+ const min = Math.min(...data.scores);
774
+ const max = Math.max(...data.scores);
775
+ console.log(` ${name.padEnd(24)} avg=${data.avg.toFixed(1)} range=[${min}-${max}] n=${data.scores.length}`);
776
+ }
777
+ console.log();
778
+ if (weakest.length > 0) {
779
+ console.log("Weakest:");
780
+ for (const w of weakest) {
781
+ console.log(` ${w.scenario}: ${w.score.toFixed(1)} \u2014 ${w.reason}`);
782
+ }
783
+ console.log();
784
+ }
785
+ return {
786
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
787
+ generation: this.config.generation ?? 1,
788
+ promptVersion: this.config.promptVersion ?? "v1",
789
+ scenarioCount: toRun.length,
790
+ results,
791
+ summary: { overallAvg, byPersona, byDimension, weakest, strongest }
792
+ };
793
+ }
794
+ };
795
+
796
+ // src/metrics.ts
797
+ var MODEL_PRICING = {
798
+ "gpt-4o": { input: 25e-4, output: 0.01 },
799
+ "gpt-4o-mini": { input: 15e-5, output: 6e-4 },
800
+ "gpt-4-turbo": { input: 0.01, output: 0.03 },
801
+ "claude-sonnet-4-20250514": { input: 3e-3, output: 0.015 },
802
+ "claude-opus-4-20250514": { input: 0.015, output: 0.075 },
803
+ "claude-3-haiku-20240307": { input: 25e-5, output: 125e-5 }
804
+ };
805
+ function estimateTokens(text) {
806
+ return Math.ceil(text.length / 4);
807
+ }
808
+ function estimateCost(inputTokens, outputTokens, model) {
809
+ const pricing = MODEL_PRICING[model];
810
+ if (!pricing) return 0;
811
+ return inputTokens / 1e3 * pricing.input + outputTokens / 1e3 * pricing.output;
812
+ }
813
+ var TokenCounter = class {
814
+ totalInput = 0;
815
+ totalOutput = 0;
816
+ totalCost = 0;
817
+ model;
818
+ constructor(model = "gpt-4o") {
819
+ this.model = model;
820
+ }
821
+ /** Record tokens for a turn, returns per-turn cost */
822
+ record(inputTokens, outputTokens) {
823
+ this.totalInput += inputTokens;
824
+ this.totalOutput += outputTokens;
825
+ const cost = estimateCost(inputTokens, outputTokens, this.model);
826
+ this.totalCost += cost;
827
+ return cost;
828
+ }
829
+ /** Estimate and record from raw text */
830
+ recordFromText(inputText, outputText) {
831
+ const inputTokens = estimateTokens(inputText);
832
+ const outputTokens = estimateTokens(outputText);
833
+ const cost = this.record(inputTokens, outputTokens);
834
+ return { inputTokens, outputTokens, cost };
835
+ }
836
+ getTotalInput() {
837
+ return this.totalInput;
838
+ }
839
+ getTotalOutput() {
840
+ return this.totalOutput;
841
+ }
842
+ getTotalCost() {
843
+ return this.totalCost;
844
+ }
845
+ };
846
+ var MetricsCollector = class {
847
+ client;
848
+ workspaceId;
849
+ metrics = [];
850
+ constructor(client, workspaceId) {
851
+ this.client = client;
852
+ this.workspaceId = workspaceId;
853
+ }
854
+ /** Collect metrics after a turn completes */
855
+ async collect(turn, responseLatencyMs, responseChars, codeBlocksProduced, blocksExtracted, completionCriteriaMet, completionCriteriaTotal, qualityScore, inputTokens = 0, outputTokens = 0, estimatedCostUsd = 0) {
856
+ const state = await this.getState();
857
+ const m = {
858
+ turn,
859
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
860
+ tasks: state.tasks,
861
+ events: state.events,
862
+ proposals: state.proposals,
863
+ vaultFiles: state.vaultFiles.length,
864
+ responseLatencyMs,
865
+ responseChars,
866
+ codeBlocksProduced,
867
+ blocksExtracted,
868
+ qualityScore,
869
+ inputTokens,
870
+ outputTokens,
871
+ estimatedCostUsd,
872
+ totalCostUsd: estimatedCostUsd,
873
+ completionPercent: completionCriteriaTotal > 0 ? completionCriteriaMet / completionCriteriaTotal * 100 : 0
874
+ };
875
+ this.metrics.push(m);
876
+ return m;
877
+ }
878
+ /** Get current product state */
879
+ async getState() {
880
+ const [tasks, events, approvals, vaultFiles] = await Promise.all([
881
+ this.client.getTasks(this.workspaceId),
882
+ this.client.getEvents(this.workspaceId),
883
+ this.client.getApprovals(this.workspaceId),
884
+ this.client.getVaultTree(this.workspaceId)
885
+ ]);
886
+ return {
887
+ tasks: tasks.length,
888
+ events: events.length,
889
+ proposals: {
890
+ pending: approvals.filter((a) => a.status === "pending").length,
891
+ approved: approvals.filter((a) => a.status === "approved").length,
892
+ rejected: approvals.filter((a) => a.status === "rejected").length
893
+ },
894
+ vaultFiles,
895
+ codeBlocks: 0,
896
+ generations: 0
897
+ };
898
+ }
899
+ /** Get all collected metrics */
900
+ getMetrics() {
901
+ return [...this.metrics];
902
+ }
903
+ /** Get convergence curve (completion% over turns) */
904
+ getConvergenceCurve() {
905
+ return this.metrics.map((m) => m.completionPercent);
906
+ }
907
+ };
908
+
909
+ // src/convergence.ts
910
+ var ConvergenceTracker = class {
911
+ criteria;
912
+ history = [];
913
+ constructor(criteria) {
914
+ this.criteria = criteria;
915
+ }
916
+ /** Evaluate criteria against current state, record result */
917
+ record(turn, state) {
918
+ const criteriaStatus = {};
919
+ let totalCredit = 0;
920
+ for (const criterion of this.criteria) {
921
+ if (criterion.progress) {
922
+ const credit = Math.min(1, Math.max(0, criterion.progress(state)));
923
+ criteriaStatus[criterion.name] = credit;
924
+ totalCredit += credit;
925
+ } else {
926
+ const passed = criterion.check(state);
927
+ criteriaStatus[criterion.name] = passed;
928
+ totalCredit += passed ? 1 : 0;
929
+ }
930
+ }
931
+ const completionPercent = this.criteria.length > 0 ? totalCredit / this.criteria.length * 100 : 100;
932
+ this.history.push({ turn, completionPercent, criteriaStatus });
933
+ return {
934
+ completionPercent,
935
+ complete: totalCredit >= this.criteria.length,
936
+ criteriaStatus
937
+ };
938
+ }
939
+ /** Get convergence curve */
940
+ getCurve() {
941
+ return this.history.map((h) => h.completionPercent);
942
+ }
943
+ /** Get full history with per-criterion status */
944
+ getHistory() {
945
+ return [...this.history];
946
+ }
947
+ /** Find the turn where completion first reached 100% (or null) */
948
+ getTurnToCompletion() {
949
+ const entry = this.history.find((h) => h.completionPercent === 100);
950
+ return entry?.turn ?? null;
951
+ }
952
+ };
953
+
954
+ // src/registry.ts
955
+ var ScenarioRegistry = class {
956
+ scenarios = [];
957
+ scenarioFiles = [];
958
+ /** Register scenarios from ScenarioFile format */
959
+ registerFiles(files) {
960
+ this.scenarioFiles.push(...files);
961
+ this.scenarios.push(...files.map(toScenario));
962
+ }
963
+ /** Register pre-built Scenario objects directly */
964
+ register(scenarios) {
965
+ this.scenarios.push(...scenarios);
966
+ }
967
+ /** Get all scenarios */
968
+ all() {
969
+ return [...this.scenarios];
970
+ }
971
+ /** Get scenarios filtered by category */
972
+ byCategory(category) {
973
+ const fromFiles = this.scenarioFiles.filter((sf) => sf.category === category).map(toScenario);
974
+ return fromFiles;
975
+ }
976
+ /** List all categories with counts */
977
+ listCategories() {
978
+ const counts = {};
979
+ for (const sf of this.scenarioFiles) {
980
+ counts[sf.category] = (counts[sf.category] ?? 0) + 1;
981
+ }
982
+ return Object.entries(counts).map(([category, count]) => ({ category, count }));
983
+ }
984
+ /** Get scenarios filtered by persona */
985
+ byPersona(persona) {
986
+ return this.scenarios.filter((s) => s.persona === persona);
987
+ }
988
+ /** Get a single scenario by ID */
989
+ byId(id) {
990
+ return this.scenarios.find((s) => s.id === id);
991
+ }
992
+ /** Count total scenarios */
993
+ get count() {
994
+ return this.scenarios.length;
995
+ }
996
+ };
997
+ function toScenario(sf) {
998
+ return {
999
+ id: sf.id,
1000
+ persona: sf.persona,
1001
+ label: sf.label,
1002
+ thesis: sf.thesis,
1003
+ dimensions: [],
1004
+ turns: sf.turns,
1005
+ artifactChecks: sf.artifactChecks,
1006
+ systemPromptAppend: sf.isControl ? "You are a helpful AI assistant." : void 0
1007
+ };
1008
+ }
1009
+
1010
+ // src/driver.ts
1011
+ var AgentDriver = class {
1012
+ tc;
1013
+ client;
1014
+ driverModel;
1015
+ productContext;
1016
+ constructor(tc, config) {
1017
+ this.tc = tc;
1018
+ this.client = config.client;
1019
+ this.driverModel = config.driverModel ?? "claude-sonnet-4-6";
1020
+ this.productContext = config.productContext ?? "";
1021
+ }
1022
+ /**
1023
+ * Run a persona through the product.
1024
+ *
1025
+ * Returns metrics on how many turns to completion, cost curve,
1026
+ * quality curve, and convergence curve.
1027
+ */
1028
+ async run(persona) {
1029
+ const email = `eval-driver-${Date.now()}@test.agent-eval.local`;
1030
+ await this.client.signup(`Driver ${persona.role}`, email, "eval-driver-pass");
1031
+ await this.client.login(email, "eval-driver-pass");
1032
+ const workspaceId = await this.client.createWorkspace(`${persona.role} Eval`);
1033
+ const threadId = await this.client.createThread(workspaceId);
1034
+ const metrics = new MetricsCollector(this.client, workspaceId);
1035
+ const convergence = new ConvergenceTracker(persona.completionCriteria);
1036
+ const turnMetrics = [];
1037
+ const conversationHistory = [];
1038
+ let completed = false;
1039
+ let turnsToCompletion = null;
1040
+ for (let turn = 1; turn <= persona.maxTurns; turn++) {
1041
+ const state = await metrics.getState();
1042
+ const userMessage = await this.decideNextMessage(persona, state, conversationHistory);
1043
+ if (userMessage === "DONE") {
1044
+ completed = true;
1045
+ turnsToCompletion = turn - 1;
1046
+ break;
1047
+ }
1048
+ const turnStart = Date.now();
1049
+ const response = await this.client.chat(workspaceId, threadId, userMessage);
1050
+ const latency = Date.now() - turnStart;
1051
+ conversationHistory.push(
1052
+ { role: "user", content: userMessage },
1053
+ { role: "assistant", content: response.text }
1054
+ );
1055
+ await new Promise((r) => setTimeout(r, 2e3));
1056
+ await this.handleApprovals(persona, workspaceId, state);
1057
+ const postState = await metrics.getState();
1058
+ const conv = convergence.record(turn, postState);
1059
+ const codeBlockCount = (response.text.match(/```\w+\n/g) || []).length;
1060
+ const m = await metrics.collect(
1061
+ turn,
1062
+ latency,
1063
+ response.text.length,
1064
+ codeBlockCount,
1065
+ response.blocks.length,
1066
+ Object.values(conv.criteriaStatus).filter(Boolean).length,
1067
+ persona.completionCriteria.length
1068
+ );
1069
+ turnMetrics.push(m);
1070
+ const criteriaStr = Object.entries(conv.criteriaStatus).map(([k, v]) => `${k}:${v ? "+" : "-"}`).join(" ");
1071
+ console.log(` [turn ${turn}] ${conv.completionPercent.toFixed(0)}% \u2014 ${criteriaStr} (${(latency / 1e3).toFixed(1)}s)`);
1072
+ if (conv.complete) {
1073
+ completed = true;
1074
+ turnsToCompletion = turn;
1075
+ console.log(` COMPLETE at turn ${turn}`);
1076
+ break;
1077
+ }
1078
+ }
1079
+ const finalState = await metrics.getState();
1080
+ return {
1081
+ personaId: persona.id,
1082
+ completed,
1083
+ turnsToCompletion,
1084
+ totalTurns: turnMetrics.length,
1085
+ metrics: turnMetrics,
1086
+ finalState,
1087
+ convergenceCurve: convergence.getCurve(),
1088
+ totalCostUsd: 0,
1089
+ finalQualityScore: null
1090
+ };
1091
+ }
1092
+ /** Use the driver LLM to decide what the "user" says next */
1093
+ async decideNextMessage(persona, state, history) {
1094
+ const lastResponse = history.length > 0 ? history[history.length - 1].content.slice(0, 2e3) : "(no conversation yet \u2014 this is the first message)";
1095
+ const recentHistory = history.slice(-6).map(
1096
+ (h) => `${h.role}: ${h.content.slice(0, 500)}`
1097
+ ).join("\n\n");
1098
+ const resp = await this.tc.chat({
1099
+ model: this.driverModel,
1100
+ messages: [{
1101
+ role: "system",
1102
+ content: `You are playing the role of a ${persona.role} testing an AI agent.
1103
+ Your goal: ${persona.goal}
1104
+
1105
+ ${this.productContext ? `Product context:
1106
+ ${this.productContext}
1107
+ ` : ""}
1108
+ Current state:
1109
+ - Tasks: ${state.tasks}
1110
+ - Events: ${state.events}
1111
+ - Proposals: pending=${state.proposals.pending}, approved=${state.proposals.approved}, rejected=${state.proposals.rejected}
1112
+ - Vault files: ${state.vaultFiles.length} (${state.vaultFiles.slice(0, 10).join(", ")}${state.vaultFiles.length > 10 ? "..." : ""})
1113
+
1114
+ Completion criteria met: ${this.describeCompletion(persona, state)}
1115
+
1116
+ Decide what to do next:
1117
+ 1. If completion is 100% \u2014 respond with exactly "DONE"
1118
+ 2. If a proposal is pending \u2014 approve or reject it (with reason)
1119
+ 3. If the agent is on track \u2014 push for the next deliverable
1120
+ 4. If the agent is off track \u2014 give specific corrective feedback
1121
+ 5. If this is the first message \u2014 start with a clear, actionable request
1122
+
1123
+ Output ONLY your next message to the agent. Be specific. Be realistic.
1124
+ Don't be patient \u2014 a real ${persona.role} wouldn't accept vague answers.`
1125
+ }, {
1126
+ role: "user",
1127
+ content: recentHistory ? `Recent conversation:
1128
+ ${recentHistory}
1129
+
1130
+ The agent just said:
1131
+ ${lastResponse}` : "No conversation yet. Send your opening message."
1132
+ }],
1133
+ temperature: 0.5,
1134
+ maxTokens: 500
1135
+ });
1136
+ const content = resp.choices?.[0]?.message?.content ?? "";
1137
+ return content.trim();
1138
+ }
1139
+ /** Handle pending approvals based on persona feedback patterns */
1140
+ async handleApprovals(persona, workspaceId, _state) {
1141
+ const approvals = await this.client.getApprovals(workspaceId);
1142
+ const pending = approvals.filter((a) => a.status === "pending");
1143
+ for (const action of pending) {
1144
+ const rejection = persona.feedbackPatterns?.find((fp) => {
1145
+ const title = action.title.toLowerCase();
1146
+ return title.includes(fp.trigger.toLowerCase());
1147
+ });
1148
+ if (rejection) {
1149
+ await this.client.rejectAction(workspaceId, action.id, rejection.response);
1150
+ console.log(` rejected: ${action.title} \u2014 ${rejection.response.slice(0, 60)}`);
1151
+ } else {
1152
+ await this.client.approveAction(workspaceId, action.id);
1153
+ console.log(` approved: ${action.title}`);
1154
+ }
1155
+ }
1156
+ }
1157
+ /** Describe which completion criteria are met */
1158
+ describeCompletion(persona, state) {
1159
+ const results = persona.completionCriteria.map((c) => {
1160
+ const met = c.check(state);
1161
+ return `${c.name}: ${met ? "MET" : "NOT MET"}`;
1162
+ });
1163
+ const metCount = results.filter((r) => r.includes("MET") && !r.includes("NOT")).length;
1164
+ return `${metCount}/${persona.completionCriteria.length} \u2014 ${results.join(", ")}`;
1165
+ }
1166
+ };
1167
+
1168
+ // src/reporter.ts
1169
+ function formatBenchmarkReport(report) {
1170
+ const lines = [];
1171
+ lines.push(`# Benchmark Report`);
1172
+ lines.push(``);
1173
+ lines.push(`**Date:** ${report.timestamp}`);
1174
+ lines.push(`**Generation:** ${report.generation}`);
1175
+ lines.push(`**Prompt Version:** ${report.promptVersion}`);
1176
+ lines.push(`**Scenarios:** ${report.scenarioCount}`);
1177
+ lines.push(`**Overall Score:** ${report.summary.overallAvg.toFixed(1)}/10`);
1178
+ lines.push(``);
1179
+ lines.push(`## By Persona`);
1180
+ lines.push(``);
1181
+ lines.push(`| Persona | Avg | Passed | Total |`);
1182
+ lines.push(`|---------|-----|--------|-------|`);
1183
+ for (const [name, data] of Object.entries(report.summary.byPersona)) {
1184
+ lines.push(`| ${name} | ${data.avg.toFixed(1)} | ${data.passed} | ${data.total} |`);
1185
+ }
1186
+ lines.push(``);
1187
+ lines.push(`## By Dimension`);
1188
+ lines.push(``);
1189
+ lines.push(`| Dimension | Avg | Range | N |`);
1190
+ lines.push(`|-----------|-----|-------|---|`);
1191
+ const dimEntries = Object.entries(report.summary.byDimension).sort((a, b) => a[1].avg - b[1].avg);
1192
+ for (const [name, data] of dimEntries) {
1193
+ const min = Math.min(...data.scores);
1194
+ const max = Math.max(...data.scores);
1195
+ lines.push(`| ${name} | ${data.avg.toFixed(1)} | ${min}-${max} | ${data.scores.length} |`);
1196
+ }
1197
+ lines.push(``);
1198
+ if (report.summary.weakest.length > 0) {
1199
+ lines.push(`## Weakest Scenarios`);
1200
+ lines.push(``);
1201
+ for (const w of report.summary.weakest) {
1202
+ lines.push(`- **${w.scenario}** (${w.score.toFixed(1)}): ${w.reason}`);
1203
+ }
1204
+ lines.push(``);
1205
+ }
1206
+ if (report.summary.strongest.length > 0) {
1207
+ lines.push(`## Strongest Scenarios`);
1208
+ lines.push(``);
1209
+ for (const s of report.summary.strongest) {
1210
+ lines.push(`- **${s.scenario}** (${s.score.toFixed(1)}): ${s.reason}`);
1211
+ }
1212
+ lines.push(``);
1213
+ }
1214
+ return lines.join("\n");
1215
+ }
1216
+ function formatDriverReport(results) {
1217
+ const lines = [];
1218
+ lines.push(`# Agent Driver Report`);
1219
+ lines.push(``);
1220
+ for (const r of results) {
1221
+ lines.push(`## Persona: ${r.personaId}`);
1222
+ lines.push(``);
1223
+ lines.push(`- **Completed:** ${r.completed ? "Yes" : "No"}`);
1224
+ lines.push(`- **Turns to completion:** ${r.turnsToCompletion ?? "N/A"}`);
1225
+ lines.push(`- **Total turns:** ${r.totalTurns}`);
1226
+ lines.push(`- **Final state:** ${r.finalState.tasks} tasks, ${r.finalState.events} events, ${r.finalState.vaultFiles.length} vault files`);
1227
+ lines.push(``);
1228
+ lines.push(`### Convergence`);
1229
+ lines.push(``);
1230
+ lines.push("```");
1231
+ for (let i = 0; i < r.convergenceCurve.length; i++) {
1232
+ const pct = r.convergenceCurve[i];
1233
+ const bar = "#".repeat(Math.round(pct / 2));
1234
+ lines.push(` turn ${String(i + 1).padStart(2)}: ${bar} ${pct.toFixed(0)}%`);
1235
+ }
1236
+ lines.push("```");
1237
+ lines.push(``);
1238
+ if (r.metrics.length > 0) {
1239
+ lines.push(`### Per-Turn Metrics`);
1240
+ lines.push(``);
1241
+ lines.push(`| Turn | Tasks | Events | Vault | Latency | Completion |`);
1242
+ lines.push(`|------|-------|--------|-------|---------|------------|`);
1243
+ for (const m of r.metrics) {
1244
+ lines.push(`| ${m.turn} | ${m.tasks} | ${m.events} | ${m.vaultFiles} | ${(m.responseLatencyMs / 1e3).toFixed(1)}s | ${m.completionPercent.toFixed(0)}% |`);
1245
+ }
1246
+ lines.push(``);
1247
+ }
1248
+ }
1249
+ return lines.join("\n");
1250
+ }
1251
+ function printDriverSummary(results) {
1252
+ console.log("=".repeat(70));
1253
+ console.log(" AGENT DRIVER \u2014 RESULTS");
1254
+ console.log("=".repeat(70));
1255
+ for (const r of results) {
1256
+ const status = r.completed ? "COMPLETE" : "INCOMPLETE";
1257
+ const turns = r.turnsToCompletion ?? r.totalTurns;
1258
+ console.log(` ${r.personaId.padEnd(20)} ${status.padEnd(12)} turns=${turns} tasks=${r.finalState.tasks} events=${r.finalState.events} vault=${r.finalState.vaultFiles.length}`);
1259
+ }
1260
+ console.log();
1261
+ const completedCount = results.filter((r) => r.completed).length;
1262
+ console.log(`${completedCount}/${results.length} personas completed`);
1263
+ }
1264
+ export {
1265
+ AgentDriver,
1266
+ BenchmarkRunner,
1267
+ ConvergenceTracker,
1268
+ MODEL_PRICING,
1269
+ MetricsCollector,
1270
+ ProductClient,
1271
+ ScenarioRegistry,
1272
+ TokenCounter,
1273
+ adversarialJudge,
1274
+ codeExecutionJudge,
1275
+ coherenceJudge,
1276
+ confidenceInterval,
1277
+ createCustomJudge,
1278
+ createDomainExpertJudge,
1279
+ defaultJudges,
1280
+ estimateCost,
1281
+ estimateTokens,
1282
+ executeScenario,
1283
+ formatBenchmarkReport,
1284
+ formatDriverReport,
1285
+ interRaterReliability,
1286
+ mannWhitneyU,
1287
+ normalizeScores,
1288
+ partialCredit,
1289
+ printDriverSummary,
1290
+ runE2EWorkflow,
1291
+ weightedMean
1292
+ };
1293
+ //# sourceMappingURL=index.js.map