@archal/cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,3427 @@
1
+ #!/usr/bin/env node
2
+ var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
3
+ get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
4
+ }) : x)(function(x) {
5
+ if (typeof require !== "undefined") return require.apply(this, arguments);
6
+ throw Error('Dynamic require of "' + x + '" is not supported');
7
+ });
8
+
9
+ // src/index.ts
10
+ import { Command as Command8 } from "commander";
11
+
12
+ // src/commands/run.ts
13
+ import { Command } from "commander";
14
+ import { existsSync as existsSync6 } from "fs";
15
+ import { resolve as resolve4 } from "path";
16
+
17
+ // src/runner/orchestrator.ts
18
+ import { readFileSync as readFileSync6, existsSync as existsSync5 } from "fs";
19
+ import { resolve as resolve3, dirname as dirname2 } from "path";
20
+
21
+ // src/runner/scenario-parser.ts
22
+ import { readFileSync } from "fs";
23
+ import { basename } from "path";
24
+
25
+ // src/utils/logger.ts
26
+ var LOG_LEVEL_PRIORITY = {
27
+ debug: 0,
28
+ info: 1,
29
+ warn: 2,
30
+ error: 3
31
+ };
32
+ var LOG_LEVEL_COLORS = {
33
+ debug: "\x1B[90m",
34
+ // gray
35
+ info: "\x1B[36m",
36
+ // cyan
37
+ warn: "\x1B[33m",
38
+ // yellow
39
+ error: "\x1B[31m"
40
+ // red
41
+ };
42
+ var RESET = "\x1B[0m";
43
+ var BOLD = "\x1B[1m";
44
+ var DIM = "\x1B[2m";
45
+ var globalOptions = {
46
+ level: "warn",
47
+ quiet: false,
48
+ json: false,
49
+ verbose: false
50
+ };
51
+ function configureLogger(options) {
52
+ globalOptions = { ...globalOptions, ...options };
53
+ }
54
+ function shouldLog(level) {
55
+ if (globalOptions.quiet && level !== "error") {
56
+ return false;
57
+ }
58
+ return LOG_LEVEL_PRIORITY[level] >= LOG_LEVEL_PRIORITY[globalOptions.level];
59
+ }
60
+ function formatTimestamp() {
61
+ return (/* @__PURE__ */ new Date()).toISOString();
62
+ }
63
+ function formatLogEntry(entry) {
64
+ if (globalOptions.json) {
65
+ return JSON.stringify(entry);
66
+ }
67
+ const color = LOG_LEVEL_COLORS[entry.level];
68
+ const levelTag = `${color}${BOLD}${entry.level.toUpperCase().padEnd(5)}${RESET}`;
69
+ const timestamp = `${DIM}${entry.timestamp}${RESET}`;
70
+ let line = `${timestamp} ${levelTag} ${entry.message}`;
71
+ if (entry.data && Object.keys(entry.data).length > 0) {
72
+ const dataStr = Object.entries(entry.data).map(([k, v]) => `${DIM}${k}=${RESET}${typeof v === "string" ? v : JSON.stringify(v)}`).join(" ");
73
+ line += ` ${dataStr}`;
74
+ }
75
+ return line;
76
+ }
77
+ function log(level, message, data) {
78
+ if (!shouldLog(level)) {
79
+ return;
80
+ }
81
+ const entry = {
82
+ level,
83
+ message,
84
+ timestamp: formatTimestamp(),
85
+ data
86
+ };
87
+ const formatted = formatLogEntry(entry);
88
+ process.stderr.write(formatted + "\n");
89
+ }
90
+ function debug(message, data) {
91
+ log("debug", message, data);
92
+ }
93
+ function info(message, data) {
94
+ log("info", message, data);
95
+ }
96
+ function warn(message, data) {
97
+ log("warn", message, data);
98
+ }
99
+ function error(message, data) {
100
+ log("error", message, data);
101
+ }
102
+ function success(message) {
103
+ if (!globalOptions.verbose) return;
104
+ process.stderr.write(`\x1B[32m${BOLD} OK${RESET} ${message}
105
+ `);
106
+ }
107
+ function fail(message) {
108
+ if (!globalOptions.verbose) return;
109
+ process.stderr.write(`\x1B[31m${BOLD}FAIL${RESET} ${message}
110
+ `);
111
+ }
112
+ function progress(message) {
113
+ if (!globalOptions.verbose) return;
114
+ process.stderr.write(`${DIM} ...${RESET} ${message}
115
+ `);
116
+ }
117
+ function banner(text) {
118
+ if (!globalOptions.verbose) return;
119
+ const line = "=".repeat(Math.max(text.length + 4, 40));
120
+ process.stderr.write(`
121
+ \x1B[36m${BOLD}${line}${RESET}
122
+ `);
123
+ process.stderr.write(`\x1B[36m${BOLD} ${text}${RESET}
124
+ `);
125
+ process.stderr.write(`\x1B[36m${BOLD}${line}${RESET}
126
+
127
+ `);
128
+ }
129
+ function table(headers, rows) {
130
+ if (!globalOptions.verbose) return;
131
+ const colWidths = headers.map((h, i) => {
132
+ const maxDataWidth = rows.reduce((max, row) => {
133
+ const cell = row[i] ?? "";
134
+ return Math.max(max, cell.length);
135
+ }, 0);
136
+ return Math.max(h.length, maxDataWidth);
137
+ });
138
+ const headerLine = headers.map((h, i) => h.padEnd(colWidths[i] ?? 0)).join(" ");
139
+ const separator = colWidths.map((w) => "-".repeat(w)).join(" ");
140
+ process.stderr.write(`${BOLD}${headerLine}${RESET}
141
+ `);
142
+ process.stderr.write(`${DIM}${separator}${RESET}
143
+ `);
144
+ for (const row of rows) {
145
+ const line = row.map((cell, i) => cell.padEnd(colWidths[i] ?? 0)).join(" ");
146
+ process.stderr.write(`${line}
147
+ `);
148
+ }
149
+ }
150
+
151
+ // src/runner/scenario-parser.ts
152
+ function extractSections(markdown) {
153
+ const lines = markdown.split("\n");
154
+ let title = "";
155
+ const sections = {};
156
+ let currentSection = "";
157
+ for (const line of lines) {
158
+ const h1Match = line.match(/^#\s+(.+)/);
159
+ if (h1Match) {
160
+ title = h1Match[1]?.trim() ?? "";
161
+ continue;
162
+ }
163
+ const h2Match = line.match(/^##\s+(.+)/);
164
+ if (h2Match) {
165
+ currentSection = (h2Match[1] ?? "").trim().toLowerCase();
166
+ sections[currentSection] = [];
167
+ continue;
168
+ }
169
+ const sectionLines = currentSection ? sections[currentSection] : void 0;
170
+ if (sectionLines) {
171
+ sectionLines.push(line);
172
+ }
173
+ }
174
+ const getSection = (name) => {
175
+ const content = sections[name];
176
+ if (!content) return "";
177
+ return content.join("\n").trim();
178
+ };
179
+ return {
180
+ title,
181
+ setup: getSection("setup"),
182
+ expectedBehavior: getSection("expected behavior"),
183
+ successCriteria: getSection("success criteria"),
184
+ config: getSection("config")
185
+ };
186
+ }
187
+ function parseCriterionLine(line, index) {
188
+ const trimmed = line.trim();
189
+ if (!trimmed) return null;
190
+ const bulletStripped = trimmed.replace(/^[-*]\s+/, "").replace(/^\d+\.\s+/, "");
191
+ if (!bulletStripped) return null;
192
+ let type = "probabilistic";
193
+ let description = bulletStripped;
194
+ const tagMatch = description.match(/^\[([DP])]\s*(.*)/i);
195
+ if (tagMatch) {
196
+ const tag = (tagMatch[1] ?? "").toUpperCase();
197
+ type = tag === "D" ? "deterministic" : "probabilistic";
198
+ description = tagMatch[2]?.trim() ?? "";
199
+ } else {
200
+ type = inferCriterionType(description);
201
+ }
202
+ return {
203
+ id: `criterion-${index + 1}`,
204
+ description,
205
+ type
206
+ };
207
+ }
208
+ function inferCriterionType(description) {
209
+ const deterministicPatterns = [
210
+ /^exactly\s+\d+/i,
211
+ /^at\s+least\s+\d+/i,
212
+ /^at\s+most\s+\d+/i,
213
+ /^no\s+more\s+than\s+\d+/i,
214
+ /^fewer\s+than\s+\d+/i,
215
+ /^more\s+than\s+\d+/i,
216
+ /\bis\s+(created|merged|closed|open|deleted|removed)\b/i,
217
+ /\bexists?\b/i,
218
+ /\bno\s+errors?\b/i,
219
+ /\bcount\s+(is|equals|==)\b/i,
220
+ /\b(should|must)\s+(have|contain)\s+exactly\b/i,
221
+ /^\d+\s+\w+\s+(are|were|is|was)\b/i,
222
+ /\b(zero|none)\s+\w+\s+(are|were|remain)\b/i
223
+ ];
224
+ for (const pattern of deterministicPatterns) {
225
+ if (pattern.test(description)) {
226
+ return "deterministic";
227
+ }
228
+ }
229
+ return "probabilistic";
230
+ }
231
+ function parseConfigSection(configText) {
232
+ const result = {};
233
+ if (!configText) return result;
234
+ const lines = configText.split("\n");
235
+ for (const line of lines) {
236
+ const trimmed = line.trim();
237
+ if (!trimmed || trimmed.startsWith("#")) continue;
238
+ const colonIndex = trimmed.indexOf(":");
239
+ if (colonIndex === -1) continue;
240
+ const key = trimmed.slice(0, colonIndex).trim().toLowerCase();
241
+ const value = trimmed.slice(colonIndex + 1).trim();
242
+ switch (key) {
243
+ case "twins": {
244
+ result.twins = value.split(",").map((t) => t.trim()).filter(Boolean);
245
+ break;
246
+ }
247
+ case "timeout": {
248
+ const num = parseInt(value, 10);
249
+ if (!Number.isNaN(num) && num > 0) {
250
+ result.timeout = num;
251
+ }
252
+ break;
253
+ }
254
+ case "runs": {
255
+ const num = parseInt(value, 10);
256
+ if (!Number.isNaN(num) && num > 0) {
257
+ result.runs = num;
258
+ }
259
+ break;
260
+ }
261
+ case "evaluator":
262
+ case "evaluator-model":
263
+ case "evaluatormodel":
264
+ case "model": {
265
+ result.evaluatorModel = value;
266
+ break;
267
+ }
268
+ default: {
269
+ debug(`Unknown config key in scenario: "${key}"`);
270
+ }
271
+ }
272
+ }
273
+ return result;
274
+ }
275
+ function inferTwinsFromContent(setup, expectedBehavior) {
276
+ const combined = `${setup}
277
+ ${expectedBehavior}`.toLowerCase();
278
+ const twins = [];
279
+ const twinKeywords = {
280
+ github: ["github", "repository", "repo", "pull request", "pr", "issue", "commit", "branch", "merge"],
281
+ slack: ["slack", "channel", "message", "thread", "workspace", "dm", "direct message"],
282
+ linear: ["linear", "ticket", "project", "cycle", "backlog"],
283
+ jira: ["jira", "sprint", "epic", "story", "board"]
284
+ };
285
+ for (const [twin, keywords] of Object.entries(twinKeywords)) {
286
+ if (keywords.some((kw) => combined.includes(kw))) {
287
+ twins.push(twin);
288
+ }
289
+ }
290
+ return twins;
291
+ }
292
+ function parseScenarioFile(filePath) {
293
+ const content = readFileSync(filePath, "utf-8");
294
+ return parseScenarioMarkdown(content, filePath);
295
+ }
296
+ function parseScenarioMarkdown(markdown, sourcePath) {
297
+ const sections = extractSections(markdown);
298
+ if (!sections.title) {
299
+ const fallbackTitle = sourcePath ? basename(sourcePath, ".md").replace(/-/g, " ") : "Untitled Scenario";
300
+ warn(`Scenario missing title heading, using fallback: "${fallbackTitle}"`);
301
+ sections.title = fallbackTitle;
302
+ }
303
+ if (!sections.setup) {
304
+ warn("Scenario missing ## Setup section");
305
+ }
306
+ if (!sections.expectedBehavior) {
307
+ warn("Scenario missing ## Expected Behavior section");
308
+ }
309
+ if (!sections.successCriteria) {
310
+ warn("Scenario missing ## Success Criteria section");
311
+ }
312
+ const criteriaLines = sections.successCriteria.split("\n");
313
+ const successCriteria = [];
314
+ let criterionIndex = 0;
315
+ for (const line of criteriaLines) {
316
+ const criterion = parseCriterionLine(line, criterionIndex);
317
+ if (criterion) {
318
+ successCriteria.push(criterion);
319
+ criterionIndex++;
320
+ }
321
+ }
322
+ const parsedConfig = parseConfigSection(sections.config);
323
+ const inferredTwins = parsedConfig.twins && parsedConfig.twins.length > 0 ? parsedConfig.twins : inferTwinsFromContent(sections.setup, sections.expectedBehavior);
324
+ const config = {
325
+ twins: inferredTwins,
326
+ timeout: parsedConfig.timeout ?? 120,
327
+ runs: parsedConfig.runs ?? 5,
328
+ evaluatorModel: parsedConfig.evaluatorModel
329
+ };
330
+ debug("Parsed scenario", {
331
+ title: sections.title,
332
+ criteriaCount: successCriteria.length,
333
+ deterministicCount: successCriteria.filter((c) => c.type === "deterministic").length,
334
+ probabilisticCount: successCriteria.filter((c) => c.type === "probabilistic").length,
335
+ twins: config.twins.join(", ")
336
+ });
337
+ return {
338
+ title: sections.title,
339
+ setup: sections.setup,
340
+ expectedBehavior: sections.expectedBehavior,
341
+ successCriteria,
342
+ config
343
+ };
344
+ }
345
+ function validateScenario(scenario) {
346
+ const errors = [];
347
+ if (!scenario.title) {
348
+ errors.push("Scenario must have a title");
349
+ }
350
+ if (!scenario.setup) {
351
+ errors.push("Scenario must have a Setup section");
352
+ }
353
+ if (!scenario.expectedBehavior) {
354
+ errors.push("Scenario must have an Expected Behavior section");
355
+ }
356
+ if (scenario.successCriteria.length === 0) {
357
+ errors.push("Scenario must have at least one success criterion");
358
+ }
359
+ for (const criterion of scenario.successCriteria) {
360
+ if (!criterion.description) {
361
+ errors.push(`Criterion ${criterion.id} has an empty description`);
362
+ }
363
+ }
364
+ if (scenario.config.twins.length === 0) {
365
+ errors.push("Scenario does not reference any known twins (specify in Config section or mention services in Setup/Expected Behavior)");
366
+ }
367
+ if (scenario.config.timeout <= 0) {
368
+ errors.push("Timeout must be a positive number");
369
+ }
370
+ if (scenario.config.runs <= 0) {
371
+ errors.push("Runs must be a positive number");
372
+ }
373
+ return errors;
374
+ }
375
+
376
+ // src/runner/seed-generator.ts
377
+ var GITHUB_SEED_MAPPINGS = [
378
+ {
379
+ keywords: ["empty", "blank", "new", "fresh", "clean", "no issues", "no pull requests", "bare"],
380
+ seedName: "empty",
381
+ weight: 1
382
+ },
383
+ {
384
+ keywords: ["small", "simple", "basic", "starter", "minimal", "few issues", "small team", "small project"],
385
+ seedName: "small-project",
386
+ weight: 1
387
+ },
388
+ {
389
+ keywords: [
390
+ "enterprise",
391
+ "large",
392
+ "many issues",
393
+ "complex",
394
+ "multiple contributors",
395
+ "ci/cd",
396
+ "workflows",
397
+ "protected branches",
398
+ "teams",
399
+ "organization"
400
+ ],
401
+ seedName: "enterprise-repo",
402
+ weight: 1
403
+ },
404
+ {
405
+ keywords: ["conflict", "merge conflict", "conflicting", "diverged", "cannot merge"],
406
+ seedName: "merge-conflict",
407
+ weight: 2
408
+ },
409
+ {
410
+ keywords: ["permission", "denied", "forbidden", "access denied", "unauthorized", "read-only"],
411
+ seedName: "permissions-denied",
412
+ weight: 2
413
+ },
414
+ {
415
+ keywords: ["rate limit", "throttle", "too many requests", "429"],
416
+ seedName: "rate-limited",
417
+ weight: 2
418
+ },
419
+ {
420
+ keywords: [
421
+ "stale",
422
+ "old",
423
+ "inactive",
424
+ "outdated",
425
+ "abandoned",
426
+ "stale issues",
427
+ "untriaged"
428
+ ],
429
+ seedName: "stale-issues",
430
+ weight: 2
431
+ },
432
+ {
433
+ keywords: [
434
+ "pagination",
435
+ "large backlog",
436
+ "many issues",
437
+ "50 issues",
438
+ "paginate",
439
+ "page 2",
440
+ "multiple pages"
441
+ ],
442
+ seedName: "large-backlog",
443
+ weight: 2
444
+ }
445
+ ];
446
+ var SLACK_SEED_MAPPINGS = [
447
+ {
448
+ keywords: ["empty", "blank", "new workspace", "fresh", "clean"],
449
+ seedName: "empty",
450
+ weight: 1
451
+ },
452
+ {
453
+ keywords: ["small team", "few channels", "simple", "basic", "starter"],
454
+ seedName: "small-team",
455
+ weight: 1
456
+ },
457
+ {
458
+ keywords: [
459
+ "engineering",
460
+ "development",
461
+ "engineering team",
462
+ "developers",
463
+ "incidents",
464
+ "on-call",
465
+ "sprints",
466
+ "standups"
467
+ ],
468
+ seedName: "engineering-team",
469
+ weight: 1
470
+ },
471
+ {
472
+ keywords: ["support", "customer", "tickets", "help desk", "routing"],
473
+ seedName: "support-team",
474
+ weight: 1
475
+ },
476
+ {
477
+ keywords: ["busy", "high volume", "many messages", "active", "noisy"],
478
+ seedName: "high-volume",
479
+ weight: 1
480
+ }
481
+ ];
482
+ var TWIN_SEED_REGISTRY = {
483
+ github: GITHUB_SEED_MAPPINGS,
484
+ slack: SLACK_SEED_MAPPINGS
485
+ };
486
+ var DEFAULT_SEEDS = {
487
+ github: "small-project",
488
+ slack: "small-team"
489
+ };
490
+ function normalizeText(text) {
491
+ return text.toLowerCase().replace(/[^a-z0-9\s/]/g, " ").replace(/\s+/g, " ").trim();
492
+ }
493
+ function scoreMappingAgainstText(text, mapping) {
494
+ const normalized = normalizeText(text);
495
+ const matched = [];
496
+ let score = 0;
497
+ for (const keyword of mapping.keywords) {
498
+ if (normalized.includes(keyword)) {
499
+ matched.push(keyword);
500
+ score += mapping.weight;
501
+ }
502
+ }
503
+ return { score, matched };
504
+ }
505
+ function selectSeedForTwin(twinName, setupDescription) {
506
+ const mappings = TWIN_SEED_REGISTRY[twinName];
507
+ if (!mappings || mappings.length === 0) {
508
+ debug(`No seed mappings for twin "${twinName}", using "default"`);
509
+ return {
510
+ twinName,
511
+ seedName: "default",
512
+ confidence: 0,
513
+ matchedKeywords: []
514
+ };
515
+ }
516
+ let bestSeed = DEFAULT_SEEDS[twinName] ?? "default";
517
+ let bestScore = 0;
518
+ let bestMatched = [];
519
+ for (const mapping of mappings) {
520
+ const { score, matched } = scoreMappingAgainstText(setupDescription, mapping);
521
+ if (score > bestScore) {
522
+ bestScore = score;
523
+ bestSeed = mapping.seedName;
524
+ bestMatched = matched;
525
+ }
526
+ }
527
+ const maxPossibleScore = mappings.reduce(
528
+ (sum, m) => sum + m.keywords.length * m.weight,
529
+ 0
530
+ );
531
+ const confidence = maxPossibleScore > 0 ? Math.min(bestScore / 5, 1) : 0;
532
+ debug("Seed selection", {
533
+ twin: twinName,
534
+ seed: bestSeed,
535
+ confidence: confidence.toFixed(2),
536
+ matchedKeywords: bestMatched.join(", ")
537
+ });
538
+ return {
539
+ twinName,
540
+ seedName: bestSeed,
541
+ confidence,
542
+ matchedKeywords: bestMatched
543
+ };
544
+ }
545
+ function generateSeedSelections(twins, setupDescription) {
546
+ return twins.map((twin) => selectSeedForTwin(twin, setupDescription));
547
+ }
548
+ function overrideSeedSelection(selections, overrides) {
549
+ return selections.map((sel) => {
550
+ const override = overrides[sel.twinName];
551
+ if (override) {
552
+ debug(`Seed override for ${sel.twinName}: ${override}`);
553
+ return {
554
+ ...sel,
555
+ seedName: override,
556
+ confidence: 1,
557
+ matchedKeywords: ["(manual override)"]
558
+ };
559
+ }
560
+ return sel;
561
+ });
562
+ }
563
+ function getAvailableSeeds(twinName) {
564
+ const mappings = TWIN_SEED_REGISTRY[twinName];
565
+ if (!mappings) return [];
566
+ const seedNames = /* @__PURE__ */ new Set();
567
+ for (const mapping of mappings) {
568
+ seedNames.add(mapping.seedName);
569
+ }
570
+ return Array.from(seedNames);
571
+ }
572
+
573
+ // src/runner/agent-executor.ts
574
+ import { readFileSync as readFileSync2, writeFileSync, renameSync, existsSync, unlinkSync } from "fs";
575
+ import { createRequire } from "module";
576
+ import { tmpdir } from "os";
577
+ import { join, resolve } from "path";
578
+ import { fileURLToPath } from "url";
579
+
580
+ // src/utils/process.ts
581
+ import { spawn } from "child_process";
582
+ function spawnWithTimeout(options) {
583
+ const {
584
+ command,
585
+ args,
586
+ timeoutMs,
587
+ cwd,
588
+ env,
589
+ pipeStdio = false,
590
+ onStdout,
591
+ onStderr
592
+ } = options;
593
+ return new Promise((resolve8, reject) => {
594
+ const startTime = Date.now();
595
+ let timedOut = false;
596
+ let stdoutBuf = "";
597
+ let stderrBuf = "";
598
+ const spawnOpts = {
599
+ cwd,
600
+ env: env ? { ...process.env, ...env } : process.env,
601
+ stdio: pipeStdio ? "inherit" : "pipe",
602
+ shell: process.platform === "win32"
603
+ };
604
+ debug("Spawning process", { command, args: args.join(" "), timeoutMs });
605
+ let child;
606
+ try {
607
+ child = spawn(command, args, spawnOpts);
608
+ } catch (err) {
609
+ const message = err instanceof Error ? err.message : String(err);
610
+ reject(new Error(`Failed to spawn process "${command}": ${message}`));
611
+ return;
612
+ }
613
+ const timer = setTimeout(() => {
614
+ timedOut = true;
615
+ debug("Process timed out, killing", { command, timeoutMs });
616
+ child.kill("SIGTERM");
617
+ setTimeout(() => {
618
+ if (!child.killed) {
619
+ child.kill("SIGKILL");
620
+ }
621
+ }, 5e3);
622
+ }, timeoutMs);
623
+ if (!pipeStdio && child.stdout) {
624
+ child.stdout.on("data", (chunk) => {
625
+ const text = chunk.toString();
626
+ stdoutBuf += text;
627
+ if (onStdout) {
628
+ onStdout(text);
629
+ }
630
+ });
631
+ }
632
+ if (!pipeStdio && child.stderr) {
633
+ child.stderr.on("data", (chunk) => {
634
+ const text = chunk.toString();
635
+ stderrBuf += text;
636
+ if (onStderr) {
637
+ onStderr(text);
638
+ }
639
+ });
640
+ }
641
+ child.on("error", (err) => {
642
+ clearTimeout(timer);
643
+ reject(new Error(`Process "${command}" errored: ${err.message}`));
644
+ });
645
+ child.on("close", (exitCode) => {
646
+ clearTimeout(timer);
647
+ const durationMs = Date.now() - startTime;
648
+ debug("Process exited", { command, exitCode, durationMs, timedOut });
649
+ resolve8({
650
+ exitCode,
651
+ stdout: stdoutBuf,
652
+ stderr: stderrBuf,
653
+ timedOut,
654
+ durationMs
655
+ });
656
+ });
657
+ });
658
+ }
659
+ function spawnMcpStdioProcess(options) {
660
+ const { command, args, env, cwd } = options;
661
+ debug("Spawning MCP stdio process", { command, args: args.join(" ") });
662
+ const child = spawn(command, args, {
663
+ cwd,
664
+ env: env ? { ...process.env, ...env } : process.env,
665
+ stdio: ["pipe", "pipe", "pipe"],
666
+ shell: process.platform === "win32"
667
+ });
668
+ child.on("error", (err) => {
669
+ error(`MCP process "${command}" errored: ${err.message}`);
670
+ });
671
+ return child;
672
+ }
673
+ function killProcess(child, gracePeriodMs = 5e3) {
674
+ return new Promise((resolve8) => {
675
+ if (child.killed || child.exitCode !== null) {
676
+ resolve8();
677
+ return;
678
+ }
679
+ child.kill("SIGTERM");
680
+ const forceKillTimer = setTimeout(() => {
681
+ if (!child.killed) {
682
+ child.kill("SIGKILL");
683
+ }
684
+ }, gracePeriodMs);
685
+ child.on("close", () => {
686
+ clearTimeout(forceKillTimer);
687
+ resolve8();
688
+ });
689
+ });
690
+ }
691
+
692
+ // src/runner/agent-executor.ts
693
+ var __dirname = fileURLToPath(new URL(".", import.meta.url));
694
+ function resolveTwinCommand(twinName) {
695
+ const distPath = resolve(__dirname, "..", "..", "twins", twinName, "dist", "index.js");
696
+ if (existsSync(distPath)) {
697
+ return { command: "node", args: [distPath] };
698
+ }
699
+ try {
700
+ const require2 = createRequire(import.meta.url);
701
+ const packageMain = require2.resolve(`@archal/twin-${twinName}`);
702
+ return { command: "node", args: [packageMain] };
703
+ } catch {
704
+ }
705
+ return { command: "npx", args: [`@archal/twin-${twinName}`] };
706
+ }
707
+ async function waitForFile(filePath, timeoutMs = 1e4) {
708
+ const start = Date.now();
709
+ while (Date.now() - start < timeoutMs) {
710
+ if (existsSync(filePath)) return true;
711
+ await new Promise((r) => setTimeout(r, 100));
712
+ }
713
+ return false;
714
+ }
715
+ async function captureSeedState(twinConfigs) {
716
+ const runId = `archal-seed-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
717
+ const beforeState = {};
718
+ const twinPaths = {};
719
+ const processes = [];
720
+ for (const config of twinConfigs) {
721
+ const { command, args } = resolveTwinCommand(config.twinName);
722
+ const stateFilePath = join(tmpdir(), `${runId}-${config.twinName}-state.json`);
723
+ debug(`Capturing seed state for ${config.twinName}`, { seed: config.seedName });
724
+ const twinArgs = [
725
+ ...args,
726
+ "--seed",
727
+ config.seedName,
728
+ "--state-file",
729
+ stateFilePath
730
+ ];
731
+ if (config.rateLimitMax && config.rateLimitMax > 0) {
732
+ twinArgs.push("--rate-limit", String(config.rateLimitMax));
733
+ }
734
+ const child = spawnMcpStdioProcess({ command, args: twinArgs });
735
+ processes.push(child);
736
+ twinPaths[config.twinName] = { stateFile: stateFilePath };
737
+ const appeared = await waitForFile(stateFilePath, 1e4);
738
+ if (appeared) {
739
+ try {
740
+ const raw = readFileSync2(stateFilePath, "utf-8");
741
+ beforeState[config.twinName] = JSON.parse(raw);
742
+ } catch {
743
+ warn(`Failed to read seed state for ${config.twinName}`);
744
+ beforeState[config.twinName] = {};
745
+ }
746
+ } else {
747
+ warn(`State file not written for ${config.twinName} within timeout`);
748
+ beforeState[config.twinName] = {};
749
+ }
750
+ await killProcess(child, 3e3);
751
+ }
752
+ return { beforeState, twinPaths };
753
+ }
754
+ function writeMcpConfig(twinConfigs, runId) {
755
+ const twinPaths = {};
756
+ const mcpServers = {};
757
+ for (const config of twinConfigs) {
758
+ const { command, args: baseArgs } = resolveTwinCommand(config.twinName);
759
+ const stateFile = join(tmpdir(), `${runId}-${config.twinName}-state.json`);
760
+ const traceFile = join(tmpdir(), `${runId}-${config.twinName}-trace.json`);
761
+ twinPaths[config.twinName] = { stateFile, traceFile };
762
+ const twinArgs = [
763
+ ...baseArgs,
764
+ "--seed",
765
+ config.seedName,
766
+ "--state-file",
767
+ stateFile,
768
+ "--trace-file",
769
+ traceFile
770
+ ];
771
+ if (config.rateLimitMax && config.rateLimitMax > 0) {
772
+ twinArgs.push("--rate-limit", String(config.rateLimitMax));
773
+ }
774
+ mcpServers[config.twinName] = { command, args: twinArgs };
775
+ }
776
+ const configData = { mcpServers };
777
+ const configPath = join(tmpdir(), `${runId}-mcp-config.json`);
778
+ const tmpPath = configPath + ".tmp";
779
+ writeFileSync(tmpPath, JSON.stringify(configData, null, 2));
780
+ renameSync(tmpPath, configPath);
781
+ debug("Wrote MCP config", { configPath, twins: Object.keys(mcpServers).join(", ") });
782
+ return { configPath, twinPaths };
783
+ }
784
+ async function executeAgent(agentConfig, mcpConfigPath, mcpServersJson, twinNames, timeoutMs) {
785
+ const agentEnv = {
786
+ ...agentConfig.env,
787
+ MCP_CONFIG_PATH: mcpConfigPath,
788
+ ARCHAL_MCP_CONFIG: mcpConfigPath,
789
+ ARCHAL_MCP_SERVERS: mcpServersJson,
790
+ ARCHAL_TWIN_NAMES: twinNames.join(",")
791
+ };
792
+ info("Executing agent", {
793
+ command: agentConfig.command,
794
+ timeout: `${timeoutMs}ms`
795
+ });
796
+ const result = await spawnWithTimeout({
797
+ command: agentConfig.command,
798
+ args: agentConfig.args,
799
+ timeoutMs,
800
+ cwd: agentConfig.cwd,
801
+ env: agentEnv,
802
+ onStdout: (chunk) => {
803
+ debug(`[agent stdout] ${chunk.trimEnd()}`);
804
+ },
805
+ onStderr: (chunk) => {
806
+ debug(`[agent stderr] ${chunk.trimEnd()}`);
807
+ }
808
+ });
809
+ if (result.timedOut) {
810
+ warn(`Agent timed out after ${timeoutMs}ms`);
811
+ } else if (result.exitCode !== 0) {
812
+ warn(`Agent exited with code ${result.exitCode}`);
813
+ } else {
814
+ info("Agent completed successfully", { durationMs: result.durationMs });
815
+ }
816
+ return {
817
+ exitCode: result.exitCode,
818
+ stdout: result.stdout,
819
+ stderr: result.stderr,
820
+ timedOut: result.timedOut,
821
+ durationMs: result.durationMs
822
+ };
823
+ }
824
+ function collectStateFromFiles(twinPaths) {
825
+ const state = {};
826
+ for (const [name, paths] of Object.entries(twinPaths)) {
827
+ try {
828
+ if (existsSync(paths.stateFile)) {
829
+ const raw = readFileSync2(paths.stateFile, "utf-8");
830
+ state[name] = JSON.parse(raw);
831
+ } else {
832
+ warn(`State file not found for twin "${name}" at ${paths.stateFile}`);
833
+ state[name] = {};
834
+ }
835
+ } catch (err) {
836
+ const msg = err instanceof Error ? err.message : String(err);
837
+ warn(`Failed to read state for twin "${name}": ${msg}`);
838
+ state[name] = {};
839
+ }
840
+ }
841
+ return state;
842
+ }
843
+ function collectTraceFromFiles(twinPaths) {
844
+ const allTraces = [];
845
+ for (const [name, paths] of Object.entries(twinPaths)) {
846
+ try {
847
+ if (existsSync(paths.traceFile)) {
848
+ const raw = readFileSync2(paths.traceFile, "utf-8");
849
+ const entries = JSON.parse(raw);
850
+ allTraces.push(...entries);
851
+ } else {
852
+ debug(`Trace file not found for twin "${name}"`);
853
+ }
854
+ } catch {
855
+ debug(`Could not parse trace file for twin "${name}"`);
856
+ }
857
+ }
858
+ allTraces.sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime());
859
+ return allTraces;
860
+ }
861
+ function cleanupTempFiles(mcpConfigPath, twinPaths, seedPaths) {
862
+ const filesToClean = [mcpConfigPath, mcpConfigPath + ".tmp"];
863
+ for (const paths of Object.values(twinPaths)) {
864
+ filesToClean.push(paths.stateFile, paths.stateFile + ".tmp");
865
+ filesToClean.push(paths.traceFile, paths.traceFile + ".tmp");
866
+ }
867
+ if (seedPaths) {
868
+ for (const paths of Object.values(seedPaths)) {
869
+ filesToClean.push(paths.stateFile, paths.stateFile + ".tmp");
870
+ }
871
+ }
872
+ for (const file of filesToClean) {
873
+ try {
874
+ if (existsSync(file)) unlinkSync(file);
875
+ } catch {
876
+ }
877
+ }
878
+ }
879
+ function splitCommand(cmd) {
880
+ const result = [];
881
+ let current = "";
882
+ let inQuote = false;
883
+ let quoteChar = "";
884
+ for (const ch of cmd) {
885
+ if (!inQuote && (ch === '"' || ch === "'")) {
886
+ inQuote = true;
887
+ quoteChar = ch;
888
+ } else if (inQuote && ch === quoteChar) {
889
+ inQuote = false;
890
+ } else if (!inQuote && ch === " ") {
891
+ if (current) result.push(current);
892
+ current = "";
893
+ } else {
894
+ current += ch;
895
+ }
896
+ }
897
+ if (current) result.push(current);
898
+ return result;
899
+ }
900
+ function resolveAgentConfig(agentCommand, projectConfigPath) {
901
+ if (agentCommand) {
902
+ const parts = splitCommand(agentCommand);
903
+ return {
904
+ command: parts[0],
905
+ args: parts.slice(1)
906
+ };
907
+ }
908
+ if (projectConfigPath) {
909
+ try {
910
+ const raw = readFileSync2(projectConfigPath, "utf-8");
911
+ const config = JSON.parse(raw);
912
+ if (config.agent?.command) {
913
+ return {
914
+ command: config.agent.command,
915
+ args: config.agent.args ?? [],
916
+ env: config.agent.env
917
+ };
918
+ }
919
+ } catch (err) {
920
+ const message = err instanceof Error ? err.message : String(err);
921
+ error(`Failed to load agent config from ${projectConfigPath}: ${message}`);
922
+ }
923
+ }
924
+ const envCommand = process.env["ARCHAL_AGENT_COMMAND"];
925
+ if (envCommand) {
926
+ const parts = splitCommand(envCommand);
927
+ return {
928
+ command: parts[0],
929
+ args: parts.slice(1)
930
+ };
931
+ }
932
+ return null;
933
+ }
934
+
935
+ // src/runner/reporter.ts
936
+ import { readFileSync as readFileSync3, existsSync as existsSync2 } from "fs";
937
+ import { createRequire as createRequire2 } from "module";
938
+ import { dirname, resolve as resolve2 } from "path";
939
+ import { fileURLToPath as fileURLToPath2 } from "url";
940
+ var __dirname2 = fileURLToPath2(new URL(".", import.meta.url));
941
+ var RESET2 = "\x1B[0m";
942
+ var BOLD2 = "\x1B[1m";
943
+ var DIM2 = "\x1B[2m";
944
+ var GREEN = "\x1B[32m";
945
+ var RED = "\x1B[31m";
946
+ var YELLOW = "\x1B[33m";
947
+ var CYAN = "\x1B[36m";
948
+ function printHeader(scenarioTitle, seedSelections) {
949
+ process.stderr.write(`
950
+ ${CYAN}${BOLD2}archal${RESET2} ${DIM2}\u2014${RESET2} ${scenarioTitle}
951
+ `);
952
+ for (const sel of seedSelections) {
953
+ process.stderr.write(` ${DIM2}twin:${RESET2} ${sel.twinName} ${DIM2}(seed: ${sel.seedName})${RESET2}
954
+ `);
955
+ }
956
+ }
957
+ function printRunProgress(runIndex, totalRuns, score, error2) {
958
+ const dots = ".".repeat(Math.max(1, 20 - String(runIndex + 1).length - String(totalRuns).length));
959
+ if (error2) {
960
+ const shortError = error2.length > 60 ? error2.slice(0, 59) + "\u2026" : error2;
961
+ process.stderr.write(` run ${runIndex + 1}/${totalRuns} ${DIM2}${dots}${RESET2} ${RED}ERROR${RESET2} ${DIM2}(${shortError})${RESET2}
962
+ `);
963
+ return;
964
+ }
965
+ const sc = score >= 100 ? GREEN : score >= 50 ? YELLOW : RED;
966
+ const pct = `${sc}${score.toFixed(0)}%${RESET2}`;
967
+ process.stderr.write(` run ${runIndex + 1}/${totalRuns} ${DIM2}${dots}${RESET2} ${pct}
968
+ `);
969
+ }
970
+ function formatTraceSummary(report) {
971
+ const lines = [];
972
+ const firstRun = report.runs[0];
973
+ if (!firstRun || firstRun.trace.length === 0) return lines;
974
+ const trace = firstRun.trace;
975
+ const toolCounts = /* @__PURE__ */ new Map();
976
+ for (const entry of trace) {
977
+ const count = toolCounts.get(entry.toolName) ?? 0;
978
+ toolCounts.set(entry.toolName, count + 1);
979
+ }
980
+ lines.push(` ${DIM2}actions:${RESET2}`);
981
+ for (const [tool, count] of toolCounts) {
982
+ const entries = trace.filter((e) => e.toolName === tool);
983
+ const summary = summarizeToolCalls(tool, entries);
984
+ if (summary) {
985
+ lines.push(` ${DIM2}\u2192${RESET2} ${summary}`);
986
+ } else {
987
+ lines.push(` ${DIM2}\u2192${RESET2} ${tool} \xD7${count}`);
988
+ }
989
+ }
990
+ const totalCalls = trace.length;
991
+ lines.push(` ${DIM2}\u2192 ${totalCalls} total API calls${RESET2}`);
992
+ return lines;
993
+ }
994
+ function summarizeToolCalls(toolName, entries) {
995
+ const count = entries.length;
996
+ const ids = [];
997
+ for (const entry of entries) {
998
+ const input = entry.input;
999
+ const id = input["issue_number"] ?? input["number"] ?? input["id"] ?? input["channel"] ?? input["name"];
1000
+ if (id !== void 0) {
1001
+ ids.push(id);
1002
+ }
1003
+ }
1004
+ const toolLabel = toolName.replace(/_/g, " ").replace(/-/g, " ");
1005
+ if (ids.length > 0 && ids.length <= 6) {
1006
+ const idStr = ids.map((id) => typeof id === "number" ? `#${id}` : String(id)).join(", ");
1007
+ return `${toolLabel} \xD7${count} (${idStr})`;
1008
+ }
1009
+ if (count === 1) {
1010
+ return toolLabel;
1011
+ }
1012
+ return `${toolLabel} \xD7${count}`;
1013
+ }
1014
+ function generateReport(report, format) {
1015
+ switch (format) {
1016
+ case "terminal":
1017
+ return formatTerminal(report);
1018
+ case "json":
1019
+ return formatJson(report);
1020
+ case "junit":
1021
+ return formatJunit(report);
1022
+ }
1023
+ }
1024
+ function loadTwinFidelity(twinNames) {
1025
+ const lines = [];
1026
+ for (const name of twinNames) {
1027
+ try {
1028
+ let fidelityPath = null;
1029
+ const monorepoPath = resolve2(__dirname2, "..", "..", "twins", name, "fidelity.json");
1030
+ if (existsSync2(monorepoPath)) {
1031
+ fidelityPath = monorepoPath;
1032
+ }
1033
+ if (!fidelityPath) {
1034
+ try {
1035
+ const require2 = createRequire2(import.meta.url);
1036
+ const twinMain = require2.resolve(`@archal/twin-${name}`);
1037
+ const candidate = resolve2(dirname(twinMain), "..", "fidelity.json");
1038
+ if (existsSync2(candidate)) {
1039
+ fidelityPath = candidate;
1040
+ }
1041
+ } catch {
1042
+ }
1043
+ }
1044
+ if (!fidelityPath) continue;
1045
+ const raw = readFileSync3(fidelityPath, "utf-8");
1046
+ const data = JSON.parse(raw);
1047
+ lines.push(` ${DIM2}twin fidelity:${RESET2} ${data.twin} v${data.version}`);
1048
+ for (const cap of data.capabilities) {
1049
+ const icon = cap.supported ? `${GREEN}\u2713${RESET2}` : `${DIM2}\u2717${RESET2}`;
1050
+ lines.push(` ${icon} ${DIM2}${cap.name}${RESET2}`);
1051
+ }
1052
+ } catch {
1053
+ }
1054
+ }
1055
+ return lines;
1056
+ }
1057
+ function formatTerminal(report) {
1058
+ const lines = [];
1059
+ const totalRuns = report.runs.length;
1060
+ const traceSummary = formatTraceSummary(report);
1061
+ if (traceSummary.length > 0) {
1062
+ lines.push("");
1063
+ lines.push(...traceSummary);
1064
+ }
1065
+ lines.push("");
1066
+ const criterionIds = [];
1067
+ if (report.runs.length > 0) {
1068
+ for (const evaluation of report.runs[0].evaluations) {
1069
+ criterionIds.push(evaluation.criterionId);
1070
+ }
1071
+ }
1072
+ for (const criterionId of criterionIds) {
1073
+ let passCount = 0;
1074
+ for (const run of report.runs) {
1075
+ const evaluation = run.evaluations.find((e) => e.criterionId === criterionId);
1076
+ if (evaluation && evaluation.status === "pass") {
1077
+ passCount++;
1078
+ }
1079
+ }
1080
+ const allPassed = passCount === totalRuns;
1081
+ const nonePassed = passCount === 0;
1082
+ const description = report.criterionDescriptions?.[criterionId] ?? criterionId;
1083
+ const termWidth = process.stdout.columns ?? 80;
1084
+ const statusPlainLen = `pass ${totalRuns}/${totalRuns}`.length;
1085
+ const reservedRight = 2 + statusPlainLen;
1086
+ const maxLabelLen = Math.max(20, termWidth - reservedRight - 4);
1087
+ let truncatedLabel;
1088
+ if (description.length > maxLabelLen) {
1089
+ const truncPoint = description.lastIndexOf(" ", maxLabelLen - 1);
1090
+ truncatedLabel = truncPoint > maxLabelLen * 0.6 ? description.slice(0, truncPoint) + "\u2026" : description.slice(0, maxLabelLen - 1) + "\u2026";
1091
+ } else {
1092
+ truncatedLabel = description;
1093
+ }
1094
+ const dotCount = Math.max(2, termWidth - 2 - truncatedLabel.length - 1 - statusPlainLen);
1095
+ const dots = `${DIM2}${".".repeat(dotCount)}${RESET2}`;
1096
+ let statusStr;
1097
+ if (allPassed) {
1098
+ statusStr = `${GREEN}pass ${passCount}/${totalRuns}${RESET2}`;
1099
+ } else if (nonePassed) {
1100
+ statusStr = `${RED}fail ${passCount}/${totalRuns}${RESET2}`;
1101
+ } else {
1102
+ statusStr = `${YELLOW}warn ${passCount}/${totalRuns}${RESET2}`;
1103
+ }
1104
+ lines.push(` ${truncatedLabel} ${dots} ${statusStr}`);
1105
+ const criterionType = report.criterionTypes?.[criterionId];
1106
+ if (!allPassed && criterionType === "probabilistic") {
1107
+ for (const run of report.runs) {
1108
+ const ev = run.evaluations.find((e) => e.criterionId === criterionId);
1109
+ if (ev && ev.status !== "pass" && ev.explanation) {
1110
+ const maxExplLen = Math.max(40, termWidth - 12);
1111
+ const truncExpl = ev.explanation.length > maxExplLen ? ev.explanation.slice(0, maxExplLen - 1) + "\u2026" : ev.explanation;
1112
+ lines.push(` ${DIM2}run ${run.runIndex + 1}: ${truncExpl}${RESET2}`);
1113
+ }
1114
+ }
1115
+ }
1116
+ }
1117
+ lines.push("");
1118
+ const sc = report.satisfactionScore >= 80 ? GREEN : report.satisfactionScore >= 50 ? YELLOW : RED;
1119
+ lines.push(` ${BOLD2}satisfaction:${RESET2} ${sc}${BOLD2}${report.satisfactionScore.toFixed(1)}%${RESET2} ${DIM2}(${totalRuns} runs)${RESET2}`);
1120
+ if (report.twinNames && report.twinNames.length > 0) {
1121
+ const fidelityLines = loadTwinFidelity(report.twinNames);
1122
+ if (fidelityLines.length > 0) {
1123
+ lines.push("");
1124
+ lines.push(...fidelityLines);
1125
+ }
1126
+ }
1127
+ lines.push("");
1128
+ return lines.join("\n");
1129
+ }
1130
+ function formatJson(report) {
1131
+ return JSON.stringify(report, null, 2);
1132
+ }
1133
+ function escapeXml(text) {
1134
+ return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
1135
+ }
1136
+ function formatJunit(report) {
1137
+ const lines = [];
1138
+ let totalTests = 0;
1139
+ let totalFailures = 0;
1140
+ let totalTime = 0;
1141
+ for (const run of report.runs) {
1142
+ totalTests += run.evaluations.length;
1143
+ totalFailures += run.evaluations.filter((e) => e.status === "fail").length;
1144
+ totalTime += run.durationMs;
1145
+ }
1146
+ lines.push('<?xml version="1.0" encoding="UTF-8"?>');
1147
+ lines.push(
1148
+ `<testsuites name="${escapeXml(report.scenarioTitle)}" tests="${totalTests}" failures="${totalFailures}" time="${(totalTime / 1e3).toFixed(3)}">`
1149
+ );
1150
+ for (const run of report.runs) {
1151
+ const runTests = run.evaluations.length;
1152
+ const runFailures = run.evaluations.filter((e) => e.status === "fail").length;
1153
+ const runTime = (run.durationMs / 1e3).toFixed(3);
1154
+ lines.push(
1155
+ ` <testsuite name="Run ${run.runIndex + 1}" tests="${runTests}" failures="${runFailures}" time="${runTime}">`
1156
+ );
1157
+ if (run.error) {
1158
+ lines.push(
1159
+ ` <testcase name="agent-execution" time="${runTime}">`
1160
+ );
1161
+ lines.push(
1162
+ ` <failure message="${escapeXml(run.error)}" type="ExecutionError">${escapeXml(run.error)}</failure>`
1163
+ );
1164
+ lines.push(" </testcase>");
1165
+ }
1166
+ for (const evaluation of run.evaluations) {
1167
+ const testName = escapeXml(evaluation.criterionId);
1168
+ lines.push(` <testcase name="${testName}" time="0.000">`);
1169
+ if (evaluation.status === "fail") {
1170
+ lines.push(
1171
+ ` <failure message="${escapeXml(evaluation.explanation)}" type="CriterionFailed">${escapeXml(evaluation.explanation)}</failure>`
1172
+ );
1173
+ } else if (evaluation.status === "partial") {
1174
+ lines.push(
1175
+ ` <system-out>PARTIAL: ${escapeXml(evaluation.explanation)} (confidence: ${(evaluation.confidence * 100).toFixed(0)}%)</system-out>`
1176
+ );
1177
+ }
1178
+ lines.push(" </testcase>");
1179
+ }
1180
+ lines.push(" </testsuite>");
1181
+ }
1182
+ lines.push("</testsuites>");
1183
+ return lines.join("\n");
1184
+ }
1185
+ function printReport(report, format) {
1186
+ const output = generateReport(report, format);
1187
+ process.stdout.write(output + "\n");
1188
+ }
1189
+
1190
+ // src/evaluator/deterministic.ts
1191
+ function stripParenthetical(text) {
1192
+ return text.replace(/\s*\(.*?\)\s*/g, " ").trim();
1193
+ }
1194
+ function cleanPredicate(pred) {
1195
+ let cleaned = stripParenthetical(pred);
1196
+ cleaned = cleaned.replace(/\s+(?:on|in|to|from|of)\s+(?:the\s+)?.+$/, "");
1197
+ return cleaned.trim();
1198
+ }
1199
+ function parseAssertion(description) {
1200
+ const lower = stripParenthetical(description).toLowerCase().trim();
1201
+ const noLabeledMatch = lower.match(/^no\s+(.+?)\s+labeled\s+["']?([^"']+?)["']?\s+(?:are|were|is|was|should be)\s+(.+)$/);
1202
+ if (noLabeledMatch) {
1203
+ return {
1204
+ type: "no_matching",
1205
+ subject: noLabeledMatch[1]?.trim() ?? "",
1206
+ predicate: cleanPredicate(noLabeledMatch[3]?.trim() ?? ""),
1207
+ labelFilter: noLabeledMatch[2]?.trim()
1208
+ };
1209
+ }
1210
+ const exactWithVerb = lower.match(/^exactly\s+(\d+)\s+(.+?)\s+(?:are|were|is|was|should be)\s+(.+)$/);
1211
+ if (exactWithVerb) {
1212
+ return {
1213
+ type: "exact_count",
1214
+ subject: exactWithVerb[2]?.trim() ?? "",
1215
+ value: parseInt(exactWithVerb[1] ?? "0", 10),
1216
+ predicate: cleanPredicate(exactWithVerb[3]?.trim() ?? "")
1217
+ };
1218
+ }
1219
+ const exactWithoutVerb = lower.match(/^exactly\s+(\d+)\s+(.+)$/);
1220
+ if (exactWithoutVerb) {
1221
+ return {
1222
+ type: "exact_count",
1223
+ subject: exactWithoutVerb[2]?.trim() ?? "",
1224
+ value: parseInt(exactWithoutVerb[1] ?? "0", 10)
1225
+ };
1226
+ }
1227
+ const minWithVerb = lower.match(/^at\s+least\s+(\d+)\s+(.+?)\s+(?:are|were|is|was|should be)\s+(.+)$/);
1228
+ if (minWithVerb) {
1229
+ return {
1230
+ type: "min_count",
1231
+ subject: minWithVerb[2]?.trim() ?? "",
1232
+ value: parseInt(minWithVerb[1] ?? "0", 10),
1233
+ predicate: cleanPredicate(minWithVerb[3]?.trim() ?? "")
1234
+ };
1235
+ }
1236
+ const minWithoutVerb = lower.match(/^at\s+least\s+(\d+)\s+(.+)$/);
1237
+ if (minWithoutVerb) {
1238
+ return {
1239
+ type: "min_count",
1240
+ subject: minWithoutVerb[2]?.trim() ?? "",
1241
+ value: parseInt(minWithoutVerb[1] ?? "0", 10)
1242
+ };
1243
+ }
1244
+ const maxWithVerb = lower.match(/^(?:at\s+most|no\s+more\s+than)\s+(\d+)\s+(.+?)\s+(?:are|were|is|was|should be)\s+(.+)$/);
1245
+ if (maxWithVerb) {
1246
+ return {
1247
+ type: "max_count",
1248
+ subject: maxWithVerb[2]?.trim() ?? "",
1249
+ value: parseInt(maxWithVerb[1] ?? "0", 10),
1250
+ predicate: maxWithVerb[3]?.trim() || void 0
1251
+ };
1252
+ }
1253
+ const maxWithoutVerb = lower.match(/^(?:at\s+most|no\s+more\s+than)\s+(\d+)\s+(.+)$/);
1254
+ if (maxWithoutVerb) {
1255
+ return {
1256
+ type: "max_count",
1257
+ subject: maxWithoutVerb[2]?.trim() ?? "",
1258
+ value: parseInt(maxWithoutVerb[1] ?? "0", 10)
1259
+ };
1260
+ }
1261
+ const fewerMatch = lower.match(/^fewer\s+than\s+(\d+)\s+(.+)/);
1262
+ if (fewerMatch) {
1263
+ return {
1264
+ type: "max_count",
1265
+ subject: fewerMatch[2]?.trim() ?? "",
1266
+ value: parseInt(fewerMatch[1] ?? "1", 10) - 1
1267
+ };
1268
+ }
1269
+ const moreMatch = lower.match(/^more\s+than\s+(\d+)\s+(.+)/);
1270
+ if (moreMatch) {
1271
+ return {
1272
+ type: "min_count",
1273
+ subject: moreMatch[2]?.trim() ?? "",
1274
+ value: parseInt(moreMatch[1] ?? "0", 10) + 1
1275
+ };
1276
+ }
1277
+ const numSubjectMatch = lower.match(/^(\d+)\s+(.+?)\s+(?:are|were|is|was)\s+(.+)$/);
1278
+ if (numSubjectMatch) {
1279
+ return {
1280
+ type: "exact_count",
1281
+ subject: numSubjectMatch[2]?.trim() ?? "",
1282
+ value: parseInt(numSubjectMatch[1] ?? "0", 10),
1283
+ predicate: numSubjectMatch[3]?.trim()
1284
+ };
1285
+ }
1286
+ if (/^no\s+errors?\s+(in\s+)?(trace|log|output)/i.test(lower)) {
1287
+ return { type: "no_errors", subject: "trace" };
1288
+ }
1289
+ const stateMatch = lower.match(/^(?:the\s+)?(.+?)\s+(?:is|was|has been|should be)\s+(created|merged|closed|open|deleted|removed|resolved|approved|rejected)/);
1290
+ if (stateMatch) {
1291
+ return {
1292
+ type: "state_check",
1293
+ subject: stateMatch[1]?.trim() ?? "",
1294
+ predicate: stateMatch[2]?.trim()
1295
+ };
1296
+ }
1297
+ const existsMatch = lower.match(/^(?:the\s+)?(.+?)\s+(?:exists?|is present|was created|has been created)/);
1298
+ if (existsMatch) {
1299
+ return { type: "exists", subject: existsMatch[1]?.trim() ?? "" };
1300
+ }
1301
+ const noneMatch = lower.match(/^(?:no|zero|none)\s+(.+?)(?:\s+(?:remain|exist|left|present|found))?\s*$/);
1302
+ if (noneMatch) {
1303
+ return {
1304
+ type: "exact_count",
1305
+ subject: noneMatch[1]?.trim() ?? "",
1306
+ value: 0
1307
+ };
1308
+ }
1309
+ return null;
1310
+ }
1311
+ function flattenTwinState(state) {
1312
+ const flattened = {};
1313
+ for (const value of Object.values(state)) {
1314
+ if (typeof value === "object" && value !== null && !Array.isArray(value)) {
1315
+ const twinData = value;
1316
+ const hasArrayValues = Object.values(twinData).some(Array.isArray);
1317
+ if (hasArrayValues) {
1318
+ for (const [entityName, entityData] of Object.entries(twinData)) {
1319
+ if (Array.isArray(entityData)) {
1320
+ const existing = flattened[entityName];
1321
+ if (Array.isArray(existing)) {
1322
+ flattened[entityName] = [...existing, ...entityData];
1323
+ } else {
1324
+ flattened[entityName] = entityData;
1325
+ }
1326
+ }
1327
+ }
1328
+ continue;
1329
+ }
1330
+ }
1331
+ }
1332
+ if (Object.keys(flattened).length === 0) return state;
1333
+ return flattened;
1334
+ }
1335
+ function resolveSubjectInState(subject, state) {
1336
+ const flat = flattenTwinState(state);
1337
+ const normalizedSubject = subject.replace(/\s+/g, "").toLowerCase();
1338
+ for (const [key, value] of Object.entries(flat)) {
1339
+ const normalizedKey = key.replace(/\s+/g, "").toLowerCase();
1340
+ if (normalizedKey === normalizedSubject || normalizedKey === normalizedSubject + "s") {
1341
+ if (Array.isArray(value)) return value;
1342
+ if (typeof value === "object" && value !== null) return [value];
1343
+ }
1344
+ if (normalizedSubject.includes(normalizedKey) || normalizedKey.includes(normalizedSubject)) {
1345
+ if (Array.isArray(value)) return value;
1346
+ }
1347
+ }
1348
+ const subjectWords = subject.toLowerCase().split(/\s+/);
1349
+ for (const [key, value] of Object.entries(flat)) {
1350
+ if (typeof value !== "object" || value === null) continue;
1351
+ const entries = Array.isArray(value) ? value : Object.values(value);
1352
+ for (const word of subjectWords) {
1353
+ if (key.toLowerCase().includes(word)) {
1354
+ return entries;
1355
+ }
1356
+ }
1357
+ }
1358
+ return null;
1359
+ }
1360
+ function filterByPredicate(items, predicate) {
1361
+ const lowerPredicate = predicate.toLowerCase();
1362
+ return items.filter((item) => {
1363
+ if (typeof item !== "object" || item === null) return false;
1364
+ const obj = item;
1365
+ if (obj["state"] !== void 0 && String(obj["state"]).toLowerCase() === lowerPredicate) {
1366
+ return true;
1367
+ }
1368
+ if (obj["status"] !== void 0 && String(obj["status"]).toLowerCase() === lowerPredicate) {
1369
+ return true;
1370
+ }
1371
+ if (lowerPredicate === "merged" && obj["merged"] === true) return true;
1372
+ if (lowerPredicate === "closed" && (obj["state"] === "closed" || obj["closed"] === true)) return true;
1373
+ if (lowerPredicate === "open" && obj["state"] === "open") return true;
1374
+ if (lowerPredicate === "created" && obj["id"] !== void 0) return true;
1375
+ if (lowerPredicate === "deleted" && obj["deleted"] === true) return true;
1376
+ if (lowerPredicate === "resolved" && (obj["state"] === "resolved" || obj["resolved"] === true)) return true;
1377
+ return false;
1378
+ });
1379
+ }
1380
+ function countTraceErrors(trace) {
1381
+ return trace.filter((entry) => entry.error !== null).length;
1382
+ }
1383
+ function evaluateDeterministic(criterion, stateView) {
1384
+ const assertion = parseAssertion(criterion.description);
1385
+ if (!assertion) {
1386
+ debug(`Could not parse deterministic assertion: "${criterion.description}"`);
1387
+ return {
1388
+ criterionId: criterion.id,
1389
+ status: "fail",
1390
+ confidence: 0.5,
1391
+ explanation: `Could not parse deterministic assertion from: "${criterion.description}"`
1392
+ };
1393
+ }
1394
+ debug("Parsed assertion", {
1395
+ type: assertion.type,
1396
+ subject: assertion.subject,
1397
+ value: String(assertion.value ?? ""),
1398
+ predicate: assertion.predicate ?? ""
1399
+ });
1400
+ switch (assertion.type) {
1401
+ case "no_errors": {
1402
+ const errorCount = countTraceErrors(stateView.trace);
1403
+ const passed = errorCount === 0;
1404
+ return {
1405
+ criterionId: criterion.id,
1406
+ status: passed ? "pass" : "fail",
1407
+ confidence: 1,
1408
+ explanation: passed ? "No errors found in trace" : `Found ${errorCount} error(s) in trace`
1409
+ };
1410
+ }
1411
+ case "exact_count":
1412
+ case "min_count":
1413
+ case "max_count": {
1414
+ const afterItems = resolveSubjectInState(assertion.subject, stateView.after);
1415
+ if (afterItems === null) {
1416
+ const addedItems = stateView.diff.added[assertion.subject];
1417
+ if (addedItems) {
1418
+ return evaluateCount(
1419
+ criterion.id,
1420
+ assertion.type,
1421
+ assertion.value ?? 0,
1422
+ addedItems.length,
1423
+ assertion.subject,
1424
+ assertion.predicate
1425
+ );
1426
+ }
1427
+ return {
1428
+ criterionId: criterion.id,
1429
+ status: "fail",
1430
+ confidence: 0.5,
1431
+ explanation: `Could not find "${assertion.subject}" in twin state`
1432
+ };
1433
+ }
1434
+ if (assertion.predicate) {
1435
+ const beforeItems = resolveSubjectInState(assertion.subject, stateView.before);
1436
+ const afterFiltered = filterByPredicate(afterItems, assertion.predicate);
1437
+ if (beforeItems) {
1438
+ const beforeFiltered = filterByPredicate(beforeItems, assertion.predicate);
1439
+ const newlyMatching = afterFiltered.length - beforeFiltered.length;
1440
+ return evaluateCount(
1441
+ criterion.id,
1442
+ assertion.type,
1443
+ assertion.value ?? 0,
1444
+ Math.max(0, newlyMatching),
1445
+ assertion.subject,
1446
+ assertion.predicate
1447
+ );
1448
+ }
1449
+ return evaluateCount(
1450
+ criterion.id,
1451
+ assertion.type,
1452
+ assertion.value ?? 0,
1453
+ afterFiltered.length,
1454
+ assertion.subject,
1455
+ assertion.predicate
1456
+ );
1457
+ }
1458
+ return evaluateCount(
1459
+ criterion.id,
1460
+ assertion.type,
1461
+ assertion.value ?? 0,
1462
+ afterItems.length,
1463
+ assertion.subject,
1464
+ assertion.predicate
1465
+ );
1466
+ }
1467
+ case "no_matching": {
1468
+ const items = resolveSubjectInState(assertion.subject, stateView.after);
1469
+ if (!items) {
1470
+ return {
1471
+ criterionId: criterion.id,
1472
+ status: "fail",
1473
+ confidence: 0.5,
1474
+ explanation: `Could not find "${assertion.subject}" in twin state`
1475
+ };
1476
+ }
1477
+ const labelFiltered = assertion.labelFilter ? items.filter((item) => {
1478
+ if (typeof item !== "object" || item === null) return false;
1479
+ const obj = item;
1480
+ const labels = obj["labels"];
1481
+ if (Array.isArray(labels)) {
1482
+ return labels.some((l) => {
1483
+ const labelName = typeof l === "string" ? l : l?.["name"];
1484
+ return String(labelName).toLowerCase() === assertion.labelFilter?.toLowerCase();
1485
+ });
1486
+ }
1487
+ return false;
1488
+ }) : items;
1489
+ const matching = assertion.predicate ? filterByPredicate(labelFiltered, assertion.predicate) : labelFiltered;
1490
+ const passed = matching.length === 0;
1491
+ return {
1492
+ criterionId: criterion.id,
1493
+ status: passed ? "pass" : "fail",
1494
+ confidence: 1,
1495
+ explanation: passed ? `No ${assertion.subject} labeled "${assertion.labelFilter}" are ${assertion.predicate}` : `Found ${matching.length} ${assertion.subject} labeled "${assertion.labelFilter}" that are ${assertion.predicate}`
1496
+ };
1497
+ }
1498
+ case "exists": {
1499
+ const items = resolveSubjectInState(assertion.subject, stateView.after);
1500
+ const found = items !== null && items.length > 0;
1501
+ return {
1502
+ criterionId: criterion.id,
1503
+ status: found ? "pass" : "fail",
1504
+ confidence: 1,
1505
+ explanation: found ? `"${assertion.subject}" exists in twin state` : `"${assertion.subject}" not found in twin state`
1506
+ };
1507
+ }
1508
+ case "not_exists": {
1509
+ const items = resolveSubjectInState(assertion.subject, stateView.after);
1510
+ const absent = items === null || items.length === 0;
1511
+ return {
1512
+ criterionId: criterion.id,
1513
+ status: absent ? "pass" : "fail",
1514
+ confidence: 1,
1515
+ explanation: absent ? `"${assertion.subject}" does not exist in twin state` : `"${assertion.subject}" still exists in twin state`
1516
+ };
1517
+ }
1518
+ case "state_check": {
1519
+ const items = resolveSubjectInState(assertion.subject, stateView.after);
1520
+ if (!items || items.length === 0) {
1521
+ return {
1522
+ criterionId: criterion.id,
1523
+ status: "fail",
1524
+ confidence: 0.8,
1525
+ explanation: `Could not find "${assertion.subject}" in twin state to check status`
1526
+ };
1527
+ }
1528
+ const matching = assertion.predicate ? filterByPredicate(items, assertion.predicate) : items;
1529
+ const passed = matching.length > 0;
1530
+ return {
1531
+ criterionId: criterion.id,
1532
+ status: passed ? "pass" : "fail",
1533
+ confidence: 1,
1534
+ explanation: passed ? `"${assertion.subject}" is ${assertion.predicate ?? "in expected state"}` : `"${assertion.subject}" is not ${assertion.predicate ?? "in expected state"}`
1535
+ };
1536
+ }
1537
+ case "comparison": {
1538
+ return {
1539
+ criterionId: criterion.id,
1540
+ status: "fail",
1541
+ confidence: 0.3,
1542
+ explanation: `Comparison assertion type not fully implemented for: "${criterion.description}"`
1543
+ };
1544
+ }
1545
+ }
1546
+ }
1547
+ function evaluateCount(criterionId, type, expected, actual, subject, predicate) {
1548
+ const subjectDesc = predicate ? `${subject} ${predicate}` : subject;
1549
+ switch (type) {
1550
+ case "exact_count": {
1551
+ const passed = actual === expected;
1552
+ return {
1553
+ criterionId,
1554
+ status: passed ? "pass" : "fail",
1555
+ confidence: 1,
1556
+ explanation: passed ? `Found exactly ${expected} ${subjectDesc}` : `Expected exactly ${expected} ${subjectDesc}, found ${actual}`
1557
+ };
1558
+ }
1559
+ case "min_count": {
1560
+ const passed = actual >= expected;
1561
+ return {
1562
+ criterionId,
1563
+ status: passed ? "pass" : "fail",
1564
+ confidence: 1,
1565
+ explanation: passed ? `Found ${actual} ${subjectDesc} (>= ${expected})` : `Expected at least ${expected} ${subjectDesc}, found ${actual}`
1566
+ };
1567
+ }
1568
+ case "max_count": {
1569
+ const passed = actual <= expected;
1570
+ return {
1571
+ criterionId,
1572
+ status: passed ? "pass" : "fail",
1573
+ confidence: 1,
1574
+ explanation: passed ? `Found ${actual} ${subjectDesc} (<= ${expected})` : `Expected at most ${expected} ${subjectDesc}, found ${actual}`
1575
+ };
1576
+ }
1577
+ }
1578
+ }
1579
+
1580
+ // src/evaluator/llm-judge.ts
1581
+ import Anthropic from "@anthropic-ai/sdk";
1582
+ var SYSTEM_PROMPT = `You are an evaluator for AI agent testing. You assess whether an agent successfully met a specific success criterion during a scenario run.
1583
+
1584
+ You will receive:
1585
+ 1. A success criterion to evaluate
1586
+ 2. The expected behavior description
1587
+ 3. The state of the digital twin before the agent ran
1588
+ 4. The state of the digital twin after the agent ran
1589
+ 5. A diff of state changes
1590
+ 6. The complete trace of tool calls the agent made
1591
+
1592
+ Your job is to determine if the criterion was met. Respond ONLY with valid JSON in this exact format:
1593
+
1594
+ {
1595
+ "status": "pass" | "fail" | "partial",
1596
+ "confidence": <number between 0 and 1>,
1597
+ "explanation": "<brief explanation of your assessment>"
1598
+ }
1599
+
1600
+ Rules:
1601
+ - "pass" means the criterion is clearly satisfied
1602
+ - "fail" means the criterion is clearly not satisfied
1603
+ - "partial" means the criterion is partially satisfied or the evidence is ambiguous
1604
+ - confidence is how certain you are in your assessment (1.0 = completely certain, 0.5 = uncertain)
1605
+ - Keep explanations concise (1-2 sentences)
1606
+ - Focus on observable evidence in the state and trace, not assumptions
1607
+ - If the criterion is about quality or helpfulness, assess based on content present in the state`;
1608
+ function buildUserPrompt(context) {
1609
+ const traceFormatted = context.trace.map((entry) => ({
1610
+ tool: entry.toolName,
1611
+ input: entry.input,
1612
+ output: entry.output,
1613
+ error: entry.error,
1614
+ durationMs: entry.durationMs
1615
+ }));
1616
+ const traceSummary = context.trace.length > 50 ? `(Showing first 25 and last 25 of ${context.trace.length} total calls)` : "";
1617
+ const traceToShow = context.trace.length > 50 ? [...traceFormatted.slice(0, 25), { note: `... ${context.trace.length - 50} calls omitted ...` }, ...traceFormatted.slice(-25)] : traceFormatted;
1618
+ return `## Success Criterion
1619
+ ${context.criterion.description}
1620
+
1621
+ ## Expected Behavior
1622
+ ${context.expectedBehavior}
1623
+
1624
+ ## State Before
1625
+ ${JSON.stringify(summarizeState(context.stateBefore), null, 2)}
1626
+
1627
+ ## State After
1628
+ ${JSON.stringify(summarizeState(context.stateAfter), null, 2)}
1629
+
1630
+ ## State Diff
1631
+ ${JSON.stringify(context.stateDiff, null, 2)}
1632
+
1633
+ ## Agent Trace ${traceSummary}
1634
+ ${JSON.stringify(traceToShow, null, 2)}`;
1635
+ }
1636
+ function summarizeState(state) {
1637
+ const summary = {};
1638
+ for (const [key, value] of Object.entries(state)) {
1639
+ if (Array.isArray(value)) {
1640
+ if (value.length <= 10) {
1641
+ summary[key] = value;
1642
+ } else {
1643
+ summary[key] = {
1644
+ _count: value.length,
1645
+ _first3: value.slice(0, 3),
1646
+ _last3: value.slice(-3)
1647
+ };
1648
+ }
1649
+ } else {
1650
+ summary[key] = value;
1651
+ }
1652
+ }
1653
+ return summary;
1654
+ }
1655
+ function parseJudgeResponse(text) {
1656
+ const jsonMatch = text.match(/\{[\s\S]*\}/);
1657
+ if (!jsonMatch) {
1658
+ warn("LLM judge did not return valid JSON, defaulting to fail");
1659
+ return {
1660
+ status: "fail",
1661
+ confidence: 0.3,
1662
+ explanation: "Could not parse evaluator response"
1663
+ };
1664
+ }
1665
+ try {
1666
+ const parsed = JSON.parse(jsonMatch[0]);
1667
+ const status = parsed["status"];
1668
+ if (status !== "pass" && status !== "fail" && status !== "partial") {
1669
+ return {
1670
+ status: "fail",
1671
+ confidence: 0.3,
1672
+ explanation: `Invalid status from evaluator: ${String(status)}`
1673
+ };
1674
+ }
1675
+ const confidence = typeof parsed["confidence"] === "number" ? Math.max(0, Math.min(1, parsed["confidence"])) : 0.5;
1676
+ const explanation = typeof parsed["explanation"] === "string" ? parsed["explanation"] : "No explanation provided";
1677
+ return { status, confidence, explanation };
1678
+ } catch {
1679
+ warn("Failed to parse LLM judge JSON response");
1680
+ return {
1681
+ status: "fail",
1682
+ confidence: 0.3,
1683
+ explanation: "Could not parse evaluator response JSON"
1684
+ };
1685
+ }
1686
+ }
1687
+ var clientInstance = null;
1688
+ function getClient(apiKey) {
1689
+ if (!clientInstance) {
1690
+ clientInstance = new Anthropic({ apiKey });
1691
+ }
1692
+ return clientInstance;
1693
+ }
1694
+ async function evaluateWithLlm(criterion, expectedBehavior, stateBefore, stateAfter, stateDiff, trace, options) {
1695
+ const context = {
1696
+ criterion,
1697
+ expectedBehavior,
1698
+ stateBefore,
1699
+ stateAfter,
1700
+ stateDiff,
1701
+ trace
1702
+ };
1703
+ if (!options.apiKey) {
1704
+ error("No API key provided for LLM evaluation");
1705
+ return {
1706
+ criterionId: criterion.id,
1707
+ status: "fail",
1708
+ confidence: 0,
1709
+ explanation: "No ANTHROPIC_API_KEY configured for probabilistic evaluation"
1710
+ };
1711
+ }
1712
+ const client = getClient(options.apiKey);
1713
+ const userPrompt = buildUserPrompt(context);
1714
+ debug("Calling LLM judge", {
1715
+ criterion: criterion.id,
1716
+ model: options.model,
1717
+ traceLength: String(trace.length)
1718
+ });
1719
+ try {
1720
+ const response = await client.messages.create({
1721
+ model: options.model,
1722
+ max_tokens: 512,
1723
+ system: SYSTEM_PROMPT,
1724
+ messages: [
1725
+ {
1726
+ role: "user",
1727
+ content: userPrompt
1728
+ }
1729
+ ]
1730
+ });
1731
+ const textBlock = response.content.find((block) => block.type === "text");
1732
+ if (!textBlock || textBlock.type !== "text") {
1733
+ return {
1734
+ criterionId: criterion.id,
1735
+ status: "fail",
1736
+ confidence: 0.3,
1737
+ explanation: "LLM returned no text content"
1738
+ };
1739
+ }
1740
+ const judgeResult = parseJudgeResponse(textBlock.text);
1741
+ debug("LLM judge result", {
1742
+ criterion: criterion.id,
1743
+ status: judgeResult.status,
1744
+ confidence: judgeResult.confidence.toFixed(2)
1745
+ });
1746
+ return {
1747
+ criterionId: criterion.id,
1748
+ status: judgeResult.status,
1749
+ confidence: judgeResult.confidence,
1750
+ explanation: judgeResult.explanation
1751
+ };
1752
+ } catch (err) {
1753
+ const message = err instanceof Error ? err.message : String(err);
1754
+ error(`LLM judge call failed: ${message}`);
1755
+ return {
1756
+ criterionId: criterion.id,
1757
+ status: "fail",
1758
+ confidence: 0,
1759
+ explanation: `LLM evaluation failed: ${message}`
1760
+ };
1761
+ }
1762
+ }
1763
+
1764
+ // src/evaluator/evaluator.ts
1765
+ function calculateOverallScore(evaluations) {
1766
+ if (evaluations.length === 0) return 0;
1767
+ let totalScore = 0;
1768
+ for (const evaluation of evaluations) {
1769
+ switch (evaluation.status) {
1770
+ case "pass":
1771
+ totalScore += 100;
1772
+ break;
1773
+ case "partial":
1774
+ totalScore += 50 * evaluation.confidence;
1775
+ break;
1776
+ case "fail":
1777
+ totalScore += 0;
1778
+ break;
1779
+ }
1780
+ }
1781
+ return totalScore / evaluations.length;
1782
+ }
1783
+ async function evaluateRun(criteria, context, config) {
1784
+ const evaluations = [];
1785
+ const deterministicCriteria = criteria.filter((c) => c.type === "deterministic");
1786
+ const probabilisticCriteria = criteria.filter((c) => c.type === "probabilistic");
1787
+ info("Evaluating criteria", {
1788
+ total: String(criteria.length),
1789
+ deterministic: String(deterministicCriteria.length),
1790
+ probabilistic: String(probabilisticCriteria.length)
1791
+ });
1792
+ for (const criterion of deterministicCriteria) {
1793
+ progress(`Evaluating [D] ${criterion.description}`);
1794
+ const result = evaluateDeterministic(criterion, {
1795
+ before: context.stateBefore,
1796
+ after: context.stateAfter,
1797
+ diff: context.stateDiff,
1798
+ trace: context.trace
1799
+ });
1800
+ evaluations.push(result);
1801
+ debug("Deterministic evaluation", {
1802
+ criterion: criterion.id,
1803
+ status: result.status
1804
+ });
1805
+ }
1806
+ for (const criterion of probabilisticCriteria) {
1807
+ progress(`Evaluating [P] ${criterion.description}`);
1808
+ const result = await evaluateWithLlm(
1809
+ criterion,
1810
+ context.expectedBehavior,
1811
+ context.stateBefore,
1812
+ context.stateAfter,
1813
+ context.stateDiff,
1814
+ context.trace,
1815
+ { apiKey: config.apiKey, model: config.model }
1816
+ );
1817
+ evaluations.push(result);
1818
+ debug("Probabilistic evaluation", {
1819
+ criterion: criterion.id,
1820
+ status: result.status,
1821
+ confidence: result.confidence.toFixed(2)
1822
+ });
1823
+ }
1824
+ evaluations.sort((a, b) => {
1825
+ const aIdx = criteria.findIndex((c) => c.id === a.criterionId);
1826
+ const bIdx = criteria.findIndex((c) => c.id === b.criterionId);
1827
+ return aIdx - bIdx;
1828
+ });
1829
+ const overallScore = calculateOverallScore(evaluations);
1830
+ info("Evaluation complete", {
1831
+ overallScore: overallScore.toFixed(1) + "%",
1832
+ passed: String(evaluations.filter((e) => e.status === "pass").length),
1833
+ failed: String(evaluations.filter((e) => e.status === "fail").length),
1834
+ partial: String(evaluations.filter((e) => e.status === "partial").length)
1835
+ });
1836
+ return { evaluations, overallScore };
1837
+ }
1838
+ function aggregateSatisfaction(runScores) {
1839
+ if (runScores.length === 0) return 0;
1840
+ const avg = runScores.reduce((sum, score) => sum + score, 0) / runScores.length;
1841
+ return Math.round(avg * 10) / 10;
1842
+ }
1843
+ function generateSummary(evaluations, satisfactionScore) {
1844
+ const totalRuns = evaluations.length;
1845
+ const allCriteria = /* @__PURE__ */ new Map();
1846
+ for (const runEvals of evaluations) {
1847
+ for (const evaluation of runEvals) {
1848
+ const existing = allCriteria.get(evaluation.criterionId) ?? { passed: 0, failed: 0, partial: 0 };
1849
+ if (evaluation.status === "pass") existing.passed++;
1850
+ else if (evaluation.status === "fail") existing.failed++;
1851
+ else existing.partial++;
1852
+ allCriteria.set(evaluation.criterionId, existing);
1853
+ }
1854
+ }
1855
+ const consistentlyPassing = [];
1856
+ const consistentlyFailing = [];
1857
+ const flaky = [];
1858
+ for (const [criterionId, counts] of allCriteria.entries()) {
1859
+ if (counts.passed === totalRuns) {
1860
+ consistentlyPassing.push(criterionId);
1861
+ } else if (counts.failed === totalRuns) {
1862
+ consistentlyFailing.push(criterionId);
1863
+ } else {
1864
+ flaky.push(criterionId);
1865
+ }
1866
+ }
1867
+ const parts = [];
1868
+ parts.push(`Satisfaction: ${satisfactionScore.toFixed(1)}% across ${totalRuns} runs.`);
1869
+ if (consistentlyPassing.length > 0) {
1870
+ parts.push(`Consistently passing: ${consistentlyPassing.join(", ")}.`);
1871
+ }
1872
+ if (consistentlyFailing.length > 0) {
1873
+ parts.push(`Consistently failing: ${consistentlyFailing.join(", ")}.`);
1874
+ }
1875
+ if (flaky.length > 0) {
1876
+ parts.push(`Non-deterministic: ${flaky.join(", ")}.`);
1877
+ }
1878
+ return parts.join(" ");
1879
+ }
1880
+
1881
+ // src/telemetry/recorder.ts
1882
+ import { mkdirSync as mkdirSync2, writeFileSync as writeFileSync3, readFileSync as readFileSync5, readdirSync, existsSync as existsSync4 } from "fs";
1883
+ import { join as join3 } from "path";
1884
+ import { randomUUID } from "crypto";
1885
+
1886
+ // src/config/config.ts
1887
+ import { readFileSync as readFileSync4, writeFileSync as writeFileSync2, mkdirSync, existsSync as existsSync3 } from "fs";
1888
+ import { join as join2 } from "path";
1889
+ import { homedir } from "os";
1890
+ import { z } from "zod";
1891
+ var ARCHAL_DIR_NAME = ".archal";
1892
+ var CONFIG_FILE_NAME = "config.json";
1893
+ var evaluatorConfigSchema = z.object({
1894
+ model: z.string().default("claude-sonnet-4-20250514"),
1895
+ apiKey: z.string().default("env:ANTHROPIC_API_KEY")
1896
+ });
1897
+ var defaultsConfigSchema = z.object({
1898
+ runs: z.number().int().positive().default(5),
1899
+ timeout: z.number().int().positive().default(120)
1900
+ });
1901
+ var configFileSchema = z.object({
1902
+ telemetry: z.boolean().default(false),
1903
+ evaluator: evaluatorConfigSchema.default({}),
1904
+ defaults: defaultsConfigSchema.default({})
1905
+ });
1906
+ function getArchalDir() {
1907
+ return join2(homedir(), ARCHAL_DIR_NAME);
1908
+ }
1909
+ function getConfigPath() {
1910
+ return join2(getArchalDir(), CONFIG_FILE_NAME);
1911
+ }
1912
+ function ensureArchalDir() {
1913
+ const dir = getArchalDir();
1914
+ if (!existsSync3(dir)) {
1915
+ mkdirSync(dir, { recursive: true });
1916
+ debug("Created archal directory", { path: dir });
1917
+ }
1918
+ return dir;
1919
+ }
1920
+ function loadConfigFile() {
1921
+ const configPath = getConfigPath();
1922
+ if (!existsSync3(configPath)) {
1923
+ debug("No config file found, using defaults", { path: configPath });
1924
+ return configFileSchema.parse({});
1925
+ }
1926
+ try {
1927
+ const raw = readFileSync4(configPath, "utf-8");
1928
+ const parsed = JSON.parse(raw);
1929
+ const config = configFileSchema.parse(parsed);
1930
+ debug("Loaded config file", { path: configPath });
1931
+ return config;
1932
+ } catch (err) {
1933
+ const message = err instanceof Error ? err.message : String(err);
1934
+ warn(`Failed to parse config file at ${configPath}: ${message}`);
1935
+ return configFileSchema.parse({});
1936
+ }
1937
+ }
1938
+ function resolveApiKey(apiKeyConfig) {
1939
+ if (apiKeyConfig.startsWith("env:")) {
1940
+ const envVar = apiKeyConfig.slice(4);
1941
+ return process.env[envVar] ?? "";
1942
+ }
1943
+ return apiKeyConfig;
1944
+ }
1945
+ function loadConfig() {
1946
+ const file = loadConfigFile();
1947
+ const envTelemetry = process.env["ARCHAL_TELEMETRY"];
1948
+ const envModel = process.env["ARCHAL_MODEL"];
1949
+ const envRuns = process.env["ARCHAL_RUNS"];
1950
+ const envTimeout = process.env["ARCHAL_TIMEOUT"];
1951
+ const envApiKey = process.env["ANTHROPIC_API_KEY"];
1952
+ const telemetry = envTelemetry !== void 0 ? envTelemetry === "true" : file.telemetry;
1953
+ const model = envModel ?? file.evaluator.model;
1954
+ const runs = envRuns !== void 0 ? parseInt(envRuns, 10) : file.defaults.runs;
1955
+ const timeout = envTimeout !== void 0 ? parseInt(envTimeout, 10) : file.defaults.timeout;
1956
+ const apiKey = envApiKey ?? resolveApiKey(file.evaluator.apiKey);
1957
+ return {
1958
+ telemetry,
1959
+ apiKey,
1960
+ model,
1961
+ runs: Number.isNaN(runs) ? 5 : runs,
1962
+ timeout: Number.isNaN(timeout) ? 120 : timeout,
1963
+ archalDir: getArchalDir(),
1964
+ configPath: getConfigPath()
1965
+ };
1966
+ }
1967
+ function saveConfig(config) {
1968
+ const dir = ensureArchalDir();
1969
+ const configPath = join2(dir, CONFIG_FILE_NAME);
1970
+ let existing;
1971
+ if (existsSync3(configPath)) {
1972
+ try {
1973
+ const raw = readFileSync4(configPath, "utf-8");
1974
+ existing = configFileSchema.parse(JSON.parse(raw));
1975
+ } catch {
1976
+ existing = configFileSchema.parse({});
1977
+ }
1978
+ } else {
1979
+ existing = configFileSchema.parse({});
1980
+ }
1981
+ const merged = {
1982
+ telemetry: config.telemetry ?? existing.telemetry,
1983
+ evaluator: {
1984
+ ...existing.evaluator,
1985
+ ...config.evaluator
1986
+ },
1987
+ defaults: {
1988
+ ...existing.defaults,
1989
+ ...config.defaults
1990
+ }
1991
+ };
1992
+ writeFileSync2(configPath, JSON.stringify(merged, null, 2) + "\n", "utf-8");
1993
+ debug("Saved config file", { path: configPath });
1994
+ }
1995
+ function initConfig() {
1996
+ const configPath = getConfigPath();
1997
+ if (existsSync3(configPath)) {
1998
+ warn(`Config file already exists at ${configPath}`);
1999
+ return configPath;
2000
+ }
2001
+ const defaultConfig = configFileSchema.parse({});
2002
+ ensureArchalDir();
2003
+ writeFileSync2(configPath, JSON.stringify(defaultConfig, null, 2) + "\n", "utf-8");
2004
+ return configPath;
2005
+ }
2006
+ function setConfigValue(key, value) {
2007
+ const file = loadConfigFile();
2008
+ const parts = key.split(".");
2009
+ if (parts.length === 1) {
2010
+ const topKey = parts[0];
2011
+ if (topKey === "telemetry") {
2012
+ saveConfig({ ...file, telemetry: value === "true" });
2013
+ return;
2014
+ }
2015
+ }
2016
+ if (parts.length === 2) {
2017
+ const [section, prop] = parts;
2018
+ if (section === "evaluator" && (prop === "model" || prop === "apiKey")) {
2019
+ saveConfig({
2020
+ ...file,
2021
+ evaluator: { ...file.evaluator, [prop]: value }
2022
+ });
2023
+ return;
2024
+ }
2025
+ if (section === "defaults" && (prop === "runs" || prop === "timeout")) {
2026
+ const numValue = parseInt(value, 10);
2027
+ if (Number.isNaN(numValue) || numValue <= 0) {
2028
+ throw new Error(`Invalid numeric value for ${key}: ${value}`);
2029
+ }
2030
+ saveConfig({
2031
+ ...file,
2032
+ defaults: { ...file.defaults, [prop]: numValue }
2033
+ });
2034
+ return;
2035
+ }
2036
+ }
2037
+ throw new Error(
2038
+ `Unknown config key: "${key}". Valid keys: telemetry, evaluator.model, evaluator.apiKey, defaults.runs, defaults.timeout`
2039
+ );
2040
+ }
2041
+ function getConfigDisplay() {
2042
+ const resolved = loadConfig();
2043
+ return {
2044
+ telemetry: resolved.telemetry,
2045
+ evaluator: {
2046
+ model: resolved.model,
2047
+ apiKey: resolved.apiKey ? "***" + resolved.apiKey.slice(-4) : "(not set)"
2048
+ },
2049
+ defaults: {
2050
+ runs: resolved.runs,
2051
+ timeout: resolved.timeout
2052
+ },
2053
+ paths: {
2054
+ archalDir: resolved.archalDir,
2055
+ configFile: resolved.configPath
2056
+ }
2057
+ };
2058
+ }
2059
+
2060
+ // src/telemetry/recorder.ts
2061
+ var TRACES_DIR = "traces";
2062
+ var MAX_STORED_TRACES = 100;
2063
+ function getTracesDir() {
2064
+ return join3(getArchalDir(), TRACES_DIR);
2065
+ }
2066
+ function ensureTracesDir() {
2067
+ const dir = getTracesDir();
2068
+ if (!existsSync4(dir)) {
2069
+ ensureArchalDir();
2070
+ mkdirSync2(dir, { recursive: true });
2071
+ debug("Created traces directory", { path: dir });
2072
+ }
2073
+ return dir;
2074
+ }
2075
+ function traceFilePath(traceId) {
2076
+ return join3(getTracesDir(), `${traceId}.json`);
2077
+ }
2078
+ function recordTrace(report) {
2079
+ const traceId = randomUUID();
2080
+ const dir = ensureTracesDir();
2081
+ const allEntries = [];
2082
+ for (const run of report.runs) {
2083
+ allEntries.push(...run.trace);
2084
+ }
2085
+ const stored = {
2086
+ id: traceId,
2087
+ scenarioTitle: report.scenarioTitle,
2088
+ timestamp: report.timestamp,
2089
+ satisfactionScore: report.satisfactionScore,
2090
+ runCount: report.runs.length,
2091
+ entries: allEntries,
2092
+ report
2093
+ };
2094
+ const filePath = traceFilePath(traceId);
2095
+ writeFileSync3(filePath, JSON.stringify(stored, null, 2), "utf-8");
2096
+ debug("Recorded trace", { id: traceId, path: filePath, entries: String(allEntries.length) });
2097
+ pruneOldTraces(dir);
2098
+ return traceId;
2099
+ }
2100
+ function pruneOldTraces(dir) {
2101
+ try {
2102
+ const files = readdirSync(dir).filter((f) => f.endsWith(".json")).sort().reverse();
2103
+ if (files.length > MAX_STORED_TRACES) {
2104
+ const toRemove = files.slice(MAX_STORED_TRACES);
2105
+ const { unlinkSync: unlinkSync2 } = __require("fs");
2106
+ for (const file of toRemove) {
2107
+ try {
2108
+ unlinkSync2(join3(dir, file));
2109
+ debug("Pruned old trace", { file });
2110
+ } catch {
2111
+ }
2112
+ }
2113
+ }
2114
+ } catch {
2115
+ }
2116
+ }
2117
+ function loadTrace(traceId) {
2118
+ const filePath = traceFilePath(traceId);
2119
+ if (!existsSync4(filePath)) {
2120
+ const partialMatch = findTraceByPrefix(traceId);
2121
+ if (partialMatch) {
2122
+ return loadTraceByPath(traceFilePath(partialMatch));
2123
+ }
2124
+ return null;
2125
+ }
2126
+ return loadTraceByPath(filePath);
2127
+ }
2128
+ function loadTraceByPath(filePath) {
2129
+ try {
2130
+ const raw = readFileSync5(filePath, "utf-8");
2131
+ return JSON.parse(raw);
2132
+ } catch (err) {
2133
+ const message = err instanceof Error ? err.message : String(err);
2134
+ warn(`Failed to load trace: ${message}`);
2135
+ return null;
2136
+ }
2137
+ }
2138
+ function findTraceByPrefix(prefix) {
2139
+ const dir = getTracesDir();
2140
+ if (!existsSync4(dir)) return null;
2141
+ const files = readdirSync(dir).filter((f) => f.endsWith(".json"));
2142
+ for (const file of files) {
2143
+ const id = file.replace(".json", "");
2144
+ if (id.startsWith(prefix)) {
2145
+ return id;
2146
+ }
2147
+ }
2148
+ return null;
2149
+ }
2150
+ function listTraces(limit = 20) {
2151
+ const dir = getTracesDir();
2152
+ if (!existsSync4(dir)) return [];
2153
+ const files = readdirSync(dir).filter((f) => f.endsWith(".json")).sort().reverse().slice(0, limit);
2154
+ const results = [];
2155
+ for (const file of files) {
2156
+ try {
2157
+ const raw = readFileSync5(join3(dir, file), "utf-8");
2158
+ const stored = JSON.parse(raw);
2159
+ results.push({
2160
+ id: stored.id,
2161
+ scenarioTitle: stored.scenarioTitle,
2162
+ timestamp: stored.timestamp,
2163
+ satisfactionScore: stored.satisfactionScore,
2164
+ runCount: stored.runCount,
2165
+ entryCount: stored.entries.length
2166
+ });
2167
+ } catch {
2168
+ debug(`Skipping corrupted trace file: ${file}`);
2169
+ }
2170
+ }
2171
+ return results;
2172
+ }
2173
+ function exportTraceAsJson(traceId) {
2174
+ const trace = loadTrace(traceId);
2175
+ if (!trace) return null;
2176
+ return JSON.stringify(trace, null, 2);
2177
+ }
2178
+
2179
+ // src/telemetry/anonymizer.ts
2180
+ import { createHash } from "crypto";
2181
+ var API_KEY_PATTERNS = [
2182
+ /(?:api[_-]?key|token|secret|password|authorization|bearer)\s*[:=]\s*["']?([a-zA-Z0-9_\-/.+=]{16,})["']?/gi,
2183
+ /sk-[a-zA-Z0-9]{20,}/g,
2184
+ /ghp_[a-zA-Z0-9]{36}/g,
2185
+ /gho_[a-zA-Z0-9]{36}/g,
2186
+ /xoxb-[a-zA-Z0-9-]+/g,
2187
+ /xoxp-[a-zA-Z0-9-]+/g,
2188
+ /xoxa-[a-zA-Z0-9-]+/g,
2189
+ /glpat-[a-zA-Z0-9_-]{20}/g,
2190
+ /Bearer\s+[a-zA-Z0-9_\-/.+=]{20,}/gi
2191
+ ];
2192
+ var EMAIL_PATTERN = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
2193
+ var USERNAME_FIELDS = /* @__PURE__ */ new Set([
2194
+ "username",
2195
+ "user",
2196
+ "author",
2197
+ "assignee",
2198
+ "reviewer",
2199
+ "creator",
2200
+ "sender",
2201
+ "recipient",
2202
+ "login",
2203
+ "owner",
2204
+ "requester",
2205
+ "reporter",
2206
+ "committer",
2207
+ "name",
2208
+ "full_name",
2209
+ "display_name",
2210
+ "real_name"
2211
+ ]);
2212
+ var PASSTHROUGH_FIELDS = /* @__PURE__ */ new Set([
2213
+ "id",
2214
+ "type",
2215
+ "state",
2216
+ "status",
2217
+ "created_at",
2218
+ "updated_at",
2219
+ "timestamp",
2220
+ "number",
2221
+ "count",
2222
+ "merged",
2223
+ "closed",
2224
+ "draft",
2225
+ "locked",
2226
+ "private",
2227
+ "public",
2228
+ "archived"
2229
+ ]);
2230
+ function hashValue(value, salt = "archal") {
2231
+ const hash = createHash("sha256").update(`${salt}:${value}`).digest("hex");
2232
+ return `anon_${hash.slice(0, 12)}`;
2233
+ }
2234
+ function stripApiKeys(text) {
2235
+ let result = text;
2236
+ for (const pattern of API_KEY_PATTERNS) {
2237
+ result = result.replace(pattern, "[REDACTED_KEY]");
2238
+ }
2239
+ return result;
2240
+ }
2241
+ function anonymizeEmails(text) {
2242
+ return text.replace(EMAIL_PATTERN, (email) => {
2243
+ const domain = email.split("@")[1] ?? "unknown";
2244
+ return `${hashValue(email)}@${domain}`;
2245
+ });
2246
+ }
2247
+ function isUsernameField(key) {
2248
+ const lower = key.toLowerCase();
2249
+ return USERNAME_FIELDS.has(lower);
2250
+ }
2251
+ function shouldPassthrough(key) {
2252
+ const lower = key.toLowerCase();
2253
+ return PASSTHROUGH_FIELDS.has(lower);
2254
+ }
2255
+ function anonymizeValue(key, value) {
2256
+ if (value === null || value === void 0) return value;
2257
+ if (typeof value === "boolean" || typeof value === "number") return value;
2258
+ if (typeof value === "string") {
2259
+ if (shouldPassthrough(key)) return value;
2260
+ let result = stripApiKeys(value);
2261
+ result = anonymizeEmails(result);
2262
+ if (isUsernameField(key)) {
2263
+ return hashValue(result);
2264
+ }
2265
+ return result;
2266
+ }
2267
+ if (Array.isArray(value)) {
2268
+ return value.map((item, idx) => anonymizeValue(`${key}[${idx}]`, item));
2269
+ }
2270
+ if (typeof value === "object") {
2271
+ return anonymizeObject(value);
2272
+ }
2273
+ return value;
2274
+ }
2275
+ function anonymizeObject(obj) {
2276
+ const result = {};
2277
+ for (const [key, value] of Object.entries(obj)) {
2278
+ result[key] = anonymizeValue(key, value);
2279
+ }
2280
+ return result;
2281
+ }
2282
+ function anonymizeTraceEntry(entry) {
2283
+ return {
2284
+ ...entry,
2285
+ input: anonymizeObject(entry.input),
2286
+ output: typeof entry.output === "object" && entry.output !== null ? anonymizeObject(entry.output) : entry.output,
2287
+ error: entry.error ? {
2288
+ ...entry.error,
2289
+ message: stripApiKeys(entry.error.message),
2290
+ details: entry.error.details !== void 0 ? typeof entry.error.details === "object" && entry.error.details !== null ? anonymizeObject(entry.error.details) : entry.error.details : void 0
2291
+ } : null
2292
+ };
2293
+ }
2294
+ function anonymizeTrace(entries) {
2295
+ debug("Anonymizing trace", { entryCount: String(entries.length) });
2296
+ return entries.map(anonymizeTraceEntry);
2297
+ }
2298
+
2299
+ // src/telemetry/uploader.ts
2300
+ var ARCHAL_CLOUD_ENDPOINT = "https://api.archal.dev/v1/traces";
2301
+ var BATCH_SIZE = 50;
2302
+ function isTelemetryEnabled() {
2303
+ const config = loadConfig();
2304
+ return config.telemetry;
2305
+ }
2306
+ function buildMetadata(report) {
2307
+ const twinNames = /* @__PURE__ */ new Set();
2308
+ for (const run of report.runs) {
2309
+ for (const entry of run.trace) {
2310
+ const twinPrefix = entry.toolName.split("_")[0];
2311
+ if (twinPrefix) {
2312
+ twinNames.add(twinPrefix);
2313
+ }
2314
+ }
2315
+ }
2316
+ return {
2317
+ cliVersion: "0.1.0",
2318
+ nodeVersion: process.version,
2319
+ platform: process.platform,
2320
+ twinNames: Array.from(twinNames),
2321
+ criteriaCount: report.runs[0]?.evaluations.length ?? 0
2322
+ };
2323
+ }
2324
+ function batchEntries(entries) {
2325
+ const batches = [];
2326
+ for (let i = 0; i < entries.length; i += BATCH_SIZE) {
2327
+ batches.push(entries.slice(i, i + BATCH_SIZE));
2328
+ }
2329
+ return batches;
2330
+ }
2331
+ async function uploadTrace(traceId, report) {
2332
+ if (!isTelemetryEnabled()) {
2333
+ debug("Telemetry is disabled, skipping upload");
2334
+ return {
2335
+ success: false,
2336
+ message: "Telemetry is disabled. Enable with: archal config set telemetry true"
2337
+ };
2338
+ }
2339
+ const allEntries = [];
2340
+ for (const run of report.runs) {
2341
+ allEntries.push(...run.trace);
2342
+ }
2343
+ const anonymizedEntries = anonymizeTrace(allEntries);
2344
+ const metadata = buildMetadata(report);
2345
+ const batches = batchEntries(anonymizedEntries);
2346
+ info(`Preparing to upload trace ${traceId}`, {
2347
+ entries: String(anonymizedEntries.length),
2348
+ batches: String(batches.length)
2349
+ });
2350
+ for (let i = 0; i < batches.length; i++) {
2351
+ const batch = batches[i];
2352
+ const payload = {
2353
+ traceId: `${traceId}${batches.length > 1 ? `-batch-${i + 1}` : ""}`,
2354
+ scenarioTitle: report.scenarioTitle,
2355
+ satisfactionScore: report.satisfactionScore,
2356
+ runCount: report.runs.length,
2357
+ timestamp: report.timestamp,
2358
+ entries: batch,
2359
+ metadata
2360
+ };
2361
+ const result = await sendBatch(payload, i + 1, batches.length);
2362
+ if (!result.success) {
2363
+ return result;
2364
+ }
2365
+ }
2366
+ return {
2367
+ success: true,
2368
+ message: `Trace ${traceId} uploaded successfully (${anonymizedEntries.length} entries in ${batches.length} batch(es))`,
2369
+ traceId
2370
+ };
2371
+ }
2372
+ async function sendBatch(payload, batchNum, totalBatches) {
2373
+ debug(`Uploading batch ${batchNum}/${totalBatches}`, {
2374
+ entries: String(payload.entries.length),
2375
+ endpoint: ARCHAL_CLOUD_ENDPOINT
2376
+ });
2377
+ info(`[Telemetry stub] Would send batch ${batchNum}/${totalBatches} to ${ARCHAL_CLOUD_ENDPOINT}`, {
2378
+ traceId: payload.traceId,
2379
+ scenario: payload.scenarioTitle,
2380
+ entries: String(payload.entries.length),
2381
+ satisfaction: payload.satisfactionScore.toFixed(1) + "%"
2382
+ });
2383
+ info(`[Telemetry stub] Payload size: ${JSON.stringify(payload).length} bytes`);
2384
+ return {
2385
+ success: true,
2386
+ message: `Batch ${batchNum}/${totalBatches} sent`,
2387
+ traceId: payload.traceId
2388
+ };
2389
+ }
2390
+ async function uploadIfEnabled(traceId, report) {
2391
+ if (!isTelemetryEnabled()) {
2392
+ return;
2393
+ }
2394
+ try {
2395
+ const result = await uploadTrace(traceId, report);
2396
+ if (result.success) {
2397
+ info("Telemetry uploaded", { traceId });
2398
+ } else {
2399
+ warn(`Telemetry upload skipped: ${result.message}`);
2400
+ }
2401
+ } catch (err) {
2402
+ const message = err instanceof Error ? err.message : String(err);
2403
+ warn(`Telemetry upload failed: ${message}`);
2404
+ }
2405
+ }
2406
+
2407
+ // src/runner/orchestrator.ts
2408
+ function computeStateDiff(before, after) {
2409
+ const diff = { added: {}, modified: {}, removed: {} };
2410
+ const allKeys = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
2411
+ for (const key of allKeys) {
2412
+ const beforeVal = before[key];
2413
+ const afterVal = after[key];
2414
+ if (beforeVal === void 0 && afterVal !== void 0) {
2415
+ diff.added[key] = Array.isArray(afterVal) ? afterVal : [afterVal];
2416
+ } else if (beforeVal !== void 0 && afterVal === void 0) {
2417
+ diff.removed[key] = Array.isArray(beforeVal) ? beforeVal.map((item) => item.id ?? 0) : [0];
2418
+ } else if (JSON.stringify(beforeVal) !== JSON.stringify(afterVal)) {
2419
+ diff.modified[key] = Array.isArray(afterVal) ? afterVal : [afterVal];
2420
+ }
2421
+ }
2422
+ return diff;
2423
+ }
2424
+ async function executeSingleRun(runIndex, scenario, agentConfig, seedSelections, evaluatorConfig, timeoutSeconds, rateLimit) {
2425
+ const startTime = Date.now();
2426
+ const runId = `archal-run-${runIndex}-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
2427
+ info(`Starting run ${runIndex + 1}`, { scenario: scenario.title });
2428
+ const twinConfigs = seedSelections.map((sel) => ({
2429
+ twinName: sel.twinName,
2430
+ seedName: sel.seedName,
2431
+ rateLimitMax: rateLimit
2432
+ }));
2433
+ try {
2434
+ progress("Capturing seed state...");
2435
+ const { beforeState, twinPaths: seedPaths } = await captureSeedState(twinConfigs);
2436
+ const { configPath: mcpConfigPath, twinPaths } = writeMcpConfig(twinConfigs, runId);
2437
+ const mcpConfigData = JSON.parse(readFileSync6(mcpConfigPath, "utf-8"));
2438
+ const mcpServersJson = JSON.stringify(mcpConfigData.mcpServers);
2439
+ const twinNames = twinConfigs.map((c) => c.twinName);
2440
+ const agentResult = await executeAgent(
2441
+ agentConfig,
2442
+ mcpConfigPath,
2443
+ mcpServersJson,
2444
+ twinNames,
2445
+ timeoutSeconds * 1e3
2446
+ );
2447
+ const stateAfter = collectStateFromFiles(twinPaths);
2448
+ const trace = collectTraceFromFiles(twinPaths);
2449
+ const diff = computeStateDiff(beforeState, stateAfter);
2450
+ cleanupTempFiles(mcpConfigPath, twinPaths, seedPaths);
2451
+ if (agentResult.timedOut) {
2452
+ const durationMs2 = Date.now() - startTime;
2453
+ return {
2454
+ runIndex,
2455
+ evaluations: scenario.successCriteria.map((c) => ({
2456
+ criterionId: c.id,
2457
+ status: "fail",
2458
+ confidence: 1,
2459
+ explanation: `Agent timed out after ${timeoutSeconds}s`
2460
+ })),
2461
+ overallScore: 0,
2462
+ trace,
2463
+ durationMs: durationMs2,
2464
+ error: `Agent timed out after ${timeoutSeconds}s`
2465
+ };
2466
+ }
2467
+ if (agentResult.exitCode !== 0 && agentResult.exitCode !== null) {
2468
+ warn(`Agent exited with non-zero code ${agentResult.exitCode} on run ${runIndex + 1}`);
2469
+ }
2470
+ progress(`Evaluating run ${runIndex + 1}...`);
2471
+ const evaluationResult = await evaluateRun(
2472
+ scenario.successCriteria,
2473
+ {
2474
+ expectedBehavior: scenario.expectedBehavior,
2475
+ stateBefore: beforeState,
2476
+ stateAfter,
2477
+ stateDiff: diff,
2478
+ trace
2479
+ },
2480
+ evaluatorConfig
2481
+ );
2482
+ const durationMs = Date.now() - startTime;
2483
+ return {
2484
+ runIndex,
2485
+ evaluations: evaluationResult.evaluations,
2486
+ overallScore: evaluationResult.overallScore,
2487
+ trace,
2488
+ durationMs
2489
+ };
2490
+ } catch (err) {
2491
+ const message = err instanceof Error ? err.message : String(err);
2492
+ error(`Run ${runIndex + 1} failed: ${message}`);
2493
+ const durationMs = Date.now() - startTime;
2494
+ return {
2495
+ runIndex,
2496
+ evaluations: scenario.successCriteria.map((c) => ({
2497
+ criterionId: c.id,
2498
+ status: "fail",
2499
+ confidence: 1,
2500
+ explanation: `Run failed: ${message}`
2501
+ })),
2502
+ overallScore: 0,
2503
+ trace: [],
2504
+ durationMs,
2505
+ error: message
2506
+ };
2507
+ }
2508
+ }
2509
+ async function runScenario(options) {
2510
+ const config = loadConfig();
2511
+ const numRuns = options.runs ?? config.runs;
2512
+ const timeoutSeconds = options.timeout ?? config.timeout;
2513
+ const model = options.model ?? config.model;
2514
+ const outputFormat = options.output ?? "terminal";
2515
+ banner(`Archal: ${options.scenarioPath}`);
2516
+ const scenario = parseScenarioFile(options.scenarioPath);
2517
+ info(`Scenario: ${scenario.title}`, {
2518
+ criteria: String(scenario.successCriteria.length),
2519
+ twins: scenario.config.twins.join(", "),
2520
+ runs: String(numRuns),
2521
+ timeout: `${timeoutSeconds}s`
2522
+ });
2523
+ let seedSelections = generateSeedSelections(scenario.config.twins, scenario.setup);
2524
+ if (options.seed) {
2525
+ const overrides = {};
2526
+ for (const twin of scenario.config.twins) {
2527
+ overrides[twin] = options.seed;
2528
+ }
2529
+ seedSelections = overrideSeedSelection(seedSelections, overrides);
2530
+ }
2531
+ const scenarioDir = dirname2(resolve3(options.scenarioPath));
2532
+ let projectConfigPath;
2533
+ for (const dir of [scenarioDir, process.cwd()]) {
2534
+ const candidate = resolve3(dir, ".archal.json");
2535
+ if (existsSync5(candidate)) {
2536
+ projectConfigPath = candidate;
2537
+ break;
2538
+ }
2539
+ }
2540
+ const agentConfig = options.agentConfig ?? resolveAgentConfig(options.agent, projectConfigPath) ?? {
2541
+ command: process.env["ARCHAL_AGENT_COMMAND"] ?? "echo",
2542
+ args: process.env["ARCHAL_AGENT_COMMAND"] ? [] : ["No agent command configured"]
2543
+ };
2544
+ if (agentConfig.command === "echo") {
2545
+ process.stderr.write("Warning: No agent command configured. Use --agent flag, set ARCHAL_AGENT_COMMAND, or create .archal.json\n");
2546
+ }
2547
+ printHeader(scenario.title, seedSelections);
2548
+ const evaluatorConfig = {
2549
+ apiKey: config.apiKey,
2550
+ model
2551
+ };
2552
+ const runs = [];
2553
+ for (let i = 0; i < numRuns; i++) {
2554
+ const result = await executeSingleRun(
2555
+ i,
2556
+ scenario,
2557
+ agentConfig,
2558
+ seedSelections,
2559
+ evaluatorConfig,
2560
+ timeoutSeconds,
2561
+ options.rateLimit
2562
+ );
2563
+ runs.push(result);
2564
+ printRunProgress(i, numRuns, result.overallScore, result.error);
2565
+ }
2566
+ const runScores = runs.map((r) => r.overallScore);
2567
+ const satisfactionScore = aggregateSatisfaction(runScores);
2568
+ const allEvaluations = runs.map((r) => r.evaluations);
2569
+ const summary = generateSummary(allEvaluations, satisfactionScore);
2570
+ const criterionDescriptions = {};
2571
+ const criterionTypes = {};
2572
+ for (const c of scenario.successCriteria) {
2573
+ criterionDescriptions[c.id] = c.description;
2574
+ criterionTypes[c.id] = c.type;
2575
+ }
2576
+ const report = {
2577
+ scenarioTitle: scenario.title,
2578
+ satisfactionScore,
2579
+ criterionDescriptions,
2580
+ criterionTypes,
2581
+ twinNames: scenario.config.twins,
2582
+ runs,
2583
+ summary,
2584
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
2585
+ };
2586
+ const traceId = recordTrace(report);
2587
+ info("Trace recorded", { traceId });
2588
+ await uploadIfEnabled(traceId, report);
2589
+ printReport(report, outputFormat);
2590
+ return report;
2591
+ }
2592
+
2593
+ // src/commands/run.ts
2594
+ function createRunCommand() {
2595
+ const cmd = new Command("run").description("Execute a scenario against digital twins").argument("<scenario>", "Path to scenario markdown file").option("-n, --runs <count>", "Number of runs", "5").option("-t, --timeout <seconds>", "Timeout per run in seconds", "120").option("-m, --model <model>", "Evaluator model for probabilistic criteria").option("-o, --output <format>", "Output format: terminal, json, junit", "terminal").option("-a, --agent <command>", "Agent command to execute").option("--seed <name>", "Override twin seed name").option("--rate-limit <count>", "Rate limit: max total requests before 429").option("-q, --quiet", "Suppress non-error output").option("-v, --verbose", "Enable debug logging").action(async (scenarioArg, opts) => {
2596
+ if (opts.quiet) {
2597
+ configureLogger({ quiet: true });
2598
+ }
2599
+ if (opts.verbose) {
2600
+ configureLogger({ verbose: true, level: "debug" });
2601
+ }
2602
+ const scenarioPath = resolve4(scenarioArg);
2603
+ if (!existsSync6(scenarioPath)) {
2604
+ process.stderr.write(`Error: Scenario file not found: ${scenarioPath}
2605
+ `);
2606
+ process.exit(1);
2607
+ }
2608
+ if (!scenarioPath.endsWith(".md")) {
2609
+ process.stderr.write(`Error: Scenario file must be a markdown file (.md): ${scenarioPath}
2610
+ `);
2611
+ process.exit(1);
2612
+ }
2613
+ const runs = parseInt(opts.runs, 10);
2614
+ if (Number.isNaN(runs) || runs <= 0) {
2615
+ process.stderr.write(`Error: --runs must be a positive integer
2616
+ `);
2617
+ process.exit(1);
2618
+ }
2619
+ const timeout = parseInt(opts.timeout, 10);
2620
+ if (Number.isNaN(timeout) || timeout <= 0) {
2621
+ process.stderr.write(`Error: --timeout must be a positive integer
2622
+ `);
2623
+ process.exit(1);
2624
+ }
2625
+ const validFormats = ["terminal", "json", "junit"];
2626
+ const outputFormat = opts.output;
2627
+ if (!validFormats.includes(outputFormat)) {
2628
+ process.stderr.write(`Error: --output must be one of: ${validFormats.join(", ")}
2629
+ `);
2630
+ process.exit(1);
2631
+ }
2632
+ if (outputFormat === "json") {
2633
+ configureLogger({ json: true });
2634
+ }
2635
+ try {
2636
+ const rateLimit = opts.rateLimit ? parseInt(opts.rateLimit, 10) : void 0;
2637
+ const report = await runScenario({
2638
+ scenarioPath,
2639
+ agent: opts.agent,
2640
+ runs,
2641
+ timeout,
2642
+ model: opts.model,
2643
+ output: outputFormat,
2644
+ seed: opts.seed,
2645
+ rateLimit
2646
+ });
2647
+ if (report.satisfactionScore < 100) {
2648
+ process.exit(1);
2649
+ }
2650
+ } catch (err) {
2651
+ const message = err instanceof Error ? err.message : String(err);
2652
+ process.stderr.write(`Error: ${message}
2653
+ `);
2654
+ process.exit(1);
2655
+ }
2656
+ });
2657
+ return cmd;
2658
+ }
2659
+
2660
+ // src/commands/init.ts
2661
+ import { Command as Command2 } from "commander";
2662
+ import { existsSync as existsSync7, mkdirSync as mkdirSync3, writeFileSync as writeFileSync4 } from "fs";
2663
+ import { join as join4, resolve as resolve5 } from "path";
2664
+ var SAMPLE_SCENARIO = `# Close Stale Issues
2665
+
2666
+ ## Setup
2667
+
2668
+ A GitHub repository has stale issues in its backlog that need cleanup. Some issues are labeled "stale" and should be closed. Issues labeled "keep-open" must not be closed.
2669
+
2670
+ ## Expected Behavior
2671
+
2672
+ The agent should list open issues, identify stale ones, close them with a comment, and skip any issue marked "keep-open".
2673
+
2674
+ ## Success Criteria
2675
+
2676
+ - [D] At least 1 issue is closed
2677
+ - [D] No issues labeled "keep-open" are closed
2678
+ - [P] Comments on closed issues explain why they were closed
2679
+
2680
+ ## Config
2681
+
2682
+ twins: github
2683
+ timeout: 60
2684
+ runs: 3
2685
+ `;
2686
+ var SAMPLE_CONFIG = `{
2687
+ "agent": {
2688
+ "command": "npx",
2689
+ "args": ["tsx", "agent.ts"]
2690
+ },
2691
+ "runs": 3,
2692
+ "timeout": 60
2693
+ }
2694
+ `;
2695
+ var SAMPLE_AGENT = `/**
2696
+ * Starter agent \u2014 closes stale GitHub issues via MCP.
2697
+ *
2698
+ * Archal sets ARCHAL_MCP_CONFIG pointing to a JSON file with MCP server config.
2699
+ * This agent connects to the GitHub twin, discovers the repo dynamically,
2700
+ * lists open issues, and closes stale ones.
2701
+ */
2702
+
2703
+ import { readFileSync } from 'node:fs';
2704
+ import { Client } from '@modelcontextprotocol/sdk/client/index.js';
2705
+ import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
2706
+
2707
+ interface McpServerConfig {
2708
+ command: string;
2709
+ args: string[];
2710
+ }
2711
+
2712
+ interface McpConfig {
2713
+ mcpServers: Record<string, McpServerConfig>;
2714
+ }
2715
+
2716
+ interface Issue {
2717
+ number: number;
2718
+ title: string;
2719
+ state: string;
2720
+ labels: Array<{ name: string }>;
2721
+ }
2722
+
2723
+ function parseToolResult(result: unknown): unknown {
2724
+ const content = (result as { content: unknown }).content;
2725
+ const text = typeof content === 'string'
2726
+ ? content
2727
+ : (content as Array<{ text?: string }>)[0]?.text ?? '[]';
2728
+ return JSON.parse(text);
2729
+ }
2730
+
2731
+ async function main(): Promise<void> {
2732
+ // 1. Read MCP config (Archal provides this via environment variable)
2733
+ const configPath = process.env['ARCHAL_MCP_CONFIG'];
2734
+ if (!configPath) {
2735
+ console.error('ARCHAL_MCP_CONFIG not set \u2014 are you running via archal run?');
2736
+ process.exit(1);
2737
+ }
2738
+
2739
+ const config: McpConfig = JSON.parse(readFileSync(configPath, 'utf-8'));
2740
+ const serverName = Object.keys(config.mcpServers)[0];
2741
+ if (!serverName) {
2742
+ console.error('No MCP servers in config');
2743
+ process.exit(1);
2744
+ }
2745
+
2746
+ const serverConfig = config.mcpServers[serverName]!;
2747
+
2748
+ // 2. Connect to the twin via MCP stdio transport
2749
+ const transport = new StdioClientTransport({
2750
+ command: serverConfig.command,
2751
+ args: serverConfig.args,
2752
+ });
2753
+
2754
+ const client = new Client({ name: 'my-agent', version: '1.0.0' });
2755
+ await client.connect(transport);
2756
+
2757
+ try {
2758
+ // 3. List available tools (useful for debugging)
2759
+ const { tools } = await client.listTools();
2760
+ console.error(\`Connected to \${serverName}: \${tools.length} tools available\`);
2761
+
2762
+ // 4. Discover the repository dynamically
2763
+ const repoResult = await client.callTool({
2764
+ name: 'search_repositories',
2765
+ arguments: { query: ' ' },
2766
+ });
2767
+ const repos = parseToolResult(repoResult) as { items: Array<{ full_name: string }> };
2768
+ const firstRepo = repos.items[0];
2769
+ if (!firstRepo) {
2770
+ console.error('No repositories found');
2771
+ process.exit(1);
2772
+ }
2773
+ const [owner, repo] = firstRepo.full_name.split('/');
2774
+ console.error(\`Found repo: \${owner}/\${repo}\`);
2775
+
2776
+ // 5. List all open issues
2777
+ const listResult = await client.callTool({
2778
+ name: 'list_issues',
2779
+ arguments: { owner, repo, state: 'open' },
2780
+ });
2781
+ const issues = parseToolResult(listResult) as Issue[];
2782
+
2783
+ // 6. Close stale issues (skip keep-open)
2784
+ for (const issue of issues) {
2785
+ const labelNames = issue.labels.map((l) => l.name);
2786
+
2787
+ if (!labelNames.includes('stale')) continue;
2788
+ if (labelNames.includes('keep-open')) {
2789
+ console.error(\`Skipping #\${issue.number} \u2014 labeled keep-open\`);
2790
+ continue;
2791
+ }
2792
+
2793
+ // Post a comment explaining closure
2794
+ await client.callTool({
2795
+ name: 'add_issue_comment',
2796
+ arguments: {
2797
+ owner,
2798
+ repo,
2799
+ issue_number: issue.number,
2800
+ body: 'Closing as stale. Reopen if still relevant.',
2801
+ },
2802
+ });
2803
+
2804
+ // Close the issue
2805
+ await client.callTool({
2806
+ name: 'update_issue',
2807
+ arguments: {
2808
+ owner,
2809
+ repo,
2810
+ issue_number: issue.number,
2811
+ state: 'closed',
2812
+ },
2813
+ });
2814
+
2815
+ console.error(\`Closed #\${issue.number} "\${issue.title}"\`);
2816
+ }
2817
+ } finally {
2818
+ await client.close();
2819
+ }
2820
+ }
2821
+
2822
+ main().catch((err) => {
2823
+ console.error(err);
2824
+ process.exit(1);
2825
+ });
2826
+ `;
2827
+ var SAMPLE_PACKAGE_JSON = `{
2828
+ "type": "module",
2829
+ "dependencies": {
2830
+ "@modelcontextprotocol/sdk": "^1.4.0"
2831
+ },
2832
+ "devDependencies": {
2833
+ "tsx": "^4.19.0"
2834
+ }
2835
+ }
2836
+ `;
2837
+ function writeIfMissing(filePath, content) {
2838
+ if (!existsSync7(filePath)) {
2839
+ writeFileSync4(filePath, content);
2840
+ info(`Created ${filePath}`);
2841
+ } else {
2842
+ info(`Skipped ${filePath} (already exists)`);
2843
+ }
2844
+ }
2845
+ function createInitCommand() {
2846
+ const cmd = new Command2("init").description("Initialize an Archal test directory with sample scenario and agent").argument("[directory]", "Directory to initialize", "archal").action((directory) => {
2847
+ const targetDir = resolve5(directory);
2848
+ if (existsSync7(targetDir)) {
2849
+ warn(`Directory already exists: ${targetDir}`);
2850
+ warn("Skipping files that already exist.");
2851
+ } else {
2852
+ mkdirSync3(targetDir, { recursive: true });
2853
+ }
2854
+ writeIfMissing(join4(targetDir, "scenario.md"), SAMPLE_SCENARIO);
2855
+ writeIfMissing(join4(targetDir, ".archal.json"), SAMPLE_CONFIG);
2856
+ writeIfMissing(join4(targetDir, "agent.ts"), SAMPLE_AGENT);
2857
+ writeIfMissing(join4(targetDir, "package.json"), SAMPLE_PACKAGE_JSON);
2858
+ success("Archal initialized. Next steps:");
2859
+ process.stderr.write(`
2860
+ 1. cd ${directory} && npm install
2861
+ `);
2862
+ process.stderr.write(` 2. Edit scenario.md and agent.ts to fit your use case
2863
+ `);
2864
+ process.stderr.write(` 3. Run: archal run scenario.md
2865
+
2866
+ `);
2867
+ });
2868
+ return cmd;
2869
+ }
2870
+
2871
+ // src/commands/twin.ts
2872
+ import { Command as Command3 } from "commander";
2873
+ var runningTwins = /* @__PURE__ */ new Map();
2874
+ var KNOWN_TWINS = [
2875
+ { name: "github", package: "@archal/twin-github", description: "GitHub digital twin" },
2876
+ { name: "slack", package: "@archal/twin-slack", description: "Slack digital twin" }
2877
+ ];
2878
+ function createTwinCommand() {
2879
+ const cmd = new Command3("twin").description("Manage digital twin processes");
2880
+ cmd.command("start").description("Start a digital twin process").argument("<name>", "Twin name (e.g., github, slack)").option("--seed <seed>", "Seed name to load", "small-project").option("--transport <type>", "Transport type: stdio or http", "stdio").option("--port <port>", "Port for HTTP transport").action((name, opts) => {
2881
+ const knownTwin = KNOWN_TWINS.find((t) => t.name === name);
2882
+ if (!knownTwin) {
2883
+ const available = KNOWN_TWINS.map((t) => t.name).join(", ");
2884
+ error(`Unknown twin: "${name}". Available twins: ${available}`);
2885
+ process.exit(1);
2886
+ }
2887
+ if (runningTwins.has(name)) {
2888
+ warn(`Twin "${name}" is already running (PID: ${runningTwins.get(name)?.pid ?? "unknown"})`);
2889
+ return;
2890
+ }
2891
+ const args = [knownTwin.package, "--seed", opts.seed, "--transport", opts.transport];
2892
+ if (opts.transport === "http" && opts.port) {
2893
+ args.push("--port", opts.port);
2894
+ }
2895
+ info(`Starting twin: ${name}`, { seed: opts.seed, transport: opts.transport });
2896
+ const child = spawnMcpStdioProcess({
2897
+ command: "npx",
2898
+ args
2899
+ });
2900
+ const pid = child.pid ?? 0;
2901
+ runningTwins.set(name, {
2902
+ name,
2903
+ pid,
2904
+ startedAt: (/* @__PURE__ */ new Date()).toISOString(),
2905
+ process: child
2906
+ });
2907
+ child.on("exit", (code) => {
2908
+ info(`Twin "${name}" exited`, { code: String(code ?? "unknown") });
2909
+ runningTwins.delete(name);
2910
+ });
2911
+ success(`Twin "${name}" started (PID: ${pid})`);
2912
+ });
2913
+ cmd.command("stop").description("Stop a running digital twin").argument("<name>", "Twin name to stop").action(async (name) => {
2914
+ const twin = runningTwins.get(name);
2915
+ if (!twin) {
2916
+ error(`Twin "${name}" is not running`);
2917
+ const running = Array.from(runningTwins.keys());
2918
+ if (running.length > 0) {
2919
+ info(`Running twins: ${running.join(", ")}`);
2920
+ }
2921
+ process.exit(1);
2922
+ }
2923
+ info(`Stopping twin: ${name}`, { pid: String(twin.pid) });
2924
+ await killProcess(twin.process);
2925
+ runningTwins.delete(name);
2926
+ success(`Twin "${name}" stopped`);
2927
+ });
2928
+ cmd.command("list").description("List available digital twins").action(() => {
2929
+ const headers = ["Name", "Package", "Description", "Seeds"];
2930
+ const rows = KNOWN_TWINS.map((twin) => {
2931
+ const seeds = getAvailableSeeds(twin.name);
2932
+ return [
2933
+ twin.name,
2934
+ twin.package,
2935
+ twin.description,
2936
+ seeds.length > 0 ? seeds.join(", ") : "(default)"
2937
+ ];
2938
+ });
2939
+ table(headers, rows);
2940
+ });
2941
+ cmd.command("status").description("Show status of running digital twins").action(() => {
2942
+ if (runningTwins.size === 0) {
2943
+ info("No twins currently running");
2944
+ return;
2945
+ }
2946
+ const headers = ["Name", "PID", "Started", "Status"];
2947
+ const rows = [];
2948
+ for (const twin of runningTwins.values()) {
2949
+ const isAlive = twin.process.exitCode === null;
2950
+ rows.push([
2951
+ twin.name,
2952
+ String(twin.pid),
2953
+ twin.startedAt,
2954
+ isAlive ? "running" : `exited (${twin.process.exitCode})`
2955
+ ]);
2956
+ }
2957
+ table(headers, rows);
2958
+ });
2959
+ return cmd;
2960
+ }
2961
+
2962
+ // src/commands/scenario.ts
2963
+ import { Command as Command4 } from "commander";
2964
+ import { existsSync as existsSync8, readdirSync as readdirSync2, writeFileSync as writeFileSync5, mkdirSync as mkdirSync4 } from "fs";
2965
+ import { resolve as resolve6, join as join5, extname } from "path";
2966
+ var SCENARIO_TEMPLATE = `# {{NAME}}
2967
+
2968
+ ## Setup
2969
+
2970
+ Describe the initial state of the digital twins here.
2971
+ What should exist before the agent starts?
2972
+
2973
+ ## Expected Behavior
2974
+
2975
+ Describe what the agent should do.
2976
+ What actions should it take? What workflow should it follow?
2977
+
2978
+ ## Success Criteria
2979
+
2980
+ - [D] Exactly N items should be created
2981
+ - [P] The agent should handle errors gracefully
2982
+ - [P] Output should be clear and well-structured
2983
+
2984
+ ## Config
2985
+
2986
+ twins: github
2987
+ timeout: 120
2988
+ runs: 5
2989
+ `;
2990
+ function findScenarioFiles(dir) {
2991
+ const files = [];
2992
+ if (!existsSync8(dir)) return files;
2993
+ const entries = readdirSync2(dir, { withFileTypes: true });
2994
+ for (const entry of entries) {
2995
+ const fullPath = join5(dir, entry.name);
2996
+ if (entry.isDirectory()) {
2997
+ files.push(...findScenarioFiles(fullPath));
2998
+ } else if (entry.isFile() && extname(entry.name) === ".md") {
2999
+ files.push(fullPath);
3000
+ }
3001
+ }
3002
+ return files;
3003
+ }
3004
+ function findScenariosDir() {
3005
+ const candidates = [
3006
+ resolve6("scenarios"),
3007
+ resolve6("test", "scenarios"),
3008
+ resolve6(".archal", "scenarios")
3009
+ ];
3010
+ for (const candidate of candidates) {
3011
+ if (existsSync8(candidate)) {
3012
+ return candidate;
3013
+ }
3014
+ }
3015
+ return resolve6("scenarios");
3016
+ }
3017
+ function createScenarioCommand() {
3018
+ const cmd = new Command4("scenario").description("Manage test scenarios");
3019
+ cmd.command("list").description("List available scenarios").option("-d, --dir <directory>", "Scenario directory to search").action((opts) => {
3020
+ const scenariosDir = opts.dir ? resolve6(opts.dir) : findScenariosDir();
3021
+ if (!existsSync8(scenariosDir)) {
3022
+ warn(`Scenarios directory not found: ${scenariosDir}`);
3023
+ info("Create a scenarios directory or use --dir to specify one");
3024
+ return;
3025
+ }
3026
+ const files = findScenarioFiles(scenariosDir);
3027
+ if (files.length === 0) {
3028
+ info("No scenario files found");
3029
+ info(`Create one with: archal scenario create my-scenario`);
3030
+ return;
3031
+ }
3032
+ const headers = ["Scenario", "Path", "Criteria", "Twins"];
3033
+ const rows = [];
3034
+ for (const file of files) {
3035
+ try {
3036
+ const scenario = parseScenarioFile(file);
3037
+ const relativePath = file.replace(resolve6(".") + "\\", "").replace(resolve6(".") + "/", "");
3038
+ rows.push([
3039
+ scenario.title,
3040
+ relativePath,
3041
+ String(scenario.successCriteria.length),
3042
+ scenario.config.twins.join(", ") || "(auto)"
3043
+ ]);
3044
+ } catch (err) {
3045
+ const message = err instanceof Error ? err.message : String(err);
3046
+ const relativePath = file.replace(resolve6(".") + "\\", "").replace(resolve6(".") + "/", "");
3047
+ rows.push([
3048
+ `(parse error)`,
3049
+ relativePath,
3050
+ "-",
3051
+ message
3052
+ ]);
3053
+ }
3054
+ }
3055
+ table(headers, rows);
3056
+ info(`
3057
+ Found ${files.length} scenario(s) in ${scenariosDir}`);
3058
+ });
3059
+ cmd.command("validate").description("Parse and validate a scenario file").argument("<file>", "Path to scenario markdown file").action((file) => {
3060
+ const filePath = resolve6(file);
3061
+ if (!existsSync8(filePath)) {
3062
+ error(`File not found: ${filePath}`);
3063
+ process.exit(1);
3064
+ }
3065
+ try {
3066
+ const scenario = parseScenarioFile(filePath);
3067
+ const errors = validateScenario(scenario);
3068
+ info(`Scenario: ${scenario.title}`);
3069
+ info(`Setup: ${scenario.setup.slice(0, 80)}${scenario.setup.length > 80 ? "..." : ""}`);
3070
+ info(`Expected Behavior: ${scenario.expectedBehavior.slice(0, 80)}${scenario.expectedBehavior.length > 80 ? "..." : ""}`);
3071
+ info(`Twins: ${scenario.config.twins.join(", ") || "(none detected)"}`);
3072
+ info(`Timeout: ${scenario.config.timeout}s`);
3073
+ info(`Runs: ${scenario.config.runs}`);
3074
+ process.stdout.write("\n");
3075
+ info("Success Criteria:");
3076
+ for (const criterion of scenario.successCriteria) {
3077
+ const tag = criterion.type === "deterministic" ? "[D]" : "[P]";
3078
+ info(` ${tag} ${criterion.description}`);
3079
+ }
3080
+ process.stdout.write("\n");
3081
+ if (errors.length === 0) {
3082
+ success("Scenario is valid");
3083
+ } else {
3084
+ fail(`Scenario has ${errors.length} validation error(s):`);
3085
+ for (const err of errors) {
3086
+ error(` - ${err}`);
3087
+ }
3088
+ process.exit(1);
3089
+ }
3090
+ } catch (err) {
3091
+ const message = err instanceof Error ? err.message : String(err);
3092
+ error(`Failed to parse scenario: ${message}`);
3093
+ process.exit(1);
3094
+ }
3095
+ });
3096
+ cmd.command("create").description("Scaffold a new scenario file").argument("<name>", "Scenario name (will be used as filename)").option("-d, --dir <directory>", "Directory to create scenario in").option("--twin <twin>", "Twin to configure (github, slack, etc.)", "github").action((name, opts) => {
3097
+ const scenariosDir = opts.dir ? resolve6(opts.dir) : findScenariosDir();
3098
+ if (!existsSync8(scenariosDir)) {
3099
+ mkdirSync4(scenariosDir, { recursive: true });
3100
+ info(`Created scenarios directory: ${scenariosDir}`);
3101
+ }
3102
+ const fileName = name.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "") + ".md";
3103
+ const filePath = join5(scenariosDir, fileName);
3104
+ if (existsSync8(filePath)) {
3105
+ error(`Scenario file already exists: ${filePath}`);
3106
+ process.exit(1);
3107
+ }
3108
+ const displayName = name.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
3109
+ const content = SCENARIO_TEMPLATE.replace("{{NAME}}", displayName).replace("twins: github", `twins: ${opts.twin}`);
3110
+ writeFileSync5(filePath, content, "utf-8");
3111
+ success(`Created scenario: ${filePath}`);
3112
+ info(`Edit the file to define your test scenario, then run:`);
3113
+ info(` archal scenario validate ${filePath}`);
3114
+ info(` archal run ${filePath}`);
3115
+ });
3116
+ return cmd;
3117
+ }
3118
+
3119
+ // src/commands/trace.ts
3120
+ import { Command as Command5 } from "commander";
3121
+ function createTraceCommand() {
3122
+ const cmd = new Command5("trace").description("Inspect and export run traces");
3123
+ cmd.command("list").description("List recent traces").option("-n, --limit <count>", "Number of traces to show", "20").action((opts) => {
3124
+ const limit = parseInt(opts.limit, 10);
3125
+ if (Number.isNaN(limit) || limit <= 0) {
3126
+ error("--limit must be a positive integer");
3127
+ process.exit(1);
3128
+ }
3129
+ const traces = listTraces(limit);
3130
+ if (traces.length === 0) {
3131
+ info("No traces found. Run a scenario first: archal run <scenario.md>");
3132
+ return;
3133
+ }
3134
+ const headers = ["ID", "Scenario", "Score", "Runs", "Entries", "Timestamp"];
3135
+ const rows = traces.map((t) => [
3136
+ t.id.slice(0, 8) + "...",
3137
+ t.scenarioTitle.length > 30 ? t.scenarioTitle.slice(0, 27) + "..." : t.scenarioTitle,
3138
+ t.satisfactionScore.toFixed(1) + "%",
3139
+ String(t.runCount),
3140
+ String(t.entryCount),
3141
+ formatTimestamp2(t.timestamp)
3142
+ ]);
3143
+ table(headers, rows);
3144
+ info(`
3145
+ Showing ${traces.length} most recent trace(s)`);
3146
+ info('Use "archal trace show <id>" to view details');
3147
+ });
3148
+ cmd.command("show").description("Show detailed trace information").argument("<id>", "Trace ID (full or prefix)").option("--run <index>", "Show specific run (0-indexed)").option("--entries", "Show individual trace entries").action((id, opts) => {
3149
+ const trace = loadTrace(id);
3150
+ if (!trace) {
3151
+ error(`Trace not found: ${id}`);
3152
+ info('Use "archal trace list" to see available traces');
3153
+ process.exit(1);
3154
+ }
3155
+ process.stdout.write("\n");
3156
+ info(`Trace ID: ${trace.id}`);
3157
+ info(`Scenario: ${trace.scenarioTitle}`);
3158
+ info(`Timestamp: ${trace.timestamp}`);
3159
+ info(`Satisfaction: ${trace.satisfactionScore.toFixed(1)}%`);
3160
+ info(`Runs: ${trace.runCount}`);
3161
+ info(`Total entries: ${trace.entries.length}`);
3162
+ process.stdout.write("\n");
3163
+ const report = trace.report;
3164
+ if (opts.run !== void 0) {
3165
+ const runIndex = parseInt(opts.run, 10);
3166
+ const run = report.runs[runIndex];
3167
+ if (!run) {
3168
+ error(`Run index ${runIndex} out of range (0-${report.runs.length - 1})`);
3169
+ process.exit(1);
3170
+ }
3171
+ info(`--- Run ${runIndex + 1} ---`);
3172
+ info(`Score: ${run.overallScore.toFixed(1)}%`);
3173
+ info(`Duration: ${run.durationMs}ms`);
3174
+ if (run.error) {
3175
+ error(`Error: ${run.error}`);
3176
+ }
3177
+ process.stdout.write("\n");
3178
+ info("Evaluations:");
3179
+ for (const evaluation of run.evaluations) {
3180
+ const status = evaluation.status.toUpperCase().padEnd(7);
3181
+ info(` [${status}] ${evaluation.criterionId}: ${evaluation.explanation} (${(evaluation.confidence * 100).toFixed(0)}% confidence)`);
3182
+ }
3183
+ if (opts.entries) {
3184
+ process.stdout.write("\n");
3185
+ info("Trace entries:");
3186
+ for (const entry of run.trace) {
3187
+ info(` ${entry.timestamp} ${entry.toolName} (${entry.durationMs}ms)${entry.error ? " ERROR" : ""}`);
3188
+ if (entry.error) {
3189
+ info(` Error: ${entry.error.code} - ${entry.error.message}`);
3190
+ }
3191
+ }
3192
+ }
3193
+ } else {
3194
+ info("Runs:");
3195
+ const runHeaders = ["Run", "Score", "Duration", "Evaluations", "Errors"];
3196
+ const runRows = report.runs.map((run) => [
3197
+ String(run.runIndex + 1),
3198
+ run.overallScore.toFixed(1) + "%",
3199
+ run.durationMs + "ms",
3200
+ `${run.evaluations.filter((e) => e.status === "pass").length}/${run.evaluations.length} pass`,
3201
+ run.error ?? "-"
3202
+ ]);
3203
+ table(runHeaders, runRows);
3204
+ process.stdout.write("\n");
3205
+ info(`Summary: ${report.summary}`);
3206
+ if (opts.entries) {
3207
+ process.stdout.write("\n");
3208
+ info("All trace entries:");
3209
+ const entryHeaders = ["Time", "Tool", "Duration", "Error"];
3210
+ const entryRows = trace.entries.slice(0, 50).map((e) => [
3211
+ formatTimestamp2(e.timestamp),
3212
+ e.toolName,
3213
+ e.durationMs + "ms",
3214
+ e.error ? `${e.error.code}: ${e.error.message}` : "-"
3215
+ ]);
3216
+ table(entryHeaders, entryRows);
3217
+ if (trace.entries.length > 50) {
3218
+ info(`
3219
+ ... and ${trace.entries.length - 50} more entries. Use "archal trace export ${id}" for full data.`);
3220
+ }
3221
+ }
3222
+ }
3223
+ });
3224
+ cmd.command("export").description("Export trace as JSON").argument("<id>", "Trace ID (full or prefix)").option("-o, --output <file>", "Output file path (default: stdout)").action((id, opts) => {
3225
+ const json = exportTraceAsJson(id);
3226
+ if (!json) {
3227
+ error(`Trace not found: ${id}`);
3228
+ info('Use "archal trace list" to see available traces');
3229
+ process.exit(1);
3230
+ }
3231
+ if (opts.output) {
3232
+ const { writeFileSync: writeFileSync6 } = __require("fs");
3233
+ const { resolve: resolve8 } = __require("path");
3234
+ const outPath = resolve8(opts.output);
3235
+ writeFileSync6(outPath, json, "utf-8");
3236
+ info(`Trace exported to: ${outPath}`);
3237
+ } else {
3238
+ process.stdout.write(json + "\n");
3239
+ }
3240
+ });
3241
+ return cmd;
3242
+ }
3243
+ function formatTimestamp2(iso) {
3244
+ try {
3245
+ const date = new Date(iso);
3246
+ return date.toLocaleString();
3247
+ } catch {
3248
+ return iso;
3249
+ }
3250
+ }
3251
+
3252
+ // src/commands/config.ts
3253
+ import { Command as Command6 } from "commander";
3254
+ function createConfigCommand() {
3255
+ const cmd = new Command6("config").description("Manage Archal configuration");
3256
+ cmd.command("show").description("Print current configuration").option("--json", "Output as JSON").action((opts) => {
3257
+ const display = getConfigDisplay();
3258
+ if (opts.json) {
3259
+ process.stdout.write(JSON.stringify(display, null, 2) + "\n");
3260
+ return;
3261
+ }
3262
+ info("Current Archal configuration:\n");
3263
+ printConfigSection("General", {
3264
+ telemetry: String(display["telemetry"])
3265
+ });
3266
+ const evaluator = display["evaluator"];
3267
+ printConfigSection("Evaluator", {
3268
+ model: evaluator["model"] ?? "(not set)",
3269
+ apiKey: evaluator["apiKey"] ?? "(not set)"
3270
+ });
3271
+ const defaults = display["defaults"];
3272
+ printConfigSection("Defaults", {
3273
+ runs: String(defaults["runs"]),
3274
+ timeout: String(defaults["timeout"]) + "s"
3275
+ });
3276
+ const paths = display["paths"];
3277
+ printConfigSection("Paths", {
3278
+ archalDir: paths["archalDir"] ?? "(unknown)",
3279
+ configFile: paths["configFile"] ?? "(unknown)"
3280
+ });
3281
+ process.stdout.write("\n");
3282
+ info("Set values with: archal config set <key> <value>");
3283
+ info("Valid keys: telemetry, evaluator.model, evaluator.apiKey, defaults.runs, defaults.timeout");
3284
+ });
3285
+ cmd.command("set").description("Set a configuration value").argument("<key>", "Configuration key (e.g., evaluator.model, defaults.runs)").argument("<value>", "Value to set").action((key, value) => {
3286
+ try {
3287
+ setConfigValue(key, value);
3288
+ success(`Set ${key} = ${key.includes("apiKey") ? "***" : value}`);
3289
+ } catch (err) {
3290
+ const message = err instanceof Error ? err.message : String(err);
3291
+ error(message);
3292
+ process.exit(1);
3293
+ }
3294
+ });
3295
+ cmd.command("init").description("Create default configuration file").option("--force", "Overwrite existing config").action((opts) => {
3296
+ const configPath = getConfigPath();
3297
+ if (opts.force) {
3298
+ const { existsSync: existsSync10, unlinkSync: unlinkSync2 } = __require("fs");
3299
+ if (existsSync10(configPath)) {
3300
+ unlinkSync2(configPath);
3301
+ }
3302
+ }
3303
+ try {
3304
+ const path = initConfig();
3305
+ success(`Configuration initialized: ${path}`);
3306
+ info("\nNext steps:");
3307
+ info(" 1. Set your API key:");
3308
+ info(" archal config set evaluator.apiKey your-key-here");
3309
+ info(" or set ANTHROPIC_API_KEY environment variable");
3310
+ info("");
3311
+ info(" 2. Create a scenario:");
3312
+ info(" archal scenario create my-first-test");
3313
+ info("");
3314
+ info(" 3. Run it:");
3315
+ info(" archal run scenarios/my-first-test.md");
3316
+ } catch (err) {
3317
+ const message = err instanceof Error ? err.message : String(err);
3318
+ error(message);
3319
+ process.exit(1);
3320
+ }
3321
+ });
3322
+ cmd.command("path").description("Print the config file path").action(() => {
3323
+ process.stdout.write(getConfigPath() + "\n");
3324
+ });
3325
+ return cmd;
3326
+ }
3327
+ function printConfigSection(name, values) {
3328
+ process.stdout.write(`\x1B[1m ${name}\x1B[0m
3329
+ `);
3330
+ for (const [key, value] of Object.entries(values)) {
3331
+ process.stdout.write(` ${key.padEnd(15)} ${value}
3332
+ `);
3333
+ }
3334
+ process.stdout.write("\n");
3335
+ }
3336
+
3337
+ // src/commands/demo.ts
3338
+ import { Command as Command7 } from "commander";
3339
+ import { existsSync as existsSync9 } from "fs";
3340
+ import { resolve as resolve7, dirname as dirname3 } from "path";
3341
+ import { fileURLToPath as fileURLToPath3 } from "url";
3342
+ import { createRequire as createRequire3 } from "module";
3343
+ var __dirname3 = fileURLToPath3(new URL(".", import.meta.url));
3344
+ function resolveDemoDir() {
3345
+ const monorepoDemoDir = resolve7(__dirname3, "..", "demo");
3346
+ if (existsSync9(resolve7(monorepoDemoDir, "scenario.md"))) {
3347
+ return monorepoDemoDir;
3348
+ }
3349
+ try {
3350
+ const require2 = createRequire3(import.meta.url);
3351
+ const cliMain = require2.resolve("@archal/cli");
3352
+ const pkgDir = dirname3(dirname3(cliMain));
3353
+ const npmDemoDir = resolve7(pkgDir, "demo");
3354
+ if (existsSync9(resolve7(npmDemoDir, "scenario.md"))) {
3355
+ return npmDemoDir;
3356
+ }
3357
+ } catch {
3358
+ }
3359
+ throw new Error("Demo files not found. Ensure @archal/cli is installed correctly.");
3360
+ }
3361
+ function createDemoCommand() {
3362
+ const cmd = new Command7("demo").description("Run a built-in demo: good agent vs bad agent on the same scenario").option("-q, --quiet", "Suppress non-error output").option("-v, --verbose", "Enable debug logging").action(async (opts) => {
3363
+ if (opts.quiet) {
3364
+ configureLogger({ quiet: true });
3365
+ }
3366
+ if (opts.verbose) {
3367
+ configureLogger({ verbose: true, level: "debug" });
3368
+ }
3369
+ const demoDir = resolveDemoDir();
3370
+ const scenarioPath = resolve7(demoDir, "scenario.md");
3371
+ const goodAgentPath = resolve7(demoDir, "good-agent.mjs");
3372
+ const badAgentPath = resolve7(demoDir, "bad-agent.mjs");
3373
+ process.stderr.write("\n\x1B[36m\x1B[1marchal demo\x1B[0m \x1B[2m\u2014 same scenario, two agents\x1B[0m\n\n");
3374
+ process.stderr.write("\x1B[1m\x1B[32m\u25B8 Good agent\x1B[0m \x1B[2m(checks labels, skips keep-open)\x1B[0m\n");
3375
+ const goodReport = await runScenario({
3376
+ scenarioPath,
3377
+ agentConfig: { command: "node", args: [goodAgentPath] },
3378
+ runs: 1,
3379
+ timeout: 60,
3380
+ output: "terminal"
3381
+ });
3382
+ process.stderr.write("\n");
3383
+ process.stderr.write("\x1B[1m\x1B[31m\u25B8 Bad agent\x1B[0m \x1B[2m(closes everything, no comments)\x1B[0m\n");
3384
+ const badReport = await runScenario({
3385
+ scenarioPath,
3386
+ agentConfig: { command: "node", args: [badAgentPath] },
3387
+ runs: 1,
3388
+ timeout: 60,
3389
+ output: "terminal"
3390
+ });
3391
+ process.stderr.write("\n\x1B[2m\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\x1B[0m\n");
3392
+ process.stderr.write("\n Same scenario. Same digital twin. Different scores.\n");
3393
+ process.stderr.write(` Good agent: \x1B[32m${goodReport.satisfactionScore.toFixed(1)}%\x1B[0m
3394
+ `);
3395
+ process.stderr.write(` Bad agent: \x1B[31m${badReport.satisfactionScore.toFixed(1)}%\x1B[0m
3396
+
3397
+ `);
3398
+ process.stderr.write(" \x1B[2mThis is what archal does \u2014 it measures agent behavior,\n");
3399
+ process.stderr.write(" not just whether it runs.\x1B[0m\n\n");
3400
+ });
3401
+ return cmd;
3402
+ }
3403
+
3404
+ // src/index.ts
3405
+ var program = new Command8();
3406
+ program.name("archal").description("The QA layer for the software factory era \u2014 test AI agents against digital twins").version("0.1.0").option("-q, --quiet", "Suppress non-error output").option("-v, --verbose", "Enable debug logging").hook("preAction", (_thisCommand) => {
3407
+ const opts = program.opts();
3408
+ if (opts.quiet) {
3409
+ configureLogger({ quiet: true });
3410
+ }
3411
+ if (opts.verbose) {
3412
+ configureLogger({ verbose: true, level: "debug" });
3413
+ }
3414
+ });
3415
+ program.addCommand(createRunCommand());
3416
+ program.addCommand(createInitCommand());
3417
+ program.addCommand(createTwinCommand());
3418
+ program.addCommand(createScenarioCommand());
3419
+ program.addCommand(createTraceCommand());
3420
+ program.addCommand(createConfigCommand());
3421
+ program.addCommand(createDemoCommand());
3422
+ program.parseAsync(process.argv).catch((err) => {
3423
+ const message = err instanceof Error ? err.message : String(err);
3424
+ process.stderr.write(`Error: ${message}
3425
+ `);
3426
+ process.exit(1);
3427
+ });