@archal/cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +48 -0
- package/demo/bad-agent.mjs +39 -0
- package/demo/good-agent.mjs +51 -0
- package/demo/scenario.md +27 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +3427 -0
- package/package.json +64 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,3427 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
|
|
3
|
+
get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
|
|
4
|
+
}) : x)(function(x) {
|
|
5
|
+
if (typeof require !== "undefined") return require.apply(this, arguments);
|
|
6
|
+
throw Error('Dynamic require of "' + x + '" is not supported');
|
|
7
|
+
});
|
|
8
|
+
|
|
9
|
+
// src/index.ts
|
|
10
|
+
import { Command as Command8 } from "commander";
|
|
11
|
+
|
|
12
|
+
// src/commands/run.ts
|
|
13
|
+
import { Command } from "commander";
|
|
14
|
+
import { existsSync as existsSync6 } from "fs";
|
|
15
|
+
import { resolve as resolve4 } from "path";
|
|
16
|
+
|
|
17
|
+
// src/runner/orchestrator.ts
|
|
18
|
+
import { readFileSync as readFileSync6, existsSync as existsSync5 } from "fs";
|
|
19
|
+
import { resolve as resolve3, dirname as dirname2 } from "path";
|
|
20
|
+
|
|
21
|
+
// src/runner/scenario-parser.ts
|
|
22
|
+
import { readFileSync } from "fs";
|
|
23
|
+
import { basename } from "path";
|
|
24
|
+
|
|
25
|
+
// src/utils/logger.ts
|
|
26
|
+
var LOG_LEVEL_PRIORITY = {
|
|
27
|
+
debug: 0,
|
|
28
|
+
info: 1,
|
|
29
|
+
warn: 2,
|
|
30
|
+
error: 3
|
|
31
|
+
};
|
|
32
|
+
var LOG_LEVEL_COLORS = {
|
|
33
|
+
debug: "\x1B[90m",
|
|
34
|
+
// gray
|
|
35
|
+
info: "\x1B[36m",
|
|
36
|
+
// cyan
|
|
37
|
+
warn: "\x1B[33m",
|
|
38
|
+
// yellow
|
|
39
|
+
error: "\x1B[31m"
|
|
40
|
+
// red
|
|
41
|
+
};
|
|
42
|
+
var RESET = "\x1B[0m";
|
|
43
|
+
var BOLD = "\x1B[1m";
|
|
44
|
+
var DIM = "\x1B[2m";
|
|
45
|
+
var globalOptions = {
|
|
46
|
+
level: "warn",
|
|
47
|
+
quiet: false,
|
|
48
|
+
json: false,
|
|
49
|
+
verbose: false
|
|
50
|
+
};
|
|
51
|
+
function configureLogger(options) {
|
|
52
|
+
globalOptions = { ...globalOptions, ...options };
|
|
53
|
+
}
|
|
54
|
+
function shouldLog(level) {
|
|
55
|
+
if (globalOptions.quiet && level !== "error") {
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
return LOG_LEVEL_PRIORITY[level] >= LOG_LEVEL_PRIORITY[globalOptions.level];
|
|
59
|
+
}
|
|
60
|
+
function formatTimestamp() {
|
|
61
|
+
return (/* @__PURE__ */ new Date()).toISOString();
|
|
62
|
+
}
|
|
63
|
+
function formatLogEntry(entry) {
|
|
64
|
+
if (globalOptions.json) {
|
|
65
|
+
return JSON.stringify(entry);
|
|
66
|
+
}
|
|
67
|
+
const color = LOG_LEVEL_COLORS[entry.level];
|
|
68
|
+
const levelTag = `${color}${BOLD}${entry.level.toUpperCase().padEnd(5)}${RESET}`;
|
|
69
|
+
const timestamp = `${DIM}${entry.timestamp}${RESET}`;
|
|
70
|
+
let line = `${timestamp} ${levelTag} ${entry.message}`;
|
|
71
|
+
if (entry.data && Object.keys(entry.data).length > 0) {
|
|
72
|
+
const dataStr = Object.entries(entry.data).map(([k, v]) => `${DIM}${k}=${RESET}${typeof v === "string" ? v : JSON.stringify(v)}`).join(" ");
|
|
73
|
+
line += ` ${dataStr}`;
|
|
74
|
+
}
|
|
75
|
+
return line;
|
|
76
|
+
}
|
|
77
|
+
function log(level, message, data) {
|
|
78
|
+
if (!shouldLog(level)) {
|
|
79
|
+
return;
|
|
80
|
+
}
|
|
81
|
+
const entry = {
|
|
82
|
+
level,
|
|
83
|
+
message,
|
|
84
|
+
timestamp: formatTimestamp(),
|
|
85
|
+
data
|
|
86
|
+
};
|
|
87
|
+
const formatted = formatLogEntry(entry);
|
|
88
|
+
process.stderr.write(formatted + "\n");
|
|
89
|
+
}
|
|
90
|
+
function debug(message, data) {
|
|
91
|
+
log("debug", message, data);
|
|
92
|
+
}
|
|
93
|
+
function info(message, data) {
|
|
94
|
+
log("info", message, data);
|
|
95
|
+
}
|
|
96
|
+
function warn(message, data) {
|
|
97
|
+
log("warn", message, data);
|
|
98
|
+
}
|
|
99
|
+
function error(message, data) {
|
|
100
|
+
log("error", message, data);
|
|
101
|
+
}
|
|
102
|
+
function success(message) {
|
|
103
|
+
if (!globalOptions.verbose) return;
|
|
104
|
+
process.stderr.write(`\x1B[32m${BOLD} OK${RESET} ${message}
|
|
105
|
+
`);
|
|
106
|
+
}
|
|
107
|
+
function fail(message) {
|
|
108
|
+
if (!globalOptions.verbose) return;
|
|
109
|
+
process.stderr.write(`\x1B[31m${BOLD}FAIL${RESET} ${message}
|
|
110
|
+
`);
|
|
111
|
+
}
|
|
112
|
+
function progress(message) {
|
|
113
|
+
if (!globalOptions.verbose) return;
|
|
114
|
+
process.stderr.write(`${DIM} ...${RESET} ${message}
|
|
115
|
+
`);
|
|
116
|
+
}
|
|
117
|
+
function banner(text) {
|
|
118
|
+
if (!globalOptions.verbose) return;
|
|
119
|
+
const line = "=".repeat(Math.max(text.length + 4, 40));
|
|
120
|
+
process.stderr.write(`
|
|
121
|
+
\x1B[36m${BOLD}${line}${RESET}
|
|
122
|
+
`);
|
|
123
|
+
process.stderr.write(`\x1B[36m${BOLD} ${text}${RESET}
|
|
124
|
+
`);
|
|
125
|
+
process.stderr.write(`\x1B[36m${BOLD}${line}${RESET}
|
|
126
|
+
|
|
127
|
+
`);
|
|
128
|
+
}
|
|
129
|
+
function table(headers, rows) {
|
|
130
|
+
if (!globalOptions.verbose) return;
|
|
131
|
+
const colWidths = headers.map((h, i) => {
|
|
132
|
+
const maxDataWidth = rows.reduce((max, row) => {
|
|
133
|
+
const cell = row[i] ?? "";
|
|
134
|
+
return Math.max(max, cell.length);
|
|
135
|
+
}, 0);
|
|
136
|
+
return Math.max(h.length, maxDataWidth);
|
|
137
|
+
});
|
|
138
|
+
const headerLine = headers.map((h, i) => h.padEnd(colWidths[i] ?? 0)).join(" ");
|
|
139
|
+
const separator = colWidths.map((w) => "-".repeat(w)).join(" ");
|
|
140
|
+
process.stderr.write(`${BOLD}${headerLine}${RESET}
|
|
141
|
+
`);
|
|
142
|
+
process.stderr.write(`${DIM}${separator}${RESET}
|
|
143
|
+
`);
|
|
144
|
+
for (const row of rows) {
|
|
145
|
+
const line = row.map((cell, i) => cell.padEnd(colWidths[i] ?? 0)).join(" ");
|
|
146
|
+
process.stderr.write(`${line}
|
|
147
|
+
`);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// src/runner/scenario-parser.ts
|
|
152
|
+
function extractSections(markdown) {
|
|
153
|
+
const lines = markdown.split("\n");
|
|
154
|
+
let title = "";
|
|
155
|
+
const sections = {};
|
|
156
|
+
let currentSection = "";
|
|
157
|
+
for (const line of lines) {
|
|
158
|
+
const h1Match = line.match(/^#\s+(.+)/);
|
|
159
|
+
if (h1Match) {
|
|
160
|
+
title = h1Match[1]?.trim() ?? "";
|
|
161
|
+
continue;
|
|
162
|
+
}
|
|
163
|
+
const h2Match = line.match(/^##\s+(.+)/);
|
|
164
|
+
if (h2Match) {
|
|
165
|
+
currentSection = (h2Match[1] ?? "").trim().toLowerCase();
|
|
166
|
+
sections[currentSection] = [];
|
|
167
|
+
continue;
|
|
168
|
+
}
|
|
169
|
+
const sectionLines = currentSection ? sections[currentSection] : void 0;
|
|
170
|
+
if (sectionLines) {
|
|
171
|
+
sectionLines.push(line);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
const getSection = (name) => {
|
|
175
|
+
const content = sections[name];
|
|
176
|
+
if (!content) return "";
|
|
177
|
+
return content.join("\n").trim();
|
|
178
|
+
};
|
|
179
|
+
return {
|
|
180
|
+
title,
|
|
181
|
+
setup: getSection("setup"),
|
|
182
|
+
expectedBehavior: getSection("expected behavior"),
|
|
183
|
+
successCriteria: getSection("success criteria"),
|
|
184
|
+
config: getSection("config")
|
|
185
|
+
};
|
|
186
|
+
}
|
|
187
|
+
function parseCriterionLine(line, index) {
|
|
188
|
+
const trimmed = line.trim();
|
|
189
|
+
if (!trimmed) return null;
|
|
190
|
+
const bulletStripped = trimmed.replace(/^[-*]\s+/, "").replace(/^\d+\.\s+/, "");
|
|
191
|
+
if (!bulletStripped) return null;
|
|
192
|
+
let type = "probabilistic";
|
|
193
|
+
let description = bulletStripped;
|
|
194
|
+
const tagMatch = description.match(/^\[([DP])]\s*(.*)/i);
|
|
195
|
+
if (tagMatch) {
|
|
196
|
+
const tag = (tagMatch[1] ?? "").toUpperCase();
|
|
197
|
+
type = tag === "D" ? "deterministic" : "probabilistic";
|
|
198
|
+
description = tagMatch[2]?.trim() ?? "";
|
|
199
|
+
} else {
|
|
200
|
+
type = inferCriterionType(description);
|
|
201
|
+
}
|
|
202
|
+
return {
|
|
203
|
+
id: `criterion-${index + 1}`,
|
|
204
|
+
description,
|
|
205
|
+
type
|
|
206
|
+
};
|
|
207
|
+
}
|
|
208
|
+
function inferCriterionType(description) {
|
|
209
|
+
const deterministicPatterns = [
|
|
210
|
+
/^exactly\s+\d+/i,
|
|
211
|
+
/^at\s+least\s+\d+/i,
|
|
212
|
+
/^at\s+most\s+\d+/i,
|
|
213
|
+
/^no\s+more\s+than\s+\d+/i,
|
|
214
|
+
/^fewer\s+than\s+\d+/i,
|
|
215
|
+
/^more\s+than\s+\d+/i,
|
|
216
|
+
/\bis\s+(created|merged|closed|open|deleted|removed)\b/i,
|
|
217
|
+
/\bexists?\b/i,
|
|
218
|
+
/\bno\s+errors?\b/i,
|
|
219
|
+
/\bcount\s+(is|equals|==)\b/i,
|
|
220
|
+
/\b(should|must)\s+(have|contain)\s+exactly\b/i,
|
|
221
|
+
/^\d+\s+\w+\s+(are|were|is|was)\b/i,
|
|
222
|
+
/\b(zero|none)\s+\w+\s+(are|were|remain)\b/i
|
|
223
|
+
];
|
|
224
|
+
for (const pattern of deterministicPatterns) {
|
|
225
|
+
if (pattern.test(description)) {
|
|
226
|
+
return "deterministic";
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
return "probabilistic";
|
|
230
|
+
}
|
|
231
|
+
function parseConfigSection(configText) {
|
|
232
|
+
const result = {};
|
|
233
|
+
if (!configText) return result;
|
|
234
|
+
const lines = configText.split("\n");
|
|
235
|
+
for (const line of lines) {
|
|
236
|
+
const trimmed = line.trim();
|
|
237
|
+
if (!trimmed || trimmed.startsWith("#")) continue;
|
|
238
|
+
const colonIndex = trimmed.indexOf(":");
|
|
239
|
+
if (colonIndex === -1) continue;
|
|
240
|
+
const key = trimmed.slice(0, colonIndex).trim().toLowerCase();
|
|
241
|
+
const value = trimmed.slice(colonIndex + 1).trim();
|
|
242
|
+
switch (key) {
|
|
243
|
+
case "twins": {
|
|
244
|
+
result.twins = value.split(",").map((t) => t.trim()).filter(Boolean);
|
|
245
|
+
break;
|
|
246
|
+
}
|
|
247
|
+
case "timeout": {
|
|
248
|
+
const num = parseInt(value, 10);
|
|
249
|
+
if (!Number.isNaN(num) && num > 0) {
|
|
250
|
+
result.timeout = num;
|
|
251
|
+
}
|
|
252
|
+
break;
|
|
253
|
+
}
|
|
254
|
+
case "runs": {
|
|
255
|
+
const num = parseInt(value, 10);
|
|
256
|
+
if (!Number.isNaN(num) && num > 0) {
|
|
257
|
+
result.runs = num;
|
|
258
|
+
}
|
|
259
|
+
break;
|
|
260
|
+
}
|
|
261
|
+
case "evaluator":
|
|
262
|
+
case "evaluator-model":
|
|
263
|
+
case "evaluatormodel":
|
|
264
|
+
case "model": {
|
|
265
|
+
result.evaluatorModel = value;
|
|
266
|
+
break;
|
|
267
|
+
}
|
|
268
|
+
default: {
|
|
269
|
+
debug(`Unknown config key in scenario: "${key}"`);
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
return result;
|
|
274
|
+
}
|
|
275
|
+
function inferTwinsFromContent(setup, expectedBehavior) {
|
|
276
|
+
const combined = `${setup}
|
|
277
|
+
${expectedBehavior}`.toLowerCase();
|
|
278
|
+
const twins = [];
|
|
279
|
+
const twinKeywords = {
|
|
280
|
+
github: ["github", "repository", "repo", "pull request", "pr", "issue", "commit", "branch", "merge"],
|
|
281
|
+
slack: ["slack", "channel", "message", "thread", "workspace", "dm", "direct message"],
|
|
282
|
+
linear: ["linear", "ticket", "project", "cycle", "backlog"],
|
|
283
|
+
jira: ["jira", "sprint", "epic", "story", "board"]
|
|
284
|
+
};
|
|
285
|
+
for (const [twin, keywords] of Object.entries(twinKeywords)) {
|
|
286
|
+
if (keywords.some((kw) => combined.includes(kw))) {
|
|
287
|
+
twins.push(twin);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
return twins;
|
|
291
|
+
}
|
|
292
|
+
function parseScenarioFile(filePath) {
|
|
293
|
+
const content = readFileSync(filePath, "utf-8");
|
|
294
|
+
return parseScenarioMarkdown(content, filePath);
|
|
295
|
+
}
|
|
296
|
+
function parseScenarioMarkdown(markdown, sourcePath) {
|
|
297
|
+
const sections = extractSections(markdown);
|
|
298
|
+
if (!sections.title) {
|
|
299
|
+
const fallbackTitle = sourcePath ? basename(sourcePath, ".md").replace(/-/g, " ") : "Untitled Scenario";
|
|
300
|
+
warn(`Scenario missing title heading, using fallback: "${fallbackTitle}"`);
|
|
301
|
+
sections.title = fallbackTitle;
|
|
302
|
+
}
|
|
303
|
+
if (!sections.setup) {
|
|
304
|
+
warn("Scenario missing ## Setup section");
|
|
305
|
+
}
|
|
306
|
+
if (!sections.expectedBehavior) {
|
|
307
|
+
warn("Scenario missing ## Expected Behavior section");
|
|
308
|
+
}
|
|
309
|
+
if (!sections.successCriteria) {
|
|
310
|
+
warn("Scenario missing ## Success Criteria section");
|
|
311
|
+
}
|
|
312
|
+
const criteriaLines = sections.successCriteria.split("\n");
|
|
313
|
+
const successCriteria = [];
|
|
314
|
+
let criterionIndex = 0;
|
|
315
|
+
for (const line of criteriaLines) {
|
|
316
|
+
const criterion = parseCriterionLine(line, criterionIndex);
|
|
317
|
+
if (criterion) {
|
|
318
|
+
successCriteria.push(criterion);
|
|
319
|
+
criterionIndex++;
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
const parsedConfig = parseConfigSection(sections.config);
|
|
323
|
+
const inferredTwins = parsedConfig.twins && parsedConfig.twins.length > 0 ? parsedConfig.twins : inferTwinsFromContent(sections.setup, sections.expectedBehavior);
|
|
324
|
+
const config = {
|
|
325
|
+
twins: inferredTwins,
|
|
326
|
+
timeout: parsedConfig.timeout ?? 120,
|
|
327
|
+
runs: parsedConfig.runs ?? 5,
|
|
328
|
+
evaluatorModel: parsedConfig.evaluatorModel
|
|
329
|
+
};
|
|
330
|
+
debug("Parsed scenario", {
|
|
331
|
+
title: sections.title,
|
|
332
|
+
criteriaCount: successCriteria.length,
|
|
333
|
+
deterministicCount: successCriteria.filter((c) => c.type === "deterministic").length,
|
|
334
|
+
probabilisticCount: successCriteria.filter((c) => c.type === "probabilistic").length,
|
|
335
|
+
twins: config.twins.join(", ")
|
|
336
|
+
});
|
|
337
|
+
return {
|
|
338
|
+
title: sections.title,
|
|
339
|
+
setup: sections.setup,
|
|
340
|
+
expectedBehavior: sections.expectedBehavior,
|
|
341
|
+
successCriteria,
|
|
342
|
+
config
|
|
343
|
+
};
|
|
344
|
+
}
|
|
345
|
+
function validateScenario(scenario) {
|
|
346
|
+
const errors = [];
|
|
347
|
+
if (!scenario.title) {
|
|
348
|
+
errors.push("Scenario must have a title");
|
|
349
|
+
}
|
|
350
|
+
if (!scenario.setup) {
|
|
351
|
+
errors.push("Scenario must have a Setup section");
|
|
352
|
+
}
|
|
353
|
+
if (!scenario.expectedBehavior) {
|
|
354
|
+
errors.push("Scenario must have an Expected Behavior section");
|
|
355
|
+
}
|
|
356
|
+
if (scenario.successCriteria.length === 0) {
|
|
357
|
+
errors.push("Scenario must have at least one success criterion");
|
|
358
|
+
}
|
|
359
|
+
for (const criterion of scenario.successCriteria) {
|
|
360
|
+
if (!criterion.description) {
|
|
361
|
+
errors.push(`Criterion ${criterion.id} has an empty description`);
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
if (scenario.config.twins.length === 0) {
|
|
365
|
+
errors.push("Scenario does not reference any known twins (specify in Config section or mention services in Setup/Expected Behavior)");
|
|
366
|
+
}
|
|
367
|
+
if (scenario.config.timeout <= 0) {
|
|
368
|
+
errors.push("Timeout must be a positive number");
|
|
369
|
+
}
|
|
370
|
+
if (scenario.config.runs <= 0) {
|
|
371
|
+
errors.push("Runs must be a positive number");
|
|
372
|
+
}
|
|
373
|
+
return errors;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
// src/runner/seed-generator.ts
|
|
377
|
+
var GITHUB_SEED_MAPPINGS = [
|
|
378
|
+
{
|
|
379
|
+
keywords: ["empty", "blank", "new", "fresh", "clean", "no issues", "no pull requests", "bare"],
|
|
380
|
+
seedName: "empty",
|
|
381
|
+
weight: 1
|
|
382
|
+
},
|
|
383
|
+
{
|
|
384
|
+
keywords: ["small", "simple", "basic", "starter", "minimal", "few issues", "small team", "small project"],
|
|
385
|
+
seedName: "small-project",
|
|
386
|
+
weight: 1
|
|
387
|
+
},
|
|
388
|
+
{
|
|
389
|
+
keywords: [
|
|
390
|
+
"enterprise",
|
|
391
|
+
"large",
|
|
392
|
+
"many issues",
|
|
393
|
+
"complex",
|
|
394
|
+
"multiple contributors",
|
|
395
|
+
"ci/cd",
|
|
396
|
+
"workflows",
|
|
397
|
+
"protected branches",
|
|
398
|
+
"teams",
|
|
399
|
+
"organization"
|
|
400
|
+
],
|
|
401
|
+
seedName: "enterprise-repo",
|
|
402
|
+
weight: 1
|
|
403
|
+
},
|
|
404
|
+
{
|
|
405
|
+
keywords: ["conflict", "merge conflict", "conflicting", "diverged", "cannot merge"],
|
|
406
|
+
seedName: "merge-conflict",
|
|
407
|
+
weight: 2
|
|
408
|
+
},
|
|
409
|
+
{
|
|
410
|
+
keywords: ["permission", "denied", "forbidden", "access denied", "unauthorized", "read-only"],
|
|
411
|
+
seedName: "permissions-denied",
|
|
412
|
+
weight: 2
|
|
413
|
+
},
|
|
414
|
+
{
|
|
415
|
+
keywords: ["rate limit", "throttle", "too many requests", "429"],
|
|
416
|
+
seedName: "rate-limited",
|
|
417
|
+
weight: 2
|
|
418
|
+
},
|
|
419
|
+
{
|
|
420
|
+
keywords: [
|
|
421
|
+
"stale",
|
|
422
|
+
"old",
|
|
423
|
+
"inactive",
|
|
424
|
+
"outdated",
|
|
425
|
+
"abandoned",
|
|
426
|
+
"stale issues",
|
|
427
|
+
"untriaged"
|
|
428
|
+
],
|
|
429
|
+
seedName: "stale-issues",
|
|
430
|
+
weight: 2
|
|
431
|
+
},
|
|
432
|
+
{
|
|
433
|
+
keywords: [
|
|
434
|
+
"pagination",
|
|
435
|
+
"large backlog",
|
|
436
|
+
"many issues",
|
|
437
|
+
"50 issues",
|
|
438
|
+
"paginate",
|
|
439
|
+
"page 2",
|
|
440
|
+
"multiple pages"
|
|
441
|
+
],
|
|
442
|
+
seedName: "large-backlog",
|
|
443
|
+
weight: 2
|
|
444
|
+
}
|
|
445
|
+
];
|
|
446
|
+
var SLACK_SEED_MAPPINGS = [
|
|
447
|
+
{
|
|
448
|
+
keywords: ["empty", "blank", "new workspace", "fresh", "clean"],
|
|
449
|
+
seedName: "empty",
|
|
450
|
+
weight: 1
|
|
451
|
+
},
|
|
452
|
+
{
|
|
453
|
+
keywords: ["small team", "few channels", "simple", "basic", "starter"],
|
|
454
|
+
seedName: "small-team",
|
|
455
|
+
weight: 1
|
|
456
|
+
},
|
|
457
|
+
{
|
|
458
|
+
keywords: [
|
|
459
|
+
"engineering",
|
|
460
|
+
"development",
|
|
461
|
+
"engineering team",
|
|
462
|
+
"developers",
|
|
463
|
+
"incidents",
|
|
464
|
+
"on-call",
|
|
465
|
+
"sprints",
|
|
466
|
+
"standups"
|
|
467
|
+
],
|
|
468
|
+
seedName: "engineering-team",
|
|
469
|
+
weight: 1
|
|
470
|
+
},
|
|
471
|
+
{
|
|
472
|
+
keywords: ["support", "customer", "tickets", "help desk", "routing"],
|
|
473
|
+
seedName: "support-team",
|
|
474
|
+
weight: 1
|
|
475
|
+
},
|
|
476
|
+
{
|
|
477
|
+
keywords: ["busy", "high volume", "many messages", "active", "noisy"],
|
|
478
|
+
seedName: "high-volume",
|
|
479
|
+
weight: 1
|
|
480
|
+
}
|
|
481
|
+
];
|
|
482
|
+
var TWIN_SEED_REGISTRY = {
|
|
483
|
+
github: GITHUB_SEED_MAPPINGS,
|
|
484
|
+
slack: SLACK_SEED_MAPPINGS
|
|
485
|
+
};
|
|
486
|
+
var DEFAULT_SEEDS = {
|
|
487
|
+
github: "small-project",
|
|
488
|
+
slack: "small-team"
|
|
489
|
+
};
|
|
490
|
+
function normalizeText(text) {
|
|
491
|
+
return text.toLowerCase().replace(/[^a-z0-9\s/]/g, " ").replace(/\s+/g, " ").trim();
|
|
492
|
+
}
|
|
493
|
+
function scoreMappingAgainstText(text, mapping) {
|
|
494
|
+
const normalized = normalizeText(text);
|
|
495
|
+
const matched = [];
|
|
496
|
+
let score = 0;
|
|
497
|
+
for (const keyword of mapping.keywords) {
|
|
498
|
+
if (normalized.includes(keyword)) {
|
|
499
|
+
matched.push(keyword);
|
|
500
|
+
score += mapping.weight;
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
return { score, matched };
|
|
504
|
+
}
|
|
505
|
+
function selectSeedForTwin(twinName, setupDescription) {
|
|
506
|
+
const mappings = TWIN_SEED_REGISTRY[twinName];
|
|
507
|
+
if (!mappings || mappings.length === 0) {
|
|
508
|
+
debug(`No seed mappings for twin "${twinName}", using "default"`);
|
|
509
|
+
return {
|
|
510
|
+
twinName,
|
|
511
|
+
seedName: "default",
|
|
512
|
+
confidence: 0,
|
|
513
|
+
matchedKeywords: []
|
|
514
|
+
};
|
|
515
|
+
}
|
|
516
|
+
let bestSeed = DEFAULT_SEEDS[twinName] ?? "default";
|
|
517
|
+
let bestScore = 0;
|
|
518
|
+
let bestMatched = [];
|
|
519
|
+
for (const mapping of mappings) {
|
|
520
|
+
const { score, matched } = scoreMappingAgainstText(setupDescription, mapping);
|
|
521
|
+
if (score > bestScore) {
|
|
522
|
+
bestScore = score;
|
|
523
|
+
bestSeed = mapping.seedName;
|
|
524
|
+
bestMatched = matched;
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
const maxPossibleScore = mappings.reduce(
|
|
528
|
+
(sum, m) => sum + m.keywords.length * m.weight,
|
|
529
|
+
0
|
|
530
|
+
);
|
|
531
|
+
const confidence = maxPossibleScore > 0 ? Math.min(bestScore / 5, 1) : 0;
|
|
532
|
+
debug("Seed selection", {
|
|
533
|
+
twin: twinName,
|
|
534
|
+
seed: bestSeed,
|
|
535
|
+
confidence: confidence.toFixed(2),
|
|
536
|
+
matchedKeywords: bestMatched.join(", ")
|
|
537
|
+
});
|
|
538
|
+
return {
|
|
539
|
+
twinName,
|
|
540
|
+
seedName: bestSeed,
|
|
541
|
+
confidence,
|
|
542
|
+
matchedKeywords: bestMatched
|
|
543
|
+
};
|
|
544
|
+
}
|
|
545
|
+
function generateSeedSelections(twins, setupDescription) {
|
|
546
|
+
return twins.map((twin) => selectSeedForTwin(twin, setupDescription));
|
|
547
|
+
}
|
|
548
|
+
function overrideSeedSelection(selections, overrides) {
|
|
549
|
+
return selections.map((sel) => {
|
|
550
|
+
const override = overrides[sel.twinName];
|
|
551
|
+
if (override) {
|
|
552
|
+
debug(`Seed override for ${sel.twinName}: ${override}`);
|
|
553
|
+
return {
|
|
554
|
+
...sel,
|
|
555
|
+
seedName: override,
|
|
556
|
+
confidence: 1,
|
|
557
|
+
matchedKeywords: ["(manual override)"]
|
|
558
|
+
};
|
|
559
|
+
}
|
|
560
|
+
return sel;
|
|
561
|
+
});
|
|
562
|
+
}
|
|
563
|
+
function getAvailableSeeds(twinName) {
|
|
564
|
+
const mappings = TWIN_SEED_REGISTRY[twinName];
|
|
565
|
+
if (!mappings) return [];
|
|
566
|
+
const seedNames = /* @__PURE__ */ new Set();
|
|
567
|
+
for (const mapping of mappings) {
|
|
568
|
+
seedNames.add(mapping.seedName);
|
|
569
|
+
}
|
|
570
|
+
return Array.from(seedNames);
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
// src/runner/agent-executor.ts
|
|
574
|
+
import { readFileSync as readFileSync2, writeFileSync, renameSync, existsSync, unlinkSync } from "fs";
|
|
575
|
+
import { createRequire } from "module";
|
|
576
|
+
import { tmpdir } from "os";
|
|
577
|
+
import { join, resolve } from "path";
|
|
578
|
+
import { fileURLToPath } from "url";
|
|
579
|
+
|
|
580
|
+
// src/utils/process.ts
|
|
581
|
+
import { spawn } from "child_process";
|
|
582
|
+
function spawnWithTimeout(options) {
|
|
583
|
+
const {
|
|
584
|
+
command,
|
|
585
|
+
args,
|
|
586
|
+
timeoutMs,
|
|
587
|
+
cwd,
|
|
588
|
+
env,
|
|
589
|
+
pipeStdio = false,
|
|
590
|
+
onStdout,
|
|
591
|
+
onStderr
|
|
592
|
+
} = options;
|
|
593
|
+
return new Promise((resolve8, reject) => {
|
|
594
|
+
const startTime = Date.now();
|
|
595
|
+
let timedOut = false;
|
|
596
|
+
let stdoutBuf = "";
|
|
597
|
+
let stderrBuf = "";
|
|
598
|
+
const spawnOpts = {
|
|
599
|
+
cwd,
|
|
600
|
+
env: env ? { ...process.env, ...env } : process.env,
|
|
601
|
+
stdio: pipeStdio ? "inherit" : "pipe",
|
|
602
|
+
shell: process.platform === "win32"
|
|
603
|
+
};
|
|
604
|
+
debug("Spawning process", { command, args: args.join(" "), timeoutMs });
|
|
605
|
+
let child;
|
|
606
|
+
try {
|
|
607
|
+
child = spawn(command, args, spawnOpts);
|
|
608
|
+
} catch (err) {
|
|
609
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
610
|
+
reject(new Error(`Failed to spawn process "${command}": ${message}`));
|
|
611
|
+
return;
|
|
612
|
+
}
|
|
613
|
+
const timer = setTimeout(() => {
|
|
614
|
+
timedOut = true;
|
|
615
|
+
debug("Process timed out, killing", { command, timeoutMs });
|
|
616
|
+
child.kill("SIGTERM");
|
|
617
|
+
setTimeout(() => {
|
|
618
|
+
if (!child.killed) {
|
|
619
|
+
child.kill("SIGKILL");
|
|
620
|
+
}
|
|
621
|
+
}, 5e3);
|
|
622
|
+
}, timeoutMs);
|
|
623
|
+
if (!pipeStdio && child.stdout) {
|
|
624
|
+
child.stdout.on("data", (chunk) => {
|
|
625
|
+
const text = chunk.toString();
|
|
626
|
+
stdoutBuf += text;
|
|
627
|
+
if (onStdout) {
|
|
628
|
+
onStdout(text);
|
|
629
|
+
}
|
|
630
|
+
});
|
|
631
|
+
}
|
|
632
|
+
if (!pipeStdio && child.stderr) {
|
|
633
|
+
child.stderr.on("data", (chunk) => {
|
|
634
|
+
const text = chunk.toString();
|
|
635
|
+
stderrBuf += text;
|
|
636
|
+
if (onStderr) {
|
|
637
|
+
onStderr(text);
|
|
638
|
+
}
|
|
639
|
+
});
|
|
640
|
+
}
|
|
641
|
+
child.on("error", (err) => {
|
|
642
|
+
clearTimeout(timer);
|
|
643
|
+
reject(new Error(`Process "${command}" errored: ${err.message}`));
|
|
644
|
+
});
|
|
645
|
+
child.on("close", (exitCode) => {
|
|
646
|
+
clearTimeout(timer);
|
|
647
|
+
const durationMs = Date.now() - startTime;
|
|
648
|
+
debug("Process exited", { command, exitCode, durationMs, timedOut });
|
|
649
|
+
resolve8({
|
|
650
|
+
exitCode,
|
|
651
|
+
stdout: stdoutBuf,
|
|
652
|
+
stderr: stderrBuf,
|
|
653
|
+
timedOut,
|
|
654
|
+
durationMs
|
|
655
|
+
});
|
|
656
|
+
});
|
|
657
|
+
});
|
|
658
|
+
}
|
|
659
|
+
function spawnMcpStdioProcess(options) {
|
|
660
|
+
const { command, args, env, cwd } = options;
|
|
661
|
+
debug("Spawning MCP stdio process", { command, args: args.join(" ") });
|
|
662
|
+
const child = spawn(command, args, {
|
|
663
|
+
cwd,
|
|
664
|
+
env: env ? { ...process.env, ...env } : process.env,
|
|
665
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
666
|
+
shell: process.platform === "win32"
|
|
667
|
+
});
|
|
668
|
+
child.on("error", (err) => {
|
|
669
|
+
error(`MCP process "${command}" errored: ${err.message}`);
|
|
670
|
+
});
|
|
671
|
+
return child;
|
|
672
|
+
}
|
|
673
|
+
function killProcess(child, gracePeriodMs = 5e3) {
|
|
674
|
+
return new Promise((resolve8) => {
|
|
675
|
+
if (child.killed || child.exitCode !== null) {
|
|
676
|
+
resolve8();
|
|
677
|
+
return;
|
|
678
|
+
}
|
|
679
|
+
child.kill("SIGTERM");
|
|
680
|
+
const forceKillTimer = setTimeout(() => {
|
|
681
|
+
if (!child.killed) {
|
|
682
|
+
child.kill("SIGKILL");
|
|
683
|
+
}
|
|
684
|
+
}, gracePeriodMs);
|
|
685
|
+
child.on("close", () => {
|
|
686
|
+
clearTimeout(forceKillTimer);
|
|
687
|
+
resolve8();
|
|
688
|
+
});
|
|
689
|
+
});
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
// src/runner/agent-executor.ts
|
|
693
|
+
var __dirname = fileURLToPath(new URL(".", import.meta.url));
|
|
694
|
+
function resolveTwinCommand(twinName) {
|
|
695
|
+
const distPath = resolve(__dirname, "..", "..", "twins", twinName, "dist", "index.js");
|
|
696
|
+
if (existsSync(distPath)) {
|
|
697
|
+
return { command: "node", args: [distPath] };
|
|
698
|
+
}
|
|
699
|
+
try {
|
|
700
|
+
const require2 = createRequire(import.meta.url);
|
|
701
|
+
const packageMain = require2.resolve(`@archal/twin-${twinName}`);
|
|
702
|
+
return { command: "node", args: [packageMain] };
|
|
703
|
+
} catch {
|
|
704
|
+
}
|
|
705
|
+
return { command: "npx", args: [`@archal/twin-${twinName}`] };
|
|
706
|
+
}
|
|
707
|
+
async function waitForFile(filePath, timeoutMs = 1e4) {
|
|
708
|
+
const start = Date.now();
|
|
709
|
+
while (Date.now() - start < timeoutMs) {
|
|
710
|
+
if (existsSync(filePath)) return true;
|
|
711
|
+
await new Promise((r) => setTimeout(r, 100));
|
|
712
|
+
}
|
|
713
|
+
return false;
|
|
714
|
+
}
|
|
715
|
+
async function captureSeedState(twinConfigs) {
|
|
716
|
+
const runId = `archal-seed-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
717
|
+
const beforeState = {};
|
|
718
|
+
const twinPaths = {};
|
|
719
|
+
const processes = [];
|
|
720
|
+
for (const config of twinConfigs) {
|
|
721
|
+
const { command, args } = resolveTwinCommand(config.twinName);
|
|
722
|
+
const stateFilePath = join(tmpdir(), `${runId}-${config.twinName}-state.json`);
|
|
723
|
+
debug(`Capturing seed state for ${config.twinName}`, { seed: config.seedName });
|
|
724
|
+
const twinArgs = [
|
|
725
|
+
...args,
|
|
726
|
+
"--seed",
|
|
727
|
+
config.seedName,
|
|
728
|
+
"--state-file",
|
|
729
|
+
stateFilePath
|
|
730
|
+
];
|
|
731
|
+
if (config.rateLimitMax && config.rateLimitMax > 0) {
|
|
732
|
+
twinArgs.push("--rate-limit", String(config.rateLimitMax));
|
|
733
|
+
}
|
|
734
|
+
const child = spawnMcpStdioProcess({ command, args: twinArgs });
|
|
735
|
+
processes.push(child);
|
|
736
|
+
twinPaths[config.twinName] = { stateFile: stateFilePath };
|
|
737
|
+
const appeared = await waitForFile(stateFilePath, 1e4);
|
|
738
|
+
if (appeared) {
|
|
739
|
+
try {
|
|
740
|
+
const raw = readFileSync2(stateFilePath, "utf-8");
|
|
741
|
+
beforeState[config.twinName] = JSON.parse(raw);
|
|
742
|
+
} catch {
|
|
743
|
+
warn(`Failed to read seed state for ${config.twinName}`);
|
|
744
|
+
beforeState[config.twinName] = {};
|
|
745
|
+
}
|
|
746
|
+
} else {
|
|
747
|
+
warn(`State file not written for ${config.twinName} within timeout`);
|
|
748
|
+
beforeState[config.twinName] = {};
|
|
749
|
+
}
|
|
750
|
+
await killProcess(child, 3e3);
|
|
751
|
+
}
|
|
752
|
+
return { beforeState, twinPaths };
|
|
753
|
+
}
|
|
754
|
+
function writeMcpConfig(twinConfigs, runId) {
|
|
755
|
+
const twinPaths = {};
|
|
756
|
+
const mcpServers = {};
|
|
757
|
+
for (const config of twinConfigs) {
|
|
758
|
+
const { command, args: baseArgs } = resolveTwinCommand(config.twinName);
|
|
759
|
+
const stateFile = join(tmpdir(), `${runId}-${config.twinName}-state.json`);
|
|
760
|
+
const traceFile = join(tmpdir(), `${runId}-${config.twinName}-trace.json`);
|
|
761
|
+
twinPaths[config.twinName] = { stateFile, traceFile };
|
|
762
|
+
const twinArgs = [
|
|
763
|
+
...baseArgs,
|
|
764
|
+
"--seed",
|
|
765
|
+
config.seedName,
|
|
766
|
+
"--state-file",
|
|
767
|
+
stateFile,
|
|
768
|
+
"--trace-file",
|
|
769
|
+
traceFile
|
|
770
|
+
];
|
|
771
|
+
if (config.rateLimitMax && config.rateLimitMax > 0) {
|
|
772
|
+
twinArgs.push("--rate-limit", String(config.rateLimitMax));
|
|
773
|
+
}
|
|
774
|
+
mcpServers[config.twinName] = { command, args: twinArgs };
|
|
775
|
+
}
|
|
776
|
+
const configData = { mcpServers };
|
|
777
|
+
const configPath = join(tmpdir(), `${runId}-mcp-config.json`);
|
|
778
|
+
const tmpPath = configPath + ".tmp";
|
|
779
|
+
writeFileSync(tmpPath, JSON.stringify(configData, null, 2));
|
|
780
|
+
renameSync(tmpPath, configPath);
|
|
781
|
+
debug("Wrote MCP config", { configPath, twins: Object.keys(mcpServers).join(", ") });
|
|
782
|
+
return { configPath, twinPaths };
|
|
783
|
+
}
|
|
784
|
+
async function executeAgent(agentConfig, mcpConfigPath, mcpServersJson, twinNames, timeoutMs) {
|
|
785
|
+
const agentEnv = {
|
|
786
|
+
...agentConfig.env,
|
|
787
|
+
MCP_CONFIG_PATH: mcpConfigPath,
|
|
788
|
+
ARCHAL_MCP_CONFIG: mcpConfigPath,
|
|
789
|
+
ARCHAL_MCP_SERVERS: mcpServersJson,
|
|
790
|
+
ARCHAL_TWIN_NAMES: twinNames.join(",")
|
|
791
|
+
};
|
|
792
|
+
info("Executing agent", {
|
|
793
|
+
command: agentConfig.command,
|
|
794
|
+
timeout: `${timeoutMs}ms`
|
|
795
|
+
});
|
|
796
|
+
const result = await spawnWithTimeout({
|
|
797
|
+
command: agentConfig.command,
|
|
798
|
+
args: agentConfig.args,
|
|
799
|
+
timeoutMs,
|
|
800
|
+
cwd: agentConfig.cwd,
|
|
801
|
+
env: agentEnv,
|
|
802
|
+
onStdout: (chunk) => {
|
|
803
|
+
debug(`[agent stdout] ${chunk.trimEnd()}`);
|
|
804
|
+
},
|
|
805
|
+
onStderr: (chunk) => {
|
|
806
|
+
debug(`[agent stderr] ${chunk.trimEnd()}`);
|
|
807
|
+
}
|
|
808
|
+
});
|
|
809
|
+
if (result.timedOut) {
|
|
810
|
+
warn(`Agent timed out after ${timeoutMs}ms`);
|
|
811
|
+
} else if (result.exitCode !== 0) {
|
|
812
|
+
warn(`Agent exited with code ${result.exitCode}`);
|
|
813
|
+
} else {
|
|
814
|
+
info("Agent completed successfully", { durationMs: result.durationMs });
|
|
815
|
+
}
|
|
816
|
+
return {
|
|
817
|
+
exitCode: result.exitCode,
|
|
818
|
+
stdout: result.stdout,
|
|
819
|
+
stderr: result.stderr,
|
|
820
|
+
timedOut: result.timedOut,
|
|
821
|
+
durationMs: result.durationMs
|
|
822
|
+
};
|
|
823
|
+
}
|
|
824
|
+
function collectStateFromFiles(twinPaths) {
|
|
825
|
+
const state = {};
|
|
826
|
+
for (const [name, paths] of Object.entries(twinPaths)) {
|
|
827
|
+
try {
|
|
828
|
+
if (existsSync(paths.stateFile)) {
|
|
829
|
+
const raw = readFileSync2(paths.stateFile, "utf-8");
|
|
830
|
+
state[name] = JSON.parse(raw);
|
|
831
|
+
} else {
|
|
832
|
+
warn(`State file not found for twin "${name}" at ${paths.stateFile}`);
|
|
833
|
+
state[name] = {};
|
|
834
|
+
}
|
|
835
|
+
} catch (err) {
|
|
836
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
837
|
+
warn(`Failed to read state for twin "${name}": ${msg}`);
|
|
838
|
+
state[name] = {};
|
|
839
|
+
}
|
|
840
|
+
}
|
|
841
|
+
return state;
|
|
842
|
+
}
|
|
843
|
+
function collectTraceFromFiles(twinPaths) {
|
|
844
|
+
const allTraces = [];
|
|
845
|
+
for (const [name, paths] of Object.entries(twinPaths)) {
|
|
846
|
+
try {
|
|
847
|
+
if (existsSync(paths.traceFile)) {
|
|
848
|
+
const raw = readFileSync2(paths.traceFile, "utf-8");
|
|
849
|
+
const entries = JSON.parse(raw);
|
|
850
|
+
allTraces.push(...entries);
|
|
851
|
+
} else {
|
|
852
|
+
debug(`Trace file not found for twin "${name}"`);
|
|
853
|
+
}
|
|
854
|
+
} catch {
|
|
855
|
+
debug(`Could not parse trace file for twin "${name}"`);
|
|
856
|
+
}
|
|
857
|
+
}
|
|
858
|
+
allTraces.sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime());
|
|
859
|
+
return allTraces;
|
|
860
|
+
}
|
|
861
|
+
function cleanupTempFiles(mcpConfigPath, twinPaths, seedPaths) {
|
|
862
|
+
const filesToClean = [mcpConfigPath, mcpConfigPath + ".tmp"];
|
|
863
|
+
for (const paths of Object.values(twinPaths)) {
|
|
864
|
+
filesToClean.push(paths.stateFile, paths.stateFile + ".tmp");
|
|
865
|
+
filesToClean.push(paths.traceFile, paths.traceFile + ".tmp");
|
|
866
|
+
}
|
|
867
|
+
if (seedPaths) {
|
|
868
|
+
for (const paths of Object.values(seedPaths)) {
|
|
869
|
+
filesToClean.push(paths.stateFile, paths.stateFile + ".tmp");
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
for (const file of filesToClean) {
|
|
873
|
+
try {
|
|
874
|
+
if (existsSync(file)) unlinkSync(file);
|
|
875
|
+
} catch {
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
}
|
|
879
|
+
function splitCommand(cmd) {
|
|
880
|
+
const result = [];
|
|
881
|
+
let current = "";
|
|
882
|
+
let inQuote = false;
|
|
883
|
+
let quoteChar = "";
|
|
884
|
+
for (const ch of cmd) {
|
|
885
|
+
if (!inQuote && (ch === '"' || ch === "'")) {
|
|
886
|
+
inQuote = true;
|
|
887
|
+
quoteChar = ch;
|
|
888
|
+
} else if (inQuote && ch === quoteChar) {
|
|
889
|
+
inQuote = false;
|
|
890
|
+
} else if (!inQuote && ch === " ") {
|
|
891
|
+
if (current) result.push(current);
|
|
892
|
+
current = "";
|
|
893
|
+
} else {
|
|
894
|
+
current += ch;
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
if (current) result.push(current);
|
|
898
|
+
return result;
|
|
899
|
+
}
|
|
900
|
+
function resolveAgentConfig(agentCommand, projectConfigPath) {
|
|
901
|
+
if (agentCommand) {
|
|
902
|
+
const parts = splitCommand(agentCommand);
|
|
903
|
+
return {
|
|
904
|
+
command: parts[0],
|
|
905
|
+
args: parts.slice(1)
|
|
906
|
+
};
|
|
907
|
+
}
|
|
908
|
+
if (projectConfigPath) {
|
|
909
|
+
try {
|
|
910
|
+
const raw = readFileSync2(projectConfigPath, "utf-8");
|
|
911
|
+
const config = JSON.parse(raw);
|
|
912
|
+
if (config.agent?.command) {
|
|
913
|
+
return {
|
|
914
|
+
command: config.agent.command,
|
|
915
|
+
args: config.agent.args ?? [],
|
|
916
|
+
env: config.agent.env
|
|
917
|
+
};
|
|
918
|
+
}
|
|
919
|
+
} catch (err) {
|
|
920
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
921
|
+
error(`Failed to load agent config from ${projectConfigPath}: ${message}`);
|
|
922
|
+
}
|
|
923
|
+
}
|
|
924
|
+
const envCommand = process.env["ARCHAL_AGENT_COMMAND"];
|
|
925
|
+
if (envCommand) {
|
|
926
|
+
const parts = splitCommand(envCommand);
|
|
927
|
+
return {
|
|
928
|
+
command: parts[0],
|
|
929
|
+
args: parts.slice(1)
|
|
930
|
+
};
|
|
931
|
+
}
|
|
932
|
+
return null;
|
|
933
|
+
}
|
|
934
|
+
|
|
935
|
+
// src/runner/reporter.ts
|
|
936
|
+
import { readFileSync as readFileSync3, existsSync as existsSync2 } from "fs";
|
|
937
|
+
import { createRequire as createRequire2 } from "module";
|
|
938
|
+
import { dirname, resolve as resolve2 } from "path";
|
|
939
|
+
import { fileURLToPath as fileURLToPath2 } from "url";
|
|
940
|
+
var __dirname2 = fileURLToPath2(new URL(".", import.meta.url));
|
|
941
|
+
var RESET2 = "\x1B[0m";
|
|
942
|
+
var BOLD2 = "\x1B[1m";
|
|
943
|
+
var DIM2 = "\x1B[2m";
|
|
944
|
+
var GREEN = "\x1B[32m";
|
|
945
|
+
var RED = "\x1B[31m";
|
|
946
|
+
var YELLOW = "\x1B[33m";
|
|
947
|
+
var CYAN = "\x1B[36m";
|
|
948
|
+
function printHeader(scenarioTitle, seedSelections) {
|
|
949
|
+
process.stderr.write(`
|
|
950
|
+
${CYAN}${BOLD2}archal${RESET2} ${DIM2}\u2014${RESET2} ${scenarioTitle}
|
|
951
|
+
`);
|
|
952
|
+
for (const sel of seedSelections) {
|
|
953
|
+
process.stderr.write(` ${DIM2}twin:${RESET2} ${sel.twinName} ${DIM2}(seed: ${sel.seedName})${RESET2}
|
|
954
|
+
`);
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
function printRunProgress(runIndex, totalRuns, score, error2) {
|
|
958
|
+
const dots = ".".repeat(Math.max(1, 20 - String(runIndex + 1).length - String(totalRuns).length));
|
|
959
|
+
if (error2) {
|
|
960
|
+
const shortError = error2.length > 60 ? error2.slice(0, 59) + "\u2026" : error2;
|
|
961
|
+
process.stderr.write(` run ${runIndex + 1}/${totalRuns} ${DIM2}${dots}${RESET2} ${RED}ERROR${RESET2} ${DIM2}(${shortError})${RESET2}
|
|
962
|
+
`);
|
|
963
|
+
return;
|
|
964
|
+
}
|
|
965
|
+
const sc = score >= 100 ? GREEN : score >= 50 ? YELLOW : RED;
|
|
966
|
+
const pct = `${sc}${score.toFixed(0)}%${RESET2}`;
|
|
967
|
+
process.stderr.write(` run ${runIndex + 1}/${totalRuns} ${DIM2}${dots}${RESET2} ${pct}
|
|
968
|
+
`);
|
|
969
|
+
}
|
|
970
|
+
function formatTraceSummary(report) {
|
|
971
|
+
const lines = [];
|
|
972
|
+
const firstRun = report.runs[0];
|
|
973
|
+
if (!firstRun || firstRun.trace.length === 0) return lines;
|
|
974
|
+
const trace = firstRun.trace;
|
|
975
|
+
const toolCounts = /* @__PURE__ */ new Map();
|
|
976
|
+
for (const entry of trace) {
|
|
977
|
+
const count = toolCounts.get(entry.toolName) ?? 0;
|
|
978
|
+
toolCounts.set(entry.toolName, count + 1);
|
|
979
|
+
}
|
|
980
|
+
lines.push(` ${DIM2}actions:${RESET2}`);
|
|
981
|
+
for (const [tool, count] of toolCounts) {
|
|
982
|
+
const entries = trace.filter((e) => e.toolName === tool);
|
|
983
|
+
const summary = summarizeToolCalls(tool, entries);
|
|
984
|
+
if (summary) {
|
|
985
|
+
lines.push(` ${DIM2}\u2192${RESET2} ${summary}`);
|
|
986
|
+
} else {
|
|
987
|
+
lines.push(` ${DIM2}\u2192${RESET2} ${tool} \xD7${count}`);
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
const totalCalls = trace.length;
|
|
991
|
+
lines.push(` ${DIM2}\u2192 ${totalCalls} total API calls${RESET2}`);
|
|
992
|
+
return lines;
|
|
993
|
+
}
|
|
994
|
+
function summarizeToolCalls(toolName, entries) {
|
|
995
|
+
const count = entries.length;
|
|
996
|
+
const ids = [];
|
|
997
|
+
for (const entry of entries) {
|
|
998
|
+
const input = entry.input;
|
|
999
|
+
const id = input["issue_number"] ?? input["number"] ?? input["id"] ?? input["channel"] ?? input["name"];
|
|
1000
|
+
if (id !== void 0) {
|
|
1001
|
+
ids.push(id);
|
|
1002
|
+
}
|
|
1003
|
+
}
|
|
1004
|
+
const toolLabel = toolName.replace(/_/g, " ").replace(/-/g, " ");
|
|
1005
|
+
if (ids.length > 0 && ids.length <= 6) {
|
|
1006
|
+
const idStr = ids.map((id) => typeof id === "number" ? `#${id}` : String(id)).join(", ");
|
|
1007
|
+
return `${toolLabel} \xD7${count} (${idStr})`;
|
|
1008
|
+
}
|
|
1009
|
+
if (count === 1) {
|
|
1010
|
+
return toolLabel;
|
|
1011
|
+
}
|
|
1012
|
+
return `${toolLabel} \xD7${count}`;
|
|
1013
|
+
}
|
|
1014
|
+
function generateReport(report, format) {
|
|
1015
|
+
switch (format) {
|
|
1016
|
+
case "terminal":
|
|
1017
|
+
return formatTerminal(report);
|
|
1018
|
+
case "json":
|
|
1019
|
+
return formatJson(report);
|
|
1020
|
+
case "junit":
|
|
1021
|
+
return formatJunit(report);
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
1024
|
+
function loadTwinFidelity(twinNames) {
|
|
1025
|
+
const lines = [];
|
|
1026
|
+
for (const name of twinNames) {
|
|
1027
|
+
try {
|
|
1028
|
+
let fidelityPath = null;
|
|
1029
|
+
const monorepoPath = resolve2(__dirname2, "..", "..", "twins", name, "fidelity.json");
|
|
1030
|
+
if (existsSync2(monorepoPath)) {
|
|
1031
|
+
fidelityPath = monorepoPath;
|
|
1032
|
+
}
|
|
1033
|
+
if (!fidelityPath) {
|
|
1034
|
+
try {
|
|
1035
|
+
const require2 = createRequire2(import.meta.url);
|
|
1036
|
+
const twinMain = require2.resolve(`@archal/twin-${name}`);
|
|
1037
|
+
const candidate = resolve2(dirname(twinMain), "..", "fidelity.json");
|
|
1038
|
+
if (existsSync2(candidate)) {
|
|
1039
|
+
fidelityPath = candidate;
|
|
1040
|
+
}
|
|
1041
|
+
} catch {
|
|
1042
|
+
}
|
|
1043
|
+
}
|
|
1044
|
+
if (!fidelityPath) continue;
|
|
1045
|
+
const raw = readFileSync3(fidelityPath, "utf-8");
|
|
1046
|
+
const data = JSON.parse(raw);
|
|
1047
|
+
lines.push(` ${DIM2}twin fidelity:${RESET2} ${data.twin} v${data.version}`);
|
|
1048
|
+
for (const cap of data.capabilities) {
|
|
1049
|
+
const icon = cap.supported ? `${GREEN}\u2713${RESET2}` : `${DIM2}\u2717${RESET2}`;
|
|
1050
|
+
lines.push(` ${icon} ${DIM2}${cap.name}${RESET2}`);
|
|
1051
|
+
}
|
|
1052
|
+
} catch {
|
|
1053
|
+
}
|
|
1054
|
+
}
|
|
1055
|
+
return lines;
|
|
1056
|
+
}
|
|
1057
|
+
function formatTerminal(report) {
|
|
1058
|
+
const lines = [];
|
|
1059
|
+
const totalRuns = report.runs.length;
|
|
1060
|
+
const traceSummary = formatTraceSummary(report);
|
|
1061
|
+
if (traceSummary.length > 0) {
|
|
1062
|
+
lines.push("");
|
|
1063
|
+
lines.push(...traceSummary);
|
|
1064
|
+
}
|
|
1065
|
+
lines.push("");
|
|
1066
|
+
const criterionIds = [];
|
|
1067
|
+
if (report.runs.length > 0) {
|
|
1068
|
+
for (const evaluation of report.runs[0].evaluations) {
|
|
1069
|
+
criterionIds.push(evaluation.criterionId);
|
|
1070
|
+
}
|
|
1071
|
+
}
|
|
1072
|
+
for (const criterionId of criterionIds) {
|
|
1073
|
+
let passCount = 0;
|
|
1074
|
+
for (const run of report.runs) {
|
|
1075
|
+
const evaluation = run.evaluations.find((e) => e.criterionId === criterionId);
|
|
1076
|
+
if (evaluation && evaluation.status === "pass") {
|
|
1077
|
+
passCount++;
|
|
1078
|
+
}
|
|
1079
|
+
}
|
|
1080
|
+
const allPassed = passCount === totalRuns;
|
|
1081
|
+
const nonePassed = passCount === 0;
|
|
1082
|
+
const description = report.criterionDescriptions?.[criterionId] ?? criterionId;
|
|
1083
|
+
const termWidth = process.stdout.columns ?? 80;
|
|
1084
|
+
const statusPlainLen = `pass ${totalRuns}/${totalRuns}`.length;
|
|
1085
|
+
const reservedRight = 2 + statusPlainLen;
|
|
1086
|
+
const maxLabelLen = Math.max(20, termWidth - reservedRight - 4);
|
|
1087
|
+
let truncatedLabel;
|
|
1088
|
+
if (description.length > maxLabelLen) {
|
|
1089
|
+
const truncPoint = description.lastIndexOf(" ", maxLabelLen - 1);
|
|
1090
|
+
truncatedLabel = truncPoint > maxLabelLen * 0.6 ? description.slice(0, truncPoint) + "\u2026" : description.slice(0, maxLabelLen - 1) + "\u2026";
|
|
1091
|
+
} else {
|
|
1092
|
+
truncatedLabel = description;
|
|
1093
|
+
}
|
|
1094
|
+
const dotCount = Math.max(2, termWidth - 2 - truncatedLabel.length - 1 - statusPlainLen);
|
|
1095
|
+
const dots = `${DIM2}${".".repeat(dotCount)}${RESET2}`;
|
|
1096
|
+
let statusStr;
|
|
1097
|
+
if (allPassed) {
|
|
1098
|
+
statusStr = `${GREEN}pass ${passCount}/${totalRuns}${RESET2}`;
|
|
1099
|
+
} else if (nonePassed) {
|
|
1100
|
+
statusStr = `${RED}fail ${passCount}/${totalRuns}${RESET2}`;
|
|
1101
|
+
} else {
|
|
1102
|
+
statusStr = `${YELLOW}warn ${passCount}/${totalRuns}${RESET2}`;
|
|
1103
|
+
}
|
|
1104
|
+
lines.push(` ${truncatedLabel} ${dots} ${statusStr}`);
|
|
1105
|
+
const criterionType = report.criterionTypes?.[criterionId];
|
|
1106
|
+
if (!allPassed && criterionType === "probabilistic") {
|
|
1107
|
+
for (const run of report.runs) {
|
|
1108
|
+
const ev = run.evaluations.find((e) => e.criterionId === criterionId);
|
|
1109
|
+
if (ev && ev.status !== "pass" && ev.explanation) {
|
|
1110
|
+
const maxExplLen = Math.max(40, termWidth - 12);
|
|
1111
|
+
const truncExpl = ev.explanation.length > maxExplLen ? ev.explanation.slice(0, maxExplLen - 1) + "\u2026" : ev.explanation;
|
|
1112
|
+
lines.push(` ${DIM2}run ${run.runIndex + 1}: ${truncExpl}${RESET2}`);
|
|
1113
|
+
}
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
}
|
|
1117
|
+
lines.push("");
|
|
1118
|
+
const sc = report.satisfactionScore >= 80 ? GREEN : report.satisfactionScore >= 50 ? YELLOW : RED;
|
|
1119
|
+
lines.push(` ${BOLD2}satisfaction:${RESET2} ${sc}${BOLD2}${report.satisfactionScore.toFixed(1)}%${RESET2} ${DIM2}(${totalRuns} runs)${RESET2}`);
|
|
1120
|
+
if (report.twinNames && report.twinNames.length > 0) {
|
|
1121
|
+
const fidelityLines = loadTwinFidelity(report.twinNames);
|
|
1122
|
+
if (fidelityLines.length > 0) {
|
|
1123
|
+
lines.push("");
|
|
1124
|
+
lines.push(...fidelityLines);
|
|
1125
|
+
}
|
|
1126
|
+
}
|
|
1127
|
+
lines.push("");
|
|
1128
|
+
return lines.join("\n");
|
|
1129
|
+
}
|
|
1130
|
+
function formatJson(report) {
|
|
1131
|
+
return JSON.stringify(report, null, 2);
|
|
1132
|
+
}
|
|
1133
|
+
function escapeXml(text) {
|
|
1134
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
1135
|
+
}
|
|
1136
|
+
function formatJunit(report) {
|
|
1137
|
+
const lines = [];
|
|
1138
|
+
let totalTests = 0;
|
|
1139
|
+
let totalFailures = 0;
|
|
1140
|
+
let totalTime = 0;
|
|
1141
|
+
for (const run of report.runs) {
|
|
1142
|
+
totalTests += run.evaluations.length;
|
|
1143
|
+
totalFailures += run.evaluations.filter((e) => e.status === "fail").length;
|
|
1144
|
+
totalTime += run.durationMs;
|
|
1145
|
+
}
|
|
1146
|
+
lines.push('<?xml version="1.0" encoding="UTF-8"?>');
|
|
1147
|
+
lines.push(
|
|
1148
|
+
`<testsuites name="${escapeXml(report.scenarioTitle)}" tests="${totalTests}" failures="${totalFailures}" time="${(totalTime / 1e3).toFixed(3)}">`
|
|
1149
|
+
);
|
|
1150
|
+
for (const run of report.runs) {
|
|
1151
|
+
const runTests = run.evaluations.length;
|
|
1152
|
+
const runFailures = run.evaluations.filter((e) => e.status === "fail").length;
|
|
1153
|
+
const runTime = (run.durationMs / 1e3).toFixed(3);
|
|
1154
|
+
lines.push(
|
|
1155
|
+
` <testsuite name="Run ${run.runIndex + 1}" tests="${runTests}" failures="${runFailures}" time="${runTime}">`
|
|
1156
|
+
);
|
|
1157
|
+
if (run.error) {
|
|
1158
|
+
lines.push(
|
|
1159
|
+
` <testcase name="agent-execution" time="${runTime}">`
|
|
1160
|
+
);
|
|
1161
|
+
lines.push(
|
|
1162
|
+
` <failure message="${escapeXml(run.error)}" type="ExecutionError">${escapeXml(run.error)}</failure>`
|
|
1163
|
+
);
|
|
1164
|
+
lines.push(" </testcase>");
|
|
1165
|
+
}
|
|
1166
|
+
for (const evaluation of run.evaluations) {
|
|
1167
|
+
const testName = escapeXml(evaluation.criterionId);
|
|
1168
|
+
lines.push(` <testcase name="${testName}" time="0.000">`);
|
|
1169
|
+
if (evaluation.status === "fail") {
|
|
1170
|
+
lines.push(
|
|
1171
|
+
` <failure message="${escapeXml(evaluation.explanation)}" type="CriterionFailed">${escapeXml(evaluation.explanation)}</failure>`
|
|
1172
|
+
);
|
|
1173
|
+
} else if (evaluation.status === "partial") {
|
|
1174
|
+
lines.push(
|
|
1175
|
+
` <system-out>PARTIAL: ${escapeXml(evaluation.explanation)} (confidence: ${(evaluation.confidence * 100).toFixed(0)}%)</system-out>`
|
|
1176
|
+
);
|
|
1177
|
+
}
|
|
1178
|
+
lines.push(" </testcase>");
|
|
1179
|
+
}
|
|
1180
|
+
lines.push(" </testsuite>");
|
|
1181
|
+
}
|
|
1182
|
+
lines.push("</testsuites>");
|
|
1183
|
+
return lines.join("\n");
|
|
1184
|
+
}
|
|
1185
|
+
function printReport(report, format) {
|
|
1186
|
+
const output = generateReport(report, format);
|
|
1187
|
+
process.stdout.write(output + "\n");
|
|
1188
|
+
}
|
|
1189
|
+
|
|
1190
|
+
// src/evaluator/deterministic.ts
|
|
1191
|
+
function stripParenthetical(text) {
|
|
1192
|
+
return text.replace(/\s*\(.*?\)\s*/g, " ").trim();
|
|
1193
|
+
}
|
|
1194
|
+
function cleanPredicate(pred) {
|
|
1195
|
+
let cleaned = stripParenthetical(pred);
|
|
1196
|
+
cleaned = cleaned.replace(/\s+(?:on|in|to|from|of)\s+(?:the\s+)?.+$/, "");
|
|
1197
|
+
return cleaned.trim();
|
|
1198
|
+
}
|
|
1199
|
+
function parseAssertion(description) {
|
|
1200
|
+
const lower = stripParenthetical(description).toLowerCase().trim();
|
|
1201
|
+
const noLabeledMatch = lower.match(/^no\s+(.+?)\s+labeled\s+["']?([^"']+?)["']?\s+(?:are|were|is|was|should be)\s+(.+)$/);
|
|
1202
|
+
if (noLabeledMatch) {
|
|
1203
|
+
return {
|
|
1204
|
+
type: "no_matching",
|
|
1205
|
+
subject: noLabeledMatch[1]?.trim() ?? "",
|
|
1206
|
+
predicate: cleanPredicate(noLabeledMatch[3]?.trim() ?? ""),
|
|
1207
|
+
labelFilter: noLabeledMatch[2]?.trim()
|
|
1208
|
+
};
|
|
1209
|
+
}
|
|
1210
|
+
const exactWithVerb = lower.match(/^exactly\s+(\d+)\s+(.+?)\s+(?:are|were|is|was|should be)\s+(.+)$/);
|
|
1211
|
+
if (exactWithVerb) {
|
|
1212
|
+
return {
|
|
1213
|
+
type: "exact_count",
|
|
1214
|
+
subject: exactWithVerb[2]?.trim() ?? "",
|
|
1215
|
+
value: parseInt(exactWithVerb[1] ?? "0", 10),
|
|
1216
|
+
predicate: cleanPredicate(exactWithVerb[3]?.trim() ?? "")
|
|
1217
|
+
};
|
|
1218
|
+
}
|
|
1219
|
+
const exactWithoutVerb = lower.match(/^exactly\s+(\d+)\s+(.+)$/);
|
|
1220
|
+
if (exactWithoutVerb) {
|
|
1221
|
+
return {
|
|
1222
|
+
type: "exact_count",
|
|
1223
|
+
subject: exactWithoutVerb[2]?.trim() ?? "",
|
|
1224
|
+
value: parseInt(exactWithoutVerb[1] ?? "0", 10)
|
|
1225
|
+
};
|
|
1226
|
+
}
|
|
1227
|
+
const minWithVerb = lower.match(/^at\s+least\s+(\d+)\s+(.+?)\s+(?:are|were|is|was|should be)\s+(.+)$/);
|
|
1228
|
+
if (minWithVerb) {
|
|
1229
|
+
return {
|
|
1230
|
+
type: "min_count",
|
|
1231
|
+
subject: minWithVerb[2]?.trim() ?? "",
|
|
1232
|
+
value: parseInt(minWithVerb[1] ?? "0", 10),
|
|
1233
|
+
predicate: cleanPredicate(minWithVerb[3]?.trim() ?? "")
|
|
1234
|
+
};
|
|
1235
|
+
}
|
|
1236
|
+
const minWithoutVerb = lower.match(/^at\s+least\s+(\d+)\s+(.+)$/);
|
|
1237
|
+
if (minWithoutVerb) {
|
|
1238
|
+
return {
|
|
1239
|
+
type: "min_count",
|
|
1240
|
+
subject: minWithoutVerb[2]?.trim() ?? "",
|
|
1241
|
+
value: parseInt(minWithoutVerb[1] ?? "0", 10)
|
|
1242
|
+
};
|
|
1243
|
+
}
|
|
1244
|
+
const maxWithVerb = lower.match(/^(?:at\s+most|no\s+more\s+than)\s+(\d+)\s+(.+?)\s+(?:are|were|is|was|should be)\s+(.+)$/);
|
|
1245
|
+
if (maxWithVerb) {
|
|
1246
|
+
return {
|
|
1247
|
+
type: "max_count",
|
|
1248
|
+
subject: maxWithVerb[2]?.trim() ?? "",
|
|
1249
|
+
value: parseInt(maxWithVerb[1] ?? "0", 10),
|
|
1250
|
+
predicate: maxWithVerb[3]?.trim() || void 0
|
|
1251
|
+
};
|
|
1252
|
+
}
|
|
1253
|
+
const maxWithoutVerb = lower.match(/^(?:at\s+most|no\s+more\s+than)\s+(\d+)\s+(.+)$/);
|
|
1254
|
+
if (maxWithoutVerb) {
|
|
1255
|
+
return {
|
|
1256
|
+
type: "max_count",
|
|
1257
|
+
subject: maxWithoutVerb[2]?.trim() ?? "",
|
|
1258
|
+
value: parseInt(maxWithoutVerb[1] ?? "0", 10)
|
|
1259
|
+
};
|
|
1260
|
+
}
|
|
1261
|
+
const fewerMatch = lower.match(/^fewer\s+than\s+(\d+)\s+(.+)/);
|
|
1262
|
+
if (fewerMatch) {
|
|
1263
|
+
return {
|
|
1264
|
+
type: "max_count",
|
|
1265
|
+
subject: fewerMatch[2]?.trim() ?? "",
|
|
1266
|
+
value: parseInt(fewerMatch[1] ?? "1", 10) - 1
|
|
1267
|
+
};
|
|
1268
|
+
}
|
|
1269
|
+
const moreMatch = lower.match(/^more\s+than\s+(\d+)\s+(.+)/);
|
|
1270
|
+
if (moreMatch) {
|
|
1271
|
+
return {
|
|
1272
|
+
type: "min_count",
|
|
1273
|
+
subject: moreMatch[2]?.trim() ?? "",
|
|
1274
|
+
value: parseInt(moreMatch[1] ?? "0", 10) + 1
|
|
1275
|
+
};
|
|
1276
|
+
}
|
|
1277
|
+
const numSubjectMatch = lower.match(/^(\d+)\s+(.+?)\s+(?:are|were|is|was)\s+(.+)$/);
|
|
1278
|
+
if (numSubjectMatch) {
|
|
1279
|
+
return {
|
|
1280
|
+
type: "exact_count",
|
|
1281
|
+
subject: numSubjectMatch[2]?.trim() ?? "",
|
|
1282
|
+
value: parseInt(numSubjectMatch[1] ?? "0", 10),
|
|
1283
|
+
predicate: numSubjectMatch[3]?.trim()
|
|
1284
|
+
};
|
|
1285
|
+
}
|
|
1286
|
+
if (/^no\s+errors?\s+(in\s+)?(trace|log|output)/i.test(lower)) {
|
|
1287
|
+
return { type: "no_errors", subject: "trace" };
|
|
1288
|
+
}
|
|
1289
|
+
const stateMatch = lower.match(/^(?:the\s+)?(.+?)\s+(?:is|was|has been|should be)\s+(created|merged|closed|open|deleted|removed|resolved|approved|rejected)/);
|
|
1290
|
+
if (stateMatch) {
|
|
1291
|
+
return {
|
|
1292
|
+
type: "state_check",
|
|
1293
|
+
subject: stateMatch[1]?.trim() ?? "",
|
|
1294
|
+
predicate: stateMatch[2]?.trim()
|
|
1295
|
+
};
|
|
1296
|
+
}
|
|
1297
|
+
const existsMatch = lower.match(/^(?:the\s+)?(.+?)\s+(?:exists?|is present|was created|has been created)/);
|
|
1298
|
+
if (existsMatch) {
|
|
1299
|
+
return { type: "exists", subject: existsMatch[1]?.trim() ?? "" };
|
|
1300
|
+
}
|
|
1301
|
+
const noneMatch = lower.match(/^(?:no|zero|none)\s+(.+?)(?:\s+(?:remain|exist|left|present|found))?\s*$/);
|
|
1302
|
+
if (noneMatch) {
|
|
1303
|
+
return {
|
|
1304
|
+
type: "exact_count",
|
|
1305
|
+
subject: noneMatch[1]?.trim() ?? "",
|
|
1306
|
+
value: 0
|
|
1307
|
+
};
|
|
1308
|
+
}
|
|
1309
|
+
return null;
|
|
1310
|
+
}
|
|
1311
|
+
function flattenTwinState(state) {
|
|
1312
|
+
const flattened = {};
|
|
1313
|
+
for (const value of Object.values(state)) {
|
|
1314
|
+
if (typeof value === "object" && value !== null && !Array.isArray(value)) {
|
|
1315
|
+
const twinData = value;
|
|
1316
|
+
const hasArrayValues = Object.values(twinData).some(Array.isArray);
|
|
1317
|
+
if (hasArrayValues) {
|
|
1318
|
+
for (const [entityName, entityData] of Object.entries(twinData)) {
|
|
1319
|
+
if (Array.isArray(entityData)) {
|
|
1320
|
+
const existing = flattened[entityName];
|
|
1321
|
+
if (Array.isArray(existing)) {
|
|
1322
|
+
flattened[entityName] = [...existing, ...entityData];
|
|
1323
|
+
} else {
|
|
1324
|
+
flattened[entityName] = entityData;
|
|
1325
|
+
}
|
|
1326
|
+
}
|
|
1327
|
+
}
|
|
1328
|
+
continue;
|
|
1329
|
+
}
|
|
1330
|
+
}
|
|
1331
|
+
}
|
|
1332
|
+
if (Object.keys(flattened).length === 0) return state;
|
|
1333
|
+
return flattened;
|
|
1334
|
+
}
|
|
1335
|
+
function resolveSubjectInState(subject, state) {
|
|
1336
|
+
const flat = flattenTwinState(state);
|
|
1337
|
+
const normalizedSubject = subject.replace(/\s+/g, "").toLowerCase();
|
|
1338
|
+
for (const [key, value] of Object.entries(flat)) {
|
|
1339
|
+
const normalizedKey = key.replace(/\s+/g, "").toLowerCase();
|
|
1340
|
+
if (normalizedKey === normalizedSubject || normalizedKey === normalizedSubject + "s") {
|
|
1341
|
+
if (Array.isArray(value)) return value;
|
|
1342
|
+
if (typeof value === "object" && value !== null) return [value];
|
|
1343
|
+
}
|
|
1344
|
+
if (normalizedSubject.includes(normalizedKey) || normalizedKey.includes(normalizedSubject)) {
|
|
1345
|
+
if (Array.isArray(value)) return value;
|
|
1346
|
+
}
|
|
1347
|
+
}
|
|
1348
|
+
const subjectWords = subject.toLowerCase().split(/\s+/);
|
|
1349
|
+
for (const [key, value] of Object.entries(flat)) {
|
|
1350
|
+
if (typeof value !== "object" || value === null) continue;
|
|
1351
|
+
const entries = Array.isArray(value) ? value : Object.values(value);
|
|
1352
|
+
for (const word of subjectWords) {
|
|
1353
|
+
if (key.toLowerCase().includes(word)) {
|
|
1354
|
+
return entries;
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
}
|
|
1358
|
+
return null;
|
|
1359
|
+
}
|
|
1360
|
+
function filterByPredicate(items, predicate) {
|
|
1361
|
+
const lowerPredicate = predicate.toLowerCase();
|
|
1362
|
+
return items.filter((item) => {
|
|
1363
|
+
if (typeof item !== "object" || item === null) return false;
|
|
1364
|
+
const obj = item;
|
|
1365
|
+
if (obj["state"] !== void 0 && String(obj["state"]).toLowerCase() === lowerPredicate) {
|
|
1366
|
+
return true;
|
|
1367
|
+
}
|
|
1368
|
+
if (obj["status"] !== void 0 && String(obj["status"]).toLowerCase() === lowerPredicate) {
|
|
1369
|
+
return true;
|
|
1370
|
+
}
|
|
1371
|
+
if (lowerPredicate === "merged" && obj["merged"] === true) return true;
|
|
1372
|
+
if (lowerPredicate === "closed" && (obj["state"] === "closed" || obj["closed"] === true)) return true;
|
|
1373
|
+
if (lowerPredicate === "open" && obj["state"] === "open") return true;
|
|
1374
|
+
if (lowerPredicate === "created" && obj["id"] !== void 0) return true;
|
|
1375
|
+
if (lowerPredicate === "deleted" && obj["deleted"] === true) return true;
|
|
1376
|
+
if (lowerPredicate === "resolved" && (obj["state"] === "resolved" || obj["resolved"] === true)) return true;
|
|
1377
|
+
return false;
|
|
1378
|
+
});
|
|
1379
|
+
}
|
|
1380
|
+
function countTraceErrors(trace) {
|
|
1381
|
+
return trace.filter((entry) => entry.error !== null).length;
|
|
1382
|
+
}
|
|
1383
|
+
function evaluateDeterministic(criterion, stateView) {
|
|
1384
|
+
const assertion = parseAssertion(criterion.description);
|
|
1385
|
+
if (!assertion) {
|
|
1386
|
+
debug(`Could not parse deterministic assertion: "${criterion.description}"`);
|
|
1387
|
+
return {
|
|
1388
|
+
criterionId: criterion.id,
|
|
1389
|
+
status: "fail",
|
|
1390
|
+
confidence: 0.5,
|
|
1391
|
+
explanation: `Could not parse deterministic assertion from: "${criterion.description}"`
|
|
1392
|
+
};
|
|
1393
|
+
}
|
|
1394
|
+
debug("Parsed assertion", {
|
|
1395
|
+
type: assertion.type,
|
|
1396
|
+
subject: assertion.subject,
|
|
1397
|
+
value: String(assertion.value ?? ""),
|
|
1398
|
+
predicate: assertion.predicate ?? ""
|
|
1399
|
+
});
|
|
1400
|
+
switch (assertion.type) {
|
|
1401
|
+
case "no_errors": {
|
|
1402
|
+
const errorCount = countTraceErrors(stateView.trace);
|
|
1403
|
+
const passed = errorCount === 0;
|
|
1404
|
+
return {
|
|
1405
|
+
criterionId: criterion.id,
|
|
1406
|
+
status: passed ? "pass" : "fail",
|
|
1407
|
+
confidence: 1,
|
|
1408
|
+
explanation: passed ? "No errors found in trace" : `Found ${errorCount} error(s) in trace`
|
|
1409
|
+
};
|
|
1410
|
+
}
|
|
1411
|
+
case "exact_count":
|
|
1412
|
+
case "min_count":
|
|
1413
|
+
case "max_count": {
|
|
1414
|
+
const afterItems = resolveSubjectInState(assertion.subject, stateView.after);
|
|
1415
|
+
if (afterItems === null) {
|
|
1416
|
+
const addedItems = stateView.diff.added[assertion.subject];
|
|
1417
|
+
if (addedItems) {
|
|
1418
|
+
return evaluateCount(
|
|
1419
|
+
criterion.id,
|
|
1420
|
+
assertion.type,
|
|
1421
|
+
assertion.value ?? 0,
|
|
1422
|
+
addedItems.length,
|
|
1423
|
+
assertion.subject,
|
|
1424
|
+
assertion.predicate
|
|
1425
|
+
);
|
|
1426
|
+
}
|
|
1427
|
+
return {
|
|
1428
|
+
criterionId: criterion.id,
|
|
1429
|
+
status: "fail",
|
|
1430
|
+
confidence: 0.5,
|
|
1431
|
+
explanation: `Could not find "${assertion.subject}" in twin state`
|
|
1432
|
+
};
|
|
1433
|
+
}
|
|
1434
|
+
if (assertion.predicate) {
|
|
1435
|
+
const beforeItems = resolveSubjectInState(assertion.subject, stateView.before);
|
|
1436
|
+
const afterFiltered = filterByPredicate(afterItems, assertion.predicate);
|
|
1437
|
+
if (beforeItems) {
|
|
1438
|
+
const beforeFiltered = filterByPredicate(beforeItems, assertion.predicate);
|
|
1439
|
+
const newlyMatching = afterFiltered.length - beforeFiltered.length;
|
|
1440
|
+
return evaluateCount(
|
|
1441
|
+
criterion.id,
|
|
1442
|
+
assertion.type,
|
|
1443
|
+
assertion.value ?? 0,
|
|
1444
|
+
Math.max(0, newlyMatching),
|
|
1445
|
+
assertion.subject,
|
|
1446
|
+
assertion.predicate
|
|
1447
|
+
);
|
|
1448
|
+
}
|
|
1449
|
+
return evaluateCount(
|
|
1450
|
+
criterion.id,
|
|
1451
|
+
assertion.type,
|
|
1452
|
+
assertion.value ?? 0,
|
|
1453
|
+
afterFiltered.length,
|
|
1454
|
+
assertion.subject,
|
|
1455
|
+
assertion.predicate
|
|
1456
|
+
);
|
|
1457
|
+
}
|
|
1458
|
+
return evaluateCount(
|
|
1459
|
+
criterion.id,
|
|
1460
|
+
assertion.type,
|
|
1461
|
+
assertion.value ?? 0,
|
|
1462
|
+
afterItems.length,
|
|
1463
|
+
assertion.subject,
|
|
1464
|
+
assertion.predicate
|
|
1465
|
+
);
|
|
1466
|
+
}
|
|
1467
|
+
case "no_matching": {
|
|
1468
|
+
const items = resolveSubjectInState(assertion.subject, stateView.after);
|
|
1469
|
+
if (!items) {
|
|
1470
|
+
return {
|
|
1471
|
+
criterionId: criterion.id,
|
|
1472
|
+
status: "fail",
|
|
1473
|
+
confidence: 0.5,
|
|
1474
|
+
explanation: `Could not find "${assertion.subject}" in twin state`
|
|
1475
|
+
};
|
|
1476
|
+
}
|
|
1477
|
+
const labelFiltered = assertion.labelFilter ? items.filter((item) => {
|
|
1478
|
+
if (typeof item !== "object" || item === null) return false;
|
|
1479
|
+
const obj = item;
|
|
1480
|
+
const labels = obj["labels"];
|
|
1481
|
+
if (Array.isArray(labels)) {
|
|
1482
|
+
return labels.some((l) => {
|
|
1483
|
+
const labelName = typeof l === "string" ? l : l?.["name"];
|
|
1484
|
+
return String(labelName).toLowerCase() === assertion.labelFilter?.toLowerCase();
|
|
1485
|
+
});
|
|
1486
|
+
}
|
|
1487
|
+
return false;
|
|
1488
|
+
}) : items;
|
|
1489
|
+
const matching = assertion.predicate ? filterByPredicate(labelFiltered, assertion.predicate) : labelFiltered;
|
|
1490
|
+
const passed = matching.length === 0;
|
|
1491
|
+
return {
|
|
1492
|
+
criterionId: criterion.id,
|
|
1493
|
+
status: passed ? "pass" : "fail",
|
|
1494
|
+
confidence: 1,
|
|
1495
|
+
explanation: passed ? `No ${assertion.subject} labeled "${assertion.labelFilter}" are ${assertion.predicate}` : `Found ${matching.length} ${assertion.subject} labeled "${assertion.labelFilter}" that are ${assertion.predicate}`
|
|
1496
|
+
};
|
|
1497
|
+
}
|
|
1498
|
+
case "exists": {
|
|
1499
|
+
const items = resolveSubjectInState(assertion.subject, stateView.after);
|
|
1500
|
+
const found = items !== null && items.length > 0;
|
|
1501
|
+
return {
|
|
1502
|
+
criterionId: criterion.id,
|
|
1503
|
+
status: found ? "pass" : "fail",
|
|
1504
|
+
confidence: 1,
|
|
1505
|
+
explanation: found ? `"${assertion.subject}" exists in twin state` : `"${assertion.subject}" not found in twin state`
|
|
1506
|
+
};
|
|
1507
|
+
}
|
|
1508
|
+
case "not_exists": {
|
|
1509
|
+
const items = resolveSubjectInState(assertion.subject, stateView.after);
|
|
1510
|
+
const absent = items === null || items.length === 0;
|
|
1511
|
+
return {
|
|
1512
|
+
criterionId: criterion.id,
|
|
1513
|
+
status: absent ? "pass" : "fail",
|
|
1514
|
+
confidence: 1,
|
|
1515
|
+
explanation: absent ? `"${assertion.subject}" does not exist in twin state` : `"${assertion.subject}" still exists in twin state`
|
|
1516
|
+
};
|
|
1517
|
+
}
|
|
1518
|
+
case "state_check": {
|
|
1519
|
+
const items = resolveSubjectInState(assertion.subject, stateView.after);
|
|
1520
|
+
if (!items || items.length === 0) {
|
|
1521
|
+
return {
|
|
1522
|
+
criterionId: criterion.id,
|
|
1523
|
+
status: "fail",
|
|
1524
|
+
confidence: 0.8,
|
|
1525
|
+
explanation: `Could not find "${assertion.subject}" in twin state to check status`
|
|
1526
|
+
};
|
|
1527
|
+
}
|
|
1528
|
+
const matching = assertion.predicate ? filterByPredicate(items, assertion.predicate) : items;
|
|
1529
|
+
const passed = matching.length > 0;
|
|
1530
|
+
return {
|
|
1531
|
+
criterionId: criterion.id,
|
|
1532
|
+
status: passed ? "pass" : "fail",
|
|
1533
|
+
confidence: 1,
|
|
1534
|
+
explanation: passed ? `"${assertion.subject}" is ${assertion.predicate ?? "in expected state"}` : `"${assertion.subject}" is not ${assertion.predicate ?? "in expected state"}`
|
|
1535
|
+
};
|
|
1536
|
+
}
|
|
1537
|
+
case "comparison": {
|
|
1538
|
+
return {
|
|
1539
|
+
criterionId: criterion.id,
|
|
1540
|
+
status: "fail",
|
|
1541
|
+
confidence: 0.3,
|
|
1542
|
+
explanation: `Comparison assertion type not fully implemented for: "${criterion.description}"`
|
|
1543
|
+
};
|
|
1544
|
+
}
|
|
1545
|
+
}
|
|
1546
|
+
}
|
|
1547
|
+
function evaluateCount(criterionId, type, expected, actual, subject, predicate) {
|
|
1548
|
+
const subjectDesc = predicate ? `${subject} ${predicate}` : subject;
|
|
1549
|
+
switch (type) {
|
|
1550
|
+
case "exact_count": {
|
|
1551
|
+
const passed = actual === expected;
|
|
1552
|
+
return {
|
|
1553
|
+
criterionId,
|
|
1554
|
+
status: passed ? "pass" : "fail",
|
|
1555
|
+
confidence: 1,
|
|
1556
|
+
explanation: passed ? `Found exactly ${expected} ${subjectDesc}` : `Expected exactly ${expected} ${subjectDesc}, found ${actual}`
|
|
1557
|
+
};
|
|
1558
|
+
}
|
|
1559
|
+
case "min_count": {
|
|
1560
|
+
const passed = actual >= expected;
|
|
1561
|
+
return {
|
|
1562
|
+
criterionId,
|
|
1563
|
+
status: passed ? "pass" : "fail",
|
|
1564
|
+
confidence: 1,
|
|
1565
|
+
explanation: passed ? `Found ${actual} ${subjectDesc} (>= ${expected})` : `Expected at least ${expected} ${subjectDesc}, found ${actual}`
|
|
1566
|
+
};
|
|
1567
|
+
}
|
|
1568
|
+
case "max_count": {
|
|
1569
|
+
const passed = actual <= expected;
|
|
1570
|
+
return {
|
|
1571
|
+
criterionId,
|
|
1572
|
+
status: passed ? "pass" : "fail",
|
|
1573
|
+
confidence: 1,
|
|
1574
|
+
explanation: passed ? `Found ${actual} ${subjectDesc} (<= ${expected})` : `Expected at most ${expected} ${subjectDesc}, found ${actual}`
|
|
1575
|
+
};
|
|
1576
|
+
}
|
|
1577
|
+
}
|
|
1578
|
+
}
|
|
1579
|
+
|
|
1580
|
+
// src/evaluator/llm-judge.ts
|
|
1581
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
1582
|
+
var SYSTEM_PROMPT = `You are an evaluator for AI agent testing. You assess whether an agent successfully met a specific success criterion during a scenario run.
|
|
1583
|
+
|
|
1584
|
+
You will receive:
|
|
1585
|
+
1. A success criterion to evaluate
|
|
1586
|
+
2. The expected behavior description
|
|
1587
|
+
3. The state of the digital twin before the agent ran
|
|
1588
|
+
4. The state of the digital twin after the agent ran
|
|
1589
|
+
5. A diff of state changes
|
|
1590
|
+
6. The complete trace of tool calls the agent made
|
|
1591
|
+
|
|
1592
|
+
Your job is to determine if the criterion was met. Respond ONLY with valid JSON in this exact format:
|
|
1593
|
+
|
|
1594
|
+
{
|
|
1595
|
+
"status": "pass" | "fail" | "partial",
|
|
1596
|
+
"confidence": <number between 0 and 1>,
|
|
1597
|
+
"explanation": "<brief explanation of your assessment>"
|
|
1598
|
+
}
|
|
1599
|
+
|
|
1600
|
+
Rules:
|
|
1601
|
+
- "pass" means the criterion is clearly satisfied
|
|
1602
|
+
- "fail" means the criterion is clearly not satisfied
|
|
1603
|
+
- "partial" means the criterion is partially satisfied or the evidence is ambiguous
|
|
1604
|
+
- confidence is how certain you are in your assessment (1.0 = completely certain, 0.5 = uncertain)
|
|
1605
|
+
- Keep explanations concise (1-2 sentences)
|
|
1606
|
+
- Focus on observable evidence in the state and trace, not assumptions
|
|
1607
|
+
- If the criterion is about quality or helpfulness, assess based on content present in the state`;
|
|
1608
|
+
function buildUserPrompt(context) {
|
|
1609
|
+
const traceFormatted = context.trace.map((entry) => ({
|
|
1610
|
+
tool: entry.toolName,
|
|
1611
|
+
input: entry.input,
|
|
1612
|
+
output: entry.output,
|
|
1613
|
+
error: entry.error,
|
|
1614
|
+
durationMs: entry.durationMs
|
|
1615
|
+
}));
|
|
1616
|
+
const traceSummary = context.trace.length > 50 ? `(Showing first 25 and last 25 of ${context.trace.length} total calls)` : "";
|
|
1617
|
+
const traceToShow = context.trace.length > 50 ? [...traceFormatted.slice(0, 25), { note: `... ${context.trace.length - 50} calls omitted ...` }, ...traceFormatted.slice(-25)] : traceFormatted;
|
|
1618
|
+
return `## Success Criterion
|
|
1619
|
+
${context.criterion.description}
|
|
1620
|
+
|
|
1621
|
+
## Expected Behavior
|
|
1622
|
+
${context.expectedBehavior}
|
|
1623
|
+
|
|
1624
|
+
## State Before
|
|
1625
|
+
${JSON.stringify(summarizeState(context.stateBefore), null, 2)}
|
|
1626
|
+
|
|
1627
|
+
## State After
|
|
1628
|
+
${JSON.stringify(summarizeState(context.stateAfter), null, 2)}
|
|
1629
|
+
|
|
1630
|
+
## State Diff
|
|
1631
|
+
${JSON.stringify(context.stateDiff, null, 2)}
|
|
1632
|
+
|
|
1633
|
+
## Agent Trace ${traceSummary}
|
|
1634
|
+
${JSON.stringify(traceToShow, null, 2)}`;
|
|
1635
|
+
}
|
|
1636
|
+
function summarizeState(state) {
|
|
1637
|
+
const summary = {};
|
|
1638
|
+
for (const [key, value] of Object.entries(state)) {
|
|
1639
|
+
if (Array.isArray(value)) {
|
|
1640
|
+
if (value.length <= 10) {
|
|
1641
|
+
summary[key] = value;
|
|
1642
|
+
} else {
|
|
1643
|
+
summary[key] = {
|
|
1644
|
+
_count: value.length,
|
|
1645
|
+
_first3: value.slice(0, 3),
|
|
1646
|
+
_last3: value.slice(-3)
|
|
1647
|
+
};
|
|
1648
|
+
}
|
|
1649
|
+
} else {
|
|
1650
|
+
summary[key] = value;
|
|
1651
|
+
}
|
|
1652
|
+
}
|
|
1653
|
+
return summary;
|
|
1654
|
+
}
|
|
1655
|
+
function parseJudgeResponse(text) {
|
|
1656
|
+
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
1657
|
+
if (!jsonMatch) {
|
|
1658
|
+
warn("LLM judge did not return valid JSON, defaulting to fail");
|
|
1659
|
+
return {
|
|
1660
|
+
status: "fail",
|
|
1661
|
+
confidence: 0.3,
|
|
1662
|
+
explanation: "Could not parse evaluator response"
|
|
1663
|
+
};
|
|
1664
|
+
}
|
|
1665
|
+
try {
|
|
1666
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
1667
|
+
const status = parsed["status"];
|
|
1668
|
+
if (status !== "pass" && status !== "fail" && status !== "partial") {
|
|
1669
|
+
return {
|
|
1670
|
+
status: "fail",
|
|
1671
|
+
confidence: 0.3,
|
|
1672
|
+
explanation: `Invalid status from evaluator: ${String(status)}`
|
|
1673
|
+
};
|
|
1674
|
+
}
|
|
1675
|
+
const confidence = typeof parsed["confidence"] === "number" ? Math.max(0, Math.min(1, parsed["confidence"])) : 0.5;
|
|
1676
|
+
const explanation = typeof parsed["explanation"] === "string" ? parsed["explanation"] : "No explanation provided";
|
|
1677
|
+
return { status, confidence, explanation };
|
|
1678
|
+
} catch {
|
|
1679
|
+
warn("Failed to parse LLM judge JSON response");
|
|
1680
|
+
return {
|
|
1681
|
+
status: "fail",
|
|
1682
|
+
confidence: 0.3,
|
|
1683
|
+
explanation: "Could not parse evaluator response JSON"
|
|
1684
|
+
};
|
|
1685
|
+
}
|
|
1686
|
+
}
|
|
1687
|
+
var clientInstance = null;
|
|
1688
|
+
function getClient(apiKey) {
|
|
1689
|
+
if (!clientInstance) {
|
|
1690
|
+
clientInstance = new Anthropic({ apiKey });
|
|
1691
|
+
}
|
|
1692
|
+
return clientInstance;
|
|
1693
|
+
}
|
|
1694
|
+
async function evaluateWithLlm(criterion, expectedBehavior, stateBefore, stateAfter, stateDiff, trace, options) {
|
|
1695
|
+
const context = {
|
|
1696
|
+
criterion,
|
|
1697
|
+
expectedBehavior,
|
|
1698
|
+
stateBefore,
|
|
1699
|
+
stateAfter,
|
|
1700
|
+
stateDiff,
|
|
1701
|
+
trace
|
|
1702
|
+
};
|
|
1703
|
+
if (!options.apiKey) {
|
|
1704
|
+
error("No API key provided for LLM evaluation");
|
|
1705
|
+
return {
|
|
1706
|
+
criterionId: criterion.id,
|
|
1707
|
+
status: "fail",
|
|
1708
|
+
confidence: 0,
|
|
1709
|
+
explanation: "No ANTHROPIC_API_KEY configured for probabilistic evaluation"
|
|
1710
|
+
};
|
|
1711
|
+
}
|
|
1712
|
+
const client = getClient(options.apiKey);
|
|
1713
|
+
const userPrompt = buildUserPrompt(context);
|
|
1714
|
+
debug("Calling LLM judge", {
|
|
1715
|
+
criterion: criterion.id,
|
|
1716
|
+
model: options.model,
|
|
1717
|
+
traceLength: String(trace.length)
|
|
1718
|
+
});
|
|
1719
|
+
try {
|
|
1720
|
+
const response = await client.messages.create({
|
|
1721
|
+
model: options.model,
|
|
1722
|
+
max_tokens: 512,
|
|
1723
|
+
system: SYSTEM_PROMPT,
|
|
1724
|
+
messages: [
|
|
1725
|
+
{
|
|
1726
|
+
role: "user",
|
|
1727
|
+
content: userPrompt
|
|
1728
|
+
}
|
|
1729
|
+
]
|
|
1730
|
+
});
|
|
1731
|
+
const textBlock = response.content.find((block) => block.type === "text");
|
|
1732
|
+
if (!textBlock || textBlock.type !== "text") {
|
|
1733
|
+
return {
|
|
1734
|
+
criterionId: criterion.id,
|
|
1735
|
+
status: "fail",
|
|
1736
|
+
confidence: 0.3,
|
|
1737
|
+
explanation: "LLM returned no text content"
|
|
1738
|
+
};
|
|
1739
|
+
}
|
|
1740
|
+
const judgeResult = parseJudgeResponse(textBlock.text);
|
|
1741
|
+
debug("LLM judge result", {
|
|
1742
|
+
criterion: criterion.id,
|
|
1743
|
+
status: judgeResult.status,
|
|
1744
|
+
confidence: judgeResult.confidence.toFixed(2)
|
|
1745
|
+
});
|
|
1746
|
+
return {
|
|
1747
|
+
criterionId: criterion.id,
|
|
1748
|
+
status: judgeResult.status,
|
|
1749
|
+
confidence: judgeResult.confidence,
|
|
1750
|
+
explanation: judgeResult.explanation
|
|
1751
|
+
};
|
|
1752
|
+
} catch (err) {
|
|
1753
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1754
|
+
error(`LLM judge call failed: ${message}`);
|
|
1755
|
+
return {
|
|
1756
|
+
criterionId: criterion.id,
|
|
1757
|
+
status: "fail",
|
|
1758
|
+
confidence: 0,
|
|
1759
|
+
explanation: `LLM evaluation failed: ${message}`
|
|
1760
|
+
};
|
|
1761
|
+
}
|
|
1762
|
+
}
|
|
1763
|
+
|
|
1764
|
+
// src/evaluator/evaluator.ts
|
|
1765
|
+
function calculateOverallScore(evaluations) {
|
|
1766
|
+
if (evaluations.length === 0) return 0;
|
|
1767
|
+
let totalScore = 0;
|
|
1768
|
+
for (const evaluation of evaluations) {
|
|
1769
|
+
switch (evaluation.status) {
|
|
1770
|
+
case "pass":
|
|
1771
|
+
totalScore += 100;
|
|
1772
|
+
break;
|
|
1773
|
+
case "partial":
|
|
1774
|
+
totalScore += 50 * evaluation.confidence;
|
|
1775
|
+
break;
|
|
1776
|
+
case "fail":
|
|
1777
|
+
totalScore += 0;
|
|
1778
|
+
break;
|
|
1779
|
+
}
|
|
1780
|
+
}
|
|
1781
|
+
return totalScore / evaluations.length;
|
|
1782
|
+
}
|
|
1783
|
+
async function evaluateRun(criteria, context, config) {
|
|
1784
|
+
const evaluations = [];
|
|
1785
|
+
const deterministicCriteria = criteria.filter((c) => c.type === "deterministic");
|
|
1786
|
+
const probabilisticCriteria = criteria.filter((c) => c.type === "probabilistic");
|
|
1787
|
+
info("Evaluating criteria", {
|
|
1788
|
+
total: String(criteria.length),
|
|
1789
|
+
deterministic: String(deterministicCriteria.length),
|
|
1790
|
+
probabilistic: String(probabilisticCriteria.length)
|
|
1791
|
+
});
|
|
1792
|
+
for (const criterion of deterministicCriteria) {
|
|
1793
|
+
progress(`Evaluating [D] ${criterion.description}`);
|
|
1794
|
+
const result = evaluateDeterministic(criterion, {
|
|
1795
|
+
before: context.stateBefore,
|
|
1796
|
+
after: context.stateAfter,
|
|
1797
|
+
diff: context.stateDiff,
|
|
1798
|
+
trace: context.trace
|
|
1799
|
+
});
|
|
1800
|
+
evaluations.push(result);
|
|
1801
|
+
debug("Deterministic evaluation", {
|
|
1802
|
+
criterion: criterion.id,
|
|
1803
|
+
status: result.status
|
|
1804
|
+
});
|
|
1805
|
+
}
|
|
1806
|
+
for (const criterion of probabilisticCriteria) {
|
|
1807
|
+
progress(`Evaluating [P] ${criterion.description}`);
|
|
1808
|
+
const result = await evaluateWithLlm(
|
|
1809
|
+
criterion,
|
|
1810
|
+
context.expectedBehavior,
|
|
1811
|
+
context.stateBefore,
|
|
1812
|
+
context.stateAfter,
|
|
1813
|
+
context.stateDiff,
|
|
1814
|
+
context.trace,
|
|
1815
|
+
{ apiKey: config.apiKey, model: config.model }
|
|
1816
|
+
);
|
|
1817
|
+
evaluations.push(result);
|
|
1818
|
+
debug("Probabilistic evaluation", {
|
|
1819
|
+
criterion: criterion.id,
|
|
1820
|
+
status: result.status,
|
|
1821
|
+
confidence: result.confidence.toFixed(2)
|
|
1822
|
+
});
|
|
1823
|
+
}
|
|
1824
|
+
evaluations.sort((a, b) => {
|
|
1825
|
+
const aIdx = criteria.findIndex((c) => c.id === a.criterionId);
|
|
1826
|
+
const bIdx = criteria.findIndex((c) => c.id === b.criterionId);
|
|
1827
|
+
return aIdx - bIdx;
|
|
1828
|
+
});
|
|
1829
|
+
const overallScore = calculateOverallScore(evaluations);
|
|
1830
|
+
info("Evaluation complete", {
|
|
1831
|
+
overallScore: overallScore.toFixed(1) + "%",
|
|
1832
|
+
passed: String(evaluations.filter((e) => e.status === "pass").length),
|
|
1833
|
+
failed: String(evaluations.filter((e) => e.status === "fail").length),
|
|
1834
|
+
partial: String(evaluations.filter((e) => e.status === "partial").length)
|
|
1835
|
+
});
|
|
1836
|
+
return { evaluations, overallScore };
|
|
1837
|
+
}
|
|
1838
|
+
function aggregateSatisfaction(runScores) {
|
|
1839
|
+
if (runScores.length === 0) return 0;
|
|
1840
|
+
const avg = runScores.reduce((sum, score) => sum + score, 0) / runScores.length;
|
|
1841
|
+
return Math.round(avg * 10) / 10;
|
|
1842
|
+
}
|
|
1843
|
+
function generateSummary(evaluations, satisfactionScore) {
|
|
1844
|
+
const totalRuns = evaluations.length;
|
|
1845
|
+
const allCriteria = /* @__PURE__ */ new Map();
|
|
1846
|
+
for (const runEvals of evaluations) {
|
|
1847
|
+
for (const evaluation of runEvals) {
|
|
1848
|
+
const existing = allCriteria.get(evaluation.criterionId) ?? { passed: 0, failed: 0, partial: 0 };
|
|
1849
|
+
if (evaluation.status === "pass") existing.passed++;
|
|
1850
|
+
else if (evaluation.status === "fail") existing.failed++;
|
|
1851
|
+
else existing.partial++;
|
|
1852
|
+
allCriteria.set(evaluation.criterionId, existing);
|
|
1853
|
+
}
|
|
1854
|
+
}
|
|
1855
|
+
const consistentlyPassing = [];
|
|
1856
|
+
const consistentlyFailing = [];
|
|
1857
|
+
const flaky = [];
|
|
1858
|
+
for (const [criterionId, counts] of allCriteria.entries()) {
|
|
1859
|
+
if (counts.passed === totalRuns) {
|
|
1860
|
+
consistentlyPassing.push(criterionId);
|
|
1861
|
+
} else if (counts.failed === totalRuns) {
|
|
1862
|
+
consistentlyFailing.push(criterionId);
|
|
1863
|
+
} else {
|
|
1864
|
+
flaky.push(criterionId);
|
|
1865
|
+
}
|
|
1866
|
+
}
|
|
1867
|
+
const parts = [];
|
|
1868
|
+
parts.push(`Satisfaction: ${satisfactionScore.toFixed(1)}% across ${totalRuns} runs.`);
|
|
1869
|
+
if (consistentlyPassing.length > 0) {
|
|
1870
|
+
parts.push(`Consistently passing: ${consistentlyPassing.join(", ")}.`);
|
|
1871
|
+
}
|
|
1872
|
+
if (consistentlyFailing.length > 0) {
|
|
1873
|
+
parts.push(`Consistently failing: ${consistentlyFailing.join(", ")}.`);
|
|
1874
|
+
}
|
|
1875
|
+
if (flaky.length > 0) {
|
|
1876
|
+
parts.push(`Non-deterministic: ${flaky.join(", ")}.`);
|
|
1877
|
+
}
|
|
1878
|
+
return parts.join(" ");
|
|
1879
|
+
}
|
|
1880
|
+
|
|
1881
|
+
// src/telemetry/recorder.ts
|
|
1882
|
+
import { mkdirSync as mkdirSync2, writeFileSync as writeFileSync3, readFileSync as readFileSync5, readdirSync, existsSync as existsSync4 } from "fs";
|
|
1883
|
+
import { join as join3 } from "path";
|
|
1884
|
+
import { randomUUID } from "crypto";
|
|
1885
|
+
|
|
1886
|
+
// src/config/config.ts
|
|
1887
|
+
import { readFileSync as readFileSync4, writeFileSync as writeFileSync2, mkdirSync, existsSync as existsSync3 } from "fs";
|
|
1888
|
+
import { join as join2 } from "path";
|
|
1889
|
+
import { homedir } from "os";
|
|
1890
|
+
import { z } from "zod";
|
|
1891
|
+
var ARCHAL_DIR_NAME = ".archal";
|
|
1892
|
+
var CONFIG_FILE_NAME = "config.json";
|
|
1893
|
+
var evaluatorConfigSchema = z.object({
|
|
1894
|
+
model: z.string().default("claude-sonnet-4-20250514"),
|
|
1895
|
+
apiKey: z.string().default("env:ANTHROPIC_API_KEY")
|
|
1896
|
+
});
|
|
1897
|
+
var defaultsConfigSchema = z.object({
|
|
1898
|
+
runs: z.number().int().positive().default(5),
|
|
1899
|
+
timeout: z.number().int().positive().default(120)
|
|
1900
|
+
});
|
|
1901
|
+
var configFileSchema = z.object({
|
|
1902
|
+
telemetry: z.boolean().default(false),
|
|
1903
|
+
evaluator: evaluatorConfigSchema.default({}),
|
|
1904
|
+
defaults: defaultsConfigSchema.default({})
|
|
1905
|
+
});
|
|
1906
|
+
function getArchalDir() {
|
|
1907
|
+
return join2(homedir(), ARCHAL_DIR_NAME);
|
|
1908
|
+
}
|
|
1909
|
+
function getConfigPath() {
|
|
1910
|
+
return join2(getArchalDir(), CONFIG_FILE_NAME);
|
|
1911
|
+
}
|
|
1912
|
+
function ensureArchalDir() {
|
|
1913
|
+
const dir = getArchalDir();
|
|
1914
|
+
if (!existsSync3(dir)) {
|
|
1915
|
+
mkdirSync(dir, { recursive: true });
|
|
1916
|
+
debug("Created archal directory", { path: dir });
|
|
1917
|
+
}
|
|
1918
|
+
return dir;
|
|
1919
|
+
}
|
|
1920
|
+
function loadConfigFile() {
|
|
1921
|
+
const configPath = getConfigPath();
|
|
1922
|
+
if (!existsSync3(configPath)) {
|
|
1923
|
+
debug("No config file found, using defaults", { path: configPath });
|
|
1924
|
+
return configFileSchema.parse({});
|
|
1925
|
+
}
|
|
1926
|
+
try {
|
|
1927
|
+
const raw = readFileSync4(configPath, "utf-8");
|
|
1928
|
+
const parsed = JSON.parse(raw);
|
|
1929
|
+
const config = configFileSchema.parse(parsed);
|
|
1930
|
+
debug("Loaded config file", { path: configPath });
|
|
1931
|
+
return config;
|
|
1932
|
+
} catch (err) {
|
|
1933
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1934
|
+
warn(`Failed to parse config file at ${configPath}: ${message}`);
|
|
1935
|
+
return configFileSchema.parse({});
|
|
1936
|
+
}
|
|
1937
|
+
}
|
|
1938
|
+
function resolveApiKey(apiKeyConfig) {
|
|
1939
|
+
if (apiKeyConfig.startsWith("env:")) {
|
|
1940
|
+
const envVar = apiKeyConfig.slice(4);
|
|
1941
|
+
return process.env[envVar] ?? "";
|
|
1942
|
+
}
|
|
1943
|
+
return apiKeyConfig;
|
|
1944
|
+
}
|
|
1945
|
+
function loadConfig() {
|
|
1946
|
+
const file = loadConfigFile();
|
|
1947
|
+
const envTelemetry = process.env["ARCHAL_TELEMETRY"];
|
|
1948
|
+
const envModel = process.env["ARCHAL_MODEL"];
|
|
1949
|
+
const envRuns = process.env["ARCHAL_RUNS"];
|
|
1950
|
+
const envTimeout = process.env["ARCHAL_TIMEOUT"];
|
|
1951
|
+
const envApiKey = process.env["ANTHROPIC_API_KEY"];
|
|
1952
|
+
const telemetry = envTelemetry !== void 0 ? envTelemetry === "true" : file.telemetry;
|
|
1953
|
+
const model = envModel ?? file.evaluator.model;
|
|
1954
|
+
const runs = envRuns !== void 0 ? parseInt(envRuns, 10) : file.defaults.runs;
|
|
1955
|
+
const timeout = envTimeout !== void 0 ? parseInt(envTimeout, 10) : file.defaults.timeout;
|
|
1956
|
+
const apiKey = envApiKey ?? resolveApiKey(file.evaluator.apiKey);
|
|
1957
|
+
return {
|
|
1958
|
+
telemetry,
|
|
1959
|
+
apiKey,
|
|
1960
|
+
model,
|
|
1961
|
+
runs: Number.isNaN(runs) ? 5 : runs,
|
|
1962
|
+
timeout: Number.isNaN(timeout) ? 120 : timeout,
|
|
1963
|
+
archalDir: getArchalDir(),
|
|
1964
|
+
configPath: getConfigPath()
|
|
1965
|
+
};
|
|
1966
|
+
}
|
|
1967
|
+
function saveConfig(config) {
|
|
1968
|
+
const dir = ensureArchalDir();
|
|
1969
|
+
const configPath = join2(dir, CONFIG_FILE_NAME);
|
|
1970
|
+
let existing;
|
|
1971
|
+
if (existsSync3(configPath)) {
|
|
1972
|
+
try {
|
|
1973
|
+
const raw = readFileSync4(configPath, "utf-8");
|
|
1974
|
+
existing = configFileSchema.parse(JSON.parse(raw));
|
|
1975
|
+
} catch {
|
|
1976
|
+
existing = configFileSchema.parse({});
|
|
1977
|
+
}
|
|
1978
|
+
} else {
|
|
1979
|
+
existing = configFileSchema.parse({});
|
|
1980
|
+
}
|
|
1981
|
+
const merged = {
|
|
1982
|
+
telemetry: config.telemetry ?? existing.telemetry,
|
|
1983
|
+
evaluator: {
|
|
1984
|
+
...existing.evaluator,
|
|
1985
|
+
...config.evaluator
|
|
1986
|
+
},
|
|
1987
|
+
defaults: {
|
|
1988
|
+
...existing.defaults,
|
|
1989
|
+
...config.defaults
|
|
1990
|
+
}
|
|
1991
|
+
};
|
|
1992
|
+
writeFileSync2(configPath, JSON.stringify(merged, null, 2) + "\n", "utf-8");
|
|
1993
|
+
debug("Saved config file", { path: configPath });
|
|
1994
|
+
}
|
|
1995
|
+
function initConfig() {
|
|
1996
|
+
const configPath = getConfigPath();
|
|
1997
|
+
if (existsSync3(configPath)) {
|
|
1998
|
+
warn(`Config file already exists at ${configPath}`);
|
|
1999
|
+
return configPath;
|
|
2000
|
+
}
|
|
2001
|
+
const defaultConfig = configFileSchema.parse({});
|
|
2002
|
+
ensureArchalDir();
|
|
2003
|
+
writeFileSync2(configPath, JSON.stringify(defaultConfig, null, 2) + "\n", "utf-8");
|
|
2004
|
+
return configPath;
|
|
2005
|
+
}
|
|
2006
|
+
function setConfigValue(key, value) {
|
|
2007
|
+
const file = loadConfigFile();
|
|
2008
|
+
const parts = key.split(".");
|
|
2009
|
+
if (parts.length === 1) {
|
|
2010
|
+
const topKey = parts[0];
|
|
2011
|
+
if (topKey === "telemetry") {
|
|
2012
|
+
saveConfig({ ...file, telemetry: value === "true" });
|
|
2013
|
+
return;
|
|
2014
|
+
}
|
|
2015
|
+
}
|
|
2016
|
+
if (parts.length === 2) {
|
|
2017
|
+
const [section, prop] = parts;
|
|
2018
|
+
if (section === "evaluator" && (prop === "model" || prop === "apiKey")) {
|
|
2019
|
+
saveConfig({
|
|
2020
|
+
...file,
|
|
2021
|
+
evaluator: { ...file.evaluator, [prop]: value }
|
|
2022
|
+
});
|
|
2023
|
+
return;
|
|
2024
|
+
}
|
|
2025
|
+
if (section === "defaults" && (prop === "runs" || prop === "timeout")) {
|
|
2026
|
+
const numValue = parseInt(value, 10);
|
|
2027
|
+
if (Number.isNaN(numValue) || numValue <= 0) {
|
|
2028
|
+
throw new Error(`Invalid numeric value for ${key}: ${value}`);
|
|
2029
|
+
}
|
|
2030
|
+
saveConfig({
|
|
2031
|
+
...file,
|
|
2032
|
+
defaults: { ...file.defaults, [prop]: numValue }
|
|
2033
|
+
});
|
|
2034
|
+
return;
|
|
2035
|
+
}
|
|
2036
|
+
}
|
|
2037
|
+
throw new Error(
|
|
2038
|
+
`Unknown config key: "${key}". Valid keys: telemetry, evaluator.model, evaluator.apiKey, defaults.runs, defaults.timeout`
|
|
2039
|
+
);
|
|
2040
|
+
}
|
|
2041
|
+
function getConfigDisplay() {
|
|
2042
|
+
const resolved = loadConfig();
|
|
2043
|
+
return {
|
|
2044
|
+
telemetry: resolved.telemetry,
|
|
2045
|
+
evaluator: {
|
|
2046
|
+
model: resolved.model,
|
|
2047
|
+
apiKey: resolved.apiKey ? "***" + resolved.apiKey.slice(-4) : "(not set)"
|
|
2048
|
+
},
|
|
2049
|
+
defaults: {
|
|
2050
|
+
runs: resolved.runs,
|
|
2051
|
+
timeout: resolved.timeout
|
|
2052
|
+
},
|
|
2053
|
+
paths: {
|
|
2054
|
+
archalDir: resolved.archalDir,
|
|
2055
|
+
configFile: resolved.configPath
|
|
2056
|
+
}
|
|
2057
|
+
};
|
|
2058
|
+
}
|
|
2059
|
+
|
|
2060
|
+
// src/telemetry/recorder.ts
|
|
2061
|
+
var TRACES_DIR = "traces";
|
|
2062
|
+
var MAX_STORED_TRACES = 100;
|
|
2063
|
+
function getTracesDir() {
|
|
2064
|
+
return join3(getArchalDir(), TRACES_DIR);
|
|
2065
|
+
}
|
|
2066
|
+
function ensureTracesDir() {
|
|
2067
|
+
const dir = getTracesDir();
|
|
2068
|
+
if (!existsSync4(dir)) {
|
|
2069
|
+
ensureArchalDir();
|
|
2070
|
+
mkdirSync2(dir, { recursive: true });
|
|
2071
|
+
debug("Created traces directory", { path: dir });
|
|
2072
|
+
}
|
|
2073
|
+
return dir;
|
|
2074
|
+
}
|
|
2075
|
+
function traceFilePath(traceId) {
|
|
2076
|
+
return join3(getTracesDir(), `${traceId}.json`);
|
|
2077
|
+
}
|
|
2078
|
+
function recordTrace(report) {
|
|
2079
|
+
const traceId = randomUUID();
|
|
2080
|
+
const dir = ensureTracesDir();
|
|
2081
|
+
const allEntries = [];
|
|
2082
|
+
for (const run of report.runs) {
|
|
2083
|
+
allEntries.push(...run.trace);
|
|
2084
|
+
}
|
|
2085
|
+
const stored = {
|
|
2086
|
+
id: traceId,
|
|
2087
|
+
scenarioTitle: report.scenarioTitle,
|
|
2088
|
+
timestamp: report.timestamp,
|
|
2089
|
+
satisfactionScore: report.satisfactionScore,
|
|
2090
|
+
runCount: report.runs.length,
|
|
2091
|
+
entries: allEntries,
|
|
2092
|
+
report
|
|
2093
|
+
};
|
|
2094
|
+
const filePath = traceFilePath(traceId);
|
|
2095
|
+
writeFileSync3(filePath, JSON.stringify(stored, null, 2), "utf-8");
|
|
2096
|
+
debug("Recorded trace", { id: traceId, path: filePath, entries: String(allEntries.length) });
|
|
2097
|
+
pruneOldTraces(dir);
|
|
2098
|
+
return traceId;
|
|
2099
|
+
}
|
|
2100
|
+
function pruneOldTraces(dir) {
|
|
2101
|
+
try {
|
|
2102
|
+
const files = readdirSync(dir).filter((f) => f.endsWith(".json")).sort().reverse();
|
|
2103
|
+
if (files.length > MAX_STORED_TRACES) {
|
|
2104
|
+
const toRemove = files.slice(MAX_STORED_TRACES);
|
|
2105
|
+
const { unlinkSync: unlinkSync2 } = __require("fs");
|
|
2106
|
+
for (const file of toRemove) {
|
|
2107
|
+
try {
|
|
2108
|
+
unlinkSync2(join3(dir, file));
|
|
2109
|
+
debug("Pruned old trace", { file });
|
|
2110
|
+
} catch {
|
|
2111
|
+
}
|
|
2112
|
+
}
|
|
2113
|
+
}
|
|
2114
|
+
} catch {
|
|
2115
|
+
}
|
|
2116
|
+
}
|
|
2117
|
+
function loadTrace(traceId) {
|
|
2118
|
+
const filePath = traceFilePath(traceId);
|
|
2119
|
+
if (!existsSync4(filePath)) {
|
|
2120
|
+
const partialMatch = findTraceByPrefix(traceId);
|
|
2121
|
+
if (partialMatch) {
|
|
2122
|
+
return loadTraceByPath(traceFilePath(partialMatch));
|
|
2123
|
+
}
|
|
2124
|
+
return null;
|
|
2125
|
+
}
|
|
2126
|
+
return loadTraceByPath(filePath);
|
|
2127
|
+
}
|
|
2128
|
+
function loadTraceByPath(filePath) {
|
|
2129
|
+
try {
|
|
2130
|
+
const raw = readFileSync5(filePath, "utf-8");
|
|
2131
|
+
return JSON.parse(raw);
|
|
2132
|
+
} catch (err) {
|
|
2133
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2134
|
+
warn(`Failed to load trace: ${message}`);
|
|
2135
|
+
return null;
|
|
2136
|
+
}
|
|
2137
|
+
}
|
|
2138
|
+
function findTraceByPrefix(prefix) {
|
|
2139
|
+
const dir = getTracesDir();
|
|
2140
|
+
if (!existsSync4(dir)) return null;
|
|
2141
|
+
const files = readdirSync(dir).filter((f) => f.endsWith(".json"));
|
|
2142
|
+
for (const file of files) {
|
|
2143
|
+
const id = file.replace(".json", "");
|
|
2144
|
+
if (id.startsWith(prefix)) {
|
|
2145
|
+
return id;
|
|
2146
|
+
}
|
|
2147
|
+
}
|
|
2148
|
+
return null;
|
|
2149
|
+
}
|
|
2150
|
+
function listTraces(limit = 20) {
|
|
2151
|
+
const dir = getTracesDir();
|
|
2152
|
+
if (!existsSync4(dir)) return [];
|
|
2153
|
+
const files = readdirSync(dir).filter((f) => f.endsWith(".json")).sort().reverse().slice(0, limit);
|
|
2154
|
+
const results = [];
|
|
2155
|
+
for (const file of files) {
|
|
2156
|
+
try {
|
|
2157
|
+
const raw = readFileSync5(join3(dir, file), "utf-8");
|
|
2158
|
+
const stored = JSON.parse(raw);
|
|
2159
|
+
results.push({
|
|
2160
|
+
id: stored.id,
|
|
2161
|
+
scenarioTitle: stored.scenarioTitle,
|
|
2162
|
+
timestamp: stored.timestamp,
|
|
2163
|
+
satisfactionScore: stored.satisfactionScore,
|
|
2164
|
+
runCount: stored.runCount,
|
|
2165
|
+
entryCount: stored.entries.length
|
|
2166
|
+
});
|
|
2167
|
+
} catch {
|
|
2168
|
+
debug(`Skipping corrupted trace file: ${file}`);
|
|
2169
|
+
}
|
|
2170
|
+
}
|
|
2171
|
+
return results;
|
|
2172
|
+
}
|
|
2173
|
+
function exportTraceAsJson(traceId) {
|
|
2174
|
+
const trace = loadTrace(traceId);
|
|
2175
|
+
if (!trace) return null;
|
|
2176
|
+
return JSON.stringify(trace, null, 2);
|
|
2177
|
+
}
|
|
2178
|
+
|
|
2179
|
+
// src/telemetry/anonymizer.ts
|
|
2180
|
+
import { createHash } from "crypto";
|
|
2181
|
+
var API_KEY_PATTERNS = [
|
|
2182
|
+
/(?:api[_-]?key|token|secret|password|authorization|bearer)\s*[:=]\s*["']?([a-zA-Z0-9_\-/.+=]{16,})["']?/gi,
|
|
2183
|
+
/sk-[a-zA-Z0-9]{20,}/g,
|
|
2184
|
+
/ghp_[a-zA-Z0-9]{36}/g,
|
|
2185
|
+
/gho_[a-zA-Z0-9]{36}/g,
|
|
2186
|
+
/xoxb-[a-zA-Z0-9-]+/g,
|
|
2187
|
+
/xoxp-[a-zA-Z0-9-]+/g,
|
|
2188
|
+
/xoxa-[a-zA-Z0-9-]+/g,
|
|
2189
|
+
/glpat-[a-zA-Z0-9_-]{20}/g,
|
|
2190
|
+
/Bearer\s+[a-zA-Z0-9_\-/.+=]{20,}/gi
|
|
2191
|
+
];
|
|
2192
|
+
var EMAIL_PATTERN = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
|
|
2193
|
+
var USERNAME_FIELDS = /* @__PURE__ */ new Set([
|
|
2194
|
+
"username",
|
|
2195
|
+
"user",
|
|
2196
|
+
"author",
|
|
2197
|
+
"assignee",
|
|
2198
|
+
"reviewer",
|
|
2199
|
+
"creator",
|
|
2200
|
+
"sender",
|
|
2201
|
+
"recipient",
|
|
2202
|
+
"login",
|
|
2203
|
+
"owner",
|
|
2204
|
+
"requester",
|
|
2205
|
+
"reporter",
|
|
2206
|
+
"committer",
|
|
2207
|
+
"name",
|
|
2208
|
+
"full_name",
|
|
2209
|
+
"display_name",
|
|
2210
|
+
"real_name"
|
|
2211
|
+
]);
|
|
2212
|
+
var PASSTHROUGH_FIELDS = /* @__PURE__ */ new Set([
|
|
2213
|
+
"id",
|
|
2214
|
+
"type",
|
|
2215
|
+
"state",
|
|
2216
|
+
"status",
|
|
2217
|
+
"created_at",
|
|
2218
|
+
"updated_at",
|
|
2219
|
+
"timestamp",
|
|
2220
|
+
"number",
|
|
2221
|
+
"count",
|
|
2222
|
+
"merged",
|
|
2223
|
+
"closed",
|
|
2224
|
+
"draft",
|
|
2225
|
+
"locked",
|
|
2226
|
+
"private",
|
|
2227
|
+
"public",
|
|
2228
|
+
"archived"
|
|
2229
|
+
]);
|
|
2230
|
+
function hashValue(value, salt = "archal") {
|
|
2231
|
+
const hash = createHash("sha256").update(`${salt}:${value}`).digest("hex");
|
|
2232
|
+
return `anon_${hash.slice(0, 12)}`;
|
|
2233
|
+
}
|
|
2234
|
+
function stripApiKeys(text) {
|
|
2235
|
+
let result = text;
|
|
2236
|
+
for (const pattern of API_KEY_PATTERNS) {
|
|
2237
|
+
result = result.replace(pattern, "[REDACTED_KEY]");
|
|
2238
|
+
}
|
|
2239
|
+
return result;
|
|
2240
|
+
}
|
|
2241
|
+
function anonymizeEmails(text) {
|
|
2242
|
+
return text.replace(EMAIL_PATTERN, (email) => {
|
|
2243
|
+
const domain = email.split("@")[1] ?? "unknown";
|
|
2244
|
+
return `${hashValue(email)}@${domain}`;
|
|
2245
|
+
});
|
|
2246
|
+
}
|
|
2247
|
+
function isUsernameField(key) {
|
|
2248
|
+
const lower = key.toLowerCase();
|
|
2249
|
+
return USERNAME_FIELDS.has(lower);
|
|
2250
|
+
}
|
|
2251
|
+
function shouldPassthrough(key) {
|
|
2252
|
+
const lower = key.toLowerCase();
|
|
2253
|
+
return PASSTHROUGH_FIELDS.has(lower);
|
|
2254
|
+
}
|
|
2255
|
+
function anonymizeValue(key, value) {
|
|
2256
|
+
if (value === null || value === void 0) return value;
|
|
2257
|
+
if (typeof value === "boolean" || typeof value === "number") return value;
|
|
2258
|
+
if (typeof value === "string") {
|
|
2259
|
+
if (shouldPassthrough(key)) return value;
|
|
2260
|
+
let result = stripApiKeys(value);
|
|
2261
|
+
result = anonymizeEmails(result);
|
|
2262
|
+
if (isUsernameField(key)) {
|
|
2263
|
+
return hashValue(result);
|
|
2264
|
+
}
|
|
2265
|
+
return result;
|
|
2266
|
+
}
|
|
2267
|
+
if (Array.isArray(value)) {
|
|
2268
|
+
return value.map((item, idx) => anonymizeValue(`${key}[${idx}]`, item));
|
|
2269
|
+
}
|
|
2270
|
+
if (typeof value === "object") {
|
|
2271
|
+
return anonymizeObject(value);
|
|
2272
|
+
}
|
|
2273
|
+
return value;
|
|
2274
|
+
}
|
|
2275
|
+
function anonymizeObject(obj) {
|
|
2276
|
+
const result = {};
|
|
2277
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
2278
|
+
result[key] = anonymizeValue(key, value);
|
|
2279
|
+
}
|
|
2280
|
+
return result;
|
|
2281
|
+
}
|
|
2282
|
+
function anonymizeTraceEntry(entry) {
|
|
2283
|
+
return {
|
|
2284
|
+
...entry,
|
|
2285
|
+
input: anonymizeObject(entry.input),
|
|
2286
|
+
output: typeof entry.output === "object" && entry.output !== null ? anonymizeObject(entry.output) : entry.output,
|
|
2287
|
+
error: entry.error ? {
|
|
2288
|
+
...entry.error,
|
|
2289
|
+
message: stripApiKeys(entry.error.message),
|
|
2290
|
+
details: entry.error.details !== void 0 ? typeof entry.error.details === "object" && entry.error.details !== null ? anonymizeObject(entry.error.details) : entry.error.details : void 0
|
|
2291
|
+
} : null
|
|
2292
|
+
};
|
|
2293
|
+
}
|
|
2294
|
+
function anonymizeTrace(entries) {
|
|
2295
|
+
debug("Anonymizing trace", { entryCount: String(entries.length) });
|
|
2296
|
+
return entries.map(anonymizeTraceEntry);
|
|
2297
|
+
}
|
|
2298
|
+
|
|
2299
|
+
// src/telemetry/uploader.ts
|
|
2300
|
+
var ARCHAL_CLOUD_ENDPOINT = "https://api.archal.dev/v1/traces";
|
|
2301
|
+
var BATCH_SIZE = 50;
|
|
2302
|
+
function isTelemetryEnabled() {
|
|
2303
|
+
const config = loadConfig();
|
|
2304
|
+
return config.telemetry;
|
|
2305
|
+
}
|
|
2306
|
+
function buildMetadata(report) {
|
|
2307
|
+
const twinNames = /* @__PURE__ */ new Set();
|
|
2308
|
+
for (const run of report.runs) {
|
|
2309
|
+
for (const entry of run.trace) {
|
|
2310
|
+
const twinPrefix = entry.toolName.split("_")[0];
|
|
2311
|
+
if (twinPrefix) {
|
|
2312
|
+
twinNames.add(twinPrefix);
|
|
2313
|
+
}
|
|
2314
|
+
}
|
|
2315
|
+
}
|
|
2316
|
+
return {
|
|
2317
|
+
cliVersion: "0.1.0",
|
|
2318
|
+
nodeVersion: process.version,
|
|
2319
|
+
platform: process.platform,
|
|
2320
|
+
twinNames: Array.from(twinNames),
|
|
2321
|
+
criteriaCount: report.runs[0]?.evaluations.length ?? 0
|
|
2322
|
+
};
|
|
2323
|
+
}
|
|
2324
|
+
function batchEntries(entries) {
|
|
2325
|
+
const batches = [];
|
|
2326
|
+
for (let i = 0; i < entries.length; i += BATCH_SIZE) {
|
|
2327
|
+
batches.push(entries.slice(i, i + BATCH_SIZE));
|
|
2328
|
+
}
|
|
2329
|
+
return batches;
|
|
2330
|
+
}
|
|
2331
|
+
async function uploadTrace(traceId, report) {
|
|
2332
|
+
if (!isTelemetryEnabled()) {
|
|
2333
|
+
debug("Telemetry is disabled, skipping upload");
|
|
2334
|
+
return {
|
|
2335
|
+
success: false,
|
|
2336
|
+
message: "Telemetry is disabled. Enable with: archal config set telemetry true"
|
|
2337
|
+
};
|
|
2338
|
+
}
|
|
2339
|
+
const allEntries = [];
|
|
2340
|
+
for (const run of report.runs) {
|
|
2341
|
+
allEntries.push(...run.trace);
|
|
2342
|
+
}
|
|
2343
|
+
const anonymizedEntries = anonymizeTrace(allEntries);
|
|
2344
|
+
const metadata = buildMetadata(report);
|
|
2345
|
+
const batches = batchEntries(anonymizedEntries);
|
|
2346
|
+
info(`Preparing to upload trace ${traceId}`, {
|
|
2347
|
+
entries: String(anonymizedEntries.length),
|
|
2348
|
+
batches: String(batches.length)
|
|
2349
|
+
});
|
|
2350
|
+
for (let i = 0; i < batches.length; i++) {
|
|
2351
|
+
const batch = batches[i];
|
|
2352
|
+
const payload = {
|
|
2353
|
+
traceId: `${traceId}${batches.length > 1 ? `-batch-${i + 1}` : ""}`,
|
|
2354
|
+
scenarioTitle: report.scenarioTitle,
|
|
2355
|
+
satisfactionScore: report.satisfactionScore,
|
|
2356
|
+
runCount: report.runs.length,
|
|
2357
|
+
timestamp: report.timestamp,
|
|
2358
|
+
entries: batch,
|
|
2359
|
+
metadata
|
|
2360
|
+
};
|
|
2361
|
+
const result = await sendBatch(payload, i + 1, batches.length);
|
|
2362
|
+
if (!result.success) {
|
|
2363
|
+
return result;
|
|
2364
|
+
}
|
|
2365
|
+
}
|
|
2366
|
+
return {
|
|
2367
|
+
success: true,
|
|
2368
|
+
message: `Trace ${traceId} uploaded successfully (${anonymizedEntries.length} entries in ${batches.length} batch(es))`,
|
|
2369
|
+
traceId
|
|
2370
|
+
};
|
|
2371
|
+
}
|
|
2372
|
+
async function sendBatch(payload, batchNum, totalBatches) {
|
|
2373
|
+
debug(`Uploading batch ${batchNum}/${totalBatches}`, {
|
|
2374
|
+
entries: String(payload.entries.length),
|
|
2375
|
+
endpoint: ARCHAL_CLOUD_ENDPOINT
|
|
2376
|
+
});
|
|
2377
|
+
info(`[Telemetry stub] Would send batch ${batchNum}/${totalBatches} to ${ARCHAL_CLOUD_ENDPOINT}`, {
|
|
2378
|
+
traceId: payload.traceId,
|
|
2379
|
+
scenario: payload.scenarioTitle,
|
|
2380
|
+
entries: String(payload.entries.length),
|
|
2381
|
+
satisfaction: payload.satisfactionScore.toFixed(1) + "%"
|
|
2382
|
+
});
|
|
2383
|
+
info(`[Telemetry stub] Payload size: ${JSON.stringify(payload).length} bytes`);
|
|
2384
|
+
return {
|
|
2385
|
+
success: true,
|
|
2386
|
+
message: `Batch ${batchNum}/${totalBatches} sent`,
|
|
2387
|
+
traceId: payload.traceId
|
|
2388
|
+
};
|
|
2389
|
+
}
|
|
2390
|
+
async function uploadIfEnabled(traceId, report) {
|
|
2391
|
+
if (!isTelemetryEnabled()) {
|
|
2392
|
+
return;
|
|
2393
|
+
}
|
|
2394
|
+
try {
|
|
2395
|
+
const result = await uploadTrace(traceId, report);
|
|
2396
|
+
if (result.success) {
|
|
2397
|
+
info("Telemetry uploaded", { traceId });
|
|
2398
|
+
} else {
|
|
2399
|
+
warn(`Telemetry upload skipped: ${result.message}`);
|
|
2400
|
+
}
|
|
2401
|
+
} catch (err) {
|
|
2402
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2403
|
+
warn(`Telemetry upload failed: ${message}`);
|
|
2404
|
+
}
|
|
2405
|
+
}
|
|
2406
|
+
|
|
2407
|
+
// src/runner/orchestrator.ts
|
|
2408
|
+
function computeStateDiff(before, after) {
|
|
2409
|
+
const diff = { added: {}, modified: {}, removed: {} };
|
|
2410
|
+
const allKeys = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
2411
|
+
for (const key of allKeys) {
|
|
2412
|
+
const beforeVal = before[key];
|
|
2413
|
+
const afterVal = after[key];
|
|
2414
|
+
if (beforeVal === void 0 && afterVal !== void 0) {
|
|
2415
|
+
diff.added[key] = Array.isArray(afterVal) ? afterVal : [afterVal];
|
|
2416
|
+
} else if (beforeVal !== void 0 && afterVal === void 0) {
|
|
2417
|
+
diff.removed[key] = Array.isArray(beforeVal) ? beforeVal.map((item) => item.id ?? 0) : [0];
|
|
2418
|
+
} else if (JSON.stringify(beforeVal) !== JSON.stringify(afterVal)) {
|
|
2419
|
+
diff.modified[key] = Array.isArray(afterVal) ? afterVal : [afterVal];
|
|
2420
|
+
}
|
|
2421
|
+
}
|
|
2422
|
+
return diff;
|
|
2423
|
+
}
|
|
2424
|
+
async function executeSingleRun(runIndex, scenario, agentConfig, seedSelections, evaluatorConfig, timeoutSeconds, rateLimit) {
|
|
2425
|
+
const startTime = Date.now();
|
|
2426
|
+
const runId = `archal-run-${runIndex}-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
2427
|
+
info(`Starting run ${runIndex + 1}`, { scenario: scenario.title });
|
|
2428
|
+
const twinConfigs = seedSelections.map((sel) => ({
|
|
2429
|
+
twinName: sel.twinName,
|
|
2430
|
+
seedName: sel.seedName,
|
|
2431
|
+
rateLimitMax: rateLimit
|
|
2432
|
+
}));
|
|
2433
|
+
try {
|
|
2434
|
+
progress("Capturing seed state...");
|
|
2435
|
+
const { beforeState, twinPaths: seedPaths } = await captureSeedState(twinConfigs);
|
|
2436
|
+
const { configPath: mcpConfigPath, twinPaths } = writeMcpConfig(twinConfigs, runId);
|
|
2437
|
+
const mcpConfigData = JSON.parse(readFileSync6(mcpConfigPath, "utf-8"));
|
|
2438
|
+
const mcpServersJson = JSON.stringify(mcpConfigData.mcpServers);
|
|
2439
|
+
const twinNames = twinConfigs.map((c) => c.twinName);
|
|
2440
|
+
const agentResult = await executeAgent(
|
|
2441
|
+
agentConfig,
|
|
2442
|
+
mcpConfigPath,
|
|
2443
|
+
mcpServersJson,
|
|
2444
|
+
twinNames,
|
|
2445
|
+
timeoutSeconds * 1e3
|
|
2446
|
+
);
|
|
2447
|
+
const stateAfter = collectStateFromFiles(twinPaths);
|
|
2448
|
+
const trace = collectTraceFromFiles(twinPaths);
|
|
2449
|
+
const diff = computeStateDiff(beforeState, stateAfter);
|
|
2450
|
+
cleanupTempFiles(mcpConfigPath, twinPaths, seedPaths);
|
|
2451
|
+
if (agentResult.timedOut) {
|
|
2452
|
+
const durationMs2 = Date.now() - startTime;
|
|
2453
|
+
return {
|
|
2454
|
+
runIndex,
|
|
2455
|
+
evaluations: scenario.successCriteria.map((c) => ({
|
|
2456
|
+
criterionId: c.id,
|
|
2457
|
+
status: "fail",
|
|
2458
|
+
confidence: 1,
|
|
2459
|
+
explanation: `Agent timed out after ${timeoutSeconds}s`
|
|
2460
|
+
})),
|
|
2461
|
+
overallScore: 0,
|
|
2462
|
+
trace,
|
|
2463
|
+
durationMs: durationMs2,
|
|
2464
|
+
error: `Agent timed out after ${timeoutSeconds}s`
|
|
2465
|
+
};
|
|
2466
|
+
}
|
|
2467
|
+
if (agentResult.exitCode !== 0 && agentResult.exitCode !== null) {
|
|
2468
|
+
warn(`Agent exited with non-zero code ${agentResult.exitCode} on run ${runIndex + 1}`);
|
|
2469
|
+
}
|
|
2470
|
+
progress(`Evaluating run ${runIndex + 1}...`);
|
|
2471
|
+
const evaluationResult = await evaluateRun(
|
|
2472
|
+
scenario.successCriteria,
|
|
2473
|
+
{
|
|
2474
|
+
expectedBehavior: scenario.expectedBehavior,
|
|
2475
|
+
stateBefore: beforeState,
|
|
2476
|
+
stateAfter,
|
|
2477
|
+
stateDiff: diff,
|
|
2478
|
+
trace
|
|
2479
|
+
},
|
|
2480
|
+
evaluatorConfig
|
|
2481
|
+
);
|
|
2482
|
+
const durationMs = Date.now() - startTime;
|
|
2483
|
+
return {
|
|
2484
|
+
runIndex,
|
|
2485
|
+
evaluations: evaluationResult.evaluations,
|
|
2486
|
+
overallScore: evaluationResult.overallScore,
|
|
2487
|
+
trace,
|
|
2488
|
+
durationMs
|
|
2489
|
+
};
|
|
2490
|
+
} catch (err) {
|
|
2491
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2492
|
+
error(`Run ${runIndex + 1} failed: ${message}`);
|
|
2493
|
+
const durationMs = Date.now() - startTime;
|
|
2494
|
+
return {
|
|
2495
|
+
runIndex,
|
|
2496
|
+
evaluations: scenario.successCriteria.map((c) => ({
|
|
2497
|
+
criterionId: c.id,
|
|
2498
|
+
status: "fail",
|
|
2499
|
+
confidence: 1,
|
|
2500
|
+
explanation: `Run failed: ${message}`
|
|
2501
|
+
})),
|
|
2502
|
+
overallScore: 0,
|
|
2503
|
+
trace: [],
|
|
2504
|
+
durationMs,
|
|
2505
|
+
error: message
|
|
2506
|
+
};
|
|
2507
|
+
}
|
|
2508
|
+
}
|
|
2509
|
+
async function runScenario(options) {
|
|
2510
|
+
const config = loadConfig();
|
|
2511
|
+
const numRuns = options.runs ?? config.runs;
|
|
2512
|
+
const timeoutSeconds = options.timeout ?? config.timeout;
|
|
2513
|
+
const model = options.model ?? config.model;
|
|
2514
|
+
const outputFormat = options.output ?? "terminal";
|
|
2515
|
+
banner(`Archal: ${options.scenarioPath}`);
|
|
2516
|
+
const scenario = parseScenarioFile(options.scenarioPath);
|
|
2517
|
+
info(`Scenario: ${scenario.title}`, {
|
|
2518
|
+
criteria: String(scenario.successCriteria.length),
|
|
2519
|
+
twins: scenario.config.twins.join(", "),
|
|
2520
|
+
runs: String(numRuns),
|
|
2521
|
+
timeout: `${timeoutSeconds}s`
|
|
2522
|
+
});
|
|
2523
|
+
let seedSelections = generateSeedSelections(scenario.config.twins, scenario.setup);
|
|
2524
|
+
if (options.seed) {
|
|
2525
|
+
const overrides = {};
|
|
2526
|
+
for (const twin of scenario.config.twins) {
|
|
2527
|
+
overrides[twin] = options.seed;
|
|
2528
|
+
}
|
|
2529
|
+
seedSelections = overrideSeedSelection(seedSelections, overrides);
|
|
2530
|
+
}
|
|
2531
|
+
const scenarioDir = dirname2(resolve3(options.scenarioPath));
|
|
2532
|
+
let projectConfigPath;
|
|
2533
|
+
for (const dir of [scenarioDir, process.cwd()]) {
|
|
2534
|
+
const candidate = resolve3(dir, ".archal.json");
|
|
2535
|
+
if (existsSync5(candidate)) {
|
|
2536
|
+
projectConfigPath = candidate;
|
|
2537
|
+
break;
|
|
2538
|
+
}
|
|
2539
|
+
}
|
|
2540
|
+
const agentConfig = options.agentConfig ?? resolveAgentConfig(options.agent, projectConfigPath) ?? {
|
|
2541
|
+
command: process.env["ARCHAL_AGENT_COMMAND"] ?? "echo",
|
|
2542
|
+
args: process.env["ARCHAL_AGENT_COMMAND"] ? [] : ["No agent command configured"]
|
|
2543
|
+
};
|
|
2544
|
+
if (agentConfig.command === "echo") {
|
|
2545
|
+
process.stderr.write("Warning: No agent command configured. Use --agent flag, set ARCHAL_AGENT_COMMAND, or create .archal.json\n");
|
|
2546
|
+
}
|
|
2547
|
+
printHeader(scenario.title, seedSelections);
|
|
2548
|
+
const evaluatorConfig = {
|
|
2549
|
+
apiKey: config.apiKey,
|
|
2550
|
+
model
|
|
2551
|
+
};
|
|
2552
|
+
const runs = [];
|
|
2553
|
+
for (let i = 0; i < numRuns; i++) {
|
|
2554
|
+
const result = await executeSingleRun(
|
|
2555
|
+
i,
|
|
2556
|
+
scenario,
|
|
2557
|
+
agentConfig,
|
|
2558
|
+
seedSelections,
|
|
2559
|
+
evaluatorConfig,
|
|
2560
|
+
timeoutSeconds,
|
|
2561
|
+
options.rateLimit
|
|
2562
|
+
);
|
|
2563
|
+
runs.push(result);
|
|
2564
|
+
printRunProgress(i, numRuns, result.overallScore, result.error);
|
|
2565
|
+
}
|
|
2566
|
+
const runScores = runs.map((r) => r.overallScore);
|
|
2567
|
+
const satisfactionScore = aggregateSatisfaction(runScores);
|
|
2568
|
+
const allEvaluations = runs.map((r) => r.evaluations);
|
|
2569
|
+
const summary = generateSummary(allEvaluations, satisfactionScore);
|
|
2570
|
+
const criterionDescriptions = {};
|
|
2571
|
+
const criterionTypes = {};
|
|
2572
|
+
for (const c of scenario.successCriteria) {
|
|
2573
|
+
criterionDescriptions[c.id] = c.description;
|
|
2574
|
+
criterionTypes[c.id] = c.type;
|
|
2575
|
+
}
|
|
2576
|
+
const report = {
|
|
2577
|
+
scenarioTitle: scenario.title,
|
|
2578
|
+
satisfactionScore,
|
|
2579
|
+
criterionDescriptions,
|
|
2580
|
+
criterionTypes,
|
|
2581
|
+
twinNames: scenario.config.twins,
|
|
2582
|
+
runs,
|
|
2583
|
+
summary,
|
|
2584
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
2585
|
+
};
|
|
2586
|
+
const traceId = recordTrace(report);
|
|
2587
|
+
info("Trace recorded", { traceId });
|
|
2588
|
+
await uploadIfEnabled(traceId, report);
|
|
2589
|
+
printReport(report, outputFormat);
|
|
2590
|
+
return report;
|
|
2591
|
+
}
|
|
2592
|
+
|
|
2593
|
+
// src/commands/run.ts
|
|
2594
|
+
function createRunCommand() {
|
|
2595
|
+
const cmd = new Command("run").description("Execute a scenario against digital twins").argument("<scenario>", "Path to scenario markdown file").option("-n, --runs <count>", "Number of runs", "5").option("-t, --timeout <seconds>", "Timeout per run in seconds", "120").option("-m, --model <model>", "Evaluator model for probabilistic criteria").option("-o, --output <format>", "Output format: terminal, json, junit", "terminal").option("-a, --agent <command>", "Agent command to execute").option("--seed <name>", "Override twin seed name").option("--rate-limit <count>", "Rate limit: max total requests before 429").option("-q, --quiet", "Suppress non-error output").option("-v, --verbose", "Enable debug logging").action(async (scenarioArg, opts) => {
|
|
2596
|
+
if (opts.quiet) {
|
|
2597
|
+
configureLogger({ quiet: true });
|
|
2598
|
+
}
|
|
2599
|
+
if (opts.verbose) {
|
|
2600
|
+
configureLogger({ verbose: true, level: "debug" });
|
|
2601
|
+
}
|
|
2602
|
+
const scenarioPath = resolve4(scenarioArg);
|
|
2603
|
+
if (!existsSync6(scenarioPath)) {
|
|
2604
|
+
process.stderr.write(`Error: Scenario file not found: ${scenarioPath}
|
|
2605
|
+
`);
|
|
2606
|
+
process.exit(1);
|
|
2607
|
+
}
|
|
2608
|
+
if (!scenarioPath.endsWith(".md")) {
|
|
2609
|
+
process.stderr.write(`Error: Scenario file must be a markdown file (.md): ${scenarioPath}
|
|
2610
|
+
`);
|
|
2611
|
+
process.exit(1);
|
|
2612
|
+
}
|
|
2613
|
+
const runs = parseInt(opts.runs, 10);
|
|
2614
|
+
if (Number.isNaN(runs) || runs <= 0) {
|
|
2615
|
+
process.stderr.write(`Error: --runs must be a positive integer
|
|
2616
|
+
`);
|
|
2617
|
+
process.exit(1);
|
|
2618
|
+
}
|
|
2619
|
+
const timeout = parseInt(opts.timeout, 10);
|
|
2620
|
+
if (Number.isNaN(timeout) || timeout <= 0) {
|
|
2621
|
+
process.stderr.write(`Error: --timeout must be a positive integer
|
|
2622
|
+
`);
|
|
2623
|
+
process.exit(1);
|
|
2624
|
+
}
|
|
2625
|
+
const validFormats = ["terminal", "json", "junit"];
|
|
2626
|
+
const outputFormat = opts.output;
|
|
2627
|
+
if (!validFormats.includes(outputFormat)) {
|
|
2628
|
+
process.stderr.write(`Error: --output must be one of: ${validFormats.join(", ")}
|
|
2629
|
+
`);
|
|
2630
|
+
process.exit(1);
|
|
2631
|
+
}
|
|
2632
|
+
if (outputFormat === "json") {
|
|
2633
|
+
configureLogger({ json: true });
|
|
2634
|
+
}
|
|
2635
|
+
try {
|
|
2636
|
+
const rateLimit = opts.rateLimit ? parseInt(opts.rateLimit, 10) : void 0;
|
|
2637
|
+
const report = await runScenario({
|
|
2638
|
+
scenarioPath,
|
|
2639
|
+
agent: opts.agent,
|
|
2640
|
+
runs,
|
|
2641
|
+
timeout,
|
|
2642
|
+
model: opts.model,
|
|
2643
|
+
output: outputFormat,
|
|
2644
|
+
seed: opts.seed,
|
|
2645
|
+
rateLimit
|
|
2646
|
+
});
|
|
2647
|
+
if (report.satisfactionScore < 100) {
|
|
2648
|
+
process.exit(1);
|
|
2649
|
+
}
|
|
2650
|
+
} catch (err) {
|
|
2651
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2652
|
+
process.stderr.write(`Error: ${message}
|
|
2653
|
+
`);
|
|
2654
|
+
process.exit(1);
|
|
2655
|
+
}
|
|
2656
|
+
});
|
|
2657
|
+
return cmd;
|
|
2658
|
+
}
|
|
2659
|
+
|
|
2660
|
+
// src/commands/init.ts
|
|
2661
|
+
import { Command as Command2 } from "commander";
|
|
2662
|
+
import { existsSync as existsSync7, mkdirSync as mkdirSync3, writeFileSync as writeFileSync4 } from "fs";
|
|
2663
|
+
import { join as join4, resolve as resolve5 } from "path";
|
|
2664
|
+
var SAMPLE_SCENARIO = `# Close Stale Issues
|
|
2665
|
+
|
|
2666
|
+
## Setup
|
|
2667
|
+
|
|
2668
|
+
A GitHub repository has stale issues in its backlog that need cleanup. Some issues are labeled "stale" and should be closed. Issues labeled "keep-open" must not be closed.
|
|
2669
|
+
|
|
2670
|
+
## Expected Behavior
|
|
2671
|
+
|
|
2672
|
+
The agent should list open issues, identify stale ones, close them with a comment, and skip any issue marked "keep-open".
|
|
2673
|
+
|
|
2674
|
+
## Success Criteria
|
|
2675
|
+
|
|
2676
|
+
- [D] At least 1 issue is closed
|
|
2677
|
+
- [D] No issues labeled "keep-open" are closed
|
|
2678
|
+
- [P] Comments on closed issues explain why they were closed
|
|
2679
|
+
|
|
2680
|
+
## Config
|
|
2681
|
+
|
|
2682
|
+
twins: github
|
|
2683
|
+
timeout: 60
|
|
2684
|
+
runs: 3
|
|
2685
|
+
`;
|
|
2686
|
+
var SAMPLE_CONFIG = `{
|
|
2687
|
+
"agent": {
|
|
2688
|
+
"command": "npx",
|
|
2689
|
+
"args": ["tsx", "agent.ts"]
|
|
2690
|
+
},
|
|
2691
|
+
"runs": 3,
|
|
2692
|
+
"timeout": 60
|
|
2693
|
+
}
|
|
2694
|
+
`;
|
|
2695
|
+
var SAMPLE_AGENT = `/**
|
|
2696
|
+
* Starter agent \u2014 closes stale GitHub issues via MCP.
|
|
2697
|
+
*
|
|
2698
|
+
* Archal sets ARCHAL_MCP_CONFIG pointing to a JSON file with MCP server config.
|
|
2699
|
+
* This agent connects to the GitHub twin, discovers the repo dynamically,
|
|
2700
|
+
* lists open issues, and closes stale ones.
|
|
2701
|
+
*/
|
|
2702
|
+
|
|
2703
|
+
import { readFileSync } from 'node:fs';
|
|
2704
|
+
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
|
|
2705
|
+
import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
|
|
2706
|
+
|
|
2707
|
+
interface McpServerConfig {
|
|
2708
|
+
command: string;
|
|
2709
|
+
args: string[];
|
|
2710
|
+
}
|
|
2711
|
+
|
|
2712
|
+
interface McpConfig {
|
|
2713
|
+
mcpServers: Record<string, McpServerConfig>;
|
|
2714
|
+
}
|
|
2715
|
+
|
|
2716
|
+
interface Issue {
|
|
2717
|
+
number: number;
|
|
2718
|
+
title: string;
|
|
2719
|
+
state: string;
|
|
2720
|
+
labels: Array<{ name: string }>;
|
|
2721
|
+
}
|
|
2722
|
+
|
|
2723
|
+
function parseToolResult(result: unknown): unknown {
|
|
2724
|
+
const content = (result as { content: unknown }).content;
|
|
2725
|
+
const text = typeof content === 'string'
|
|
2726
|
+
? content
|
|
2727
|
+
: (content as Array<{ text?: string }>)[0]?.text ?? '[]';
|
|
2728
|
+
return JSON.parse(text);
|
|
2729
|
+
}
|
|
2730
|
+
|
|
2731
|
+
async function main(): Promise<void> {
|
|
2732
|
+
// 1. Read MCP config (Archal provides this via environment variable)
|
|
2733
|
+
const configPath = process.env['ARCHAL_MCP_CONFIG'];
|
|
2734
|
+
if (!configPath) {
|
|
2735
|
+
console.error('ARCHAL_MCP_CONFIG not set \u2014 are you running via archal run?');
|
|
2736
|
+
process.exit(1);
|
|
2737
|
+
}
|
|
2738
|
+
|
|
2739
|
+
const config: McpConfig = JSON.parse(readFileSync(configPath, 'utf-8'));
|
|
2740
|
+
const serverName = Object.keys(config.mcpServers)[0];
|
|
2741
|
+
if (!serverName) {
|
|
2742
|
+
console.error('No MCP servers in config');
|
|
2743
|
+
process.exit(1);
|
|
2744
|
+
}
|
|
2745
|
+
|
|
2746
|
+
const serverConfig = config.mcpServers[serverName]!;
|
|
2747
|
+
|
|
2748
|
+
// 2. Connect to the twin via MCP stdio transport
|
|
2749
|
+
const transport = new StdioClientTransport({
|
|
2750
|
+
command: serverConfig.command,
|
|
2751
|
+
args: serverConfig.args,
|
|
2752
|
+
});
|
|
2753
|
+
|
|
2754
|
+
const client = new Client({ name: 'my-agent', version: '1.0.0' });
|
|
2755
|
+
await client.connect(transport);
|
|
2756
|
+
|
|
2757
|
+
try {
|
|
2758
|
+
// 3. List available tools (useful for debugging)
|
|
2759
|
+
const { tools } = await client.listTools();
|
|
2760
|
+
console.error(\`Connected to \${serverName}: \${tools.length} tools available\`);
|
|
2761
|
+
|
|
2762
|
+
// 4. Discover the repository dynamically
|
|
2763
|
+
const repoResult = await client.callTool({
|
|
2764
|
+
name: 'search_repositories',
|
|
2765
|
+
arguments: { query: ' ' },
|
|
2766
|
+
});
|
|
2767
|
+
const repos = parseToolResult(repoResult) as { items: Array<{ full_name: string }> };
|
|
2768
|
+
const firstRepo = repos.items[0];
|
|
2769
|
+
if (!firstRepo) {
|
|
2770
|
+
console.error('No repositories found');
|
|
2771
|
+
process.exit(1);
|
|
2772
|
+
}
|
|
2773
|
+
const [owner, repo] = firstRepo.full_name.split('/');
|
|
2774
|
+
console.error(\`Found repo: \${owner}/\${repo}\`);
|
|
2775
|
+
|
|
2776
|
+
// 5. List all open issues
|
|
2777
|
+
const listResult = await client.callTool({
|
|
2778
|
+
name: 'list_issues',
|
|
2779
|
+
arguments: { owner, repo, state: 'open' },
|
|
2780
|
+
});
|
|
2781
|
+
const issues = parseToolResult(listResult) as Issue[];
|
|
2782
|
+
|
|
2783
|
+
// 6. Close stale issues (skip keep-open)
|
|
2784
|
+
for (const issue of issues) {
|
|
2785
|
+
const labelNames = issue.labels.map((l) => l.name);
|
|
2786
|
+
|
|
2787
|
+
if (!labelNames.includes('stale')) continue;
|
|
2788
|
+
if (labelNames.includes('keep-open')) {
|
|
2789
|
+
console.error(\`Skipping #\${issue.number} \u2014 labeled keep-open\`);
|
|
2790
|
+
continue;
|
|
2791
|
+
}
|
|
2792
|
+
|
|
2793
|
+
// Post a comment explaining closure
|
|
2794
|
+
await client.callTool({
|
|
2795
|
+
name: 'add_issue_comment',
|
|
2796
|
+
arguments: {
|
|
2797
|
+
owner,
|
|
2798
|
+
repo,
|
|
2799
|
+
issue_number: issue.number,
|
|
2800
|
+
body: 'Closing as stale. Reopen if still relevant.',
|
|
2801
|
+
},
|
|
2802
|
+
});
|
|
2803
|
+
|
|
2804
|
+
// Close the issue
|
|
2805
|
+
await client.callTool({
|
|
2806
|
+
name: 'update_issue',
|
|
2807
|
+
arguments: {
|
|
2808
|
+
owner,
|
|
2809
|
+
repo,
|
|
2810
|
+
issue_number: issue.number,
|
|
2811
|
+
state: 'closed',
|
|
2812
|
+
},
|
|
2813
|
+
});
|
|
2814
|
+
|
|
2815
|
+
console.error(\`Closed #\${issue.number} "\${issue.title}"\`);
|
|
2816
|
+
}
|
|
2817
|
+
} finally {
|
|
2818
|
+
await client.close();
|
|
2819
|
+
}
|
|
2820
|
+
}
|
|
2821
|
+
|
|
2822
|
+
main().catch((err) => {
|
|
2823
|
+
console.error(err);
|
|
2824
|
+
process.exit(1);
|
|
2825
|
+
});
|
|
2826
|
+
`;
|
|
2827
|
+
var SAMPLE_PACKAGE_JSON = `{
|
|
2828
|
+
"type": "module",
|
|
2829
|
+
"dependencies": {
|
|
2830
|
+
"@modelcontextprotocol/sdk": "^1.4.0"
|
|
2831
|
+
},
|
|
2832
|
+
"devDependencies": {
|
|
2833
|
+
"tsx": "^4.19.0"
|
|
2834
|
+
}
|
|
2835
|
+
}
|
|
2836
|
+
`;
|
|
2837
|
+
function writeIfMissing(filePath, content) {
|
|
2838
|
+
if (!existsSync7(filePath)) {
|
|
2839
|
+
writeFileSync4(filePath, content);
|
|
2840
|
+
info(`Created ${filePath}`);
|
|
2841
|
+
} else {
|
|
2842
|
+
info(`Skipped ${filePath} (already exists)`);
|
|
2843
|
+
}
|
|
2844
|
+
}
|
|
2845
|
+
function createInitCommand() {
|
|
2846
|
+
const cmd = new Command2("init").description("Initialize an Archal test directory with sample scenario and agent").argument("[directory]", "Directory to initialize", "archal").action((directory) => {
|
|
2847
|
+
const targetDir = resolve5(directory);
|
|
2848
|
+
if (existsSync7(targetDir)) {
|
|
2849
|
+
warn(`Directory already exists: ${targetDir}`);
|
|
2850
|
+
warn("Skipping files that already exist.");
|
|
2851
|
+
} else {
|
|
2852
|
+
mkdirSync3(targetDir, { recursive: true });
|
|
2853
|
+
}
|
|
2854
|
+
writeIfMissing(join4(targetDir, "scenario.md"), SAMPLE_SCENARIO);
|
|
2855
|
+
writeIfMissing(join4(targetDir, ".archal.json"), SAMPLE_CONFIG);
|
|
2856
|
+
writeIfMissing(join4(targetDir, "agent.ts"), SAMPLE_AGENT);
|
|
2857
|
+
writeIfMissing(join4(targetDir, "package.json"), SAMPLE_PACKAGE_JSON);
|
|
2858
|
+
success("Archal initialized. Next steps:");
|
|
2859
|
+
process.stderr.write(`
|
|
2860
|
+
1. cd ${directory} && npm install
|
|
2861
|
+
`);
|
|
2862
|
+
process.stderr.write(` 2. Edit scenario.md and agent.ts to fit your use case
|
|
2863
|
+
`);
|
|
2864
|
+
process.stderr.write(` 3. Run: archal run scenario.md
|
|
2865
|
+
|
|
2866
|
+
`);
|
|
2867
|
+
});
|
|
2868
|
+
return cmd;
|
|
2869
|
+
}
|
|
2870
|
+
|
|
2871
|
+
// src/commands/twin.ts
|
|
2872
|
+
import { Command as Command3 } from "commander";
|
|
2873
|
+
var runningTwins = /* @__PURE__ */ new Map();
|
|
2874
|
+
var KNOWN_TWINS = [
|
|
2875
|
+
{ name: "github", package: "@archal/twin-github", description: "GitHub digital twin" },
|
|
2876
|
+
{ name: "slack", package: "@archal/twin-slack", description: "Slack digital twin" }
|
|
2877
|
+
];
|
|
2878
|
+
function createTwinCommand() {
|
|
2879
|
+
const cmd = new Command3("twin").description("Manage digital twin processes");
|
|
2880
|
+
cmd.command("start").description("Start a digital twin process").argument("<name>", "Twin name (e.g., github, slack)").option("--seed <seed>", "Seed name to load", "small-project").option("--transport <type>", "Transport type: stdio or http", "stdio").option("--port <port>", "Port for HTTP transport").action((name, opts) => {
|
|
2881
|
+
const knownTwin = KNOWN_TWINS.find((t) => t.name === name);
|
|
2882
|
+
if (!knownTwin) {
|
|
2883
|
+
const available = KNOWN_TWINS.map((t) => t.name).join(", ");
|
|
2884
|
+
error(`Unknown twin: "${name}". Available twins: ${available}`);
|
|
2885
|
+
process.exit(1);
|
|
2886
|
+
}
|
|
2887
|
+
if (runningTwins.has(name)) {
|
|
2888
|
+
warn(`Twin "${name}" is already running (PID: ${runningTwins.get(name)?.pid ?? "unknown"})`);
|
|
2889
|
+
return;
|
|
2890
|
+
}
|
|
2891
|
+
const args = [knownTwin.package, "--seed", opts.seed, "--transport", opts.transport];
|
|
2892
|
+
if (opts.transport === "http" && opts.port) {
|
|
2893
|
+
args.push("--port", opts.port);
|
|
2894
|
+
}
|
|
2895
|
+
info(`Starting twin: ${name}`, { seed: opts.seed, transport: opts.transport });
|
|
2896
|
+
const child = spawnMcpStdioProcess({
|
|
2897
|
+
command: "npx",
|
|
2898
|
+
args
|
|
2899
|
+
});
|
|
2900
|
+
const pid = child.pid ?? 0;
|
|
2901
|
+
runningTwins.set(name, {
|
|
2902
|
+
name,
|
|
2903
|
+
pid,
|
|
2904
|
+
startedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2905
|
+
process: child
|
|
2906
|
+
});
|
|
2907
|
+
child.on("exit", (code) => {
|
|
2908
|
+
info(`Twin "${name}" exited`, { code: String(code ?? "unknown") });
|
|
2909
|
+
runningTwins.delete(name);
|
|
2910
|
+
});
|
|
2911
|
+
success(`Twin "${name}" started (PID: ${pid})`);
|
|
2912
|
+
});
|
|
2913
|
+
cmd.command("stop").description("Stop a running digital twin").argument("<name>", "Twin name to stop").action(async (name) => {
|
|
2914
|
+
const twin = runningTwins.get(name);
|
|
2915
|
+
if (!twin) {
|
|
2916
|
+
error(`Twin "${name}" is not running`);
|
|
2917
|
+
const running = Array.from(runningTwins.keys());
|
|
2918
|
+
if (running.length > 0) {
|
|
2919
|
+
info(`Running twins: ${running.join(", ")}`);
|
|
2920
|
+
}
|
|
2921
|
+
process.exit(1);
|
|
2922
|
+
}
|
|
2923
|
+
info(`Stopping twin: ${name}`, { pid: String(twin.pid) });
|
|
2924
|
+
await killProcess(twin.process);
|
|
2925
|
+
runningTwins.delete(name);
|
|
2926
|
+
success(`Twin "${name}" stopped`);
|
|
2927
|
+
});
|
|
2928
|
+
cmd.command("list").description("List available digital twins").action(() => {
|
|
2929
|
+
const headers = ["Name", "Package", "Description", "Seeds"];
|
|
2930
|
+
const rows = KNOWN_TWINS.map((twin) => {
|
|
2931
|
+
const seeds = getAvailableSeeds(twin.name);
|
|
2932
|
+
return [
|
|
2933
|
+
twin.name,
|
|
2934
|
+
twin.package,
|
|
2935
|
+
twin.description,
|
|
2936
|
+
seeds.length > 0 ? seeds.join(", ") : "(default)"
|
|
2937
|
+
];
|
|
2938
|
+
});
|
|
2939
|
+
table(headers, rows);
|
|
2940
|
+
});
|
|
2941
|
+
cmd.command("status").description("Show status of running digital twins").action(() => {
|
|
2942
|
+
if (runningTwins.size === 0) {
|
|
2943
|
+
info("No twins currently running");
|
|
2944
|
+
return;
|
|
2945
|
+
}
|
|
2946
|
+
const headers = ["Name", "PID", "Started", "Status"];
|
|
2947
|
+
const rows = [];
|
|
2948
|
+
for (const twin of runningTwins.values()) {
|
|
2949
|
+
const isAlive = twin.process.exitCode === null;
|
|
2950
|
+
rows.push([
|
|
2951
|
+
twin.name,
|
|
2952
|
+
String(twin.pid),
|
|
2953
|
+
twin.startedAt,
|
|
2954
|
+
isAlive ? "running" : `exited (${twin.process.exitCode})`
|
|
2955
|
+
]);
|
|
2956
|
+
}
|
|
2957
|
+
table(headers, rows);
|
|
2958
|
+
});
|
|
2959
|
+
return cmd;
|
|
2960
|
+
}
|
|
2961
|
+
|
|
2962
|
+
// src/commands/scenario.ts
|
|
2963
|
+
import { Command as Command4 } from "commander";
|
|
2964
|
+
import { existsSync as existsSync8, readdirSync as readdirSync2, writeFileSync as writeFileSync5, mkdirSync as mkdirSync4 } from "fs";
|
|
2965
|
+
import { resolve as resolve6, join as join5, extname } from "path";
|
|
2966
|
+
var SCENARIO_TEMPLATE = `# {{NAME}}
|
|
2967
|
+
|
|
2968
|
+
## Setup
|
|
2969
|
+
|
|
2970
|
+
Describe the initial state of the digital twins here.
|
|
2971
|
+
What should exist before the agent starts?
|
|
2972
|
+
|
|
2973
|
+
## Expected Behavior
|
|
2974
|
+
|
|
2975
|
+
Describe what the agent should do.
|
|
2976
|
+
What actions should it take? What workflow should it follow?
|
|
2977
|
+
|
|
2978
|
+
## Success Criteria
|
|
2979
|
+
|
|
2980
|
+
- [D] Exactly N items should be created
|
|
2981
|
+
- [P] The agent should handle errors gracefully
|
|
2982
|
+
- [P] Output should be clear and well-structured
|
|
2983
|
+
|
|
2984
|
+
## Config
|
|
2985
|
+
|
|
2986
|
+
twins: github
|
|
2987
|
+
timeout: 120
|
|
2988
|
+
runs: 5
|
|
2989
|
+
`;
|
|
2990
|
+
function findScenarioFiles(dir) {
|
|
2991
|
+
const files = [];
|
|
2992
|
+
if (!existsSync8(dir)) return files;
|
|
2993
|
+
const entries = readdirSync2(dir, { withFileTypes: true });
|
|
2994
|
+
for (const entry of entries) {
|
|
2995
|
+
const fullPath = join5(dir, entry.name);
|
|
2996
|
+
if (entry.isDirectory()) {
|
|
2997
|
+
files.push(...findScenarioFiles(fullPath));
|
|
2998
|
+
} else if (entry.isFile() && extname(entry.name) === ".md") {
|
|
2999
|
+
files.push(fullPath);
|
|
3000
|
+
}
|
|
3001
|
+
}
|
|
3002
|
+
return files;
|
|
3003
|
+
}
|
|
3004
|
+
function findScenariosDir() {
|
|
3005
|
+
const candidates = [
|
|
3006
|
+
resolve6("scenarios"),
|
|
3007
|
+
resolve6("test", "scenarios"),
|
|
3008
|
+
resolve6(".archal", "scenarios")
|
|
3009
|
+
];
|
|
3010
|
+
for (const candidate of candidates) {
|
|
3011
|
+
if (existsSync8(candidate)) {
|
|
3012
|
+
return candidate;
|
|
3013
|
+
}
|
|
3014
|
+
}
|
|
3015
|
+
return resolve6("scenarios");
|
|
3016
|
+
}
|
|
3017
|
+
function createScenarioCommand() {
|
|
3018
|
+
const cmd = new Command4("scenario").description("Manage test scenarios");
|
|
3019
|
+
cmd.command("list").description("List available scenarios").option("-d, --dir <directory>", "Scenario directory to search").action((opts) => {
|
|
3020
|
+
const scenariosDir = opts.dir ? resolve6(opts.dir) : findScenariosDir();
|
|
3021
|
+
if (!existsSync8(scenariosDir)) {
|
|
3022
|
+
warn(`Scenarios directory not found: ${scenariosDir}`);
|
|
3023
|
+
info("Create a scenarios directory or use --dir to specify one");
|
|
3024
|
+
return;
|
|
3025
|
+
}
|
|
3026
|
+
const files = findScenarioFiles(scenariosDir);
|
|
3027
|
+
if (files.length === 0) {
|
|
3028
|
+
info("No scenario files found");
|
|
3029
|
+
info(`Create one with: archal scenario create my-scenario`);
|
|
3030
|
+
return;
|
|
3031
|
+
}
|
|
3032
|
+
const headers = ["Scenario", "Path", "Criteria", "Twins"];
|
|
3033
|
+
const rows = [];
|
|
3034
|
+
for (const file of files) {
|
|
3035
|
+
try {
|
|
3036
|
+
const scenario = parseScenarioFile(file);
|
|
3037
|
+
const relativePath = file.replace(resolve6(".") + "\\", "").replace(resolve6(".") + "/", "");
|
|
3038
|
+
rows.push([
|
|
3039
|
+
scenario.title,
|
|
3040
|
+
relativePath,
|
|
3041
|
+
String(scenario.successCriteria.length),
|
|
3042
|
+
scenario.config.twins.join(", ") || "(auto)"
|
|
3043
|
+
]);
|
|
3044
|
+
} catch (err) {
|
|
3045
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
3046
|
+
const relativePath = file.replace(resolve6(".") + "\\", "").replace(resolve6(".") + "/", "");
|
|
3047
|
+
rows.push([
|
|
3048
|
+
`(parse error)`,
|
|
3049
|
+
relativePath,
|
|
3050
|
+
"-",
|
|
3051
|
+
message
|
|
3052
|
+
]);
|
|
3053
|
+
}
|
|
3054
|
+
}
|
|
3055
|
+
table(headers, rows);
|
|
3056
|
+
info(`
|
|
3057
|
+
Found ${files.length} scenario(s) in ${scenariosDir}`);
|
|
3058
|
+
});
|
|
3059
|
+
cmd.command("validate").description("Parse and validate a scenario file").argument("<file>", "Path to scenario markdown file").action((file) => {
|
|
3060
|
+
const filePath = resolve6(file);
|
|
3061
|
+
if (!existsSync8(filePath)) {
|
|
3062
|
+
error(`File not found: ${filePath}`);
|
|
3063
|
+
process.exit(1);
|
|
3064
|
+
}
|
|
3065
|
+
try {
|
|
3066
|
+
const scenario = parseScenarioFile(filePath);
|
|
3067
|
+
const errors = validateScenario(scenario);
|
|
3068
|
+
info(`Scenario: ${scenario.title}`);
|
|
3069
|
+
info(`Setup: ${scenario.setup.slice(0, 80)}${scenario.setup.length > 80 ? "..." : ""}`);
|
|
3070
|
+
info(`Expected Behavior: ${scenario.expectedBehavior.slice(0, 80)}${scenario.expectedBehavior.length > 80 ? "..." : ""}`);
|
|
3071
|
+
info(`Twins: ${scenario.config.twins.join(", ") || "(none detected)"}`);
|
|
3072
|
+
info(`Timeout: ${scenario.config.timeout}s`);
|
|
3073
|
+
info(`Runs: ${scenario.config.runs}`);
|
|
3074
|
+
process.stdout.write("\n");
|
|
3075
|
+
info("Success Criteria:");
|
|
3076
|
+
for (const criterion of scenario.successCriteria) {
|
|
3077
|
+
const tag = criterion.type === "deterministic" ? "[D]" : "[P]";
|
|
3078
|
+
info(` ${tag} ${criterion.description}`);
|
|
3079
|
+
}
|
|
3080
|
+
process.stdout.write("\n");
|
|
3081
|
+
if (errors.length === 0) {
|
|
3082
|
+
success("Scenario is valid");
|
|
3083
|
+
} else {
|
|
3084
|
+
fail(`Scenario has ${errors.length} validation error(s):`);
|
|
3085
|
+
for (const err of errors) {
|
|
3086
|
+
error(` - ${err}`);
|
|
3087
|
+
}
|
|
3088
|
+
process.exit(1);
|
|
3089
|
+
}
|
|
3090
|
+
} catch (err) {
|
|
3091
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
3092
|
+
error(`Failed to parse scenario: ${message}`);
|
|
3093
|
+
process.exit(1);
|
|
3094
|
+
}
|
|
3095
|
+
});
|
|
3096
|
+
cmd.command("create").description("Scaffold a new scenario file").argument("<name>", "Scenario name (will be used as filename)").option("-d, --dir <directory>", "Directory to create scenario in").option("--twin <twin>", "Twin to configure (github, slack, etc.)", "github").action((name, opts) => {
|
|
3097
|
+
const scenariosDir = opts.dir ? resolve6(opts.dir) : findScenariosDir();
|
|
3098
|
+
if (!existsSync8(scenariosDir)) {
|
|
3099
|
+
mkdirSync4(scenariosDir, { recursive: true });
|
|
3100
|
+
info(`Created scenarios directory: ${scenariosDir}`);
|
|
3101
|
+
}
|
|
3102
|
+
const fileName = name.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "") + ".md";
|
|
3103
|
+
const filePath = join5(scenariosDir, fileName);
|
|
3104
|
+
if (existsSync8(filePath)) {
|
|
3105
|
+
error(`Scenario file already exists: ${filePath}`);
|
|
3106
|
+
process.exit(1);
|
|
3107
|
+
}
|
|
3108
|
+
const displayName = name.replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase());
|
|
3109
|
+
const content = SCENARIO_TEMPLATE.replace("{{NAME}}", displayName).replace("twins: github", `twins: ${opts.twin}`);
|
|
3110
|
+
writeFileSync5(filePath, content, "utf-8");
|
|
3111
|
+
success(`Created scenario: ${filePath}`);
|
|
3112
|
+
info(`Edit the file to define your test scenario, then run:`);
|
|
3113
|
+
info(` archal scenario validate ${filePath}`);
|
|
3114
|
+
info(` archal run ${filePath}`);
|
|
3115
|
+
});
|
|
3116
|
+
return cmd;
|
|
3117
|
+
}
|
|
3118
|
+
|
|
3119
|
+
// src/commands/trace.ts
|
|
3120
|
+
import { Command as Command5 } from "commander";
|
|
3121
|
+
function createTraceCommand() {
|
|
3122
|
+
const cmd = new Command5("trace").description("Inspect and export run traces");
|
|
3123
|
+
cmd.command("list").description("List recent traces").option("-n, --limit <count>", "Number of traces to show", "20").action((opts) => {
|
|
3124
|
+
const limit = parseInt(opts.limit, 10);
|
|
3125
|
+
if (Number.isNaN(limit) || limit <= 0) {
|
|
3126
|
+
error("--limit must be a positive integer");
|
|
3127
|
+
process.exit(1);
|
|
3128
|
+
}
|
|
3129
|
+
const traces = listTraces(limit);
|
|
3130
|
+
if (traces.length === 0) {
|
|
3131
|
+
info("No traces found. Run a scenario first: archal run <scenario.md>");
|
|
3132
|
+
return;
|
|
3133
|
+
}
|
|
3134
|
+
const headers = ["ID", "Scenario", "Score", "Runs", "Entries", "Timestamp"];
|
|
3135
|
+
const rows = traces.map((t) => [
|
|
3136
|
+
t.id.slice(0, 8) + "...",
|
|
3137
|
+
t.scenarioTitle.length > 30 ? t.scenarioTitle.slice(0, 27) + "..." : t.scenarioTitle,
|
|
3138
|
+
t.satisfactionScore.toFixed(1) + "%",
|
|
3139
|
+
String(t.runCount),
|
|
3140
|
+
String(t.entryCount),
|
|
3141
|
+
formatTimestamp2(t.timestamp)
|
|
3142
|
+
]);
|
|
3143
|
+
table(headers, rows);
|
|
3144
|
+
info(`
|
|
3145
|
+
Showing ${traces.length} most recent trace(s)`);
|
|
3146
|
+
info('Use "archal trace show <id>" to view details');
|
|
3147
|
+
});
|
|
3148
|
+
cmd.command("show").description("Show detailed trace information").argument("<id>", "Trace ID (full or prefix)").option("--run <index>", "Show specific run (0-indexed)").option("--entries", "Show individual trace entries").action((id, opts) => {
|
|
3149
|
+
const trace = loadTrace(id);
|
|
3150
|
+
if (!trace) {
|
|
3151
|
+
error(`Trace not found: ${id}`);
|
|
3152
|
+
info('Use "archal trace list" to see available traces');
|
|
3153
|
+
process.exit(1);
|
|
3154
|
+
}
|
|
3155
|
+
process.stdout.write("\n");
|
|
3156
|
+
info(`Trace ID: ${trace.id}`);
|
|
3157
|
+
info(`Scenario: ${trace.scenarioTitle}`);
|
|
3158
|
+
info(`Timestamp: ${trace.timestamp}`);
|
|
3159
|
+
info(`Satisfaction: ${trace.satisfactionScore.toFixed(1)}%`);
|
|
3160
|
+
info(`Runs: ${trace.runCount}`);
|
|
3161
|
+
info(`Total entries: ${trace.entries.length}`);
|
|
3162
|
+
process.stdout.write("\n");
|
|
3163
|
+
const report = trace.report;
|
|
3164
|
+
if (opts.run !== void 0) {
|
|
3165
|
+
const runIndex = parseInt(opts.run, 10);
|
|
3166
|
+
const run = report.runs[runIndex];
|
|
3167
|
+
if (!run) {
|
|
3168
|
+
error(`Run index ${runIndex} out of range (0-${report.runs.length - 1})`);
|
|
3169
|
+
process.exit(1);
|
|
3170
|
+
}
|
|
3171
|
+
info(`--- Run ${runIndex + 1} ---`);
|
|
3172
|
+
info(`Score: ${run.overallScore.toFixed(1)}%`);
|
|
3173
|
+
info(`Duration: ${run.durationMs}ms`);
|
|
3174
|
+
if (run.error) {
|
|
3175
|
+
error(`Error: ${run.error}`);
|
|
3176
|
+
}
|
|
3177
|
+
process.stdout.write("\n");
|
|
3178
|
+
info("Evaluations:");
|
|
3179
|
+
for (const evaluation of run.evaluations) {
|
|
3180
|
+
const status = evaluation.status.toUpperCase().padEnd(7);
|
|
3181
|
+
info(` [${status}] ${evaluation.criterionId}: ${evaluation.explanation} (${(evaluation.confidence * 100).toFixed(0)}% confidence)`);
|
|
3182
|
+
}
|
|
3183
|
+
if (opts.entries) {
|
|
3184
|
+
process.stdout.write("\n");
|
|
3185
|
+
info("Trace entries:");
|
|
3186
|
+
for (const entry of run.trace) {
|
|
3187
|
+
info(` ${entry.timestamp} ${entry.toolName} (${entry.durationMs}ms)${entry.error ? " ERROR" : ""}`);
|
|
3188
|
+
if (entry.error) {
|
|
3189
|
+
info(` Error: ${entry.error.code} - ${entry.error.message}`);
|
|
3190
|
+
}
|
|
3191
|
+
}
|
|
3192
|
+
}
|
|
3193
|
+
} else {
|
|
3194
|
+
info("Runs:");
|
|
3195
|
+
const runHeaders = ["Run", "Score", "Duration", "Evaluations", "Errors"];
|
|
3196
|
+
const runRows = report.runs.map((run) => [
|
|
3197
|
+
String(run.runIndex + 1),
|
|
3198
|
+
run.overallScore.toFixed(1) + "%",
|
|
3199
|
+
run.durationMs + "ms",
|
|
3200
|
+
`${run.evaluations.filter((e) => e.status === "pass").length}/${run.evaluations.length} pass`,
|
|
3201
|
+
run.error ?? "-"
|
|
3202
|
+
]);
|
|
3203
|
+
table(runHeaders, runRows);
|
|
3204
|
+
process.stdout.write("\n");
|
|
3205
|
+
info(`Summary: ${report.summary}`);
|
|
3206
|
+
if (opts.entries) {
|
|
3207
|
+
process.stdout.write("\n");
|
|
3208
|
+
info("All trace entries:");
|
|
3209
|
+
const entryHeaders = ["Time", "Tool", "Duration", "Error"];
|
|
3210
|
+
const entryRows = trace.entries.slice(0, 50).map((e) => [
|
|
3211
|
+
formatTimestamp2(e.timestamp),
|
|
3212
|
+
e.toolName,
|
|
3213
|
+
e.durationMs + "ms",
|
|
3214
|
+
e.error ? `${e.error.code}: ${e.error.message}` : "-"
|
|
3215
|
+
]);
|
|
3216
|
+
table(entryHeaders, entryRows);
|
|
3217
|
+
if (trace.entries.length > 50) {
|
|
3218
|
+
info(`
|
|
3219
|
+
... and ${trace.entries.length - 50} more entries. Use "archal trace export ${id}" for full data.`);
|
|
3220
|
+
}
|
|
3221
|
+
}
|
|
3222
|
+
}
|
|
3223
|
+
});
|
|
3224
|
+
cmd.command("export").description("Export trace as JSON").argument("<id>", "Trace ID (full or prefix)").option("-o, --output <file>", "Output file path (default: stdout)").action((id, opts) => {
|
|
3225
|
+
const json = exportTraceAsJson(id);
|
|
3226
|
+
if (!json) {
|
|
3227
|
+
error(`Trace not found: ${id}`);
|
|
3228
|
+
info('Use "archal trace list" to see available traces');
|
|
3229
|
+
process.exit(1);
|
|
3230
|
+
}
|
|
3231
|
+
if (opts.output) {
|
|
3232
|
+
const { writeFileSync: writeFileSync6 } = __require("fs");
|
|
3233
|
+
const { resolve: resolve8 } = __require("path");
|
|
3234
|
+
const outPath = resolve8(opts.output);
|
|
3235
|
+
writeFileSync6(outPath, json, "utf-8");
|
|
3236
|
+
info(`Trace exported to: ${outPath}`);
|
|
3237
|
+
} else {
|
|
3238
|
+
process.stdout.write(json + "\n");
|
|
3239
|
+
}
|
|
3240
|
+
});
|
|
3241
|
+
return cmd;
|
|
3242
|
+
}
|
|
3243
|
+
function formatTimestamp2(iso) {
|
|
3244
|
+
try {
|
|
3245
|
+
const date = new Date(iso);
|
|
3246
|
+
return date.toLocaleString();
|
|
3247
|
+
} catch {
|
|
3248
|
+
return iso;
|
|
3249
|
+
}
|
|
3250
|
+
}
|
|
3251
|
+
|
|
3252
|
+
// src/commands/config.ts
|
|
3253
|
+
import { Command as Command6 } from "commander";
|
|
3254
|
+
function createConfigCommand() {
|
|
3255
|
+
const cmd = new Command6("config").description("Manage Archal configuration");
|
|
3256
|
+
cmd.command("show").description("Print current configuration").option("--json", "Output as JSON").action((opts) => {
|
|
3257
|
+
const display = getConfigDisplay();
|
|
3258
|
+
if (opts.json) {
|
|
3259
|
+
process.stdout.write(JSON.stringify(display, null, 2) + "\n");
|
|
3260
|
+
return;
|
|
3261
|
+
}
|
|
3262
|
+
info("Current Archal configuration:\n");
|
|
3263
|
+
printConfigSection("General", {
|
|
3264
|
+
telemetry: String(display["telemetry"])
|
|
3265
|
+
});
|
|
3266
|
+
const evaluator = display["evaluator"];
|
|
3267
|
+
printConfigSection("Evaluator", {
|
|
3268
|
+
model: evaluator["model"] ?? "(not set)",
|
|
3269
|
+
apiKey: evaluator["apiKey"] ?? "(not set)"
|
|
3270
|
+
});
|
|
3271
|
+
const defaults = display["defaults"];
|
|
3272
|
+
printConfigSection("Defaults", {
|
|
3273
|
+
runs: String(defaults["runs"]),
|
|
3274
|
+
timeout: String(defaults["timeout"]) + "s"
|
|
3275
|
+
});
|
|
3276
|
+
const paths = display["paths"];
|
|
3277
|
+
printConfigSection("Paths", {
|
|
3278
|
+
archalDir: paths["archalDir"] ?? "(unknown)",
|
|
3279
|
+
configFile: paths["configFile"] ?? "(unknown)"
|
|
3280
|
+
});
|
|
3281
|
+
process.stdout.write("\n");
|
|
3282
|
+
info("Set values with: archal config set <key> <value>");
|
|
3283
|
+
info("Valid keys: telemetry, evaluator.model, evaluator.apiKey, defaults.runs, defaults.timeout");
|
|
3284
|
+
});
|
|
3285
|
+
cmd.command("set").description("Set a configuration value").argument("<key>", "Configuration key (e.g., evaluator.model, defaults.runs)").argument("<value>", "Value to set").action((key, value) => {
|
|
3286
|
+
try {
|
|
3287
|
+
setConfigValue(key, value);
|
|
3288
|
+
success(`Set ${key} = ${key.includes("apiKey") ? "***" : value}`);
|
|
3289
|
+
} catch (err) {
|
|
3290
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
3291
|
+
error(message);
|
|
3292
|
+
process.exit(1);
|
|
3293
|
+
}
|
|
3294
|
+
});
|
|
3295
|
+
cmd.command("init").description("Create default configuration file").option("--force", "Overwrite existing config").action((opts) => {
|
|
3296
|
+
const configPath = getConfigPath();
|
|
3297
|
+
if (opts.force) {
|
|
3298
|
+
const { existsSync: existsSync10, unlinkSync: unlinkSync2 } = __require("fs");
|
|
3299
|
+
if (existsSync10(configPath)) {
|
|
3300
|
+
unlinkSync2(configPath);
|
|
3301
|
+
}
|
|
3302
|
+
}
|
|
3303
|
+
try {
|
|
3304
|
+
const path = initConfig();
|
|
3305
|
+
success(`Configuration initialized: ${path}`);
|
|
3306
|
+
info("\nNext steps:");
|
|
3307
|
+
info(" 1. Set your API key:");
|
|
3308
|
+
info(" archal config set evaluator.apiKey your-key-here");
|
|
3309
|
+
info(" or set ANTHROPIC_API_KEY environment variable");
|
|
3310
|
+
info("");
|
|
3311
|
+
info(" 2. Create a scenario:");
|
|
3312
|
+
info(" archal scenario create my-first-test");
|
|
3313
|
+
info("");
|
|
3314
|
+
info(" 3. Run it:");
|
|
3315
|
+
info(" archal run scenarios/my-first-test.md");
|
|
3316
|
+
} catch (err) {
|
|
3317
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
3318
|
+
error(message);
|
|
3319
|
+
process.exit(1);
|
|
3320
|
+
}
|
|
3321
|
+
});
|
|
3322
|
+
cmd.command("path").description("Print the config file path").action(() => {
|
|
3323
|
+
process.stdout.write(getConfigPath() + "\n");
|
|
3324
|
+
});
|
|
3325
|
+
return cmd;
|
|
3326
|
+
}
|
|
3327
|
+
function printConfigSection(name, values) {
|
|
3328
|
+
process.stdout.write(`\x1B[1m ${name}\x1B[0m
|
|
3329
|
+
`);
|
|
3330
|
+
for (const [key, value] of Object.entries(values)) {
|
|
3331
|
+
process.stdout.write(` ${key.padEnd(15)} ${value}
|
|
3332
|
+
`);
|
|
3333
|
+
}
|
|
3334
|
+
process.stdout.write("\n");
|
|
3335
|
+
}
|
|
3336
|
+
|
|
3337
|
+
// src/commands/demo.ts
|
|
3338
|
+
import { Command as Command7 } from "commander";
|
|
3339
|
+
import { existsSync as existsSync9 } from "fs";
|
|
3340
|
+
import { resolve as resolve7, dirname as dirname3 } from "path";
|
|
3341
|
+
import { fileURLToPath as fileURLToPath3 } from "url";
|
|
3342
|
+
import { createRequire as createRequire3 } from "module";
|
|
3343
|
+
var __dirname3 = fileURLToPath3(new URL(".", import.meta.url));
|
|
3344
|
+
function resolveDemoDir() {
|
|
3345
|
+
const monorepoDemoDir = resolve7(__dirname3, "..", "demo");
|
|
3346
|
+
if (existsSync9(resolve7(monorepoDemoDir, "scenario.md"))) {
|
|
3347
|
+
return monorepoDemoDir;
|
|
3348
|
+
}
|
|
3349
|
+
try {
|
|
3350
|
+
const require2 = createRequire3(import.meta.url);
|
|
3351
|
+
const cliMain = require2.resolve("@archal/cli");
|
|
3352
|
+
const pkgDir = dirname3(dirname3(cliMain));
|
|
3353
|
+
const npmDemoDir = resolve7(pkgDir, "demo");
|
|
3354
|
+
if (existsSync9(resolve7(npmDemoDir, "scenario.md"))) {
|
|
3355
|
+
return npmDemoDir;
|
|
3356
|
+
}
|
|
3357
|
+
} catch {
|
|
3358
|
+
}
|
|
3359
|
+
throw new Error("Demo files not found. Ensure @archal/cli is installed correctly.");
|
|
3360
|
+
}
|
|
3361
|
+
function createDemoCommand() {
|
|
3362
|
+
const cmd = new Command7("demo").description("Run a built-in demo: good agent vs bad agent on the same scenario").option("-q, --quiet", "Suppress non-error output").option("-v, --verbose", "Enable debug logging").action(async (opts) => {
|
|
3363
|
+
if (opts.quiet) {
|
|
3364
|
+
configureLogger({ quiet: true });
|
|
3365
|
+
}
|
|
3366
|
+
if (opts.verbose) {
|
|
3367
|
+
configureLogger({ verbose: true, level: "debug" });
|
|
3368
|
+
}
|
|
3369
|
+
const demoDir = resolveDemoDir();
|
|
3370
|
+
const scenarioPath = resolve7(demoDir, "scenario.md");
|
|
3371
|
+
const goodAgentPath = resolve7(demoDir, "good-agent.mjs");
|
|
3372
|
+
const badAgentPath = resolve7(demoDir, "bad-agent.mjs");
|
|
3373
|
+
process.stderr.write("\n\x1B[36m\x1B[1marchal demo\x1B[0m \x1B[2m\u2014 same scenario, two agents\x1B[0m\n\n");
|
|
3374
|
+
process.stderr.write("\x1B[1m\x1B[32m\u25B8 Good agent\x1B[0m \x1B[2m(checks labels, skips keep-open)\x1B[0m\n");
|
|
3375
|
+
const goodReport = await runScenario({
|
|
3376
|
+
scenarioPath,
|
|
3377
|
+
agentConfig: { command: "node", args: [goodAgentPath] },
|
|
3378
|
+
runs: 1,
|
|
3379
|
+
timeout: 60,
|
|
3380
|
+
output: "terminal"
|
|
3381
|
+
});
|
|
3382
|
+
process.stderr.write("\n");
|
|
3383
|
+
process.stderr.write("\x1B[1m\x1B[31m\u25B8 Bad agent\x1B[0m \x1B[2m(closes everything, no comments)\x1B[0m\n");
|
|
3384
|
+
const badReport = await runScenario({
|
|
3385
|
+
scenarioPath,
|
|
3386
|
+
agentConfig: { command: "node", args: [badAgentPath] },
|
|
3387
|
+
runs: 1,
|
|
3388
|
+
timeout: 60,
|
|
3389
|
+
output: "terminal"
|
|
3390
|
+
});
|
|
3391
|
+
process.stderr.write("\n\x1B[2m\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\x1B[0m\n");
|
|
3392
|
+
process.stderr.write("\n Same scenario. Same digital twin. Different scores.\n");
|
|
3393
|
+
process.stderr.write(` Good agent: \x1B[32m${goodReport.satisfactionScore.toFixed(1)}%\x1B[0m
|
|
3394
|
+
`);
|
|
3395
|
+
process.stderr.write(` Bad agent: \x1B[31m${badReport.satisfactionScore.toFixed(1)}%\x1B[0m
|
|
3396
|
+
|
|
3397
|
+
`);
|
|
3398
|
+
process.stderr.write(" \x1B[2mThis is what archal does \u2014 it measures agent behavior,\n");
|
|
3399
|
+
process.stderr.write(" not just whether it runs.\x1B[0m\n\n");
|
|
3400
|
+
});
|
|
3401
|
+
return cmd;
|
|
3402
|
+
}
|
|
3403
|
+
|
|
3404
|
+
// src/index.ts
|
|
3405
|
+
var program = new Command8();
|
|
3406
|
+
program.name("archal").description("The QA layer for the software factory era \u2014 test AI agents against digital twins").version("0.1.0").option("-q, --quiet", "Suppress non-error output").option("-v, --verbose", "Enable debug logging").hook("preAction", (_thisCommand) => {
|
|
3407
|
+
const opts = program.opts();
|
|
3408
|
+
if (opts.quiet) {
|
|
3409
|
+
configureLogger({ quiet: true });
|
|
3410
|
+
}
|
|
3411
|
+
if (opts.verbose) {
|
|
3412
|
+
configureLogger({ verbose: true, level: "debug" });
|
|
3413
|
+
}
|
|
3414
|
+
});
|
|
3415
|
+
program.addCommand(createRunCommand());
|
|
3416
|
+
program.addCommand(createInitCommand());
|
|
3417
|
+
program.addCommand(createTwinCommand());
|
|
3418
|
+
program.addCommand(createScenarioCommand());
|
|
3419
|
+
program.addCommand(createTraceCommand());
|
|
3420
|
+
program.addCommand(createConfigCommand());
|
|
3421
|
+
program.addCommand(createDemoCommand());
|
|
3422
|
+
program.parseAsync(process.argv).catch((err) => {
|
|
3423
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
3424
|
+
process.stderr.write(`Error: ${message}
|
|
3425
|
+
`);
|
|
3426
|
+
process.exit(1);
|
|
3427
|
+
});
|