agent-regression-lab 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,155 @@
1
+ import OpenAI from "openai";
2
+ class OpenAIResponsesSession {
3
+ client;
4
+ model;
5
+ input;
6
+ previousResponseId;
7
+ pendingToolCall;
8
+ toolNameMap;
9
+ providerTools;
10
+ constructor(client, model, input) {
11
+ this.client = client;
12
+ this.model = model;
13
+ this.input = input;
14
+ this.providerTools = input.availableTools.map((tool) => ({
15
+ internalName: tool.name,
16
+ providerName: toProviderToolName(tool.name),
17
+ tool,
18
+ }));
19
+ this.toolNameMap = new Map(this.providerTools.map((entry) => [entry.providerName, entry.internalName]));
20
+ }
21
+ async next(event) {
22
+ try {
23
+ const requestBody = {
24
+ model: this.model,
25
+ instructions: this.input.instructions,
26
+ input: this.buildInput(event),
27
+ tools: this.providerTools.map((entry) => toOpenAITool(entry.providerName, entry.tool)),
28
+ previous_response_id: this.previousResponseId,
29
+ parallel_tool_calls: false,
30
+ };
31
+ const response = await this.client.responses.create(requestBody);
32
+ this.previousResponseId = response.id;
33
+ const output = Array.isArray(response.output) ? response.output : [];
34
+ const functionCall = output.find((item) => item?.type === "function_call");
35
+ if (functionCall) {
36
+ const providerToolName = String(functionCall.name);
37
+ const internalToolName = this.toolNameMap.get(providerToolName);
38
+ if (!internalToolName) {
39
+ return {
40
+ type: "error",
41
+ message: `OpenAI requested unknown provider tool '${providerToolName}'.`,
42
+ };
43
+ }
44
+ this.pendingToolCall = {
45
+ callId: String(functionCall.call_id),
46
+ providerToolName,
47
+ };
48
+ return {
49
+ type: "tool_call",
50
+ toolName: internalToolName,
51
+ input: safeJsonParse(String(functionCall.arguments ?? "{}")),
52
+ metadata: {
53
+ responseId: response.id,
54
+ providerToolName,
55
+ message: `OpenAI requested tool ${internalToolName}.`,
56
+ },
57
+ };
58
+ }
59
+ const finalOutput = typeof response.output_text === "string" ? response.output_text : "";
60
+ if (finalOutput) {
61
+ return {
62
+ type: "final",
63
+ output: finalOutput,
64
+ metadata: {
65
+ responseId: response.id,
66
+ usage: response.usage ?? undefined,
67
+ },
68
+ };
69
+ }
70
+ return {
71
+ type: "error",
72
+ message: "OpenAI response did not include a function call or final output.",
73
+ };
74
+ }
75
+ catch (error) {
76
+ const message = error instanceof Error ? error.message : String(error);
77
+ return { type: "error", message };
78
+ }
79
+ }
80
+ buildInput(event) {
81
+ if (event.type === "run_started") {
82
+ return [
83
+ {
84
+ role: "user",
85
+ content: [
86
+ {
87
+ type: "input_text",
88
+ text: buildInitialPrompt(this.input),
89
+ },
90
+ ],
91
+ },
92
+ ];
93
+ }
94
+ if (event.type === "tool_result") {
95
+ if (!this.pendingToolCall) {
96
+ throw new Error("Received tool result without a pending provider tool call.");
97
+ }
98
+ const output = typeof event.result === "string" ? event.result : JSON.stringify(event.result);
99
+ const payload = [
100
+ {
101
+ type: "function_call_output",
102
+ call_id: this.pendingToolCall.callId,
103
+ output,
104
+ },
105
+ ];
106
+ this.pendingToolCall = undefined;
107
+ return payload;
108
+ }
109
+ throw new Error(event.message);
110
+ }
111
+ }
112
+ export class OpenAIResponsesAgentAdapter {
113
+ options;
114
+ constructor(options) {
115
+ this.options = options;
116
+ }
117
+ async startRun(input) {
118
+ if (!this.options.apiKey && !this.options.client) {
119
+ throw new Error("OPENAI_API_KEY is required for provider=openai.");
120
+ }
121
+ const model = typeof input.metadata?.model === "string" && input.metadata.model.length > 0 ? input.metadata.model : "gpt-4o-mini";
122
+ const client = this.options.client ?? new OpenAI({ apiKey: this.options.apiKey });
123
+ return new OpenAIResponsesSession(client, model, input);
124
+ }
125
+ }
126
+ function toOpenAITool(providerName, tool) {
127
+ return {
128
+ type: "function",
129
+ name: providerName,
130
+ description: tool.description ?? "",
131
+ parameters: tool.inputSchema ?? {
132
+ type: "object",
133
+ additionalProperties: true,
134
+ properties: {},
135
+ },
136
+ };
137
+ }
138
+ function buildInitialPrompt(input) {
139
+ const context = JSON.stringify(input.context, null, 2);
140
+ const tools = input.availableTools
141
+ .map((tool) => `- ${toProviderToolName(tool.name)}: ${tool.description ?? "No description"}`)
142
+ .join("\n");
143
+ return `Task:\n${input.instructions}\n\nContext:\n${context}\n\nAvailable tools:\n${tools}\n\nUse tools when needed and provide a final answer when the task is complete.`;
144
+ }
145
+ function toProviderToolName(internalName) {
146
+ return internalName.replace(/[^a-zA-Z0-9_-]/g, "_");
147
+ }
148
+ function safeJsonParse(value) {
149
+ try {
150
+ return JSON.parse(value);
151
+ }
152
+ catch {
153
+ return {};
154
+ }
155
+ }
package/dist/config.js ADDED
@@ -0,0 +1,123 @@
1
+ import { statSync, readFileSync } from "node:fs";
2
+ import { resolve, relative, sep } from "node:path";
3
+ import { parse } from "yaml";
4
+ const CONFIG_PATH = resolve("agentlab.config.yaml");
5
+ export function loadAgentLabConfig() {
6
+ if (!exists(CONFIG_PATH)) {
7
+ return {};
8
+ }
9
+ const raw = readFileSync(CONFIG_PATH, "utf8");
10
+ const parsed = parse(raw);
11
+ validateConfig(parsed);
12
+ return parsed;
13
+ }
14
+ function validateConfig(value) {
15
+ if (!isObject(value)) {
16
+ throw new Error("agentlab.config.yaml must contain a YAML object.");
17
+ }
18
+ if (value.tools !== undefined) {
19
+ if (!Array.isArray(value.tools)) {
20
+ throw new Error("agentlab.config.yaml field 'tools' must be an array.");
21
+ }
22
+ const names = new Set();
23
+ for (const tool of value.tools) {
24
+ validateToolRegistration(tool);
25
+ if (names.has(tool.name)) {
26
+ throw new Error(`agentlab.config.yaml defines duplicate tool '${tool.name}'.`);
27
+ }
28
+ names.add(tool.name);
29
+ }
30
+ }
31
+ if (value.agents !== undefined) {
32
+ if (!Array.isArray(value.agents)) {
33
+ throw new Error("agentlab.config.yaml field 'agents' must be an array.");
34
+ }
35
+ const names = new Set();
36
+ for (const agent of value.agents) {
37
+ validateAgentRegistration(agent);
38
+ if (names.has(agent.name)) {
39
+ throw new Error(`agentlab.config.yaml defines duplicate agent '${agent.name}'.`);
40
+ }
41
+ names.add(agent.name);
42
+ }
43
+ }
44
+ }
45
+ function validateToolRegistration(value) {
46
+ if (!isObject(value)) {
47
+ throw new Error("Each tool registration in agentlab.config.yaml must be an object.");
48
+ }
49
+ if (typeof value.name !== "string" || value.name.length === 0) {
50
+ throw new Error("Each tool registration must define a non-empty 'name'.");
51
+ }
52
+ if (typeof value.modulePath !== "string" || value.modulePath.length === 0) {
53
+ throw new Error(`Tool '${value.name}' must define a non-empty 'modulePath'.`);
54
+ }
55
+ if (typeof value.exportName !== "string" || value.exportName.length === 0) {
56
+ throw new Error(`Tool '${value.name}' must define a non-empty 'exportName'.`);
57
+ }
58
+ if (typeof value.description !== "string" || value.description.length === 0) {
59
+ throw new Error(`Tool '${value.name}' must define a non-empty 'description'.`);
60
+ }
61
+ if (!isObject(value.inputSchema)) {
62
+ throw new Error(`Tool '${value.name}' must define an object 'inputSchema'.`);
63
+ }
64
+ const resolved = resolve(value.modulePath);
65
+ const root = `${process.cwd()}${sep}`;
66
+ if (!(resolved === process.cwd() || resolved.startsWith(root))) {
67
+ throw new Error(`Tool '${value.name}' modulePath must stay within the repo.`);
68
+ }
69
+ if (!exists(resolved)) {
70
+ throw new Error(`Tool '${value.name}' references missing module '${relative(process.cwd(), resolved)}'.`);
71
+ }
72
+ }
73
+ function validateAgentRegistration(value) {
74
+ if (!isObject(value)) {
75
+ throw new Error("Each agent registration in agentlab.config.yaml must be an object.");
76
+ }
77
+ if (typeof value.name !== "string" || value.name.length === 0) {
78
+ throw new Error("Each agent registration must define a non-empty 'name'.");
79
+ }
80
+ if (value.provider !== "mock" && value.provider !== "openai" && value.provider !== "external_process") {
81
+ throw new Error(`Agent '${value.name}' uses unsupported provider '${String(value.provider)}'.`);
82
+ }
83
+ if (value.label !== undefined && (typeof value.label !== "string" || value.label.length === 0)) {
84
+ throw new Error(`Agent '${value.name}' must define a non-empty 'label' when provided.`);
85
+ }
86
+ if (value.provider === "openai" && value.model !== undefined && (typeof value.model !== "string" || value.model.length === 0)) {
87
+ throw new Error(`Agent '${value.name}' must define a non-empty 'model' when provided.`);
88
+ }
89
+ if (value.provider === "external_process") {
90
+ if (typeof value.command !== "string" || value.command.length === 0) {
91
+ throw new Error(`Agent '${value.name}' must define a non-empty 'command'.`);
92
+ }
93
+ if (value.args !== undefined) {
94
+ if (!Array.isArray(value.args) || value.args.some((arg) => typeof arg !== "string")) {
95
+ throw new Error(`Agent '${value.name}' field 'args' must be an array of strings.`);
96
+ }
97
+ }
98
+ if (value.envAllowlist !== undefined) {
99
+ if (!Array.isArray(value.envAllowlist) || value.envAllowlist.some((key) => typeof key !== "string" || key.length === 0)) {
100
+ throw new Error(`Agent '${value.name}' field 'envAllowlist' must be an array of non-empty strings.`);
101
+ }
102
+ }
103
+ }
104
+ }
105
+ export function getAgentRegistration(name) {
106
+ const match = loadAgentLabConfig().agents?.find((agent) => agent.name === name);
107
+ if (!match) {
108
+ throw new Error(`agentlab.config.yaml does not define agent '${name}'.`);
109
+ }
110
+ return match;
111
+ }
112
+ function exists(path) {
113
+ try {
114
+ statSync(path);
115
+ return true;
116
+ }
117
+ catch {
118
+ return false;
119
+ }
120
+ }
121
+ function isObject(value) {
122
+ return typeof value === "object" && value !== null && !Array.isArray(value);
123
+ }
@@ -0,0 +1,109 @@
1
+ export function evaluateScenario(bundle, evaluators) {
2
+ return evaluators.map((evaluator) => evaluateOne(evaluator, bundle));
3
+ }
4
+ function evaluateOne(evaluator, bundle) {
5
+ switch (evaluator.type) {
6
+ case "forbidden_tool":
7
+ return evaluateForbiddenTool(evaluator, bundle.toolCalls);
8
+ case "tool_call_assertion":
9
+ return evaluateToolCallAssertion(evaluator, bundle.toolCalls);
10
+ case "final_answer_contains":
11
+ return evaluateFinalAnswerContains(evaluator, bundle.run.finalOutput);
12
+ case "exact_final_answer":
13
+ return evaluateExactFinalAnswer(evaluator, bundle.run.finalOutput);
14
+ case "step_count_max":
15
+ return evaluateStepCountMax(evaluator, bundle.run.totalSteps);
16
+ default:
17
+ return {
18
+ evaluatorId: evaluator.id,
19
+ evaluatorType: evaluator.type,
20
+ mode: evaluator.mode,
21
+ status: "fail",
22
+ weight: evaluator.weight,
23
+ message: `Unsupported evaluator type '${evaluator.type}'.`,
24
+ };
25
+ }
26
+ }
27
+ function evaluateForbiddenTool(evaluator, toolCalls) {
28
+ const forbidden = Array.isArray(evaluator.config.tools) ? evaluator.config.tools.map(String) : [];
29
+ const used = toolCalls.find((call) => forbidden.includes(call.toolName));
30
+ return {
31
+ evaluatorId: evaluator.id,
32
+ evaluatorType: evaluator.type,
33
+ mode: evaluator.mode,
34
+ status: used ? "fail" : "pass",
35
+ weight: evaluator.weight,
36
+ rawScore: used ? 0 : 1,
37
+ message: used ? `Forbidden tool '${used.toolName}' was used.` : "No forbidden tools were used.",
38
+ };
39
+ }
40
+ function evaluateToolCallAssertion(evaluator, toolCalls) {
41
+ const tool = String(evaluator.config.tool ?? "");
42
+ const match = isObject(evaluator.config.match) ? evaluator.config.match : {};
43
+ const call = toolCalls.find((candidate) => candidate.toolName === tool && matches(candidate.input, match));
44
+ return {
45
+ evaluatorId: evaluator.id,
46
+ evaluatorType: evaluator.type,
47
+ mode: evaluator.mode,
48
+ status: call ? "pass" : "fail",
49
+ weight: evaluator.weight,
50
+ rawScore: call ? 1 : 0,
51
+ message: call ? `Observed expected tool call for '${tool}'.` : `Expected tool call for '${tool}' was not found.`,
52
+ details: { expected: match },
53
+ };
54
+ }
55
+ function evaluateFinalAnswerContains(evaluator, finalOutput) {
56
+ const required = Array.isArray(evaluator.config.required_substrings)
57
+ ? evaluator.config.required_substrings.map(String)
58
+ : [];
59
+ const normalizedOutput = normalizeText(finalOutput);
60
+ const missing = required.filter((candidate) => !normalizedOutput.includes(normalizeText(candidate)));
61
+ const passed = missing.length === 0;
62
+ return {
63
+ evaluatorId: evaluator.id,
64
+ evaluatorType: evaluator.type,
65
+ mode: evaluator.mode,
66
+ status: passed ? "pass" : "fail",
67
+ weight: evaluator.weight,
68
+ rawScore: passed ? required.length : required.length - missing.length,
69
+ message: passed ? "Final answer contains all required substrings." : `Missing required substrings: ${missing.join(", ")}.`,
70
+ };
71
+ }
72
+ function normalizeText(value) {
73
+ return value.toLowerCase().replace(/\s+/g, " ").trim();
74
+ }
75
+ function evaluateExactFinalAnswer(evaluator, finalOutput) {
76
+ const expected = String(evaluator.config.expected ?? "");
77
+ const passed = finalOutput.trim() === expected.trim();
78
+ return {
79
+ evaluatorId: evaluator.id,
80
+ evaluatorType: evaluator.type,
81
+ mode: evaluator.mode,
82
+ status: passed ? "pass" : "fail",
83
+ weight: evaluator.weight,
84
+ rawScore: passed ? 1 : 0,
85
+ message: passed ? "Final answer matched exactly." : "Final answer did not match expected output.",
86
+ };
87
+ }
88
+ function evaluateStepCountMax(evaluator, stepCount) {
89
+ const max = Number(evaluator.config.max_steps ?? 0);
90
+ const passed = stepCount <= max;
91
+ return {
92
+ evaluatorId: evaluator.id,
93
+ evaluatorType: evaluator.type,
94
+ mode: evaluator.mode,
95
+ status: passed ? "pass" : "fail",
96
+ weight: evaluator.weight,
97
+ rawScore: passed ? 1 : 0,
98
+ message: passed ? `Step count ${stepCount} is within max ${max}.` : `Step count ${stepCount} exceeds max ${max}.`,
99
+ };
100
+ }
101
+ function matches(input, match) {
102
+ if (!isObject(input)) {
103
+ return false;
104
+ }
105
+ return Object.entries(match).every(([key, value]) => input[key] === value);
106
+ }
107
+ function isObject(value) {
108
+ return typeof value === "object" && value !== null && !Array.isArray(value);
109
+ }
package/dist/index.js ADDED
@@ -0,0 +1,296 @@
1
+ #!/usr/bin/env node
2
+ import { createAgentFactory } from "./agent/factory.js";
3
+ import { getAgentRegistration } from "./config.js";
4
+ import { getRunErrorDetail } from "./runOutput.js";
5
+ async function main() {
6
+ const [, , command, ...args] = process.argv;
7
+ switch (command) {
8
+ case "help":
9
+ case "--help":
10
+ case "-h":
11
+ printUsage();
12
+ return;
13
+ case "version":
14
+ case "--version":
15
+ case "-v":
16
+ printVersion();
17
+ return;
18
+ case "list":
19
+ await handleList(args);
20
+ return;
21
+ case "run":
22
+ await handleRun(args);
23
+ return;
24
+ case "show":
25
+ await handleShow(args);
26
+ return;
27
+ case "compare":
28
+ await handleCompare(args);
29
+ return;
30
+ case "ui":
31
+ await handleUi();
32
+ return;
33
+ default:
34
+ printUsage();
35
+ }
36
+ }
37
+ function printUsage() {
38
+ console.log(`Usage:
39
+ agentlab list scenarios
40
+ agentlab run <scenario-id> [--agent <name>] [--provider mock|openai|external_process] [--model <model>] [--agent-label <label>]
41
+ agentlab run --suite <suite-id> [--agent <name>] [--provider mock|openai|external_process] [--model <model>] [--agent-label <label>]
42
+ agentlab show <run-id>
43
+ agentlab compare <baseline-run-id> <candidate-run-id>
44
+ agentlab ui
45
+ agentlab help
46
+ agentlab version`);
47
+ }
48
+ function printVersion() {
49
+ console.log("0.1.0");
50
+ }
51
+ async function handleList(args) {
52
+ if (args[0] !== "scenarios") {
53
+ printUsage();
54
+ return;
55
+ }
56
+ const { listScenarios } = await import("./scenarios.js");
57
+ for (const scenario of listScenarios()) {
58
+ console.log(`${scenario.id}\t${scenario.suite}\t${scenario.difficulty ?? "-"}\t${scenario.description ?? ""}`);
59
+ }
60
+ }
61
+ async function handleRun(args) {
62
+ const parsed = parseRunArgs(args);
63
+ const runtimeConfig = validateRuntimeConfig(parsed.runtimeConfig);
64
+ const { loadScenariosBySuite } = await import("./scenarios.js");
65
+ if (parsed.suite) {
66
+ const suite = parsed.suite;
67
+ if (!suite) {
68
+ throw new Error("Missing suite id.");
69
+ }
70
+ const scenarios = loadScenariosBySuite(suite);
71
+ if (scenarios.length === 0) {
72
+ throw new Error(`No scenarios found for suite '${suite}'.`);
73
+ }
74
+ const runs = [];
75
+ for (const scenario of scenarios) {
76
+ runs.push(await executeOne(scenario.definition.id, runtimeConfig));
77
+ }
78
+ const passed = runs.filter((bundle) => bundle.run.status === "pass").length;
79
+ const failed = runs.filter((bundle) => bundle.run.status === "fail").length;
80
+ const errored = runs.filter((bundle) => bundle.run.status === "error").length;
81
+ const avgScore = Math.round(runs.reduce((sum, bundle) => sum + bundle.run.score, 0) / runs.length);
82
+ console.log(`Suite: ${suite}`);
83
+ console.log(`Passed: ${passed}/${runs.length}`);
84
+ console.log(`Failed: ${failed}/${runs.length}`);
85
+ console.log(`Errored: ${errored}/${runs.length}`);
86
+ console.log(`Average score: ${avgScore}`);
87
+ return;
88
+ }
89
+ const scenarioId = parsed.scenarioId;
90
+ if (!scenarioId) {
91
+ throw new Error("Missing scenario id.");
92
+ }
93
+ await executeOne(scenarioId, runtimeConfig);
94
+ }
95
+ async function executeOne(scenarioId, runtimeConfig) {
96
+ const [{ Storage }, { loadToolRegistry, loadToolSpecs }, { loadScenarioById }, { runScenario }] = await Promise.all([
97
+ import("./storage.js"),
98
+ import("./tools.js"),
99
+ import("./scenarios.js"),
100
+ import("./runner.js"),
101
+ ]);
102
+ const storage = new Storage();
103
+ const toolSpecs = await loadToolSpecs();
104
+ const toolRegistry = await loadToolRegistry();
105
+ const loaded = loadScenarioById(scenarioId);
106
+ storage.upsertScenario({
107
+ id: loaded.definition.id,
108
+ name: loaded.definition.name,
109
+ suite: loaded.definition.suite,
110
+ difficulty: loaded.definition.difficulty,
111
+ description: loaded.definition.description,
112
+ }, loaded.definition, loaded.filePath, loaded.fileHash);
113
+ const factory = createAgentFactory(runtimeConfig);
114
+ const agentVersion = factory.createVersion(runtimeConfig);
115
+ storage.upsertAgentVersion(agentVersion);
116
+ const bundle = await runScenario({
117
+ agentAdapter: factory.createAdapter(),
118
+ agentVersion,
119
+ scenario: loaded.definition,
120
+ scenarioFileHash: loaded.fileHash,
121
+ toolSpecs,
122
+ tools: toolRegistry,
123
+ });
124
+ bundle.agentVersion = agentVersion;
125
+ storage.saveRun(bundle);
126
+ printRunSummary(bundle);
127
+ return bundle;
128
+ }
129
+ async function handleUi() {
130
+ const { startUiServer } = await import("./ui/server.js");
131
+ await startUiServer();
132
+ }
133
+ function printRunSummary(bundle) {
134
+ console.log(`Run: ${bundle.run.id}`);
135
+ console.log(`Scenario: ${bundle.run.scenarioId}`);
136
+ console.log(`Status: ${bundle.run.status.toUpperCase()}`);
137
+ console.log(`Score: ${bundle.run.score}/100`);
138
+ console.log(`Agent: ${bundle.agentVersion?.label ?? bundle.run.agentVersionId}`);
139
+ if (bundle.agentVersion?.provider) {
140
+ console.log(`Provider: ${bundle.agentVersion.provider}`);
141
+ }
142
+ if (bundle.agentVersion?.modelId) {
143
+ console.log(`Model: ${bundle.agentVersion.modelId}`);
144
+ }
145
+ if (bundle.agentVersion?.command) {
146
+ console.log(`Command: ${bundle.agentVersion.command} ${(bundle.agentVersion.args ?? []).join(" ")}`.trim());
147
+ }
148
+ console.log(`Runtime: ${bundle.run.durationMs}ms`);
149
+ if (bundle.run.status !== "pass") {
150
+ console.log(`Reason: ${bundle.run.terminationReason}`);
151
+ const errorDetail = getRunErrorDetail(bundle);
152
+ if (errorDetail) {
153
+ console.log(`Error: ${errorDetail}`);
154
+ }
155
+ }
156
+ }
157
+ async function handleShow(args) {
158
+ const runId = args[0];
159
+ if (!runId) {
160
+ throw new Error("Missing run id.");
161
+ }
162
+ const { Storage } = await import("./storage.js");
163
+ const storage = new Storage();
164
+ const bundle = storage.getRun(runId);
165
+ if (!bundle) {
166
+ throw new Error(`Run '${runId}' not found.`);
167
+ }
168
+ console.log(`Run: ${bundle.run.id}`);
169
+ console.log(`Scenario: ${bundle.run.scenarioId}`);
170
+ console.log(`Status: ${bundle.run.status.toUpperCase()}`);
171
+ console.log(`Score: ${bundle.run.score}/100`);
172
+ if (bundle.agentVersion) {
173
+ console.log(`Provider: ${bundle.agentVersion.provider ?? "unknown"}`);
174
+ console.log(`Model: ${bundle.agentVersion.modelId ?? "unknown"}`);
175
+ if (bundle.agentVersion.command) {
176
+ console.log(`Command: ${bundle.agentVersion.command} ${(bundle.agentVersion.args ?? []).join(" ")}`.trim());
177
+ }
178
+ }
179
+ console.log(`Termination: ${bundle.run.terminationReason}`);
180
+ const errorDetail = getRunErrorDetail(bundle);
181
+ if (errorDetail) {
182
+ console.log(`Error: ${errorDetail}`);
183
+ }
184
+ console.log(`Final output: ${bundle.run.finalOutput}`);
185
+ console.log("Evaluators:");
186
+ for (const result of bundle.evaluatorResults) {
187
+ console.log(`- ${result.evaluatorId}: ${result.status.toUpperCase()} - ${result.message}`);
188
+ }
189
+ }
190
+ async function handleCompare(args) {
191
+ const [baselineRunId, candidateRunId] = args;
192
+ if (!baselineRunId || !candidateRunId) {
193
+ throw new Error("Missing baseline or candidate run id.");
194
+ }
195
+ const { Storage } = await import("./storage.js");
196
+ const storage = new Storage();
197
+ const comparison = storage.compareRuns(baselineRunId, candidateRunId);
198
+ console.log(`Scenario: ${comparison.baseline.run.scenarioId}`);
199
+ console.log(`Baseline: ${comparison.baseline.run.id} (${comparison.baseline.run.status.toUpperCase()} ${comparison.baseline.run.score}/100)`);
200
+ console.log(`Candidate: ${comparison.candidate.run.id} (${comparison.candidate.run.status.toUpperCase()} ${comparison.candidate.run.score}/100)`);
201
+ console.log("Changes:");
202
+ if (comparison.notes.length === 0) {
203
+ console.log("- No material changes.");
204
+ }
205
+ else {
206
+ for (const note of comparison.notes) {
207
+ console.log(`- ${note}`);
208
+ }
209
+ }
210
+ if (comparison.evaluatorDiffs.length > 0) {
211
+ console.log("Evaluator diffs:");
212
+ for (const diff of comparison.evaluatorDiffs) {
213
+ console.log(`- ${diff.note}`);
214
+ }
215
+ }
216
+ if (comparison.toolDiffs.length > 0) {
217
+ console.log("Tool diffs:");
218
+ for (const diff of comparison.toolDiffs) {
219
+ console.log(`- ${diff.note}`);
220
+ }
221
+ }
222
+ }
223
+ function parseRunArgs(args) {
224
+ const runtimeConfig = { provider: "mock" };
225
+ let scenarioId;
226
+ let suite;
227
+ for (let index = 0; index < args.length; index += 1) {
228
+ const arg = args[index];
229
+ if (arg === "--suite") {
230
+ suite = args[index + 1];
231
+ index += 1;
232
+ continue;
233
+ }
234
+ if (arg === "--provider") {
235
+ const provider = args[index + 1];
236
+ if (provider !== "mock" && provider !== "openai" && provider !== "external_process") {
237
+ throw new Error(`Unsupported provider '${String(provider)}'.`);
238
+ }
239
+ runtimeConfig.provider = provider;
240
+ index += 1;
241
+ continue;
242
+ }
243
+ if (arg === "--agent") {
244
+ runtimeConfig.agentName = args[index + 1];
245
+ index += 1;
246
+ continue;
247
+ }
248
+ if (arg === "--model") {
249
+ runtimeConfig.model = args[index + 1];
250
+ index += 1;
251
+ continue;
252
+ }
253
+ if (arg === "--agent-label") {
254
+ runtimeConfig.label = args[index + 1];
255
+ index += 1;
256
+ continue;
257
+ }
258
+ if (!scenarioId) {
259
+ scenarioId = arg;
260
+ continue;
261
+ }
262
+ throw new Error(`Unexpected argument '${arg}'.`);
263
+ }
264
+ return { scenarioId, suite, runtimeConfig };
265
+ }
266
+ function validateRuntimeConfig(config) {
267
+ if (config.agentName) {
268
+ const registration = getAgentRegistration(config.agentName);
269
+ config.provider = registration.provider;
270
+ config.model = config.model ?? registration.model;
271
+ config.label = config.label ?? registration.label ?? registration.name;
272
+ config.command = registration.command;
273
+ config.args = registration.args;
274
+ config.envAllowlist = registration.envAllowlist;
275
+ }
276
+ if (config.provider === "openai") {
277
+ if (!process.env.OPENAI_API_KEY) {
278
+ throw new Error("OPENAI_API_KEY is required when --provider openai is used.");
279
+ }
280
+ config.model = config.model ?? "gpt-4o-mini";
281
+ }
282
+ if (config.provider === "mock") {
283
+ config.label = config.label ?? config.agentName ?? "mock-support-agent-v1";
284
+ }
285
+ if (config.provider === "external_process") {
286
+ if (!config.command) {
287
+ throw new Error("External process agents require a configured command.");
288
+ }
289
+ config.label = config.label ?? config.agentName ?? "external-process-agent";
290
+ }
291
+ return config;
292
+ }
293
+ main().catch((error) => {
294
+ console.error(error instanceof Error ? error.message : String(error));
295
+ process.exitCode = 1;
296
+ });
package/dist/lib/fs.js ADDED
@@ -0,0 +1,8 @@
1
+ import { mkdirSync } from "node:fs";
2
+ import { dirname } from "node:path";
3
+ export function ensureDir(path) {
4
+ mkdirSync(path, { recursive: true });
5
+ }
6
+ export function ensureParentDir(path) {
7
+ ensureDir(dirname(path));
8
+ }