agent-regression-lab 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +205 -0
- package/dist/agent/externalProcessAdapter.js +173 -0
- package/dist/agent/factory.js +84 -0
- package/dist/agent/mockAdapter.js +96 -0
- package/dist/agent/openaiResponsesAdapter.js +155 -0
- package/dist/config.js +123 -0
- package/dist/evaluators.js +109 -0
- package/dist/index.js +296 -0
- package/dist/lib/fs.js +8 -0
- package/dist/lib/id.js +16 -0
- package/dist/runOutput.js +13 -0
- package/dist/runner.js +199 -0
- package/dist/scenarios.js +155 -0
- package/dist/scoring.js +18 -0
- package/dist/storage.js +394 -0
- package/dist/tools.js +128 -0
- package/dist/trace.js +30 -0
- package/dist/types.js +1 -0
- package/dist/ui/App.js +85 -0
- package/dist/ui/client.js +10 -0
- package/dist/ui/server.js +147 -0
- package/package.json +53 -0
package/dist/lib/id.js
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { createHash, randomUUID } from "node:crypto";
|
|
2
|
+
export function hashText(text) {
|
|
3
|
+
return createHash("sha256").update(text).digest("hex");
|
|
4
|
+
}
|
|
5
|
+
export function createRunId() {
|
|
6
|
+
return `run_${Date.now()}`;
|
|
7
|
+
}
|
|
8
|
+
export function createEventId() {
|
|
9
|
+
return `evt_${randomUUID()}`;
|
|
10
|
+
}
|
|
11
|
+
export function createToolCallId() {
|
|
12
|
+
return `tool_${randomUUID()}`;
|
|
13
|
+
}
|
|
14
|
+
export function createAgentVersionId(label, config) {
|
|
15
|
+
return `agent_${hashText(`${label}:${JSON.stringify(config)}`).slice(0, 12)}`;
|
|
16
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export function getRunErrorDetail(bundle) {
|
|
2
|
+
for (const event of [...bundle.traceEvents].reverse()) {
|
|
3
|
+
if (event.type === "agent_error") {
|
|
4
|
+
const message = event.payload.message;
|
|
5
|
+
return typeof message === "string" ? message : undefined;
|
|
6
|
+
}
|
|
7
|
+
if (event.type === "tool_call_failed") {
|
|
8
|
+
const error = event.payload.error;
|
|
9
|
+
return typeof error === "string" ? error : undefined;
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
return undefined;
|
|
13
|
+
}
|
package/dist/runner.js
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
import { performance } from "node:perf_hooks";
|
|
2
|
+
import { createToolCallId, createRunId } from "./lib/id.js";
|
|
3
|
+
import { evaluateScenario } from "./evaluators.js";
|
|
4
|
+
import { computeScore } from "./scoring.js";
|
|
5
|
+
import { TraceRecorder } from "./trace.js";
|
|
6
|
+
export async function runScenario(deps) {
|
|
7
|
+
const runId = createRunId();
|
|
8
|
+
const startedAt = new Date().toISOString();
|
|
9
|
+
const runStart = performance.now();
|
|
10
|
+
const trace = new TraceRecorder(runId, deps.scenario.id);
|
|
11
|
+
const toolCalls = [];
|
|
12
|
+
const maxSteps = deps.scenario.runtime?.max_steps ?? 8;
|
|
13
|
+
const timeoutSeconds = deps.scenario.runtime?.timeout_seconds;
|
|
14
|
+
const deadline = timeoutSeconds ? Date.now() + timeoutSeconds * 1000 : undefined;
|
|
15
|
+
trace.record("runner", "run_started", {
|
|
16
|
+
agentVersionId: deps.agentVersion.id,
|
|
17
|
+
provider: deps.agentVersion.provider ?? "unknown",
|
|
18
|
+
modelId: deps.agentVersion.modelId ?? "unknown",
|
|
19
|
+
command: deps.agentVersion.command,
|
|
20
|
+
args: deps.agentVersion.args,
|
|
21
|
+
scenarioVersionHash: deps.scenarioFileHash,
|
|
22
|
+
maxSteps,
|
|
23
|
+
timeoutSeconds,
|
|
24
|
+
});
|
|
25
|
+
const availableTools = deps.toolSpecs.filter((tool) => deps.scenario.tools.allowed.includes(tool.name));
|
|
26
|
+
const session = await deps.agentAdapter.startRun({
|
|
27
|
+
instructions: deps.scenario.task.instructions,
|
|
28
|
+
availableTools,
|
|
29
|
+
context: deps.scenario.context?.variables ?? {},
|
|
30
|
+
maxSteps,
|
|
31
|
+
metadata: {
|
|
32
|
+
scenarioId: deps.scenario.id,
|
|
33
|
+
provider: deps.agentVersion.provider,
|
|
34
|
+
model: deps.agentVersion.modelId,
|
|
35
|
+
},
|
|
36
|
+
});
|
|
37
|
+
let finalOutput = "";
|
|
38
|
+
let terminationReason = "completed";
|
|
39
|
+
let status = "pass";
|
|
40
|
+
let loopCount = 0;
|
|
41
|
+
let event = {
|
|
42
|
+
type: "run_started",
|
|
43
|
+
};
|
|
44
|
+
while (loopCount < maxSteps) {
|
|
45
|
+
if (hasTimedOut(deadline)) {
|
|
46
|
+
status = "error";
|
|
47
|
+
terminationReason = "timeout_exceeded";
|
|
48
|
+
trace.record("runner", "timeout_exceeded", { timeoutSeconds });
|
|
49
|
+
break;
|
|
50
|
+
}
|
|
51
|
+
loopCount += 1;
|
|
52
|
+
trace.record("agent", "agent_turn_started", { loopCount });
|
|
53
|
+
const turn = await raceWithTimeout(session.next(event), deadline, "Agent turn timed out.");
|
|
54
|
+
if (turn.type === "error") {
|
|
55
|
+
status = "error";
|
|
56
|
+
terminationReason = "agent_error";
|
|
57
|
+
trace.record("agent", "agent_error", { message: turn.message });
|
|
58
|
+
break;
|
|
59
|
+
}
|
|
60
|
+
if (turn.type === "final") {
|
|
61
|
+
finalOutput = turn.output;
|
|
62
|
+
trace.record("agent", "agent_final_output", { output: turn.output, metadata: turn.metadata ?? {} });
|
|
63
|
+
break;
|
|
64
|
+
}
|
|
65
|
+
const toolName = turn.toolName;
|
|
66
|
+
const toolCallId = createToolCallId();
|
|
67
|
+
trace.record("agent", "agent_message", { content: String(turn.metadata?.message ?? `Requesting ${toolName}`) });
|
|
68
|
+
trace.record("agent", "tool_call_requested", { toolCallId, toolName, input: turn.input });
|
|
69
|
+
if (!deps.scenario.tools.allowed.includes(toolName) || deps.scenario.tools.forbidden?.includes(toolName)) {
|
|
70
|
+
status = "fail";
|
|
71
|
+
terminationReason = "forbidden_tool_used";
|
|
72
|
+
trace.record("runner", "forbidden_tool_attempted", { toolName });
|
|
73
|
+
break;
|
|
74
|
+
}
|
|
75
|
+
const handler = deps.tools[toolName];
|
|
76
|
+
if (!handler) {
|
|
77
|
+
status = "error";
|
|
78
|
+
terminationReason = "tool_error";
|
|
79
|
+
trace.record("tool", "tool_call_failed", { toolCallId, toolName, error: "Tool handler missing" });
|
|
80
|
+
break;
|
|
81
|
+
}
|
|
82
|
+
const started = performance.now();
|
|
83
|
+
trace.record("tool", "tool_call_started", { toolCallId, toolName, input: turn.input });
|
|
84
|
+
try {
|
|
85
|
+
const result = await raceWithTimeout(handler(turn.input, { scenarioId: deps.scenario.id }), deadline, `Tool '${toolName}' timed out.`);
|
|
86
|
+
const durationMs = Math.round(performance.now() - started);
|
|
87
|
+
toolCalls.push({
|
|
88
|
+
id: toolCallId,
|
|
89
|
+
stepIndex: trace.getStepCount() + 1,
|
|
90
|
+
toolName,
|
|
91
|
+
input: turn.input,
|
|
92
|
+
output: result,
|
|
93
|
+
status: "pass",
|
|
94
|
+
durationMs,
|
|
95
|
+
});
|
|
96
|
+
trace.record("tool", "tool_call_completed", { toolCallId, toolName, input: turn.input, output: result, durationMs });
|
|
97
|
+
event = { type: "tool_result", toolName, result };
|
|
98
|
+
}
|
|
99
|
+
catch (error) {
|
|
100
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
101
|
+
if (deadline && Date.now() >= deadline) {
|
|
102
|
+
status = "error";
|
|
103
|
+
terminationReason = "timeout_exceeded";
|
|
104
|
+
trace.record("runner", "timeout_exceeded", { timeoutSeconds, message });
|
|
105
|
+
}
|
|
106
|
+
else {
|
|
107
|
+
status = "error";
|
|
108
|
+
terminationReason = "tool_error";
|
|
109
|
+
}
|
|
110
|
+
toolCalls.push({
|
|
111
|
+
id: toolCallId,
|
|
112
|
+
stepIndex: trace.getStepCount() + 1,
|
|
113
|
+
toolName,
|
|
114
|
+
input: turn.input,
|
|
115
|
+
status: "fail",
|
|
116
|
+
errorMessage: message,
|
|
117
|
+
});
|
|
118
|
+
trace.record("tool", "tool_call_failed", { toolCallId, toolName, error: message });
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
if (!finalOutput && status !== "error" && loopCount >= maxSteps) {
|
|
123
|
+
status = "fail";
|
|
124
|
+
terminationReason = "step_limit_exceeded";
|
|
125
|
+
trace.record("runner", "step_budget_exceeded", { maxSteps });
|
|
126
|
+
}
|
|
127
|
+
const finishedAt = new Date().toISOString();
|
|
128
|
+
const durationMs = Math.round(performance.now() - runStart);
|
|
129
|
+
const run = {
|
|
130
|
+
id: runId,
|
|
131
|
+
scenarioId: deps.scenario.id,
|
|
132
|
+
scenarioFileHash: deps.scenarioFileHash,
|
|
133
|
+
agentVersionId: deps.agentVersion.id,
|
|
134
|
+
status,
|
|
135
|
+
terminationReason,
|
|
136
|
+
finalOutput,
|
|
137
|
+
totalSteps: trace.getStepCount(),
|
|
138
|
+
totalToolCalls: toolCalls.length,
|
|
139
|
+
durationMs,
|
|
140
|
+
score: 0,
|
|
141
|
+
startedAt,
|
|
142
|
+
finishedAt,
|
|
143
|
+
};
|
|
144
|
+
let bundle = {
|
|
145
|
+
run,
|
|
146
|
+
traceEvents: trace.getEvents(),
|
|
147
|
+
toolCalls,
|
|
148
|
+
evaluatorResults: [],
|
|
149
|
+
};
|
|
150
|
+
trace.record("evaluator", "evaluation_started", {});
|
|
151
|
+
const evaluatorResults = evaluateScenario(bundle, deps.scenario.evaluators);
|
|
152
|
+
for (const result of evaluatorResults) {
|
|
153
|
+
trace.record("evaluator", "evaluation_result", {
|
|
154
|
+
evaluatorId: result.evaluatorId,
|
|
155
|
+
status: result.status,
|
|
156
|
+
message: result.message,
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
trace.record("evaluator", "evaluation_finished", {});
|
|
160
|
+
const finalScoring = computeScore(evaluatorResults);
|
|
161
|
+
run.score = finalScoring.score;
|
|
162
|
+
if (run.status !== "error") {
|
|
163
|
+
run.status = finalScoring.status;
|
|
164
|
+
if (run.status === "fail" && terminationReason === "completed") {
|
|
165
|
+
run.terminationReason = "evaluator_failed";
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
trace.record("runner", "run_finished", {
|
|
169
|
+
status: run.status,
|
|
170
|
+
terminationReason: run.terminationReason,
|
|
171
|
+
totalSteps: run.totalSteps,
|
|
172
|
+
durationMs: run.durationMs,
|
|
173
|
+
});
|
|
174
|
+
bundle = {
|
|
175
|
+
run,
|
|
176
|
+
traceEvents: trace.getEvents(),
|
|
177
|
+
toolCalls,
|
|
178
|
+
evaluatorResults,
|
|
179
|
+
};
|
|
180
|
+
return bundle;
|
|
181
|
+
}
|
|
182
|
+
function hasTimedOut(deadline) {
|
|
183
|
+
return deadline !== undefined && Date.now() >= deadline;
|
|
184
|
+
}
|
|
185
|
+
async function raceWithTimeout(promise, deadline, message) {
|
|
186
|
+
if (deadline === undefined) {
|
|
187
|
+
return promise;
|
|
188
|
+
}
|
|
189
|
+
const remainingMs = deadline - Date.now();
|
|
190
|
+
if (remainingMs <= 0) {
|
|
191
|
+
throw new Error(message);
|
|
192
|
+
}
|
|
193
|
+
return await Promise.race([
|
|
194
|
+
promise,
|
|
195
|
+
new Promise((_, reject) => {
|
|
196
|
+
setTimeout(() => reject(new Error(message)), remainingMs);
|
|
197
|
+
}),
|
|
198
|
+
]);
|
|
199
|
+
}
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import { readFileSync, readdirSync, statSync } from "node:fs";
|
|
2
|
+
import { createHash } from "node:crypto";
|
|
3
|
+
import { join, relative, resolve } from "node:path";
|
|
4
|
+
import { parse } from "yaml";
|
|
5
|
+
import { loadAgentLabConfig } from "./config.js";
|
|
6
|
+
import { getBuiltinToolSpecs } from "./tools.js";
|
|
7
|
+
const SCENARIOS_ROOT = resolve("scenarios");
|
|
8
|
+
export function listScenarioFiles(root = SCENARIOS_ROOT) {
|
|
9
|
+
if (!safeExists(root)) {
|
|
10
|
+
return [];
|
|
11
|
+
}
|
|
12
|
+
const results = [];
|
|
13
|
+
function walk(dir) {
|
|
14
|
+
const entries = readdirSync(dir, { withFileTypes: true });
|
|
15
|
+
for (const entry of entries) {
|
|
16
|
+
const fullPath = join(dir, entry.name);
|
|
17
|
+
if (entry.isDirectory()) {
|
|
18
|
+
walk(fullPath);
|
|
19
|
+
}
|
|
20
|
+
else if (entry.isFile() && (entry.name.endsWith(".yaml") || entry.name.endsWith(".yml"))) {
|
|
21
|
+
results.push(fullPath);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
walk(root);
|
|
26
|
+
return results.sort();
|
|
27
|
+
}
|
|
28
|
+
export function listScenarios() {
|
|
29
|
+
return listScenarioFiles().map((filePath) => {
|
|
30
|
+
const { definition } = loadScenarioByPath(filePath, getKnownToolNames());
|
|
31
|
+
return {
|
|
32
|
+
id: definition.id,
|
|
33
|
+
name: definition.name,
|
|
34
|
+
suite: definition.suite,
|
|
35
|
+
difficulty: definition.difficulty,
|
|
36
|
+
description: definition.description,
|
|
37
|
+
};
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
export function loadScenarioById(scenarioId) {
|
|
41
|
+
for (const filePath of listScenarioFiles()) {
|
|
42
|
+
const loaded = loadScenarioByPath(filePath, getKnownToolNames());
|
|
43
|
+
if (loaded.definition.id === scenarioId) {
|
|
44
|
+
return loaded;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
throw new Error(`Scenario '${scenarioId}' not found.`);
|
|
48
|
+
}
|
|
49
|
+
export function loadScenariosBySuite(suite) {
|
|
50
|
+
return listScenarioFiles()
|
|
51
|
+
.map((filePath) => loadScenarioByPath(filePath, getKnownToolNames()))
|
|
52
|
+
.filter(({ definition }) => definition.suite === suite);
|
|
53
|
+
}
|
|
54
|
+
export function loadScenarioByPath(filePath, knownToolNames = getKnownToolNames()) {
|
|
55
|
+
const absolutePath = resolve(filePath);
|
|
56
|
+
const raw = readFileSync(absolutePath, "utf8");
|
|
57
|
+
const parsed = parse(raw);
|
|
58
|
+
validateScenario(parsed, absolutePath, knownToolNames);
|
|
59
|
+
return {
|
|
60
|
+
definition: parsed,
|
|
61
|
+
filePath: relative(process.cwd(), absolutePath),
|
|
62
|
+
fileHash: createHash("sha256").update(raw).digest("hex"),
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
function validateScenario(value, filePath, knownToolNames) {
|
|
66
|
+
if (!isObject(value)) {
|
|
67
|
+
throw new Error(`Scenario file '${filePath}' must contain a YAML object.`);
|
|
68
|
+
}
|
|
69
|
+
const requiredStrings = [
|
|
70
|
+
["id", value.id],
|
|
71
|
+
["name", value.name],
|
|
72
|
+
["suite", value.suite],
|
|
73
|
+
];
|
|
74
|
+
for (const [field, candidate] of requiredStrings) {
|
|
75
|
+
if (typeof candidate !== "string" || candidate.length === 0) {
|
|
76
|
+
throw new Error(`Scenario file '${filePath}' is missing required string field '${field}'.`);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
if (!isObject(value.task) || typeof value.task.instructions !== "string" || value.task.instructions.length === 0) {
|
|
80
|
+
throw new Error(`Scenario file '${filePath}' must define task.instructions.`);
|
|
81
|
+
}
|
|
82
|
+
if (!isObject(value.tools) || !Array.isArray(value.tools.allowed) || value.tools.allowed.length === 0) {
|
|
83
|
+
throw new Error(`Scenario file '${filePath}' must define at least one allowed tool.`);
|
|
84
|
+
}
|
|
85
|
+
for (const toolName of value.tools.allowed) {
|
|
86
|
+
if (typeof toolName !== "string") {
|
|
87
|
+
throw new Error(`Scenario file '${filePath}' contains a non-string tool name in tools.allowed.`);
|
|
88
|
+
}
|
|
89
|
+
if (!knownToolNames.has(toolName)) {
|
|
90
|
+
throw new Error(`Scenario file '${filePath}' references unknown allowed tool '${toolName}'.`);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
if (Array.isArray(value.tools.forbidden)) {
|
|
94
|
+
for (const toolName of value.tools.forbidden) {
|
|
95
|
+
if (typeof toolName !== "string") {
|
|
96
|
+
throw new Error(`Scenario file '${filePath}' contains a non-string tool name in tools.forbidden.`);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
if (!Array.isArray(value.evaluators) || value.evaluators.length === 0) {
|
|
101
|
+
throw new Error(`Scenario file '${filePath}' must define at least one evaluator.`);
|
|
102
|
+
}
|
|
103
|
+
const evaluatorIds = new Set();
|
|
104
|
+
for (const evaluator of value.evaluators) {
|
|
105
|
+
if (!isObject(evaluator) || typeof evaluator.id !== "string" || typeof evaluator.type !== "string") {
|
|
106
|
+
throw new Error(`Scenario file '${filePath}' has an invalid evaluator entry.`);
|
|
107
|
+
}
|
|
108
|
+
if (evaluatorIds.has(evaluator.id)) {
|
|
109
|
+
throw new Error(`Scenario file '${filePath}' defines duplicate evaluator id '${evaluator.id}'.`);
|
|
110
|
+
}
|
|
111
|
+
evaluatorIds.add(evaluator.id);
|
|
112
|
+
}
|
|
113
|
+
if (isObject(value.runtime)) {
|
|
114
|
+
validatePositiveInt(value.runtime.max_steps, "runtime.max_steps", filePath);
|
|
115
|
+
validatePositiveInt(value.runtime.timeout_seconds, "runtime.timeout_seconds", filePath);
|
|
116
|
+
}
|
|
117
|
+
if (isObject(value.context) && Array.isArray(value.context.fixtures)) {
|
|
118
|
+
for (const fixturePath of value.context.fixtures) {
|
|
119
|
+
if (typeof fixturePath !== "string") {
|
|
120
|
+
throw new Error(`Scenario file '${filePath}' contains a non-string fixture path.`);
|
|
121
|
+
}
|
|
122
|
+
const resolvedPath = resolve(fixturePath);
|
|
123
|
+
if (!safeExists(resolvedPath) || !statSync(resolvedPath).isFile()) {
|
|
124
|
+
throw new Error(`Scenario file '${filePath}' references missing fixture '${fixturePath}'.`);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
function validatePositiveInt(value, field, filePath) {
|
|
130
|
+
if (value === undefined) {
|
|
131
|
+
return;
|
|
132
|
+
}
|
|
133
|
+
if (typeof value !== "number" || !Number.isInteger(value) || value <= 0) {
|
|
134
|
+
throw new Error(`Scenario file '${filePath}' field '${field}' must be a positive integer.`);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
function isObject(value) {
|
|
138
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
139
|
+
}
|
|
140
|
+
function safeExists(path) {
|
|
141
|
+
try {
|
|
142
|
+
statSync(path);
|
|
143
|
+
return true;
|
|
144
|
+
}
|
|
145
|
+
catch {
|
|
146
|
+
return false;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
function getKnownToolNames() {
|
|
150
|
+
const names = new Set(getBuiltinToolSpecs().map((tool) => tool.name));
|
|
151
|
+
for (const tool of loadAgentLabConfig().tools ?? []) {
|
|
152
|
+
names.add(tool.name);
|
|
153
|
+
}
|
|
154
|
+
return names;
|
|
155
|
+
}
|
package/dist/scoring.js
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
const PASS_THRESHOLD = 80;
|
|
2
|
+
export function computeScore(results) {
|
|
3
|
+
const hardGateFailure = results.some((result) => result.mode === "hard_gate" && result.status === "fail");
|
|
4
|
+
const weighted = results.filter((result) => result.mode === "weighted");
|
|
5
|
+
let score = 100;
|
|
6
|
+
if (weighted.length > 0) {
|
|
7
|
+
const totalWeight = weighted.reduce((sum, result) => sum + (result.weight ?? 0), 0);
|
|
8
|
+
const earnedWeight = weighted.reduce((sum, result) => {
|
|
9
|
+
const weight = result.weight ?? 0;
|
|
10
|
+
return sum + (result.status === "pass" ? weight : 0);
|
|
11
|
+
}, 0);
|
|
12
|
+
score = totalWeight === 0 ? 100 : Math.round((earnedWeight / totalWeight) * 100);
|
|
13
|
+
}
|
|
14
|
+
if (hardGateFailure) {
|
|
15
|
+
return { score, status: "fail" };
|
|
16
|
+
}
|
|
17
|
+
return { score, status: score >= PASS_THRESHOLD ? "pass" : "fail" };
|
|
18
|
+
}
|