agent-regression-lab 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -7
- package/dist/agent/factory.js +20 -6
- package/dist/agent/httpAdapter.js +5 -4
- package/dist/config.js +186 -3
- package/dist/evaluators.js +56 -1
- package/dist/index.js +143 -11
- package/dist/lib/id.js +3 -0
- package/dist/runOutput.js +46 -0
- package/dist/runner.js +31 -9
- package/dist/scenarios.js +90 -2
- package/dist/scoring.js +2 -2
- package/dist/storage.js +117 -7
- package/dist/tools.js +38 -0
- package/dist/trace.js +4 -2
- package/dist/ui/App.js +28 -2
- package/dist/ui-assets/client.js +82 -0
- package/docs/agents.md +143 -8
- package/docs/golden-suites.md +74 -0
- package/docs/integrations-and-live-services.md +58 -0
- package/docs/memory-and-stateful-agents.md +51 -0
- package/docs/release-checklist.md +30 -0
- package/docs/runtime-profiles.md +67 -0
- package/docs/scenarios.md +303 -56
- package/docs/troubleshooting.md +138 -0
- package/docs/variant-sets.md +63 -0
- package/package.json +2 -2
package/dist/storage.js
CHANGED
|
@@ -10,6 +10,10 @@ export class Storage {
|
|
|
10
10
|
ensureParentDir(DB_PATH);
|
|
11
11
|
this.db = new DatabaseSync(DB_PATH);
|
|
12
12
|
this.db.exec(`
|
|
13
|
+
PRAGMA journal_mode = WAL;
|
|
14
|
+
PRAGMA busy_timeout = 5000;
|
|
15
|
+
`);
|
|
16
|
+
this.db.exec(`
|
|
13
17
|
CREATE TABLE IF NOT EXISTS metadata (
|
|
14
18
|
key TEXT PRIMARY KEY,
|
|
15
19
|
value TEXT NOT NULL
|
|
@@ -35,6 +39,15 @@ export class Storage {
|
|
|
35
39
|
provider TEXT,
|
|
36
40
|
command TEXT,
|
|
37
41
|
args_json TEXT,
|
|
42
|
+
variant_set_name TEXT,
|
|
43
|
+
variant_label TEXT,
|
|
44
|
+
prompt_version TEXT,
|
|
45
|
+
model_version TEXT,
|
|
46
|
+
tool_schema_version TEXT,
|
|
47
|
+
config_label TEXT,
|
|
48
|
+
config_hash TEXT,
|
|
49
|
+
runtime_profile_name TEXT,
|
|
50
|
+
suite_definition_name TEXT,
|
|
38
51
|
config_json TEXT NOT NULL,
|
|
39
52
|
created_at TEXT NOT NULL
|
|
40
53
|
);
|
|
@@ -45,6 +58,15 @@ export class Storage {
|
|
|
45
58
|
scenario_file_hash TEXT NOT NULL,
|
|
46
59
|
agent_version_id TEXT NOT NULL,
|
|
47
60
|
suite_batch_id TEXT,
|
|
61
|
+
variant_set_name TEXT,
|
|
62
|
+
variant_label TEXT,
|
|
63
|
+
prompt_version TEXT,
|
|
64
|
+
model_version TEXT,
|
|
65
|
+
tool_schema_version TEXT,
|
|
66
|
+
config_label TEXT,
|
|
67
|
+
config_hash TEXT,
|
|
68
|
+
runtime_profile_name TEXT,
|
|
69
|
+
suite_definition_name TEXT,
|
|
48
70
|
status TEXT NOT NULL,
|
|
49
71
|
termination_reason TEXT NOT NULL,
|
|
50
72
|
final_output TEXT NOT NULL,
|
|
@@ -120,25 +142,41 @@ export class Storage {
|
|
|
120
142
|
upsertAgentVersion(agentVersion) {
|
|
121
143
|
const now = new Date().toISOString();
|
|
122
144
|
this.db
|
|
123
|
-
.prepare(`INSERT INTO agent_versions (
|
|
124
|
-
|
|
145
|
+
.prepare(`INSERT INTO agent_versions (
|
|
146
|
+
id, label, model_id, provider, command, args_json,
|
|
147
|
+
variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
|
|
148
|
+
config_label, config_hash, runtime_profile_name, suite_definition_name,
|
|
149
|
+
config_json, created_at
|
|
150
|
+
)
|
|
151
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
125
152
|
ON CONFLICT(id) DO UPDATE SET
|
|
126
153
|
label = excluded.label,
|
|
127
154
|
model_id = excluded.model_id,
|
|
128
155
|
provider = excluded.provider,
|
|
129
156
|
command = excluded.command,
|
|
130
157
|
args_json = excluded.args_json,
|
|
158
|
+
variant_set_name = excluded.variant_set_name,
|
|
159
|
+
variant_label = excluded.variant_label,
|
|
160
|
+
prompt_version = excluded.prompt_version,
|
|
161
|
+
model_version = excluded.model_version,
|
|
162
|
+
tool_schema_version = excluded.tool_schema_version,
|
|
163
|
+
config_label = excluded.config_label,
|
|
164
|
+
config_hash = excluded.config_hash,
|
|
165
|
+
runtime_profile_name = excluded.runtime_profile_name,
|
|
166
|
+
suite_definition_name = excluded.suite_definition_name,
|
|
131
167
|
config_json = excluded.config_json`)
|
|
132
|
-
.run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), JSON.stringify(agentVersion.config), now);
|
|
168
|
+
.run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), agentVersion.variantSetName ?? null, agentVersion.variantLabel ?? null, agentVersion.promptVersion ?? null, agentVersion.modelVersion ?? null, agentVersion.toolSchemaVersion ?? null, agentVersion.configLabel ?? null, agentVersion.configHash ?? null, agentVersion.runtimeProfileName ?? null, agentVersion.suiteDefinitionName ?? null, JSON.stringify(agentVersion.config), now);
|
|
133
169
|
}
|
|
134
170
|
saveRun(bundle) {
|
|
135
171
|
const run = bundle.run;
|
|
136
172
|
this.db
|
|
137
173
|
.prepare(`INSERT INTO runs (
|
|
138
174
|
id, scenario_id, scenario_file_hash, agent_version_id, status, termination_reason, final_output,
|
|
139
|
-
suite_batch_id,
|
|
140
|
-
|
|
141
|
-
|
|
175
|
+
suite_batch_id, variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
|
|
176
|
+
config_label, config_hash, runtime_profile_name, suite_definition_name,
|
|
177
|
+
total_steps, total_tool_calls, duration_ms, total_tokens, total_cost_usd, score, started_at, finished_at
|
|
178
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
|
|
179
|
+
.run(run.id, run.scenarioId, run.scenarioFileHash, run.agentVersionId, run.status, run.terminationReason, run.finalOutput, run.suiteBatchId ?? null, run.variantSetName ?? null, run.variantLabel ?? null, run.promptVersion ?? null, run.modelVersion ?? null, run.toolSchemaVersion ?? null, run.configLabel ?? null, run.configHash ?? null, run.runtimeProfileName ?? null, run.suiteDefinitionName ?? null, run.totalSteps, run.totalToolCalls, run.durationMs, run.totalTokens ?? null, run.totalCostUsd ?? null, run.score, run.startedAt, run.finishedAt);
|
|
142
180
|
const insertStep = this.db.prepare(`INSERT INTO run_steps (id, run_id, step_index, timestamp, source, type, payload_json)
|
|
143
181
|
VALUES (?, ?, ?, ?, ?, ?, ?)`);
|
|
144
182
|
const insertTool = this.db.prepare(`INSERT INTO tool_calls (id, run_id, step_index, tool_name, input_json, output_json, status, duration_ms, error_message)
|
|
@@ -183,6 +221,7 @@ export class Storage {
|
|
|
183
221
|
return this.db
|
|
184
222
|
.prepare(`SELECT r.id, r.scenario_id as scenarioId, s.suite, r.agent_version_id as agentVersionId,
|
|
185
223
|
r.suite_batch_id as suiteBatchId,
|
|
224
|
+
r.variant_set_name as variantSetName, r.variant_label as variantLabel,
|
|
186
225
|
av.label as agentLabel, av.provider, av.model_id as modelId,
|
|
187
226
|
r.status, r.score, r.duration_ms as durationMs, r.total_steps as totalSteps,
|
|
188
227
|
r.started_at as startedAt
|
|
@@ -244,6 +283,11 @@ export class Storage {
|
|
|
244
283
|
}));
|
|
245
284
|
const agentVersion = this.db
|
|
246
285
|
.prepare(`SELECT id, label, model_id as modelId, provider, command, args_json, config_json
|
|
286
|
+
, variant_set_name as variantSetName, variant_label as variantLabel,
|
|
287
|
+
prompt_version as promptVersion, model_version as modelVersion,
|
|
288
|
+
tool_schema_version as toolSchemaVersion, config_label as configLabel,
|
|
289
|
+
config_hash as configHash, runtime_profile_name as runtimeProfileName,
|
|
290
|
+
suite_definition_name as suiteDefinitionName
|
|
247
291
|
FROM agent_versions WHERE id = ?`)
|
|
248
292
|
.get(run.agentVersionId);
|
|
249
293
|
return {
|
|
@@ -259,6 +303,15 @@ export class Storage {
|
|
|
259
303
|
provider: agentVersion.provider ?? undefined,
|
|
260
304
|
command: agentVersion.command ?? undefined,
|
|
261
305
|
args: agentVersion.args_json ? JSON.parse(agentVersion.args_json) : undefined,
|
|
306
|
+
variantSetName: agentVersion.variantSetName ?? undefined,
|
|
307
|
+
variantLabel: agentVersion.variantLabel ?? undefined,
|
|
308
|
+
promptVersion: agentVersion.promptVersion ?? undefined,
|
|
309
|
+
modelVersion: agentVersion.modelVersion ?? undefined,
|
|
310
|
+
toolSchemaVersion: agentVersion.toolSchemaVersion ?? undefined,
|
|
311
|
+
configLabel: agentVersion.configLabel ?? undefined,
|
|
312
|
+
configHash: agentVersion.configHash ?? undefined,
|
|
313
|
+
runtimeProfileName: agentVersion.runtimeProfileName ?? undefined,
|
|
314
|
+
suiteDefinitionName: agentVersion.suiteDefinitionName ?? undefined,
|
|
262
315
|
config: JSON.parse(agentVersion.config_json),
|
|
263
316
|
}
|
|
264
317
|
: undefined,
|
|
@@ -348,7 +401,10 @@ export class Storage {
|
|
|
348
401
|
getRunRecord(runId) {
|
|
349
402
|
return (this.db
|
|
350
403
|
.prepare(`SELECT id, scenario_id as scenarioId, scenario_file_hash as scenarioFileHash, agent_version_id as agentVersionId,
|
|
351
|
-
suite_batch_id as suiteBatchId,
|
|
404
|
+
suite_batch_id as suiteBatchId, variant_set_name as variantSetName, variant_label as variantLabel,
|
|
405
|
+
prompt_version as promptVersion, model_version as modelVersion, tool_schema_version as toolSchemaVersion,
|
|
406
|
+
config_label as configLabel, config_hash as configHash, runtime_profile_name as runtimeProfileName,
|
|
407
|
+
suite_definition_name as suiteDefinitionName,
|
|
352
408
|
status, termination_reason as terminationReason, final_output as finalOutput, total_steps as totalSteps,
|
|
353
409
|
total_tool_calls as totalToolCalls, duration_ms as durationMs, total_tokens as totalTokens,
|
|
354
410
|
total_cost_usd as totalCostUsd, score, started_at as startedAt, finished_at as finishedAt
|
|
@@ -388,6 +444,33 @@ export class Storage {
|
|
|
388
444
|
if (!names.has("args_json")) {
|
|
389
445
|
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN args_json TEXT`);
|
|
390
446
|
}
|
|
447
|
+
if (!names.has("variant_set_name")) {
|
|
448
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_set_name TEXT`);
|
|
449
|
+
}
|
|
450
|
+
if (!names.has("variant_label")) {
|
|
451
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_label TEXT`);
|
|
452
|
+
}
|
|
453
|
+
if (!names.has("prompt_version")) {
|
|
454
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN prompt_version TEXT`);
|
|
455
|
+
}
|
|
456
|
+
if (!names.has("model_version")) {
|
|
457
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN model_version TEXT`);
|
|
458
|
+
}
|
|
459
|
+
if (!names.has("tool_schema_version")) {
|
|
460
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN tool_schema_version TEXT`);
|
|
461
|
+
}
|
|
462
|
+
if (!names.has("config_label")) {
|
|
463
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_label TEXT`);
|
|
464
|
+
}
|
|
465
|
+
if (!names.has("config_hash")) {
|
|
466
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_hash TEXT`);
|
|
467
|
+
}
|
|
468
|
+
if (!names.has("runtime_profile_name")) {
|
|
469
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN runtime_profile_name TEXT`);
|
|
470
|
+
}
|
|
471
|
+
if (!names.has("suite_definition_name")) {
|
|
472
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN suite_definition_name TEXT`);
|
|
473
|
+
}
|
|
391
474
|
}
|
|
392
475
|
ensureRunColumns() {
|
|
393
476
|
const columns = this.db.prepare(`PRAGMA table_info(runs)`).all();
|
|
@@ -395,6 +478,33 @@ export class Storage {
|
|
|
395
478
|
if (!names.has("suite_batch_id")) {
|
|
396
479
|
this.db.exec(`ALTER TABLE runs ADD COLUMN suite_batch_id TEXT`);
|
|
397
480
|
}
|
|
481
|
+
if (!names.has("variant_set_name")) {
|
|
482
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN variant_set_name TEXT`);
|
|
483
|
+
}
|
|
484
|
+
if (!names.has("variant_label")) {
|
|
485
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN variant_label TEXT`);
|
|
486
|
+
}
|
|
487
|
+
if (!names.has("prompt_version")) {
|
|
488
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN prompt_version TEXT`);
|
|
489
|
+
}
|
|
490
|
+
if (!names.has("model_version")) {
|
|
491
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN model_version TEXT`);
|
|
492
|
+
}
|
|
493
|
+
if (!names.has("tool_schema_version")) {
|
|
494
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN tool_schema_version TEXT`);
|
|
495
|
+
}
|
|
496
|
+
if (!names.has("config_label")) {
|
|
497
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN config_label TEXT`);
|
|
498
|
+
}
|
|
499
|
+
if (!names.has("config_hash")) {
|
|
500
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN config_hash TEXT`);
|
|
501
|
+
}
|
|
502
|
+
if (!names.has("runtime_profile_name")) {
|
|
503
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN runtime_profile_name TEXT`);
|
|
504
|
+
}
|
|
505
|
+
if (!names.has("suite_definition_name")) {
|
|
506
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN suite_definition_name TEXT`);
|
|
507
|
+
}
|
|
398
508
|
}
|
|
399
509
|
getRunsBySuiteBatchId(suiteBatchId) {
|
|
400
510
|
const runIds = this.db
|
package/dist/tools.js
CHANGED
|
@@ -2,6 +2,38 @@ import { readFileSync } from "node:fs";
|
|
|
2
2
|
import { pathToFileURL } from "node:url";
|
|
3
3
|
import { resolve } from "node:path";
|
|
4
4
|
import { loadAgentLabConfig } from "./config.js";
|
|
5
|
+
export function applyRuntimeProfileToTools(tools, profile, trace) {
|
|
6
|
+
if (!profile?.tool_faults?.length) {
|
|
7
|
+
return tools;
|
|
8
|
+
}
|
|
9
|
+
const wrapped = { ...tools };
|
|
10
|
+
for (const fault of profile.tool_faults) {
|
|
11
|
+
const original = wrapped[fault.tool];
|
|
12
|
+
if (!original) {
|
|
13
|
+
continue;
|
|
14
|
+
}
|
|
15
|
+
wrapped[fault.tool] = async (input, context) => {
|
|
16
|
+
trace.record("system", "tool_fault_injected", {
|
|
17
|
+
tool: fault.tool,
|
|
18
|
+
mode: fault.mode,
|
|
19
|
+
}, { countStep: false });
|
|
20
|
+
if (fault.mode === "timeout") {
|
|
21
|
+
await waitUnref(fault.timeout_ms ?? 5000);
|
|
22
|
+
const timeoutError = new Error(`Injected timeout for ${fault.tool}`);
|
|
23
|
+
timeoutError.code = "timeout_exceeded";
|
|
24
|
+
throw timeoutError;
|
|
25
|
+
}
|
|
26
|
+
if (fault.mode === "error") {
|
|
27
|
+
throw new Error(fault.error_message ?? `Injected failure for ${fault.tool}`);
|
|
28
|
+
}
|
|
29
|
+
if (fault.mode === "malformed_output") {
|
|
30
|
+
return "MALFORMED_OUTPUT";
|
|
31
|
+
}
|
|
32
|
+
return fault.partial_output ?? {};
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
return wrapped;
|
|
36
|
+
}
|
|
5
37
|
function loadFixture(path) {
|
|
6
38
|
const raw = readFileSync(resolve(path), "utf8");
|
|
7
39
|
return JSON.parse(raw);
|
|
@@ -372,3 +404,9 @@ function assertObject(value) {
|
|
|
372
404
|
throw new Error("Tool input must be an object.");
|
|
373
405
|
}
|
|
374
406
|
}
|
|
407
|
+
function waitUnref(timeoutMs) {
|
|
408
|
+
return new Promise((resolve) => {
|
|
409
|
+
const timer = setTimeout(resolve, timeoutMs);
|
|
410
|
+
timer.unref?.();
|
|
411
|
+
});
|
|
412
|
+
}
|
package/dist/trace.js
CHANGED
|
@@ -8,8 +8,10 @@ export class TraceRecorder {
|
|
|
8
8
|
this.runId = runId;
|
|
9
9
|
this.scenarioId = scenarioId;
|
|
10
10
|
}
|
|
11
|
-
record(source, type, payload) {
|
|
12
|
-
|
|
11
|
+
record(source, type, payload, options) {
|
|
12
|
+
if (options?.countStep !== false) {
|
|
13
|
+
this.stepIndex += 1;
|
|
14
|
+
}
|
|
13
15
|
this.events.push({
|
|
14
16
|
eventId: createEventId(),
|
|
15
17
|
runId: this.runId,
|
package/dist/ui/App.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { jsx as _jsx, jsxs as _jsxs } from "react/jsx-runtime";
|
|
1
|
+
import { jsx as _jsx, jsxs as _jsxs, Fragment as _Fragment } from "react/jsx-runtime";
|
|
2
2
|
import { useEffect, useState } from "react";
|
|
3
3
|
export function App() {
|
|
4
4
|
const route = getRoute();
|
|
@@ -37,7 +37,18 @@ function RunDetailPage(props) {
|
|
|
37
37
|
if (!detail) {
|
|
38
38
|
return _jsx(EmptyState, { title: "Loading run", description: "Fetching run detail from the local lab." });
|
|
39
39
|
}
|
|
40
|
-
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline", children: detail.traceEvents.map((event) => (_jsxs("li", { children: [_jsxs("div", { children: [_jsxs("strong", { children: [event.stepIndex, ". ", event.type] }), " ", _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
|
|
40
|
+
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsx(FailureSummaryPanel, { detail: detail }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), _jsx(RunIdentitySummary, { detail: detail }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline", children: detail.traceEvents.map((event) => (_jsxs("li", { children: [_jsxs("div", { children: [_jsxs("strong", { children: [event.stepIndex, ". ", event.type] }), " ", _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
|
|
41
|
+
}
|
|
42
|
+
export function FailureSummaryPanel(props) {
|
|
43
|
+
const failureItems = getFailureSummaryItems(props.detail);
|
|
44
|
+
if (failureItems.length === 0) {
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Failures First" }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsx("ul", { className: "stack", children: failureItems.map((item) => (_jsx("li", { children: item }, item))) })] }));
|
|
48
|
+
}
|
|
49
|
+
export function RunIdentitySummary(props) {
|
|
50
|
+
const run = props.detail.run;
|
|
51
|
+
return (_jsxs(_Fragment, { children: [_jsxs("p", { children: [_jsx("strong", { children: "Variant set:" }), " ", run.variantSetName ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Variant:" }), " ", run.variantLabel ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Prompt version:" }), " ", run.promptVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model version:" }), " ", run.modelVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Tool schema version:" }), " ", run.toolSchemaVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Config label:" }), " ", run.configLabel ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime profile:" }), " ", run.runtimeProfileName ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Suite definition:" }), " ", run.suiteDefinitionName ?? "-"] })] }));
|
|
41
52
|
}
|
|
42
53
|
function ComparePage(props) {
|
|
43
54
|
const [data, setData] = useState(null);
|
|
@@ -95,6 +106,21 @@ function Stat(props) {
|
|
|
95
106
|
function EmptyState(props) {
|
|
96
107
|
return (_jsxs("section", { className: "empty", children: [_jsx("h1", { children: props.title }), _jsx("p", { children: props.description })] }));
|
|
97
108
|
}
|
|
109
|
+
export function getFailureSummaryItems(detail) {
|
|
110
|
+
const items = [];
|
|
111
|
+
if (detail.errorDetail) {
|
|
112
|
+
items.push(`Error: ${detail.errorDetail}`);
|
|
113
|
+
}
|
|
114
|
+
for (const result of detail.evaluatorResults) {
|
|
115
|
+
if (result.status === "fail") {
|
|
116
|
+
items.push(`Evaluator ${result.evaluatorId}: ${result.message}`);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
if (detail.run.status !== "pass" && items.length === 0) {
|
|
120
|
+
items.push("Run did not pass. Inspect evaluator results and trace for the first divergence.");
|
|
121
|
+
}
|
|
122
|
+
return items;
|
|
123
|
+
}
|
|
98
124
|
function signed(value) {
|
|
99
125
|
return value > 0 ? `+${value}` : `${value}`;
|
|
100
126
|
}
|
package/dist/ui-assets/client.js
CHANGED
|
@@ -21816,6 +21816,7 @@ function RunDetailPage(props) {
|
|
|
21816
21816
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: detail.run.id }),
|
|
21817
21817
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: detail.run.scenarioId })
|
|
21818
21818
|
] }),
|
|
21819
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(FailureSummaryPanel, { detail }),
|
|
21819
21820
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
|
|
21820
21821
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Status", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }),
|
|
21821
21822
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score", value: detail.run.score }),
|
|
@@ -21835,6 +21836,7 @@ function RunDetailPage(props) {
|
|
|
21835
21836
|
" ",
|
|
21836
21837
|
detail.agentVersion?.modelId ?? "-"
|
|
21837
21838
|
] }),
|
|
21839
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunIdentitySummary, { detail }),
|
|
21838
21840
|
detail.agentVersion?.command ? /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21839
21841
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Command:" }),
|
|
21840
21842
|
" ",
|
|
@@ -21892,6 +21894,71 @@ function RunDetailPage(props) {
|
|
|
21892
21894
|
] })
|
|
21893
21895
|
] });
|
|
21894
21896
|
}
|
|
21897
|
+
function FailureSummaryPanel(props) {
|
|
21898
|
+
const failureItems = getFailureSummaryItems(props.detail);
|
|
21899
|
+
if (failureItems.length === 0) {
|
|
21900
|
+
return null;
|
|
21901
|
+
}
|
|
21902
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
21903
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Failures First" }),
|
|
21904
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21905
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Status:" }),
|
|
21906
|
+
" ",
|
|
21907
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })
|
|
21908
|
+
] }),
|
|
21909
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21910
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Termination:" }),
|
|
21911
|
+
" ",
|
|
21912
|
+
props.detail.run.terminationReason
|
|
21913
|
+
] }),
|
|
21914
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: failureItems.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: item }, item)) })
|
|
21915
|
+
] });
|
|
21916
|
+
}
|
|
21917
|
+
function RunIdentitySummary(props) {
|
|
21918
|
+
const run = props.detail.run;
|
|
21919
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)(import_jsx_runtime.Fragment, { children: [
|
|
21920
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21921
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant set:" }),
|
|
21922
|
+
" ",
|
|
21923
|
+
run.variantSetName ?? "-"
|
|
21924
|
+
] }),
|
|
21925
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21926
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant:" }),
|
|
21927
|
+
" ",
|
|
21928
|
+
run.variantLabel ?? "-"
|
|
21929
|
+
] }),
|
|
21930
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21931
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Prompt version:" }),
|
|
21932
|
+
" ",
|
|
21933
|
+
run.promptVersion ?? "-"
|
|
21934
|
+
] }),
|
|
21935
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21936
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Model version:" }),
|
|
21937
|
+
" ",
|
|
21938
|
+
run.modelVersion ?? "-"
|
|
21939
|
+
] }),
|
|
21940
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21941
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Tool schema version:" }),
|
|
21942
|
+
" ",
|
|
21943
|
+
run.toolSchemaVersion ?? "-"
|
|
21944
|
+
] }),
|
|
21945
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21946
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Config label:" }),
|
|
21947
|
+
" ",
|
|
21948
|
+
run.configLabel ?? "-"
|
|
21949
|
+
] }),
|
|
21950
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21951
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Runtime profile:" }),
|
|
21952
|
+
" ",
|
|
21953
|
+
run.runtimeProfileName ?? "-"
|
|
21954
|
+
] }),
|
|
21955
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21956
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Suite definition:" }),
|
|
21957
|
+
" ",
|
|
21958
|
+
run.suiteDefinitionName ?? "-"
|
|
21959
|
+
] })
|
|
21960
|
+
] });
|
|
21961
|
+
}
|
|
21895
21962
|
function ComparePage(props) {
|
|
21896
21963
|
const [data, setData] = (0, import_react.useState)(null);
|
|
21897
21964
|
(0, import_react.useEffect)(() => {
|
|
@@ -22092,6 +22159,21 @@ function EmptyState(props) {
|
|
|
22092
22159
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: props.description })
|
|
22093
22160
|
] });
|
|
22094
22161
|
}
|
|
22162
|
+
function getFailureSummaryItems(detail) {
|
|
22163
|
+
const items = [];
|
|
22164
|
+
if (detail.errorDetail) {
|
|
22165
|
+
items.push(`Error: ${detail.errorDetail}`);
|
|
22166
|
+
}
|
|
22167
|
+
for (const result of detail.evaluatorResults) {
|
|
22168
|
+
if (result.status === "fail") {
|
|
22169
|
+
items.push(`Evaluator ${result.evaluatorId}: ${result.message}`);
|
|
22170
|
+
}
|
|
22171
|
+
}
|
|
22172
|
+
if (detail.run.status !== "pass" && items.length === 0) {
|
|
22173
|
+
items.push("Run did not pass. Inspect evaluator results and trace for the first divergence.");
|
|
22174
|
+
}
|
|
22175
|
+
return items;
|
|
22176
|
+
}
|
|
22095
22177
|
function signed(value) {
|
|
22096
22178
|
return value > 0 ? `+${value}` : `${value}`;
|
|
22097
22179
|
}
|
package/docs/agents.md
CHANGED
|
@@ -2,15 +2,25 @@
|
|
|
2
2
|
|
|
3
3
|
Named agents are configured in `agentlab.config.yaml`.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
Agents remain the stable execution unit even when you introduce Tier 1 comparison features. You still run one named agent at a time, but you can now group multiple named agents into a `variant_set` for prompt/model/config comparisons.
|
|
6
|
+
|
|
7
|
+
This repo supports four provider modes:
|
|
6
8
|
|
|
7
9
|
- `mock`
|
|
8
10
|
- `openai`
|
|
9
11
|
- `external_process`
|
|
12
|
+
- `http`
|
|
13
|
+
|
|
14
|
+
Choose the simplest provider that answers the engineering question you actually have:
|
|
15
|
+
|
|
16
|
+
- `mock` for deterministic harness verification
|
|
17
|
+
- `openai` for real model behavior on deterministic tools
|
|
18
|
+
- `external_process` for local agents where the runner should still own the tool loop
|
|
19
|
+
- `http` for real running services that own their own memory and internal orchestration
|
|
10
20
|
|
|
11
21
|
## Named Agent Config
|
|
12
22
|
|
|
13
|
-
Example:
|
|
23
|
+
Example covering all providers:
|
|
14
24
|
|
|
15
25
|
```yaml
|
|
16
26
|
agents:
|
|
@@ -29,14 +39,31 @@ agents:
|
|
|
29
39
|
args:
|
|
30
40
|
- custom_agents/node_agent.mjs
|
|
31
41
|
label: custom-node-agent
|
|
42
|
+
|
|
43
|
+
- name: my-production-agent
|
|
44
|
+
provider: http
|
|
45
|
+
url: http://localhost:3000/api/chat
|
|
46
|
+
label: my-production-agent
|
|
32
47
|
```
|
|
33
48
|
|
|
34
49
|
Run a named agent with:
|
|
35
50
|
|
|
36
51
|
```bash
|
|
37
52
|
agentlab run support.refund-correct-order --agent mock-default
|
|
53
|
+
agentlab run internal-teams.memory-followup-recall --agent my-production-agent
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Use a named variant set when you want to run one scenario or one suite against multiple agent variants and compare the results later:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
agentlab run support.refund-correct-order --variant-set refund-agent-model-comparison
|
|
60
|
+
agentlab run --suite-def pre_merge --variant-set refund-agent-model-comparison
|
|
38
61
|
```
|
|
39
62
|
|
|
63
|
+
Each run records the underlying agent plus richer identity metadata such as `variant_label`, `prompt_version`, `model_version`, `tool_schema_version`, and `config_label`. Those fields appear in CLI summaries, `show`, stored run history, and the UI.
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
40
67
|
## Mock
|
|
41
68
|
|
|
42
69
|
The built-in mock adapter is the best path for deterministic smoke tests and baseline examples.
|
|
@@ -47,6 +74,8 @@ Use it when you want:
|
|
|
47
74
|
- stable docs examples
|
|
48
75
|
- predictable benchmark behavior
|
|
49
76
|
|
|
77
|
+
---
|
|
78
|
+
|
|
50
79
|
## OpenAI
|
|
51
80
|
|
|
52
81
|
The OpenAI path uses your API key and a configured model.
|
|
@@ -65,6 +94,8 @@ agentlab run support.refund-correct-order --agent openai-cheap
|
|
|
65
94
|
|
|
66
95
|
The OpenAI path is useful, but less deterministic than the mock path.
|
|
67
96
|
|
|
97
|
+
---
|
|
98
|
+
|
|
68
99
|
## External Process
|
|
69
100
|
|
|
70
101
|
External-process agents communicate with the runner over line-delimited JSON on stdin/stdout.
|
|
@@ -110,14 +141,12 @@ Run one of them with:
|
|
|
110
141
|
agentlab run support.refund-via-config-tool --agent custom-node-agent
|
|
111
142
|
```
|
|
112
143
|
|
|
113
|
-
|
|
144
|
+
### Environment Allowlist
|
|
114
145
|
|
|
115
146
|
External-process agents can optionally define `envAllowlist`.
|
|
116
147
|
|
|
117
148
|
Use it when a child process needs specific environment variables passed through.
|
|
118
149
|
|
|
119
|
-
Example shape:
|
|
120
|
-
|
|
121
150
|
```yaml
|
|
122
151
|
agents:
|
|
123
152
|
- name: custom-agent
|
|
@@ -131,13 +160,117 @@ agents:
|
|
|
131
160
|
|
|
132
161
|
Only allow through what the child actually needs.
|
|
133
162
|
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## HTTP
|
|
166
|
+
|
|
167
|
+
The `http` provider is for testing real production agents that run as HTTP services — Express, FastAPI, Next.js API routes, or any service that accepts a POST and returns a JSON response.
|
|
168
|
+
|
|
169
|
+
Unlike the other providers, HTTP agents manage their own conversation history and tool execution internally. agentlab sends the current message and a `conversation_id` each turn, then evaluates the reply.
|
|
170
|
+
|
|
171
|
+
Use HTTP agents with `type: conversation` scenarios. See [scenarios.md](scenarios.md) for the conversation scenario format.
|
|
172
|
+
|
|
173
|
+
This is the default choice when validating memoryful or stateful agents that already run as a service.
|
|
174
|
+
|
|
175
|
+
HTTP agents can be included inside a `variant_set` the same way as other named agents. Runtime-profile fault injection is currently applied only to task/tool-loop runs. Conversation scenarios may still reference a runtime profile for reusable authoring, but ARL does not currently intercept internal HTTP-agent tools.
|
|
176
|
+
|
|
177
|
+
### Minimal Config
|
|
178
|
+
|
|
179
|
+
```yaml
|
|
180
|
+
agents:
|
|
181
|
+
- name: my-agent
|
|
182
|
+
provider: http
|
|
183
|
+
url: http://localhost:3000/api/chat
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
Default contract: agentlab posts `{ message, conversation_id }` and expects `{ message }` in the response.
|
|
187
|
+
|
|
188
|
+
### Custom Field Names
|
|
189
|
+
|
|
190
|
+
If your agent uses different field names:
|
|
191
|
+
|
|
192
|
+
```yaml
|
|
193
|
+
agents:
|
|
194
|
+
- name: my-agent-custom
|
|
195
|
+
provider: http
|
|
196
|
+
url: http://localhost:3000/api/chat
|
|
197
|
+
request_template:
|
|
198
|
+
query: "{{message}}"
|
|
199
|
+
session_id: "{{conversation_id}}"
|
|
200
|
+
response_field: reply
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
`request_template` values support three placeholders:
|
|
204
|
+
|
|
205
|
+
- `{{message}}` — the current step message
|
|
206
|
+
- `{{conversation_id}}` — the UUID generated for this run (consistent across all steps)
|
|
207
|
+
- `{{env.VAR_NAME}}` — reads from the environment at runtime
|
|
208
|
+
|
|
209
|
+
Whitespace inside `{{ }}` is ignored: `{{ message }}` and `{{message}}` are identical.
|
|
210
|
+
|
|
211
|
+
### Auth and Timeout
|
|
212
|
+
|
|
213
|
+
```yaml
|
|
214
|
+
agents:
|
|
215
|
+
- name: my-agent-auth
|
|
216
|
+
provider: http
|
|
217
|
+
url: http://localhost:3000/api/chat
|
|
218
|
+
headers:
|
|
219
|
+
Authorization: "Bearer {{env.MY_AGENT_TOKEN}}"
|
|
220
|
+
timeout_ms: 10000
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
`timeout_ms` defaults to 30000 (30 seconds) if not set.
|
|
224
|
+
|
|
225
|
+
Header values also support `{{message}}`, `{{conversation_id}}`, and `{{env.VAR_NAME}}` placeholders.
|
|
226
|
+
|
|
227
|
+
### Full Config Reference
|
|
228
|
+
|
|
229
|
+
| Field | Required | Default | Description |
|
|
230
|
+
|-------|----------|---------|-------------|
|
|
231
|
+
| `url` | yes | — | HTTP endpoint to POST to |
|
|
232
|
+
| `request_template` | no | `{ message, conversation_id }` | Custom request body shape |
|
|
233
|
+
| `response_field` | no | `message` | Field to read the reply from |
|
|
234
|
+
| `headers` | no | `{}` | Additional HTTP headers |
|
|
235
|
+
| `timeout_ms` | no | `30000` | Per-request timeout in milliseconds |
|
|
236
|
+
| `label` | no | agent name | Display label in CLI output and run history |
|
|
237
|
+
|
|
238
|
+
### How It Works
|
|
239
|
+
|
|
240
|
+
For each step in a conversation scenario:
|
|
241
|
+
|
|
242
|
+
1. agentlab generates a UUID `conversation_id` once at the start of the run
|
|
243
|
+
2. for every step, it POSTs the current message and `conversation_id` to your agent
|
|
244
|
+
3. your agent is responsible for maintaining conversation history using that id
|
|
245
|
+
4. agentlab reads the reply, measures latency, and runs per-step evaluators
|
|
246
|
+
5. if a hard-gate evaluator fails, the run stops immediately
|
|
247
|
+
|
|
248
|
+
### Error Handling
|
|
249
|
+
|
|
250
|
+
HTTP provider runs can end with these termination reasons:
|
|
251
|
+
|
|
252
|
+
| Reason | Cause |
|
|
253
|
+
|--------|-------|
|
|
254
|
+
| `http_connection_failed` | Could not connect to the URL |
|
|
255
|
+
| `http_error` | Agent returned HTTP 4xx or 5xx |
|
|
256
|
+
| `timeout_exceeded` | Request exceeded `timeout_ms` |
|
|
257
|
+
| `invalid_response_format` | Response is not valid JSON, or the expected field is missing |
|
|
258
|
+
| `evaluator_failed` | A per-step hard-gate evaluator failed |
|
|
259
|
+
|
|
260
|
+
Infrastructure errors (`http_connection_failed`, `http_error`, `timeout_exceeded`, `invalid_response_format`) always produce `status: error` and `score: 0`.
|
|
261
|
+
|
|
262
|
+
---
|
|
263
|
+
|
|
134
264
|
## Best Practices
|
|
135
265
|
|
|
136
|
-
- use named agents instead of ad hoc
|
|
266
|
+
- use named agents instead of ad hoc provider flags
|
|
137
267
|
- keep labels stable so compare output stays readable
|
|
138
268
|
- prefer the mock path for smoke tests and docs
|
|
139
|
-
- use external-process agents when you want to wrap a local Node or Python agent
|
|
140
|
-
-
|
|
269
|
+
- use external-process agents when you want to wrap a local Node or Python agent
|
|
270
|
+
- use http agents when your agent is already running as a service
|
|
271
|
+
- keep the runner authoritative for tools and termination (external_process and mock)
|
|
272
|
+
- keep your agent authoritative for tools and history (http)
|
|
273
|
+
- choose the simplest provider that answers the engineering question you actually have
|
|
141
274
|
|
|
142
275
|
## Common Errors
|
|
143
276
|
|
|
@@ -148,5 +281,7 @@ Typical failures:
|
|
|
148
281
|
- missing external-process `command`
|
|
149
282
|
- invalid `args` or `envAllowlist`
|
|
150
283
|
- child process returning invalid JSON
|
|
284
|
+
- http agent url not running when the test starts
|
|
285
|
+
- http agent returning a field name that doesn't match `response_field`
|
|
151
286
|
|
|
152
287
|
See [troubleshooting.md](troubleshooting.md) for fixes.
|