agent-regression-lab 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/storage.js CHANGED
@@ -10,6 +10,10 @@ export class Storage {
10
10
  ensureParentDir(DB_PATH);
11
11
  this.db = new DatabaseSync(DB_PATH);
12
12
  this.db.exec(`
13
+ PRAGMA journal_mode = WAL;
14
+ PRAGMA busy_timeout = 5000;
15
+ `);
16
+ this.db.exec(`
13
17
  CREATE TABLE IF NOT EXISTS metadata (
14
18
  key TEXT PRIMARY KEY,
15
19
  value TEXT NOT NULL
@@ -35,6 +39,15 @@ export class Storage {
35
39
  provider TEXT,
36
40
  command TEXT,
37
41
  args_json TEXT,
42
+ variant_set_name TEXT,
43
+ variant_label TEXT,
44
+ prompt_version TEXT,
45
+ model_version TEXT,
46
+ tool_schema_version TEXT,
47
+ config_label TEXT,
48
+ config_hash TEXT,
49
+ runtime_profile_name TEXT,
50
+ suite_definition_name TEXT,
38
51
  config_json TEXT NOT NULL,
39
52
  created_at TEXT NOT NULL
40
53
  );
@@ -45,6 +58,15 @@ export class Storage {
45
58
  scenario_file_hash TEXT NOT NULL,
46
59
  agent_version_id TEXT NOT NULL,
47
60
  suite_batch_id TEXT,
61
+ variant_set_name TEXT,
62
+ variant_label TEXT,
63
+ prompt_version TEXT,
64
+ model_version TEXT,
65
+ tool_schema_version TEXT,
66
+ config_label TEXT,
67
+ config_hash TEXT,
68
+ runtime_profile_name TEXT,
69
+ suite_definition_name TEXT,
48
70
  status TEXT NOT NULL,
49
71
  termination_reason TEXT NOT NULL,
50
72
  final_output TEXT NOT NULL,
@@ -120,25 +142,41 @@ export class Storage {
120
142
  upsertAgentVersion(agentVersion) {
121
143
  const now = new Date().toISOString();
122
144
  this.db
123
- .prepare(`INSERT INTO agent_versions (id, label, model_id, provider, command, args_json, config_json, created_at)
124
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)
145
+ .prepare(`INSERT INTO agent_versions (
146
+ id, label, model_id, provider, command, args_json,
147
+ variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
148
+ config_label, config_hash, runtime_profile_name, suite_definition_name,
149
+ config_json, created_at
150
+ )
151
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
125
152
  ON CONFLICT(id) DO UPDATE SET
126
153
  label = excluded.label,
127
154
  model_id = excluded.model_id,
128
155
  provider = excluded.provider,
129
156
  command = excluded.command,
130
157
  args_json = excluded.args_json,
158
+ variant_set_name = excluded.variant_set_name,
159
+ variant_label = excluded.variant_label,
160
+ prompt_version = excluded.prompt_version,
161
+ model_version = excluded.model_version,
162
+ tool_schema_version = excluded.tool_schema_version,
163
+ config_label = excluded.config_label,
164
+ config_hash = excluded.config_hash,
165
+ runtime_profile_name = excluded.runtime_profile_name,
166
+ suite_definition_name = excluded.suite_definition_name,
131
167
  config_json = excluded.config_json`)
132
- .run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), JSON.stringify(agentVersion.config), now);
168
+ .run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), agentVersion.variantSetName ?? null, agentVersion.variantLabel ?? null, agentVersion.promptVersion ?? null, agentVersion.modelVersion ?? null, agentVersion.toolSchemaVersion ?? null, agentVersion.configLabel ?? null, agentVersion.configHash ?? null, agentVersion.runtimeProfileName ?? null, agentVersion.suiteDefinitionName ?? null, JSON.stringify(agentVersion.config), now);
133
169
  }
134
170
  saveRun(bundle) {
135
171
  const run = bundle.run;
136
172
  this.db
137
173
  .prepare(`INSERT INTO runs (
138
174
  id, scenario_id, scenario_file_hash, agent_version_id, status, termination_reason, final_output,
139
- suite_batch_id, total_steps, total_tool_calls, duration_ms, total_tokens, total_cost_usd, score, started_at, finished_at
140
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
141
- .run(run.id, run.scenarioId, run.scenarioFileHash, run.agentVersionId, run.status, run.terminationReason, run.finalOutput, run.suiteBatchId ?? null, run.totalSteps, run.totalToolCalls, run.durationMs, run.totalTokens ?? null, run.totalCostUsd ?? null, run.score, run.startedAt, run.finishedAt);
175
+ suite_batch_id, variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
176
+ config_label, config_hash, runtime_profile_name, suite_definition_name,
177
+ total_steps, total_tool_calls, duration_ms, total_tokens, total_cost_usd, score, started_at, finished_at
178
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
179
+ .run(run.id, run.scenarioId, run.scenarioFileHash, run.agentVersionId, run.status, run.terminationReason, run.finalOutput, run.suiteBatchId ?? null, run.variantSetName ?? null, run.variantLabel ?? null, run.promptVersion ?? null, run.modelVersion ?? null, run.toolSchemaVersion ?? null, run.configLabel ?? null, run.configHash ?? null, run.runtimeProfileName ?? null, run.suiteDefinitionName ?? null, run.totalSteps, run.totalToolCalls, run.durationMs, run.totalTokens ?? null, run.totalCostUsd ?? null, run.score, run.startedAt, run.finishedAt);
142
180
  const insertStep = this.db.prepare(`INSERT INTO run_steps (id, run_id, step_index, timestamp, source, type, payload_json)
143
181
  VALUES (?, ?, ?, ?, ?, ?, ?)`);
144
182
  const insertTool = this.db.prepare(`INSERT INTO tool_calls (id, run_id, step_index, tool_name, input_json, output_json, status, duration_ms, error_message)
@@ -183,6 +221,7 @@ export class Storage {
183
221
  return this.db
184
222
  .prepare(`SELECT r.id, r.scenario_id as scenarioId, s.suite, r.agent_version_id as agentVersionId,
185
223
  r.suite_batch_id as suiteBatchId,
224
+ r.variant_set_name as variantSetName, r.variant_label as variantLabel,
186
225
  av.label as agentLabel, av.provider, av.model_id as modelId,
187
226
  r.status, r.score, r.duration_ms as durationMs, r.total_steps as totalSteps,
188
227
  r.started_at as startedAt
@@ -244,6 +283,11 @@ export class Storage {
244
283
  }));
245
284
  const agentVersion = this.db
246
285
  .prepare(`SELECT id, label, model_id as modelId, provider, command, args_json, config_json
286
+ , variant_set_name as variantSetName, variant_label as variantLabel,
287
+ prompt_version as promptVersion, model_version as modelVersion,
288
+ tool_schema_version as toolSchemaVersion, config_label as configLabel,
289
+ config_hash as configHash, runtime_profile_name as runtimeProfileName,
290
+ suite_definition_name as suiteDefinitionName
247
291
  FROM agent_versions WHERE id = ?`)
248
292
  .get(run.agentVersionId);
249
293
  return {
@@ -259,6 +303,15 @@ export class Storage {
259
303
  provider: agentVersion.provider ?? undefined,
260
304
  command: agentVersion.command ?? undefined,
261
305
  args: agentVersion.args_json ? JSON.parse(agentVersion.args_json) : undefined,
306
+ variantSetName: agentVersion.variantSetName ?? undefined,
307
+ variantLabel: agentVersion.variantLabel ?? undefined,
308
+ promptVersion: agentVersion.promptVersion ?? undefined,
309
+ modelVersion: agentVersion.modelVersion ?? undefined,
310
+ toolSchemaVersion: agentVersion.toolSchemaVersion ?? undefined,
311
+ configLabel: agentVersion.configLabel ?? undefined,
312
+ configHash: agentVersion.configHash ?? undefined,
313
+ runtimeProfileName: agentVersion.runtimeProfileName ?? undefined,
314
+ suiteDefinitionName: agentVersion.suiteDefinitionName ?? undefined,
262
315
  config: JSON.parse(agentVersion.config_json),
263
316
  }
264
317
  : undefined,
@@ -348,7 +401,10 @@ export class Storage {
348
401
  getRunRecord(runId) {
349
402
  return (this.db
350
403
  .prepare(`SELECT id, scenario_id as scenarioId, scenario_file_hash as scenarioFileHash, agent_version_id as agentVersionId,
351
- suite_batch_id as suiteBatchId,
404
+ suite_batch_id as suiteBatchId, variant_set_name as variantSetName, variant_label as variantLabel,
405
+ prompt_version as promptVersion, model_version as modelVersion, tool_schema_version as toolSchemaVersion,
406
+ config_label as configLabel, config_hash as configHash, runtime_profile_name as runtimeProfileName,
407
+ suite_definition_name as suiteDefinitionName,
352
408
  status, termination_reason as terminationReason, final_output as finalOutput, total_steps as totalSteps,
353
409
  total_tool_calls as totalToolCalls, duration_ms as durationMs, total_tokens as totalTokens,
354
410
  total_cost_usd as totalCostUsd, score, started_at as startedAt, finished_at as finishedAt
@@ -388,6 +444,33 @@ export class Storage {
388
444
  if (!names.has("args_json")) {
389
445
  this.db.exec(`ALTER TABLE agent_versions ADD COLUMN args_json TEXT`);
390
446
  }
447
+ if (!names.has("variant_set_name")) {
448
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_set_name TEXT`);
449
+ }
450
+ if (!names.has("variant_label")) {
451
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_label TEXT`);
452
+ }
453
+ if (!names.has("prompt_version")) {
454
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN prompt_version TEXT`);
455
+ }
456
+ if (!names.has("model_version")) {
457
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN model_version TEXT`);
458
+ }
459
+ if (!names.has("tool_schema_version")) {
460
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN tool_schema_version TEXT`);
461
+ }
462
+ if (!names.has("config_label")) {
463
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_label TEXT`);
464
+ }
465
+ if (!names.has("config_hash")) {
466
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_hash TEXT`);
467
+ }
468
+ if (!names.has("runtime_profile_name")) {
469
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN runtime_profile_name TEXT`);
470
+ }
471
+ if (!names.has("suite_definition_name")) {
472
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN suite_definition_name TEXT`);
473
+ }
391
474
  }
392
475
  ensureRunColumns() {
393
476
  const columns = this.db.prepare(`PRAGMA table_info(runs)`).all();
@@ -395,6 +478,33 @@ export class Storage {
395
478
  if (!names.has("suite_batch_id")) {
396
479
  this.db.exec(`ALTER TABLE runs ADD COLUMN suite_batch_id TEXT`);
397
480
  }
481
+ if (!names.has("variant_set_name")) {
482
+ this.db.exec(`ALTER TABLE runs ADD COLUMN variant_set_name TEXT`);
483
+ }
484
+ if (!names.has("variant_label")) {
485
+ this.db.exec(`ALTER TABLE runs ADD COLUMN variant_label TEXT`);
486
+ }
487
+ if (!names.has("prompt_version")) {
488
+ this.db.exec(`ALTER TABLE runs ADD COLUMN prompt_version TEXT`);
489
+ }
490
+ if (!names.has("model_version")) {
491
+ this.db.exec(`ALTER TABLE runs ADD COLUMN model_version TEXT`);
492
+ }
493
+ if (!names.has("tool_schema_version")) {
494
+ this.db.exec(`ALTER TABLE runs ADD COLUMN tool_schema_version TEXT`);
495
+ }
496
+ if (!names.has("config_label")) {
497
+ this.db.exec(`ALTER TABLE runs ADD COLUMN config_label TEXT`);
498
+ }
499
+ if (!names.has("config_hash")) {
500
+ this.db.exec(`ALTER TABLE runs ADD COLUMN config_hash TEXT`);
501
+ }
502
+ if (!names.has("runtime_profile_name")) {
503
+ this.db.exec(`ALTER TABLE runs ADD COLUMN runtime_profile_name TEXT`);
504
+ }
505
+ if (!names.has("suite_definition_name")) {
506
+ this.db.exec(`ALTER TABLE runs ADD COLUMN suite_definition_name TEXT`);
507
+ }
398
508
  }
399
509
  getRunsBySuiteBatchId(suiteBatchId) {
400
510
  const runIds = this.db
package/dist/tools.js CHANGED
@@ -2,6 +2,38 @@ import { readFileSync } from "node:fs";
2
2
  import { pathToFileURL } from "node:url";
3
3
  import { resolve } from "node:path";
4
4
  import { loadAgentLabConfig } from "./config.js";
5
+ export function applyRuntimeProfileToTools(tools, profile, trace) {
6
+ if (!profile?.tool_faults?.length) {
7
+ return tools;
8
+ }
9
+ const wrapped = { ...tools };
10
+ for (const fault of profile.tool_faults) {
11
+ const original = wrapped[fault.tool];
12
+ if (!original) {
13
+ continue;
14
+ }
15
+ wrapped[fault.tool] = async (input, context) => {
16
+ trace.record("system", "tool_fault_injected", {
17
+ tool: fault.tool,
18
+ mode: fault.mode,
19
+ }, { countStep: false });
20
+ if (fault.mode === "timeout") {
21
+ await waitUnref(fault.timeout_ms ?? 5000);
22
+ const timeoutError = new Error(`Injected timeout for ${fault.tool}`);
23
+ timeoutError.code = "timeout_exceeded";
24
+ throw timeoutError;
25
+ }
26
+ if (fault.mode === "error") {
27
+ throw new Error(fault.error_message ?? `Injected failure for ${fault.tool}`);
28
+ }
29
+ if (fault.mode === "malformed_output") {
30
+ return "MALFORMED_OUTPUT";
31
+ }
32
+ return fault.partial_output ?? {};
33
+ };
34
+ }
35
+ return wrapped;
36
+ }
5
37
  function loadFixture(path) {
6
38
  const raw = readFileSync(resolve(path), "utf8");
7
39
  return JSON.parse(raw);
@@ -372,3 +404,9 @@ function assertObject(value) {
372
404
  throw new Error("Tool input must be an object.");
373
405
  }
374
406
  }
407
+ function waitUnref(timeoutMs) {
408
+ return new Promise((resolve) => {
409
+ const timer = setTimeout(resolve, timeoutMs);
410
+ timer.unref?.();
411
+ });
412
+ }
package/dist/trace.js CHANGED
@@ -8,8 +8,10 @@ export class TraceRecorder {
8
8
  this.runId = runId;
9
9
  this.scenarioId = scenarioId;
10
10
  }
11
- record(source, type, payload) {
12
- this.stepIndex += 1;
11
+ record(source, type, payload, options) {
12
+ if (options?.countStep !== false) {
13
+ this.stepIndex += 1;
14
+ }
13
15
  this.events.push({
14
16
  eventId: createEventId(),
15
17
  runId: this.runId,
package/dist/ui/App.js CHANGED
@@ -1,4 +1,4 @@
1
- import { jsx as _jsx, jsxs as _jsxs } from "react/jsx-runtime";
1
+ import { jsx as _jsx, jsxs as _jsxs, Fragment as _Fragment } from "react/jsx-runtime";
2
2
  import { useEffect, useState } from "react";
3
3
  export function App() {
4
4
  const route = getRoute();
@@ -37,7 +37,18 @@ function RunDetailPage(props) {
37
37
  if (!detail) {
38
38
  return _jsx(EmptyState, { title: "Loading run", description: "Fetching run detail from the local lab." });
39
39
  }
40
- return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline", children: detail.traceEvents.map((event) => (_jsxs("li", { children: [_jsxs("div", { children: [_jsxs("strong", { children: [event.stepIndex, ". ", event.type] }), " ", _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
40
+ return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsx(FailureSummaryPanel, { detail: detail }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), _jsx(RunIdentitySummary, { detail: detail }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline", children: detail.traceEvents.map((event) => (_jsxs("li", { children: [_jsxs("div", { children: [_jsxs("strong", { children: [event.stepIndex, ". ", event.type] }), " ", _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
41
+ }
42
+ export function FailureSummaryPanel(props) {
43
+ const failureItems = getFailureSummaryItems(props.detail);
44
+ if (failureItems.length === 0) {
45
+ return null;
46
+ }
47
+ return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Failures First" }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsx("ul", { className: "stack", children: failureItems.map((item) => (_jsx("li", { children: item }, item))) })] }));
48
+ }
49
+ export function RunIdentitySummary(props) {
50
+ const run = props.detail.run;
51
+ return (_jsxs(_Fragment, { children: [_jsxs("p", { children: [_jsx("strong", { children: "Variant set:" }), " ", run.variantSetName ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Variant:" }), " ", run.variantLabel ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Prompt version:" }), " ", run.promptVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model version:" }), " ", run.modelVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Tool schema version:" }), " ", run.toolSchemaVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Config label:" }), " ", run.configLabel ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime profile:" }), " ", run.runtimeProfileName ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Suite definition:" }), " ", run.suiteDefinitionName ?? "-"] })] }));
41
52
  }
42
53
  function ComparePage(props) {
43
54
  const [data, setData] = useState(null);
@@ -95,6 +106,21 @@ function Stat(props) {
95
106
  function EmptyState(props) {
96
107
  return (_jsxs("section", { className: "empty", children: [_jsx("h1", { children: props.title }), _jsx("p", { children: props.description })] }));
97
108
  }
109
+ export function getFailureSummaryItems(detail) {
110
+ const items = [];
111
+ if (detail.errorDetail) {
112
+ items.push(`Error: ${detail.errorDetail}`);
113
+ }
114
+ for (const result of detail.evaluatorResults) {
115
+ if (result.status === "fail") {
116
+ items.push(`Evaluator ${result.evaluatorId}: ${result.message}`);
117
+ }
118
+ }
119
+ if (detail.run.status !== "pass" && items.length === 0) {
120
+ items.push("Run did not pass. Inspect evaluator results and trace for the first divergence.");
121
+ }
122
+ return items;
123
+ }
98
124
  function signed(value) {
99
125
  return value > 0 ? `+${value}` : `${value}`;
100
126
  }
@@ -21816,6 +21816,7 @@ function RunDetailPage(props) {
21816
21816
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: detail.run.id }),
21817
21817
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: detail.run.scenarioId })
21818
21818
  ] }),
21819
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(FailureSummaryPanel, { detail }),
21819
21820
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
21820
21821
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Status", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }),
21821
21822
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score", value: detail.run.score }),
@@ -21835,6 +21836,7 @@ function RunDetailPage(props) {
21835
21836
  " ",
21836
21837
  detail.agentVersion?.modelId ?? "-"
21837
21838
  ] }),
21839
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunIdentitySummary, { detail }),
21838
21840
  detail.agentVersion?.command ? /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21839
21841
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Command:" }),
21840
21842
  " ",
@@ -21892,6 +21894,71 @@ function RunDetailPage(props) {
21892
21894
  ] })
21893
21895
  ] });
21894
21896
  }
21897
+ function FailureSummaryPanel(props) {
21898
+ const failureItems = getFailureSummaryItems(props.detail);
21899
+ if (failureItems.length === 0) {
21900
+ return null;
21901
+ }
21902
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
21903
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Failures First" }),
21904
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21905
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Status:" }),
21906
+ " ",
21907
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })
21908
+ ] }),
21909
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21910
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Termination:" }),
21911
+ " ",
21912
+ props.detail.run.terminationReason
21913
+ ] }),
21914
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: failureItems.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: item }, item)) })
21915
+ ] });
21916
+ }
21917
+ function RunIdentitySummary(props) {
21918
+ const run = props.detail.run;
21919
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)(import_jsx_runtime.Fragment, { children: [
21920
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21921
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant set:" }),
21922
+ " ",
21923
+ run.variantSetName ?? "-"
21924
+ ] }),
21925
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21926
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant:" }),
21927
+ " ",
21928
+ run.variantLabel ?? "-"
21929
+ ] }),
21930
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21931
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Prompt version:" }),
21932
+ " ",
21933
+ run.promptVersion ?? "-"
21934
+ ] }),
21935
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21936
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Model version:" }),
21937
+ " ",
21938
+ run.modelVersion ?? "-"
21939
+ ] }),
21940
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21941
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Tool schema version:" }),
21942
+ " ",
21943
+ run.toolSchemaVersion ?? "-"
21944
+ ] }),
21945
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21946
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Config label:" }),
21947
+ " ",
21948
+ run.configLabel ?? "-"
21949
+ ] }),
21950
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21951
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Runtime profile:" }),
21952
+ " ",
21953
+ run.runtimeProfileName ?? "-"
21954
+ ] }),
21955
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21956
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Suite definition:" }),
21957
+ " ",
21958
+ run.suiteDefinitionName ?? "-"
21959
+ ] })
21960
+ ] });
21961
+ }
21895
21962
  function ComparePage(props) {
21896
21963
  const [data, setData] = (0, import_react.useState)(null);
21897
21964
  (0, import_react.useEffect)(() => {
@@ -22092,6 +22159,21 @@ function EmptyState(props) {
22092
22159
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: props.description })
22093
22160
  ] });
22094
22161
  }
22162
+ function getFailureSummaryItems(detail) {
22163
+ const items = [];
22164
+ if (detail.errorDetail) {
22165
+ items.push(`Error: ${detail.errorDetail}`);
22166
+ }
22167
+ for (const result of detail.evaluatorResults) {
22168
+ if (result.status === "fail") {
22169
+ items.push(`Evaluator ${result.evaluatorId}: ${result.message}`);
22170
+ }
22171
+ }
22172
+ if (detail.run.status !== "pass" && items.length === 0) {
22173
+ items.push("Run did not pass. Inspect evaluator results and trace for the first divergence.");
22174
+ }
22175
+ return items;
22176
+ }
22095
22177
  function signed(value) {
22096
22178
  return value > 0 ? `+${value}` : `${value}`;
22097
22179
  }
package/docs/agents.md CHANGED
@@ -2,15 +2,25 @@
2
2
 
3
3
  Named agents are configured in `agentlab.config.yaml`.
4
4
 
5
- This repo currently supports three provider modes:
5
+ Agents remain the stable execution unit even when you introduce Tier 1 comparison features. You still run one named agent at a time, but you can now group multiple named agents into a `variant_set` for prompt/model/config comparisons.
6
+
7
+ This repo supports four provider modes:
6
8
 
7
9
  - `mock`
8
10
  - `openai`
9
11
  - `external_process`
12
+ - `http`
13
+
14
+ Choose the simplest provider that answers the engineering question you actually have:
15
+
16
+ - `mock` for deterministic harness verification
17
+ - `openai` for real model behavior on deterministic tools
18
+ - `external_process` for local agents where the runner should still own the tool loop
19
+ - `http` for real running services that own their own memory and internal orchestration
10
20
 
11
21
  ## Named Agent Config
12
22
 
13
- Example:
23
+ Example covering all providers:
14
24
 
15
25
  ```yaml
16
26
  agents:
@@ -29,14 +39,31 @@ agents:
29
39
  args:
30
40
  - custom_agents/node_agent.mjs
31
41
  label: custom-node-agent
42
+
43
+ - name: my-production-agent
44
+ provider: http
45
+ url: http://localhost:3000/api/chat
46
+ label: my-production-agent
32
47
  ```
33
48
 
34
49
  Run a named agent with:
35
50
 
36
51
  ```bash
37
52
  agentlab run support.refund-correct-order --agent mock-default
53
+ agentlab run internal-teams.memory-followup-recall --agent my-production-agent
54
+ ```
55
+
56
+ Use a named variant set when you want to run one scenario or one suite against multiple agent variants and compare the results later:
57
+
58
+ ```bash
59
+ agentlab run support.refund-correct-order --variant-set refund-agent-model-comparison
60
+ agentlab run --suite-def pre_merge --variant-set refund-agent-model-comparison
38
61
  ```
39
62
 
63
+ Each run records the underlying agent plus richer identity metadata such as `variant_label`, `prompt_version`, `model_version`, `tool_schema_version`, and `config_label`. Those fields appear in CLI summaries, `show`, stored run history, and the UI.
64
+
65
+ ---
66
+
40
67
  ## Mock
41
68
 
42
69
  The built-in mock adapter is the best path for deterministic smoke tests and baseline examples.
@@ -47,6 +74,8 @@ Use it when you want:
47
74
  - stable docs examples
48
75
  - predictable benchmark behavior
49
76
 
77
+ ---
78
+
50
79
  ## OpenAI
51
80
 
52
81
  The OpenAI path uses your API key and a configured model.
@@ -65,6 +94,8 @@ agentlab run support.refund-correct-order --agent openai-cheap
65
94
 
66
95
  The OpenAI path is useful, but less deterministic than the mock path.
67
96
 
97
+ ---
98
+
68
99
  ## External Process
69
100
 
70
101
  External-process agents communicate with the runner over line-delimited JSON on stdin/stdout.
@@ -110,14 +141,12 @@ Run one of them with:
110
141
  agentlab run support.refund-via-config-tool --agent custom-node-agent
111
142
  ```
112
143
 
113
- ## Environment Allowlist
144
+ ### Environment Allowlist
114
145
 
115
146
  External-process agents can optionally define `envAllowlist`.
116
147
 
117
148
  Use it when a child process needs specific environment variables passed through.
118
149
 
119
- Example shape:
120
-
121
150
  ```yaml
122
151
  agents:
123
152
  - name: custom-agent
@@ -131,13 +160,117 @@ agents:
131
160
 
132
161
  Only allow through what the child actually needs.
133
162
 
163
+ ---
164
+
165
+ ## HTTP
166
+
167
+ The `http` provider is for testing real production agents that run as HTTP services — Express, FastAPI, Next.js API routes, or any service that accepts a POST and returns a JSON response.
168
+
169
+ Unlike the other providers, HTTP agents manage their own conversation history and tool execution internally. agentlab sends the current message and a `conversation_id` each turn, then evaluates the reply.
170
+
171
+ Use HTTP agents with `type: conversation` scenarios. See [scenarios.md](scenarios.md) for the conversation scenario format.
172
+
173
+ This is the default choice when validating memoryful or stateful agents that already run as a service.
174
+
175
+ HTTP agents can be included inside a `variant_set` the same way as other named agents. Runtime-profile fault injection is currently applied only to task/tool-loop runs. Conversation scenarios may still reference a runtime profile for reusable authoring, but ARL does not currently intercept internal HTTP-agent tools.
176
+
177
+ ### Minimal Config
178
+
179
+ ```yaml
180
+ agents:
181
+ - name: my-agent
182
+ provider: http
183
+ url: http://localhost:3000/api/chat
184
+ ```
185
+
186
+ Default contract: agentlab posts `{ message, conversation_id }` and expects `{ message }` in the response.
187
+
188
+ ### Custom Field Names
189
+
190
+ If your agent uses different field names:
191
+
192
+ ```yaml
193
+ agents:
194
+ - name: my-agent-custom
195
+ provider: http
196
+ url: http://localhost:3000/api/chat
197
+ request_template:
198
+ query: "{{message}}"
199
+ session_id: "{{conversation_id}}"
200
+ response_field: reply
201
+ ```
202
+
203
+ `request_template` values support three placeholders:
204
+
205
+ - `{{message}}` — the current step message
206
+ - `{{conversation_id}}` — the UUID generated for this run (consistent across all steps)
207
+ - `{{env.VAR_NAME}}` — reads from the environment at runtime
208
+
209
+ Whitespace inside `{{ }}` is ignored: `{{ message }}` and `{{message}}` are identical.
210
+
211
+ ### Auth and Timeout
212
+
213
+ ```yaml
214
+ agents:
215
+ - name: my-agent-auth
216
+ provider: http
217
+ url: http://localhost:3000/api/chat
218
+ headers:
219
+ Authorization: "Bearer {{env.MY_AGENT_TOKEN}}"
220
+ timeout_ms: 10000
221
+ ```
222
+
223
+ `timeout_ms` defaults to 30000 (30 seconds) if not set.
224
+
225
+ Header values also support `{{message}}`, `{{conversation_id}}`, and `{{env.VAR_NAME}}` placeholders.
226
+
227
+ ### Full Config Reference
228
+
229
+ | Field | Required | Default | Description |
230
+ |-------|----------|---------|-------------|
231
+ | `url` | yes | — | HTTP endpoint to POST to |
232
+ | `request_template` | no | `{ message, conversation_id }` | Custom request body shape |
233
+ | `response_field` | no | `message` | Field to read the reply from |
234
+ | `headers` | no | `{}` | Additional HTTP headers |
235
+ | `timeout_ms` | no | `30000` | Per-request timeout in milliseconds |
236
+ | `label` | no | agent name | Display label in CLI output and run history |
237
+
238
+ ### How It Works
239
+
240
+ For each step in a conversation scenario:
241
+
242
+ 1. agentlab generates a UUID `conversation_id` once at the start of the run
243
+ 2. for every step, it POSTs the current message and `conversation_id` to your agent
244
+ 3. your agent is responsible for maintaining conversation history using that id
245
+ 4. agentlab reads the reply, measures latency, and runs per-step evaluators
246
+ 5. if a hard-gate evaluator fails, the run stops immediately
247
+
248
+ ### Error Handling
249
+
250
+ HTTP provider runs can end with these termination reasons:
251
+
252
+ | Reason | Cause |
253
+ |--------|-------|
254
+ | `http_connection_failed` | Could not connect to the URL |
255
+ | `http_error` | Agent returned HTTP 4xx or 5xx |
256
+ | `timeout_exceeded` | Request exceeded `timeout_ms` |
257
+ | `invalid_response_format` | Response is not valid JSON, or the expected field is missing |
258
+ | `evaluator_failed` | A per-step hard-gate evaluator failed |
259
+
260
+ Infrastructure errors (`http_connection_failed`, `http_error`, `timeout_exceeded`, `invalid_response_format`) always produce `status: error` and `score: 0`.
261
+
262
+ ---
263
+
134
264
  ## Best Practices
135
265
 
136
- - use named agents instead of ad hoc local command strings
266
+ - use named agents instead of ad hoc provider flags
137
267
  - keep labels stable so compare output stays readable
138
268
  - prefer the mock path for smoke tests and docs
139
- - use external-process agents when you want to wrap a local Node or Python agent implementation
140
- - keep the runner authoritative for tools and termination
269
+ - use external-process agents when you want to wrap a local Node or Python agent
270
+ - use http agents when your agent is already running as a service
271
+ - keep the runner authoritative for tools and termination (external_process and mock)
272
+ - keep your agent authoritative for tools and history (http)
273
+ - choose the simplest provider that answers the engineering question you actually have
141
274
 
142
275
  ## Common Errors
143
276
 
@@ -148,5 +281,7 @@ Typical failures:
148
281
  - missing external-process `command`
149
282
  - invalid `args` or `envAllowlist`
150
283
  - child process returning invalid JSON
284
+ - http agent url not running when the test starts
285
+ - http agent returning a field name that doesn't match `response_field`
151
286
 
152
287
  See [troubleshooting.md](troubleshooting.md) for fixes.