npm - agent-regression-lab - Versions diffs - 0.2.0 → 0.3.0 - Mend

agent-regression-lab 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/README.md +53 -7
package/dist/agent/factory.js +20 -6
package/dist/agent/httpAdapter.js +5 -4
package/dist/config.js +186 -3
package/dist/evaluators.js +56 -1
package/dist/index.js +143 -11
package/dist/lib/id.js +3 -0
package/dist/runOutput.js +46 -0
package/dist/runner.js +31 -9
package/dist/scenarios.js +90 -2
package/dist/scoring.js +2 -2
package/dist/storage.js +117 -7
package/dist/tools.js +38 -0
package/dist/trace.js +4 -2
package/dist/ui/App.js +28 -2
package/dist/ui-assets/client.js +82 -0
package/docs/agents.md +143 -8
package/docs/golden-suites.md +74 -0
package/docs/integrations-and-live-services.md +58 -0
package/docs/memory-and-stateful-agents.md +51 -0
package/docs/release-checklist.md +30 -0
package/docs/runtime-profiles.md +67 -0
package/docs/scenarios.md +303 -56
package/docs/troubleshooting.md +138 -0
package/docs/variant-sets.md +63 -0
package/package.json +2 -2

package/dist/storage.js CHANGED Viewed

@@ -10,6 +10,10 @@ export class Storage {
         ensureParentDir(DB_PATH);
         this.db = new DatabaseSync(DB_PATH);
         this.db.exec(`
+      PRAGMA journal_mode = WAL;
+      PRAGMA busy_timeout = 5000;
+    `);
+        this.db.exec(`
       CREATE TABLE IF NOT EXISTS metadata (
         key TEXT PRIMARY KEY,
         value TEXT NOT NULL
@@ -35,6 +39,15 @@ export class Storage {
         provider TEXT,
         command TEXT,
         args_json TEXT,
+        variant_set_name TEXT,
+        variant_label TEXT,
+        prompt_version TEXT,
+        model_version TEXT,
+        tool_schema_version TEXT,
+        config_label TEXT,
+        config_hash TEXT,
+        runtime_profile_name TEXT,
+        suite_definition_name TEXT,
         config_json TEXT NOT NULL,
         created_at TEXT NOT NULL
       );
@@ -45,6 +58,15 @@ export class Storage {
         scenario_file_hash TEXT NOT NULL,
         agent_version_id TEXT NOT NULL,
         suite_batch_id TEXT,
+        variant_set_name TEXT,
+        variant_label TEXT,
+        prompt_version TEXT,
+        model_version TEXT,
+        tool_schema_version TEXT,
+        config_label TEXT,
+        config_hash TEXT,
+        runtime_profile_name TEXT,
+        suite_definition_name TEXT,
         status TEXT NOT NULL,
         termination_reason TEXT NOT NULL,
         final_output TEXT NOT NULL,
@@ -120,25 +142,41 @@ export class Storage {
     upsertAgentVersion(agentVersion) {
         const now = new Date().toISOString();
         this.db
-            .prepare(`INSERT INTO agent_versions (id, label, model_id, provider, command, args_json, config_json, created_at)
-         VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+            .prepare(`INSERT INTO agent_versions (
+           id, label, model_id, provider, command, args_json,
+           variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
+           config_label, config_hash, runtime_profile_name, suite_definition_name,
+           config_json, created_at
+         )
+         VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
          ON CONFLICT(id) DO UPDATE SET
            label = excluded.label,
            model_id = excluded.model_id,
            provider = excluded.provider,
            command = excluded.command,
            args_json = excluded.args_json,
+           variant_set_name = excluded.variant_set_name,
+           variant_label = excluded.variant_label,
+           prompt_version = excluded.prompt_version,
+           model_version = excluded.model_version,
+           tool_schema_version = excluded.tool_schema_version,
+           config_label = excluded.config_label,
+           config_hash = excluded.config_hash,
+           runtime_profile_name = excluded.runtime_profile_name,
+           suite_definition_name = excluded.suite_definition_name,
            config_json = excluded.config_json`)
-            .run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), JSON.stringify(agentVersion.config), now);
+            .run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), agentVersion.variantSetName ?? null, agentVersion.variantLabel ?? null, agentVersion.promptVersion ?? null, agentVersion.modelVersion ?? null, agentVersion.toolSchemaVersion ?? null, agentVersion.configLabel ?? null, agentVersion.configHash ?? null, agentVersion.runtimeProfileName ?? null, agentVersion.suiteDefinitionName ?? null, JSON.stringify(agentVersion.config), now);
     }
     saveRun(bundle) {
         const run = bundle.run;
         this.db
             .prepare(`INSERT INTO runs (
           id, scenario_id, scenario_file_hash, agent_version_id, status, termination_reason, final_output,
-          suite_batch_id, total_steps, total_tool_calls, duration_ms, total_tokens, total_cost_usd, score, started_at, finished_at
-        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
-            .run(run.id, run.scenarioId, run.scenarioFileHash, run.agentVersionId, run.status, run.terminationReason, run.finalOutput, run.suiteBatchId ?? null, run.totalSteps, run.totalToolCalls, run.durationMs, run.totalTokens ?? null, run.totalCostUsd ?? null, run.score, run.startedAt, run.finishedAt);
+          suite_batch_id, variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
+          config_label, config_hash, runtime_profile_name, suite_definition_name,
+          total_steps, total_tool_calls, duration_ms, total_tokens, total_cost_usd, score, started_at, finished_at
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
+            .run(run.id, run.scenarioId, run.scenarioFileHash, run.agentVersionId, run.status, run.terminationReason, run.finalOutput, run.suiteBatchId ?? null, run.variantSetName ?? null, run.variantLabel ?? null, run.promptVersion ?? null, run.modelVersion ?? null, run.toolSchemaVersion ?? null, run.configLabel ?? null, run.configHash ?? null, run.runtimeProfileName ?? null, run.suiteDefinitionName ?? null, run.totalSteps, run.totalToolCalls, run.durationMs, run.totalTokens ?? null, run.totalCostUsd ?? null, run.score, run.startedAt, run.finishedAt);
         const insertStep = this.db.prepare(`INSERT INTO run_steps (id, run_id, step_index, timestamp, source, type, payload_json)
        VALUES (?, ?, ?, ?, ?, ?, ?)`);
         const insertTool = this.db.prepare(`INSERT INTO tool_calls (id, run_id, step_index, tool_name, input_json, output_json, status, duration_ms, error_message)
@@ -183,6 +221,7 @@ export class Storage {
         return this.db
             .prepare(`SELECT r.id, r.scenario_id as scenarioId, s.suite, r.agent_version_id as agentVersionId,
                 r.suite_batch_id as suiteBatchId,
+                r.variant_set_name as variantSetName, r.variant_label as variantLabel,
                 av.label as agentLabel, av.provider, av.model_id as modelId,
                 r.status, r.score, r.duration_ms as durationMs, r.total_steps as totalSteps,
                 r.started_at as startedAt
@@ -244,6 +283,11 @@ export class Storage {
         }));
         const agentVersion = this.db
             .prepare(`SELECT id, label, model_id as modelId, provider, command, args_json, config_json
+                , variant_set_name as variantSetName, variant_label as variantLabel,
+                prompt_version as promptVersion, model_version as modelVersion,
+                tool_schema_version as toolSchemaVersion, config_label as configLabel,
+                config_hash as configHash, runtime_profile_name as runtimeProfileName,
+                suite_definition_name as suiteDefinitionName
          FROM agent_versions WHERE id = ?`)
             .get(run.agentVersionId);
         return {
@@ -259,6 +303,15 @@ export class Storage {
                     provider: agentVersion.provider ?? undefined,
                     command: agentVersion.command ?? undefined,
                     args: agentVersion.args_json ? JSON.parse(agentVersion.args_json) : undefined,
+                    variantSetName: agentVersion.variantSetName ?? undefined,
+                    variantLabel: agentVersion.variantLabel ?? undefined,
+                    promptVersion: agentVersion.promptVersion ?? undefined,
+                    modelVersion: agentVersion.modelVersion ?? undefined,
+                    toolSchemaVersion: agentVersion.toolSchemaVersion ?? undefined,
+                    configLabel: agentVersion.configLabel ?? undefined,
+                    configHash: agentVersion.configHash ?? undefined,
+                    runtimeProfileName: agentVersion.runtimeProfileName ?? undefined,
+                    suiteDefinitionName: agentVersion.suiteDefinitionName ?? undefined,
                     config: JSON.parse(agentVersion.config_json),
                 }
                 : undefined,
@@ -348,7 +401,10 @@ export class Storage {
     getRunRecord(runId) {
         return (this.db
             .prepare(`SELECT id, scenario_id as scenarioId, scenario_file_hash as scenarioFileHash, agent_version_id as agentVersionId,
-                  suite_batch_id as suiteBatchId,
+                  suite_batch_id as suiteBatchId, variant_set_name as variantSetName, variant_label as variantLabel,
+                  prompt_version as promptVersion, model_version as modelVersion, tool_schema_version as toolSchemaVersion,
+                  config_label as configLabel, config_hash as configHash, runtime_profile_name as runtimeProfileName,
+                  suite_definition_name as suiteDefinitionName,
                   status, termination_reason as terminationReason, final_output as finalOutput, total_steps as totalSteps,
                   total_tool_calls as totalToolCalls, duration_ms as durationMs, total_tokens as totalTokens,
                   total_cost_usd as totalCostUsd, score, started_at as startedAt, finished_at as finishedAt
@@ -388,6 +444,33 @@ export class Storage {
         if (!names.has("args_json")) {
             this.db.exec(`ALTER TABLE agent_versions ADD COLUMN args_json TEXT`);
         }
+        if (!names.has("variant_set_name")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_set_name TEXT`);
+        }
+        if (!names.has("variant_label")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_label TEXT`);
+        }
+        if (!names.has("prompt_version")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN prompt_version TEXT`);
+        }
+        if (!names.has("model_version")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN model_version TEXT`);
+        }
+        if (!names.has("tool_schema_version")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN tool_schema_version TEXT`);
+        }
+        if (!names.has("config_label")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_label TEXT`);
+        }
+        if (!names.has("config_hash")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_hash TEXT`);
+        }
+        if (!names.has("runtime_profile_name")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN runtime_profile_name TEXT`);
+        }
+        if (!names.has("suite_definition_name")) {
+            this.db.exec(`ALTER TABLE agent_versions ADD COLUMN suite_definition_name TEXT`);
+        }
     }
     ensureRunColumns() {
         const columns = this.db.prepare(`PRAGMA table_info(runs)`).all();
@@ -395,6 +478,33 @@ export class Storage {
         if (!names.has("suite_batch_id")) {
             this.db.exec(`ALTER TABLE runs ADD COLUMN suite_batch_id TEXT`);
         }
+        if (!names.has("variant_set_name")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN variant_set_name TEXT`);
+        }
+        if (!names.has("variant_label")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN variant_label TEXT`);
+        }
+        if (!names.has("prompt_version")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN prompt_version TEXT`);
+        }
+        if (!names.has("model_version")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN model_version TEXT`);
+        }
+        if (!names.has("tool_schema_version")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN tool_schema_version TEXT`);
+        }
+        if (!names.has("config_label")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN config_label TEXT`);
+        }
+        if (!names.has("config_hash")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN config_hash TEXT`);
+        }
+        if (!names.has("runtime_profile_name")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN runtime_profile_name TEXT`);
+        }
+        if (!names.has("suite_definition_name")) {
+            this.db.exec(`ALTER TABLE runs ADD COLUMN suite_definition_name TEXT`);
+        }
     }
     getRunsBySuiteBatchId(suiteBatchId) {
         const runIds = this.db

package/dist/tools.js CHANGED Viewed

@@ -2,6 +2,38 @@ import { readFileSync } from "node:fs";
 import { pathToFileURL } from "node:url";
 import { resolve } from "node:path";
 import { loadAgentLabConfig } from "./config.js";
+export function applyRuntimeProfileToTools(tools, profile, trace) {
+    if (!profile?.tool_faults?.length) {
+        return tools;
+    }
+    const wrapped = { ...tools };
+    for (const fault of profile.tool_faults) {
+        const original = wrapped[fault.tool];
+        if (!original) {
+            continue;
+        }
+        wrapped[fault.tool] = async (input, context) => {
+            trace.record("system", "tool_fault_injected", {
+                tool: fault.tool,
+                mode: fault.mode,
+            }, { countStep: false });
+            if (fault.mode === "timeout") {
+                await waitUnref(fault.timeout_ms ?? 5000);
+                const timeoutError = new Error(`Injected timeout for ${fault.tool}`);
+                timeoutError.code = "timeout_exceeded";
+                throw timeoutError;
+            }
+            if (fault.mode === "error") {
+                throw new Error(fault.error_message ?? `Injected failure for ${fault.tool}`);
+            }
+            if (fault.mode === "malformed_output") {
+                return "MALFORMED_OUTPUT";
+            }
+            return fault.partial_output ?? {};
+        };
+    }
+    return wrapped;
+}
 function loadFixture(path) {
     const raw = readFileSync(resolve(path), "utf8");
     return JSON.parse(raw);
@@ -372,3 +404,9 @@ function assertObject(value) {
         throw new Error("Tool input must be an object.");
     }
 }
+function waitUnref(timeoutMs) {
+    return new Promise((resolve) => {
+        const timer = setTimeout(resolve, timeoutMs);
+        timer.unref?.();
+    });
+}

package/dist/trace.js CHANGED Viewed

@@ -8,8 +8,10 @@ export class TraceRecorder {
         this.runId = runId;
         this.scenarioId = scenarioId;
     }
-    record(source, type, payload) {
-        this.stepIndex += 1;
+    record(source, type, payload, options) {
+        if (options?.countStep !== false) {
+            this.stepIndex += 1;
+        }
         this.events.push({
             eventId: createEventId(),
             runId: this.runId,

package/dist/ui/App.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { jsx as _jsx, jsxs as _jsxs } from "react/jsx-runtime";
+import { jsx as _jsx, jsxs as _jsxs, Fragment as _Fragment } from "react/jsx-runtime";
 import { useEffect, useState } from "react";
 export function App() {
     const route = getRoute();
@@ -37,7 +37,18 @@ function RunDetailPage(props) {
     if (!detail) {
         return _jsx(EmptyState, { title: "Loading run", description: "Fetching run detail from the local lab." });
     }
-    return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline", children: detail.traceEvents.map((event) => (_jsxs("li", { children: [_jsxs("div", { children: [_jsxs("strong", { children: [event.stepIndex, ". ", event.type] }), " ", _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
+    return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsx(FailureSummaryPanel, { detail: detail }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), _jsx(RunIdentitySummary, { detail: detail }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline", children: detail.traceEvents.map((event) => (_jsxs("li", { children: [_jsxs("div", { children: [_jsxs("strong", { children: [event.stepIndex, ". ", event.type] }), " ", _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
+}
+export function FailureSummaryPanel(props) {
+    const failureItems = getFailureSummaryItems(props.detail);
+    if (failureItems.length === 0) {
+        return null;
+    }
+    return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Failures First" }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsx("ul", { className: "stack", children: failureItems.map((item) => (_jsx("li", { children: item }, item))) })] }));
+}
+export function RunIdentitySummary(props) {
+    const run = props.detail.run;
+    return (_jsxs(_Fragment, { children: [_jsxs("p", { children: [_jsx("strong", { children: "Variant set:" }), " ", run.variantSetName ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Variant:" }), " ", run.variantLabel ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Prompt version:" }), " ", run.promptVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model version:" }), " ", run.modelVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Tool schema version:" }), " ", run.toolSchemaVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Config label:" }), " ", run.configLabel ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime profile:" }), " ", run.runtimeProfileName ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Suite definition:" }), " ", run.suiteDefinitionName ?? "-"] })] }));
 }
 function ComparePage(props) {
     const [data, setData] = useState(null);
@@ -95,6 +106,21 @@ function Stat(props) {
 function EmptyState(props) {
     return (_jsxs("section", { className: "empty", children: [_jsx("h1", { children: props.title }), _jsx("p", { children: props.description })] }));
 }
+export function getFailureSummaryItems(detail) {
+    const items = [];
+    if (detail.errorDetail) {
+        items.push(`Error: ${detail.errorDetail}`);
+    }
+    for (const result of detail.evaluatorResults) {
+        if (result.status === "fail") {
+            items.push(`Evaluator ${result.evaluatorId}: ${result.message}`);
+        }
+    }
+    if (detail.run.status !== "pass" && items.length === 0) {
+        items.push("Run did not pass. Inspect evaluator results and trace for the first divergence.");
+    }
+    return items;
+}
 function signed(value) {
     return value > 0 ? `+${value}` : `${value}`;
 }

package/dist/ui-assets/client.js CHANGED Viewed

@@ -21816,6 +21816,7 @@ function RunDetailPage(props) {
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: detail.run.id }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: detail.run.scenarioId })
     ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsx)(FailureSummaryPanel, { detail }),
     /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Status", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score", value: detail.run.score }),
@@ -21835,6 +21836,7 @@ function RunDetailPage(props) {
           " ",
           detail.agentVersion?.modelId ?? "-"
         ] }),
+        /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunIdentitySummary, { detail }),
         detail.agentVersion?.command ? /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
           /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Command:" }),
           " ",
@@ -21892,6 +21894,71 @@ function RunDetailPage(props) {
     ] })
   ] });
 }
+function FailureSummaryPanel(props) {
+  const failureItems = getFailureSummaryItems(props.detail);
+  if (failureItems.length === 0) {
+    return null;
+  }
+  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
+    /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Failures First" }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Status:" }),
+      " ",
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Termination:" }),
+      " ",
+      props.detail.run.terminationReason
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: failureItems.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: item }, item)) })
+  ] });
+}
+function RunIdentitySummary(props) {
+  const run = props.detail.run;
+  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)(import_jsx_runtime.Fragment, { children: [
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant set:" }),
+      " ",
+      run.variantSetName ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant:" }),
+      " ",
+      run.variantLabel ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Prompt version:" }),
+      " ",
+      run.promptVersion ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Model version:" }),
+      " ",
+      run.modelVersion ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Tool schema version:" }),
+      " ",
+      run.toolSchemaVersion ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Config label:" }),
+      " ",
+      run.configLabel ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Runtime profile:" }),
+      " ",
+      run.runtimeProfileName ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Suite definition:" }),
+      " ",
+      run.suiteDefinitionName ?? "-"
+    ] })
+  ] });
+}
 function ComparePage(props) {
   const [data, setData] = (0, import_react.useState)(null);
   (0, import_react.useEffect)(() => {
@@ -22092,6 +22159,21 @@ function EmptyState(props) {
     /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: props.description })
   ] });
 }
+function getFailureSummaryItems(detail) {
+  const items = [];
+  if (detail.errorDetail) {
+    items.push(`Error: ${detail.errorDetail}`);
+  }
+  for (const result of detail.evaluatorResults) {
+    if (result.status === "fail") {
+      items.push(`Evaluator ${result.evaluatorId}: ${result.message}`);
+    }
+  }
+  if (detail.run.status !== "pass" && items.length === 0) {
+    items.push("Run did not pass. Inspect evaluator results and trace for the first divergence.");
+  }
+  return items;
+}
 function signed(value) {
   return value > 0 ? `+${value}` : `${value}`;
 }

package/docs/agents.md CHANGED Viewed

@@ -2,15 +2,25 @@
 Named agents are configured in `agentlab.config.yaml`.
-This repo currently supports three provider modes:
+Agents remain the stable execution unit even when you introduce Tier 1 comparison features. You still run one named agent at a time, but you can now group multiple named agents into a `variant_set` for prompt/model/config comparisons.
+This repo supports four provider modes:
 - `mock`
 - `openai`
 - `external_process`
+- `http`
+Choose the simplest provider that answers the engineering question you actually have:
+- `mock` for deterministic harness verification
+- `openai` for real model behavior on deterministic tools
+- `external_process` for local agents where the runner should still own the tool loop
+- `http` for real running services that own their own memory and internal orchestration
 ## Named Agent Config
-Example:
+Example covering all providers:
 ```yaml
 agents:
@@ -29,14 +39,31 @@ agents:
     args:
       - custom_agents/node_agent.mjs
     label: custom-node-agent
+  - name: my-production-agent
+    provider: http
+    url: http://localhost:3000/api/chat
+    label: my-production-agent
 ```
 Run a named agent with:
 ```bash
 agentlab run support.refund-correct-order --agent mock-default
+agentlab run internal-teams.memory-followup-recall --agent my-production-agent
+```
+Use a named variant set when you want to run one scenario or one suite against multiple agent variants and compare the results later:
+```bash
+agentlab run support.refund-correct-order --variant-set refund-agent-model-comparison
+agentlab run --suite-def pre_merge --variant-set refund-agent-model-comparison
 ```
+Each run records the underlying agent plus richer identity metadata such as `variant_label`, `prompt_version`, `model_version`, `tool_schema_version`, and `config_label`. Those fields appear in CLI summaries, `show`, stored run history, and the UI.
+---
 ## Mock
 The built-in mock adapter is the best path for deterministic smoke tests and baseline examples.
@@ -47,6 +74,8 @@ Use it when you want:
 - stable docs examples
 - predictable benchmark behavior
+---
 ## OpenAI
 The OpenAI path uses your API key and a configured model.
@@ -65,6 +94,8 @@ agentlab run support.refund-correct-order --agent openai-cheap
 The OpenAI path is useful, but less deterministic than the mock path.
+---
 ## External Process
 External-process agents communicate with the runner over line-delimited JSON on stdin/stdout.
@@ -110,14 +141,12 @@ Run one of them with:
 agentlab run support.refund-via-config-tool --agent custom-node-agent
 ```
-## Environment Allowlist
+### Environment Allowlist
 External-process agents can optionally define `envAllowlist`.
 Use it when a child process needs specific environment variables passed through.
-Example shape:
 ```yaml
 agents:
   - name: custom-agent
@@ -131,13 +160,117 @@ agents:
 Only allow through what the child actually needs.
+---
+## HTTP
+The `http` provider is for testing real production agents that run as HTTP services — Express, FastAPI, Next.js API routes, or any service that accepts a POST and returns a JSON response.
+Unlike the other providers, HTTP agents manage their own conversation history and tool execution internally. agentlab sends the current message and a `conversation_id` each turn, then evaluates the reply.
+Use HTTP agents with `type: conversation` scenarios. See [scenarios.md](scenarios.md) for the conversation scenario format.
+This is the default choice when validating memoryful or stateful agents that already run as a service.
+HTTP agents can be included inside a `variant_set` the same way as other named agents. Runtime-profile fault injection is currently applied only to task/tool-loop runs. Conversation scenarios may still reference a runtime profile for reusable authoring, but ARL does not currently intercept internal HTTP-agent tools.
+### Minimal Config
+```yaml
+agents:
+  - name: my-agent
+    provider: http
+    url: http://localhost:3000/api/chat
+```
+Default contract: agentlab posts `{ message, conversation_id }` and expects `{ message }` in the response.
+### Custom Field Names
+If your agent uses different field names:
+```yaml
+agents:
+  - name: my-agent-custom
+    provider: http
+    url: http://localhost:3000/api/chat
+    request_template:
+      query: "{{message}}"
+      session_id: "{{conversation_id}}"
+    response_field: reply
+```
+`request_template` values support three placeholders:
+- `{{message}}` — the current step message
+- `{{conversation_id}}` — the UUID generated for this run (consistent across all steps)
+- `{{env.VAR_NAME}}` — reads from the environment at runtime
+Whitespace inside `{{ }}` is ignored: `{{ message }}` and `{{message}}` are identical.
+### Auth and Timeout
+```yaml
+agents:
+  - name: my-agent-auth
+    provider: http
+    url: http://localhost:3000/api/chat
+    headers:
+      Authorization: "Bearer {{env.MY_AGENT_TOKEN}}"
+    timeout_ms: 10000
+```
+`timeout_ms` defaults to 30000 (30 seconds) if not set.
+Header values also support `{{message}}`, `{{conversation_id}}`, and `{{env.VAR_NAME}}` placeholders.
+### Full Config Reference
+| Field | Required | Default | Description |
+|-------|----------|---------|-------------|
+| `url` | yes | — | HTTP endpoint to POST to |
+| `request_template` | no | `{ message, conversation_id }` | Custom request body shape |
+| `response_field` | no | `message` | Field to read the reply from |
+| `headers` | no | `{}` | Additional HTTP headers |
+| `timeout_ms` | no | `30000` | Per-request timeout in milliseconds |
+| `label` | no | agent name | Display label in CLI output and run history |
+### How It Works
+For each step in a conversation scenario:
+1. agentlab generates a UUID `conversation_id` once at the start of the run
+2. for every step, it POSTs the current message and `conversation_id` to your agent
+3. your agent is responsible for maintaining conversation history using that id
+4. agentlab reads the reply, measures latency, and runs per-step evaluators
+5. if a hard-gate evaluator fails, the run stops immediately
+### Error Handling
+HTTP provider runs can end with these termination reasons:
+| Reason | Cause |
+|--------|-------|
+| `http_connection_failed` | Could not connect to the URL |
+| `http_error` | Agent returned HTTP 4xx or 5xx |
+| `timeout_exceeded` | Request exceeded `timeout_ms` |
+| `invalid_response_format` | Response is not valid JSON, or the expected field is missing |
+| `evaluator_failed` | A per-step hard-gate evaluator failed |
+Infrastructure errors (`http_connection_failed`, `http_error`, `timeout_exceeded`, `invalid_response_format`) always produce `status: error` and `score: 0`.
+---
 ## Best Practices
-- use named agents instead of ad hoc local command strings
+- use named agents instead of ad hoc provider flags
 - keep labels stable so compare output stays readable
 - prefer the mock path for smoke tests and docs
-- use external-process agents when you want to wrap a local Node or Python agent implementation
-- keep the runner authoritative for tools and termination
+- use external-process agents when you want to wrap a local Node or Python agent
+- use http agents when your agent is already running as a service
+- keep the runner authoritative for tools and termination (external_process and mock)
+- keep your agent authoritative for tools and history (http)
+- choose the simplest provider that answers the engineering question you actually have
 ## Common Errors
@@ -148,5 +281,7 @@ Typical failures:
 - missing external-process `command`
 - invalid `args` or `envAllowlist`
 - child process returning invalid JSON
+- http agent url not running when the test starts
+- http agent returning a field name that doesn't match `response_field`
 See [troubleshooting.md](troubleshooting.md) for fixes.