npm - agent-regression-lab - Versions diffs - 0.1.1 → 0.2.0 - Mend

agent-regression-lab 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +140 -123
package/dist/agent/httpAdapter.js +78 -0
package/dist/agent/mockAdapter.js +210 -13
package/dist/config.js +37 -1
package/dist/conversationEvaluators.js +167 -0
package/dist/conversationRunner.js +199 -0
package/dist/index.js +287 -102
package/dist/lib/id.js +3 -0
package/dist/scenarios.js +121 -9
package/dist/storage.js +193 -29
package/dist/tools.js +246 -0
package/dist/ui/App.js +39 -3
package/dist/ui/server.js +18 -0
package/dist/ui-assets/client.js +83 -3
package/docs/agents.md +152 -0
package/docs/release-checklist.md +64 -0
package/docs/scenarios.md +172 -0
package/docs/tools.md +102 -0
package/docs/troubleshooting.md +158 -0
package/package.json +3 -2

package/dist/index.js CHANGED Viewed

@@ -1,6 +1,8 @@
 #!/usr/bin/env node
+import packageJson from "../package.json" with { type: "json" };
 import { createAgentFactory } from "./agent/factory.js";
 import { getAgentRegistration } from "./config.js";
+import { createSuiteBatchId } from "./lib/id.js";
 import { getRunErrorDetail } from "./runOutput.js";
 async function main() {
     const [, , command, ...args] = process.argv;
@@ -9,27 +11,27 @@ async function main() {
         case "--help":
         case "-h":
             printUsage();
-            return;
+            break;
         case "version":
         case "--version":
         case "-v":
             printVersion();
-            return;
+            break;
         case "list":
             await handleList(args);
-            return;
+            break;
         case "run":
             await handleRun(args);
-            return;
+            break;
         case "show":
             await handleShow(args);
-            return;
+            break;
         case "compare":
             await handleCompare(args);
-            return;
+            break;
         case "ui":
             await handleUi();
-            return;
+            break;
         default:
             printUsage();
     }
@@ -37,16 +39,17 @@ async function main() {
 function printUsage() {
     console.log(`Usage:
   agentlab list scenarios
-  agentlab run <scenario-id> [--agent <name>] [--provider mock|openai|external_process] [--model <model>] [--agent-label <label>]
-  agentlab run --suite <suite-id> [--agent <name>] [--provider mock|openai|external_process] [--model <model>] [--agent-label <label>]
+  agentlab run <scenario-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
+  agentlab run --suite <suite-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
   agentlab show <run-id>
   agentlab compare <baseline-run-id> <candidate-run-id>
+  agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
   agentlab ui
   agentlab help
   agentlab version`);
 }
 function printVersion() {
-    console.log("0.1.0");
+    console.log(packageJson.version);
 }
 async function handleList(args) {
     if (args[0] !== "scenarios") {
@@ -64,35 +67,56 @@ async function handleRun(args) {
     const { loadScenariosBySuite } = await import("./scenarios.js");
     if (parsed.suite) {
         const suite = parsed.suite;
-        if (!suite) {
-            throw new Error("Missing suite id.");
-        }
         const scenarios = loadScenariosBySuite(suite);
         if (scenarios.length === 0) {
             throw new Error(`No scenarios found for suite '${suite}'.`);
         }
+        const suiteBatchId = createSuiteBatchId();
         const runs = [];
         for (const scenario of scenarios) {
-            runs.push(await executeOne(scenario.definition.id, runtimeConfig));
+            runs.push(await executeOne(scenario.definition.id, runtimeConfig, suiteBatchId));
         }
-        const passed = runs.filter((bundle) => bundle.run.status === "pass").length;
-        const failed = runs.filter((bundle) => bundle.run.status === "fail").length;
-        const errored = runs.filter((bundle) => bundle.run.status === "error").length;
-        const avgScore = Math.round(runs.reduce((sum, bundle) => sum + bundle.run.score, 0) / runs.length);
-        console.log(`Suite: ${suite}`);
-        console.log(`Passed: ${passed}/${runs.length}`);
-        console.log(`Failed: ${failed}/${runs.length}`);
-        console.log(`Errored: ${errored}/${runs.length}`);
-        console.log(`Average score: ${avgScore}`);
+        printSuiteSummary(suite, runs, suiteBatchId);
         return;
     }
     const scenarioId = parsed.scenarioId;
     if (!scenarioId) {
         throw new Error("Missing scenario id.");
     }
-    await executeOne(scenarioId, runtimeConfig);
+    // Detect scenario type to route to the right runner
+    const { listScenarioFiles } = await import("./scenarios.js");
+    const { parse } = await import("yaml");
+    const { readFileSync } = await import("node:fs");
+    const { resolve } = await import("node:path");
+    let scenarioType = "task";
+    for (const filePath of listScenarioFiles()) {
+        const raw = readFileSync(resolve(filePath), "utf8");
+        const parsedYaml = parse(raw);
+        if (parsedYaml.id === scenarioId) {
+            scenarioType = parsedYaml.type === "conversation" ? "conversation" : "task";
+            break;
+        }
+    }
+    if (scenarioType === "conversation") {
+        if (runtimeConfig.provider !== "http") {
+            throw new Error(`Scenario '${scenarioId}' is a conversation scenario and requires provider: http. Use --agent <name> with a configured HTTP agent.`);
+        }
+        const httpConfig = {
+            name: runtimeConfig.agentName ?? "http-agent",
+            provider: "http",
+            url: runtimeConfig.url,
+            request_template: runtimeConfig.request_template,
+            response_field: runtimeConfig.response_field,
+            headers: runtimeConfig.headers,
+            timeout_ms: runtimeConfig.timeout_ms,
+        };
+        await executeConversation(scenarioId, httpConfig, runtimeConfig.label);
+    }
+    else {
+        await executeOne(scenarioId, runtimeConfig);
+    }
 }
-async function executeOne(scenarioId, runtimeConfig) {
+async function executeOne(scenarioId, runtimeConfig, suiteBatchId) {
     const [{ Storage }, { loadToolRegistry, loadToolSpecs }, { loadScenarioById }, { runScenario }] = await Promise.all([
         import("./storage.js"),
         import("./tools.js"),
@@ -100,31 +124,119 @@ async function executeOne(scenarioId, runtimeConfig) {
         import("./runner.js"),
     ]);
     const storage = new Storage();
-    const toolSpecs = await loadToolSpecs();
-    const toolRegistry = await loadToolRegistry();
-    const loaded = loadScenarioById(scenarioId);
-    storage.upsertScenario({
-        id: loaded.definition.id,
-        name: loaded.definition.name,
-        suite: loaded.definition.suite,
-        difficulty: loaded.definition.difficulty,
-        description: loaded.definition.description,
-    }, loaded.definition, loaded.filePath, loaded.fileHash);
-    const factory = createAgentFactory(runtimeConfig);
-    const agentVersion = factory.createVersion(runtimeConfig);
-    storage.upsertAgentVersion(agentVersion);
-    const bundle = await runScenario({
-        agentAdapter: factory.createAdapter(),
-        agentVersion,
-        scenario: loaded.definition,
-        scenarioFileHash: loaded.fileHash,
-        toolSpecs,
-        tools: toolRegistry,
-    });
-    bundle.agentVersion = agentVersion;
-    storage.saveRun(bundle);
-    printRunSummary(bundle);
-    return bundle;
+    try {
+        const toolSpecs = await loadToolSpecs();
+        const toolRegistry = await loadToolRegistry();
+        const loaded = loadScenarioById(scenarioId);
+        storage.upsertScenario({
+            id: loaded.definition.id,
+            name: loaded.definition.name,
+            suite: loaded.definition.suite,
+            difficulty: loaded.definition.difficulty,
+            description: loaded.definition.description,
+        }, loaded.definition, loaded.filePath, loaded.fileHash);
+        const factory = createAgentFactory(runtimeConfig);
+        const agentVersion = factory.createVersion(runtimeConfig);
+        storage.upsertAgentVersion(agentVersion);
+        const bundle = await runScenario({
+            agentAdapter: factory.createAdapter(),
+            agentVersion,
+            scenario: loaded.definition,
+            scenarioFileHash: loaded.fileHash,
+            toolSpecs,
+            tools: toolRegistry,
+        });
+        bundle.run.suiteBatchId = suiteBatchId;
+        bundle.agentVersion = agentVersion;
+        storage.saveRun(bundle);
+        printRunSummary(bundle);
+        return bundle;
+    }
+    finally {
+        storage.close();
+    }
+}
+export async function executeConversation(scenarioId, httpConfig, label, suiteBatchId) {
+    const [{ Storage }, { loadConversationScenarioById }, { runConversation }, { createAgentVersionId }] = await Promise.all([
+        import("./storage.js"),
+        import("./scenarios.js"),
+        import("./conversationRunner.js"),
+        import("./lib/id.js"),
+    ]);
+    const storage = new Storage();
+    try {
+        const loaded = loadConversationScenarioById(scenarioId);
+        storage.upsertScenario({
+            id: loaded.definition.id,
+            name: loaded.definition.name,
+            suite: loaded.definition.suite,
+            difficulty: loaded.definition.difficulty,
+            description: loaded.definition.description,
+        }, loaded.definition, loaded.filePath, loaded.fileHash);
+        const agentLabel = label ?? httpConfig.label ?? httpConfig.name;
+        const agentConfig = { provider: "http", url: httpConfig.url, agentName: httpConfig.name };
+        const agentVersion = {
+            id: createAgentVersionId(agentLabel, agentConfig),
+            label: agentLabel,
+            provider: "http",
+            config: agentConfig,
+        };
+        storage.upsertAgentVersion(agentVersion);
+        const bundle = await runConversation({
+            httpConfig,
+            agentVersion,
+            scenario: loaded.definition,
+            scenarioFileHash: loaded.fileHash,
+        });
+        bundle.run.suiteBatchId = suiteBatchId;
+        bundle.agentVersion = agentVersion;
+        storage.saveRun(bundle);
+        printConversationSummary(bundle, httpConfig.url, loaded.definition.steps.length);
+        return bundle;
+    }
+    finally {
+        storage.close();
+    }
+}
+function printSuiteSummary(suite, runs, suiteBatchId) {
+    const passed = runs.filter((bundle) => bundle.run.status === "pass").length;
+    const failed = runs.filter((bundle) => bundle.run.status === "fail").length;
+    const errored = runs.filter((bundle) => bundle.run.status === "error").length;
+    const avgScore = Math.round(runs.reduce((sum, bundle) => sum + bundle.run.score, 0) / runs.length);
+    console.log(`Suite: ${suite}`);
+    console.log(`Passed: ${passed}/${runs.length}`);
+    console.log(`Failed: ${failed}/${runs.length}`);
+    console.log(`Errored: ${errored}/${runs.length}`);
+    console.log(`Average score: ${avgScore}`);
+    console.log(`Suite batch: ${suiteBatchId}`);
+}
+function printConversationSummary(bundle, agentUrl, totalSteps) {
+    const statusLabel = bundle.run.status.toUpperCase();
+    console.log(`run ${bundle.run.scenarioId} — ${statusLabel}`);
+    console.log(`  agent: ${bundle.agentVersion?.label ?? bundle.run.agentVersionId} (${agentUrl})`);
+    console.log(`  turns completed: ${bundle.run.totalSteps}/${totalSteps}`);
+    const stepEvals = bundle.evaluatorResults.filter((r) => r.evaluatorId.startsWith("step_"));
+    const stepIndices = new Set(stepEvals.map((r) => {
+        const match = r.evaluatorId.match(/^step_(\d+)_/);
+        return match ? parseInt(match[1], 10) : -1;
+    }));
+    for (const stepIndex of [...stepIndices].sort((a, b) => a - b)) {
+        const resultsForStep = stepEvals.filter((r) => r.evaluatorId.startsWith(`step_${stepIndex}_`));
+        const allPass = resultsForStep.every((r) => r.status === "pass");
+        const stepStatus = allPass ? "pass" : "FAIL";
+        const details = resultsForStep.map((r) => {
+            if (r.evaluatorType === "response_latency_max") {
+                const latencyMatch = r.message.match(/(\d+)ms/);
+                return latencyMatch ? `latency ${latencyMatch[1]}ms ✓` : r.message;
+            }
+            return `${r.evaluatorType} ${r.status === "pass" ? "✓" : "✗"}`;
+        });
+        console.log(`  step ${stepIndex + 1}: ${stepStatus}${details.length > 0 ? ` (${details.join(", ")})` : ""}`);
+    }
+    if (bundle.run.status !== "pass") {
+        console.log(`  run stopped (${bundle.run.terminationReason})`);
+    }
+    console.log(`  run id: ${bundle.run.id}`);
 }
 async function handleUi() {
     const { startUiServer } = await import("./ui/server.js");
@@ -161,64 +273,122 @@ async function handleShow(args) {
     }
     const { Storage } = await import("./storage.js");
     const storage = new Storage();
-    const bundle = storage.getRun(runId);
-    if (!bundle) {
-        throw new Error(`Run '${runId}' not found.`);
-    }
-    console.log(`Run: ${bundle.run.id}`);
-    console.log(`Scenario: ${bundle.run.scenarioId}`);
-    console.log(`Status: ${bundle.run.status.toUpperCase()}`);
-    console.log(`Score: ${bundle.run.score}/100`);
-    if (bundle.agentVersion) {
-        console.log(`Provider: ${bundle.agentVersion.provider ?? "unknown"}`);
-        console.log(`Model: ${bundle.agentVersion.modelId ?? "unknown"}`);
-        if (bundle.agentVersion.command) {
-            console.log(`Command: ${bundle.agentVersion.command} ${(bundle.agentVersion.args ?? []).join(" ")}`.trim());
+    try {
+        const bundle = storage.getRun(runId);
+        if (!bundle) {
+            throw new Error(`Run '${runId}' not found.`);
+        }
+        console.log(`Run: ${bundle.run.id}`);
+        console.log(`Scenario: ${bundle.run.scenarioId}`);
+        console.log(`Status: ${bundle.run.status.toUpperCase()}`);
+        console.log(`Score: ${bundle.run.score}/100`);
+        if (bundle.agentVersion) {
+            console.log(`Provider: ${bundle.agentVersion.provider ?? "unknown"}`);
+            console.log(`Model: ${bundle.agentVersion.modelId ?? "unknown"}`);
+            if (bundle.agentVersion.command) {
+                console.log(`Command: ${bundle.agentVersion.command} ${(bundle.agentVersion.args ?? []).join(" ")}`.trim());
+            }
+        }
+        console.log(`Termination: ${bundle.run.terminationReason}`);
+        const errorDetail = getRunErrorDetail(bundle);
+        if (errorDetail) {
+            console.log(`Error: ${errorDetail}`);
+        }
+        console.log(`Final output: ${bundle.run.finalOutput}`);
+        console.log("Evaluators:");
+        for (const result of bundle.evaluatorResults) {
+            console.log(`- ${result.evaluatorId}: ${result.status.toUpperCase()} - ${result.message}`);
         }
     }
-    console.log(`Termination: ${bundle.run.terminationReason}`);
-    const errorDetail = getRunErrorDetail(bundle);
-    if (errorDetail) {
-        console.log(`Error: ${errorDetail}`);
-    }
-    console.log(`Final output: ${bundle.run.finalOutput}`);
-    console.log("Evaluators:");
-    for (const result of bundle.evaluatorResults) {
-        console.log(`- ${result.evaluatorId}: ${result.status.toUpperCase()} - ${result.message}`);
+    finally {
+        storage.close();
     }
 }
 async function handleCompare(args) {
-    const [baselineRunId, candidateRunId] = args;
-    if (!baselineRunId || !candidateRunId) {
-        throw new Error("Missing baseline or candidate run id.");
-    }
+    const isSuiteCompare = args[0] === "--suite";
     const { Storage } = await import("./storage.js");
     const storage = new Storage();
-    const comparison = storage.compareRuns(baselineRunId, candidateRunId);
-    console.log(`Scenario: ${comparison.baseline.run.scenarioId}`);
-    console.log(`Baseline: ${comparison.baseline.run.id} (${comparison.baseline.run.status.toUpperCase()} ${comparison.baseline.run.score}/100)`);
-    console.log(`Candidate: ${comparison.candidate.run.id} (${comparison.candidate.run.status.toUpperCase()} ${comparison.candidate.run.score}/100)`);
-    console.log("Changes:");
-    if (comparison.notes.length === 0) {
-        console.log("- No material changes.");
-    }
-    else {
-        for (const note of comparison.notes) {
-            console.log(`- ${note}`);
+    try {
+        if (isSuiteCompare) {
+            const baselineBatchId = args[1];
+            const candidateBatchId = args[2];
+            if (!baselineBatchId || !candidateBatchId) {
+                throw new Error("Missing baseline or candidate suite batch id.");
+            }
+            const comparison = storage.compareSuites(baselineBatchId, candidateBatchId);
+            console.log(`Suite: ${comparison.suite}`);
+            console.log(`Baseline batch: ${comparison.baselineBatchId}`);
+            console.log(`Candidate batch: ${comparison.candidateBatchId}`);
+            console.log(`Classification: ${comparison.classification.toUpperCase()}`);
+            console.log(`Pass delta: ${signedMetric(comparison.deltas.pass)}`);
+            console.log(`Fail delta: ${signedMetric(comparison.deltas.fail)}`);
+            console.log(`Error delta: ${signedMetric(comparison.deltas.error)}`);
+            console.log(`Average score delta: ${signedMetric(comparison.deltas.averageScore)}`);
+            console.log(`Average runtime delta: ${signedMetric(comparison.deltas.averageRuntimeMs)}ms`);
+            console.log(`Average steps delta: ${signedMetric(comparison.deltas.averageSteps)}`);
+            if (comparison.notes.length > 0) {
+                console.log("Notes:");
+                for (const note of comparison.notes) {
+                    console.log(`- ${note}`);
+                }
+            }
+            if (comparison.regressions.length > 0) {
+                console.log("Regressions:");
+                for (const regression of comparison.regressions) {
+                    console.log(`- ${regression.scenarioId}: ${regression.comparison.classification}`);
+                }
+            }
+            if (comparison.improvements.length > 0) {
+                console.log("Improvements:");
+                for (const improvement of comparison.improvements) {
+                    console.log(`- ${improvement.scenarioId}: ${improvement.comparison.classification}`);
+                }
+            }
+            if (comparison.missingFromCandidate.length > 0) {
+                console.log(`Missing from candidate: ${comparison.missingFromCandidate.join(", ")}`);
+            }
+            if (comparison.missingFromBaseline.length > 0) {
+                console.log(`Missing from baseline: ${comparison.missingFromBaseline.join(", ")}`);
+            }
+            return;
         }
-    }
-    if (comparison.evaluatorDiffs.length > 0) {
-        console.log("Evaluator diffs:");
-        for (const diff of comparison.evaluatorDiffs) {
-            console.log(`- ${diff.note}`);
+        const [baselineRunId, candidateRunId] = args;
+        if (!baselineRunId || !candidateRunId) {
+            throw new Error("Missing baseline or candidate run id.");
         }
-    }
-    if (comparison.toolDiffs.length > 0) {
-        console.log("Tool diffs:");
-        for (const diff of comparison.toolDiffs) {
-            console.log(`- ${diff.note}`);
+        const comparison = storage.compareRuns(baselineRunId, candidateRunId);
+        console.log(`Scenario: ${comparison.baseline.run.scenarioId}`);
+        console.log(`Baseline: ${comparison.baseline.run.id} (${comparison.baseline.run.status.toUpperCase()} ${comparison.baseline.run.score}/100)`);
+        console.log(`Candidate: ${comparison.candidate.run.id} (${comparison.candidate.run.status.toUpperCase()} ${comparison.candidate.run.score}/100)`);
+        console.log(`Classification: ${comparison.classification.toUpperCase()}`);
+        console.log("Changes:");
+        if (comparison.notes.length === 0) {
+            console.log("- No material changes.");
+        }
+        else {
+            for (const note of comparison.notes) {
+                console.log(`- ${note}`);
+            }
+        }
+        if (comparison.evaluatorDiffs.length > 0) {
+            console.log("Evaluator diffs:");
+            for (const diff of comparison.evaluatorDiffs) {
+                console.log(`- ${diff.note}`);
+            }
+        }
+        if (comparison.toolDiffs.length > 0) {
+            console.log("Tool diffs:");
+            for (const diff of comparison.toolDiffs) {
+                console.log(`- ${diff.note}`);
+            }
         }
     }
+    finally {
+        storage.close();
+    }
+}
+function signedMetric(value) {
+    return value > 0 ? `+${value}` : `${value}`;
 }
 function parseRunArgs(args) {
     const runtimeConfig = { provider: "mock" };
@@ -233,7 +403,7 @@ function parseRunArgs(args) {
         }
         if (arg === "--provider") {
             const provider = args[index + 1];
-            if (provider !== "mock" && provider !== "openai" && provider !== "external_process") {
+            if (provider !== "mock" && provider !== "openai" && provider !== "external_process" && provider !== "http") {
                 throw new Error(`Unsupported provider '${String(provider)}'.`);
             }
             runtimeConfig.provider = provider;
@@ -267,11 +437,20 @@ function validateRuntimeConfig(config) {
     if (config.agentName) {
         const registration = getAgentRegistration(config.agentName);
         config.provider = registration.provider;
-        config.model = config.model ?? registration.model;
         config.label = config.label ?? registration.label ?? registration.name;
-        config.command = registration.command;
-        config.args = registration.args;
-        config.envAllowlist = registration.envAllowlist;
+        if (registration.provider !== "http") {
+            config.model = config.model ?? registration.model;
+            config.command = registration.command;
+            config.args = registration.args;
+            config.envAllowlist = registration.envAllowlist;
+        }
+        else {
+            config.url = registration.url;
+            config.request_template = registration.request_template;
+            config.response_field = registration.response_field;
+            config.headers = registration.headers;
+            config.timeout_ms = registration.timeout_ms;
+        }
     }
     if (config.provider === "openai") {
         if (!process.env.OPENAI_API_KEY) {
@@ -288,6 +467,12 @@ function validateRuntimeConfig(config) {
         }
         config.label = config.label ?? config.agentName ?? "external-process-agent";
     }
+    if (config.provider === "http") {
+        if (!config.url) {
+            throw new Error("HTTP agents require a configured url. Use --agent <name> with provider: http in agentlab.config.yaml.");
+        }
+        config.label = config.label ?? config.agentName ?? "http-agent";
+    }
     return config;
 }
 main().catch((error) => {

package/dist/lib/id.js CHANGED Viewed

@@ -5,6 +5,9 @@ export function hashText(text) {
 export function createRunId() {
     return `run_${Date.now()}`;
 }
+export function createSuiteBatchId() {
+    return `suite_${Date.now()}_${randomUUID().slice(0, 8)}`;
+}
 export function createEventId() {
     return `evt_${randomUUID()}`;
 }

package/dist/scenarios.js CHANGED Viewed

@@ -26,15 +26,31 @@ export function listScenarioFiles(root = SCENARIOS_ROOT) {
     return results.sort();
 }
 export function listScenarios() {
-    return listScenarioFiles().map((filePath) => {
-        const { definition } = loadScenarioByPath(filePath, getKnownToolNames());
-        return {
-            id: definition.id,
-            name: definition.name,
-            suite: definition.suite,
-            difficulty: definition.difficulty,
-            description: definition.description,
-        };
+    return listScenarioFiles().flatMap((filePath) => {
+        try {
+            const scenarioType = getScenarioType(filePath);
+            if (scenarioType === "conversation") {
+                const { definition } = loadConversationScenarioByPath(filePath);
+                return [{
+                        id: definition.id,
+                        name: definition.name,
+                        suite: definition.suite,
+                        difficulty: definition.difficulty,
+                        description: definition.description,
+                    }];
+            }
+            const { definition } = loadScenarioByPath(filePath, getKnownToolNames());
+            return [{
+                    id: definition.id,
+                    name: definition.name,
+                    suite: definition.suite,
+                    difficulty: definition.difficulty,
+                    description: definition.description,
+                }];
+        }
+        catch {
+            return [];
+        }
     });
 }
 export function loadScenarioById(scenarioId) {
@@ -48,6 +64,7 @@ export function loadScenarioById(scenarioId) {
 }
 export function loadScenariosBySuite(suite) {
     return listScenarioFiles()
+        .filter((filePath) => getScenarioType(filePath) === "task")
         .map((filePath) => loadScenarioByPath(filePath, getKnownToolNames()))
         .filter(({ definition }) => definition.suite === suite);
 }
@@ -153,3 +170,98 @@ function getKnownToolNames() {
     }
     return names;
 }
+export function getScenarioType(filePath) {
+    const absolutePath = resolve(filePath);
+    const raw = readFileSync(absolutePath, "utf8");
+    const parsed = parse(raw);
+    if (isObject(parsed) && parsed.type === "conversation") {
+        return "conversation";
+    }
+    return "task";
+}
+export function loadConversationScenarioByPath(filePath) {
+    const absolutePath = resolve(filePath);
+    const raw = readFileSync(absolutePath, "utf8");
+    const parsed = parse(raw);
+    validateConversationScenario(parsed, absolutePath);
+    return {
+        definition: parsed,
+        filePath: relative(process.cwd(), absolutePath),
+        fileHash: createHash("sha256").update(raw).digest("hex"),
+    };
+}
+export function loadConversationScenarioById(scenarioId) {
+    for (const filePath of listScenarioFiles()) {
+        const absolutePath = resolve(filePath);
+        const raw = readFileSync(absolutePath, "utf8");
+        const parsed = parse(raw);
+        if (parsed.type === "conversation" && parsed.id === scenarioId) {
+            return loadConversationScenarioByPath(filePath);
+        }
+    }
+    throw new Error(`Conversation scenario '${scenarioId}' not found.`);
+}
+const VALID_CONVERSATION_EVALUATOR_TYPES = new Set([
+    "response_contains",
+    "response_not_contains",
+    "response_matches_regex",
+    "response_latency_max",
+    "step_count_max",
+    "exact_final_answer",
+    "final_answer_contains",
+]);
+function validateConversationEvaluatorList(evaluators, context, filePath) {
+    if (!Array.isArray(evaluators)) {
+        throw new Error(`Conversation scenario '${filePath}' ${context} evaluators must be an array.`);
+    }
+    for (let i = 0; i < evaluators.length; i += 1) {
+        const ev = evaluators[i];
+        if (!isObject(ev)) {
+            throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must be an object.`);
+        }
+        if (typeof ev.type !== "string" || !VALID_CONVERSATION_EVALUATOR_TYPES.has(ev.type)) {
+            throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} has invalid type '${String(ev.type)}'. ` +
+                `Valid types: ${[...VALID_CONVERSATION_EVALUATOR_TYPES].join(", ")}.`);
+        }
+        if (ev.mode !== "hard_gate" && ev.mode !== "weighted") {
+            throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must have mode: hard_gate or weighted.`);
+        }
+    }
+}
+function validateConversationScenario(value, filePath) {
+    if (!isObject(value)) {
+        throw new Error(`Scenario file '${filePath}' must contain a YAML object.`);
+    }
+    for (const field of ["id", "name", "suite"]) {
+        if (typeof value[field] !== "string" || value[field].length === 0) {
+            throw new Error(`Conversation scenario '${filePath}' is missing required string field '${field}'.`);
+        }
+    }
+    if (value.type !== "conversation") {
+        throw new Error(`Scenario file '${filePath}' does not have type: conversation.`);
+    }
+    if ("tools" in value) {
+        throw new Error(`Conversation scenario '${filePath}' must not define 'tools'. HTTP agents manage their own tools internally.`);
+    }
+    if (!Array.isArray(value.steps) || value.steps.length === 0) {
+        throw new Error(`Conversation scenario '${filePath}' must define at least one step.`);
+    }
+    for (let i = 0; i < value.steps.length; i += 1) {
+        const step = value.steps[i];
+        if (!isObject(step)) {
+            throw new Error(`Conversation scenario '${filePath}' step ${i} must be an object.`);
+        }
+        if (step.role !== "user") {
+            throw new Error(`Conversation scenario '${filePath}' step ${i} must have role: user.`);
+        }
+        if (typeof step.message !== "string" || step.message.length === 0) {
+            throw new Error(`Conversation scenario '${filePath}' step ${i} must have a non-empty message.`);
+        }
+        if (step.evaluators !== undefined) {
+            validateConversationEvaluatorList(step.evaluators, `step ${i}`, filePath);
+        }
+    }
+    if (value.evaluators !== undefined) {
+        validateConversationEvaluatorList(value.evaluators, "end-of-run evaluators", filePath);
+    }
+}