npm - agent-regression-lab - Versions diffs - 0.2.0 → 0.4.0 - Mend

agent-regression-lab 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/README.md +78 -11
package/bin/agentlab.js +2 -0
package/dist/agent/factory.js +20 -6
package/dist/agent/httpAdapter.js +5 -4
package/dist/config.js +199 -12
package/dist/evaluators.js +56 -1
package/dist/index.js +157 -11
package/dist/init.js +88 -0
package/dist/lib/id.js +3 -0
package/dist/runOutput.js +46 -0
package/dist/runner.js +31 -9
package/dist/scenarios.js +90 -2
package/dist/scoring.js +2 -2
package/dist/storage.js +117 -7
package/dist/tools.js +56 -2
package/dist/trace.js +4 -2
package/dist/ui/App.js +75 -7
package/dist/ui-assets/client.css +92 -0
package/dist/ui-assets/client.js +183 -19
package/docs/agents.md +143 -8
package/docs/coding-agents.md +74 -0
package/docs/golden-suites.md +74 -0
package/docs/integrations-and-live-services.md +58 -0
package/docs/memory-and-stateful-agents.md +51 -0
package/docs/release-checklist.md +30 -0
package/docs/runtime-profiles.md +67 -0
package/docs/scenarios.md +303 -56
package/docs/superpowers/plans/2026-04-13-phase-2-lite-phase-3-plan.md +160 -0
package/docs/superpowers/plans/2026-04-13-phase-one-npm-tools-plan.md +502 -0
package/docs/superpowers/specs/2026-04-13-phase-2-lite-phase-3-design.md +164 -0
package/docs/tools.md +34 -3
package/docs/troubleshooting.md +193 -0
package/docs/variant-sets.md +63 -0
package/examples/coding-tools/README.md +21 -0
package/examples/coding-tools/index.js +11 -0
package/examples/coding-tools/package.json +8 -0
package/examples/support-tools/README.md +21 -0
package/examples/support-tools/index.js +8 -0
package/examples/support-tools/package.json +8 -0
package/package.json +7 -5

package/dist/index.js CHANGED Viewed

@@ -1,9 +1,11 @@
 #!/usr/bin/env node
 import packageJson from "../package.json" with { type: "json" };
+import { pathToFileURL } from "node:url";
 import { createAgentFactory } from "./agent/factory.js";
-import { getAgentRegistration } from "./config.js";
-import { createSuiteBatchId } from "./lib/id.js";
-import { getRunErrorDetail } from "./runOutput.js";
+import { getAgentRegistration, getVariantSet } from "./config.js";
+import { createConfigHash, createSuiteBatchId } from "./lib/id.js";
+import { formatCliErrorMessage, formatRunIdentityLines, getFailedEvaluatorSummaries, getRunErrorDetail } from "./runOutput.js";
+import { initProject } from "./init.js";
 async function main() {
     const [, , command, ...args] = process.argv;
     switch (command) {
@@ -32,15 +34,21 @@ async function main() {
         case "ui":
             await handleUi();
             break;
+        case "init":
+            await handleInit(args);
+            break;
         default:
             printUsage();
     }
 }
 function printUsage() {
     console.log(`Usage:
+  agentlab init <project-name>
   agentlab list scenarios
   agentlab run <scenario-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
   agentlab run --suite <suite-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
+  agentlab run --suite-def <name> [--agent <name>]
+  agentlab run <scenario-id> [--variant-set <name>]
   agentlab show <run-id>
   agentlab compare <baseline-run-id> <candidate-run-id>
   agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
@@ -61,10 +69,25 @@ async function handleList(args) {
         console.log(`${scenario.id}\t${scenario.suite}\t${scenario.difficulty ?? "-"}\t${scenario.description ?? ""}`);
     }
 }
+async function handleInit(args) {
+    const projectName = args[0];
+    if (!projectName) {
+        console.error("Error: project-name is required.");
+        console.error("Usage: agentlab init <project-name>");
+        process.exit(1);
+    }
+    await initProject(projectName);
+}
 async function handleRun(args) {
     const parsed = parseRunArgs(args);
     const runtimeConfig = validateRuntimeConfig(parsed.runtimeConfig);
-    const { loadScenariosBySuite } = await import("./scenarios.js");
+    const { loadScenariosBySuite, loadScenariosBySuiteDefinition } = await import("./scenarios.js");
+    if (parsed.suite && parsed.suiteDefinition) {
+        throw new Error("--suite and --suite-def cannot be used together.");
+    }
+    if (parsed.runtimeConfig.agentName && parsed.variantSetName) {
+        throw new Error("--agent and --variant-set cannot be used together.");
+    }
     if (parsed.suite) {
         const suite = parsed.suite;
         const scenarios = loadScenariosBySuite(suite);
@@ -73,16 +96,53 @@ async function handleRun(args) {
         }
         const suiteBatchId = createSuiteBatchId();
         const runs = [];
-        for (const scenario of scenarios) {
-            runs.push(await executeOne(scenario.definition.id, runtimeConfig, suiteBatchId));
+        if (parsed.variantSetName) {
+            console.log(`Variant set: ${parsed.variantSetName}`);
+            for (const scenario of scenarios) {
+                runs.push(...await executeVariantSetScenario(scenario.definition.id, parsed.variantSetName, suiteBatchId));
+            }
+        }
+        else {
+            for (const scenario of scenarios) {
+                runs.push(await executeOne(scenario.definition.id, runtimeConfig, suiteBatchId));
+            }
         }
         printSuiteSummary(suite, runs, suiteBatchId);
         return;
     }
+    if (parsed.suiteDefinition) {
+        const suiteDefinition = parsed.suiteDefinition;
+        const scenarios = loadScenariosBySuiteDefinition(suiteDefinition);
+        if (scenarios.length === 0) {
+            throw new Error(`No scenarios found for suite definition '${suiteDefinition}'.`);
+        }
+        const suiteBatchId = createSuiteBatchId();
+        const runs = [];
+        console.log(`Suite definition: ${suiteDefinition}`);
+        if (parsed.variantSetName) {
+            console.log(`Variant set: ${parsed.variantSetName}`);
+            for (const scenario of scenarios) {
+                runs.push(...await executeVariantSetScenario(scenario.definition.id, parsed.variantSetName, suiteBatchId, suiteDefinition));
+            }
+        }
+        else {
+            const suiteRuntimeConfig = { ...runtimeConfig, suiteDefinitionName: suiteDefinition };
+            for (const scenario of scenarios) {
+                runs.push(await executeOne(scenario.definition.id, suiteRuntimeConfig, suiteBatchId));
+            }
+        }
+        printSuiteSummary(suiteDefinition, runs, suiteBatchId);
+        return;
+    }
     const scenarioId = parsed.scenarioId;
     if (!scenarioId) {
         throw new Error("Missing scenario id.");
     }
+    if (parsed.variantSetName) {
+        console.log(`Variant set: ${parsed.variantSetName}`);
+        await executeVariantSetScenario(scenarioId, parsed.variantSetName);
+        return;
+    }
     // Detect scenario type to route to the right runner
     const { listScenarioFiles } = await import("./scenarios.js");
     const { parse } = await import("yaml");
@@ -97,6 +157,12 @@ async function handleRun(args) {
             break;
         }
     }
+    if (scenarioType === "task" && runtimeConfig.provider === "http") {
+        throw new Error(`Scenario '${scenarioId}' is a task scenario. HTTP agents (provider: http) only work with ` +
+            `type: conversation scenarios.\n` +
+            `To test an HTTP agent, create a conversation scenario (type: conversation) — ` +
+            `conversation scenarios do not use a tools: block. See docs/scenarios.md for the format.`);
+    }
     if (scenarioType === "conversation") {
         if (runtimeConfig.provider !== "http") {
             throw new Error(`Scenario '${scenarioId}' is a conversation scenario and requires provider: http. Use --agent <name> with a configured HTTP agent.`);
@@ -147,6 +213,15 @@ async function executeOne(scenarioId, runtimeConfig, suiteBatchId) {
             tools: toolRegistry,
         });
         bundle.run.suiteBatchId = suiteBatchId;
+        bundle.run.variantSetName = agentVersion.variantSetName;
+        bundle.run.variantLabel = agentVersion.variantLabel;
+        bundle.run.promptVersion = agentVersion.promptVersion;
+        bundle.run.modelVersion = agentVersion.modelVersion;
+        bundle.run.toolSchemaVersion = agentVersion.toolSchemaVersion;
+        bundle.run.configLabel = agentVersion.configLabel;
+        bundle.run.configHash = agentVersion.configHash;
+        bundle.run.runtimeProfileName = loaded.definition.runtime_profile;
+        bundle.run.suiteDefinitionName = runtimeConfig.suiteDefinitionName;
         bundle.agentVersion = agentVersion;
         storage.saveRun(bundle);
         printRunSummary(bundle);
@@ -156,6 +231,45 @@ async function executeOne(scenarioId, runtimeConfig, suiteBatchId) {
         storage.close();
     }
 }
+export async function executeVariantSetScenario(scenarioId, variantSetName, suiteBatchId, suiteDefinitionName) {
+    const variantSet = getVariantSet(variantSetName);
+    const runs = [];
+    for (const variant of variantSet.variants) {
+        const registration = getAgentRegistration(variant.agent);
+        const runtimeConfig = buildVariantRuntimeConfig(registration, variantSet.name, variant, suiteDefinitionName);
+        runs.push(await executeOne(scenarioId, runtimeConfig, suiteBatchId));
+    }
+    return runs;
+}
+function buildVariantRuntimeConfig(registration, variantSetName, variant, suiteDefinitionName) {
+    const runtimeConfig = {
+        ...registration,
+        agentName: registration.name,
+        label: registration.label ?? variant.label,
+        variantSetName,
+        variantLabel: variant.label,
+        promptVersion: variant.prompt_version,
+        modelVersion: variant.model_version,
+        toolSchemaVersion: variant.tool_schema_version,
+        configLabel: variant.config_label,
+        suiteDefinitionName,
+    };
+    runtimeConfig.configHash = createConfigHash({
+        provider: runtimeConfig.provider,
+        agentName: runtimeConfig.agentName,
+        label: runtimeConfig.label,
+        model: runtimeConfig.model,
+        command: runtimeConfig.command,
+        args: runtimeConfig.args ?? [],
+        variantSetName,
+        variantLabel: variant.label,
+        promptVersion: variant.prompt_version,
+        modelVersion: variant.model_version,
+        toolSchemaVersion: variant.tool_schema_version,
+        configLabel: variant.config_label,
+    });
+    return runtimeConfig;
+}
 export async function executeConversation(scenarioId, httpConfig, label, suiteBatchId) {
     const [{ Storage }, { loadConversationScenarioById }, { runConversation }, { createAgentVersionId }] = await Promise.all([
         import("./storage.js"),
@@ -257,6 +371,9 @@ function printRunSummary(bundle) {
     if (bundle.agentVersion?.command) {
         console.log(`Command: ${bundle.agentVersion.command} ${(bundle.agentVersion.args ?? []).join(" ")}`.trim());
     }
+    for (const line of formatRunIdentityLines(bundle)) {
+        console.log(line);
+    }
     console.log(`Runtime: ${bundle.run.durationMs}ms`);
     if (bundle.run.status !== "pass") {
         console.log(`Reason: ${bundle.run.terminationReason}`);
@@ -264,6 +381,13 @@ function printRunSummary(bundle) {
         if (errorDetail) {
             console.log(`Error: ${errorDetail}`);
         }
+        const failedEvaluators = getFailedEvaluatorSummaries(bundle);
+        if (failedEvaluators.length > 0) {
+            console.log("Failed evaluators:");
+            for (const summary of failedEvaluators) {
+                console.log(`- ${summary}`);
+            }
+        }
     }
 }
 async function handleShow(args) {
@@ -394,6 +518,8 @@ function parseRunArgs(args) {
     const runtimeConfig = { provider: "mock" };
     let scenarioId;
     let suite;
+    let suiteDefinition;
+    let variantSetName;
     for (let index = 0; index < args.length; index += 1) {
         const arg = args[index];
         if (arg === "--suite") {
@@ -401,6 +527,16 @@ function parseRunArgs(args) {
             index += 1;
             continue;
         }
+        if (arg === "--suite-def") {
+            suiteDefinition = args[index + 1];
+            index += 1;
+            continue;
+        }
+        if (arg === "--variant-set") {
+            variantSetName = args[index + 1];
+            index += 1;
+            continue;
+        }
         if (arg === "--provider") {
             const provider = args[index + 1];
             if (provider !== "mock" && provider !== "openai" && provider !== "external_process" && provider !== "http") {
@@ -431,7 +567,7 @@ function parseRunArgs(args) {
         }
         throw new Error(`Unexpected argument '${arg}'.`);
     }
-    return { scenarioId, suite, runtimeConfig };
+    return { scenarioId, suite, suiteDefinition, variantSetName, runtimeConfig };
 }
 function validateRuntimeConfig(config) {
     if (config.agentName) {
@@ -475,7 +611,17 @@ function validateRuntimeConfig(config) {
     }
     return config;
 }
-main().catch((error) => {
-    console.error(error instanceof Error ? error.message : String(error));
-    process.exitCode = 1;
-});
+if (isEntrypoint()) {
+    main().catch((error) => {
+        const message = error instanceof Error ? error.message : String(error);
+        console.error(formatCliErrorMessage(message));
+        process.exitCode = 1;
+    });
+}
+function isEntrypoint() {
+    const entry = process.argv[1];
+    if (!entry) {
+        return false;
+    }
+    return import.meta.url === pathToFileURL(entry).href;
+}

package/dist/init.js ADDED Viewed

@@ -0,0 +1,88 @@
+import { existsSync, mkdirSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+const SAMPLE_SCENARIO = `id: sample.hello-world
+name: Hello World Sample
+suite: sample
+description: A minimal example to verify your setup.
+difficulty: easy
+tags:
+  - smoke
+  - sample
+task:
+  instructions: |
+    Say hello to the user and confirm the system is working.
+  context:
+    user_name: Alice
+tools:
+  allowed: []
+runtime:
+  max_steps: 5
+evaluators:
+  - id: greeting-output
+    type: final_answer_contains
+    mode: hard_gate
+    config:
+      required_substrings:
+        - "Hello"
+`;
+const SAMPLE_FIXTURE = `{
+  "users": [
+    { "id": "user_001", "name": "Alice", "email": "alice@example.com" }
+  ]
+}
+`;
+const SAMPLE_CONFIG = `# Agent Regression Lab Configuration
+# Docs: https://github.com/YakshithK/agent-regression-lab#readme
+agents:
+  - name: mock-default
+    provider: mock
+    label: mock-default
+  # Uncomment and configure to test with OpenAI:
+  # - name: openai-test
+  #   provider: openai
+  #   model: gpt-4o-mini
+  #   label: openai-test
+# Tools can be registered from either:
+# 1. repo-local files
+# 2. installed npm packages
+#
+# tools:
+#   - name: my.local_tool
+#     modulePath: ./tools/customTool.ts
+#     exportName: customTool
+#     description: My repo-local custom tool.
+#     inputSchema:
+#       type: object
+#
+#   - name: support.find_duplicate_charge
+#     package: "@agentlab/example-support-tools"
+#     exportName: findDuplicateCharge
+#     description: Find the duplicated charge order id for a given customer.
+#     inputSchema:
+#       type: object
+`;
+export async function initProject(projectName) {
+    const targetDir = join(process.cwd(), projectName);
+    if (existsSync(targetDir)) {
+        throw new Error(`Directory '${projectName}' already exists.`);
+    }
+    // Create directory structure
+    mkdirSync(targetDir, { recursive: true });
+    mkdirSync(join(targetDir, "scenarios"), { recursive: true });
+    mkdirSync(join(targetDir, "scenarios", "sample"), { recursive: true });
+    mkdirSync(join(targetDir, "fixtures"), { recursive: true });
+    // Write files
+    writeFileSync(join(targetDir, "scenarios", "sample", "hello-world.yaml"), SAMPLE_SCENARIO);
+    writeFileSync(join(targetDir, "fixtures", "users.json"), SAMPLE_FIXTURE);
+    writeFileSync(join(targetDir, "agentlab.config.yaml"), SAMPLE_CONFIG);
+    console.log(`Created '${projectName}' with sample scenario.`);
+    console.log("");
+    console.log("Next steps:");
+    console.log(`  cd ${projectName}`);
+    console.log("  npm install @agentlab/example-support-tools");
+    console.log("  # then register package-backed tools in agentlab.config.yaml if needed");
+    console.log("  agentlab run sample.hello-world --agent mock-default");
+}

package/dist/lib/id.js CHANGED Viewed

@@ -17,3 +17,6 @@ export function createToolCallId() {
 export function createAgentVersionId(label, config) {
     return `agent_${hashText(`${label}:${JSON.stringify(config)}`).slice(0, 12)}`;
 }
+export function createConfigHash(input) {
+    return createAgentVersionId("config", input);
+}

package/dist/runOutput.js CHANGED Viewed

@@ -1,5 +1,11 @@
 export function getRunErrorDetail(bundle) {
     for (const event of [...bundle.traceEvents].reverse()) {
+        if (event.type === "conversation_finished") {
+            const errorMessage = event.payload.errorMessage;
+            if (typeof errorMessage === "string") {
+                return errorMessage;
+            }
+        }
         if (event.type === "agent_error") {
             const message = event.payload.message;
             return typeof message === "string" ? message : undefined;
@@ -11,3 +17,43 @@ export function getRunErrorDetail(bundle) {
     }
     return undefined;
 }
+export function formatCliErrorMessage(message) {
+    if (message.includes("database is locked")) {
+        return "SQLite database is locked. Retry the run sequentially or wait for the current run to finish.";
+    }
+    return message;
+}
+export function getFailedEvaluatorSummaries(bundle) {
+    return bundle.evaluatorResults
+        .filter((result) => result.status === "fail")
+        .map((result) => `${result.evaluatorId}: ${result.message}`);
+}
+export function formatRunIdentityLines(bundle) {
+    const lines = [];
+    const run = bundle.run;
+    if (run.variantSetName) {
+        lines.push(`Variant set: ${run.variantSetName}`);
+    }
+    if (run.variantLabel) {
+        lines.push(`Variant: ${run.variantLabel}`);
+    }
+    if (run.promptVersion) {
+        lines.push(`Prompt version: ${run.promptVersion}`);
+    }
+    if (run.modelVersion) {
+        lines.push(`Model version: ${run.modelVersion}`);
+    }
+    if (run.toolSchemaVersion) {
+        lines.push(`Tool schema version: ${run.toolSchemaVersion}`);
+    }
+    if (run.configLabel) {
+        lines.push(`Config label: ${run.configLabel}`);
+    }
+    if (run.runtimeProfileName) {
+        lines.push(`Runtime profile: ${run.runtimeProfileName}`);
+    }
+    if (run.suiteDefinitionName) {
+        lines.push(`Suite definition: ${run.suiteDefinitionName}`);
+    }
+    return lines;
+}

package/dist/runner.js CHANGED Viewed

@@ -1,7 +1,9 @@
 import { performance } from "node:perf_hooks";
+import { getRuntimeProfile } from "./config.js";
 import { createToolCallId, createRunId } from "./lib/id.js";
 import { evaluateScenario } from "./evaluators.js";
 import { computeScore } from "./scoring.js";
+import { applyRuntimeProfileToTools } from "./tools.js";
 import { TraceRecorder } from "./trace.js";
 export async function runScenario(deps) {
     const runId = createRunId();
@@ -9,6 +11,8 @@ export async function runScenario(deps) {
     const runStart = performance.now();
     const trace = new TraceRecorder(runId, deps.scenario.id);
     const toolCalls = [];
+    const runtimeProfile = deps.scenario.runtime_profile ? getRuntimeProfile(deps.scenario.runtime_profile) : undefined;
+    const tools = applyRuntimeProfileToTools(deps.tools, runtimeProfile, trace);
     const maxSteps = deps.scenario.runtime?.max_steps ?? 8;
     const timeoutSeconds = deps.scenario.runtime?.timeout_seconds;
     const deadline = timeoutSeconds ? Date.now() + timeoutSeconds * 1000 : undefined;
@@ -22,6 +26,9 @@ export async function runScenario(deps) {
         maxSteps,
         timeoutSeconds,
     });
+    trace.record("system", "runtime_profile_applied", {
+        name: runtimeProfile?.name ?? null,
+    }, { countStep: false });
     const availableTools = deps.toolSpecs.filter((tool) => deps.scenario.tools.allowed.includes(tool.name));
     const session = await deps.agentAdapter.startRun({
         instructions: deps.scenario.task.instructions,
@@ -72,7 +79,7 @@ export async function runScenario(deps) {
             trace.record("runner", "forbidden_tool_attempted", { toolName });
             break;
         }
-        const handler = deps.tools[toolName];
+        const handler = tools[toolName];
         if (!handler) {
             status = "error";
             terminationReason = "tool_error";
@@ -98,7 +105,8 @@ export async function runScenario(deps) {
         }
         catch (error) {
             const message = error instanceof Error ? error.message : String(error);
-            if (deadline && Date.now() >= deadline) {
+            const isInjectedTimeout = error instanceof Error && error.code === "timeout_exceeded";
+            if (isInjectedTimeout || (deadline && Date.now() >= deadline)) {
                 status = "error";
                 terminationReason = "timeout_exceeded";
                 trace.record("runner", "timeout_exceeded", { timeoutSeconds, message });
@@ -182,18 +190,32 @@ export async function runScenario(deps) {
 function hasTimedOut(deadline) {
     return deadline !== undefined && Date.now() >= deadline;
 }
+function toolRaceTimeoutError(message) {
+    const error = new Error(message);
+    error.code = "timeout_exceeded";
+    return error;
+}
 async function raceWithTimeout(promise, deadline, message) {
     if (deadline === undefined) {
         return promise;
     }
     const remainingMs = deadline - Date.now();
     if (remainingMs <= 0) {
-        throw new Error(message);
+        throw toolRaceTimeoutError(message);
+    }
+    let timeoutHandle;
+    try {
+        return await Promise.race([
+            promise,
+            new Promise((_, reject) => {
+                timeoutHandle = setTimeout(() => reject(toolRaceTimeoutError(message)), remainingMs);
+                timeoutHandle.unref?.();
+            }),
+        ]);
+    }
+    finally {
+        if (timeoutHandle !== undefined) {
+            clearTimeout(timeoutHandle);
+        }
     }
-    return await Promise.race([
-        promise,
-        new Promise((_, reject) => {
-            setTimeout(() => reject(new Error(message)), remainingMs);
-        }),
-    ]);
 }

package/dist/scenarios.js CHANGED Viewed

@@ -2,9 +2,20 @@ import { readFileSync, readdirSync, statSync } from "node:fs";
 import { createHash } from "node:crypto";
 import { join, relative, resolve } from "node:path";
 import { parse } from "yaml";
-import { loadAgentLabConfig } from "./config.js";
+import { getRuntimeProfile, getSuiteDefinition, loadAgentLabConfig } from "./config.js";
 import { getBuiltinToolSpecs } from "./tools.js";
 const SCENARIOS_ROOT = resolve("scenarios");
+const VALID_TASK_EVALUATOR_TYPES = new Set([
+    "exact_final_answer",
+    "final_answer_contains",
+    "forbidden_tool",
+    "tool_call_assertion",
+    "step_count_max",
+    "tool_call_count_max",
+    "tool_repeat_max",
+    "cost_max",
+]);
+const VALID_EVALUATOR_MODES = new Set(["hard_gate", "weighted"]);
 export function listScenarioFiles(root = SCENARIOS_ROOT) {
     if (!safeExists(root)) {
         return [];
@@ -55,6 +66,8 @@ export function listScenarios() {
 }
 export function loadScenarioById(scenarioId) {
     for (const filePath of listScenarioFiles()) {
+        if (getScenarioType(filePath) !== "task")
+            continue;
         const loaded = loadScenarioByPath(filePath, getKnownToolNames());
         if (loaded.definition.id === scenarioId) {
             return loaded;
@@ -68,6 +81,20 @@ export function loadScenariosBySuite(suite) {
         .map((filePath) => loadScenarioByPath(filePath, getKnownToolNames()))
         .filter(({ definition }) => definition.suite === suite);
 }
+export function loadScenariosBySuiteDefinition(name) {
+    const suiteDefinition = getSuiteDefinition(name);
+    const knownToolNames = getKnownToolNames();
+    const scenarioFiles = listScenarioFiles(resolve("scenarios"));
+    const loadedScenarios = scenarioFiles.map((filePath) => loadScenarioRecordByPath(filePath, knownToolNames));
+    const included = loadedScenarios
+        .filter(({ definition }) => matchesSuiteDefinitionInclude(definition, suiteDefinition));
+    const excludedIds = new Set(loadedScenarios
+        .filter(({ definition }) => matchesSuiteDefinitionExclude(definition, suiteDefinition))
+        .map(({ definition }) => definition.id));
+    return included
+        .filter(({ definition }) => !excludedIds.has(definition.id))
+        .sort((left, right) => left.definition.id.localeCompare(right.definition.id));
+}
 export function loadScenarioByPath(filePath, knownToolNames = getKnownToolNames()) {
     const absolutePath = resolve(filePath);
     const raw = readFileSync(absolutePath, "utf8");
@@ -107,7 +134,10 @@ function validateScenario(value, filePath, knownToolNames) {
             throw new Error(`Scenario file '${filePath}' references unknown allowed tool '${toolName}'.`);
         }
     }
-    if (Array.isArray(value.tools.forbidden)) {
+    if (value.tools.forbidden !== undefined) {
+        if (!Array.isArray(value.tools.forbidden)) {
+            throw new Error(`Scenario file '${filePath}' field 'tools.forbidden' must be an array of strings.`);
+        }
         for (const toolName of value.tools.forbidden) {
             if (typeof toolName !== "string") {
                 throw new Error(`Scenario file '${filePath}' contains a non-string tool name in tools.forbidden.`);
@@ -122,6 +152,17 @@ function validateScenario(value, filePath, knownToolNames) {
         if (!isObject(evaluator) || typeof evaluator.id !== "string" || typeof evaluator.type !== "string") {
             throw new Error(`Scenario file '${filePath}' has an invalid evaluator entry.`);
         }
+        if (!VALID_TASK_EVALUATOR_TYPES.has(evaluator.type)) {
+            throw new Error(`Scenario file '${filePath}' evaluator '${evaluator.id}' has invalid type '${evaluator.type}'. ` +
+                `Valid types: ${[...VALID_TASK_EVALUATOR_TYPES].join(", ")}.`);
+        }
+        if (!VALID_EVALUATOR_MODES.has(evaluator.mode)) {
+            throw new Error(`Scenario file '${filePath}' evaluator '${evaluator.id}' has invalid mode '${String(evaluator.mode)}'. ` +
+                `Valid modes: hard_gate, weighted.`);
+        }
+        if (!isObject(evaluator.config)) {
+            throw new Error(`Scenario file '${filePath}' evaluator '${evaluator.id}' must define an object config.`);
+        }
         if (evaluatorIds.has(evaluator.id)) {
             throw new Error(`Scenario file '${filePath}' defines duplicate evaluator id '${evaluator.id}'.`);
         }
@@ -131,6 +172,12 @@ function validateScenario(value, filePath, knownToolNames) {
         validatePositiveInt(value.runtime.max_steps, "runtime.max_steps", filePath);
         validatePositiveInt(value.runtime.timeout_seconds, "runtime.timeout_seconds", filePath);
     }
+    if (value.runtime_profile !== undefined) {
+        if (typeof value.runtime_profile !== "string" || value.runtime_profile.length === 0) {
+            throw new Error(`Scenario file '${filePath}' field 'runtime_profile' must be a non-empty string.`);
+        }
+        getRuntimeProfile(value.runtime_profile);
+    }
     if (isObject(value.context) && Array.isArray(value.context.fixtures)) {
         for (const fixturePath of value.context.fixtures) {
             if (typeof fixturePath !== "string") {
@@ -226,6 +273,17 @@ function validateConversationEvaluatorList(evaluators, context, filePath) {
         if (ev.mode !== "hard_gate" && ev.mode !== "weighted") {
             throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must have mode: hard_gate or weighted.`);
         }
+        if (ev.type === "response_contains" || ev.type === "response_not_contains") {
+            if (!isObject(ev.config)) {
+                throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must define an object config.`);
+            }
+            if ("text" in ev.config) {
+                throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} uses stale 'config.text'; use 'config.keywords: string[]'.`);
+            }
+            if (!Array.isArray(ev.config.keywords) || ev.config.keywords.some((kw) => typeof kw !== "string")) {
+                throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must define config.keywords as a string array.`);
+            }
+        }
     }
 }
 function validateConversationScenario(value, filePath) {
@@ -240,6 +298,12 @@ function validateConversationScenario(value, filePath) {
     if (value.type !== "conversation") {
         throw new Error(`Scenario file '${filePath}' does not have type: conversation.`);
     }
+    if (value.runtime_profile !== undefined) {
+        if (typeof value.runtime_profile !== "string" || value.runtime_profile.length === 0) {
+            throw new Error(`Conversation scenario '${filePath}' field 'runtime_profile' must be a non-empty string.`);
+        }
+        getRuntimeProfile(value.runtime_profile);
+    }
     if ("tools" in value) {
         throw new Error(`Conversation scenario '${filePath}' must not define 'tools'. HTTP agents manage their own tools internally.`);
     }
@@ -265,3 +329,27 @@ function validateConversationScenario(value, filePath) {
         validateConversationEvaluatorList(value.evaluators, "end-of-run evaluators", filePath);
     }
 }
+function loadScenarioRecordByPath(filePath, knownToolNames = getKnownToolNames()) {
+    if (getScenarioType(filePath) === "conversation") {
+        return loadConversationScenarioByPath(filePath);
+    }
+    return loadScenarioByPath(filePath, knownToolNames);
+}
+function matchesSuiteDefinitionInclude(definition, suiteDefinition) {
+    return matchesSuiteDefinitionSelectors(definition, suiteDefinition.include);
+}
+function matchesSuiteDefinitionExclude(definition, suiteDefinition) {
+    return suiteDefinition.exclude !== undefined && matchesSuiteDefinitionSelectors(definition, suiteDefinition.exclude);
+}
+function matchesSuiteDefinitionSelectors(definition, selectors) {
+    if (selectors.scenarios?.includes(definition.id)) {
+        return true;
+    }
+    if (selectors.tags?.some((tag) => definition.tags?.includes(tag) ?? false)) {
+        return true;
+    }
+    if (selectors.suites?.includes(definition.suite)) {
+        return true;
+    }
+    return false;
+}

package/dist/scoring.js CHANGED Viewed

@@ -4,9 +4,9 @@ export function computeScore(results) {
     const weighted = results.filter((result) => result.mode === "weighted");
     let score = 100;
     if (weighted.length > 0) {
-        const totalWeight = weighted.reduce((sum, result) => sum + (result.weight ?? 0), 0);
+        const totalWeight = weighted.reduce((sum, result) => sum + (result.weight ?? 1), 0);
         const earnedWeight = weighted.reduce((sum, result) => {
-            const weight = result.weight ?? 0;
+            const weight = result.weight ?? 1;
             return sum + (result.status === "pass" ? weight : 0);
         }, 0);
         score = totalWeight === 0 ? 100 : Math.round((earnedWeight / totalWeight) * 100);