npm - agent-regression-lab - Versions diffs - 0.1.1 → 0.3.0 - Mend

agent-regression-lab 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/README.md +186 -123
package/dist/agent/factory.js +20 -6
package/dist/agent/httpAdapter.js +79 -0
package/dist/agent/mockAdapter.js +210 -13
package/dist/config.js +223 -4
package/dist/conversationEvaluators.js +167 -0
package/dist/conversationRunner.js +199 -0
package/dist/evaluators.js +56 -1
package/dist/index.js +428 -111
package/dist/lib/id.js +6 -0
package/dist/runOutput.js +46 -0
package/dist/runner.js +31 -9
package/dist/scenarios.js +211 -11
package/dist/scoring.js +2 -2
package/dist/storage.js +305 -31
package/dist/tools.js +284 -0
package/dist/trace.js +4 -2
package/dist/ui/App.js +67 -5
package/dist/ui/server.js +18 -0
package/dist/ui-assets/client.js +165 -3
package/docs/agents.md +287 -0
package/docs/golden-suites.md +74 -0
package/docs/integrations-and-live-services.md +58 -0
package/docs/memory-and-stateful-agents.md +51 -0
package/docs/release-checklist.md +94 -0
package/docs/runtime-profiles.md +67 -0
package/docs/scenarios.md +419 -0
package/docs/tools.md +102 -0
package/docs/troubleshooting.md +296 -0
package/docs/variant-sets.md +63 -0
package/package.json +4 -3

package/dist/conversationEvaluators.js ADDED Viewed

@@ -0,0 +1,167 @@
+export function evaluateStep(reply, latencyMs, evaluators, stepIndex) {
+    return evaluators.map((evaluator, i) => {
+        const evaluatorId = `step_${stepIndex}_${evaluator.type}_${i}`;
+        return evaluateStepOne(evaluator, evaluatorId, reply, latencyMs);
+    });
+}
+export function evaluateConversationEnd(finalReply, totalTurns, evaluators) {
+    return evaluators.map((evaluator, i) => {
+        const evaluatorId = `run_${evaluator.type}_${i}`;
+        return evaluateEndOne(evaluator, evaluatorId, finalReply, totalTurns);
+    });
+}
+function evaluateStepOne(evaluator, evaluatorId, reply, latencyMs) {
+    const normalizedReply = reply.toLowerCase();
+    switch (evaluator.type) {
+        case "response_contains": {
+            const keywords = toStringArray(evaluator.config.keywords);
+            const missing = keywords.filter((kw) => !normalizedReply.includes(kw.toLowerCase()));
+            const passed = missing.length === 0;
+            return {
+                evaluatorId,
+                evaluatorType: evaluator.type,
+                mode: evaluator.mode,
+                weight: evaluator.weight,
+                status: passed ? "pass" : "fail",
+                rawScore: passed ? 1 : 0,
+                message: passed
+                    ? "All required keywords found in reply."
+                    : `Missing keywords: ${missing.join(", ")}.`,
+            };
+        }
+        case "response_not_contains": {
+            const keywords = toStringArray(evaluator.config.keywords);
+            const found = keywords.find((kw) => normalizedReply.includes(kw.toLowerCase()));
+            const passed = found === undefined;
+            return {
+                evaluatorId,
+                evaluatorType: evaluator.type,
+                mode: evaluator.mode,
+                weight: evaluator.weight,
+                status: passed ? "pass" : "fail",
+                rawScore: passed ? 1 : 0,
+                message: passed
+                    ? "No forbidden keywords found in reply."
+                    : `Forbidden keyword found: "${found}".`,
+            };
+        }
+        case "response_matches_regex": {
+            const pattern = String(evaluator.config.pattern ?? "");
+            let passed = false;
+            try {
+                passed = new RegExp(pattern, "i").test(reply);
+            }
+            catch {
+                // invalid regex — treat as fail
+            }
+            return {
+                evaluatorId,
+                evaluatorType: evaluator.type,
+                mode: evaluator.mode,
+                weight: evaluator.weight,
+                status: passed ? "pass" : "fail",
+                rawScore: passed ? 1 : 0,
+                message: passed
+                    ? `Reply matches pattern /${pattern}/.`
+                    : `Reply does not match pattern /${pattern}/.`,
+            };
+        }
+        case "response_latency_max": {
+            const maxMs = Number(evaluator.config.ms ?? 0);
+            const passed = latencyMs <= maxMs;
+            return {
+                evaluatorId,
+                evaluatorType: evaluator.type,
+                mode: evaluator.mode,
+                weight: evaluator.weight,
+                status: passed ? "pass" : "fail",
+                rawScore: passed ? 1 : 0,
+                message: passed
+                    ? `Response latency ${latencyMs}ms is within limit ${maxMs}ms.`
+                    : `Response latency ${latencyMs}ms exceeds limit ${maxMs}ms.`,
+            };
+        }
+        default:
+            return {
+                evaluatorId,
+                evaluatorType: evaluator.type,
+                mode: evaluator.mode,
+                weight: evaluator.weight,
+                status: "fail",
+                message: `Unsupported step evaluator type '${evaluator.type}'.`,
+            };
+    }
+}
+function evaluateEndOne(evaluator, evaluatorId, finalReply, totalTurns) {
+    const normalizedReply = finalReply.toLowerCase();
+    switch (evaluator.type) {
+        case "step_count_max": {
+            const max = Number(evaluator.config.max ?? 0);
+            const passed = totalTurns <= max;
+            return {
+                evaluatorId,
+                evaluatorType: evaluator.type,
+                mode: evaluator.mode,
+                weight: evaluator.weight,
+                status: passed ? "pass" : "fail",
+                rawScore: passed ? 1 : 0,
+                message: passed
+                    ? `Turn count ${totalTurns} is within max ${max}.`
+                    : `Turn count ${totalTurns} exceeds max ${max}.`,
+            };
+        }
+        case "exact_final_answer": {
+            const expected = String(evaluator.config.expected ?? "");
+            const passed = finalReply.trim() === expected.trim();
+            return {
+                evaluatorId,
+                evaluatorType: evaluator.type,
+                mode: evaluator.mode,
+                weight: evaluator.weight,
+                status: passed ? "pass" : "fail",
+                rawScore: passed ? 1 : 0,
+                message: passed ? "Final reply matched exactly." : "Final reply did not match expected output.",
+            };
+        }
+        case "final_answer_contains": {
+            const keywords = toStringArray(evaluator.config.keywords);
+            const missing = keywords.filter((kw) => !normalizedReply.includes(kw.toLowerCase()));
+            const passed = missing.length === 0;
+            return {
+                evaluatorId,
+                evaluatorType: evaluator.type,
+                mode: evaluator.mode,
+                weight: evaluator.weight,
+                status: passed ? "pass" : "fail",
+                rawScore: passed ? 1 : 0,
+                message: passed
+                    ? "Final reply contains all required keywords."
+                    : `Missing keywords in final reply: ${missing.join(", ")}.`,
+            };
+        }
+        case "response_contains":
+        case "response_not_contains":
+        case "response_matches_regex":
+        case "response_latency_max":
+            return {
+                evaluatorId,
+                evaluatorType: evaluator.type,
+                mode: evaluator.mode,
+                weight: evaluator.weight,
+                status: "fail",
+                message: `Evaluator type '${evaluator.type}' is only valid as a per-step evaluator, not end-of-run.`,
+            };
+        default:
+            return {
+                evaluatorId,
+                evaluatorType: evaluator.type,
+                mode: evaluator.mode,
+                weight: evaluator.weight,
+                status: "fail",
+                message: `Unsupported end-of-run evaluator type '${evaluator.type}'.`,
+            };
+    }
+}
+function toStringArray(value) {
+    return Array.isArray(value) ? value.map(String) : [];
+}

package/dist/conversationRunner.js ADDED Viewed

@@ -0,0 +1,199 @@
+import { performance } from "node:perf_hooks";
+import { randomUUID } from "node:crypto";
+import { callHttpAgent } from "./agent/httpAdapter.js";
+import { evaluateStep, evaluateConversationEnd } from "./conversationEvaluators.js";
+import { computeScore } from "./scoring.js";
+import { TraceRecorder } from "./trace.js";
+import { createRunId } from "./lib/id.js";
+export async function runConversation(deps) {
+    const { httpConfig, agentVersion, scenario, scenarioFileHash } = deps;
+    const runId = createRunId();
+    const startedAt = new Date().toISOString();
+    const runStart = performance.now();
+    const trace = new TraceRecorder(runId, scenario.id);
+    const conversationId = randomUUID();
+    const allEvaluatorResults = [];
+    trace.record("runner", "conversation_started", {
+        conversationId,
+        stepCount: scenario.steps.length,
+        agentUrl: httpConfig.url,
+        agentVersionId: agentVersion.id,
+        scenarioVersionHash: scenarioFileHash,
+    });
+    let finalOutput = "";
+    let terminationReason = "completed";
+    let status = "pass";
+    let completedSteps = 0;
+    for (let stepIndex = 0; stepIndex < scenario.steps.length; stepIndex += 1) {
+        const step = scenario.steps[stepIndex];
+        trace.record("runner", "turn_started", {
+            stepIndex,
+            message: step.message,
+            conversationId,
+        });
+        let reply;
+        let latencyMs;
+        try {
+            const result = await callHttpAgent({
+                url: httpConfig.url,
+                message: step.message,
+                conversationId,
+                request_template: httpConfig.request_template,
+                response_field: httpConfig.response_field,
+                headers: httpConfig.headers ?? {},
+                timeout_ms: httpConfig.timeout_ms ?? 30000,
+            });
+            reply = result.reply;
+            latencyMs = result.latencyMs;
+        }
+        catch (error) {
+            const code = error.code;
+            const message = error instanceof Error ? error.message : String(error);
+            status = "error";
+            terminationReason =
+                code === "http_connection_failed"
+                    ? "http_connection_failed"
+                    : code === "http_error"
+                        ? "http_error"
+                        : code === "timeout_exceeded"
+                            ? "timeout_exceeded"
+                            : code === "invalid_response_format"
+                                ? "invalid_response_format"
+                                : "http_connection_failed";
+            trace.record("runner", "conversation_finished", {
+                status,
+                terminationReason,
+                totalTurns: completedSteps,
+                durationMs: Math.round(performance.now() - runStart),
+                errorMessage: message,
+            });
+            return buildBundle({
+                runId,
+                scenario,
+                scenarioFileHash,
+                agentVersion,
+                startedAt,
+                runStart,
+                status,
+                terminationReason,
+                finalOutput: "",
+                completedSteps,
+                allEvaluatorResults,
+                trace,
+                score: 0,
+            });
+        }
+        completedSteps += 1;
+        finalOutput = reply;
+        trace.record("runner", "turn_completed", {
+            stepIndex,
+            reply,
+            latencyMs,
+        });
+        if (step.evaluators && step.evaluators.length > 0) {
+            const stepResults = evaluateStep(reply, latencyMs, step.evaluators, stepIndex);
+            for (const result of stepResults) {
+                trace.record("evaluator", "step_evaluation_result", {
+                    stepIndex,
+                    evaluatorId: result.evaluatorId,
+                    status: result.status,
+                    message: result.message,
+                });
+                allEvaluatorResults.push(result);
+            }
+            const hardGateFailed = stepResults.some((r) => r.mode === "hard_gate" && r.status === "fail");
+            if (hardGateFailed) {
+                status = "fail";
+                terminationReason = "evaluator_failed";
+                trace.record("runner", "conversation_finished", {
+                    status,
+                    terminationReason,
+                    totalTurns: completedSteps,
+                    durationMs: Math.round(performance.now() - runStart),
+                });
+                return buildBundle({
+                    runId,
+                    scenario,
+                    scenarioFileHash,
+                    agentVersion,
+                    startedAt,
+                    runStart,
+                    status,
+                    terminationReason,
+                    finalOutput,
+                    completedSteps,
+                    allEvaluatorResults,
+                    trace,
+                });
+            }
+        }
+    }
+    // End-of-run evaluators
+    if (scenario.evaluators && scenario.evaluators.length > 0) {
+        trace.record("evaluator", "evaluation_started", {});
+        const endResults = evaluateConversationEnd(finalOutput, completedSteps, scenario.evaluators);
+        for (const result of endResults) {
+            trace.record("evaluator", "evaluation_result", {
+                evaluatorId: result.evaluatorId,
+                status: result.status,
+                message: result.message,
+            });
+            allEvaluatorResults.push(result);
+        }
+        trace.record("evaluator", "evaluation_finished", {});
+    }
+    const scoring = computeScore(allEvaluatorResults);
+    status = scoring.status;
+    if (status === "fail" && terminationReason === "completed") {
+        terminationReason = "evaluator_failed";
+    }
+    trace.record("runner", "conversation_finished", {
+        status,
+        terminationReason,
+        totalTurns: completedSteps,
+        durationMs: Math.round(performance.now() - runStart),
+    });
+    return buildBundle({
+        runId,
+        scenario,
+        scenarioFileHash,
+        agentVersion,
+        startedAt,
+        runStart,
+        status,
+        terminationReason,
+        finalOutput,
+        completedSteps,
+        allEvaluatorResults,
+        trace,
+        score: scoring.score,
+    });
+}
+function buildBundle(input) {
+    const { runId, scenario, scenarioFileHash, agentVersion, startedAt, runStart, status, terminationReason, finalOutput, completedSteps, allEvaluatorResults, trace, } = input;
+    const durationMs = Math.round(performance.now() - runStart);
+    const finishedAt = new Date().toISOString();
+    const score = input.score ?? computeScore(allEvaluatorResults).score;
+    const run = {
+        id: runId,
+        scenarioId: scenario.id,
+        scenarioFileHash,
+        agentVersionId: agentVersion.id,
+        status,
+        terminationReason,
+        finalOutput,
+        totalSteps: completedSteps,
+        totalToolCalls: 0,
+        durationMs,
+        score,
+        startedAt,
+        finishedAt,
+    };
+    return {
+        run,
+        traceEvents: trace.getEvents(),
+        toolCalls: [],
+        evaluatorResults: allEvaluatorResults,
+        agentVersion,
+    };
+}

package/dist/evaluators.js CHANGED Viewed

@@ -13,6 +13,12 @@ function evaluateOne(evaluator, bundle) {
             return evaluateExactFinalAnswer(evaluator, bundle.run.finalOutput);
         case "step_count_max":
             return evaluateStepCountMax(evaluator, bundle.run.totalSteps);
+        case "tool_call_count_max":
+            return evaluateToolCallCountMax(evaluator, bundle.run.totalToolCalls);
+        case "tool_repeat_max":
+            return evaluateToolRepeatMax(evaluator, bundle.toolCalls);
+        case "cost_max":
+            return evaluateCostMax(evaluator, bundle.run.totalCostUsd);
         default:
             return {
                 evaluatorId: evaluator.id,
@@ -86,7 +92,8 @@ function evaluateExactFinalAnswer(evaluator, finalOutput) {
     };
 }
 function evaluateStepCountMax(evaluator, stepCount) {
-    const max = Number(evaluator.config.max_steps ?? 0);
+    const rawMax = evaluator.config.max ?? evaluator.config.max_steps;
+    const max = Number(rawMax ?? 0);
     const passed = stepCount <= max;
     return {
         evaluatorId: evaluator.id,
@@ -98,6 +105,54 @@ function evaluateStepCountMax(evaluator, stepCount) {
         message: passed ? `Step count ${stepCount} is within max ${max}.` : `Step count ${stepCount} exceeds max ${max}.`,
     };
 }
+function evaluateToolCallCountMax(evaluator, totalToolCalls) {
+    const max = Number(evaluator.config.max ?? 0);
+    const passed = totalToolCalls <= max;
+    return {
+        evaluatorId: evaluator.id,
+        evaluatorType: evaluator.type,
+        mode: evaluator.mode,
+        status: passed ? "pass" : "fail",
+        weight: evaluator.weight,
+        rawScore: passed ? 1 : 0,
+        message: passed
+            ? `Tool call count ${totalToolCalls} is within max ${max}.`
+            : `Tool call count ${totalToolCalls} exceeds max ${max}.`,
+    };
+}
+function evaluateToolRepeatMax(evaluator, toolCalls) {
+    const tool = String(evaluator.config.tool ?? "");
+    const max = Number(evaluator.config.max ?? 0);
+    const count = toolCalls.filter((call) => call.toolName === tool).length;
+    const passed = count <= max;
+    return {
+        evaluatorId: evaluator.id,
+        evaluatorType: evaluator.type,
+        mode: evaluator.mode,
+        status: passed ? "pass" : "fail",
+        weight: evaluator.weight,
+        rawScore: passed ? 1 : 0,
+        message: passed
+            ? `Tool '${tool}' usage count ${count} is within max ${max}.`
+            : `Tool '${tool}' usage count ${count} exceeds max ${max}.`,
+    };
+}
+function evaluateCostMax(evaluator, totalCostUsd) {
+    const maxUsd = Number(evaluator.config.max_usd ?? 0);
+    const total = totalCostUsd ?? 0;
+    const passed = total <= maxUsd;
+    return {
+        evaluatorId: evaluator.id,
+        evaluatorType: evaluator.type,
+        mode: evaluator.mode,
+        status: passed ? "pass" : "fail",
+        weight: evaluator.weight,
+        rawScore: passed ? 1 : 0,
+        message: passed
+            ? `Total cost ${total} is within max ${maxUsd}.`
+            : `Total cost ${total} exceeds max ${maxUsd}.`,
+    };
+}
 function matches(input, match) {
     if (!isObject(input)) {
         return false;