agent-regression-lab 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,167 @@
1
+ export function evaluateStep(reply, latencyMs, evaluators, stepIndex) {
2
+ return evaluators.map((evaluator, i) => {
3
+ const evaluatorId = `step_${stepIndex}_${evaluator.type}_${i}`;
4
+ return evaluateStepOne(evaluator, evaluatorId, reply, latencyMs);
5
+ });
6
+ }
7
+ export function evaluateConversationEnd(finalReply, totalTurns, evaluators) {
8
+ return evaluators.map((evaluator, i) => {
9
+ const evaluatorId = `run_${evaluator.type}_${i}`;
10
+ return evaluateEndOne(evaluator, evaluatorId, finalReply, totalTurns);
11
+ });
12
+ }
13
+ function evaluateStepOne(evaluator, evaluatorId, reply, latencyMs) {
14
+ const normalizedReply = reply.toLowerCase();
15
+ switch (evaluator.type) {
16
+ case "response_contains": {
17
+ const keywords = toStringArray(evaluator.config.keywords);
18
+ const missing = keywords.filter((kw) => !normalizedReply.includes(kw.toLowerCase()));
19
+ const passed = missing.length === 0;
20
+ return {
21
+ evaluatorId,
22
+ evaluatorType: evaluator.type,
23
+ mode: evaluator.mode,
24
+ weight: evaluator.weight,
25
+ status: passed ? "pass" : "fail",
26
+ rawScore: passed ? 1 : 0,
27
+ message: passed
28
+ ? "All required keywords found in reply."
29
+ : `Missing keywords: ${missing.join(", ")}.`,
30
+ };
31
+ }
32
+ case "response_not_contains": {
33
+ const keywords = toStringArray(evaluator.config.keywords);
34
+ const found = keywords.find((kw) => normalizedReply.includes(kw.toLowerCase()));
35
+ const passed = found === undefined;
36
+ return {
37
+ evaluatorId,
38
+ evaluatorType: evaluator.type,
39
+ mode: evaluator.mode,
40
+ weight: evaluator.weight,
41
+ status: passed ? "pass" : "fail",
42
+ rawScore: passed ? 1 : 0,
43
+ message: passed
44
+ ? "No forbidden keywords found in reply."
45
+ : `Forbidden keyword found: "${found}".`,
46
+ };
47
+ }
48
+ case "response_matches_regex": {
49
+ const pattern = String(evaluator.config.pattern ?? "");
50
+ let passed = false;
51
+ try {
52
+ passed = new RegExp(pattern, "i").test(reply);
53
+ }
54
+ catch {
55
+ // invalid regex — treat as fail
56
+ }
57
+ return {
58
+ evaluatorId,
59
+ evaluatorType: evaluator.type,
60
+ mode: evaluator.mode,
61
+ weight: evaluator.weight,
62
+ status: passed ? "pass" : "fail",
63
+ rawScore: passed ? 1 : 0,
64
+ message: passed
65
+ ? `Reply matches pattern /${pattern}/.`
66
+ : `Reply does not match pattern /${pattern}/.`,
67
+ };
68
+ }
69
+ case "response_latency_max": {
70
+ const maxMs = Number(evaluator.config.ms ?? 0);
71
+ const passed = latencyMs <= maxMs;
72
+ return {
73
+ evaluatorId,
74
+ evaluatorType: evaluator.type,
75
+ mode: evaluator.mode,
76
+ weight: evaluator.weight,
77
+ status: passed ? "pass" : "fail",
78
+ rawScore: passed ? 1 : 0,
79
+ message: passed
80
+ ? `Response latency ${latencyMs}ms is within limit ${maxMs}ms.`
81
+ : `Response latency ${latencyMs}ms exceeds limit ${maxMs}ms.`,
82
+ };
83
+ }
84
+ default:
85
+ return {
86
+ evaluatorId,
87
+ evaluatorType: evaluator.type,
88
+ mode: evaluator.mode,
89
+ weight: evaluator.weight,
90
+ status: "fail",
91
+ message: `Unsupported step evaluator type '${evaluator.type}'.`,
92
+ };
93
+ }
94
+ }
95
+ function evaluateEndOne(evaluator, evaluatorId, finalReply, totalTurns) {
96
+ const normalizedReply = finalReply.toLowerCase();
97
+ switch (evaluator.type) {
98
+ case "step_count_max": {
99
+ const max = Number(evaluator.config.max ?? 0);
100
+ const passed = totalTurns <= max;
101
+ return {
102
+ evaluatorId,
103
+ evaluatorType: evaluator.type,
104
+ mode: evaluator.mode,
105
+ weight: evaluator.weight,
106
+ status: passed ? "pass" : "fail",
107
+ rawScore: passed ? 1 : 0,
108
+ message: passed
109
+ ? `Turn count ${totalTurns} is within max ${max}.`
110
+ : `Turn count ${totalTurns} exceeds max ${max}.`,
111
+ };
112
+ }
113
+ case "exact_final_answer": {
114
+ const expected = String(evaluator.config.expected ?? "");
115
+ const passed = finalReply.trim() === expected.trim();
116
+ return {
117
+ evaluatorId,
118
+ evaluatorType: evaluator.type,
119
+ mode: evaluator.mode,
120
+ weight: evaluator.weight,
121
+ status: passed ? "pass" : "fail",
122
+ rawScore: passed ? 1 : 0,
123
+ message: passed ? "Final reply matched exactly." : "Final reply did not match expected output.",
124
+ };
125
+ }
126
+ case "final_answer_contains": {
127
+ const keywords = toStringArray(evaluator.config.keywords);
128
+ const missing = keywords.filter((kw) => !normalizedReply.includes(kw.toLowerCase()));
129
+ const passed = missing.length === 0;
130
+ return {
131
+ evaluatorId,
132
+ evaluatorType: evaluator.type,
133
+ mode: evaluator.mode,
134
+ weight: evaluator.weight,
135
+ status: passed ? "pass" : "fail",
136
+ rawScore: passed ? 1 : 0,
137
+ message: passed
138
+ ? "Final reply contains all required keywords."
139
+ : `Missing keywords in final reply: ${missing.join(", ")}.`,
140
+ };
141
+ }
142
+ case "response_contains":
143
+ case "response_not_contains":
144
+ case "response_matches_regex":
145
+ case "response_latency_max":
146
+ return {
147
+ evaluatorId,
148
+ evaluatorType: evaluator.type,
149
+ mode: evaluator.mode,
150
+ weight: evaluator.weight,
151
+ status: "fail",
152
+ message: `Evaluator type '${evaluator.type}' is only valid as a per-step evaluator, not end-of-run.`,
153
+ };
154
+ default:
155
+ return {
156
+ evaluatorId,
157
+ evaluatorType: evaluator.type,
158
+ mode: evaluator.mode,
159
+ weight: evaluator.weight,
160
+ status: "fail",
161
+ message: `Unsupported end-of-run evaluator type '${evaluator.type}'.`,
162
+ };
163
+ }
164
+ }
165
+ function toStringArray(value) {
166
+ return Array.isArray(value) ? value.map(String) : [];
167
+ }
@@ -0,0 +1,199 @@
1
+ import { performance } from "node:perf_hooks";
2
+ import { randomUUID } from "node:crypto";
3
+ import { callHttpAgent } from "./agent/httpAdapter.js";
4
+ import { evaluateStep, evaluateConversationEnd } from "./conversationEvaluators.js";
5
+ import { computeScore } from "./scoring.js";
6
+ import { TraceRecorder } from "./trace.js";
7
+ import { createRunId } from "./lib/id.js";
8
+ export async function runConversation(deps) {
9
+ const { httpConfig, agentVersion, scenario, scenarioFileHash } = deps;
10
+ const runId = createRunId();
11
+ const startedAt = new Date().toISOString();
12
+ const runStart = performance.now();
13
+ const trace = new TraceRecorder(runId, scenario.id);
14
+ const conversationId = randomUUID();
15
+ const allEvaluatorResults = [];
16
+ trace.record("runner", "conversation_started", {
17
+ conversationId,
18
+ stepCount: scenario.steps.length,
19
+ agentUrl: httpConfig.url,
20
+ agentVersionId: agentVersion.id,
21
+ scenarioVersionHash: scenarioFileHash,
22
+ });
23
+ let finalOutput = "";
24
+ let terminationReason = "completed";
25
+ let status = "pass";
26
+ let completedSteps = 0;
27
+ for (let stepIndex = 0; stepIndex < scenario.steps.length; stepIndex += 1) {
28
+ const step = scenario.steps[stepIndex];
29
+ trace.record("runner", "turn_started", {
30
+ stepIndex,
31
+ message: step.message,
32
+ conversationId,
33
+ });
34
+ let reply;
35
+ let latencyMs;
36
+ try {
37
+ const result = await callHttpAgent({
38
+ url: httpConfig.url,
39
+ message: step.message,
40
+ conversationId,
41
+ request_template: httpConfig.request_template,
42
+ response_field: httpConfig.response_field,
43
+ headers: httpConfig.headers ?? {},
44
+ timeout_ms: httpConfig.timeout_ms ?? 30000,
45
+ });
46
+ reply = result.reply;
47
+ latencyMs = result.latencyMs;
48
+ }
49
+ catch (error) {
50
+ const code = error.code;
51
+ const message = error instanceof Error ? error.message : String(error);
52
+ status = "error";
53
+ terminationReason =
54
+ code === "http_connection_failed"
55
+ ? "http_connection_failed"
56
+ : code === "http_error"
57
+ ? "http_error"
58
+ : code === "timeout_exceeded"
59
+ ? "timeout_exceeded"
60
+ : code === "invalid_response_format"
61
+ ? "invalid_response_format"
62
+ : "http_connection_failed";
63
+ trace.record("runner", "conversation_finished", {
64
+ status,
65
+ terminationReason,
66
+ totalTurns: completedSteps,
67
+ durationMs: Math.round(performance.now() - runStart),
68
+ errorMessage: message,
69
+ });
70
+ return buildBundle({
71
+ runId,
72
+ scenario,
73
+ scenarioFileHash,
74
+ agentVersion,
75
+ startedAt,
76
+ runStart,
77
+ status,
78
+ terminationReason,
79
+ finalOutput: "",
80
+ completedSteps,
81
+ allEvaluatorResults,
82
+ trace,
83
+ score: 0,
84
+ });
85
+ }
86
+ completedSteps += 1;
87
+ finalOutput = reply;
88
+ trace.record("runner", "turn_completed", {
89
+ stepIndex,
90
+ reply,
91
+ latencyMs,
92
+ });
93
+ if (step.evaluators && step.evaluators.length > 0) {
94
+ const stepResults = evaluateStep(reply, latencyMs, step.evaluators, stepIndex);
95
+ for (const result of stepResults) {
96
+ trace.record("evaluator", "step_evaluation_result", {
97
+ stepIndex,
98
+ evaluatorId: result.evaluatorId,
99
+ status: result.status,
100
+ message: result.message,
101
+ });
102
+ allEvaluatorResults.push(result);
103
+ }
104
+ const hardGateFailed = stepResults.some((r) => r.mode === "hard_gate" && r.status === "fail");
105
+ if (hardGateFailed) {
106
+ status = "fail";
107
+ terminationReason = "evaluator_failed";
108
+ trace.record("runner", "conversation_finished", {
109
+ status,
110
+ terminationReason,
111
+ totalTurns: completedSteps,
112
+ durationMs: Math.round(performance.now() - runStart),
113
+ });
114
+ return buildBundle({
115
+ runId,
116
+ scenario,
117
+ scenarioFileHash,
118
+ agentVersion,
119
+ startedAt,
120
+ runStart,
121
+ status,
122
+ terminationReason,
123
+ finalOutput,
124
+ completedSteps,
125
+ allEvaluatorResults,
126
+ trace,
127
+ });
128
+ }
129
+ }
130
+ }
131
+ // End-of-run evaluators
132
+ if (scenario.evaluators && scenario.evaluators.length > 0) {
133
+ trace.record("evaluator", "evaluation_started", {});
134
+ const endResults = evaluateConversationEnd(finalOutput, completedSteps, scenario.evaluators);
135
+ for (const result of endResults) {
136
+ trace.record("evaluator", "evaluation_result", {
137
+ evaluatorId: result.evaluatorId,
138
+ status: result.status,
139
+ message: result.message,
140
+ });
141
+ allEvaluatorResults.push(result);
142
+ }
143
+ trace.record("evaluator", "evaluation_finished", {});
144
+ }
145
+ const scoring = computeScore(allEvaluatorResults);
146
+ status = scoring.status;
147
+ if (status === "fail" && terminationReason === "completed") {
148
+ terminationReason = "evaluator_failed";
149
+ }
150
+ trace.record("runner", "conversation_finished", {
151
+ status,
152
+ terminationReason,
153
+ totalTurns: completedSteps,
154
+ durationMs: Math.round(performance.now() - runStart),
155
+ });
156
+ return buildBundle({
157
+ runId,
158
+ scenario,
159
+ scenarioFileHash,
160
+ agentVersion,
161
+ startedAt,
162
+ runStart,
163
+ status,
164
+ terminationReason,
165
+ finalOutput,
166
+ completedSteps,
167
+ allEvaluatorResults,
168
+ trace,
169
+ score: scoring.score,
170
+ });
171
+ }
172
+ function buildBundle(input) {
173
+ const { runId, scenario, scenarioFileHash, agentVersion, startedAt, runStart, status, terminationReason, finalOutput, completedSteps, allEvaluatorResults, trace, } = input;
174
+ const durationMs = Math.round(performance.now() - runStart);
175
+ const finishedAt = new Date().toISOString();
176
+ const score = input.score ?? computeScore(allEvaluatorResults).score;
177
+ const run = {
178
+ id: runId,
179
+ scenarioId: scenario.id,
180
+ scenarioFileHash,
181
+ agentVersionId: agentVersion.id,
182
+ status,
183
+ terminationReason,
184
+ finalOutput,
185
+ totalSteps: completedSteps,
186
+ totalToolCalls: 0,
187
+ durationMs,
188
+ score,
189
+ startedAt,
190
+ finishedAt,
191
+ };
192
+ return {
193
+ run,
194
+ traceEvents: trace.getEvents(),
195
+ toolCalls: [],
196
+ evaluatorResults: allEvaluatorResults,
197
+ agentVersion,
198
+ };
199
+ }
@@ -13,6 +13,12 @@ function evaluateOne(evaluator, bundle) {
13
13
  return evaluateExactFinalAnswer(evaluator, bundle.run.finalOutput);
14
14
  case "step_count_max":
15
15
  return evaluateStepCountMax(evaluator, bundle.run.totalSteps);
16
+ case "tool_call_count_max":
17
+ return evaluateToolCallCountMax(evaluator, bundle.run.totalToolCalls);
18
+ case "tool_repeat_max":
19
+ return evaluateToolRepeatMax(evaluator, bundle.toolCalls);
20
+ case "cost_max":
21
+ return evaluateCostMax(evaluator, bundle.run.totalCostUsd);
16
22
  default:
17
23
  return {
18
24
  evaluatorId: evaluator.id,
@@ -86,7 +92,8 @@ function evaluateExactFinalAnswer(evaluator, finalOutput) {
86
92
  };
87
93
  }
88
94
  function evaluateStepCountMax(evaluator, stepCount) {
89
- const max = Number(evaluator.config.max_steps ?? 0);
95
+ const rawMax = evaluator.config.max ?? evaluator.config.max_steps;
96
+ const max = Number(rawMax ?? 0);
90
97
  const passed = stepCount <= max;
91
98
  return {
92
99
  evaluatorId: evaluator.id,
@@ -98,6 +105,54 @@ function evaluateStepCountMax(evaluator, stepCount) {
98
105
  message: passed ? `Step count ${stepCount} is within max ${max}.` : `Step count ${stepCount} exceeds max ${max}.`,
99
106
  };
100
107
  }
108
+ function evaluateToolCallCountMax(evaluator, totalToolCalls) {
109
+ const max = Number(evaluator.config.max ?? 0);
110
+ const passed = totalToolCalls <= max;
111
+ return {
112
+ evaluatorId: evaluator.id,
113
+ evaluatorType: evaluator.type,
114
+ mode: evaluator.mode,
115
+ status: passed ? "pass" : "fail",
116
+ weight: evaluator.weight,
117
+ rawScore: passed ? 1 : 0,
118
+ message: passed
119
+ ? `Tool call count ${totalToolCalls} is within max ${max}.`
120
+ : `Tool call count ${totalToolCalls} exceeds max ${max}.`,
121
+ };
122
+ }
123
+ function evaluateToolRepeatMax(evaluator, toolCalls) {
124
+ const tool = String(evaluator.config.tool ?? "");
125
+ const max = Number(evaluator.config.max ?? 0);
126
+ const count = toolCalls.filter((call) => call.toolName === tool).length;
127
+ const passed = count <= max;
128
+ return {
129
+ evaluatorId: evaluator.id,
130
+ evaluatorType: evaluator.type,
131
+ mode: evaluator.mode,
132
+ status: passed ? "pass" : "fail",
133
+ weight: evaluator.weight,
134
+ rawScore: passed ? 1 : 0,
135
+ message: passed
136
+ ? `Tool '${tool}' usage count ${count} is within max ${max}.`
137
+ : `Tool '${tool}' usage count ${count} exceeds max ${max}.`,
138
+ };
139
+ }
140
+ function evaluateCostMax(evaluator, totalCostUsd) {
141
+ const maxUsd = Number(evaluator.config.max_usd ?? 0);
142
+ const total = totalCostUsd ?? 0;
143
+ const passed = total <= maxUsd;
144
+ return {
145
+ evaluatorId: evaluator.id,
146
+ evaluatorType: evaluator.type,
147
+ mode: evaluator.mode,
148
+ status: passed ? "pass" : "fail",
149
+ weight: evaluator.weight,
150
+ rawScore: passed ? 1 : 0,
151
+ message: passed
152
+ ? `Total cost ${total} is within max ${maxUsd}.`
153
+ : `Total cost ${total} exceeds max ${maxUsd}.`,
154
+ };
155
+ }
101
156
  function matches(input, match) {
102
157
  if (!isObject(input)) {
103
158
  return false;