agent-regression-lab 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +186 -123
- package/dist/agent/factory.js +20 -6
- package/dist/agent/httpAdapter.js +79 -0
- package/dist/agent/mockAdapter.js +210 -13
- package/dist/config.js +223 -4
- package/dist/conversationEvaluators.js +167 -0
- package/dist/conversationRunner.js +199 -0
- package/dist/evaluators.js +56 -1
- package/dist/index.js +428 -111
- package/dist/lib/id.js +6 -0
- package/dist/runOutput.js +46 -0
- package/dist/runner.js +31 -9
- package/dist/scenarios.js +211 -11
- package/dist/scoring.js +2 -2
- package/dist/storage.js +305 -31
- package/dist/tools.js +284 -0
- package/dist/trace.js +4 -2
- package/dist/ui/App.js +67 -5
- package/dist/ui/server.js +18 -0
- package/dist/ui-assets/client.js +165 -3
- package/docs/agents.md +287 -0
- package/docs/golden-suites.md +74 -0
- package/docs/integrations-and-live-services.md +58 -0
- package/docs/memory-and-stateful-agents.md +51 -0
- package/docs/release-checklist.md +94 -0
- package/docs/runtime-profiles.md +67 -0
- package/docs/scenarios.md +419 -0
- package/docs/tools.md +102 -0
- package/docs/troubleshooting.md +296 -0
- package/docs/variant-sets.md +63 -0
- package/package.json +4 -3
package/dist/lib/id.js
CHANGED
|
@@ -5,6 +5,9 @@ export function hashText(text) {
|
|
|
5
5
|
export function createRunId() {
|
|
6
6
|
return `run_${Date.now()}`;
|
|
7
7
|
}
|
|
8
|
+
export function createSuiteBatchId() {
|
|
9
|
+
return `suite_${Date.now()}_${randomUUID().slice(0, 8)}`;
|
|
10
|
+
}
|
|
8
11
|
export function createEventId() {
|
|
9
12
|
return `evt_${randomUUID()}`;
|
|
10
13
|
}
|
|
@@ -14,3 +17,6 @@ export function createToolCallId() {
|
|
|
14
17
|
export function createAgentVersionId(label, config) {
|
|
15
18
|
return `agent_${hashText(`${label}:${JSON.stringify(config)}`).slice(0, 12)}`;
|
|
16
19
|
}
|
|
20
|
+
export function createConfigHash(input) {
|
|
21
|
+
return createAgentVersionId("config", input);
|
|
22
|
+
}
|
package/dist/runOutput.js
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
export function getRunErrorDetail(bundle) {
|
|
2
2
|
for (const event of [...bundle.traceEvents].reverse()) {
|
|
3
|
+
if (event.type === "conversation_finished") {
|
|
4
|
+
const errorMessage = event.payload.errorMessage;
|
|
5
|
+
if (typeof errorMessage === "string") {
|
|
6
|
+
return errorMessage;
|
|
7
|
+
}
|
|
8
|
+
}
|
|
3
9
|
if (event.type === "agent_error") {
|
|
4
10
|
const message = event.payload.message;
|
|
5
11
|
return typeof message === "string" ? message : undefined;
|
|
@@ -11,3 +17,43 @@ export function getRunErrorDetail(bundle) {
|
|
|
11
17
|
}
|
|
12
18
|
return undefined;
|
|
13
19
|
}
|
|
20
|
+
export function formatCliErrorMessage(message) {
|
|
21
|
+
if (message.includes("database is locked")) {
|
|
22
|
+
return "SQLite database is locked. Retry the run sequentially or wait for the current run to finish.";
|
|
23
|
+
}
|
|
24
|
+
return message;
|
|
25
|
+
}
|
|
26
|
+
export function getFailedEvaluatorSummaries(bundle) {
|
|
27
|
+
return bundle.evaluatorResults
|
|
28
|
+
.filter((result) => result.status === "fail")
|
|
29
|
+
.map((result) => `${result.evaluatorId}: ${result.message}`);
|
|
30
|
+
}
|
|
31
|
+
export function formatRunIdentityLines(bundle) {
|
|
32
|
+
const lines = [];
|
|
33
|
+
const run = bundle.run;
|
|
34
|
+
if (run.variantSetName) {
|
|
35
|
+
lines.push(`Variant set: ${run.variantSetName}`);
|
|
36
|
+
}
|
|
37
|
+
if (run.variantLabel) {
|
|
38
|
+
lines.push(`Variant: ${run.variantLabel}`);
|
|
39
|
+
}
|
|
40
|
+
if (run.promptVersion) {
|
|
41
|
+
lines.push(`Prompt version: ${run.promptVersion}`);
|
|
42
|
+
}
|
|
43
|
+
if (run.modelVersion) {
|
|
44
|
+
lines.push(`Model version: ${run.modelVersion}`);
|
|
45
|
+
}
|
|
46
|
+
if (run.toolSchemaVersion) {
|
|
47
|
+
lines.push(`Tool schema version: ${run.toolSchemaVersion}`);
|
|
48
|
+
}
|
|
49
|
+
if (run.configLabel) {
|
|
50
|
+
lines.push(`Config label: ${run.configLabel}`);
|
|
51
|
+
}
|
|
52
|
+
if (run.runtimeProfileName) {
|
|
53
|
+
lines.push(`Runtime profile: ${run.runtimeProfileName}`);
|
|
54
|
+
}
|
|
55
|
+
if (run.suiteDefinitionName) {
|
|
56
|
+
lines.push(`Suite definition: ${run.suiteDefinitionName}`);
|
|
57
|
+
}
|
|
58
|
+
return lines;
|
|
59
|
+
}
|
package/dist/runner.js
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import { performance } from "node:perf_hooks";
|
|
2
|
+
import { getRuntimeProfile } from "./config.js";
|
|
2
3
|
import { createToolCallId, createRunId } from "./lib/id.js";
|
|
3
4
|
import { evaluateScenario } from "./evaluators.js";
|
|
4
5
|
import { computeScore } from "./scoring.js";
|
|
6
|
+
import { applyRuntimeProfileToTools } from "./tools.js";
|
|
5
7
|
import { TraceRecorder } from "./trace.js";
|
|
6
8
|
export async function runScenario(deps) {
|
|
7
9
|
const runId = createRunId();
|
|
@@ -9,6 +11,8 @@ export async function runScenario(deps) {
|
|
|
9
11
|
const runStart = performance.now();
|
|
10
12
|
const trace = new TraceRecorder(runId, deps.scenario.id);
|
|
11
13
|
const toolCalls = [];
|
|
14
|
+
const runtimeProfile = deps.scenario.runtime_profile ? getRuntimeProfile(deps.scenario.runtime_profile) : undefined;
|
|
15
|
+
const tools = applyRuntimeProfileToTools(deps.tools, runtimeProfile, trace);
|
|
12
16
|
const maxSteps = deps.scenario.runtime?.max_steps ?? 8;
|
|
13
17
|
const timeoutSeconds = deps.scenario.runtime?.timeout_seconds;
|
|
14
18
|
const deadline = timeoutSeconds ? Date.now() + timeoutSeconds * 1000 : undefined;
|
|
@@ -22,6 +26,9 @@ export async function runScenario(deps) {
|
|
|
22
26
|
maxSteps,
|
|
23
27
|
timeoutSeconds,
|
|
24
28
|
});
|
|
29
|
+
trace.record("system", "runtime_profile_applied", {
|
|
30
|
+
name: runtimeProfile?.name ?? null,
|
|
31
|
+
}, { countStep: false });
|
|
25
32
|
const availableTools = deps.toolSpecs.filter((tool) => deps.scenario.tools.allowed.includes(tool.name));
|
|
26
33
|
const session = await deps.agentAdapter.startRun({
|
|
27
34
|
instructions: deps.scenario.task.instructions,
|
|
@@ -72,7 +79,7 @@ export async function runScenario(deps) {
|
|
|
72
79
|
trace.record("runner", "forbidden_tool_attempted", { toolName });
|
|
73
80
|
break;
|
|
74
81
|
}
|
|
75
|
-
const handler =
|
|
82
|
+
const handler = tools[toolName];
|
|
76
83
|
if (!handler) {
|
|
77
84
|
status = "error";
|
|
78
85
|
terminationReason = "tool_error";
|
|
@@ -98,7 +105,8 @@ export async function runScenario(deps) {
|
|
|
98
105
|
}
|
|
99
106
|
catch (error) {
|
|
100
107
|
const message = error instanceof Error ? error.message : String(error);
|
|
101
|
-
|
|
108
|
+
const isInjectedTimeout = error instanceof Error && error.code === "timeout_exceeded";
|
|
109
|
+
if (isInjectedTimeout || (deadline && Date.now() >= deadline)) {
|
|
102
110
|
status = "error";
|
|
103
111
|
terminationReason = "timeout_exceeded";
|
|
104
112
|
trace.record("runner", "timeout_exceeded", { timeoutSeconds, message });
|
|
@@ -182,18 +190,32 @@ export async function runScenario(deps) {
|
|
|
182
190
|
function hasTimedOut(deadline) {
|
|
183
191
|
return deadline !== undefined && Date.now() >= deadline;
|
|
184
192
|
}
|
|
193
|
+
function toolRaceTimeoutError(message) {
|
|
194
|
+
const error = new Error(message);
|
|
195
|
+
error.code = "timeout_exceeded";
|
|
196
|
+
return error;
|
|
197
|
+
}
|
|
185
198
|
async function raceWithTimeout(promise, deadline, message) {
|
|
186
199
|
if (deadline === undefined) {
|
|
187
200
|
return promise;
|
|
188
201
|
}
|
|
189
202
|
const remainingMs = deadline - Date.now();
|
|
190
203
|
if (remainingMs <= 0) {
|
|
191
|
-
throw
|
|
204
|
+
throw toolRaceTimeoutError(message);
|
|
205
|
+
}
|
|
206
|
+
let timeoutHandle;
|
|
207
|
+
try {
|
|
208
|
+
return await Promise.race([
|
|
209
|
+
promise,
|
|
210
|
+
new Promise((_, reject) => {
|
|
211
|
+
timeoutHandle = setTimeout(() => reject(toolRaceTimeoutError(message)), remainingMs);
|
|
212
|
+
timeoutHandle.unref?.();
|
|
213
|
+
}),
|
|
214
|
+
]);
|
|
215
|
+
}
|
|
216
|
+
finally {
|
|
217
|
+
if (timeoutHandle !== undefined) {
|
|
218
|
+
clearTimeout(timeoutHandle);
|
|
219
|
+
}
|
|
192
220
|
}
|
|
193
|
-
return await Promise.race([
|
|
194
|
-
promise,
|
|
195
|
-
new Promise((_, reject) => {
|
|
196
|
-
setTimeout(() => reject(new Error(message)), remainingMs);
|
|
197
|
-
}),
|
|
198
|
-
]);
|
|
199
221
|
}
|
package/dist/scenarios.js
CHANGED
|
@@ -2,9 +2,20 @@ import { readFileSync, readdirSync, statSync } from "node:fs";
|
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { join, relative, resolve } from "node:path";
|
|
4
4
|
import { parse } from "yaml";
|
|
5
|
-
import { loadAgentLabConfig } from "./config.js";
|
|
5
|
+
import { getRuntimeProfile, getSuiteDefinition, loadAgentLabConfig } from "./config.js";
|
|
6
6
|
import { getBuiltinToolSpecs } from "./tools.js";
|
|
7
7
|
const SCENARIOS_ROOT = resolve("scenarios");
|
|
8
|
+
const VALID_TASK_EVALUATOR_TYPES = new Set([
|
|
9
|
+
"exact_final_answer",
|
|
10
|
+
"final_answer_contains",
|
|
11
|
+
"forbidden_tool",
|
|
12
|
+
"tool_call_assertion",
|
|
13
|
+
"step_count_max",
|
|
14
|
+
"tool_call_count_max",
|
|
15
|
+
"tool_repeat_max",
|
|
16
|
+
"cost_max",
|
|
17
|
+
]);
|
|
18
|
+
const VALID_EVALUATOR_MODES = new Set(["hard_gate", "weighted"]);
|
|
8
19
|
export function listScenarioFiles(root = SCENARIOS_ROOT) {
|
|
9
20
|
if (!safeExists(root)) {
|
|
10
21
|
return [];
|
|
@@ -26,19 +37,37 @@ export function listScenarioFiles(root = SCENARIOS_ROOT) {
|
|
|
26
37
|
return results.sort();
|
|
27
38
|
}
|
|
28
39
|
export function listScenarios() {
|
|
29
|
-
return listScenarioFiles().
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
40
|
+
return listScenarioFiles().flatMap((filePath) => {
|
|
41
|
+
try {
|
|
42
|
+
const scenarioType = getScenarioType(filePath);
|
|
43
|
+
if (scenarioType === "conversation") {
|
|
44
|
+
const { definition } = loadConversationScenarioByPath(filePath);
|
|
45
|
+
return [{
|
|
46
|
+
id: definition.id,
|
|
47
|
+
name: definition.name,
|
|
48
|
+
suite: definition.suite,
|
|
49
|
+
difficulty: definition.difficulty,
|
|
50
|
+
description: definition.description,
|
|
51
|
+
}];
|
|
52
|
+
}
|
|
53
|
+
const { definition } = loadScenarioByPath(filePath, getKnownToolNames());
|
|
54
|
+
return [{
|
|
55
|
+
id: definition.id,
|
|
56
|
+
name: definition.name,
|
|
57
|
+
suite: definition.suite,
|
|
58
|
+
difficulty: definition.difficulty,
|
|
59
|
+
description: definition.description,
|
|
60
|
+
}];
|
|
61
|
+
}
|
|
62
|
+
catch {
|
|
63
|
+
return [];
|
|
64
|
+
}
|
|
38
65
|
});
|
|
39
66
|
}
|
|
40
67
|
export function loadScenarioById(scenarioId) {
|
|
41
68
|
for (const filePath of listScenarioFiles()) {
|
|
69
|
+
if (getScenarioType(filePath) !== "task")
|
|
70
|
+
continue;
|
|
42
71
|
const loaded = loadScenarioByPath(filePath, getKnownToolNames());
|
|
43
72
|
if (loaded.definition.id === scenarioId) {
|
|
44
73
|
return loaded;
|
|
@@ -48,9 +77,24 @@ export function loadScenarioById(scenarioId) {
|
|
|
48
77
|
}
|
|
49
78
|
export function loadScenariosBySuite(suite) {
|
|
50
79
|
return listScenarioFiles()
|
|
80
|
+
.filter((filePath) => getScenarioType(filePath) === "task")
|
|
51
81
|
.map((filePath) => loadScenarioByPath(filePath, getKnownToolNames()))
|
|
52
82
|
.filter(({ definition }) => definition.suite === suite);
|
|
53
83
|
}
|
|
84
|
+
export function loadScenariosBySuiteDefinition(name) {
|
|
85
|
+
const suiteDefinition = getSuiteDefinition(name);
|
|
86
|
+
const knownToolNames = getKnownToolNames();
|
|
87
|
+
const scenarioFiles = listScenarioFiles(resolve("scenarios"));
|
|
88
|
+
const loadedScenarios = scenarioFiles.map((filePath) => loadScenarioRecordByPath(filePath, knownToolNames));
|
|
89
|
+
const included = loadedScenarios
|
|
90
|
+
.filter(({ definition }) => matchesSuiteDefinitionInclude(definition, suiteDefinition));
|
|
91
|
+
const excludedIds = new Set(loadedScenarios
|
|
92
|
+
.filter(({ definition }) => matchesSuiteDefinitionExclude(definition, suiteDefinition))
|
|
93
|
+
.map(({ definition }) => definition.id));
|
|
94
|
+
return included
|
|
95
|
+
.filter(({ definition }) => !excludedIds.has(definition.id))
|
|
96
|
+
.sort((left, right) => left.definition.id.localeCompare(right.definition.id));
|
|
97
|
+
}
|
|
54
98
|
export function loadScenarioByPath(filePath, knownToolNames = getKnownToolNames()) {
|
|
55
99
|
const absolutePath = resolve(filePath);
|
|
56
100
|
const raw = readFileSync(absolutePath, "utf8");
|
|
@@ -90,7 +134,10 @@ function validateScenario(value, filePath, knownToolNames) {
|
|
|
90
134
|
throw new Error(`Scenario file '${filePath}' references unknown allowed tool '${toolName}'.`);
|
|
91
135
|
}
|
|
92
136
|
}
|
|
93
|
-
if (
|
|
137
|
+
if (value.tools.forbidden !== undefined) {
|
|
138
|
+
if (!Array.isArray(value.tools.forbidden)) {
|
|
139
|
+
throw new Error(`Scenario file '${filePath}' field 'tools.forbidden' must be an array of strings.`);
|
|
140
|
+
}
|
|
94
141
|
for (const toolName of value.tools.forbidden) {
|
|
95
142
|
if (typeof toolName !== "string") {
|
|
96
143
|
throw new Error(`Scenario file '${filePath}' contains a non-string tool name in tools.forbidden.`);
|
|
@@ -105,6 +152,17 @@ function validateScenario(value, filePath, knownToolNames) {
|
|
|
105
152
|
if (!isObject(evaluator) || typeof evaluator.id !== "string" || typeof evaluator.type !== "string") {
|
|
106
153
|
throw new Error(`Scenario file '${filePath}' has an invalid evaluator entry.`);
|
|
107
154
|
}
|
|
155
|
+
if (!VALID_TASK_EVALUATOR_TYPES.has(evaluator.type)) {
|
|
156
|
+
throw new Error(`Scenario file '${filePath}' evaluator '${evaluator.id}' has invalid type '${evaluator.type}'. ` +
|
|
157
|
+
`Valid types: ${[...VALID_TASK_EVALUATOR_TYPES].join(", ")}.`);
|
|
158
|
+
}
|
|
159
|
+
if (!VALID_EVALUATOR_MODES.has(evaluator.mode)) {
|
|
160
|
+
throw new Error(`Scenario file '${filePath}' evaluator '${evaluator.id}' has invalid mode '${String(evaluator.mode)}'. ` +
|
|
161
|
+
`Valid modes: hard_gate, weighted.`);
|
|
162
|
+
}
|
|
163
|
+
if (!isObject(evaluator.config)) {
|
|
164
|
+
throw new Error(`Scenario file '${filePath}' evaluator '${evaluator.id}' must define an object config.`);
|
|
165
|
+
}
|
|
108
166
|
if (evaluatorIds.has(evaluator.id)) {
|
|
109
167
|
throw new Error(`Scenario file '${filePath}' defines duplicate evaluator id '${evaluator.id}'.`);
|
|
110
168
|
}
|
|
@@ -114,6 +172,12 @@ function validateScenario(value, filePath, knownToolNames) {
|
|
|
114
172
|
validatePositiveInt(value.runtime.max_steps, "runtime.max_steps", filePath);
|
|
115
173
|
validatePositiveInt(value.runtime.timeout_seconds, "runtime.timeout_seconds", filePath);
|
|
116
174
|
}
|
|
175
|
+
if (value.runtime_profile !== undefined) {
|
|
176
|
+
if (typeof value.runtime_profile !== "string" || value.runtime_profile.length === 0) {
|
|
177
|
+
throw new Error(`Scenario file '${filePath}' field 'runtime_profile' must be a non-empty string.`);
|
|
178
|
+
}
|
|
179
|
+
getRuntimeProfile(value.runtime_profile);
|
|
180
|
+
}
|
|
117
181
|
if (isObject(value.context) && Array.isArray(value.context.fixtures)) {
|
|
118
182
|
for (const fixturePath of value.context.fixtures) {
|
|
119
183
|
if (typeof fixturePath !== "string") {
|
|
@@ -153,3 +217,139 @@ function getKnownToolNames() {
|
|
|
153
217
|
}
|
|
154
218
|
return names;
|
|
155
219
|
}
|
|
220
|
+
export function getScenarioType(filePath) {
|
|
221
|
+
const absolutePath = resolve(filePath);
|
|
222
|
+
const raw = readFileSync(absolutePath, "utf8");
|
|
223
|
+
const parsed = parse(raw);
|
|
224
|
+
if (isObject(parsed) && parsed.type === "conversation") {
|
|
225
|
+
return "conversation";
|
|
226
|
+
}
|
|
227
|
+
return "task";
|
|
228
|
+
}
|
|
229
|
+
export function loadConversationScenarioByPath(filePath) {
|
|
230
|
+
const absolutePath = resolve(filePath);
|
|
231
|
+
const raw = readFileSync(absolutePath, "utf8");
|
|
232
|
+
const parsed = parse(raw);
|
|
233
|
+
validateConversationScenario(parsed, absolutePath);
|
|
234
|
+
return {
|
|
235
|
+
definition: parsed,
|
|
236
|
+
filePath: relative(process.cwd(), absolutePath),
|
|
237
|
+
fileHash: createHash("sha256").update(raw).digest("hex"),
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
export function loadConversationScenarioById(scenarioId) {
|
|
241
|
+
for (const filePath of listScenarioFiles()) {
|
|
242
|
+
const absolutePath = resolve(filePath);
|
|
243
|
+
const raw = readFileSync(absolutePath, "utf8");
|
|
244
|
+
const parsed = parse(raw);
|
|
245
|
+
if (parsed.type === "conversation" && parsed.id === scenarioId) {
|
|
246
|
+
return loadConversationScenarioByPath(filePath);
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
throw new Error(`Conversation scenario '${scenarioId}' not found.`);
|
|
250
|
+
}
|
|
251
|
+
const VALID_CONVERSATION_EVALUATOR_TYPES = new Set([
|
|
252
|
+
"response_contains",
|
|
253
|
+
"response_not_contains",
|
|
254
|
+
"response_matches_regex",
|
|
255
|
+
"response_latency_max",
|
|
256
|
+
"step_count_max",
|
|
257
|
+
"exact_final_answer",
|
|
258
|
+
"final_answer_contains",
|
|
259
|
+
]);
|
|
260
|
+
function validateConversationEvaluatorList(evaluators, context, filePath) {
|
|
261
|
+
if (!Array.isArray(evaluators)) {
|
|
262
|
+
throw new Error(`Conversation scenario '${filePath}' ${context} evaluators must be an array.`);
|
|
263
|
+
}
|
|
264
|
+
for (let i = 0; i < evaluators.length; i += 1) {
|
|
265
|
+
const ev = evaluators[i];
|
|
266
|
+
if (!isObject(ev)) {
|
|
267
|
+
throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must be an object.`);
|
|
268
|
+
}
|
|
269
|
+
if (typeof ev.type !== "string" || !VALID_CONVERSATION_EVALUATOR_TYPES.has(ev.type)) {
|
|
270
|
+
throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} has invalid type '${String(ev.type)}'. ` +
|
|
271
|
+
`Valid types: ${[...VALID_CONVERSATION_EVALUATOR_TYPES].join(", ")}.`);
|
|
272
|
+
}
|
|
273
|
+
if (ev.mode !== "hard_gate" && ev.mode !== "weighted") {
|
|
274
|
+
throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must have mode: hard_gate or weighted.`);
|
|
275
|
+
}
|
|
276
|
+
if (ev.type === "response_contains" || ev.type === "response_not_contains") {
|
|
277
|
+
if (!isObject(ev.config)) {
|
|
278
|
+
throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must define an object config.`);
|
|
279
|
+
}
|
|
280
|
+
if ("text" in ev.config) {
|
|
281
|
+
throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} uses stale 'config.text'; use 'config.keywords: string[]'.`);
|
|
282
|
+
}
|
|
283
|
+
if (!Array.isArray(ev.config.keywords) || ev.config.keywords.some((kw) => typeof kw !== "string")) {
|
|
284
|
+
throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must define config.keywords as a string array.`);
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
function validateConversationScenario(value, filePath) {
|
|
290
|
+
if (!isObject(value)) {
|
|
291
|
+
throw new Error(`Scenario file '${filePath}' must contain a YAML object.`);
|
|
292
|
+
}
|
|
293
|
+
for (const field of ["id", "name", "suite"]) {
|
|
294
|
+
if (typeof value[field] !== "string" || value[field].length === 0) {
|
|
295
|
+
throw new Error(`Conversation scenario '${filePath}' is missing required string field '${field}'.`);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
if (value.type !== "conversation") {
|
|
299
|
+
throw new Error(`Scenario file '${filePath}' does not have type: conversation.`);
|
|
300
|
+
}
|
|
301
|
+
if (value.runtime_profile !== undefined) {
|
|
302
|
+
if (typeof value.runtime_profile !== "string" || value.runtime_profile.length === 0) {
|
|
303
|
+
throw new Error(`Conversation scenario '${filePath}' field 'runtime_profile' must be a non-empty string.`);
|
|
304
|
+
}
|
|
305
|
+
getRuntimeProfile(value.runtime_profile);
|
|
306
|
+
}
|
|
307
|
+
if ("tools" in value) {
|
|
308
|
+
throw new Error(`Conversation scenario '${filePath}' must not define 'tools'. HTTP agents manage their own tools internally.`);
|
|
309
|
+
}
|
|
310
|
+
if (!Array.isArray(value.steps) || value.steps.length === 0) {
|
|
311
|
+
throw new Error(`Conversation scenario '${filePath}' must define at least one step.`);
|
|
312
|
+
}
|
|
313
|
+
for (let i = 0; i < value.steps.length; i += 1) {
|
|
314
|
+
const step = value.steps[i];
|
|
315
|
+
if (!isObject(step)) {
|
|
316
|
+
throw new Error(`Conversation scenario '${filePath}' step ${i} must be an object.`);
|
|
317
|
+
}
|
|
318
|
+
if (step.role !== "user") {
|
|
319
|
+
throw new Error(`Conversation scenario '${filePath}' step ${i} must have role: user.`);
|
|
320
|
+
}
|
|
321
|
+
if (typeof step.message !== "string" || step.message.length === 0) {
|
|
322
|
+
throw new Error(`Conversation scenario '${filePath}' step ${i} must have a non-empty message.`);
|
|
323
|
+
}
|
|
324
|
+
if (step.evaluators !== undefined) {
|
|
325
|
+
validateConversationEvaluatorList(step.evaluators, `step ${i}`, filePath);
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
if (value.evaluators !== undefined) {
|
|
329
|
+
validateConversationEvaluatorList(value.evaluators, "end-of-run evaluators", filePath);
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
function loadScenarioRecordByPath(filePath, knownToolNames = getKnownToolNames()) {
|
|
333
|
+
if (getScenarioType(filePath) === "conversation") {
|
|
334
|
+
return loadConversationScenarioByPath(filePath);
|
|
335
|
+
}
|
|
336
|
+
return loadScenarioByPath(filePath, knownToolNames);
|
|
337
|
+
}
|
|
338
|
+
function matchesSuiteDefinitionInclude(definition, suiteDefinition) {
|
|
339
|
+
return matchesSuiteDefinitionSelectors(definition, suiteDefinition.include);
|
|
340
|
+
}
|
|
341
|
+
function matchesSuiteDefinitionExclude(definition, suiteDefinition) {
|
|
342
|
+
return suiteDefinition.exclude !== undefined && matchesSuiteDefinitionSelectors(definition, suiteDefinition.exclude);
|
|
343
|
+
}
|
|
344
|
+
function matchesSuiteDefinitionSelectors(definition, selectors) {
|
|
345
|
+
if (selectors.scenarios?.includes(definition.id)) {
|
|
346
|
+
return true;
|
|
347
|
+
}
|
|
348
|
+
if (selectors.tags?.some((tag) => definition.tags?.includes(tag) ?? false)) {
|
|
349
|
+
return true;
|
|
350
|
+
}
|
|
351
|
+
if (selectors.suites?.includes(definition.suite)) {
|
|
352
|
+
return true;
|
|
353
|
+
}
|
|
354
|
+
return false;
|
|
355
|
+
}
|
package/dist/scoring.js
CHANGED
|
@@ -4,9 +4,9 @@ export function computeScore(results) {
|
|
|
4
4
|
const weighted = results.filter((result) => result.mode === "weighted");
|
|
5
5
|
let score = 100;
|
|
6
6
|
if (weighted.length > 0) {
|
|
7
|
-
const totalWeight = weighted.reduce((sum, result) => sum + (result.weight ??
|
|
7
|
+
const totalWeight = weighted.reduce((sum, result) => sum + (result.weight ?? 1), 0);
|
|
8
8
|
const earnedWeight = weighted.reduce((sum, result) => {
|
|
9
|
-
const weight = result.weight ??
|
|
9
|
+
const weight = result.weight ?? 1;
|
|
10
10
|
return sum + (result.status === "pass" ? weight : 0);
|
|
11
11
|
}, 0);
|
|
12
12
|
score = totalWeight === 0 ? 100 : Math.round((earnedWeight / totalWeight) * 100);
|