agent-regression-lab 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -7
- package/dist/agent/factory.js +20 -6
- package/dist/agent/httpAdapter.js +5 -4
- package/dist/config.js +186 -3
- package/dist/evaluators.js +56 -1
- package/dist/index.js +143 -11
- package/dist/lib/id.js +3 -0
- package/dist/runOutput.js +46 -0
- package/dist/runner.js +31 -9
- package/dist/scenarios.js +90 -2
- package/dist/scoring.js +2 -2
- package/dist/storage.js +117 -7
- package/dist/tools.js +38 -0
- package/dist/trace.js +4 -2
- package/dist/ui/App.js +28 -2
- package/dist/ui-assets/client.js +82 -0
- package/docs/agents.md +143 -8
- package/docs/golden-suites.md +74 -0
- package/docs/integrations-and-live-services.md +58 -0
- package/docs/memory-and-stateful-agents.md +51 -0
- package/docs/release-checklist.md +30 -0
- package/docs/runtime-profiles.md +67 -0
- package/docs/scenarios.md +303 -56
- package/docs/troubleshooting.md +138 -0
- package/docs/variant-sets.md +63 -0
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import packageJson from "../package.json" with { type: "json" };
|
|
3
|
+
import { pathToFileURL } from "node:url";
|
|
3
4
|
import { createAgentFactory } from "./agent/factory.js";
|
|
4
|
-
import { getAgentRegistration } from "./config.js";
|
|
5
|
-
import { createSuiteBatchId } from "./lib/id.js";
|
|
6
|
-
import { getRunErrorDetail } from "./runOutput.js";
|
|
5
|
+
import { getAgentRegistration, getVariantSet } from "./config.js";
|
|
6
|
+
import { createConfigHash, createSuiteBatchId } from "./lib/id.js";
|
|
7
|
+
import { formatCliErrorMessage, formatRunIdentityLines, getFailedEvaluatorSummaries, getRunErrorDetail } from "./runOutput.js";
|
|
7
8
|
async function main() {
|
|
8
9
|
const [, , command, ...args] = process.argv;
|
|
9
10
|
switch (command) {
|
|
@@ -41,6 +42,8 @@ function printUsage() {
|
|
|
41
42
|
agentlab list scenarios
|
|
42
43
|
agentlab run <scenario-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
|
|
43
44
|
agentlab run --suite <suite-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
|
|
45
|
+
agentlab run --suite-def <name> [--agent <name>]
|
|
46
|
+
agentlab run <scenario-id> [--variant-set <name>]
|
|
44
47
|
agentlab show <run-id>
|
|
45
48
|
agentlab compare <baseline-run-id> <candidate-run-id>
|
|
46
49
|
agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
|
|
@@ -64,7 +67,13 @@ async function handleList(args) {
|
|
|
64
67
|
async function handleRun(args) {
|
|
65
68
|
const parsed = parseRunArgs(args);
|
|
66
69
|
const runtimeConfig = validateRuntimeConfig(parsed.runtimeConfig);
|
|
67
|
-
const { loadScenariosBySuite } = await import("./scenarios.js");
|
|
70
|
+
const { loadScenariosBySuite, loadScenariosBySuiteDefinition } = await import("./scenarios.js");
|
|
71
|
+
if (parsed.suite && parsed.suiteDefinition) {
|
|
72
|
+
throw new Error("--suite and --suite-def cannot be used together.");
|
|
73
|
+
}
|
|
74
|
+
if (parsed.runtimeConfig.agentName && parsed.variantSetName) {
|
|
75
|
+
throw new Error("--agent and --variant-set cannot be used together.");
|
|
76
|
+
}
|
|
68
77
|
if (parsed.suite) {
|
|
69
78
|
const suite = parsed.suite;
|
|
70
79
|
const scenarios = loadScenariosBySuite(suite);
|
|
@@ -73,16 +82,53 @@ async function handleRun(args) {
|
|
|
73
82
|
}
|
|
74
83
|
const suiteBatchId = createSuiteBatchId();
|
|
75
84
|
const runs = [];
|
|
76
|
-
|
|
77
|
-
|
|
85
|
+
if (parsed.variantSetName) {
|
|
86
|
+
console.log(`Variant set: ${parsed.variantSetName}`);
|
|
87
|
+
for (const scenario of scenarios) {
|
|
88
|
+
runs.push(...await executeVariantSetScenario(scenario.definition.id, parsed.variantSetName, suiteBatchId));
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
for (const scenario of scenarios) {
|
|
93
|
+
runs.push(await executeOne(scenario.definition.id, runtimeConfig, suiteBatchId));
|
|
94
|
+
}
|
|
78
95
|
}
|
|
79
96
|
printSuiteSummary(suite, runs, suiteBatchId);
|
|
80
97
|
return;
|
|
81
98
|
}
|
|
99
|
+
if (parsed.suiteDefinition) {
|
|
100
|
+
const suiteDefinition = parsed.suiteDefinition;
|
|
101
|
+
const scenarios = loadScenariosBySuiteDefinition(suiteDefinition);
|
|
102
|
+
if (scenarios.length === 0) {
|
|
103
|
+
throw new Error(`No scenarios found for suite definition '${suiteDefinition}'.`);
|
|
104
|
+
}
|
|
105
|
+
const suiteBatchId = createSuiteBatchId();
|
|
106
|
+
const runs = [];
|
|
107
|
+
console.log(`Suite definition: ${suiteDefinition}`);
|
|
108
|
+
if (parsed.variantSetName) {
|
|
109
|
+
console.log(`Variant set: ${parsed.variantSetName}`);
|
|
110
|
+
for (const scenario of scenarios) {
|
|
111
|
+
runs.push(...await executeVariantSetScenario(scenario.definition.id, parsed.variantSetName, suiteBatchId, suiteDefinition));
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
else {
|
|
115
|
+
const suiteRuntimeConfig = { ...runtimeConfig, suiteDefinitionName: suiteDefinition };
|
|
116
|
+
for (const scenario of scenarios) {
|
|
117
|
+
runs.push(await executeOne(scenario.definition.id, suiteRuntimeConfig, suiteBatchId));
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
printSuiteSummary(suiteDefinition, runs, suiteBatchId);
|
|
121
|
+
return;
|
|
122
|
+
}
|
|
82
123
|
const scenarioId = parsed.scenarioId;
|
|
83
124
|
if (!scenarioId) {
|
|
84
125
|
throw new Error("Missing scenario id.");
|
|
85
126
|
}
|
|
127
|
+
if (parsed.variantSetName) {
|
|
128
|
+
console.log(`Variant set: ${parsed.variantSetName}`);
|
|
129
|
+
await executeVariantSetScenario(scenarioId, parsed.variantSetName);
|
|
130
|
+
return;
|
|
131
|
+
}
|
|
86
132
|
// Detect scenario type to route to the right runner
|
|
87
133
|
const { listScenarioFiles } = await import("./scenarios.js");
|
|
88
134
|
const { parse } = await import("yaml");
|
|
@@ -97,6 +143,12 @@ async function handleRun(args) {
|
|
|
97
143
|
break;
|
|
98
144
|
}
|
|
99
145
|
}
|
|
146
|
+
if (scenarioType === "task" && runtimeConfig.provider === "http") {
|
|
147
|
+
throw new Error(`Scenario '${scenarioId}' is a task scenario. HTTP agents (provider: http) only work with ` +
|
|
148
|
+
`type: conversation scenarios.\n` +
|
|
149
|
+
`To test an HTTP agent, create a conversation scenario (type: conversation) — ` +
|
|
150
|
+
`conversation scenarios do not use a tools: block. See docs/scenarios.md for the format.`);
|
|
151
|
+
}
|
|
100
152
|
if (scenarioType === "conversation") {
|
|
101
153
|
if (runtimeConfig.provider !== "http") {
|
|
102
154
|
throw new Error(`Scenario '${scenarioId}' is a conversation scenario and requires provider: http. Use --agent <name> with a configured HTTP agent.`);
|
|
@@ -147,6 +199,15 @@ async function executeOne(scenarioId, runtimeConfig, suiteBatchId) {
|
|
|
147
199
|
tools: toolRegistry,
|
|
148
200
|
});
|
|
149
201
|
bundle.run.suiteBatchId = suiteBatchId;
|
|
202
|
+
bundle.run.variantSetName = agentVersion.variantSetName;
|
|
203
|
+
bundle.run.variantLabel = agentVersion.variantLabel;
|
|
204
|
+
bundle.run.promptVersion = agentVersion.promptVersion;
|
|
205
|
+
bundle.run.modelVersion = agentVersion.modelVersion;
|
|
206
|
+
bundle.run.toolSchemaVersion = agentVersion.toolSchemaVersion;
|
|
207
|
+
bundle.run.configLabel = agentVersion.configLabel;
|
|
208
|
+
bundle.run.configHash = agentVersion.configHash;
|
|
209
|
+
bundle.run.runtimeProfileName = loaded.definition.runtime_profile;
|
|
210
|
+
bundle.run.suiteDefinitionName = runtimeConfig.suiteDefinitionName;
|
|
150
211
|
bundle.agentVersion = agentVersion;
|
|
151
212
|
storage.saveRun(bundle);
|
|
152
213
|
printRunSummary(bundle);
|
|
@@ -156,6 +217,45 @@ async function executeOne(scenarioId, runtimeConfig, suiteBatchId) {
|
|
|
156
217
|
storage.close();
|
|
157
218
|
}
|
|
158
219
|
}
|
|
220
|
+
export async function executeVariantSetScenario(scenarioId, variantSetName, suiteBatchId, suiteDefinitionName) {
|
|
221
|
+
const variantSet = getVariantSet(variantSetName);
|
|
222
|
+
const runs = [];
|
|
223
|
+
for (const variant of variantSet.variants) {
|
|
224
|
+
const registration = getAgentRegistration(variant.agent);
|
|
225
|
+
const runtimeConfig = buildVariantRuntimeConfig(registration, variantSet.name, variant, suiteDefinitionName);
|
|
226
|
+
runs.push(await executeOne(scenarioId, runtimeConfig, suiteBatchId));
|
|
227
|
+
}
|
|
228
|
+
return runs;
|
|
229
|
+
}
|
|
230
|
+
function buildVariantRuntimeConfig(registration, variantSetName, variant, suiteDefinitionName) {
|
|
231
|
+
const runtimeConfig = {
|
|
232
|
+
...registration,
|
|
233
|
+
agentName: registration.name,
|
|
234
|
+
label: registration.label ?? variant.label,
|
|
235
|
+
variantSetName,
|
|
236
|
+
variantLabel: variant.label,
|
|
237
|
+
promptVersion: variant.prompt_version,
|
|
238
|
+
modelVersion: variant.model_version,
|
|
239
|
+
toolSchemaVersion: variant.tool_schema_version,
|
|
240
|
+
configLabel: variant.config_label,
|
|
241
|
+
suiteDefinitionName,
|
|
242
|
+
};
|
|
243
|
+
runtimeConfig.configHash = createConfigHash({
|
|
244
|
+
provider: runtimeConfig.provider,
|
|
245
|
+
agentName: runtimeConfig.agentName,
|
|
246
|
+
label: runtimeConfig.label,
|
|
247
|
+
model: runtimeConfig.model,
|
|
248
|
+
command: runtimeConfig.command,
|
|
249
|
+
args: runtimeConfig.args ?? [],
|
|
250
|
+
variantSetName,
|
|
251
|
+
variantLabel: variant.label,
|
|
252
|
+
promptVersion: variant.prompt_version,
|
|
253
|
+
modelVersion: variant.model_version,
|
|
254
|
+
toolSchemaVersion: variant.tool_schema_version,
|
|
255
|
+
configLabel: variant.config_label,
|
|
256
|
+
});
|
|
257
|
+
return runtimeConfig;
|
|
258
|
+
}
|
|
159
259
|
export async function executeConversation(scenarioId, httpConfig, label, suiteBatchId) {
|
|
160
260
|
const [{ Storage }, { loadConversationScenarioById }, { runConversation }, { createAgentVersionId }] = await Promise.all([
|
|
161
261
|
import("./storage.js"),
|
|
@@ -257,6 +357,9 @@ function printRunSummary(bundle) {
|
|
|
257
357
|
if (bundle.agentVersion?.command) {
|
|
258
358
|
console.log(`Command: ${bundle.agentVersion.command} ${(bundle.agentVersion.args ?? []).join(" ")}`.trim());
|
|
259
359
|
}
|
|
360
|
+
for (const line of formatRunIdentityLines(bundle)) {
|
|
361
|
+
console.log(line);
|
|
362
|
+
}
|
|
260
363
|
console.log(`Runtime: ${bundle.run.durationMs}ms`);
|
|
261
364
|
if (bundle.run.status !== "pass") {
|
|
262
365
|
console.log(`Reason: ${bundle.run.terminationReason}`);
|
|
@@ -264,6 +367,13 @@ function printRunSummary(bundle) {
|
|
|
264
367
|
if (errorDetail) {
|
|
265
368
|
console.log(`Error: ${errorDetail}`);
|
|
266
369
|
}
|
|
370
|
+
const failedEvaluators = getFailedEvaluatorSummaries(bundle);
|
|
371
|
+
if (failedEvaluators.length > 0) {
|
|
372
|
+
console.log("Failed evaluators:");
|
|
373
|
+
for (const summary of failedEvaluators) {
|
|
374
|
+
console.log(`- ${summary}`);
|
|
375
|
+
}
|
|
376
|
+
}
|
|
267
377
|
}
|
|
268
378
|
}
|
|
269
379
|
async function handleShow(args) {
|
|
@@ -394,6 +504,8 @@ function parseRunArgs(args) {
|
|
|
394
504
|
const runtimeConfig = { provider: "mock" };
|
|
395
505
|
let scenarioId;
|
|
396
506
|
let suite;
|
|
507
|
+
let suiteDefinition;
|
|
508
|
+
let variantSetName;
|
|
397
509
|
for (let index = 0; index < args.length; index += 1) {
|
|
398
510
|
const arg = args[index];
|
|
399
511
|
if (arg === "--suite") {
|
|
@@ -401,6 +513,16 @@ function parseRunArgs(args) {
|
|
|
401
513
|
index += 1;
|
|
402
514
|
continue;
|
|
403
515
|
}
|
|
516
|
+
if (arg === "--suite-def") {
|
|
517
|
+
suiteDefinition = args[index + 1];
|
|
518
|
+
index += 1;
|
|
519
|
+
continue;
|
|
520
|
+
}
|
|
521
|
+
if (arg === "--variant-set") {
|
|
522
|
+
variantSetName = args[index + 1];
|
|
523
|
+
index += 1;
|
|
524
|
+
continue;
|
|
525
|
+
}
|
|
404
526
|
if (arg === "--provider") {
|
|
405
527
|
const provider = args[index + 1];
|
|
406
528
|
if (provider !== "mock" && provider !== "openai" && provider !== "external_process" && provider !== "http") {
|
|
@@ -431,7 +553,7 @@ function parseRunArgs(args) {
|
|
|
431
553
|
}
|
|
432
554
|
throw new Error(`Unexpected argument '${arg}'.`);
|
|
433
555
|
}
|
|
434
|
-
return { scenarioId, suite, runtimeConfig };
|
|
556
|
+
return { scenarioId, suite, suiteDefinition, variantSetName, runtimeConfig };
|
|
435
557
|
}
|
|
436
558
|
function validateRuntimeConfig(config) {
|
|
437
559
|
if (config.agentName) {
|
|
@@ -475,7 +597,17 @@ function validateRuntimeConfig(config) {
|
|
|
475
597
|
}
|
|
476
598
|
return config;
|
|
477
599
|
}
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
600
|
+
if (isEntrypoint()) {
|
|
601
|
+
main().catch((error) => {
|
|
602
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
603
|
+
console.error(formatCliErrorMessage(message));
|
|
604
|
+
process.exitCode = 1;
|
|
605
|
+
});
|
|
606
|
+
}
|
|
607
|
+
function isEntrypoint() {
|
|
608
|
+
const entry = process.argv[1];
|
|
609
|
+
if (!entry) {
|
|
610
|
+
return false;
|
|
611
|
+
}
|
|
612
|
+
return import.meta.url === pathToFileURL(entry).href;
|
|
613
|
+
}
|
package/dist/lib/id.js
CHANGED
|
@@ -17,3 +17,6 @@ export function createToolCallId() {
|
|
|
17
17
|
export function createAgentVersionId(label, config) {
|
|
18
18
|
return `agent_${hashText(`${label}:${JSON.stringify(config)}`).slice(0, 12)}`;
|
|
19
19
|
}
|
|
20
|
+
export function createConfigHash(input) {
|
|
21
|
+
return createAgentVersionId("config", input);
|
|
22
|
+
}
|
package/dist/runOutput.js
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
export function getRunErrorDetail(bundle) {
|
|
2
2
|
for (const event of [...bundle.traceEvents].reverse()) {
|
|
3
|
+
if (event.type === "conversation_finished") {
|
|
4
|
+
const errorMessage = event.payload.errorMessage;
|
|
5
|
+
if (typeof errorMessage === "string") {
|
|
6
|
+
return errorMessage;
|
|
7
|
+
}
|
|
8
|
+
}
|
|
3
9
|
if (event.type === "agent_error") {
|
|
4
10
|
const message = event.payload.message;
|
|
5
11
|
return typeof message === "string" ? message : undefined;
|
|
@@ -11,3 +17,43 @@ export function getRunErrorDetail(bundle) {
|
|
|
11
17
|
}
|
|
12
18
|
return undefined;
|
|
13
19
|
}
|
|
20
|
+
export function formatCliErrorMessage(message) {
|
|
21
|
+
if (message.includes("database is locked")) {
|
|
22
|
+
return "SQLite database is locked. Retry the run sequentially or wait for the current run to finish.";
|
|
23
|
+
}
|
|
24
|
+
return message;
|
|
25
|
+
}
|
|
26
|
+
export function getFailedEvaluatorSummaries(bundle) {
|
|
27
|
+
return bundle.evaluatorResults
|
|
28
|
+
.filter((result) => result.status === "fail")
|
|
29
|
+
.map((result) => `${result.evaluatorId}: ${result.message}`);
|
|
30
|
+
}
|
|
31
|
+
export function formatRunIdentityLines(bundle) {
|
|
32
|
+
const lines = [];
|
|
33
|
+
const run = bundle.run;
|
|
34
|
+
if (run.variantSetName) {
|
|
35
|
+
lines.push(`Variant set: ${run.variantSetName}`);
|
|
36
|
+
}
|
|
37
|
+
if (run.variantLabel) {
|
|
38
|
+
lines.push(`Variant: ${run.variantLabel}`);
|
|
39
|
+
}
|
|
40
|
+
if (run.promptVersion) {
|
|
41
|
+
lines.push(`Prompt version: ${run.promptVersion}`);
|
|
42
|
+
}
|
|
43
|
+
if (run.modelVersion) {
|
|
44
|
+
lines.push(`Model version: ${run.modelVersion}`);
|
|
45
|
+
}
|
|
46
|
+
if (run.toolSchemaVersion) {
|
|
47
|
+
lines.push(`Tool schema version: ${run.toolSchemaVersion}`);
|
|
48
|
+
}
|
|
49
|
+
if (run.configLabel) {
|
|
50
|
+
lines.push(`Config label: ${run.configLabel}`);
|
|
51
|
+
}
|
|
52
|
+
if (run.runtimeProfileName) {
|
|
53
|
+
lines.push(`Runtime profile: ${run.runtimeProfileName}`);
|
|
54
|
+
}
|
|
55
|
+
if (run.suiteDefinitionName) {
|
|
56
|
+
lines.push(`Suite definition: ${run.suiteDefinitionName}`);
|
|
57
|
+
}
|
|
58
|
+
return lines;
|
|
59
|
+
}
|
package/dist/runner.js
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import { performance } from "node:perf_hooks";
|
|
2
|
+
import { getRuntimeProfile } from "./config.js";
|
|
2
3
|
import { createToolCallId, createRunId } from "./lib/id.js";
|
|
3
4
|
import { evaluateScenario } from "./evaluators.js";
|
|
4
5
|
import { computeScore } from "./scoring.js";
|
|
6
|
+
import { applyRuntimeProfileToTools } from "./tools.js";
|
|
5
7
|
import { TraceRecorder } from "./trace.js";
|
|
6
8
|
export async function runScenario(deps) {
|
|
7
9
|
const runId = createRunId();
|
|
@@ -9,6 +11,8 @@ export async function runScenario(deps) {
|
|
|
9
11
|
const runStart = performance.now();
|
|
10
12
|
const trace = new TraceRecorder(runId, deps.scenario.id);
|
|
11
13
|
const toolCalls = [];
|
|
14
|
+
const runtimeProfile = deps.scenario.runtime_profile ? getRuntimeProfile(deps.scenario.runtime_profile) : undefined;
|
|
15
|
+
const tools = applyRuntimeProfileToTools(deps.tools, runtimeProfile, trace);
|
|
12
16
|
const maxSteps = deps.scenario.runtime?.max_steps ?? 8;
|
|
13
17
|
const timeoutSeconds = deps.scenario.runtime?.timeout_seconds;
|
|
14
18
|
const deadline = timeoutSeconds ? Date.now() + timeoutSeconds * 1000 : undefined;
|
|
@@ -22,6 +26,9 @@ export async function runScenario(deps) {
|
|
|
22
26
|
maxSteps,
|
|
23
27
|
timeoutSeconds,
|
|
24
28
|
});
|
|
29
|
+
trace.record("system", "runtime_profile_applied", {
|
|
30
|
+
name: runtimeProfile?.name ?? null,
|
|
31
|
+
}, { countStep: false });
|
|
25
32
|
const availableTools = deps.toolSpecs.filter((tool) => deps.scenario.tools.allowed.includes(tool.name));
|
|
26
33
|
const session = await deps.agentAdapter.startRun({
|
|
27
34
|
instructions: deps.scenario.task.instructions,
|
|
@@ -72,7 +79,7 @@ export async function runScenario(deps) {
|
|
|
72
79
|
trace.record("runner", "forbidden_tool_attempted", { toolName });
|
|
73
80
|
break;
|
|
74
81
|
}
|
|
75
|
-
const handler =
|
|
82
|
+
const handler = tools[toolName];
|
|
76
83
|
if (!handler) {
|
|
77
84
|
status = "error";
|
|
78
85
|
terminationReason = "tool_error";
|
|
@@ -98,7 +105,8 @@ export async function runScenario(deps) {
|
|
|
98
105
|
}
|
|
99
106
|
catch (error) {
|
|
100
107
|
const message = error instanceof Error ? error.message : String(error);
|
|
101
|
-
|
|
108
|
+
const isInjectedTimeout = error instanceof Error && error.code === "timeout_exceeded";
|
|
109
|
+
if (isInjectedTimeout || (deadline && Date.now() >= deadline)) {
|
|
102
110
|
status = "error";
|
|
103
111
|
terminationReason = "timeout_exceeded";
|
|
104
112
|
trace.record("runner", "timeout_exceeded", { timeoutSeconds, message });
|
|
@@ -182,18 +190,32 @@ export async function runScenario(deps) {
|
|
|
182
190
|
function hasTimedOut(deadline) {
|
|
183
191
|
return deadline !== undefined && Date.now() >= deadline;
|
|
184
192
|
}
|
|
193
|
+
function toolRaceTimeoutError(message) {
|
|
194
|
+
const error = new Error(message);
|
|
195
|
+
error.code = "timeout_exceeded";
|
|
196
|
+
return error;
|
|
197
|
+
}
|
|
185
198
|
async function raceWithTimeout(promise, deadline, message) {
|
|
186
199
|
if (deadline === undefined) {
|
|
187
200
|
return promise;
|
|
188
201
|
}
|
|
189
202
|
const remainingMs = deadline - Date.now();
|
|
190
203
|
if (remainingMs <= 0) {
|
|
191
|
-
throw
|
|
204
|
+
throw toolRaceTimeoutError(message);
|
|
205
|
+
}
|
|
206
|
+
let timeoutHandle;
|
|
207
|
+
try {
|
|
208
|
+
return await Promise.race([
|
|
209
|
+
promise,
|
|
210
|
+
new Promise((_, reject) => {
|
|
211
|
+
timeoutHandle = setTimeout(() => reject(toolRaceTimeoutError(message)), remainingMs);
|
|
212
|
+
timeoutHandle.unref?.();
|
|
213
|
+
}),
|
|
214
|
+
]);
|
|
215
|
+
}
|
|
216
|
+
finally {
|
|
217
|
+
if (timeoutHandle !== undefined) {
|
|
218
|
+
clearTimeout(timeoutHandle);
|
|
219
|
+
}
|
|
192
220
|
}
|
|
193
|
-
return await Promise.race([
|
|
194
|
-
promise,
|
|
195
|
-
new Promise((_, reject) => {
|
|
196
|
-
setTimeout(() => reject(new Error(message)), remainingMs);
|
|
197
|
-
}),
|
|
198
|
-
]);
|
|
199
221
|
}
|
package/dist/scenarios.js
CHANGED
|
@@ -2,9 +2,20 @@ import { readFileSync, readdirSync, statSync } from "node:fs";
|
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { join, relative, resolve } from "node:path";
|
|
4
4
|
import { parse } from "yaml";
|
|
5
|
-
import { loadAgentLabConfig } from "./config.js";
|
|
5
|
+
import { getRuntimeProfile, getSuiteDefinition, loadAgentLabConfig } from "./config.js";
|
|
6
6
|
import { getBuiltinToolSpecs } from "./tools.js";
|
|
7
7
|
const SCENARIOS_ROOT = resolve("scenarios");
|
|
8
|
+
const VALID_TASK_EVALUATOR_TYPES = new Set([
|
|
9
|
+
"exact_final_answer",
|
|
10
|
+
"final_answer_contains",
|
|
11
|
+
"forbidden_tool",
|
|
12
|
+
"tool_call_assertion",
|
|
13
|
+
"step_count_max",
|
|
14
|
+
"tool_call_count_max",
|
|
15
|
+
"tool_repeat_max",
|
|
16
|
+
"cost_max",
|
|
17
|
+
]);
|
|
18
|
+
const VALID_EVALUATOR_MODES = new Set(["hard_gate", "weighted"]);
|
|
8
19
|
export function listScenarioFiles(root = SCENARIOS_ROOT) {
|
|
9
20
|
if (!safeExists(root)) {
|
|
10
21
|
return [];
|
|
@@ -55,6 +66,8 @@ export function listScenarios() {
|
|
|
55
66
|
}
|
|
56
67
|
export function loadScenarioById(scenarioId) {
|
|
57
68
|
for (const filePath of listScenarioFiles()) {
|
|
69
|
+
if (getScenarioType(filePath) !== "task")
|
|
70
|
+
continue;
|
|
58
71
|
const loaded = loadScenarioByPath(filePath, getKnownToolNames());
|
|
59
72
|
if (loaded.definition.id === scenarioId) {
|
|
60
73
|
return loaded;
|
|
@@ -68,6 +81,20 @@ export function loadScenariosBySuite(suite) {
|
|
|
68
81
|
.map((filePath) => loadScenarioByPath(filePath, getKnownToolNames()))
|
|
69
82
|
.filter(({ definition }) => definition.suite === suite);
|
|
70
83
|
}
|
|
84
|
+
export function loadScenariosBySuiteDefinition(name) {
|
|
85
|
+
const suiteDefinition = getSuiteDefinition(name);
|
|
86
|
+
const knownToolNames = getKnownToolNames();
|
|
87
|
+
const scenarioFiles = listScenarioFiles(resolve("scenarios"));
|
|
88
|
+
const loadedScenarios = scenarioFiles.map((filePath) => loadScenarioRecordByPath(filePath, knownToolNames));
|
|
89
|
+
const included = loadedScenarios
|
|
90
|
+
.filter(({ definition }) => matchesSuiteDefinitionInclude(definition, suiteDefinition));
|
|
91
|
+
const excludedIds = new Set(loadedScenarios
|
|
92
|
+
.filter(({ definition }) => matchesSuiteDefinitionExclude(definition, suiteDefinition))
|
|
93
|
+
.map(({ definition }) => definition.id));
|
|
94
|
+
return included
|
|
95
|
+
.filter(({ definition }) => !excludedIds.has(definition.id))
|
|
96
|
+
.sort((left, right) => left.definition.id.localeCompare(right.definition.id));
|
|
97
|
+
}
|
|
71
98
|
export function loadScenarioByPath(filePath, knownToolNames = getKnownToolNames()) {
|
|
72
99
|
const absolutePath = resolve(filePath);
|
|
73
100
|
const raw = readFileSync(absolutePath, "utf8");
|
|
@@ -107,7 +134,10 @@ function validateScenario(value, filePath, knownToolNames) {
|
|
|
107
134
|
throw new Error(`Scenario file '${filePath}' references unknown allowed tool '${toolName}'.`);
|
|
108
135
|
}
|
|
109
136
|
}
|
|
110
|
-
if (
|
|
137
|
+
if (value.tools.forbidden !== undefined) {
|
|
138
|
+
if (!Array.isArray(value.tools.forbidden)) {
|
|
139
|
+
throw new Error(`Scenario file '${filePath}' field 'tools.forbidden' must be an array of strings.`);
|
|
140
|
+
}
|
|
111
141
|
for (const toolName of value.tools.forbidden) {
|
|
112
142
|
if (typeof toolName !== "string") {
|
|
113
143
|
throw new Error(`Scenario file '${filePath}' contains a non-string tool name in tools.forbidden.`);
|
|
@@ -122,6 +152,17 @@ function validateScenario(value, filePath, knownToolNames) {
|
|
|
122
152
|
if (!isObject(evaluator) || typeof evaluator.id !== "string" || typeof evaluator.type !== "string") {
|
|
123
153
|
throw new Error(`Scenario file '${filePath}' has an invalid evaluator entry.`);
|
|
124
154
|
}
|
|
155
|
+
if (!VALID_TASK_EVALUATOR_TYPES.has(evaluator.type)) {
|
|
156
|
+
throw new Error(`Scenario file '${filePath}' evaluator '${evaluator.id}' has invalid type '${evaluator.type}'. ` +
|
|
157
|
+
`Valid types: ${[...VALID_TASK_EVALUATOR_TYPES].join(", ")}.`);
|
|
158
|
+
}
|
|
159
|
+
if (!VALID_EVALUATOR_MODES.has(evaluator.mode)) {
|
|
160
|
+
throw new Error(`Scenario file '${filePath}' evaluator '${evaluator.id}' has invalid mode '${String(evaluator.mode)}'. ` +
|
|
161
|
+
`Valid modes: hard_gate, weighted.`);
|
|
162
|
+
}
|
|
163
|
+
if (!isObject(evaluator.config)) {
|
|
164
|
+
throw new Error(`Scenario file '${filePath}' evaluator '${evaluator.id}' must define an object config.`);
|
|
165
|
+
}
|
|
125
166
|
if (evaluatorIds.has(evaluator.id)) {
|
|
126
167
|
throw new Error(`Scenario file '${filePath}' defines duplicate evaluator id '${evaluator.id}'.`);
|
|
127
168
|
}
|
|
@@ -131,6 +172,12 @@ function validateScenario(value, filePath, knownToolNames) {
|
|
|
131
172
|
validatePositiveInt(value.runtime.max_steps, "runtime.max_steps", filePath);
|
|
132
173
|
validatePositiveInt(value.runtime.timeout_seconds, "runtime.timeout_seconds", filePath);
|
|
133
174
|
}
|
|
175
|
+
if (value.runtime_profile !== undefined) {
|
|
176
|
+
if (typeof value.runtime_profile !== "string" || value.runtime_profile.length === 0) {
|
|
177
|
+
throw new Error(`Scenario file '${filePath}' field 'runtime_profile' must be a non-empty string.`);
|
|
178
|
+
}
|
|
179
|
+
getRuntimeProfile(value.runtime_profile);
|
|
180
|
+
}
|
|
134
181
|
if (isObject(value.context) && Array.isArray(value.context.fixtures)) {
|
|
135
182
|
for (const fixturePath of value.context.fixtures) {
|
|
136
183
|
if (typeof fixturePath !== "string") {
|
|
@@ -226,6 +273,17 @@ function validateConversationEvaluatorList(evaluators, context, filePath) {
|
|
|
226
273
|
if (ev.mode !== "hard_gate" && ev.mode !== "weighted") {
|
|
227
274
|
throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must have mode: hard_gate or weighted.`);
|
|
228
275
|
}
|
|
276
|
+
if (ev.type === "response_contains" || ev.type === "response_not_contains") {
|
|
277
|
+
if (!isObject(ev.config)) {
|
|
278
|
+
throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must define an object config.`);
|
|
279
|
+
}
|
|
280
|
+
if ("text" in ev.config) {
|
|
281
|
+
throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} uses stale 'config.text'; use 'config.keywords: string[]'.`);
|
|
282
|
+
}
|
|
283
|
+
if (!Array.isArray(ev.config.keywords) || ev.config.keywords.some((kw) => typeof kw !== "string")) {
|
|
284
|
+
throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must define config.keywords as a string array.`);
|
|
285
|
+
}
|
|
286
|
+
}
|
|
229
287
|
}
|
|
230
288
|
}
|
|
231
289
|
function validateConversationScenario(value, filePath) {
|
|
@@ -240,6 +298,12 @@ function validateConversationScenario(value, filePath) {
|
|
|
240
298
|
if (value.type !== "conversation") {
|
|
241
299
|
throw new Error(`Scenario file '${filePath}' does not have type: conversation.`);
|
|
242
300
|
}
|
|
301
|
+
if (value.runtime_profile !== undefined) {
|
|
302
|
+
if (typeof value.runtime_profile !== "string" || value.runtime_profile.length === 0) {
|
|
303
|
+
throw new Error(`Conversation scenario '${filePath}' field 'runtime_profile' must be a non-empty string.`);
|
|
304
|
+
}
|
|
305
|
+
getRuntimeProfile(value.runtime_profile);
|
|
306
|
+
}
|
|
243
307
|
if ("tools" in value) {
|
|
244
308
|
throw new Error(`Conversation scenario '${filePath}' must not define 'tools'. HTTP agents manage their own tools internally.`);
|
|
245
309
|
}
|
|
@@ -265,3 +329,27 @@ function validateConversationScenario(value, filePath) {
|
|
|
265
329
|
validateConversationEvaluatorList(value.evaluators, "end-of-run evaluators", filePath);
|
|
266
330
|
}
|
|
267
331
|
}
|
|
332
|
+
function loadScenarioRecordByPath(filePath, knownToolNames = getKnownToolNames()) {
|
|
333
|
+
if (getScenarioType(filePath) === "conversation") {
|
|
334
|
+
return loadConversationScenarioByPath(filePath);
|
|
335
|
+
}
|
|
336
|
+
return loadScenarioByPath(filePath, knownToolNames);
|
|
337
|
+
}
|
|
338
|
+
function matchesSuiteDefinitionInclude(definition, suiteDefinition) {
|
|
339
|
+
return matchesSuiteDefinitionSelectors(definition, suiteDefinition.include);
|
|
340
|
+
}
|
|
341
|
+
function matchesSuiteDefinitionExclude(definition, suiteDefinition) {
|
|
342
|
+
return suiteDefinition.exclude !== undefined && matchesSuiteDefinitionSelectors(definition, suiteDefinition.exclude);
|
|
343
|
+
}
|
|
344
|
+
function matchesSuiteDefinitionSelectors(definition, selectors) {
|
|
345
|
+
if (selectors.scenarios?.includes(definition.id)) {
|
|
346
|
+
return true;
|
|
347
|
+
}
|
|
348
|
+
if (selectors.tags?.some((tag) => definition.tags?.includes(tag) ?? false)) {
|
|
349
|
+
return true;
|
|
350
|
+
}
|
|
351
|
+
if (selectors.suites?.includes(definition.suite)) {
|
|
352
|
+
return true;
|
|
353
|
+
}
|
|
354
|
+
return false;
|
|
355
|
+
}
|
package/dist/scoring.js
CHANGED
|
@@ -4,9 +4,9 @@ export function computeScore(results) {
|
|
|
4
4
|
const weighted = results.filter((result) => result.mode === "weighted");
|
|
5
5
|
let score = 100;
|
|
6
6
|
if (weighted.length > 0) {
|
|
7
|
-
const totalWeight = weighted.reduce((sum, result) => sum + (result.weight ??
|
|
7
|
+
const totalWeight = weighted.reduce((sum, result) => sum + (result.weight ?? 1), 0);
|
|
8
8
|
const earnedWeight = weighted.reduce((sum, result) => {
|
|
9
|
-
const weight = result.weight ??
|
|
9
|
+
const weight = result.weight ?? 1;
|
|
10
10
|
return sum + (result.status === "pass" ? weight : 0);
|
|
11
11
|
}, 0);
|
|
12
12
|
score = totalWeight === 0 ? 100 : Math.round((earnedWeight / totalWeight) * 100);
|