@langwatch/scenario 0.2.0-prerelease.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -38,6 +38,7 @@ __export(index_exports, {
38
38
  UserSimulatorAgentAdapter: () => UserSimulatorAgentAdapter,
39
39
  agent: () => agent,
40
40
  allAgentRoles: () => allAgentRoles,
41
+ default: () => index_default,
41
42
  defineConfig: () => defineConfig,
42
43
  fail: () => fail,
43
44
  judge: () => judge,
@@ -45,6 +46,7 @@ __export(index_exports, {
45
46
  message: () => message,
46
47
  proceed: () => proceed,
47
48
  run: () => run,
49
+ scenario: () => scenario,
48
50
  scenarioProjectConfigSchema: () => scenarioProjectConfigSchema,
49
51
  succeed: () => succeed,
50
52
  user: () => user,
@@ -52,31 +54,28 @@ __export(index_exports, {
52
54
  });
53
55
  module.exports = __toCommonJS(index_exports);
54
56
 
55
- // src/script/index.ts
56
- var message = (message2) => {
57
- return (_state, executor) => executor.message(message2);
58
- };
59
- var agent = (content) => {
60
- return (_state, executor) => executor.agent(content);
61
- };
62
- var judge = (content) => {
63
- return (_state, executor) => executor.judge(content);
64
- };
65
- var user = (content) => {
66
- return (_state, executor) => executor.user(content);
67
- };
68
- var proceed = (turns, onTurn, onStep) => {
69
- return (_state, executor) => executor.proceed(turns, onTurn, onStep);
70
- };
71
- var succeed = (reasoning) => {
72
- return (_state, executor) => executor.succeed(reasoning);
73
- };
74
- var fail = (reasoning) => {
75
- return (_state, executor) => executor.fail(reasoning);
76
- };
57
+ // src/agents/index.ts
58
+ var agents_exports = {};
59
+ __export(agents_exports, {
60
+ judgeAgent: () => judgeAgent,
61
+ userSimulatorAgent: () => userSimulatorAgent
62
+ });
77
63
 
78
- // src/execution/scenario-execution.ts
79
- var import_rxjs = require("rxjs");
64
+ // src/agents/judge-agent.ts
65
+ var import_ai = require("ai");
66
+ var import_zod2 = require("zod");
67
+
68
+ // src/domain/index.ts
69
+ var domain_exports = {};
70
+ __export(domain_exports, {
71
+ AgentAdapter: () => AgentAdapter,
72
+ AgentRole: () => AgentRole,
73
+ JudgeAgentAdapter: () => JudgeAgentAdapter,
74
+ UserSimulatorAgentAdapter: () => UserSimulatorAgentAdapter,
75
+ allAgentRoles: () => allAgentRoles,
76
+ defineConfig: () => defineConfig,
77
+ scenarioProjectConfigSchema: () => scenarioProjectConfigSchema
78
+ });
80
79
 
81
80
  // src/domain/core/config.ts
82
81
  var import_zod = require("zod");
@@ -120,264 +119,93 @@ var JudgeAgentAdapter = class {
120
119
  }
121
120
  };
122
121
 
123
- // src/utils/ids.ts
124
- var import_xksuid = require("xksuid");
125
- var batchRunId = null;
126
- function generateThreadId() {
127
- return `thread_${(0, import_xksuid.generate)()}`;
128
- }
129
- function generateScenarioRunId() {
130
- return `scenariorun_${(0, import_xksuid.generate)()}`;
131
- }
132
- function generateScenarioId() {
133
- return `scenario_${(0, import_xksuid.generate)()}`;
134
- }
135
- function getBatchRunId() {
136
- if (!batchRunId) {
137
- batchRunId = process.env.SCENARIO_BATCH_RUN_ID ?? `scenariobatchrun_${(0, import_xksuid.generate)()}`;
138
- }
139
- return batchRunId;
140
- }
141
- function generateMessageId() {
142
- return `scenariomsg_${(0, import_xksuid.generate)()}`;
143
- }
144
-
145
- // src/execution/scenario-execution-state.ts
146
- var ScenarioExecutionState = class {
147
- _history = [];
148
- _turn = 0;
149
- _partialResult = null;
150
- _threadId = "";
151
- _agents = [];
152
- _pendingMessages = /* @__PURE__ */ new Map();
153
- _pendingRolesOnTurn = [];
154
- _pendingAgentsOnTurn = /* @__PURE__ */ new Set();
155
- _agentTimes = /* @__PURE__ */ new Map();
156
- _totalStartTime = 0;
157
- /**
158
- * Creates a new ScenarioExecutionState.
159
- */
160
- constructor() {
161
- this._totalStartTime = Date.now();
162
- }
163
- setThreadId(threadId) {
164
- this._threadId = threadId;
165
- }
166
- setAgents(agents) {
167
- this._agents = agents;
168
- this._pendingMessages.clear();
169
- this._agentTimes.clear();
170
- }
171
- appendMessage(role, content) {
172
- const message2 = { role, content };
173
- this._history.push({ ...message2, id: generateMessageId() });
174
- }
175
- appendUserMessage(content) {
176
- this.appendMessage("user", content);
177
- }
178
- appendAssistantMessage(content) {
179
- this.appendMessage("assistant", content);
180
- }
181
- addMessage(message2, fromAgentIdx) {
182
- this._history.push({ ...message2, id: generateMessageId() });
183
- for (let idx = 0; idx < this._agents.length; idx++) {
184
- if (idx === fromAgentIdx) continue;
185
- if (!this._pendingMessages.has(idx)) {
186
- this._pendingMessages.set(idx, []);
187
- }
188
- this._pendingMessages.get(idx).push(message2);
189
- }
190
- }
191
- addMessages(messages, fromAgentIdx) {
192
- for (const message2 of messages) {
193
- this.addMessage(message2, fromAgentIdx);
122
+ // src/agents/utils.ts
123
+ var toolMessageRole = "tool";
124
+ var assistantMessageRole = "assistant";
125
+ var userMessageRole = "user";
126
+ var groupMessagesByToolBoundaries = (messages) => {
127
+ const segments = [];
128
+ let currentSegment = [];
129
+ for (const message2 of messages) {
130
+ currentSegment.push(message2);
131
+ if (message2.role === toolMessageRole) {
132
+ segments.push(currentSegment);
133
+ currentSegment = [];
194
134
  }
195
135
  }
196
- getPendingMessages(agentIdx) {
197
- return this._pendingMessages.get(agentIdx) || [];
198
- }
199
- clearPendingMessages(agentIdx) {
200
- this._pendingMessages.set(agentIdx, []);
201
- }
202
- newTurn() {
203
- this._pendingAgentsOnTurn = new Set(this._agents);
204
- this._pendingRolesOnTurn = [
205
- "User" /* USER */,
206
- "Agent" /* AGENT */,
207
- "Judge" /* JUDGE */
208
- ];
209
- if (this._turn === null) {
210
- this._turn = 1;
211
- } else {
212
- this._turn++;
213
- }
136
+ if (currentSegment.length > 0) {
137
+ segments.push(currentSegment);
214
138
  }
215
- removePendingRole(role) {
216
- const index = this._pendingRolesOnTurn.indexOf(role);
217
- if (index > -1) {
218
- this._pendingRolesOnTurn.splice(index, 1);
139
+ return segments;
140
+ };
141
+ var segmentHasToolMessages = (segment) => {
142
+ return segment.some((message2) => {
143
+ if (message2.role === toolMessageRole) return true;
144
+ if (message2.role === assistantMessageRole && Array.isArray(message2.content)) {
145
+ return message2.content.some((part) => part.type === "tool-call");
219
146
  }
220
- }
221
- removePendingAgent(agent2) {
222
- this._pendingAgentsOnTurn.delete(agent2);
223
- }
224
- getNextAgentForRole(role) {
225
- for (let i = 0; i < this._agents.length; i++) {
226
- const agent2 = this._agents[i];
227
- if (agent2.role === role && this._pendingAgentsOnTurn.has(agent2)) {
228
- return { index: i, agent: agent2 };
147
+ return false;
148
+ });
149
+ };
150
+ var reverseSegmentRoles = (segment) => {
151
+ return segment.map((message2) => {
152
+ const hasStringContent = typeof message2.content === "string";
153
+ if (!hasStringContent) return message2;
154
+ const roleMap = {
155
+ [userMessageRole]: assistantMessageRole,
156
+ [assistantMessageRole]: userMessageRole
157
+ };
158
+ const newRole = roleMap[message2.role];
159
+ if (!newRole) return message2;
160
+ return {
161
+ role: newRole,
162
+ content: message2.content
163
+ };
164
+ });
165
+ };
166
+ var messageRoleReversal = (messages) => {
167
+ const segments = groupMessagesByToolBoundaries(messages);
168
+ const processedSegments = segments.map(
169
+ (segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
170
+ );
171
+ return processedSegments.flat();
172
+ };
173
+ var criterionToParamName = (criterion) => {
174
+ return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
175
+ };
176
+
177
+ // src/config/load.ts
178
+ var import_promises = __toESM(require("fs/promises"));
179
+ var import_node_path = __toESM(require("path"));
180
+ var import_node_url = require("url");
181
+ async function loadScenarioProjectConfig() {
182
+ const cwd = process.cwd();
183
+ const configNames = [
184
+ "scenario.config.js",
185
+ "scenario.config.mjs"
186
+ ];
187
+ for (const name of configNames) {
188
+ const fullPath = import_node_path.default.join(cwd, name);
189
+ try {
190
+ await import_promises.default.access(fullPath);
191
+ const configModule = await import((0, import_node_url.pathToFileURL)(fullPath).href);
192
+ const config2 = configModule.default || configModule;
193
+ const parsed = scenarioProjectConfigSchema.safeParse(config2);
194
+ if (!parsed.success) {
195
+ throw new Error(
196
+ `Invalid config file ${name}: ${JSON.stringify(parsed.error.format(), null, 2)}`
197
+ );
198
+ }
199
+ return parsed.data;
200
+ } catch (error) {
201
+ if (error instanceof Error && "code" in error && error.code === "ENOENT") {
202
+ continue;
229
203
  }
204
+ throw error;
230
205
  }
231
- return null;
232
- }
233
- addAgentTime(agentIdx, time) {
234
- const currentTime = this._agentTimes.get(agentIdx) || 0;
235
- this._agentTimes.set(agentIdx, currentTime + time);
236
- }
237
- hasResult() {
238
- return this._partialResult !== null;
239
- }
240
- setResult(result) {
241
- this._partialResult = result;
242
- }
243
- get lastMessage() {
244
- return this._history[this._history.length - 1];
245
- }
246
- get lastUserMessage() {
247
- return this._history.findLast((message2) => message2.role === "user");
248
- }
249
- get lastAssistantMessage() {
250
- return this._history.findLast((message2) => message2.role === "assistant");
251
- }
252
- get lastToolCall() {
253
- return this._history.findLast((message2) => message2.role === "tool");
254
- }
255
- getLastToolCallByToolName(toolName) {
256
- const toolMessage = this._history.findLast(
257
- (message2) => message2.role === "tool" && message2.content.find(
258
- (part) => part.type === "tool-result" && part.toolName === toolName
259
- )
260
- );
261
- return toolMessage;
262
- }
263
- hasToolCall(toolName) {
264
- return this._history.some(
265
- (message2) => message2.role === "tool" && message2.content.find(
266
- (part) => part.type === "tool-result" && part.toolName === toolName
267
- )
268
- );
269
- }
270
- get history() {
271
- return this._history;
272
- }
273
- get historyWithoutLastMessage() {
274
- return this._history.slice(0, -1);
275
- }
276
- get historyWithoutLastUserMessage() {
277
- const lastUserMessageIndex = this._history.findLastIndex((message2) => message2.role === "user");
278
- if (lastUserMessageIndex === -1) return this._history;
279
- return this._history.slice(0, lastUserMessageIndex);
280
- }
281
- get turn() {
282
- return this._turn;
283
- }
284
- set turn(turn) {
285
- this._turn = turn;
286
- }
287
- get threadId() {
288
- return this._threadId;
289
- }
290
- get agents() {
291
- return this._agents;
292
- }
293
- get pendingRolesOnTurn() {
294
- return this._pendingRolesOnTurn;
295
- }
296
- set pendingRolesOnTurn(roles) {
297
- this._pendingRolesOnTurn = roles;
298
- }
299
- get pendingAgentsOnTurn() {
300
- return Array.from(this._pendingAgentsOnTurn);
301
- }
302
- set pendingAgentsOnTurn(agents) {
303
- this._pendingAgentsOnTurn = new Set(agents);
304
- }
305
- get partialResult() {
306
- return this._partialResult;
307
- }
308
- get totalTime() {
309
- return Date.now() - this._totalStartTime;
310
- }
311
- get agentTimes() {
312
- return new Map(this._agentTimes);
313
206
  }
314
- removeLastPendingRole() {
315
- this._pendingRolesOnTurn.pop();
316
- }
317
- };
318
-
319
- // src/events/schema.ts
320
- var import_core = require("@ag-ui/core");
321
- var import_zod2 = require("zod");
322
- var ScenarioRunStatus = /* @__PURE__ */ ((ScenarioRunStatus2) => {
323
- ScenarioRunStatus2["SUCCESS"] = "SUCCESS";
324
- ScenarioRunStatus2["ERROR"] = "ERROR";
325
- ScenarioRunStatus2["CANCELLED"] = "CANCELLED";
326
- ScenarioRunStatus2["IN_PROGRESS"] = "IN_PROGRESS";
327
- ScenarioRunStatus2["PENDING"] = "PENDING";
328
- ScenarioRunStatus2["FAILED"] = "FAILED";
329
- return ScenarioRunStatus2;
330
- })(ScenarioRunStatus || {});
331
- var baseEventSchema = import_zod2.z.object({
332
- type: import_zod2.z.nativeEnum(import_core.EventType),
333
- timestamp: import_zod2.z.number().optional(),
334
- rawEvent: import_zod2.z.any().optional()
335
- });
336
- var baseScenarioEventSchema = baseEventSchema.extend({
337
- batchRunId: import_zod2.z.string(),
338
- scenarioId: import_zod2.z.string(),
339
- scenarioRunId: import_zod2.z.string()
340
- });
341
- var scenarioRunStartedSchema = baseScenarioEventSchema.extend({
342
- type: import_zod2.z.literal("SCENARIO_RUN_STARTED" /* RUN_STARTED */),
343
- metadata: import_zod2.z.object({
344
- name: import_zod2.z.string(),
345
- description: import_zod2.z.string().optional()
346
- // config: z.record(z.unknown()).optional(),
347
- })
348
- });
349
- var scenarioRunFinishedSchema = baseScenarioEventSchema.extend({
350
- type: import_zod2.z.literal("SCENARIO_RUN_FINISHED" /* RUN_FINISHED */),
351
- status: import_zod2.z.nativeEnum(ScenarioRunStatus)
352
- // error: z
353
- // .object({
354
- // message: z.string(),
355
- // code: z.string().optional(),
356
- // stack: z.string().optional(),
357
- // })
358
- // .optional(),
359
- // metrics: z.record(z.number()).optional(),
360
- });
361
- var scenarioMessageSnapshotSchema = import_core.MessagesSnapshotEventSchema.merge(
362
- baseScenarioEventSchema.extend({
363
- type: import_zod2.z.literal("SCENARIO_MESSAGE_SNAPSHOT" /* MESSAGE_SNAPSHOT */)
364
- })
365
- );
366
- var scenarioEventSchema = import_zod2.z.discriminatedUnion("type", [
367
- scenarioRunStartedSchema,
368
- scenarioRunFinishedSchema,
369
- scenarioMessageSnapshotSchema
370
- ]);
371
- var successSchema = import_zod2.z.object({ success: import_zod2.z.boolean() });
372
- var errorSchema = import_zod2.z.object({ error: import_zod2.z.string() });
373
- var stateSchema = import_zod2.z.object({
374
- state: import_zod2.z.object({
375
- messages: import_zod2.z.array(import_zod2.z.any()),
376
- status: import_zod2.z.string()
377
- })
378
- });
379
- var runsSchema = import_zod2.z.object({ runs: import_zod2.z.array(import_zod2.z.string()) });
380
- var eventsSchema = import_zod2.z.object({ events: import_zod2.z.array(scenarioEventSchema) });
207
+ return await scenarioProjectConfigSchema.parseAsync({});
208
+ }
381
209
 
382
210
  // src/utils/logger.ts
383
211
  var Logger = class _Logger {
@@ -445,300 +273,645 @@ var Logger = class _Logger {
445
273
  }
446
274
  };
447
275
 
448
- // src/execution/scenario-execution.ts
449
- var batchRunId2 = getBatchRunId();
450
- function convertAgentReturnTypesToMessages(response, role) {
451
- if (typeof response === "string")
452
- return [{ role, content: response }];
453
- if (Array.isArray(response))
454
- return response;
455
- if (typeof response === "object" && "role" in response)
456
- return [response];
457
- return [];
458
- }
459
- var ScenarioExecution = class {
460
- state = new ScenarioExecutionState();
461
- eventSubject = new import_rxjs.Subject();
462
- logger = new Logger("scenario.execution.ScenarioExecution");
463
- config;
464
- /**
465
- * An observable stream of events that occur during the scenario execution.
466
- * Subscribe to this to monitor the progress of the scenario in real-time.
467
- */
468
- events$ = this.eventSubject.asObservable();
469
- /**
470
- * Creates a new ScenarioExecution instance.
471
- * @param config The scenario configuration.
472
- * @param script The script steps to execute.
473
- */
474
- constructor(config2, script) {
475
- this.config = {
476
- id: config2.id ?? generateScenarioId(),
477
- name: config2.name,
478
- description: config2.description,
479
- agents: config2.agents,
480
- script,
481
- verbose: config2.verbose ?? false,
482
- maxTurns: config2.maxTurns ?? 10,
483
- threadId: config2.threadId ?? generateThreadId()
484
- };
485
- this.reset();
486
- }
487
- /**
488
- * The history of messages in the conversation.
489
- */
490
- get history() {
491
- return this.state.history;
276
+ // src/config/index.ts
277
+ var logger = new Logger("scenario.config");
278
+ var configLoaded = false;
279
+ var config = null;
280
+ var configLoadPromise = null;
281
+ async function loadProjectConfig() {
282
+ if (configLoaded) {
283
+ return;
492
284
  }
493
- /**
494
- * The unique identifier for the conversation thread.
495
- */
496
- get threadId() {
497
- return this.state.threadId;
285
+ if (configLoadPromise) {
286
+ return configLoadPromise;
498
287
  }
499
- /**
500
- * Executes the entire scenario from start to finish.
501
- * This will run through the script and any automatic proceeding logic until a
502
- * final result (success, failure, or error) is determined.
503
- * @returns A promise that resolves with the final result of the scenario.
504
- */
505
- async execute() {
506
- this.reset();
507
- const scenarioRunId = generateScenarioRunId();
508
- this.emitRunStarted({ scenarioRunId });
288
+ configLoadPromise = (async () => {
509
289
  try {
510
- for (const scriptStep of this.config.script) {
511
- this.logger.debug(`[${this.config.id}] Executing script step`, {
512
- scriptStep
513
- });
514
- const result = await scriptStep(this.state, this);
515
- this.emitMessageSnapshot({ scenarioRunId });
516
- if (result && typeof result === "object" && "success" in result) {
517
- this.emitRunFinished({
518
- scenarioRunId,
519
- status: result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */
520
- });
521
- return result;
522
- }
523
- }
524
- this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */ });
525
- return this.reachedMaxTurns([
526
- "Reached end of script without conclusion, add one of the following to the end of the script:",
527
- "- `Scenario.proceed()` to let the simulation continue to play out",
528
- "- `Scenario.judge()` to force criteria judgement",
529
- "- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
530
- ].join("\n"));
290
+ config = await loadScenarioProjectConfig();
291
+ logger.info("loaded scenario project config", { config });
531
292
  } catch (error) {
532
- this.emitRunFinished({
533
- scenarioRunId,
534
- status: "ERROR" /* ERROR */
535
- });
536
- throw error;
537
- }
538
- }
539
- /**
540
- * Executes a single step in the scenario.
541
- * A step usually corresponds to a single agent's turn. This method is useful
542
- * for manually controlling the scenario's progress.
543
- * @returns A promise that resolves with the new messages added during the step, or a final scenario result if the step concludes the scenario.
544
- */
545
- async step() {
546
- const result = await this._step();
547
- if (result === null) throw new Error("No result from step");
548
- return result;
549
- }
550
- async _step(goToNextTurn = true, onTurn) {
551
- if (this.state.pendingRolesOnTurn.length === 0) {
552
- if (!goToNextTurn) return null;
553
- this.state.newTurn();
554
- if (onTurn) await onTurn(this.state);
555
- if (this.state.turn != null && this.state.turn >= this.config.maxTurns)
556
- return this.reachedMaxTurns();
557
- }
558
- const currentRole = this.state.pendingRolesOnTurn[0];
559
- const { idx, agent: nextAgent } = this.nextAgentForRole(currentRole);
560
- if (!nextAgent) {
561
- this.state.removePendingRole(currentRole);
562
- return this._step(goToNextTurn, onTurn);
293
+ logger.error("error loading scenario project config", { error });
294
+ } finally {
295
+ configLoaded = true;
563
296
  }
564
- this.state.removePendingAgent(nextAgent);
565
- return await this.callAgent(idx, currentRole);
297
+ })();
298
+ return configLoadPromise;
299
+ }
300
+ async function getProjectConfig() {
301
+ await loadProjectConfig();
302
+ return config;
303
+ }
304
+
305
+ // src/utils/config.ts
306
+ function mergeConfig(config2, projectConfig) {
307
+ if (!projectConfig) {
308
+ return config2;
566
309
  }
567
- async callAgent(idx, role, judgmentRequest = false) {
568
- const agent2 = this.state.agents[idx];
569
- const startTime = Date.now();
570
- const agentInput = {
571
- threadId: this.state.threadId,
572
- messages: this.state.history,
573
- newMessages: this.state.getPendingMessages(idx),
574
- requestedRole: role,
575
- judgmentRequest,
576
- scenarioState: this.state,
577
- scenarioConfig: this.config
578
- };
579
- const agentResponse = await agent2.call(agentInput);
580
- const endTime = Date.now();
581
- this.state.addAgentTime(idx, endTime - startTime);
582
- this.state.clearPendingMessages(idx);
583
- if (typeof agentResponse === "object" && agentResponse && "success" in agentResponse) {
584
- return agentResponse;
585
- }
586
- const messages = convertAgentReturnTypesToMessages(
587
- agentResponse,
588
- role === "User" /* USER */ ? "user" : "assistant"
589
- );
590
- this.state.addMessages(messages, idx);
591
- return messages;
310
+ return {
311
+ ...projectConfig.defaultModel,
312
+ ...config2
313
+ };
314
+ }
315
+ function mergeAndValidateConfig(config2, projectConfig) {
316
+ var _a;
317
+ const mergedConfig = mergeConfig(config2, projectConfig);
318
+ mergedConfig.model = mergedConfig.model ?? ((_a = projectConfig == null ? void 0 : projectConfig.defaultModel) == null ? void 0 : _a.model);
319
+ if (!mergedConfig.model) {
320
+ throw new Error("Model is required");
592
321
  }
593
- nextAgentForRole(role) {
594
- for (const agent2 of this.state.agents) {
595
- if (agent2.role === role && this.state.pendingAgentsOnTurn.includes(agent2) && this.state.pendingRolesOnTurn.includes(role)) {
596
- return { idx: this.state.agents.indexOf(agent2), agent: agent2 };
322
+ return mergedConfig;
323
+ }
324
+
325
+ // src/agents/judge-agent.ts
326
+ function buildSystemPrompt(criteria, description) {
327
+ const criteriaList = (criteria == null ? void 0 : criteria.map((criterion, idx) => `${idx + 1}. ${criterion}`).join("\n")) || "No criteria provided";
328
+ return `
329
+ <role>
330
+ You are an LLM as a judge watching a simulated conversation as it plays out live to determine if the agent under test meets the criteria or not.
331
+ </role>
332
+
333
+ <goal>
334
+ Your goal is to determine if you already have enough information to make a verdict of the scenario below, or if the conversation should continue for longer.
335
+ If you do have enough information, use the finish_test tool to determine if all the criteria have been met, if not, use the continue_test tool to let the next step play out.
336
+ </goal>
337
+
338
+ <scenario>
339
+ ${description}
340
+ </scenario>
341
+
342
+ <criteria>
343
+ ${criteriaList}
344
+ </criteria>
345
+
346
+ <rules>
347
+ - Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criteria.
348
+ - DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
349
+ </rules>
350
+ `.trim();
351
+ }
352
+ function buildContinueTestTool() {
353
+ return (0, import_ai.tool)({
354
+ description: "Continue the test with the next step",
355
+ parameters: import_zod2.z.object({})
356
+ });
357
+ }
358
+ function buildFinishTestTool(criteria) {
359
+ const criteriaNames = criteria.map(criterionToParamName);
360
+ return (0, import_ai.tool)({
361
+ description: "Complete the test with a final verdict",
362
+ parameters: import_zod2.z.object({
363
+ criteria: import_zod2.z.object(
364
+ Object.fromEntries(
365
+ criteriaNames.map((name, idx) => [
366
+ name,
367
+ import_zod2.z.enum(["true", "false", "inconclusive"]).describe(criteria[idx])
368
+ ])
369
+ )
370
+ ).strict().describe("Strict verdict for each criterion"),
371
+ reasoning: import_zod2.z.string().describe("Explanation of what the final verdict should be"),
372
+ verdict: import_zod2.z.enum(["success", "failure", "inconclusive"]).describe("The final verdict of the test")
373
+ })
374
+ });
375
+ }
376
+ var judgeAgent = (cfg) => {
377
+ return {
378
+ role: "Judge" /* JUDGE */,
379
+ criteria: cfg.criteria,
380
+ call: async (input) => {
381
+ var _a;
382
+ const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
383
+ const messages = [
384
+ { role: "system", content: systemPrompt },
385
+ ...input.messages
386
+ ];
387
+ const isLastMessage = input.scenarioState.currentTurn === input.scenarioConfig.maxTurns;
388
+ const projectConfig = await getProjectConfig();
389
+ const mergedConfig = mergeAndValidateConfig(cfg, projectConfig);
390
+ if (!mergedConfig.model) {
391
+ throw new Error("Model is required for the judge agent");
597
392
  }
598
- }
599
- return { idx: -1, agent: null };
600
- }
601
- reachedMaxTurns(errorMessage) {
602
- var _a;
603
- const agentRoleAgentsIdx = this.state.agents.map((agent2, i) => ({ agent: agent2, idx: i })).filter(({ agent: agent2 }) => agent2.role === "Agent" /* AGENT */).map(({ idx }) => idx);
604
- const agentTimes = agentRoleAgentsIdx.map((i) => this.state.agentTimes.get(i) || 0);
605
- const totalAgentTime = agentTimes.reduce((sum, time) => sum + time, 0);
606
- return {
607
- success: false,
608
- messages: this.state.history,
609
- reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
610
- passedCriteria: [],
611
- failedCriteria: ((_a = this.getJudgeAgent()) == null ? void 0 : _a.criteria) ?? [],
612
- totalTime: this.state.totalTime,
613
- agentTime: totalAgentTime
614
- };
615
- }
616
- getJudgeAgent() {
617
- return this.state.agents.find((agent2) => agent2 instanceof JudgeAgentAdapter) ?? null;
618
- }
619
- consumeUntilRole(role) {
620
- while (this.state.pendingRolesOnTurn.length > 0) {
621
- const nextRole = this.state.pendingRolesOnTurn[0];
622
- if (nextRole === role) break;
623
- this.state.pendingRolesOnTurn.pop();
624
- }
625
- }
626
- async scriptCallAgent(role, content, judgmentRequest = false) {
627
- this.consumeUntilRole(role);
628
- let index = -1;
629
- let agent2 = null;
630
- const nextAgent = this.state.getNextAgentForRole(role);
631
- if (!nextAgent) {
632
- this.state.newTurn();
633
- this.consumeUntilRole(role);
634
- const nextAgent2 = this.state.getNextAgentForRole(role);
635
- if (!nextAgent2) {
636
- let roleClass = "";
637
- switch (role) {
638
- case "User" /* USER */:
639
- roleClass = "a scenario.userSimulatorAgent()";
640
- break;
641
- case "Agent" /* AGENT */:
642
- roleClass = "a scenario.agent()";
643
- break;
644
- case "Judge" /* JUDGE */:
645
- roleClass = "a scenario.judgeAgent()";
646
- break;
647
- default:
648
- roleClass = "your agent";
649
- }
650
- if (content)
651
- throw new Error(
652
- `Cannot generate a message for role \`${role}\` with content \`${content}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
653
- );
654
- throw new Error(
655
- `Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
656
- );
393
+ const tools = {
394
+ continue_test: buildContinueTestTool(),
395
+ finish_test: buildFinishTestTool(cfg.criteria)
396
+ };
397
+ const enforceJudgement = input.judgmentRequest;
398
+ const hasCriteria = cfg.criteria.length && cfg.criteria.length > 0;
399
+ if (enforceJudgement && !hasCriteria) {
400
+ return {
401
+ success: false,
402
+ messages: [],
403
+ reasoning: "JudgeAgent: No criteria was provided to be judged against",
404
+ metCriteria: [],
405
+ unmetCriteria: []
406
+ };
657
407
  }
658
- index = nextAgent2.index;
659
- agent2 = nextAgent2.agent;
660
- } else {
661
- index = nextAgent.index;
662
- agent2 = nextAgent.agent;
663
- }
664
- this.state.removePendingAgent(agent2);
665
- if (content) {
666
- if (typeof content === "string") {
667
- if (role === "User" /* USER */) {
668
- this.state.addMessage({ role: "user", content });
669
- } else {
670
- this.state.addMessage({ role: "assistant", content });
671
- }
672
- } else {
673
- this.state.addMessage(content);
408
+ const toolChoice = (isLastMessage || enforceJudgement) && hasCriteria ? { type: "tool", toolName: "finish_test" } : "required";
409
+ const completion = await (0, import_ai.generateText)({
410
+ model: mergedConfig.model,
411
+ messages,
412
+ temperature: mergedConfig.temperature ?? 0,
413
+ maxTokens: mergedConfig.maxTokens,
414
+ tools,
415
+ toolChoice
416
+ });
417
+ let args;
418
+ if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
419
+ const toolCall = completion.toolCalls[0];
420
+ switch (toolCall.toolName) {
421
+ case "finish_test": {
422
+ args = toolCall.args;
423
+ const verdict = args.verdict || "inconclusive";
424
+ const reasoning = args.reasoning || "No reasoning provided";
425
+ const criteria = args.criteria || {};
426
+ const criteriaValues = Object.values(criteria);
427
+ const metCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] === "true");
428
+ const unmetCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] !== "true");
429
+ return {
430
+ success: verdict === "success",
431
+ messages: input.messages,
432
+ reasoning,
433
+ metCriteria,
434
+ unmetCriteria
435
+ };
436
+ }
437
+ case "continue_test":
438
+ return [];
439
+ default:
440
+ return {
441
+ success: false,
442
+ messages: input.messages,
443
+ reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
444
+ metCriteria: [],
445
+ unmetCriteria: cfg.criteria
446
+ };
447
+ }
674
448
  }
675
- return null;
449
+ return {
450
+ success: false,
451
+ messages: input.messages,
452
+ reasoning: `JudgeAgent: No tool call found in LLM output`,
453
+ metCriteria: [],
454
+ unmetCriteria: cfg.criteria
455
+ };
676
456
  }
677
- const result = await this.callAgent(index, role, judgmentRequest);
678
- if (Array.isArray(result))
679
- return null;
680
- return result;
457
+ };
458
+ };
459
+
460
+ // src/agents/user-simulator-agent.ts
461
+ var import_ai2 = require("ai");
462
+ function buildSystemPrompt2(description) {
463
+ return `
464
+ <role>
465
+ You are pretending to be a user, you are testing an AI Agent (shown as the user role) based on a scenario.
466
+ Approach this naturally, as a human user would, with very short inputs, few words, all lowercase, imperative, not periods, like when they google or talk to chatgpt.
467
+ </role>
468
+
469
+ <goal>
470
+ Your goal (assistant) is to interact with the Agent Under Test (user) as if you were a human user to see if it can complete the scenario successfully.
471
+ </goal>
472
+
473
+ <scenario>
474
+ ${description}
475
+ </scenario>
476
+
477
+ <rules>
478
+ - DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, you are the user
479
+ </rules>
480
+ `.trim();
481
+ }
482
+ var userSimulatorAgent = (config2) => {
483
+ return {
484
+ role: "User" /* USER */,
485
+ call: async (input) => {
486
+ const systemPrompt = buildSystemPrompt2(input.scenarioConfig.description);
487
+ const messages = [
488
+ { role: "system", content: systemPrompt },
489
+ { role: "assistant", content: "Hello, how can I help you today" },
490
+ ...input.messages
491
+ ];
492
+ const projectConfig = await getProjectConfig();
493
+ const mergedConfig = mergeAndValidateConfig(config2 ?? {}, projectConfig);
494
+ if (!mergedConfig.model) {
495
+ throw new Error("Model is required for the user simulator agent");
496
+ }
497
+ const reversedMessages = messageRoleReversal(messages);
498
+ const completion = await (0, import_ai2.generateText)({
499
+ model: mergedConfig.model,
500
+ messages: reversedMessages,
501
+ temperature: mergedConfig.temperature ?? 0,
502
+ maxTokens: mergedConfig.maxTokens
503
+ });
504
+ const messageContent = completion.text;
505
+ if (!messageContent) {
506
+ throw new Error("No response content from LLM");
507
+ }
508
+ return { role: "user", content: messageContent };
509
+ }
510
+ };
511
+ };
512
+
513
+ // src/execution/index.ts
514
+ var execution_exports = {};
515
+ __export(execution_exports, {
516
+ ScenarioExecution: () => ScenarioExecution,
517
+ ScenarioExecutionState: () => ScenarioExecutionState
518
+ });
519
+
520
+ // src/execution/scenario-execution.ts
521
+ var import_rxjs = require("rxjs");
522
+
523
+ // src/utils/ids.ts
524
+ var import_xksuid = require("xksuid");
525
+ var batchRunId = null;
526
+ function generateThreadId() {
527
+ return `thread_${(0, import_xksuid.generate)()}`;
528
+ }
529
+ function generateScenarioRunId() {
530
+ return `scenariorun_${(0, import_xksuid.generate)()}`;
531
+ }
532
+ function generateScenarioId() {
533
+ return `scenario_${(0, import_xksuid.generate)()}`;
534
+ }
535
+ function getBatchRunId() {
536
+ if (!batchRunId) {
537
+ batchRunId = process.env.SCENARIO_BATCH_RUN_ID ?? `scenariobatchrun_${(0, import_xksuid.generate)()}`;
538
+ }
539
+ return batchRunId;
540
+ }
541
+ function generateMessageId() {
542
+ return `scenariomsg_${(0, import_xksuid.generate)()}`;
543
+ }
544
+
545
+ // src/execution/scenario-execution-state.ts
546
+ var ScenarioExecutionState = class {
547
+ _messages = [];
548
+ _currentTurn = 0;
549
+ _threadId = "";
550
+ description;
551
+ config;
552
+ constructor(config2) {
553
+ this.config = config2;
554
+ this.description = config2.description;
555
+ }
556
+ get messages() {
557
+ return this._messages;
558
+ }
559
+ get currentTurn() {
560
+ return this._currentTurn;
561
+ }
562
+ set currentTurn(turn) {
563
+ this._currentTurn = turn;
564
+ }
565
+ get threadId() {
566
+ return this._threadId;
567
+ }
568
+ set threadId(value) {
569
+ this._threadId = value;
681
570
  }
682
571
  /**
683
572
  * Adds a message to the conversation history.
684
- * This is part of the `ScenarioExecutionLike` interface used by script steps.
685
- * @param message The message to add.
573
+ *
574
+ * @param message - The message to add.
686
575
  */
687
- async message(message2) {
688
- if (message2.role === "user") {
689
- await this.scriptCallAgent("User" /* USER */, message2);
690
- } else if (message2.role === "assistant") {
691
- await this.scriptCallAgent("Agent" /* AGENT */, message2);
692
- } else {
693
- this.state.addMessage(message2);
576
+ addMessage(message2) {
577
+ this._messages.push({ ...message2, id: generateMessageId() });
578
+ }
579
+ lastMessage() {
580
+ if (this._messages.length === 0) {
581
+ throw new Error("No messages in history");
582
+ }
583
+ return this._messages[this._messages.length - 1];
584
+ }
585
+ lastUserMessage() {
586
+ if (this._messages.length === 0) {
587
+ throw new Error("No messages in history");
588
+ }
589
+ const lastMessage = this._messages.findLast((message2) => message2.role === "user");
590
+ if (!lastMessage) {
591
+ throw new Error("No user message in history");
694
592
  }
593
+ return lastMessage;
594
+ }
595
+ lastToolCall(toolName) {
596
+ if (this._messages.length === 0) {
597
+ throw new Error("No messages in history");
598
+ }
599
+ const lastMessage = this._messages.findLast((message2) => message2.role === "tool" && message2.content.find(
600
+ (part) => part.type === "tool-result" && part.toolName === toolName
601
+ ));
602
+ if (!lastMessage) {
603
+ throw new Error("No tool call message in history");
604
+ }
605
+ return lastMessage;
606
+ }
607
+ hasToolCall(toolName) {
608
+ return this._messages.some(
609
+ (message2) => message2.role === "tool" && message2.content.find(
610
+ (part) => part.type === "tool-result" && part.toolName === toolName
611
+ )
612
+ );
695
613
  }
614
+ };
615
+
616
+ // src/events/schema.ts
617
+ var import_core = require("@ag-ui/core");
618
+ var import_zod3 = require("zod");
619
+ var Verdict = /* @__PURE__ */ ((Verdict2) => {
620
+ Verdict2["SUCCESS"] = "success";
621
+ Verdict2["FAILURE"] = "failure";
622
+ Verdict2["INCONCLUSIVE"] = "inconclusive";
623
+ return Verdict2;
624
+ })(Verdict || {});
625
+ var ScenarioRunStatus = /* @__PURE__ */ ((ScenarioRunStatus2) => {
626
+ ScenarioRunStatus2["SUCCESS"] = "SUCCESS";
627
+ ScenarioRunStatus2["ERROR"] = "ERROR";
628
+ ScenarioRunStatus2["CANCELLED"] = "CANCELLED";
629
+ ScenarioRunStatus2["IN_PROGRESS"] = "IN_PROGRESS";
630
+ ScenarioRunStatus2["PENDING"] = "PENDING";
631
+ ScenarioRunStatus2["FAILED"] = "FAILED";
632
+ return ScenarioRunStatus2;
633
+ })(ScenarioRunStatus || {});
634
+ var baseEventSchema = import_zod3.z.object({
635
+ type: import_zod3.z.nativeEnum(import_core.EventType),
636
+ timestamp: import_zod3.z.number(),
637
+ rawEvent: import_zod3.z.any().optional()
638
+ });
639
+ var batchRunIdSchema = import_zod3.z.string();
640
+ var scenarioRunIdSchema = import_zod3.z.string();
641
+ var scenarioIdSchema = import_zod3.z.string();
642
+ var baseScenarioEventSchema = baseEventSchema.extend({
643
+ batchRunId: batchRunIdSchema,
644
+ scenarioId: scenarioIdSchema,
645
+ scenarioRunId: scenarioRunIdSchema,
646
+ scenarioSetId: import_zod3.z.string().optional().default("default")
647
+ });
648
+ var scenarioRunStartedSchema = baseScenarioEventSchema.extend({
649
+ type: import_zod3.z.literal("SCENARIO_RUN_STARTED" /* RUN_STARTED */),
650
+ metadata: import_zod3.z.object({
651
+ name: import_zod3.z.string().optional(),
652
+ description: import_zod3.z.string().optional()
653
+ })
654
+ });
655
+ var scenarioResultsSchema = import_zod3.z.object({
656
+ verdict: import_zod3.z.nativeEnum(Verdict),
657
+ reasoning: import_zod3.z.string().optional(),
658
+ metCriteria: import_zod3.z.array(import_zod3.z.string()),
659
+ unmetCriteria: import_zod3.z.array(import_zod3.z.string()),
660
+ error: import_zod3.z.string().optional()
661
+ });
662
+ var scenarioRunFinishedSchema = baseScenarioEventSchema.extend({
663
+ type: import_zod3.z.literal("SCENARIO_RUN_FINISHED" /* RUN_FINISHED */),
664
+ status: import_zod3.z.nativeEnum(ScenarioRunStatus),
665
+ results: scenarioResultsSchema.optional().nullable()
666
+ });
667
+ var scenarioMessageSnapshotSchema = import_core.MessagesSnapshotEventSchema.merge(
668
+ baseScenarioEventSchema.extend({
669
+ type: import_zod3.z.literal("SCENARIO_MESSAGE_SNAPSHOT" /* MESSAGE_SNAPSHOT */)
670
+ })
671
+ );
672
+ var scenarioEventSchema = import_zod3.z.discriminatedUnion("type", [
673
+ scenarioRunStartedSchema,
674
+ scenarioRunFinishedSchema,
675
+ scenarioMessageSnapshotSchema
676
+ ]);
677
+ var successSchema = import_zod3.z.object({ success: import_zod3.z.boolean() });
678
+ var errorSchema = import_zod3.z.object({ error: import_zod3.z.string() });
679
+ var stateSchema = import_zod3.z.object({
680
+ state: import_zod3.z.object({
681
+ messages: import_zod3.z.array(import_zod3.z.any()),
682
+ status: import_zod3.z.string()
683
+ })
684
+ });
685
+ var runsSchema = import_zod3.z.object({ runs: import_zod3.z.array(import_zod3.z.string()) });
686
+ var eventsSchema = import_zod3.z.object({ events: import_zod3.z.array(scenarioEventSchema) });
687
+
688
+ // src/execution/scenario-execution.ts
689
+ var batchRunId2 = getBatchRunId();
690
+ var ScenarioExecution = class {
691
+ state;
692
+ eventSubject = new import_rxjs.Subject();
693
+ logger = new Logger("scenario.execution.ScenarioExecution");
694
+ config;
695
+ agents = [];
696
+ pendingRolesOnTurn = [];
697
+ pendingAgentsOnTurn = /* @__PURE__ */ new Set();
698
+ pendingMessages = /* @__PURE__ */ new Map();
699
+ partialResult = null;
700
+ agentTimes = /* @__PURE__ */ new Map();
701
+ totalStartTime = 0;
696
702
  /**
697
- * Executes a user turn.
698
- * If content is provided, it's used as the user's message.
699
- * If not, the user simulator agent is called to generate a message.
700
- * This is part of the `ScenarioExecutionLike` interface used by script steps.
701
- * @param content The optional content of the user's message.
703
+ * An observable stream of events that occur during the scenario execution.
704
+ * Subscribe to this to monitor the progress of the scenario in real-time.
702
705
  */
703
- async user(content) {
704
- await this.scriptCallAgent("User" /* USER */, content);
706
+ events$ = this.eventSubject.asObservable();
707
+ /**
708
+ * Creates a new ScenarioExecution instance.
709
+ * @param config The scenario configuration.
710
+ * @param script The script steps to execute.
711
+ */
712
+ constructor(config2, script) {
713
+ this.config = {
714
+ id: config2.id ?? generateScenarioId(),
715
+ name: config2.name,
716
+ description: config2.description,
717
+ agents: config2.agents,
718
+ script,
719
+ verbose: config2.verbose ?? false,
720
+ maxTurns: config2.maxTurns ?? 10,
721
+ threadId: config2.threadId ?? generateThreadId(),
722
+ setId: config2.setId
723
+ };
724
+ this.state = new ScenarioExecutionState(this.config);
725
+ this.reset();
705
726
  }
706
727
  /**
707
- * Executes an agent turn.
708
- * If content is provided, it's used as the agent's message.
709
- * If not, the agent under test is called to generate a response.
710
- * This is part of the `ScenarioExecutionLike` interface used by script steps.
711
- * @param content The optional content of the agent's message.
728
+ * The history of messages in the conversation.
712
729
  */
713
- async agent(content) {
714
- await this.scriptCallAgent("Agent" /* AGENT */, content);
730
+ get messages() {
731
+ return this.state.messages;
715
732
  }
716
733
  /**
717
- * Invokes the judge agent to evaluate the current state of the conversation.
718
- * This is part of the `ScenarioExecutionLike` interface used by script steps.
719
- * @param content Optional message to pass to the judge.
720
- * @returns A promise that resolves with the scenario result if the judge makes a final decision, otherwise null.
734
+ * The unique identifier for the conversation thread.
721
735
  */
722
- async judge(content) {
723
- return await this.scriptCallAgent("Judge" /* JUDGE */, content, true);
736
+ get threadId() {
737
+ return this.state.threadId;
724
738
  }
725
739
  /**
726
- * Lets the scenario proceed automatically for a specified number of turns.
727
- * This simulates the natural flow of conversation between agents.
728
- * This is part of the `ScenarioExecutionLike` interface used by script steps.
729
- * @param turns The number of turns to proceed. If undefined, runs until a conclusion or max turns is reached.
730
- * @param onTurn A callback executed at the end of each turn.
731
- * @param onStep A callback executed after each agent interaction.
732
- * @returns A promise that resolves with the scenario result if a conclusion is reached.
740
+ * The total elapsed time for the scenario execution.
733
741
  */
734
- async proceed(turns, onTurn, onStep) {
735
- let initialTurn = this.state.turn;
736
- while (true) {
737
- const goToNextTurn = turns === void 0 || initialTurn === null || this.state.turn != null && this.state.turn + 1 < initialTurn + turns;
738
- const nextMessage = await this._step(goToNextTurn, onTurn);
739
- if (initialTurn === null)
740
- initialTurn = this.state.turn;
741
- if (nextMessage === null) {
742
+ get totalTime() {
743
+ return Date.now() - this.totalStartTime;
744
+ }
745
+ /**
746
+ * Executes the entire scenario from start to finish.
747
+ * This will run through the script and any automatic proceeding logic until a
748
+ * final result (success, failure, or error) is determined.
749
+ * @returns A promise that resolves with the final result of the scenario.
750
+ */
751
+ async execute() {
752
+ this.reset();
753
+ const scenarioRunId = generateScenarioRunId();
754
+ this.emitRunStarted({ scenarioRunId });
755
+ try {
756
+ for (const scriptStep of this.config.script) {
757
+ this.logger.debug(`[${this.config.id}] Executing script step`, {
758
+ scriptStep
759
+ });
760
+ const result = await scriptStep(this.state, this);
761
+ this.emitMessageSnapshot({ scenarioRunId });
762
+ if (result && typeof result === "object" && "success" in result) {
763
+ this.emitRunFinished({
764
+ scenarioRunId,
765
+ status: result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
766
+ result
767
+ });
768
+ return result;
769
+ }
770
+ }
771
+ this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */ });
772
+ return this.reachedMaxTurns([
773
+ "Reached end of script without conclusion, add one of the following to the end of the script:",
774
+ "- `Scenario.proceed()` to let the simulation continue to play out",
775
+ "- `Scenario.judge()` to force criteria judgement",
776
+ "- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
777
+ ].join("\n"));
778
+ } catch (error) {
779
+ const errorResult = {
780
+ success: false,
781
+ messages: this.state.messages,
782
+ reasoning: `Scenario failed with error: ${error instanceof Error ? error.message : String(error)}`,
783
+ metCriteria: [],
784
+ unmetCriteria: [],
785
+ error: error instanceof Error ? error.message : String(error)
786
+ };
787
+ this.emitRunFinished({
788
+ scenarioRunId,
789
+ status: "ERROR" /* ERROR */,
790
+ result: errorResult
791
+ });
792
+ return errorResult;
793
+ }
794
+ }
795
+ /**
796
+ * Executes a single step in the scenario.
797
+ * A step usually corresponds to a single agent's turn. This method is useful
798
+ * for manually controlling the scenario's progress.
799
+ * @returns A promise that resolves with the new messages added during the step, or a final scenario result if the step concludes the scenario.
800
+ */
801
+ async step() {
802
+ const result = await this._step();
803
+ if (result === null) throw new Error("No result from step");
804
+ return result;
805
+ }
806
+ async _step(goToNextTurn = true, onTurn) {
807
+ if (this.pendingRolesOnTurn.length === 0) {
808
+ if (!goToNextTurn) return null;
809
+ this.newTurn();
810
+ if (onTurn) await onTurn(this.state);
811
+ if (this.state.currentTurn >= this.config.maxTurns)
812
+ return this.reachedMaxTurns();
813
+ }
814
+ const currentRole = this.pendingRolesOnTurn[0];
815
+ const { idx, agent: nextAgent } = this.nextAgentForRole(currentRole);
816
+ if (!nextAgent) {
817
+ this.removePendingRole(currentRole);
818
+ return this._step(goToNextTurn, onTurn);
819
+ }
820
+ this.removePendingAgent(nextAgent);
821
+ return await this.callAgent(idx, currentRole);
822
+ }
823
+ async callAgent(idx, role, judgmentRequest = false) {
824
+ const agent2 = this.agents[idx];
825
+ const startTime = Date.now();
826
+ const agentInput = {
827
+ threadId: this.state.threadId,
828
+ messages: this.state.messages,
829
+ newMessages: this.pendingMessages.get(idx) ?? [],
830
+ requestedRole: role,
831
+ judgmentRequest,
832
+ scenarioState: this.state,
833
+ scenarioConfig: this.config
834
+ };
835
+ const agentResponse = await agent2.call(agentInput);
836
+ const endTime = Date.now();
837
+ this.addAgentTime(idx, endTime - startTime);
838
+ this.pendingMessages.delete(idx);
839
+ if (agentResponse && typeof agentResponse === "object" && "success" in agentResponse) {
840
+ return agentResponse;
841
+ }
842
+ const currentAgentTime = this.agentTimes.get(idx) ?? 0;
843
+ this.agentTimes.set(idx, currentAgentTime + (Date.now() - startTime));
844
+ const messages = convertAgentReturnTypesToMessages(
845
+ agentResponse,
846
+ role === "User" /* USER */ ? "user" : "assistant"
847
+ );
848
+ for (const message2 of messages) {
849
+ this.state.addMessage(message2);
850
+ this.broadcastMessage(message2, idx);
851
+ }
852
+ return messages;
853
+ }
854
+ /**
855
+ * Adds a message to the conversation history.
856
+ * This is part of the `ScenarioExecutionLike` interface used by script steps.
857
+ * @param message The message to add.
858
+ */
859
+ async message(message2) {
860
+ if (message2.role === "user") {
861
+ await this.scriptCallAgent("User" /* USER */, message2);
862
+ } else if (message2.role === "assistant") {
863
+ await this.scriptCallAgent("Agent" /* AGENT */, message2);
864
+ } else {
865
+ this.state.addMessage(message2);
866
+ this.broadcastMessage(message2);
867
+ }
868
+ }
869
+ /**
870
+ * Executes a user turn.
871
+ * If content is provided, it's used as the user's message.
872
+ * If not, the user simulator agent is called to generate a message.
873
+ * This is part of the `ScenarioExecutionLike` interface used by script steps.
874
+ * @param content The optional content of the user's message.
875
+ */
876
+ async user(content) {
877
+ await this.scriptCallAgent("User" /* USER */, content);
878
+ }
879
+ /**
880
+ * Executes an agent turn.
881
+ * If content is provided, it's used as the agent's message.
882
+ * If not, the agent under test is called to generate a response.
883
+ * This is part of the `ScenarioExecutionLike` interface used by script steps.
884
+ * @param content The optional content of the agent's message.
885
+ */
886
+ async agent(content) {
887
+ await this.scriptCallAgent("Agent" /* AGENT */, content);
888
+ }
889
+ /**
890
+ * Invokes the judge agent to evaluate the current state of the conversation.
891
+ * This is part of the `ScenarioExecutionLike` interface used by script steps.
892
+ * @param content Optional message to pass to the judge.
893
+ * @returns A promise that resolves with the scenario result if the judge makes a final decision, otherwise null.
894
+ */
895
+ async judge(content) {
896
+ return await this.scriptCallAgent("Judge" /* JUDGE */, content, true);
897
+ }
898
+ /**
899
+ * Lets the scenario proceed automatically for a specified number of turns.
900
+ * This simulates the natural flow of conversation between agents.
901
+ * This is part of the `ScenarioExecutionLike` interface used by script steps.
902
+ * @param turns The number of turns to proceed. If undefined, runs until a conclusion or max turns is reached.
903
+ * @param onTurn A callback executed at the end of each turn.
904
+ * @param onStep A callback executed after each agent interaction.
905
+ * @returns A promise that resolves with the scenario result if a conclusion is reached.
906
+ */
907
+ async proceed(turns, onTurn, onStep) {
908
+ let initialTurn = this.state.currentTurn;
909
+ while (true) {
910
+ const goToNextTurn = turns === void 0 || initialTurn === null || this.state.currentTurn != null && this.state.currentTurn + 1 < initialTurn + turns;
911
+ const nextMessage = await this._step(goToNextTurn, onTurn);
912
+ if (initialTurn === null)
913
+ initialTurn = this.state.currentTurn;
914
+ if (nextMessage === null) {
742
915
  return null;
743
916
  }
744
917
  if (onStep) await onStep(this.state);
@@ -755,10 +928,10 @@ var ScenarioExecution = class {
755
928
  async succeed(reasoning) {
756
929
  return {
757
930
  success: true,
758
- messages: this.state.history,
931
+ messages: this.state.messages,
759
932
  reasoning: reasoning || "Scenario marked as successful with Scenario.succeed()",
760
- passedCriteria: [],
761
- failedCriteria: []
933
+ metCriteria: [],
934
+ unmetCriteria: []
762
935
  };
763
936
  }
764
937
  /**
@@ -770,25 +943,147 @@ var ScenarioExecution = class {
770
943
  async fail(reasoning) {
771
944
  return {
772
945
  success: false,
773
- messages: this.state.history,
946
+ messages: this.state.messages,
774
947
  reasoning: reasoning || "Scenario marked as failed with Scenario.fail()",
775
- passedCriteria: [],
776
- failedCriteria: []
948
+ metCriteria: [],
949
+ unmetCriteria: []
777
950
  };
778
951
  }
952
+ addAgentTime(agentIdx, time) {
953
+ const currentTime = this.agentTimes.get(agentIdx) || 0;
954
+ this.agentTimes.set(agentIdx, currentTime + time);
955
+ }
956
+ hasResult() {
957
+ return this.partialResult !== null;
958
+ }
959
+ setResult(result) {
960
+ this.partialResult = result;
961
+ }
962
+ async scriptCallAgent(role, content, judgmentRequest = false) {
963
+ this.consumeUntilRole(role);
964
+ let index = -1;
965
+ let agent2 = null;
966
+ let nextAgent = this.getNextAgentForRole(role);
967
+ if (!nextAgent) {
968
+ this.newTurn();
969
+ this.consumeUntilRole(role);
970
+ nextAgent = this.getNextAgentForRole(role);
971
+ }
972
+ if (!nextAgent) {
973
+ let roleClass = "";
974
+ switch (role) {
975
+ case "User" /* USER */:
976
+ roleClass = "a scenario.userSimulatorAgent()";
977
+ break;
978
+ case "Agent" /* AGENT */:
979
+ roleClass = "a scenario.agent()";
980
+ break;
981
+ case "Judge" /* JUDGE */:
982
+ roleClass = "a scenario.judgeAgent()";
983
+ break;
984
+ default:
985
+ roleClass = "your agent";
986
+ }
987
+ if (content)
988
+ throw new Error(
989
+ `Cannot generate a message for role \`${role}\` with content \`${content}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
990
+ );
991
+ throw new Error(
992
+ `Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
993
+ );
994
+ }
995
+ index = nextAgent.index;
996
+ agent2 = nextAgent.agent;
997
+ this.removePendingAgent(agent2);
998
+ if (content) {
999
+ const message2 = typeof content === "string" ? { role: role === "User" /* USER */ ? "user" : "assistant", content } : content;
1000
+ this.state.addMessage(message2);
1001
+ this.broadcastMessage(message2, index);
1002
+ return null;
1003
+ }
1004
+ const result = await this.callAgent(index, role, judgmentRequest);
1005
+ if (result && typeof result === "object" && "success" in result) {
1006
+ return result;
1007
+ }
1008
+ return null;
1009
+ }
779
1010
  reset() {
780
- this.state = new ScenarioExecutionState();
781
- this.state.setThreadId(this.config.threadId || generateThreadId());
782
- this.state.setAgents(this.config.agents);
783
- this.state.newTurn();
784
- this.state.turn = 0;
785
- }
786
- // =====================================================
787
- // Event Emission Methods
788
- // =====================================================
789
- // These methods handle the creation and emission of
790
- // scenario events for external consumption and monitoring
791
- // =====================================================
1011
+ this.state = new ScenarioExecutionState(this.config);
1012
+ this.state.threadId = this.config.threadId || generateThreadId();
1013
+ this.setAgents(this.config.agents);
1014
+ this.newTurn();
1015
+ this.state.currentTurn = 0;
1016
+ this.totalStartTime = Date.now();
1017
+ this.pendingMessages.clear();
1018
+ }
1019
+ nextAgentForRole(role) {
1020
+ for (const agent2 of this.agents) {
1021
+ if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2) && this.pendingRolesOnTurn.includes(role)) {
1022
+ return { idx: this.agents.indexOf(agent2), agent: agent2 };
1023
+ }
1024
+ }
1025
+ return { idx: -1, agent: null };
1026
+ }
1027
+ newTurn() {
1028
+ this.pendingAgentsOnTurn = new Set(this.agents);
1029
+ this.pendingRolesOnTurn = [
1030
+ "User" /* USER */,
1031
+ "Agent" /* AGENT */,
1032
+ "Judge" /* JUDGE */
1033
+ ];
1034
+ if (this.state.currentTurn === null) {
1035
+ this.state.currentTurn = 1;
1036
+ } else {
1037
+ this.state.currentTurn++;
1038
+ }
1039
+ }
1040
+ removePendingRole(role) {
1041
+ const index = this.pendingRolesOnTurn.indexOf(role);
1042
+ if (index > -1) {
1043
+ this.pendingRolesOnTurn.splice(index, 1);
1044
+ }
1045
+ }
1046
+ removePendingAgent(agent2) {
1047
+ this.pendingAgentsOnTurn.delete(agent2);
1048
+ }
1049
+ getNextAgentForRole(role) {
1050
+ for (let i = 0; i < this.agents.length; i++) {
1051
+ const agent2 = this.agents[i];
1052
+ if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2)) {
1053
+ return { index: i, agent: agent2 };
1054
+ }
1055
+ }
1056
+ return null;
1057
+ }
1058
+ setAgents(agents) {
1059
+ this.agents = agents;
1060
+ this.agentTimes.clear();
1061
+ }
1062
+ consumeUntilRole(role) {
1063
+ while (this.pendingRolesOnTurn.length > 0) {
1064
+ const nextRole = this.pendingRolesOnTurn[0];
1065
+ if (nextRole === role) break;
1066
+ this.pendingRolesOnTurn.pop();
1067
+ }
1068
+ }
1069
+ reachedMaxTurns(errorMessage) {
1070
+ var _a;
1071
+ const agentRoleAgentsIdx = this.agents.map((agent2, i) => ({ agent: agent2, idx: i })).filter(({ agent: agent2 }) => agent2.role === "Agent" /* AGENT */).map(({ idx }) => idx);
1072
+ const agentTimes = agentRoleAgentsIdx.map((i) => this.agentTimes.get(i) || 0);
1073
+ const totalAgentTime = agentTimes.reduce((sum, time) => sum + time, 0);
1074
+ return {
1075
+ success: false,
1076
+ messages: this.state.messages,
1077
+ reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
1078
+ metCriteria: [],
1079
+ unmetCriteria: ((_a = this.getJudgeAgent()) == null ? void 0 : _a.criteria) ?? [],
1080
+ totalTime: this.totalTime,
1081
+ agentTime: totalAgentTime
1082
+ };
1083
+ }
1084
+ getJudgeAgent() {
1085
+ return this.agents.find((agent2) => agent2 instanceof JudgeAgentAdapter) ?? null;
1086
+ }
792
1087
  /**
793
1088
  * Emits an event to the event stream for external consumption.
794
1089
  */
@@ -800,11 +1095,13 @@ var ScenarioExecution = class {
800
1095
  */
801
1096
  makeBaseEvent({ scenarioRunId }) {
802
1097
  return {
1098
+ type: "placeholder",
1099
+ // This will be replaced by the specific event type
1100
+ timestamp: Date.now(),
803
1101
  batchRunId: batchRunId2,
804
1102
  scenarioId: this.config.id,
805
1103
  scenarioRunId,
806
- timestamp: Date.now(),
807
- rawEvent: void 0
1104
+ scenarioSetId: this.config.setId
808
1105
  };
809
1106
  }
810
1107
  /**
@@ -827,7 +1124,7 @@ var ScenarioExecution = class {
827
1124
  this.emitEvent({
828
1125
  ...this.makeBaseEvent({ scenarioRunId }),
829
1126
  type: "SCENARIO_MESSAGE_SNAPSHOT" /* MESSAGE_SNAPSHOT */,
830
- messages: this.state.history
1127
+ messages: this.state.messages
831
1128
  // Add any other required fields from MessagesSnapshotEventSchema
832
1129
  });
833
1130
  }
@@ -836,53 +1133,60 @@ var ScenarioExecution = class {
836
1133
  */
837
1134
  emitRunFinished({
838
1135
  scenarioRunId,
839
- status
1136
+ status,
1137
+ result
840
1138
  }) {
841
- this.emitEvent({
1139
+ const event = {
842
1140
  ...this.makeBaseEvent({ scenarioRunId }),
1141
+ scenarioSetId: this.config.setId ?? "default",
843
1142
  type: "SCENARIO_RUN_FINISHED" /* RUN_FINISHED */,
844
- status
845
- // Add error/metrics fields if needed
846
- });
847
- }
848
- };
849
-
850
- // src/config/load.ts
851
- var import_promises = __toESM(require("fs/promises"));
852
- var import_node_path = __toESM(require("path"));
853
- var import_node_url = require("url");
854
- async function loadScenarioProjectConfig() {
855
- const cwd = process.cwd();
856
- const configNames = [
857
- "scenario.config.js",
858
- "scenario.config.mjs"
859
- ];
860
- for (const name of configNames) {
861
- const fullPath = import_node_path.default.join(cwd, name);
862
- try {
863
- await import_promises.default.access(fullPath);
864
- const configModule = await import((0, import_node_url.pathToFileURL)(fullPath).href);
865
- const config2 = configModule.default || configModule;
866
- const parsed = scenarioProjectConfigSchema.safeParse(config2);
867
- if (!parsed.success) {
868
- throw new Error(
869
- `Invalid config file ${name}: ${JSON.stringify(parsed.error.format(), null, 2)}`
870
- );
1143
+ status,
1144
+ results: {
1145
+ verdict: (result == null ? void 0 : result.success) ? "success" /* SUCCESS */ : "failure" /* FAILURE */,
1146
+ metCriteria: (result == null ? void 0 : result.metCriteria) ?? [],
1147
+ unmetCriteria: (result == null ? void 0 : result.unmetCriteria) ?? [],
1148
+ reasoning: result == null ? void 0 : result.reasoning,
1149
+ error: result == null ? void 0 : result.error
871
1150
  }
872
- return parsed.data;
873
- } catch (error) {
874
- if (error instanceof Error && "code" in error && error.code === "ENOENT") {
875
- continue;
1151
+ };
1152
+ this.emitEvent(event);
1153
+ this.eventSubject.complete();
1154
+ }
1155
+ /**
1156
+ * Distributes a message to all other agents in the scenario.
1157
+ *
1158
+ * @param message - The message to broadcast.
1159
+ * @param fromAgentIdx - The index of the agent that sent the message, to avoid echoing.
1160
+ */
1161
+ broadcastMessage(message2, fromAgentIdx) {
1162
+ for (let idx = 0; idx < this.agents.length; idx++) {
1163
+ if (idx === fromAgentIdx) continue;
1164
+ if (!this.pendingMessages.has(idx)) {
1165
+ this.pendingMessages.set(idx, []);
876
1166
  }
877
- throw error;
1167
+ this.pendingMessages.get(idx).push(message2);
878
1168
  }
879
1169
  }
880
- return await scenarioProjectConfigSchema.parseAsync({});
881
- }
882
-
883
- // src/events/event-bus.ts
884
- var import_rxjs2 = require("rxjs");
885
-
1170
+ };
1171
+ function convertAgentReturnTypesToMessages(response, role) {
1172
+ if (typeof response === "string")
1173
+ return [{ role, content: response }];
1174
+ if (Array.isArray(response))
1175
+ return response;
1176
+ if (typeof response === "object" && "role" in response)
1177
+ return [response];
1178
+ return [];
1179
+ }
1180
+
1181
+ // src/runner/index.ts
1182
+ var runner_exports = {};
1183
+ __export(runner_exports, {
1184
+ run: () => run
1185
+ });
1186
+
1187
+ // src/events/event-bus.ts
1188
+ var import_rxjs2 = require("rxjs");
1189
+
886
1190
  // src/events/event-reporter.ts
887
1191
  var EventReporter = class {
888
1192
  eventsEndpoint;
@@ -892,16 +1196,16 @@ var EventReporter = class {
892
1196
  this.eventsEndpoint = new URL("/api/scenario-events", config2.endpoint);
893
1197
  this.apiKey = config2.apiKey ?? "";
894
1198
  if (!process.env.SCENARIO_DISABLE_SIMULATION_REPORT_INFO) {
895
- console.log("=== Scenario Simulation Reporting ===");
896
1199
  if (!this.apiKey) {
897
- console.warn("LangWatch API key not configured, simulations will be local");
898
- console.warn(`To enable simulation reporting in the LangWatch dashboard, configure your LangWatch API key (via LANGWATCH_API_KEY, or scenario.config.js)`);
1200
+ console.log(
1201
+ "\u27A1\uFE0F LangWatch API key not configured, simulations will only output the final result"
1202
+ );
1203
+ console.log(
1204
+ "To visualize the conversations in real time, configure your LangWatch API key (via LANGWATCH_API_KEY, or scenario.config.js)"
1205
+ );
899
1206
  } else {
900
- console.log("Simulation reporting is enabled");
901
- console.log(`Endpoint: ${config2.endpoint} -> ${this.eventsEndpoint.href}`);
902
- console.log(`API Key: ${!this.apiKey ? "not configured" : "configured"}`);
1207
+ console.log(`simulation reporting is enabled, endpoint:(${this.eventsEndpoint}) api_key_configured:(${this.apiKey.length > 0 ? "true" : "false"})`);
903
1208
  }
904
- console.log("=== Scenario Simulation Reporting ===");
905
1209
  }
906
1210
  }
907
1211
  /**
@@ -953,13 +1257,25 @@ var EventReporter = class {
953
1257
  };
954
1258
 
955
1259
  // src/events/event-bus.ts
956
- var EventBus = class {
1260
+ var EventBus = class _EventBus {
1261
+ static registry = /* @__PURE__ */ new Set();
957
1262
  events$ = new import_rxjs2.Subject();
958
1263
  eventReporter;
959
1264
  processingPromise = null;
960
1265
  logger = new Logger("scenario.events.EventBus");
1266
+ static globalListeners = [];
961
1267
  constructor(config2) {
962
1268
  this.eventReporter = new EventReporter(config2);
1269
+ _EventBus.registry.add(this);
1270
+ for (const listener of _EventBus.globalListeners) {
1271
+ listener(this);
1272
+ }
1273
+ }
1274
+ static getAllBuses() {
1275
+ return _EventBus.registry;
1276
+ }
1277
+ static addGlobalListener(listener) {
1278
+ _EventBus.globalListeners.push(listener);
963
1279
  }
964
1280
  /**
965
1281
  * Publishes an event into the processing pipeline.
@@ -1014,7 +1330,7 @@ var EventBus = class {
1014
1330
  */
1015
1331
  async drain() {
1016
1332
  this.logger.debug("Draining event stream");
1017
- this.events$.unsubscribe();
1333
+ this.events$.complete();
1018
1334
  if (this.processingPromise) {
1019
1335
  await this.processingPromise;
1020
1336
  }
@@ -1027,6 +1343,45 @@ var EventBus = class {
1027
1343
  this.logger.debug("Subscribing to event stream");
1028
1344
  return source$.subscribe(this.events$);
1029
1345
  }
1346
+ /**
1347
+ * Expose the events$ observable for external subscription (read-only).
1348
+ */
1349
+ get eventsObservable() {
1350
+ return this.events$.asObservable();
1351
+ }
1352
+ };
1353
+
1354
+ // src/script/index.ts
1355
+ var script_exports = {};
1356
+ __export(script_exports, {
1357
+ agent: () => agent,
1358
+ fail: () => fail,
1359
+ judge: () => judge,
1360
+ message: () => message,
1361
+ proceed: () => proceed,
1362
+ succeed: () => succeed,
1363
+ user: () => user
1364
+ });
1365
+ var message = (message2) => {
1366
+ return (_state, executor) => executor.message(message2);
1367
+ };
1368
+ var agent = (content) => {
1369
+ return (_state, executor) => executor.agent(content);
1370
+ };
1371
+ var judge = (content) => {
1372
+ return (_state, executor) => executor.judge(content);
1373
+ };
1374
+ var user = (content) => {
1375
+ return (_state, executor) => executor.user(content);
1376
+ };
1377
+ var proceed = (turns, onTurn, onStep) => {
1378
+ return (_state, executor) => executor.proceed(turns, onTurn, onStep);
1379
+ };
1380
+ var succeed = (reasoning) => {
1381
+ return (_state, executor) => executor.succeed(reasoning);
1382
+ };
1383
+ var fail = (reasoning) => {
1384
+ return (_state, executor) => executor.fail(reasoning);
1030
1385
  };
1031
1386
 
1032
1387
  // src/runner/run.ts
@@ -1071,8 +1426,8 @@ async function run(cfg) {
1071
1426
  console.log(`Scenario failed: ${cfg.name}`);
1072
1427
  console.log(`Reasoning: ${result.reasoning}`);
1073
1428
  console.log("--------------------------------");
1074
- console.log(`Passed criteria: ${result.passedCriteria.join("\n- ")}`);
1075
- console.log(`Failed criteria: ${result.failedCriteria.join("\n- ")}`);
1429
+ console.log(`Met criteria: ${result.metCriteria.join("\n- ")}`);
1430
+ console.log(`Unmet criteria: ${result.unmetCriteria.join("\n- ")}`);
1076
1431
  console.log(result.messages.map(formatMessage).join("\n"));
1077
1432
  }
1078
1433
  return result;
@@ -1125,301 +1480,15 @@ function formatPart(part) {
1125
1480
  }
1126
1481
  }
1127
1482
 
1128
- // src/agents/judge-agent.ts
1129
- var import_ai = require("ai");
1130
- var import_zod3 = require("zod");
1131
-
1132
- // src/agents/utils.ts
1133
- var toolMessageRole = "tool";
1134
- var assistantMessageRole = "assistant";
1135
- var userMessageRole = "user";
1136
- var groupMessagesByToolBoundaries = (messages) => {
1137
- const segments = [];
1138
- let currentSegment = [];
1139
- for (const message2 of messages) {
1140
- currentSegment.push(message2);
1141
- if (message2.role === toolMessageRole) {
1142
- segments.push(currentSegment);
1143
- currentSegment = [];
1144
- }
1145
- }
1146
- if (currentSegment.length > 0) {
1147
- segments.push(currentSegment);
1148
- }
1149
- return segments;
1150
- };
1151
- var segmentHasToolMessages = (segment) => {
1152
- return segment.some((message2) => {
1153
- if (message2.role === toolMessageRole) return true;
1154
- if (message2.role === assistantMessageRole && Array.isArray(message2.content)) {
1155
- return message2.content.some((part) => part.type === "tool-call");
1156
- }
1157
- return false;
1158
- });
1159
- };
1160
- var reverseSegmentRoles = (segment) => {
1161
- return segment.map((message2) => {
1162
- const hasStringContent = typeof message2.content === "string";
1163
- if (!hasStringContent) return message2;
1164
- const roleMap = {
1165
- [userMessageRole]: assistantMessageRole,
1166
- [assistantMessageRole]: userMessageRole
1167
- };
1168
- const newRole = roleMap[message2.role];
1169
- if (!newRole) return message2;
1170
- return {
1171
- role: newRole,
1172
- content: message2.content
1173
- };
1174
- });
1175
- };
1176
- var messageRoleReversal = (messages) => {
1177
- const segments = groupMessagesByToolBoundaries(messages);
1178
- const processedSegments = segments.map(
1179
- (segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
1180
- );
1181
- return processedSegments.flat();
1182
- };
1183
- var criterionToParamName = (criterion) => {
1184
- return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
1185
- };
1186
-
1187
- // src/config/index.ts
1188
- var logger = new Logger("scenario.config");
1189
- var configLoaded = false;
1190
- var config = null;
1191
- var configLoadPromise = null;
1192
- async function loadProjectConfig() {
1193
- if (configLoaded) {
1194
- return;
1195
- }
1196
- if (configLoadPromise) {
1197
- return configLoadPromise;
1198
- }
1199
- configLoadPromise = (async () => {
1200
- try {
1201
- config = await loadScenarioProjectConfig();
1202
- logger.info("loaded scenario project config", { config });
1203
- } catch (error) {
1204
- logger.error("error loading scenario project config", { error });
1205
- } finally {
1206
- configLoaded = true;
1207
- }
1208
- })();
1209
- return configLoadPromise;
1210
- }
1211
- async function getProjectConfig() {
1212
- await loadProjectConfig();
1213
- return config;
1214
- }
1215
-
1216
- // src/utils/config.ts
1217
- function mergeConfig(config2, projectConfig) {
1218
- if (!projectConfig) {
1219
- return config2;
1220
- }
1221
- return {
1222
- ...projectConfig.defaultModel,
1223
- ...config2
1224
- };
1225
- }
1226
- function mergeAndValidateConfig(config2, projectConfig) {
1227
- var _a;
1228
- const mergedConfig = mergeConfig(config2, projectConfig);
1229
- mergedConfig.model = mergedConfig.model ?? ((_a = projectConfig == null ? void 0 : projectConfig.defaultModel) == null ? void 0 : _a.model);
1230
- if (!mergedConfig.model) {
1231
- throw new Error("Model is required");
1232
- }
1233
- return mergedConfig;
1234
- }
1235
-
1236
- // src/agents/judge-agent.ts
1237
- function buildSystemPrompt(criteria, description) {
1238
- const criteriaList = (criteria == null ? void 0 : criteria.map((criterion, idx) => `${idx + 1}. ${criterion}`).join("\n")) || "No criteria provided";
1239
- return `
1240
- <role>
1241
- You are an LLM as a judge watching a simulated conversation as it plays out live to determine if the agent under test meets the criteria or not.
1242
- </role>
1243
-
1244
- <goal>
1245
- Your goal is to determine if you already have enough information to make a verdict of the scenario below, or if the conversation should continue for longer.
1246
- If you do have enough information, use the finish_test tool to determine if all the criteria have been met, if not, use the continue_test tool to let the next step play out.
1247
- </goal>
1248
-
1249
- <scenario>
1250
- ${description}
1251
- </scenario>
1252
-
1253
- <criteria>
1254
- ${criteriaList}
1255
- </criteria>
1256
-
1257
- <rules>
1258
- - Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criteria.
1259
- - DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
1260
- </rules>
1261
- `.trim();
1262
- }
1263
- function buildContinueTestTool() {
1264
- return (0, import_ai.tool)({
1265
- description: "Continue the test with the next step",
1266
- parameters: import_zod3.z.object({})
1267
- });
1268
- }
1269
- function buildFinishTestTool(criteria) {
1270
- const criteriaNames = criteria.map(criterionToParamName);
1271
- return (0, import_ai.tool)({
1272
- description: "Complete the test with a final verdict",
1273
- parameters: import_zod3.z.object({
1274
- criteria: import_zod3.z.object(
1275
- Object.fromEntries(
1276
- criteriaNames.map((name, idx) => [
1277
- name,
1278
- import_zod3.z.enum(["true", "false", "inconclusive"]).describe(criteria[idx])
1279
- ])
1280
- )
1281
- ).strict().describe("Strict verdict for each criterion"),
1282
- reasoning: import_zod3.z.string().describe("Explanation of what the final verdict should be"),
1283
- verdict: import_zod3.z.enum(["success", "failure", "inconclusive"]).describe("The final verdict of the test")
1284
- })
1285
- });
1286
- }
1287
- var judgeAgent = (cfg) => {
1288
- return {
1289
- role: "Judge" /* JUDGE */,
1290
- criteria: cfg.criteria,
1291
- call: async (input) => {
1292
- var _a;
1293
- const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
1294
- const messages = [
1295
- { role: "system", content: systemPrompt },
1296
- ...input.messages
1297
- ];
1298
- const isLastMessage = input.scenarioState.turn == input.scenarioConfig.maxTurns;
1299
- const projectConfig = await getProjectConfig();
1300
- const mergedConfig = mergeAndValidateConfig(cfg, projectConfig);
1301
- if (!mergedConfig.model) {
1302
- throw new Error("Model is required for the judge agent");
1303
- }
1304
- const tools = {
1305
- continue_test: buildContinueTestTool(),
1306
- finish_test: buildFinishTestTool(cfg.criteria)
1307
- };
1308
- const enforceJudgement = input.judgmentRequest;
1309
- const hasCriteria = cfg.criteria.length && cfg.criteria.length > 0;
1310
- if (enforceJudgement && !hasCriteria) {
1311
- return {
1312
- success: false,
1313
- messages: [],
1314
- reasoning: "JudgeAgent: No criteria was provided to be judged against",
1315
- passedCriteria: [],
1316
- failedCriteria: []
1317
- };
1318
- }
1319
- const toolChoice = (isLastMessage || enforceJudgement) && hasCriteria ? { type: "tool", toolName: "finish_test" } : "required";
1320
- const completion = await (0, import_ai.generateText)({
1321
- model: mergedConfig.model,
1322
- messages,
1323
- temperature: mergedConfig.temperature ?? 0,
1324
- maxTokens: mergedConfig.maxTokens,
1325
- tools,
1326
- toolChoice
1327
- });
1328
- let args;
1329
- if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
1330
- const toolCall = completion.toolCalls[0];
1331
- switch (toolCall.toolName) {
1332
- case "finish_test": {
1333
- args = toolCall.args;
1334
- const verdict = args.verdict || "inconclusive";
1335
- const reasoning = args.reasoning || "No reasoning provided";
1336
- const criteria = args.criteria || {};
1337
- const criteriaValues = Object.values(criteria);
1338
- const passedCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] === "true");
1339
- const failedCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] !== "true");
1340
- return {
1341
- success: verdict === "success",
1342
- messages: input.messages,
1343
- reasoning,
1344
- passedCriteria,
1345
- failedCriteria
1346
- };
1347
- }
1348
- case "continue_test":
1349
- return [];
1350
- default:
1351
- return {
1352
- success: false,
1353
- messages: input.messages,
1354
- reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
1355
- passedCriteria: [],
1356
- failedCriteria: cfg.criteria
1357
- };
1358
- }
1359
- }
1360
- return {
1361
- success: false,
1362
- messages: input.messages,
1363
- reasoning: `JudgeAgent: No tool call found in LLM output`,
1364
- passedCriteria: [],
1365
- failedCriteria: cfg.criteria
1366
- };
1367
- }
1368
- };
1369
- };
1370
-
1371
- // src/agents/user-simulator-agent.ts
1372
- var import_ai2 = require("ai");
1373
- function buildSystemPrompt2(description) {
1374
- return `
1375
- <role>
1376
- You are pretending to be a user, you are testing an AI Agent (shown as the user role) based on a scenario.
1377
- Approach this naturally, as a human user would, with very short inputs, few words, all lowercase, imperative, not periods, like when they google or talk to chatgpt.
1378
- </role>
1379
-
1380
- <goal>
1381
- Your goal (assistant) is to interact with the Agent Under Test (user) as if you were a human user to see if it can complete the scenario successfully.
1382
- </goal>
1383
-
1384
- <scenario>
1385
- ${description}
1386
- </scenario>
1387
-
1388
- <rules>
1389
- - DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, you are the user
1390
- </rules>
1391
- `.trim();
1392
- }
1393
- var userSimulatorAgent = (config2) => {
1394
- return {
1395
- role: "User" /* USER */,
1396
- call: async (input) => {
1397
- const systemPrompt = buildSystemPrompt2(input.scenarioConfig.description);
1398
- const messages = [
1399
- { role: "system", content: systemPrompt },
1400
- { role: "assistant", content: "Hello, how can I help you today" },
1401
- ...input.messages
1402
- ];
1403
- const projectConfig = await getProjectConfig();
1404
- const mergedConfig = mergeAndValidateConfig(config2 ?? {}, projectConfig);
1405
- if (!mergedConfig.model) {
1406
- throw new Error("Model is required for the user simulator agent");
1407
- }
1408
- const reversedMessages = messageRoleReversal(messages);
1409
- const completion = await (0, import_ai2.generateText)({
1410
- model: mergedConfig.model,
1411
- messages: reversedMessages,
1412
- temperature: mergedConfig.temperature ?? 0,
1413
- maxTokens: mergedConfig.maxTokens
1414
- });
1415
- const messageContent = completion.text;
1416
- if (!messageContent) {
1417
- throw new Error("No response content from LLM");
1418
- }
1419
- return { role: "user", content: messageContent };
1420
- }
1421
- };
1483
+ // src/index.ts
1484
+ var scenario = {
1485
+ ...agents_exports,
1486
+ ...domain_exports,
1487
+ ...execution_exports,
1488
+ ...runner_exports,
1489
+ ...script_exports
1422
1490
  };
1491
+ var index_default = scenario;
1423
1492
  // Annotate the CommonJS export names for ESM import in node:
1424
1493
  0 && (module.exports = {
1425
1494
  AgentAdapter,
@@ -1437,6 +1506,7 @@ var userSimulatorAgent = (config2) => {
1437
1506
  message,
1438
1507
  proceed,
1439
1508
  run,
1509
+ scenario,
1440
1510
  scenarioProjectConfigSchema,
1441
1511
  succeed,
1442
1512
  user,