@langwatch/scenario 0.2.0-prerelease.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,28 +1,33 @@
1
- // src/script/index.ts
2
- var message = (message2) => {
3
- return (_state, executor) => executor.message(message2);
4
- };
5
- var agent = (content) => {
6
- return (_state, executor) => executor.agent(content);
7
- };
8
- var judge = (content) => {
9
- return (_state, executor) => executor.judge(content);
10
- };
11
- var user = (content) => {
12
- return (_state, executor) => executor.user(content);
13
- };
14
- var proceed = (turns, onTurn, onStep) => {
15
- return (_state, executor) => executor.proceed(turns, onTurn, onStep);
16
- };
17
- var succeed = (reasoning) => {
18
- return (_state, executor) => executor.succeed(reasoning);
19
- };
20
- var fail = (reasoning) => {
21
- return (_state, executor) => executor.fail(reasoning);
22
- };
1
+ import {
2
+ EventBus,
3
+ Logger
4
+ } from "./chunk-ORWSJC5F.mjs";
5
+ import {
6
+ __export
7
+ } from "./chunk-7P6ASYW6.mjs";
23
8
 
24
- // src/execution/scenario-execution.ts
25
- import { Subject } from "rxjs";
9
+ // src/agents/index.ts
10
+ var agents_exports = {};
11
+ __export(agents_exports, {
12
+ judgeAgent: () => judgeAgent,
13
+ userSimulatorAgent: () => userSimulatorAgent
14
+ });
15
+
16
+ // src/agents/judge-agent.ts
17
+ import { generateText, tool } from "ai";
18
+ import { z as z2 } from "zod";
19
+
20
+ // src/domain/index.ts
21
+ var domain_exports = {};
22
+ __export(domain_exports, {
23
+ AgentAdapter: () => AgentAdapter,
24
+ AgentRole: () => AgentRole,
25
+ JudgeAgentAdapter: () => JudgeAgentAdapter,
26
+ UserSimulatorAgentAdapter: () => UserSimulatorAgentAdapter,
27
+ allAgentRoles: () => allAgentRoles,
28
+ defineConfig: () => defineConfig,
29
+ scenarioProjectConfigSchema: () => scenarioProjectConfigSchema
30
+ });
26
31
 
27
32
  // src/domain/core/config.ts
28
33
  import { z } from "zod";
@@ -66,347 +71,448 @@ var JudgeAgentAdapter = class {
66
71
  }
67
72
  };
68
73
 
69
- // src/utils/ids.ts
70
- import { generate, parse } from "xksuid";
71
- var batchRunId = null;
72
- function generateThreadId() {
73
- return `thread_${generate()}`;
74
- }
75
- function generateScenarioRunId() {
76
- return `scenariorun_${generate()}`;
77
- }
78
- function generateScenarioId() {
79
- return `scenario_${generate()}`;
80
- }
81
- function getBatchRunId() {
82
- if (!batchRunId) {
83
- batchRunId = process.env.SCENARIO_BATCH_RUN_ID ?? `scenariobatchrun_${generate()}`;
84
- }
85
- return batchRunId;
86
- }
87
- function generateMessageId() {
88
- return `scenariomsg_${generate()}`;
89
- }
90
-
91
- // src/execution/scenario-execution-state.ts
92
- var ScenarioExecutionState = class {
93
- _history = [];
94
- _turn = 0;
95
- _partialResult = null;
96
- _threadId = "";
97
- _agents = [];
98
- _pendingMessages = /* @__PURE__ */ new Map();
99
- _pendingRolesOnTurn = [];
100
- _pendingAgentsOnTurn = /* @__PURE__ */ new Set();
101
- _agentTimes = /* @__PURE__ */ new Map();
102
- _totalStartTime = 0;
103
- /**
104
- * Creates a new ScenarioExecutionState.
105
- */
106
- constructor() {
107
- this._totalStartTime = Date.now();
108
- }
109
- setThreadId(threadId) {
110
- this._threadId = threadId;
111
- }
112
- setAgents(agents) {
113
- this._agents = agents;
114
- this._pendingMessages.clear();
115
- this._agentTimes.clear();
116
- }
117
- appendMessage(role, content) {
118
- const message2 = { role, content };
119
- this._history.push({ ...message2, id: generateMessageId() });
120
- }
121
- appendUserMessage(content) {
122
- this.appendMessage("user", content);
123
- }
124
- appendAssistantMessage(content) {
125
- this.appendMessage("assistant", content);
126
- }
127
- addMessage(message2, fromAgentIdx) {
128
- this._history.push({ ...message2, id: generateMessageId() });
129
- for (let idx = 0; idx < this._agents.length; idx++) {
130
- if (idx === fromAgentIdx) continue;
131
- if (!this._pendingMessages.has(idx)) {
132
- this._pendingMessages.set(idx, []);
133
- }
134
- this._pendingMessages.get(idx).push(message2);
135
- }
136
- }
137
- addMessages(messages, fromAgentIdx) {
138
- for (const message2 of messages) {
139
- this.addMessage(message2, fromAgentIdx);
140
- }
141
- }
142
- getPendingMessages(agentIdx) {
143
- return this._pendingMessages.get(agentIdx) || [];
144
- }
145
- clearPendingMessages(agentIdx) {
146
- this._pendingMessages.set(agentIdx, []);
147
- }
148
- newTurn() {
149
- this._pendingAgentsOnTurn = new Set(this._agents);
150
- this._pendingRolesOnTurn = [
151
- "User" /* USER */,
152
- "Agent" /* AGENT */,
153
- "Judge" /* JUDGE */
154
- ];
155
- if (this._turn === null) {
156
- this._turn = 1;
157
- } else {
158
- this._turn++;
159
- }
160
- }
161
- removePendingRole(role) {
162
- const index = this._pendingRolesOnTurn.indexOf(role);
163
- if (index > -1) {
164
- this._pendingRolesOnTurn.splice(index, 1);
74
+ // src/agents/utils.ts
75
+ var toolMessageRole = "tool";
76
+ var assistantMessageRole = "assistant";
77
+ var userMessageRole = "user";
78
+ var groupMessagesByToolBoundaries = (messages) => {
79
+ const segments = [];
80
+ let currentSegment = [];
81
+ for (const message2 of messages) {
82
+ currentSegment.push(message2);
83
+ if (message2.role === toolMessageRole) {
84
+ segments.push(currentSegment);
85
+ currentSegment = [];
165
86
  }
166
87
  }
167
- removePendingAgent(agent2) {
168
- this._pendingAgentsOnTurn.delete(agent2);
88
+ if (currentSegment.length > 0) {
89
+ segments.push(currentSegment);
169
90
  }
170
- getNextAgentForRole(role) {
171
- for (let i = 0; i < this._agents.length; i++) {
172
- const agent2 = this._agents[i];
173
- if (agent2.role === role && this._pendingAgentsOnTurn.has(agent2)) {
174
- return { index: i, agent: agent2 };
175
- }
91
+ return segments;
92
+ };
93
+ var segmentHasToolMessages = (segment) => {
94
+ return segment.some((message2) => {
95
+ if (message2.role === toolMessageRole) return true;
96
+ if (message2.role === assistantMessageRole && Array.isArray(message2.content)) {
97
+ return message2.content.some((part) => part.type === "tool-call");
176
98
  }
177
- return null;
178
- }
179
- addAgentTime(agentIdx, time) {
180
- const currentTime = this._agentTimes.get(agentIdx) || 0;
181
- this._agentTimes.set(agentIdx, currentTime + time);
182
- }
183
- hasResult() {
184
- return this._partialResult !== null;
185
- }
186
- setResult(result) {
187
- this._partialResult = result;
188
- }
189
- get lastMessage() {
190
- return this._history[this._history.length - 1];
191
- }
192
- get lastUserMessage() {
193
- return this._history.findLast((message2) => message2.role === "user");
194
- }
195
- get lastAssistantMessage() {
196
- return this._history.findLast((message2) => message2.role === "assistant");
197
- }
198
- get lastToolCall() {
199
- return this._history.findLast((message2) => message2.role === "tool");
200
- }
201
- getLastToolCallByToolName(toolName) {
202
- const toolMessage = this._history.findLast(
203
- (message2) => message2.role === "tool" && message2.content.find(
204
- (part) => part.type === "tool-result" && part.toolName === toolName
205
- )
206
- );
207
- return toolMessage;
208
- }
209
- hasToolCall(toolName) {
210
- return this._history.some(
211
- (message2) => message2.role === "tool" && message2.content.find(
212
- (part) => part.type === "tool-result" && part.toolName === toolName
213
- )
214
- );
215
- }
216
- get history() {
217
- return this._history;
218
- }
219
- get historyWithoutLastMessage() {
220
- return this._history.slice(0, -1);
221
- }
222
- get historyWithoutLastUserMessage() {
223
- const lastUserMessageIndex = this._history.findLastIndex((message2) => message2.role === "user");
224
- if (lastUserMessageIndex === -1) return this._history;
225
- return this._history.slice(0, lastUserMessageIndex);
226
- }
227
- get turn() {
228
- return this._turn;
229
- }
230
- set turn(turn) {
231
- this._turn = turn;
232
- }
233
- get threadId() {
234
- return this._threadId;
235
- }
236
- get agents() {
237
- return this._agents;
238
- }
239
- get pendingRolesOnTurn() {
240
- return this._pendingRolesOnTurn;
241
- }
242
- set pendingRolesOnTurn(roles) {
243
- this._pendingRolesOnTurn = roles;
244
- }
245
- get pendingAgentsOnTurn() {
246
- return Array.from(this._pendingAgentsOnTurn);
247
- }
248
- set pendingAgentsOnTurn(agents) {
249
- this._pendingAgentsOnTurn = new Set(agents);
250
- }
251
- get partialResult() {
252
- return this._partialResult;
253
- }
254
- get totalTime() {
255
- return Date.now() - this._totalStartTime;
256
- }
257
- get agentTimes() {
258
- return new Map(this._agentTimes);
259
- }
260
- removeLastPendingRole() {
261
- this._pendingRolesOnTurn.pop();
262
- }
99
+ return false;
100
+ });
101
+ };
102
+ var reverseSegmentRoles = (segment) => {
103
+ return segment.map((message2) => {
104
+ const hasStringContent = typeof message2.content === "string";
105
+ if (!hasStringContent) return message2;
106
+ const roleMap = {
107
+ [userMessageRole]: assistantMessageRole,
108
+ [assistantMessageRole]: userMessageRole
109
+ };
110
+ const newRole = roleMap[message2.role];
111
+ if (!newRole) return message2;
112
+ return {
113
+ role: newRole,
114
+ content: message2.content
115
+ };
116
+ });
117
+ };
118
+ var messageRoleReversal = (messages) => {
119
+ const segments = groupMessagesByToolBoundaries(messages);
120
+ const processedSegments = segments.map(
121
+ (segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
122
+ );
123
+ return processedSegments.flat();
124
+ };
125
+ var criterionToParamName = (criterion) => {
126
+ return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
263
127
  };
264
128
 
265
- // src/events/schema.ts
266
- import { EventType, MessagesSnapshotEventSchema } from "@ag-ui/core";
267
- import { z as z2 } from "zod";
268
- var ScenarioRunStatus = /* @__PURE__ */ ((ScenarioRunStatus2) => {
269
- ScenarioRunStatus2["SUCCESS"] = "SUCCESS";
270
- ScenarioRunStatus2["ERROR"] = "ERROR";
271
- ScenarioRunStatus2["CANCELLED"] = "CANCELLED";
272
- ScenarioRunStatus2["IN_PROGRESS"] = "IN_PROGRESS";
273
- ScenarioRunStatus2["PENDING"] = "PENDING";
274
- ScenarioRunStatus2["FAILED"] = "FAILED";
275
- return ScenarioRunStatus2;
276
- })(ScenarioRunStatus || {});
277
- var baseEventSchema = z2.object({
278
- type: z2.nativeEnum(EventType),
279
- timestamp: z2.number().optional(),
280
- rawEvent: z2.any().optional()
281
- });
282
- var baseScenarioEventSchema = baseEventSchema.extend({
283
- batchRunId: z2.string(),
284
- scenarioId: z2.string(),
285
- scenarioRunId: z2.string()
286
- });
287
- var scenarioRunStartedSchema = baseScenarioEventSchema.extend({
288
- type: z2.literal("SCENARIO_RUN_STARTED" /* RUN_STARTED */),
289
- metadata: z2.object({
290
- name: z2.string(),
291
- description: z2.string().optional()
292
- // config: z.record(z.unknown()).optional(),
293
- })
294
- });
295
- var scenarioRunFinishedSchema = baseScenarioEventSchema.extend({
296
- type: z2.literal("SCENARIO_RUN_FINISHED" /* RUN_FINISHED */),
297
- status: z2.nativeEnum(ScenarioRunStatus)
298
- // error: z
299
- // .object({
300
- // message: z.string(),
301
- // code: z.string().optional(),
302
- // stack: z.string().optional(),
303
- // })
304
- // .optional(),
305
- // metrics: z.record(z.number()).optional(),
306
- });
307
- var scenarioMessageSnapshotSchema = MessagesSnapshotEventSchema.merge(
308
- baseScenarioEventSchema.extend({
309
- type: z2.literal("SCENARIO_MESSAGE_SNAPSHOT" /* MESSAGE_SNAPSHOT */)
310
- })
311
- );
312
- var scenarioEventSchema = z2.discriminatedUnion("type", [
313
- scenarioRunStartedSchema,
314
- scenarioRunFinishedSchema,
315
- scenarioMessageSnapshotSchema
316
- ]);
317
- var successSchema = z2.object({ success: z2.boolean() });
318
- var errorSchema = z2.object({ error: z2.string() });
319
- var stateSchema = z2.object({
320
- state: z2.object({
321
- messages: z2.array(z2.any()),
322
- status: z2.string()
323
- })
324
- });
325
- var runsSchema = z2.object({ runs: z2.array(z2.string()) });
326
- var eventsSchema = z2.object({ events: z2.array(scenarioEventSchema) });
327
-
328
- // src/utils/logger.ts
329
- var Logger = class _Logger {
330
- constructor(context) {
331
- this.context = context;
332
- }
333
- /**
334
- * Creates a logger with context (e.g., class name)
335
- */
336
- static create(context) {
337
- return new _Logger(context);
338
- }
339
- /**
340
- * Checks if logging should occur based on LOG_LEVEL env var
341
- */
342
- shouldLog(level) {
343
- const logLevel = (process.env.SCENARIO_LOG_LEVEL || "").toLowerCase();
344
- const levels = ["error", "warn", "info", "debug"];
345
- const currentLevelIndex = levels.indexOf(logLevel);
346
- const requestedLevelIndex = levels.indexOf(level);
347
- return currentLevelIndex >= 0 && requestedLevelIndex <= currentLevelIndex;
348
- }
349
- formatMessage(message2) {
350
- return this.context ? `[${this.context}] ${message2}` : message2;
351
- }
352
- error(message2, data) {
353
- if (this.shouldLog("error")) {
354
- const formattedMessage = this.formatMessage(message2);
355
- if (data) {
356
- console.error(formattedMessage, data);
357
- } else {
358
- console.error(formattedMessage);
129
+ // src/config/load.ts
130
+ import fs from "node:fs/promises";
131
+ import path from "node:path";
132
+ import { pathToFileURL } from "node:url";
133
+ async function loadScenarioProjectConfig() {
134
+ const cwd = process.cwd();
135
+ const configNames = [
136
+ "scenario.config.js",
137
+ "scenario.config.mjs"
138
+ ];
139
+ for (const name of configNames) {
140
+ const fullPath = path.join(cwd, name);
141
+ try {
142
+ await fs.access(fullPath);
143
+ const configModule = await import(pathToFileURL(fullPath).href);
144
+ const config2 = configModule.default || configModule;
145
+ const parsed = scenarioProjectConfigSchema.safeParse(config2);
146
+ if (!parsed.success) {
147
+ throw new Error(
148
+ `Invalid config file ${name}: ${JSON.stringify(parsed.error.format(), null, 2)}`
149
+ );
359
150
  }
360
- }
361
- }
362
- warn(message2, data) {
363
- if (this.shouldLog("warn")) {
364
- const formattedMessage = this.formatMessage(message2);
365
- if (data) {
366
- console.warn(formattedMessage, data);
367
- } else {
368
- console.warn(formattedMessage);
151
+ return parsed.data;
152
+ } catch (error) {
153
+ if (error instanceof Error && "code" in error && error.code === "ENOENT") {
154
+ continue;
369
155
  }
156
+ throw error;
370
157
  }
371
158
  }
372
- info(message2, data) {
373
- if (this.shouldLog("info")) {
374
- const formattedMessage = this.formatMessage(message2);
375
- if (data) {
376
- console.info(formattedMessage, data);
377
- } else {
378
- console.info(formattedMessage);
379
- }
380
- }
159
+ return await scenarioProjectConfigSchema.parseAsync({});
160
+ }
161
+
162
+ // src/config/index.ts
163
+ var logger = new Logger("scenario.config");
164
+ var configLoaded = false;
165
+ var config = null;
166
+ var configLoadPromise = null;
167
+ async function loadProjectConfig() {
168
+ if (configLoaded) {
169
+ return;
381
170
  }
382
- debug(message2, data) {
383
- if (this.shouldLog("debug")) {
384
- const formattedMessage = this.formatMessage(message2);
385
- if (data) {
386
- console.log(formattedMessage, data);
387
- } else {
388
- console.log(formattedMessage);
389
- }
390
- }
171
+ if (configLoadPromise) {
172
+ return configLoadPromise;
391
173
  }
392
- };
174
+ configLoadPromise = (async () => {
175
+ try {
176
+ config = await loadScenarioProjectConfig();
177
+ logger.info("loaded scenario project config", { config });
178
+ } catch (error) {
179
+ logger.error("error loading scenario project config", { error });
180
+ } finally {
181
+ configLoaded = true;
182
+ }
183
+ })();
184
+ return configLoadPromise;
185
+ }
186
+ async function getProjectConfig() {
187
+ await loadProjectConfig();
188
+ return config;
189
+ }
393
190
 
394
- // src/execution/scenario-execution.ts
395
- var batchRunId2 = getBatchRunId();
396
- function convertAgentReturnTypesToMessages(response, role) {
397
- if (typeof response === "string")
398
- return [{ role, content: response }];
399
- if (Array.isArray(response))
400
- return response;
401
- if (typeof response === "object" && "role" in response)
402
- return [response];
403
- return [];
191
+ // src/utils/config.ts
192
+ function mergeConfig(config2, projectConfig) {
193
+ if (!projectConfig) {
194
+ return config2;
195
+ }
196
+ return {
197
+ ...projectConfig.defaultModel,
198
+ ...config2
199
+ };
200
+ }
201
+ function mergeAndValidateConfig(config2, projectConfig) {
202
+ var _a;
203
+ const mergedConfig = mergeConfig(config2, projectConfig);
204
+ mergedConfig.model = mergedConfig.model ?? ((_a = projectConfig == null ? void 0 : projectConfig.defaultModel) == null ? void 0 : _a.model);
205
+ if (!mergedConfig.model) {
206
+ throw new Error("Model is required");
207
+ }
208
+ return mergedConfig;
209
+ }
210
+
211
+ // src/agents/judge-agent.ts
212
+ function buildSystemPrompt(criteria, description) {
213
+ const criteriaList = (criteria == null ? void 0 : criteria.map((criterion, idx) => `${idx + 1}. ${criterion}`).join("\n")) || "No criteria provided";
214
+ return `
215
+ <role>
216
+ You are an LLM as a judge watching a simulated conversation as it plays out live to determine if the agent under test meets the criteria or not.
217
+ </role>
218
+
219
+ <goal>
220
+ Your goal is to determine if you already have enough information to make a verdict of the scenario below, or if the conversation should continue for longer.
221
+ If you do have enough information, use the finish_test tool to determine if all the criteria have been met, if not, use the continue_test tool to let the next step play out.
222
+ </goal>
223
+
224
+ <scenario>
225
+ ${description}
226
+ </scenario>
227
+
228
+ <criteria>
229
+ ${criteriaList}
230
+ </criteria>
231
+
232
+ <rules>
233
+ - Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criteria.
234
+ - DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
235
+ </rules>
236
+ `.trim();
237
+ }
238
+ function buildContinueTestTool() {
239
+ return tool({
240
+ description: "Continue the test with the next step",
241
+ parameters: z2.object({})
242
+ });
243
+ }
244
+ function buildFinishTestTool(criteria) {
245
+ const criteriaNames = criteria.map(criterionToParamName);
246
+ return tool({
247
+ description: "Complete the test with a final verdict",
248
+ parameters: z2.object({
249
+ criteria: z2.object(
250
+ Object.fromEntries(
251
+ criteriaNames.map((name, idx) => [
252
+ name,
253
+ z2.enum(["true", "false", "inconclusive"]).describe(criteria[idx])
254
+ ])
255
+ )
256
+ ).strict().describe("Strict verdict for each criterion"),
257
+ reasoning: z2.string().describe("Explanation of what the final verdict should be"),
258
+ verdict: z2.enum(["success", "failure", "inconclusive"]).describe("The final verdict of the test")
259
+ })
260
+ });
404
261
  }
262
+ var judgeAgent = (cfg) => {
263
+ return {
264
+ role: "Judge" /* JUDGE */,
265
+ criteria: cfg.criteria,
266
+ call: async (input) => {
267
+ var _a;
268
+ const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
269
+ const messages = [
270
+ { role: "system", content: systemPrompt },
271
+ ...input.messages
272
+ ];
273
+ const isLastMessage = input.scenarioState.currentTurn === input.scenarioConfig.maxTurns;
274
+ const projectConfig = await getProjectConfig();
275
+ const mergedConfig = mergeAndValidateConfig(cfg, projectConfig);
276
+ if (!mergedConfig.model) {
277
+ throw new Error("Model is required for the judge agent");
278
+ }
279
+ const tools = {
280
+ continue_test: buildContinueTestTool(),
281
+ finish_test: buildFinishTestTool(cfg.criteria)
282
+ };
283
+ const enforceJudgement = input.judgmentRequest;
284
+ const hasCriteria = cfg.criteria.length && cfg.criteria.length > 0;
285
+ if (enforceJudgement && !hasCriteria) {
286
+ return {
287
+ success: false,
288
+ messages: [],
289
+ reasoning: "JudgeAgent: No criteria was provided to be judged against",
290
+ metCriteria: [],
291
+ unmetCriteria: []
292
+ };
293
+ }
294
+ const toolChoice = (isLastMessage || enforceJudgement) && hasCriteria ? { type: "tool", toolName: "finish_test" } : "required";
295
+ const completion = await generateText({
296
+ model: mergedConfig.model,
297
+ messages,
298
+ temperature: mergedConfig.temperature ?? 0,
299
+ maxTokens: mergedConfig.maxTokens,
300
+ tools,
301
+ toolChoice
302
+ });
303
+ let args;
304
+ if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
305
+ const toolCall = completion.toolCalls[0];
306
+ switch (toolCall.toolName) {
307
+ case "finish_test": {
308
+ args = toolCall.args;
309
+ const verdict = args.verdict || "inconclusive";
310
+ const reasoning = args.reasoning || "No reasoning provided";
311
+ const criteria = args.criteria || {};
312
+ const criteriaValues = Object.values(criteria);
313
+ const metCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] === "true");
314
+ const unmetCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] !== "true");
315
+ return {
316
+ success: verdict === "success",
317
+ messages: input.messages,
318
+ reasoning,
319
+ metCriteria,
320
+ unmetCriteria
321
+ };
322
+ }
323
+ case "continue_test":
324
+ return [];
325
+ default:
326
+ return {
327
+ success: false,
328
+ messages: input.messages,
329
+ reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
330
+ metCriteria: [],
331
+ unmetCriteria: cfg.criteria
332
+ };
333
+ }
334
+ }
335
+ return {
336
+ success: false,
337
+ messages: input.messages,
338
+ reasoning: `JudgeAgent: No tool call found in LLM output`,
339
+ metCriteria: [],
340
+ unmetCriteria: cfg.criteria
341
+ };
342
+ }
343
+ };
344
+ };
345
+
346
+ // src/agents/user-simulator-agent.ts
347
+ import { generateText as generateText2 } from "ai";
348
+ function buildSystemPrompt2(description) {
349
+ return `
350
+ <role>
351
+ You are pretending to be a user, you are testing an AI Agent (shown as the user role) based on a scenario.
352
+ Approach this naturally, as a human user would, with very short inputs, few words, all lowercase, imperative, not periods, like when they google or talk to chatgpt.
353
+ </role>
354
+
355
+ <goal>
356
+ Your goal (assistant) is to interact with the Agent Under Test (user) as if you were a human user to see if it can complete the scenario successfully.
357
+ </goal>
358
+
359
+ <scenario>
360
+ ${description}
361
+ </scenario>
362
+
363
+ <rules>
364
+ - DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, you are the user
365
+ </rules>
366
+ `.trim();
367
+ }
368
+ var userSimulatorAgent = (config2) => {
369
+ return {
370
+ role: "User" /* USER */,
371
+ call: async (input) => {
372
+ const systemPrompt = buildSystemPrompt2(input.scenarioConfig.description);
373
+ const messages = [
374
+ { role: "system", content: systemPrompt },
375
+ { role: "assistant", content: "Hello, how can I help you today" },
376
+ ...input.messages
377
+ ];
378
+ const projectConfig = await getProjectConfig();
379
+ const mergedConfig = mergeAndValidateConfig(config2 ?? {}, projectConfig);
380
+ if (!mergedConfig.model) {
381
+ throw new Error("Model is required for the user simulator agent");
382
+ }
383
+ const reversedMessages = messageRoleReversal(messages);
384
+ const completion = await generateText2({
385
+ model: mergedConfig.model,
386
+ messages: reversedMessages,
387
+ temperature: mergedConfig.temperature ?? 0,
388
+ maxTokens: mergedConfig.maxTokens
389
+ });
390
+ const messageContent = completion.text;
391
+ if (!messageContent) {
392
+ throw new Error("No response content from LLM");
393
+ }
394
+ return { role: "user", content: messageContent };
395
+ }
396
+ };
397
+ };
398
+
399
+ // src/execution/index.ts
400
+ var execution_exports = {};
401
+ __export(execution_exports, {
402
+ ScenarioExecution: () => ScenarioExecution,
403
+ ScenarioExecutionState: () => ScenarioExecutionState
404
+ });
405
+
406
+ // src/execution/scenario-execution.ts
407
+ import { Subject } from "rxjs";
408
+
409
+ // src/utils/ids.ts
410
+ import { generate, parse } from "xksuid";
411
+ var batchRunId = null;
412
+ function generateThreadId() {
413
+ return `thread_${generate()}`;
414
+ }
415
+ function generateScenarioRunId() {
416
+ return `scenariorun_${generate()}`;
417
+ }
418
+ function generateScenarioId() {
419
+ return `scenario_${generate()}`;
420
+ }
421
+ function getBatchRunId() {
422
+ if (!batchRunId) {
423
+ batchRunId = process.env.SCENARIO_BATCH_RUN_ID ?? `scenariobatchrun_${generate()}`;
424
+ }
425
+ return batchRunId;
426
+ }
427
+ function generateMessageId() {
428
+ return `scenariomsg_${generate()}`;
429
+ }
430
+
431
+ // src/execution/scenario-execution-state.ts
432
+ var ScenarioExecutionState = class {
433
+ _messages = [];
434
+ _currentTurn = 0;
435
+ _threadId = "";
436
+ description;
437
+ config;
438
+ constructor(config2) {
439
+ this.config = config2;
440
+ this.description = config2.description;
441
+ }
442
+ get messages() {
443
+ return this._messages;
444
+ }
445
+ get currentTurn() {
446
+ return this._currentTurn;
447
+ }
448
+ set currentTurn(turn) {
449
+ this._currentTurn = turn;
450
+ }
451
+ get threadId() {
452
+ return this._threadId;
453
+ }
454
+ set threadId(value) {
455
+ this._threadId = value;
456
+ }
457
+ /**
458
+ * Adds a message to the conversation history.
459
+ *
460
+ * @param message - The message to add.
461
+ */
462
+ addMessage(message2) {
463
+ this._messages.push({ ...message2, id: generateMessageId() });
464
+ }
465
+ lastMessage() {
466
+ if (this._messages.length === 0) {
467
+ throw new Error("No messages in history");
468
+ }
469
+ return this._messages[this._messages.length - 1];
470
+ }
471
+ lastUserMessage() {
472
+ if (this._messages.length === 0) {
473
+ throw new Error("No messages in history");
474
+ }
475
+ const lastMessage = this._messages.findLast((message2) => message2.role === "user");
476
+ if (!lastMessage) {
477
+ throw new Error("No user message in history");
478
+ }
479
+ return lastMessage;
480
+ }
481
+ lastToolCall(toolName) {
482
+ if (this._messages.length === 0) {
483
+ throw new Error("No messages in history");
484
+ }
485
+ const lastMessage = this._messages.findLast((message2) => message2.role === "tool" && message2.content.find(
486
+ (part) => part.type === "tool-result" && part.toolName === toolName
487
+ ));
488
+ if (!lastMessage) {
489
+ throw new Error("No tool call message in history");
490
+ }
491
+ return lastMessage;
492
+ }
493
+ hasToolCall(toolName) {
494
+ return this._messages.some(
495
+ (message2) => message2.role === "tool" && message2.content.find(
496
+ (part) => part.type === "tool-result" && part.toolName === toolName
497
+ )
498
+ );
499
+ }
500
+ };
501
+
502
+ // src/execution/scenario-execution.ts
503
+ var batchRunId2 = getBatchRunId();
405
504
  var ScenarioExecution = class {
406
- state = new ScenarioExecutionState();
505
+ state;
407
506
  eventSubject = new Subject();
408
507
  logger = new Logger("scenario.execution.ScenarioExecution");
409
508
  config;
509
+ agents = [];
510
+ pendingRolesOnTurn = [];
511
+ pendingAgentsOnTurn = /* @__PURE__ */ new Set();
512
+ pendingMessages = /* @__PURE__ */ new Map();
513
+ partialResult = null;
514
+ agentTimes = /* @__PURE__ */ new Map();
515
+ totalStartTime = 0;
410
516
  /**
411
517
  * An observable stream of events that occur during the scenario execution.
412
518
  * Subscribe to this to monitor the progress of the scenario in real-time.
@@ -426,15 +532,17 @@ var ScenarioExecution = class {
426
532
  script,
427
533
  verbose: config2.verbose ?? false,
428
534
  maxTurns: config2.maxTurns ?? 10,
429
- threadId: config2.threadId ?? generateThreadId()
535
+ threadId: config2.threadId ?? generateThreadId(),
536
+ setId: config2.setId
430
537
  };
538
+ this.state = new ScenarioExecutionState(this.config);
431
539
  this.reset();
432
540
  }
433
541
  /**
434
542
  * The history of messages in the conversation.
435
543
  */
436
- get history() {
437
- return this.state.history;
544
+ get messages() {
545
+ return this.state.messages;
438
546
  }
439
547
  /**
440
548
  * The unique identifier for the conversation thread.
@@ -442,6 +550,12 @@ var ScenarioExecution = class {
442
550
  get threadId() {
443
551
  return this.state.threadId;
444
552
  }
553
+ /**
554
+ * The total elapsed time for the scenario execution.
555
+ */
556
+ get totalTime() {
557
+ return Date.now() - this.totalStartTime;
558
+ }
445
559
  /**
446
560
  * Executes the entire scenario from start to finish.
447
561
  * This will run through the script and any automatic proceeding logic until a
@@ -462,7 +576,8 @@ var ScenarioExecution = class {
462
576
  if (result && typeof result === "object" && "success" in result) {
463
577
  this.emitRunFinished({
464
578
  scenarioRunId,
465
- status: result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */
579
+ status: result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
580
+ result
466
581
  });
467
582
  return result;
468
583
  }
@@ -475,11 +590,20 @@ var ScenarioExecution = class {
475
590
  "- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
476
591
  ].join("\n"));
477
592
  } catch (error) {
593
+ const errorResult = {
594
+ success: false,
595
+ messages: this.state.messages,
596
+ reasoning: `Scenario failed with error: ${error instanceof Error ? error.message : String(error)}`,
597
+ metCriteria: [],
598
+ unmetCriteria: [],
599
+ error: error instanceof Error ? error.message : String(error)
600
+ };
478
601
  this.emitRunFinished({
479
602
  scenarioRunId,
480
- status: "ERROR" /* ERROR */
603
+ status: "ERROR" /* ERROR */,
604
+ result: errorResult
481
605
  });
482
- throw error;
606
+ return errorResult;
483
607
  }
484
608
  }
485
609
  /**
@@ -494,29 +618,29 @@ var ScenarioExecution = class {
494
618
  return result;
495
619
  }
496
620
  async _step(goToNextTurn = true, onTurn) {
497
- if (this.state.pendingRolesOnTurn.length === 0) {
621
+ if (this.pendingRolesOnTurn.length === 0) {
498
622
  if (!goToNextTurn) return null;
499
- this.state.newTurn();
623
+ this.newTurn();
500
624
  if (onTurn) await onTurn(this.state);
501
- if (this.state.turn != null && this.state.turn >= this.config.maxTurns)
625
+ if (this.state.currentTurn >= this.config.maxTurns)
502
626
  return this.reachedMaxTurns();
503
627
  }
504
- const currentRole = this.state.pendingRolesOnTurn[0];
628
+ const currentRole = this.pendingRolesOnTurn[0];
505
629
  const { idx, agent: nextAgent } = this.nextAgentForRole(currentRole);
506
630
  if (!nextAgent) {
507
- this.state.removePendingRole(currentRole);
631
+ this.removePendingRole(currentRole);
508
632
  return this._step(goToNextTurn, onTurn);
509
633
  }
510
- this.state.removePendingAgent(nextAgent);
634
+ this.removePendingAgent(nextAgent);
511
635
  return await this.callAgent(idx, currentRole);
512
636
  }
513
637
  async callAgent(idx, role, judgmentRequest = false) {
514
- const agent2 = this.state.agents[idx];
638
+ const agent2 = this.agents[idx];
515
639
  const startTime = Date.now();
516
640
  const agentInput = {
517
641
  threadId: this.state.threadId,
518
- messages: this.state.history,
519
- newMessages: this.state.getPendingMessages(idx),
642
+ messages: this.state.messages,
643
+ newMessages: this.pendingMessages.get(idx) ?? [],
520
644
  requestedRole: role,
521
645
  judgmentRequest,
522
646
  scenarioState: this.state,
@@ -524,106 +648,22 @@ var ScenarioExecution = class {
524
648
  };
525
649
  const agentResponse = await agent2.call(agentInput);
526
650
  const endTime = Date.now();
527
- this.state.addAgentTime(idx, endTime - startTime);
528
- this.state.clearPendingMessages(idx);
529
- if (typeof agentResponse === "object" && agentResponse && "success" in agentResponse) {
651
+ this.addAgentTime(idx, endTime - startTime);
652
+ this.pendingMessages.delete(idx);
653
+ if (agentResponse && typeof agentResponse === "object" && "success" in agentResponse) {
530
654
  return agentResponse;
531
655
  }
656
+ const currentAgentTime = this.agentTimes.get(idx) ?? 0;
657
+ this.agentTimes.set(idx, currentAgentTime + (Date.now() - startTime));
532
658
  const messages = convertAgentReturnTypesToMessages(
533
659
  agentResponse,
534
660
  role === "User" /* USER */ ? "user" : "assistant"
535
661
  );
536
- this.state.addMessages(messages, idx);
537
- return messages;
538
- }
539
- nextAgentForRole(role) {
540
- for (const agent2 of this.state.agents) {
541
- if (agent2.role === role && this.state.pendingAgentsOnTurn.includes(agent2) && this.state.pendingRolesOnTurn.includes(role)) {
542
- return { idx: this.state.agents.indexOf(agent2), agent: agent2 };
543
- }
544
- }
545
- return { idx: -1, agent: null };
546
- }
547
- reachedMaxTurns(errorMessage) {
548
- var _a;
549
- const agentRoleAgentsIdx = this.state.agents.map((agent2, i) => ({ agent: agent2, idx: i })).filter(({ agent: agent2 }) => agent2.role === "Agent" /* AGENT */).map(({ idx }) => idx);
550
- const agentTimes = agentRoleAgentsIdx.map((i) => this.state.agentTimes.get(i) || 0);
551
- const totalAgentTime = agentTimes.reduce((sum, time) => sum + time, 0);
552
- return {
553
- success: false,
554
- messages: this.state.history,
555
- reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
556
- passedCriteria: [],
557
- failedCriteria: ((_a = this.getJudgeAgent()) == null ? void 0 : _a.criteria) ?? [],
558
- totalTime: this.state.totalTime,
559
- agentTime: totalAgentTime
560
- };
561
- }
562
- getJudgeAgent() {
563
- return this.state.agents.find((agent2) => agent2 instanceof JudgeAgentAdapter) ?? null;
564
- }
565
- consumeUntilRole(role) {
566
- while (this.state.pendingRolesOnTurn.length > 0) {
567
- const nextRole = this.state.pendingRolesOnTurn[0];
568
- if (nextRole === role) break;
569
- this.state.pendingRolesOnTurn.pop();
570
- }
571
- }
572
- async scriptCallAgent(role, content, judgmentRequest = false) {
573
- this.consumeUntilRole(role);
574
- let index = -1;
575
- let agent2 = null;
576
- const nextAgent = this.state.getNextAgentForRole(role);
577
- if (!nextAgent) {
578
- this.state.newTurn();
579
- this.consumeUntilRole(role);
580
- const nextAgent2 = this.state.getNextAgentForRole(role);
581
- if (!nextAgent2) {
582
- let roleClass = "";
583
- switch (role) {
584
- case "User" /* USER */:
585
- roleClass = "a scenario.userSimulatorAgent()";
586
- break;
587
- case "Agent" /* AGENT */:
588
- roleClass = "a scenario.agent()";
589
- break;
590
- case "Judge" /* JUDGE */:
591
- roleClass = "a scenario.judgeAgent()";
592
- break;
593
- default:
594
- roleClass = "your agent";
595
- }
596
- if (content)
597
- throw new Error(
598
- `Cannot generate a message for role \`${role}\` with content \`${content}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
599
- );
600
- throw new Error(
601
- `Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
602
- );
603
- }
604
- index = nextAgent2.index;
605
- agent2 = nextAgent2.agent;
606
- } else {
607
- index = nextAgent.index;
608
- agent2 = nextAgent.agent;
609
- }
610
- this.state.removePendingAgent(agent2);
611
- if (content) {
612
- if (typeof content === "string") {
613
- if (role === "User" /* USER */) {
614
- this.state.addMessage({ role: "user", content });
615
- } else {
616
- this.state.addMessage({ role: "assistant", content });
617
- }
618
- } else {
619
- this.state.addMessage(content);
620
- }
621
- return null;
662
+ for (const message2 of messages) {
663
+ this.state.addMessage(message2);
664
+ this.broadcastMessage(message2, idx);
622
665
  }
623
- const result = await this.callAgent(index, role, judgmentRequest);
624
- if (Array.isArray(result))
625
- return null;
626
- return result;
666
+ return messages;
627
667
  }
628
668
  /**
629
669
  * Adds a message to the conversation history.
@@ -637,6 +677,7 @@ var ScenarioExecution = class {
637
677
  await this.scriptCallAgent("Agent" /* AGENT */, message2);
638
678
  } else {
639
679
  this.state.addMessage(message2);
680
+ this.broadcastMessage(message2);
640
681
  }
641
682
  }
642
683
  /**
@@ -678,12 +719,12 @@ var ScenarioExecution = class {
678
719
  * @returns A promise that resolves with the scenario result if a conclusion is reached.
679
720
  */
680
721
  async proceed(turns, onTurn, onStep) {
681
- let initialTurn = this.state.turn;
722
+ let initialTurn = this.state.currentTurn;
682
723
  while (true) {
683
- const goToNextTurn = turns === void 0 || initialTurn === null || this.state.turn != null && this.state.turn + 1 < initialTurn + turns;
724
+ const goToNextTurn = turns === void 0 || initialTurn === null || this.state.currentTurn != null && this.state.currentTurn + 1 < initialTurn + turns;
684
725
  const nextMessage = await this._step(goToNextTurn, onTurn);
685
726
  if (initialTurn === null)
686
- initialTurn = this.state.turn;
727
+ initialTurn = this.state.currentTurn;
687
728
  if (nextMessage === null) {
688
729
  return null;
689
730
  }
@@ -701,10 +742,10 @@ var ScenarioExecution = class {
701
742
  async succeed(reasoning) {
702
743
  return {
703
744
  success: true,
704
- messages: this.state.history,
745
+ messages: this.state.messages,
705
746
  reasoning: reasoning || "Scenario marked as successful with Scenario.succeed()",
706
- passedCriteria: [],
707
- failedCriteria: []
747
+ metCriteria: [],
748
+ unmetCriteria: []
708
749
  };
709
750
  }
710
751
  /**
@@ -716,656 +757,385 @@ var ScenarioExecution = class {
716
757
  async fail(reasoning) {
717
758
  return {
718
759
  success: false,
719
- messages: this.state.history,
760
+ messages: this.state.messages,
720
761
  reasoning: reasoning || "Scenario marked as failed with Scenario.fail()",
721
- passedCriteria: [],
722
- failedCriteria: []
762
+ metCriteria: [],
763
+ unmetCriteria: []
723
764
  };
724
765
  }
725
- reset() {
726
- this.state = new ScenarioExecutionState();
727
- this.state.setThreadId(this.config.threadId || generateThreadId());
728
- this.state.setAgents(this.config.agents);
729
- this.state.newTurn();
730
- this.state.turn = 0;
731
- }
732
- // =====================================================
733
- // Event Emission Methods
734
- // =====================================================
735
- // These methods handle the creation and emission of
736
- // scenario events for external consumption and monitoring
737
- // =====================================================
738
- /**
739
- * Emits an event to the event stream for external consumption.
740
- */
741
- emitEvent(event) {
742
- this.eventSubject.next(event);
743
- }
744
- /**
745
- * Creates base event properties shared across all scenario events.
746
- */
747
- makeBaseEvent({ scenarioRunId }) {
748
- return {
749
- batchRunId: batchRunId2,
750
- scenarioId: this.config.id,
751
- scenarioRunId,
752
- timestamp: Date.now(),
753
- rawEvent: void 0
754
- };
755
- }
756
- /**
757
- * Emits a run started event to indicate scenario execution has begun.
758
- */
759
- emitRunStarted({ scenarioRunId }) {
760
- this.emitEvent({
761
- ...this.makeBaseEvent({ scenarioRunId }),
762
- type: "SCENARIO_RUN_STARTED" /* RUN_STARTED */,
763
- metadata: {
764
- name: this.config.name,
765
- description: this.config.description
766
- }
767
- });
766
+ addAgentTime(agentIdx, time) {
767
+ const currentTime = this.agentTimes.get(agentIdx) || 0;
768
+ this.agentTimes.set(agentIdx, currentTime + time);
768
769
  }
769
- /**
770
- * Emits a message snapshot event containing current conversation history.
771
- */
772
- emitMessageSnapshot({ scenarioRunId }) {
773
- this.emitEvent({
774
- ...this.makeBaseEvent({ scenarioRunId }),
775
- type: "SCENARIO_MESSAGE_SNAPSHOT" /* MESSAGE_SNAPSHOT */,
776
- messages: this.state.history
777
- // Add any other required fields from MessagesSnapshotEventSchema
778
- });
770
+ hasResult() {
771
+ return this.partialResult !== null;
779
772
  }
780
- /**
781
- * Emits a run finished event with the final execution status.
782
- */
783
- emitRunFinished({
784
- scenarioRunId,
785
- status
786
- }) {
787
- this.emitEvent({
788
- ...this.makeBaseEvent({ scenarioRunId }),
789
- type: "SCENARIO_RUN_FINISHED" /* RUN_FINISHED */,
790
- status
791
- // Add error/metrics fields if needed
792
- });
773
+ setResult(result) {
774
+ this.partialResult = result;
793
775
  }
794
- };
795
-
796
- // src/config/load.ts
797
- import fs from "node:fs/promises";
798
- import path from "node:path";
799
- import { pathToFileURL } from "node:url";
800
- async function loadScenarioProjectConfig() {
801
- const cwd = process.cwd();
802
- const configNames = [
803
- "scenario.config.js",
804
- "scenario.config.mjs"
805
- ];
806
- for (const name of configNames) {
807
- const fullPath = path.join(cwd, name);
808
- try {
809
- await fs.access(fullPath);
810
- const configModule = await import(pathToFileURL(fullPath).href);
811
- const config2 = configModule.default || configModule;
812
- const parsed = scenarioProjectConfigSchema.safeParse(config2);
813
- if (!parsed.success) {
814
- throw new Error(
815
- `Invalid config file ${name}: ${JSON.stringify(parsed.error.format(), null, 2)}`
816
- );
817
- }
818
- return parsed.data;
819
- } catch (error) {
820
- if (error instanceof Error && "code" in error && error.code === "ENOENT") {
821
- continue;
822
- }
823
- throw error;
776
+ async scriptCallAgent(role, content, judgmentRequest = false) {
777
+ this.consumeUntilRole(role);
778
+ let index = -1;
779
+ let agent2 = null;
780
+ let nextAgent = this.getNextAgentForRole(role);
781
+ if (!nextAgent) {
782
+ this.newTurn();
783
+ this.consumeUntilRole(role);
784
+ nextAgent = this.getNextAgentForRole(role);
824
785
  }
825
- }
826
- return await scenarioProjectConfigSchema.parseAsync({});
827
- }
828
-
829
- // src/events/event-bus.ts
830
- import { concatMap, EMPTY, catchError, Subject as Subject2 } from "rxjs";
831
-
832
- // src/events/event-reporter.ts
833
- var EventReporter = class {
834
- eventsEndpoint;
835
- apiKey;
836
- logger = new Logger("scenario.events.EventReporter");
837
- constructor(config2) {
838
- this.eventsEndpoint = new URL("/api/scenario-events", config2.endpoint);
839
- this.apiKey = config2.apiKey ?? "";
840
- if (!process.env.SCENARIO_DISABLE_SIMULATION_REPORT_INFO) {
841
- console.log("=== Scenario Simulation Reporting ===");
842
- if (!this.apiKey) {
843
- console.warn("LangWatch API key not configured, simulations will be local");
844
- console.warn(`To enable simulation reporting in the LangWatch dashboard, configure your LangWatch API key (via LANGWATCH_API_KEY, or scenario.config.js)`);
845
- } else {
846
- console.log("Simulation reporting is enabled");
847
- console.log(`Endpoint: ${config2.endpoint} -> ${this.eventsEndpoint.href}`);
848
- console.log(`API Key: ${!this.apiKey ? "not configured" : "configured"}`);
786
+ if (!nextAgent) {
787
+ let roleClass = "";
788
+ switch (role) {
789
+ case "User" /* USER */:
790
+ roleClass = "a scenario.userSimulatorAgent()";
791
+ break;
792
+ case "Agent" /* AGENT */:
793
+ roleClass = "a scenario.agent()";
794
+ break;
795
+ case "Judge" /* JUDGE */:
796
+ roleClass = "a scenario.judgeAgent()";
797
+ break;
798
+ default:
799
+ roleClass = "your agent";
849
800
  }
850
- console.log("=== Scenario Simulation Reporting ===");
851
- }
852
- }
853
- /**
854
- * Posts an event to the configured endpoint.
855
- * Logs success/failure but doesn't throw - event posting shouldn't break scenario execution.
856
- */
857
- async postEvent(event) {
858
- this.logger.debug(`[${event.type}] Posting event`, {
859
- event
860
- });
861
- if (!this.eventsEndpoint) {
862
- this.logger.warn(
863
- "No LANGWATCH_ENDPOINT configured, skipping event posting"
801
+ if (content)
802
+ throw new Error(
803
+ `Cannot generate a message for role \`${role}\` with content \`${content}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
804
+ );
805
+ throw new Error(
806
+ `Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
864
807
  );
865
- return;
866
808
  }
867
- try {
868
- const response = await fetch(this.eventsEndpoint.href, {
869
- method: "POST",
870
- body: JSON.stringify(event),
871
- headers: {
872
- "Content-Type": "application/json",
873
- "X-Auth-Token": this.apiKey
874
- }
875
- });
876
- this.logger.debug(
877
- `[${event.type}] Event POST response status: ${response.status}`
878
- );
879
- if (response.ok) {
880
- const data = await response.json();
881
- this.logger.debug(`[${event.type}] Event POST response:`, data);
882
- } else {
883
- const errorText = await response.text();
884
- this.logger.error(`[${event.type}] Event POST failed:`, {
885
- status: response.status,
886
- statusText: response.statusText,
887
- error: errorText,
888
- event
889
- });
890
- }
891
- } catch (error) {
892
- this.logger.error(`[${event.type}] Event POST error:`, {
893
- error,
894
- event,
895
- endpoint: this.eventsEndpoint
896
- });
897
- }
898
- }
899
- };
900
-
901
- // src/events/event-bus.ts
902
- var EventBus = class {
903
- events$ = new Subject2();
904
- eventReporter;
905
- processingPromise = null;
906
- logger = new Logger("scenario.events.EventBus");
907
- constructor(config2) {
908
- this.eventReporter = new EventReporter(config2);
909
- }
910
- /**
911
- * Publishes an event into the processing pipeline.
912
- */
913
- publish(event) {
914
- this.logger.debug(`[${event.type}] Publishing event`, {
915
- event
916
- });
917
- this.events$.next(event);
918
- }
919
- /**
920
- * Begins listening for and processing events.
921
- * Returns a promise that resolves when a RUN_FINISHED event is fully processed.
922
- */
923
- listen() {
924
- this.logger.debug("Listening for events");
925
- if (this.processingPromise) {
926
- return this.processingPromise;
927
- }
928
- this.processingPromise = new Promise((resolve, reject) => {
929
- this.events$.pipe(
930
- concatMap(async (event) => {
931
- this.logger.debug(`[${event.type}] Processing event`, {
932
- event
933
- });
934
- await this.eventReporter.postEvent(event);
935
- return event;
936
- }),
937
- catchError((error) => {
938
- this.logger.error("Error in event stream:", error);
939
- return EMPTY;
940
- })
941
- ).subscribe({
942
- next: (event) => {
943
- this.logger.debug(`[${event.type}] Event processed`, {
944
- event
945
- });
946
- if (event.type === "SCENARIO_RUN_FINISHED" /* RUN_FINISHED */) {
947
- resolve();
948
- }
949
- },
950
- error: (error) => {
951
- this.logger.error("Error in event stream:", error);
952
- reject(error);
953
- }
954
- });
955
- });
956
- return this.processingPromise;
957
- }
958
- /**
959
- * Stops accepting new events and drains the processing queue.
960
- */
961
- async drain() {
962
- this.logger.debug("Draining event stream");
963
- this.events$.unsubscribe();
964
- if (this.processingPromise) {
965
- await this.processingPromise;
966
- }
967
- }
968
- /**
969
- * Subscribes to an event stream.
970
- * @param source$ - The event stream to subscribe to.
971
- */
972
- subscribeTo(source$) {
973
- this.logger.debug("Subscribing to event stream");
974
- return source$.subscribe(this.events$);
975
- }
976
- };
977
-
978
- // src/runner/run.ts
979
- async function run(cfg) {
980
- if (!cfg.name) {
981
- throw new Error("Scenario name is required");
982
- }
983
- if (!cfg.description) {
984
- throw new Error("Scenario description is required");
985
- }
986
- if ((cfg.maxTurns || 10) < 1) {
987
- throw new Error("Max turns must be at least 1");
988
- }
989
- if (cfg.agents.length === 0) {
990
- throw new Error("At least one agent is required");
991
- }
992
- if (!cfg.agents.find((agent2) => agent2.role === "Agent" /* AGENT */)) {
993
- throw new Error("At least one non-user/non-judge agent is required");
994
- }
995
- cfg.agents.forEach((agent2, i) => {
996
- if (!allAgentRoles.includes(agent2.role)) {
997
- throw new Error(`Agent ${i} has invalid role: ${agent2.role}`);
998
- }
999
- });
1000
- if (!cfg.threadId) {
1001
- cfg.threadId = generateThreadId();
1002
- }
1003
- const steps = cfg.script || [proceed()];
1004
- const execution = new ScenarioExecution(cfg, steps);
1005
- let eventBus = null;
1006
- let subscription = null;
1007
- try {
1008
- const projectConfig = await loadScenarioProjectConfig();
1009
- eventBus = new EventBus({
1010
- endpoint: projectConfig.langwatchEndpoint ?? process.env.LANGWATCH_ENDPOINT ?? "https://app.langwatch.ai",
1011
- apiKey: projectConfig.langwatchApiKey ?? process.env.LANGWATCH_API_KEY
1012
- });
1013
- eventBus.listen();
1014
- subscription = eventBus.subscribeTo(execution.events$);
1015
- const result = await execution.execute();
1016
- if (cfg.verbose && !result.success) {
1017
- console.log(`Scenario failed: ${cfg.name}`);
1018
- console.log(`Reasoning: ${result.reasoning}`);
1019
- console.log("--------------------------------");
1020
- console.log(`Passed criteria: ${result.passedCriteria.join("\n- ")}`);
1021
- console.log(`Failed criteria: ${result.failedCriteria.join("\n- ")}`);
1022
- console.log(result.messages.map(formatMessage).join("\n"));
809
+ index = nextAgent.index;
810
+ agent2 = nextAgent.agent;
811
+ this.removePendingAgent(agent2);
812
+ if (content) {
813
+ const message2 = typeof content === "string" ? { role: role === "User" /* USER */ ? "user" : "assistant", content } : content;
814
+ this.state.addMessage(message2);
815
+ this.broadcastMessage(message2, index);
816
+ return null;
1023
817
  }
1024
- return result;
1025
- } finally {
1026
- await (eventBus == null ? void 0 : eventBus.drain());
1027
- subscription == null ? void 0 : subscription.unsubscribe();
818
+ const result = await this.callAgent(index, role, judgmentRequest);
819
+ if (result && typeof result === "object" && "success" in result) {
820
+ return result;
821
+ }
822
+ return null;
1028
823
  }
1029
- }
1030
- function formatMessage(m) {
1031
- switch (m.role) {
1032
- case "user":
1033
- return `User: ${m.content}`;
1034
- case "assistant":
1035
- return `Assistant: ${formatParts(m.content)}`;
1036
- case "tool":
1037
- return `Tool: ${formatParts(m.content)}`;
1038
- default:
1039
- return `${m.role}: ${m.content}`;
824
+ reset() {
825
+ this.state = new ScenarioExecutionState(this.config);
826
+ this.state.threadId = this.config.threadId || generateThreadId();
827
+ this.setAgents(this.config.agents);
828
+ this.newTurn();
829
+ this.state.currentTurn = 0;
830
+ this.totalStartTime = Date.now();
831
+ this.pendingMessages.clear();
1040
832
  }
1041
- }
1042
- function formatParts(part) {
1043
- if (typeof part === "string") {
1044
- return part;
833
+ nextAgentForRole(role) {
834
+ for (const agent2 of this.agents) {
835
+ if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2) && this.pendingRolesOnTurn.includes(role)) {
836
+ return { idx: this.agents.indexOf(agent2), agent: agent2 };
837
+ }
838
+ }
839
+ return { idx: -1, agent: null };
1045
840
  }
1046
- if (Array.isArray(part)) {
1047
- if (part.length === 1) {
1048
- return formatPart(part[0]);
841
+ newTurn() {
842
+ this.pendingAgentsOnTurn = new Set(this.agents);
843
+ this.pendingRolesOnTurn = [
844
+ "User" /* USER */,
845
+ "Agent" /* AGENT */,
846
+ "Judge" /* JUDGE */
847
+ ];
848
+ if (this.state.currentTurn === null) {
849
+ this.state.currentTurn = 1;
850
+ } else {
851
+ this.state.currentTurn++;
1049
852
  }
1050
- return `
1051
- ${part.map(formatPart).join("\n")}`;
1052
853
  }
1053
- return "Unknown content: " + JSON.stringify(part);
1054
- }
1055
- function formatPart(part) {
1056
- switch (part.type) {
1057
- case "text":
1058
- return part.text;
1059
- case "file":
1060
- return `(file): ${part.filename} ${typeof part.data === "string" ? `url:${part.data}` : "base64:omitted"}`;
1061
- case "tool-call":
1062
- return `(tool call): ${part.toolName} id:${part.toolCallId} args:(${JSON.stringify(part.args)})`;
1063
- case "tool-result":
1064
- return `(tool result): ${part.toolName} id:${part.toolCallId} result:(${JSON.stringify(part.result)})`;
1065
- case "reasoning":
1066
- return `(reasoning): ${part.text}`;
1067
- case "redacted-reasoning":
1068
- return `(redacted reasoning): ${part.data}`;
1069
- default:
1070
- return `Unknown content: ${JSON.stringify(part)}`;
854
+ removePendingRole(role) {
855
+ const index = this.pendingRolesOnTurn.indexOf(role);
856
+ if (index > -1) {
857
+ this.pendingRolesOnTurn.splice(index, 1);
858
+ }
1071
859
  }
1072
- }
1073
-
1074
- // src/agents/judge-agent.ts
1075
- import { generateText, tool } from "ai";
1076
- import { z as z3 } from "zod";
1077
-
1078
- // src/agents/utils.ts
1079
- var toolMessageRole = "tool";
1080
- var assistantMessageRole = "assistant";
1081
- var userMessageRole = "user";
1082
- var groupMessagesByToolBoundaries = (messages) => {
1083
- const segments = [];
1084
- let currentSegment = [];
1085
- for (const message2 of messages) {
1086
- currentSegment.push(message2);
1087
- if (message2.role === toolMessageRole) {
1088
- segments.push(currentSegment);
1089
- currentSegment = [];
860
+ removePendingAgent(agent2) {
861
+ this.pendingAgentsOnTurn.delete(agent2);
862
+ }
863
+ getNextAgentForRole(role) {
864
+ for (let i = 0; i < this.agents.length; i++) {
865
+ const agent2 = this.agents[i];
866
+ if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2)) {
867
+ return { index: i, agent: agent2 };
868
+ }
1090
869
  }
870
+ return null;
1091
871
  }
1092
- if (currentSegment.length > 0) {
1093
- segments.push(currentSegment);
872
+ setAgents(agents) {
873
+ this.agents = agents;
874
+ this.agentTimes.clear();
1094
875
  }
1095
- return segments;
1096
- };
1097
- var segmentHasToolMessages = (segment) => {
1098
- return segment.some((message2) => {
1099
- if (message2.role === toolMessageRole) return true;
1100
- if (message2.role === assistantMessageRole && Array.isArray(message2.content)) {
1101
- return message2.content.some((part) => part.type === "tool-call");
876
+ consumeUntilRole(role) {
877
+ while (this.pendingRolesOnTurn.length > 0) {
878
+ const nextRole = this.pendingRolesOnTurn[0];
879
+ if (nextRole === role) break;
880
+ this.pendingRolesOnTurn.pop();
1102
881
  }
1103
- return false;
1104
- });
1105
- };
1106
- var reverseSegmentRoles = (segment) => {
1107
- return segment.map((message2) => {
1108
- const hasStringContent = typeof message2.content === "string";
1109
- if (!hasStringContent) return message2;
1110
- const roleMap = {
1111
- [userMessageRole]: assistantMessageRole,
1112
- [assistantMessageRole]: userMessageRole
1113
- };
1114
- const newRole = roleMap[message2.role];
1115
- if (!newRole) return message2;
882
+ }
883
+ reachedMaxTurns(errorMessage) {
884
+ var _a;
885
+ const agentRoleAgentsIdx = this.agents.map((agent2, i) => ({ agent: agent2, idx: i })).filter(({ agent: agent2 }) => agent2.role === "Agent" /* AGENT */).map(({ idx }) => idx);
886
+ const agentTimes = agentRoleAgentsIdx.map((i) => this.agentTimes.get(i) || 0);
887
+ const totalAgentTime = agentTimes.reduce((sum, time) => sum + time, 0);
1116
888
  return {
1117
- role: newRole,
1118
- content: message2.content
889
+ success: false,
890
+ messages: this.state.messages,
891
+ reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
892
+ metCriteria: [],
893
+ unmetCriteria: ((_a = this.getJudgeAgent()) == null ? void 0 : _a.criteria) ?? [],
894
+ totalTime: this.totalTime,
895
+ agentTime: totalAgentTime
1119
896
  };
1120
- });
1121
- };
1122
- var messageRoleReversal = (messages) => {
1123
- const segments = groupMessagesByToolBoundaries(messages);
1124
- const processedSegments = segments.map(
1125
- (segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
1126
- );
1127
- return processedSegments.flat();
1128
- };
1129
- var criterionToParamName = (criterion) => {
1130
- return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
1131
- };
1132
-
1133
- // src/config/index.ts
1134
- var logger = new Logger("scenario.config");
1135
- var configLoaded = false;
1136
- var config = null;
1137
- var configLoadPromise = null;
1138
- async function loadProjectConfig() {
1139
- if (configLoaded) {
1140
- return;
1141
897
  }
1142
- if (configLoadPromise) {
1143
- return configLoadPromise;
898
+ getJudgeAgent() {
899
+ return this.agents.find((agent2) => agent2 instanceof JudgeAgentAdapter) ?? null;
1144
900
  }
1145
- configLoadPromise = (async () => {
1146
- try {
1147
- config = await loadScenarioProjectConfig();
1148
- logger.info("loaded scenario project config", { config });
1149
- } catch (error) {
1150
- logger.error("error loading scenario project config", { error });
1151
- } finally {
1152
- configLoaded = true;
1153
- }
1154
- })();
1155
- return configLoadPromise;
1156
- }
1157
- async function getProjectConfig() {
1158
- await loadProjectConfig();
1159
- return config;
1160
- }
1161
-
1162
- // src/utils/config.ts
1163
- function mergeConfig(config2, projectConfig) {
1164
- if (!projectConfig) {
1165
- return config2;
901
+ /**
902
+ * Emits an event to the event stream for external consumption.
903
+ */
904
+ emitEvent(event) {
905
+ this.eventSubject.next(event);
1166
906
  }
1167
- return {
1168
- ...projectConfig.defaultModel,
1169
- ...config2
1170
- };
1171
- }
1172
- function mergeAndValidateConfig(config2, projectConfig) {
1173
- var _a;
1174
- const mergedConfig = mergeConfig(config2, projectConfig);
1175
- mergedConfig.model = mergedConfig.model ?? ((_a = projectConfig == null ? void 0 : projectConfig.defaultModel) == null ? void 0 : _a.model);
1176
- if (!mergedConfig.model) {
1177
- throw new Error("Model is required");
907
+ /**
908
+ * Creates base event properties shared across all scenario events.
909
+ */
910
+ makeBaseEvent({ scenarioRunId }) {
911
+ return {
912
+ type: "placeholder",
913
+ // This will be replaced by the specific event type
914
+ timestamp: Date.now(),
915
+ batchRunId: batchRunId2,
916
+ scenarioId: this.config.id,
917
+ scenarioRunId,
918
+ scenarioSetId: this.config.setId
919
+ };
1178
920
  }
1179
- return mergedConfig;
1180
- }
1181
-
1182
- // src/agents/judge-agent.ts
1183
- function buildSystemPrompt(criteria, description) {
1184
- const criteriaList = (criteria == null ? void 0 : criteria.map((criterion, idx) => `${idx + 1}. ${criterion}`).join("\n")) || "No criteria provided";
1185
- return `
1186
- <role>
1187
- You are an LLM as a judge watching a simulated conversation as it plays out live to determine if the agent under test meets the criteria or not.
1188
- </role>
1189
-
1190
- <goal>
1191
- Your goal is to determine if you already have enough information to make a verdict of the scenario below, or if the conversation should continue for longer.
1192
- If you do have enough information, use the finish_test tool to determine if all the criteria have been met, if not, use the continue_test tool to let the next step play out.
1193
- </goal>
1194
-
1195
- <scenario>
1196
- ${description}
1197
- </scenario>
1198
-
1199
- <criteria>
1200
- ${criteriaList}
1201
- </criteria>
1202
-
1203
- <rules>
1204
- - Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criteria.
1205
- - DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
1206
- </rules>
1207
- `.trim();
1208
- }
1209
- function buildContinueTestTool() {
1210
- return tool({
1211
- description: "Continue the test with the next step",
1212
- parameters: z3.object({})
1213
- });
1214
- }
1215
- function buildFinishTestTool(criteria) {
1216
- const criteriaNames = criteria.map(criterionToParamName);
1217
- return tool({
1218
- description: "Complete the test with a final verdict",
1219
- parameters: z3.object({
1220
- criteria: z3.object(
1221
- Object.fromEntries(
1222
- criteriaNames.map((name, idx) => [
1223
- name,
1224
- z3.enum(["true", "false", "inconclusive"]).describe(criteria[idx])
1225
- ])
1226
- )
1227
- ).strict().describe("Strict verdict for each criterion"),
1228
- reasoning: z3.string().describe("Explanation of what the final verdict should be"),
1229
- verdict: z3.enum(["success", "failure", "inconclusive"]).describe("The final verdict of the test")
1230
- })
1231
- });
1232
- }
1233
- var judgeAgent = (cfg) => {
1234
- return {
1235
- role: "Judge" /* JUDGE */,
1236
- criteria: cfg.criteria,
1237
- call: async (input) => {
1238
- var _a;
1239
- const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
1240
- const messages = [
1241
- { role: "system", content: systemPrompt },
1242
- ...input.messages
1243
- ];
1244
- const isLastMessage = input.scenarioState.turn == input.scenarioConfig.maxTurns;
1245
- const projectConfig = await getProjectConfig();
1246
- const mergedConfig = mergeAndValidateConfig(cfg, projectConfig);
1247
- if (!mergedConfig.model) {
1248
- throw new Error("Model is required for the judge agent");
921
+ /**
922
+ * Emits a run started event to indicate scenario execution has begun.
923
+ */
924
+ emitRunStarted({ scenarioRunId }) {
925
+ this.emitEvent({
926
+ ...this.makeBaseEvent({ scenarioRunId }),
927
+ type: "SCENARIO_RUN_STARTED" /* RUN_STARTED */,
928
+ metadata: {
929
+ name: this.config.name,
930
+ description: this.config.description
1249
931
  }
1250
- const tools = {
1251
- continue_test: buildContinueTestTool(),
1252
- finish_test: buildFinishTestTool(cfg.criteria)
1253
- };
1254
- const enforceJudgement = input.judgmentRequest;
1255
- const hasCriteria = cfg.criteria.length && cfg.criteria.length > 0;
1256
- if (enforceJudgement && !hasCriteria) {
1257
- return {
1258
- success: false,
1259
- messages: [],
1260
- reasoning: "JudgeAgent: No criteria was provided to be judged against",
1261
- passedCriteria: [],
1262
- failedCriteria: []
1263
- };
932
+ });
933
+ }
934
+ /**
935
+ * Emits a message snapshot event containing current conversation history.
936
+ */
937
+ emitMessageSnapshot({ scenarioRunId }) {
938
+ this.emitEvent({
939
+ ...this.makeBaseEvent({ scenarioRunId }),
940
+ type: "SCENARIO_MESSAGE_SNAPSHOT" /* MESSAGE_SNAPSHOT */,
941
+ messages: this.state.messages
942
+ // Add any other required fields from MessagesSnapshotEventSchema
943
+ });
944
+ }
945
+ /**
946
+ * Emits a run finished event with the final execution status.
947
+ */
948
+ emitRunFinished({
949
+ scenarioRunId,
950
+ status,
951
+ result
952
+ }) {
953
+ const event = {
954
+ ...this.makeBaseEvent({ scenarioRunId }),
955
+ scenarioSetId: this.config.setId ?? "default",
956
+ type: "SCENARIO_RUN_FINISHED" /* RUN_FINISHED */,
957
+ status,
958
+ results: {
959
+ verdict: (result == null ? void 0 : result.success) ? "success" /* SUCCESS */ : "failure" /* FAILURE */,
960
+ metCriteria: (result == null ? void 0 : result.metCriteria) ?? [],
961
+ unmetCriteria: (result == null ? void 0 : result.unmetCriteria) ?? [],
962
+ reasoning: result == null ? void 0 : result.reasoning,
963
+ error: result == null ? void 0 : result.error
1264
964
  }
1265
- const toolChoice = (isLastMessage || enforceJudgement) && hasCriteria ? { type: "tool", toolName: "finish_test" } : "required";
1266
- const completion = await generateText({
1267
- model: mergedConfig.model,
1268
- messages,
1269
- temperature: mergedConfig.temperature ?? 0,
1270
- maxTokens: mergedConfig.maxTokens,
1271
- tools,
1272
- toolChoice
1273
- });
1274
- let args;
1275
- if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
1276
- const toolCall = completion.toolCalls[0];
1277
- switch (toolCall.toolName) {
1278
- case "finish_test": {
1279
- args = toolCall.args;
1280
- const verdict = args.verdict || "inconclusive";
1281
- const reasoning = args.reasoning || "No reasoning provided";
1282
- const criteria = args.criteria || {};
1283
- const criteriaValues = Object.values(criteria);
1284
- const passedCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] === "true");
1285
- const failedCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] !== "true");
1286
- return {
1287
- success: verdict === "success",
1288
- messages: input.messages,
1289
- reasoning,
1290
- passedCriteria,
1291
- failedCriteria
1292
- };
1293
- }
1294
- case "continue_test":
1295
- return [];
1296
- default:
1297
- return {
1298
- success: false,
1299
- messages: input.messages,
1300
- reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
1301
- passedCriteria: [],
1302
- failedCriteria: cfg.criteria
1303
- };
1304
- }
965
+ };
966
+ this.emitEvent(event);
967
+ this.eventSubject.complete();
968
+ }
969
+ /**
970
+ * Distributes a message to all other agents in the scenario.
971
+ *
972
+ * @param message - The message to broadcast.
973
+ * @param fromAgentIdx - The index of the agent that sent the message, to avoid echoing.
974
+ */
975
+ broadcastMessage(message2, fromAgentIdx) {
976
+ for (let idx = 0; idx < this.agents.length; idx++) {
977
+ if (idx === fromAgentIdx) continue;
978
+ if (!this.pendingMessages.has(idx)) {
979
+ this.pendingMessages.set(idx, []);
1305
980
  }
1306
- return {
1307
- success: false,
1308
- messages: input.messages,
1309
- reasoning: `JudgeAgent: No tool call found in LLM output`,
1310
- passedCriteria: [],
1311
- failedCriteria: cfg.criteria
1312
- };
981
+ this.pendingMessages.get(idx).push(message2);
1313
982
  }
1314
- };
983
+ }
1315
984
  };
985
+ function convertAgentReturnTypesToMessages(response, role) {
986
+ if (typeof response === "string")
987
+ return [{ role, content: response }];
988
+ if (Array.isArray(response))
989
+ return response;
990
+ if (typeof response === "object" && "role" in response)
991
+ return [response];
992
+ return [];
993
+ }
1316
994
 
1317
- // src/agents/user-simulator-agent.ts
1318
- import { generateText as generateText2 } from "ai";
1319
- function buildSystemPrompt2(description) {
1320
- return `
1321
- <role>
1322
- You are pretending to be a user, you are testing an AI Agent (shown as the user role) based on a scenario.
1323
- Approach this naturally, as a human user would, with very short inputs, few words, all lowercase, imperative, not periods, like when they google or talk to chatgpt.
1324
- </role>
1325
-
1326
- <goal>
1327
- Your goal (assistant) is to interact with the Agent Under Test (user) as if you were a human user to see if it can complete the scenario successfully.
1328
- </goal>
995
+ // src/runner/index.ts
996
+ var runner_exports = {};
997
+ __export(runner_exports, {
998
+ run: () => run
999
+ });
1329
1000
 
1330
- <scenario>
1331
- ${description}
1332
- </scenario>
1001
+ // src/script/index.ts
1002
+ var script_exports = {};
1003
+ __export(script_exports, {
1004
+ agent: () => agent,
1005
+ fail: () => fail,
1006
+ judge: () => judge,
1007
+ message: () => message,
1008
+ proceed: () => proceed,
1009
+ succeed: () => succeed,
1010
+ user: () => user
1011
+ });
1012
+ var message = (message2) => {
1013
+ return (_state, executor) => executor.message(message2);
1014
+ };
1015
+ var agent = (content) => {
1016
+ return (_state, executor) => executor.agent(content);
1017
+ };
1018
+ var judge = (content) => {
1019
+ return (_state, executor) => executor.judge(content);
1020
+ };
1021
+ var user = (content) => {
1022
+ return (_state, executor) => executor.user(content);
1023
+ };
1024
+ var proceed = (turns, onTurn, onStep) => {
1025
+ return (_state, executor) => executor.proceed(turns, onTurn, onStep);
1026
+ };
1027
+ var succeed = (reasoning) => {
1028
+ return (_state, executor) => executor.succeed(reasoning);
1029
+ };
1030
+ var fail = (reasoning) => {
1031
+ return (_state, executor) => executor.fail(reasoning);
1032
+ };
1333
1033
 
1334
- <rules>
1335
- - DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, you are the user
1336
- </rules>
1337
- `.trim();
1034
+ // src/runner/run.ts
1035
+ async function run(cfg) {
1036
+ if (!cfg.name) {
1037
+ throw new Error("Scenario name is required");
1038
+ }
1039
+ if (!cfg.description) {
1040
+ throw new Error("Scenario description is required");
1041
+ }
1042
+ if ((cfg.maxTurns || 10) < 1) {
1043
+ throw new Error("Max turns must be at least 1");
1044
+ }
1045
+ if (cfg.agents.length === 0) {
1046
+ throw new Error("At least one agent is required");
1047
+ }
1048
+ if (!cfg.agents.find((agent2) => agent2.role === "Agent" /* AGENT */)) {
1049
+ throw new Error("At least one non-user/non-judge agent is required");
1050
+ }
1051
+ cfg.agents.forEach((agent2, i) => {
1052
+ if (!allAgentRoles.includes(agent2.role)) {
1053
+ throw new Error(`Agent ${i} has invalid role: ${agent2.role}`);
1054
+ }
1055
+ });
1056
+ if (!cfg.threadId) {
1057
+ cfg.threadId = generateThreadId();
1058
+ }
1059
+ const steps = cfg.script || [proceed()];
1060
+ const execution = new ScenarioExecution(cfg, steps);
1061
+ let eventBus = null;
1062
+ let subscription = null;
1063
+ try {
1064
+ const projectConfig = await loadScenarioProjectConfig();
1065
+ eventBus = new EventBus({
1066
+ endpoint: projectConfig.langwatchEndpoint ?? process.env.LANGWATCH_ENDPOINT ?? "https://app.langwatch.ai",
1067
+ apiKey: projectConfig.langwatchApiKey ?? process.env.LANGWATCH_API_KEY
1068
+ });
1069
+ eventBus.listen();
1070
+ subscription = eventBus.subscribeTo(execution.events$);
1071
+ const result = await execution.execute();
1072
+ if (cfg.verbose && !result.success) {
1073
+ console.log(`Scenario failed: ${cfg.name}`);
1074
+ console.log(`Reasoning: ${result.reasoning}`);
1075
+ console.log("--------------------------------");
1076
+ console.log(`Met criteria: ${result.metCriteria.join("\n- ")}`);
1077
+ console.log(`Unmet criteria: ${result.unmetCriteria.join("\n- ")}`);
1078
+ console.log(result.messages.map(formatMessage).join("\n"));
1079
+ }
1080
+ return result;
1081
+ } finally {
1082
+ await (eventBus == null ? void 0 : eventBus.drain());
1083
+ subscription == null ? void 0 : subscription.unsubscribe();
1084
+ }
1338
1085
  }
1339
- var userSimulatorAgent = (config2) => {
1340
- return {
1341
- role: "User" /* USER */,
1342
- call: async (input) => {
1343
- const systemPrompt = buildSystemPrompt2(input.scenarioConfig.description);
1344
- const messages = [
1345
- { role: "system", content: systemPrompt },
1346
- { role: "assistant", content: "Hello, how can I help you today" },
1347
- ...input.messages
1348
- ];
1349
- const projectConfig = await getProjectConfig();
1350
- const mergedConfig = mergeAndValidateConfig(config2 ?? {}, projectConfig);
1351
- if (!mergedConfig.model) {
1352
- throw new Error("Model is required for the user simulator agent");
1353
- }
1354
- const reversedMessages = messageRoleReversal(messages);
1355
- const completion = await generateText2({
1356
- model: mergedConfig.model,
1357
- messages: reversedMessages,
1358
- temperature: mergedConfig.temperature ?? 0,
1359
- maxTokens: mergedConfig.maxTokens
1360
- });
1361
- const messageContent = completion.text;
1362
- if (!messageContent) {
1363
- throw new Error("No response content from LLM");
1364
- }
1365
- return { role: "user", content: messageContent };
1086
+ function formatMessage(m) {
1087
+ switch (m.role) {
1088
+ case "user":
1089
+ return `User: ${m.content}`;
1090
+ case "assistant":
1091
+ return `Assistant: ${formatParts(m.content)}`;
1092
+ case "tool":
1093
+ return `Tool: ${formatParts(m.content)}`;
1094
+ default:
1095
+ return `${m.role}: ${m.content}`;
1096
+ }
1097
+ }
1098
+ function formatParts(part) {
1099
+ if (typeof part === "string") {
1100
+ return part;
1101
+ }
1102
+ if (Array.isArray(part)) {
1103
+ if (part.length === 1) {
1104
+ return formatPart(part[0]);
1366
1105
  }
1367
- };
1106
+ return `
1107
+ ${part.map(formatPart).join("\n")}`;
1108
+ }
1109
+ return "Unknown content: " + JSON.stringify(part);
1110
+ }
1111
+ function formatPart(part) {
1112
+ switch (part.type) {
1113
+ case "text":
1114
+ return part.text;
1115
+ case "file":
1116
+ return `(file): ${part.filename} ${typeof part.data === "string" ? `url:${part.data}` : "base64:omitted"}`;
1117
+ case "tool-call":
1118
+ return `(tool call): ${part.toolName} id:${part.toolCallId} args:(${JSON.stringify(part.args)})`;
1119
+ case "tool-result":
1120
+ return `(tool result): ${part.toolName} id:${part.toolCallId} result:(${JSON.stringify(part.result)})`;
1121
+ case "reasoning":
1122
+ return `(reasoning): ${part.text}`;
1123
+ case "redacted-reasoning":
1124
+ return `(redacted reasoning): ${part.data}`;
1125
+ default:
1126
+ return `Unknown content: ${JSON.stringify(part)}`;
1127
+ }
1128
+ }
1129
+
1130
+ // src/index.ts
1131
+ var scenario = {
1132
+ ...agents_exports,
1133
+ ...domain_exports,
1134
+ ...execution_exports,
1135
+ ...runner_exports,
1136
+ ...script_exports
1368
1137
  };
1138
+ var index_default = scenario;
1369
1139
  export {
1370
1140
  AgentAdapter,
1371
1141
  AgentRole,
@@ -1375,6 +1145,7 @@ export {
1375
1145
  UserSimulatorAgentAdapter,
1376
1146
  agent,
1377
1147
  allAgentRoles,
1148
+ index_default as default,
1378
1149
  defineConfig,
1379
1150
  fail,
1380
1151
  judge,
@@ -1382,6 +1153,7 @@ export {
1382
1153
  message,
1383
1154
  proceed,
1384
1155
  run,
1156
+ scenario,
1385
1157
  scenarioProjectConfigSchema,
1386
1158
  succeed,
1387
1159
  user,