@langwatch/scenario 0.2.6 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import * as ai from 'ai';
2
- import { CoreMessage, CoreToolMessage, LanguageModel } from 'ai';
2
+ import { CoreMessage, CoreUserMessage, CoreAssistantMessage, CoreToolMessage, LanguageModel } from 'ai';
3
3
  import { z } from 'zod';
4
4
  import { Observable } from 'rxjs';
5
5
 
@@ -72,7 +72,6 @@ type AgentReturnTypes = string | CoreMessage | CoreMessage[] | ScenarioResult;
72
72
  */
73
73
  declare abstract class AgentAdapter {
74
74
  role: AgentRole;
75
- constructor(input: AgentInput);
76
75
  /**
77
76
  * Process the input and generate a response.
78
77
  *
@@ -91,7 +90,6 @@ declare abstract class AgentAdapter {
91
90
  */
92
91
  declare abstract class UserSimulatorAgentAdapter implements AgentAdapter {
93
92
  role: AgentRole;
94
- constructor(input: AgentInput);
95
93
  /**
96
94
  * Process the input and generate a user message.
97
95
  *
@@ -110,7 +108,6 @@ declare abstract class JudgeAgentAdapter implements AgentAdapter {
110
108
  * The criteria the judge will use to evaluate the conversation.
111
109
  */
112
110
  abstract criteria: string[];
113
- constructor(input: AgentInput);
114
111
  /**
115
112
  * Process the input and evaluate the conversation.
116
113
  *
@@ -331,7 +328,12 @@ interface ScenarioExecutionStateLike {
331
328
  * Retrieves the last user message from the execution state.
332
329
  * @returns The last user message.
333
330
  */
334
- lastUserMessage(): CoreMessage;
331
+ lastUserMessage(): CoreUserMessage;
332
+ /**
333
+ * Retrieves the last agent message from the execution state.
334
+ * @returns The last agent message.
335
+ */
336
+ lastAgentMessage(): CoreAssistantMessage;
335
337
  /**
336
338
  * Retrieves the last tool call message for a specific tool.
337
339
  * @param toolName - The name of the tool.
@@ -475,6 +477,32 @@ interface JudgeAgentConfig extends TestingAgentConfig {
475
477
  /**
476
478
  * Agent that evaluates conversations against success criteria.
477
479
  *
480
+ * This is the default judge agent that is used if no judge agent is provided.
481
+ * It is a simple agent that uses function calling to make structured decisions
482
+ * and provides detailed reasoning for its verdicts.
483
+ *
484
+ * @param cfg {JudgeAgentConfig} Configuration for the judge agent.
485
+ */
486
+ declare class JudgeAgent extends JudgeAgentAdapter {
487
+ private readonly cfg;
488
+ private logger;
489
+ role: AgentRole;
490
+ criteria: string[];
491
+ constructor(cfg: JudgeAgentConfig);
492
+ call(input: AgentInput): Promise<never[] | {
493
+ success: boolean;
494
+ messages: CoreMessage[];
495
+ reasoning: string;
496
+ metCriteria: string[];
497
+ unmetCriteria: string[];
498
+ }>;
499
+ private generateText;
500
+ }
501
+ /**
502
+ * Factory function for creating JudgeAgent instances.
503
+ *
504
+ * JudgeAgent evaluates conversations against success criteria.
505
+ *
478
506
  * The JudgeAgent watches conversations in real-time and makes decisions about
479
507
  * whether the agent under test is meeting the specified criteria. It can either
480
508
  * allow the conversation to continue or end it with a success/failure verdict.
@@ -520,18 +548,18 @@ interface JudgeAgentConfig extends TestingAgentConfig {
520
548
  * main();
521
549
  * ```
522
550
  */
523
- declare const judgeAgent: (cfg: JudgeAgentConfig) => {
524
- role: AgentRole.JUDGE;
525
- criteria: string[];
526
- call: (input: AgentInput) => Promise<never[] | {
527
- success: boolean;
528
- messages: CoreMessage[];
529
- reasoning: string;
530
- metCriteria: string[];
531
- unmetCriteria: string[];
532
- }>;
533
- };
551
+ declare const judgeAgent: (cfg: JudgeAgentConfig) => JudgeAgent;
534
552
 
553
+ declare class UserSimulatorAgent extends UserSimulatorAgentAdapter {
554
+ private readonly cfg?;
555
+ private logger;
556
+ constructor(cfg?: TestingAgentConfig | undefined);
557
+ call: (input: AgentInput) => Promise<{
558
+ role: "user";
559
+ content: string;
560
+ }>;
561
+ private generateText;
562
+ }
535
563
  /**
536
564
  * Agent that simulates realistic user behavior in scenario conversations.
537
565
  *
@@ -618,16 +646,10 @@ declare const judgeAgent: (cfg: JudgeAgentConfig) => {
618
646
  * main();
619
647
  * ```
620
648
  *
621
- * @note
649
+ * **Implementation Notes:**
622
650
  * - Uses role reversal internally to work around LLM biases toward assistant roles
623
651
  */
624
- declare const userSimulatorAgent: (config?: TestingAgentConfig) => {
625
- role: AgentRole.USER;
626
- call: (input: AgentInput) => Promise<{
627
- role: "user";
628
- content: string;
629
- }>;
630
- };
652
+ declare const userSimulatorAgent: (config?: TestingAgentConfig) => UserSimulatorAgent;
631
653
 
632
654
  type agents_FinishTestArgs = FinishTestArgs;
633
655
  type agents_JudgeAgentConfig = JudgeAgentConfig;
@@ -998,11 +1020,60 @@ declare const scenarioEventSchema: z.ZodDiscriminatedUnion<"type", [z.ZodObject<
998
1020
  type ScenarioEvent = z.infer<typeof scenarioEventSchema>;
999
1021
 
1000
1022
  /**
1001
- * Manages the execution of a single scenario.
1023
+ * Manages the execution of a single scenario test.
1024
+ *
1025
+ * This class orchestrates the interaction between agents (user simulator, agent under test,
1026
+ * and judge), executes the test script step-by-step, and manages the scenario's state
1027
+ * throughout execution. It also emits events that can be subscribed to for real-time
1028
+ * monitoring of the scenario's progress.
1029
+ *
1030
+ * ## Execution Flow Overview
1031
+ *
1032
+ * The execution follows a turn-based system where agents take turns responding. The key
1033
+ * concepts are:
1034
+ * - **Script Steps**: Functions in the scenario script like `user()`, `agent()`, `proceed()`, etc.
1035
+ * - **Agent Interactions**: Individual agent responses that occur when an agent takes their turn
1036
+ * - **Turns**: Groups of agent interactions that happen in sequence
1037
+ *
1038
+ * ## Message Broadcasting System
1002
1039
  *
1003
- * This class orchestrates the interaction between agents, executes the script,
1004
- * and manages the scenario's state. It also emits events that can be subscribed to
1005
- * for observing the scenario's progress.
1040
+ * The class implements a sophisticated message broadcasting system that ensures all agents
1041
+ * can "hear" each other's messages:
1042
+ *
1043
+ * 1. **Message Creation**: When an agent sends a message, it's added to the conversation history
1044
+ * 2. **Broadcasting**: The message is immediately broadcast to all other agents via `broadcastMessage()`
1045
+ * 3. **Queue Management**: Each agent has a pending message queue (`pendingMessages`) that stores
1046
+ * messages from other agents
1047
+ * 4. **Agent Input**: When an agent is called, it receives both the full conversation history
1048
+ * and any new pending messages that have been broadcast to it
1049
+ * 5. **Queue Clearing**: After an agent processes its pending messages, its queue is cleared
1050
+ *
1051
+ * This creates a realistic conversation environment where agents can respond contextually
1052
+ * to the full conversation history and any new messages from other agents.
1053
+ *
1054
+ * ## Example Message Flow
1055
+ *
1056
+ * ```
1057
+ * Turn 1:
1058
+ * 1. User Agent sends: "Hello"
1059
+ * - Added to conversation history
1060
+ * - Broadcast to Agent and Judge (pendingMessages[1] = ["Hello"], pendingMessages[2] = ["Hello"])
1061
+ *
1062
+ * 2. Agent is called:
1063
+ * - Receives: full conversation + pendingMessages[1] = ["Hello"]
1064
+ * - Sends: "Hi there! How can I help you?"
1065
+ * - Added to conversation history
1066
+ * - Broadcast to User and Judge (pendingMessages[0] = ["Hi there!..."], pendingMessages[2] = ["Hello", "Hi there!..."])
1067
+ * - pendingMessages[1] is cleared
1068
+ *
1069
+ * 3. Judge is called:
1070
+ * - Receives: full conversation + pendingMessages[2] = ["Hello", "Hi there!..."]
1071
+ * - Evaluates and decides to continue
1072
+ * - pendingMessages[2] is cleared
1073
+ * ```
1074
+ *
1075
+ * Each script step can trigger one or more agent interactions depending on the step type.
1076
+ * For example, a `proceed(5)` step might trigger 10 agent interactions across 5 turns.
1006
1077
  *
1007
1078
  * Note: This is an internal class. Most users will interact with the higher-level
1008
1079
  * `scenario.run()` function instead of instantiating this class directly.
@@ -1022,9 +1093,10 @@ type ScenarioEvent = z.infer<typeof scenarioEventSchema>;
1022
1093
  * }),
1023
1094
  * ],
1024
1095
  * script: [
1025
- * scenario.user("Hello"),
1026
- * scenario.agent(),
1027
- * scenario.judge(),
1096
+ * scenario.user("Hello"), // Script step 1: triggers 1 agent interaction
1097
+ * scenario.agent(), // Script step 2: triggers 1 agent interaction
1098
+ * scenario.proceed(3), // Script step 3: triggers multiple agent interactions
1099
+ * scenario.judge(), // Script step 4: triggers 1 agent interaction
1028
1100
  * ]
1029
1101
  * });
1030
1102
  *
@@ -1032,34 +1104,62 @@ type ScenarioEvent = z.infer<typeof scenarioEventSchema>;
1032
1104
  * ```
1033
1105
  */
1034
1106
  declare class ScenarioExecution implements ScenarioExecutionLike {
1107
+ /** The current state of the scenario execution */
1035
1108
  private state;
1036
- private eventSubject;
1109
+ /** Logger for debugging and monitoring */
1037
1110
  private logger;
1111
+ /** Finalized configuration with all defaults applied */
1038
1112
  private config;
1113
+ /** Array of all agents participating in the scenario */
1039
1114
  private agents;
1115
+ /** Roles that still need to act in the current turn (USER, AGENT, JUDGE) */
1040
1116
  private pendingRolesOnTurn;
1117
+ /** Agents that still need to act in the current turn */
1041
1118
  private pendingAgentsOnTurn;
1119
+ /**
1120
+ * Message queues for each agent. When an agent sends a message, it gets
1121
+ * broadcast to all other agents' pending message queues. When an agent
1122
+ * is called, it receives these pending messages as part of its input.
1123
+ *
1124
+ * Key: agent index, Value: array of pending messages for that agent
1125
+ */
1042
1126
  private pendingMessages;
1127
+ /** Intermediate result set by agents that make final decisions */
1043
1128
  private partialResult;
1129
+ /** Accumulated execution time for each agent (for performance tracking) */
1044
1130
  private agentTimes;
1131
+ /** Timestamp when execution started (for total time calculation) */
1045
1132
  private totalStartTime;
1133
+ /** Event stream for monitoring scenario progress */
1134
+ private eventSubject;
1046
1135
  /**
1047
1136
  * An observable stream of events that occur during the scenario execution.
1048
1137
  * Subscribe to this to monitor the progress of the scenario in real-time.
1138
+ *
1139
+ * Events include:
1140
+ * - RUN_STARTED: When scenario execution begins
1141
+ * - MESSAGE_SNAPSHOT: After each message is added to the conversation
1142
+ * - RUN_FINISHED: When scenario execution completes (success/failure/error)
1049
1143
  */
1050
1144
  readonly events$: Observable<ScenarioEvent>;
1051
1145
  /**
1052
1146
  * Creates a new ScenarioExecution instance.
1053
- * @param config The scenario configuration.
1054
- * @param script The script steps to execute.
1147
+ *
1148
+ * @param config - The scenario configuration containing agents, settings, and metadata
1149
+ * @param script - The ordered sequence of script steps that define the test flow
1055
1150
  */
1056
1151
  constructor(config: ScenarioConfig, script: ScriptStep[]);
1057
1152
  /**
1058
- * The history of messages in the conversation.
1153
+ * Gets the complete conversation history as an array of messages.
1154
+ *
1155
+ * @returns Array of CoreMessage objects representing the full conversation
1059
1156
  */
1060
1157
  get messages(): CoreMessage[];
1061
1158
  /**
1062
- * The unique identifier for the conversation thread.
1159
+ * Gets the unique identifier for the conversation thread.
1160
+ * This ID is used to maintain conversation context across multiple runs.
1161
+ *
1162
+ * @returns The thread identifier string
1063
1163
  */
1064
1164
  get threadId(): string;
1065
1165
  /**
@@ -1068,85 +1168,422 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1068
1168
  private get totalTime();
1069
1169
  /**
1070
1170
  * Executes the entire scenario from start to finish.
1071
- * This will run through the script and any automatic proceeding logic until a
1072
- * final result (success, failure, or error) is determined.
1073
- * @returns A promise that resolves with the final result of the scenario.
1171
+ *
1172
+ * This method runs through all script steps sequentially until a final result
1173
+ * (success, failure, or error) is determined. Each script step can trigger one or
1174
+ * more agent interactions depending on the step type:
1175
+ * - `user()` and `agent()` steps typically trigger one agent interaction each
1176
+ * - `proceed()` steps can trigger multiple agent interactions across multiple turns
1177
+ * - `judge()` steps trigger the judge agent to evaluate the conversation
1178
+ * - `succeed()` and `fail()` steps immediately end the scenario
1179
+ *
1180
+ * The execution will stop early if:
1181
+ * - A script step returns a ScenarioResult
1182
+ * - The maximum number of turns is reached
1183
+ * - An error occurs during execution
1184
+ *
1185
+ * @returns A promise that resolves with the final result of the scenario
1186
+ * @throws Error if an unhandled exception occurs during execution
1187
+ *
1188
+ * @example
1189
+ * ```typescript
1190
+ * const execution = new ScenarioExecution(config, script);
1191
+ * const result = await execution.execute();
1192
+ * console.log(`Scenario ${result.success ? 'passed' : 'failed'}`);
1193
+ * ```
1074
1194
  */
1075
1195
  execute(): Promise<ScenarioResult>;
1076
1196
  /**
1077
- * Executes a single step in the scenario.
1078
- * A step usually corresponds to a single agent's turn. This method is useful
1079
- * for manually controlling the scenario's progress.
1080
- * @returns A promise that resolves with the new messages added during the step, or a final scenario result if the step concludes the scenario.
1197
+ * Executes a single agent interaction in the scenario.
1198
+ *
1199
+ * This method is for manual step-by-step execution of the scenario, where each call
1200
+ * represents one agent taking their turn. This is different from script steps (like
1201
+ * `user()`, `agent()`, `proceed()`, etc.) which are functions in the scenario script.
1202
+ *
1203
+ * Each call to this method will:
1204
+ * - Progress to the next turn if needed
1205
+ * - Find the next agent that should act
1206
+ * - Execute that agent's response
1207
+ * - Return either new messages or a final scenario result
1208
+ *
1209
+ * Note: This method is primarily for debugging or custom execution flows. Most users
1210
+ * will use `execute()` to run the entire scenario automatically.
1211
+ *
1212
+ * @returns A promise that resolves with either:
1213
+ * - Array of new messages added during the agent interaction, or
1214
+ * - A final ScenarioResult if the interaction concludes the scenario
1215
+ * @throws Error if no result is returned from the step
1216
+ *
1217
+ * @example
1218
+ * ```typescript
1219
+ * const execution = new ScenarioExecution(config, script);
1220
+ *
1221
+ * // Execute one agent interaction at a time
1222
+ * const messages = await execution.step();
1223
+ * if (Array.isArray(messages)) {
1224
+ * console.log('New messages:', messages);
1225
+ * } else {
1226
+ * console.log('Scenario finished:', messages.success);
1227
+ * }
1228
+ * ```
1081
1229
  */
1082
1230
  step(): Promise<CoreMessage[] | ScenarioResult>;
1083
1231
  private _step;
1232
+ /**
1233
+ * Calls a specific agent to generate a response or make a decision.
1234
+ *
1235
+ * This method is the core of agent interaction. It prepares the agent's input
1236
+ * by combining the conversation history with any pending messages that have been
1237
+ * broadcast to this agent, then calls the agent and processes its response.
1238
+ *
1239
+ * The agent input includes:
1240
+ * - Full conversation history (this.state.messages)
1241
+ * - New messages that have been broadcast to this agent (this.pendingMessages.get(idx))
1242
+ * - The role the agent is being asked to play
1243
+ * - Whether this is a judgment request (for judge agents)
1244
+ * - Current scenario state and configuration
1245
+ *
1246
+ * After the agent responds:
1247
+ * - Performance timing is recorded
1248
+ * - Pending messages for this agent are cleared (they've been processed)
1249
+ * - If the agent returns a ScenarioResult, it's returned immediately
1250
+ * - Otherwise, the agent's messages are added to the conversation and broadcast
1251
+ *
1252
+ * @param idx - The index of the agent in the agents array
1253
+ * @param role - The role the agent is being asked to play (USER, AGENT, or JUDGE)
1254
+ * @param judgmentRequest - Whether this is a judgment request (for judge agents)
1255
+ * @returns A promise that resolves with either:
1256
+ * - Array of messages if the agent generated a response, or
1257
+ * - ScenarioResult if the agent made a final decision
1258
+ * @throws Error if the agent call fails
1259
+ */
1084
1260
  private callAgent;
1085
1261
  /**
1086
1262
  * Adds a message to the conversation history.
1087
- * This is part of the `ScenarioExecutionLike` interface used by script steps.
1088
- * @param message The message to add.
1263
+ *
1264
+ * This method is part of the ScenarioExecutionLike interface used by script steps.
1265
+ * It automatically routes the message to the appropriate agent based on the message role:
1266
+ * - "user" messages are routed to USER role agents
1267
+ * - "assistant" messages are routed to AGENT role agents
1268
+ * - Other message types are added directly to the conversation
1269
+ *
1270
+ * @param message - The CoreMessage to add to the conversation
1271
+ *
1272
+ * @example
1273
+ * ```typescript
1274
+ * await execution.message({
1275
+ * role: "user",
1276
+ * content: "Hello, how are you?"
1277
+ * });
1278
+ * ```
1089
1279
  */
1090
1280
  message(message: CoreMessage): Promise<void>;
1091
1281
  /**
1092
- * Executes a user turn.
1093
- * If content is provided, it's used as the user's message.
1094
- * If not, the user simulator agent is called to generate a message.
1095
- * This is part of the `ScenarioExecutionLike` interface used by script steps.
1096
- * @param content The optional content of the user's message.
1282
+ * Executes a user turn in the conversation.
1283
+ *
1284
+ * If content is provided, it's used directly as the user's message. If not provided,
1285
+ * the user simulator agent is called to generate an appropriate response based on
1286
+ * the current conversation context.
1287
+ *
1288
+ * This method is part of the ScenarioExecutionLike interface used by script steps.
1289
+ *
1290
+ * @param content - Optional content for the user's message. Can be a string or CoreMessage.
1291
+ * If not provided, the user simulator agent will generate the content.
1292
+ *
1293
+ * @example
1294
+ * ```typescript
1295
+ * // Use provided content
1296
+ * await execution.user("What's the weather like?");
1297
+ *
1298
+ * // Let user simulator generate content
1299
+ * await execution.user();
1300
+ *
1301
+ * // Use a CoreMessage object
1302
+ * await execution.user({
1303
+ * role: "user",
1304
+ * content: "Tell me a joke"
1305
+ * });
1306
+ * ```
1097
1307
  */
1098
1308
  user(content?: string | CoreMessage): Promise<void>;
1099
1309
  /**
1100
- * Executes an agent turn.
1101
- * If content is provided, it's used as the agent's message.
1102
- * If not, the agent under test is called to generate a response.
1103
- * This is part of the `ScenarioExecutionLike` interface used by script steps.
1104
- * @param content The optional content of the agent's message.
1310
+ * Executes an agent turn in the conversation.
1311
+ *
1312
+ * If content is provided, it's used directly as the agent's response. If not provided,
1313
+ * the agent under test is called to generate a response based on the current conversation
1314
+ * context and any pending messages.
1315
+ *
1316
+ * This method is part of the ScenarioExecutionLike interface used by script steps.
1317
+ *
1318
+ * @param content - Optional content for the agent's response. Can be a string or CoreMessage.
1319
+ * If not provided, the agent under test will generate the response.
1320
+ *
1321
+ * @example
1322
+ * ```typescript
1323
+ * // Let agent generate response
1324
+ * await execution.agent();
1325
+ *
1326
+ * // Use provided content
1327
+ * await execution.agent("The weather is sunny today!");
1328
+ *
1329
+ * // Use a CoreMessage object
1330
+ * await execution.agent({
1331
+ * role: "assistant",
1332
+ * content: "I'm here to help you with weather information."
1333
+ * });
1334
+ * ```
1105
1335
  */
1106
1336
  agent(content?: string | CoreMessage): Promise<void>;
1107
1337
  /**
1108
1338
  * Invokes the judge agent to evaluate the current state of the conversation.
1109
- * This is part of the `ScenarioExecutionLike` interface used by script steps.
1110
- * @param content Optional message to pass to the judge.
1111
- * @returns A promise that resolves with the scenario result if the judge makes a final decision, otherwise null.
1339
+ *
1340
+ * The judge agent analyzes the conversation history and determines whether the
1341
+ * scenario criteria have been met. This can result in either:
1342
+ * - A final scenario result (success/failure) if the judge makes a decision
1343
+ * - Null if the judge needs more information or conversation to continue
1344
+ *
1345
+ * This method is part of the ScenarioExecutionLike interface used by script steps.
1346
+ *
1347
+ * @param content - Optional message to pass to the judge agent for additional context
1348
+ * @returns A promise that resolves with:
1349
+ * - ScenarioResult if the judge makes a final decision, or
1350
+ * - Null if the conversation should continue
1351
+ *
1352
+ * @example
1353
+ * ```typescript
1354
+ * // Let judge evaluate current state
1355
+ * const result = await execution.judge();
1356
+ * if (result) {
1357
+ * console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
1358
+ * }
1359
+ *
1360
+ * // Provide additional context to judge
1361
+ * const result = await execution.judge("Please consider the user's satisfaction level");
1362
+ * ```
1112
1363
  */
1113
1364
  judge(content?: string | CoreMessage): Promise<ScenarioResult | null>;
1114
1365
  /**
1115
1366
  * Lets the scenario proceed automatically for a specified number of turns.
1116
- * This simulates the natural flow of conversation between agents.
1117
- * This is part of the `ScenarioExecutionLike` interface used by script steps.
1118
- * @param turns The number of turns to proceed. If undefined, runs until a conclusion or max turns is reached.
1119
- * @param onTurn A callback executed at the end of each turn.
1120
- * @param onStep A callback executed after each agent interaction.
1121
- * @returns A promise that resolves with the scenario result if a conclusion is reached.
1367
+ *
1368
+ * This method is a script step that simulates natural conversation flow by allowing
1369
+ * agents to interact automatically without explicit script steps. It can trigger
1370
+ * multiple agent interactions across multiple turns, making it useful for testing
1371
+ * scenarios where you want to see how agents behave in extended conversations.
1372
+ *
1373
+ * Unlike other script steps that typically trigger one agent interaction each,
1374
+ * this step can trigger many agent interactions depending on the number of turns
1375
+ * and the agents' behavior.
1376
+ *
1377
+ * The method will continue until:
1378
+ * - The specified number of turns is reached
1379
+ * - A final scenario result is determined
1380
+ * - The maximum turns limit is reached
1381
+ *
1382
+ * @param turns - The number of turns to proceed. If undefined, runs until a conclusion
1383
+ * or max turns is reached
1384
+ * @param onTurn - Optional callback executed at the end of each turn. Receives the
1385
+ * current execution state
1386
+ * @param onStep - Optional callback executed after each agent interaction. Receives
1387
+ * the current execution state
1388
+ * @returns A promise that resolves with:
1389
+ * - ScenarioResult if a conclusion is reached during the proceeding, or
1390
+ * - Null if the specified turns complete without conclusion
1391
+ *
1392
+ * @example
1393
+ * ```typescript
1394
+ * // Proceed for 5 turns
1395
+ * const result = await execution.proceed(5);
1396
+ *
1397
+ * // Proceed until conclusion with callbacks
1398
+ * const result = await execution.proceed(
1399
+ * undefined,
1400
+ * (state) => console.log(`Turn ${state.currentTurn} completed`),
1401
+ * (state) => console.log(`Agent interaction completed, ${state.messages.length} messages`)
1402
+ * );
1403
+ * ```
1122
1404
  */
1123
1405
  proceed(turns?: number, onTurn?: (state: ScenarioExecutionStateLike) => void | Promise<void>, onStep?: (state: ScenarioExecutionStateLike) => void | Promise<void>): Promise<ScenarioResult | null>;
1124
1406
  /**
1125
1407
  * Immediately ends the scenario with a success verdict.
1126
- * This is part of the `ScenarioExecutionLike` interface used by script steps.
1127
- * @param reasoning An optional explanation for the success.
1128
- * @returns A promise that resolves with the final successful scenario result.
1408
+ *
1409
+ * This method forces the scenario to end successfully, regardless of the current
1410
+ * conversation state. It's useful for scenarios where you want to explicitly
1411
+ * mark success based on specific conditions or external factors.
1412
+ *
1413
+ * This method is part of the ScenarioExecutionLike interface used by script steps.
1414
+ *
1415
+ * @param reasoning - Optional explanation for why the scenario is being marked as successful
1416
+ * @returns A promise that resolves with the final successful scenario result
1417
+ *
1418
+ * @example
1419
+ * ```typescript
1420
+ * // Mark success with default reasoning
1421
+ * const result = await execution.succeed();
1422
+ *
1423
+ * // Mark success with custom reasoning
1424
+ * const result = await execution.succeed(
1425
+ * "User successfully completed the onboarding flow"
1426
+ * );
1427
+ * ```
1129
1428
  */
1130
1429
  succeed(reasoning?: string): Promise<ScenarioResult>;
1131
1430
  /**
1132
1431
  * Immediately ends the scenario with a failure verdict.
1133
- * This is part of the `ScenarioExecutionLike` interface used by script steps.
1134
- * @param reasoning An optional explanation for the failure.
1135
- * @returns A promise that resolves with the final failed scenario result.
1432
+ *
1433
+ * This method forces the scenario to end with failure, regardless of the current
1434
+ * conversation state. It's useful for scenarios where you want to explicitly
1435
+ * mark failure based on specific conditions or external factors.
1436
+ *
1437
+ * This method is part of the ScenarioExecutionLike interface used by script steps.
1438
+ *
1439
+ * @param reasoning - Optional explanation for why the scenario is being marked as failed
1440
+ * @returns A promise that resolves with the final failed scenario result
1441
+ *
1442
+ * @example
1443
+ * ```typescript
1444
+ * // Mark failure with default reasoning
1445
+ * const result = await execution.fail();
1446
+ *
1447
+ * // Mark failure with custom reasoning
1448
+ * const result = await execution.fail(
1449
+ * "Agent failed to provide accurate weather information"
1450
+ * );
1451
+ * ```
1136
1452
  */
1137
1453
  fail(reasoning?: string): Promise<ScenarioResult>;
1454
+ /**
1455
+ * Adds execution time for a specific agent to the performance tracking.
1456
+ *
1457
+ * This method is used internally to track how long each agent takes to respond,
1458
+ * which is included in the final scenario result for performance analysis.
1459
+ * The accumulated time for each agent is used to calculate total agent response
1460
+ * times in the scenario result.
1461
+ *
1462
+ * @param agentIdx - The index of the agent in the agents array
1463
+ * @param time - The execution time in milliseconds to add to the agent's total
1464
+ *
1465
+ * @example
1466
+ * ```typescript
1467
+ * // This is typically called internally by the execution engine
1468
+ * execution.addAgentTime(0, 1500); // Agent at index 0 took 1.5 seconds
1469
+ * ```
1470
+ */
1138
1471
  addAgentTime(agentIdx: number, time: number): void;
1472
+ /**
1473
+ * Checks if a partial result has been set for the scenario.
1474
+ *
1475
+ * This method is used internally to determine if a scenario has already reached
1476
+ * a conclusion (success or failure) but hasn't been finalized yet. Partial results
1477
+ * are typically set by agents that make final decisions (like judge agents) and
1478
+ * are later finalized with the complete message history.
1479
+ *
1480
+ * @returns True if a partial result exists, false otherwise
1481
+ *
1482
+ * @example
1483
+ * ```typescript
1484
+ * // This is typically used internally by the execution engine
1485
+ * if (execution.hasResult()) {
1486
+ * console.log('Scenario has reached a conclusion');
1487
+ * }
1488
+ * ```
1489
+ */
1139
1490
  hasResult(): boolean;
1491
+ /**
1492
+ * Sets a partial result for the scenario.
1493
+ *
1494
+ * This method is used internally to store intermediate results that may be
1495
+ * finalized later with the complete message history. Partial results are typically
1496
+ * created by agents that make final decisions (like judge agents) and contain
1497
+ * the success/failure status, reasoning, and criteria evaluation, but not the
1498
+ * complete message history.
1499
+ *
1500
+ * @param result - The partial result without the messages field. Should include
1501
+ * success status, reasoning, and criteria evaluation.
1502
+ *
1503
+ * @example
1504
+ * ```typescript
1505
+ * // This is typically called internally by agents that make final decisions
1506
+ * execution.setResult({
1507
+ * success: true,
1508
+ * reasoning: "Agent provided accurate weather information",
1509
+ * metCriteria: ["Provides accurate weather data"],
1510
+ * unmetCriteria: []
1511
+ * });
1512
+ * ```
1513
+ */
1140
1514
  setResult(result: Omit<ScenarioResult, "messages">): void;
1515
+ /**
1516
+ * Internal method to handle script step calls to agents.
1517
+ *
1518
+ * This method is the core logic for executing script steps that involve agent
1519
+ * interactions. It handles finding the appropriate agent for the given role,
1520
+ * managing turn progression, and executing the agent's response.
1521
+ *
1522
+ * The method will:
1523
+ * - Find the next available agent for the specified role
1524
+ * - Progress to a new turn if no agent is available
1525
+ * - Execute the agent with the provided content or let it generate content
1526
+ * - Handle judgment requests for judge agents
1527
+ * - Return a final result if the agent makes a decision
1528
+ *
1529
+ * @param role - The role of the agent to call (USER, AGENT, or JUDGE)
1530
+ * @param content - Optional content to use instead of letting the agent generate it
1531
+ * @param judgmentRequest - Whether this is a judgment request (for judge agents)
1532
+ * @returns A promise that resolves with a ScenarioResult if the agent makes a final
1533
+ * decision, or null if the conversation should continue
1534
+ * @throws Error if no agent is found for the specified role
1535
+ */
1141
1536
  private scriptCallAgent;
1537
+ /**
1538
+ * Resets the scenario execution to its initial state.
1539
+ *
1540
+ * This method is called at the beginning of each execution to ensure a clean
1541
+ * state. It creates a new execution state, initializes agents, sets up the
1542
+ * first turn, and clears any pending messages or partial results.
1543
+ *
1544
+ * The reset process:
1545
+ * - Creates a new ScenarioExecutionState with the current config
1546
+ * - Sets up the thread ID (generates new one if not provided)
1547
+ * - Initializes all agents
1548
+ * - Starts the first turn
1549
+ * - Records the start time for performance tracking
1550
+ * - Clears any pending messages
1551
+ */
1142
1552
  private reset;
1143
1553
  private nextAgentForRole;
1554
+ /**
1555
+ * Starts a new turn in the scenario execution.
1556
+ *
1557
+ * This method is called when transitioning to a new turn. It resets the pending
1558
+ * agents and roles for the turn, allowing all agents to participate again in
1559
+ * the new turn. The turn counter is incremented to track the current turn number.
1560
+ *
1561
+ * A turn represents a cycle where agents can take actions. Each turn can involve
1562
+ * multiple agent interactions as agents respond to each other's messages.
1563
+ */
1144
1564
  private newTurn;
1145
1565
  private removePendingRole;
1146
1566
  private removePendingAgent;
1147
1567
  private getNextAgentForRole;
1148
1568
  private setAgents;
1149
1569
  private consumeUntilRole;
1570
+ /**
1571
+ * Creates a failure result when the maximum number of turns is reached.
1572
+ *
1573
+ * This method is called when the scenario execution reaches the maximum number
1574
+ * of turns without reaching a conclusion. It creates a failure result with
1575
+ * appropriate reasoning and includes performance metrics.
1576
+ *
1577
+ * The result includes:
1578
+ * - All messages from the conversation
1579
+ * - Failure reasoning explaining the turn limit was reached
1580
+ * - Empty met criteria (since no conclusion was reached)
1581
+ * - All judge criteria as unmet (since no evaluation was completed)
1582
+ * - Total execution time and agent response times
1583
+ *
1584
+ * @param errorMessage - Optional custom error message to use instead of the default
1585
+ * @returns A ScenarioResult indicating failure due to reaching max turns
1586
+ */
1150
1587
  private reachedMaxTurns;
1151
1588
  private getJudgeAgent;
1152
1589
  /**
@@ -1172,12 +1609,61 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
1172
1609
  /**
1173
1610
  * Distributes a message to all other agents in the scenario.
1174
1611
  *
1175
- * @param message - The message to broadcast.
1176
- * @param fromAgentIdx - The index of the agent that sent the message, to avoid echoing.
1612
+ * This method implements the message broadcasting system that allows agents to
1613
+ * "hear" messages from other agents. When an agent sends a message, it needs to
1614
+ * be distributed to all other agents so they can respond appropriately.
1615
+ *
1616
+ * The broadcasting process:
1617
+ * 1. Iterates through all agents in the scenario
1618
+ * 2. Skips the agent that sent the message (to avoid echo)
1619
+ * 3. Adds the message to each agent's pending message queue
1620
+ * 4. Agents will receive these messages when they're called next
1621
+ *
1622
+ * This creates a realistic conversation environment where agents can see
1623
+ * the full conversation history and respond contextually.
1624
+ *
1625
+ * @param message - The message to broadcast to all other agents
1626
+ * @param fromAgentIdx - The index of the agent that sent the message (to avoid echoing back to sender)
1627
+ *
1628
+ * @example
1629
+ * ```typescript
1630
+ * // When agent 0 sends a message, it gets broadcast to agents 1 and 2
1631
+ * execution.broadcastMessage(
1632
+ * { role: "user", content: "Hello" },
1633
+ * 0 // fromAgentIdx
1634
+ * );
1635
+ * // Now agents 1 and 2 have this message in their pendingMessages queue
1636
+ * ```
1177
1637
  */
1178
1638
  private broadcastMessage;
1639
+ /**
1640
+ * Executes a single script step with proper error handling and logging.
1641
+ *
1642
+ * This method is responsible for executing each script step function with
1643
+ * comprehensive error handling and logging. It provides the execution context
1644
+ * to the script step and handles any errors that occur during execution.
1645
+ *
1646
+ * The method:
1647
+ * - Logs the start of script step execution
1648
+ * - Calls the script step function with the current state and execution context
1649
+ * - Logs the completion of the script step
1650
+ * - Handles and logs any errors that occur
1651
+ * - Re-throws errors to maintain the original error context
1652
+ *
1653
+ * @param scriptStep - The script step function to execute (user, agent, judge, etc.)
1654
+ * @param stepIndex - The index of the script step for logging and debugging context
1655
+ * @returns The result of the script step execution (void, ScenarioResult, or null)
1656
+ * @throws Error if the script step throws an error (preserves original error)
1657
+ */
1658
+ private executeScriptStep;
1179
1659
  }
1180
1660
 
1661
+ declare enum StateChangeEventType {
1662
+ MESSAGE_ADDED = "MESSAGE_ADDED"
1663
+ }
1664
+ type StateChangeEvent = {
1665
+ type: StateChangeEventType.MESSAGE_ADDED;
1666
+ };
1181
1667
  /**
1182
1668
  * Manages the state of a scenario execution.
1183
1669
  * This class implements the ScenarioExecutionStateLike interface and provides
@@ -1188,6 +1674,9 @@ declare class ScenarioExecutionState implements ScenarioExecutionStateLike {
1188
1674
  private _messages;
1189
1675
  private _currentTurn;
1190
1676
  private _threadId;
1677
+ /** Event stream for message additions */
1678
+ private eventSubject;
1679
+ readonly events$: Observable<StateChangeEvent>;
1191
1680
  description: string;
1192
1681
  config: ScenarioConfig;
1193
1682
  constructor(config: ScenarioConfig);
@@ -1203,7 +1692,8 @@ declare class ScenarioExecutionState implements ScenarioExecutionStateLike {
1203
1692
  */
1204
1693
  addMessage(message: CoreMessage): void;
1205
1694
  lastMessage(): CoreMessage;
1206
- lastUserMessage(): CoreMessage;
1695
+ lastUserMessage(): CoreUserMessage;
1696
+ lastAgentMessage(): CoreAssistantMessage;
1207
1697
  lastToolCall(toolName: string): CoreToolMessage;
1208
1698
  hasToolCall(toolName: string): boolean;
1209
1699
  }
@@ -1212,8 +1702,11 @@ type execution_ScenarioExecution = ScenarioExecution;
1212
1702
  declare const execution_ScenarioExecution: typeof ScenarioExecution;
1213
1703
  type execution_ScenarioExecutionState = ScenarioExecutionState;
1214
1704
  declare const execution_ScenarioExecutionState: typeof ScenarioExecutionState;
1705
+ type execution_StateChangeEvent = StateChangeEvent;
1706
+ type execution_StateChangeEventType = StateChangeEventType;
1707
+ declare const execution_StateChangeEventType: typeof StateChangeEventType;
1215
1708
  declare namespace execution {
1216
- export { execution_ScenarioExecution as ScenarioExecution, execution_ScenarioExecutionState as ScenarioExecutionState };
1709
+ export { execution_ScenarioExecution as ScenarioExecution, execution_ScenarioExecutionState as ScenarioExecutionState, type execution_StateChangeEvent as StateChangeEvent, execution_StateChangeEventType as StateChangeEventType };
1217
1710
  }
1218
1711
 
1219
1712
  /**
@@ -1375,4 +1868,4 @@ declare namespace script {
1375
1868
  type ScenarioApi = typeof agents & typeof domain & typeof execution & typeof runner & typeof script;
1376
1869
  declare const scenario: ScenarioApi;
1377
1870
 
1378
- export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, DEFAULT_MAX_TURNS, DEFAULT_TEMPERATURE, DEFAULT_VERBOSE, type FinishTestArgs, JudgeAgentAdapter, type JudgeAgentConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type TestingAgentConfig, type TestingAgentInferenceConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
1871
+ export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, DEFAULT_MAX_TURNS, DEFAULT_TEMPERATURE, DEFAULT_VERBOSE, type FinishTestArgs, JudgeAgentAdapter, type JudgeAgentConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, type TestingAgentInferenceConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };