@langwatch/scenario 0.2.9 → 0.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +50 -15
- package/dist/{chunk-7H6OGEQ5.mjs → chunk-7HLDX5EL.mjs} +9 -14
- package/dist/{chunk-YPJZSK4J.mjs → chunk-OL4RFXV4.mjs} +23 -11
- package/dist/index.d.mts +559 -72
- package/dist/index.d.ts +559 -72
- package/dist/index.js +746 -212
- package/dist/index.mjs +711 -187
- package/dist/integrations/vitest/config.d.mts +37 -0
- package/dist/integrations/vitest/config.d.ts +37 -0
- package/dist/integrations/vitest/config.js +3 -276
- package/dist/integrations/vitest/config.mjs +3 -10
- package/dist/integrations/vitest/reporter.js +69 -17
- package/dist/integrations/vitest/reporter.mjs +182 -4
- package/dist/integrations/vitest/setup.js +24 -12
- package/dist/integrations/vitest/setup.mjs +2 -2
- package/package.json +1 -2
- package/dist/chunk-K7KLHTDI.mjs +0 -146
package/dist/index.mjs
CHANGED
|
@@ -17,11 +17,11 @@ import {
|
|
|
17
17
|
getBatchRunId,
|
|
18
18
|
getProjectConfig,
|
|
19
19
|
scenarioProjectConfigSchema
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-7HLDX5EL.mjs";
|
|
21
21
|
import {
|
|
22
22
|
Logger,
|
|
23
|
-
|
|
24
|
-
} from "./chunk-
|
|
23
|
+
getEnv
|
|
24
|
+
} from "./chunk-OL4RFXV4.mjs";
|
|
25
25
|
import {
|
|
26
26
|
__export
|
|
27
27
|
} from "./chunk-7P6ASYW6.mjs";
|
|
@@ -163,88 +163,109 @@ function buildFinishTestTool(criteria) {
|
|
|
163
163
|
})
|
|
164
164
|
});
|
|
165
165
|
}
|
|
166
|
-
var
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
}
|
|
198
|
-
const toolChoice = (isLastMessage || enforceJudgement) && hasCriteria ? { type: "tool", toolName: "finish_test" } : "required";
|
|
199
|
-
const completion = await generateText({
|
|
200
|
-
model: mergedConfig.model,
|
|
201
|
-
messages,
|
|
202
|
-
temperature: mergedConfig.temperature ?? 0,
|
|
203
|
-
maxTokens: mergedConfig.maxTokens,
|
|
204
|
-
tools,
|
|
205
|
-
toolChoice
|
|
206
|
-
});
|
|
207
|
-
let args;
|
|
208
|
-
if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
|
|
209
|
-
const toolCall = completion.toolCalls[0];
|
|
210
|
-
switch (toolCall.toolName) {
|
|
211
|
-
case "finish_test": {
|
|
212
|
-
args = toolCall.args;
|
|
213
|
-
const verdict = args.verdict || "inconclusive";
|
|
214
|
-
const reasoning = args.reasoning || "No reasoning provided";
|
|
215
|
-
const criteria = args.criteria || {};
|
|
216
|
-
const criteriaValues = Object.values(criteria);
|
|
217
|
-
const metCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] === "true");
|
|
218
|
-
const unmetCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] !== "true");
|
|
219
|
-
return {
|
|
220
|
-
success: verdict === "success",
|
|
221
|
-
messages: input.messages,
|
|
222
|
-
reasoning,
|
|
223
|
-
metCriteria,
|
|
224
|
-
unmetCriteria
|
|
225
|
-
};
|
|
226
|
-
}
|
|
227
|
-
case "continue_test":
|
|
228
|
-
return [];
|
|
229
|
-
default:
|
|
230
|
-
return {
|
|
231
|
-
success: false,
|
|
232
|
-
messages: input.messages,
|
|
233
|
-
reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
|
|
234
|
-
metCriteria: [],
|
|
235
|
-
unmetCriteria: cfg.criteria
|
|
236
|
-
};
|
|
237
|
-
}
|
|
238
|
-
}
|
|
166
|
+
var JudgeAgent = class extends JudgeAgentAdapter {
|
|
167
|
+
constructor(cfg) {
|
|
168
|
+
super();
|
|
169
|
+
this.cfg = cfg;
|
|
170
|
+
this.criteria = cfg.criteria;
|
|
171
|
+
this.role = "Judge" /* JUDGE */;
|
|
172
|
+
}
|
|
173
|
+
logger = new Logger("JudgeAgent");
|
|
174
|
+
role = "Judge" /* JUDGE */;
|
|
175
|
+
criteria;
|
|
176
|
+
async call(input) {
|
|
177
|
+
var _a;
|
|
178
|
+
const cfg = this.cfg;
|
|
179
|
+
const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
|
|
180
|
+
const messages = [
|
|
181
|
+
{ role: "system", content: systemPrompt },
|
|
182
|
+
...input.messages
|
|
183
|
+
];
|
|
184
|
+
const isLastMessage = input.scenarioState.currentTurn === input.scenarioConfig.maxTurns;
|
|
185
|
+
const projectConfig = await getProjectConfig();
|
|
186
|
+
const mergedConfig = mergeAndValidateConfig(cfg, projectConfig);
|
|
187
|
+
if (!mergedConfig.model) {
|
|
188
|
+
throw new Error("Model is required for the judge agent");
|
|
189
|
+
}
|
|
190
|
+
const tools = {
|
|
191
|
+
continue_test: buildContinueTestTool(),
|
|
192
|
+
finish_test: buildFinishTestTool(cfg.criteria)
|
|
193
|
+
};
|
|
194
|
+
const enforceJudgement = input.judgmentRequest;
|
|
195
|
+
const hasCriteria = cfg.criteria.length && cfg.criteria.length > 0;
|
|
196
|
+
if (enforceJudgement && !hasCriteria) {
|
|
239
197
|
return {
|
|
240
198
|
success: false,
|
|
241
|
-
messages:
|
|
242
|
-
reasoning:
|
|
199
|
+
messages: [],
|
|
200
|
+
reasoning: "JudgeAgent: No criteria was provided to be judged against",
|
|
243
201
|
metCriteria: [],
|
|
244
|
-
unmetCriteria:
|
|
202
|
+
unmetCriteria: []
|
|
245
203
|
};
|
|
246
204
|
}
|
|
247
|
-
|
|
205
|
+
const toolChoice = (isLastMessage || enforceJudgement) && hasCriteria ? { type: "tool", toolName: "finish_test" } : "required";
|
|
206
|
+
const completion = await this.generateText({
|
|
207
|
+
model: mergedConfig.model,
|
|
208
|
+
messages,
|
|
209
|
+
temperature: mergedConfig.temperature ?? 0,
|
|
210
|
+
maxTokens: mergedConfig.maxTokens,
|
|
211
|
+
tools,
|
|
212
|
+
toolChoice
|
|
213
|
+
});
|
|
214
|
+
let args;
|
|
215
|
+
if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
|
|
216
|
+
const toolCall = completion.toolCalls[0];
|
|
217
|
+
switch (toolCall.toolName) {
|
|
218
|
+
case "finish_test": {
|
|
219
|
+
args = toolCall.args;
|
|
220
|
+
const verdict = args.verdict || "inconclusive";
|
|
221
|
+
const reasoning = args.reasoning || "No reasoning provided";
|
|
222
|
+
const criteria = args.criteria || {};
|
|
223
|
+
const criteriaValues = Object.values(criteria);
|
|
224
|
+
const metCriteria = cfg.criteria.filter(
|
|
225
|
+
(_, i) => criteriaValues[i] === "true"
|
|
226
|
+
);
|
|
227
|
+
const unmetCriteria = cfg.criteria.filter(
|
|
228
|
+
(_, i) => criteriaValues[i] !== "true"
|
|
229
|
+
);
|
|
230
|
+
return {
|
|
231
|
+
success: verdict === "success",
|
|
232
|
+
messages: input.messages,
|
|
233
|
+
reasoning,
|
|
234
|
+
metCriteria,
|
|
235
|
+
unmetCriteria
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
case "continue_test":
|
|
239
|
+
return [];
|
|
240
|
+
default:
|
|
241
|
+
return {
|
|
242
|
+
success: false,
|
|
243
|
+
messages: input.messages,
|
|
244
|
+
reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
|
|
245
|
+
metCriteria: [],
|
|
246
|
+
unmetCriteria: cfg.criteria
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
return {
|
|
251
|
+
success: false,
|
|
252
|
+
messages: input.messages,
|
|
253
|
+
reasoning: `JudgeAgent: No tool call found in LLM output`,
|
|
254
|
+
metCriteria: [],
|
|
255
|
+
unmetCriteria: cfg.criteria
|
|
256
|
+
};
|
|
257
|
+
}
|
|
258
|
+
async generateText(input) {
|
|
259
|
+
try {
|
|
260
|
+
return await generateText(input);
|
|
261
|
+
} catch (error) {
|
|
262
|
+
this.logger.error("Error generating text", { error });
|
|
263
|
+
throw error;
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
};
|
|
267
|
+
var judgeAgent = (cfg) => {
|
|
268
|
+
return new JudgeAgent(cfg);
|
|
248
269
|
};
|
|
249
270
|
|
|
250
271
|
// src/agents/user-simulator-agent.ts
|
|
@@ -269,52 +290,75 @@ ${description}
|
|
|
269
290
|
</rules>
|
|
270
291
|
`.trim();
|
|
271
292
|
}
|
|
272
|
-
var
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
293
|
+
var UserSimulatorAgent = class extends UserSimulatorAgentAdapter {
|
|
294
|
+
constructor(cfg) {
|
|
295
|
+
super();
|
|
296
|
+
this.cfg = cfg;
|
|
297
|
+
}
|
|
298
|
+
logger = new Logger(this.constructor.name);
|
|
299
|
+
call = async (input) => {
|
|
300
|
+
const config = this.cfg;
|
|
301
|
+
const systemPrompt = (config == null ? void 0 : config.systemPrompt) ?? buildSystemPrompt2(input.scenarioConfig.description);
|
|
302
|
+
const messages = [
|
|
303
|
+
{ role: "system", content: systemPrompt },
|
|
304
|
+
{ role: "assistant", content: "Hello, how can I help you today" },
|
|
305
|
+
...input.messages
|
|
306
|
+
];
|
|
307
|
+
const projectConfig = await getProjectConfig();
|
|
308
|
+
const mergedConfig = mergeAndValidateConfig(config ?? {}, projectConfig);
|
|
309
|
+
if (!mergedConfig.model) {
|
|
310
|
+
throw new Error("Model is required for the user simulator agent");
|
|
311
|
+
}
|
|
312
|
+
const reversedMessages = messageRoleReversal(messages);
|
|
313
|
+
const completion = await this.generateText({
|
|
314
|
+
model: mergedConfig.model,
|
|
315
|
+
messages: reversedMessages,
|
|
316
|
+
temperature: mergedConfig.temperature ?? DEFAULT_TEMPERATURE,
|
|
317
|
+
maxTokens: mergedConfig.maxTokens
|
|
318
|
+
});
|
|
319
|
+
const messageContent = completion.text;
|
|
320
|
+
if (!messageContent) {
|
|
321
|
+
throw new Error("No response content from LLM");
|
|
299
322
|
}
|
|
323
|
+
return { role: "user", content: messageContent };
|
|
300
324
|
};
|
|
325
|
+
async generateText(input) {
|
|
326
|
+
try {
|
|
327
|
+
return await generateText2(input);
|
|
328
|
+
} catch (error) {
|
|
329
|
+
this.logger.error("Error generating text", { error });
|
|
330
|
+
throw error;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
};
|
|
334
|
+
var userSimulatorAgent = (config) => {
|
|
335
|
+
return new UserSimulatorAgent(config);
|
|
301
336
|
};
|
|
302
337
|
|
|
303
338
|
// src/execution/index.ts
|
|
304
339
|
var execution_exports = {};
|
|
305
340
|
__export(execution_exports, {
|
|
306
341
|
ScenarioExecution: () => ScenarioExecution,
|
|
307
|
-
ScenarioExecutionState: () => ScenarioExecutionState
|
|
342
|
+
ScenarioExecutionState: () => ScenarioExecutionState,
|
|
343
|
+
StateChangeEventType: () => StateChangeEventType
|
|
308
344
|
});
|
|
309
345
|
|
|
310
346
|
// src/execution/scenario-execution.ts
|
|
311
|
-
import { Subject } from "rxjs";
|
|
347
|
+
import { filter, Subject as Subject2 } from "rxjs";
|
|
312
348
|
|
|
313
349
|
// src/execution/scenario-execution-state.ts
|
|
350
|
+
import { Subject } from "rxjs";
|
|
351
|
+
var StateChangeEventType = /* @__PURE__ */ ((StateChangeEventType2) => {
|
|
352
|
+
StateChangeEventType2["MESSAGE_ADDED"] = "MESSAGE_ADDED";
|
|
353
|
+
return StateChangeEventType2;
|
|
354
|
+
})(StateChangeEventType || {});
|
|
314
355
|
var ScenarioExecutionState = class {
|
|
315
356
|
_messages = [];
|
|
316
357
|
_currentTurn = 0;
|
|
317
358
|
_threadId = "";
|
|
359
|
+
/** Event stream for message additions */
|
|
360
|
+
eventSubject = new Subject();
|
|
361
|
+
events$ = this.eventSubject.asObservable();
|
|
318
362
|
description;
|
|
319
363
|
config;
|
|
320
364
|
constructor(config) {
|
|
@@ -342,7 +386,9 @@ var ScenarioExecutionState = class {
|
|
|
342
386
|
* @param message - The message to add.
|
|
343
387
|
*/
|
|
344
388
|
addMessage(message2) {
|
|
345
|
-
|
|
389
|
+
const messageWithId = { ...message2, id: generateMessageId() };
|
|
390
|
+
this._messages.push(messageWithId);
|
|
391
|
+
this.eventSubject.next({ type: "MESSAGE_ADDED" /* MESSAGE_ADDED */ });
|
|
346
392
|
}
|
|
347
393
|
lastMessage() {
|
|
348
394
|
if (this._messages.length === 0) {
|
|
@@ -354,7 +400,9 @@ var ScenarioExecutionState = class {
|
|
|
354
400
|
if (this._messages.length === 0) {
|
|
355
401
|
throw new Error("No messages in history");
|
|
356
402
|
}
|
|
357
|
-
const lastMessage = this._messages.findLast(
|
|
403
|
+
const lastMessage = this._messages.findLast(
|
|
404
|
+
(message2) => message2.role === "user"
|
|
405
|
+
);
|
|
358
406
|
if (!lastMessage) {
|
|
359
407
|
throw new Error("No user message in history");
|
|
360
408
|
}
|
|
@@ -364,7 +412,9 @@ var ScenarioExecutionState = class {
|
|
|
364
412
|
if (this._messages.length === 0) {
|
|
365
413
|
throw new Error("No messages in history");
|
|
366
414
|
}
|
|
367
|
-
const lastMessage = this._messages.findLast(
|
|
415
|
+
const lastMessage = this._messages.findLast(
|
|
416
|
+
(message2) => message2.role === "assistant"
|
|
417
|
+
);
|
|
368
418
|
if (!lastMessage) {
|
|
369
419
|
throw new Error("No agent message in history");
|
|
370
420
|
}
|
|
@@ -374,9 +424,11 @@ var ScenarioExecutionState = class {
|
|
|
374
424
|
if (this._messages.length === 0) {
|
|
375
425
|
throw new Error("No messages in history");
|
|
376
426
|
}
|
|
377
|
-
const lastMessage = this._messages.findLast(
|
|
378
|
-
(
|
|
379
|
-
|
|
427
|
+
const lastMessage = this._messages.findLast(
|
|
428
|
+
(message2) => message2.role === "tool" && message2.content.find(
|
|
429
|
+
(part) => part.type === "tool-result" && part.toolName === toolName
|
|
430
|
+
)
|
|
431
|
+
);
|
|
380
432
|
return lastMessage;
|
|
381
433
|
}
|
|
382
434
|
hasToolCall(toolName) {
|
|
@@ -388,7 +440,7 @@ var ScenarioExecutionState = class {
|
|
|
388
440
|
}
|
|
389
441
|
};
|
|
390
442
|
|
|
391
|
-
// src/utils/
|
|
443
|
+
// src/utils/convert-core-messages-to-agui-messages.ts
|
|
392
444
|
function convertCoreMessagesToAguiMessages(coreMessages) {
|
|
393
445
|
const aguiMessages = [];
|
|
394
446
|
for (const msg of coreMessages) {
|
|
@@ -457,30 +509,53 @@ function convertCoreMessagesToAguiMessages(coreMessages) {
|
|
|
457
509
|
}
|
|
458
510
|
return aguiMessages;
|
|
459
511
|
}
|
|
460
|
-
var
|
|
512
|
+
var convert_core_messages_to_agui_messages_default = convertCoreMessagesToAguiMessages;
|
|
461
513
|
|
|
462
514
|
// src/execution/scenario-execution.ts
|
|
463
515
|
var ScenarioExecution = class {
|
|
516
|
+
/** The current state of the scenario execution */
|
|
464
517
|
state;
|
|
465
|
-
|
|
518
|
+
/** Logger for debugging and monitoring */
|
|
466
519
|
logger = new Logger("scenario.execution.ScenarioExecution");
|
|
520
|
+
/** Finalized configuration with all defaults applied */
|
|
467
521
|
config;
|
|
522
|
+
/** Array of all agents participating in the scenario */
|
|
468
523
|
agents = [];
|
|
524
|
+
/** Roles that still need to act in the current turn (USER, AGENT, JUDGE) */
|
|
469
525
|
pendingRolesOnTurn = [];
|
|
526
|
+
/** Agents that still need to act in the current turn */
|
|
470
527
|
pendingAgentsOnTurn = /* @__PURE__ */ new Set();
|
|
528
|
+
/**
|
|
529
|
+
* Message queues for each agent. When an agent sends a message, it gets
|
|
530
|
+
* broadcast to all other agents' pending message queues. When an agent
|
|
531
|
+
* is called, it receives these pending messages as part of its input.
|
|
532
|
+
*
|
|
533
|
+
* Key: agent index, Value: array of pending messages for that agent
|
|
534
|
+
*/
|
|
471
535
|
pendingMessages = /* @__PURE__ */ new Map();
|
|
536
|
+
/** Intermediate result set by agents that make final decisions */
|
|
472
537
|
partialResult = null;
|
|
538
|
+
/** Accumulated execution time for each agent (for performance tracking) */
|
|
473
539
|
agentTimes = /* @__PURE__ */ new Map();
|
|
540
|
+
/** Timestamp when execution started (for total time calculation) */
|
|
474
541
|
totalStartTime = 0;
|
|
542
|
+
/** Event stream for monitoring scenario progress */
|
|
543
|
+
eventSubject = new Subject2();
|
|
475
544
|
/**
|
|
476
545
|
* An observable stream of events that occur during the scenario execution.
|
|
477
546
|
* Subscribe to this to monitor the progress of the scenario in real-time.
|
|
547
|
+
*
|
|
548
|
+
* Events include:
|
|
549
|
+
* - RUN_STARTED: When scenario execution begins
|
|
550
|
+
* - MESSAGE_SNAPSHOT: After each message is added to the conversation
|
|
551
|
+
* - RUN_FINISHED: When scenario execution completes (success/failure/error)
|
|
478
552
|
*/
|
|
479
553
|
events$ = this.eventSubject.asObservable();
|
|
480
554
|
/**
|
|
481
555
|
* Creates a new ScenarioExecution instance.
|
|
482
|
-
*
|
|
483
|
-
* @param
|
|
556
|
+
*
|
|
557
|
+
* @param config - The scenario configuration containing agents, settings, and metadata
|
|
558
|
+
* @param script - The ordered sequence of script steps that define the test flow
|
|
484
559
|
*/
|
|
485
560
|
constructor(config, script) {
|
|
486
561
|
this.config = {
|
|
@@ -498,13 +573,18 @@ var ScenarioExecution = class {
|
|
|
498
573
|
this.reset();
|
|
499
574
|
}
|
|
500
575
|
/**
|
|
501
|
-
*
|
|
576
|
+
* Gets the complete conversation history as an array of messages.
|
|
577
|
+
*
|
|
578
|
+
* @returns Array of CoreMessage objects representing the full conversation
|
|
502
579
|
*/
|
|
503
580
|
get messages() {
|
|
504
581
|
return this.state.messages;
|
|
505
582
|
}
|
|
506
583
|
/**
|
|
507
|
-
*
|
|
584
|
+
* Gets the unique identifier for the conversation thread.
|
|
585
|
+
* This ID is used to maintain conversation context across multiple runs.
|
|
586
|
+
*
|
|
587
|
+
* @returns The thread identifier string
|
|
508
588
|
*/
|
|
509
589
|
get threadId() {
|
|
510
590
|
return this.state.threadId;
|
|
@@ -517,21 +597,43 @@ var ScenarioExecution = class {
|
|
|
517
597
|
}
|
|
518
598
|
/**
|
|
519
599
|
* Executes the entire scenario from start to finish.
|
|
520
|
-
*
|
|
521
|
-
*
|
|
522
|
-
*
|
|
600
|
+
*
|
|
601
|
+
* This method runs through all script steps sequentially until a final result
|
|
602
|
+
* (success, failure, or error) is determined. Each script step can trigger one or
|
|
603
|
+
* more agent interactions depending on the step type:
|
|
604
|
+
* - `user()` and `agent()` steps typically trigger one agent interaction each
|
|
605
|
+
* - `proceed()` steps can trigger multiple agent interactions across multiple turns
|
|
606
|
+
* - `judge()` steps trigger the judge agent to evaluate the conversation
|
|
607
|
+
* - `succeed()` and `fail()` steps immediately end the scenario
|
|
608
|
+
*
|
|
609
|
+
* The execution will stop early if:
|
|
610
|
+
* - A script step returns a ScenarioResult
|
|
611
|
+
* - The maximum number of turns is reached
|
|
612
|
+
* - An error occurs during execution
|
|
613
|
+
*
|
|
614
|
+
* @returns A promise that resolves with the final result of the scenario
|
|
615
|
+
* @throws Error if an unhandled exception occurs during execution
|
|
616
|
+
*
|
|
617
|
+
* @example
|
|
618
|
+
* ```typescript
|
|
619
|
+
* const execution = new ScenarioExecution(config, script);
|
|
620
|
+
* const result = await execution.execute();
|
|
621
|
+
* console.log(`Scenario ${result.success ? 'passed' : 'failed'}`);
|
|
622
|
+
* ```
|
|
523
623
|
*/
|
|
524
624
|
async execute() {
|
|
525
625
|
this.reset();
|
|
526
626
|
const scenarioRunId = generateScenarioRunId();
|
|
527
627
|
this.emitRunStarted({ scenarioRunId });
|
|
628
|
+
const subscription = this.state.events$.pipe(
|
|
629
|
+
filter((event) => event.type === "MESSAGE_ADDED" /* MESSAGE_ADDED */)
|
|
630
|
+
).subscribe(() => {
|
|
631
|
+
this.emitMessageSnapshot({ scenarioRunId });
|
|
632
|
+
});
|
|
528
633
|
try {
|
|
529
|
-
for (
|
|
530
|
-
this.
|
|
531
|
-
|
|
532
|
-
});
|
|
533
|
-
const result = await scriptStep(this.state, this);
|
|
534
|
-
this.emitMessageSnapshot({ scenarioRunId });
|
|
634
|
+
for (let i = 0; i < this.config.script.length; i++) {
|
|
635
|
+
const scriptStep = this.config.script[i];
|
|
636
|
+
const result = await this.executeScriptStep(scriptStep, i);
|
|
535
637
|
if (result && typeof result === "object" && "success" in result) {
|
|
536
638
|
this.emitRunFinished({
|
|
537
639
|
scenarioRunId,
|
|
@@ -551,27 +653,58 @@ var ScenarioExecution = class {
|
|
|
551
653
|
].join("\n")
|
|
552
654
|
);
|
|
553
655
|
} catch (error) {
|
|
656
|
+
const errorInfo = extractErrorInfo(error);
|
|
554
657
|
const errorResult = {
|
|
555
658
|
success: false,
|
|
556
659
|
messages: this.state.messages,
|
|
557
|
-
reasoning: `Scenario failed with error: ${
|
|
660
|
+
reasoning: `Scenario failed with error: ${errorInfo.message}`,
|
|
558
661
|
metCriteria: [],
|
|
559
662
|
unmetCriteria: [],
|
|
560
|
-
error:
|
|
663
|
+
error: JSON.stringify(errorInfo)
|
|
561
664
|
};
|
|
562
665
|
this.emitRunFinished({
|
|
563
666
|
scenarioRunId,
|
|
564
667
|
status: "ERROR" /* ERROR */,
|
|
565
668
|
result: errorResult
|
|
566
669
|
});
|
|
567
|
-
|
|
670
|
+
throw error;
|
|
671
|
+
} finally {
|
|
672
|
+
subscription.unsubscribe();
|
|
568
673
|
}
|
|
569
674
|
}
|
|
570
675
|
/**
|
|
571
|
-
* Executes a single
|
|
572
|
-
*
|
|
573
|
-
* for
|
|
574
|
-
*
|
|
676
|
+
* Executes a single agent interaction in the scenario.
|
|
677
|
+
*
|
|
678
|
+
* This method is for manual step-by-step execution of the scenario, where each call
|
|
679
|
+
* represents one agent taking their turn. This is different from script steps (like
|
|
680
|
+
* `user()`, `agent()`, `proceed()`, etc.) which are functions in the scenario script.
|
|
681
|
+
*
|
|
682
|
+
* Each call to this method will:
|
|
683
|
+
* - Progress to the next turn if needed
|
|
684
|
+
* - Find the next agent that should act
|
|
685
|
+
* - Execute that agent's response
|
|
686
|
+
* - Return either new messages or a final scenario result
|
|
687
|
+
*
|
|
688
|
+
* Note: This method is primarily for debugging or custom execution flows. Most users
|
|
689
|
+
* will use `execute()` to run the entire scenario automatically.
|
|
690
|
+
*
|
|
691
|
+
* @returns A promise that resolves with either:
|
|
692
|
+
* - Array of new messages added during the agent interaction, or
|
|
693
|
+
* - A final ScenarioResult if the interaction concludes the scenario
|
|
694
|
+
* @throws Error if no result is returned from the step
|
|
695
|
+
*
|
|
696
|
+
* @example
|
|
697
|
+
* ```typescript
|
|
698
|
+
* const execution = new ScenarioExecution(config, script);
|
|
699
|
+
*
|
|
700
|
+
* // Execute one agent interaction at a time
|
|
701
|
+
* const messages = await execution.step();
|
|
702
|
+
* if (Array.isArray(messages)) {
|
|
703
|
+
* console.log('New messages:', messages);
|
|
704
|
+
* } else {
|
|
705
|
+
* console.log('Scenario finished:', messages.success);
|
|
706
|
+
* }
|
|
707
|
+
* ```
|
|
575
708
|
*/
|
|
576
709
|
async step() {
|
|
577
710
|
const result = await this._step();
|
|
@@ -595,6 +728,34 @@ var ScenarioExecution = class {
|
|
|
595
728
|
this.removePendingAgent(nextAgent);
|
|
596
729
|
return await this.callAgent(idx, currentRole);
|
|
597
730
|
}
|
|
731
|
+
/**
|
|
732
|
+
* Calls a specific agent to generate a response or make a decision.
|
|
733
|
+
*
|
|
734
|
+
* This method is the core of agent interaction. It prepares the agent's input
|
|
735
|
+
* by combining the conversation history with any pending messages that have been
|
|
736
|
+
* broadcast to this agent, then calls the agent and processes its response.
|
|
737
|
+
*
|
|
738
|
+
* The agent input includes:
|
|
739
|
+
* - Full conversation history (this.state.messages)
|
|
740
|
+
* - New messages that have been broadcast to this agent (this.pendingMessages.get(idx))
|
|
741
|
+
* - The role the agent is being asked to play
|
|
742
|
+
* - Whether this is a judgment request (for judge agents)
|
|
743
|
+
* - Current scenario state and configuration
|
|
744
|
+
*
|
|
745
|
+
* After the agent responds:
|
|
746
|
+
* - Performance timing is recorded
|
|
747
|
+
* - Pending messages for this agent are cleared (they've been processed)
|
|
748
|
+
* - If the agent returns a ScenarioResult, it's returned immediately
|
|
749
|
+
* - Otherwise, the agent's messages are added to the conversation and broadcast
|
|
750
|
+
*
|
|
751
|
+
* @param idx - The index of the agent in the agents array
|
|
752
|
+
* @param role - The role the agent is being asked to play (USER, AGENT, or JUDGE)
|
|
753
|
+
* @param judgmentRequest - Whether this is a judgment request (for judge agents)
|
|
754
|
+
* @returns A promise that resolves with either:
|
|
755
|
+
* - Array of messages if the agent generated a response, or
|
|
756
|
+
* - ScenarioResult if the agent made a final decision
|
|
757
|
+
* @throws Error if the agent call fails
|
|
758
|
+
*/
|
|
598
759
|
async callAgent(idx, role, judgmentRequest = false) {
|
|
599
760
|
const agent2 = this.agents[idx];
|
|
600
761
|
const startTime = Date.now();
|
|
@@ -607,29 +768,55 @@ var ScenarioExecution = class {
|
|
|
607
768
|
scenarioState: this.state,
|
|
608
769
|
scenarioConfig: this.config
|
|
609
770
|
};
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
771
|
+
try {
|
|
772
|
+
const agentResponse = await agent2.call(agentInput);
|
|
773
|
+
const endTime = Date.now();
|
|
774
|
+
this.addAgentTime(idx, endTime - startTime);
|
|
775
|
+
this.pendingMessages.delete(idx);
|
|
776
|
+
if (agentResponse && typeof agentResponse === "object" && "success" in agentResponse) {
|
|
777
|
+
return agentResponse;
|
|
778
|
+
}
|
|
779
|
+
const currentAgentTime = this.agentTimes.get(idx) ?? 0;
|
|
780
|
+
this.agentTimes.set(idx, currentAgentTime + (Date.now() - startTime));
|
|
781
|
+
const messages = convertAgentReturnTypesToMessages(
|
|
782
|
+
agentResponse,
|
|
783
|
+
role === "User" /* USER */ ? "user" : "assistant"
|
|
784
|
+
);
|
|
785
|
+
for (const message2 of messages) {
|
|
786
|
+
this.state.addMessage(message2);
|
|
787
|
+
this.broadcastMessage(message2, idx);
|
|
788
|
+
}
|
|
789
|
+
return messages;
|
|
790
|
+
} catch (error) {
|
|
791
|
+
this.logger.error(
|
|
792
|
+
`[${this.config.id}] Error calling agent ${agent2.constructor.name}`,
|
|
793
|
+
{
|
|
794
|
+
error: error instanceof Error ? error.message : String(error),
|
|
795
|
+
agent: agent2.constructor.name,
|
|
796
|
+
agentInput
|
|
797
|
+
}
|
|
798
|
+
);
|
|
799
|
+
throw error;
|
|
626
800
|
}
|
|
627
|
-
return messages;
|
|
628
801
|
}
|
|
629
802
|
/**
|
|
630
803
|
* Adds a message to the conversation history.
|
|
631
|
-
*
|
|
632
|
-
*
|
|
804
|
+
*
|
|
805
|
+
* This method is part of the ScenarioExecutionLike interface used by script steps.
|
|
806
|
+
* It automatically routes the message to the appropriate agent based on the message role:
|
|
807
|
+
* - "user" messages are routed to USER role agents
|
|
808
|
+
* - "assistant" messages are routed to AGENT role agents
|
|
809
|
+
* - Other message types are added directly to the conversation
|
|
810
|
+
*
|
|
811
|
+
* @param message - The CoreMessage to add to the conversation
|
|
812
|
+
*
|
|
813
|
+
* @example
|
|
814
|
+
* ```typescript
|
|
815
|
+
* await execution.message({
|
|
816
|
+
* role: "user",
|
|
817
|
+
* content: "Hello, how are you?"
|
|
818
|
+
* });
|
|
819
|
+
* ```
|
|
633
820
|
*/
|
|
634
821
|
async message(message2) {
|
|
635
822
|
if (message2.role === "user") {
|
|
@@ -642,42 +829,134 @@ var ScenarioExecution = class {
|
|
|
642
829
|
}
|
|
643
830
|
}
|
|
644
831
|
/**
|
|
645
|
-
* Executes a user turn.
|
|
646
|
-
*
|
|
647
|
-
* If
|
|
648
|
-
*
|
|
649
|
-
*
|
|
832
|
+
* Executes a user turn in the conversation.
|
|
833
|
+
*
|
|
834
|
+
* If content is provided, it's used directly as the user's message. If not provided,
|
|
835
|
+
* the user simulator agent is called to generate an appropriate response based on
|
|
836
|
+
* the current conversation context.
|
|
837
|
+
*
|
|
838
|
+
* This method is part of the ScenarioExecutionLike interface used by script steps.
|
|
839
|
+
*
|
|
840
|
+
* @param content - Optional content for the user's message. Can be a string or CoreMessage.
|
|
841
|
+
* If not provided, the user simulator agent will generate the content.
|
|
842
|
+
*
|
|
843
|
+
* @example
|
|
844
|
+
* ```typescript
|
|
845
|
+
* // Use provided content
|
|
846
|
+
* await execution.user("What's the weather like?");
|
|
847
|
+
*
|
|
848
|
+
* // Let user simulator generate content
|
|
849
|
+
* await execution.user();
|
|
850
|
+
*
|
|
851
|
+
* // Use a CoreMessage object
|
|
852
|
+
* await execution.user({
|
|
853
|
+
* role: "user",
|
|
854
|
+
* content: "Tell me a joke"
|
|
855
|
+
* });
|
|
856
|
+
* ```
|
|
650
857
|
*/
|
|
651
858
|
async user(content) {
|
|
652
859
|
await this.scriptCallAgent("User" /* USER */, content);
|
|
653
860
|
}
|
|
654
861
|
/**
|
|
655
|
-
* Executes an agent turn.
|
|
656
|
-
*
|
|
657
|
-
* If
|
|
658
|
-
*
|
|
659
|
-
*
|
|
862
|
+
* Executes an agent turn in the conversation.
|
|
863
|
+
*
|
|
864
|
+
* If content is provided, it's used directly as the agent's response. If not provided,
|
|
865
|
+
* the agent under test is called to generate a response based on the current conversation
|
|
866
|
+
* context and any pending messages.
|
|
867
|
+
*
|
|
868
|
+
* This method is part of the ScenarioExecutionLike interface used by script steps.
|
|
869
|
+
*
|
|
870
|
+
* @param content - Optional content for the agent's response. Can be a string or CoreMessage.
|
|
871
|
+
* If not provided, the agent under test will generate the response.
|
|
872
|
+
*
|
|
873
|
+
* @example
|
|
874
|
+
* ```typescript
|
|
875
|
+
* // Let agent generate response
|
|
876
|
+
* await execution.agent();
|
|
877
|
+
*
|
|
878
|
+
* // Use provided content
|
|
879
|
+
* await execution.agent("The weather is sunny today!");
|
|
880
|
+
*
|
|
881
|
+
* // Use a CoreMessage object
|
|
882
|
+
* await execution.agent({
|
|
883
|
+
* role: "assistant",
|
|
884
|
+
* content: "I'm here to help you with weather information."
|
|
885
|
+
* });
|
|
886
|
+
* ```
|
|
660
887
|
*/
|
|
661
888
|
async agent(content) {
|
|
662
889
|
await this.scriptCallAgent("Agent" /* AGENT */, content);
|
|
663
890
|
}
|
|
664
891
|
/**
|
|
665
892
|
* Invokes the judge agent to evaluate the current state of the conversation.
|
|
666
|
-
*
|
|
667
|
-
*
|
|
668
|
-
*
|
|
893
|
+
*
|
|
894
|
+
* The judge agent analyzes the conversation history and determines whether the
|
|
895
|
+
* scenario criteria have been met. This can result in either:
|
|
896
|
+
* - A final scenario result (success/failure) if the judge makes a decision
|
|
897
|
+
* - Null if the judge needs more information or conversation to continue
|
|
898
|
+
*
|
|
899
|
+
* This method is part of the ScenarioExecutionLike interface used by script steps.
|
|
900
|
+
*
|
|
901
|
+
* @param content - Optional message to pass to the judge agent for additional context
|
|
902
|
+
* @returns A promise that resolves with:
|
|
903
|
+
* - ScenarioResult if the judge makes a final decision, or
|
|
904
|
+
* - Null if the conversation should continue
|
|
905
|
+
*
|
|
906
|
+
* @example
|
|
907
|
+
* ```typescript
|
|
908
|
+
* // Let judge evaluate current state
|
|
909
|
+
* const result = await execution.judge();
|
|
910
|
+
* if (result) {
|
|
911
|
+
* console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
|
|
912
|
+
* }
|
|
913
|
+
*
|
|
914
|
+
* // Provide additional context to judge
|
|
915
|
+
* const result = await execution.judge("Please consider the user's satisfaction level");
|
|
916
|
+
* ```
|
|
669
917
|
*/
|
|
670
918
|
async judge(content) {
|
|
671
919
|
return await this.scriptCallAgent("Judge" /* JUDGE */, content, true);
|
|
672
920
|
}
|
|
673
921
|
/**
|
|
674
922
|
* Lets the scenario proceed automatically for a specified number of turns.
|
|
675
|
-
*
|
|
676
|
-
* This is
|
|
677
|
-
*
|
|
678
|
-
*
|
|
679
|
-
*
|
|
680
|
-
*
|
|
923
|
+
*
|
|
924
|
+
* This method is a script step that simulates natural conversation flow by allowing
|
|
925
|
+
* agents to interact automatically without explicit script steps. It can trigger
|
|
926
|
+
* multiple agent interactions across multiple turns, making it useful for testing
|
|
927
|
+
* scenarios where you want to see how agents behave in extended conversations.
|
|
928
|
+
*
|
|
929
|
+
* Unlike other script steps that typically trigger one agent interaction each,
|
|
930
|
+
* this step can trigger many agent interactions depending on the number of turns
|
|
931
|
+
* and the agents' behavior.
|
|
932
|
+
*
|
|
933
|
+
* The method will continue until:
|
|
934
|
+
* - The specified number of turns is reached
|
|
935
|
+
* - A final scenario result is determined
|
|
936
|
+
* - The maximum turns limit is reached
|
|
937
|
+
*
|
|
938
|
+
* @param turns - The number of turns to proceed. If undefined, runs until a conclusion
|
|
939
|
+
* or max turns is reached
|
|
940
|
+
* @param onTurn - Optional callback executed at the end of each turn. Receives the
|
|
941
|
+
* current execution state
|
|
942
|
+
* @param onStep - Optional callback executed after each agent interaction. Receives
|
|
943
|
+
* the current execution state
|
|
944
|
+
* @returns A promise that resolves with:
|
|
945
|
+
* - ScenarioResult if a conclusion is reached during the proceeding, or
|
|
946
|
+
* - Null if the specified turns complete without conclusion
|
|
947
|
+
*
|
|
948
|
+
* @example
|
|
949
|
+
* ```typescript
|
|
950
|
+
* // Proceed for 5 turns
|
|
951
|
+
* const result = await execution.proceed(5);
|
|
952
|
+
*
|
|
953
|
+
* // Proceed until conclusion with callbacks
|
|
954
|
+
* const result = await execution.proceed(
|
|
955
|
+
* undefined,
|
|
956
|
+
* (state) => console.log(`Turn ${state.currentTurn} completed`),
|
|
957
|
+
* (state) => console.log(`Agent interaction completed, ${state.messages.length} messages`)
|
|
958
|
+
* );
|
|
959
|
+
* ```
|
|
681
960
|
*/
|
|
682
961
|
async proceed(turns, onTurn, onStep) {
|
|
683
962
|
let initialTurn = this.state.currentTurn;
|
|
@@ -695,9 +974,26 @@ var ScenarioExecution = class {
|
|
|
695
974
|
}
|
|
696
975
|
/**
|
|
697
976
|
* Immediately ends the scenario with a success verdict.
|
|
698
|
-
*
|
|
699
|
-
*
|
|
700
|
-
*
|
|
977
|
+
*
|
|
978
|
+
* This method forces the scenario to end successfully, regardless of the current
|
|
979
|
+
* conversation state. It's useful for scenarios where you want to explicitly
|
|
980
|
+
* mark success based on specific conditions or external factors.
|
|
981
|
+
*
|
|
982
|
+
* This method is part of the ScenarioExecutionLike interface used by script steps.
|
|
983
|
+
*
|
|
984
|
+
* @param reasoning - Optional explanation for why the scenario is being marked as successful
|
|
985
|
+
* @returns A promise that resolves with the final successful scenario result
|
|
986
|
+
*
|
|
987
|
+
* @example
|
|
988
|
+
* ```typescript
|
|
989
|
+
* // Mark success with default reasoning
|
|
990
|
+
* const result = await execution.succeed();
|
|
991
|
+
*
|
|
992
|
+
* // Mark success with custom reasoning
|
|
993
|
+
* const result = await execution.succeed(
|
|
994
|
+
* "User successfully completed the onboarding flow"
|
|
995
|
+
* );
|
|
996
|
+
* ```
|
|
701
997
|
*/
|
|
702
998
|
async succeed(reasoning) {
|
|
703
999
|
return {
|
|
@@ -710,9 +1006,26 @@ var ScenarioExecution = class {
|
|
|
710
1006
|
}
|
|
711
1007
|
/**
|
|
712
1008
|
* Immediately ends the scenario with a failure verdict.
|
|
713
|
-
*
|
|
714
|
-
*
|
|
715
|
-
*
|
|
1009
|
+
*
|
|
1010
|
+
* This method forces the scenario to end with failure, regardless of the current
|
|
1011
|
+
* conversation state. It's useful for scenarios where you want to explicitly
|
|
1012
|
+
* mark failure based on specific conditions or external factors.
|
|
1013
|
+
*
|
|
1014
|
+
* This method is part of the ScenarioExecutionLike interface used by script steps.
|
|
1015
|
+
*
|
|
1016
|
+
* @param reasoning - Optional explanation for why the scenario is being marked as failed
|
|
1017
|
+
* @returns A promise that resolves with the final failed scenario result
|
|
1018
|
+
*
|
|
1019
|
+
* @example
|
|
1020
|
+
* ```typescript
|
|
1021
|
+
* // Mark failure with default reasoning
|
|
1022
|
+
* const result = await execution.fail();
|
|
1023
|
+
*
|
|
1024
|
+
* // Mark failure with custom reasoning
|
|
1025
|
+
* const result = await execution.fail(
|
|
1026
|
+
* "Agent failed to provide accurate weather information"
|
|
1027
|
+
* );
|
|
1028
|
+
* ```
|
|
716
1029
|
*/
|
|
717
1030
|
async fail(reasoning) {
|
|
718
1031
|
return {
|
|
@@ -723,16 +1036,95 @@ var ScenarioExecution = class {
|
|
|
723
1036
|
unmetCriteria: []
|
|
724
1037
|
};
|
|
725
1038
|
}
|
|
1039
|
+
/**
|
|
1040
|
+
* Adds execution time for a specific agent to the performance tracking.
|
|
1041
|
+
*
|
|
1042
|
+
* This method is used internally to track how long each agent takes to respond,
|
|
1043
|
+
* which is included in the final scenario result for performance analysis.
|
|
1044
|
+
* The accumulated time for each agent is used to calculate total agent response
|
|
1045
|
+
* times in the scenario result.
|
|
1046
|
+
*
|
|
1047
|
+
* @param agentIdx - The index of the agent in the agents array
|
|
1048
|
+
* @param time - The execution time in milliseconds to add to the agent's total
|
|
1049
|
+
*
|
|
1050
|
+
* @example
|
|
1051
|
+
* ```typescript
|
|
1052
|
+
* // This is typically called internally by the execution engine
|
|
1053
|
+
* execution.addAgentTime(0, 1500); // Agent at index 0 took 1.5 seconds
|
|
1054
|
+
* ```
|
|
1055
|
+
*/
|
|
726
1056
|
addAgentTime(agentIdx, time) {
|
|
727
1057
|
const currentTime = this.agentTimes.get(agentIdx) || 0;
|
|
728
1058
|
this.agentTimes.set(agentIdx, currentTime + time);
|
|
729
1059
|
}
|
|
1060
|
+
/**
|
|
1061
|
+
* Checks if a partial result has been set for the scenario.
|
|
1062
|
+
*
|
|
1063
|
+
* This method is used internally to determine if a scenario has already reached
|
|
1064
|
+
* a conclusion (success or failure) but hasn't been finalized yet. Partial results
|
|
1065
|
+
* are typically set by agents that make final decisions (like judge agents) and
|
|
1066
|
+
* are later finalized with the complete message history.
|
|
1067
|
+
*
|
|
1068
|
+
* @returns True if a partial result exists, false otherwise
|
|
1069
|
+
*
|
|
1070
|
+
* @example
|
|
1071
|
+
* ```typescript
|
|
1072
|
+
* // This is typically used internally by the execution engine
|
|
1073
|
+
* if (execution.hasResult()) {
|
|
1074
|
+
* console.log('Scenario has reached a conclusion');
|
|
1075
|
+
* }
|
|
1076
|
+
* ```
|
|
1077
|
+
*/
|
|
730
1078
|
hasResult() {
|
|
731
1079
|
return this.partialResult !== null;
|
|
732
1080
|
}
|
|
1081
|
+
/**
|
|
1082
|
+
* Sets a partial result for the scenario.
|
|
1083
|
+
*
|
|
1084
|
+
* This method is used internally to store intermediate results that may be
|
|
1085
|
+
* finalized later with the complete message history. Partial results are typically
|
|
1086
|
+
* created by agents that make final decisions (like judge agents) and contain
|
|
1087
|
+
* the success/failure status, reasoning, and criteria evaluation, but not the
|
|
1088
|
+
* complete message history.
|
|
1089
|
+
*
|
|
1090
|
+
* @param result - The partial result without the messages field. Should include
|
|
1091
|
+
* success status, reasoning, and criteria evaluation.
|
|
1092
|
+
*
|
|
1093
|
+
* @example
|
|
1094
|
+
* ```typescript
|
|
1095
|
+
* // This is typically called internally by agents that make final decisions
|
|
1096
|
+
* execution.setResult({
|
|
1097
|
+
* success: true,
|
|
1098
|
+
* reasoning: "Agent provided accurate weather information",
|
|
1099
|
+
* metCriteria: ["Provides accurate weather data"],
|
|
1100
|
+
* unmetCriteria: []
|
|
1101
|
+
* });
|
|
1102
|
+
* ```
|
|
1103
|
+
*/
|
|
733
1104
|
setResult(result) {
|
|
734
1105
|
this.partialResult = result;
|
|
735
1106
|
}
|
|
1107
|
+
/**
|
|
1108
|
+
* Internal method to handle script step calls to agents.
|
|
1109
|
+
*
|
|
1110
|
+
* This method is the core logic for executing script steps that involve agent
|
|
1111
|
+
* interactions. It handles finding the appropriate agent for the given role,
|
|
1112
|
+
* managing turn progression, and executing the agent's response.
|
|
1113
|
+
*
|
|
1114
|
+
* The method will:
|
|
1115
|
+
* - Find the next available agent for the specified role
|
|
1116
|
+
* - Progress to a new turn if no agent is available
|
|
1117
|
+
* - Execute the agent with the provided content or let it generate content
|
|
1118
|
+
* - Handle judgment requests for judge agents
|
|
1119
|
+
* - Return a final result if the agent makes a decision
|
|
1120
|
+
*
|
|
1121
|
+
* @param role - The role of the agent to call (USER, AGENT, or JUDGE)
|
|
1122
|
+
* @param content - Optional content to use instead of letting the agent generate it
|
|
1123
|
+
* @param judgmentRequest - Whether this is a judgment request (for judge agents)
|
|
1124
|
+
* @returns A promise that resolves with a ScenarioResult if the agent makes a final
|
|
1125
|
+
* decision, or null if the conversation should continue
|
|
1126
|
+
* @throws Error if no agent is found for the specified role
|
|
1127
|
+
*/
|
|
736
1128
|
async scriptCallAgent(role, content, judgmentRequest = false) {
|
|
737
1129
|
this.consumeUntilRole(role);
|
|
738
1130
|
let index = -1;
|
|
@@ -784,6 +1176,21 @@ var ScenarioExecution = class {
|
|
|
784
1176
|
}
|
|
785
1177
|
return null;
|
|
786
1178
|
}
|
|
1179
|
+
/**
|
|
1180
|
+
* Resets the scenario execution to its initial state.
|
|
1181
|
+
*
|
|
1182
|
+
* This method is called at the beginning of each execution to ensure a clean
|
|
1183
|
+
* state. It creates a new execution state, initializes agents, sets up the
|
|
1184
|
+
* first turn, and clears any pending messages or partial results.
|
|
1185
|
+
*
|
|
1186
|
+
* The reset process:
|
|
1187
|
+
* - Creates a new ScenarioExecutionState with the current config
|
|
1188
|
+
* - Sets up the thread ID (generates new one if not provided)
|
|
1189
|
+
* - Initializes all agents
|
|
1190
|
+
* - Starts the first turn
|
|
1191
|
+
* - Records the start time for performance tracking
|
|
1192
|
+
* - Clears any pending messages
|
|
1193
|
+
*/
|
|
787
1194
|
reset() {
|
|
788
1195
|
this.state = new ScenarioExecutionState(this.config);
|
|
789
1196
|
this.state.threadId = this.config.threadId || generateThreadId();
|
|
@@ -801,6 +1208,16 @@ var ScenarioExecution = class {
|
|
|
801
1208
|
}
|
|
802
1209
|
return { idx: -1, agent: null };
|
|
803
1210
|
}
|
|
1211
|
+
/**
|
|
1212
|
+
* Starts a new turn in the scenario execution.
|
|
1213
|
+
*
|
|
1214
|
+
* This method is called when transitioning to a new turn. It resets the pending
|
|
1215
|
+
* agents and roles for the turn, allowing all agents to participate again in
|
|
1216
|
+
* the new turn. The turn counter is incremented to track the current turn number.
|
|
1217
|
+
*
|
|
1218
|
+
* A turn represents a cycle where agents can take actions. Each turn can involve
|
|
1219
|
+
* multiple agent interactions as agents respond to each other's messages.
|
|
1220
|
+
*/
|
|
804
1221
|
newTurn() {
|
|
805
1222
|
this.pendingAgentsOnTurn = new Set(this.agents);
|
|
806
1223
|
this.pendingRolesOnTurn = [
|
|
@@ -843,6 +1260,23 @@ var ScenarioExecution = class {
|
|
|
843
1260
|
this.pendingRolesOnTurn.pop();
|
|
844
1261
|
}
|
|
845
1262
|
}
|
|
1263
|
+
/**
|
|
1264
|
+
* Creates a failure result when the maximum number of turns is reached.
|
|
1265
|
+
*
|
|
1266
|
+
* This method is called when the scenario execution reaches the maximum number
|
|
1267
|
+
* of turns without reaching a conclusion. It creates a failure result with
|
|
1268
|
+
* appropriate reasoning and includes performance metrics.
|
|
1269
|
+
*
|
|
1270
|
+
* The result includes:
|
|
1271
|
+
* - All messages from the conversation
|
|
1272
|
+
* - Failure reasoning explaining the turn limit was reached
|
|
1273
|
+
* - Empty met criteria (since no conclusion was reached)
|
|
1274
|
+
* - All judge criteria as unmet (since no evaluation was completed)
|
|
1275
|
+
* - Total execution time and agent response times
|
|
1276
|
+
*
|
|
1277
|
+
* @param errorMessage - Optional custom error message to use instead of the default
|
|
1278
|
+
* @returns A ScenarioResult indicating failure due to reaching max turns
|
|
1279
|
+
*/
|
|
846
1280
|
reachedMaxTurns(errorMessage) {
|
|
847
1281
|
var _a;
|
|
848
1282
|
const agentRoleAgentsIdx = this.agents.map((agent2, i) => ({ agent: agent2, idx: i })).filter(({ agent: agent2 }) => agent2.role === "Agent" /* AGENT */).map(({ idx }) => idx);
|
|
@@ -903,7 +1337,7 @@ var ScenarioExecution = class {
|
|
|
903
1337
|
this.emitEvent({
|
|
904
1338
|
...this.makeBaseEvent({ scenarioRunId }),
|
|
905
1339
|
type: "SCENARIO_MESSAGE_SNAPSHOT" /* MESSAGE_SNAPSHOT */,
|
|
906
|
-
messages:
|
|
1340
|
+
messages: convert_core_messages_to_agui_messages_default(this.state.messages)
|
|
907
1341
|
// Add any other required fields from MessagesSnapshotEventSchema
|
|
908
1342
|
});
|
|
909
1343
|
}
|
|
@@ -934,8 +1368,31 @@ var ScenarioExecution = class {
|
|
|
934
1368
|
/**
|
|
935
1369
|
* Distributes a message to all other agents in the scenario.
|
|
936
1370
|
*
|
|
937
|
-
*
|
|
938
|
-
*
|
|
1371
|
+
* This method implements the message broadcasting system that allows agents to
|
|
1372
|
+
* "hear" messages from other agents. When an agent sends a message, it needs to
|
|
1373
|
+
* be distributed to all other agents so they can respond appropriately.
|
|
1374
|
+
*
|
|
1375
|
+
* The broadcasting process:
|
|
1376
|
+
* 1. Iterates through all agents in the scenario
|
|
1377
|
+
* 2. Skips the agent that sent the message (to avoid echo)
|
|
1378
|
+
* 3. Adds the message to each agent's pending message queue
|
|
1379
|
+
* 4. Agents will receive these messages when they're called next
|
|
1380
|
+
*
|
|
1381
|
+
* This creates a realistic conversation environment where agents can see
|
|
1382
|
+
* the full conversation history and respond contextually.
|
|
1383
|
+
*
|
|
1384
|
+
* @param message - The message to broadcast to all other agents
|
|
1385
|
+
* @param fromAgentIdx - The index of the agent that sent the message (to avoid echoing back to sender)
|
|
1386
|
+
*
|
|
1387
|
+
* @example
|
|
1388
|
+
* ```typescript
|
|
1389
|
+
* // When agent 0 sends a message, it gets broadcast to agents 1 and 2
|
|
1390
|
+
* execution.broadcastMessage(
|
|
1391
|
+
* { role: "user", content: "Hello" },
|
|
1392
|
+
* 0 // fromAgentIdx
|
|
1393
|
+
* );
|
|
1394
|
+
* // Now agents 1 and 2 have this message in their pendingMessages queue
|
|
1395
|
+
* ```
|
|
939
1396
|
*/
|
|
940
1397
|
broadcastMessage(message2, fromAgentIdx) {
|
|
941
1398
|
for (let idx = 0; idx < this.agents.length; idx++) {
|
|
@@ -946,6 +1403,58 @@ var ScenarioExecution = class {
|
|
|
946
1403
|
this.pendingMessages.get(idx).push(message2);
|
|
947
1404
|
}
|
|
948
1405
|
}
|
|
1406
|
+
/**
|
|
1407
|
+
* Executes a single script step with proper error handling and logging.
|
|
1408
|
+
*
|
|
1409
|
+
* This method is responsible for executing each script step function with
|
|
1410
|
+
* comprehensive error handling and logging. It provides the execution context
|
|
1411
|
+
* to the script step and handles any errors that occur during execution.
|
|
1412
|
+
*
|
|
1413
|
+
* The method:
|
|
1414
|
+
* - Logs the start of script step execution
|
|
1415
|
+
* - Calls the script step function with the current state and execution context
|
|
1416
|
+
* - Logs the completion of the script step
|
|
1417
|
+
* - Handles and logs any errors that occur
|
|
1418
|
+
* - Re-throws errors to maintain the original error context
|
|
1419
|
+
*
|
|
1420
|
+
* @param scriptStep - The script step function to execute (user, agent, judge, etc.)
|
|
1421
|
+
* @param stepIndex - The index of the script step for logging and debugging context
|
|
1422
|
+
* @returns The result of the script step execution (void, ScenarioResult, or null)
|
|
1423
|
+
* @throws Error if the script step throws an error (preserves original error)
|
|
1424
|
+
*/
|
|
1425
|
+
async executeScriptStep(scriptStep, stepIndex) {
|
|
1426
|
+
const functionString = scriptStep.toString();
|
|
1427
|
+
try {
|
|
1428
|
+
this.logger.debug(
|
|
1429
|
+
`[${this.config.id}] Executing script step ${stepIndex + 1}`,
|
|
1430
|
+
{
|
|
1431
|
+
stepIndex,
|
|
1432
|
+
function: functionString
|
|
1433
|
+
}
|
|
1434
|
+
);
|
|
1435
|
+
const result = await scriptStep(this.state, this);
|
|
1436
|
+
this.logger.debug(
|
|
1437
|
+
`[${this.config.id}] Script step ${stepIndex + 1} completed`,
|
|
1438
|
+
{
|
|
1439
|
+
stepIndex,
|
|
1440
|
+
hasResult: result !== null && result !== void 0,
|
|
1441
|
+
resultType: typeof result
|
|
1442
|
+
}
|
|
1443
|
+
);
|
|
1444
|
+
return result;
|
|
1445
|
+
} catch (error) {
|
|
1446
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1447
|
+
this.logger.error(
|
|
1448
|
+
`[${this.config.id}] Script step ${stepIndex + 1} failed`,
|
|
1449
|
+
{
|
|
1450
|
+
stepIndex,
|
|
1451
|
+
error: errorMessage,
|
|
1452
|
+
function: functionString
|
|
1453
|
+
}
|
|
1454
|
+
);
|
|
1455
|
+
throw error;
|
|
1456
|
+
}
|
|
1457
|
+
}
|
|
949
1458
|
};
|
|
950
1459
|
function convertAgentReturnTypesToMessages(response, role) {
|
|
951
1460
|
if (typeof response === "string")
|
|
@@ -954,6 +1463,19 @@ function convertAgentReturnTypesToMessages(response, role) {
|
|
|
954
1463
|
if (typeof response === "object" && "role" in response) return [response];
|
|
955
1464
|
return [];
|
|
956
1465
|
}
|
|
1466
|
+
function extractErrorInfo(error) {
|
|
1467
|
+
if (error instanceof Error) {
|
|
1468
|
+
return {
|
|
1469
|
+
name: error.name,
|
|
1470
|
+
message: error.message,
|
|
1471
|
+
stack: error.stack
|
|
1472
|
+
};
|
|
1473
|
+
}
|
|
1474
|
+
return {
|
|
1475
|
+
name: typeof error,
|
|
1476
|
+
message: String(error)
|
|
1477
|
+
};
|
|
1478
|
+
}
|
|
957
1479
|
|
|
958
1480
|
// src/runner/index.ts
|
|
959
1481
|
var runner_exports = {};
|
|
@@ -1024,9 +1546,10 @@ async function run(cfg) {
|
|
|
1024
1546
|
let eventBus = null;
|
|
1025
1547
|
let subscription = null;
|
|
1026
1548
|
try {
|
|
1549
|
+
const envConfig = getEnv();
|
|
1027
1550
|
eventBus = new EventBus({
|
|
1028
|
-
endpoint:
|
|
1029
|
-
apiKey:
|
|
1551
|
+
endpoint: envConfig.LANGWATCH_ENDPOINT,
|
|
1552
|
+
apiKey: envConfig.LANGWATCH_API_KEY
|
|
1030
1553
|
});
|
|
1031
1554
|
eventBus.listen();
|
|
1032
1555
|
subscription = eventBus.subscribeTo(execution.events$);
|
|
@@ -1107,6 +1630,7 @@ export {
|
|
|
1107
1630
|
JudgeAgentAdapter,
|
|
1108
1631
|
ScenarioExecution,
|
|
1109
1632
|
ScenarioExecutionState,
|
|
1633
|
+
StateChangeEventType,
|
|
1110
1634
|
UserSimulatorAgentAdapter,
|
|
1111
1635
|
agent,
|
|
1112
1636
|
allAgentRoles,
|