@langwatch/scenario 0.2.0-prerelease.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +272 -0
- package/dist/index.d.mts +1193 -0
- package/dist/index.d.ts +1193 -0
- package/dist/index.js +1444 -0
- package/dist/index.mjs +1389 -0
- package/package.json +78 -0
package/dist/index.d.mts
ADDED
|
@@ -0,0 +1,1193 @@
|
|
|
1
|
+
import * as ai from 'ai';
|
|
2
|
+
import { CoreMessage, CoreToolMessage, LanguageModel } from 'ai';
|
|
3
|
+
import { Observable } from 'rxjs';
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
|
|
6
|
+
interface ScenarioResult {
|
|
7
|
+
success: boolean;
|
|
8
|
+
messages: CoreMessage[];
|
|
9
|
+
reasoning?: string;
|
|
10
|
+
passedCriteria: string[];
|
|
11
|
+
failedCriteria: string[];
|
|
12
|
+
totalTime?: number;
|
|
13
|
+
agentTime?: number;
|
|
14
|
+
}
|
|
15
|
+
interface ScenarioExecutionStateLike {
|
|
16
|
+
history: CoreMessage[];
|
|
17
|
+
historyWithoutLastMessage: CoreMessage[];
|
|
18
|
+
historyWithoutLastUserMessage: CoreMessage[];
|
|
19
|
+
threadId: string;
|
|
20
|
+
turn: number | null;
|
|
21
|
+
agents: AgentAdapter[];
|
|
22
|
+
pendingRolesOnTurn: AgentRole[];
|
|
23
|
+
pendingAgentsOnTurn: AgentAdapter[];
|
|
24
|
+
partialResult: Omit<ScenarioResult, "messages"> | null;
|
|
25
|
+
totalTime: number;
|
|
26
|
+
agentTimes: Map<number, number>;
|
|
27
|
+
addMessage(message: CoreMessage, fromAgentIdx?: number): void;
|
|
28
|
+
addMessages(messages: CoreMessage[], fromAgentIdx?: number): void;
|
|
29
|
+
setThreadId(threadId: string): void;
|
|
30
|
+
setAgents(agents: AgentAdapter[]): void;
|
|
31
|
+
appendMessage(role: CoreMessage["role"], content: string): void;
|
|
32
|
+
appendUserMessage(content: string): void;
|
|
33
|
+
appendAssistantMessage(content: string): void;
|
|
34
|
+
getPendingMessages(agentIdx: number): CoreMessage[];
|
|
35
|
+
clearPendingMessages(agentIdx: number): void;
|
|
36
|
+
newTurn(): void;
|
|
37
|
+
removePendingRole(role: AgentRole): void;
|
|
38
|
+
removeLastPendingRole(): void;
|
|
39
|
+
removePendingAgent(agent: AgentAdapter): void;
|
|
40
|
+
getNextAgentForRole(role: AgentRole): {
|
|
41
|
+
index: number;
|
|
42
|
+
agent: AgentAdapter;
|
|
43
|
+
} | null;
|
|
44
|
+
addAgentTime(agentIdx: number, time: number): void;
|
|
45
|
+
hasResult(): boolean;
|
|
46
|
+
setResult(result: Omit<ScenarioResult, "messages">): void;
|
|
47
|
+
readonly lastMessage: CoreMessage | undefined;
|
|
48
|
+
readonly lastUserMessage: CoreMessage | undefined;
|
|
49
|
+
readonly lastAssistantMessage: CoreMessage | undefined;
|
|
50
|
+
readonly lastToolCall: CoreToolMessage | undefined;
|
|
51
|
+
getLastToolCallByToolName(toolName: string): CoreToolMessage | undefined;
|
|
52
|
+
hasToolCall(toolName: string): boolean;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
declare const scenarioProjectConfigSchema: z.ZodObject<{
|
|
56
|
+
defaultModel: z.ZodOptional<z.ZodObject<{
|
|
57
|
+
model: z.ZodType<ai.LanguageModelV1, z.ZodTypeDef, ai.LanguageModelV1>;
|
|
58
|
+
temperature: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
|
|
59
|
+
maxTokens: z.ZodOptional<z.ZodNumber>;
|
|
60
|
+
}, "strip", z.ZodTypeAny, {
|
|
61
|
+
model: ai.LanguageModelV1;
|
|
62
|
+
temperature: number;
|
|
63
|
+
maxTokens?: number | undefined;
|
|
64
|
+
}, {
|
|
65
|
+
model: ai.LanguageModelV1;
|
|
66
|
+
temperature?: number | undefined;
|
|
67
|
+
maxTokens?: number | undefined;
|
|
68
|
+
}>>;
|
|
69
|
+
langwatchEndpoint: z.ZodOptional<z.ZodString>;
|
|
70
|
+
langwatchApiKey: z.ZodOptional<z.ZodString>;
|
|
71
|
+
}, "strict", z.ZodTypeAny, {
|
|
72
|
+
defaultModel?: {
|
|
73
|
+
model: ai.LanguageModelV1;
|
|
74
|
+
temperature: number;
|
|
75
|
+
maxTokens?: number | undefined;
|
|
76
|
+
} | undefined;
|
|
77
|
+
langwatchEndpoint?: string | undefined;
|
|
78
|
+
langwatchApiKey?: string | undefined;
|
|
79
|
+
}, {
|
|
80
|
+
defaultModel?: {
|
|
81
|
+
model: ai.LanguageModelV1;
|
|
82
|
+
temperature?: number | undefined;
|
|
83
|
+
maxTokens?: number | undefined;
|
|
84
|
+
} | undefined;
|
|
85
|
+
langwatchEndpoint?: string | undefined;
|
|
86
|
+
langwatchApiKey?: string | undefined;
|
|
87
|
+
}>;
|
|
88
|
+
type ScenarioProjectConfig = z.infer<typeof scenarioProjectConfigSchema>;
|
|
89
|
+
declare function defineConfig(config: ScenarioProjectConfig): ScenarioProjectConfig;
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Configuration for a scenario.
|
|
93
|
+
*/
|
|
94
|
+
interface ScenarioConfig {
|
|
95
|
+
/**
|
|
96
|
+
* Optional unique identifier for the scenario.
|
|
97
|
+
* If not provided, a UUID will be generated.
|
|
98
|
+
*/
|
|
99
|
+
id?: string;
|
|
100
|
+
/**
|
|
101
|
+
* The name of the scenario.
|
|
102
|
+
*/
|
|
103
|
+
name: string;
|
|
104
|
+
/**
|
|
105
|
+
* A description of what the scenario tests.
|
|
106
|
+
*/
|
|
107
|
+
description: string;
|
|
108
|
+
/**
|
|
109
|
+
* The agents participating in the scenario.
|
|
110
|
+
*/
|
|
111
|
+
agents: AgentAdapter[];
|
|
112
|
+
/**
|
|
113
|
+
* The script of steps to execute for the scenario.
|
|
114
|
+
*/
|
|
115
|
+
script?: ScriptStep[];
|
|
116
|
+
/**
|
|
117
|
+
* Whether to output verbose logging. Defaults to false.
|
|
118
|
+
*/
|
|
119
|
+
verbose?: boolean;
|
|
120
|
+
/**
|
|
121
|
+
* The maximum number of turns to execute. Defaults to 20.
|
|
122
|
+
*/
|
|
123
|
+
maxTurns?: number;
|
|
124
|
+
/**
|
|
125
|
+
* Optional thread ID to use for the conversation.
|
|
126
|
+
* If not provided, a new thread will be created.
|
|
127
|
+
*/
|
|
128
|
+
threadId?: string;
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Final, normalized scenario configuration.
|
|
132
|
+
* All optional fields are filled with default values.
|
|
133
|
+
* @internal
|
|
134
|
+
*/
|
|
135
|
+
interface ScenarioConfigFinal extends Omit<ScenarioConfig, "id" | "script" | "threadId" | "verbose" | "maxTurns"> {
|
|
136
|
+
id: string;
|
|
137
|
+
script: ScriptStep[];
|
|
138
|
+
verbose: boolean;
|
|
139
|
+
maxTurns: number;
|
|
140
|
+
threadId: string;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* The execution context for a scenario script.
|
|
144
|
+
* This provides the functions to control the flow of the scenario.
|
|
145
|
+
*/
|
|
146
|
+
interface ScenarioExecutionLike {
|
|
147
|
+
/**
|
|
148
|
+
* The history of messages in the conversation.
|
|
149
|
+
*/
|
|
150
|
+
readonly history: CoreMessage[];
|
|
151
|
+
/**
|
|
152
|
+
* The ID of the conversation thread.
|
|
153
|
+
*/
|
|
154
|
+
readonly threadId: string;
|
|
155
|
+
/**
|
|
156
|
+
* Adds a message to the conversation.
|
|
157
|
+
* @param message The message to add.
|
|
158
|
+
*/
|
|
159
|
+
message(message: CoreMessage): Promise<void>;
|
|
160
|
+
/**
|
|
161
|
+
* Adds a user message to the conversation.
|
|
162
|
+
* If no content is provided, the user simulator will generate a message.
|
|
163
|
+
* @param content The content of the user message.
|
|
164
|
+
*/
|
|
165
|
+
user(content?: string | CoreMessage): Promise<void>;
|
|
166
|
+
/**
|
|
167
|
+
* Adds an agent message to the conversation.
|
|
168
|
+
* If no content is provided, the agent under test will generate a message.
|
|
169
|
+
* @param content The content of the agent message.
|
|
170
|
+
*/
|
|
171
|
+
agent(content?: string | CoreMessage): Promise<void>;
|
|
172
|
+
/**
|
|
173
|
+
* Invokes the judge agent to evaluate the current state.
|
|
174
|
+
* @param content Optional message to the judge.
|
|
175
|
+
* @returns The result of the scenario if the judge makes a final decision.
|
|
176
|
+
*/
|
|
177
|
+
judge(content?: string | CoreMessage): Promise<ScenarioResult | null>;
|
|
178
|
+
/**
|
|
179
|
+
* Proceeds with the scenario automatically for a number of turns.
|
|
180
|
+
* @param turns The number of turns to proceed. Defaults to running until the scenario ends.
|
|
181
|
+
* @param onTurn Optional callback executed at the end of each turn.
|
|
182
|
+
* @param onStep Optional callback executed after each agent interaction.
|
|
183
|
+
* @returns The result of the scenario if it ends.
|
|
184
|
+
*/
|
|
185
|
+
proceed(turns?: number, onTurn?: (state: ScenarioExecutionStateLike) => void | Promise<void>, onStep?: (state: ScenarioExecutionStateLike) => void | Promise<void>): Promise<ScenarioResult | null>;
|
|
186
|
+
/**
|
|
187
|
+
* Ends the scenario with a success.
|
|
188
|
+
* @param reasoning Optional reasoning for the success.
|
|
189
|
+
* @returns The final result of the scenario.
|
|
190
|
+
*/
|
|
191
|
+
succeed(reasoning?: string): Promise<ScenarioResult>;
|
|
192
|
+
/**
|
|
193
|
+
* Ends the scenario with a failure.
|
|
194
|
+
* @param reasoning Optional reasoning for the failure.
|
|
195
|
+
* @returns The final result of the scenario.
|
|
196
|
+
*/
|
|
197
|
+
fail(reasoning?: string): Promise<ScenarioResult>;
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* A step in a scenario script.
|
|
201
|
+
* This is a function that takes the current state and an executor, and performs an action.
|
|
202
|
+
*/
|
|
203
|
+
type ScriptStep = (state: ScenarioExecutionStateLike, executor: ScenarioExecutionLike) => Promise<void | ScenarioResult | null> | void | ScenarioResult | null;
|
|
204
|
+
|
|
205
|
+
declare enum AgentRole {
|
|
206
|
+
USER = "User",
|
|
207
|
+
AGENT = "Agent",
|
|
208
|
+
JUDGE = "Judge"
|
|
209
|
+
}
|
|
210
|
+
declare const allAgentRoles: readonly [AgentRole.USER, AgentRole.AGENT, AgentRole.JUDGE];
|
|
211
|
+
/**
|
|
212
|
+
* Input provided to an agent's `call` method.
|
|
213
|
+
*/
|
|
214
|
+
interface AgentInput {
|
|
215
|
+
/**
|
|
216
|
+
* A unique identifier for the conversation thread.
|
|
217
|
+
*/
|
|
218
|
+
threadId: string;
|
|
219
|
+
/**
|
|
220
|
+
* The full history of messages in the conversation.
|
|
221
|
+
*/
|
|
222
|
+
messages: CoreMessage[];
|
|
223
|
+
/**
|
|
224
|
+
* New messages added since the last time this agent was called.
|
|
225
|
+
*/
|
|
226
|
+
newMessages: CoreMessage[];
|
|
227
|
+
/**
|
|
228
|
+
* The role the agent is being asked to play in this turn.
|
|
229
|
+
*/
|
|
230
|
+
requestedRole: AgentRole;
|
|
231
|
+
/**
|
|
232
|
+
* Whether a judgment is being requested in this turn.
|
|
233
|
+
*/
|
|
234
|
+
judgmentRequest: boolean;
|
|
235
|
+
/**
|
|
236
|
+
* The current state of the scenario execution.
|
|
237
|
+
*/
|
|
238
|
+
scenarioState: ScenarioExecutionStateLike;
|
|
239
|
+
/**
|
|
240
|
+
* The configuration for the current scenario.
|
|
241
|
+
*/
|
|
242
|
+
scenarioConfig: ScenarioConfig;
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* The possible return types from an agent's `call` method.
|
|
246
|
+
* Can be a simple string, a single message, an array of messages, or a ScenarioResult.
|
|
247
|
+
*/
|
|
248
|
+
type AgentReturnTypes = string | CoreMessage | CoreMessage[] | ScenarioResult;
|
|
249
|
+
/**
|
|
250
|
+
* Abstract base class for integrating custom agents with the Scenario framework.
|
|
251
|
+
*
|
|
252
|
+
* This adapter pattern allows you to wrap any existing agent implementation
|
|
253
|
+
* (LLM calls, agent frameworks, or complex multi-step systems) to work with
|
|
254
|
+
* the Scenario testing framework. The adapter receives structured input about
|
|
255
|
+
* the conversation state and returns responses in a standardized format.
|
|
256
|
+
*
|
|
257
|
+
* @example
|
|
258
|
+
* ```typescript
|
|
259
|
+
* class MyAgent extends AgentAdapter {
|
|
260
|
+
* role = AgentRole.AGENT;
|
|
261
|
+
*
|
|
262
|
+
* async call(input: AgentInput): Promise<AgentReturnTypes> {
|
|
263
|
+
* const userMessage = input.messages.find(m => m.role === 'user');
|
|
264
|
+
* if (userMessage) {
|
|
265
|
+
* return `You said: ${userMessage.content}`;
|
|
266
|
+
* }
|
|
267
|
+
* return "Hello!";
|
|
268
|
+
* }
|
|
269
|
+
* }
|
|
270
|
+
* ```
|
|
271
|
+
*/
|
|
272
|
+
declare abstract class AgentAdapter {
|
|
273
|
+
role: AgentRole;
|
|
274
|
+
constructor(input: AgentInput);
|
|
275
|
+
/**
|
|
276
|
+
* Process the input and generate a response.
|
|
277
|
+
*
|
|
278
|
+
* This is the main method that your agent implementation must provide.
|
|
279
|
+
* It receives structured information about the current conversation state
|
|
280
|
+
* and must return a response in one of the supported formats.
|
|
281
|
+
*
|
|
282
|
+
* @param input AgentInput containing conversation history, thread context, and scenario state.
|
|
283
|
+
* @returns The agent's response.
|
|
284
|
+
*/
|
|
285
|
+
abstract call(input: AgentInput): Promise<AgentReturnTypes>;
|
|
286
|
+
}
|
|
287
|
+
/**
|
|
288
|
+
* Abstract base class for user simulator agents.
|
|
289
|
+
* User simulator agents are responsible for generating user messages to drive the conversation.
|
|
290
|
+
*/
|
|
291
|
+
declare abstract class UserSimulatorAgentAdapter implements AgentAdapter {
|
|
292
|
+
role: AgentRole;
|
|
293
|
+
constructor(input: AgentInput);
|
|
294
|
+
/**
|
|
295
|
+
* Process the input and generate a user message.
|
|
296
|
+
*
|
|
297
|
+
* @param input AgentInput containing conversation history, thread context, and scenario state.
|
|
298
|
+
* @returns The user's response.
|
|
299
|
+
*/
|
|
300
|
+
abstract call(input: AgentInput): Promise<AgentReturnTypes>;
|
|
301
|
+
}
|
|
302
|
+
/**
|
|
303
|
+
* Abstract base class for judge agents.
|
|
304
|
+
* Judge agents are responsible for evaluating the conversation and determining success or failure.
|
|
305
|
+
*/
|
|
306
|
+
declare abstract class JudgeAgentAdapter implements AgentAdapter {
|
|
307
|
+
role: AgentRole;
|
|
308
|
+
/**
|
|
309
|
+
* The criteria the judge will use to evaluate the conversation.
|
|
310
|
+
*/
|
|
311
|
+
abstract criteria: string[];
|
|
312
|
+
constructor(input: AgentInput);
|
|
313
|
+
/**
|
|
314
|
+
* Process the input and evaluate the conversation.
|
|
315
|
+
*
|
|
316
|
+
* @param input AgentInput containing conversation history, thread context, and scenario state.
|
|
317
|
+
* @returns A ScenarioResult if the conversation should end, otherwise should continue.
|
|
318
|
+
*/
|
|
319
|
+
abstract call(input: AgentInput): Promise<AgentReturnTypes>;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
/**
|
|
323
|
+
* Scenario script DSL (Domain Specific Language) module.
|
|
324
|
+
*
|
|
325
|
+
* This module provides a collection of functions that form a declarative language
|
|
326
|
+
* for controlling scenario execution flow. These functions can be used to create
|
|
327
|
+
* scripts that precisely control how conversations unfold, when evaluations occur,
|
|
328
|
+
* and when scenarios should succeed or fail.
|
|
329
|
+
*/
|
|
330
|
+
|
|
331
|
+
/**
|
|
332
|
+
* Add a specific message to the conversation.
|
|
333
|
+
*
|
|
334
|
+
* This function allows you to inject any CoreMessage compatible message directly
|
|
335
|
+
* into the conversation at a specific point in the script. Useful for
|
|
336
|
+
* simulating tool responses, system messages, or specific conversational states.
|
|
337
|
+
*
|
|
338
|
+
* @param message The message to add to the conversation.
|
|
339
|
+
* @returns A ScriptStep function that can be used in scenario scripts.
|
|
340
|
+
*/
|
|
341
|
+
declare const message: (message: CoreMessage) => ScriptStep;
|
|
342
|
+
/**
|
|
343
|
+
* Generate or specify an agent response in the conversation.
|
|
344
|
+
*
|
|
345
|
+
* If content is provided, it will be used as the agent response. If no content
|
|
346
|
+
* is provided, the agent under test will be called to generate its response
|
|
347
|
+
* based on the current conversation state.
|
|
348
|
+
*
|
|
349
|
+
* @param content Optional agent response content. Can be a string or full message object.
|
|
350
|
+
* If undefined, the agent under test will generate content automatically.
|
|
351
|
+
* @returns A ScriptStep function that can be used in scenario scripts.
|
|
352
|
+
*/
|
|
353
|
+
declare const agent: (content?: string | CoreMessage) => ScriptStep;
|
|
354
|
+
/**
|
|
355
|
+
* Invoke the judge agent to evaluate the current conversation state.
|
|
356
|
+
*
|
|
357
|
+
* This function forces the judge agent to make a decision about whether
|
|
358
|
+
* the scenario should continue or end with a success/failure verdict.
|
|
359
|
+
* The judge will evaluate based on its configured criteria.
|
|
360
|
+
*
|
|
361
|
+
* @param content Optional message content for the judge. Usually undefined to let
|
|
362
|
+
* the judge evaluate based on its criteria.
|
|
363
|
+
* @returns A ScriptStep function that can be used in scenario scripts.
|
|
364
|
+
*/
|
|
365
|
+
declare const judge: (content?: string | CoreMessage) => ScriptStep;
|
|
366
|
+
/**
|
|
367
|
+
* Generate or specify a user message in the conversation.
|
|
368
|
+
*
|
|
369
|
+
* If content is provided, it will be used as the user message. If no content
|
|
370
|
+
* is provided, the user simulator agent will automatically generate an
|
|
371
|
+
* appropriate message based on the scenario context.
|
|
372
|
+
*
|
|
373
|
+
* @param content Optional user message content. Can be a string or full message object.
|
|
374
|
+
* If undefined, the user simulator will generate content automatically.
|
|
375
|
+
* @returns A ScriptStep function that can be used in scenario scripts.
|
|
376
|
+
*/
|
|
377
|
+
declare const user: (content?: string | CoreMessage) => ScriptStep;
|
|
378
|
+
/**
|
|
379
|
+
* Let the scenario proceed automatically for a specified number of turns.
|
|
380
|
+
*
|
|
381
|
+
* This function allows the scenario to run automatically with the normal
|
|
382
|
+
* agent interaction flow (user -> agent -> judge evaluation). You can
|
|
383
|
+
* optionally provide callbacks to execute custom logic at each turn or step.
|
|
384
|
+
*
|
|
385
|
+
* @param turns Number of turns to proceed automatically. If undefined, proceeds until
|
|
386
|
+
* the judge agent decides to end the scenario or max_turns is reached.
|
|
387
|
+
* @param onTurn Optional callback function called at the end of each turn.
|
|
388
|
+
* @param onStep Optional callback function called after each agent interaction.
|
|
389
|
+
* @returns A ScriptStep function that can be used in scenario scripts.
|
|
390
|
+
*/
|
|
391
|
+
declare const proceed: (turns?: number, onTurn?: (state: ScenarioExecutionStateLike) => void | Promise<void>, onStep?: (state: ScenarioExecutionStateLike) => void | Promise<void>) => ScriptStep;
|
|
392
|
+
/**
|
|
393
|
+
* End the scenario with a success verdict.
|
|
394
|
+
*
|
|
395
|
+
* This function immediately concludes the scenario and marks it as successful.
|
|
396
|
+
*
|
|
397
|
+
* @param reasoning Optional explanation for why the scenario succeeded.
|
|
398
|
+
* @returns A ScriptStep function that can be used in scenario scripts.
|
|
399
|
+
*/
|
|
400
|
+
declare const succeed: (reasoning?: string) => ScriptStep;
|
|
401
|
+
/**
|
|
402
|
+
* End the scenario with a failure verdict.
|
|
403
|
+
*
|
|
404
|
+
* This function immediately concludes the scenario and marks it as failed.
|
|
405
|
+
*
|
|
406
|
+
* @param reasoning Optional explanation for why the scenario failed.
|
|
407
|
+
* @returns A ScriptStep function that can be used in scenario scripts.
|
|
408
|
+
*/
|
|
409
|
+
declare const fail: (reasoning?: string) => ScriptStep;
|
|
410
|
+
|
|
411
|
+
/**
|
|
412
|
+
* The type of a scenario event.
|
|
413
|
+
*/
|
|
414
|
+
declare enum ScenarioEventType {
|
|
415
|
+
/**
|
|
416
|
+
* A scenario run has started.
|
|
417
|
+
*/
|
|
418
|
+
RUN_STARTED = "SCENARIO_RUN_STARTED",
|
|
419
|
+
/**
|
|
420
|
+
* A scenario run has finished.
|
|
421
|
+
*/
|
|
422
|
+
RUN_FINISHED = "SCENARIO_RUN_FINISHED",
|
|
423
|
+
/**
|
|
424
|
+
* A snapshot of the messages in a scenario.
|
|
425
|
+
*/
|
|
426
|
+
MESSAGE_SNAPSHOT = "SCENARIO_MESSAGE_SNAPSHOT"
|
|
427
|
+
}
|
|
428
|
+
/**
|
|
429
|
+
* The status of a scenario run.
|
|
430
|
+
*/
|
|
431
|
+
declare enum ScenarioRunStatus {
|
|
432
|
+
/**
|
|
433
|
+
* The scenario completed successfully.
|
|
434
|
+
*/
|
|
435
|
+
SUCCESS = "SUCCESS",
|
|
436
|
+
/**
|
|
437
|
+
* The scenario failed with an error.
|
|
438
|
+
*/
|
|
439
|
+
ERROR = "ERROR",
|
|
440
|
+
/**
|
|
441
|
+
* The scenario was cancelled.
|
|
442
|
+
*/
|
|
443
|
+
CANCELLED = "CANCELLED",
|
|
444
|
+
/**
|
|
445
|
+
* The scenario is in progress.
|
|
446
|
+
*/
|
|
447
|
+
IN_PROGRESS = "IN_PROGRESS",
|
|
448
|
+
/**
|
|
449
|
+
* The scenario is pending execution.
|
|
450
|
+
*/
|
|
451
|
+
PENDING = "PENDING",
|
|
452
|
+
/**
|
|
453
|
+
* The scenario failed.
|
|
454
|
+
*/
|
|
455
|
+
FAILED = "FAILED"
|
|
456
|
+
}
|
|
457
|
+
declare const scenarioEventSchema: z.ZodDiscriminatedUnion<"type", [z.ZodObject<{
|
|
458
|
+
timestamp: z.ZodOptional<z.ZodNumber>;
|
|
459
|
+
rawEvent: z.ZodOptional<z.ZodAny>;
|
|
460
|
+
batchRunId: z.ZodString;
|
|
461
|
+
scenarioId: z.ZodString;
|
|
462
|
+
scenarioRunId: z.ZodString;
|
|
463
|
+
} & {
|
|
464
|
+
type: z.ZodLiteral<ScenarioEventType.RUN_STARTED>;
|
|
465
|
+
metadata: z.ZodObject<{
|
|
466
|
+
name: z.ZodString;
|
|
467
|
+
description: z.ZodOptional<z.ZodString>;
|
|
468
|
+
}, "strip", z.ZodTypeAny, {
|
|
469
|
+
name: string;
|
|
470
|
+
description?: string | undefined;
|
|
471
|
+
}, {
|
|
472
|
+
name: string;
|
|
473
|
+
description?: string | undefined;
|
|
474
|
+
}>;
|
|
475
|
+
}, "strip", z.ZodTypeAny, {
|
|
476
|
+
type: ScenarioEventType.RUN_STARTED;
|
|
477
|
+
batchRunId: string;
|
|
478
|
+
scenarioId: string;
|
|
479
|
+
scenarioRunId: string;
|
|
480
|
+
metadata: {
|
|
481
|
+
name: string;
|
|
482
|
+
description?: string | undefined;
|
|
483
|
+
};
|
|
484
|
+
timestamp?: number | undefined;
|
|
485
|
+
rawEvent?: any;
|
|
486
|
+
}, {
|
|
487
|
+
type: ScenarioEventType.RUN_STARTED;
|
|
488
|
+
batchRunId: string;
|
|
489
|
+
scenarioId: string;
|
|
490
|
+
scenarioRunId: string;
|
|
491
|
+
metadata: {
|
|
492
|
+
name: string;
|
|
493
|
+
description?: string | undefined;
|
|
494
|
+
};
|
|
495
|
+
timestamp?: number | undefined;
|
|
496
|
+
rawEvent?: any;
|
|
497
|
+
}>, z.ZodObject<{
|
|
498
|
+
timestamp: z.ZodOptional<z.ZodNumber>;
|
|
499
|
+
rawEvent: z.ZodOptional<z.ZodAny>;
|
|
500
|
+
batchRunId: z.ZodString;
|
|
501
|
+
scenarioId: z.ZodString;
|
|
502
|
+
scenarioRunId: z.ZodString;
|
|
503
|
+
} & {
|
|
504
|
+
type: z.ZodLiteral<ScenarioEventType.RUN_FINISHED>;
|
|
505
|
+
status: z.ZodNativeEnum<typeof ScenarioRunStatus>;
|
|
506
|
+
}, "strip", z.ZodTypeAny, {
|
|
507
|
+
type: ScenarioEventType.RUN_FINISHED;
|
|
508
|
+
status: ScenarioRunStatus;
|
|
509
|
+
batchRunId: string;
|
|
510
|
+
scenarioId: string;
|
|
511
|
+
scenarioRunId: string;
|
|
512
|
+
timestamp?: number | undefined;
|
|
513
|
+
rawEvent?: any;
|
|
514
|
+
}, {
|
|
515
|
+
type: ScenarioEventType.RUN_FINISHED;
|
|
516
|
+
status: ScenarioRunStatus;
|
|
517
|
+
batchRunId: string;
|
|
518
|
+
scenarioId: string;
|
|
519
|
+
scenarioRunId: string;
|
|
520
|
+
timestamp?: number | undefined;
|
|
521
|
+
rawEvent?: any;
|
|
522
|
+
}>, z.ZodObject<{
|
|
523
|
+
messages: z.ZodArray<z.ZodDiscriminatedUnion<"role", [z.ZodObject<z.objectUtil.extendShape<{
|
|
524
|
+
id: z.ZodString;
|
|
525
|
+
role: z.ZodString;
|
|
526
|
+
content: z.ZodOptional<z.ZodString>;
|
|
527
|
+
name: z.ZodOptional<z.ZodString>;
|
|
528
|
+
}, {
|
|
529
|
+
role: z.ZodLiteral<"developer">;
|
|
530
|
+
content: z.ZodString;
|
|
531
|
+
}>, "strip", z.ZodTypeAny, {
|
|
532
|
+
id: string;
|
|
533
|
+
role: "developer";
|
|
534
|
+
content: string;
|
|
535
|
+
name?: string | undefined;
|
|
536
|
+
}, {
|
|
537
|
+
id: string;
|
|
538
|
+
role: "developer";
|
|
539
|
+
content: string;
|
|
540
|
+
name?: string | undefined;
|
|
541
|
+
}>, z.ZodObject<z.objectUtil.extendShape<{
|
|
542
|
+
id: z.ZodString;
|
|
543
|
+
role: z.ZodString;
|
|
544
|
+
content: z.ZodOptional<z.ZodString>;
|
|
545
|
+
name: z.ZodOptional<z.ZodString>;
|
|
546
|
+
}, {
|
|
547
|
+
role: z.ZodLiteral<"system">;
|
|
548
|
+
content: z.ZodString;
|
|
549
|
+
}>, "strip", z.ZodTypeAny, {
|
|
550
|
+
id: string;
|
|
551
|
+
role: "system";
|
|
552
|
+
content: string;
|
|
553
|
+
name?: string | undefined;
|
|
554
|
+
}, {
|
|
555
|
+
id: string;
|
|
556
|
+
role: "system";
|
|
557
|
+
content: string;
|
|
558
|
+
name?: string | undefined;
|
|
559
|
+
}>, z.ZodObject<z.objectUtil.extendShape<{
|
|
560
|
+
id: z.ZodString;
|
|
561
|
+
role: z.ZodString;
|
|
562
|
+
content: z.ZodOptional<z.ZodString>;
|
|
563
|
+
name: z.ZodOptional<z.ZodString>;
|
|
564
|
+
}, {
|
|
565
|
+
role: z.ZodLiteral<"assistant">;
|
|
566
|
+
content: z.ZodOptional<z.ZodString>;
|
|
567
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
568
|
+
id: z.ZodString;
|
|
569
|
+
type: z.ZodLiteral<"function">;
|
|
570
|
+
function: z.ZodObject<{
|
|
571
|
+
name: z.ZodString;
|
|
572
|
+
arguments: z.ZodString;
|
|
573
|
+
}, "strip", z.ZodTypeAny, {
|
|
574
|
+
name: string;
|
|
575
|
+
arguments: string;
|
|
576
|
+
}, {
|
|
577
|
+
name: string;
|
|
578
|
+
arguments: string;
|
|
579
|
+
}>;
|
|
580
|
+
}, "strip", z.ZodTypeAny, {
|
|
581
|
+
function: {
|
|
582
|
+
name: string;
|
|
583
|
+
arguments: string;
|
|
584
|
+
};
|
|
585
|
+
type: "function";
|
|
586
|
+
id: string;
|
|
587
|
+
}, {
|
|
588
|
+
function: {
|
|
589
|
+
name: string;
|
|
590
|
+
arguments: string;
|
|
591
|
+
};
|
|
592
|
+
type: "function";
|
|
593
|
+
id: string;
|
|
594
|
+
}>, "many">>;
|
|
595
|
+
}>, "strip", z.ZodTypeAny, {
|
|
596
|
+
id: string;
|
|
597
|
+
role: "assistant";
|
|
598
|
+
name?: string | undefined;
|
|
599
|
+
content?: string | undefined;
|
|
600
|
+
toolCalls?: {
|
|
601
|
+
function: {
|
|
602
|
+
name: string;
|
|
603
|
+
arguments: string;
|
|
604
|
+
};
|
|
605
|
+
type: "function";
|
|
606
|
+
id: string;
|
|
607
|
+
}[] | undefined;
|
|
608
|
+
}, {
|
|
609
|
+
id: string;
|
|
610
|
+
role: "assistant";
|
|
611
|
+
name?: string | undefined;
|
|
612
|
+
content?: string | undefined;
|
|
613
|
+
toolCalls?: {
|
|
614
|
+
function: {
|
|
615
|
+
name: string;
|
|
616
|
+
arguments: string;
|
|
617
|
+
};
|
|
618
|
+
type: "function";
|
|
619
|
+
id: string;
|
|
620
|
+
}[] | undefined;
|
|
621
|
+
}>, z.ZodObject<z.objectUtil.extendShape<{
|
|
622
|
+
id: z.ZodString;
|
|
623
|
+
role: z.ZodString;
|
|
624
|
+
content: z.ZodOptional<z.ZodString>;
|
|
625
|
+
name: z.ZodOptional<z.ZodString>;
|
|
626
|
+
}, {
|
|
627
|
+
role: z.ZodLiteral<"user">;
|
|
628
|
+
content: z.ZodString;
|
|
629
|
+
}>, "strip", z.ZodTypeAny, {
|
|
630
|
+
id: string;
|
|
631
|
+
role: "user";
|
|
632
|
+
content: string;
|
|
633
|
+
name?: string | undefined;
|
|
634
|
+
}, {
|
|
635
|
+
id: string;
|
|
636
|
+
role: "user";
|
|
637
|
+
content: string;
|
|
638
|
+
name?: string | undefined;
|
|
639
|
+
}>, z.ZodObject<{
|
|
640
|
+
id: z.ZodString;
|
|
641
|
+
content: z.ZodString;
|
|
642
|
+
role: z.ZodLiteral<"tool">;
|
|
643
|
+
toolCallId: z.ZodString;
|
|
644
|
+
}, "strip", z.ZodTypeAny, {
|
|
645
|
+
id: string;
|
|
646
|
+
role: "tool";
|
|
647
|
+
content: string;
|
|
648
|
+
toolCallId: string;
|
|
649
|
+
}, {
|
|
650
|
+
id: string;
|
|
651
|
+
role: "tool";
|
|
652
|
+
content: string;
|
|
653
|
+
toolCallId: string;
|
|
654
|
+
}>]>, "many">;
|
|
655
|
+
} & {
|
|
656
|
+
timestamp: z.ZodOptional<z.ZodNumber>;
|
|
657
|
+
rawEvent: z.ZodOptional<z.ZodAny>;
|
|
658
|
+
batchRunId: z.ZodString;
|
|
659
|
+
scenarioId: z.ZodString;
|
|
660
|
+
scenarioRunId: z.ZodString;
|
|
661
|
+
type: z.ZodLiteral<ScenarioEventType.MESSAGE_SNAPSHOT>;
|
|
662
|
+
}, "strip", z.ZodTypeAny, {
|
|
663
|
+
messages: ({
|
|
664
|
+
id: string;
|
|
665
|
+
role: "developer";
|
|
666
|
+
content: string;
|
|
667
|
+
name?: string | undefined;
|
|
668
|
+
} | {
|
|
669
|
+
id: string;
|
|
670
|
+
role: "system";
|
|
671
|
+
content: string;
|
|
672
|
+
name?: string | undefined;
|
|
673
|
+
} | {
|
|
674
|
+
id: string;
|
|
675
|
+
role: "assistant";
|
|
676
|
+
name?: string | undefined;
|
|
677
|
+
content?: string | undefined;
|
|
678
|
+
toolCalls?: {
|
|
679
|
+
function: {
|
|
680
|
+
name: string;
|
|
681
|
+
arguments: string;
|
|
682
|
+
};
|
|
683
|
+
type: "function";
|
|
684
|
+
id: string;
|
|
685
|
+
}[] | undefined;
|
|
686
|
+
} | {
|
|
687
|
+
id: string;
|
|
688
|
+
role: "user";
|
|
689
|
+
content: string;
|
|
690
|
+
name?: string | undefined;
|
|
691
|
+
} | {
|
|
692
|
+
id: string;
|
|
693
|
+
role: "tool";
|
|
694
|
+
content: string;
|
|
695
|
+
toolCallId: string;
|
|
696
|
+
})[];
|
|
697
|
+
type: ScenarioEventType.MESSAGE_SNAPSHOT;
|
|
698
|
+
batchRunId: string;
|
|
699
|
+
scenarioId: string;
|
|
700
|
+
scenarioRunId: string;
|
|
701
|
+
timestamp?: number | undefined;
|
|
702
|
+
rawEvent?: any;
|
|
703
|
+
}, {
|
|
704
|
+
messages: ({
|
|
705
|
+
id: string;
|
|
706
|
+
role: "developer";
|
|
707
|
+
content: string;
|
|
708
|
+
name?: string | undefined;
|
|
709
|
+
} | {
|
|
710
|
+
id: string;
|
|
711
|
+
role: "system";
|
|
712
|
+
content: string;
|
|
713
|
+
name?: string | undefined;
|
|
714
|
+
} | {
|
|
715
|
+
id: string;
|
|
716
|
+
role: "assistant";
|
|
717
|
+
name?: string | undefined;
|
|
718
|
+
content?: string | undefined;
|
|
719
|
+
toolCalls?: {
|
|
720
|
+
function: {
|
|
721
|
+
name: string;
|
|
722
|
+
arguments: string;
|
|
723
|
+
};
|
|
724
|
+
type: "function";
|
|
725
|
+
id: string;
|
|
726
|
+
}[] | undefined;
|
|
727
|
+
} | {
|
|
728
|
+
id: string;
|
|
729
|
+
role: "user";
|
|
730
|
+
content: string;
|
|
731
|
+
name?: string | undefined;
|
|
732
|
+
} | {
|
|
733
|
+
id: string;
|
|
734
|
+
role: "tool";
|
|
735
|
+
content: string;
|
|
736
|
+
toolCallId: string;
|
|
737
|
+
})[];
|
|
738
|
+
type: ScenarioEventType.MESSAGE_SNAPSHOT;
|
|
739
|
+
batchRunId: string;
|
|
740
|
+
scenarioId: string;
|
|
741
|
+
scenarioRunId: string;
|
|
742
|
+
timestamp?: number | undefined;
|
|
743
|
+
rawEvent?: any;
|
|
744
|
+
}>]>;
|
|
745
|
+
/**
|
|
746
|
+
* A union of all possible scenario events.
|
|
747
|
+
*/
|
|
748
|
+
type ScenarioEvent = z.infer<typeof scenarioEventSchema>;
|
|
749
|
+
|
|
750
|
+
/**
|
|
751
|
+
* Manages the execution of a single scenario.
|
|
752
|
+
*
|
|
753
|
+
* This class orchestrates the interaction between agents, executes the script,
|
|
754
|
+
* and manages the scenario's state. It also emits events that can be subscribed to
|
|
755
|
+
* for observing the scenario's progress.
|
|
756
|
+
*
|
|
757
|
+
* @example
|
|
758
|
+
* ```typescript
|
|
759
|
+
* import { scenario, user, agent, succeed, judge } from "@getscenario/scenario";
|
|
760
|
+
*
|
|
761
|
+
* const myScenario = scenario(
|
|
762
|
+
* {
|
|
763
|
+
* name: "My First Scenario",
|
|
764
|
+
* description: "A simple test of the agent's greeting.",
|
|
765
|
+
* agents: [
|
|
766
|
+
* scenario.userSimulatorAgent(),
|
|
767
|
+
* scenario.judgeAgent({
|
|
768
|
+
* criteria: [
|
|
769
|
+
* "Agent should respond with a greeting",
|
|
770
|
+
* "Agent should ask for the user's name",
|
|
771
|
+
* "Agent should respond with a farewell",
|
|
772
|
+
* ],
|
|
773
|
+
* }),
|
|
774
|
+
* ],
|
|
775
|
+
* },
|
|
776
|
+
* [
|
|
777
|
+
* user("Hello"),
|
|
778
|
+
* agent("Hi, how can I help you?"),
|
|
779
|
+
* succeed("Agent responded correctly."),
|
|
780
|
+
* ]
|
|
781
|
+
* );
|
|
782
|
+
*
|
|
783
|
+
* const execution = new ScenarioExecution(myScenario.config, myScenario.script);
|
|
784
|
+
*
|
|
785
|
+
* execution.events$.subscribe(event => {
|
|
786
|
+
* console.log("Scenario event:", event);
|
|
787
|
+
* });
|
|
788
|
+
*
|
|
789
|
+
* const result = await execution.execute();
|
|
790
|
+
* console.log("Scenario result:", result.success);
|
|
791
|
+
* ```
|
|
792
|
+
*/
|
|
793
|
+
declare class ScenarioExecution implements ScenarioExecutionLike {
|
|
794
|
+
private state;
|
|
795
|
+
private eventSubject;
|
|
796
|
+
private logger;
|
|
797
|
+
private config;
|
|
798
|
+
/**
|
|
799
|
+
* An observable stream of events that occur during the scenario execution.
|
|
800
|
+
* Subscribe to this to monitor the progress of the scenario in real-time.
|
|
801
|
+
*/
|
|
802
|
+
readonly events$: Observable<ScenarioEvent>;
|
|
803
|
+
/**
|
|
804
|
+
* Creates a new ScenarioExecution instance.
|
|
805
|
+
* @param config The scenario configuration.
|
|
806
|
+
* @param script The script steps to execute.
|
|
807
|
+
*/
|
|
808
|
+
constructor(config: ScenarioConfig, script: ScriptStep[]);
|
|
809
|
+
/**
|
|
810
|
+
* The history of messages in the conversation.
|
|
811
|
+
*/
|
|
812
|
+
get history(): CoreMessage[];
|
|
813
|
+
/**
|
|
814
|
+
* The unique identifier for the conversation thread.
|
|
815
|
+
*/
|
|
816
|
+
get threadId(): string;
|
|
817
|
+
/**
|
|
818
|
+
* Executes the entire scenario from start to finish.
|
|
819
|
+
* This will run through the script and any automatic proceeding logic until a
|
|
820
|
+
* final result (success, failure, or error) is determined.
|
|
821
|
+
* @returns A promise that resolves with the final result of the scenario.
|
|
822
|
+
*/
|
|
823
|
+
execute(): Promise<ScenarioResult>;
|
|
824
|
+
/**
|
|
825
|
+
* Executes a single step in the scenario.
|
|
826
|
+
* A step usually corresponds to a single agent's turn. This method is useful
|
|
827
|
+
* for manually controlling the scenario's progress.
|
|
828
|
+
* @returns A promise that resolves with the new messages added during the step, or a final scenario result if the step concludes the scenario.
|
|
829
|
+
*/
|
|
830
|
+
step(): Promise<CoreMessage[] | ScenarioResult>;
|
|
831
|
+
private _step;
|
|
832
|
+
private callAgent;
|
|
833
|
+
private nextAgentForRole;
|
|
834
|
+
private reachedMaxTurns;
|
|
835
|
+
private getJudgeAgent;
|
|
836
|
+
private consumeUntilRole;
|
|
837
|
+
private scriptCallAgent;
|
|
838
|
+
/**
|
|
839
|
+
* Adds a message to the conversation history.
|
|
840
|
+
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
841
|
+
* @param message The message to add.
|
|
842
|
+
*/
|
|
843
|
+
message(message: CoreMessage): Promise<void>;
|
|
844
|
+
/**
|
|
845
|
+
* Executes a user turn.
|
|
846
|
+
* If content is provided, it's used as the user's message.
|
|
847
|
+
* If not, the user simulator agent is called to generate a message.
|
|
848
|
+
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
849
|
+
* @param content The optional content of the user's message.
|
|
850
|
+
*/
|
|
851
|
+
user(content?: string | CoreMessage): Promise<void>;
|
|
852
|
+
/**
|
|
853
|
+
* Executes an agent turn.
|
|
854
|
+
* If content is provided, it's used as the agent's message.
|
|
855
|
+
* If not, the agent under test is called to generate a response.
|
|
856
|
+
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
857
|
+
* @param content The optional content of the agent's message.
|
|
858
|
+
*/
|
|
859
|
+
agent(content?: string | CoreMessage): Promise<void>;
|
|
860
|
+
/**
|
|
861
|
+
* Invokes the judge agent to evaluate the current state of the conversation.
|
|
862
|
+
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
863
|
+
* @param content Optional message to pass to the judge.
|
|
864
|
+
* @returns A promise that resolves with the scenario result if the judge makes a final decision, otherwise null.
|
|
865
|
+
*/
|
|
866
|
+
judge(content?: string | CoreMessage): Promise<ScenarioResult | null>;
|
|
867
|
+
/**
|
|
868
|
+
* Lets the scenario proceed automatically for a specified number of turns.
|
|
869
|
+
* This simulates the natural flow of conversation between agents.
|
|
870
|
+
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
871
|
+
* @param turns The number of turns to proceed. If undefined, runs until a conclusion or max turns is reached.
|
|
872
|
+
* @param onTurn A callback executed at the end of each turn.
|
|
873
|
+
* @param onStep A callback executed after each agent interaction.
|
|
874
|
+
* @returns A promise that resolves with the scenario result if a conclusion is reached.
|
|
875
|
+
*/
|
|
876
|
+
proceed(turns?: number, onTurn?: (state: ScenarioExecutionStateLike) => void | Promise<void>, onStep?: (state: ScenarioExecutionStateLike) => void | Promise<void>): Promise<ScenarioResult | null>;
|
|
877
|
+
/**
|
|
878
|
+
* Immediately ends the scenario with a success verdict.
|
|
879
|
+
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
880
|
+
* @param reasoning An optional explanation for the success.
|
|
881
|
+
* @returns A promise that resolves with the final successful scenario result.
|
|
882
|
+
*/
|
|
883
|
+
succeed(reasoning?: string): Promise<ScenarioResult>;
|
|
884
|
+
/**
|
|
885
|
+
* Immediately ends the scenario with a failure verdict.
|
|
886
|
+
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
887
|
+
* @param reasoning An optional explanation for the failure.
|
|
888
|
+
* @returns A promise that resolves with the final failed scenario result.
|
|
889
|
+
*/
|
|
890
|
+
fail(reasoning?: string): Promise<ScenarioResult>;
|
|
891
|
+
private reset;
|
|
892
|
+
/**
|
|
893
|
+
* Emits an event to the event stream for external consumption.
|
|
894
|
+
*/
|
|
895
|
+
private emitEvent;
|
|
896
|
+
/**
|
|
897
|
+
* Creates base event properties shared across all scenario events.
|
|
898
|
+
*/
|
|
899
|
+
private makeBaseEvent;
|
|
900
|
+
/**
|
|
901
|
+
* Emits a run started event to indicate scenario execution has begun.
|
|
902
|
+
*/
|
|
903
|
+
private emitRunStarted;
|
|
904
|
+
/**
|
|
905
|
+
* Emits a message snapshot event containing current conversation history.
|
|
906
|
+
*/
|
|
907
|
+
private emitMessageSnapshot;
|
|
908
|
+
/**
|
|
909
|
+
* Emits a run finished event with the final execution status.
|
|
910
|
+
*/
|
|
911
|
+
private emitRunFinished;
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
/**
|
|
915
|
+
* Manages the state of a scenario execution.
|
|
916
|
+
* This class implements the ScenarioExecutionStateLike interface and provides
|
|
917
|
+
* the internal logic for tracking conversation history, turns, results, and
|
|
918
|
+
* other related information.
|
|
919
|
+
*/
|
|
920
|
+
declare class ScenarioExecutionState implements ScenarioExecutionStateLike {
|
|
921
|
+
private _history;
|
|
922
|
+
private _turn;
|
|
923
|
+
private _partialResult;
|
|
924
|
+
private _threadId;
|
|
925
|
+
private _agents;
|
|
926
|
+
private _pendingMessages;
|
|
927
|
+
private _pendingRolesOnTurn;
|
|
928
|
+
private _pendingAgentsOnTurn;
|
|
929
|
+
private _agentTimes;
|
|
930
|
+
private _totalStartTime;
|
|
931
|
+
/**
|
|
932
|
+
* Creates a new ScenarioExecutionState.
|
|
933
|
+
*/
|
|
934
|
+
constructor();
|
|
935
|
+
setThreadId(threadId: string): void;
|
|
936
|
+
setAgents(agents: AgentAdapter[]): void;
|
|
937
|
+
appendMessage(role: CoreMessage["role"], content: string): void;
|
|
938
|
+
appendUserMessage(content: string): void;
|
|
939
|
+
appendAssistantMessage(content: string): void;
|
|
940
|
+
addMessage(message: CoreMessage, fromAgentIdx?: number): void;
|
|
941
|
+
addMessages(messages: CoreMessage[], fromAgentIdx?: number): void;
|
|
942
|
+
getPendingMessages(agentIdx: number): CoreMessage[];
|
|
943
|
+
clearPendingMessages(agentIdx: number): void;
|
|
944
|
+
newTurn(): void;
|
|
945
|
+
removePendingRole(role: AgentRole): void;
|
|
946
|
+
removePendingAgent(agent: AgentAdapter): void;
|
|
947
|
+
getNextAgentForRole(role: AgentRole): {
|
|
948
|
+
index: number;
|
|
949
|
+
agent: AgentAdapter;
|
|
950
|
+
} | null;
|
|
951
|
+
addAgentTime(agentIdx: number, time: number): void;
|
|
952
|
+
hasResult(): boolean;
|
|
953
|
+
setResult(result: Omit<ScenarioResult, "messages">): void;
|
|
954
|
+
get lastMessage(): CoreMessage | undefined;
|
|
955
|
+
get lastUserMessage(): CoreMessage | undefined;
|
|
956
|
+
get lastAssistantMessage(): CoreMessage | undefined;
|
|
957
|
+
get lastToolCall(): CoreToolMessage | undefined;
|
|
958
|
+
getLastToolCallByToolName(toolName: string): CoreToolMessage | undefined;
|
|
959
|
+
hasToolCall(toolName: string): boolean;
|
|
960
|
+
get history(): CoreMessage[];
|
|
961
|
+
get historyWithoutLastMessage(): CoreMessage[];
|
|
962
|
+
get historyWithoutLastUserMessage(): CoreMessage[];
|
|
963
|
+
get turn(): number | null;
|
|
964
|
+
set turn(turn: number);
|
|
965
|
+
get threadId(): string;
|
|
966
|
+
get agents(): AgentAdapter[];
|
|
967
|
+
get pendingRolesOnTurn(): AgentRole[];
|
|
968
|
+
set pendingRolesOnTurn(roles: AgentRole[]);
|
|
969
|
+
get pendingAgentsOnTurn(): AgentAdapter[];
|
|
970
|
+
set pendingAgentsOnTurn(agents: AgentAdapter[]);
|
|
971
|
+
get partialResult(): Omit<ScenarioResult, "messages"> | null;
|
|
972
|
+
get totalTime(): number;
|
|
973
|
+
get agentTimes(): Map<number, number>;
|
|
974
|
+
removeLastPendingRole(): void;
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
/**
|
|
978
|
+
* High-level interface for running a scenario test.
|
|
979
|
+
*
|
|
980
|
+
* This is the main entry point for executing scenario tests. It creates a
|
|
981
|
+
* ScenarioExecution instance and runs it.
|
|
982
|
+
*
|
|
983
|
+
* @param cfg Configuration for the scenario test.
|
|
984
|
+
* @param cfg.name Human-readable name for the scenario.
|
|
985
|
+
* @param cfg.description Detailed description of what the scenario tests.
|
|
986
|
+
* @param cfg.agents List of agent adapters (agent under test, user simulator, judge).
|
|
987
|
+
* @param cfg.maxTurns Maximum conversation turns before timeout (default: 10).
|
|
988
|
+
* @param cfg.verbose Show detailed output during execution.
|
|
989
|
+
* @param cfg.script Optional script steps to control scenario flow.
|
|
990
|
+
* @param cfg.threadId Optional ID for the conversation thread.
|
|
991
|
+
* @returns A promise that resolves with the ScenarioResult containing the test outcome,
|
|
992
|
+
* conversation history, success/failure status, and detailed reasoning.
|
|
993
|
+
*
|
|
994
|
+
* @example
|
|
995
|
+
* ```typescript
|
|
996
|
+
* import { run, AgentAdapter, AgentRole, user, agent } from '@langwatch/scenario';
|
|
997
|
+
*
|
|
998
|
+
* const myAgent: AgentAdapter = {
|
|
999
|
+
* role: AgentRole.AGENT,
|
|
1000
|
+
* async call(input) {
|
|
1001
|
+
* return `The user said: ${input.messages.at(-1)?.content}`;
|
|
1002
|
+
* }
|
|
1003
|
+
* };
|
|
1004
|
+
*
|
|
1005
|
+
* async function main() {
|
|
1006
|
+
* const result = await run({
|
|
1007
|
+
* name: "Customer Service Test",
|
|
1008
|
+
* description: "A simple test to see if the agent responds.",
|
|
1009
|
+
* agents: [myAgent],
|
|
1010
|
+
* script: [
|
|
1011
|
+
* user("Hello, world!"),
|
|
1012
|
+
* agent(),
|
|
1013
|
+
* ],
|
|
1014
|
+
* });
|
|
1015
|
+
*
|
|
1016
|
+
* if (result.success) {
|
|
1017
|
+
* console.log("Scenario passed!");
|
|
1018
|
+
* } else {
|
|
1019
|
+
* console.error(`Scenario failed: ${result.reasoning}`);
|
|
1020
|
+
* }
|
|
1021
|
+
* }
|
|
1022
|
+
*
|
|
1023
|
+
* main();
|
|
1024
|
+
* ```
|
|
1025
|
+
*/
|
|
1026
|
+
declare function run(cfg: ScenarioConfig): Promise<ScenarioResult>;
|
|
1027
|
+
|
|
1028
|
+
/**
|
|
1029
|
+
* Configuration for the inference parameters of a testing agent.
|
|
1030
|
+
*/
|
|
1031
|
+
interface TestingAgentInferenceConfig {
|
|
1032
|
+
/**
|
|
1033
|
+
* The language model to use for generating responses.
|
|
1034
|
+
* If not provided, a default model will be used.
|
|
1035
|
+
*/
|
|
1036
|
+
model?: LanguageModel;
|
|
1037
|
+
/**
|
|
1038
|
+
* The temperature for the language model.
|
|
1039
|
+
* Defaults to 0.
|
|
1040
|
+
*/
|
|
1041
|
+
temperature?: number;
|
|
1042
|
+
/**
|
|
1043
|
+
* The maximum number of tokens to generate.
|
|
1044
|
+
*/
|
|
1045
|
+
maxTokens?: number;
|
|
1046
|
+
}
|
|
1047
|
+
/**
|
|
1048
|
+
* General configuration for a testing agent.
|
|
1049
|
+
*/
|
|
1050
|
+
interface TestingAgentConfig extends TestingAgentInferenceConfig {
|
|
1051
|
+
/**
|
|
1052
|
+
* The name of the agent.
|
|
1053
|
+
*/
|
|
1054
|
+
name?: string;
|
|
1055
|
+
}
|
|
1056
|
+
/**
|
|
1057
|
+
* The arguments for finishing a test, used by the judge agent's tool.
|
|
1058
|
+
*/
|
|
1059
|
+
interface FinishTestArgs {
|
|
1060
|
+
/**
|
|
1061
|
+
* A record of the criteria and their results.
|
|
1062
|
+
*/
|
|
1063
|
+
criteria: Record<string, "true" | "false" | "inconclusive">;
|
|
1064
|
+
/**
|
|
1065
|
+
* The reasoning behind the verdict.
|
|
1066
|
+
*/
|
|
1067
|
+
reasoning: string;
|
|
1068
|
+
/**
|
|
1069
|
+
* The final verdict of the test.
|
|
1070
|
+
*/
|
|
1071
|
+
verdict: "success" | "failure" | "inconclusive";
|
|
1072
|
+
}
|
|
1073
|
+
|
|
1074
|
+
/**
|
|
1075
|
+
* Configuration for the judge agent.
|
|
1076
|
+
*/
|
|
1077
|
+
interface JudgeAgentConfig extends TestingAgentConfig {
|
|
1078
|
+
/**
|
|
1079
|
+
* A custom system prompt to override the default behavior of the judge.
|
|
1080
|
+
*/
|
|
1081
|
+
systemPrompt?: string;
|
|
1082
|
+
/**
|
|
1083
|
+
* The criteria that the judge will use to evaluate the conversation.
|
|
1084
|
+
*/
|
|
1085
|
+
criteria: string[];
|
|
1086
|
+
}
|
|
1087
|
+
/**
|
|
1088
|
+
* Agent that evaluates conversations against success criteria.
|
|
1089
|
+
*
|
|
1090
|
+
* The JudgeAgent watches conversations in real-time and makes decisions about
|
|
1091
|
+
* whether the agent under test is meeting the specified criteria. It can either
|
|
1092
|
+
* allow the conversation to continue or end it with a success/failure verdict.
|
|
1093
|
+
*
|
|
1094
|
+
* The judge uses function calling to make structured decisions and provides
|
|
1095
|
+
* detailed reasoning for its verdicts. It evaluates each criterion independently
|
|
1096
|
+
* and provides comprehensive feedback about what worked and what didn't.
|
|
1097
|
+
*
|
|
1098
|
+
* @param cfg Configuration for the judge agent.
|
|
1099
|
+
* @param cfg.criteria List of success criteria to evaluate against.
|
|
1100
|
+
* @param cfg.model Optional The language model to use for generating responses.
|
|
1101
|
+
* @param cfg.temperature Optional The temperature to use for the model.
|
|
1102
|
+
* @param cfg.maxTokens Optional The maximum number of tokens to generate.
|
|
1103
|
+
* @param cfg.systemPrompt Optional Custom system prompt to override default judge behavior.
|
|
1104
|
+
*
|
|
1105
|
+
* @example
|
|
1106
|
+
* ```typescript
|
|
1107
|
+
* import { run, judgeAgent, AgentRole, user, agent, AgentAdapter } from '@langwatch/scenario';
|
|
1108
|
+
*
|
|
1109
|
+
* const myAgent: AgentAdapter = {
|
|
1110
|
+
* role: AgentRole.AGENT,
|
|
1111
|
+
* async call(input) {
|
|
1112
|
+
* return `The user said: ${input.messages.at(-1)?.content}`;
|
|
1113
|
+
* }
|
|
1114
|
+
* };
|
|
1115
|
+
*
|
|
1116
|
+
* async function main() {
|
|
1117
|
+
* const result = await run({
|
|
1118
|
+
* name: "Judge Agent Test",
|
|
1119
|
+
* description: "A simple test to see if the judge agent works.",
|
|
1120
|
+
* agents: [
|
|
1121
|
+
* myAgent,
|
|
1122
|
+
* judgeAgent({
|
|
1123
|
+
* criteria: ["The agent must respond to the user."],
|
|
1124
|
+
* }),
|
|
1125
|
+
* ],
|
|
1126
|
+
* script: [
|
|
1127
|
+
* user("Hello!"),
|
|
1128
|
+
* agent(),
|
|
1129
|
+
* ],
|
|
1130
|
+
* });
|
|
1131
|
+
* }
|
|
1132
|
+
* main();
|
|
1133
|
+
* ```
|
|
1134
|
+
*/
|
|
1135
|
+
declare const judgeAgent: (cfg: JudgeAgentConfig) => {
|
|
1136
|
+
role: AgentRole.JUDGE;
|
|
1137
|
+
criteria: string[];
|
|
1138
|
+
call: (input: AgentInput) => Promise<never[] | {
|
|
1139
|
+
success: boolean;
|
|
1140
|
+
messages: CoreMessage[];
|
|
1141
|
+
reasoning: string;
|
|
1142
|
+
passedCriteria: string[];
|
|
1143
|
+
failedCriteria: string[];
|
|
1144
|
+
}>;
|
|
1145
|
+
};
|
|
1146
|
+
|
|
1147
|
+
/**
|
|
1148
|
+
* Agent that simulates realistic user behavior in scenario conversations.
|
|
1149
|
+
*
|
|
1150
|
+
* This agent generates user messages that are appropriate for the given scenario
|
|
1151
|
+
* context, simulating how a real human user would interact with the agent under test.
|
|
1152
|
+
* It uses an LLM to generate natural, contextually relevant user inputs that help
|
|
1153
|
+
* drive the conversation forward according to the scenario description.
|
|
1154
|
+
*
|
|
1155
|
+
* @param config Optional configuration for the agent.
|
|
1156
|
+
* @param config.model The language model to use for generating responses.
|
|
1157
|
+
* @param config.temperature The temperature to use for the model.
|
|
1158
|
+
* @param config.maxTokens The maximum number of tokens to generate.
|
|
1159
|
+
*
|
|
1160
|
+
* @example
|
|
1161
|
+
* ```typescript
|
|
1162
|
+
* import { run, userSimulatorAgent, AgentRole, user, agent, AgentAdapter } from '@langwatch/scenario';
|
|
1163
|
+
*
|
|
1164
|
+
* const myAgent: AgentAdapter = {
|
|
1165
|
+
* role: AgentRole.AGENT,
|
|
1166
|
+
* async call(input) {
|
|
1167
|
+
* return `The user said: ${input.messages.at(-1)?.content}`;
|
|
1168
|
+
* }
|
|
1169
|
+
* };
|
|
1170
|
+
*
|
|
1171
|
+
* async function main() {
|
|
1172
|
+
* const result = await run({
|
|
1173
|
+
* name: "User Simulator Test",
|
|
1174
|
+
* description: "A simple test to see if the user simulator works.",
|
|
1175
|
+
* agents: [myAgent, userSimulatorAgent()],
|
|
1176
|
+
* script: [
|
|
1177
|
+
* user(),
|
|
1178
|
+
* agent(),
|
|
1179
|
+
* ],
|
|
1180
|
+
* });
|
|
1181
|
+
* }
|
|
1182
|
+
* main();
|
|
1183
|
+
* ```
|
|
1184
|
+
*/
|
|
1185
|
+
declare const userSimulatorAgent: (config?: TestingAgentConfig) => {
|
|
1186
|
+
role: AgentRole.USER;
|
|
1187
|
+
call: (input: AgentInput) => Promise<{
|
|
1188
|
+
role: "user";
|
|
1189
|
+
content: string;
|
|
1190
|
+
}>;
|
|
1191
|
+
};
|
|
1192
|
+
|
|
1193
|
+
export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type FinishTestArgs, JudgeAgentAdapter, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type TestingAgentConfig, type TestingAgentInferenceConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, defineConfig, fail, judge, judgeAgent, message, proceed, run, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
|