@langwatch/scenario 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +70 -47
- package/dist/index.d.ts +70 -47
- package/dist/index.js +153 -87
- package/dist/index.mjs +153 -87
- package/dist/integrations/vitest/setup.js +1 -1
- package/dist/integrations/vitest/setup.mjs +1 -1
- package/package.json +4 -4
package/dist/index.d.mts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import * as ai from 'ai';
|
|
2
|
-
import {
|
|
2
|
+
import { ModelMessage, UserModelMessage, AssistantModelMessage, ToolModelMessage, LanguageModel, generateText } from 'ai';
|
|
3
3
|
import { z } from 'zod/v4';
|
|
4
4
|
import { SpanProcessor, ReadableSpan } from '@opentelemetry/sdk-trace-base';
|
|
5
5
|
import { RealtimeSession } from '@openai/agents/realtime';
|
|
@@ -8,11 +8,11 @@ import { z as z$1 } from 'zod';
|
|
|
8
8
|
|
|
9
9
|
/**
|
|
10
10
|
* The possible return types from an agent's `call` method.
|
|
11
|
-
* - string |
|
|
11
|
+
* - string | ModelMessage | ModelMessage[]: Agent generated response
|
|
12
12
|
* - JudgeResult: Judge made a final decision
|
|
13
13
|
* - null: Judge wants to continue observing (no decision yet)
|
|
14
14
|
*/
|
|
15
|
-
type AgentReturnTypes = string |
|
|
15
|
+
type AgentReturnTypes = string | ModelMessage | ModelMessage[] | JudgeResult | null;
|
|
16
16
|
|
|
17
17
|
declare enum AgentRole {
|
|
18
18
|
USER = "User",
|
|
@@ -20,6 +20,18 @@ declare enum AgentRole {
|
|
|
20
20
|
JUDGE = "Judge"
|
|
21
21
|
}
|
|
22
22
|
declare const allAgentRoles: readonly [AgentRole.USER, AgentRole.AGENT, AgentRole.JUDGE];
|
|
23
|
+
/**
|
|
24
|
+
* Encapsulates a request for the judge agent to evaluate the conversation.
|
|
25
|
+
*
|
|
26
|
+
* When present on AgentInput, signals the judge to produce a verdict.
|
|
27
|
+
* Optionally carries inline criteria that override the judge's own criteria.
|
|
28
|
+
*/
|
|
29
|
+
interface JudgmentRequest {
|
|
30
|
+
/**
|
|
31
|
+
* Optional criteria to evaluate, overriding the judge agent's configured criteria.
|
|
32
|
+
*/
|
|
33
|
+
criteria?: string[];
|
|
34
|
+
}
|
|
23
35
|
/**
|
|
24
36
|
* Input provided to an agent's `call` method.
|
|
25
37
|
*/
|
|
@@ -31,19 +43,19 @@ interface AgentInput {
|
|
|
31
43
|
/**
|
|
32
44
|
* The full history of messages in the conversation.
|
|
33
45
|
*/
|
|
34
|
-
messages:
|
|
46
|
+
messages: ModelMessage[];
|
|
35
47
|
/**
|
|
36
48
|
* New messages added since the last time this agent was called.
|
|
37
49
|
*/
|
|
38
|
-
newMessages:
|
|
50
|
+
newMessages: ModelMessage[];
|
|
39
51
|
/**
|
|
40
52
|
* The role the agent is being asked to play in this turn.
|
|
41
53
|
*/
|
|
42
54
|
requestedRole: AgentRole;
|
|
43
55
|
/**
|
|
44
|
-
*
|
|
56
|
+
* When set, requests the judge to produce a verdict, optionally with inline criteria.
|
|
45
57
|
*/
|
|
46
|
-
judgmentRequest
|
|
58
|
+
judgmentRequest?: JudgmentRequest;
|
|
47
59
|
/**
|
|
48
60
|
* The current state of the scenario execution.
|
|
49
61
|
*/
|
|
@@ -191,7 +203,7 @@ interface ScenarioExecutionLike {
|
|
|
191
203
|
/**
|
|
192
204
|
* The history of messages in the conversation.
|
|
193
205
|
*/
|
|
194
|
-
readonly messages:
|
|
206
|
+
readonly messages: ModelMessage[];
|
|
195
207
|
/**
|
|
196
208
|
* The ID of the conversation thread.
|
|
197
209
|
*/
|
|
@@ -200,25 +212,27 @@ interface ScenarioExecutionLike {
|
|
|
200
212
|
* Adds a message to the conversation.
|
|
201
213
|
* @param message The message to add.
|
|
202
214
|
*/
|
|
203
|
-
message(message:
|
|
215
|
+
message(message: ModelMessage): Promise<void>;
|
|
204
216
|
/**
|
|
205
217
|
* Adds a user message to the conversation.
|
|
206
218
|
* If no content is provided, the user simulator will generate a message.
|
|
207
219
|
* @param content The content of the user message.
|
|
208
220
|
*/
|
|
209
|
-
user(content?: string |
|
|
221
|
+
user(content?: string | ModelMessage): Promise<void>;
|
|
210
222
|
/**
|
|
211
223
|
* Adds an agent message to the conversation.
|
|
212
224
|
* If no content is provided, the agent under test will generate a message.
|
|
213
225
|
* @param content The content of the agent message.
|
|
214
226
|
*/
|
|
215
|
-
agent(content?: string |
|
|
227
|
+
agent(content?: string | ModelMessage): Promise<void>;
|
|
216
228
|
/**
|
|
217
229
|
* Invokes the judge agent to evaluate the current state.
|
|
218
|
-
* @param
|
|
230
|
+
* @param options Optional options with inline criteria to evaluate as a checkpoint.
|
|
219
231
|
* @returns The result of the scenario if the judge makes a final decision.
|
|
220
232
|
*/
|
|
221
|
-
judge(
|
|
233
|
+
judge(options?: {
|
|
234
|
+
criteria?: string[];
|
|
235
|
+
}): Promise<ScenarioResult | null>;
|
|
222
236
|
/**
|
|
223
237
|
* Proceeds with the scenario automatically for a number of turns.
|
|
224
238
|
* @param turns The number of turns to proceed. Defaults to running until the scenario ends.
|
|
@@ -258,7 +272,7 @@ interface ScenarioResult {
|
|
|
258
272
|
/**
|
|
259
273
|
* The sequence of messages exchanged during the scenario.
|
|
260
274
|
*/
|
|
261
|
-
messages:
|
|
275
|
+
messages: ModelMessage[];
|
|
262
276
|
/**
|
|
263
277
|
* The reasoning behind the scenario's outcome.
|
|
264
278
|
*/
|
|
@@ -299,7 +313,7 @@ interface ScenarioExecutionStateLike {
|
|
|
299
313
|
/**
|
|
300
314
|
* The sequence of messages exchanged during the scenario.
|
|
301
315
|
*/
|
|
302
|
-
get messages():
|
|
316
|
+
get messages(): ModelMessage[];
|
|
303
317
|
/**
|
|
304
318
|
* The unique identifier for the execution thread.
|
|
305
319
|
*/
|
|
@@ -313,28 +327,28 @@ interface ScenarioExecutionStateLike {
|
|
|
313
327
|
*
|
|
314
328
|
* @param message - The core message to add.
|
|
315
329
|
*/
|
|
316
|
-
addMessage(message:
|
|
330
|
+
addMessage(message: ModelMessage): void;
|
|
317
331
|
/**
|
|
318
332
|
* Retrieves the last message from the execution state.
|
|
319
333
|
* @returns The last message.
|
|
320
334
|
*/
|
|
321
|
-
lastMessage():
|
|
335
|
+
lastMessage(): ModelMessage;
|
|
322
336
|
/**
|
|
323
337
|
* Retrieves the last user message from the execution state.
|
|
324
338
|
* @returns The last user message.
|
|
325
339
|
*/
|
|
326
|
-
lastUserMessage():
|
|
340
|
+
lastUserMessage(): UserModelMessage;
|
|
327
341
|
/**
|
|
328
342
|
* Retrieves the last agent message from the execution state.
|
|
329
343
|
* @returns The last agent message.
|
|
330
344
|
*/
|
|
331
|
-
lastAgentMessage():
|
|
345
|
+
lastAgentMessage(): AssistantModelMessage;
|
|
332
346
|
/**
|
|
333
347
|
* Retrieves the last tool call message for a specific tool.
|
|
334
348
|
* @param toolName - The name of the tool.
|
|
335
349
|
* @returns The last tool call message.
|
|
336
350
|
*/
|
|
337
|
-
lastToolCall(toolName: string):
|
|
351
|
+
lastToolCall(toolName: string): ToolModelMessage;
|
|
338
352
|
/**
|
|
339
353
|
* Checks if a tool call for a specific tool exists in the execution state.
|
|
340
354
|
* @param toolName - The name of the tool.
|
|
@@ -364,6 +378,7 @@ declare const domain_DEFAULT_MAX_TURNS: typeof DEFAULT_MAX_TURNS;
|
|
|
364
378
|
declare const domain_DEFAULT_VERBOSE: typeof DEFAULT_VERBOSE;
|
|
365
379
|
type domain_JudgeAgentAdapter = JudgeAgentAdapter;
|
|
366
380
|
declare const domain_JudgeAgentAdapter: typeof JudgeAgentAdapter;
|
|
381
|
+
type domain_JudgmentRequest = JudgmentRequest;
|
|
367
382
|
type domain_ScenarioConfig = ScenarioConfig;
|
|
368
383
|
type domain_ScenarioConfigFinal = ScenarioConfigFinal;
|
|
369
384
|
type domain_ScenarioExecutionLike = ScenarioExecutionLike;
|
|
@@ -377,7 +392,7 @@ declare const domain_allAgentRoles: typeof allAgentRoles;
|
|
|
377
392
|
declare const domain_defineConfig: typeof defineConfig;
|
|
378
393
|
declare const domain_scenarioProjectConfigSchema: typeof scenarioProjectConfigSchema;
|
|
379
394
|
declare namespace domain {
|
|
380
|
-
export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
|
|
395
|
+
export { domain_AgentAdapter as AgentAdapter, type domain_AgentInput as AgentInput, type domain_AgentReturnTypes as AgentReturnTypes, domain_AgentRole as AgentRole, domain_DEFAULT_MAX_TURNS as DEFAULT_MAX_TURNS, domain_DEFAULT_VERBOSE as DEFAULT_VERBOSE, domain_JudgeAgentAdapter as JudgeAgentAdapter, type domain_JudgmentRequest as JudgmentRequest, type domain_ScenarioConfig as ScenarioConfig, type domain_ScenarioConfigFinal as ScenarioConfigFinal, type domain_ScenarioExecutionLike as ScenarioExecutionLike, type domain_ScenarioExecutionStateLike as ScenarioExecutionStateLike, type domain_ScenarioProjectConfig as ScenarioProjectConfig, type domain_ScenarioResult as ScenarioResult, type domain_ScriptStep as ScriptStep, domain_UserSimulatorAgentAdapter as UserSimulatorAgentAdapter, domain_allAgentRoles as allAgentRoles, domain_defineConfig as defineConfig, domain_scenarioProjectConfigSchema as scenarioProjectConfigSchema };
|
|
381
396
|
}
|
|
382
397
|
|
|
383
398
|
/**
|
|
@@ -475,7 +490,7 @@ interface JudgeAgentConfig extends TestingAgentConfig {
|
|
|
475
490
|
/**
|
|
476
491
|
* The criteria that the judge will use to evaluate the conversation.
|
|
477
492
|
*/
|
|
478
|
-
criteria
|
|
493
|
+
criteria?: string[];
|
|
479
494
|
/**
|
|
480
495
|
* Optional span collector for telemetry. Defaults to global singleton.
|
|
481
496
|
*/
|
|
@@ -554,7 +569,7 @@ declare class JudgeAgent extends JudgeAgentAdapter {
|
|
|
554
569
|
* main();
|
|
555
570
|
* ```
|
|
556
571
|
*/
|
|
557
|
-
declare const judgeAgent: (cfg
|
|
572
|
+
declare const judgeAgent: (cfg?: JudgeAgentConfig) => JudgeAgent;
|
|
558
573
|
|
|
559
574
|
/**
|
|
560
575
|
* Transforms OpenTelemetry spans into a complete plain-text digest for judge evaluation.
|
|
@@ -1334,6 +1349,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
|
|
|
1334
1349
|
private currentTurnSpan?;
|
|
1335
1350
|
/** Timestamp when execution started (for total time calculation) */
|
|
1336
1351
|
private totalStartTime;
|
|
1352
|
+
/** Accumulated results from inline judge checkpoints */
|
|
1353
|
+
private checkpointResults;
|
|
1337
1354
|
/** Event stream for monitoring scenario progress */
|
|
1338
1355
|
private eventSubject;
|
|
1339
1356
|
/**
|
|
@@ -1554,24 +1571,23 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
|
|
|
1554
1571
|
*
|
|
1555
1572
|
* This method is part of the ScenarioExecutionLike interface used by script steps.
|
|
1556
1573
|
*
|
|
1557
|
-
* @param
|
|
1574
|
+
* @param options - Optional options with inline criteria to evaluate as a checkpoint.
|
|
1558
1575
|
* @returns A promise that resolves with:
|
|
1559
1576
|
* - ScenarioResult if the judge makes a final decision, or
|
|
1560
1577
|
* - Null if the conversation should continue
|
|
1561
1578
|
*
|
|
1562
1579
|
* @example
|
|
1563
1580
|
* ```typescript
|
|
1564
|
-
* // Let judge evaluate
|
|
1581
|
+
* // Let judge evaluate with its configured criteria
|
|
1565
1582
|
* const result = await execution.judge();
|
|
1566
|
-
* if (result) {
|
|
1567
|
-
* console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`);
|
|
1568
|
-
* }
|
|
1569
1583
|
*
|
|
1570
|
-
* //
|
|
1571
|
-
* const result = await execution.judge(
|
|
1584
|
+
* // Evaluate inline criteria as a checkpoint
|
|
1585
|
+
* const result = await execution.judge({ criteria: ["Agent responded helpfully"] });
|
|
1572
1586
|
* ```
|
|
1573
1587
|
*/
|
|
1574
|
-
judge(
|
|
1588
|
+
judge(options?: {
|
|
1589
|
+
criteria?: string[];
|
|
1590
|
+
}): Promise<ScenarioResult | null>;
|
|
1575
1591
|
/**
|
|
1576
1592
|
* Lets the scenario proceed automatically for a specified number of turns.
|
|
1577
1593
|
*
|
|
@@ -1718,6 +1734,8 @@ declare class ScenarioExecution implements ScenarioExecutionLike {
|
|
|
1718
1734
|
* - Clears the result from any previous execution
|
|
1719
1735
|
*/
|
|
1720
1736
|
private reset;
|
|
1737
|
+
/** Compiles all accumulated checkpoint results into aggregated met/unmet criteria. */
|
|
1738
|
+
private get compiledCheckpoints();
|
|
1721
1739
|
private nextAgentForRole;
|
|
1722
1740
|
/**
|
|
1723
1741
|
* Starts a new turn in the scenario execution.
|
|
@@ -1847,7 +1865,7 @@ declare class ScenarioExecutionState implements ScenarioExecutionStateLike {
|
|
|
1847
1865
|
description: string;
|
|
1848
1866
|
config: ScenarioConfig;
|
|
1849
1867
|
constructor(config: ScenarioConfig);
|
|
1850
|
-
get messages():
|
|
1868
|
+
get messages(): ModelMessage[];
|
|
1851
1869
|
get currentTurn(): number;
|
|
1852
1870
|
set currentTurn(turn: number);
|
|
1853
1871
|
get threadId(): string;
|
|
@@ -1858,10 +1876,10 @@ declare class ScenarioExecutionState implements ScenarioExecutionStateLike {
|
|
|
1858
1876
|
* @param message - The message to add.
|
|
1859
1877
|
* @param traceId - Optional trace ID to associate with the message.
|
|
1860
1878
|
*/
|
|
1861
|
-
addMessage(message:
|
|
1879
|
+
addMessage(message: ModelMessage & {
|
|
1862
1880
|
traceId?: string;
|
|
1863
1881
|
}): void;
|
|
1864
|
-
lastMessage():
|
|
1882
|
+
lastMessage(): ModelMessage & {
|
|
1865
1883
|
id: string;
|
|
1866
1884
|
traceId?: string;
|
|
1867
1885
|
};
|
|
@@ -1869,10 +1887,10 @@ declare class ScenarioExecutionState implements ScenarioExecutionStateLike {
|
|
|
1869
1887
|
id: string;
|
|
1870
1888
|
traceId?: string;
|
|
1871
1889
|
};
|
|
1872
|
-
lastAgentMessage():
|
|
1890
|
+
lastAgentMessage(): AssistantModelMessage & {
|
|
1873
1891
|
traceId?: string;
|
|
1874
1892
|
};
|
|
1875
|
-
lastToolCall(toolName: string):
|
|
1893
|
+
lastToolCall(toolName: string): ToolModelMessage & {
|
|
1876
1894
|
traceId?: string;
|
|
1877
1895
|
};
|
|
1878
1896
|
hasToolCall(toolName: string): boolean;
|
|
@@ -1957,14 +1975,14 @@ declare namespace runner {
|
|
|
1957
1975
|
/**
|
|
1958
1976
|
* Add a specific message to the conversation.
|
|
1959
1977
|
*
|
|
1960
|
-
* This function allows you to inject any
|
|
1978
|
+
* This function allows you to inject any ModelMessage compatible message directly
|
|
1961
1979
|
* into the conversation at a specific point in the script. Useful for
|
|
1962
1980
|
* simulating tool responses, system messages, or specific conversational states.
|
|
1963
1981
|
*
|
|
1964
1982
|
* @param message The message to add to the conversation.
|
|
1965
1983
|
* @returns A ScriptStep function that can be used in scenario scripts.
|
|
1966
1984
|
*/
|
|
1967
|
-
declare const message: (message:
|
|
1985
|
+
declare const message: (message: ModelMessage) => ScriptStep;
|
|
1968
1986
|
/**
|
|
1969
1987
|
* Generate or specify an agent response in the conversation.
|
|
1970
1988
|
*
|
|
@@ -1976,19 +1994,24 @@ declare const message: (message: CoreMessage) => ScriptStep;
|
|
|
1976
1994
|
* If undefined, the agent under test will generate content automatically.
|
|
1977
1995
|
* @returns A ScriptStep function that can be used in scenario scripts.
|
|
1978
1996
|
*/
|
|
1979
|
-
declare const agent: (content?: string |
|
|
1997
|
+
declare const agent: (content?: string | ModelMessage) => ScriptStep;
|
|
1980
1998
|
/**
|
|
1981
1999
|
* Invoke the judge agent to evaluate the current conversation state.
|
|
1982
2000
|
*
|
|
1983
|
-
*
|
|
1984
|
-
*
|
|
1985
|
-
*
|
|
2001
|
+
* When criteria are provided inline, the judge evaluates only those criteria
|
|
2002
|
+
* as a checkpoint: if all pass, the scenario continues; if any fail, the
|
|
2003
|
+
* scenario fails immediately. This is the preferred way to pass criteria
|
|
2004
|
+
* when using scripts.
|
|
2005
|
+
*
|
|
2006
|
+
* When no criteria are provided, the judge uses its own configured criteria
|
|
2007
|
+
* and returns a final verdict (success or failure), ending the scenario.
|
|
1986
2008
|
*
|
|
1987
|
-
* @param
|
|
1988
|
-
* the judge evaluate based on its criteria.
|
|
2009
|
+
* @param options Optional options object with inline criteria to evaluate.
|
|
1989
2010
|
* @returns A ScriptStep function that can be used in scenario scripts.
|
|
1990
2011
|
*/
|
|
1991
|
-
declare const judge: (
|
|
2012
|
+
declare const judge: (options?: {
|
|
2013
|
+
criteria: string[];
|
|
2014
|
+
}) => ScriptStep;
|
|
1992
2015
|
/**
|
|
1993
2016
|
* Generate or specify a user message in the conversation.
|
|
1994
2017
|
*
|
|
@@ -2000,7 +2023,7 @@ declare const judge: (content?: string | CoreMessage) => ScriptStep;
|
|
|
2000
2023
|
* If undefined, the user simulator will generate content automatically.
|
|
2001
2024
|
* @returns A ScriptStep function that can be used in scenario scripts.
|
|
2002
2025
|
*/
|
|
2003
|
-
declare const user: (content?: string |
|
|
2026
|
+
declare const user: (content?: string | ModelMessage) => ScriptStep;
|
|
2004
2027
|
/**
|
|
2005
2028
|
* Let the scenario proceed automatically for a specified number of turns.
|
|
2006
2029
|
*
|
|
@@ -2048,4 +2071,4 @@ declare namespace script {
|
|
|
2048
2071
|
type ScenarioApi = typeof agents & typeof domain & typeof execution & typeof runner & typeof script;
|
|
2049
2072
|
declare const scenario: ScenarioApi;
|
|
2050
2073
|
|
|
2051
|
-
export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
|
|
2074
|
+
export { AgentAdapter, type AgentInput, type AgentReturnTypes, AgentRole, type AudioResponseEvent, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, type FinishTestArgs, type InvokeLLMParams, type InvokeLLMResult, JudgeAgentAdapter, type JudgeAgentConfig, type JudgeResult, JudgeSpanCollector, JudgeSpanDigestFormatter, type JudgmentRequest, RealtimeAgentAdapter, type RealtimeAgentAdapterConfig, type ScenarioConfig, type ScenarioConfigFinal, ScenarioExecution, type ScenarioExecutionLike, ScenarioExecutionState, type ScenarioExecutionStateLike, type ScenarioProjectConfig, type ScenarioResult, type ScriptStep, type StateChangeEvent, StateChangeEventType, type TestingAgentConfig, UserSimulatorAgentAdapter, agent, allAgentRoles, scenario as default, defineConfig, fail, judge, judgeAgent, judgeSpanCollector, judgeSpanDigestFormatter, message, proceed, run, scenario, scenarioProjectConfigSchema, succeed, user, userSimulatorAgent };
|