@langwatch/scenario 0.2.0-prerelease.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +272 -0
- package/dist/index.d.mts +1193 -0
- package/dist/index.d.ts +1193 -0
- package/dist/index.js +1444 -0
- package/dist/index.mjs +1389 -0
- package/package.json +78 -0
package/README.md
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
# Scenario
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
[](https://badge.fury.io/js/%40getscenario%2Fscenario)
|
|
7
|
+
|
|
8
|
+
A powerful TypeScript library for testing AI agents in realistic, scripted scenarios.
|
|
9
|
+
|
|
10
|
+
Scenario provides a declarative DSL for defining test cases, allowing you to control conversation flow, simulate user behavior, and evaluate agent performance against predefined criteria.
|
|
11
|
+
|
|
12
|
+
## Features
|
|
13
|
+
|
|
14
|
+
- **Declarative DSL**: Write clear and concise tests with a simple, powerful scripting language.
|
|
15
|
+
- **Realistic Simulation**: Use the `userSimulatorAgent` to generate natural user interactions.
|
|
16
|
+
- **Automated Evaluation**: Employ the `judgeAgent` to automatically assess conversations against success criteria.
|
|
17
|
+
- **Flexible & Extensible**: Easily integrate any AI agent that conforms to a simple `AgentAdapter` interface.
|
|
18
|
+
- **Detailed Reporting**: Get rich results including conversation history, success/failure reasoning, and performance metrics.
|
|
19
|
+
- **TypeScript First**: Full type safety and autocompletion in your editor.
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pnpm add @getscenario/scenario
|
|
25
|
+
# or
|
|
26
|
+
npm install @getscenario/scenario
|
|
27
|
+
# or
|
|
28
|
+
yarn add @getscenario/scenario
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Quick Start
|
|
32
|
+
|
|
33
|
+
Create your first scenario test in under a minute.
|
|
34
|
+
|
|
35
|
+
```typescript
|
|
36
|
+
// echo.test.ts
|
|
37
|
+
import { run, AgentRole, AgentAdapter, user, agent, succeed } from "@getscenario/scenario";
|
|
38
|
+
|
|
39
|
+
// 1. Create an adapter for your agent
|
|
40
|
+
const echoAgent: AgentAdapter = {
|
|
41
|
+
role: AgentRole.AGENT,
|
|
42
|
+
call: async (input) => {
|
|
43
|
+
// This agent simply echoes back the last message content
|
|
44
|
+
const lastMessage = input.messages[input.messages.length - 1];
|
|
45
|
+
return `You said: ${lastMessage.content}`;
|
|
46
|
+
},
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
// 2. Define and run your scenario
|
|
50
|
+
async function testEchoAgent() {
|
|
51
|
+
const result = await run({
|
|
52
|
+
name: "Echo Agent Test",
|
|
53
|
+
description: "The agent should echo back the user's message.",
|
|
54
|
+
agents: [echoAgent],
|
|
55
|
+
script: [
|
|
56
|
+
user("Hello world!"),
|
|
57
|
+
agent("You said: Hello world!"), // You can assert the agent's response directly
|
|
58
|
+
succeed("Agent correctly echoed the message."),
|
|
59
|
+
],
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
if (result.success) {
|
|
63
|
+
console.log("✅ Scenario passed!");
|
|
64
|
+
} else {
|
|
65
|
+
console.error(`❌ Scenario failed: ${result.reasoning}`);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
testEchoAgent();
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Usage with a Test Runner
|
|
73
|
+
|
|
74
|
+
Scenario integrates seamlessly with test runners like [Vitest](https://vitest.dev/) or [Jest](https://jestjs.io/). Here's a more advanced example testing an AI-powered weather agent.
|
|
75
|
+
|
|
76
|
+
```typescript
|
|
77
|
+
// weather.test.ts
|
|
78
|
+
import { describe, it, expect } from "vitest";
|
|
79
|
+
import { openai } from "@ai-sdk/openai";
|
|
80
|
+
import { run, userSimulatorAgent, AgentRole, AgentAdapter, user, agent, succeed } from "@getscenario/scenario";
|
|
81
|
+
import { generateText, tool } from "ai";
|
|
82
|
+
import { z } from "zod";
|
|
83
|
+
|
|
84
|
+
describe("Weather Agent", () => {
|
|
85
|
+
it("should get the weather for a city", async () => {
|
|
86
|
+
// 1. Define the tools your agent can use
|
|
87
|
+
const getCurrentWeather = tool({
|
|
88
|
+
description: "Get the current weather in a given city.",
|
|
89
|
+
parameters: z.object({
|
|
90
|
+
city: z.string().describe("The city to get the weather for."),
|
|
91
|
+
}),
|
|
92
|
+
execute: async ({ city }) => `The weather in ${city} is cloudy with a temperature of 24°C.`,
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
// 2. Create an adapter for your agent
|
|
96
|
+
const weatherAgent: AgentAdapter = {
|
|
97
|
+
role: AgentRole.AGENT,
|
|
98
|
+
call: async (input) => {
|
|
99
|
+
const response = await generateText({
|
|
100
|
+
model: openai("gpt-4.1-mini"),
|
|
101
|
+
system: `You are a helpful assistant that may help the user with weather information.`,
|
|
102
|
+
messages: input.messages,
|
|
103
|
+
tools: { get_current_weather: getCurrentWeather },
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
if (response.toolCalls?.length) {
|
|
107
|
+
// For simplicity, we'll just return the arguments of the first tool call
|
|
108
|
+
const { toolName, args } = response.toolCalls[0];
|
|
109
|
+
return {
|
|
110
|
+
role: "tool",
|
|
111
|
+
content: [{ type: "tool-result", toolName, result: args }],
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return response.text;
|
|
116
|
+
},
|
|
117
|
+
};
|
|
118
|
+
|
|
119
|
+
// 3. Define and run your scenario
|
|
120
|
+
const result = await run({
|
|
121
|
+
name: "Checking the weather",
|
|
122
|
+
description: "The user asks for the weather in a specific city, and the agent should use the weather tool to find it.",
|
|
123
|
+
agents: [
|
|
124
|
+
weatherAgent,
|
|
125
|
+
userSimulatorAgent({ model: openai("gpt-4.1-mini") }),
|
|
126
|
+
],
|
|
127
|
+
script: [
|
|
128
|
+
user("What's the weather like in Barcelona?"),
|
|
129
|
+
agent(),
|
|
130
|
+
// You can use inline assertions within your script
|
|
131
|
+
(state) => {
|
|
132
|
+
expect(state.hasToolCall("get_current_weather")).toBe(true);
|
|
133
|
+
},
|
|
134
|
+
succeed("Agent correctly used the weather tool."),
|
|
135
|
+
],
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
// 4. Assert the final result
|
|
139
|
+
expect(result.success).toBe(true);
|
|
140
|
+
});
|
|
141
|
+
});
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Core Concepts
|
|
145
|
+
|
|
146
|
+
### `run(config)`
|
|
147
|
+
|
|
148
|
+
The main function to execute a scenario. It takes a configuration object and returns a promise that resolves with the final `ScenarioResult`.
|
|
149
|
+
|
|
150
|
+
### `ScenarioConfig`
|
|
151
|
+
|
|
152
|
+
The configuration object for a scenario.
|
|
153
|
+
|
|
154
|
+
- `name: string`: A human-readable name for the scenario.
|
|
155
|
+
- `description: string`: A detailed description of what the scenario tests.
|
|
156
|
+
- `agents: AgentAdapter[]`: A list of agents participating in the scenario.
|
|
157
|
+
- `script?: ScriptStep[]`: An optional array of steps to control the scenario flow. If not provided, the scenario will proceed automatically.
|
|
158
|
+
- `maxTurns?: number`: The maximum number of conversation turns before a timeout. Defaults to 10.
|
|
159
|
+
- `verbose?: boolean`: Enables detailed logging during execution.
|
|
160
|
+
|
|
161
|
+
### Agents
|
|
162
|
+
|
|
163
|
+
Agents are the participants in a scenario. They are defined by the `AgentAdapter` interface.
|
|
164
|
+
|
|
165
|
+
```typescript
|
|
166
|
+
export interface AgentAdapter {
|
|
167
|
+
role: AgentRole; // USER, AGENT, or JUDGE
|
|
168
|
+
call: (input: AgentInput) => Promise<AgentReturnTypes>;
|
|
169
|
+
}
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Scenario provides built-in agents for common testing needs:
|
|
173
|
+
|
|
174
|
+
- `userSimulatorAgent(config)`: Simulates a human user, generating realistic messages based on the scenario description.
|
|
175
|
+
- `judgeAgent(config)`: Evaluates the conversation against a set of criteria and determines if the scenario succeeds or fails.
|
|
176
|
+
|
|
177
|
+
### Scripting
|
|
178
|
+
|
|
179
|
+
Scripts provide fine-grained control over the scenario's execution. A script is an array of `ScriptStep` functions.
|
|
180
|
+
|
|
181
|
+
A `ScriptStep` is a function that receives the current `ScenarioExecutionState` and the `ScenarioExecutionLike` context.
|
|
182
|
+
|
|
183
|
+
**Built-in Script Steps:**
|
|
184
|
+
|
|
185
|
+
- `user(content?)`: A user turn. If `content` is provided, it's used as the message. Otherwise, the `userSimulatorAgent` generates one.
|
|
186
|
+
- `agent(content?)`: An agent turn. If `content` is provided, it's used as the message. Otherwise, the agent under test generates a response.
|
|
187
|
+
- `judge(content?)`: Forces the `judgeAgent` to make a decision.
|
|
188
|
+
- `message(message)`: Adds a specific `CoreMessage` to the conversation.
|
|
189
|
+
- `proceed(turns?, onTurn?, onStep?)`: Lets the scenario run automatically.
|
|
190
|
+
- `succeed(reasoning?)`: Ends the scenario with a success verdict.
|
|
191
|
+
- `fail(reasoning?)`: Ends the scenario with a failure verdict.
|
|
192
|
+
|
|
193
|
+
You can also provide your own functions as script steps for making assertions:
|
|
194
|
+
|
|
195
|
+
```typescript
|
|
196
|
+
import { expect } from "vitest";
|
|
197
|
+
|
|
198
|
+
const script = [
|
|
199
|
+
user("Hello"),
|
|
200
|
+
agent(),
|
|
201
|
+
(state) => {
|
|
202
|
+
// Make assertions on the state
|
|
203
|
+
expect(state.lastAssistantMessage?.content).toContain("Hi there");
|
|
204
|
+
},
|
|
205
|
+
succeed(),
|
|
206
|
+
];
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
## Configuration
|
|
210
|
+
|
|
211
|
+
You can configure project-wide defaults by creating a `scenario.config.js` or `scenario.config.mjs` file in your project root.
|
|
212
|
+
|
|
213
|
+
```js
|
|
214
|
+
// scenario.config.mjs
|
|
215
|
+
import { defineConfig } from "@getscenario/scenario/config";
|
|
216
|
+
import { openai } from "@ai-sdk/openai";
|
|
217
|
+
|
|
218
|
+
export default defineConfig({
|
|
219
|
+
// Set a default model provider for all agents (e.g., userSimulatorAgent, judgeAgent)
|
|
220
|
+
defaultModel: {
|
|
221
|
+
model: openai("gpt-4o-mini"),
|
|
222
|
+
temperature: 0.1,
|
|
223
|
+
},
|
|
224
|
+
|
|
225
|
+
// Configure the LangWatch reporting endpoint and API key
|
|
226
|
+
langwatchEndpoint: "https://app.langwatch.ai",
|
|
227
|
+
langwatchApiKey: process.env.LANGWATCH_API_KEY,
|
|
228
|
+
});
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
The library will automatically load this configuration.
|
|
232
|
+
|
|
233
|
+
### All Configuration Options
|
|
234
|
+
|
|
235
|
+
The following configuration options are all optional. You can specify any combination of them in your `scenario.config.js` file.
|
|
236
|
+
|
|
237
|
+
- `defaultModel` _(Optional)_: An object to configure the default AI model for all agents.
|
|
238
|
+
- `model`: **(Required if `defaultModel` is set)** An instance of a language model from a provider like `@ai-sdk/openai`.
|
|
239
|
+
- `temperature` _(Optional)_: The default temperature for the model (e.g., `0.1`).
|
|
240
|
+
- `maxTokens` _(Optional)_: The default maximum number of tokens for the model to generate.
|
|
241
|
+
- `langwatchEndpoint` _(Optional)_: The endpoint for the LangWatch reporting service. If not specified, it defaults to the `LANGWATCH_ENDPOINT` environment variable, or `https://app.langwatch.ai`.
|
|
242
|
+
- `langwatchApiKey` _(Optional)_: Your LangWatch API key for authenticating with the reporting service. If not specified, it defaults to the `LANGWATCH_API_KEY` environment variable.
|
|
243
|
+
|
|
244
|
+
### Environment Variables
|
|
245
|
+
|
|
246
|
+
You can control the library's behavior with the following environment variables:
|
|
247
|
+
|
|
248
|
+
- `SCENARIO_LOG_LEVEL`: Sets the verbosity of the internal logger. Can be `error`, `warn`, `info`, or `debug`. By default, logging is silent.
|
|
249
|
+
- `SCENARIO_DISABLE_SIMULATION_REPORT_INFO`: Set to `true` to disable the "Scenario Simulation Reporting" banner that is printed to the console when a test run starts.
|
|
250
|
+
- `LANGWATCH_API_KEY`: Your LangWatch API key. This is used as a fallback if `langwatchApiKey` is not set in your config file.
|
|
251
|
+
- `LANGWATCH_ENDPOINT`: The LangWatch reporting endpoint. This is used as a fallback if `langwatchEndpoint` is not set in your config file.
|
|
252
|
+
|
|
253
|
+
## Development
|
|
254
|
+
|
|
255
|
+
This project uses `pnpm` for package management.
|
|
256
|
+
|
|
257
|
+
### Getting Started
|
|
258
|
+
|
|
259
|
+
```bash
|
|
260
|
+
# Install dependencies
|
|
261
|
+
pnpm install
|
|
262
|
+
|
|
263
|
+
# Build the project
|
|
264
|
+
pnpm run build
|
|
265
|
+
|
|
266
|
+
# Run tests
|
|
267
|
+
pnpm test
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
## License
|
|
271
|
+
|
|
272
|
+
MIT
|