@langwatch/scenario 0.2.0-prerelease.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,272 @@
1
+ # Scenario
2
+
3
+ ![scenario](../assets/scenario-wide.webp)
4
+
5
+
6
+ [![npm version](https://badge.fury.io/js/%40getscenario%2Fscenario.svg)](https://badge.fury.io/js/%40getscenario%2Fscenario)
7
+
8
+ A powerful TypeScript library for testing AI agents in realistic, scripted scenarios.
9
+
10
+ Scenario provides a declarative DSL for defining test cases, allowing you to control conversation flow, simulate user behavior, and evaluate agent performance against predefined criteria.
11
+
12
+ ## Features
13
+
14
+ - **Declarative DSL**: Write clear and concise tests with a simple, powerful scripting language.
15
+ - **Realistic Simulation**: Use the `userSimulatorAgent` to generate natural user interactions.
16
+ - **Automated Evaluation**: Employ the `judgeAgent` to automatically assess conversations against success criteria.
17
+ - **Flexible & Extensible**: Easily integrate any AI agent that conforms to a simple `AgentAdapter` interface.
18
+ - **Detailed Reporting**: Get rich results including conversation history, success/failure reasoning, and performance metrics.
19
+ - **TypeScript First**: Full type safety and autocompletion in your editor.
20
+
21
+ ## Installation
22
+
23
+ ```bash
24
+ pnpm add @getscenario/scenario
25
+ # or
26
+ npm install @getscenario/scenario
27
+ # or
28
+ yarn add @getscenario/scenario
29
+ ```
30
+
31
+ ## Quick Start
32
+
33
+ Create your first scenario test in under a minute.
34
+
35
+ ```typescript
36
+ // echo.test.ts
37
+ import { run, AgentRole, AgentAdapter, user, agent, succeed } from "@getscenario/scenario";
38
+
39
+ // 1. Create an adapter for your agent
40
+ const echoAgent: AgentAdapter = {
41
+ role: AgentRole.AGENT,
42
+ call: async (input) => {
43
+ // This agent simply echoes back the last message content
44
+ const lastMessage = input.messages[input.messages.length - 1];
45
+ return `You said: ${lastMessage.content}`;
46
+ },
47
+ };
48
+
49
+ // 2. Define and run your scenario
50
+ async function testEchoAgent() {
51
+ const result = await run({
52
+ name: "Echo Agent Test",
53
+ description: "The agent should echo back the user's message.",
54
+ agents: [echoAgent],
55
+ script: [
56
+ user("Hello world!"),
57
+ agent("You said: Hello world!"), // You can assert the agent's response directly
58
+ succeed("Agent correctly echoed the message."),
59
+ ],
60
+ });
61
+
62
+ if (result.success) {
63
+ console.log("✅ Scenario passed!");
64
+ } else {
65
+ console.error(`❌ Scenario failed: ${result.reasoning}`);
66
+ }
67
+ }
68
+
69
+ testEchoAgent();
70
+ ```
71
+
72
+ ## Usage with a Test Runner
73
+
74
+ Scenario integrates seamlessly with test runners like [Vitest](https://vitest.dev/) or [Jest](https://jestjs.io/). Here's a more advanced example testing an AI-powered weather agent.
75
+
76
+ ```typescript
77
+ // weather.test.ts
78
+ import { describe, it, expect } from "vitest";
79
+ import { openai } from "@ai-sdk/openai";
80
+ import { run, userSimulatorAgent, AgentRole, AgentAdapter, user, agent, succeed } from "@getscenario/scenario";
81
+ import { generateText, tool } from "ai";
82
+ import { z } from "zod";
83
+
84
+ describe("Weather Agent", () => {
85
+ it("should get the weather for a city", async () => {
86
+ // 1. Define the tools your agent can use
87
+ const getCurrentWeather = tool({
88
+ description: "Get the current weather in a given city.",
89
+ parameters: z.object({
90
+ city: z.string().describe("The city to get the weather for."),
91
+ }),
92
+ execute: async ({ city }) => `The weather in ${city} is cloudy with a temperature of 24°C.`,
93
+ });
94
+
95
+ // 2. Create an adapter for your agent
96
+ const weatherAgent: AgentAdapter = {
97
+ role: AgentRole.AGENT,
98
+ call: async (input) => {
99
+ const response = await generateText({
100
+ model: openai("gpt-4.1-mini"),
101
+ system: `You are a helpful assistant that may help the user with weather information.`,
102
+ messages: input.messages,
103
+ tools: { get_current_weather: getCurrentWeather },
104
+ });
105
+
106
+ if (response.toolCalls?.length) {
107
+ // For simplicity, we'll just return the arguments of the first tool call
108
+ const { toolName, args } = response.toolCalls[0];
109
+ return {
110
+ role: "tool",
111
+ content: [{ type: "tool-result", toolName, result: args }],
112
+ };
113
+ }
114
+
115
+ return response.text;
116
+ },
117
+ };
118
+
119
+ // 3. Define and run your scenario
120
+ const result = await run({
121
+ name: "Checking the weather",
122
+ description: "The user asks for the weather in a specific city, and the agent should use the weather tool to find it.",
123
+ agents: [
124
+ weatherAgent,
125
+ userSimulatorAgent({ model: openai("gpt-4.1-mini") }),
126
+ ],
127
+ script: [
128
+ user("What's the weather like in Barcelona?"),
129
+ agent(),
130
+ // You can use inline assertions within your script
131
+ (state) => {
132
+ expect(state.hasToolCall("get_current_weather")).toBe(true);
133
+ },
134
+ succeed("Agent correctly used the weather tool."),
135
+ ],
136
+ });
137
+
138
+ // 4. Assert the final result
139
+ expect(result.success).toBe(true);
140
+ });
141
+ });
142
+ ```
143
+
144
+ ## Core Concepts
145
+
146
+ ### `run(config)`
147
+
148
+ The main function to execute a scenario. It takes a configuration object and returns a promise that resolves with the final `ScenarioResult`.
149
+
150
+ ### `ScenarioConfig`
151
+
152
+ The configuration object for a scenario.
153
+
154
+ - `name: string`: A human-readable name for the scenario.
155
+ - `description: string`: A detailed description of what the scenario tests.
156
+ - `agents: AgentAdapter[]`: A list of agents participating in the scenario.
157
+ - `script?: ScriptStep[]`: An optional array of steps to control the scenario flow. If not provided, the scenario will proceed automatically.
158
+ - `maxTurns?: number`: The maximum number of conversation turns before a timeout. Defaults to 10.
159
+ - `verbose?: boolean`: Enables detailed logging during execution.
160
+
161
+ ### Agents
162
+
163
+ Agents are the participants in a scenario. They are defined by the `AgentAdapter` interface.
164
+
165
+ ```typescript
166
+ export interface AgentAdapter {
167
+ role: AgentRole; // USER, AGENT, or JUDGE
168
+ call: (input: AgentInput) => Promise<AgentReturnTypes>;
169
+ }
170
+ ```
171
+
172
+ Scenario provides built-in agents for common testing needs:
173
+
174
+ - `userSimulatorAgent(config)`: Simulates a human user, generating realistic messages based on the scenario description.
175
+ - `judgeAgent(config)`: Evaluates the conversation against a set of criteria and determines if the scenario succeeds or fails.
176
+
177
+ ### Scripting
178
+
179
+ Scripts provide fine-grained control over the scenario's execution. A script is an array of `ScriptStep` functions.
180
+
181
+ A `ScriptStep` is a function that receives the current `ScenarioExecutionState` and the `ScenarioExecutionLike` context.
182
+
183
+ **Built-in Script Steps:**
184
+
185
+ - `user(content?)`: A user turn. If `content` is provided, it's used as the message. Otherwise, the `userSimulatorAgent` generates one.
186
+ - `agent(content?)`: An agent turn. If `content` is provided, it's used as the message. Otherwise, the agent under test generates a response.
187
+ - `judge(content?)`: Forces the `judgeAgent` to make a decision.
188
+ - `message(message)`: Adds a specific `CoreMessage` to the conversation.
189
+ - `proceed(turns?, onTurn?, onStep?)`: Lets the scenario run automatically.
190
+ - `succeed(reasoning?)`: Ends the scenario with a success verdict.
191
+ - `fail(reasoning?)`: Ends the scenario with a failure verdict.
192
+
193
+ You can also provide your own functions as script steps for making assertions:
194
+
195
+ ```typescript
196
+ import { expect } from "vitest";
197
+
198
+ const script = [
199
+ user("Hello"),
200
+ agent(),
201
+ (state) => {
202
+ // Make assertions on the state
203
+ expect(state.lastAssistantMessage?.content).toContain("Hi there");
204
+ },
205
+ succeed(),
206
+ ];
207
+ ```
208
+
209
+ ## Configuration
210
+
211
+ You can configure project-wide defaults by creating a `scenario.config.js` or `scenario.config.mjs` file in your project root.
212
+
213
+ ```js
214
+ // scenario.config.mjs
215
+ import { defineConfig } from "@getscenario/scenario/config";
216
+ import { openai } from "@ai-sdk/openai";
217
+
218
+ export default defineConfig({
219
+ // Set a default model provider for all agents (e.g., userSimulatorAgent, judgeAgent)
220
+ defaultModel: {
221
+ model: openai("gpt-4o-mini"),
222
+ temperature: 0.1,
223
+ },
224
+
225
+ // Configure the LangWatch reporting endpoint and API key
226
+ langwatchEndpoint: "https://app.langwatch.ai",
227
+ langwatchApiKey: process.env.LANGWATCH_API_KEY,
228
+ });
229
+ ```
230
+
231
+ The library will automatically load this configuration.
232
+
233
+ ### All Configuration Options
234
+
235
+ The following configuration options are all optional. You can specify any combination of them in your `scenario.config.js` file.
236
+
237
+ - `defaultModel` _(Optional)_: An object to configure the default AI model for all agents.
238
+ - `model`: **(Required if `defaultModel` is set)** An instance of a language model from a provider like `@ai-sdk/openai`.
239
+ - `temperature` _(Optional)_: The default temperature for the model (e.g., `0.1`).
240
+ - `maxTokens` _(Optional)_: The default maximum number of tokens for the model to generate.
241
+ - `langwatchEndpoint` _(Optional)_: The endpoint for the LangWatch reporting service. If not specified, it defaults to the `LANGWATCH_ENDPOINT` environment variable, or `https://app.langwatch.ai`.
242
+ - `langwatchApiKey` _(Optional)_: Your LangWatch API key for authenticating with the reporting service. If not specified, it defaults to the `LANGWATCH_API_KEY` environment variable.
243
+
244
+ ### Environment Variables
245
+
246
+ You can control the library's behavior with the following environment variables:
247
+
248
+ - `SCENARIO_LOG_LEVEL`: Sets the verbosity of the internal logger. Can be `error`, `warn`, `info`, or `debug`. By default, logging is silent.
249
+ - `SCENARIO_DISABLE_SIMULATION_REPORT_INFO`: Set to `true` to disable the "Scenario Simulation Reporting" banner that is printed to the console when a test run starts.
250
+ - `LANGWATCH_API_KEY`: Your LangWatch API key. This is used as a fallback if `langwatchApiKey` is not set in your config file.
251
+ - `LANGWATCH_ENDPOINT`: The LangWatch reporting endpoint. This is used as a fallback if `langwatchEndpoint` is not set in your config file.
252
+
253
+ ## Development
254
+
255
+ This project uses `pnpm` for package management.
256
+
257
+ ### Getting Started
258
+
259
+ ```bash
260
+ # Install dependencies
261
+ pnpm install
262
+
263
+ # Build the project
264
+ pnpm run build
265
+
266
+ # Run tests
267
+ pnpm test
268
+ ```
269
+
270
+ ## License
271
+
272
+ MIT