@langwatch/scenario 0.2.0-prerelease.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +72 -17
- package/dist/chunk-7P6ASYW6.mjs +9 -0
- package/dist/chunk-ORWSJC5F.mjs +309 -0
- package/dist/index.d.mts +642 -515
- package/dist/index.d.ts +642 -515
- package/dist/index.js +977 -907
- package/dist/index.mjs +845 -1073
- package/dist/integrations/vitest/reporter.d.mts +9 -0
- package/dist/integrations/vitest/reporter.d.ts +9 -0
- package/dist/integrations/vitest/reporter.js +168 -0
- package/dist/integrations/vitest/reporter.mjs +139 -0
- package/dist/integrations/vitest/setup.d.mts +2 -0
- package/dist/integrations/vitest/setup.d.ts +2 -0
- package/dist/integrations/vitest/setup.js +377 -0
- package/dist/integrations/vitest/setup.mjs +51 -0
- package/package.json +17 -5
package/dist/index.mjs
CHANGED
|
@@ -1,28 +1,33 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
};
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
};
|
|
8
|
-
var judge = (content) => {
|
|
9
|
-
return (_state, executor) => executor.judge(content);
|
|
10
|
-
};
|
|
11
|
-
var user = (content) => {
|
|
12
|
-
return (_state, executor) => executor.user(content);
|
|
13
|
-
};
|
|
14
|
-
var proceed = (turns, onTurn, onStep) => {
|
|
15
|
-
return (_state, executor) => executor.proceed(turns, onTurn, onStep);
|
|
16
|
-
};
|
|
17
|
-
var succeed = (reasoning) => {
|
|
18
|
-
return (_state, executor) => executor.succeed(reasoning);
|
|
19
|
-
};
|
|
20
|
-
var fail = (reasoning) => {
|
|
21
|
-
return (_state, executor) => executor.fail(reasoning);
|
|
22
|
-
};
|
|
1
|
+
import {
|
|
2
|
+
EventBus,
|
|
3
|
+
Logger
|
|
4
|
+
} from "./chunk-ORWSJC5F.mjs";
|
|
5
|
+
import {
|
|
6
|
+
__export
|
|
7
|
+
} from "./chunk-7P6ASYW6.mjs";
|
|
23
8
|
|
|
24
|
-
// src/
|
|
25
|
-
|
|
9
|
+
// src/agents/index.ts
|
|
10
|
+
var agents_exports = {};
|
|
11
|
+
__export(agents_exports, {
|
|
12
|
+
judgeAgent: () => judgeAgent,
|
|
13
|
+
userSimulatorAgent: () => userSimulatorAgent
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
// src/agents/judge-agent.ts
|
|
17
|
+
import { generateText, tool } from "ai";
|
|
18
|
+
import { z as z2 } from "zod";
|
|
19
|
+
|
|
20
|
+
// src/domain/index.ts
|
|
21
|
+
var domain_exports = {};
|
|
22
|
+
__export(domain_exports, {
|
|
23
|
+
AgentAdapter: () => AgentAdapter,
|
|
24
|
+
AgentRole: () => AgentRole,
|
|
25
|
+
JudgeAgentAdapter: () => JudgeAgentAdapter,
|
|
26
|
+
UserSimulatorAgentAdapter: () => UserSimulatorAgentAdapter,
|
|
27
|
+
allAgentRoles: () => allAgentRoles,
|
|
28
|
+
defineConfig: () => defineConfig,
|
|
29
|
+
scenarioProjectConfigSchema: () => scenarioProjectConfigSchema
|
|
30
|
+
});
|
|
26
31
|
|
|
27
32
|
// src/domain/core/config.ts
|
|
28
33
|
import { z } from "zod";
|
|
@@ -66,347 +71,448 @@ var JudgeAgentAdapter = class {
|
|
|
66
71
|
}
|
|
67
72
|
};
|
|
68
73
|
|
|
69
|
-
// src/utils
|
|
70
|
-
|
|
71
|
-
var
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
function getBatchRunId() {
|
|
82
|
-
if (!batchRunId) {
|
|
83
|
-
batchRunId = process.env.SCENARIO_BATCH_RUN_ID ?? `scenariobatchrun_${generate()}`;
|
|
84
|
-
}
|
|
85
|
-
return batchRunId;
|
|
86
|
-
}
|
|
87
|
-
function generateMessageId() {
|
|
88
|
-
return `scenariomsg_${generate()}`;
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
// src/execution/scenario-execution-state.ts
|
|
92
|
-
var ScenarioExecutionState = class {
|
|
93
|
-
_history = [];
|
|
94
|
-
_turn = 0;
|
|
95
|
-
_partialResult = null;
|
|
96
|
-
_threadId = "";
|
|
97
|
-
_agents = [];
|
|
98
|
-
_pendingMessages = /* @__PURE__ */ new Map();
|
|
99
|
-
_pendingRolesOnTurn = [];
|
|
100
|
-
_pendingAgentsOnTurn = /* @__PURE__ */ new Set();
|
|
101
|
-
_agentTimes = /* @__PURE__ */ new Map();
|
|
102
|
-
_totalStartTime = 0;
|
|
103
|
-
/**
|
|
104
|
-
* Creates a new ScenarioExecutionState.
|
|
105
|
-
*/
|
|
106
|
-
constructor() {
|
|
107
|
-
this._totalStartTime = Date.now();
|
|
108
|
-
}
|
|
109
|
-
setThreadId(threadId) {
|
|
110
|
-
this._threadId = threadId;
|
|
111
|
-
}
|
|
112
|
-
setAgents(agents) {
|
|
113
|
-
this._agents = agents;
|
|
114
|
-
this._pendingMessages.clear();
|
|
115
|
-
this._agentTimes.clear();
|
|
116
|
-
}
|
|
117
|
-
appendMessage(role, content) {
|
|
118
|
-
const message2 = { role, content };
|
|
119
|
-
this._history.push({ ...message2, id: generateMessageId() });
|
|
120
|
-
}
|
|
121
|
-
appendUserMessage(content) {
|
|
122
|
-
this.appendMessage("user", content);
|
|
123
|
-
}
|
|
124
|
-
appendAssistantMessage(content) {
|
|
125
|
-
this.appendMessage("assistant", content);
|
|
126
|
-
}
|
|
127
|
-
addMessage(message2, fromAgentIdx) {
|
|
128
|
-
this._history.push({ ...message2, id: generateMessageId() });
|
|
129
|
-
for (let idx = 0; idx < this._agents.length; idx++) {
|
|
130
|
-
if (idx === fromAgentIdx) continue;
|
|
131
|
-
if (!this._pendingMessages.has(idx)) {
|
|
132
|
-
this._pendingMessages.set(idx, []);
|
|
133
|
-
}
|
|
134
|
-
this._pendingMessages.get(idx).push(message2);
|
|
135
|
-
}
|
|
136
|
-
}
|
|
137
|
-
addMessages(messages, fromAgentIdx) {
|
|
138
|
-
for (const message2 of messages) {
|
|
139
|
-
this.addMessage(message2, fromAgentIdx);
|
|
140
|
-
}
|
|
141
|
-
}
|
|
142
|
-
getPendingMessages(agentIdx) {
|
|
143
|
-
return this._pendingMessages.get(agentIdx) || [];
|
|
144
|
-
}
|
|
145
|
-
clearPendingMessages(agentIdx) {
|
|
146
|
-
this._pendingMessages.set(agentIdx, []);
|
|
147
|
-
}
|
|
148
|
-
newTurn() {
|
|
149
|
-
this._pendingAgentsOnTurn = new Set(this._agents);
|
|
150
|
-
this._pendingRolesOnTurn = [
|
|
151
|
-
"User" /* USER */,
|
|
152
|
-
"Agent" /* AGENT */,
|
|
153
|
-
"Judge" /* JUDGE */
|
|
154
|
-
];
|
|
155
|
-
if (this._turn === null) {
|
|
156
|
-
this._turn = 1;
|
|
157
|
-
} else {
|
|
158
|
-
this._turn++;
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
removePendingRole(role) {
|
|
162
|
-
const index = this._pendingRolesOnTurn.indexOf(role);
|
|
163
|
-
if (index > -1) {
|
|
164
|
-
this._pendingRolesOnTurn.splice(index, 1);
|
|
74
|
+
// src/agents/utils.ts
|
|
75
|
+
var toolMessageRole = "tool";
|
|
76
|
+
var assistantMessageRole = "assistant";
|
|
77
|
+
var userMessageRole = "user";
|
|
78
|
+
var groupMessagesByToolBoundaries = (messages) => {
|
|
79
|
+
const segments = [];
|
|
80
|
+
let currentSegment = [];
|
|
81
|
+
for (const message2 of messages) {
|
|
82
|
+
currentSegment.push(message2);
|
|
83
|
+
if (message2.role === toolMessageRole) {
|
|
84
|
+
segments.push(currentSegment);
|
|
85
|
+
currentSegment = [];
|
|
165
86
|
}
|
|
166
87
|
}
|
|
167
|
-
|
|
168
|
-
|
|
88
|
+
if (currentSegment.length > 0) {
|
|
89
|
+
segments.push(currentSegment);
|
|
169
90
|
}
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
91
|
+
return segments;
|
|
92
|
+
};
|
|
93
|
+
var segmentHasToolMessages = (segment) => {
|
|
94
|
+
return segment.some((message2) => {
|
|
95
|
+
if (message2.role === toolMessageRole) return true;
|
|
96
|
+
if (message2.role === assistantMessageRole && Array.isArray(message2.content)) {
|
|
97
|
+
return message2.content.some((part) => part.type === "tool-call");
|
|
176
98
|
}
|
|
177
|
-
return
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
return
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
)
|
|
206
|
-
);
|
|
207
|
-
return toolMessage;
|
|
208
|
-
}
|
|
209
|
-
hasToolCall(toolName) {
|
|
210
|
-
return this._history.some(
|
|
211
|
-
(message2) => message2.role === "tool" && message2.content.find(
|
|
212
|
-
(part) => part.type === "tool-result" && part.toolName === toolName
|
|
213
|
-
)
|
|
214
|
-
);
|
|
215
|
-
}
|
|
216
|
-
get history() {
|
|
217
|
-
return this._history;
|
|
218
|
-
}
|
|
219
|
-
get historyWithoutLastMessage() {
|
|
220
|
-
return this._history.slice(0, -1);
|
|
221
|
-
}
|
|
222
|
-
get historyWithoutLastUserMessage() {
|
|
223
|
-
const lastUserMessageIndex = this._history.findLastIndex((message2) => message2.role === "user");
|
|
224
|
-
if (lastUserMessageIndex === -1) return this._history;
|
|
225
|
-
return this._history.slice(0, lastUserMessageIndex);
|
|
226
|
-
}
|
|
227
|
-
get turn() {
|
|
228
|
-
return this._turn;
|
|
229
|
-
}
|
|
230
|
-
set turn(turn) {
|
|
231
|
-
this._turn = turn;
|
|
232
|
-
}
|
|
233
|
-
get threadId() {
|
|
234
|
-
return this._threadId;
|
|
235
|
-
}
|
|
236
|
-
get agents() {
|
|
237
|
-
return this._agents;
|
|
238
|
-
}
|
|
239
|
-
get pendingRolesOnTurn() {
|
|
240
|
-
return this._pendingRolesOnTurn;
|
|
241
|
-
}
|
|
242
|
-
set pendingRolesOnTurn(roles) {
|
|
243
|
-
this._pendingRolesOnTurn = roles;
|
|
244
|
-
}
|
|
245
|
-
get pendingAgentsOnTurn() {
|
|
246
|
-
return Array.from(this._pendingAgentsOnTurn);
|
|
247
|
-
}
|
|
248
|
-
set pendingAgentsOnTurn(agents) {
|
|
249
|
-
this._pendingAgentsOnTurn = new Set(agents);
|
|
250
|
-
}
|
|
251
|
-
get partialResult() {
|
|
252
|
-
return this._partialResult;
|
|
253
|
-
}
|
|
254
|
-
get totalTime() {
|
|
255
|
-
return Date.now() - this._totalStartTime;
|
|
256
|
-
}
|
|
257
|
-
get agentTimes() {
|
|
258
|
-
return new Map(this._agentTimes);
|
|
259
|
-
}
|
|
260
|
-
removeLastPendingRole() {
|
|
261
|
-
this._pendingRolesOnTurn.pop();
|
|
262
|
-
}
|
|
99
|
+
return false;
|
|
100
|
+
});
|
|
101
|
+
};
|
|
102
|
+
var reverseSegmentRoles = (segment) => {
|
|
103
|
+
return segment.map((message2) => {
|
|
104
|
+
const hasStringContent = typeof message2.content === "string";
|
|
105
|
+
if (!hasStringContent) return message2;
|
|
106
|
+
const roleMap = {
|
|
107
|
+
[userMessageRole]: assistantMessageRole,
|
|
108
|
+
[assistantMessageRole]: userMessageRole
|
|
109
|
+
};
|
|
110
|
+
const newRole = roleMap[message2.role];
|
|
111
|
+
if (!newRole) return message2;
|
|
112
|
+
return {
|
|
113
|
+
role: newRole,
|
|
114
|
+
content: message2.content
|
|
115
|
+
};
|
|
116
|
+
});
|
|
117
|
+
};
|
|
118
|
+
var messageRoleReversal = (messages) => {
|
|
119
|
+
const segments = groupMessagesByToolBoundaries(messages);
|
|
120
|
+
const processedSegments = segments.map(
|
|
121
|
+
(segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
|
|
122
|
+
);
|
|
123
|
+
return processedSegments.flat();
|
|
124
|
+
};
|
|
125
|
+
var criterionToParamName = (criterion) => {
|
|
126
|
+
return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
|
|
263
127
|
};
|
|
264
128
|
|
|
265
|
-
// src/
|
|
266
|
-
import
|
|
267
|
-
import
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
});
|
|
287
|
-
var scenarioRunStartedSchema = baseScenarioEventSchema.extend({
|
|
288
|
-
type: z2.literal("SCENARIO_RUN_STARTED" /* RUN_STARTED */),
|
|
289
|
-
metadata: z2.object({
|
|
290
|
-
name: z2.string(),
|
|
291
|
-
description: z2.string().optional()
|
|
292
|
-
// config: z.record(z.unknown()).optional(),
|
|
293
|
-
})
|
|
294
|
-
});
|
|
295
|
-
var scenarioRunFinishedSchema = baseScenarioEventSchema.extend({
|
|
296
|
-
type: z2.literal("SCENARIO_RUN_FINISHED" /* RUN_FINISHED */),
|
|
297
|
-
status: z2.nativeEnum(ScenarioRunStatus)
|
|
298
|
-
// error: z
|
|
299
|
-
// .object({
|
|
300
|
-
// message: z.string(),
|
|
301
|
-
// code: z.string().optional(),
|
|
302
|
-
// stack: z.string().optional(),
|
|
303
|
-
// })
|
|
304
|
-
// .optional(),
|
|
305
|
-
// metrics: z.record(z.number()).optional(),
|
|
306
|
-
});
|
|
307
|
-
var scenarioMessageSnapshotSchema = MessagesSnapshotEventSchema.merge(
|
|
308
|
-
baseScenarioEventSchema.extend({
|
|
309
|
-
type: z2.literal("SCENARIO_MESSAGE_SNAPSHOT" /* MESSAGE_SNAPSHOT */)
|
|
310
|
-
})
|
|
311
|
-
);
|
|
312
|
-
var scenarioEventSchema = z2.discriminatedUnion("type", [
|
|
313
|
-
scenarioRunStartedSchema,
|
|
314
|
-
scenarioRunFinishedSchema,
|
|
315
|
-
scenarioMessageSnapshotSchema
|
|
316
|
-
]);
|
|
317
|
-
var successSchema = z2.object({ success: z2.boolean() });
|
|
318
|
-
var errorSchema = z2.object({ error: z2.string() });
|
|
319
|
-
var stateSchema = z2.object({
|
|
320
|
-
state: z2.object({
|
|
321
|
-
messages: z2.array(z2.any()),
|
|
322
|
-
status: z2.string()
|
|
323
|
-
})
|
|
324
|
-
});
|
|
325
|
-
var runsSchema = z2.object({ runs: z2.array(z2.string()) });
|
|
326
|
-
var eventsSchema = z2.object({ events: z2.array(scenarioEventSchema) });
|
|
327
|
-
|
|
328
|
-
// src/utils/logger.ts
|
|
329
|
-
var Logger = class _Logger {
|
|
330
|
-
constructor(context) {
|
|
331
|
-
this.context = context;
|
|
332
|
-
}
|
|
333
|
-
/**
|
|
334
|
-
* Creates a logger with context (e.g., class name)
|
|
335
|
-
*/
|
|
336
|
-
static create(context) {
|
|
337
|
-
return new _Logger(context);
|
|
338
|
-
}
|
|
339
|
-
/**
|
|
340
|
-
* Checks if logging should occur based on LOG_LEVEL env var
|
|
341
|
-
*/
|
|
342
|
-
shouldLog(level) {
|
|
343
|
-
const logLevel = (process.env.SCENARIO_LOG_LEVEL || "").toLowerCase();
|
|
344
|
-
const levels = ["error", "warn", "info", "debug"];
|
|
345
|
-
const currentLevelIndex = levels.indexOf(logLevel);
|
|
346
|
-
const requestedLevelIndex = levels.indexOf(level);
|
|
347
|
-
return currentLevelIndex >= 0 && requestedLevelIndex <= currentLevelIndex;
|
|
348
|
-
}
|
|
349
|
-
formatMessage(message2) {
|
|
350
|
-
return this.context ? `[${this.context}] ${message2}` : message2;
|
|
351
|
-
}
|
|
352
|
-
error(message2, data) {
|
|
353
|
-
if (this.shouldLog("error")) {
|
|
354
|
-
const formattedMessage = this.formatMessage(message2);
|
|
355
|
-
if (data) {
|
|
356
|
-
console.error(formattedMessage, data);
|
|
357
|
-
} else {
|
|
358
|
-
console.error(formattedMessage);
|
|
129
|
+
// src/config/load.ts
|
|
130
|
+
import fs from "node:fs/promises";
|
|
131
|
+
import path from "node:path";
|
|
132
|
+
import { pathToFileURL } from "node:url";
|
|
133
|
+
async function loadScenarioProjectConfig() {
|
|
134
|
+
const cwd = process.cwd();
|
|
135
|
+
const configNames = [
|
|
136
|
+
"scenario.config.js",
|
|
137
|
+
"scenario.config.mjs"
|
|
138
|
+
];
|
|
139
|
+
for (const name of configNames) {
|
|
140
|
+
const fullPath = path.join(cwd, name);
|
|
141
|
+
try {
|
|
142
|
+
await fs.access(fullPath);
|
|
143
|
+
const configModule = await import(pathToFileURL(fullPath).href);
|
|
144
|
+
const config2 = configModule.default || configModule;
|
|
145
|
+
const parsed = scenarioProjectConfigSchema.safeParse(config2);
|
|
146
|
+
if (!parsed.success) {
|
|
147
|
+
throw new Error(
|
|
148
|
+
`Invalid config file ${name}: ${JSON.stringify(parsed.error.format(), null, 2)}`
|
|
149
|
+
);
|
|
359
150
|
}
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
const formattedMessage = this.formatMessage(message2);
|
|
365
|
-
if (data) {
|
|
366
|
-
console.warn(formattedMessage, data);
|
|
367
|
-
} else {
|
|
368
|
-
console.warn(formattedMessage);
|
|
151
|
+
return parsed.data;
|
|
152
|
+
} catch (error) {
|
|
153
|
+
if (error instanceof Error && "code" in error && error.code === "ENOENT") {
|
|
154
|
+
continue;
|
|
369
155
|
}
|
|
156
|
+
throw error;
|
|
370
157
|
}
|
|
371
158
|
}
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
159
|
+
return await scenarioProjectConfigSchema.parseAsync({});
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// src/config/index.ts
|
|
163
|
+
var logger = new Logger("scenario.config");
|
|
164
|
+
var configLoaded = false;
|
|
165
|
+
var config = null;
|
|
166
|
+
var configLoadPromise = null;
|
|
167
|
+
async function loadProjectConfig() {
|
|
168
|
+
if (configLoaded) {
|
|
169
|
+
return;
|
|
381
170
|
}
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
const formattedMessage = this.formatMessage(message2);
|
|
385
|
-
if (data) {
|
|
386
|
-
console.log(formattedMessage, data);
|
|
387
|
-
} else {
|
|
388
|
-
console.log(formattedMessage);
|
|
389
|
-
}
|
|
390
|
-
}
|
|
171
|
+
if (configLoadPromise) {
|
|
172
|
+
return configLoadPromise;
|
|
391
173
|
}
|
|
392
|
-
|
|
174
|
+
configLoadPromise = (async () => {
|
|
175
|
+
try {
|
|
176
|
+
config = await loadScenarioProjectConfig();
|
|
177
|
+
logger.info("loaded scenario project config", { config });
|
|
178
|
+
} catch (error) {
|
|
179
|
+
logger.error("error loading scenario project config", { error });
|
|
180
|
+
} finally {
|
|
181
|
+
configLoaded = true;
|
|
182
|
+
}
|
|
183
|
+
})();
|
|
184
|
+
return configLoadPromise;
|
|
185
|
+
}
|
|
186
|
+
async function getProjectConfig() {
|
|
187
|
+
await loadProjectConfig();
|
|
188
|
+
return config;
|
|
189
|
+
}
|
|
393
190
|
|
|
394
|
-
// src/
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
191
|
+
// src/utils/config.ts
|
|
192
|
+
function mergeConfig(config2, projectConfig) {
|
|
193
|
+
if (!projectConfig) {
|
|
194
|
+
return config2;
|
|
195
|
+
}
|
|
196
|
+
return {
|
|
197
|
+
...projectConfig.defaultModel,
|
|
198
|
+
...config2
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
function mergeAndValidateConfig(config2, projectConfig) {
|
|
202
|
+
var _a;
|
|
203
|
+
const mergedConfig = mergeConfig(config2, projectConfig);
|
|
204
|
+
mergedConfig.model = mergedConfig.model ?? ((_a = projectConfig == null ? void 0 : projectConfig.defaultModel) == null ? void 0 : _a.model);
|
|
205
|
+
if (!mergedConfig.model) {
|
|
206
|
+
throw new Error("Model is required");
|
|
207
|
+
}
|
|
208
|
+
return mergedConfig;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// src/agents/judge-agent.ts
|
|
212
|
+
function buildSystemPrompt(criteria, description) {
|
|
213
|
+
const criteriaList = (criteria == null ? void 0 : criteria.map((criterion, idx) => `${idx + 1}. ${criterion}`).join("\n")) || "No criteria provided";
|
|
214
|
+
return `
|
|
215
|
+
<role>
|
|
216
|
+
You are an LLM as a judge watching a simulated conversation as it plays out live to determine if the agent under test meets the criteria or not.
|
|
217
|
+
</role>
|
|
218
|
+
|
|
219
|
+
<goal>
|
|
220
|
+
Your goal is to determine if you already have enough information to make a verdict of the scenario below, or if the conversation should continue for longer.
|
|
221
|
+
If you do have enough information, use the finish_test tool to determine if all the criteria have been met, if not, use the continue_test tool to let the next step play out.
|
|
222
|
+
</goal>
|
|
223
|
+
|
|
224
|
+
<scenario>
|
|
225
|
+
${description}
|
|
226
|
+
</scenario>
|
|
227
|
+
|
|
228
|
+
<criteria>
|
|
229
|
+
${criteriaList}
|
|
230
|
+
</criteria>
|
|
231
|
+
|
|
232
|
+
<rules>
|
|
233
|
+
- Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criteria.
|
|
234
|
+
- DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
|
|
235
|
+
</rules>
|
|
236
|
+
`.trim();
|
|
237
|
+
}
|
|
238
|
+
function buildContinueTestTool() {
|
|
239
|
+
return tool({
|
|
240
|
+
description: "Continue the test with the next step",
|
|
241
|
+
parameters: z2.object({})
|
|
242
|
+
});
|
|
243
|
+
}
|
|
244
|
+
function buildFinishTestTool(criteria) {
|
|
245
|
+
const criteriaNames = criteria.map(criterionToParamName);
|
|
246
|
+
return tool({
|
|
247
|
+
description: "Complete the test with a final verdict",
|
|
248
|
+
parameters: z2.object({
|
|
249
|
+
criteria: z2.object(
|
|
250
|
+
Object.fromEntries(
|
|
251
|
+
criteriaNames.map((name, idx) => [
|
|
252
|
+
name,
|
|
253
|
+
z2.enum(["true", "false", "inconclusive"]).describe(criteria[idx])
|
|
254
|
+
])
|
|
255
|
+
)
|
|
256
|
+
).strict().describe("Strict verdict for each criterion"),
|
|
257
|
+
reasoning: z2.string().describe("Explanation of what the final verdict should be"),
|
|
258
|
+
verdict: z2.enum(["success", "failure", "inconclusive"]).describe("The final verdict of the test")
|
|
259
|
+
})
|
|
260
|
+
});
|
|
404
261
|
}
|
|
262
|
+
var judgeAgent = (cfg) => {
|
|
263
|
+
return {
|
|
264
|
+
role: "Judge" /* JUDGE */,
|
|
265
|
+
criteria: cfg.criteria,
|
|
266
|
+
call: async (input) => {
|
|
267
|
+
var _a;
|
|
268
|
+
const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
|
|
269
|
+
const messages = [
|
|
270
|
+
{ role: "system", content: systemPrompt },
|
|
271
|
+
...input.messages
|
|
272
|
+
];
|
|
273
|
+
const isLastMessage = input.scenarioState.currentTurn === input.scenarioConfig.maxTurns;
|
|
274
|
+
const projectConfig = await getProjectConfig();
|
|
275
|
+
const mergedConfig = mergeAndValidateConfig(cfg, projectConfig);
|
|
276
|
+
if (!mergedConfig.model) {
|
|
277
|
+
throw new Error("Model is required for the judge agent");
|
|
278
|
+
}
|
|
279
|
+
const tools = {
|
|
280
|
+
continue_test: buildContinueTestTool(),
|
|
281
|
+
finish_test: buildFinishTestTool(cfg.criteria)
|
|
282
|
+
};
|
|
283
|
+
const enforceJudgement = input.judgmentRequest;
|
|
284
|
+
const hasCriteria = cfg.criteria.length && cfg.criteria.length > 0;
|
|
285
|
+
if (enforceJudgement && !hasCriteria) {
|
|
286
|
+
return {
|
|
287
|
+
success: false,
|
|
288
|
+
messages: [],
|
|
289
|
+
reasoning: "JudgeAgent: No criteria was provided to be judged against",
|
|
290
|
+
metCriteria: [],
|
|
291
|
+
unmetCriteria: []
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
const toolChoice = (isLastMessage || enforceJudgement) && hasCriteria ? { type: "tool", toolName: "finish_test" } : "required";
|
|
295
|
+
const completion = await generateText({
|
|
296
|
+
model: mergedConfig.model,
|
|
297
|
+
messages,
|
|
298
|
+
temperature: mergedConfig.temperature ?? 0,
|
|
299
|
+
maxTokens: mergedConfig.maxTokens,
|
|
300
|
+
tools,
|
|
301
|
+
toolChoice
|
|
302
|
+
});
|
|
303
|
+
let args;
|
|
304
|
+
if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
|
|
305
|
+
const toolCall = completion.toolCalls[0];
|
|
306
|
+
switch (toolCall.toolName) {
|
|
307
|
+
case "finish_test": {
|
|
308
|
+
args = toolCall.args;
|
|
309
|
+
const verdict = args.verdict || "inconclusive";
|
|
310
|
+
const reasoning = args.reasoning || "No reasoning provided";
|
|
311
|
+
const criteria = args.criteria || {};
|
|
312
|
+
const criteriaValues = Object.values(criteria);
|
|
313
|
+
const metCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] === "true");
|
|
314
|
+
const unmetCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] !== "true");
|
|
315
|
+
return {
|
|
316
|
+
success: verdict === "success",
|
|
317
|
+
messages: input.messages,
|
|
318
|
+
reasoning,
|
|
319
|
+
metCriteria,
|
|
320
|
+
unmetCriteria
|
|
321
|
+
};
|
|
322
|
+
}
|
|
323
|
+
case "continue_test":
|
|
324
|
+
return [];
|
|
325
|
+
default:
|
|
326
|
+
return {
|
|
327
|
+
success: false,
|
|
328
|
+
messages: input.messages,
|
|
329
|
+
reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
|
|
330
|
+
metCriteria: [],
|
|
331
|
+
unmetCriteria: cfg.criteria
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
return {
|
|
336
|
+
success: false,
|
|
337
|
+
messages: input.messages,
|
|
338
|
+
reasoning: `JudgeAgent: No tool call found in LLM output`,
|
|
339
|
+
metCriteria: [],
|
|
340
|
+
unmetCriteria: cfg.criteria
|
|
341
|
+
};
|
|
342
|
+
}
|
|
343
|
+
};
|
|
344
|
+
};
|
|
345
|
+
|
|
346
|
+
// src/agents/user-simulator-agent.ts
|
|
347
|
+
import { generateText as generateText2 } from "ai";
|
|
348
|
+
function buildSystemPrompt2(description) {
|
|
349
|
+
return `
|
|
350
|
+
<role>
|
|
351
|
+
You are pretending to be a user, you are testing an AI Agent (shown as the user role) based on a scenario.
|
|
352
|
+
Approach this naturally, as a human user would, with very short inputs, few words, all lowercase, imperative, not periods, like when they google or talk to chatgpt.
|
|
353
|
+
</role>
|
|
354
|
+
|
|
355
|
+
<goal>
|
|
356
|
+
Your goal (assistant) is to interact with the Agent Under Test (user) as if you were a human user to see if it can complete the scenario successfully.
|
|
357
|
+
</goal>
|
|
358
|
+
|
|
359
|
+
<scenario>
|
|
360
|
+
${description}
|
|
361
|
+
</scenario>
|
|
362
|
+
|
|
363
|
+
<rules>
|
|
364
|
+
- DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, you are the user
|
|
365
|
+
</rules>
|
|
366
|
+
`.trim();
|
|
367
|
+
}
|
|
368
|
+
var userSimulatorAgent = (config2) => {
|
|
369
|
+
return {
|
|
370
|
+
role: "User" /* USER */,
|
|
371
|
+
call: async (input) => {
|
|
372
|
+
const systemPrompt = buildSystemPrompt2(input.scenarioConfig.description);
|
|
373
|
+
const messages = [
|
|
374
|
+
{ role: "system", content: systemPrompt },
|
|
375
|
+
{ role: "assistant", content: "Hello, how can I help you today" },
|
|
376
|
+
...input.messages
|
|
377
|
+
];
|
|
378
|
+
const projectConfig = await getProjectConfig();
|
|
379
|
+
const mergedConfig = mergeAndValidateConfig(config2 ?? {}, projectConfig);
|
|
380
|
+
if (!mergedConfig.model) {
|
|
381
|
+
throw new Error("Model is required for the user simulator agent");
|
|
382
|
+
}
|
|
383
|
+
const reversedMessages = messageRoleReversal(messages);
|
|
384
|
+
const completion = await generateText2({
|
|
385
|
+
model: mergedConfig.model,
|
|
386
|
+
messages: reversedMessages,
|
|
387
|
+
temperature: mergedConfig.temperature ?? 0,
|
|
388
|
+
maxTokens: mergedConfig.maxTokens
|
|
389
|
+
});
|
|
390
|
+
const messageContent = completion.text;
|
|
391
|
+
if (!messageContent) {
|
|
392
|
+
throw new Error("No response content from LLM");
|
|
393
|
+
}
|
|
394
|
+
return { role: "user", content: messageContent };
|
|
395
|
+
}
|
|
396
|
+
};
|
|
397
|
+
};
|
|
398
|
+
|
|
399
|
+
// src/execution/index.ts
|
|
400
|
+
var execution_exports = {};
|
|
401
|
+
__export(execution_exports, {
|
|
402
|
+
ScenarioExecution: () => ScenarioExecution,
|
|
403
|
+
ScenarioExecutionState: () => ScenarioExecutionState
|
|
404
|
+
});
|
|
405
|
+
|
|
406
|
+
// src/execution/scenario-execution.ts
|
|
407
|
+
import { Subject } from "rxjs";
|
|
408
|
+
|
|
409
|
+
// src/utils/ids.ts
|
|
410
|
+
import { generate, parse } from "xksuid";
|
|
411
|
+
var batchRunId = null;
|
|
412
|
+
function generateThreadId() {
|
|
413
|
+
return `thread_${generate()}`;
|
|
414
|
+
}
|
|
415
|
+
function generateScenarioRunId() {
|
|
416
|
+
return `scenariorun_${generate()}`;
|
|
417
|
+
}
|
|
418
|
+
function generateScenarioId() {
|
|
419
|
+
return `scenario_${generate()}`;
|
|
420
|
+
}
|
|
421
|
+
function getBatchRunId() {
|
|
422
|
+
if (!batchRunId) {
|
|
423
|
+
batchRunId = process.env.SCENARIO_BATCH_RUN_ID ?? `scenariobatchrun_${generate()}`;
|
|
424
|
+
}
|
|
425
|
+
return batchRunId;
|
|
426
|
+
}
|
|
427
|
+
function generateMessageId() {
|
|
428
|
+
return `scenariomsg_${generate()}`;
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
// src/execution/scenario-execution-state.ts
|
|
432
|
+
var ScenarioExecutionState = class {
|
|
433
|
+
_messages = [];
|
|
434
|
+
_currentTurn = 0;
|
|
435
|
+
_threadId = "";
|
|
436
|
+
description;
|
|
437
|
+
config;
|
|
438
|
+
constructor(config2) {
|
|
439
|
+
this.config = config2;
|
|
440
|
+
this.description = config2.description;
|
|
441
|
+
}
|
|
442
|
+
get messages() {
|
|
443
|
+
return this._messages;
|
|
444
|
+
}
|
|
445
|
+
get currentTurn() {
|
|
446
|
+
return this._currentTurn;
|
|
447
|
+
}
|
|
448
|
+
set currentTurn(turn) {
|
|
449
|
+
this._currentTurn = turn;
|
|
450
|
+
}
|
|
451
|
+
get threadId() {
|
|
452
|
+
return this._threadId;
|
|
453
|
+
}
|
|
454
|
+
set threadId(value) {
|
|
455
|
+
this._threadId = value;
|
|
456
|
+
}
|
|
457
|
+
/**
|
|
458
|
+
* Adds a message to the conversation history.
|
|
459
|
+
*
|
|
460
|
+
* @param message - The message to add.
|
|
461
|
+
*/
|
|
462
|
+
addMessage(message2) {
|
|
463
|
+
this._messages.push({ ...message2, id: generateMessageId() });
|
|
464
|
+
}
|
|
465
|
+
lastMessage() {
|
|
466
|
+
if (this._messages.length === 0) {
|
|
467
|
+
throw new Error("No messages in history");
|
|
468
|
+
}
|
|
469
|
+
return this._messages[this._messages.length - 1];
|
|
470
|
+
}
|
|
471
|
+
lastUserMessage() {
|
|
472
|
+
if (this._messages.length === 0) {
|
|
473
|
+
throw new Error("No messages in history");
|
|
474
|
+
}
|
|
475
|
+
const lastMessage = this._messages.findLast((message2) => message2.role === "user");
|
|
476
|
+
if (!lastMessage) {
|
|
477
|
+
throw new Error("No user message in history");
|
|
478
|
+
}
|
|
479
|
+
return lastMessage;
|
|
480
|
+
}
|
|
481
|
+
lastToolCall(toolName) {
|
|
482
|
+
if (this._messages.length === 0) {
|
|
483
|
+
throw new Error("No messages in history");
|
|
484
|
+
}
|
|
485
|
+
const lastMessage = this._messages.findLast((message2) => message2.role === "tool" && message2.content.find(
|
|
486
|
+
(part) => part.type === "tool-result" && part.toolName === toolName
|
|
487
|
+
));
|
|
488
|
+
if (!lastMessage) {
|
|
489
|
+
throw new Error("No tool call message in history");
|
|
490
|
+
}
|
|
491
|
+
return lastMessage;
|
|
492
|
+
}
|
|
493
|
+
hasToolCall(toolName) {
|
|
494
|
+
return this._messages.some(
|
|
495
|
+
(message2) => message2.role === "tool" && message2.content.find(
|
|
496
|
+
(part) => part.type === "tool-result" && part.toolName === toolName
|
|
497
|
+
)
|
|
498
|
+
);
|
|
499
|
+
}
|
|
500
|
+
};
|
|
501
|
+
|
|
502
|
+
// src/execution/scenario-execution.ts
|
|
503
|
+
var batchRunId2 = getBatchRunId();
|
|
405
504
|
var ScenarioExecution = class {
|
|
406
|
-
state
|
|
505
|
+
state;
|
|
407
506
|
eventSubject = new Subject();
|
|
408
507
|
logger = new Logger("scenario.execution.ScenarioExecution");
|
|
409
508
|
config;
|
|
509
|
+
agents = [];
|
|
510
|
+
pendingRolesOnTurn = [];
|
|
511
|
+
pendingAgentsOnTurn = /* @__PURE__ */ new Set();
|
|
512
|
+
pendingMessages = /* @__PURE__ */ new Map();
|
|
513
|
+
partialResult = null;
|
|
514
|
+
agentTimes = /* @__PURE__ */ new Map();
|
|
515
|
+
totalStartTime = 0;
|
|
410
516
|
/**
|
|
411
517
|
* An observable stream of events that occur during the scenario execution.
|
|
412
518
|
* Subscribe to this to monitor the progress of the scenario in real-time.
|
|
@@ -426,15 +532,17 @@ var ScenarioExecution = class {
|
|
|
426
532
|
script,
|
|
427
533
|
verbose: config2.verbose ?? false,
|
|
428
534
|
maxTurns: config2.maxTurns ?? 10,
|
|
429
|
-
threadId: config2.threadId ?? generateThreadId()
|
|
535
|
+
threadId: config2.threadId ?? generateThreadId(),
|
|
536
|
+
setId: config2.setId
|
|
430
537
|
};
|
|
538
|
+
this.state = new ScenarioExecutionState(this.config);
|
|
431
539
|
this.reset();
|
|
432
540
|
}
|
|
433
541
|
/**
|
|
434
542
|
* The history of messages in the conversation.
|
|
435
543
|
*/
|
|
436
|
-
get
|
|
437
|
-
return this.state.
|
|
544
|
+
get messages() {
|
|
545
|
+
return this.state.messages;
|
|
438
546
|
}
|
|
439
547
|
/**
|
|
440
548
|
* The unique identifier for the conversation thread.
|
|
@@ -442,6 +550,12 @@ var ScenarioExecution = class {
|
|
|
442
550
|
get threadId() {
|
|
443
551
|
return this.state.threadId;
|
|
444
552
|
}
|
|
553
|
+
/**
|
|
554
|
+
* The total elapsed time for the scenario execution.
|
|
555
|
+
*/
|
|
556
|
+
get totalTime() {
|
|
557
|
+
return Date.now() - this.totalStartTime;
|
|
558
|
+
}
|
|
445
559
|
/**
|
|
446
560
|
* Executes the entire scenario from start to finish.
|
|
447
561
|
* This will run through the script and any automatic proceeding logic until a
|
|
@@ -462,7 +576,8 @@ var ScenarioExecution = class {
|
|
|
462
576
|
if (result && typeof result === "object" && "success" in result) {
|
|
463
577
|
this.emitRunFinished({
|
|
464
578
|
scenarioRunId,
|
|
465
|
-
status: result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED
|
|
579
|
+
status: result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
|
|
580
|
+
result
|
|
466
581
|
});
|
|
467
582
|
return result;
|
|
468
583
|
}
|
|
@@ -475,11 +590,20 @@ var ScenarioExecution = class {
|
|
|
475
590
|
"- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
|
|
476
591
|
].join("\n"));
|
|
477
592
|
} catch (error) {
|
|
593
|
+
const errorResult = {
|
|
594
|
+
success: false,
|
|
595
|
+
messages: this.state.messages,
|
|
596
|
+
reasoning: `Scenario failed with error: ${error instanceof Error ? error.message : String(error)}`,
|
|
597
|
+
metCriteria: [],
|
|
598
|
+
unmetCriteria: [],
|
|
599
|
+
error: error instanceof Error ? error.message : String(error)
|
|
600
|
+
};
|
|
478
601
|
this.emitRunFinished({
|
|
479
602
|
scenarioRunId,
|
|
480
|
-
status: "ERROR" /* ERROR
|
|
603
|
+
status: "ERROR" /* ERROR */,
|
|
604
|
+
result: errorResult
|
|
481
605
|
});
|
|
482
|
-
|
|
606
|
+
return errorResult;
|
|
483
607
|
}
|
|
484
608
|
}
|
|
485
609
|
/**
|
|
@@ -494,29 +618,29 @@ var ScenarioExecution = class {
|
|
|
494
618
|
return result;
|
|
495
619
|
}
|
|
496
620
|
async _step(goToNextTurn = true, onTurn) {
|
|
497
|
-
if (this.
|
|
621
|
+
if (this.pendingRolesOnTurn.length === 0) {
|
|
498
622
|
if (!goToNextTurn) return null;
|
|
499
|
-
this.
|
|
623
|
+
this.newTurn();
|
|
500
624
|
if (onTurn) await onTurn(this.state);
|
|
501
|
-
if (this.state.
|
|
625
|
+
if (this.state.currentTurn >= this.config.maxTurns)
|
|
502
626
|
return this.reachedMaxTurns();
|
|
503
627
|
}
|
|
504
|
-
const currentRole = this.
|
|
628
|
+
const currentRole = this.pendingRolesOnTurn[0];
|
|
505
629
|
const { idx, agent: nextAgent } = this.nextAgentForRole(currentRole);
|
|
506
630
|
if (!nextAgent) {
|
|
507
|
-
this.
|
|
631
|
+
this.removePendingRole(currentRole);
|
|
508
632
|
return this._step(goToNextTurn, onTurn);
|
|
509
633
|
}
|
|
510
|
-
this.
|
|
634
|
+
this.removePendingAgent(nextAgent);
|
|
511
635
|
return await this.callAgent(idx, currentRole);
|
|
512
636
|
}
|
|
513
637
|
async callAgent(idx, role, judgmentRequest = false) {
|
|
514
|
-
const agent2 = this.
|
|
638
|
+
const agent2 = this.agents[idx];
|
|
515
639
|
const startTime = Date.now();
|
|
516
640
|
const agentInput = {
|
|
517
641
|
threadId: this.state.threadId,
|
|
518
|
-
messages: this.state.
|
|
519
|
-
newMessages: this.
|
|
642
|
+
messages: this.state.messages,
|
|
643
|
+
newMessages: this.pendingMessages.get(idx) ?? [],
|
|
520
644
|
requestedRole: role,
|
|
521
645
|
judgmentRequest,
|
|
522
646
|
scenarioState: this.state,
|
|
@@ -524,106 +648,22 @@ var ScenarioExecution = class {
|
|
|
524
648
|
};
|
|
525
649
|
const agentResponse = await agent2.call(agentInput);
|
|
526
650
|
const endTime = Date.now();
|
|
527
|
-
this.
|
|
528
|
-
this.
|
|
529
|
-
if (typeof agentResponse === "object" &&
|
|
651
|
+
this.addAgentTime(idx, endTime - startTime);
|
|
652
|
+
this.pendingMessages.delete(idx);
|
|
653
|
+
if (agentResponse && typeof agentResponse === "object" && "success" in agentResponse) {
|
|
530
654
|
return agentResponse;
|
|
531
655
|
}
|
|
656
|
+
const currentAgentTime = this.agentTimes.get(idx) ?? 0;
|
|
657
|
+
this.agentTimes.set(idx, currentAgentTime + (Date.now() - startTime));
|
|
532
658
|
const messages = convertAgentReturnTypesToMessages(
|
|
533
659
|
agentResponse,
|
|
534
660
|
role === "User" /* USER */ ? "user" : "assistant"
|
|
535
661
|
);
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
nextAgentForRole(role) {
|
|
540
|
-
for (const agent2 of this.state.agents) {
|
|
541
|
-
if (agent2.role === role && this.state.pendingAgentsOnTurn.includes(agent2) && this.state.pendingRolesOnTurn.includes(role)) {
|
|
542
|
-
return { idx: this.state.agents.indexOf(agent2), agent: agent2 };
|
|
543
|
-
}
|
|
544
|
-
}
|
|
545
|
-
return { idx: -1, agent: null };
|
|
546
|
-
}
|
|
547
|
-
reachedMaxTurns(errorMessage) {
|
|
548
|
-
var _a;
|
|
549
|
-
const agentRoleAgentsIdx = this.state.agents.map((agent2, i) => ({ agent: agent2, idx: i })).filter(({ agent: agent2 }) => agent2.role === "Agent" /* AGENT */).map(({ idx }) => idx);
|
|
550
|
-
const agentTimes = agentRoleAgentsIdx.map((i) => this.state.agentTimes.get(i) || 0);
|
|
551
|
-
const totalAgentTime = agentTimes.reduce((sum, time) => sum + time, 0);
|
|
552
|
-
return {
|
|
553
|
-
success: false,
|
|
554
|
-
messages: this.state.history,
|
|
555
|
-
reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
|
|
556
|
-
passedCriteria: [],
|
|
557
|
-
failedCriteria: ((_a = this.getJudgeAgent()) == null ? void 0 : _a.criteria) ?? [],
|
|
558
|
-
totalTime: this.state.totalTime,
|
|
559
|
-
agentTime: totalAgentTime
|
|
560
|
-
};
|
|
561
|
-
}
|
|
562
|
-
getJudgeAgent() {
|
|
563
|
-
return this.state.agents.find((agent2) => agent2 instanceof JudgeAgentAdapter) ?? null;
|
|
564
|
-
}
|
|
565
|
-
consumeUntilRole(role) {
|
|
566
|
-
while (this.state.pendingRolesOnTurn.length > 0) {
|
|
567
|
-
const nextRole = this.state.pendingRolesOnTurn[0];
|
|
568
|
-
if (nextRole === role) break;
|
|
569
|
-
this.state.pendingRolesOnTurn.pop();
|
|
570
|
-
}
|
|
571
|
-
}
|
|
572
|
-
async scriptCallAgent(role, content, judgmentRequest = false) {
|
|
573
|
-
this.consumeUntilRole(role);
|
|
574
|
-
let index = -1;
|
|
575
|
-
let agent2 = null;
|
|
576
|
-
const nextAgent = this.state.getNextAgentForRole(role);
|
|
577
|
-
if (!nextAgent) {
|
|
578
|
-
this.state.newTurn();
|
|
579
|
-
this.consumeUntilRole(role);
|
|
580
|
-
const nextAgent2 = this.state.getNextAgentForRole(role);
|
|
581
|
-
if (!nextAgent2) {
|
|
582
|
-
let roleClass = "";
|
|
583
|
-
switch (role) {
|
|
584
|
-
case "User" /* USER */:
|
|
585
|
-
roleClass = "a scenario.userSimulatorAgent()";
|
|
586
|
-
break;
|
|
587
|
-
case "Agent" /* AGENT */:
|
|
588
|
-
roleClass = "a scenario.agent()";
|
|
589
|
-
break;
|
|
590
|
-
case "Judge" /* JUDGE */:
|
|
591
|
-
roleClass = "a scenario.judgeAgent()";
|
|
592
|
-
break;
|
|
593
|
-
default:
|
|
594
|
-
roleClass = "your agent";
|
|
595
|
-
}
|
|
596
|
-
if (content)
|
|
597
|
-
throw new Error(
|
|
598
|
-
`Cannot generate a message for role \`${role}\` with content \`${content}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
|
|
599
|
-
);
|
|
600
|
-
throw new Error(
|
|
601
|
-
`Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
|
|
602
|
-
);
|
|
603
|
-
}
|
|
604
|
-
index = nextAgent2.index;
|
|
605
|
-
agent2 = nextAgent2.agent;
|
|
606
|
-
} else {
|
|
607
|
-
index = nextAgent.index;
|
|
608
|
-
agent2 = nextAgent.agent;
|
|
609
|
-
}
|
|
610
|
-
this.state.removePendingAgent(agent2);
|
|
611
|
-
if (content) {
|
|
612
|
-
if (typeof content === "string") {
|
|
613
|
-
if (role === "User" /* USER */) {
|
|
614
|
-
this.state.addMessage({ role: "user", content });
|
|
615
|
-
} else {
|
|
616
|
-
this.state.addMessage({ role: "assistant", content });
|
|
617
|
-
}
|
|
618
|
-
} else {
|
|
619
|
-
this.state.addMessage(content);
|
|
620
|
-
}
|
|
621
|
-
return null;
|
|
662
|
+
for (const message2 of messages) {
|
|
663
|
+
this.state.addMessage(message2);
|
|
664
|
+
this.broadcastMessage(message2, idx);
|
|
622
665
|
}
|
|
623
|
-
|
|
624
|
-
if (Array.isArray(result))
|
|
625
|
-
return null;
|
|
626
|
-
return result;
|
|
666
|
+
return messages;
|
|
627
667
|
}
|
|
628
668
|
/**
|
|
629
669
|
* Adds a message to the conversation history.
|
|
@@ -637,6 +677,7 @@ var ScenarioExecution = class {
|
|
|
637
677
|
await this.scriptCallAgent("Agent" /* AGENT */, message2);
|
|
638
678
|
} else {
|
|
639
679
|
this.state.addMessage(message2);
|
|
680
|
+
this.broadcastMessage(message2);
|
|
640
681
|
}
|
|
641
682
|
}
|
|
642
683
|
/**
|
|
@@ -678,12 +719,12 @@ var ScenarioExecution = class {
|
|
|
678
719
|
* @returns A promise that resolves with the scenario result if a conclusion is reached.
|
|
679
720
|
*/
|
|
680
721
|
async proceed(turns, onTurn, onStep) {
|
|
681
|
-
let initialTurn = this.state.
|
|
722
|
+
let initialTurn = this.state.currentTurn;
|
|
682
723
|
while (true) {
|
|
683
|
-
const goToNextTurn = turns === void 0 || initialTurn === null || this.state.
|
|
724
|
+
const goToNextTurn = turns === void 0 || initialTurn === null || this.state.currentTurn != null && this.state.currentTurn + 1 < initialTurn + turns;
|
|
684
725
|
const nextMessage = await this._step(goToNextTurn, onTurn);
|
|
685
726
|
if (initialTurn === null)
|
|
686
|
-
initialTurn = this.state.
|
|
727
|
+
initialTurn = this.state.currentTurn;
|
|
687
728
|
if (nextMessage === null) {
|
|
688
729
|
return null;
|
|
689
730
|
}
|
|
@@ -701,10 +742,10 @@ var ScenarioExecution = class {
|
|
|
701
742
|
async succeed(reasoning) {
|
|
702
743
|
return {
|
|
703
744
|
success: true,
|
|
704
|
-
messages: this.state.
|
|
745
|
+
messages: this.state.messages,
|
|
705
746
|
reasoning: reasoning || "Scenario marked as successful with Scenario.succeed()",
|
|
706
|
-
|
|
707
|
-
|
|
747
|
+
metCriteria: [],
|
|
748
|
+
unmetCriteria: []
|
|
708
749
|
};
|
|
709
750
|
}
|
|
710
751
|
/**
|
|
@@ -716,656 +757,385 @@ var ScenarioExecution = class {
|
|
|
716
757
|
async fail(reasoning) {
|
|
717
758
|
return {
|
|
718
759
|
success: false,
|
|
719
|
-
messages: this.state.
|
|
760
|
+
messages: this.state.messages,
|
|
720
761
|
reasoning: reasoning || "Scenario marked as failed with Scenario.fail()",
|
|
721
|
-
|
|
722
|
-
|
|
762
|
+
metCriteria: [],
|
|
763
|
+
unmetCriteria: []
|
|
723
764
|
};
|
|
724
765
|
}
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
this.
|
|
728
|
-
this.state.setAgents(this.config.agents);
|
|
729
|
-
this.state.newTurn();
|
|
730
|
-
this.state.turn = 0;
|
|
731
|
-
}
|
|
732
|
-
// =====================================================
|
|
733
|
-
// Event Emission Methods
|
|
734
|
-
// =====================================================
|
|
735
|
-
// These methods handle the creation and emission of
|
|
736
|
-
// scenario events for external consumption and monitoring
|
|
737
|
-
// =====================================================
|
|
738
|
-
/**
|
|
739
|
-
* Emits an event to the event stream for external consumption.
|
|
740
|
-
*/
|
|
741
|
-
emitEvent(event) {
|
|
742
|
-
this.eventSubject.next(event);
|
|
743
|
-
}
|
|
744
|
-
/**
|
|
745
|
-
* Creates base event properties shared across all scenario events.
|
|
746
|
-
*/
|
|
747
|
-
makeBaseEvent({ scenarioRunId }) {
|
|
748
|
-
return {
|
|
749
|
-
batchRunId: batchRunId2,
|
|
750
|
-
scenarioId: this.config.id,
|
|
751
|
-
scenarioRunId,
|
|
752
|
-
timestamp: Date.now(),
|
|
753
|
-
rawEvent: void 0
|
|
754
|
-
};
|
|
755
|
-
}
|
|
756
|
-
/**
|
|
757
|
-
* Emits a run started event to indicate scenario execution has begun.
|
|
758
|
-
*/
|
|
759
|
-
emitRunStarted({ scenarioRunId }) {
|
|
760
|
-
this.emitEvent({
|
|
761
|
-
...this.makeBaseEvent({ scenarioRunId }),
|
|
762
|
-
type: "SCENARIO_RUN_STARTED" /* RUN_STARTED */,
|
|
763
|
-
metadata: {
|
|
764
|
-
name: this.config.name,
|
|
765
|
-
description: this.config.description
|
|
766
|
-
}
|
|
767
|
-
});
|
|
766
|
+
addAgentTime(agentIdx, time) {
|
|
767
|
+
const currentTime = this.agentTimes.get(agentIdx) || 0;
|
|
768
|
+
this.agentTimes.set(agentIdx, currentTime + time);
|
|
768
769
|
}
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
*/
|
|
772
|
-
emitMessageSnapshot({ scenarioRunId }) {
|
|
773
|
-
this.emitEvent({
|
|
774
|
-
...this.makeBaseEvent({ scenarioRunId }),
|
|
775
|
-
type: "SCENARIO_MESSAGE_SNAPSHOT" /* MESSAGE_SNAPSHOT */,
|
|
776
|
-
messages: this.state.history
|
|
777
|
-
// Add any other required fields from MessagesSnapshotEventSchema
|
|
778
|
-
});
|
|
770
|
+
hasResult() {
|
|
771
|
+
return this.partialResult !== null;
|
|
779
772
|
}
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
*/
|
|
783
|
-
emitRunFinished({
|
|
784
|
-
scenarioRunId,
|
|
785
|
-
status
|
|
786
|
-
}) {
|
|
787
|
-
this.emitEvent({
|
|
788
|
-
...this.makeBaseEvent({ scenarioRunId }),
|
|
789
|
-
type: "SCENARIO_RUN_FINISHED" /* RUN_FINISHED */,
|
|
790
|
-
status
|
|
791
|
-
// Add error/metrics fields if needed
|
|
792
|
-
});
|
|
773
|
+
setResult(result) {
|
|
774
|
+
this.partialResult = result;
|
|
793
775
|
}
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
"scenario.config.js",
|
|
804
|
-
"scenario.config.mjs"
|
|
805
|
-
];
|
|
806
|
-
for (const name of configNames) {
|
|
807
|
-
const fullPath = path.join(cwd, name);
|
|
808
|
-
try {
|
|
809
|
-
await fs.access(fullPath);
|
|
810
|
-
const configModule = await import(pathToFileURL(fullPath).href);
|
|
811
|
-
const config2 = configModule.default || configModule;
|
|
812
|
-
const parsed = scenarioProjectConfigSchema.safeParse(config2);
|
|
813
|
-
if (!parsed.success) {
|
|
814
|
-
throw new Error(
|
|
815
|
-
`Invalid config file ${name}: ${JSON.stringify(parsed.error.format(), null, 2)}`
|
|
816
|
-
);
|
|
817
|
-
}
|
|
818
|
-
return parsed.data;
|
|
819
|
-
} catch (error) {
|
|
820
|
-
if (error instanceof Error && "code" in error && error.code === "ENOENT") {
|
|
821
|
-
continue;
|
|
822
|
-
}
|
|
823
|
-
throw error;
|
|
776
|
+
async scriptCallAgent(role, content, judgmentRequest = false) {
|
|
777
|
+
this.consumeUntilRole(role);
|
|
778
|
+
let index = -1;
|
|
779
|
+
let agent2 = null;
|
|
780
|
+
let nextAgent = this.getNextAgentForRole(role);
|
|
781
|
+
if (!nextAgent) {
|
|
782
|
+
this.newTurn();
|
|
783
|
+
this.consumeUntilRole(role);
|
|
784
|
+
nextAgent = this.getNextAgentForRole(role);
|
|
824
785
|
}
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
this.apiKey = config2.apiKey ?? "";
|
|
840
|
-
if (!process.env.SCENARIO_DISABLE_SIMULATION_REPORT_INFO) {
|
|
841
|
-
console.log("=== Scenario Simulation Reporting ===");
|
|
842
|
-
if (!this.apiKey) {
|
|
843
|
-
console.warn("LangWatch API key not configured, simulations will be local");
|
|
844
|
-
console.warn(`To enable simulation reporting in the LangWatch dashboard, configure your LangWatch API key (via LANGWATCH_API_KEY, or scenario.config.js)`);
|
|
845
|
-
} else {
|
|
846
|
-
console.log("Simulation reporting is enabled");
|
|
847
|
-
console.log(`Endpoint: ${config2.endpoint} -> ${this.eventsEndpoint.href}`);
|
|
848
|
-
console.log(`API Key: ${!this.apiKey ? "not configured" : "configured"}`);
|
|
786
|
+
if (!nextAgent) {
|
|
787
|
+
let roleClass = "";
|
|
788
|
+
switch (role) {
|
|
789
|
+
case "User" /* USER */:
|
|
790
|
+
roleClass = "a scenario.userSimulatorAgent()";
|
|
791
|
+
break;
|
|
792
|
+
case "Agent" /* AGENT */:
|
|
793
|
+
roleClass = "a scenario.agent()";
|
|
794
|
+
break;
|
|
795
|
+
case "Judge" /* JUDGE */:
|
|
796
|
+
roleClass = "a scenario.judgeAgent()";
|
|
797
|
+
break;
|
|
798
|
+
default:
|
|
799
|
+
roleClass = "your agent";
|
|
849
800
|
}
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
*/
|
|
857
|
-
async postEvent(event) {
|
|
858
|
-
this.logger.debug(`[${event.type}] Posting event`, {
|
|
859
|
-
event
|
|
860
|
-
});
|
|
861
|
-
if (!this.eventsEndpoint) {
|
|
862
|
-
this.logger.warn(
|
|
863
|
-
"No LANGWATCH_ENDPOINT configured, skipping event posting"
|
|
801
|
+
if (content)
|
|
802
|
+
throw new Error(
|
|
803
|
+
`Cannot generate a message for role \`${role}\` with content \`${content}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
|
|
804
|
+
);
|
|
805
|
+
throw new Error(
|
|
806
|
+
`Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
|
|
864
807
|
);
|
|
865
|
-
return;
|
|
866
808
|
}
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
});
|
|
876
|
-
this.logger.debug(
|
|
877
|
-
`[${event.type}] Event POST response status: ${response.status}`
|
|
878
|
-
);
|
|
879
|
-
if (response.ok) {
|
|
880
|
-
const data = await response.json();
|
|
881
|
-
this.logger.debug(`[${event.type}] Event POST response:`, data);
|
|
882
|
-
} else {
|
|
883
|
-
const errorText = await response.text();
|
|
884
|
-
this.logger.error(`[${event.type}] Event POST failed:`, {
|
|
885
|
-
status: response.status,
|
|
886
|
-
statusText: response.statusText,
|
|
887
|
-
error: errorText,
|
|
888
|
-
event
|
|
889
|
-
});
|
|
890
|
-
}
|
|
891
|
-
} catch (error) {
|
|
892
|
-
this.logger.error(`[${event.type}] Event POST error:`, {
|
|
893
|
-
error,
|
|
894
|
-
event,
|
|
895
|
-
endpoint: this.eventsEndpoint
|
|
896
|
-
});
|
|
897
|
-
}
|
|
898
|
-
}
|
|
899
|
-
};
|
|
900
|
-
|
|
901
|
-
// src/events/event-bus.ts
|
|
902
|
-
var EventBus = class {
|
|
903
|
-
events$ = new Subject2();
|
|
904
|
-
eventReporter;
|
|
905
|
-
processingPromise = null;
|
|
906
|
-
logger = new Logger("scenario.events.EventBus");
|
|
907
|
-
constructor(config2) {
|
|
908
|
-
this.eventReporter = new EventReporter(config2);
|
|
909
|
-
}
|
|
910
|
-
/**
|
|
911
|
-
* Publishes an event into the processing pipeline.
|
|
912
|
-
*/
|
|
913
|
-
publish(event) {
|
|
914
|
-
this.logger.debug(`[${event.type}] Publishing event`, {
|
|
915
|
-
event
|
|
916
|
-
});
|
|
917
|
-
this.events$.next(event);
|
|
918
|
-
}
|
|
919
|
-
/**
|
|
920
|
-
* Begins listening for and processing events.
|
|
921
|
-
* Returns a promise that resolves when a RUN_FINISHED event is fully processed.
|
|
922
|
-
*/
|
|
923
|
-
listen() {
|
|
924
|
-
this.logger.debug("Listening for events");
|
|
925
|
-
if (this.processingPromise) {
|
|
926
|
-
return this.processingPromise;
|
|
927
|
-
}
|
|
928
|
-
this.processingPromise = new Promise((resolve, reject) => {
|
|
929
|
-
this.events$.pipe(
|
|
930
|
-
concatMap(async (event) => {
|
|
931
|
-
this.logger.debug(`[${event.type}] Processing event`, {
|
|
932
|
-
event
|
|
933
|
-
});
|
|
934
|
-
await this.eventReporter.postEvent(event);
|
|
935
|
-
return event;
|
|
936
|
-
}),
|
|
937
|
-
catchError((error) => {
|
|
938
|
-
this.logger.error("Error in event stream:", error);
|
|
939
|
-
return EMPTY;
|
|
940
|
-
})
|
|
941
|
-
).subscribe({
|
|
942
|
-
next: (event) => {
|
|
943
|
-
this.logger.debug(`[${event.type}] Event processed`, {
|
|
944
|
-
event
|
|
945
|
-
});
|
|
946
|
-
if (event.type === "SCENARIO_RUN_FINISHED" /* RUN_FINISHED */) {
|
|
947
|
-
resolve();
|
|
948
|
-
}
|
|
949
|
-
},
|
|
950
|
-
error: (error) => {
|
|
951
|
-
this.logger.error("Error in event stream:", error);
|
|
952
|
-
reject(error);
|
|
953
|
-
}
|
|
954
|
-
});
|
|
955
|
-
});
|
|
956
|
-
return this.processingPromise;
|
|
957
|
-
}
|
|
958
|
-
/**
|
|
959
|
-
* Stops accepting new events and drains the processing queue.
|
|
960
|
-
*/
|
|
961
|
-
async drain() {
|
|
962
|
-
this.logger.debug("Draining event stream");
|
|
963
|
-
this.events$.unsubscribe();
|
|
964
|
-
if (this.processingPromise) {
|
|
965
|
-
await this.processingPromise;
|
|
966
|
-
}
|
|
967
|
-
}
|
|
968
|
-
/**
|
|
969
|
-
* Subscribes to an event stream.
|
|
970
|
-
* @param source$ - The event stream to subscribe to.
|
|
971
|
-
*/
|
|
972
|
-
subscribeTo(source$) {
|
|
973
|
-
this.logger.debug("Subscribing to event stream");
|
|
974
|
-
return source$.subscribe(this.events$);
|
|
975
|
-
}
|
|
976
|
-
};
|
|
977
|
-
|
|
978
|
-
// src/runner/run.ts
|
|
979
|
-
async function run(cfg) {
|
|
980
|
-
if (!cfg.name) {
|
|
981
|
-
throw new Error("Scenario name is required");
|
|
982
|
-
}
|
|
983
|
-
if (!cfg.description) {
|
|
984
|
-
throw new Error("Scenario description is required");
|
|
985
|
-
}
|
|
986
|
-
if ((cfg.maxTurns || 10) < 1) {
|
|
987
|
-
throw new Error("Max turns must be at least 1");
|
|
988
|
-
}
|
|
989
|
-
if (cfg.agents.length === 0) {
|
|
990
|
-
throw new Error("At least one agent is required");
|
|
991
|
-
}
|
|
992
|
-
if (!cfg.agents.find((agent2) => agent2.role === "Agent" /* AGENT */)) {
|
|
993
|
-
throw new Error("At least one non-user/non-judge agent is required");
|
|
994
|
-
}
|
|
995
|
-
cfg.agents.forEach((agent2, i) => {
|
|
996
|
-
if (!allAgentRoles.includes(agent2.role)) {
|
|
997
|
-
throw new Error(`Agent ${i} has invalid role: ${agent2.role}`);
|
|
998
|
-
}
|
|
999
|
-
});
|
|
1000
|
-
if (!cfg.threadId) {
|
|
1001
|
-
cfg.threadId = generateThreadId();
|
|
1002
|
-
}
|
|
1003
|
-
const steps = cfg.script || [proceed()];
|
|
1004
|
-
const execution = new ScenarioExecution(cfg, steps);
|
|
1005
|
-
let eventBus = null;
|
|
1006
|
-
let subscription = null;
|
|
1007
|
-
try {
|
|
1008
|
-
const projectConfig = await loadScenarioProjectConfig();
|
|
1009
|
-
eventBus = new EventBus({
|
|
1010
|
-
endpoint: projectConfig.langwatchEndpoint ?? process.env.LANGWATCH_ENDPOINT ?? "https://app.langwatch.ai",
|
|
1011
|
-
apiKey: projectConfig.langwatchApiKey ?? process.env.LANGWATCH_API_KEY
|
|
1012
|
-
});
|
|
1013
|
-
eventBus.listen();
|
|
1014
|
-
subscription = eventBus.subscribeTo(execution.events$);
|
|
1015
|
-
const result = await execution.execute();
|
|
1016
|
-
if (cfg.verbose && !result.success) {
|
|
1017
|
-
console.log(`Scenario failed: ${cfg.name}`);
|
|
1018
|
-
console.log(`Reasoning: ${result.reasoning}`);
|
|
1019
|
-
console.log("--------------------------------");
|
|
1020
|
-
console.log(`Passed criteria: ${result.passedCriteria.join("\n- ")}`);
|
|
1021
|
-
console.log(`Failed criteria: ${result.failedCriteria.join("\n- ")}`);
|
|
1022
|
-
console.log(result.messages.map(formatMessage).join("\n"));
|
|
809
|
+
index = nextAgent.index;
|
|
810
|
+
agent2 = nextAgent.agent;
|
|
811
|
+
this.removePendingAgent(agent2);
|
|
812
|
+
if (content) {
|
|
813
|
+
const message2 = typeof content === "string" ? { role: role === "User" /* USER */ ? "user" : "assistant", content } : content;
|
|
814
|
+
this.state.addMessage(message2);
|
|
815
|
+
this.broadcastMessage(message2, index);
|
|
816
|
+
return null;
|
|
1023
817
|
}
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
818
|
+
const result = await this.callAgent(index, role, judgmentRequest);
|
|
819
|
+
if (result && typeof result === "object" && "success" in result) {
|
|
820
|
+
return result;
|
|
821
|
+
}
|
|
822
|
+
return null;
|
|
1028
823
|
}
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
return `Tool: ${formatParts(m.content)}`;
|
|
1038
|
-
default:
|
|
1039
|
-
return `${m.role}: ${m.content}`;
|
|
824
|
+
reset() {
|
|
825
|
+
this.state = new ScenarioExecutionState(this.config);
|
|
826
|
+
this.state.threadId = this.config.threadId || generateThreadId();
|
|
827
|
+
this.setAgents(this.config.agents);
|
|
828
|
+
this.newTurn();
|
|
829
|
+
this.state.currentTurn = 0;
|
|
830
|
+
this.totalStartTime = Date.now();
|
|
831
|
+
this.pendingMessages.clear();
|
|
1040
832
|
}
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
833
|
+
nextAgentForRole(role) {
|
|
834
|
+
for (const agent2 of this.agents) {
|
|
835
|
+
if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2) && this.pendingRolesOnTurn.includes(role)) {
|
|
836
|
+
return { idx: this.agents.indexOf(agent2), agent: agent2 };
|
|
837
|
+
}
|
|
838
|
+
}
|
|
839
|
+
return { idx: -1, agent: null };
|
|
1045
840
|
}
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
841
|
+
newTurn() {
|
|
842
|
+
this.pendingAgentsOnTurn = new Set(this.agents);
|
|
843
|
+
this.pendingRolesOnTurn = [
|
|
844
|
+
"User" /* USER */,
|
|
845
|
+
"Agent" /* AGENT */,
|
|
846
|
+
"Judge" /* JUDGE */
|
|
847
|
+
];
|
|
848
|
+
if (this.state.currentTurn === null) {
|
|
849
|
+
this.state.currentTurn = 1;
|
|
850
|
+
} else {
|
|
851
|
+
this.state.currentTurn++;
|
|
1049
852
|
}
|
|
1050
|
-
return `
|
|
1051
|
-
${part.map(formatPart).join("\n")}`;
|
|
1052
853
|
}
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
return part.text;
|
|
1059
|
-
case "file":
|
|
1060
|
-
return `(file): ${part.filename} ${typeof part.data === "string" ? `url:${part.data}` : "base64:omitted"}`;
|
|
1061
|
-
case "tool-call":
|
|
1062
|
-
return `(tool call): ${part.toolName} id:${part.toolCallId} args:(${JSON.stringify(part.args)})`;
|
|
1063
|
-
case "tool-result":
|
|
1064
|
-
return `(tool result): ${part.toolName} id:${part.toolCallId} result:(${JSON.stringify(part.result)})`;
|
|
1065
|
-
case "reasoning":
|
|
1066
|
-
return `(reasoning): ${part.text}`;
|
|
1067
|
-
case "redacted-reasoning":
|
|
1068
|
-
return `(redacted reasoning): ${part.data}`;
|
|
1069
|
-
default:
|
|
1070
|
-
return `Unknown content: ${JSON.stringify(part)}`;
|
|
854
|
+
removePendingRole(role) {
|
|
855
|
+
const index = this.pendingRolesOnTurn.indexOf(role);
|
|
856
|
+
if (index > -1) {
|
|
857
|
+
this.pendingRolesOnTurn.splice(index, 1);
|
|
858
|
+
}
|
|
1071
859
|
}
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
var userMessageRole = "user";
|
|
1082
|
-
var groupMessagesByToolBoundaries = (messages) => {
|
|
1083
|
-
const segments = [];
|
|
1084
|
-
let currentSegment = [];
|
|
1085
|
-
for (const message2 of messages) {
|
|
1086
|
-
currentSegment.push(message2);
|
|
1087
|
-
if (message2.role === toolMessageRole) {
|
|
1088
|
-
segments.push(currentSegment);
|
|
1089
|
-
currentSegment = [];
|
|
860
|
+
removePendingAgent(agent2) {
|
|
861
|
+
this.pendingAgentsOnTurn.delete(agent2);
|
|
862
|
+
}
|
|
863
|
+
getNextAgentForRole(role) {
|
|
864
|
+
for (let i = 0; i < this.agents.length; i++) {
|
|
865
|
+
const agent2 = this.agents[i];
|
|
866
|
+
if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2)) {
|
|
867
|
+
return { index: i, agent: agent2 };
|
|
868
|
+
}
|
|
1090
869
|
}
|
|
870
|
+
return null;
|
|
1091
871
|
}
|
|
1092
|
-
|
|
1093
|
-
|
|
872
|
+
setAgents(agents) {
|
|
873
|
+
this.agents = agents;
|
|
874
|
+
this.agentTimes.clear();
|
|
1094
875
|
}
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
if (message2.role === assistantMessageRole && Array.isArray(message2.content)) {
|
|
1101
|
-
return message2.content.some((part) => part.type === "tool-call");
|
|
876
|
+
consumeUntilRole(role) {
|
|
877
|
+
while (this.pendingRolesOnTurn.length > 0) {
|
|
878
|
+
const nextRole = this.pendingRolesOnTurn[0];
|
|
879
|
+
if (nextRole === role) break;
|
|
880
|
+
this.pendingRolesOnTurn.pop();
|
|
1102
881
|
}
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
const
|
|
1109
|
-
if (!hasStringContent) return message2;
|
|
1110
|
-
const roleMap = {
|
|
1111
|
-
[userMessageRole]: assistantMessageRole,
|
|
1112
|
-
[assistantMessageRole]: userMessageRole
|
|
1113
|
-
};
|
|
1114
|
-
const newRole = roleMap[message2.role];
|
|
1115
|
-
if (!newRole) return message2;
|
|
882
|
+
}
|
|
883
|
+
reachedMaxTurns(errorMessage) {
|
|
884
|
+
var _a;
|
|
885
|
+
const agentRoleAgentsIdx = this.agents.map((agent2, i) => ({ agent: agent2, idx: i })).filter(({ agent: agent2 }) => agent2.role === "Agent" /* AGENT */).map(({ idx }) => idx);
|
|
886
|
+
const agentTimes = agentRoleAgentsIdx.map((i) => this.agentTimes.get(i) || 0);
|
|
887
|
+
const totalAgentTime = agentTimes.reduce((sum, time) => sum + time, 0);
|
|
1116
888
|
return {
|
|
1117
|
-
|
|
1118
|
-
|
|
889
|
+
success: false,
|
|
890
|
+
messages: this.state.messages,
|
|
891
|
+
reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
|
|
892
|
+
metCriteria: [],
|
|
893
|
+
unmetCriteria: ((_a = this.getJudgeAgent()) == null ? void 0 : _a.criteria) ?? [],
|
|
894
|
+
totalTime: this.totalTime,
|
|
895
|
+
agentTime: totalAgentTime
|
|
1119
896
|
};
|
|
1120
|
-
});
|
|
1121
|
-
};
|
|
1122
|
-
var messageRoleReversal = (messages) => {
|
|
1123
|
-
const segments = groupMessagesByToolBoundaries(messages);
|
|
1124
|
-
const processedSegments = segments.map(
|
|
1125
|
-
(segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
|
|
1126
|
-
);
|
|
1127
|
-
return processedSegments.flat();
|
|
1128
|
-
};
|
|
1129
|
-
var criterionToParamName = (criterion) => {
|
|
1130
|
-
return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
|
|
1131
|
-
};
|
|
1132
|
-
|
|
1133
|
-
// src/config/index.ts
|
|
1134
|
-
var logger = new Logger("scenario.config");
|
|
1135
|
-
var configLoaded = false;
|
|
1136
|
-
var config = null;
|
|
1137
|
-
var configLoadPromise = null;
|
|
1138
|
-
async function loadProjectConfig() {
|
|
1139
|
-
if (configLoaded) {
|
|
1140
|
-
return;
|
|
1141
897
|
}
|
|
1142
|
-
|
|
1143
|
-
return
|
|
898
|
+
getJudgeAgent() {
|
|
899
|
+
return this.agents.find((agent2) => agent2 instanceof JudgeAgentAdapter) ?? null;
|
|
1144
900
|
}
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
logger.error("error loading scenario project config", { error });
|
|
1151
|
-
} finally {
|
|
1152
|
-
configLoaded = true;
|
|
1153
|
-
}
|
|
1154
|
-
})();
|
|
1155
|
-
return configLoadPromise;
|
|
1156
|
-
}
|
|
1157
|
-
async function getProjectConfig() {
|
|
1158
|
-
await loadProjectConfig();
|
|
1159
|
-
return config;
|
|
1160
|
-
}
|
|
1161
|
-
|
|
1162
|
-
// src/utils/config.ts
|
|
1163
|
-
function mergeConfig(config2, projectConfig) {
|
|
1164
|
-
if (!projectConfig) {
|
|
1165
|
-
return config2;
|
|
901
|
+
/**
|
|
902
|
+
* Emits an event to the event stream for external consumption.
|
|
903
|
+
*/
|
|
904
|
+
emitEvent(event) {
|
|
905
|
+
this.eventSubject.next(event);
|
|
1166
906
|
}
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
}
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
907
|
+
/**
|
|
908
|
+
* Creates base event properties shared across all scenario events.
|
|
909
|
+
*/
|
|
910
|
+
makeBaseEvent({ scenarioRunId }) {
|
|
911
|
+
return {
|
|
912
|
+
type: "placeholder",
|
|
913
|
+
// This will be replaced by the specific event type
|
|
914
|
+
timestamp: Date.now(),
|
|
915
|
+
batchRunId: batchRunId2,
|
|
916
|
+
scenarioId: this.config.id,
|
|
917
|
+
scenarioRunId,
|
|
918
|
+
scenarioSetId: this.config.setId
|
|
919
|
+
};
|
|
1178
920
|
}
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
<goal>
|
|
1191
|
-
Your goal is to determine if you already have enough information to make a verdict of the scenario below, or if the conversation should continue for longer.
|
|
1192
|
-
If you do have enough information, use the finish_test tool to determine if all the criteria have been met, if not, use the continue_test tool to let the next step play out.
|
|
1193
|
-
</goal>
|
|
1194
|
-
|
|
1195
|
-
<scenario>
|
|
1196
|
-
${description}
|
|
1197
|
-
</scenario>
|
|
1198
|
-
|
|
1199
|
-
<criteria>
|
|
1200
|
-
${criteriaList}
|
|
1201
|
-
</criteria>
|
|
1202
|
-
|
|
1203
|
-
<rules>
|
|
1204
|
-
- Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criteria.
|
|
1205
|
-
- DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
|
|
1206
|
-
</rules>
|
|
1207
|
-
`.trim();
|
|
1208
|
-
}
|
|
1209
|
-
function buildContinueTestTool() {
|
|
1210
|
-
return tool({
|
|
1211
|
-
description: "Continue the test with the next step",
|
|
1212
|
-
parameters: z3.object({})
|
|
1213
|
-
});
|
|
1214
|
-
}
|
|
1215
|
-
function buildFinishTestTool(criteria) {
|
|
1216
|
-
const criteriaNames = criteria.map(criterionToParamName);
|
|
1217
|
-
return tool({
|
|
1218
|
-
description: "Complete the test with a final verdict",
|
|
1219
|
-
parameters: z3.object({
|
|
1220
|
-
criteria: z3.object(
|
|
1221
|
-
Object.fromEntries(
|
|
1222
|
-
criteriaNames.map((name, idx) => [
|
|
1223
|
-
name,
|
|
1224
|
-
z3.enum(["true", "false", "inconclusive"]).describe(criteria[idx])
|
|
1225
|
-
])
|
|
1226
|
-
)
|
|
1227
|
-
).strict().describe("Strict verdict for each criterion"),
|
|
1228
|
-
reasoning: z3.string().describe("Explanation of what the final verdict should be"),
|
|
1229
|
-
verdict: z3.enum(["success", "failure", "inconclusive"]).describe("The final verdict of the test")
|
|
1230
|
-
})
|
|
1231
|
-
});
|
|
1232
|
-
}
|
|
1233
|
-
var judgeAgent = (cfg) => {
|
|
1234
|
-
return {
|
|
1235
|
-
role: "Judge" /* JUDGE */,
|
|
1236
|
-
criteria: cfg.criteria,
|
|
1237
|
-
call: async (input) => {
|
|
1238
|
-
var _a;
|
|
1239
|
-
const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
|
|
1240
|
-
const messages = [
|
|
1241
|
-
{ role: "system", content: systemPrompt },
|
|
1242
|
-
...input.messages
|
|
1243
|
-
];
|
|
1244
|
-
const isLastMessage = input.scenarioState.turn == input.scenarioConfig.maxTurns;
|
|
1245
|
-
const projectConfig = await getProjectConfig();
|
|
1246
|
-
const mergedConfig = mergeAndValidateConfig(cfg, projectConfig);
|
|
1247
|
-
if (!mergedConfig.model) {
|
|
1248
|
-
throw new Error("Model is required for the judge agent");
|
|
921
|
+
/**
|
|
922
|
+
* Emits a run started event to indicate scenario execution has begun.
|
|
923
|
+
*/
|
|
924
|
+
emitRunStarted({ scenarioRunId }) {
|
|
925
|
+
this.emitEvent({
|
|
926
|
+
...this.makeBaseEvent({ scenarioRunId }),
|
|
927
|
+
type: "SCENARIO_RUN_STARTED" /* RUN_STARTED */,
|
|
928
|
+
metadata: {
|
|
929
|
+
name: this.config.name,
|
|
930
|
+
description: this.config.description
|
|
1249
931
|
}
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
932
|
+
});
|
|
933
|
+
}
|
|
934
|
+
/**
|
|
935
|
+
* Emits a message snapshot event containing current conversation history.
|
|
936
|
+
*/
|
|
937
|
+
emitMessageSnapshot({ scenarioRunId }) {
|
|
938
|
+
this.emitEvent({
|
|
939
|
+
...this.makeBaseEvent({ scenarioRunId }),
|
|
940
|
+
type: "SCENARIO_MESSAGE_SNAPSHOT" /* MESSAGE_SNAPSHOT */,
|
|
941
|
+
messages: this.state.messages
|
|
942
|
+
// Add any other required fields from MessagesSnapshotEventSchema
|
|
943
|
+
});
|
|
944
|
+
}
|
|
945
|
+
/**
|
|
946
|
+
* Emits a run finished event with the final execution status.
|
|
947
|
+
*/
|
|
948
|
+
emitRunFinished({
|
|
949
|
+
scenarioRunId,
|
|
950
|
+
status,
|
|
951
|
+
result
|
|
952
|
+
}) {
|
|
953
|
+
const event = {
|
|
954
|
+
...this.makeBaseEvent({ scenarioRunId }),
|
|
955
|
+
scenarioSetId: this.config.setId ?? "default",
|
|
956
|
+
type: "SCENARIO_RUN_FINISHED" /* RUN_FINISHED */,
|
|
957
|
+
status,
|
|
958
|
+
results: {
|
|
959
|
+
verdict: (result == null ? void 0 : result.success) ? "success" /* SUCCESS */ : "failure" /* FAILURE */,
|
|
960
|
+
metCriteria: (result == null ? void 0 : result.metCriteria) ?? [],
|
|
961
|
+
unmetCriteria: (result == null ? void 0 : result.unmetCriteria) ?? [],
|
|
962
|
+
reasoning: result == null ? void 0 : result.reasoning,
|
|
963
|
+
error: result == null ? void 0 : result.error
|
|
1264
964
|
}
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
const verdict = args.verdict || "inconclusive";
|
|
1281
|
-
const reasoning = args.reasoning || "No reasoning provided";
|
|
1282
|
-
const criteria = args.criteria || {};
|
|
1283
|
-
const criteriaValues = Object.values(criteria);
|
|
1284
|
-
const passedCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] === "true");
|
|
1285
|
-
const failedCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] !== "true");
|
|
1286
|
-
return {
|
|
1287
|
-
success: verdict === "success",
|
|
1288
|
-
messages: input.messages,
|
|
1289
|
-
reasoning,
|
|
1290
|
-
passedCriteria,
|
|
1291
|
-
failedCriteria
|
|
1292
|
-
};
|
|
1293
|
-
}
|
|
1294
|
-
case "continue_test":
|
|
1295
|
-
return [];
|
|
1296
|
-
default:
|
|
1297
|
-
return {
|
|
1298
|
-
success: false,
|
|
1299
|
-
messages: input.messages,
|
|
1300
|
-
reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
|
|
1301
|
-
passedCriteria: [],
|
|
1302
|
-
failedCriteria: cfg.criteria
|
|
1303
|
-
};
|
|
1304
|
-
}
|
|
965
|
+
};
|
|
966
|
+
this.emitEvent(event);
|
|
967
|
+
this.eventSubject.complete();
|
|
968
|
+
}
|
|
969
|
+
/**
|
|
970
|
+
* Distributes a message to all other agents in the scenario.
|
|
971
|
+
*
|
|
972
|
+
* @param message - The message to broadcast.
|
|
973
|
+
* @param fromAgentIdx - The index of the agent that sent the message, to avoid echoing.
|
|
974
|
+
*/
|
|
975
|
+
broadcastMessage(message2, fromAgentIdx) {
|
|
976
|
+
for (let idx = 0; idx < this.agents.length; idx++) {
|
|
977
|
+
if (idx === fromAgentIdx) continue;
|
|
978
|
+
if (!this.pendingMessages.has(idx)) {
|
|
979
|
+
this.pendingMessages.set(idx, []);
|
|
1305
980
|
}
|
|
1306
|
-
|
|
1307
|
-
success: false,
|
|
1308
|
-
messages: input.messages,
|
|
1309
|
-
reasoning: `JudgeAgent: No tool call found in LLM output`,
|
|
1310
|
-
passedCriteria: [],
|
|
1311
|
-
failedCriteria: cfg.criteria
|
|
1312
|
-
};
|
|
981
|
+
this.pendingMessages.get(idx).push(message2);
|
|
1313
982
|
}
|
|
1314
|
-
}
|
|
983
|
+
}
|
|
1315
984
|
};
|
|
985
|
+
function convertAgentReturnTypesToMessages(response, role) {
|
|
986
|
+
if (typeof response === "string")
|
|
987
|
+
return [{ role, content: response }];
|
|
988
|
+
if (Array.isArray(response))
|
|
989
|
+
return response;
|
|
990
|
+
if (typeof response === "object" && "role" in response)
|
|
991
|
+
return [response];
|
|
992
|
+
return [];
|
|
993
|
+
}
|
|
1316
994
|
|
|
1317
|
-
// src/
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
You are pretending to be a user, you are testing an AI Agent (shown as the user role) based on a scenario.
|
|
1323
|
-
Approach this naturally, as a human user would, with very short inputs, few words, all lowercase, imperative, not periods, like when they google or talk to chatgpt.
|
|
1324
|
-
</role>
|
|
1325
|
-
|
|
1326
|
-
<goal>
|
|
1327
|
-
Your goal (assistant) is to interact with the Agent Under Test (user) as if you were a human user to see if it can complete the scenario successfully.
|
|
1328
|
-
</goal>
|
|
995
|
+
// src/runner/index.ts
|
|
996
|
+
var runner_exports = {};
|
|
997
|
+
__export(runner_exports, {
|
|
998
|
+
run: () => run
|
|
999
|
+
});
|
|
1329
1000
|
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1001
|
+
// src/script/index.ts
|
|
1002
|
+
var script_exports = {};
|
|
1003
|
+
__export(script_exports, {
|
|
1004
|
+
agent: () => agent,
|
|
1005
|
+
fail: () => fail,
|
|
1006
|
+
judge: () => judge,
|
|
1007
|
+
message: () => message,
|
|
1008
|
+
proceed: () => proceed,
|
|
1009
|
+
succeed: () => succeed,
|
|
1010
|
+
user: () => user
|
|
1011
|
+
});
|
|
1012
|
+
var message = (message2) => {
|
|
1013
|
+
return (_state, executor) => executor.message(message2);
|
|
1014
|
+
};
|
|
1015
|
+
var agent = (content) => {
|
|
1016
|
+
return (_state, executor) => executor.agent(content);
|
|
1017
|
+
};
|
|
1018
|
+
var judge = (content) => {
|
|
1019
|
+
return (_state, executor) => executor.judge(content);
|
|
1020
|
+
};
|
|
1021
|
+
var user = (content) => {
|
|
1022
|
+
return (_state, executor) => executor.user(content);
|
|
1023
|
+
};
|
|
1024
|
+
var proceed = (turns, onTurn, onStep) => {
|
|
1025
|
+
return (_state, executor) => executor.proceed(turns, onTurn, onStep);
|
|
1026
|
+
};
|
|
1027
|
+
var succeed = (reasoning) => {
|
|
1028
|
+
return (_state, executor) => executor.succeed(reasoning);
|
|
1029
|
+
};
|
|
1030
|
+
var fail = (reasoning) => {
|
|
1031
|
+
return (_state, executor) => executor.fail(reasoning);
|
|
1032
|
+
};
|
|
1333
1033
|
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1034
|
+
// src/runner/run.ts
|
|
1035
|
+
async function run(cfg) {
|
|
1036
|
+
if (!cfg.name) {
|
|
1037
|
+
throw new Error("Scenario name is required");
|
|
1038
|
+
}
|
|
1039
|
+
if (!cfg.description) {
|
|
1040
|
+
throw new Error("Scenario description is required");
|
|
1041
|
+
}
|
|
1042
|
+
if ((cfg.maxTurns || 10) < 1) {
|
|
1043
|
+
throw new Error("Max turns must be at least 1");
|
|
1044
|
+
}
|
|
1045
|
+
if (cfg.agents.length === 0) {
|
|
1046
|
+
throw new Error("At least one agent is required");
|
|
1047
|
+
}
|
|
1048
|
+
if (!cfg.agents.find((agent2) => agent2.role === "Agent" /* AGENT */)) {
|
|
1049
|
+
throw new Error("At least one non-user/non-judge agent is required");
|
|
1050
|
+
}
|
|
1051
|
+
cfg.agents.forEach((agent2, i) => {
|
|
1052
|
+
if (!allAgentRoles.includes(agent2.role)) {
|
|
1053
|
+
throw new Error(`Agent ${i} has invalid role: ${agent2.role}`);
|
|
1054
|
+
}
|
|
1055
|
+
});
|
|
1056
|
+
if (!cfg.threadId) {
|
|
1057
|
+
cfg.threadId = generateThreadId();
|
|
1058
|
+
}
|
|
1059
|
+
const steps = cfg.script || [proceed()];
|
|
1060
|
+
const execution = new ScenarioExecution(cfg, steps);
|
|
1061
|
+
let eventBus = null;
|
|
1062
|
+
let subscription = null;
|
|
1063
|
+
try {
|
|
1064
|
+
const projectConfig = await loadScenarioProjectConfig();
|
|
1065
|
+
eventBus = new EventBus({
|
|
1066
|
+
endpoint: projectConfig.langwatchEndpoint ?? process.env.LANGWATCH_ENDPOINT ?? "https://app.langwatch.ai",
|
|
1067
|
+
apiKey: projectConfig.langwatchApiKey ?? process.env.LANGWATCH_API_KEY
|
|
1068
|
+
});
|
|
1069
|
+
eventBus.listen();
|
|
1070
|
+
subscription = eventBus.subscribeTo(execution.events$);
|
|
1071
|
+
const result = await execution.execute();
|
|
1072
|
+
if (cfg.verbose && !result.success) {
|
|
1073
|
+
console.log(`Scenario failed: ${cfg.name}`);
|
|
1074
|
+
console.log(`Reasoning: ${result.reasoning}`);
|
|
1075
|
+
console.log("--------------------------------");
|
|
1076
|
+
console.log(`Met criteria: ${result.metCriteria.join("\n- ")}`);
|
|
1077
|
+
console.log(`Unmet criteria: ${result.unmetCriteria.join("\n- ")}`);
|
|
1078
|
+
console.log(result.messages.map(formatMessage).join("\n"));
|
|
1079
|
+
}
|
|
1080
|
+
return result;
|
|
1081
|
+
} finally {
|
|
1082
|
+
await (eventBus == null ? void 0 : eventBus.drain());
|
|
1083
|
+
subscription == null ? void 0 : subscription.unsubscribe();
|
|
1084
|
+
}
|
|
1338
1085
|
}
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
temperature: mergedConfig.temperature ?? 0,
|
|
1359
|
-
maxTokens: mergedConfig.maxTokens
|
|
1360
|
-
});
|
|
1361
|
-
const messageContent = completion.text;
|
|
1362
|
-
if (!messageContent) {
|
|
1363
|
-
throw new Error("No response content from LLM");
|
|
1364
|
-
}
|
|
1365
|
-
return { role: "user", content: messageContent };
|
|
1086
|
+
function formatMessage(m) {
|
|
1087
|
+
switch (m.role) {
|
|
1088
|
+
case "user":
|
|
1089
|
+
return `User: ${m.content}`;
|
|
1090
|
+
case "assistant":
|
|
1091
|
+
return `Assistant: ${formatParts(m.content)}`;
|
|
1092
|
+
case "tool":
|
|
1093
|
+
return `Tool: ${formatParts(m.content)}`;
|
|
1094
|
+
default:
|
|
1095
|
+
return `${m.role}: ${m.content}`;
|
|
1096
|
+
}
|
|
1097
|
+
}
|
|
1098
|
+
function formatParts(part) {
|
|
1099
|
+
if (typeof part === "string") {
|
|
1100
|
+
return part;
|
|
1101
|
+
}
|
|
1102
|
+
if (Array.isArray(part)) {
|
|
1103
|
+
if (part.length === 1) {
|
|
1104
|
+
return formatPart(part[0]);
|
|
1366
1105
|
}
|
|
1367
|
-
|
|
1106
|
+
return `
|
|
1107
|
+
${part.map(formatPart).join("\n")}`;
|
|
1108
|
+
}
|
|
1109
|
+
return "Unknown content: " + JSON.stringify(part);
|
|
1110
|
+
}
|
|
1111
|
+
function formatPart(part) {
|
|
1112
|
+
switch (part.type) {
|
|
1113
|
+
case "text":
|
|
1114
|
+
return part.text;
|
|
1115
|
+
case "file":
|
|
1116
|
+
return `(file): ${part.filename} ${typeof part.data === "string" ? `url:${part.data}` : "base64:omitted"}`;
|
|
1117
|
+
case "tool-call":
|
|
1118
|
+
return `(tool call): ${part.toolName} id:${part.toolCallId} args:(${JSON.stringify(part.args)})`;
|
|
1119
|
+
case "tool-result":
|
|
1120
|
+
return `(tool result): ${part.toolName} id:${part.toolCallId} result:(${JSON.stringify(part.result)})`;
|
|
1121
|
+
case "reasoning":
|
|
1122
|
+
return `(reasoning): ${part.text}`;
|
|
1123
|
+
case "redacted-reasoning":
|
|
1124
|
+
return `(redacted reasoning): ${part.data}`;
|
|
1125
|
+
default:
|
|
1126
|
+
return `Unknown content: ${JSON.stringify(part)}`;
|
|
1127
|
+
}
|
|
1128
|
+
}
|
|
1129
|
+
|
|
1130
|
+
// src/index.ts
|
|
1131
|
+
var scenario = {
|
|
1132
|
+
...agents_exports,
|
|
1133
|
+
...domain_exports,
|
|
1134
|
+
...execution_exports,
|
|
1135
|
+
...runner_exports,
|
|
1136
|
+
...script_exports
|
|
1368
1137
|
};
|
|
1138
|
+
var index_default = scenario;
|
|
1369
1139
|
export {
|
|
1370
1140
|
AgentAdapter,
|
|
1371
1141
|
AgentRole,
|
|
@@ -1375,6 +1145,7 @@ export {
|
|
|
1375
1145
|
UserSimulatorAgentAdapter,
|
|
1376
1146
|
agent,
|
|
1377
1147
|
allAgentRoles,
|
|
1148
|
+
index_default as default,
|
|
1378
1149
|
defineConfig,
|
|
1379
1150
|
fail,
|
|
1380
1151
|
judge,
|
|
@@ -1382,6 +1153,7 @@ export {
|
|
|
1382
1153
|
message,
|
|
1383
1154
|
proceed,
|
|
1384
1155
|
run,
|
|
1156
|
+
scenario,
|
|
1385
1157
|
scenarioProjectConfigSchema,
|
|
1386
1158
|
succeed,
|
|
1387
1159
|
user,
|