@langwatch/scenario 0.2.0-prerelease.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +72 -17
- package/dist/chunk-7P6ASYW6.mjs +9 -0
- package/dist/chunk-ORWSJC5F.mjs +309 -0
- package/dist/index.d.mts +642 -515
- package/dist/index.d.ts +642 -515
- package/dist/index.js +977 -907
- package/dist/index.mjs +845 -1073
- package/dist/integrations/vitest/reporter.d.mts +9 -0
- package/dist/integrations/vitest/reporter.d.ts +9 -0
- package/dist/integrations/vitest/reporter.js +168 -0
- package/dist/integrations/vitest/reporter.mjs +139 -0
- package/dist/integrations/vitest/setup.d.mts +2 -0
- package/dist/integrations/vitest/setup.d.ts +2 -0
- package/dist/integrations/vitest/setup.js +377 -0
- package/dist/integrations/vitest/setup.mjs +51 -0
- package/package.json +17 -5
package/dist/index.js
CHANGED
|
@@ -38,6 +38,7 @@ __export(index_exports, {
|
|
|
38
38
|
UserSimulatorAgentAdapter: () => UserSimulatorAgentAdapter,
|
|
39
39
|
agent: () => agent,
|
|
40
40
|
allAgentRoles: () => allAgentRoles,
|
|
41
|
+
default: () => index_default,
|
|
41
42
|
defineConfig: () => defineConfig,
|
|
42
43
|
fail: () => fail,
|
|
43
44
|
judge: () => judge,
|
|
@@ -45,6 +46,7 @@ __export(index_exports, {
|
|
|
45
46
|
message: () => message,
|
|
46
47
|
proceed: () => proceed,
|
|
47
48
|
run: () => run,
|
|
49
|
+
scenario: () => scenario,
|
|
48
50
|
scenarioProjectConfigSchema: () => scenarioProjectConfigSchema,
|
|
49
51
|
succeed: () => succeed,
|
|
50
52
|
user: () => user,
|
|
@@ -52,31 +54,28 @@ __export(index_exports, {
|
|
|
52
54
|
});
|
|
53
55
|
module.exports = __toCommonJS(index_exports);
|
|
54
56
|
|
|
55
|
-
// src/
|
|
56
|
-
var
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
};
|
|
62
|
-
var judge = (content) => {
|
|
63
|
-
return (_state, executor) => executor.judge(content);
|
|
64
|
-
};
|
|
65
|
-
var user = (content) => {
|
|
66
|
-
return (_state, executor) => executor.user(content);
|
|
67
|
-
};
|
|
68
|
-
var proceed = (turns, onTurn, onStep) => {
|
|
69
|
-
return (_state, executor) => executor.proceed(turns, onTurn, onStep);
|
|
70
|
-
};
|
|
71
|
-
var succeed = (reasoning) => {
|
|
72
|
-
return (_state, executor) => executor.succeed(reasoning);
|
|
73
|
-
};
|
|
74
|
-
var fail = (reasoning) => {
|
|
75
|
-
return (_state, executor) => executor.fail(reasoning);
|
|
76
|
-
};
|
|
57
|
+
// src/agents/index.ts
|
|
58
|
+
var agents_exports = {};
|
|
59
|
+
__export(agents_exports, {
|
|
60
|
+
judgeAgent: () => judgeAgent,
|
|
61
|
+
userSimulatorAgent: () => userSimulatorAgent
|
|
62
|
+
});
|
|
77
63
|
|
|
78
|
-
// src/
|
|
79
|
-
var
|
|
64
|
+
// src/agents/judge-agent.ts
|
|
65
|
+
var import_ai = require("ai");
|
|
66
|
+
var import_zod2 = require("zod");
|
|
67
|
+
|
|
68
|
+
// src/domain/index.ts
|
|
69
|
+
var domain_exports = {};
|
|
70
|
+
__export(domain_exports, {
|
|
71
|
+
AgentAdapter: () => AgentAdapter,
|
|
72
|
+
AgentRole: () => AgentRole,
|
|
73
|
+
JudgeAgentAdapter: () => JudgeAgentAdapter,
|
|
74
|
+
UserSimulatorAgentAdapter: () => UserSimulatorAgentAdapter,
|
|
75
|
+
allAgentRoles: () => allAgentRoles,
|
|
76
|
+
defineConfig: () => defineConfig,
|
|
77
|
+
scenarioProjectConfigSchema: () => scenarioProjectConfigSchema
|
|
78
|
+
});
|
|
80
79
|
|
|
81
80
|
// src/domain/core/config.ts
|
|
82
81
|
var import_zod = require("zod");
|
|
@@ -120,264 +119,93 @@ var JudgeAgentAdapter = class {
|
|
|
120
119
|
}
|
|
121
120
|
};
|
|
122
121
|
|
|
123
|
-
// src/utils
|
|
124
|
-
var
|
|
125
|
-
var
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
function getBatchRunId() {
|
|
136
|
-
if (!batchRunId) {
|
|
137
|
-
batchRunId = process.env.SCENARIO_BATCH_RUN_ID ?? `scenariobatchrun_${(0, import_xksuid.generate)()}`;
|
|
138
|
-
}
|
|
139
|
-
return batchRunId;
|
|
140
|
-
}
|
|
141
|
-
function generateMessageId() {
|
|
142
|
-
return `scenariomsg_${(0, import_xksuid.generate)()}`;
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
// src/execution/scenario-execution-state.ts
|
|
146
|
-
var ScenarioExecutionState = class {
|
|
147
|
-
_history = [];
|
|
148
|
-
_turn = 0;
|
|
149
|
-
_partialResult = null;
|
|
150
|
-
_threadId = "";
|
|
151
|
-
_agents = [];
|
|
152
|
-
_pendingMessages = /* @__PURE__ */ new Map();
|
|
153
|
-
_pendingRolesOnTurn = [];
|
|
154
|
-
_pendingAgentsOnTurn = /* @__PURE__ */ new Set();
|
|
155
|
-
_agentTimes = /* @__PURE__ */ new Map();
|
|
156
|
-
_totalStartTime = 0;
|
|
157
|
-
/**
|
|
158
|
-
* Creates a new ScenarioExecutionState.
|
|
159
|
-
*/
|
|
160
|
-
constructor() {
|
|
161
|
-
this._totalStartTime = Date.now();
|
|
162
|
-
}
|
|
163
|
-
setThreadId(threadId) {
|
|
164
|
-
this._threadId = threadId;
|
|
165
|
-
}
|
|
166
|
-
setAgents(agents) {
|
|
167
|
-
this._agents = agents;
|
|
168
|
-
this._pendingMessages.clear();
|
|
169
|
-
this._agentTimes.clear();
|
|
170
|
-
}
|
|
171
|
-
appendMessage(role, content) {
|
|
172
|
-
const message2 = { role, content };
|
|
173
|
-
this._history.push({ ...message2, id: generateMessageId() });
|
|
174
|
-
}
|
|
175
|
-
appendUserMessage(content) {
|
|
176
|
-
this.appendMessage("user", content);
|
|
177
|
-
}
|
|
178
|
-
appendAssistantMessage(content) {
|
|
179
|
-
this.appendMessage("assistant", content);
|
|
180
|
-
}
|
|
181
|
-
addMessage(message2, fromAgentIdx) {
|
|
182
|
-
this._history.push({ ...message2, id: generateMessageId() });
|
|
183
|
-
for (let idx = 0; idx < this._agents.length; idx++) {
|
|
184
|
-
if (idx === fromAgentIdx) continue;
|
|
185
|
-
if (!this._pendingMessages.has(idx)) {
|
|
186
|
-
this._pendingMessages.set(idx, []);
|
|
187
|
-
}
|
|
188
|
-
this._pendingMessages.get(idx).push(message2);
|
|
189
|
-
}
|
|
190
|
-
}
|
|
191
|
-
addMessages(messages, fromAgentIdx) {
|
|
192
|
-
for (const message2 of messages) {
|
|
193
|
-
this.addMessage(message2, fromAgentIdx);
|
|
122
|
+
// src/agents/utils.ts
|
|
123
|
+
var toolMessageRole = "tool";
|
|
124
|
+
var assistantMessageRole = "assistant";
|
|
125
|
+
var userMessageRole = "user";
|
|
126
|
+
var groupMessagesByToolBoundaries = (messages) => {
|
|
127
|
+
const segments = [];
|
|
128
|
+
let currentSegment = [];
|
|
129
|
+
for (const message2 of messages) {
|
|
130
|
+
currentSegment.push(message2);
|
|
131
|
+
if (message2.role === toolMessageRole) {
|
|
132
|
+
segments.push(currentSegment);
|
|
133
|
+
currentSegment = [];
|
|
194
134
|
}
|
|
195
135
|
}
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
}
|
|
199
|
-
clearPendingMessages(agentIdx) {
|
|
200
|
-
this._pendingMessages.set(agentIdx, []);
|
|
201
|
-
}
|
|
202
|
-
newTurn() {
|
|
203
|
-
this._pendingAgentsOnTurn = new Set(this._agents);
|
|
204
|
-
this._pendingRolesOnTurn = [
|
|
205
|
-
"User" /* USER */,
|
|
206
|
-
"Agent" /* AGENT */,
|
|
207
|
-
"Judge" /* JUDGE */
|
|
208
|
-
];
|
|
209
|
-
if (this._turn === null) {
|
|
210
|
-
this._turn = 1;
|
|
211
|
-
} else {
|
|
212
|
-
this._turn++;
|
|
213
|
-
}
|
|
136
|
+
if (currentSegment.length > 0) {
|
|
137
|
+
segments.push(currentSegment);
|
|
214
138
|
}
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
139
|
+
return segments;
|
|
140
|
+
};
|
|
141
|
+
var segmentHasToolMessages = (segment) => {
|
|
142
|
+
return segment.some((message2) => {
|
|
143
|
+
if (message2.role === toolMessageRole) return true;
|
|
144
|
+
if (message2.role === assistantMessageRole && Array.isArray(message2.content)) {
|
|
145
|
+
return message2.content.some((part) => part.type === "tool-call");
|
|
219
146
|
}
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
147
|
+
return false;
|
|
148
|
+
});
|
|
149
|
+
};
|
|
150
|
+
var reverseSegmentRoles = (segment) => {
|
|
151
|
+
return segment.map((message2) => {
|
|
152
|
+
const hasStringContent = typeof message2.content === "string";
|
|
153
|
+
if (!hasStringContent) return message2;
|
|
154
|
+
const roleMap = {
|
|
155
|
+
[userMessageRole]: assistantMessageRole,
|
|
156
|
+
[assistantMessageRole]: userMessageRole
|
|
157
|
+
};
|
|
158
|
+
const newRole = roleMap[message2.role];
|
|
159
|
+
if (!newRole) return message2;
|
|
160
|
+
return {
|
|
161
|
+
role: newRole,
|
|
162
|
+
content: message2.content
|
|
163
|
+
};
|
|
164
|
+
});
|
|
165
|
+
};
|
|
166
|
+
var messageRoleReversal = (messages) => {
|
|
167
|
+
const segments = groupMessagesByToolBoundaries(messages);
|
|
168
|
+
const processedSegments = segments.map(
|
|
169
|
+
(segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
|
|
170
|
+
);
|
|
171
|
+
return processedSegments.flat();
|
|
172
|
+
};
|
|
173
|
+
var criterionToParamName = (criterion) => {
|
|
174
|
+
return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
|
|
175
|
+
};
|
|
176
|
+
|
|
177
|
+
// src/config/load.ts
|
|
178
|
+
var import_promises = __toESM(require("fs/promises"));
|
|
179
|
+
var import_node_path = __toESM(require("path"));
|
|
180
|
+
var import_node_url = require("url");
|
|
181
|
+
async function loadScenarioProjectConfig() {
|
|
182
|
+
const cwd = process.cwd();
|
|
183
|
+
const configNames = [
|
|
184
|
+
"scenario.config.js",
|
|
185
|
+
"scenario.config.mjs"
|
|
186
|
+
];
|
|
187
|
+
for (const name of configNames) {
|
|
188
|
+
const fullPath = import_node_path.default.join(cwd, name);
|
|
189
|
+
try {
|
|
190
|
+
await import_promises.default.access(fullPath);
|
|
191
|
+
const configModule = await import((0, import_node_url.pathToFileURL)(fullPath).href);
|
|
192
|
+
const config2 = configModule.default || configModule;
|
|
193
|
+
const parsed = scenarioProjectConfigSchema.safeParse(config2);
|
|
194
|
+
if (!parsed.success) {
|
|
195
|
+
throw new Error(
|
|
196
|
+
`Invalid config file ${name}: ${JSON.stringify(parsed.error.format(), null, 2)}`
|
|
197
|
+
);
|
|
198
|
+
}
|
|
199
|
+
return parsed.data;
|
|
200
|
+
} catch (error) {
|
|
201
|
+
if (error instanceof Error && "code" in error && error.code === "ENOENT") {
|
|
202
|
+
continue;
|
|
229
203
|
}
|
|
204
|
+
throw error;
|
|
230
205
|
}
|
|
231
|
-
return null;
|
|
232
|
-
}
|
|
233
|
-
addAgentTime(agentIdx, time) {
|
|
234
|
-
const currentTime = this._agentTimes.get(agentIdx) || 0;
|
|
235
|
-
this._agentTimes.set(agentIdx, currentTime + time);
|
|
236
|
-
}
|
|
237
|
-
hasResult() {
|
|
238
|
-
return this._partialResult !== null;
|
|
239
|
-
}
|
|
240
|
-
setResult(result) {
|
|
241
|
-
this._partialResult = result;
|
|
242
|
-
}
|
|
243
|
-
get lastMessage() {
|
|
244
|
-
return this._history[this._history.length - 1];
|
|
245
|
-
}
|
|
246
|
-
get lastUserMessage() {
|
|
247
|
-
return this._history.findLast((message2) => message2.role === "user");
|
|
248
|
-
}
|
|
249
|
-
get lastAssistantMessage() {
|
|
250
|
-
return this._history.findLast((message2) => message2.role === "assistant");
|
|
251
|
-
}
|
|
252
|
-
get lastToolCall() {
|
|
253
|
-
return this._history.findLast((message2) => message2.role === "tool");
|
|
254
|
-
}
|
|
255
|
-
getLastToolCallByToolName(toolName) {
|
|
256
|
-
const toolMessage = this._history.findLast(
|
|
257
|
-
(message2) => message2.role === "tool" && message2.content.find(
|
|
258
|
-
(part) => part.type === "tool-result" && part.toolName === toolName
|
|
259
|
-
)
|
|
260
|
-
);
|
|
261
|
-
return toolMessage;
|
|
262
|
-
}
|
|
263
|
-
hasToolCall(toolName) {
|
|
264
|
-
return this._history.some(
|
|
265
|
-
(message2) => message2.role === "tool" && message2.content.find(
|
|
266
|
-
(part) => part.type === "tool-result" && part.toolName === toolName
|
|
267
|
-
)
|
|
268
|
-
);
|
|
269
|
-
}
|
|
270
|
-
get history() {
|
|
271
|
-
return this._history;
|
|
272
|
-
}
|
|
273
|
-
get historyWithoutLastMessage() {
|
|
274
|
-
return this._history.slice(0, -1);
|
|
275
|
-
}
|
|
276
|
-
get historyWithoutLastUserMessage() {
|
|
277
|
-
const lastUserMessageIndex = this._history.findLastIndex((message2) => message2.role === "user");
|
|
278
|
-
if (lastUserMessageIndex === -1) return this._history;
|
|
279
|
-
return this._history.slice(0, lastUserMessageIndex);
|
|
280
|
-
}
|
|
281
|
-
get turn() {
|
|
282
|
-
return this._turn;
|
|
283
|
-
}
|
|
284
|
-
set turn(turn) {
|
|
285
|
-
this._turn = turn;
|
|
286
|
-
}
|
|
287
|
-
get threadId() {
|
|
288
|
-
return this._threadId;
|
|
289
|
-
}
|
|
290
|
-
get agents() {
|
|
291
|
-
return this._agents;
|
|
292
|
-
}
|
|
293
|
-
get pendingRolesOnTurn() {
|
|
294
|
-
return this._pendingRolesOnTurn;
|
|
295
|
-
}
|
|
296
|
-
set pendingRolesOnTurn(roles) {
|
|
297
|
-
this._pendingRolesOnTurn = roles;
|
|
298
|
-
}
|
|
299
|
-
get pendingAgentsOnTurn() {
|
|
300
|
-
return Array.from(this._pendingAgentsOnTurn);
|
|
301
|
-
}
|
|
302
|
-
set pendingAgentsOnTurn(agents) {
|
|
303
|
-
this._pendingAgentsOnTurn = new Set(agents);
|
|
304
|
-
}
|
|
305
|
-
get partialResult() {
|
|
306
|
-
return this._partialResult;
|
|
307
|
-
}
|
|
308
|
-
get totalTime() {
|
|
309
|
-
return Date.now() - this._totalStartTime;
|
|
310
|
-
}
|
|
311
|
-
get agentTimes() {
|
|
312
|
-
return new Map(this._agentTimes);
|
|
313
206
|
}
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
}
|
|
317
|
-
};
|
|
318
|
-
|
|
319
|
-
// src/events/schema.ts
|
|
320
|
-
var import_core = require("@ag-ui/core");
|
|
321
|
-
var import_zod2 = require("zod");
|
|
322
|
-
var ScenarioRunStatus = /* @__PURE__ */ ((ScenarioRunStatus2) => {
|
|
323
|
-
ScenarioRunStatus2["SUCCESS"] = "SUCCESS";
|
|
324
|
-
ScenarioRunStatus2["ERROR"] = "ERROR";
|
|
325
|
-
ScenarioRunStatus2["CANCELLED"] = "CANCELLED";
|
|
326
|
-
ScenarioRunStatus2["IN_PROGRESS"] = "IN_PROGRESS";
|
|
327
|
-
ScenarioRunStatus2["PENDING"] = "PENDING";
|
|
328
|
-
ScenarioRunStatus2["FAILED"] = "FAILED";
|
|
329
|
-
return ScenarioRunStatus2;
|
|
330
|
-
})(ScenarioRunStatus || {});
|
|
331
|
-
var baseEventSchema = import_zod2.z.object({
|
|
332
|
-
type: import_zod2.z.nativeEnum(import_core.EventType),
|
|
333
|
-
timestamp: import_zod2.z.number().optional(),
|
|
334
|
-
rawEvent: import_zod2.z.any().optional()
|
|
335
|
-
});
|
|
336
|
-
var baseScenarioEventSchema = baseEventSchema.extend({
|
|
337
|
-
batchRunId: import_zod2.z.string(),
|
|
338
|
-
scenarioId: import_zod2.z.string(),
|
|
339
|
-
scenarioRunId: import_zod2.z.string()
|
|
340
|
-
});
|
|
341
|
-
var scenarioRunStartedSchema = baseScenarioEventSchema.extend({
|
|
342
|
-
type: import_zod2.z.literal("SCENARIO_RUN_STARTED" /* RUN_STARTED */),
|
|
343
|
-
metadata: import_zod2.z.object({
|
|
344
|
-
name: import_zod2.z.string(),
|
|
345
|
-
description: import_zod2.z.string().optional()
|
|
346
|
-
// config: z.record(z.unknown()).optional(),
|
|
347
|
-
})
|
|
348
|
-
});
|
|
349
|
-
var scenarioRunFinishedSchema = baseScenarioEventSchema.extend({
|
|
350
|
-
type: import_zod2.z.literal("SCENARIO_RUN_FINISHED" /* RUN_FINISHED */),
|
|
351
|
-
status: import_zod2.z.nativeEnum(ScenarioRunStatus)
|
|
352
|
-
// error: z
|
|
353
|
-
// .object({
|
|
354
|
-
// message: z.string(),
|
|
355
|
-
// code: z.string().optional(),
|
|
356
|
-
// stack: z.string().optional(),
|
|
357
|
-
// })
|
|
358
|
-
// .optional(),
|
|
359
|
-
// metrics: z.record(z.number()).optional(),
|
|
360
|
-
});
|
|
361
|
-
var scenarioMessageSnapshotSchema = import_core.MessagesSnapshotEventSchema.merge(
|
|
362
|
-
baseScenarioEventSchema.extend({
|
|
363
|
-
type: import_zod2.z.literal("SCENARIO_MESSAGE_SNAPSHOT" /* MESSAGE_SNAPSHOT */)
|
|
364
|
-
})
|
|
365
|
-
);
|
|
366
|
-
var scenarioEventSchema = import_zod2.z.discriminatedUnion("type", [
|
|
367
|
-
scenarioRunStartedSchema,
|
|
368
|
-
scenarioRunFinishedSchema,
|
|
369
|
-
scenarioMessageSnapshotSchema
|
|
370
|
-
]);
|
|
371
|
-
var successSchema = import_zod2.z.object({ success: import_zod2.z.boolean() });
|
|
372
|
-
var errorSchema = import_zod2.z.object({ error: import_zod2.z.string() });
|
|
373
|
-
var stateSchema = import_zod2.z.object({
|
|
374
|
-
state: import_zod2.z.object({
|
|
375
|
-
messages: import_zod2.z.array(import_zod2.z.any()),
|
|
376
|
-
status: import_zod2.z.string()
|
|
377
|
-
})
|
|
378
|
-
});
|
|
379
|
-
var runsSchema = import_zod2.z.object({ runs: import_zod2.z.array(import_zod2.z.string()) });
|
|
380
|
-
var eventsSchema = import_zod2.z.object({ events: import_zod2.z.array(scenarioEventSchema) });
|
|
207
|
+
return await scenarioProjectConfigSchema.parseAsync({});
|
|
208
|
+
}
|
|
381
209
|
|
|
382
210
|
// src/utils/logger.ts
|
|
383
211
|
var Logger = class _Logger {
|
|
@@ -445,300 +273,645 @@ var Logger = class _Logger {
|
|
|
445
273
|
}
|
|
446
274
|
};
|
|
447
275
|
|
|
448
|
-
// src/
|
|
449
|
-
var
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
return [response];
|
|
457
|
-
return [];
|
|
458
|
-
}
|
|
459
|
-
var ScenarioExecution = class {
|
|
460
|
-
state = new ScenarioExecutionState();
|
|
461
|
-
eventSubject = new import_rxjs.Subject();
|
|
462
|
-
logger = new Logger("scenario.execution.ScenarioExecution");
|
|
463
|
-
config;
|
|
464
|
-
/**
|
|
465
|
-
* An observable stream of events that occur during the scenario execution.
|
|
466
|
-
* Subscribe to this to monitor the progress of the scenario in real-time.
|
|
467
|
-
*/
|
|
468
|
-
events$ = this.eventSubject.asObservable();
|
|
469
|
-
/**
|
|
470
|
-
* Creates a new ScenarioExecution instance.
|
|
471
|
-
* @param config The scenario configuration.
|
|
472
|
-
* @param script The script steps to execute.
|
|
473
|
-
*/
|
|
474
|
-
constructor(config2, script) {
|
|
475
|
-
this.config = {
|
|
476
|
-
id: config2.id ?? generateScenarioId(),
|
|
477
|
-
name: config2.name,
|
|
478
|
-
description: config2.description,
|
|
479
|
-
agents: config2.agents,
|
|
480
|
-
script,
|
|
481
|
-
verbose: config2.verbose ?? false,
|
|
482
|
-
maxTurns: config2.maxTurns ?? 10,
|
|
483
|
-
threadId: config2.threadId ?? generateThreadId()
|
|
484
|
-
};
|
|
485
|
-
this.reset();
|
|
486
|
-
}
|
|
487
|
-
/**
|
|
488
|
-
* The history of messages in the conversation.
|
|
489
|
-
*/
|
|
490
|
-
get history() {
|
|
491
|
-
return this.state.history;
|
|
276
|
+
// src/config/index.ts
|
|
277
|
+
var logger = new Logger("scenario.config");
|
|
278
|
+
var configLoaded = false;
|
|
279
|
+
var config = null;
|
|
280
|
+
var configLoadPromise = null;
|
|
281
|
+
async function loadProjectConfig() {
|
|
282
|
+
if (configLoaded) {
|
|
283
|
+
return;
|
|
492
284
|
}
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
*/
|
|
496
|
-
get threadId() {
|
|
497
|
-
return this.state.threadId;
|
|
285
|
+
if (configLoadPromise) {
|
|
286
|
+
return configLoadPromise;
|
|
498
287
|
}
|
|
499
|
-
|
|
500
|
-
* Executes the entire scenario from start to finish.
|
|
501
|
-
* This will run through the script and any automatic proceeding logic until a
|
|
502
|
-
* final result (success, failure, or error) is determined.
|
|
503
|
-
* @returns A promise that resolves with the final result of the scenario.
|
|
504
|
-
*/
|
|
505
|
-
async execute() {
|
|
506
|
-
this.reset();
|
|
507
|
-
const scenarioRunId = generateScenarioRunId();
|
|
508
|
-
this.emitRunStarted({ scenarioRunId });
|
|
288
|
+
configLoadPromise = (async () => {
|
|
509
289
|
try {
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
scriptStep
|
|
513
|
-
});
|
|
514
|
-
const result = await scriptStep(this.state, this);
|
|
515
|
-
this.emitMessageSnapshot({ scenarioRunId });
|
|
516
|
-
if (result && typeof result === "object" && "success" in result) {
|
|
517
|
-
this.emitRunFinished({
|
|
518
|
-
scenarioRunId,
|
|
519
|
-
status: result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */
|
|
520
|
-
});
|
|
521
|
-
return result;
|
|
522
|
-
}
|
|
523
|
-
}
|
|
524
|
-
this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */ });
|
|
525
|
-
return this.reachedMaxTurns([
|
|
526
|
-
"Reached end of script without conclusion, add one of the following to the end of the script:",
|
|
527
|
-
"- `Scenario.proceed()` to let the simulation continue to play out",
|
|
528
|
-
"- `Scenario.judge()` to force criteria judgement",
|
|
529
|
-
"- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
|
|
530
|
-
].join("\n"));
|
|
290
|
+
config = await loadScenarioProjectConfig();
|
|
291
|
+
logger.info("loaded scenario project config", { config });
|
|
531
292
|
} catch (error) {
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
});
|
|
536
|
-
throw error;
|
|
537
|
-
}
|
|
538
|
-
}
|
|
539
|
-
/**
|
|
540
|
-
* Executes a single step in the scenario.
|
|
541
|
-
* A step usually corresponds to a single agent's turn. This method is useful
|
|
542
|
-
* for manually controlling the scenario's progress.
|
|
543
|
-
* @returns A promise that resolves with the new messages added during the step, or a final scenario result if the step concludes the scenario.
|
|
544
|
-
*/
|
|
545
|
-
async step() {
|
|
546
|
-
const result = await this._step();
|
|
547
|
-
if (result === null) throw new Error("No result from step");
|
|
548
|
-
return result;
|
|
549
|
-
}
|
|
550
|
-
async _step(goToNextTurn = true, onTurn) {
|
|
551
|
-
if (this.state.pendingRolesOnTurn.length === 0) {
|
|
552
|
-
if (!goToNextTurn) return null;
|
|
553
|
-
this.state.newTurn();
|
|
554
|
-
if (onTurn) await onTurn(this.state);
|
|
555
|
-
if (this.state.turn != null && this.state.turn >= this.config.maxTurns)
|
|
556
|
-
return this.reachedMaxTurns();
|
|
557
|
-
}
|
|
558
|
-
const currentRole = this.state.pendingRolesOnTurn[0];
|
|
559
|
-
const { idx, agent: nextAgent } = this.nextAgentForRole(currentRole);
|
|
560
|
-
if (!nextAgent) {
|
|
561
|
-
this.state.removePendingRole(currentRole);
|
|
562
|
-
return this._step(goToNextTurn, onTurn);
|
|
293
|
+
logger.error("error loading scenario project config", { error });
|
|
294
|
+
} finally {
|
|
295
|
+
configLoaded = true;
|
|
563
296
|
}
|
|
564
|
-
|
|
565
|
-
|
|
297
|
+
})();
|
|
298
|
+
return configLoadPromise;
|
|
299
|
+
}
|
|
300
|
+
async function getProjectConfig() {
|
|
301
|
+
await loadProjectConfig();
|
|
302
|
+
return config;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// src/utils/config.ts
|
|
306
|
+
function mergeConfig(config2, projectConfig) {
|
|
307
|
+
if (!projectConfig) {
|
|
308
|
+
return config2;
|
|
566
309
|
}
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
};
|
|
579
|
-
const agentResponse = await agent2.call(agentInput);
|
|
580
|
-
const endTime = Date.now();
|
|
581
|
-
this.state.addAgentTime(idx, endTime - startTime);
|
|
582
|
-
this.state.clearPendingMessages(idx);
|
|
583
|
-
if (typeof agentResponse === "object" && agentResponse && "success" in agentResponse) {
|
|
584
|
-
return agentResponse;
|
|
585
|
-
}
|
|
586
|
-
const messages = convertAgentReturnTypesToMessages(
|
|
587
|
-
agentResponse,
|
|
588
|
-
role === "User" /* USER */ ? "user" : "assistant"
|
|
589
|
-
);
|
|
590
|
-
this.state.addMessages(messages, idx);
|
|
591
|
-
return messages;
|
|
310
|
+
return {
|
|
311
|
+
...projectConfig.defaultModel,
|
|
312
|
+
...config2
|
|
313
|
+
};
|
|
314
|
+
}
|
|
315
|
+
function mergeAndValidateConfig(config2, projectConfig) {
|
|
316
|
+
var _a;
|
|
317
|
+
const mergedConfig = mergeConfig(config2, projectConfig);
|
|
318
|
+
mergedConfig.model = mergedConfig.model ?? ((_a = projectConfig == null ? void 0 : projectConfig.defaultModel) == null ? void 0 : _a.model);
|
|
319
|
+
if (!mergedConfig.model) {
|
|
320
|
+
throw new Error("Model is required");
|
|
592
321
|
}
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
322
|
+
return mergedConfig;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// src/agents/judge-agent.ts
|
|
326
|
+
function buildSystemPrompt(criteria, description) {
|
|
327
|
+
const criteriaList = (criteria == null ? void 0 : criteria.map((criterion, idx) => `${idx + 1}. ${criterion}`).join("\n")) || "No criteria provided";
|
|
328
|
+
return `
|
|
329
|
+
<role>
|
|
330
|
+
You are an LLM as a judge watching a simulated conversation as it plays out live to determine if the agent under test meets the criteria or not.
|
|
331
|
+
</role>
|
|
332
|
+
|
|
333
|
+
<goal>
|
|
334
|
+
Your goal is to determine if you already have enough information to make a verdict of the scenario below, or if the conversation should continue for longer.
|
|
335
|
+
If you do have enough information, use the finish_test tool to determine if all the criteria have been met, if not, use the continue_test tool to let the next step play out.
|
|
336
|
+
</goal>
|
|
337
|
+
|
|
338
|
+
<scenario>
|
|
339
|
+
${description}
|
|
340
|
+
</scenario>
|
|
341
|
+
|
|
342
|
+
<criteria>
|
|
343
|
+
${criteriaList}
|
|
344
|
+
</criteria>
|
|
345
|
+
|
|
346
|
+
<rules>
|
|
347
|
+
- Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criteria.
|
|
348
|
+
- DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
|
|
349
|
+
</rules>
|
|
350
|
+
`.trim();
|
|
351
|
+
}
|
|
352
|
+
function buildContinueTestTool() {
|
|
353
|
+
return (0, import_ai.tool)({
|
|
354
|
+
description: "Continue the test with the next step",
|
|
355
|
+
parameters: import_zod2.z.object({})
|
|
356
|
+
});
|
|
357
|
+
}
|
|
358
|
+
function buildFinishTestTool(criteria) {
|
|
359
|
+
const criteriaNames = criteria.map(criterionToParamName);
|
|
360
|
+
return (0, import_ai.tool)({
|
|
361
|
+
description: "Complete the test with a final verdict",
|
|
362
|
+
parameters: import_zod2.z.object({
|
|
363
|
+
criteria: import_zod2.z.object(
|
|
364
|
+
Object.fromEntries(
|
|
365
|
+
criteriaNames.map((name, idx) => [
|
|
366
|
+
name,
|
|
367
|
+
import_zod2.z.enum(["true", "false", "inconclusive"]).describe(criteria[idx])
|
|
368
|
+
])
|
|
369
|
+
)
|
|
370
|
+
).strict().describe("Strict verdict for each criterion"),
|
|
371
|
+
reasoning: import_zod2.z.string().describe("Explanation of what the final verdict should be"),
|
|
372
|
+
verdict: import_zod2.z.enum(["success", "failure", "inconclusive"]).describe("The final verdict of the test")
|
|
373
|
+
})
|
|
374
|
+
});
|
|
375
|
+
}
|
|
376
|
+
var judgeAgent = (cfg) => {
|
|
377
|
+
return {
|
|
378
|
+
role: "Judge" /* JUDGE */,
|
|
379
|
+
criteria: cfg.criteria,
|
|
380
|
+
call: async (input) => {
|
|
381
|
+
var _a;
|
|
382
|
+
const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
|
|
383
|
+
const messages = [
|
|
384
|
+
{ role: "system", content: systemPrompt },
|
|
385
|
+
...input.messages
|
|
386
|
+
];
|
|
387
|
+
const isLastMessage = input.scenarioState.currentTurn === input.scenarioConfig.maxTurns;
|
|
388
|
+
const projectConfig = await getProjectConfig();
|
|
389
|
+
const mergedConfig = mergeAndValidateConfig(cfg, projectConfig);
|
|
390
|
+
if (!mergedConfig.model) {
|
|
391
|
+
throw new Error("Model is required for the judge agent");
|
|
597
392
|
}
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
totalTime: this.state.totalTime,
|
|
613
|
-
agentTime: totalAgentTime
|
|
614
|
-
};
|
|
615
|
-
}
|
|
616
|
-
getJudgeAgent() {
|
|
617
|
-
return this.state.agents.find((agent2) => agent2 instanceof JudgeAgentAdapter) ?? null;
|
|
618
|
-
}
|
|
619
|
-
consumeUntilRole(role) {
|
|
620
|
-
while (this.state.pendingRolesOnTurn.length > 0) {
|
|
621
|
-
const nextRole = this.state.pendingRolesOnTurn[0];
|
|
622
|
-
if (nextRole === role) break;
|
|
623
|
-
this.state.pendingRolesOnTurn.pop();
|
|
624
|
-
}
|
|
625
|
-
}
|
|
626
|
-
async scriptCallAgent(role, content, judgmentRequest = false) {
|
|
627
|
-
this.consumeUntilRole(role);
|
|
628
|
-
let index = -1;
|
|
629
|
-
let agent2 = null;
|
|
630
|
-
const nextAgent = this.state.getNextAgentForRole(role);
|
|
631
|
-
if (!nextAgent) {
|
|
632
|
-
this.state.newTurn();
|
|
633
|
-
this.consumeUntilRole(role);
|
|
634
|
-
const nextAgent2 = this.state.getNextAgentForRole(role);
|
|
635
|
-
if (!nextAgent2) {
|
|
636
|
-
let roleClass = "";
|
|
637
|
-
switch (role) {
|
|
638
|
-
case "User" /* USER */:
|
|
639
|
-
roleClass = "a scenario.userSimulatorAgent()";
|
|
640
|
-
break;
|
|
641
|
-
case "Agent" /* AGENT */:
|
|
642
|
-
roleClass = "a scenario.agent()";
|
|
643
|
-
break;
|
|
644
|
-
case "Judge" /* JUDGE */:
|
|
645
|
-
roleClass = "a scenario.judgeAgent()";
|
|
646
|
-
break;
|
|
647
|
-
default:
|
|
648
|
-
roleClass = "your agent";
|
|
649
|
-
}
|
|
650
|
-
if (content)
|
|
651
|
-
throw new Error(
|
|
652
|
-
`Cannot generate a message for role \`${role}\` with content \`${content}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
|
|
653
|
-
);
|
|
654
|
-
throw new Error(
|
|
655
|
-
`Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
|
|
656
|
-
);
|
|
393
|
+
const tools = {
|
|
394
|
+
continue_test: buildContinueTestTool(),
|
|
395
|
+
finish_test: buildFinishTestTool(cfg.criteria)
|
|
396
|
+
};
|
|
397
|
+
const enforceJudgement = input.judgmentRequest;
|
|
398
|
+
const hasCriteria = cfg.criteria.length && cfg.criteria.length > 0;
|
|
399
|
+
if (enforceJudgement && !hasCriteria) {
|
|
400
|
+
return {
|
|
401
|
+
success: false,
|
|
402
|
+
messages: [],
|
|
403
|
+
reasoning: "JudgeAgent: No criteria was provided to be judged against",
|
|
404
|
+
metCriteria: [],
|
|
405
|
+
unmetCriteria: []
|
|
406
|
+
};
|
|
657
407
|
}
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
408
|
+
const toolChoice = (isLastMessage || enforceJudgement) && hasCriteria ? { type: "tool", toolName: "finish_test" } : "required";
|
|
409
|
+
const completion = await (0, import_ai.generateText)({
|
|
410
|
+
model: mergedConfig.model,
|
|
411
|
+
messages,
|
|
412
|
+
temperature: mergedConfig.temperature ?? 0,
|
|
413
|
+
maxTokens: mergedConfig.maxTokens,
|
|
414
|
+
tools,
|
|
415
|
+
toolChoice
|
|
416
|
+
});
|
|
417
|
+
let args;
|
|
418
|
+
if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
|
|
419
|
+
const toolCall = completion.toolCalls[0];
|
|
420
|
+
switch (toolCall.toolName) {
|
|
421
|
+
case "finish_test": {
|
|
422
|
+
args = toolCall.args;
|
|
423
|
+
const verdict = args.verdict || "inconclusive";
|
|
424
|
+
const reasoning = args.reasoning || "No reasoning provided";
|
|
425
|
+
const criteria = args.criteria || {};
|
|
426
|
+
const criteriaValues = Object.values(criteria);
|
|
427
|
+
const metCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] === "true");
|
|
428
|
+
const unmetCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] !== "true");
|
|
429
|
+
return {
|
|
430
|
+
success: verdict === "success",
|
|
431
|
+
messages: input.messages,
|
|
432
|
+
reasoning,
|
|
433
|
+
metCriteria,
|
|
434
|
+
unmetCriteria
|
|
435
|
+
};
|
|
436
|
+
}
|
|
437
|
+
case "continue_test":
|
|
438
|
+
return [];
|
|
439
|
+
default:
|
|
440
|
+
return {
|
|
441
|
+
success: false,
|
|
442
|
+
messages: input.messages,
|
|
443
|
+
reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
|
|
444
|
+
metCriteria: [],
|
|
445
|
+
unmetCriteria: cfg.criteria
|
|
446
|
+
};
|
|
447
|
+
}
|
|
674
448
|
}
|
|
675
|
-
return
|
|
449
|
+
return {
|
|
450
|
+
success: false,
|
|
451
|
+
messages: input.messages,
|
|
452
|
+
reasoning: `JudgeAgent: No tool call found in LLM output`,
|
|
453
|
+
metCriteria: [],
|
|
454
|
+
unmetCriteria: cfg.criteria
|
|
455
|
+
};
|
|
676
456
|
}
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
457
|
+
};
|
|
458
|
+
};
|
|
459
|
+
|
|
460
|
+
// src/agents/user-simulator-agent.ts
|
|
461
|
+
var import_ai2 = require("ai");
|
|
462
|
+
function buildSystemPrompt2(description) {
|
|
463
|
+
return `
|
|
464
|
+
<role>
|
|
465
|
+
You are pretending to be a user, you are testing an AI Agent (shown as the user role) based on a scenario.
|
|
466
|
+
Approach this naturally, as a human user would, with very short inputs, few words, all lowercase, imperative, not periods, like when they google or talk to chatgpt.
|
|
467
|
+
</role>
|
|
468
|
+
|
|
469
|
+
<goal>
|
|
470
|
+
Your goal (assistant) is to interact with the Agent Under Test (user) as if you were a human user to see if it can complete the scenario successfully.
|
|
471
|
+
</goal>
|
|
472
|
+
|
|
473
|
+
<scenario>
|
|
474
|
+
${description}
|
|
475
|
+
</scenario>
|
|
476
|
+
|
|
477
|
+
<rules>
|
|
478
|
+
- DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, you are the user
|
|
479
|
+
</rules>
|
|
480
|
+
`.trim();
|
|
481
|
+
}
|
|
482
|
+
var userSimulatorAgent = (config2) => {
|
|
483
|
+
return {
|
|
484
|
+
role: "User" /* USER */,
|
|
485
|
+
call: async (input) => {
|
|
486
|
+
const systemPrompt = buildSystemPrompt2(input.scenarioConfig.description);
|
|
487
|
+
const messages = [
|
|
488
|
+
{ role: "system", content: systemPrompt },
|
|
489
|
+
{ role: "assistant", content: "Hello, how can I help you today" },
|
|
490
|
+
...input.messages
|
|
491
|
+
];
|
|
492
|
+
const projectConfig = await getProjectConfig();
|
|
493
|
+
const mergedConfig = mergeAndValidateConfig(config2 ?? {}, projectConfig);
|
|
494
|
+
if (!mergedConfig.model) {
|
|
495
|
+
throw new Error("Model is required for the user simulator agent");
|
|
496
|
+
}
|
|
497
|
+
const reversedMessages = messageRoleReversal(messages);
|
|
498
|
+
const completion = await (0, import_ai2.generateText)({
|
|
499
|
+
model: mergedConfig.model,
|
|
500
|
+
messages: reversedMessages,
|
|
501
|
+
temperature: mergedConfig.temperature ?? 0,
|
|
502
|
+
maxTokens: mergedConfig.maxTokens
|
|
503
|
+
});
|
|
504
|
+
const messageContent = completion.text;
|
|
505
|
+
if (!messageContent) {
|
|
506
|
+
throw new Error("No response content from LLM");
|
|
507
|
+
}
|
|
508
|
+
return { role: "user", content: messageContent };
|
|
509
|
+
}
|
|
510
|
+
};
|
|
511
|
+
};
|
|
512
|
+
|
|
513
|
+
// src/execution/index.ts
|
|
514
|
+
var execution_exports = {};
|
|
515
|
+
__export(execution_exports, {
|
|
516
|
+
ScenarioExecution: () => ScenarioExecution,
|
|
517
|
+
ScenarioExecutionState: () => ScenarioExecutionState
|
|
518
|
+
});
|
|
519
|
+
|
|
520
|
+
// src/execution/scenario-execution.ts
|
|
521
|
+
var import_rxjs = require("rxjs");
|
|
522
|
+
|
|
523
|
+
// src/utils/ids.ts
|
|
524
|
+
var import_xksuid = require("xksuid");
|
|
525
|
+
var batchRunId = null;
|
|
526
|
+
function generateThreadId() {
|
|
527
|
+
return `thread_${(0, import_xksuid.generate)()}`;
|
|
528
|
+
}
|
|
529
|
+
function generateScenarioRunId() {
|
|
530
|
+
return `scenariorun_${(0, import_xksuid.generate)()}`;
|
|
531
|
+
}
|
|
532
|
+
function generateScenarioId() {
|
|
533
|
+
return `scenario_${(0, import_xksuid.generate)()}`;
|
|
534
|
+
}
|
|
535
|
+
function getBatchRunId() {
|
|
536
|
+
if (!batchRunId) {
|
|
537
|
+
batchRunId = process.env.SCENARIO_BATCH_RUN_ID ?? `scenariobatchrun_${(0, import_xksuid.generate)()}`;
|
|
538
|
+
}
|
|
539
|
+
return batchRunId;
|
|
540
|
+
}
|
|
541
|
+
function generateMessageId() {
|
|
542
|
+
return `scenariomsg_${(0, import_xksuid.generate)()}`;
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
// src/execution/scenario-execution-state.ts
|
|
546
|
+
var ScenarioExecutionState = class {
|
|
547
|
+
_messages = [];
|
|
548
|
+
_currentTurn = 0;
|
|
549
|
+
_threadId = "";
|
|
550
|
+
description;
|
|
551
|
+
config;
|
|
552
|
+
constructor(config2) {
|
|
553
|
+
this.config = config2;
|
|
554
|
+
this.description = config2.description;
|
|
555
|
+
}
|
|
556
|
+
get messages() {
|
|
557
|
+
return this._messages;
|
|
558
|
+
}
|
|
559
|
+
get currentTurn() {
|
|
560
|
+
return this._currentTurn;
|
|
561
|
+
}
|
|
562
|
+
set currentTurn(turn) {
|
|
563
|
+
this._currentTurn = turn;
|
|
564
|
+
}
|
|
565
|
+
get threadId() {
|
|
566
|
+
return this._threadId;
|
|
567
|
+
}
|
|
568
|
+
set threadId(value) {
|
|
569
|
+
this._threadId = value;
|
|
681
570
|
}
|
|
682
571
|
/**
|
|
683
572
|
* Adds a message to the conversation history.
|
|
684
|
-
*
|
|
685
|
-
* @param message The message to add.
|
|
573
|
+
*
|
|
574
|
+
* @param message - The message to add.
|
|
686
575
|
*/
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
576
|
+
addMessage(message2) {
|
|
577
|
+
this._messages.push({ ...message2, id: generateMessageId() });
|
|
578
|
+
}
|
|
579
|
+
lastMessage() {
|
|
580
|
+
if (this._messages.length === 0) {
|
|
581
|
+
throw new Error("No messages in history");
|
|
582
|
+
}
|
|
583
|
+
return this._messages[this._messages.length - 1];
|
|
584
|
+
}
|
|
585
|
+
lastUserMessage() {
|
|
586
|
+
if (this._messages.length === 0) {
|
|
587
|
+
throw new Error("No messages in history");
|
|
588
|
+
}
|
|
589
|
+
const lastMessage = this._messages.findLast((message2) => message2.role === "user");
|
|
590
|
+
if (!lastMessage) {
|
|
591
|
+
throw new Error("No user message in history");
|
|
694
592
|
}
|
|
593
|
+
return lastMessage;
|
|
594
|
+
}
|
|
595
|
+
lastToolCall(toolName) {
|
|
596
|
+
if (this._messages.length === 0) {
|
|
597
|
+
throw new Error("No messages in history");
|
|
598
|
+
}
|
|
599
|
+
const lastMessage = this._messages.findLast((message2) => message2.role === "tool" && message2.content.find(
|
|
600
|
+
(part) => part.type === "tool-result" && part.toolName === toolName
|
|
601
|
+
));
|
|
602
|
+
if (!lastMessage) {
|
|
603
|
+
throw new Error("No tool call message in history");
|
|
604
|
+
}
|
|
605
|
+
return lastMessage;
|
|
606
|
+
}
|
|
607
|
+
hasToolCall(toolName) {
|
|
608
|
+
return this._messages.some(
|
|
609
|
+
(message2) => message2.role === "tool" && message2.content.find(
|
|
610
|
+
(part) => part.type === "tool-result" && part.toolName === toolName
|
|
611
|
+
)
|
|
612
|
+
);
|
|
695
613
|
}
|
|
614
|
+
};
|
|
615
|
+
|
|
616
|
+
// src/events/schema.ts
|
|
617
|
+
var import_core = require("@ag-ui/core");
|
|
618
|
+
var import_zod3 = require("zod");
|
|
619
|
+
var Verdict = /* @__PURE__ */ ((Verdict2) => {
|
|
620
|
+
Verdict2["SUCCESS"] = "success";
|
|
621
|
+
Verdict2["FAILURE"] = "failure";
|
|
622
|
+
Verdict2["INCONCLUSIVE"] = "inconclusive";
|
|
623
|
+
return Verdict2;
|
|
624
|
+
})(Verdict || {});
|
|
625
|
+
var ScenarioRunStatus = /* @__PURE__ */ ((ScenarioRunStatus2) => {
|
|
626
|
+
ScenarioRunStatus2["SUCCESS"] = "SUCCESS";
|
|
627
|
+
ScenarioRunStatus2["ERROR"] = "ERROR";
|
|
628
|
+
ScenarioRunStatus2["CANCELLED"] = "CANCELLED";
|
|
629
|
+
ScenarioRunStatus2["IN_PROGRESS"] = "IN_PROGRESS";
|
|
630
|
+
ScenarioRunStatus2["PENDING"] = "PENDING";
|
|
631
|
+
ScenarioRunStatus2["FAILED"] = "FAILED";
|
|
632
|
+
return ScenarioRunStatus2;
|
|
633
|
+
})(ScenarioRunStatus || {});
|
|
634
|
+
var baseEventSchema = import_zod3.z.object({
|
|
635
|
+
type: import_zod3.z.nativeEnum(import_core.EventType),
|
|
636
|
+
timestamp: import_zod3.z.number(),
|
|
637
|
+
rawEvent: import_zod3.z.any().optional()
|
|
638
|
+
});
|
|
639
|
+
var batchRunIdSchema = import_zod3.z.string();
|
|
640
|
+
var scenarioRunIdSchema = import_zod3.z.string();
|
|
641
|
+
var scenarioIdSchema = import_zod3.z.string();
|
|
642
|
+
var baseScenarioEventSchema = baseEventSchema.extend({
|
|
643
|
+
batchRunId: batchRunIdSchema,
|
|
644
|
+
scenarioId: scenarioIdSchema,
|
|
645
|
+
scenarioRunId: scenarioRunIdSchema,
|
|
646
|
+
scenarioSetId: import_zod3.z.string().optional().default("default")
|
|
647
|
+
});
|
|
648
|
+
var scenarioRunStartedSchema = baseScenarioEventSchema.extend({
|
|
649
|
+
type: import_zod3.z.literal("SCENARIO_RUN_STARTED" /* RUN_STARTED */),
|
|
650
|
+
metadata: import_zod3.z.object({
|
|
651
|
+
name: import_zod3.z.string().optional(),
|
|
652
|
+
description: import_zod3.z.string().optional()
|
|
653
|
+
})
|
|
654
|
+
});
|
|
655
|
+
var scenarioResultsSchema = import_zod3.z.object({
|
|
656
|
+
verdict: import_zod3.z.nativeEnum(Verdict),
|
|
657
|
+
reasoning: import_zod3.z.string().optional(),
|
|
658
|
+
metCriteria: import_zod3.z.array(import_zod3.z.string()),
|
|
659
|
+
unmetCriteria: import_zod3.z.array(import_zod3.z.string()),
|
|
660
|
+
error: import_zod3.z.string().optional()
|
|
661
|
+
});
|
|
662
|
+
var scenarioRunFinishedSchema = baseScenarioEventSchema.extend({
|
|
663
|
+
type: import_zod3.z.literal("SCENARIO_RUN_FINISHED" /* RUN_FINISHED */),
|
|
664
|
+
status: import_zod3.z.nativeEnum(ScenarioRunStatus),
|
|
665
|
+
results: scenarioResultsSchema.optional().nullable()
|
|
666
|
+
});
|
|
667
|
+
var scenarioMessageSnapshotSchema = import_core.MessagesSnapshotEventSchema.merge(
|
|
668
|
+
baseScenarioEventSchema.extend({
|
|
669
|
+
type: import_zod3.z.literal("SCENARIO_MESSAGE_SNAPSHOT" /* MESSAGE_SNAPSHOT */)
|
|
670
|
+
})
|
|
671
|
+
);
|
|
672
|
+
var scenarioEventSchema = import_zod3.z.discriminatedUnion("type", [
|
|
673
|
+
scenarioRunStartedSchema,
|
|
674
|
+
scenarioRunFinishedSchema,
|
|
675
|
+
scenarioMessageSnapshotSchema
|
|
676
|
+
]);
|
|
677
|
+
var successSchema = import_zod3.z.object({ success: import_zod3.z.boolean() });
|
|
678
|
+
var errorSchema = import_zod3.z.object({ error: import_zod3.z.string() });
|
|
679
|
+
var stateSchema = import_zod3.z.object({
|
|
680
|
+
state: import_zod3.z.object({
|
|
681
|
+
messages: import_zod3.z.array(import_zod3.z.any()),
|
|
682
|
+
status: import_zod3.z.string()
|
|
683
|
+
})
|
|
684
|
+
});
|
|
685
|
+
var runsSchema = import_zod3.z.object({ runs: import_zod3.z.array(import_zod3.z.string()) });
|
|
686
|
+
var eventsSchema = import_zod3.z.object({ events: import_zod3.z.array(scenarioEventSchema) });
|
|
687
|
+
|
|
688
|
+
// src/execution/scenario-execution.ts
|
|
689
|
+
var batchRunId2 = getBatchRunId();
|
|
690
|
+
var ScenarioExecution = class {
|
|
691
|
+
state;
|
|
692
|
+
eventSubject = new import_rxjs.Subject();
|
|
693
|
+
logger = new Logger("scenario.execution.ScenarioExecution");
|
|
694
|
+
config;
|
|
695
|
+
agents = [];
|
|
696
|
+
pendingRolesOnTurn = [];
|
|
697
|
+
pendingAgentsOnTurn = /* @__PURE__ */ new Set();
|
|
698
|
+
pendingMessages = /* @__PURE__ */ new Map();
|
|
699
|
+
partialResult = null;
|
|
700
|
+
agentTimes = /* @__PURE__ */ new Map();
|
|
701
|
+
totalStartTime = 0;
|
|
696
702
|
/**
|
|
697
|
-
*
|
|
698
|
-
*
|
|
699
|
-
* If not, the user simulator agent is called to generate a message.
|
|
700
|
-
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
701
|
-
* @param content The optional content of the user's message.
|
|
703
|
+
* An observable stream of events that occur during the scenario execution.
|
|
704
|
+
* Subscribe to this to monitor the progress of the scenario in real-time.
|
|
702
705
|
*/
|
|
703
|
-
|
|
704
|
-
|
|
706
|
+
events$ = this.eventSubject.asObservable();
|
|
707
|
+
/**
|
|
708
|
+
* Creates a new ScenarioExecution instance.
|
|
709
|
+
* @param config The scenario configuration.
|
|
710
|
+
* @param script The script steps to execute.
|
|
711
|
+
*/
|
|
712
|
+
constructor(config2, script) {
|
|
713
|
+
this.config = {
|
|
714
|
+
id: config2.id ?? generateScenarioId(),
|
|
715
|
+
name: config2.name,
|
|
716
|
+
description: config2.description,
|
|
717
|
+
agents: config2.agents,
|
|
718
|
+
script,
|
|
719
|
+
verbose: config2.verbose ?? false,
|
|
720
|
+
maxTurns: config2.maxTurns ?? 10,
|
|
721
|
+
threadId: config2.threadId ?? generateThreadId(),
|
|
722
|
+
setId: config2.setId
|
|
723
|
+
};
|
|
724
|
+
this.state = new ScenarioExecutionState(this.config);
|
|
725
|
+
this.reset();
|
|
705
726
|
}
|
|
706
727
|
/**
|
|
707
|
-
*
|
|
708
|
-
* If content is provided, it's used as the agent's message.
|
|
709
|
-
* If not, the agent under test is called to generate a response.
|
|
710
|
-
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
711
|
-
* @param content The optional content of the agent's message.
|
|
728
|
+
* The history of messages in the conversation.
|
|
712
729
|
*/
|
|
713
|
-
|
|
714
|
-
|
|
730
|
+
get messages() {
|
|
731
|
+
return this.state.messages;
|
|
715
732
|
}
|
|
716
733
|
/**
|
|
717
|
-
*
|
|
718
|
-
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
719
|
-
* @param content Optional message to pass to the judge.
|
|
720
|
-
* @returns A promise that resolves with the scenario result if the judge makes a final decision, otherwise null.
|
|
734
|
+
* The unique identifier for the conversation thread.
|
|
721
735
|
*/
|
|
722
|
-
|
|
723
|
-
return
|
|
736
|
+
get threadId() {
|
|
737
|
+
return this.state.threadId;
|
|
724
738
|
}
|
|
725
739
|
/**
|
|
726
|
-
*
|
|
727
|
-
* This simulates the natural flow of conversation between agents.
|
|
728
|
-
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
729
|
-
* @param turns The number of turns to proceed. If undefined, runs until a conclusion or max turns is reached.
|
|
730
|
-
* @param onTurn A callback executed at the end of each turn.
|
|
731
|
-
* @param onStep A callback executed after each agent interaction.
|
|
732
|
-
* @returns A promise that resolves with the scenario result if a conclusion is reached.
|
|
740
|
+
* The total elapsed time for the scenario execution.
|
|
733
741
|
*/
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
+
get totalTime() {
|
|
743
|
+
return Date.now() - this.totalStartTime;
|
|
744
|
+
}
|
|
745
|
+
/**
|
|
746
|
+
* Executes the entire scenario from start to finish.
|
|
747
|
+
* This will run through the script and any automatic proceeding logic until a
|
|
748
|
+
* final result (success, failure, or error) is determined.
|
|
749
|
+
* @returns A promise that resolves with the final result of the scenario.
|
|
750
|
+
*/
|
|
751
|
+
async execute() {
|
|
752
|
+
this.reset();
|
|
753
|
+
const scenarioRunId = generateScenarioRunId();
|
|
754
|
+
this.emitRunStarted({ scenarioRunId });
|
|
755
|
+
try {
|
|
756
|
+
for (const scriptStep of this.config.script) {
|
|
757
|
+
this.logger.debug(`[${this.config.id}] Executing script step`, {
|
|
758
|
+
scriptStep
|
|
759
|
+
});
|
|
760
|
+
const result = await scriptStep(this.state, this);
|
|
761
|
+
this.emitMessageSnapshot({ scenarioRunId });
|
|
762
|
+
if (result && typeof result === "object" && "success" in result) {
|
|
763
|
+
this.emitRunFinished({
|
|
764
|
+
scenarioRunId,
|
|
765
|
+
status: result.success ? "SUCCESS" /* SUCCESS */ : "FAILED" /* FAILED */,
|
|
766
|
+
result
|
|
767
|
+
});
|
|
768
|
+
return result;
|
|
769
|
+
}
|
|
770
|
+
}
|
|
771
|
+
this.emitRunFinished({ scenarioRunId, status: "FAILED" /* FAILED */ });
|
|
772
|
+
return this.reachedMaxTurns([
|
|
773
|
+
"Reached end of script without conclusion, add one of the following to the end of the script:",
|
|
774
|
+
"- `Scenario.proceed()` to let the simulation continue to play out",
|
|
775
|
+
"- `Scenario.judge()` to force criteria judgement",
|
|
776
|
+
"- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result"
|
|
777
|
+
].join("\n"));
|
|
778
|
+
} catch (error) {
|
|
779
|
+
const errorResult = {
|
|
780
|
+
success: false,
|
|
781
|
+
messages: this.state.messages,
|
|
782
|
+
reasoning: `Scenario failed with error: ${error instanceof Error ? error.message : String(error)}`,
|
|
783
|
+
metCriteria: [],
|
|
784
|
+
unmetCriteria: [],
|
|
785
|
+
error: error instanceof Error ? error.message : String(error)
|
|
786
|
+
};
|
|
787
|
+
this.emitRunFinished({
|
|
788
|
+
scenarioRunId,
|
|
789
|
+
status: "ERROR" /* ERROR */,
|
|
790
|
+
result: errorResult
|
|
791
|
+
});
|
|
792
|
+
return errorResult;
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
/**
|
|
796
|
+
* Executes a single step in the scenario.
|
|
797
|
+
* A step usually corresponds to a single agent's turn. This method is useful
|
|
798
|
+
* for manually controlling the scenario's progress.
|
|
799
|
+
* @returns A promise that resolves with the new messages added during the step, or a final scenario result if the step concludes the scenario.
|
|
800
|
+
*/
|
|
801
|
+
async step() {
|
|
802
|
+
const result = await this._step();
|
|
803
|
+
if (result === null) throw new Error("No result from step");
|
|
804
|
+
return result;
|
|
805
|
+
}
|
|
806
|
+
async _step(goToNextTurn = true, onTurn) {
|
|
807
|
+
if (this.pendingRolesOnTurn.length === 0) {
|
|
808
|
+
if (!goToNextTurn) return null;
|
|
809
|
+
this.newTurn();
|
|
810
|
+
if (onTurn) await onTurn(this.state);
|
|
811
|
+
if (this.state.currentTurn >= this.config.maxTurns)
|
|
812
|
+
return this.reachedMaxTurns();
|
|
813
|
+
}
|
|
814
|
+
const currentRole = this.pendingRolesOnTurn[0];
|
|
815
|
+
const { idx, agent: nextAgent } = this.nextAgentForRole(currentRole);
|
|
816
|
+
if (!nextAgent) {
|
|
817
|
+
this.removePendingRole(currentRole);
|
|
818
|
+
return this._step(goToNextTurn, onTurn);
|
|
819
|
+
}
|
|
820
|
+
this.removePendingAgent(nextAgent);
|
|
821
|
+
return await this.callAgent(idx, currentRole);
|
|
822
|
+
}
|
|
823
|
+
async callAgent(idx, role, judgmentRequest = false) {
|
|
824
|
+
const agent2 = this.agents[idx];
|
|
825
|
+
const startTime = Date.now();
|
|
826
|
+
const agentInput = {
|
|
827
|
+
threadId: this.state.threadId,
|
|
828
|
+
messages: this.state.messages,
|
|
829
|
+
newMessages: this.pendingMessages.get(idx) ?? [],
|
|
830
|
+
requestedRole: role,
|
|
831
|
+
judgmentRequest,
|
|
832
|
+
scenarioState: this.state,
|
|
833
|
+
scenarioConfig: this.config
|
|
834
|
+
};
|
|
835
|
+
const agentResponse = await agent2.call(agentInput);
|
|
836
|
+
const endTime = Date.now();
|
|
837
|
+
this.addAgentTime(idx, endTime - startTime);
|
|
838
|
+
this.pendingMessages.delete(idx);
|
|
839
|
+
if (agentResponse && typeof agentResponse === "object" && "success" in agentResponse) {
|
|
840
|
+
return agentResponse;
|
|
841
|
+
}
|
|
842
|
+
const currentAgentTime = this.agentTimes.get(idx) ?? 0;
|
|
843
|
+
this.agentTimes.set(idx, currentAgentTime + (Date.now() - startTime));
|
|
844
|
+
const messages = convertAgentReturnTypesToMessages(
|
|
845
|
+
agentResponse,
|
|
846
|
+
role === "User" /* USER */ ? "user" : "assistant"
|
|
847
|
+
);
|
|
848
|
+
for (const message2 of messages) {
|
|
849
|
+
this.state.addMessage(message2);
|
|
850
|
+
this.broadcastMessage(message2, idx);
|
|
851
|
+
}
|
|
852
|
+
return messages;
|
|
853
|
+
}
|
|
854
|
+
/**
|
|
855
|
+
* Adds a message to the conversation history.
|
|
856
|
+
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
857
|
+
* @param message The message to add.
|
|
858
|
+
*/
|
|
859
|
+
async message(message2) {
|
|
860
|
+
if (message2.role === "user") {
|
|
861
|
+
await this.scriptCallAgent("User" /* USER */, message2);
|
|
862
|
+
} else if (message2.role === "assistant") {
|
|
863
|
+
await this.scriptCallAgent("Agent" /* AGENT */, message2);
|
|
864
|
+
} else {
|
|
865
|
+
this.state.addMessage(message2);
|
|
866
|
+
this.broadcastMessage(message2);
|
|
867
|
+
}
|
|
868
|
+
}
|
|
869
|
+
/**
|
|
870
|
+
* Executes a user turn.
|
|
871
|
+
* If content is provided, it's used as the user's message.
|
|
872
|
+
* If not, the user simulator agent is called to generate a message.
|
|
873
|
+
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
874
|
+
* @param content The optional content of the user's message.
|
|
875
|
+
*/
|
|
876
|
+
async user(content) {
|
|
877
|
+
await this.scriptCallAgent("User" /* USER */, content);
|
|
878
|
+
}
|
|
879
|
+
/**
|
|
880
|
+
* Executes an agent turn.
|
|
881
|
+
* If content is provided, it's used as the agent's message.
|
|
882
|
+
* If not, the agent under test is called to generate a response.
|
|
883
|
+
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
884
|
+
* @param content The optional content of the agent's message.
|
|
885
|
+
*/
|
|
886
|
+
async agent(content) {
|
|
887
|
+
await this.scriptCallAgent("Agent" /* AGENT */, content);
|
|
888
|
+
}
|
|
889
|
+
/**
|
|
890
|
+
* Invokes the judge agent to evaluate the current state of the conversation.
|
|
891
|
+
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
892
|
+
* @param content Optional message to pass to the judge.
|
|
893
|
+
* @returns A promise that resolves with the scenario result if the judge makes a final decision, otherwise null.
|
|
894
|
+
*/
|
|
895
|
+
async judge(content) {
|
|
896
|
+
return await this.scriptCallAgent("Judge" /* JUDGE */, content, true);
|
|
897
|
+
}
|
|
898
|
+
/**
|
|
899
|
+
* Lets the scenario proceed automatically for a specified number of turns.
|
|
900
|
+
* This simulates the natural flow of conversation between agents.
|
|
901
|
+
* This is part of the `ScenarioExecutionLike` interface used by script steps.
|
|
902
|
+
* @param turns The number of turns to proceed. If undefined, runs until a conclusion or max turns is reached.
|
|
903
|
+
* @param onTurn A callback executed at the end of each turn.
|
|
904
|
+
* @param onStep A callback executed after each agent interaction.
|
|
905
|
+
* @returns A promise that resolves with the scenario result if a conclusion is reached.
|
|
906
|
+
*/
|
|
907
|
+
async proceed(turns, onTurn, onStep) {
|
|
908
|
+
let initialTurn = this.state.currentTurn;
|
|
909
|
+
while (true) {
|
|
910
|
+
const goToNextTurn = turns === void 0 || initialTurn === null || this.state.currentTurn != null && this.state.currentTurn + 1 < initialTurn + turns;
|
|
911
|
+
const nextMessage = await this._step(goToNextTurn, onTurn);
|
|
912
|
+
if (initialTurn === null)
|
|
913
|
+
initialTurn = this.state.currentTurn;
|
|
914
|
+
if (nextMessage === null) {
|
|
742
915
|
return null;
|
|
743
916
|
}
|
|
744
917
|
if (onStep) await onStep(this.state);
|
|
@@ -755,10 +928,10 @@ var ScenarioExecution = class {
|
|
|
755
928
|
async succeed(reasoning) {
|
|
756
929
|
return {
|
|
757
930
|
success: true,
|
|
758
|
-
messages: this.state.
|
|
931
|
+
messages: this.state.messages,
|
|
759
932
|
reasoning: reasoning || "Scenario marked as successful with Scenario.succeed()",
|
|
760
|
-
|
|
761
|
-
|
|
933
|
+
metCriteria: [],
|
|
934
|
+
unmetCriteria: []
|
|
762
935
|
};
|
|
763
936
|
}
|
|
764
937
|
/**
|
|
@@ -770,25 +943,147 @@ var ScenarioExecution = class {
|
|
|
770
943
|
async fail(reasoning) {
|
|
771
944
|
return {
|
|
772
945
|
success: false,
|
|
773
|
-
messages: this.state.
|
|
946
|
+
messages: this.state.messages,
|
|
774
947
|
reasoning: reasoning || "Scenario marked as failed with Scenario.fail()",
|
|
775
|
-
|
|
776
|
-
|
|
948
|
+
metCriteria: [],
|
|
949
|
+
unmetCriteria: []
|
|
777
950
|
};
|
|
778
951
|
}
|
|
952
|
+
addAgentTime(agentIdx, time) {
|
|
953
|
+
const currentTime = this.agentTimes.get(agentIdx) || 0;
|
|
954
|
+
this.agentTimes.set(agentIdx, currentTime + time);
|
|
955
|
+
}
|
|
956
|
+
hasResult() {
|
|
957
|
+
return this.partialResult !== null;
|
|
958
|
+
}
|
|
959
|
+
setResult(result) {
|
|
960
|
+
this.partialResult = result;
|
|
961
|
+
}
|
|
962
|
+
async scriptCallAgent(role, content, judgmentRequest = false) {
|
|
963
|
+
this.consumeUntilRole(role);
|
|
964
|
+
let index = -1;
|
|
965
|
+
let agent2 = null;
|
|
966
|
+
let nextAgent = this.getNextAgentForRole(role);
|
|
967
|
+
if (!nextAgent) {
|
|
968
|
+
this.newTurn();
|
|
969
|
+
this.consumeUntilRole(role);
|
|
970
|
+
nextAgent = this.getNextAgentForRole(role);
|
|
971
|
+
}
|
|
972
|
+
if (!nextAgent) {
|
|
973
|
+
let roleClass = "";
|
|
974
|
+
switch (role) {
|
|
975
|
+
case "User" /* USER */:
|
|
976
|
+
roleClass = "a scenario.userSimulatorAgent()";
|
|
977
|
+
break;
|
|
978
|
+
case "Agent" /* AGENT */:
|
|
979
|
+
roleClass = "a scenario.agent()";
|
|
980
|
+
break;
|
|
981
|
+
case "Judge" /* JUDGE */:
|
|
982
|
+
roleClass = "a scenario.judgeAgent()";
|
|
983
|
+
break;
|
|
984
|
+
default:
|
|
985
|
+
roleClass = "your agent";
|
|
986
|
+
}
|
|
987
|
+
if (content)
|
|
988
|
+
throw new Error(
|
|
989
|
+
`Cannot generate a message for role \`${role}\` with content \`${content}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
|
|
990
|
+
);
|
|
991
|
+
throw new Error(
|
|
992
|
+
`Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list`
|
|
993
|
+
);
|
|
994
|
+
}
|
|
995
|
+
index = nextAgent.index;
|
|
996
|
+
agent2 = nextAgent.agent;
|
|
997
|
+
this.removePendingAgent(agent2);
|
|
998
|
+
if (content) {
|
|
999
|
+
const message2 = typeof content === "string" ? { role: role === "User" /* USER */ ? "user" : "assistant", content } : content;
|
|
1000
|
+
this.state.addMessage(message2);
|
|
1001
|
+
this.broadcastMessage(message2, index);
|
|
1002
|
+
return null;
|
|
1003
|
+
}
|
|
1004
|
+
const result = await this.callAgent(index, role, judgmentRequest);
|
|
1005
|
+
if (result && typeof result === "object" && "success" in result) {
|
|
1006
|
+
return result;
|
|
1007
|
+
}
|
|
1008
|
+
return null;
|
|
1009
|
+
}
|
|
779
1010
|
reset() {
|
|
780
|
-
this.state = new ScenarioExecutionState();
|
|
781
|
-
this.state.
|
|
782
|
-
this.
|
|
783
|
-
this.
|
|
784
|
-
this.state.
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
1011
|
+
this.state = new ScenarioExecutionState(this.config);
|
|
1012
|
+
this.state.threadId = this.config.threadId || generateThreadId();
|
|
1013
|
+
this.setAgents(this.config.agents);
|
|
1014
|
+
this.newTurn();
|
|
1015
|
+
this.state.currentTurn = 0;
|
|
1016
|
+
this.totalStartTime = Date.now();
|
|
1017
|
+
this.pendingMessages.clear();
|
|
1018
|
+
}
|
|
1019
|
+
nextAgentForRole(role) {
|
|
1020
|
+
for (const agent2 of this.agents) {
|
|
1021
|
+
if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2) && this.pendingRolesOnTurn.includes(role)) {
|
|
1022
|
+
return { idx: this.agents.indexOf(agent2), agent: agent2 };
|
|
1023
|
+
}
|
|
1024
|
+
}
|
|
1025
|
+
return { idx: -1, agent: null };
|
|
1026
|
+
}
|
|
1027
|
+
newTurn() {
|
|
1028
|
+
this.pendingAgentsOnTurn = new Set(this.agents);
|
|
1029
|
+
this.pendingRolesOnTurn = [
|
|
1030
|
+
"User" /* USER */,
|
|
1031
|
+
"Agent" /* AGENT */,
|
|
1032
|
+
"Judge" /* JUDGE */
|
|
1033
|
+
];
|
|
1034
|
+
if (this.state.currentTurn === null) {
|
|
1035
|
+
this.state.currentTurn = 1;
|
|
1036
|
+
} else {
|
|
1037
|
+
this.state.currentTurn++;
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
removePendingRole(role) {
|
|
1041
|
+
const index = this.pendingRolesOnTurn.indexOf(role);
|
|
1042
|
+
if (index > -1) {
|
|
1043
|
+
this.pendingRolesOnTurn.splice(index, 1);
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1046
|
+
removePendingAgent(agent2) {
|
|
1047
|
+
this.pendingAgentsOnTurn.delete(agent2);
|
|
1048
|
+
}
|
|
1049
|
+
getNextAgentForRole(role) {
|
|
1050
|
+
for (let i = 0; i < this.agents.length; i++) {
|
|
1051
|
+
const agent2 = this.agents[i];
|
|
1052
|
+
if (agent2.role === role && this.pendingAgentsOnTurn.has(agent2)) {
|
|
1053
|
+
return { index: i, agent: agent2 };
|
|
1054
|
+
}
|
|
1055
|
+
}
|
|
1056
|
+
return null;
|
|
1057
|
+
}
|
|
1058
|
+
setAgents(agents) {
|
|
1059
|
+
this.agents = agents;
|
|
1060
|
+
this.agentTimes.clear();
|
|
1061
|
+
}
|
|
1062
|
+
consumeUntilRole(role) {
|
|
1063
|
+
while (this.pendingRolesOnTurn.length > 0) {
|
|
1064
|
+
const nextRole = this.pendingRolesOnTurn[0];
|
|
1065
|
+
if (nextRole === role) break;
|
|
1066
|
+
this.pendingRolesOnTurn.pop();
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
reachedMaxTurns(errorMessage) {
|
|
1070
|
+
var _a;
|
|
1071
|
+
const agentRoleAgentsIdx = this.agents.map((agent2, i) => ({ agent: agent2, idx: i })).filter(({ agent: agent2 }) => agent2.role === "Agent" /* AGENT */).map(({ idx }) => idx);
|
|
1072
|
+
const agentTimes = agentRoleAgentsIdx.map((i) => this.agentTimes.get(i) || 0);
|
|
1073
|
+
const totalAgentTime = agentTimes.reduce((sum, time) => sum + time, 0);
|
|
1074
|
+
return {
|
|
1075
|
+
success: false,
|
|
1076
|
+
messages: this.state.messages,
|
|
1077
|
+
reasoning: errorMessage || `Reached maximum turns (${this.config.maxTurns || 10}) without conclusion`,
|
|
1078
|
+
metCriteria: [],
|
|
1079
|
+
unmetCriteria: ((_a = this.getJudgeAgent()) == null ? void 0 : _a.criteria) ?? [],
|
|
1080
|
+
totalTime: this.totalTime,
|
|
1081
|
+
agentTime: totalAgentTime
|
|
1082
|
+
};
|
|
1083
|
+
}
|
|
1084
|
+
getJudgeAgent() {
|
|
1085
|
+
return this.agents.find((agent2) => agent2 instanceof JudgeAgentAdapter) ?? null;
|
|
1086
|
+
}
|
|
792
1087
|
/**
|
|
793
1088
|
* Emits an event to the event stream for external consumption.
|
|
794
1089
|
*/
|
|
@@ -800,11 +1095,13 @@ var ScenarioExecution = class {
|
|
|
800
1095
|
*/
|
|
801
1096
|
makeBaseEvent({ scenarioRunId }) {
|
|
802
1097
|
return {
|
|
1098
|
+
type: "placeholder",
|
|
1099
|
+
// This will be replaced by the specific event type
|
|
1100
|
+
timestamp: Date.now(),
|
|
803
1101
|
batchRunId: batchRunId2,
|
|
804
1102
|
scenarioId: this.config.id,
|
|
805
1103
|
scenarioRunId,
|
|
806
|
-
|
|
807
|
-
rawEvent: void 0
|
|
1104
|
+
scenarioSetId: this.config.setId
|
|
808
1105
|
};
|
|
809
1106
|
}
|
|
810
1107
|
/**
|
|
@@ -827,7 +1124,7 @@ var ScenarioExecution = class {
|
|
|
827
1124
|
this.emitEvent({
|
|
828
1125
|
...this.makeBaseEvent({ scenarioRunId }),
|
|
829
1126
|
type: "SCENARIO_MESSAGE_SNAPSHOT" /* MESSAGE_SNAPSHOT */,
|
|
830
|
-
messages: this.state.
|
|
1127
|
+
messages: this.state.messages
|
|
831
1128
|
// Add any other required fields from MessagesSnapshotEventSchema
|
|
832
1129
|
});
|
|
833
1130
|
}
|
|
@@ -836,53 +1133,60 @@ var ScenarioExecution = class {
|
|
|
836
1133
|
*/
|
|
837
1134
|
emitRunFinished({
|
|
838
1135
|
scenarioRunId,
|
|
839
|
-
status
|
|
1136
|
+
status,
|
|
1137
|
+
result
|
|
840
1138
|
}) {
|
|
841
|
-
|
|
1139
|
+
const event = {
|
|
842
1140
|
...this.makeBaseEvent({ scenarioRunId }),
|
|
1141
|
+
scenarioSetId: this.config.setId ?? "default",
|
|
843
1142
|
type: "SCENARIO_RUN_FINISHED" /* RUN_FINISHED */,
|
|
844
|
-
status
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
var import_promises = __toESM(require("fs/promises"));
|
|
852
|
-
var import_node_path = __toESM(require("path"));
|
|
853
|
-
var import_node_url = require("url");
|
|
854
|
-
async function loadScenarioProjectConfig() {
|
|
855
|
-
const cwd = process.cwd();
|
|
856
|
-
const configNames = [
|
|
857
|
-
"scenario.config.js",
|
|
858
|
-
"scenario.config.mjs"
|
|
859
|
-
];
|
|
860
|
-
for (const name of configNames) {
|
|
861
|
-
const fullPath = import_node_path.default.join(cwd, name);
|
|
862
|
-
try {
|
|
863
|
-
await import_promises.default.access(fullPath);
|
|
864
|
-
const configModule = await import((0, import_node_url.pathToFileURL)(fullPath).href);
|
|
865
|
-
const config2 = configModule.default || configModule;
|
|
866
|
-
const parsed = scenarioProjectConfigSchema.safeParse(config2);
|
|
867
|
-
if (!parsed.success) {
|
|
868
|
-
throw new Error(
|
|
869
|
-
`Invalid config file ${name}: ${JSON.stringify(parsed.error.format(), null, 2)}`
|
|
870
|
-
);
|
|
1143
|
+
status,
|
|
1144
|
+
results: {
|
|
1145
|
+
verdict: (result == null ? void 0 : result.success) ? "success" /* SUCCESS */ : "failure" /* FAILURE */,
|
|
1146
|
+
metCriteria: (result == null ? void 0 : result.metCriteria) ?? [],
|
|
1147
|
+
unmetCriteria: (result == null ? void 0 : result.unmetCriteria) ?? [],
|
|
1148
|
+
reasoning: result == null ? void 0 : result.reasoning,
|
|
1149
|
+
error: result == null ? void 0 : result.error
|
|
871
1150
|
}
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
1151
|
+
};
|
|
1152
|
+
this.emitEvent(event);
|
|
1153
|
+
this.eventSubject.complete();
|
|
1154
|
+
}
|
|
1155
|
+
/**
|
|
1156
|
+
* Distributes a message to all other agents in the scenario.
|
|
1157
|
+
*
|
|
1158
|
+
* @param message - The message to broadcast.
|
|
1159
|
+
* @param fromAgentIdx - The index of the agent that sent the message, to avoid echoing.
|
|
1160
|
+
*/
|
|
1161
|
+
broadcastMessage(message2, fromAgentIdx) {
|
|
1162
|
+
for (let idx = 0; idx < this.agents.length; idx++) {
|
|
1163
|
+
if (idx === fromAgentIdx) continue;
|
|
1164
|
+
if (!this.pendingMessages.has(idx)) {
|
|
1165
|
+
this.pendingMessages.set(idx, []);
|
|
876
1166
|
}
|
|
877
|
-
|
|
1167
|
+
this.pendingMessages.get(idx).push(message2);
|
|
878
1168
|
}
|
|
879
1169
|
}
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
1170
|
+
};
|
|
1171
|
+
function convertAgentReturnTypesToMessages(response, role) {
|
|
1172
|
+
if (typeof response === "string")
|
|
1173
|
+
return [{ role, content: response }];
|
|
1174
|
+
if (Array.isArray(response))
|
|
1175
|
+
return response;
|
|
1176
|
+
if (typeof response === "object" && "role" in response)
|
|
1177
|
+
return [response];
|
|
1178
|
+
return [];
|
|
1179
|
+
}
|
|
1180
|
+
|
|
1181
|
+
// src/runner/index.ts
|
|
1182
|
+
var runner_exports = {};
|
|
1183
|
+
__export(runner_exports, {
|
|
1184
|
+
run: () => run
|
|
1185
|
+
});
|
|
1186
|
+
|
|
1187
|
+
// src/events/event-bus.ts
|
|
1188
|
+
var import_rxjs2 = require("rxjs");
|
|
1189
|
+
|
|
886
1190
|
// src/events/event-reporter.ts
|
|
887
1191
|
var EventReporter = class {
|
|
888
1192
|
eventsEndpoint;
|
|
@@ -892,16 +1196,16 @@ var EventReporter = class {
|
|
|
892
1196
|
this.eventsEndpoint = new URL("/api/scenario-events", config2.endpoint);
|
|
893
1197
|
this.apiKey = config2.apiKey ?? "";
|
|
894
1198
|
if (!process.env.SCENARIO_DISABLE_SIMULATION_REPORT_INFO) {
|
|
895
|
-
console.log("=== Scenario Simulation Reporting ===");
|
|
896
1199
|
if (!this.apiKey) {
|
|
897
|
-
console.
|
|
898
|
-
|
|
1200
|
+
console.log(
|
|
1201
|
+
"\u27A1\uFE0F LangWatch API key not configured, simulations will only output the final result"
|
|
1202
|
+
);
|
|
1203
|
+
console.log(
|
|
1204
|
+
"To visualize the conversations in real time, configure your LangWatch API key (via LANGWATCH_API_KEY, or scenario.config.js)"
|
|
1205
|
+
);
|
|
899
1206
|
} else {
|
|
900
|
-
console.log(
|
|
901
|
-
console.log(`Endpoint: ${config2.endpoint} -> ${this.eventsEndpoint.href}`);
|
|
902
|
-
console.log(`API Key: ${!this.apiKey ? "not configured" : "configured"}`);
|
|
1207
|
+
console.log(`simulation reporting is enabled, endpoint:(${this.eventsEndpoint}) api_key_configured:(${this.apiKey.length > 0 ? "true" : "false"})`);
|
|
903
1208
|
}
|
|
904
|
-
console.log("=== Scenario Simulation Reporting ===");
|
|
905
1209
|
}
|
|
906
1210
|
}
|
|
907
1211
|
/**
|
|
@@ -953,13 +1257,25 @@ var EventReporter = class {
|
|
|
953
1257
|
};
|
|
954
1258
|
|
|
955
1259
|
// src/events/event-bus.ts
|
|
956
|
-
var EventBus = class {
|
|
1260
|
+
var EventBus = class _EventBus {
|
|
1261
|
+
static registry = /* @__PURE__ */ new Set();
|
|
957
1262
|
events$ = new import_rxjs2.Subject();
|
|
958
1263
|
eventReporter;
|
|
959
1264
|
processingPromise = null;
|
|
960
1265
|
logger = new Logger("scenario.events.EventBus");
|
|
1266
|
+
static globalListeners = [];
|
|
961
1267
|
constructor(config2) {
|
|
962
1268
|
this.eventReporter = new EventReporter(config2);
|
|
1269
|
+
_EventBus.registry.add(this);
|
|
1270
|
+
for (const listener of _EventBus.globalListeners) {
|
|
1271
|
+
listener(this);
|
|
1272
|
+
}
|
|
1273
|
+
}
|
|
1274
|
+
static getAllBuses() {
|
|
1275
|
+
return _EventBus.registry;
|
|
1276
|
+
}
|
|
1277
|
+
static addGlobalListener(listener) {
|
|
1278
|
+
_EventBus.globalListeners.push(listener);
|
|
963
1279
|
}
|
|
964
1280
|
/**
|
|
965
1281
|
* Publishes an event into the processing pipeline.
|
|
@@ -1014,7 +1330,7 @@ var EventBus = class {
|
|
|
1014
1330
|
*/
|
|
1015
1331
|
async drain() {
|
|
1016
1332
|
this.logger.debug("Draining event stream");
|
|
1017
|
-
this.events$.
|
|
1333
|
+
this.events$.complete();
|
|
1018
1334
|
if (this.processingPromise) {
|
|
1019
1335
|
await this.processingPromise;
|
|
1020
1336
|
}
|
|
@@ -1027,6 +1343,45 @@ var EventBus = class {
|
|
|
1027
1343
|
this.logger.debug("Subscribing to event stream");
|
|
1028
1344
|
return source$.subscribe(this.events$);
|
|
1029
1345
|
}
|
|
1346
|
+
/**
|
|
1347
|
+
* Expose the events$ observable for external subscription (read-only).
|
|
1348
|
+
*/
|
|
1349
|
+
get eventsObservable() {
|
|
1350
|
+
return this.events$.asObservable();
|
|
1351
|
+
}
|
|
1352
|
+
};
|
|
1353
|
+
|
|
1354
|
+
// src/script/index.ts
|
|
1355
|
+
var script_exports = {};
|
|
1356
|
+
__export(script_exports, {
|
|
1357
|
+
agent: () => agent,
|
|
1358
|
+
fail: () => fail,
|
|
1359
|
+
judge: () => judge,
|
|
1360
|
+
message: () => message,
|
|
1361
|
+
proceed: () => proceed,
|
|
1362
|
+
succeed: () => succeed,
|
|
1363
|
+
user: () => user
|
|
1364
|
+
});
|
|
1365
|
+
var message = (message2) => {
|
|
1366
|
+
return (_state, executor) => executor.message(message2);
|
|
1367
|
+
};
|
|
1368
|
+
var agent = (content) => {
|
|
1369
|
+
return (_state, executor) => executor.agent(content);
|
|
1370
|
+
};
|
|
1371
|
+
var judge = (content) => {
|
|
1372
|
+
return (_state, executor) => executor.judge(content);
|
|
1373
|
+
};
|
|
1374
|
+
var user = (content) => {
|
|
1375
|
+
return (_state, executor) => executor.user(content);
|
|
1376
|
+
};
|
|
1377
|
+
var proceed = (turns, onTurn, onStep) => {
|
|
1378
|
+
return (_state, executor) => executor.proceed(turns, onTurn, onStep);
|
|
1379
|
+
};
|
|
1380
|
+
var succeed = (reasoning) => {
|
|
1381
|
+
return (_state, executor) => executor.succeed(reasoning);
|
|
1382
|
+
};
|
|
1383
|
+
var fail = (reasoning) => {
|
|
1384
|
+
return (_state, executor) => executor.fail(reasoning);
|
|
1030
1385
|
};
|
|
1031
1386
|
|
|
1032
1387
|
// src/runner/run.ts
|
|
@@ -1071,8 +1426,8 @@ async function run(cfg) {
|
|
|
1071
1426
|
console.log(`Scenario failed: ${cfg.name}`);
|
|
1072
1427
|
console.log(`Reasoning: ${result.reasoning}`);
|
|
1073
1428
|
console.log("--------------------------------");
|
|
1074
|
-
console.log(`
|
|
1075
|
-
console.log(`
|
|
1429
|
+
console.log(`Met criteria: ${result.metCriteria.join("\n- ")}`);
|
|
1430
|
+
console.log(`Unmet criteria: ${result.unmetCriteria.join("\n- ")}`);
|
|
1076
1431
|
console.log(result.messages.map(formatMessage).join("\n"));
|
|
1077
1432
|
}
|
|
1078
1433
|
return result;
|
|
@@ -1125,301 +1480,15 @@ function formatPart(part) {
|
|
|
1125
1480
|
}
|
|
1126
1481
|
}
|
|
1127
1482
|
|
|
1128
|
-
// src/
|
|
1129
|
-
var
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
var userMessageRole = "user";
|
|
1136
|
-
var groupMessagesByToolBoundaries = (messages) => {
|
|
1137
|
-
const segments = [];
|
|
1138
|
-
let currentSegment = [];
|
|
1139
|
-
for (const message2 of messages) {
|
|
1140
|
-
currentSegment.push(message2);
|
|
1141
|
-
if (message2.role === toolMessageRole) {
|
|
1142
|
-
segments.push(currentSegment);
|
|
1143
|
-
currentSegment = [];
|
|
1144
|
-
}
|
|
1145
|
-
}
|
|
1146
|
-
if (currentSegment.length > 0) {
|
|
1147
|
-
segments.push(currentSegment);
|
|
1148
|
-
}
|
|
1149
|
-
return segments;
|
|
1150
|
-
};
|
|
1151
|
-
var segmentHasToolMessages = (segment) => {
|
|
1152
|
-
return segment.some((message2) => {
|
|
1153
|
-
if (message2.role === toolMessageRole) return true;
|
|
1154
|
-
if (message2.role === assistantMessageRole && Array.isArray(message2.content)) {
|
|
1155
|
-
return message2.content.some((part) => part.type === "tool-call");
|
|
1156
|
-
}
|
|
1157
|
-
return false;
|
|
1158
|
-
});
|
|
1159
|
-
};
|
|
1160
|
-
var reverseSegmentRoles = (segment) => {
|
|
1161
|
-
return segment.map((message2) => {
|
|
1162
|
-
const hasStringContent = typeof message2.content === "string";
|
|
1163
|
-
if (!hasStringContent) return message2;
|
|
1164
|
-
const roleMap = {
|
|
1165
|
-
[userMessageRole]: assistantMessageRole,
|
|
1166
|
-
[assistantMessageRole]: userMessageRole
|
|
1167
|
-
};
|
|
1168
|
-
const newRole = roleMap[message2.role];
|
|
1169
|
-
if (!newRole) return message2;
|
|
1170
|
-
return {
|
|
1171
|
-
role: newRole,
|
|
1172
|
-
content: message2.content
|
|
1173
|
-
};
|
|
1174
|
-
});
|
|
1175
|
-
};
|
|
1176
|
-
var messageRoleReversal = (messages) => {
|
|
1177
|
-
const segments = groupMessagesByToolBoundaries(messages);
|
|
1178
|
-
const processedSegments = segments.map(
|
|
1179
|
-
(segment) => segmentHasToolMessages(segment) ? segment : reverseSegmentRoles(segment)
|
|
1180
|
-
);
|
|
1181
|
-
return processedSegments.flat();
|
|
1182
|
-
};
|
|
1183
|
-
var criterionToParamName = (criterion) => {
|
|
1184
|
-
return criterion.replace(/"/g, "").replace(/[^a-zA-Z0-9]/g, "_").replace(/ /g, "_").toLowerCase().substring(0, 70);
|
|
1185
|
-
};
|
|
1186
|
-
|
|
1187
|
-
// src/config/index.ts
|
|
1188
|
-
var logger = new Logger("scenario.config");
|
|
1189
|
-
var configLoaded = false;
|
|
1190
|
-
var config = null;
|
|
1191
|
-
var configLoadPromise = null;
|
|
1192
|
-
async function loadProjectConfig() {
|
|
1193
|
-
if (configLoaded) {
|
|
1194
|
-
return;
|
|
1195
|
-
}
|
|
1196
|
-
if (configLoadPromise) {
|
|
1197
|
-
return configLoadPromise;
|
|
1198
|
-
}
|
|
1199
|
-
configLoadPromise = (async () => {
|
|
1200
|
-
try {
|
|
1201
|
-
config = await loadScenarioProjectConfig();
|
|
1202
|
-
logger.info("loaded scenario project config", { config });
|
|
1203
|
-
} catch (error) {
|
|
1204
|
-
logger.error("error loading scenario project config", { error });
|
|
1205
|
-
} finally {
|
|
1206
|
-
configLoaded = true;
|
|
1207
|
-
}
|
|
1208
|
-
})();
|
|
1209
|
-
return configLoadPromise;
|
|
1210
|
-
}
|
|
1211
|
-
async function getProjectConfig() {
|
|
1212
|
-
await loadProjectConfig();
|
|
1213
|
-
return config;
|
|
1214
|
-
}
|
|
1215
|
-
|
|
1216
|
-
// src/utils/config.ts
|
|
1217
|
-
function mergeConfig(config2, projectConfig) {
|
|
1218
|
-
if (!projectConfig) {
|
|
1219
|
-
return config2;
|
|
1220
|
-
}
|
|
1221
|
-
return {
|
|
1222
|
-
...projectConfig.defaultModel,
|
|
1223
|
-
...config2
|
|
1224
|
-
};
|
|
1225
|
-
}
|
|
1226
|
-
function mergeAndValidateConfig(config2, projectConfig) {
|
|
1227
|
-
var _a;
|
|
1228
|
-
const mergedConfig = mergeConfig(config2, projectConfig);
|
|
1229
|
-
mergedConfig.model = mergedConfig.model ?? ((_a = projectConfig == null ? void 0 : projectConfig.defaultModel) == null ? void 0 : _a.model);
|
|
1230
|
-
if (!mergedConfig.model) {
|
|
1231
|
-
throw new Error("Model is required");
|
|
1232
|
-
}
|
|
1233
|
-
return mergedConfig;
|
|
1234
|
-
}
|
|
1235
|
-
|
|
1236
|
-
// src/agents/judge-agent.ts
|
|
1237
|
-
function buildSystemPrompt(criteria, description) {
|
|
1238
|
-
const criteriaList = (criteria == null ? void 0 : criteria.map((criterion, idx) => `${idx + 1}. ${criterion}`).join("\n")) || "No criteria provided";
|
|
1239
|
-
return `
|
|
1240
|
-
<role>
|
|
1241
|
-
You are an LLM as a judge watching a simulated conversation as it plays out live to determine if the agent under test meets the criteria or not.
|
|
1242
|
-
</role>
|
|
1243
|
-
|
|
1244
|
-
<goal>
|
|
1245
|
-
Your goal is to determine if you already have enough information to make a verdict of the scenario below, or if the conversation should continue for longer.
|
|
1246
|
-
If you do have enough information, use the finish_test tool to determine if all the criteria have been met, if not, use the continue_test tool to let the next step play out.
|
|
1247
|
-
</goal>
|
|
1248
|
-
|
|
1249
|
-
<scenario>
|
|
1250
|
-
${description}
|
|
1251
|
-
</scenario>
|
|
1252
|
-
|
|
1253
|
-
<criteria>
|
|
1254
|
-
${criteriaList}
|
|
1255
|
-
</criteria>
|
|
1256
|
-
|
|
1257
|
-
<rules>
|
|
1258
|
-
- Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criteria.
|
|
1259
|
-
- DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
|
|
1260
|
-
</rules>
|
|
1261
|
-
`.trim();
|
|
1262
|
-
}
|
|
1263
|
-
function buildContinueTestTool() {
|
|
1264
|
-
return (0, import_ai.tool)({
|
|
1265
|
-
description: "Continue the test with the next step",
|
|
1266
|
-
parameters: import_zod3.z.object({})
|
|
1267
|
-
});
|
|
1268
|
-
}
|
|
1269
|
-
function buildFinishTestTool(criteria) {
|
|
1270
|
-
const criteriaNames = criteria.map(criterionToParamName);
|
|
1271
|
-
return (0, import_ai.tool)({
|
|
1272
|
-
description: "Complete the test with a final verdict",
|
|
1273
|
-
parameters: import_zod3.z.object({
|
|
1274
|
-
criteria: import_zod3.z.object(
|
|
1275
|
-
Object.fromEntries(
|
|
1276
|
-
criteriaNames.map((name, idx) => [
|
|
1277
|
-
name,
|
|
1278
|
-
import_zod3.z.enum(["true", "false", "inconclusive"]).describe(criteria[idx])
|
|
1279
|
-
])
|
|
1280
|
-
)
|
|
1281
|
-
).strict().describe("Strict verdict for each criterion"),
|
|
1282
|
-
reasoning: import_zod3.z.string().describe("Explanation of what the final verdict should be"),
|
|
1283
|
-
verdict: import_zod3.z.enum(["success", "failure", "inconclusive"]).describe("The final verdict of the test")
|
|
1284
|
-
})
|
|
1285
|
-
});
|
|
1286
|
-
}
|
|
1287
|
-
var judgeAgent = (cfg) => {
|
|
1288
|
-
return {
|
|
1289
|
-
role: "Judge" /* JUDGE */,
|
|
1290
|
-
criteria: cfg.criteria,
|
|
1291
|
-
call: async (input) => {
|
|
1292
|
-
var _a;
|
|
1293
|
-
const systemPrompt = cfg.systemPrompt ?? buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
|
|
1294
|
-
const messages = [
|
|
1295
|
-
{ role: "system", content: systemPrompt },
|
|
1296
|
-
...input.messages
|
|
1297
|
-
];
|
|
1298
|
-
const isLastMessage = input.scenarioState.turn == input.scenarioConfig.maxTurns;
|
|
1299
|
-
const projectConfig = await getProjectConfig();
|
|
1300
|
-
const mergedConfig = mergeAndValidateConfig(cfg, projectConfig);
|
|
1301
|
-
if (!mergedConfig.model) {
|
|
1302
|
-
throw new Error("Model is required for the judge agent");
|
|
1303
|
-
}
|
|
1304
|
-
const tools = {
|
|
1305
|
-
continue_test: buildContinueTestTool(),
|
|
1306
|
-
finish_test: buildFinishTestTool(cfg.criteria)
|
|
1307
|
-
};
|
|
1308
|
-
const enforceJudgement = input.judgmentRequest;
|
|
1309
|
-
const hasCriteria = cfg.criteria.length && cfg.criteria.length > 0;
|
|
1310
|
-
if (enforceJudgement && !hasCriteria) {
|
|
1311
|
-
return {
|
|
1312
|
-
success: false,
|
|
1313
|
-
messages: [],
|
|
1314
|
-
reasoning: "JudgeAgent: No criteria was provided to be judged against",
|
|
1315
|
-
passedCriteria: [],
|
|
1316
|
-
failedCriteria: []
|
|
1317
|
-
};
|
|
1318
|
-
}
|
|
1319
|
-
const toolChoice = (isLastMessage || enforceJudgement) && hasCriteria ? { type: "tool", toolName: "finish_test" } : "required";
|
|
1320
|
-
const completion = await (0, import_ai.generateText)({
|
|
1321
|
-
model: mergedConfig.model,
|
|
1322
|
-
messages,
|
|
1323
|
-
temperature: mergedConfig.temperature ?? 0,
|
|
1324
|
-
maxTokens: mergedConfig.maxTokens,
|
|
1325
|
-
tools,
|
|
1326
|
-
toolChoice
|
|
1327
|
-
});
|
|
1328
|
-
let args;
|
|
1329
|
-
if ((_a = completion.toolCalls) == null ? void 0 : _a.length) {
|
|
1330
|
-
const toolCall = completion.toolCalls[0];
|
|
1331
|
-
switch (toolCall.toolName) {
|
|
1332
|
-
case "finish_test": {
|
|
1333
|
-
args = toolCall.args;
|
|
1334
|
-
const verdict = args.verdict || "inconclusive";
|
|
1335
|
-
const reasoning = args.reasoning || "No reasoning provided";
|
|
1336
|
-
const criteria = args.criteria || {};
|
|
1337
|
-
const criteriaValues = Object.values(criteria);
|
|
1338
|
-
const passedCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] === "true");
|
|
1339
|
-
const failedCriteria = cfg.criteria.filter((_, i) => criteriaValues[i] !== "true");
|
|
1340
|
-
return {
|
|
1341
|
-
success: verdict === "success",
|
|
1342
|
-
messages: input.messages,
|
|
1343
|
-
reasoning,
|
|
1344
|
-
passedCriteria,
|
|
1345
|
-
failedCriteria
|
|
1346
|
-
};
|
|
1347
|
-
}
|
|
1348
|
-
case "continue_test":
|
|
1349
|
-
return [];
|
|
1350
|
-
default:
|
|
1351
|
-
return {
|
|
1352
|
-
success: false,
|
|
1353
|
-
messages: input.messages,
|
|
1354
|
-
reasoning: `JudgeAgent: Unknown tool call: ${toolCall.toolName}`,
|
|
1355
|
-
passedCriteria: [],
|
|
1356
|
-
failedCriteria: cfg.criteria
|
|
1357
|
-
};
|
|
1358
|
-
}
|
|
1359
|
-
}
|
|
1360
|
-
return {
|
|
1361
|
-
success: false,
|
|
1362
|
-
messages: input.messages,
|
|
1363
|
-
reasoning: `JudgeAgent: No tool call found in LLM output`,
|
|
1364
|
-
passedCriteria: [],
|
|
1365
|
-
failedCriteria: cfg.criteria
|
|
1366
|
-
};
|
|
1367
|
-
}
|
|
1368
|
-
};
|
|
1369
|
-
};
|
|
1370
|
-
|
|
1371
|
-
// src/agents/user-simulator-agent.ts
|
|
1372
|
-
var import_ai2 = require("ai");
|
|
1373
|
-
function buildSystemPrompt2(description) {
|
|
1374
|
-
return `
|
|
1375
|
-
<role>
|
|
1376
|
-
You are pretending to be a user, you are testing an AI Agent (shown as the user role) based on a scenario.
|
|
1377
|
-
Approach this naturally, as a human user would, with very short inputs, few words, all lowercase, imperative, not periods, like when they google or talk to chatgpt.
|
|
1378
|
-
</role>
|
|
1379
|
-
|
|
1380
|
-
<goal>
|
|
1381
|
-
Your goal (assistant) is to interact with the Agent Under Test (user) as if you were a human user to see if it can complete the scenario successfully.
|
|
1382
|
-
</goal>
|
|
1383
|
-
|
|
1384
|
-
<scenario>
|
|
1385
|
-
${description}
|
|
1386
|
-
</scenario>
|
|
1387
|
-
|
|
1388
|
-
<rules>
|
|
1389
|
-
- DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, you are the user
|
|
1390
|
-
</rules>
|
|
1391
|
-
`.trim();
|
|
1392
|
-
}
|
|
1393
|
-
var userSimulatorAgent = (config2) => {
|
|
1394
|
-
return {
|
|
1395
|
-
role: "User" /* USER */,
|
|
1396
|
-
call: async (input) => {
|
|
1397
|
-
const systemPrompt = buildSystemPrompt2(input.scenarioConfig.description);
|
|
1398
|
-
const messages = [
|
|
1399
|
-
{ role: "system", content: systemPrompt },
|
|
1400
|
-
{ role: "assistant", content: "Hello, how can I help you today" },
|
|
1401
|
-
...input.messages
|
|
1402
|
-
];
|
|
1403
|
-
const projectConfig = await getProjectConfig();
|
|
1404
|
-
const mergedConfig = mergeAndValidateConfig(config2 ?? {}, projectConfig);
|
|
1405
|
-
if (!mergedConfig.model) {
|
|
1406
|
-
throw new Error("Model is required for the user simulator agent");
|
|
1407
|
-
}
|
|
1408
|
-
const reversedMessages = messageRoleReversal(messages);
|
|
1409
|
-
const completion = await (0, import_ai2.generateText)({
|
|
1410
|
-
model: mergedConfig.model,
|
|
1411
|
-
messages: reversedMessages,
|
|
1412
|
-
temperature: mergedConfig.temperature ?? 0,
|
|
1413
|
-
maxTokens: mergedConfig.maxTokens
|
|
1414
|
-
});
|
|
1415
|
-
const messageContent = completion.text;
|
|
1416
|
-
if (!messageContent) {
|
|
1417
|
-
throw new Error("No response content from LLM");
|
|
1418
|
-
}
|
|
1419
|
-
return { role: "user", content: messageContent };
|
|
1420
|
-
}
|
|
1421
|
-
};
|
|
1483
|
+
// src/index.ts
|
|
1484
|
+
var scenario = {
|
|
1485
|
+
...agents_exports,
|
|
1486
|
+
...domain_exports,
|
|
1487
|
+
...execution_exports,
|
|
1488
|
+
...runner_exports,
|
|
1489
|
+
...script_exports
|
|
1422
1490
|
};
|
|
1491
|
+
var index_default = scenario;
|
|
1423
1492
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1424
1493
|
0 && (module.exports = {
|
|
1425
1494
|
AgentAdapter,
|
|
@@ -1437,6 +1506,7 @@ var userSimulatorAgent = (config2) => {
|
|
|
1437
1506
|
message,
|
|
1438
1507
|
proceed,
|
|
1439
1508
|
run,
|
|
1509
|
+
scenario,
|
|
1440
1510
|
scenarioProjectConfigSchema,
|
|
1441
1511
|
succeed,
|
|
1442
1512
|
user,
|