@elizaos/cli 1.4.4 → 1.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/BrowserWebSocketTransport-5YQPVDV7.js +7 -0
- package/dist/EnhancedEvaluationEngine-APOQ6INN.js +473 -0
- package/dist/EvaluationEngine-Y7ZQJBRC.js +9 -0
- package/dist/LocalEnvironmentProvider-JWFGG4IN.js +15 -0
- package/dist/NodeWebSocketTransport-PUO724EY.js +8 -0
- package/dist/ScreenRecorder-YK246DNJ.js +10 -0
- package/dist/agent-start-6QJQAMKA.js +13 -0
- package/dist/bidi-2SVNH6F7.js +15309 -0
- package/dist/{bun-exec-ULMPAIQC.js → bun-exec-NH4UCUY4.js} +1 -1
- package/dist/chunk-2ESYSVXG.js +48 -0
- package/dist/chunk-3AEYIKBZ.js +432 -0
- package/dist/chunk-5IWKEMEF.js +239 -0
- package/dist/chunk-5WZO2HMM.js +2644 -0
- package/dist/chunk-ABGBVB74.js +3501 -0
- package/dist/{chunk-NSNXXD3I.js → chunk-BCO32GR6.js} +2 -2
- package/dist/chunk-CGXTFHQP.js +25 -0
- package/dist/chunk-EXUFDTUD.js +3948 -0
- package/dist/chunk-FGGNHEXZ.js +211860 -0
- package/dist/chunk-FWYHSCLF.js +243 -0
- package/dist/chunk-I57T3WPO.js +165 -0
- package/dist/chunk-LBZLMFFF.js +221 -0
- package/dist/chunk-LG7YDBMV.js +401 -0
- package/dist/chunk-NHKLUXNE.js +166 -0
- package/dist/chunk-PUZHCSGF.js +828 -0
- package/dist/chunk-PWDR7CPA.js +7828 -0
- package/dist/{chunk-N5G5XSGP.js → chunk-Q6M2K53X.js} +3 -3
- package/dist/chunk-SVHCNBHM.js +289 -0
- package/dist/{chunk-HOC6B3QV.js → chunk-VFFOOPYS.js} +4 -238
- package/dist/chunk-WX37MM4G.js +292 -0
- package/dist/chunk-XFJIHUT3.js +6 -0
- package/dist/chunk-XPPESCCM.js +787 -0
- package/dist/chunk-YBDC5OZO.js +40 -0
- package/dist/commands/agent/actions/index.js +2 -2
- package/dist/commands/agent/index.js +2 -2
- package/dist/commands/create/actions/index.js +4 -3
- package/dist/commands/create/index.js +5 -4
- package/dist/commands/shared/index.js +1 -1
- package/dist/index.js +66796 -4986
- package/dist/js-yaml-KADNMPWR.js +35 -0
- package/dist/matrix-orchestrator-3WLRK7GG.js +1070 -0
- package/dist/matrix-runner-KDPETCKQ.js +160 -0
- package/dist/matrix-schema-PCO2KGJY.js +102 -0
- package/dist/parameter-override-ALOPPXCE.js +487 -0
- package/dist/{plugin-creator-TCUFII32.js → plugin-creator-J7GNPMPG.js} +1 -1
- package/dist/process-manager-IU2A3BTQ.js +9 -0
- package/dist/{registry-ELONUC44.js → registry-65KMEA7N.js} +2 -2
- package/dist/resource-monitor-EHZSH2P6.js +15 -0
- package/dist/run-isolation-PGLZ37Y7.js +29 -0
- package/dist/runtime-factory-Q4U5YBNV.js +22 -0
- package/dist/schema-C25LVPEK.js +17 -0
- package/dist/src/commands/report/src/assets/report_template.html +1704 -0
- package/dist/src-EJG4ILDC.js +5 -0
- package/dist/templates/plugin-quick-starter/package.json +2 -2
- package/dist/templates/plugin-starter/package.json +2 -2
- package/dist/templates/project-starter/package.json +4 -4
- package/dist/templates/project-tee-starter/package.json +4 -4
- package/dist/typescript-ZF3IK2DJ.js +5 -0
- package/dist/{utils-X6UXPLKD.js → utils-QFD2PW4X.js} +2 -2
- package/package.json +14 -8
- package/templates/plugin-quick-starter/package.json +2 -2
- package/templates/plugin-starter/package.json +2 -2
- package/templates/project-starter/package.json +4 -4
- package/templates/project-tee-starter/package.json +4 -4
- package/dist/chunk-3RG5ZIWI.js +0 -10
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
import {
|
|
2
|
+
askAgentViaApi
|
|
3
|
+
} from "./chunk-PUZHCSGF.js";
|
|
4
|
+
import {
|
|
5
|
+
bunExec
|
|
6
|
+
} from "./chunk-I4L4T7QX.js";
|
|
7
|
+
|
|
8
|
+
// src/commands/scenario/src/TrajectoryReconstructor.ts
|
|
9
|
+
var TrajectoryReconstructor = class {
|
|
10
|
+
runtime;
|
|
11
|
+
constructor(runtime) {
|
|
12
|
+
this.runtime = runtime;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Reconstruct trajectory from memories (using same approach as TrajectoryContainsActionEvaluator)
|
|
16
|
+
*/
|
|
17
|
+
async reconstructTrajectory(roomId, timeWindowMs = 3e4) {
|
|
18
|
+
const endTime = Date.now();
|
|
19
|
+
const startTime = endTime - timeWindowMs;
|
|
20
|
+
const allMemories = await this.runtime.getMemories({
|
|
21
|
+
tableName: "messages",
|
|
22
|
+
agentId: this.runtime.agentId,
|
|
23
|
+
count: 100,
|
|
24
|
+
unique: false
|
|
25
|
+
});
|
|
26
|
+
console.log(`
|
|
27
|
+
\u{1F50D} [TrajectoryReconstructor] ===== MEMORY ANALYSIS START =====`);
|
|
28
|
+
console.log(
|
|
29
|
+
`\u{1F50D} [TrajectoryReconstructor] Found ${allMemories.length} total memories for agent`
|
|
30
|
+
);
|
|
31
|
+
console.log(`\u{1F50D} [TrajectoryReconstructor] All roomIds found in memories:`);
|
|
32
|
+
const uniqueRoomIds = [...new Set(allMemories.map((m) => m.roomId).filter(Boolean))];
|
|
33
|
+
uniqueRoomIds.forEach((rId, i) => {
|
|
34
|
+
const count = allMemories.filter((m) => m.roomId === rId).length;
|
|
35
|
+
console.log(` ${i + 1}. ${rId} (${count} memories)`);
|
|
36
|
+
});
|
|
37
|
+
console.log(`\u{1F50D} [TrajectoryReconstructor] Original roomId: ${roomId}`);
|
|
38
|
+
const actualRoomId = uniqueRoomIds.length > 0 ? uniqueRoomIds[0] : roomId;
|
|
39
|
+
if (actualRoomId !== roomId) {
|
|
40
|
+
console.log(
|
|
41
|
+
`\u{1F527} [TrajectoryReconstructor] ROOMID MISMATCH DETECTED - Using actual roomId: ${actualRoomId}`
|
|
42
|
+
);
|
|
43
|
+
}
|
|
44
|
+
const memories = allMemories.filter((mem) => mem && mem.roomId === actualRoomId);
|
|
45
|
+
console.log(
|
|
46
|
+
`\u{1F50D} [TrajectoryReconstructor] Found ${memories.length} memories using actual roomId`
|
|
47
|
+
);
|
|
48
|
+
memories.forEach((mem, index) => {
|
|
49
|
+
console.log(`
|
|
50
|
+
--- Memory ${index + 1}/${memories.length} ---`);
|
|
51
|
+
console.log(`ID: ${mem.id}`);
|
|
52
|
+
console.log(`CreatedAt: ${mem.createdAt} (${new Date(mem.createdAt || 0).toISOString()})`);
|
|
53
|
+
console.log(`Type: ${mem.type || "undefined"}`);
|
|
54
|
+
console.log(`Content Type: ${typeof mem.content}`);
|
|
55
|
+
if (mem.content && typeof mem.content === "object") {
|
|
56
|
+
console.log(`Content.type: ${mem.content?.type}`);
|
|
57
|
+
console.log(`Content keys:`, Object.keys(mem.content));
|
|
58
|
+
if (mem.content?.type === "action_result") {
|
|
59
|
+
console.log(
|
|
60
|
+
`\u{1F3AF} FOUND ACTION_RESULT - FULL CONTENT:`,
|
|
61
|
+
JSON.stringify(mem.content, null, 2)
|
|
62
|
+
);
|
|
63
|
+
} else if (mem.content?.type === "user" || mem.content?.type === "agent") {
|
|
64
|
+
console.log(
|
|
65
|
+
`\u{1F4AC} MESSAGE CONTENT:`,
|
|
66
|
+
JSON.stringify(
|
|
67
|
+
{
|
|
68
|
+
type: mem.content.type,
|
|
69
|
+
text: mem.content.text,
|
|
70
|
+
content: mem.content.content
|
|
71
|
+
},
|
|
72
|
+
null,
|
|
73
|
+
2
|
|
74
|
+
)
|
|
75
|
+
);
|
|
76
|
+
} else {
|
|
77
|
+
console.log(`\u{1F4CB} OTHER CONTENT:`, JSON.stringify(mem.content, null, 2));
|
|
78
|
+
}
|
|
79
|
+
} else {
|
|
80
|
+
console.log(`Raw Content:`, mem.content);
|
|
81
|
+
}
|
|
82
|
+
});
|
|
83
|
+
console.log(`\u{1F50D} [TrajectoryReconstructor] ===== MEMORY ANALYSIS END =====
|
|
84
|
+
`);
|
|
85
|
+
const actionMemories = memories.filter(
|
|
86
|
+
(mem) => mem && typeof mem.content === "object" && mem.content?.type === "action_result"
|
|
87
|
+
);
|
|
88
|
+
const steps = [];
|
|
89
|
+
const runIds = /* @__PURE__ */ new Set();
|
|
90
|
+
console.log(
|
|
91
|
+
`\u{1F3AF} [TrajectoryReconstructor] Processing ${actionMemories.length} action memories...`
|
|
92
|
+
);
|
|
93
|
+
for (const memory of actionMemories) {
|
|
94
|
+
const content = memory.content;
|
|
95
|
+
console.log(`
|
|
96
|
+
\u{1F504} Processing action memory ${memory.id}...`);
|
|
97
|
+
console.log(` actionName: ${content?.actionName}`);
|
|
98
|
+
console.log(` actionParams:`, content?.actionParams);
|
|
99
|
+
console.log(` actionResult:`, content?.actionResult);
|
|
100
|
+
console.log(` thought:`, content?.thought);
|
|
101
|
+
console.log(` planThought:`, content?.planThought);
|
|
102
|
+
console.log(` actionStatus:`, content?.actionStatus);
|
|
103
|
+
const actionName = content?.actionName || "unknown";
|
|
104
|
+
const actionParams = content?.actionParams || {};
|
|
105
|
+
const actionResult = content?.actionResult || {};
|
|
106
|
+
const thought = content?.thought || content?.planThought || "";
|
|
107
|
+
let observationContent = "";
|
|
108
|
+
if (actionResult?.text) {
|
|
109
|
+
observationContent = actionResult.text;
|
|
110
|
+
} else if (actionResult?.stdout) {
|
|
111
|
+
observationContent = actionResult.stdout;
|
|
112
|
+
} else if (actionResult?.output) {
|
|
113
|
+
observationContent = actionResult.output;
|
|
114
|
+
} else if (typeof actionResult === "string") {
|
|
115
|
+
observationContent = actionResult;
|
|
116
|
+
} else if (actionResult && typeof actionResult === "object") {
|
|
117
|
+
observationContent = JSON.stringify(actionResult);
|
|
118
|
+
}
|
|
119
|
+
console.log(
|
|
120
|
+
` \u{1F4CB} Extracted observation (${observationContent.length} chars):`,
|
|
121
|
+
observationContent.substring(0, 200)
|
|
122
|
+
);
|
|
123
|
+
const timestamp = new Date(memory.createdAt || Date.now()).toISOString();
|
|
124
|
+
if (thought && thought.trim()) {
|
|
125
|
+
const thoughtStep = {
|
|
126
|
+
type: "thought",
|
|
127
|
+
timestamp,
|
|
128
|
+
content: thought
|
|
129
|
+
};
|
|
130
|
+
steps.push(thoughtStep);
|
|
131
|
+
console.log(` \u{1F4AD} Created thought step:`, JSON.stringify(thoughtStep, null, 2));
|
|
132
|
+
}
|
|
133
|
+
const actionStep = {
|
|
134
|
+
type: "action",
|
|
135
|
+
timestamp,
|
|
136
|
+
content: {
|
|
137
|
+
name: actionName,
|
|
138
|
+
parameters: actionParams
|
|
139
|
+
}
|
|
140
|
+
};
|
|
141
|
+
steps.push(actionStep);
|
|
142
|
+
console.log(` \u26A1 Created action step:`, JSON.stringify(actionStep, null, 2));
|
|
143
|
+
const observationStep = {
|
|
144
|
+
type: "observation",
|
|
145
|
+
timestamp,
|
|
146
|
+
content: observationContent
|
|
147
|
+
};
|
|
148
|
+
steps.push(observationStep);
|
|
149
|
+
console.log(` \u{1F441}\uFE0F Created observation step:`, JSON.stringify(observationStep, null, 2));
|
|
150
|
+
}
|
|
151
|
+
steps.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
|
|
152
|
+
return {
|
|
153
|
+
steps,
|
|
154
|
+
runId: runIds.size === 1 ? Array.from(runIds)[0] : void 0,
|
|
155
|
+
startTime,
|
|
156
|
+
endTime,
|
|
157
|
+
totalSteps: steps.length
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
/**
|
|
161
|
+
* Get latest trajectory for a room (convenience method) with retry logic
|
|
162
|
+
*/
|
|
163
|
+
async getLatestTrajectory(roomId) {
|
|
164
|
+
console.log(`\u{1F50D} [TrajectoryReconstructor] Starting reconstruction for room: ${roomId}`);
|
|
165
|
+
const maxRetries = 3;
|
|
166
|
+
const retryDelayMs = 2e3;
|
|
167
|
+
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
168
|
+
console.log(`
|
|
169
|
+
\u{1F504} [TrajectoryReconstructor] ===== ATTEMPT ${attempt}/${maxRetries} =====`);
|
|
170
|
+
const trajectory = await this.reconstructTrajectory(roomId, 3e4);
|
|
171
|
+
console.log(
|
|
172
|
+
`\u{1F4CA} [TrajectoryReconstructor] Found ${trajectory.steps.length} trajectory steps on attempt ${attempt}`
|
|
173
|
+
);
|
|
174
|
+
console.log(
|
|
175
|
+
`\u{1F4CA} [TrajectoryReconstructor] Time window: ${trajectory.startTime} - ${trajectory.endTime}`
|
|
176
|
+
);
|
|
177
|
+
if (trajectory.steps.length > 0) {
|
|
178
|
+
console.log(
|
|
179
|
+
`\u2705 [TrajectoryReconstructor] SUCCESS on attempt ${attempt}: Found ${trajectory.steps.length} trajectory steps`
|
|
180
|
+
);
|
|
181
|
+
console.log(
|
|
182
|
+
`\u{1F4CA} [TrajectoryReconstructor] Actions found:`,
|
|
183
|
+
trajectory.steps.filter((s) => s.type === "action").map((s) => s.content.name)
|
|
184
|
+
);
|
|
185
|
+
console.log(
|
|
186
|
+
`\u{1F4CA} [TrajectoryReconstructor] First step sample:`,
|
|
187
|
+
JSON.stringify(trajectory.steps[0], null, 2)
|
|
188
|
+
);
|
|
189
|
+
console.log(`\u{1F4CA} [TrajectoryReconstructor] ===== SUCCESS END =====
|
|
190
|
+
`);
|
|
191
|
+
return trajectory.steps;
|
|
192
|
+
}
|
|
193
|
+
console.log(
|
|
194
|
+
`\u26A0\uFE0F [TrajectoryReconstructor] Attempt ${attempt} found 0 steps. ${attempt < maxRetries ? "Retrying..." : "Final attempt failed."}`
|
|
195
|
+
);
|
|
196
|
+
if (attempt < maxRetries) {
|
|
197
|
+
console.log(`\u23F3 [TrajectoryReconstructor] Waiting ${retryDelayMs}ms before retry...`);
|
|
198
|
+
await new Promise((resolve) => setTimeout(resolve, retryDelayMs));
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
console.log(
|
|
202
|
+
`\u274C [TrajectoryReconstructor] All ${maxRetries} attempts failed - returning empty trajectory`
|
|
203
|
+
);
|
|
204
|
+
console.log(`\u{1F4CA} [TrajectoryReconstructor] ===== FINAL FAILURE =====
|
|
205
|
+
`);
|
|
206
|
+
return [];
|
|
207
|
+
}
|
|
208
|
+
};
|
|
209
|
+
|
|
210
|
+
// src/commands/scenario/src/LocalEnvironmentProvider.ts
|
|
211
|
+
import fs from "fs/promises";
|
|
212
|
+
import path from "path";
|
|
213
|
+
import os from "os";
|
|
214
|
+
var LocalEnvironmentProvider = class {
|
|
215
|
+
tempDir = null;
|
|
216
|
+
server = null;
|
|
217
|
+
agentId = null;
|
|
218
|
+
runtime = null;
|
|
219
|
+
serverPort = null;
|
|
220
|
+
trajectoryReconstructor = null;
|
|
221
|
+
constructor(server, agentId, runtime, serverPort) {
|
|
222
|
+
this.server = server ?? null;
|
|
223
|
+
this.agentId = agentId ?? null;
|
|
224
|
+
this.runtime = runtime ?? null;
|
|
225
|
+
this.serverPort = serverPort ?? null;
|
|
226
|
+
this.trajectoryReconstructor = runtime ? new TrajectoryReconstructor(runtime) : null;
|
|
227
|
+
console.log(`\u{1F527} [DEBUG] LocalEnvironmentProvider CONSTRUCTOR:`);
|
|
228
|
+
console.log(`\u{1F527} [DEBUG] - Server: ${server ? "present" : "null"}`);
|
|
229
|
+
console.log(`\u{1F527} [DEBUG] - Agent ID: ${agentId}`);
|
|
230
|
+
console.log(`\u{1F527} [DEBUG] - Runtime: ${runtime ? "present" : "null"}`);
|
|
231
|
+
console.log(`\u{1F527} [DEBUG] - Server Port: ${serverPort}`);
|
|
232
|
+
}
|
|
233
|
+
async setup(scenario) {
|
|
234
|
+
const tempDirPrefix = path.join(os.tmpdir(), "eliza-scenario-run-");
|
|
235
|
+
this.tempDir = await fs.mkdtemp(tempDirPrefix);
|
|
236
|
+
const virtualFs = scenario.setup?.virtual_fs;
|
|
237
|
+
if (virtualFs) {
|
|
238
|
+
for (const [filePath, content] of Object.entries(virtualFs)) {
|
|
239
|
+
const fullPath = path.join(this.tempDir, filePath);
|
|
240
|
+
await fs.mkdir(path.dirname(fullPath), { recursive: true });
|
|
241
|
+
await fs.writeFile(fullPath, content);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
async captureFileSystem() {
|
|
246
|
+
if (!this.tempDir) {
|
|
247
|
+
return {};
|
|
248
|
+
}
|
|
249
|
+
const files = {};
|
|
250
|
+
try {
|
|
251
|
+
const readDirRecursive = async (dirPath, basePath = "") => {
|
|
252
|
+
const entries = await fs.readdir(dirPath, { withFileTypes: true });
|
|
253
|
+
for (const entry of entries) {
|
|
254
|
+
const fullPath = path.join(dirPath, entry.name);
|
|
255
|
+
const relativePath = path.join(basePath, entry.name);
|
|
256
|
+
if (entry.isDirectory()) {
|
|
257
|
+
await readDirRecursive(fullPath, relativePath);
|
|
258
|
+
} else if (entry.isFile()) {
|
|
259
|
+
try {
|
|
260
|
+
const content = await fs.readFile(fullPath, "utf-8");
|
|
261
|
+
files[relativePath] = content;
|
|
262
|
+
} catch (error) {
|
|
263
|
+
files[relativePath] = "[binary or unreadable]";
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
};
|
|
268
|
+
await readDirRecursive(this.tempDir);
|
|
269
|
+
return files;
|
|
270
|
+
} catch (error) {
|
|
271
|
+
console.warn("Failed to capture file system state:", error);
|
|
272
|
+
return {};
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
async run(scenario) {
|
|
276
|
+
if (!this.tempDir) {
|
|
277
|
+
throw new Error("Setup must be called before run.");
|
|
278
|
+
}
|
|
279
|
+
const results = [];
|
|
280
|
+
for (const step of scenario.run) {
|
|
281
|
+
const startedAtMs = Date.now();
|
|
282
|
+
if (step.input) {
|
|
283
|
+
if (!this.server || !this.agentId) {
|
|
284
|
+
throw new Error(
|
|
285
|
+
"LocalEnvironmentProvider requires a pre-created server and agent for NL input"
|
|
286
|
+
);
|
|
287
|
+
}
|
|
288
|
+
const { response, roomId } = await askAgentViaApi(
|
|
289
|
+
this.server,
|
|
290
|
+
this.agentId,
|
|
291
|
+
step.input,
|
|
292
|
+
3e4,
|
|
293
|
+
// timeout
|
|
294
|
+
this.serverPort
|
|
295
|
+
// Pass the actual server port
|
|
296
|
+
);
|
|
297
|
+
await new Promise((resolve) => setTimeout(resolve, 3e3));
|
|
298
|
+
const trajectory = this.trajectoryReconstructor && roomId ? await this.trajectoryReconstructor.getLatestTrajectory(roomId) : [];
|
|
299
|
+
console.log(`\u{1F50D} [Trajectory Debug] Room ID: ${roomId}, Steps found: ${trajectory.length}`);
|
|
300
|
+
if (trajectory.length > 0) {
|
|
301
|
+
console.log(`\u{1F4CA} [Trajectory Debug] First step:`, JSON.stringify(trajectory[0], null, 2));
|
|
302
|
+
}
|
|
303
|
+
const endedAtMs = Date.now();
|
|
304
|
+
const durationMs = endedAtMs - startedAtMs;
|
|
305
|
+
results.push({
|
|
306
|
+
exitCode: 0,
|
|
307
|
+
stdout: response,
|
|
308
|
+
stderr: "",
|
|
309
|
+
files: await this.captureFileSystem(),
|
|
310
|
+
startedAtMs,
|
|
311
|
+
endedAtMs,
|
|
312
|
+
durationMs,
|
|
313
|
+
trajectory
|
|
314
|
+
// Add trajectory to execution result
|
|
315
|
+
});
|
|
316
|
+
} else if (step.code) {
|
|
317
|
+
let execCommand;
|
|
318
|
+
let execArgs;
|
|
319
|
+
switch (step.lang) {
|
|
320
|
+
case "bash":
|
|
321
|
+
case "sh":
|
|
322
|
+
execCommand = "sh";
|
|
323
|
+
execArgs = ["-c", step.code];
|
|
324
|
+
break;
|
|
325
|
+
case "node":
|
|
326
|
+
case "javascript":
|
|
327
|
+
execCommand = "node";
|
|
328
|
+
execArgs = ["-e", step.code];
|
|
329
|
+
break;
|
|
330
|
+
case "python":
|
|
331
|
+
case "python3":
|
|
332
|
+
execCommand = "python3";
|
|
333
|
+
execArgs = ["-c", step.code];
|
|
334
|
+
break;
|
|
335
|
+
default:
|
|
336
|
+
execCommand = step.lang;
|
|
337
|
+
execArgs = ["-c", step.code];
|
|
338
|
+
break;
|
|
339
|
+
}
|
|
340
|
+
try {
|
|
341
|
+
const result = await bunExec(execCommand, execArgs, { cwd: this.tempDir });
|
|
342
|
+
const { stdout, stderr } = result;
|
|
343
|
+
const files = await this.captureFileSystem();
|
|
344
|
+
const endedAtMs = Date.now();
|
|
345
|
+
const durationMs = endedAtMs - startedAtMs;
|
|
346
|
+
results.push({
|
|
347
|
+
exitCode: result.exitCode || 0,
|
|
348
|
+
stdout,
|
|
349
|
+
stderr,
|
|
350
|
+
files,
|
|
351
|
+
startedAtMs,
|
|
352
|
+
endedAtMs,
|
|
353
|
+
durationMs
|
|
354
|
+
});
|
|
355
|
+
} catch (error) {
|
|
356
|
+
const files = await this.captureFileSystem();
|
|
357
|
+
const endedAtMs = Date.now();
|
|
358
|
+
const durationMs = endedAtMs - startedAtMs;
|
|
359
|
+
let exitCode = 1;
|
|
360
|
+
let stderr = "";
|
|
361
|
+
let stdout = "";
|
|
362
|
+
if (error.exitCode !== void 0) {
|
|
363
|
+
exitCode = error.exitCode;
|
|
364
|
+
}
|
|
365
|
+
if (error.stderr) {
|
|
366
|
+
stderr = error.stderr;
|
|
367
|
+
}
|
|
368
|
+
if (error.stdout) {
|
|
369
|
+
stdout = error.stdout;
|
|
370
|
+
}
|
|
371
|
+
if (!stderr && error.message) {
|
|
372
|
+
stderr = error.message;
|
|
373
|
+
}
|
|
374
|
+
results.push({
|
|
375
|
+
exitCode,
|
|
376
|
+
stdout,
|
|
377
|
+
stderr,
|
|
378
|
+
files,
|
|
379
|
+
startedAtMs,
|
|
380
|
+
endedAtMs,
|
|
381
|
+
durationMs
|
|
382
|
+
});
|
|
383
|
+
}
|
|
384
|
+
} else {
|
|
385
|
+
throw new Error("Step must have either input or code");
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
return results;
|
|
389
|
+
}
|
|
390
|
+
async teardown() {
|
|
391
|
+
if (this.tempDir) {
|
|
392
|
+
await fs.rm(this.tempDir, { recursive: true, force: true });
|
|
393
|
+
this.tempDir = null;
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
};
|
|
397
|
+
|
|
398
|
+
export {
|
|
399
|
+
TrajectoryReconstructor,
|
|
400
|
+
LocalEnvironmentProvider
|
|
401
|
+
};
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
// src/commands/scenario/src/schema.ts
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
var EnhancedEvaluationResultSchema = z.object({
|
|
4
|
+
evaluator_type: z.string(),
|
|
5
|
+
success: z.boolean(),
|
|
6
|
+
summary: z.string(),
|
|
7
|
+
details: z.record(z.any())
|
|
8
|
+
});
|
|
9
|
+
var CapabilityCheckSchema = z.object({
|
|
10
|
+
capability: z.string(),
|
|
11
|
+
achieved: z.boolean(),
|
|
12
|
+
reasoning: z.string()
|
|
13
|
+
});
|
|
14
|
+
var LLMJudgeResultSchema = z.object({
|
|
15
|
+
qualitative_summary: z.string(),
|
|
16
|
+
capability_checklist: z.array(CapabilityCheckSchema)
|
|
17
|
+
});
|
|
18
|
+
var BaseEvaluationSchema = z.object({
|
|
19
|
+
type: z.string()
|
|
20
|
+
});
|
|
21
|
+
var StringContainsEvaluationSchema = BaseEvaluationSchema.extend({
|
|
22
|
+
type: z.literal("string_contains"),
|
|
23
|
+
value: z.string(),
|
|
24
|
+
case_sensitive: z.boolean().optional()
|
|
25
|
+
});
|
|
26
|
+
var RegexMatchEvaluationSchema = BaseEvaluationSchema.extend({
|
|
27
|
+
type: z.literal("regex_match"),
|
|
28
|
+
pattern: z.string()
|
|
29
|
+
});
|
|
30
|
+
var FileExistsEvaluationSchema = BaseEvaluationSchema.extend({
|
|
31
|
+
type: z.literal("file_exists"),
|
|
32
|
+
path: z.string()
|
|
33
|
+
});
|
|
34
|
+
var TrajectoryContainsActionSchema = BaseEvaluationSchema.extend({
|
|
35
|
+
type: z.literal("trajectory_contains_action"),
|
|
36
|
+
action: z.string()
|
|
37
|
+
});
|
|
38
|
+
var LLMJudgeEvaluationSchema = BaseEvaluationSchema.extend({
|
|
39
|
+
type: z.literal("llm_judge"),
|
|
40
|
+
prompt: z.string(),
|
|
41
|
+
expected: z.string(),
|
|
42
|
+
model_type: z.string().optional(),
|
|
43
|
+
temperature: z.number().min(0).max(2).optional(),
|
|
44
|
+
json_schema: z.record(z.any()).optional(),
|
|
45
|
+
// JSON schema object for response validation
|
|
46
|
+
capabilities: z.array(z.string()).min(1, "Capabilities array must not be empty").optional()
|
|
47
|
+
// Custom capabilities for evaluation
|
|
48
|
+
});
|
|
49
|
+
var ExecutionTimeEvaluationSchema = BaseEvaluationSchema.extend({
|
|
50
|
+
type: z.literal("execution_time"),
|
|
51
|
+
max_duration_ms: z.number(),
|
|
52
|
+
min_duration_ms: z.number().optional(),
|
|
53
|
+
target_duration_ms: z.number().optional()
|
|
54
|
+
});
|
|
55
|
+
var EvaluationSchema = z.discriminatedUnion("type", [
|
|
56
|
+
StringContainsEvaluationSchema,
|
|
57
|
+
RegexMatchEvaluationSchema,
|
|
58
|
+
FileExistsEvaluationSchema,
|
|
59
|
+
TrajectoryContainsActionSchema,
|
|
60
|
+
LLMJudgeEvaluationSchema,
|
|
61
|
+
ExecutionTimeEvaluationSchema
|
|
62
|
+
]);
|
|
63
|
+
var MockSchema = z.object({
|
|
64
|
+
service: z.string().optional(),
|
|
65
|
+
method: z.string(),
|
|
66
|
+
// Enhanced 'when' clause with multiple matching strategies
|
|
67
|
+
when: z.object({
|
|
68
|
+
// Exact argument matching (existing)
|
|
69
|
+
args: z.array(z.any()).optional(),
|
|
70
|
+
// Input parameter matching (extracted from args)
|
|
71
|
+
input: z.record(z.any()).optional(),
|
|
72
|
+
// Request context matching
|
|
73
|
+
context: z.record(z.any()).optional(),
|
|
74
|
+
// Custom JavaScript matcher function
|
|
75
|
+
matcher: z.string().optional(),
|
|
76
|
+
// Partial argument matching
|
|
77
|
+
partialArgs: z.array(z.any()).optional()
|
|
78
|
+
}).optional(),
|
|
79
|
+
// Static response (existing)
|
|
80
|
+
response: z.any(),
|
|
81
|
+
// Dynamic response generation
|
|
82
|
+
responseFn: z.string().optional(),
|
|
83
|
+
// Error simulation
|
|
84
|
+
error: z.object({
|
|
85
|
+
code: z.string(),
|
|
86
|
+
message: z.string(),
|
|
87
|
+
status: z.number().optional()
|
|
88
|
+
}).optional(),
|
|
89
|
+
// Response metadata
|
|
90
|
+
metadata: z.object({
|
|
91
|
+
delay: z.number().optional(),
|
|
92
|
+
// Simulate network delay
|
|
93
|
+
probability: z.number().min(0).max(1).optional()
|
|
94
|
+
// Random failure
|
|
95
|
+
}).optional()
|
|
96
|
+
});
|
|
97
|
+
var PluginConfigSchema = z.object({
|
|
98
|
+
name: z.string(),
|
|
99
|
+
version: z.string().optional(),
|
|
100
|
+
config: z.record(z.any()).optional(),
|
|
101
|
+
enabled: z.boolean().optional().default(true)
|
|
102
|
+
});
|
|
103
|
+
var PluginReferenceSchema = z.union([
|
|
104
|
+
z.string(),
|
|
105
|
+
// Simple string reference
|
|
106
|
+
PluginConfigSchema
|
|
107
|
+
// Full configuration object
|
|
108
|
+
]);
|
|
109
|
+
var SetupSchema = z.object({
|
|
110
|
+
mocks: z.array(MockSchema).optional(),
|
|
111
|
+
virtual_fs: z.record(z.string()).optional()
|
|
112
|
+
});
|
|
113
|
+
var RunStepSchema = z.object({
|
|
114
|
+
name: z.string().optional(),
|
|
115
|
+
lang: z.string().optional(),
|
|
116
|
+
code: z.string().optional(),
|
|
117
|
+
input: z.string().optional(),
|
|
118
|
+
// Natural language input to agent
|
|
119
|
+
evaluations: z.array(EvaluationSchema)
|
|
120
|
+
});
|
|
121
|
+
var JudgmentSchema = z.object({
|
|
122
|
+
strategy: z.enum(["all_pass", "any_pass"])
|
|
123
|
+
});
|
|
124
|
+
var ScenarioSchema = z.object({
|
|
125
|
+
name: z.string(),
|
|
126
|
+
description: z.string(),
|
|
127
|
+
plugins: z.array(PluginReferenceSchema).optional(),
|
|
128
|
+
environment: z.object({
|
|
129
|
+
type: z.enum(["e2b", "local"])
|
|
130
|
+
}),
|
|
131
|
+
setup: SetupSchema.optional(),
|
|
132
|
+
run: z.array(RunStepSchema),
|
|
133
|
+
judgment: JudgmentSchema
|
|
134
|
+
});
|
|
135
|
+
var ScenarioRunResultSchema = z.object({
|
|
136
|
+
run_id: z.string().min(1, "Run ID cannot be empty"),
|
|
137
|
+
matrix_combination_id: z.string().min(1, "Matrix combination ID cannot be empty"),
|
|
138
|
+
parameters: z.record(z.any()),
|
|
139
|
+
metrics: z.object({
|
|
140
|
+
execution_time_seconds: z.number().min(0),
|
|
141
|
+
llm_calls: z.number().int().min(0),
|
|
142
|
+
total_tokens: z.number().int().min(0)
|
|
143
|
+
}).catchall(z.number()),
|
|
144
|
+
// Allow additional numeric metrics
|
|
145
|
+
final_agent_response: z.string().optional(),
|
|
146
|
+
evaluations: z.array(EnhancedEvaluationResultSchema),
|
|
147
|
+
trajectory: z.array(
|
|
148
|
+
z.object({
|
|
149
|
+
type: z.enum(["thought", "action", "observation"]),
|
|
150
|
+
timestamp: z.string().refine((val) => !isNaN(Date.parse(val)), {
|
|
151
|
+
message: "Timestamp must be a valid ISO string"
|
|
152
|
+
}),
|
|
153
|
+
content: z.any()
|
|
154
|
+
})
|
|
155
|
+
),
|
|
156
|
+
error: z.string().nullable()
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
export {
|
|
160
|
+
EnhancedEvaluationResultSchema,
|
|
161
|
+
CapabilityCheckSchema,
|
|
162
|
+
LLMJudgeResultSchema,
|
|
163
|
+
EvaluationSchema,
|
|
164
|
+
ScenarioSchema,
|
|
165
|
+
ScenarioRunResultSchema
|
|
166
|
+
};
|