comfy-qa 1.6.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/agent/demo-editor.ts +0 -450
- package/src/agent/demo-research.ts +0 -725
package/package.json
CHANGED
package/src/agent/demo-editor.ts
DELETED
|
@@ -1,450 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Demo Editor Agent — reads actions.jsonl from research, extracts active
|
|
3
|
-
* bursts (narration + surrounding actions), removes LLM thinking gaps,
|
|
4
|
-
* and assembles a tight final MP4 using ffmpeg.
|
|
5
|
-
*
|
|
6
|
-
* Key improvement: instead of keeping whole feature segments (which include
|
|
7
|
-
* 15-30s LLM response wait times), we extract only the "activity bursts"
|
|
8
|
-
* — windows around narrations and actions where something is actually
|
|
9
|
-
* happening on screen.
|
|
10
|
-
*
|
|
11
|
-
* Output:
|
|
12
|
-
* - final_demo.mp4 (polished video with idle time removed)
|
|
13
|
-
* - edit_plan.json (the editing decisions made)
|
|
14
|
-
*
|
|
15
|
-
* Usage:
|
|
16
|
-
* bun src/agent/demo-editor.ts .comfy-qa/.research/comfy-registry/
|
|
17
|
-
*/
|
|
18
|
-
import * as fs from "node:fs";
|
|
19
|
-
import * as path from "node:path";
|
|
20
|
-
import { $ } from "bun";
|
|
21
|
-
|
|
22
|
-
// ---------------------------------------------------------------------------
|
|
23
|
-
// Types
|
|
24
|
-
// ---------------------------------------------------------------------------
|
|
25
|
-
|
|
26
|
-
interface ActionLogEntry {
|
|
27
|
-
ts: number;
|
|
28
|
-
offsetMs: number;
|
|
29
|
-
type: string;
|
|
30
|
-
feature?: string;
|
|
31
|
-
chapter?: string;
|
|
32
|
-
action?: string;
|
|
33
|
-
selector?: string;
|
|
34
|
-
text?: string;
|
|
35
|
-
success?: boolean;
|
|
36
|
-
error?: string;
|
|
37
|
-
screenshot?: string;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
interface Segment {
|
|
41
|
-
feature: string;
|
|
42
|
-
chapter: string;
|
|
43
|
-
startMs: number;
|
|
44
|
-
endMs: number;
|
|
45
|
-
narration: string;
|
|
46
|
-
actions: ActionLogEntry[];
|
|
47
|
-
success: boolean;
|
|
48
|
-
hasError: boolean;
|
|
49
|
-
retryCount: number;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
/** A burst is a tight window of activity within a segment */
|
|
53
|
-
interface Burst {
|
|
54
|
-
feature: string;
|
|
55
|
-
chapter: string;
|
|
56
|
-
startMs: number;
|
|
57
|
-
endMs: number;
|
|
58
|
-
hasNarration: boolean;
|
|
59
|
-
eventCount: number;
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
interface EditPlan {
|
|
63
|
-
product: string;
|
|
64
|
-
totalRawMs: number;
|
|
65
|
-
segments: { feature: string; chapter: string; success: boolean; durationMs: number; action: string; reason: string }[];
|
|
66
|
-
bursts: Burst[];
|
|
67
|
-
estimatedFinalMs: number;
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
// ---------------------------------------------------------------------------
|
|
71
|
-
// Parse actions.jsonl into segments
|
|
72
|
-
// ---------------------------------------------------------------------------
|
|
73
|
-
|
|
74
|
-
function parseLog(logPath: string): ActionLogEntry[] {
|
|
75
|
-
const lines = fs.readFileSync(logPath, "utf-8").trim().split("\n");
|
|
76
|
-
return lines.map((l) => JSON.parse(l) as ActionLogEntry);
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
function groupIntoSegments(entries: ActionLogEntry[]): Segment[] {
|
|
80
|
-
const segments: Segment[] = [];
|
|
81
|
-
let current: Partial<Segment> | null = null;
|
|
82
|
-
let currentChapter = "";
|
|
83
|
-
|
|
84
|
-
for (const entry of entries) {
|
|
85
|
-
if (entry.type === "chapter_start") {
|
|
86
|
-
currentChapter = entry.chapter ?? "";
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
if (entry.type === "feature_start") {
|
|
90
|
-
current = {
|
|
91
|
-
feature: entry.feature ?? "",
|
|
92
|
-
chapter: currentChapter,
|
|
93
|
-
startMs: entry.offsetMs,
|
|
94
|
-
endMs: entry.offsetMs,
|
|
95
|
-
narration: "",
|
|
96
|
-
actions: [],
|
|
97
|
-
success: false,
|
|
98
|
-
hasError: false,
|
|
99
|
-
retryCount: 0,
|
|
100
|
-
};
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
if (current) {
|
|
104
|
-
current.actions!.push(entry);
|
|
105
|
-
current.endMs = entry.offsetMs;
|
|
106
|
-
|
|
107
|
-
if (entry.type === "narrate" && !current.narration) {
|
|
108
|
-
current.narration = entry.text ?? "";
|
|
109
|
-
}
|
|
110
|
-
if (entry.type === "error") {
|
|
111
|
-
current.hasError = true;
|
|
112
|
-
current.retryCount = (current.retryCount ?? 0) + 1;
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
if (entry.type === "feature_end" && current) {
|
|
117
|
-
current.success = entry.success ?? false;
|
|
118
|
-
current.endMs = entry.offsetMs;
|
|
119
|
-
segments.push(current as Segment);
|
|
120
|
-
current = null;
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
return segments;
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
// ---------------------------------------------------------------------------
|
|
128
|
-
// Extract activity bursts from segments
|
|
129
|
-
// ---------------------------------------------------------------------------
|
|
130
|
-
|
|
131
|
-
const BURST_PAD_BEFORE = 1000; // 1s padding before first event in burst
|
|
132
|
-
const BURST_PAD_AFTER = 1500; // 1.5s padding after last event in burst
|
|
133
|
-
const MIN_BURST_MS = 3000; // Minimum burst duration
|
|
134
|
-
|
|
135
|
-
interface TimedEvent {
|
|
136
|
-
offsetMs: number;
|
|
137
|
-
endMs: number;
|
|
138
|
-
type: string;
|
|
139
|
-
text?: string;
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
/**
|
|
143
|
-
* Extract activity bursts from a segment.
|
|
144
|
-
*
|
|
145
|
-
* For narrations, we estimate end time as the time of the next event
|
|
146
|
-
* (since the agent waits for TTS playback before proceeding).
|
|
147
|
-
* For actions, we use a small duration (500ms).
|
|
148
|
-
*
|
|
149
|
-
* Events that overlap or are close (within GAP_TOLERANCE) form one burst.
|
|
150
|
-
* Gaps longer than that (typically LLM thinking time) are cut.
|
|
151
|
-
*/
|
|
152
|
-
function extractBursts(seg: Segment): Burst[] {
|
|
153
|
-
// Collect all meaningful events (narrate, action, screenshot)
|
|
154
|
-
const rawEvents = seg.actions.filter(
|
|
155
|
-
(a) => a.type === "narrate" || (a.type === "action" && a.success !== false),
|
|
156
|
-
);
|
|
157
|
-
|
|
158
|
-
if (rawEvents.length === 0) return [];
|
|
159
|
-
|
|
160
|
-
// Build timed events: narration endMs = next event's timestamp (TTS plays until then)
|
|
161
|
-
const events: TimedEvent[] = [];
|
|
162
|
-
for (let i = 0; i < rawEvents.length; i++) {
|
|
163
|
-
const a = rawEvents[i];
|
|
164
|
-
if (a.type === "narrate") {
|
|
165
|
-
// Estimate TTS duration from word count (~150 wpm)
|
|
166
|
-
const words = (a.text ?? "").split(/\s+/).length;
|
|
167
|
-
const ttsDurMs = Math.max(2500, (words / 150) * 60 * 1000);
|
|
168
|
-
|
|
169
|
-
// Also find the next meaningful event timestamp
|
|
170
|
-
const idx = seg.actions.indexOf(a);
|
|
171
|
-
let nextTs = Infinity;
|
|
172
|
-
for (let j = idx + 1; j < seg.actions.length; j++) {
|
|
173
|
-
const n = seg.actions[j];
|
|
174
|
-
if (n.type === "narrate" || n.type === "action") {
|
|
175
|
-
nextTs = n.offsetMs;
|
|
176
|
-
break;
|
|
177
|
-
}
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
// Use the SHORTER of: estimated TTS duration, or time until next event.
|
|
181
|
-
// This avoids including LLM thinking time that happens after TTS finishes.
|
|
182
|
-
const endMs = a.offsetMs + Math.min(ttsDurMs, nextTs === Infinity ? ttsDurMs : nextTs - a.offsetMs);
|
|
183
|
-
events.push({ offsetMs: a.offsetMs, endMs, type: "narrate", text: a.text });
|
|
184
|
-
} else {
|
|
185
|
-
events.push({ offsetMs: a.offsetMs, endMs: a.offsetMs + 500, type: "action" });
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
events.sort((a, b) => a.offsetMs - b.offsetMs);
|
|
190
|
-
|
|
191
|
-
// Merge events into bursts. Gap tolerance: 3s (covers visual pause between actions).
|
|
192
|
-
// Anything longer = LLM thinking time, which we want to cut.
|
|
193
|
-
const GAP_TOLERANCE = 3000;
|
|
194
|
-
const groups: TimedEvent[][] = [[events[0]]];
|
|
195
|
-
let groupEnd = events[0].endMs;
|
|
196
|
-
|
|
197
|
-
for (let i = 1; i < events.length; i++) {
|
|
198
|
-
if (events[i].offsetMs <= groupEnd + GAP_TOLERANCE) {
|
|
199
|
-
groups[groups.length - 1].push(events[i]);
|
|
200
|
-
groupEnd = Math.max(groupEnd, events[i].endMs);
|
|
201
|
-
} else {
|
|
202
|
-
groups.push([events[i]]);
|
|
203
|
-
groupEnd = events[i].endMs;
|
|
204
|
-
}
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
return groups.map((g) => {
|
|
208
|
-
const firstStart = g[0].offsetMs;
|
|
209
|
-
const lastEnd = Math.max(...g.map((e) => e.endMs));
|
|
210
|
-
const startMs = Math.max(seg.startMs, firstStart - BURST_PAD_BEFORE);
|
|
211
|
-
const endMs = Math.min(seg.endMs, lastEnd + BURST_PAD_AFTER);
|
|
212
|
-
const hasNarration = g.some((e) => e.type === "narrate");
|
|
213
|
-
return {
|
|
214
|
-
feature: seg.feature,
|
|
215
|
-
chapter: seg.chapter,
|
|
216
|
-
startMs,
|
|
217
|
-
endMs: Math.max(endMs, startMs + MIN_BURST_MS),
|
|
218
|
-
hasNarration,
|
|
219
|
-
eventCount: g.length,
|
|
220
|
-
};
|
|
221
|
-
});
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
// ---------------------------------------------------------------------------
|
|
225
|
-
// Build edit plan
|
|
226
|
-
// ---------------------------------------------------------------------------
|
|
227
|
-
|
|
228
|
-
function createEditPlan(entries: ActionLogEntry[]): EditPlan {
|
|
229
|
-
const segments = groupIntoSegments(entries);
|
|
230
|
-
const totalRawMs = entries.length > 0 ? entries[entries.length - 1].offsetMs : 0;
|
|
231
|
-
|
|
232
|
-
const segmentReports: EditPlan["segments"] = [];
|
|
233
|
-
const allBursts: Burst[] = [];
|
|
234
|
-
|
|
235
|
-
for (const seg of segments) {
|
|
236
|
-
const durationMs = seg.endMs - seg.startMs;
|
|
237
|
-
|
|
238
|
-
if (!seg.success) {
|
|
239
|
-
segmentReports.push({
|
|
240
|
-
feature: seg.feature,
|
|
241
|
-
chapter: seg.chapter,
|
|
242
|
-
success: false,
|
|
243
|
-
durationMs,
|
|
244
|
-
action: "cut",
|
|
245
|
-
reason: "Feature demonstration failed",
|
|
246
|
-
});
|
|
247
|
-
continue;
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
if (durationMs < 1500) {
|
|
251
|
-
segmentReports.push({
|
|
252
|
-
feature: seg.feature,
|
|
253
|
-
chapter: seg.chapter,
|
|
254
|
-
success: true,
|
|
255
|
-
durationMs,
|
|
256
|
-
action: "cut",
|
|
257
|
-
reason: "Too short to be meaningful",
|
|
258
|
-
});
|
|
259
|
-
continue;
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
const bursts = extractBursts(seg);
|
|
263
|
-
// Only keep bursts that have narration (visual-only bursts are usually noise)
|
|
264
|
-
const goodBursts = bursts.filter((b) => b.hasNarration);
|
|
265
|
-
|
|
266
|
-
if (goodBursts.length === 0) {
|
|
267
|
-
// Fallback: keep all bursts if none have narration
|
|
268
|
-
const fallback = bursts.length > 0 ? bursts : [];
|
|
269
|
-
allBursts.push(...fallback);
|
|
270
|
-
segmentReports.push({
|
|
271
|
-
feature: seg.feature,
|
|
272
|
-
chapter: seg.chapter,
|
|
273
|
-
success: true,
|
|
274
|
-
durationMs,
|
|
275
|
-
action: fallback.length > 0 ? "burst" : "cut",
|
|
276
|
-
reason: fallback.length > 0
|
|
277
|
-
? `No narrated bursts, keeping ${fallback.length} action bursts`
|
|
278
|
-
: "No usable bursts",
|
|
279
|
-
});
|
|
280
|
-
} else {
|
|
281
|
-
allBursts.push(...goodBursts);
|
|
282
|
-
const burstMs = goodBursts.reduce((a, b) => a + (b.endMs - b.startMs), 0);
|
|
283
|
-
const saved = durationMs - burstMs;
|
|
284
|
-
segmentReports.push({
|
|
285
|
-
feature: seg.feature,
|
|
286
|
-
chapter: seg.chapter,
|
|
287
|
-
success: true,
|
|
288
|
-
durationMs,
|
|
289
|
-
action: "burst",
|
|
290
|
-
reason: `${goodBursts.length} burst(s), ${(burstMs / 1000).toFixed(1)}s kept, ${(saved / 1000).toFixed(1)}s idle removed`,
|
|
291
|
-
});
|
|
292
|
-
}
|
|
293
|
-
}
|
|
294
|
-
|
|
295
|
-
const estimatedFinalMs = allBursts.reduce((a, b) => a + (b.endMs - b.startMs), 0);
|
|
296
|
-
|
|
297
|
-
return {
|
|
298
|
-
product: "",
|
|
299
|
-
totalRawMs,
|
|
300
|
-
segments: segmentReports,
|
|
301
|
-
bursts: allBursts,
|
|
302
|
-
estimatedFinalMs,
|
|
303
|
-
};
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
// ---------------------------------------------------------------------------
|
|
307
|
-
// Generate ffmpeg script
|
|
308
|
-
// ---------------------------------------------------------------------------
|
|
309
|
-
|
|
310
|
-
function generateFfmpegScript(plan: EditPlan, rawVideoPath: string, outputPath: string): string {
|
|
311
|
-
const audioPath = rawVideoPath.replace("raw_video.webm", "narration.wav");
|
|
312
|
-
const hasAudio = fs.existsSync(audioPath);
|
|
313
|
-
|
|
314
|
-
if (plan.bursts.length === 0) {
|
|
315
|
-
return `#!/bin/bash\necho "No usable bursts found"`;
|
|
316
|
-
}
|
|
317
|
-
|
|
318
|
-
const concatListPath = outputPath.replace(".mp4", "-segments.txt");
|
|
319
|
-
const segmentCmds: string[] = [];
|
|
320
|
-
|
|
321
|
-
for (let i = 0; i < plan.bursts.length; i++) {
|
|
322
|
-
const b = plan.bursts[i];
|
|
323
|
-
const startSec = (b.startMs / 1000).toFixed(3);
|
|
324
|
-
const endSec = (b.endMs / 1000).toFixed(3);
|
|
325
|
-
const segFile = outputPath.replace(".mp4", `-seg${i}.mp4`);
|
|
326
|
-
|
|
327
|
-
if (hasAudio) {
|
|
328
|
-
segmentCmds.push(
|
|
329
|
-
`ffmpeg -y -ss ${startSec} -to ${endSec} -i "${rawVideoPath}" ` +
|
|
330
|
-
`-ss ${startSec} -to ${endSec} -i "${audioPath}" ` +
|
|
331
|
-
`-map 0:v -map 1:a -c:v libx264 -preset fast -c:a aac -b:a 128k -shortest "${segFile}"`,
|
|
332
|
-
);
|
|
333
|
-
} else {
|
|
334
|
-
segmentCmds.push(
|
|
335
|
-
`ffmpeg -y -ss ${startSec} -to ${endSec} -i "${rawVideoPath}" ` +
|
|
336
|
-
`-c:v libx264 -preset fast "${segFile}"`,
|
|
337
|
-
);
|
|
338
|
-
}
|
|
339
|
-
}
|
|
340
|
-
|
|
341
|
-
const concatEntries = plan.bursts.map((_, i) =>
|
|
342
|
-
`file '${outputPath.replace(".mp4", `-seg${i}.mp4`)}'`,
|
|
343
|
-
);
|
|
344
|
-
|
|
345
|
-
return `#!/bin/bash
|
|
346
|
-
set -e
|
|
347
|
-
|
|
348
|
-
# Cut individual bursts (video + audio synced at same timestamps)
|
|
349
|
-
${segmentCmds.join("\n")}
|
|
350
|
-
|
|
351
|
-
# Create concat list
|
|
352
|
-
cat > "${concatListPath}" << 'CONCAT'
|
|
353
|
-
${concatEntries.join("\n")}
|
|
354
|
-
CONCAT
|
|
355
|
-
|
|
356
|
-
# Concatenate all bursts into final video
|
|
357
|
-
ffmpeg -y -f concat -safe 0 -i "${concatListPath}" -c copy "${outputPath}"
|
|
358
|
-
|
|
359
|
-
echo "Final video: ${outputPath}"
|
|
360
|
-
|
|
361
|
-
# Cleanup
|
|
362
|
-
rm -f ${plan.bursts.map((_, i) => `"${outputPath.replace(".mp4", `-seg${i}.mp4`)}"`).join(" ")}
|
|
363
|
-
rm -f "${concatListPath}"
|
|
364
|
-
`;
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
// ---------------------------------------------------------------------------
|
|
368
|
-
// Main
|
|
369
|
-
// ---------------------------------------------------------------------------
|
|
370
|
-
|
|
371
|
-
export async function editDemoResearch(researchDir: string) {
|
|
372
|
-
const logPath = path.join(researchDir, "actions.jsonl");
|
|
373
|
-
if (!fs.existsSync(logPath)) {
|
|
374
|
-
console.error(`No actions.jsonl found in ${researchDir}`);
|
|
375
|
-
process.exit(1);
|
|
376
|
-
}
|
|
377
|
-
|
|
378
|
-
console.log(`\n Editor Agent: processing ${researchDir}\n`);
|
|
379
|
-
|
|
380
|
-
// Parse and plan
|
|
381
|
-
const entries = parseLog(logPath);
|
|
382
|
-
const plan = createEditPlan(entries);
|
|
383
|
-
|
|
384
|
-
const rawSec = (plan.totalRawMs / 1000).toFixed(1);
|
|
385
|
-
const finalSec = (plan.estimatedFinalMs / 1000).toFixed(1);
|
|
386
|
-
const ratio = plan.totalRawMs > 0 ? ((1 - plan.estimatedFinalMs / plan.totalRawMs) * 100).toFixed(0) : "0";
|
|
387
|
-
|
|
388
|
-
console.log(` Raw footage: ${rawSec}s`);
|
|
389
|
-
console.log(` Final video: ${finalSec}s (${ratio}% idle removed)`);
|
|
390
|
-
console.log(` Bursts: ${plan.bursts.length}`);
|
|
391
|
-
console.log(` Segments: ${plan.segments.length} features`);
|
|
392
|
-
|
|
393
|
-
// Save edit plan
|
|
394
|
-
const planPath = path.join(researchDir, "edit_plan.json");
|
|
395
|
-
fs.writeFileSync(planPath, JSON.stringify(plan, null, 2));
|
|
396
|
-
console.log(` Plan: ${planPath}`);
|
|
397
|
-
|
|
398
|
-
// Generate ffmpeg script
|
|
399
|
-
const rawVideoPath = path.join(researchDir, "raw_video.webm");
|
|
400
|
-
const outputPath = path.join(researchDir, "final_demo.mp4");
|
|
401
|
-
|
|
402
|
-
if (fs.existsSync(rawVideoPath)) {
|
|
403
|
-
const scriptPath = path.join(researchDir, "edit.sh");
|
|
404
|
-
const script = generateFfmpegScript(plan, rawVideoPath, outputPath);
|
|
405
|
-
fs.writeFileSync(scriptPath, script, { mode: 0o755 });
|
|
406
|
-
console.log(` Script: ${scriptPath}\n`);
|
|
407
|
-
|
|
408
|
-
try {
|
|
409
|
-
console.log(" Running ffmpeg...");
|
|
410
|
-
await $`bash ${scriptPath}`.quiet();
|
|
411
|
-
console.log(` Done: ${outputPath}`);
|
|
412
|
-
} catch (err: any) {
|
|
413
|
-
console.log(` ffmpeg failed — run manually: bash ${scriptPath}`);
|
|
414
|
-
}
|
|
415
|
-
} else {
|
|
416
|
-
console.log(` No raw_video.webm found — skipping video assembly`);
|
|
417
|
-
}
|
|
418
|
-
|
|
419
|
-
// Print segment report
|
|
420
|
-
console.log("\n --- Segment Report ---");
|
|
421
|
-
for (const s of plan.segments) {
|
|
422
|
-
const dur = (s.durationMs / 1000).toFixed(1);
|
|
423
|
-
const icon = s.action === "cut" ? "X" : "OK";
|
|
424
|
-
console.log(` [${icon}] [${dur}s] ${s.chapter} / ${s.feature}: ${s.reason}`);
|
|
425
|
-
}
|
|
426
|
-
|
|
427
|
-
// Print burst details
|
|
428
|
-
console.log("\n --- Burst Detail ---");
|
|
429
|
-
for (const b of plan.bursts) {
|
|
430
|
-
const dur = ((b.endMs - b.startMs) / 1000).toFixed(1);
|
|
431
|
-
console.log(` [${dur}s] ${b.chapter} / ${b.feature} (${b.eventCount} events${b.hasNarration ? ", narrated" : ""})`);
|
|
432
|
-
}
|
|
433
|
-
console.log("");
|
|
434
|
-
}
|
|
435
|
-
|
|
436
|
-
// ---------------------------------------------------------------------------
|
|
437
|
-
// CLI
|
|
438
|
-
// ---------------------------------------------------------------------------
|
|
439
|
-
|
|
440
|
-
if (import.meta.main) {
|
|
441
|
-
const dir = process.argv[2];
|
|
442
|
-
if (!dir) {
|
|
443
|
-
console.error("Usage: bun src/agent/demo-editor.ts <research-dir>");
|
|
444
|
-
process.exit(1);
|
|
445
|
-
}
|
|
446
|
-
editDemoResearch(path.resolve(dir)).catch((err) => {
|
|
447
|
-
console.error(err);
|
|
448
|
-
process.exit(1);
|
|
449
|
-
});
|
|
450
|
-
}
|
|
@@ -1,725 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Demo Research Agent — explores a website guided by a feature checklist,
|
|
3
|
-
* narrates discoveries via TTS, and logs every action with timestamps.
|
|
4
|
-
*
|
|
5
|
-
* Output:
|
|
6
|
-
* - raw_video.webm (full Playwright recording)
|
|
7
|
-
* - actions.jsonl (timestamped action log with feature markers)
|
|
8
|
-
* - screenshots/ (per-feature screenshots)
|
|
9
|
-
*
|
|
10
|
-
* Usage:
|
|
11
|
-
* bun src/agent/demo-research.ts demo/checklists/registry-web.yaml
|
|
12
|
-
*/
|
|
13
|
-
import { chromium, type Page, type BrowserContext } from "playwright";
|
|
14
|
-
import * as fs from "node:fs";
|
|
15
|
-
import * as path from "node:path";
|
|
16
|
-
import * as yaml from "yaml";
|
|
17
|
-
import { applyHud } from "../../lib/demowright/dist/setup.mjs";
|
|
18
|
-
|
|
19
|
-
// ---------------------------------------------------------------------------
|
|
20
|
-
// Types
|
|
21
|
-
// ---------------------------------------------------------------------------
|
|
22
|
-
|
|
23
|
-
interface FeatureItem {
|
|
24
|
-
id: string;
|
|
25
|
-
description: string;
|
|
26
|
-
action?: string;
|
|
27
|
-
narration_hint: string;
|
|
28
|
-
success_hint?: string;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
interface Chapter {
|
|
32
|
-
name: string;
|
|
33
|
-
goal: string;
|
|
34
|
-
features: FeatureItem[];
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
interface Checklist {
|
|
38
|
-
product: string;
|
|
39
|
-
url: string;
|
|
40
|
-
staging_url_env?: string;
|
|
41
|
-
persona: string;
|
|
42
|
-
narration_style: Record<string, string>;
|
|
43
|
-
chapters: Chapter[];
|
|
44
|
-
conclusion: { narration: string };
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
interface ActionLogEntry {
|
|
48
|
-
ts: number;
|
|
49
|
-
offsetMs: number;
|
|
50
|
-
type: "narrate" | "action" | "feature_start" | "feature_end" | "chapter_start" | "chapter_end" | "screenshot" | "error";
|
|
51
|
-
feature?: string;
|
|
52
|
-
chapter?: string;
|
|
53
|
-
action?: string;
|
|
54
|
-
selector?: string;
|
|
55
|
-
text?: string;
|
|
56
|
-
success?: boolean;
|
|
57
|
-
error?: string;
|
|
58
|
-
screenshot?: string;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
interface AgentDecision {
|
|
62
|
-
narration: string;
|
|
63
|
-
actions: Array<{
|
|
64
|
-
type: "click" | "type" | "scroll" | "hover" | "wait" | "key";
|
|
65
|
-
selector?: string;
|
|
66
|
-
text?: string;
|
|
67
|
-
x?: number;
|
|
68
|
-
y?: number;
|
|
69
|
-
key?: string;
|
|
70
|
-
ms?: number;
|
|
71
|
-
}>;
|
|
72
|
-
done: boolean;
|
|
73
|
-
observation: string;
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
// ---------------------------------------------------------------------------
|
|
77
|
-
// LLM integration (Anthropic direct or OpenRouter)
|
|
78
|
-
// ---------------------------------------------------------------------------
|
|
79
|
-
|
|
80
|
-
async function callLLM(
|
|
81
|
-
systemPrompt: string,
|
|
82
|
-
messages: Array<{ role: "user" | "assistant"; content: any }>,
|
|
83
|
-
): Promise<string> {
|
|
84
|
-
const openRouterKey = process.env.OPENROUTER_API_KEY;
|
|
85
|
-
const anthropicKey = process.env.ANTHROPIC_API_KEY;
|
|
86
|
-
|
|
87
|
-
if (openRouterKey) {
|
|
88
|
-
return callOpenRouter(systemPrompt, messages, openRouterKey);
|
|
89
|
-
}
|
|
90
|
-
if (anthropicKey) {
|
|
91
|
-
return callAnthropic(systemPrompt, messages);
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
// Fallback: claude CLI
|
|
95
|
-
const prompt = `${systemPrompt}\n\n${messages.map((m) => (typeof m.content === "string" ? m.content : JSON.stringify(m.content))).join("\n")}`;
|
|
96
|
-
const proc = Bun.spawn(["claude", "--print", "--model", "claude-sonnet-4-6"], {
|
|
97
|
-
stdin: new TextEncoder().encode(prompt),
|
|
98
|
-
stdout: "pipe",
|
|
99
|
-
stderr: "pipe",
|
|
100
|
-
});
|
|
101
|
-
const output = await new Response(proc.stdout).text();
|
|
102
|
-
await proc.exited;
|
|
103
|
-
return output;
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
async function callAnthropic(
|
|
107
|
-
systemPrompt: string,
|
|
108
|
-
messages: Array<{ role: "user" | "assistant"; content: any }>,
|
|
109
|
-
): Promise<string> {
|
|
110
|
-
const Anthropic = (await import("@anthropic-ai/sdk")).default;
|
|
111
|
-
const client = new Anthropic();
|
|
112
|
-
const response = await client.messages.create({
|
|
113
|
-
model: "claude-sonnet-4-6",
|
|
114
|
-
max_tokens: 2048,
|
|
115
|
-
system: systemPrompt,
|
|
116
|
-
messages,
|
|
117
|
-
});
|
|
118
|
-
return response.content[0].type === "text" ? response.content[0].text : "";
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
async function callOpenRouter(
|
|
122
|
-
systemPrompt: string,
|
|
123
|
-
messages: Array<{ role: "user" | "assistant"; content: any }>,
|
|
124
|
-
apiKey: string,
|
|
125
|
-
): Promise<string> {
|
|
126
|
-
// Convert Anthropic-style messages to OpenAI-style for OpenRouter
|
|
127
|
-
const openaiMessages: any[] = [{ role: "system", content: systemPrompt }];
|
|
128
|
-
|
|
129
|
-
for (const msg of messages) {
|
|
130
|
-
if (typeof msg.content === "string") {
|
|
131
|
-
openaiMessages.push({ role: msg.role, content: msg.content });
|
|
132
|
-
} else if (Array.isArray(msg.content)) {
|
|
133
|
-
// Convert Anthropic content blocks to OpenAI format
|
|
134
|
-
const parts: any[] = [];
|
|
135
|
-
for (const block of msg.content) {
|
|
136
|
-
if (block.type === "text") {
|
|
137
|
-
parts.push({ type: "text", text: block.text });
|
|
138
|
-
} else if (block.type === "image") {
|
|
139
|
-
parts.push({
|
|
140
|
-
type: "image_url",
|
|
141
|
-
image_url: {
|
|
142
|
-
url: `data:${block.source.media_type};base64,${block.source.data}`,
|
|
143
|
-
},
|
|
144
|
-
});
|
|
145
|
-
}
|
|
146
|
-
}
|
|
147
|
-
openaiMessages.push({ role: msg.role, content: parts });
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
const res = await fetch("https://openrouter.ai/api/v1/chat/completions", {
|
|
152
|
-
method: "POST",
|
|
153
|
-
headers: {
|
|
154
|
-
Authorization: `Bearer ${apiKey}`,
|
|
155
|
-
"Content-Type": "application/json",
|
|
156
|
-
"HTTP-Referer": "https://github.com/comfy-org/comfy-qa",
|
|
157
|
-
"X-Title": "Comfy QA Demo Research",
|
|
158
|
-
},
|
|
159
|
-
body: JSON.stringify({
|
|
160
|
-
model: "anthropic/claude-sonnet-4",
|
|
161
|
-
max_tokens: 2048,
|
|
162
|
-
messages: openaiMessages,
|
|
163
|
-
}),
|
|
164
|
-
});
|
|
165
|
-
|
|
166
|
-
if (!res.ok) {
|
|
167
|
-
const errText = await res.text();
|
|
168
|
-
throw new Error(`OpenRouter API error ${res.status}: ${errText.slice(0, 300)}`);
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
const json = (await res.json()) as any;
|
|
172
|
-
return json.choices?.[0]?.message?.content ?? "";
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
async function askAgent(
|
|
176
|
-
checklist: Checklist,
|
|
177
|
-
chapter: Chapter,
|
|
178
|
-
feature: FeatureItem,
|
|
179
|
-
pageState: { screenshot: string; a11yTree: string; url: string; title: string },
|
|
180
|
-
history: string[],
|
|
181
|
-
attempt: number,
|
|
182
|
-
): Promise<AgentDecision> {
|
|
183
|
-
const systemPrompt = `You are a demo presenter exploring a website to create a narrated video demo.
|
|
184
|
-
|
|
185
|
-
Product: ${checklist.product}
|
|
186
|
-
Persona: ${checklist.persona}
|
|
187
|
-
|
|
188
|
-
You are currently demonstrating a feature. Your job is to:
|
|
189
|
-
1. Write a short narration (1-2 sentences, first person, conversational) explaining what you're doing and WHY it matters to the user.
|
|
190
|
-
2. Decide what Playwright actions to take to demonstrate the feature.
|
|
191
|
-
3. Report whether you've successfully demonstrated the feature.
|
|
192
|
-
|
|
193
|
-
CRITICAL RULES FOR "done":
|
|
194
|
-
- Set "done": true as soon as you have NARRATED the feature and it is VISIBLE on the page. You do NOT need to click every element.
|
|
195
|
-
- If the relevant content is already visible in the screenshot or accessibility tree, narrate it and set "done": true immediately.
|
|
196
|
-
- Scrolling to see content and narrating it IS a successful demonstration. You don't need to interact further.
|
|
197
|
-
- If you've already narrated the feature on a previous attempt, set "done": true.
|
|
198
|
-
- Do NOT keep trying different selectors if the content is already visible. Just narrate and finish.
|
|
199
|
-
- Maximum 2 actions per response. Prefer scroll + wait over complex click sequences.
|
|
200
|
-
|
|
201
|
-
IMPORTANT: You are in a headless browser — there is NO URL bar. To navigate to a different page, use {"type": "navigate", "text": "https://full-url-here"}. Do NOT try to use keyboard shortcuts like Ctrl+L or type URLs into input fields.
|
|
202
|
-
|
|
203
|
-
Respond with ONLY a JSON object (no markdown):
|
|
204
|
-
{
|
|
205
|
-
"narration": "What to say (first person, explain user benefit)",
|
|
206
|
-
"actions": [
|
|
207
|
-
{"type": "click", "selector": "CSS selector"},
|
|
208
|
-
{"type": "type", "selector": "CSS selector", "text": "text to type"},
|
|
209
|
-
{"type": "scroll", "y": 300},
|
|
210
|
-
{"type": "hover", "selector": "CSS selector"},
|
|
211
|
-
{"type": "wait", "ms": 1000},
|
|
212
|
-
{"type": "key", "key": "Enter"},
|
|
213
|
-
{"type": "navigate", "text": "https://example.com/page"}
|
|
214
|
-
],
|
|
215
|
-
"done": true/false,
|
|
216
|
-
"observation": "What I see on the page"
|
|
217
|
-
}`;
|
|
218
|
-
|
|
219
|
-
const userContent: any[] = [
|
|
220
|
-
{
|
|
221
|
-
type: "text",
|
|
222
|
-
text: `## Current Task
|
|
223
|
-
Chapter: ${chapter.name} — ${chapter.goal}
|
|
224
|
-
Feature: ${feature.id} — ${feature.description}
|
|
225
|
-
${feature.action ? `Suggested action: ${feature.action}` : ""}
|
|
226
|
-
Narration hint: ${feature.narration_hint}
|
|
227
|
-
${feature.success_hint ? `Success criteria: ${feature.success_hint}` : ""}
|
|
228
|
-
Attempt: ${attempt}/3
|
|
229
|
-
${attempt > 1 ? "IMPORTANT: If you can see the relevant content on the page, just narrate it and set done=true. Do not keep retrying." : ""}
|
|
230
|
-
|
|
231
|
-
## Page State
|
|
232
|
-
URL: ${pageState.url}
|
|
233
|
-
Title: ${pageState.title}
|
|
234
|
-
|
|
235
|
-
## Accessibility Tree (first 2000 chars)
|
|
236
|
-
${pageState.a11yTree.slice(0, 2000)}
|
|
237
|
-
|
|
238
|
-
## Recent History
|
|
239
|
-
${history.slice(-8).join("\n") || "(start)"}`,
|
|
240
|
-
},
|
|
241
|
-
];
|
|
242
|
-
|
|
243
|
-
// Include screenshot as vision input
|
|
244
|
-
if (pageState.screenshot) {
|
|
245
|
-
userContent.push({
|
|
246
|
-
type: "image",
|
|
247
|
-
source: {
|
|
248
|
-
type: "base64",
|
|
249
|
-
media_type: "image/png",
|
|
250
|
-
data: pageState.screenshot,
|
|
251
|
-
},
|
|
252
|
-
});
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
const text = await callLLM(systemPrompt, [{ role: "user", content: userContent }]);
|
|
256
|
-
|
|
257
|
-
// Parse JSON response
|
|
258
|
-
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
259
|
-
if (!jsonMatch) {
|
|
260
|
-
return {
|
|
261
|
-
narration: "",
|
|
262
|
-
actions: [],
|
|
263
|
-
done: false,
|
|
264
|
-
observation: `Could not parse agent response: ${text.slice(0, 200)}`,
|
|
265
|
-
};
|
|
266
|
-
}
|
|
267
|
-
try {
|
|
268
|
-
return JSON.parse(jsonMatch[0]) as AgentDecision;
|
|
269
|
-
} catch {
|
|
270
|
-
return {
|
|
271
|
-
narration: "",
|
|
272
|
-
actions: [],
|
|
273
|
-
done: false,
|
|
274
|
-
observation: "JSON parse failed",
|
|
275
|
-
};
|
|
276
|
-
}
|
|
277
|
-
}
|
|
278
|
-
|
|
279
|
-
// ---------------------------------------------------------------------------
|
|
280
|
-
// Page state capture
|
|
281
|
-
// ---------------------------------------------------------------------------
|
|
282
|
-
|
|
283
|
-
async function getA11ySnapshot(page: Page): Promise<string> {
|
|
284
|
-
try {
|
|
285
|
-
const tree = await page.accessibility.snapshot();
|
|
286
|
-
return tree ? formatA11y(tree, 0) : "(empty)";
|
|
287
|
-
} catch {
|
|
288
|
-
return "(unavailable)";
|
|
289
|
-
}
|
|
290
|
-
}
|
|
291
|
-
|
|
292
|
-
function formatA11y(node: any, depth: number): string {
|
|
293
|
-
const indent = " ".repeat(depth);
|
|
294
|
-
let line = `${indent}[${node.role}]`;
|
|
295
|
-
if (node.name) line += ` "${node.name}"`;
|
|
296
|
-
if (node.value) line += ` val="${node.value}"`;
|
|
297
|
-
let result = line + "\n";
|
|
298
|
-
if (node.children) {
|
|
299
|
-
for (const child of node.children.slice(0, 40)) {
|
|
300
|
-
result += formatA11y(child, depth + 1);
|
|
301
|
-
}
|
|
302
|
-
}
|
|
303
|
-
return result;
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
async function captureState(page: Page) {
|
|
307
|
-
const buf = await page.screenshot({ type: "png" });
|
|
308
|
-
return {
|
|
309
|
-
screenshot: buf.toString("base64"),
|
|
310
|
-
a11yTree: await getA11ySnapshot(page),
|
|
311
|
-
url: page.url(),
|
|
312
|
-
title: await page.title(),
|
|
313
|
-
};
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
// ---------------------------------------------------------------------------
|
|
317
|
-
// Action execution
|
|
318
|
-
// ---------------------------------------------------------------------------
|
|
319
|
-
|
|
320
|
-
async function executeAction(
|
|
321
|
-
page: Page,
|
|
322
|
-
action: AgentDecision["actions"][0],
|
|
323
|
-
): Promise<{ success: boolean; result: string }> {
|
|
324
|
-
try {
|
|
325
|
-
switch (action.type) {
|
|
326
|
-
case "click":
|
|
327
|
-
if (action.selector) {
|
|
328
|
-
await page.click(action.selector, { timeout: 5000 });
|
|
329
|
-
return { success: true, result: `Clicked: ${action.selector}` };
|
|
330
|
-
}
|
|
331
|
-
if (action.x !== undefined && action.y !== undefined) {
|
|
332
|
-
await page.mouse.click(action.x, action.y);
|
|
333
|
-
return { success: true, result: `Clicked (${action.x},${action.y})` };
|
|
334
|
-
}
|
|
335
|
-
return { success: false, result: "Click: no target" };
|
|
336
|
-
|
|
337
|
-
case "type":
|
|
338
|
-
if (action.selector && action.text) {
|
|
339
|
-
await page.fill(action.selector, action.text, { timeout: 5000 });
|
|
340
|
-
return { success: true, result: `Typed "${action.text}" → ${action.selector}` };
|
|
341
|
-
}
|
|
342
|
-
if (action.text) {
|
|
343
|
-
await page.keyboard.type(action.text, { delay: 80 });
|
|
344
|
-
return { success: true, result: `Typed: "${action.text}"` };
|
|
345
|
-
}
|
|
346
|
-
return { success: false, result: "Type: no text" };
|
|
347
|
-
|
|
348
|
-
case "scroll":
|
|
349
|
-
await page.mouse.wheel(0, action.y ?? 300);
|
|
350
|
-
return { success: true, result: `Scrolled ${action.y ?? 300}px` };
|
|
351
|
-
|
|
352
|
-
case "hover":
|
|
353
|
-
if (action.selector) {
|
|
354
|
-
await page.hover(action.selector, { timeout: 5000 });
|
|
355
|
-
return { success: true, result: `Hovered: ${action.selector}` };
|
|
356
|
-
}
|
|
357
|
-
return { success: false, result: "Hover: no target" };
|
|
358
|
-
|
|
359
|
-
case "wait":
|
|
360
|
-
await page.waitForTimeout(action.ms ?? 1000);
|
|
361
|
-
return { success: true, result: `Waited ${action.ms ?? 1000}ms` };
|
|
362
|
-
|
|
363
|
-
case "key":
|
|
364
|
-
if (action.key) {
|
|
365
|
-
await page.keyboard.press(action.key);
|
|
366
|
-
return { success: true, result: `Key: ${action.key}` };
|
|
367
|
-
}
|
|
368
|
-
return { success: false, result: "Key: none" };
|
|
369
|
-
|
|
370
|
-
case "navigate":
|
|
371
|
-
if (action.text) {
|
|
372
|
-
await page.goto(action.text, { waitUntil: "domcontentloaded", timeout: 15000 });
|
|
373
|
-
await page.waitForTimeout(1500);
|
|
374
|
-
return { success: true, result: `Navigated: ${action.text}` };
|
|
375
|
-
}
|
|
376
|
-
return { success: false, result: "Navigate: no URL" };
|
|
377
|
-
|
|
378
|
-
default:
|
|
379
|
-
return { success: false, result: `Unknown: ${action.type}` };
|
|
380
|
-
}
|
|
381
|
-
} catch (err: any) {
|
|
382
|
-
return { success: false, result: `Failed: ${err.message?.slice(0, 150)}` };
|
|
383
|
-
}
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
// ---------------------------------------------------------------------------
|
|
387
|
-
// TTS (reuse Gemini TTS from fixture)
|
|
388
|
-
// ---------------------------------------------------------------------------
|
|
389
|
-
|
|
390
|
-
async function generateTTS(text: string): Promise<Buffer | null> {
|
|
391
|
-
const apiKey = process.env.GEMINI_API_KEY;
|
|
392
|
-
if (!apiKey || !text.trim()) return null;
|
|
393
|
-
|
|
394
|
-
const url = `https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent?key=${apiKey}`;
|
|
395
|
-
const body = {
|
|
396
|
-
contents: [{ parts: [{ text }] }],
|
|
397
|
-
generationConfig: {
|
|
398
|
-
responseModalities: ["AUDIO"],
|
|
399
|
-
speechConfig: {
|
|
400
|
-
voiceConfig: { prebuiltVoiceConfig: { voiceName: "Kore" } },
|
|
401
|
-
},
|
|
402
|
-
},
|
|
403
|
-
};
|
|
404
|
-
|
|
405
|
-
try {
|
|
406
|
-
const res = await fetch(url, {
|
|
407
|
-
method: "POST",
|
|
408
|
-
headers: { "Content-Type": "application/json" },
|
|
409
|
-
body: JSON.stringify(body),
|
|
410
|
-
});
|
|
411
|
-
if (!res.ok) return null;
|
|
412
|
-
const json = (await res.json()) as any;
|
|
413
|
-
const b64 = json?.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
|
|
414
|
-
if (!b64) return null;
|
|
415
|
-
const pcm = Buffer.from(b64, "base64");
|
|
416
|
-
return pcmToWav(pcm, 24000, 1);
|
|
417
|
-
} catch {
|
|
418
|
-
return null;
|
|
419
|
-
}
|
|
420
|
-
}
|
|
421
|
-
|
|
422
|
-
function pcmToWav(pcm: Buffer, sampleRate: number, channels: number): Buffer {
|
|
423
|
-
const header = Buffer.alloc(44);
|
|
424
|
-
const dataSize = pcm.length;
|
|
425
|
-
header.write("RIFF", 0);
|
|
426
|
-
header.writeUInt32LE(36 + dataSize, 4);
|
|
427
|
-
header.write("WAVE", 8);
|
|
428
|
-
header.write("fmt ", 12);
|
|
429
|
-
header.writeUInt32LE(16, 16);
|
|
430
|
-
header.writeUInt16LE(1, 20);
|
|
431
|
-
header.writeUInt16LE(channels, 22);
|
|
432
|
-
header.writeUInt32LE(sampleRate, 24);
|
|
433
|
-
header.writeUInt32LE(sampleRate * channels * 2, 28);
|
|
434
|
-
header.writeUInt16LE(channels * 2, 32);
|
|
435
|
-
header.writeUInt16LE(16, 34);
|
|
436
|
-
header.write("data", 36);
|
|
437
|
-
header.writeUInt32LE(dataSize, 40);
|
|
438
|
-
return Buffer.concat([header, pcm]);
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
// ---------------------------------------------------------------------------
|
|
442
|
-
// Play audio in browser (inject base64 WAV)
|
|
443
|
-
// ---------------------------------------------------------------------------
|
|
444
|
-
|
|
445
|
-
async function playAudioInBrowser(page: Page, wavBuf: Buffer): Promise<number> {
|
|
446
|
-
const b64 = wavBuf.toString("base64");
|
|
447
|
-
const durationMs = await page.evaluate(async (data: string) => {
|
|
448
|
-
const binary = atob(data);
|
|
449
|
-
const bytes = new Uint8Array(binary.length);
|
|
450
|
-
for (let i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
|
|
451
|
-
const blob = new Blob([bytes], { type: "audio/wav" });
|
|
452
|
-
const url = URL.createObjectURL(blob);
|
|
453
|
-
const audio = new Audio(url);
|
|
454
|
-
await audio.play();
|
|
455
|
-
const dur = audio.duration * 1000;
|
|
456
|
-
return isFinite(dur) ? dur : 3000;
|
|
457
|
-
}, b64);
|
|
458
|
-
return durationMs;
|
|
459
|
-
}
|
|
460
|
-
|
|
461
|
-
// ---------------------------------------------------------------------------
|
|
462
|
-
// Main research loop
|
|
463
|
-
// ---------------------------------------------------------------------------
|
|
464
|
-
|
|
465
|
-
export async function runDemoResearch(checklistPath: string) {
|
|
466
|
-
// Load checklist
|
|
467
|
-
const raw = fs.readFileSync(checklistPath, "utf-8");
|
|
468
|
-
const checklist = yaml.parse(raw) as Checklist;
|
|
469
|
-
|
|
470
|
-
// Resolve URL
|
|
471
|
-
const baseUrl = checklist.staging_url_env
|
|
472
|
-
? process.env[checklist.staging_url_env] ?? checklist.url
|
|
473
|
-
: checklist.url;
|
|
474
|
-
|
|
475
|
-
// Setup output dir
|
|
476
|
-
const productSlug = checklist.product.toLowerCase().replace(/\s+/g, "-");
|
|
477
|
-
const outputDir = path.resolve(`.comfy-qa/.research/${productSlug}`);
|
|
478
|
-
fs.mkdirSync(outputDir, { recursive: true });
|
|
479
|
-
fs.mkdirSync(path.join(outputDir, "screenshots"), { recursive: true });
|
|
480
|
-
|
|
481
|
-
// Action log
|
|
482
|
-
const logPath = path.join(outputDir, "actions.jsonl");
|
|
483
|
-
const logStream = fs.createWriteStream(logPath, { flags: "w" });
|
|
484
|
-
const startMs = Date.now();
|
|
485
|
-
|
|
486
|
-
function log(entry: Omit<ActionLogEntry, "ts" | "offsetMs">) {
|
|
487
|
-
const now = Date.now();
|
|
488
|
-
const full: ActionLogEntry = { ts: now, offsetMs: now - startMs, ...entry };
|
|
489
|
-
logStream.write(JSON.stringify(full) + "\n");
|
|
490
|
-
const prefix = `[${((now - startMs) / 1000).toFixed(1)}s]`;
|
|
491
|
-
if (entry.type === "narrate") console.log(` ${prefix} 🎤 ${entry.text}`);
|
|
492
|
-
else if (entry.type === "action") console.log(` ${prefix} ▶ ${entry.action} ${entry.success ? "✓" : "✗"}`);
|
|
493
|
-
else if (entry.type === "feature_start") console.log(` ${prefix} 📌 ${entry.feature}: ${entry.text}`);
|
|
494
|
-
else if (entry.type === "feature_end") console.log(` ${prefix} ${entry.success ? "✅" : "❌"} ${entry.feature}`);
|
|
495
|
-
else if (entry.type === "chapter_start") console.log(`\n ${prefix} 📖 Chapter: ${entry.chapter}`);
|
|
496
|
-
else if (entry.type === "error") console.log(` ${prefix} ⚠ ${entry.error}`);
|
|
497
|
-
}
|
|
498
|
-
|
|
499
|
-
// Launch browser
|
|
500
|
-
console.log(`\n🔬 Research Agent: ${checklist.product}`);
|
|
501
|
-
console.log(` URL: ${baseUrl}`);
|
|
502
|
-
console.log(` Output: ${outputDir}\n`);
|
|
503
|
-
|
|
504
|
-
const browser = await chromium.launch({ headless: true });
|
|
505
|
-
const context = await browser.newContext({
|
|
506
|
-
viewport: { width: 1280, height: 720 },
|
|
507
|
-
recordVideo: { dir: outputDir, size: { width: 1280, height: 720 } },
|
|
508
|
-
});
|
|
509
|
-
|
|
510
|
-
// Apply HUD overlay (cursor, keystrokes)
|
|
511
|
-
await applyHud(context, {
|
|
512
|
-
cursor: true,
|
|
513
|
-
keyboard: true,
|
|
514
|
-
cursorStyle: "default",
|
|
515
|
-
actionDelay: 200,
|
|
516
|
-
});
|
|
517
|
-
|
|
518
|
-
const page = await context.newPage();
|
|
519
|
-
const history: string[] = [];
|
|
520
|
-
|
|
521
|
-
// Navigate to site
|
|
522
|
-
await page.goto(baseUrl, { waitUntil: "domcontentloaded", timeout: 30000 });
|
|
523
|
-
await page.waitForTimeout(2000);
|
|
524
|
-
|
|
525
|
-
// Audio segments for later muxing
|
|
526
|
-
const audioSegments: Array<{ offsetMs: number; wavBuf: Buffer }> = [];
|
|
527
|
-
|
|
528
|
-
// -- Research loop --
|
|
529
|
-
for (const chapter of checklist.chapters) {
|
|
530
|
-
log({ type: "chapter_start", chapter: chapter.name, text: chapter.goal });
|
|
531
|
-
|
|
532
|
-
for (const feature of chapter.features) {
|
|
533
|
-
log({ type: "feature_start", feature: feature.id, text: feature.description });
|
|
534
|
-
|
|
535
|
-
// Auto-execute navigation actions from checklist before asking the agent
|
|
536
|
-
if (feature.action) {
|
|
537
|
-
const navMatch = feature.action.match(/navigate\s+to\s+(\S+)/i);
|
|
538
|
-
if (navMatch) {
|
|
539
|
-
const target = navMatch[1].startsWith("http") ? navMatch[1] : new URL(navMatch[1], baseUrl).href;
|
|
540
|
-
try {
|
|
541
|
-
await page.goto(target, { waitUntil: "domcontentloaded", timeout: 15000 });
|
|
542
|
-
await page.waitForTimeout(1500);
|
|
543
|
-
log({ type: "action", feature: feature.id, action: `navigate ${target}`, success: true });
|
|
544
|
-
history.push(`[${feature.id}] Navigated to ${target}`);
|
|
545
|
-
} catch (err: any) {
|
|
546
|
-
log({ type: "error", feature: feature.id, error: `Auto-navigate failed: ${err.message?.slice(0, 100)}` });
|
|
547
|
-
}
|
|
548
|
-
}
|
|
549
|
-
}
|
|
550
|
-
|
|
551
|
-
let demonstrated = false;
|
|
552
|
-
for (let attempt = 1; attempt <= 3 && !demonstrated; attempt++) {
|
|
553
|
-
try {
|
|
554
|
-
// Capture page state
|
|
555
|
-
const state = await captureState(page);
|
|
556
|
-
|
|
557
|
-
// Ask Claude what to do
|
|
558
|
-
const decision = await askAgent(checklist, chapter, feature, state, history, attempt);
|
|
559
|
-
|
|
560
|
-
// Narrate
|
|
561
|
-
if (decision.narration) {
|
|
562
|
-
log({ type: "narrate", feature: feature.id, text: decision.narration });
|
|
563
|
-
const wav = await generateTTS(decision.narration);
|
|
564
|
-
if (wav) {
|
|
565
|
-
audioSegments.push({ offsetMs: Date.now() - startMs, wavBuf: wav });
|
|
566
|
-
const durMs = await playAudioInBrowser(page, wav).catch(() => 3000);
|
|
567
|
-
await page.waitForTimeout(Math.max(durMs - 500, 500));
|
|
568
|
-
}
|
|
569
|
-
}
|
|
570
|
-
|
|
571
|
-
// Execute actions
|
|
572
|
-
for (const action of decision.actions) {
|
|
573
|
-
const result = await executeAction(page, action);
|
|
574
|
-
log({
|
|
575
|
-
type: "action",
|
|
576
|
-
feature: feature.id,
|
|
577
|
-
action: `${action.type} ${action.selector ?? action.text ?? action.key ?? ""}`.trim(),
|
|
578
|
-
selector: action.selector,
|
|
579
|
-
success: result.success,
|
|
580
|
-
error: result.success ? undefined : result.result,
|
|
581
|
-
});
|
|
582
|
-
history.push(`[${feature.id}] ${result.result}`);
|
|
583
|
-
await page.waitForTimeout(400); // visual pause
|
|
584
|
-
}
|
|
585
|
-
|
|
586
|
-
if (decision.observation) {
|
|
587
|
-
history.push(`[observe] ${decision.observation}`);
|
|
588
|
-
}
|
|
589
|
-
|
|
590
|
-
demonstrated = decision.done;
|
|
591
|
-
} catch (err: any) {
|
|
592
|
-
log({ type: "error", feature: feature.id, error: err.message?.slice(0, 200) });
|
|
593
|
-
}
|
|
594
|
-
}
|
|
595
|
-
|
|
596
|
-
// Screenshot for this feature
|
|
597
|
-
const ssPath = path.join(outputDir, "screenshots", `${feature.id}.png`);
|
|
598
|
-
await page.screenshot({ path: ssPath }).catch(() => {});
|
|
599
|
-
log({ type: "screenshot", feature: feature.id, screenshot: ssPath });
|
|
600
|
-
|
|
601
|
-
log({ type: "feature_end", feature: feature.id, success: demonstrated });
|
|
602
|
-
}
|
|
603
|
-
|
|
604
|
-
log({ type: "chapter_end", chapter: chapter.name });
|
|
605
|
-
}
|
|
606
|
-
|
|
607
|
-
// Conclusion narration
|
|
608
|
-
if (checklist.conclusion?.narration) {
|
|
609
|
-
log({ type: "narrate", text: checklist.conclusion.narration });
|
|
610
|
-
const wav = await generateTTS(checklist.conclusion.narration);
|
|
611
|
-
if (wav) {
|
|
612
|
-
audioSegments.push({ offsetMs: Date.now() - startMs, wavBuf: wav });
|
|
613
|
-
await playAudioInBrowser(page, wav).catch(() => {});
|
|
614
|
-
await page.waitForTimeout(3000);
|
|
615
|
-
}
|
|
616
|
-
}
|
|
617
|
-
|
|
618
|
-
// Save audio track
|
|
619
|
-
if (audioSegments.length > 0) {
|
|
620
|
-
const totalMs = Date.now() - startMs;
|
|
621
|
-
const wavPath = path.join(outputDir, "narration.wav");
|
|
622
|
-
const wavBuf = buildWavTrack(audioSegments, totalMs);
|
|
623
|
-
if (wavBuf) fs.writeFileSync(wavPath, wavBuf);
|
|
624
|
-
console.log(`\n 🔊 Audio: ${wavPath}`);
|
|
625
|
-
}
|
|
626
|
-
|
|
627
|
-
// Stop recording
|
|
628
|
-
await page.waitForTimeout(1000);
|
|
629
|
-
const videoPath = await page.video()?.path();
|
|
630
|
-
await context.close();
|
|
631
|
-
await browser.close();
|
|
632
|
-
|
|
633
|
-
if (videoPath) {
|
|
634
|
-
const dest = path.join(outputDir, "raw_video.webm");
|
|
635
|
-
fs.renameSync(videoPath, dest);
|
|
636
|
-
console.log(` 🎬 Video: ${dest}`);
|
|
637
|
-
}
|
|
638
|
-
|
|
639
|
-
logStream.end();
|
|
640
|
-
console.log(` 📋 Log: ${logPath}`);
|
|
641
|
-
console.log(`\n✅ Research complete for ${checklist.product}\n`);
|
|
642
|
-
}
|
|
643
|
-
|
|
644
|
-
// ---------------------------------------------------------------------------
|
|
645
|
-
// WAV track builder (stereo mix of all narration segments)
|
|
646
|
-
// ---------------------------------------------------------------------------
|
|
647
|
-
|
|
648
|
-
function parseWav(buf: Buffer) {
|
|
649
|
-
const dataOff = buf.indexOf("data") + 8;
|
|
650
|
-
if (dataOff < 8) return { float32: new Float32Array(0), sampleRate: 24000, channels: 1, sampleCount: 0, durationMs: 0 };
|
|
651
|
-
const sr = buf.readUInt32LE(24);
|
|
652
|
-
const ch = buf.readUInt16LE(22);
|
|
653
|
-
const pcm = buf.subarray(dataOff);
|
|
654
|
-
const count = pcm.length / 2;
|
|
655
|
-
const f32 = new Float32Array(count);
|
|
656
|
-
for (let i = 0; i < count; i++) f32[i] = pcm.readInt16LE(i * 2) / 32768;
|
|
657
|
-
return { float32: f32, sampleRate: sr, channels: ch, sampleCount: count, durationMs: (count / ch / sr) * 1000 };
|
|
658
|
-
}
|
|
659
|
-
|
|
660
|
-
function buildWavTrack(segments: Array<{ offsetMs: number; wavBuf: Buffer }>, totalMs: number): Buffer | null {
|
|
661
|
-
if (!segments.length) return null;
|
|
662
|
-
const sr = 24000;
|
|
663
|
-
const ch = 2;
|
|
664
|
-
const totalSamples = Math.ceil((totalMs / 1000) * sr * ch);
|
|
665
|
-
const track = new Float32Array(totalSamples);
|
|
666
|
-
|
|
667
|
-
for (const seg of segments) {
|
|
668
|
-
const p = parseWav(seg.wavBuf);
|
|
669
|
-
const off = Math.floor((seg.offsetMs / 1000) * sr) * ch;
|
|
670
|
-
const stereo =
|
|
671
|
-
p.channels === 1
|
|
672
|
-
? (() => {
|
|
673
|
-
const s = new Float32Array(p.sampleCount * 2);
|
|
674
|
-
for (let i = 0; i < p.sampleCount; i++) {
|
|
675
|
-
s[i * 2] = p.float32[i];
|
|
676
|
-
s[i * 2 + 1] = p.float32[i];
|
|
677
|
-
}
|
|
678
|
-
return s;
|
|
679
|
-
})()
|
|
680
|
-
: p.float32;
|
|
681
|
-
for (let i = 0; i < stereo.length && off + i < track.length; i++) {
|
|
682
|
-
track[off + i] += stereo[i];
|
|
683
|
-
}
|
|
684
|
-
}
|
|
685
|
-
|
|
686
|
-
const int16 = new Int16Array(track.length);
|
|
687
|
-
for (let i = 0; i < track.length; i++) {
|
|
688
|
-
const s = Math.max(-1, Math.min(1, track[i]));
|
|
689
|
-
int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
|
|
690
|
-
}
|
|
691
|
-
|
|
692
|
-
const dataBytes = int16.length * 2;
|
|
693
|
-
const buf = Buffer.alloc(44 + dataBytes);
|
|
694
|
-
buf.write("RIFF", 0);
|
|
695
|
-
buf.writeUInt32LE(36 + dataBytes, 4);
|
|
696
|
-
buf.write("WAVE", 8);
|
|
697
|
-
buf.write("fmt ", 12);
|
|
698
|
-
buf.writeUInt32LE(16, 16);
|
|
699
|
-
buf.writeUInt16LE(1, 20);
|
|
700
|
-
buf.writeUInt16LE(ch, 22);
|
|
701
|
-
buf.writeUInt32LE(sr, 24);
|
|
702
|
-
buf.writeUInt32LE(sr * ch * 2, 28);
|
|
703
|
-
buf.writeUInt16LE(ch * 2, 32);
|
|
704
|
-
buf.writeUInt16LE(16, 34);
|
|
705
|
-
buf.write("data", 36);
|
|
706
|
-
buf.writeUInt32LE(dataBytes, 40);
|
|
707
|
-
Buffer.from(int16.buffer).copy(buf, 44);
|
|
708
|
-
return buf;
|
|
709
|
-
}
|
|
710
|
-
|
|
711
|
-
// ---------------------------------------------------------------------------
|
|
712
|
-
// CLI entrypoint
|
|
713
|
-
// ---------------------------------------------------------------------------
|
|
714
|
-
|
|
715
|
-
if (import.meta.main) {
|
|
716
|
-
const checklistPath = process.argv[2];
|
|
717
|
-
if (!checklistPath) {
|
|
718
|
-
console.error("Usage: bun src/agent/demo-research.ts <checklist.yaml>");
|
|
719
|
-
process.exit(1);
|
|
720
|
-
}
|
|
721
|
-
runDemoResearch(path.resolve(checklistPath)).catch((err) => {
|
|
722
|
-
console.error(err);
|
|
723
|
-
process.exit(1);
|
|
724
|
-
});
|
|
725
|
-
}
|