comfy-qa 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,450 @@
1
+ /**
2
+ * Demo Editor Agent — reads actions.jsonl from research, extracts active
3
+ * bursts (narration + surrounding actions), removes LLM thinking gaps,
4
+ * and assembles a tight final MP4 using ffmpeg.
5
+ *
6
+ * Key improvement: instead of keeping whole feature segments (which include
7
+ * 15-30s LLM response wait times), we extract only the "activity bursts"
8
+ * — windows around narrations and actions where something is actually
9
+ * happening on screen.
10
+ *
11
+ * Output:
12
+ * - final_demo.mp4 (polished video with idle time removed)
13
+ * - edit_plan.json (the editing decisions made)
14
+ *
15
+ * Usage:
16
+ * bun src/agent/demo-editor.ts .comfy-qa/.research/comfy-registry/
17
+ */
18
+ import * as fs from "node:fs";
19
+ import * as path from "node:path";
20
+ import { $ } from "bun";
21
+
22
+ // ---------------------------------------------------------------------------
23
+ // Types
24
+ // ---------------------------------------------------------------------------
25
+
26
+ interface ActionLogEntry {
27
+ ts: number;
28
+ offsetMs: number;
29
+ type: string;
30
+ feature?: string;
31
+ chapter?: string;
32
+ action?: string;
33
+ selector?: string;
34
+ text?: string;
35
+ success?: boolean;
36
+ error?: string;
37
+ screenshot?: string;
38
+ }
39
+
40
+ interface Segment {
41
+ feature: string;
42
+ chapter: string;
43
+ startMs: number;
44
+ endMs: number;
45
+ narration: string;
46
+ actions: ActionLogEntry[];
47
+ success: boolean;
48
+ hasError: boolean;
49
+ retryCount: number;
50
+ }
51
+
52
+ /** A burst is a tight window of activity within a segment */
53
+ interface Burst {
54
+ feature: string;
55
+ chapter: string;
56
+ startMs: number;
57
+ endMs: number;
58
+ hasNarration: boolean;
59
+ eventCount: number;
60
+ }
61
+
62
+ interface EditPlan {
63
+ product: string;
64
+ totalRawMs: number;
65
+ segments: { feature: string; chapter: string; success: boolean; durationMs: number; action: string; reason: string }[];
66
+ bursts: Burst[];
67
+ estimatedFinalMs: number;
68
+ }
69
+
70
+ // ---------------------------------------------------------------------------
71
+ // Parse actions.jsonl into segments
72
+ // ---------------------------------------------------------------------------
73
+
74
+ function parseLog(logPath: string): ActionLogEntry[] {
75
+ const lines = fs.readFileSync(logPath, "utf-8").trim().split("\n");
76
+ return lines.map((l) => JSON.parse(l) as ActionLogEntry);
77
+ }
78
+
79
+ function groupIntoSegments(entries: ActionLogEntry[]): Segment[] {
80
+ const segments: Segment[] = [];
81
+ let current: Partial<Segment> | null = null;
82
+ let currentChapter = "";
83
+
84
+ for (const entry of entries) {
85
+ if (entry.type === "chapter_start") {
86
+ currentChapter = entry.chapter ?? "";
87
+ }
88
+
89
+ if (entry.type === "feature_start") {
90
+ current = {
91
+ feature: entry.feature ?? "",
92
+ chapter: currentChapter,
93
+ startMs: entry.offsetMs,
94
+ endMs: entry.offsetMs,
95
+ narration: "",
96
+ actions: [],
97
+ success: false,
98
+ hasError: false,
99
+ retryCount: 0,
100
+ };
101
+ }
102
+
103
+ if (current) {
104
+ current.actions!.push(entry);
105
+ current.endMs = entry.offsetMs;
106
+
107
+ if (entry.type === "narrate" && !current.narration) {
108
+ current.narration = entry.text ?? "";
109
+ }
110
+ if (entry.type === "error") {
111
+ current.hasError = true;
112
+ current.retryCount = (current.retryCount ?? 0) + 1;
113
+ }
114
+ }
115
+
116
+ if (entry.type === "feature_end" && current) {
117
+ current.success = entry.success ?? false;
118
+ current.endMs = entry.offsetMs;
119
+ segments.push(current as Segment);
120
+ current = null;
121
+ }
122
+ }
123
+
124
+ return segments;
125
+ }
126
+
127
+ // ---------------------------------------------------------------------------
128
+ // Extract activity bursts from segments
129
+ // ---------------------------------------------------------------------------
130
+
131
+ const BURST_PAD_BEFORE = 1000; // 1s padding before first event in burst
132
+ const BURST_PAD_AFTER = 1500; // 1.5s padding after last event in burst
133
+ const MIN_BURST_MS = 3000; // Minimum burst duration
134
+
135
+ interface TimedEvent {
136
+ offsetMs: number;
137
+ endMs: number;
138
+ type: string;
139
+ text?: string;
140
+ }
141
+
142
+ /**
143
+ * Extract activity bursts from a segment.
144
+ *
145
+ * For narrations, we estimate end time as the time of the next event
146
+ * (since the agent waits for TTS playback before proceeding).
147
+ * For actions, we use a small duration (500ms).
148
+ *
149
+ * Events that overlap or are close (within GAP_TOLERANCE) form one burst.
150
+ * Gaps longer than that (typically LLM thinking time) are cut.
151
+ */
152
+ function extractBursts(seg: Segment): Burst[] {
153
+ // Collect all meaningful events (narrate, action, screenshot)
154
+ const rawEvents = seg.actions.filter(
155
+ (a) => a.type === "narrate" || (a.type === "action" && a.success !== false),
156
+ );
157
+
158
+ if (rawEvents.length === 0) return [];
159
+
160
+ // Build timed events: narration endMs = next event's timestamp (TTS plays until then)
161
+ const events: TimedEvent[] = [];
162
+ for (let i = 0; i < rawEvents.length; i++) {
163
+ const a = rawEvents[i];
164
+ if (a.type === "narrate") {
165
+ // Estimate TTS duration from word count (~150 wpm)
166
+ const words = (a.text ?? "").split(/\s+/).length;
167
+ const ttsDurMs = Math.max(2500, (words / 150) * 60 * 1000);
168
+
169
+ // Also find the next meaningful event timestamp
170
+ const idx = seg.actions.indexOf(a);
171
+ let nextTs = Infinity;
172
+ for (let j = idx + 1; j < seg.actions.length; j++) {
173
+ const n = seg.actions[j];
174
+ if (n.type === "narrate" || n.type === "action") {
175
+ nextTs = n.offsetMs;
176
+ break;
177
+ }
178
+ }
179
+
180
+ // Use the SHORTER of: estimated TTS duration, or time until next event.
181
+ // This avoids including LLM thinking time that happens after TTS finishes.
182
+ const endMs = a.offsetMs + Math.min(ttsDurMs, nextTs === Infinity ? ttsDurMs : nextTs - a.offsetMs);
183
+ events.push({ offsetMs: a.offsetMs, endMs, type: "narrate", text: a.text });
184
+ } else {
185
+ events.push({ offsetMs: a.offsetMs, endMs: a.offsetMs + 500, type: "action" });
186
+ }
187
+ }
188
+
189
+ events.sort((a, b) => a.offsetMs - b.offsetMs);
190
+
191
+ // Merge events into bursts. Gap tolerance: 3s (covers visual pause between actions).
192
+ // Anything longer = LLM thinking time, which we want to cut.
193
+ const GAP_TOLERANCE = 3000;
194
+ const groups: TimedEvent[][] = [[events[0]]];
195
+ let groupEnd = events[0].endMs;
196
+
197
+ for (let i = 1; i < events.length; i++) {
198
+ if (events[i].offsetMs <= groupEnd + GAP_TOLERANCE) {
199
+ groups[groups.length - 1].push(events[i]);
200
+ groupEnd = Math.max(groupEnd, events[i].endMs);
201
+ } else {
202
+ groups.push([events[i]]);
203
+ groupEnd = events[i].endMs;
204
+ }
205
+ }
206
+
207
+ return groups.map((g) => {
208
+ const firstStart = g[0].offsetMs;
209
+ const lastEnd = Math.max(...g.map((e) => e.endMs));
210
+ const startMs = Math.max(seg.startMs, firstStart - BURST_PAD_BEFORE);
211
+ const endMs = Math.min(seg.endMs, lastEnd + BURST_PAD_AFTER);
212
+ const hasNarration = g.some((e) => e.type === "narrate");
213
+ return {
214
+ feature: seg.feature,
215
+ chapter: seg.chapter,
216
+ startMs,
217
+ endMs: Math.max(endMs, startMs + MIN_BURST_MS),
218
+ hasNarration,
219
+ eventCount: g.length,
220
+ };
221
+ });
222
+ }
223
+
224
+ // ---------------------------------------------------------------------------
225
+ // Build edit plan
226
+ // ---------------------------------------------------------------------------
227
+
228
+ function createEditPlan(entries: ActionLogEntry[]): EditPlan {
229
+ const segments = groupIntoSegments(entries);
230
+ const totalRawMs = entries.length > 0 ? entries[entries.length - 1].offsetMs : 0;
231
+
232
+ const segmentReports: EditPlan["segments"] = [];
233
+ const allBursts: Burst[] = [];
234
+
235
+ for (const seg of segments) {
236
+ const durationMs = seg.endMs - seg.startMs;
237
+
238
+ if (!seg.success) {
239
+ segmentReports.push({
240
+ feature: seg.feature,
241
+ chapter: seg.chapter,
242
+ success: false,
243
+ durationMs,
244
+ action: "cut",
245
+ reason: "Feature demonstration failed",
246
+ });
247
+ continue;
248
+ }
249
+
250
+ if (durationMs < 1500) {
251
+ segmentReports.push({
252
+ feature: seg.feature,
253
+ chapter: seg.chapter,
254
+ success: true,
255
+ durationMs,
256
+ action: "cut",
257
+ reason: "Too short to be meaningful",
258
+ });
259
+ continue;
260
+ }
261
+
262
+ const bursts = extractBursts(seg);
263
+ // Only keep bursts that have narration (visual-only bursts are usually noise)
264
+ const goodBursts = bursts.filter((b) => b.hasNarration);
265
+
266
+ if (goodBursts.length === 0) {
267
+ // Fallback: keep all bursts if none have narration
268
+ const fallback = bursts.length > 0 ? bursts : [];
269
+ allBursts.push(...fallback);
270
+ segmentReports.push({
271
+ feature: seg.feature,
272
+ chapter: seg.chapter,
273
+ success: true,
274
+ durationMs,
275
+ action: fallback.length > 0 ? "burst" : "cut",
276
+ reason: fallback.length > 0
277
+ ? `No narrated bursts, keeping ${fallback.length} action bursts`
278
+ : "No usable bursts",
279
+ });
280
+ } else {
281
+ allBursts.push(...goodBursts);
282
+ const burstMs = goodBursts.reduce((a, b) => a + (b.endMs - b.startMs), 0);
283
+ const saved = durationMs - burstMs;
284
+ segmentReports.push({
285
+ feature: seg.feature,
286
+ chapter: seg.chapter,
287
+ success: true,
288
+ durationMs,
289
+ action: "burst",
290
+ reason: `${goodBursts.length} burst(s), ${(burstMs / 1000).toFixed(1)}s kept, ${(saved / 1000).toFixed(1)}s idle removed`,
291
+ });
292
+ }
293
+ }
294
+
295
+ const estimatedFinalMs = allBursts.reduce((a, b) => a + (b.endMs - b.startMs), 0);
296
+
297
+ return {
298
+ product: "",
299
+ totalRawMs,
300
+ segments: segmentReports,
301
+ bursts: allBursts,
302
+ estimatedFinalMs,
303
+ };
304
+ }
305
+
306
+ // ---------------------------------------------------------------------------
307
+ // Generate ffmpeg script
308
+ // ---------------------------------------------------------------------------
309
+
310
+ function generateFfmpegScript(plan: EditPlan, rawVideoPath: string, outputPath: string): string {
311
+ const audioPath = rawVideoPath.replace("raw_video.webm", "narration.wav");
312
+ const hasAudio = fs.existsSync(audioPath);
313
+
314
+ if (plan.bursts.length === 0) {
315
+ return `#!/bin/bash\necho "No usable bursts found"`;
316
+ }
317
+
318
+ const concatListPath = outputPath.replace(".mp4", "-segments.txt");
319
+ const segmentCmds: string[] = [];
320
+
321
+ for (let i = 0; i < plan.bursts.length; i++) {
322
+ const b = plan.bursts[i];
323
+ const startSec = (b.startMs / 1000).toFixed(3);
324
+ const endSec = (b.endMs / 1000).toFixed(3);
325
+ const segFile = outputPath.replace(".mp4", `-seg${i}.mp4`);
326
+
327
+ if (hasAudio) {
328
+ segmentCmds.push(
329
+ `ffmpeg -y -ss ${startSec} -to ${endSec} -i "${rawVideoPath}" ` +
330
+ `-ss ${startSec} -to ${endSec} -i "${audioPath}" ` +
331
+ `-map 0:v -map 1:a -c:v libx264 -preset fast -c:a aac -b:a 128k -shortest "${segFile}"`,
332
+ );
333
+ } else {
334
+ segmentCmds.push(
335
+ `ffmpeg -y -ss ${startSec} -to ${endSec} -i "${rawVideoPath}" ` +
336
+ `-c:v libx264 -preset fast "${segFile}"`,
337
+ );
338
+ }
339
+ }
340
+
341
+ const concatEntries = plan.bursts.map((_, i) =>
342
+ `file '${outputPath.replace(".mp4", `-seg${i}.mp4`)}'`,
343
+ );
344
+
345
+ return `#!/bin/bash
346
+ set -e
347
+
348
+ # Cut individual bursts (video + audio synced at same timestamps)
349
+ ${segmentCmds.join("\n")}
350
+
351
+ # Create concat list
352
+ cat > "${concatListPath}" << 'CONCAT'
353
+ ${concatEntries.join("\n")}
354
+ CONCAT
355
+
356
+ # Concatenate all bursts into final video
357
+ ffmpeg -y -f concat -safe 0 -i "${concatListPath}" -c copy "${outputPath}"
358
+
359
+ echo "Final video: ${outputPath}"
360
+
361
+ # Cleanup
362
+ rm -f ${plan.bursts.map((_, i) => `"${outputPath.replace(".mp4", `-seg${i}.mp4`)}"`).join(" ")}
363
+ rm -f "${concatListPath}"
364
+ `;
365
+ }
366
+
367
+ // ---------------------------------------------------------------------------
368
+ // Main
369
+ // ---------------------------------------------------------------------------
370
+
371
+ export async function editDemoResearch(researchDir: string) {
372
+ const logPath = path.join(researchDir, "actions.jsonl");
373
+ if (!fs.existsSync(logPath)) {
374
+ console.error(`No actions.jsonl found in ${researchDir}`);
375
+ process.exit(1);
376
+ }
377
+
378
+ console.log(`\n Editor Agent: processing ${researchDir}\n`);
379
+
380
+ // Parse and plan
381
+ const entries = parseLog(logPath);
382
+ const plan = createEditPlan(entries);
383
+
384
+ const rawSec = (plan.totalRawMs / 1000).toFixed(1);
385
+ const finalSec = (plan.estimatedFinalMs / 1000).toFixed(1);
386
+ const ratio = plan.totalRawMs > 0 ? ((1 - plan.estimatedFinalMs / plan.totalRawMs) * 100).toFixed(0) : "0";
387
+
388
+ console.log(` Raw footage: ${rawSec}s`);
389
+ console.log(` Final video: ${finalSec}s (${ratio}% idle removed)`);
390
+ console.log(` Bursts: ${plan.bursts.length}`);
391
+ console.log(` Segments: ${plan.segments.length} features`);
392
+
393
+ // Save edit plan
394
+ const planPath = path.join(researchDir, "edit_plan.json");
395
+ fs.writeFileSync(planPath, JSON.stringify(plan, null, 2));
396
+ console.log(` Plan: ${planPath}`);
397
+
398
+ // Generate ffmpeg script
399
+ const rawVideoPath = path.join(researchDir, "raw_video.webm");
400
+ const outputPath = path.join(researchDir, "final_demo.mp4");
401
+
402
+ if (fs.existsSync(rawVideoPath)) {
403
+ const scriptPath = path.join(researchDir, "edit.sh");
404
+ const script = generateFfmpegScript(plan, rawVideoPath, outputPath);
405
+ fs.writeFileSync(scriptPath, script, { mode: 0o755 });
406
+ console.log(` Script: ${scriptPath}\n`);
407
+
408
+ try {
409
+ console.log(" Running ffmpeg...");
410
+ await $`bash ${scriptPath}`.quiet();
411
+ console.log(` Done: ${outputPath}`);
412
+ } catch (err: any) {
413
+ console.log(` ffmpeg failed — run manually: bash ${scriptPath}`);
414
+ }
415
+ } else {
416
+ console.log(` No raw_video.webm found — skipping video assembly`);
417
+ }
418
+
419
+ // Print segment report
420
+ console.log("\n --- Segment Report ---");
421
+ for (const s of plan.segments) {
422
+ const dur = (s.durationMs / 1000).toFixed(1);
423
+ const icon = s.action === "cut" ? "X" : "OK";
424
+ console.log(` [${icon}] [${dur}s] ${s.chapter} / ${s.feature}: ${s.reason}`);
425
+ }
426
+
427
+ // Print burst details
428
+ console.log("\n --- Burst Detail ---");
429
+ for (const b of plan.bursts) {
430
+ const dur = ((b.endMs - b.startMs) / 1000).toFixed(1);
431
+ console.log(` [${dur}s] ${b.chapter} / ${b.feature} (${b.eventCount} events${b.hasNarration ? ", narrated" : ""})`);
432
+ }
433
+ console.log("");
434
+ }
435
+
436
+ // ---------------------------------------------------------------------------
437
+ // CLI
438
+ // ---------------------------------------------------------------------------
439
+
440
+ if (import.meta.main) {
441
+ const dir = process.argv[2];
442
+ if (!dir) {
443
+ console.error("Usage: bun src/agent/demo-editor.ts <research-dir>");
444
+ process.exit(1);
445
+ }
446
+ editDemoResearch(path.resolve(dir)).catch((err) => {
447
+ console.error(err);
448
+ process.exit(1);
449
+ });
450
+ }